仔细看了下scrapy的官方文档关于ImagesPipeline的介绍说明及使用例子:Downloading Item Images 感觉官方文档太过简单。 比如:通过在setting.py文件中通过给IMAGES_STORE赋值,就可以指定图片的保存路径。 并且默认情况下,文件名是通过对url使用SHA1 hash得来的。 现在我想以原来的图片名进行保存,不知道该如何做,希望有经验的朋友帮忙指点下~
setting.py
IMAGES_STORE
另外求大家帮忙推荐下关于scrapy的相关书籍(中英文都行)
欢迎选择我的课程,让我们一起见证您的进步~~
查看下ImagePipeline的源码,发现可以重写file_path函数以修改图片名称,例如:
def file_path(self, request, response=None, info=None): open("image_urls.txt","a").write(request.url + "\n") image_guid = request.url.split('/')[-1] return 'full/%s' % (image_guid)
ImagePipeline的源码如下:
class ImagesPipeline(FilesPipeline): """Abstract pipeline that implement the image thumbnail generation logic """ MEDIA_NAME = 'image' MIN_WIDTH = 0 MIN_HEIGHT = 0 THUMBS = {} DEFAULT_IMAGES_URLS_FIELD = 'image_urls' DEFAULT_IMAGES_RESULT_FIELD = 'images' @classmethod def from_settings(cls, settings): cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0) cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0) cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90) cls.THUMBS = settings.get('IMAGES_THUMBS', {}) s3store = cls.STORE_SCHEMES['s3'] s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD) cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD) store_uri = settings['IMAGES_STORE'] return cls(store_uri) def file_downloaded(self, response, request, info): return self.image_downloaded(response, request, info) def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size self.store.persist_file( path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum def get_images(self, response, request, info): path = self.file_path(request, response=response, info=info) orig_image = Image.open(StringIO(response.body)) width, height = orig_image.size if width < self.MIN_WIDTH or height < self.MIN_HEIGHT: raise ImageException("Image too small (%dx%d < %dx%d)" % (width, height, self.MIN_WIDTH, self.MIN_HEIGHT)) image, buf = self.convert_image(orig_image) yield path, image, buf for thumb_id, size in self.THUMBS.iteritems(): thumb_path = self.thumb_path(request, thumb_id, response=response, info=info) thumb_image, thumb_buf = self.convert_image(image, size) yield thumb_path, thumb_image, thumb_buf def convert_image(self, image, size=None): if image.format == 'PNG' and image.mode == 'RGBA': background = Image.new('RGBA', image.size, (255, 255, 255)) background.paste(image, image) image = background.convert('RGB') elif image.mode != 'RGB': image = image.convert('RGB') if size: image = image.copy() image.thumbnail(size, Image.ANTIALIAS) buf = StringIO() image.save(buf, 'JPEG') return image, buf def get_media_requests(self, item, info): return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])] def item_completed(self, results, item, info): if self.IMAGES_RESULT_FIELD in item.fields: item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok] return item def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation return 'full/%s.jpg' % (image_guid) def thumb_path(self, request, thumb_id, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use ' 'thumb_path(request, thumb_id, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from thumb_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if thumb_key() method has been overridden if not hasattr(self.thumb_key, '_base'): _warn() return self.thumb_key(url, thumb_id) ## end of deprecation warning block thumb_guid = hashlib.sha1(url).hexdigest() # change to request.url after deprecation return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid) # deprecated def file_key(self, url): return self.image_key(url) file_key._base = True # deprecated def image_key(self, url): return self.file_path(url) image_key._base = True # deprecated def thumb_key(self, url, thumb_id): return self.thumb_path(url, thumb_id) thumb_key._base = True
现在我想以原来的图片名进行保存
貌似这个没有直接的参数可以传,需要自己实现Images Pipeline
scrapy.contrib.pipeline.images.ImagesPipeline类的get_media_requests(item, info)会下载图片,并把结果喂给item_completed()方法,结果是一个tuple,(success, image_info_or_failure),其中success是下载是否成功的bool,image_info_or_failure包括url、path和checksum三项。其中,path就是相对于IMAGES_STORE的路径(含文件名)。
scrapy.contrib.pipeline.images.ImagesPipeline
get_media_requests(item, info)
item_completed()
(success, image_info_or_failure)
success
image_info_or_failure
url
path
checksum
[(True, {'checksum': '2b00042f7481c7b056c4b410d28f33cf', 'path': 'full/7d97e98f8af710c7e7fe703abc8f639e0ee507c4.jpg', 'url': 'http://www.example.com/images/product1.jpg'}), (True, {'checksum': 'b9628c4ab9b595f72f280b90c4fd093d', 'path': 'full/1ca5879492b8fd606df1964ea3c1e2f4520f076f.jpg', 'url': 'http://www.example.com/images/product2.jpg'}), (False, Failure(...))]
上面是官网上的例子。
所以你需要做的是改写item_completed(results, items, info)方法,用原本的文件名替换掉item['image_paths']。
item_completed(results, items, info)
item['image_paths']
修改file_path对原代码侵入太大,如果只是为了修改文件路径的话,可以在item_completed对文件进行重命名。
class NeteaseautoImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url.replace('120x90', '800x600')) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") if item['jk']: newname = item['car'] + '-' + item['jk'] + '-' + item['model'] + '.jpg' else: newname = item['car'] + '-' + item['model'] + '.jpg' os.rename("/neteaseauto/" + image_paths[0], "/neteaseauto/" + newname) return item
查看下ImagePipeline的源码,发现可以重写file_path函数以修改图片名称,例如:
ImagePipeline的源码如下:
貌似这个没有直接的参数可以传,需要自己实现Images Pipeline
scrapy.contrib.pipeline.images.ImagesPipeline
类的get_media_requests(item, info)
会下载图片,并把结果喂给item_completed()
方法,结果是一个tuple,(success, image_info_or_failure)
,其中success
是下载是否成功的bool,image_info_or_failure
包括url
、path
和checksum
三项。其中,path
就是相对于IMAGES_STORE
的路径(含文件名)。上面是官网上的例子。
所以你需要做的是改写
item_completed(results, items, info)
方法,用原本的文件名替换掉item['image_paths']
。修改file_path对原代码侵入太大,如果只是为了修改文件路径的话,可以在item_completed对文件进行重命名。