使用scrapy框架爬取桌面背景图片

目标数据： zol桌面壁纸，[风景] [1920*1080] 分类下19页每个图册的图片

items.py

 1 import scrapy
 2 
 3 
 4 class zol2item(scrapy.item):
 5     # define the fields for your item here like:
 6     # name = scrapy.field()
 7     image_urls = scrapy.field()
 8     images = scrapy.field()
 9 
10     image_title = scrapy.field()

pipelines.py

 1 from scrapy import request
 2 from scrapy.pipelines.images import imagespipeline
 3 
 4 class zolpipeline(imagespipeline):
 5     # num = 1
 6     def get_media_requests(self, item, info):
 7         image_url = item["image_urls"]
 8         if image_url:
 9             # self.num + 1
10             yield request(url=image_url, meta={"item": item})
11 
12     def file_path(self, request, response=none, info=none):
13         ## start of deprecation warning block (can be removed in the future)
14         def _warn():
15             from scrapy.exceptions import scrapydeprecationwarning
16             import warnings
17             warnings.warn('imagespipeline.image_key(url) and file_key(url) methods are deprecated, '
18                           'please use file_path(request, response=none, info=none) instead',
19                           category=scrapydeprecationwarning, stacklevel=1)
20 
21         # check if called from image_key or file_key with url as first argument
22         if not isinstance(request, request):
23             _warn()
24             url = request
25         else:
26             url = request.url
27 
28         # detect if file_key() or image_key() methods have been overridden
29         if not hasattr(self.file_key, '_base'):
30             _warn()
31             return self.file_key(url)
32         elif not hasattr(self.image_key, '_base'):
33             _warn()
34             return self.image_key(url)
35         ## end of deprecation warning block
36 
37         return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])

middlewares.py

 1 from scrapy import signals
 2 from zol2.useragents import agents
 3 
 4 
 5 class zol2spidermiddleware(object):
 6     # not all methods need to be defined. if a method is not defined,
 7     # scrapy acts as if the spider middleware does not modify the
 8     # passed objects.
 9 
10     @classmethod
11     def from_crawler(cls, crawler):
12         # this method is used by scrapy to create your spiders.
13         s = cls()
14         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
15         return s
16 
17     def process_spider_input(self, response, spider):
18         # called for each response that goes through the spider
19         # middleware and into the spider.
20 
21         # should return none or raise an exception.
22         return none
23 
24     def process_spider_output(self, response, result, spider):
25         # called with the results returned from the spider, after
26         # it has processed the response.
27 
28         # must return an iterable of request, dict or item objects.
29         for i in result:
30             yield i
31 
32     def process_spider_exception(self, response, exception, spider):
33         # called when a spider or process_spider_input() method
34         # (from other spider middleware) raises an exception.
35 
36         # should return either none or an iterable of response, dict
37         # or item objects.
38         pass
39 
40     def process_start_requests(self, start_requests, spider):
41         # called with the start requests of the spider, and works
42         # similarly to the process_spider_output() method, except
43         # that it doesn’t have a response associated.
44 
45         # must return only requests (not items).
46         for r in start_requests:
47             yield r
48 
49     def spider_opened(self, spider):
50         spider.logger.info('spider opened: %s' % spider.name)
51 
52 
53 class zol2downloadermiddleware(object):
54     # not all methods need to be defined. if a method is not defined,
55     # scrapy acts as if the downloader middleware does not modify the
56     # passed objects.
57 
58     @classmethod
59     def from_crawler(cls, crawler):
60         # this method is used by scrapy to create your spiders.
61         s = cls()
62         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
63         return s
64 
65     def process_request(self, request, spider):
66         # called for each request that goes through the downloader
67         # middleware.
68 
69         # must either:
70         # - return none: continue processing this request
71         # - or return a response object
72         # - or return a request object
73         # - or raise ignorerequest: process_exception() methods of
74         #   installed downloader middleware will be called
75         return none
76 
77     def process_response(self, request, response, spider):
78         # called with the response returned from the downloader.
79 
80         # must either;
81         # - return a response object
82         # - return a request object
83         # - or raise ignorerequest
84         return response
85 
86     def process_exception(self, request, exception, spider):
87         # called when a download handler or a process_request()
88         # (from other downloader middleware) raises an exception.
89 
90         # must either:
91         # - return none: continue processing this exception
92         # - return a response object: stops process_exception() chain
93         # - return a request object: stops process_exception() chain
94         pass
95 
96     def spider_opened(self, spider):
97         spider.logger.info('spider opened: %s' % spider.name)

settings.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # scrapy settings for zol2 project
 4 #
 5 # for simplicity, this file contains only settings considered important or
 6 # commonly used. you can find more settings consulting the documentation:
 7 #
 8 #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 bot_name = 'zol2'
13 
14 spider_modules = ['zol2.spiders']
15 newspider_module = 'zol2.spiders'
16 
17 
18 # crawl responsibly by identifying yourself (and your website) on the user-agent
19 user_agent = 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/73.0.3683.75 safari/537.36'
20 
21 # obey robots.txt rules
22 # robotstxt_obey = true
23 
24 # configure maximum concurrent requests performed by scrapy (default: 16)
25 #concurrent_requests = 32
26 
27 # configure a delay for requests for the same website (default: 0)
28 # see https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 # see also autothrottle settings and docs
30 download_delay = 0.5
31 # the download delay setting will honor only one of:
32 #concurrent_requests_per_domain = 16
33 #concurrent_requests_per_ip = 16
34 
35 # disable cookies (enabled by default)
36 #cookies_enabled = false
37 
38 # disable telnet console (enabled by default)
39 #telnetconsole_enabled = false
40 
41 # override the default request headers:
42 #default_request_headers = {
43 #   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 #   'accept-language': 'en',
45 #}
46 
47 # enable or disable spider middlewares
48 # see https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 #spider_middlewares = {
50 #    'zol2.middlewares.zol2spidermiddleware': 543,
51 #}
52 
53 # enable or disable downloader middlewares
54 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 #downloader_middlewares = {
56 #    'zol2.middlewares.zol2downloadermiddleware': 543,
57 #}
58 
59 # enable or disable extensions
60 # see https://doc.scrapy.org/en/latest/topics/extensions.html
61 #extensions = {
62 #    'scrapy.extensions.telnet.telnetconsole': none,
63 #}
64 
65 # configure item pipelines
66 # see https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 item_pipelines = {
68    'zol2.pipelines.zol2pipeline': 300,
69 }
70 images_store = "/home/pyvip/env_spider/zol2/zol2/images"
71 
72 # enable and configure the autothrottle extension (disabled by default)
73 # see https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 #autothrottle_enabled = true
75 # the initial download delay
76 #autothrottle_start_delay = 5
77 # the maximum download delay to be set in case of high latencies
78 #autothrottle_max_delay = 60
79 # the average number of requests scrapy should be sending in parallel to
80 # each remote server
81 #autothrottle_target_concurrency = 1.0
82 # enable showing throttling stats for every response received:
83 #autothrottle_debug = false
84 
85 # enable and configure http caching (disabled by default)
86 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 #httpcache_enabled = true
88 #httpcache_expiration_secs = 0
89 #httpcache_dir = 'httpcache'
90 #httpcache_ignore_http_codes = []
91 #httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage'

pazol2.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import linkextractor
 4 from scrapy.spiders import crawlspider, rule
 5 from zol2.items import zol2item
 6 
 7 class pazol2spider(crawlspider):
 8     name = 'pazol2'
 9     # allowed_domains = ['desk.zol.com.cn']
10     start_urls = ['http://desk.zol.com.cn/fengjing/1920x1080/']
11     front_url = "http://desk.zol.com.cn"
12     num = 1
13 
14     rules = (
15         # 1.解决翻页
16         rule(linkextractor(allow=r'/fengjing/1920x1080/[0-1]?[0-9]?.html'), callback='parse_album', follow=true),
17         # 2.进入各个图库的每一张图片页
18         rule(linkextractor(allow=r'/bizhi/\d+_\d+_\d+.html', restrict_xpaths=("//div[@class='main']/ul[@class='pic-list2  clearfix']/li", "//div[@class='photo-list-box']")), follow=true),
19         # 3.点击各个图片1920*1080按钮，获得html
20         rule(linkextractor(allow=r'/showpic/1920x1080_\d+_\d+.html'), callback='get_img', follow=true),
21     )
22 
23     def get_img(self, response):
24         item = zol2item()
25         item['image_urls'] = response.xpath("//body/img[1]/@src").extract_first()
26         item['image_title'] = str(self.num)
27         self.num += 1
28         yield item

爬取结果

共爬取了4517张图片，用时108分钟

放在桌面图库，半小时换一张，美滋滋。

使用scrapy框架爬取桌面背景图片

目标数据： zol桌面壁纸，[风景] [1920*1080] 分类下19页每个图册的图片

相关推荐

Python Scrapy爬虫框架使用的方法

scrapy爬虫如何爬取javascript内容

在scrapy中使用selenium实现一个爬取网页的功能

Blazor前后端框架Known-V1.2.11

C#/.NET/.NET Core优秀项目和框架每周精选（坑已挖，欢迎大家踊跃提交PR或者Issues中留言）

Spring核心框架 - AOP的原理及源码解析

Python运维开发之路《WEB框架：Django》

GPUImage框架使用