import scrapy
# 需要继承scrapy.Spider类
class mingyan(scrapy.Spider):
name = "mingyan2" # 定义蜘蛛名
def start_requests(self): # 由此方法通过下面链接爬取页面
# 定义爬取的链接
urls = [
'http://lab.scrapyd.cn/page/1/',
'http://lab.scrapyd.cn/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse) # 爬取到的页面如何处理?提交给parse方法处理
def parse(self, response):
"""start_requests已经爬取到页面,那如何提取我们想要的内容呢?那就可以在这个方法里面定义。
这里的话,并木有定义,只是简单的把页面做了一个保存,并没有涉及提取我们想要的数据,后面会慢慢说到
也就是用xpath、正则、或是css进行相应提取,这个例子就是让你看看scrapy运行的流程:
1、定义链接;
2、通过链接爬取(下载)页面;
3、定义规则,然后提取数据;
就是这么个流程,似不似很简单呀?
"""
page = response.url.split("/")[-2] # 根据上面的链接提取分页,如:/page/1/,提取到的就是:1
filename = 'mingyan-%s.html' % page # 拼接文件名,如果是第一页,最终文件名便是:mingyan-1.html
with open(filename, 'wb') as f: # python文件操作,不多说了;
f.write(response.body) # 刚才下载的页面去哪里了?response.body就代表了刚才下载的页面!
self.log('保存文件: %s' % filename) # 打个日志
运行结果:
night@localhost:~/Project/PycharmProjects/test_scrapy$ scrapy crawl mingyan2
2019-02-03 14:28:28 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: test_scrapy)
2019-02-03 14:28:28 [scrapy.utils.log] INFO: Versions: lxml 4.3.0.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 18.9.0, Python 3.7.2 (default, Jan 15 2019, 15:43:38) - [GCC 5.4.0 20160609], pyOpenSSL 18.0.0 (OpenSSL 1.1.0j 20 Nov 2018), cryptography 2.4.2, Platform Linux-4.15.0-43-generic-x86_64-with-debian-stretch-sid
2019-02-03 14:28:28 [scrapy.crawler] INFO: Overridden settings: {'BOT_NAME': 'test_scrapy', 'NEWSPIDER_MODULE': 'test_scrapy.spiders', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['test_scrapy.spiders']}
2019-02-03 14:28:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.logstats.LogStats']
2019-02-03 14:28:28 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-02-03 14:28:28 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-02-03 14:28:28 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-02-03 14:28:28 [scrapy.core.engine] INFO: Spider opened
2019-02-03 14:28:28 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-02-03 14:28:28 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2019-02-03 14:28:29 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/robots.txt> (failed 1 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:29 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/robots.txt> (failed 2 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:29 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET http://lab.scrapyd.cn/robots.txt> (failed 3 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:29 [scrapy.downloadermiddlewares.robotstxt] ERROR: Error downloading <GET http://lab.scrapyd.cn/robots.txt>: Connection was refused by other side: 111: Connection refused.
Traceback (most recent call last):
File "/home/night/.pyenv/versions/3.7.2/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.ConnectionRefusedError: Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/page/1/> (failed 1 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/page/2/> (failed 1 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/page/2/> (failed 2 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET http://lab.scrapyd.cn/page/1/> (failed 2 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET http://lab.scrapyd.cn/page/1/> (failed 3 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <GET http://lab.scrapyd.cn/page/2/> (failed 3 times): Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.core.scraper] ERROR: Error downloading <GET http://lab.scrapyd.cn/page/1/>
Traceback (most recent call last):
File "/home/night/.pyenv/versions/3.7.2/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.ConnectionRefusedError: Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:30 [scrapy.core.scraper] ERROR: Error downloading <GET http://lab.scrapyd.cn/page/2/>
Traceback (most recent call last):
File "/home/night/.pyenv/versions/3.7.2/lib/python3.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.ConnectionRefusedError: Connection was refused by other side: 111: Connection refused.
2019-02-03 14:28:31 [scrapy.core.engine] INFO: Closing spider (finished)
2019-02-03 14:28:31 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/exception_count': 9,
'downloader/exception_type_count/twisted.internet.error.ConnectionRefusedError': 9,
'downloader/request_bytes': 1989,
'downloader/request_count': 9,
'downloader/request_method_count/GET': 9,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 2, 3, 6, 28, 31, 70341),
'log_count/DEBUG': 10,
'log_count/ERROR': 3,
'log_count/INFO': 7,
'memusage/max': 50855936,
'memusage/startup': 50855936,
'retry/count': 6,
'retry/max_reached': 3,
'retry/reason_count/twisted.internet.error.ConnectionRefusedError': 6,
'scheduler/dequeued': 6,
'scheduler/dequeued/memory': 6,
'scheduler/enqueued': 6,
'scheduler/enqueued/memory': 6,
'start_time': datetime.datetime(2019, 2, 3, 6, 28, 28, 982760)}
2019-02-03 14:28:31 [scrapy.core.engine] INFO: Spider closed (finished)
这是怎么回事啊????大佬求救
本文介绍使用Scrapy框架进行网页爬取的全过程,包括定义爬虫类、请求页面、解析内容并保存文件。通过实例演示了Scrapy的基本工作流程。

2389

被折叠的 条评论
为什么被折叠?



