基于twisted、整合其他模块的框架
pip3 install whl文件
即可scrapy
示例
#1、执行全局命令:请确保不在某个项目的目录下,排除受该项目配置的影响
scrapy startproject MyProject
scrapy genspider baidu www.baidu.com # 创建爬虫
scrapy settings --get XXX #如果切换到项目目录下,看到的则是该项目的配置
scrapy runspider baidu.py # 运行爬虫
scrapy shell https://www.baidu.com
response
response.status
response.body
view(response)
scrapy view https://www.taobao.com #如果页面显示内容不全,不全的内容则是ajax请求实现的,以此快速定位问题
scrapy fetch --nolog --headers https://www.taobao.com
scrapy version #scrapy的版本
scrapy version -v #依赖库的版本
#2、执行项目命令:切到项目目录下
scrapy crawl baidu
scrapy check
scrapy list
scrapy parse http://quotes.toscrape.com/ --callback parse
scrapy bench
项目根目录
│ entrypoint.py # 启动文件
│ scrapy.cfg # 项目部署文件
└─AMAZON
│ items.py
│ middlewares.py
│ pipelines.py # 数据存储模板,用于结构化数据
│ settings.py # 项目配置文件,如:递归的层数、并发数,延迟下载等
│ __init__.py
└─spiders # 爬虫目录,如:创建文件,编写爬虫规则
│ amazon.py # 爬虫程序
└─ __init__.py
from scrapy.cmdline import execute
# execute(["scrapy", "crawl", "amazon"])
execute(["scrapy", "crawl", "amazon", "--nolog"])
keyword = input("请输入搜索内容>>>")
execute(["scrapy", "crawl", "amazon", "-a", "keyword=%s"%keyword])
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urlencode
# 调度器
from scrapy.core.scheduler import Scheduler
# 默认去重规则
# from scrapy.dupefilter import RFPDupeFilter
class AmazonSpider(scrapy.Spider):
name = 'amazon' # 爬虫名
allowed_domains = ['www.amazon.com'] # 该爬虫可以爬的域
start_urls = ['http://www.amazon.com/'] # 需要爬取的url
# 自定义设置
custom_settings = {
"BOT_NAME": "chuck",
"REQUEST_HEADERS": {}
}
def __init__(self, keyword="iphone8", *args, **kwargs):
scrapy.Spider.__init__(self, *args, **kwargs)
self.keyword = keyword
# 开始请求的视图函数,生成请求
def start_requests(self):
# scrapy.Request()
# yield scrapy.Request(
# "https://www.amazon.com/b/ref=unrec_bubbler_2/136-4368269-0847354?_encoding=UTF8&node=12847721&ref=unrec_bubbler_2&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=B8D3YND7QYX4393P8SXD&pf_rd_t=36701&pf_rd_p=e00909ac-c77c-445a-9c6c-9c021da40fa3&pf_rd_i=desktop",
# self.parse,
# # dont_filter=True
# )
# yield scrapy.Request(
# "https://www.amazon.com/b/ref=unrec_bubbler_2/136-4368269-0847354?_encoding=UTF8&node=12847721&ref=unrec_bubbler_2&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=B8D3YND7QYX4393P8SXD&pf_rd_t=36701&pf_rd_p=e00909ac-c77c-445a-9c6c-9c021da40fa3&pf_rd_i=desktop",
# self.parse,
# # dont_filter=True
# )
# yield scrapy.Request(
# "https://www.amazon.com/b/ref=unrec_bubbler_2/136-4368269-0847354?_encoding=UTF8&node=12847721&ref=unrec_bubbler_2&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=B8D3YND7QYX4393P8SXD&pf_rd_t=36701&pf_rd_p=e00909ac-c77c-445a-9c6c-9c021da40fa3&pf_rd_i=desktop",
# self.parse,
# # dont_filter=True
# )
yield scrapy.Request(
"https://www.amazon.com/s/ref=nb_sb_noss_1?{0}".format(urlencode({"field-keywords": self.keyword})),
callback=self.parse,
)
# 请求返回的响应处理
def parse(self, response):
# import hashlib
# import time
# md = hashlib.md5()
# md.update(str(time.time()).encode("utf-8"))
# with open("%s.html" % md.hexdigest(), "w", encoding="utf-8") as wf:
# wf.write(response.text)
print("=========>", len(response.text))
def close(spider, reason):
print("结束")
回调函数parse方法内:
def parse(self, response):
if response.url in self.visited:
return None
.......
self.visited.add(response.url)
- 方法一改进: - 针对url可能过长,所以我们存放url的hash值
```python
def parse(self, response):
url=md5(response.request.url)
if url in self.visited:
return None
.......
self.visited.add(url)
```
配置文件:
DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' #默认的去重规则帮我们去重,去重规则在内存中
DUPEFILTER_DEBUG = False
JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen,去重规则放文件中
from scrapy.dupefilter import RFPDupeFilter
步骤一:在项目目录下自定义去重文件dup.py
class UrlFilter(object):
def __init__(self):
self.visited = set() #或者放到数据库
@classmethod
def from_settings(cls, settings):
return cls()
def request_seen(self, request):
if request.url in self.visited:
return True
self.visited.add(request.url)
def open(self): # can return deferred
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
pass
步骤二:配置文件settings.py:
DUPEFILTER_CLASS = '项目名.dup.UrlFilter'
ITEM_PIPELINES = {
'Amazon.pipelines.CustomPipeline': 200,
'Amazon.pipelines.CustomPipeline2': 300,
}
pipeline类
class CustomPipeline(object):
def __init__(self,host,port,user,pwd,db,table):
self.host=host
self.port=port
self.user=user
self.pwd=pwd
self.db=db
self.table=table
@classmethod
def from_crawler(cls, crawler):
"""
Scrapy会先通过getattr判断我们是否自定义了from_crawler,有则调它来完
成实例化
"""
HOST = crawler.settings.get('HOST')
PORT = crawler.settings.get('PORT')
USER = crawler.settings.get('USER')
PWD = crawler.settings.get('PWD')
DB = crawler.settings.get('DB')
TABLE = crawler.settings.get('TABLE')
return cls(HOST,PORT,USER,PWD,DB,TABLE)
def open_spider(self,spider):
"""
爬虫刚启动时执行一次
"""
self.client = MongoClient('mongodb://%s:%s@%s:%s' %(self.user,self.pwd,self.host,self.port))
def close_spider(self,spider):
"""
爬虫关闭时执行一次
"""
self.client.close()
def process_item(self, item, spider):
# 操作并进行持久化
self.client[self.db][self.table].save(dict(item))
class AmazonSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
#1、与middlewares.py同级目录下新建proxy_handle.py
import requests
def get_proxy():
return requests.get("http://127.0.0.1:5010/get/").text
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
#2、middlewares.py
from Amazon.proxy_handle import get_proxy,delete_proxy
class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用
:param request:
:param spider:
:return:
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
proxy="http://" + get_proxy()
request.meta['download_timeout']=20
request.meta["proxy"] = proxy
print('为%s 添加代理%s ' % (request.url, proxy),end='')
print('元数据为',request.meta)
def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用
:param response:
:param result:
:param spider:
:return:
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('返回状态吗',response.status)
return response
def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
:param response:
:param exception:
:param spider:
:return:
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
print('代理%s,访问%s出现异常:%s' %(request.meta['proxy'],request.url,exception))
import time
time.sleep(5)
delete_proxy(request.meta['proxy'].split("//")[-1])
request.meta['proxy']='http://'+get_proxy()
return request
re
scrapy shell https://doc.scrapy.org/en/latest/_static/selectors-sample1.html
#1 //与/
>>> response.xpath('//body/a') #开头的//代表从整篇文档中寻找,body之后的/代表body的儿子
[]
>>> response.xpath('//body//a') #开头的//代表从整篇文档中寻找,body之后的//代表body的子子孙孙
[<Selector xpath='//body//a' data='<a href="image1.html">Name: My image 1 <'>, <Selector xpath='//body//a' data='<a href="image2.html">Name: My image 2 <'>, <Selector xpath='//body//a' data='<a href="
image3.html">Name: My image 3 <'>, <Selector xpath='//body//a' data='<a href="image4.html">Name: My image 4 <'>, <Selector xpath='//body//a' data='<a href="image5.html">Name: My image 5 <'>]
#2 text
>>> response.xpath('//body//a/text()')
>>> response.css('body a::text')
#3、extract与extract_first:从selector对象中解出内容
>>> response.xpath('//div/a/text()').extract()
['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 ']
>>> response.css('div a::text').extract()
['Name: My image 1 ', 'Name: My image 2 ', 'Name: My image 3 ', 'Name: My image 4 ', 'Name: My image 5 ']
>>> response.xpath('//div/a/text()').extract_first()
'Name: My image 1 '
>>> response.css('div a::text').extract_first()
'Name: My image 1 '
#4、属性:xpath的属性加前缀@
>>> response.xpath('//div/a/@href').extract_first()
'image1.html'
>>> response.css('div a::attr(href)').extract_first()
'image1.html'
#4、嵌套查找
>>> response.xpath('//div').css('a').xpath('@href').extract_first()
'image1.html'
#5、设置默认值
>>> response.xpath('//div[@id="xxx"]').extract_first(default="not found")
'not found'
#4、按照属性查找
response.xpath('//div[@id="images"]/a[@href="image3.html"]/text()').extract()
response.css('#images a[@href="image3.html"]/text()').extract()
#5、按照属性模糊查找
response.xpath('//a[contains(@href,"image")]/@href').extract()
response.css('a[href*="image"]::attr(href)').extract()
response.xpath('//a[contains(@href,"image")]/img/@src').extract()
response.css('a[href*="imag"] img::attr(src)').extract()
response.xpath('//*[@href="image1.html"]')
response.css('*[href="image1.html"]')
#6、正则表达式
response.xpath('//a/text()').re(r'Name: (.*)')
response.xpath('//a/text()').re_first(r'Name: (.*)')
#7、xpath相对路径
>>> res=response.xpath('//a[contains(@href,"3")]')[0]
>>> res.xpath('img')
[<Selector xpath='img' data='<img src="image3_thumb.jpg">'>]
>>> res.xpath('./img')
[<Selector xpath='./img' data='<img src="image3_thumb.jpg">'>]
>>> res.xpath('.//img')
[<Selector xpath='.//img' data='<img src="image3_thumb.jpg">'>]
>>> res.xpath('//img') #这就是从头开始扫描
[<Selector xpath='//img' data='<img src="image1_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image2_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image3_thumb.jpg">'>, <Selector xpa
th='//img' data='<img src="image4_thumb.jpg">'>, <Selector xpath='//img' data='<img src="image5_thumb.jpg">'>]
#8、带变量的xpath
>>> response.xpath('//div[@id=$xxx]/a/text()',xxx='images').extract_first()
'Name: My image 1 '
>>> response.xpath('//div[count(a)=$yyy]/@id',yyy=5).extract_first() #求有5个a标签的div的id
'images'