实现的原理

首先需要在setting中配置
REDIS_HOST = 'XXX.XXX.XXX.XXX'
REDIS_PORT = 6379
REDIS_PARAMS = {'password':'xxxx'} # 如果redis中没有设置密码可以忽略
# 使用scrapy_redis提供的引擎
SCHDULER = 'scrapy_redis.scheduler.Schduler'
# 去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
# 能否暂停
SCHEDULER_PERSIST = True
需要在爬虫文件中将start_urls注释,并且爬虫继承自RedisCrawlSpider
from scrapy_redis.spider import RedisCrawlSpider
class ExampleSpider(RedisCrawlSpider):
name = 'example'
allowed_domains = ['example.com']
redis_key = "example:start_urls" # 前者example是自己定义的,后者的start_urls必须相同
# 并且在第0个数据库中创建起始的url lpush example:start_urls https://example.com
rules = (
Rule(LinkExtractor(allow=r'patterns'),callback='parse_item',follow=True),
)
def parse_item(self,response):
yield item
# redis中的配置 lpush example:start_urls http://www.example.com
网友评论