一个spider 爬取多个内容
需求:爬取音乐详情、乐评,在同一个spider完成
name = 'music'
allowed_domains = ['music.douban.com']
# 首页
start_urls = ['https://music.douban.com/tag/',
'https://music.douban.com/tag/?view=cloud'
]
rules = (
# 分类
Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$")),
# 分类下一页
Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$")),
# 乐评
Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time$")),
# 乐评下一页
Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time\&start=\d+$")),
#音乐详情
Rule(LinkExtractor(allow=r"/subject/\d+/$"), callback="parse_music", follow=True),
#乐评详情
Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True),
)
首页.png
目标分类.png
音乐详情页.png
乐评详情页.png







网友评论