1 概述
使用twisted处理网络通信,异步
更容易构建大规模项目
内置机制被称为选择器,正则 | css | xpath
异步处理速度快
自动调节爬取速度
免费,开源
可生成JSON CSV XML等格式
安装
# 安装Twisted
# 搜索下载.whl文件
# pywin32
pip install pywin32
pip install scrapy
# 创建项目
scrapy startproject a_hello
# 创建爬虫
scrapy genspider baidu baidu.com
# 修改UA
# 执行
scrapy crawl baidu
# 代码方式启动,便于debug
main.py
------------
from scrapy.cmdline import execute
execute('scrapy crawl baidu'.split())
--------------
# 控制台中输出scrapy默认的下载中间件和pipeline
第一行代码
class BaiduSpider(scrapy.Spider):
name = 'baidu'
# 允许爬取的域名
allowed_domains = ['baidu.com']
start_urls = ['http://baidu.com/']
# 响应
def parse(self, response):
print(response.text)
中文文档:
https://www.osgeo.cn/scrapy/
英文(新):
https://docs.scrapy.org/en/latest/
2 pipeline
pipeline可以对解析的数据进行写文件 | 入库等操作
封装item
items.py
import scrapy
# 封装字段
class MovieItem(scrapy.Item):
name = scrapy.Field()
score = scrapy.Field()
推送数据 spiders/maoyan.py
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
allowed_domains = ['maoyao.com']
start_urls = ['https://maoyan.com/films']
def parse(self, response):
print(response.text)
names = response.xpath('//div[@class="channel-detail movie-item-title"]/@title').extract()
scores_div = response.xpath('//div[@class="channel-detail channel-detail-orange"]')
scores = []
for score in scores_div:
# 只有一个结果推荐使用extract_first,extract[0]可能报错
scores.append(score.xpath('string(.)').extract_first())
item = MovieItem()
for name, score in zip(names, scores):
# print(name, ":", score)
# # 推送到pipeline
# # 只能推送字典和item数据
# yield {'name':name, 'score':score}
item['name'] = name
item['score'] = score
yield item
pipeline数据处理 pipelines.py
import json
from itemadapter import ItemAdapter
# 数据处理
class AHelloPipeline:
def open_spider(self, spider):
self.filename = open('movie.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
# 可以根据spider中设置的type执行不同的操作
self.filename.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
return item
def close_spider(self, spider):
self.filename.close();
# pipeline可以配多个,根据数字确定优先级
3 练习 - 起点小说网
数据保存为json等格式
scrapy crawl qidian -o book.json
scrapy crawl qidian -o book.xml
# csv乱码问题
scrapy crawl qidian -o book.csv
spiders\qidian.py
class QidianSpider(scrapy.Spider):
name = 'qidian'
allowed_domains = ['qidian.com']
# 开始url可以传递多个
start_urls = ['https://www.qidian.com/rank/fengyun?style=1']
# 自定义封装请求
# def start_requests(self):
# yield scrapy.Request("https://www.qidian.com/rank/fengyun?style=1")
def parse(self, response):
# //h4/a/text()
# print(response.text)
# 多个用extract
names = response.xpath('//h4/a/text()').extract()
authors = response.xpath('//p[@class="author"]/a[1]/text()').extract()
print(names)
print(authors)
book = []
for name, author in zip(names, authors):
book.append({'name': name, 'author': author})
# return数据交给pipeline处理
return book
不同类别存放不同目录
可以在数据中设置type,然后在pipeline中根据type进行不同操作
4 练习 - 爬小说
https://www.81zw.us/book/606/424359.html
注意li顺序,如果li无序可以点进详情然后点击下一章
'''
spiders/zww.py
提取标题和正文,推送到pipeline
提取下一页地址,继续解析
'''
class ZwwSpider(scrapy.Spider):
name = 'zww'
allowed_domains = ['81zw.us']
start_urls = ['https://www.81zw.us/book/606/424359.html']
def parse(self, response):
with open('1.html', 'w', encoding='utf-8') as f:
f.write(response.text)
# 提取标题
title = response.xpath('//h1/text()').extract_first()
# 提取正文
content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace(' ', '\n')
# 推送到pipeline
yield {
'title': title,
'content': content
}
# 获取下一页地址
next_url = response.xpath('//div[@class="bottem2"]/a[4]/@href').extract_first()
base_url = 'https://www.81zw.us{}'.format(next_url)
# 如果存在下一页
if base_url.find('.html') != -1:
# 解析下一页
yield scrapy.Request(response.urljoin(base_url), callback=self.parse)
def parse_info(self, response):
pass
'''
pipelines.py
将数据写入文件
默认文件流达到一定数量才会写磁盘,故手动flush刷磁盘
定义文件的打开和关闭方法
'''
class BNovelPipeline:
def open_spider(self, spider):
self.file = open('lwcs.txt', 'w', encoding='utf-8')
def process_item(self, item, spider):
title = item['title']
content = item['content']
info = title + '\n' + content + '\n'
self.file.write(info)
# 手动刷到磁盘
self.file.flush()
return item
def close_spider(self, spider):
self.file.close()
链接提取器 - LinkExtractor
# 创建爬虫
scrapy genspider -t crawl zww2 81zw.us
'''
spiders/zww2.py
通过链接提取器定义规则提取下一页地址
下载器提取新url给调度器
需要单独提取第一章url
'''
class Zww2Spider(CrawlSpider):
name = 'zww2'
allowed_domains = ['81zw.us']
start_urls = ['https://www.81zw.us/book/1215']
rules = (
# 可以写多种规则提取链接, LinkExtractor链接提取器, callback解析, follow是否跟进
# 定位到a标签即可
# 需要单独提取第一章url
Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[8]/a'), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths=r'//div[@class="bottem2"]/a[4]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
title = response.xpath('//h1/text()').extract_first()
content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace(' ', '\n')
yield {
'title': title,
'content': content
}
5 下载图片 - ImagesPipline
下载图片,转成通用格式,去重
这里提取中关村图片网的套图
http://desk.zol.com.cn/bizhi/9257_113237_2.html
# settings.py
# 指定图片pipeline
ITEM_PIPELINES = {
'c_image.pipelines.ImagePipeline': 300,
}
# 指定图片保存地址
IMAGES_STORE = 'd:/dat/img'
'''
spiders/zol.py
图片数据推动到pipeline
'''
class ZolSpider(scrapy.Spider):
name = 'zol'
allowed_domains = ['zol.com.cn']
start_urls = ['http://desk.zol.com.cn/bizhi/9257_113237_2.html']
def parse(self, response):
# 提取图片url和名称
image_url = response.xpath('//img[@id="bigImg"]/@src').extract()
image_name = response.xpath('string(//h3)').extract_first()
# 推送到pipeline
yield {
"image_urls": image_url,
# 需要自定义pipeline
"image_name": image_name
}
# 提取下一图片网页地址
next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
if next_url.find('.html') != -1:
yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
'''
pipelines.py
图片保存
'''
class CImagePipeline(object):
def process_item(self, item, spider):
return item
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url, meta={'image_name': item['image_name']})
def file_path(self, request, response=None, info=None):
# 后缀名空格处理
file_name = request.meta['image_name'].strip().replace('/', '_').replace('\r\n\t\t', r'') + '.jpg'
return file_name
6 下载中间件
设置UA | 代理等
# settings.py
# 定义UA列表
USER_AGENTS = ['a', 'b', 'c']
...
# 指定下载中间件
DOWNLOADER_MIDDLEWARES = {
'c_image.proxymiddleware.ProxyMiddleware': 343
}
from random import choice
from fake_useragent import UserAgent
from .settings import USER_AGENTS
'''
middlewares.py
下载中间件中可以设置UA
这里给出随机选择列表和fake_useragent两种方式
'''
class UserAgentDownloadMiddleware(object):
def process_request(self, request, spider):
# if self.user_agent:
# 随机选择UA
# request.headers.setdefault(b'User-Agent', choice(USER_AGENTS))
request.headers.setdefault(b'User-Agent', UserAgent().random)
7 Request | Response
FormRequest 提交表单使用, 是Request子类
'''
spiders/login.py
提交表单 - FormRequest
'''
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['sxt.cn']
start_urls = ['http://sxt.cn/']
def start_requests(self):
# 如何解决滑动验证码问题?
url = 'https://www';
# 提交表单信息
form_data = {
"user": "",
"password": ""
}
yield scrapy.FormRequest(url, formdata=form_data, callback=self.parse)
callback可以链式解析
def parse(self, response):
print(response.text)
yield scrapy.Request("", callback=self.parse_info)
def parse_info(self, response):
print(response.text)
携带cookie
'''
spiders/login2.py
携带cookie
'''
...
def start_requests(self):
cookie_str = "UM_distinctid=174a12e00ae82-0943674d9bf601-333769-144000-174a12e00af331; CNZZDATA1261969808=758276302-1600431365-%7C1600431365; PHPSESSID=jundabfrl22tpqpvjf0nk6o2p7; user=a%3A2%3A%7Bs%3A2%3A%22id%22%3Bi%3A5170%3Bs%3A5%3A%22phone%22%3Bs%3A11%3A%2215053710260%22%3B%7D"
# 字符串转字典
for cookie in cookie_str.split(";"):
key, value = cookie.split('=', 1)
cookies = {}
# 去空格
cookies[key.strip()] = value.strip()
yield scrapy.Request("http://sxt.cn/", cookies=cookies, callback=self.parse)
...
练习 - 赶集网登陆
https://passport.ganji.com/login.php
'''
spiders/login3.py
赶集网登陆实例
'''
class Login3Spider(scrapy.Spider):
name = 'login3'
allowed_domains = ['ganji.com']
start_urls = ['https://passport.ganji.com/login.php']
def parse(self, response):
# 提取页面随机生成的hash_code,登陆提交表单时使用
hash_code = re.findall(r'"__hash__":"(.+)"', response.text)[0]
# 提取验证码url
img_url = response.xpath('//div[@class="form-div js-checkcode"]//img[@class="login-img-checkcode"]/@data-url').extract_first()
# 请求验证码url并传递hash_code
yield scrapy.Request(img_url, callback=self.parse_info, meta={'hash_code': hash_code})
def parse_info(self, response):
hash_code = response.meta['hash_code']
# 验证码保存到文件
with open('yzm.jpg', 'wb') as f:
f.write(response.body)
# 控制台手动输入
code = input("请输入验证码: ")
# 构造form表单
form_data = {
"username": "15053710260",
"password": "Ww458141",
"setcookie": "0",
"checkCode": code,
"next": "/",
"source": "passport",
"__hash__": hash_code
}
login_url = 'https://passport.ganji.com/login.php'
# 请求登陆地址,传递表单数据
yield scrapy.FormRequest(login_url, callback=self.after_login, formdata=form_data)
# 登陆后处理
def after_login(self, response):
# ({"site_id":1,"status":1,"user_id":757983285,"sscode":"iu+F3qykQgBhd443iukQlOVf","cookie_expire":1600434536,"site_list":{"5":{"cookie_domain":"ganji.com.cn","host":"http:\/\/sync.ganji.com.cn","login_url":"passport\/sync.php","logout_url":"passport\/sync.php","site_id":5}},"next":"http:\/\/www.ganji.com\/user\/login_success.php?username=15053710260&next=%2F"})
print(response.text)
8 mongo | scrapy交互
概述
nosql数据库
高并发读写
传统关系型数据库的事务 | 存入即可读 | 关联查询限制了性能
面向集合,可以包含无限数目的文档,集合对应表
场景:
1.缓存 | 大尺寸低价值 | 高伸缩
2.对象及json存储
安装
下载地址:
https://www.mongodb.com/try/download/community
参考:
https://www.runoob.com/mongodb/mongodb-window-install.html
注意不要选择Mongo Compass,会卡死
# 配置文件 mongod.cfg
------------
systemLog:
destination: file
path: d:\dat\mongo\mongod.log
storage:
dbPath: d:\dat\mongo
------------
# 安装服务
C:\sof\MongoDB\Server\4.4\bin\mongod.exe --config "C:\sof\MongoDB\Server\4.4\mongod.cfg" --install
# 启动,注意以管理员模式运行cmd
net start MongoDB
安装GUI工具Robo 3T
下载地址:
https://robomongo.org/download
直接下一步
mongo基础
id 自动设置, 12字节16进制,时间戳+机器id+进程id+增量
动态列
运行mongo.exe
# 查看数据库
show dbs
# 创建
use person
# 创建后show不显示,只是占内存
# 建表
db.createCollection('student')
use person
# 删除
db.dropDatabase()
# 等价于show collections
show tables
# 删除集合
db.student.drop()
# insert不检测重复数据,有则报错
db.student.insert({"name":"Vincent","age":"18"})
# save替换
# 插入多条数据
db.student.save([{"name": "张三"}, {"name": "李四"}])
# 命令支持循环语句
for(i=0;i<5;i++){db.student.insert({"name":"张三", "age": i})}
# 更新
db.student.update({"name":"张三"}, {"name": "张三", "age": 20})
# 更新单个字段
db.student.update({"name":"习近平"}, {$set:{age:30}})
# 更新多条
db.student.update({"name":"张三"}, {$set:{age:30}}, {multi: true})
# 删除
db.student.remove({name:"习近平"})
# 只删一条
db.student.remove({name:"张三"}, {justOne: true})
# 删除所有
db.student.remove({})
# 查询
db.student.find()
# 查询条件
db.student.find({name:"Vincent"})
# 只返回一个
db.student.findOne({name:"Vincent"})
# 条件大于
db.student.find({age:{$gt:18}})
# 条件或
db.student.find({$or: [{name: "Vincent"}, {age: 18}]})
# 条件in
db.student.find({name:{$in:["Vincent", null]}})
# 模糊匹配 正则
db.student.find({name: /^Vin/})
# 另一种写法
db.student.find({name: {$regex:"^Vin"}})
# 条件函数
db.student.find({$where:function(){return this.age == null}})
# 限制条数
db.student.find().limit(2)
# 跳过
db.student.find().skip(1)
# 排序 +-1
db.student.find().sort({name:-1})
# 数量
db.student.find().count()
python | mongo交互
pip install pymongo
import pymongo
# 连接数据库
client = pymongo.MongoClient()
# 选择实例
person = client.person
# 选择集合
student = person.student
# 操作数据
# result = student.find()
# print(result)
# for r in result:
# print(r)
# print(result.next())
# 过滤数据
# result = student.find({"name":"Vincent"})
# for r in result:
# print(r)
# 排序
# result = student.find().sort("age", pymongo.ASCENDING)
# for r in result:
# print(r)
# 分页
# result = student.find().limit(1).skip(1)
# for r in result:
# print(r)
# 数量
# result = student.find().count()
# print(result)
# 增加数据
# data = {"name": "lisi", "age": 90}
# student.insert(data)
# data = {"name": "wangwu", "age": 80}
# student.insert_one(data)
# 删除
# student.remove({"name":"wangwu"})
# 修改
data = {"name":"lisi"}
result = student.find_one(data)
result["country"] = "China"
student.update(data, {"$set": result})
scrapy | mongo交互 - 豆瓣250
import scrapy
# 调试先引入
from scrapy.linkextractors import LinkExtractor
'''
spiders/douban.py
抓取豆瓣top250数据
'''
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
# 获取页面地址列表
start_urls = ['https://movie.douban.com/top250?start={}&filter='.
format(num*25) for num in range(10)]
def parse(self, response):
# 提取影片标题
names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
# 提取影片评分
stars = response.xpath('//span[@class="rating_num"]/text()').extract()
for name, star in zip(names, stars):
yield {
"name": name,
"star": star
}
import pymongo
'''
pipelines.py
影片数据存入mongo
'''
class EMongoPipeline(object):
def open_spider(self, spider):
self.client = pymongo.MongoClient()
def process_item(self, item, spider):
self.client.douban.movie.insert_one(item)
return item
def close_spider(self, spider):
self.client.close()
9 调试
可以使用scrapy shell在控制台进行调试
scrapy shell https://movie.douban.com/top250
from scrapy.linkextractors import LinkExtractor
link = LinkExtractor
link = LinkExtractor(allow=r'/films/\d+')
link.extract_links(response)
LinkExtractor(allow=r'/subject/\d+').extract_links(response)
使用pycharm degub调试时,可以先把类引入
# 调试先引入
from scrapy.linkextractors import LinkExtractor
10 splash | scrapy交互
概述
爬动态页面,获取页面渲染结果
需要装docker
对于win环境,官网提供docker toolbox
需要开启虚拟化
lua代码
下载地址:
https://get.daocloud.io/toolbox/
boot2docker.iso文件copy到C:\Users\Vincent.docker\machine
异常
异常1: Kitematic 卡100%
有网友表示关了chrome就好了,也可以试下代理
异常2: Docker Quickstart Terminal提示window查找bash.exe
右键 -> 属性 -> 指定git位置
环境
# 拉取镜像
# 拉镜像很慢
docker pull scrapinghub/splash
docker run -p 8050:8050 scrapinghub/splash
# 配置阿里云镜像加速
docker-machine ssh default
sudo sed -i "s|EXTRA_ARGS='|EXTRA_ARGS='--registry-mirror=https://1vrs21kz.mirror.aliyuncs.com |g" /var/lib/boot2docker/profile
exit
docker-machine restart default
# 访问地址
http://192.168.99.100:8050/
# 关闭终端服务依然可用,虚拟机仍在运行
docker ps
docker kill
python | scrapy交互
简单示例
'''
hello_splash.py
python splash简单交互
'''
splash_url = "http://192.168.99.100:8050/render.html?url={}&wait=3"
url = "https://www.guazi.com/bj/buy"
response = requests.get(splash_url.format(url), headers={"User-Agent": UserAgent().random})
response.encoding = 'utf-8'
print(response.text)
使用lua脚本
splash_url = "http://192.168.99.100:8050/execute?lua_source={}"
# lua = '''
# function main(splash, args)
# return 'test'
# end
# '''
url = "https://www.guazi.com/bj/buy"
# 注意引号
lua = '''
function main(splash, args)
splash:go('{}')
splash:wait(1)
return splash:html()
end
'''.format(url)
response = requests.get(splash_url.format(lua), headers={"User-Agent": UserAgent().random})
response.encoding = 'utf-8'
print(response.text)
scrapy与splash结合
pip install scrapy-splash
# settings.py
# 指定splash url
SPLASH_URL = 'http://192.168.99.100:8050/'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
'''
spiders/guazi.py
简单交互
'''
class GuaziSpider(scrapy.Spider):
name = 'guazi'
allowed_domains = ['guazi.com']
# start_urls = ['https://www.guazi.com/bj/buy']
def start_requests(self):
url = 'https://www.guazi.com/bj/buy'
yield SplashRequest(url, callback=self.parse, args={'wait': 1})
def parse(self, response):
print(response.text)
'''
spiders/guazi2.py
lua脚本方式交互
'''
class GuaziSpider(scrapy.Spider):
name = 'guazi2'
allowed_domains = ['guazi.com']
# start_urls = ['https://www.guazi.com/bj/buy']
def start_requests(self):
url = 'https://www.guazi.com/bj/buy'
lua = '''
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(1))
return {
html = splash:html()
}
end
'''
# yield SplashRequest(url, callback=self.parse, args={'wait': 1})
yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': lua})
def parse(self, response):
print(response.text)
10 selenium | scrapy交互
配置
# settings.py
# 指定下载中间件
DOWNLOADER_MIDDLEWARES = {
'g_splash.middlewares.SeleniumMiddleware': 543,
}
下载器
# guazi3.py
import scrapy
from scrapy_splash import SplashRequest
from selenium import webdriver
from scrapy import signals
class GuaziSpider(scrapy.Spider):
name = 'guazi3'
allowed_domains = ['guazi.com']
start_urls = ['https://www.guazi.com/bj/buy']
def parse(self, response):
print(response.text)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs)
spider.chrome = webdriver.Chrome()
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
# 每次调用selenium后关闭窗口
def spider_closed(self, spider):
spider.chrome.quit()
print("爬虫结束了!!!")
下载中间件
from selenium import webdriver
from scrapy import signals
# 响应不再下载
from scrapy.http import HtmlResponse
class SeleniumMiddleware(object):
# def __init__(self):
# self.chrome = webdriver.Chrome()
def process_request(self, request, spider):
url = request.url
# chrome = webdriver.Chrome()
# chrome.get(url)
# 避免每次请求多度创建chrome对象
# self.chrome.get(url)
# html = self.chrome.page_source
# print(html)
spider.chrome.get(url)
html = spider.chrome.page_source
return HtmlResponse(url=url, body=html, request=request, encoding='utf-8')
11 练习 - 某家
12 scrapy-redis
第三方,分布式
url统一调度,数据统一存储
爬虫所在服务器不推荐存数据
调度器拿到url先存redis
https://github.com/rmax/scrapy-redis
# 准备两台linux作为从服务器
# 上传python压缩包并解压
# 依赖
yum install gcc -y
yum install zlib* openssl* -y
./configure --prefix=/usr/local/Python-3.6.10 --enable-optimizations
make install
cd ~
vi .bashrc
----------
PATH=$PATH:/usr/local/Python-3.6.10/bin
---------
. .bashrc
# 测试
python36
# 安装Twisted
# 解压Twisted-17.1.0.tar.bz2
python3 setup.py install
pip3 install scrapy-redis
# 阿里云镜像加速
# https://blog.csdn.net/qq_42744532/article/details/84955936
# scrapy-redis 阿里云镜像貌似没有
# bind 127.0.0.1
pip install scrapy_redis
# 指定配置文件启动redis,否则linux爬虫会提示protect mode...异常
------
# bind 127.0.0.1
# 关闭保护模式
------
# python文件上传到linux,切换spider目录,启动
scrapy runspider qiubai.py
# win redis设置启动url
LPUSH qiubai:start_urls https://www.qiushibaike.com/text/
redis数据导入到mongo
pip install redis
网友评论