美文网首页
python爬虫-02-scrapy

python爬虫-02-scrapy

作者: 西海岸虎皮猫大人 | 来源:发表于2020-09-19 02:17 被阅读0次

1 概述

使用twisted处理网络通信,异步
更容易构建大规模项目
内置机制被称为选择器,正则 | css | xpath
异步处理速度快
自动调节爬取速度
免费,开源
可生成JSON CSV XML等格式

安装
# 安装Twisted
# 搜索下载.whl文件
# pywin32
pip install pywin32
pip install scrapy
# 创建项目
scrapy startproject a_hello
# 创建爬虫
scrapy genspider baidu baidu.com

# 修改UA

# 执行
scrapy crawl baidu

# 代码方式启动,便于debug
main.py
------------
from scrapy.cmdline import execute

execute('scrapy crawl baidu'.split())
--------------
# 控制台中输出scrapy默认的下载中间件和pipeline
第一行代码
class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    # 允许爬取的域名
    allowed_domains = ['baidu.com']
    start_urls = ['http://baidu.com/']

    # 响应
    def parse(self, response):
        print(response.text)

中文文档:
https://www.osgeo.cn/scrapy/
英文(新):
https://docs.scrapy.org/en/latest/

2 pipeline

pipeline可以对解析的数据进行写文件 | 入库等操作
封装item
items.py

import scrapy

# 封装字段
class MovieItem(scrapy.Item):
    name = scrapy.Field()
    score = scrapy.Field()

推送数据 spiders/maoyan.py

class MaoyanSpider(scrapy.Spider):
    name = 'maoyan'
    allowed_domains = ['maoyao.com']
    start_urls = ['https://maoyan.com/films']

    def parse(self, response):
        print(response.text)
        names = response.xpath('//div[@class="channel-detail movie-item-title"]/@title').extract()
        scores_div = response.xpath('//div[@class="channel-detail channel-detail-orange"]')
        scores = []
        for score in scores_div:
            # 只有一个结果推荐使用extract_first,extract[0]可能报错
            scores.append(score.xpath('string(.)').extract_first())
        item = MovieItem()
        for name, score in zip(names, scores):
            # print(name, ":", score)
            # # 推送到pipeline
            # # 只能推送字典和item数据
            # yield {'name':name, 'score':score}

            item['name'] = name
            item['score'] = score

            yield item

pipeline数据处理 pipelines.py

import json

from itemadapter import ItemAdapter

# 数据处理
class AHelloPipeline:
    def open_spider(self, spider):
        self.filename = open('movie.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        # 可以根据spider中设置的type执行不同的操作
        self.filename.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
        return item

    def close_spider(self, spider):
        self.filename.close();


# pipeline可以配多个,根据数字确定优先级

3 练习 - 起点小说网

数据保存为json等格式

scrapy crawl qidian -o book.json
scrapy crawl qidian -o book.xml
# csv乱码问题
scrapy crawl qidian -o book.csv

spiders\qidian.py

class QidianSpider(scrapy.Spider):
    name = 'qidian'
    allowed_domains = ['qidian.com']
    # 开始url可以传递多个
    start_urls = ['https://www.qidian.com/rank/fengyun?style=1']

    # 自定义封装请求
    # def start_requests(self):
    #     yield scrapy.Request("https://www.qidian.com/rank/fengyun?style=1")

    def parse(self, response):
        # //h4/a/text()
        # print(response.text)
        # 多个用extract
        names = response.xpath('//h4/a/text()').extract()
        authors = response.xpath('//p[@class="author"]/a[1]/text()').extract()
        print(names)
        print(authors)

        book = []
        for name, author in zip(names, authors):
            book.append({'name': name, 'author': author})
        # return数据交给pipeline处理
        return book
不同类别存放不同目录

可以在数据中设置type,然后在pipeline中根据type进行不同操作

4 练习 - 爬小说

https://www.81zw.us/book/606/424359.html
注意li顺序,如果li无序可以点进详情然后点击下一章

'''
spiders/zww.py
提取标题和正文,推送到pipeline
提取下一页地址,继续解析
'''
class ZwwSpider(scrapy.Spider):
    name = 'zww'
    allowed_domains = ['81zw.us']
    start_urls = ['https://www.81zw.us/book/606/424359.html']

    def parse(self, response):
        with open('1.html', 'w', encoding='utf-8') as f:
            f.write(response.text)
        # 提取标题
        title = response.xpath('//h1/text()').extract_first()
        # 提取正文
        content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '\n')
        # 推送到pipeline
        yield {
            'title': title,
            'content': content
        }
        # 获取下一页地址
        next_url = response.xpath('//div[@class="bottem2"]/a[4]/@href').extract_first()
        base_url = 'https://www.81zw.us{}'.format(next_url)
        # 如果存在下一页
        if base_url.find('.html') != -1:
            # 解析下一页
            yield scrapy.Request(response.urljoin(base_url), callback=self.parse)

    def parse_info(self, response):
        pass
'''
pipelines.py
将数据写入文件
默认文件流达到一定数量才会写磁盘,故手动flush刷磁盘
定义文件的打开和关闭方法
'''
class BNovelPipeline:
    def open_spider(self, spider):
        self.file = open('lwcs.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['title']
        content = item['content']
        info = title + '\n' + content + '\n'
        self.file.write(info)
        # 手动刷到磁盘
        self.file.flush()
        return item

    def close_spider(self, spider):
        self.file.close()
链接提取器 - LinkExtractor
# 创建爬虫
scrapy genspider -t crawl zww2 81zw.us
'''
spiders/zww2.py
通过链接提取器定义规则提取下一页地址
下载器提取新url给调度器
需要单独提取第一章url
'''
class Zww2Spider(CrawlSpider):
    name = 'zww2'
    allowed_domains = ['81zw.us']
    start_urls = ['https://www.81zw.us/book/1215']

    rules = (
        # 可以写多种规则提取链接, LinkExtractor链接提取器, callback解析, follow是否跟进
        # 定位到a标签即可
        # 需要单独提取第一章url
        Rule(LinkExtractor(restrict_xpaths=r'//*[@id="list"]/dl/dd[8]/a'), callback='parse_item', follow=True),
        Rule(LinkExtractor(restrict_xpaths=r'//div[@class="bottem2"]/a[4]'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '\n')
        yield {
            'title': title,
            'content': content
        }

5 下载图片 - ImagesPipline

下载图片,转成通用格式,去重
这里提取中关村图片网的套图
http://desk.zol.com.cn/bizhi/9257_113237_2.html

# settings.py
# 指定图片pipeline
ITEM_PIPELINES = {
     'c_image.pipelines.ImagePipeline': 300,
}
# 指定图片保存地址
IMAGES_STORE = 'd:/dat/img'
'''
spiders/zol.py
图片数据推动到pipeline
'''
class ZolSpider(scrapy.Spider):
    name = 'zol'
    allowed_domains = ['zol.com.cn']
    start_urls = ['http://desk.zol.com.cn/bizhi/9257_113237_2.html']

    def parse(self, response):
        # 提取图片url和名称
        image_url = response.xpath('//img[@id="bigImg"]/@src').extract()
        image_name = response.xpath('string(//h3)').extract_first()
        # 推送到pipeline
        yield {
            "image_urls": image_url,
            # 需要自定义pipeline
            "image_name": image_name
        }
        # 提取下一图片网页地址
        next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
        if next_url.find('.html') != -1:
            yield scrapy.Request(response.urljoin(next_url), callback=self.parse)
'''
pipelines.py
图片保存
'''

class CImagePipeline(object):
    def process_item(self, item, spider):
        return item


class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url, meta={'image_name': item['image_name']})

    def file_path(self, request, response=None, info=None):
        # 后缀名空格处理
        file_name = request.meta['image_name'].strip().replace('/', '_').replace('\r\n\t\t', r'') + '.jpg'
        return file_name

6 下载中间件

设置UA | 代理等

# settings.py
# 定义UA列表
USER_AGENTS = ['a', 'b', 'c']
...
# 指定下载中间件
DOWNLOADER_MIDDLEWARES = {
    'c_image.proxymiddleware.ProxyMiddleware': 343
}
from random import choice
from fake_useragent import UserAgent

from .settings import USER_AGENTS
'''
middlewares.py
下载中间件中可以设置UA
这里给出随机选择列表和fake_useragent两种方式
'''
class UserAgentDownloadMiddleware(object):
    def process_request(self, request, spider):
        # if self.user_agent:
        # 随机选择UA
        # request.headers.setdefault(b'User-Agent', choice(USER_AGENTS))
        request.headers.setdefault(b'User-Agent', UserAgent().random)

7 Request | Response

FormRequest 提交表单使用, 是Request子类

'''
spiders/login.py
提交表单 - FormRequest 
'''
class LoginSpider(scrapy.Spider):
    name = 'login'
    allowed_domains = ['sxt.cn']
    start_urls = ['http://sxt.cn/']

    def start_requests(self):
        # 如何解决滑动验证码问题?
        url = 'https://www';
        # 提交表单信息
        form_data = {
            "user": "",
            "password": ""
        }
        yield scrapy.FormRequest(url, formdata=form_data, callback=self.parse)

callback可以链式解析

    def parse(self, response):
        print(response.text)
        yield scrapy.Request("", callback=self.parse_info)

    def parse_info(self, response):
        print(response.text)
携带cookie
'''
spiders/login2.py
携带cookie
'''
...
    def start_requests(self):
        cookie_str = "UM_distinctid=174a12e00ae82-0943674d9bf601-333769-144000-174a12e00af331; CNZZDATA1261969808=758276302-1600431365-%7C1600431365; PHPSESSID=jundabfrl22tpqpvjf0nk6o2p7; user=a%3A2%3A%7Bs%3A2%3A%22id%22%3Bi%3A5170%3Bs%3A5%3A%22phone%22%3Bs%3A11%3A%2215053710260%22%3B%7D"
        # 字符串转字典
        for cookie in cookie_str.split(";"):
            key, value = cookie.split('=', 1)
            cookies = {}
            # 去空格
            cookies[key.strip()] = value.strip()
        yield scrapy.Request("http://sxt.cn/", cookies=cookies, callback=self.parse)
...
练习 - 赶集网登陆

https://passport.ganji.com/login.php

'''
spiders/login3.py
赶集网登陆实例
'''
class Login3Spider(scrapy.Spider):
    name = 'login3'
    allowed_domains = ['ganji.com']
    start_urls = ['https://passport.ganji.com/login.php']

    def parse(self, response):
        # 提取页面随机生成的hash_code,登陆提交表单时使用
        hash_code = re.findall(r'"__hash__":"(.+)"', response.text)[0]
        # 提取验证码url
        img_url = response.xpath('//div[@class="form-div js-checkcode"]//img[@class="login-img-checkcode"]/@data-url').extract_first()
        # 请求验证码url并传递hash_code
        yield scrapy.Request(img_url, callback=self.parse_info, meta={'hash_code': hash_code})

    def parse_info(self, response):
        hash_code = response.meta['hash_code']
        # 验证码保存到文件
        with open('yzm.jpg', 'wb') as f:
            f.write(response.body)
        # 控制台手动输入
        code = input("请输入验证码: ")
        # 构造form表单
        form_data = {
             "username": "15053710260",
             "password": "Ww458141",
             "setcookie": "0",
             "checkCode": code,
             "next": "/",
             "source": "passport",
             "__hash__": hash_code
        }
        login_url = 'https://passport.ganji.com/login.php'
        # 请求登陆地址,传递表单数据
        yield scrapy.FormRequest(login_url, callback=self.after_login, formdata=form_data)
    # 登陆后处理
    def after_login(self, response):
        # ({"site_id":1,"status":1,"user_id":757983285,"sscode":"iu+F3qykQgBhd443iukQlOVf","cookie_expire":1600434536,"site_list":{"5":{"cookie_domain":"ganji.com.cn","host":"http:\/\/sync.ganji.com.cn","login_url":"passport\/sync.php","logout_url":"passport\/sync.php","site_id":5}},"next":"http:\/\/www.ganji.com\/user\/login_success.php?username=15053710260&next=%2F"})
        print(response.text)

8 mongo | scrapy交互

概述

nosql数据库
高并发读写
传统关系型数据库的事务 | 存入即可读 | 关联查询限制了性能
面向集合,可以包含无限数目的文档,集合对应表
场景:
1.缓存 | 大尺寸低价值 | 高伸缩
2.对象及json存储

安装

下载地址:
https://www.mongodb.com/try/download/community
参考:
https://www.runoob.com/mongodb/mongodb-window-install.html
注意不要选择Mongo Compass,会卡死

# 配置文件 mongod.cfg
------------
systemLog:
    destination: file
    path: d:\dat\mongo\mongod.log
storage:
    dbPath: d:\dat\mongo
------------

# 安装服务
C:\sof\MongoDB\Server\4.4\bin\mongod.exe --config "C:\sof\MongoDB\Server\4.4\mongod.cfg" --install
# 启动,注意以管理员模式运行cmd
net start MongoDB
安装GUI工具Robo 3T

下载地址:
https://robomongo.org/download
直接下一步

mongo基础

id 自动设置, 12字节16进制,时间戳+机器id+进程id+增量
动态列
运行mongo.exe

# 查看数据库
show dbs
# 创建
use person
# 创建后show不显示,只是占内存
# 建表
db.createCollection('student')
use person
# 删除
db.dropDatabase()
# 等价于show collections
show tables
# 删除集合
db.student.drop()


# insert不检测重复数据,有则报错
db.student.insert({"name":"Vincent","age":"18"})
# save替换

# 插入多条数据
db.student.save([{"name": "张三"}, {"name": "李四"}])

# 命令支持循环语句
for(i=0;i<5;i++){db.student.insert({"name":"张三", "age": i})}

# 更新
db.student.update({"name":"张三"}, {"name": "张三", "age": 20})
# 更新单个字段
db.student.update({"name":"习近平"}, {$set:{age:30}})
# 更新多条
db.student.update({"name":"张三"}, {$set:{age:30}}, {multi: true})

# 删除
db.student.remove({name:"习近平"})
# 只删一条
db.student.remove({name:"张三"}, {justOne: true})
# 删除所有
db.student.remove({})

# 查询
db.student.find()
# 查询条件
db.student.find({name:"Vincent"})
# 只返回一个
db.student.findOne({name:"Vincent"})
# 条件大于
db.student.find({age:{$gt:18}})
# 条件或
db.student.find({$or: [{name: "Vincent"}, {age: 18}]})
# 条件in
db.student.find({name:{$in:["Vincent", null]}})

# 模糊匹配 正则
db.student.find({name: /^Vin/})
# 另一种写法
db.student.find({name: {$regex:"^Vin"}})
# 条件函数
db.student.find({$where:function(){return this.age == null}})

# 限制条数
db.student.find().limit(2)
# 跳过
db.student.find().skip(1)

# 排序 +-1
db.student.find().sort({name:-1})
# 数量
db.student.find().count()

python | mongo交互

pip install pymongo

import pymongo

# 连接数据库
client = pymongo.MongoClient()
# 选择实例
person = client.person
# 选择集合
student = person.student
# 操作数据
# result = student.find()
# print(result)

# for r in result:
#     print(r)
# print(result.next())

# 过滤数据
# result = student.find({"name":"Vincent"})
# for r in result:
#     print(r)

# 排序
# result = student.find().sort("age", pymongo.ASCENDING)
# for r in result:
#     print(r)

# 分页
# result = student.find().limit(1).skip(1)
# for r in result:
#     print(r)

# 数量
# result = student.find().count()
# print(result)

# 增加数据
# data = {"name": "lisi", "age": 90}
# student.insert(data)

# data = {"name": "wangwu", "age": 80}
# student.insert_one(data)

# 删除
# student.remove({"name":"wangwu"})

# 修改
data = {"name":"lisi"}
result = student.find_one(data)
result["country"] = "China"
student.update(data, {"$set": result})

scrapy | mongo交互 - 豆瓣250

import scrapy
# 调试先引入
from scrapy.linkextractors import LinkExtractor

'''
spiders/douban.py
抓取豆瓣top250数据
'''
class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    # 获取页面地址列表
    start_urls = ['https://movie.douban.com/top250?start={}&filter='.
                      format(num*25) for num in range(10)]

    def parse(self, response):
        # 提取影片标题
        names = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
        # 提取影片评分
        stars = response.xpath('//span[@class="rating_num"]/text()').extract()
        for name, star in zip(names, stars):
            yield {
                "name": name,
                "star": star
            }
import pymongo
'''
pipelines.py
影片数据存入mongo
'''
class EMongoPipeline(object):
    def open_spider(self, spider):
        self.client = pymongo.MongoClient()

    def process_item(self, item, spider):
        self.client.douban.movie.insert_one(item)
        return item

    def close_spider(self, spider):
        self.client.close()

9 调试

可以使用scrapy shell在控制台进行调试

scrapy shell https://movie.douban.com/top250

from scrapy.linkextractors import LinkExtractor
link = LinkExtractor
link = LinkExtractor(allow=r'/films/\d+')
link.extract_links(response)

LinkExtractor(allow=r'/subject/\d+').extract_links(response)

使用pycharm degub调试时,可以先把类引入

# 调试先引入
from scrapy.linkextractors import LinkExtractor

10 splash | scrapy交互

概述

爬动态页面,获取页面渲染结果
需要装docker
对于win环境,官网提供docker toolbox
需要开启虚拟化
lua代码
下载地址:
https://get.daocloud.io/toolbox/
boot2docker.iso文件copy到C:\Users\Vincent.docker\machine

异常
异常1: Kitematic 卡100%
有网友表示关了chrome就好了,也可以试下代理
异常2: Docker Quickstart Terminal提示window查找bash.exe
右键 -> 属性 -> 指定git位置
环境
# 拉取镜像
# 拉镜像很慢
docker pull scrapinghub/splash
docker run -p 8050:8050 scrapinghub/splash

# 配置阿里云镜像加速
docker-machine ssh default 
sudo sed -i "s|EXTRA_ARGS='|EXTRA_ARGS='--registry-mirror=https://1vrs21kz.mirror.aliyuncs.com |g" /var/lib/boot2docker/profile 
exit 
docker-machine restart default
#  访问地址
http://192.168.99.100:8050/
# 关闭终端服务依然可用,虚拟机仍在运行
docker ps
docker kill
python | scrapy交互

简单示例

'''
hello_splash.py
python splash简单交互
'''
splash_url = "http://192.168.99.100:8050/render.html?url={}&wait=3"
url = "https://www.guazi.com/bj/buy"
response = requests.get(splash_url.format(url), headers={"User-Agent": UserAgent().random})
response.encoding = 'utf-8'
print(response.text)

使用lua脚本

splash_url = "http://192.168.99.100:8050/execute?lua_source={}"
# lua = '''
# function main(splash, args)
#   return 'test'
# end
# '''
url = "https://www.guazi.com/bj/buy"
# 注意引号
lua = '''
function main(splash, args)
  splash:go('{}')
  splash:wait(1)
  return splash:html()
end
'''.format(url)
response = requests.get(splash_url.format(lua), headers={"User-Agent": UserAgent().random})
response.encoding = 'utf-8'
print(response.text)
scrapy与splash结合
pip install scrapy-splash
# settings.py
# 指定splash url
SPLASH_URL = 'http://192.168.99.100:8050/'

SPIDER_MIDDLEWARES = {
  'scrapy_splash.SplashDeduplicateArgsMiddleware': 100
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
'''
spiders/guazi.py
简单交互
'''
class GuaziSpider(scrapy.Spider):
    name = 'guazi'
    allowed_domains = ['guazi.com']
    # start_urls = ['https://www.guazi.com/bj/buy']
    def start_requests(self):
        url = 'https://www.guazi.com/bj/buy'
        yield SplashRequest(url, callback=self.parse, args={'wait': 1})

    def parse(self, response):
        print(response.text)
'''
spiders/guazi2.py
lua脚本方式交互
'''
class GuaziSpider(scrapy.Spider):
    name = 'guazi2'
    allowed_domains = ['guazi.com']
    # start_urls = ['https://www.guazi.com/bj/buy']
    def start_requests(self):
        url = 'https://www.guazi.com/bj/buy'
        lua = '''
        function main(splash, args)
          assert(splash:go(args.url))
          assert(splash:wait(1))
          return {
            html = splash:html()
          }
        end
       '''
        # yield SplashRequest(url, callback=self.parse, args={'wait': 1})
        yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': lua})
    def parse(self, response):
        print(response.text)

10 selenium | scrapy交互

配置

# settings.py
# 指定下载中间件
DOWNLOADER_MIDDLEWARES = {
   'g_splash.middlewares.SeleniumMiddleware': 543,
}

下载器

# guazi3.py
import scrapy
from scrapy_splash import SplashRequest
from selenium import webdriver
from scrapy import signals


class GuaziSpider(scrapy.Spider):
    name = 'guazi3'
    allowed_domains = ['guazi.com']
    start_urls = ['https://www.guazi.com/bj/buy']
    def parse(self, response):
        print(response.text)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider.chrome = webdriver.Chrome()
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider

    # 每次调用selenium后关闭窗口
    def spider_closed(self, spider):
        spider.chrome.quit()
        print("爬虫结束了!!!")

下载中间件

from selenium import webdriver
from scrapy import signals
# 响应不再下载
from scrapy.http import HtmlResponse


class SeleniumMiddleware(object):
    # def __init__(self):
    #     self.chrome = webdriver.Chrome()

    def process_request(self, request, spider):
        url = request.url
        # chrome = webdriver.Chrome()
        # chrome.get(url)

        # 避免每次请求多度创建chrome对象
        # self.chrome.get(url)
        # html = self.chrome.page_source
        # print(html)
        spider.chrome.get(url)
        html = spider.chrome.page_source
        return HtmlResponse(url=url, body=html, request=request, encoding='utf-8')

11 练习 - 某家

12 scrapy-redis

第三方,分布式
url统一调度,数据统一存储
爬虫所在服务器不推荐存数据
调度器拿到url先存redis
https://github.com/rmax/scrapy-redis

# 准备两台linux作为从服务器
# 上传python压缩包并解压
# 依赖
yum install gcc -y
yum install zlib* openssl* -y
./configure --prefix=/usr/local/Python-3.6.10 --enable-optimizations
make install
cd ~
vi .bashrc
----------
PATH=$PATH:/usr/local/Python-3.6.10/bin
---------
. .bashrc
# 测试
python36
# 安装Twisted
# 解压Twisted-17.1.0.tar.bz2
python3 setup.py install
pip3 install scrapy-redis
# 阿里云镜像加速
# https://blog.csdn.net/qq_42744532/article/details/84955936
# scrapy-redis 阿里云镜像貌似没有

# bind 127.0.0.1
pip install scrapy_redis


# 指定配置文件启动redis,否则linux爬虫会提示protect mode...异常
------
# bind 127.0.0.1
# 关闭保护模式
------

# python文件上传到linux,切换spider目录,启动
scrapy runspider qiubai.py

# win redis设置启动url
LPUSH qiubai:start_urls https://www.qiushibaike.com/text/
redis数据导入到mongo
pip install redis

练习 - 抓取金十数据

https://www.jin10.com/

相关文章

网友评论

      本文标题:python爬虫-02-scrapy

      本文链接:https://www.haomeiwen.com/subject/wzmthktx.html