协程并非爬虫

作者: 大卫同學 | 来源:发表于2018-12-10 00:05 被阅读0次

协程并非爬虫
协程池糗百爬虫(五)
Kotlin 协程学习教程之通俗易懂篇
网络爬虫（十二）
python爬虫11：gevent queue
python 多进程+协程爬虫
Python Scraping ———09.24.2017
爬虫：多任务-协程
线程，协程对比和Python爬虫实战说明
Python进程+协程——从零开始搭建异步爬虫（1）

标签： tornado 、爬虫

from urllib.parse import urljoin

from bs4 import BeautifulSoup
from tornado import gen, httpclient, ioloop, queues

base_url = "http://www.tornadoweb.org/en/stable/"
concurrency = 20

async def get_url_links(url):
    response = await httpclient.AsyncHTTPClient().fetch(url)
    html = response.body.decode("utf8")
    soup = BeautifulSoup(html)
    links = [urljoin(base_url, a.get("href")) for a in soup.find_all("a", href=True)]
    return links

async def main():
    seen_set = set()
    q = queues.Queue()

    async def fetch_url(current_url):
        #生产者
        if current_url in seen_set:
            return

        print("获取: {}".format(current_url))
        seen_set.add(current_url)
        next_urls = await get_url_links(current_url)
        for new_url in next_urls:
            if new_url.startswith(base_url):
                #放入队列，
                await q.put(new_url)

    async def worker():
        '''
        消费者
        :return: 
        '''
        async for url in q:
            if url is None:
                return
            try:
                await fetch_url(url)
            except Exception as e:
                print("excepiton")
            finally:
                q.task_done()

    #放入初始url到队列
    await q.put(base_url)

    #启动协程
    workers = gen.multi([worker() for _ in range(concurrency)])
    await q.join()

    for _ in range(concurrency):
        await q.put(None)

    await workers


if __name__ == "__main__":
    import asyncio
    asyncio.get_event_loop().run_until_complete(main())
    # io_loop = ioloop.IOLoop.current()
    # io_loop.run_sync(main)

并发请求demo

#!/usr/bin/env python
#-*- coding:utf-8 -*-
# author:zenwan
# datetime:2018/10/29 22:54
# file: PyCharm

import tornado
from tornado.ioloop import IOLoop
from tornado import httpclient
import asyncio
from tornado import gen,queues
import requests
import time

concurrency = 50
num = 100
base_url = 'http://mirrors.163.com/fedora/'
client = httpclient.AsyncHTTPClient()
res0,res1 = [],[]
async def main():
    q = queues.Queue()
    for i in range(num):
        q.put(base_url)

    async def f():
        async for url in q:
            if url is None:
                return
            try:
                res = await client.fetch(url)
                res1.append(res.body.decode("utf8").strip()[:10])
            except Exception as e:
                print(e)
            else:
                q.task_done()

    workers = gen.multi([f() for _ in range(concurrency)])
    await q.join()
    for _ in range(concurrency):
        await  q.put(None)
    await workers


if __name__ == "__main__":
    # io_loop = IOLoop.current()
    # # run_sync 方法可以在运行完某个协程之后停止事件循环
    # io_loop.run_sync(f)
    from pprint import pprint
    time0 = time.time()
    for i in range(num):
        res = requests.get(base_url)
        res0.append(res.text.strip()[:10])
    time1 = time.time()
    #pprint(res0)
    print(time1-time0)
    asyncio.get_event_loop().run_until_complete(main())
    time2 = time.time()
    #pprint(res1)
    print(time2 - time1)