"""
@author:Rudy
@time : 12月4日
@message:爬取糗事百科热门上的段子用户昵称
"""
"""
注意事项:
1 知道url的规律,知道一共多少页,准备url列表,比如豆瓣电影
2 不知道url的规律,或者不知道一共多少页,准备satrt_url,比如贴吧
"""
import requests
from lxml import etree
class QiuBai():
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page{}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"}
def get_url_list(self):
return [self.temp_url.format(i) for i in range(1, 14)]
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
def get_content_list(self, html_str): # 提取数据
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div")
content_list = []
for div in div_list:
item = {}
item["user_name"] = div.xpath(".//h2/text()")
item["content"] = [i.strip() for i in div.xpath(".//div[@class='content']/span/text()")] # 列表推导式
content_list.append(item)
return content_list
def save_content_list(self, content_list):
for content in content_list:
print(content)
def run(self): # 实现主要的逻辑
# 1 准备URL列表
url_list = self.get_url_list()
# 2 遍历发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3 提取数据
content_list = self.get_content_list(html_str)
# 4 保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = QiuBai()
qiubai.run()
网友评论