主要收获
- 首次使用正则匹配爬取。
- 使用yield返回数据
- 利用json写入文本
import requests
import re
import json
urls=["https://www.qiushibaike.com/text/page/{}/".format (str(i)) for i in range(1,14)]
def get_one_page(url):
web_data=requests.get(url)
return web_data.text
def parse_one_page(html):
content=re.compile('<h2>(.*?)</h2>.*?</div>.*?<span>(.*?)</span>.*?</div>.*?number.*?>(\d+)</i>',re.S)
item=re.findall(content,html)
for i in item:
yield {
"title":i[0].strip(),
"article":i[1].strip().replace('<br/>','\n'),
"funny":i[2],
}
def write_to_file(content):
with open('re_douban.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for url in urls:
html=get_one_page(url)
data=parse_one_page(html)
for i in data:
write_to_file(i)
if __name__ == '__main__':
main()
网友评论