# 爬取糗事百科文字
import re
import requests
def data_capture(url):
headers = {
'User-Agent':'ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
response = requests.get(url,headers = headers)
text = response.text
contents = re.findall('<div class="content">.*?<span>(.*?)</span>',text,re.S) # re.S == re.DOTALL
for content in contents:
content = re.sub('<br/>','',content)
print(content.strip()) # 去除空格,换行
def spider():
urls = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1,6):
url = urls.format(i)
data_capture(url)
break
spider()
网友评论