import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pymongo
import time
import datetime as dt
import random
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
print('导入成功!','\n','-'*20)
def get_urls(n):
#【分页网页url采集】函数
#n:页数参数
lst = []
for i in range(n):
lst.append('https://space.bilibili.com/82366241/video?tid=0&page=%i&keyword=&order=pubdate' %(i+1))
return lst
def get_data(ui,d_h,d_c1,d_c2,table):
#【视频网页url采集】函数
#u:起始网址
#d_h:user-agent信息
#d_c1,d_c2:cookies信息(两个网址)
r1 = requests.get(url = ui,headers = d_h,cookies = d_c1)
soup1 = BeautifulSoup(r1.text,'lxml')
title = soup1.h1['title']
time = re.search(r'(\d*-\d*-\d* \d*:\d*:\d*)',soup1.find('div',class_="video-data").text).group(1)
aid = re.search(r'av(\d*).*',ui).group(1)
up = soup1.find('div',class_="name").a.text
r2 = requests.get(url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid=%s' %aid,
headers = d_h,cookies = d_c2)
soup2 = BeautifulSoup(r2.text,'lxml')
name = soup1.find('div',class_="info open").text
info = re.search(r'"view.*(\d*),"now_rank"',soup2.text).group().split(',')[:-2]
date = str(dt.date.today())
dic = {}
dic['标题'] = title
dic['介绍'] = name
dic['aid'] = aid
dic['上线时间'] = time
dic['up主'] = up
dic['采集时间'] = date
for i in info:
dic[i.split(':')[0].replace('"','')] = i.split(':')[1]
table.insert_one(dic)
return len(dic)
if __name__ == "__main__":
urllst = get_urls(14)
lilst = []
brower = webdriver.Chrome()
for u in urllst:
brower.get(u)
time.sleep(1)
ul = brower.find_element_by_class_name('list-list')
lis = ul.find_elements_by_tag_name('a')
for li in lis:
lilst.append(li.get_attribute('href'))
lilst = list(set(lilst))
dic_h = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/70.0.3521.2 Safari/537.36'}
dic_c1 = {}
cookies1 = '''xxx'''#替换cookies
for i in cookies1.split('; '):
dic_c1[i.split('=')[0]] = i.split('=')[1]
dic_c2 = {}
cookies2 = '''xxx'''#替换cookies
for i in cookies2.split('; '):
dic_c2[i.split('=')[0]] = i.split('=')[1]
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
db = myclient['看电影了没']
datatable = db['视频信息'+ str(dt.date.today())]
starttime = time.time()
errorlst = []
datalst = []
for ui in lilst:
try:
datalst.append(get_data(ui,dic_h,dic_c1,dic_c2,datatable))
print('数据采集成功,总共采集%i条数据' % len(datalst))
except:
errorlst.append(ui)
print('数据采集失败,数据网址为:',ui)
time.sleep(random.randint(1,3))
网友评论