此处介绍文件夹,下面,我们对这6大模块进行详细的介绍。
一、URL下载器
URL下载器包含两步,首先下载网站左侧导航栏的URL,然后通过导航栏的URL获取每个子栏目包含的链接列表。
下面是获取左侧导航栏所有链接并生成导航文件的代码
# -*- coding: utf-8 -*-importpandasaspdimporturllib.requestfrombs4importBeautifulSoupimportreimportosclassget_catalog(object):'''生成和操作导航文件'''defsave_catalog(self):'''获得证券之星左侧自导航的内容和网址并保存'''#获取网页内容url ='http://quote.stockstar.com'request =urllib.request.Request(url = url) response = urllib.request.urlopen(request) content = response.read().decode('gbk')#截取左侧导航内容soup = BeautifulSoup(content,"lxml") soup = BeautifulSoup(str(soup.find_all('div',class_ ="subMenuBox")),"lxml")#初始化一级子目录和二级子目录的数据框catalog1 = pd.DataFrame(columns = ["cata1","cata2","url2"]) catalog2 = pd.DataFrame(columns = ["url2","cata3","url3"])#整理目录内容和其对应的链接index1 =0;index2 =0forcontent1insoup.find_all('div',class_ = re.compile("list submenu?")): cata1 = re.findall('>(.*?)<',str(content1.h3.a))forcontent2incontent1.find_all('dl'): cata2 = re.findall('>(.*?)<',str(content2.dt.a).replace('\r\n','')) url2 = url + content2.dt.a['href'] catalog1.loc[index1] = {'cata1':cata1[0],'cata2':cata2[0].split()[0],'url2':url2} index1 +=1forcontent3incontent2.find_all('li'): cata3 = re.findall('·(.*?)<',str(content3.a)) url3 = url + content3.a['href'] catalog2.loc[index2] = {'url2':url2,'cata3':cata3[0],'url3':url3} index2 +=1#对一级子目录表和二级子目录表做表连接并保存catalog = pd.merge(catalog1,catalog2,on='url2',how='left') catalog.to_csv('catalog.csv')defload_catalog(self):'''判断导航文件是否存在并载入'''if'catalog.csv'notinos.listdir(): self.save_catalog() print('网址导航文件已生成')else: print('网址导航文件已存在') catalog = pd.read_csv('catalog.csv',encoding='gbk',usecols=range(1,6)) print("网址导航文件已载入")return(catalog)defindex_info(self,catalog,index):'''创建每行的行名,作为存入数据库的表名,并获取每行终端的网址链接'''ifstr(catalog.loc[index]['cata3'])=='nan': table_name = catalog.loc[index]['cata1'] +'_'+ catalog.loc[index]['cata2'] url = catalog.loc[index]['url2']else:#+、()等符号不能作为数据库表名,得替换或剔除if'+'incatalog.loc[index]['cata3']: cata3 = catalog.loc[index]['cata3'].replace('+','') table_name = catalog.loc[index]['cata1'] +'_'+ catalog.loc[index]['cata2'] +'_'+ cata3elif'('incatalog.loc[index]['cata3']: cata3 = catalog.loc[index]['cata3'].replace('(','').replace(')','') table_name = catalog.loc[index]['cata1'] +'_'+ catalog.loc[index]['cata2'] +'_'+ cata3else: table_name = catalog.loc[index]['cata1'] +'_'+ catalog.loc[index]['cata2'] +'_'+ catalog.loc[index]['cata3'] url = catalog.loc[index]['url3']return(table_name,url)get_catalog
下面是获取每个子栏目所有链接的代码
importpandasaspdfromseleniumimportwebdriverimporttimeimportreimportmathfromget_catalogimportget_catalogclassget_urls(object):'''获取每个栏目的链接列表'''def__init__(self,browser,url):self.browser = browser#浏览器对象self.url = url#待爬取的URLdefget_browser(self):'''连接URL'''state =0test =0whilestate ==0andtest <5:try: self.browser.get(self.url) state =1print('成功连接 %s'%self.url)except: test +=1defget_element(self):'''获取翻页相关按钮的链接列表'''self.get_browser() element_list=[]foriinrange(1,8):try: element = self.browser.find_element_by_xpath('//*[@id="divPageControl1"]/a[%d]'%i).get_attribute('href') element_list.append(element)except: time.sleep(0.2)return(element_list)defget_urllist(self):'''通过翻页相关按钮生成有效的页码链接列表'''element_list = self.get_element()iflen(element_list)<=1: urls = [self.url]else:try: max_number = re.search('_(\d*)\.',element_list[len(element_list)-3]) begin = max_number.start() +1end = max_number.end() -1int_max_number = int(element_list[len(element_list)-3][begin:end]) urls = []foriinrange(1,int_max_number +1): url = element_list[len(element_list)-3][:begin] + str(i) + element_list[len(element_list)-3][end:] urls.append(url)except: urls = [self.url]return(urls)get_urls
# coding:utf - 8classUrlManager(object):'''URL管理器'''def__init__(self):self.new_urls = set()#未爬取URL集合self.old_urls = set()#已爬取URLdefhas_new_url(self):'''判断是否有未爬取的URL'''return(self.new_url_size()!=0)defget_new_url(self):'''获取一个未爬取的URL'''new_url = self.new_urls.pop() self.old_urls.add(new_url)return(new_url)defadd_new_url(self,url):'''将新的URL添加到未爬取的URL集合中'''ifurlisNone:returnifurlnotinself.new_urlsandurlnotinself.old_urls: self.new_urls.add(url)defadd_new_urls(self,urls):'''将新的URL列表添加到未爬取的URL集合中'''ifurlsisNoneorlen(urls)==0:returnforurlinurls: self.add_new_url(url)defnew_url_size(self):'''获取为爬取URL集合的大小'''return(len(self.new_urls))UrlManager
三、HTML下载器
HTML下载器用来下载网页,这时候需要注意网页的编码,已保证下载的网页没有乱码。
获取网页内容时可能会遇到IP被封的情况,所以我们得爬取一个代理IP池,供HTML下载器使用。
下面是获取代理IP池的代码
importurllib.requestimportreimporttimeimportrandomimportsocketimportthreadingclassproxy_ip(object):'''获取有效代理IP并保存'''def__init__(self,url,total_page):self.url = url#打算爬取的网址self.total_page = total_page#遍历代理IP网页的页数defget_proxys(self):'''抓取代理IP'''user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)",'Mozilla/5.0 (Windows NT 6.3; WOW64)','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36','Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)','Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1','Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3','Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12','Opera/9.27 (Windows NT 5.2; U; zh-cn)','Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0','Opera/8.0 (Macintosh; PPC Mac OS X; U; en)','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] ip_totle=[]forpageinrange(1,self.total_page+1):#url = 'http://www.httpsdaili.com/?page='+str(page)#url='http://www.kuaidaili.com/free/inha/'+str(page)+'/'url='http://www.xicidaili.com/nn/'+str(page)#西刺代理headers={"User-Agent":random.choice(user_agent)}try: request=urllib.request.Request(url=url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') print('get page',page) pattern=re.compile('(\d.*?)')#截取与之间第一个数为数字的内容ip_page=re.findall(pattern,str(content)) ip_totle.extend(ip_page)exceptExceptionase: print(e) time.sleep(random.choice(range(1,5)))#打印抓取内容print('代理IP地址 ','\t','端口','\t','速度','\t','验证时间')foriinrange(0,len(ip_totle),4): print(ip_totle[i],' ','\t',ip_totle[i+1],'\t',ip_totle[i+2],'\t',ip_totle[i+3])#整理代理IP格式proxys = []foriinrange(0,len(ip_totle),4): proxy_host = ip_totle[i]+':'+ip_totle[i+1] proxy_temp = {"http":proxy_host} proxys.append(proxy_temp)return(proxys)deftest(self,lock,proxys,i,f):'''验证代理IP有效性'''socket.setdefaulttimeout(15)#设置全局超时时间url = self.urltry: proxy_support = urllib.request.ProxyHandler(proxys[i]) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)")] urllib.request.install_opener(opener)#res = urllib.request.urlopen(url).read().decode('gbk')res = urllib.request.urlopen(url).read().decode('utf-8') print(res) lock.acquire()#获得锁print(proxys[i],'is OK') f.write('%s\n'%str(proxys[i]))#写入该代理IPlock.release()#释放锁exceptExceptionase: lock.acquire() print(proxys[i],e) lock.release()defget_ip(self):'''多线程验证'''f = open('proxy_ip.txt','a+')#新建一个储存有效IP的文档lock=threading.Lock()#建立一个锁#多线程验证proxys = self.get_proxys() threads=[]foriinrange(len(proxys)): thread=threading.Thread(target=self.test,args=[lock,proxys,i,f]) threads.append(thread) thread.start()#阻塞主进程,等待所有子线程结束forthreadinthreads: thread.join() f.close()#关闭文件get_proxy_ip
下面是HTML下载器模块的代码
# _*_ coding:utf-8 _*_fromfirstSpider.get_proxy_ipimportproxy_ipimporturllib.requestimportrandomimportosimportsocketimporttimeimportreclassHtmlDownloader(object):'''获取网页内容'''defdownload(self,url):user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)",'Mozilla/5.0 (Windows NT 6.3; WOW64)','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11','Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko','Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36','Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)','Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1','Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3','Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12','Opera/9.27 (Windows NT 5.2; U; zh-cn)','Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0','Opera/8.0 (Macintosh; PPC Mac OS X; U; en)','Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER','Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11'] state =0;test =0socket.setdefaulttimeout(20)#设置全局超时时间whilestate ==0andtest <5:try: request = urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素response = urllib.request.urlopen(request) readhtml = response.read() content = readhtml.decode('gbk')#读取网页内容time.sleep(random.randrange(1,6))ifre.search('Auth Result',content) ==None: state =1exceptExceptionase: print('系统IP获取网页失败','',e)if'proxy_ip.txt'notinos.listdir()oros.path.getsize('proxy_ip.txt') ==0: print('代理IP池不存在,新建代理IP池') pool = proxy_ip(url,5) pool.get_ip() print('代理IP池创建完毕')else: f = open('proxy_ip.txt','r') proxys_ip = f.readlines() f.close() random.shuffle(proxys_ip)foriinrange(len(proxys_ip)):try: proxy_support = urllib.request.ProxyHandler(eval(proxys_ip[i][:-1])) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent",random.choice(user_agent))] urllib.request.install_opener(opener) response = urllib.request.urlopen(url) readhtml = response.read() content = readhtml.decode('gbk') time.sleep(random.randrange(1,6))ifre.search('Auth Result',content) ==None:#排除被判别为无效用户的情况state =1print('成功接入代理IP',proxys_ip[i])breakexceptExceptionase: print(proxys_ip[i],'请求失败',e)excepturllib.error.HTTPErrorase: print(proxys_ip[i],'请求失败',e.code)excepturllib.error.URLErrorase: print(proxys_ip[i],'请求失败',e.reason)try:ifi == len(proxys_ip)-1: os.remove('proxy_ip.txt') print('代理IP池失效,已删除')except:#i不存在的情况os.remove('proxy_ip.txt') print('代理IP池为空,文件已删除') time.sleep(60) test +=1iftest ==5: print('未成功获取 %s 页面内容'%url) content =Nonereturn(content)HtmlDownloader
四、HTML解析器
HTML解析器主要对HTML下载器下载的网页内容进行解析,提取想要的内容。
本文用到的网页解析方法主要是正则表达式和BeautifulSoup,下面是HTML解析器的代码
# coding:utf-8importrefrombs4importBeautifulSoupimportpandasaspdimporturllib.requestimportnumpyasnpimporttimeimportdatetimeclassHtmlParser(object):'''解析网页内容'''def__init__(self,content):self.soup = BeautifulSoup(content,"lxml")#待解析内容defget_header(self):'''获取表格标题'''try: header = []fortaginself.soup.thead.find_all('td'): title = str(tag) title = title.replace(' ','') title = title.replace('\n','') header.extend(re.findall('>(.*?)<',title)) header_name = []fordatainheader:ifdata !='': header_name.append(data.strip()) header_name.append('数据时间')except:#无标题返回空列表,标记了该内容是否有效header_name = []return(header_name) h2_len = len(self.soup.thead.find_all('td',class_ ="h2")) datalist_len = len(self.soup.find_all('tbody',id="datalist") + self.soup.find_all('tbody',id="datalist1") + self.soup.find_all('tbody',id="datalist2"))ifh2_len >=6ordatalist_len ==0:#排除了标题格式不统一和没数据的两种情况header_name = []return(header_name)defget_header2(self):'''获取表格标题(标题存在两层)'''stati_date = []fordateinself.soup.thead.find_all('td',class_ ="double align_center"): stati_date.extend(re.findall('>(.*?)<',str(date))) header_total = self.get_header() header_name = header_total[:-5] header_name = header_name[:2] + header_total[-5:-1] + header_name[2:]ifstati_date[0]inheader_name: header_name.remove(stati_date[0])ifstati_date[1]inheader_name: header_name.remove(stati_date[1]) header_name.append('三四列统计时间') header_name.append('五六列统计时间') header_name.append('数据时间')return(header_name,stati_date)defget_datatime(self):'''获取数据时间'''try: date = re.findall('数据时间:(.*?)<',str(self.soup.find_all('span',class_ ="fl")))[0][0:10]except:#若不存在,根据系统时间推断now_time = time.localtime()iftime.strftime("%w",now_time)in['1','2','3','4','5']: date = time.strftime("%Y-%m-%d",now_time)eliftime.strftime("%w",now_time) =='6': dt = (datetime.datetime.now() - datetime.timedelta(days =1)) date = dt.strftime("%Y-%m-%d")else: dt = (datetime.datetime.now() - datetime.timedelta(days =2)) date = dt.strftime("%Y-%m-%d")return(date)defget_datalist(self):'''获取数据内容'''iflen(self.soup.find_all('tbody',id="datalist")) >=1: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist")[0]),"lxml")eliflen(self.soup.find_all('tbody',id="datalist1")) >=1: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist1")[0]),"lxml")else: soup = BeautifulSoup(str(self.soup.find_all('tbody',id="datalist2")[0]),"lxml") date = self.get_datatime() row = len(soup.tbody.find_all('tr'))#初始化正常标题和双重标题时的数组iflen(self.soup.thead.find_all('td',class_ ="double align_center")) ==0: header_name = self.get_header() col = len(header_name) datalist = np.array(['']*(row * col),dtype ='U24').reshape(row,col) flag =1else: header_name = self.get_header2()[0] col = len(header_name) datalist = np.array(['']*(row * col),dtype ='U24').reshape(row,col) flag =2foriinrange(row):#提取数据并写入数组detail = re.findall('>(.*?)<',str(soup.find_all('tr')[i]))forblankinrange(detail.count('')): detail.remove("")try:ifflag ==1: detail.append(date) datalist[i] = detailelifflag ==2: stati_date = self.get_header2()[1] detail.append(stati_date[0]) detail.append(stati_date[1]) detail.append(date) datalist[i] = detailexcept: datalist[i][0] = detail[0] datalist[i][col-1] = datereturn(datalist,header_name)defget_dataframe(self):'''组合标题和数据数据为数据框并输出'''datalist,header_name = self.get_datalist() table = pd.DataFrame(datalist ,columns = header_name)return(table)HtmlParser
六、爬虫调度器
爬虫调度器主要将上述几个模块组合起来,合理的分工,高效完成任务。
爬虫调度器采用进程池的方式加快了程序执行的效率,下面是爬虫调度器模块的代码
fromfirstSpider.UrlManagerimportUrlManagerfromfirstSpider.HtmlDownloaderimportHtmlDownloaderfromfirstSpider.HtmlParserimportHtmlParserfromfirstSpider.DataOutputimportDataOutputfromsqlalchemyimportcreate_engineimportthreadpool,timeclassSpiderMan(object):'''爬虫机器人'''def__init__(self,engine,table_name):self.engine = engine#数据库连接引擎self.table_name = table_name#表名self.manager = UrlManager()#URL管理器self.downloader = HtmlDownloader()#HTML下载器defspider(self,url):'''单网页爬虫组件'''# HTML下载器下载网页html = self.downloader.download(url) f = open('stock.txt','w') f.write(html) f.close()# HTML解析器抽取网页数据parser = HtmlParser(html)iflen(parser.get_header()) >0: data = parser.get_dataframe()# 数据储存器储存文件out = DataOutput(self.engine,data,self.table_name) out.output() print('%s 的数据已存入表 %s'%(url,self.table_name)) time.sleep(1)return(parser.get_datatime())defcrawl(self,urls):'''爬取一个栏目连接列表的内容'''self.manager.add_new_urls(urls)# 判断url管理器中是否有新的urlpool = threadpool.ThreadPool(10)while(self.manager.has_new_url()):# 从URL管理器获取新的urlnew_url = self.manager.get_new_url() requests = threadpool.makeRequests(self.spider,(new_url,)) pool.putRequest(requests[0]) pool.wait()SpiderMan
将上述每个模块的代码都新建一个py文件放在firstSpider文件夹下,并运行如下主程序即可获取证券之星全站的股票数据
fromfirstSpider.get_proxy_ipimportproxy_ipfromfirstSpider.get_catalogimportget_catalogfromfirstSpider.get_urlsimportget_urlsfromfirstSpider.SpiderManimportSpiderManfromseleniumimportwebdriverfromsqlalchemyimportcreate_engineimporttime'''根据左侧子导航下载证券之星当天所有数据'''if__name__ =="__main__": print('获取代理IP并验证有效性') ip_pool = proxy_ip('http://quote.stockstar.com',8) ip_pool.get_ip() print('代理IP池建立完毕') getcata = get_catalog() catalog = getcata.load_catalog() start =0end = len(catalog) catalog = catalog[start : end] print('初始化浏览器') browser = webdriver.Chrome() engine = create_engine('mysql+pymysql://root:Jwd116875@localhost:3306/scott?charset=utf8')forindexinrange(start,end): table_name,url = getcata.index_info(catalog,index) stop_url = ['http://quote.stockstar.com/gold/globalcurrency.shtml']#想过滤掉的网页链接ifurlnotinstop_url: geturls = get_urls(browser,url) urls = geturls.get_urllist() print('已获取 %s 的链接列表'%table_name) Spider_man = SpiderMan(engine,table_name) Spider_man.crawl(urls) datatime = Spider_man.spider(urls[0]) print('%s: %s 栏目 %s 的增量数据爬取完毕'%(index,table_name,datatime))main
麻雀虽小五脏俱全,以上是用简单的爬虫框架实现的一次全站内容爬取,在执行速度和程序伪装上还有很大提升空间,希望能够与大家一同交流成长。
进群:125240963 即可获取数十篇的PDF哦!
网友评论