2018-07-16-北京新发地市场爬虫

作者: 加勒比海带_4bbc | 来源:发表于2018-07-16 00:06 被阅读0次

2018-07-16-北京新发地市场爬虫
北京
2020-06-16
北京新发地
北京新发地
2020-06-14
北京的疫情很严重
北京新发地市场
2020-6-13 流水账日常
#挪威三文鱼“养殖罪恶”#

coding=utf-8

import requests
from bs4 import BeautifulSoup
import re
import os
import pymongo
import json
import pandas as pd
import numpy as np
import xlrd
import datetime
from pyecharts import Line,Grid,EffectScatter,Overlap

def getPriceSoup_table():
soup_table=BeautifulSoup('',"lxml")
for i in range(500):
url = "http://www.xinfadi.com.cn/marketanalysis/4/list/"+str(i)+".shtml"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0)' +
'Gecko/20100101 Firefox/61.0'}
r = requests.get(url,headers=headers)
html = r.text.encode(encoding='utf_8').decode()
soup = BeautifulSoup(html,"lxml")
table = soup.find('table', attrs={'class' :{'hq_table'}})
soup_table.append(table)
return soup_table

def soup_table2DataFrame(table): #构造dataframe 准备存储表格
#收集表头
columns = [x.text for x in table.tr.findAll('td')]
columns = [x.replace('\xa0','') for x in columns]
print(columns)
rows=[]
for row in table.findAll('tr'):
if row not in rows and row.find('td')!=None:
rows.append(row)
print(rows)
height = len(rows)-1
width = len(columns)
#逐行分析表格
df = pd.DataFrame(data = np.full((height,width),'',dtype = 'U'),columns = columns)
#逐行分析表格
for i in range(height):
cells = rows[i+1].findAll('td')
if len(cells) == width:
df.iloc[i] = [cell.text.replace(' ','').replace('\n','') for cell in cells] #去点空格和换行
else:
w=len(cells)
df.iloc[i,width-w:] = [cell.text.replace(' ','').replace('\n','') for cell in cells]
return df

def getPriceLine(product_name,specification,days):
price_data = []
price_date = []
for i in range(days):
spiderDay = datetime.date.today()- datetime.timedelta(days=i+1)
spiderDayStr =str(spiderDay) #2018-07-11格式
sendtimeStr=re.sub("-","",spiderDayStr) #20180711格式
#outputfilePath="D:/xlsx/"+sendtimeStr+".水产品价格xlsx"
if os.path.exists(outputfilePath):
ExcelFile = xlrd.open_workbook(outputfilePath)
sheet = ExcelFile.sheet_by_index(0)
columnIndex = None
rowIndex = None
for j in range(sheet.ncols):
for i in range(sheet.nrows):
if sheet.cell_value(i, j) == '平均价':
columnIndex = j
break
if sheet.cell_value(i, j) == product_name:
if sheet.cell_value(i, j+6) == spiderDayStr:
if sheet.cell_value(i, j+4) == specification:
rowIndex = i
break
print('-------------columnIndex----------------'+str(columnIndex))
print('-------------rowIndex----------------'+str(rowIndex))
if not (rowIndex == None) and not (columnIndex == None):
print(sheet.cell_value(rowIndex, columnIndex))
price_data.append(sheet.cell_value(rowIndex, columnIndex))
price_date.append(sendtimeStr)
print(price_data)
print(price_date)
attr = price_date[::-1]
v1 = price_data[::-1]
line = Line(product_name+str(days)+'天'+'价格走势图')
line.add(product_name,attr,v1,is_smooth = False,mark_point = ['max','min'],mark_line=["average"],yaxis_formatter="元/斤")
grid =Grid()
grid.add(line,grid_top="10%")

判断是否存在目标文件夹

isExists=os.path.exists('D:/价格走势图')
if not isExists:

如果不存在则创建目录

    os.makedirs('D:/价格走势图') 

    print ('创建D:/价格走势图文件夹成功')
#return True
else:
    # 如果目录存在则不创建，并提示目录已存在

print ('目录已存在')

    grid.render('D:/价格走势图/北京新发地市场'+product_name+str(days)+'天价格走势图.html')
    print('已得到'+str(days)+'天'+product_name+'价格走势图')

es = EffectScatter()

es.add('',attr,v1,effect_scale=8) #闪烁

overlop = Overlap()

overlop.add(line) #必须先添加line,在添加es

overlop.add(es)

overlop.render('./line-es01.html')

return  
    
#client= pymongo.MongoClient()
#获取一个数据库   
#db=client.priceSpider
#创建 或获取一个集合，并在collection下新建books
#account=db.prcie
#data=xlrd.open_workbook("D:/"+sendtimeStr+".xlsx")
#table=data.sheets()[0]
#读取excel第一行数据作为存入mongodb的字段名
#rowstag=table.row_values(0)
#nrows=table.nrows
#print('-------------nrows----------------'+str(nrows))
#ncols=table.ncols #print rows
#returnData={}
#for i in range(1,nrows):
    #将字段名和excel数据存储为字典形式，并转换为json格式
    #returnData[i]=json.dumps(dict(zip(rowstag,table.row_values(i))))
    #通过编解码还原数据
    #returnData[i]=json.loads(returnData[i])
    #print returnData[i]
    #account.insert(returnData[i])
    #return daySpider

if name=="main":
table = getPriceSoup_table()
print(table)
print('---------------------------------------------')
if not table == None:
df = soup_table2DataFrame(table)
isExists=os.path.exists('D:/xlsx')
if not isExists:
# 如果不存在则创建目录
os.makedirs('D:/xlsx')
print ('创建D:/xlsx文件夹成功')
#return True
else:
outputfilePath="D:/xlsx/北京新发地市场水产品每日价格行情.xlsx"
df.to_excel(outputfilePath)
print('-------------------------------')

outputfilePath="D:/xlsx/北京新发地市场水产品每日价格行情.xlsx"
getPriceLine('桂鱼','标桂',120)

网友评论

本文标题：2018-07-16-北京新发地市场爬虫

本文链接：https://www.haomeiwen.com/subject/ervmpftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

2018-07-16-北京新发地市场爬虫

coding=utf-8

判断是否存在目标文件夹

如果不存在则创建目录

print ('目录已存在')

es = EffectScatter()

es.add('',attr,v1,effect_scale=8) #闪烁

overlop = Overlap()

overlop.add(line) #必须先添加line,在添加es

overlop.add(es)

overlop.render('./line-es01.html')

相关文章

2018-07-16-北京新发地市场爬虫

北京

2020-06-16

北京新发地

北京新发地

2020-06-14

北京的疫情很严重

北京新发地市场

2020-6-13 流水账日常

#挪威三文鱼“养殖罪恶”#

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读