1.饼状图
from random import randint
from matplotlib import pyplot as plt
#x是饼图中各个元素的值
x = [randint(3000,15000) for i in range(8)]
#每个元素的标签
labels=['员工{}'.format(i) for i in range(1,9)]
colors=['red','green','yellow','pink','orange','blue']
#每一个元素距离中心点的距离,可选值0~1
explode = [0,0,0,0,0,0,0,0.2]
plt.pie(x=x,
labels=labels,
colors=colors,
shadow=True,
startangle=270,#开始角度
explode=explode,
autopct='%1.1f%%'#显示百分比
)
plt.axis('equal') #设置成标准圆形
plt.legend(loc=2) #指定象线
plt.title('某大型公司员工工资占比')
plt.show()
2.切片
- 使用2个冒号分隔的3个数字来完成,第一个数字表示切片的开始位置(默认为0),第二个数字表示切片截至(但不包含)位置(默认为列表长度),第三个数字表示切片的步长(默认为1),当步长省略时可以省略最后一个冒号
l1 = [{'name':'张三','grade':56},
{'name':'李四','grade':90},
{'name':'王五','grade':6},
{'name': '赵六', 'grade': 33},
{'name': '田七', 'grade': 67},
{'name': '老八', 'grade': 12}
]
print(l1[:3])
#[{'name': '张三', 'grade': 56}, {'name': '李四', 'grade': 90}, {'name': '王五', 'grade': 6}]
3.爬取豆瓣top250
import requests
from lxml import html
import pandas as pd
url ='https://movie.douban.com/top250'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
respon = requests.get(url ='https://movie.douban.com/top250',headers=headers)
html_date = respon.text
#print(html_date)
selector = html.fromstring(html_date)
top_list = selector.xpath('//ol[@class="grid_view"]/li')
print('搜索到{}部电影'.format(len(top_list)))
movie_info_list = []
for movie in top_list:
movie_number = movie.xpath('div[@class="item"]/div[@class="pic"]/em[@class=""]/text()')
movie_number = '我是有底线的' if len(movie_number) ==0 else movie_number[0]
movie_img = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')
movie_img = '对不起,没有查询到数据' if len(movie_img) == 0 else movie_img[0]
movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a[@class=""]/span[@class="title"]/text()')
movie_name = '对不起,没有查询到数据' if len(movie_name)==0 else movie_name[0]
movie_link = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/@href')
movie_link = '对不起,没有查询到数据' if len(movie_link) == 0 else movie_link[0]
movie_conmunicate =movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class=""]/text()')
movie_conmunicate = '对不起,没有查询到数据' if len(movie_conmunicate) == 0 else movie_conmunicate[0]
movie_level = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
movie_level = '对不起,没有查询到数据' if len(movie_level) == 0 else movie_level[0]
movie_people = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
movie_people = '对不起,没有查询到数据' if len(movie_people) == 0 else movie_people[0]
movie_lookdian = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
movie_lookdian = '对不起,没有查询到数据' if len(movie_lookdian) == 0 else movie_lookdian[0]
movie_info_list.append({
'movie_number':movie_number,
'movie_name':movie_name,
'movie_link':movie_link,
'movie_img':movie_img,
'movie_conmunicate':movie_conmunicate,
'movie_level':movie_level,
'movie_people':movie_people,
'movie_lookdian':movie_lookdian
})
#print(movie_info_list)
for movie in movie_info_list:
print(movie)
with open('./top_img/{}.png'.format(movie['movie_name']), 'wb') as f:
f.write(requests.get(movie['movie_img'], headers=headers).content)
pd.DataFrame(movie_info_list).to_csv()

爬取到的图片
3.爬取豆瓣top250(优化版)
import requests
from lxml import html
import pandas as pd
def spider():
sun = 0
movie_info_list = []
for i in range(0,250,25):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
respon = requests.get(url ='https://movie.douban.com/top250?start={}&filter='.format(i),headers=headers)
html_date = respon.text
# print(html_date)
sun += 1
# print(html_date)
selector = html.fromstring(html_date)
top_list = selector.xpath('//ol[@class="grid_view"]/li')
# print('第{}页搜索到{}部电影'.format(sun, len(top_list)))
for movie in top_list:
movie_number = movie.xpath('div[@class="item"]/div[@class="pic"]/em[@class=""]/text()')
movie_number = '我是有底线的' if len(movie_number) ==0 else movie_number[0]
movie_img = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')
movie_img = '对不起,没有查询到数据' if len(movie_img) == 0 else movie_img[0]
movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a[@class=""]/span[@class="title"]/text()')
movie_name = '对不起,没有查询到数据' if len(movie_name)==0 else movie_name[0]
movie_link = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/@href')
movie_link = '对不起,没有查询到数据' if len(movie_link) == 0 else movie_link[0]
movie_conmunicate =movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class=""]/text()')
movie_conmunicate = '对不起,没有查询到数据' if len(movie_conmunicate) == 0 else movie_conmunicate[0]
movie_level = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
movie_level = '对不起,没有查询到数据' if len(movie_level) == 0 else movie_level[0]
movie_people = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
movie_people = '对不起,没有查询到数据' if len(movie_people) == 0 else movie_people[0]
movie_lookdian = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
movie_lookdian = '对不起,没有查询到数据' if len(movie_lookdian) == 0 else movie_lookdian[0]
movie_info_list.append({
'movie_number': movie_number,
'movie_name': movie_name,
'movie_link': movie_link,
'movie_img': movie_img,
'movie_conmunicate': movie_conmunicate,
'movie_level': movie_level,
'movie_people': movie_people,
'movie_lookdian': movie_lookdian
})
for movie in movie_info_list:
print(movie)
with open('./top_img/{}.png'.format(movie['movie_name']), 'wb') as f:
f.write(requests.get(movie['movie_img'], headers=headers).content)
pd.DataFrame(movie_info_list).to_csv('豆瓣top250.csv')
print('{}页共搜索到{}部电影'.format(sun, int(movie_number)))
spider()

爬取的图片

豆瓣top250csv
网友评论