美文网首页
Python爬取豆瓣top250、切片以及饼状图

Python爬取豆瓣top250、切片以及饼状图

作者: C_Z_Q_ | 来源:发表于2019-10-30 18:56 被阅读0次

1.饼状图

from random import randint
from matplotlib import pyplot as plt
#x是饼图中各个元素的值
x = [randint(3000,15000) for i in range(8)]
#每个元素的标签
labels=['员工{}'.format(i) for i in range(1,9)]
colors=['red','green','yellow','pink','orange','blue']
#每一个元素距离中心点的距离,可选值0~1
explode = [0,0,0,0,0,0,0,0.2]
plt.pie(x=x,
        labels=labels,
        colors=colors,
        shadow=True,
        startangle=270,#开始角度
        explode=explode,
        autopct='%1.1f%%'#显示百分比
        )
plt.axis('equal')    #设置成标准圆形
plt.legend(loc=2)    #指定象线
plt.title('某大型公司员工工资占比')
plt.show()

2.切片

  • 使用2个冒号分隔的3个数字来完成,第一个数字表示切片的开始位置(默认为0),第二个数字表示切片截至(但不包含)位置(默认为列表长度),第三个数字表示切片的步长(默认为1),当步长省略时可以省略最后一个冒号
l1 = [{'name':'张三','grade':56},
      {'name':'李四','grade':90},
      {'name':'王五','grade':6},
      {'name': '赵六', 'grade': 33},
      {'name': '田七', 'grade': 67},
      {'name': '老八', 'grade': 12}
      ]
print(l1[:3])
#[{'name': '张三', 'grade': 56}, {'name': '李四', 'grade': 90}, {'name': '王五', 'grade': 6}]

3.爬取豆瓣top250

import requests
from lxml import html
import pandas as pd
url ='https://movie.douban.com/top250'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
respon = requests.get(url ='https://movie.douban.com/top250',headers=headers)
html_date = respon.text
#print(html_date)
selector = html.fromstring(html_date)
top_list = selector.xpath('//ol[@class="grid_view"]/li')
print('搜索到{}部电影'.format(len(top_list)))
movie_info_list = []
for movie in top_list:
    movie_number = movie.xpath('div[@class="item"]/div[@class="pic"]/em[@class=""]/text()')
    movie_number = '我是有底线的' if len(movie_number) ==0 else movie_number[0]

    movie_img = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')
    movie_img = '对不起,没有查询到数据' if len(movie_img) == 0 else movie_img[0]

    movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a[@class=""]/span[@class="title"]/text()')
    movie_name = '对不起,没有查询到数据' if len(movie_name)==0 else movie_name[0]

    movie_link = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/@href')
    movie_link = '对不起,没有查询到数据' if len(movie_link) == 0 else movie_link[0]

    movie_conmunicate =movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class=""]/text()')
    movie_conmunicate = '对不起,没有查询到数据' if len(movie_conmunicate) == 0 else movie_conmunicate[0]

    movie_level = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
    movie_level = '对不起,没有查询到数据' if len(movie_level) == 0 else movie_level[0]

    movie_people = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
    movie_people = '对不起,没有查询到数据' if len(movie_people) == 0 else movie_people[0]

    movie_lookdian = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
    movie_lookdian = '对不起,没有查询到数据' if len(movie_lookdian) == 0 else movie_lookdian[0]

    movie_info_list.append({
        'movie_number':movie_number,
        'movie_name':movie_name,
        'movie_link':movie_link,
        'movie_img':movie_img,
        'movie_conmunicate':movie_conmunicate,
        'movie_level':movie_level,
        'movie_people':movie_people,
        'movie_lookdian':movie_lookdian
    })
#print(movie_info_list)
for movie in movie_info_list:
    print(movie)
    with open('./top_img/{}.png'.format(movie['movie_name']), 'wb') as f:
        f.write(requests.get(movie['movie_img'], headers=headers).content)
pd.DataFrame(movie_info_list).to_csv()
爬取到的图片

3.爬取豆瓣top250(优化版)


import requests
from lxml import html
import pandas as pd

def spider():
    sun = 0
    movie_info_list = []
    for i in range(0,250,25):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'}
        respon = requests.get(url ='https://movie.douban.com/top250?start={}&filter='.format(i),headers=headers)
        html_date = respon.text
    # print(html_date)
        sun += 1

    # print(html_date)
        selector = html.fromstring(html_date)
        top_list = selector.xpath('//ol[@class="grid_view"]/li')
        # print('第{}页搜索到{}部电影'.format(sun, len(top_list)))
        for movie in top_list:
            movie_number = movie.xpath('div[@class="item"]/div[@class="pic"]/em[@class=""]/text()')
            movie_number = '我是有底线的' if len(movie_number) ==0 else movie_number[0]

            movie_img = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')
            movie_img = '对不起,没有查询到数据' if len(movie_img) == 0 else movie_img[0]

            movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a[@class=""]/span[@class="title"]/text()')
            movie_name = '对不起,没有查询到数据' if len(movie_name)==0 else movie_name[0]

            movie_link = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/@href')
            movie_link = '对不起,没有查询到数据' if len(movie_link) == 0 else movie_link[0]

            movie_conmunicate =movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class=""]/text()')
            movie_conmunicate = '对不起,没有查询到数据' if len(movie_conmunicate) == 0 else movie_conmunicate[0]

            movie_level = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
            movie_level = '对不起,没有查询到数据' if len(movie_level) == 0 else movie_level[0]

            movie_people = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
            movie_people = '对不起,没有查询到数据' if len(movie_people) == 0 else movie_people[0]

            movie_lookdian = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
            movie_lookdian = '对不起,没有查询到数据' if len(movie_lookdian) == 0 else movie_lookdian[0]

            movie_info_list.append({
                'movie_number': movie_number,
                'movie_name': movie_name,
                'movie_link': movie_link,
                'movie_img': movie_img,
                'movie_conmunicate': movie_conmunicate,
                'movie_level': movie_level,
                'movie_people': movie_people,
                'movie_lookdian': movie_lookdian
            })
    for movie in movie_info_list:
        print(movie)
        with open('./top_img/{}.png'.format(movie['movie_name']), 'wb') as f:
            f.write(requests.get(movie['movie_img'], headers=headers).content)
    pd.DataFrame(movie_info_list).to_csv('豆瓣top250.csv')
    print('{}页共搜索到{}部电影'.format(sun, int(movie_number)))
spider()
爬取的图片 豆瓣top250csv

相关文章

网友评论

      本文标题:Python爬取豆瓣top250、切片以及饼状图

      本文链接:https://www.haomeiwen.com/subject/dcdavctx.html