# -*- coding:utf-8 -*-
# @Time :
# @Author :gcy
# @Email :
# @File :
import re
import urllib2
import requests
import json
import socket
import time
import logging
import pymongo
import threading
from fake_useragent import UserAgent
from lxml import etree
# #设置一个日志输出文件
# log_filename="logging.txt"
ua_list = UserAgent()
#获取歌手id
def get_singer_id(url):
my_headers={
"User-Agent": ua_list.random
}
requests = urllib2.Request(url,headers=my_headers)
html = urllib2.urlopen(requests).read()
content = etree.HTML(html)
link_list = content.xpath('//div//ul//a/@href')
for urls in link_list:
# print urls
try:
id = re.search('/artist/[0-9]+',urls).group()
# print id
singer_id = re.search('[0-9]+',id).group() #歌手id
# print singer_id
get_singer_img(singer_id)
except:
pass
def get_singer_img(singer_id):
# 建立于MongoClient 的连接
client = pymongo.MongoClient('localhost', 27017)
# 得到数据库
hero = client['baidu']
# 得到一个数据集合
msg = hero['singerimginfo']
# url3=[]
my_headers={
"User-Agent": ua_list.random
}
url = "http://music.baidu.com/artist/"+singer_id
try:
requests = urllib2.Request(url, headers=my_headers)
html = urllib2.urlopen(requests,timeout=60).read()
content = etree.HTML(html)
except urllib2.URLError,e:
print e
else:
ting_uid = singer_id #歌手id
imgurl = content.xpath('//*[@id="baseInfo"]/div[1]/span/img/@src') #头像路径缩略图
# print imgurl
imgurl1 = ''.join(imgurl) #头像路径缩略图
if imgurl1 == '/static/images/default/artist_default_130.png':
imgurl1 = 'null'
imgurl2 = 'null'
else:
imgurl1 = imgurl1
m = re.findall(r'(.+,)', imgurl1)
c = ''.join(m)
imgurl2 = c + 'w_480' #头像大图
singer_name = content.xpath('//*[@id="baseInfo"]/div[2]/div/h2/text()') #歌手
# if len(singer_name) is None:
singer_name = ''.join(singer_name)
hot = content.xpath('//*[@id="baseInfo"]/div[2]/div/div[1]/text()') #热度
hot = ''.join(hot)
hot = re.findall(r'[0-9].*', hot)
hot = ''.join(hot)
if len(hot)==0:
hot2 = "null"
else:
hot2 = hot
address = content.xpath('//*[@id="baseInfo"]/div[2]/ul/li[1]/span[1]/text()') #地区
address = ''.join(address)
if len(address)==0:
address2 = 'null'
else:
address2=address
birthday = content.xpath('//*[@id="baseInfo"]/div[2]/ul/li[1]/span[2]/text()') # 生日
birthday = ''.join(birthday)
if len(birthday)==0:
birthday2 = 'null'
else:
birthday2 = birthday
singer_encyclopedia_url = content.xpath('//*[@id="baike_artist"]/@href') #歌手百科路径
singer_encyclopedia_url = ''.join(singer_encyclopedia_url)
if len(singer_encyclopedia_url)==0:
singer_encyclopedia_url2 = 'null'
else:
singer_encyclopedia_url2 = singer_encyclopedia_url
data = {
"ting_uid":ting_uid,
"head_image_small":imgurl1,
"head_image_big":imgurl2,
"singer_name":singer_name,
"hot":hot2,
"address":address2,
"birthday":birthday2,
"singer_encyclopedia_url":singer_encyclopedia_url2,
}
# result = singerinfo.collection.find_one(ting_uid)
# if result:
# pass
# else:
# pass
msg.insert_one(data)
print ting_uid
print imgurl
print imgurl2
print singer_name
print hot2
print address2
print birthday2
print singer_encyclopedia_url2
if __name__ == '__main__':
url = "https://music.baidu.com/artist"
# get_singer_id(url)
t2 = threading.Thread(target=get_singer_id(url))
t2.start()
time.sleep(4)
网友评论