webdriver

作者: FDDDDDDD_ | 来源:发表于2018-06-22 13:14 被阅读0次
import requests
import os
import pymysql
import uuid
import re
from selenium import webdriver
from bs4 import BeautifulSoup


def getHeaders():
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)',
    }
    return headers

def fillList(infos,i):
    try:
        infos['title']=i.find_element_by_xpath('./div[1]/div/div/a').text
        infos['url']=i.find_element_by_xpath('./div[1]/div/div/a').get_attribute("href")
        print('----',infos['url'])
        #保存二级页面
        resp_detail = requests.get(url=infos['url'],headers=getHeaders())
        content_detail = resp_detail.content.decode('utf-8')
        content=re.findall(r'<p>.*</p>',content_detail)
        ''.join(content)
        content = re.sub(r'<img.*?>', '', content[0])
        infos['html'] = content
        #保存图片
        img_url=i.find_element_by_xpath('./div[2]/a/img').get_attribute("src")
        resp_img = requests.get(url=img_url)
        content_img = resp_img.content
        currentPathName = os.getcwd()
        parentPathName = os.path.abspath(os.path.join(currentPathName, os.pardir))
        folder_path = parentPathName + "/image" + "/"
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        img_name = str(uuid.uuid1()) + '.jpg'
        filename = '%s%s' % (folder_path, img_name)
        with open(filename, 'wb') as f:
            f.write(content_img)
        infos['img'] = img_name

        print('提取信息成功')
    except Exception as e:
        print('提取信息失败')
    return infos


def printInfo(infos, inf):
    conn = pymysql.connect(host='localhost', port=3306, user='root',
                           passwd='root', db='jh_project01', charset='utf8')

    cur = conn.cursor()
    sqlc = '''
                    create table news(
                    id int primary key auto_increment,
                    title varchar(60),
                    img varchar(60),
                    url varchar(100),
                    html longtext)DEFAULT CHARSET=utf8;
                    '''
    try:
        cur.execute(sqlc)
        conn.commit()
        print("成功")
    except:
        print("错误")

    for item, i in enumerate(inf):
        # print(item,i.text)
        if item == 7:
            break
        infos=fillList(infos, i)
        sqla = '''
                insert into news(title,img,url,html)
                values(%s,%s,%s,%s);
               '''
        try:
            cur.execute(sqla, (infos['title'], infos['img'], infos['url'], infos['html']))
            conn.commit()
            print("成功")
        except:
            print("失败")

    conn.commit()
    cur.close()
    conn.close()

def main():
    infos = {}
    driver = webdriver.Chrome()
    driver.get('https://www.toutiao.com/ch/news_tech/')
    js = "var q=document.documentElement.scrollTop=500"
    driver.execute_script(js)
    inf = driver.find_elements_by_xpath('//div[@class="wcommonFeed"]/ul/li[@class="item    "]/div[@class="item-inner y-box"]')
    del inf[0]
    print(inf)
    # print(len(inf))
    printInfo(infos,inf)

    driver.close()

main()

相关文章

网友评论

      本文标题:webdriver

      本文链接:https://www.haomeiwen.com/subject/ezdkyftx.html