美文网首页python爬虫学习
使用scrapy爬取当当网商品信息

使用scrapy爬取当当网商品信息

作者: tonyemail_st | 来源:发表于2017-10-24 10:40 被阅读42次

参考文档:https://doc.scrapy.org/en/latest/intro/tutorial.html

dd.py
# -*- coding: utf-8 -*-
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request

class DdSpider(scrapy.Spider):
    name = 'dd'
    allowed_domains = ['http://www.dangdang.com/']
    start_urls = ['http://search.dangdang.com/?key=%CD%E2%CC%D7%C5%AE&act=input&page_index=1']

    def start_requests(self):
        urls = []
        for i in range(1, 81):
            current_url = 'http://search.dangdang.com/?key=%CD%E2%CC%D7%C5%AE&act=input&page_index=' + str(i)
            urls.append(current_url)
        print(urls)
        for url in urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        item=DangdangItem()
        item["title"] = response.xpath('//a[@name="itemlist-picture"]/@title').extract()
        item['link'] = response.xpath('//a[@name="itemlist-picture"]/@href').extract()
        item['comment'] = response.xpath("//a[@dd_name='单品评论']/text()").extract()
        yield item

piplines.py

# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class DangdangPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="dd", charset="utf8")
        for i in range(0,len(item["title"])):
            title=item["title"][i]
            print("正在处理:" + title)
            link=item["link"][i]
            comment=item["comment"][i]
            # print(title + ":" + link + ":" + comment)
            sql="insert into goods(title, link, comment) values('" + title + "','" + link + "','" + comment + "')"
            print(sql)
            try:
                conn.query(sql)
                conn.commit()
            except Exception as err:
                print(err)
        conn.close()
        return item

settings.py

ITEM_PIPELINES = {
   'dangdang.pipelines.DangdangPipeline': 300,
}

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    comment = scrapy.Field()

相关文章

网友评论

    本文标题:使用scrapy爬取当当网商品信息

    本文链接:https://www.haomeiwen.com/subject/wfdtpxtx.html