使用scrapy爬取当当网商品信息

作者: tonyemail_st | 来源:发表于2017-10-24 10:40 被阅读42次

使用scrapy爬取当当网商品信息
scrapy 爬取当当网-图书排行榜-多条件爬取
scrapy爬取妹子图
[scrapy]scrapy爬取京东商品信息——以自营手机为例
Python爬虫（15）利用Scrapy爬虫当当网图书畅销榜
Scrapy+Selenium+Phantomjs的Demo
Scrapy爬取图片续集
python实战计划第一周，第二个项目
爬虫练习_使用scrapy爬取淘宝
爬虫实战（二）之 CrawlSpider 爬取新闻网

参考文档：https://doc.scrapy.org/en/latest/intro/tutorial.html

dd.py

# -*- coding: utf-8 -*-
import scrapy
from dangdang.items import DangdangItem
from scrapy.http import Request

class DdSpider(scrapy.Spider):
    name = 'dd'
    allowed_domains = ['http://www.dangdang.com/']
    start_urls = ['http://search.dangdang.com/?key=%CD%E2%CC%D7%C5%AE&act=input&page_index=1']

    def start_requests(self):
        urls = []
        for i in range(1, 81):
            current_url = 'http://search.dangdang.com/?key=%CD%E2%CC%D7%C5%AE&act=input&page_index=' + str(i)
            urls.append(current_url)
        print(urls)
        for url in urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        item=DangdangItem()
        item["title"] = response.xpath('//a[@name="itemlist-picture"]/@title').extract()
        item['link'] = response.xpath('//a[@name="itemlist-picture"]/@href').extract()
        item['comment'] = response.xpath("//a[@dd_name='单品评论']/text()").extract()
        yield item

piplines.py

# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class DangdangPipeline(object):
    def process_item(self, item, spider):
        conn = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", db="dd", charset="utf8")
        for i in range(0,len(item["title"])):
            title=item["title"][i]
            print("正在处理：" + title)
            link=item["link"][i]
            comment=item["comment"][i]
            # print(title + ":" + link + ":" + comment)
            sql="insert into goods(title, link, comment) values('" + title + "','" + link + "','" + comment + "')"
            print(sql)
            try:
                conn.query(sql)
                conn.commit()
            except Exception as err:
                print(err)
        conn.close()
        return item

settings.py

ITEM_PIPELINES = {
   'dangdang.pipelines.DangdangPipeline': 300,
}

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class DangdangItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    comment = scrapy.Field()