<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
</div>
</body>
</html>
scrapy shell http://example.com/
r = response
# 获取 base href 属性
r.xpath('//base/@href').extract_first()
r.css('base::attr(href)').extract_first()
# 获取 title
r.xpath('//title/text()').extract_first()
r.css('title::text').extract_first()
r.xpath('//div[@id="images"]').extract()
r.xpath('//div[@id="images"]/a').extract()
r.xpath('//div[@id="images"]/a/@href').extract()
r.xpath('//div[@id="images"]/a/img').extract()
r.xpath('//a').css('img::attr(src)').extract()
# 带条件提取
# 找出所有a 标签中 href 中 含有 image 的所有属性
r.xpath('//a[contains(@href,"image")]/@href').extract()
r.xpath('//a[contains(@href,"i")]/@href').extract()
r.xpath('//a[contains(@href,"2")]/@href').extract()
r.css('a[href*=image]::attr(href)')
r.xpath('//a[contains(@href,"image")]/img/@src').extract()
r.xpath('//a[contains(@href,"image1")]/img/@src').extract()
# re
r.css('a::text').re('Name\:(.*)')
r.css('a::text').re_first('Name\:(.*)').strip() # 去空格
网友评论