爬虫解析库XPath使用
1.XPath简介
XPath 是一门在 XML 文档中查找信息的语言。XPath 用于在 XML 文档中通过元素和属性进行导航。XPath 使用路径表达式来选取 XML 文档中的节点或者节点集。这些路径表达式和我们在常规的电脑文件系统中看到的表达式非常相似。XPath 含有超过 100 个内建的函数。这些函数用于字符串值、数值、日期和时间比较、节点和 QName 处理、序列处理、逻辑值等等。XPath 是 XSLT 标准中的主要元素。如果没有 XPath 方面的知识,您就无法创建 XSLT 文档。XPath 于 1999 年 11 月 16 日 成为 W3C 标准。XPath 被设计为供 XSLT、XPointer 以及其他 XML 解析软件使用。
2.XPath术语
节点(Node)在 XPath 中,有七种类型的节点:元素、属性、文本、命名空间、处理指令、注释以及文档(根)节点。XML 文档是被作为节点树来对待的。树的根被称为文档节点或者根节点。
节点关系:父(Parent)、子(Children)、同胞(Sibling)、先辈(Ancestor)、后代(Descendant)共五种。
3.XPath常用规则
| 表达式 | 描述 |
|---|---|
| nodename | 选取此节点的所有节点 |
| / | 从当前节点选取直接子节点 |
| // | 从当前节点选取子孙节点 |
| . | 选取当前节点 |
| .. | 选取当前节点的父节点 |
| @ | 选取属性 |
| 通配符 | 描述 |
|---|---|
| * | 匹配任何元素节点 |
| @* | 匹配任何属性节点 |
| node() | 匹配任何类型的节点 |
4.示例
from lxml import etree #导入lxml库的etree模块
text='''<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text) #调用HTML类进行初始化,构造一个XPath解析对象
result= etree.tostring(html) #例如最后一个li节点没有闭合。tostring()方法输出自动修正后的HTML代码,但是结果是bytes类型。需利用decode()方法将其转成str类型
print(result.decode('utf-8'))
<html><body><div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>
</body></html>
另外,可直接读取文本进行解析
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser()) #test.html内容就是上面例子中的HTML代码,parse()方法读取文本进行解析
result = etree.tostring(html)
print(result.decode('utf-8'))
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body><div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>
</body></html>
from lxml import etree
html = etree.parse('./test.html',etree.HTMLParser())
匹配所有节点
result = html.xpath('//*')
print(result)
[<Element html at 0x11185a4b0>, <Element body at 0x11185ab40>, <Element div at 0x11185abe0>, <Element ul at 0x11185adc0>, <Element li at 0x11185ae10>, <Element a at 0x11185af50>, <Element li at 0x11185ac30>, <Element a at 0x112344780>, <Element li at 0x112344820>, <Element a at 0x112344280>, <Element li at 0x112344370>, <Element a at 0x1123440a0>, <Element li at 0x1123440f0>, <Element a at 0x112344140>]
匹配子节点
result = html.xpath('//li/a') #li节点下的所有直接a子节点
print(result)
result = html.xpath('//ul//a') #ul节点下的所有子孙a节点
print(result)
[<Element a at 0x111c68a00>, <Element a at 0x111c68d70>, <Element a at 0x111c68e10>, <Element a at 0x111c68f00>, <Element a at 0x111c68f50>]
[<Element a at 0x111c68a00>, <Element a at 0x111c68d70>, <Element a at 0x111c68e10>, <Element a at 0x111c68f00>, <Element a at 0x111c68f50>]
父节点
result = html.xpath('//a[@href="link4.html"]/..') #获取href属性为link4.html的a节点的父节点
print(result)
result = html.xpath('//a[@href="link4.html"]/../@class') #获取href属性为link4.html的a节点的父节点,再获取其class属性
print(result)
[<Element li at 0x112368e10>]
['item-1']
属性匹配
result = html.xpath('//li[@class="item-0"]') #获取class属性为item-0的li节点
print(result)
[<Element li at 0x112368fa0>, <Element li at 0x112368b90>]
文本获取
result = html.xpath('//li[@class="item-0"]//text()') #获取class属性为item-0的li节点中子孙节点内部的所有文本://text()
print(result)
result = html.xpath('//li[@class="item-0"]/a/text()') #获取class属性为item-0的li节点中的特定子孙a节点中所有文本:/text()
print(result)
['first item', 'fifth item', '\n']
['first item', 'fifth item']
属性获取
result = html.xpath('//li/a/@href') #通过@href获取节点的href属性
print(result)
['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
属性多值匹配
from lxml import etree
text = '''
<li class="li li-first"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="li li-first"]/a/text()')
print(result)
result = html.xpath('//li[contains(@class,"li")]/a/text()') #通过contains()方法获取节点的href属性,第一个参数传入属性名称,第二个参数传入属性值
print(result)
['first item']
['first item']
多属性匹配
from lxml import etree
text = '''
<li class="li li-first" name="item"><a href="link1.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()') #使用运算符and连接
print(result)
['first item']
拓展:XPath中的运算符 https://www.w3school.com.cn/xpath/xpath_operators.asp
按序选择
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
# 获取第一个
result = html.xpath('//li[1]/a/text()')
print(result)
# 获取最后一个
result = html.xpath('//li[last()]/a/text()')
print(result)
# 获取前两个
result = html.xpath('//li[position()<3]/a/text()')
print(result)
# 获取倒数第三个
result = html.xpath('//li[last()-2]/a/text()')
print(result)
['first item']
['fifth item']
['first item', 'second item']
['third item']
节点轴选择
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
# 获取所有祖先节点
result = html.xpath('//li[1]/ancestor::*')
print(result)
# 获取 div 祖先节点
result = html.xpath('//li[1]/ancestor::div')
print(result)
# 获取当前节点所有属性值
result = html.xpath('//li[1]/attribute::*')
print(result)
# 获取 href 属性值为 link1.html 的直接子节点
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
# 获取所有的的子孙节点中包含 span 节点但不包含 a 节点
result = html.xpath('//li[1]/descendant::span')
print(result)
# 获取当前所有节点之后的第二个节点
result = html.xpath('//li[1]/following::*[2]')
print(result)
# 获取当前节点之后的所有同级节点
result = html.xpath('//li[1]/following-sibling::*')
print(result)
[<Element html at 0x111afc730>, <Element body at 0x111e79960>, <Element div at 0x111afc870>, <Element ul at 0x111afcf00>]
[<Element div at 0x111afc870>]
['item-0']
[<Element a at 0x111879af0>]
[<Element span at 0x111e79960>]
[<Element a at 0x110e182d0>]
[<Element li at 0x111e79960>, <Element li at 0x111879af0>, <Element li at 0x111879f00>, <Element li at 0x111879fa0>]
实战中常用获取下一页链接:
from lxml import etree
text = '''
<div class="ListPageWrap">
<a href="javascript:void(0);"> < </a>
<a class="Pagecurt" href="index_shopping_dianshang.html">1</a>
<a href="index_shopping_dianshang_2.html">2</a>
<a href="index_shopping_dianshang_3.html">3</a>
<a href="index_shopping_dianshang_4.html">4</a>
<a href="index_shopping_dianshang_5.html">5</a>
<a href="index_shopping_dianshang_6.html">6</a>
<a href="index_shopping_dianshang_7.html">7</a>
<a href="index_shopping_dianshang_8.html">8</a>
<span>...</span><a href="index_shopping_dianshang_64.html">64</a>
<a href="index_shopping_dianshang_2.html">下一页</a>
</div>
'''
html = etree.HTML(text)
#方法一
result = html.xpath('//div[@class="ListPageWrap"]/a[contains(.,"下一页")]/@href')
print(result)
#方法二
result = html.xpath('//div[@class="ListPageWrap"]/a[contains(text(),"下一页")]/@href')
print(result)
#方法三
result = html.xpath('//div[@class="ListPageWrap"]/a[text()="下一页"]/@href')
print(result)
['index_shopping_dianshang_2.html']
['index_shopping_dianshang_2.html']
['index_shopping_dianshang_2.html']









网友评论