python xpath 解析网页 应用实例

本文介绍了Python中使用XPath解析网页的实际应用,包括基础概念学习和实战案例展示。

首先请阅读文档 http://www.runoob.com/xpath/xpath-tutorial.html ,这里有背景和概念的介绍。

下面展示一些实际中的应用。

一、准备:

from lxml import etree # 加载模块

 
html='''
<html>
    <head>
        <title>哈哈测试一下</title>
        <link type="text/css" rel="stylesheet" href="haha.css" />
        <link type="text/css" rel="stylesheet" href="haha1.css" />
        <link type="text/css" rel="stylesheet" href="haha2.css" />
        <script type="text/javascript" src="haha.js"></script>
        <script type="text/javascript" src="haha1.js"></script>
        <script type="text/javascript" src="haha2.js"></script>
    </head>
    <body>
        <div id="id1" class="class1">
            <div id="id2" class="class2">
                <ul class="cls_ul1">
                    <li class="cls_li1">
                        <div class="cls_3">
                            <span>span_text1</span>
                            <span>span_text2</span>
                            <i>text_1</i>
                        </div>
                        <div>
                            <a href="a_1.html">a_1</a>
                            <a href="a_2.html">a_2</a>
                            <a href="a_3.html">a_3</a>
                        </div>
                        <div class="cls_4">
                            <a href="a_4.html">
                                <img href="a_img1.jpg" />
                            </a>
                        </div>
                    </li>
                    <li class="cls_li1">
                        <div class="cls_3">
                            <span>span_text3</span>
                            <span>span_text4</span>
                            <i>text_2</i>
                            <i>text_22</i>
                        </div>
                        <div>
                            <a href="a_4.html">a_4</a>
                            <a href="a_5.html">a_5</a>
                            <a href="a_6.html">a_6</a>
                        </div>
                        <div class="cls_4">
                            <a href="a_5.html">
                                <img href="a_img2.jpg" />
                            </a>
                        </div>
                    </li>
                </ul>
            </div>
            <div id="id3" class="class3">
                <ul class="cls_ul2">
                    <li class="cls_li2">
                        <div class="cls_5">
                            <span>span_text5</span>
                            <span>span_text6</span>
                            <i>text_3</i>
                        </div>
                        <div>
                            <a href="a_1.html">a_1</a>
                            <a href="a_2.html">a_2</a>
                            <a href="a_3.html">a_3</a>
                        </div>
                        <div class="cls_6">
                            <a href="a_4.html">
                                <img href="a_img3.jpg" />
                            </a>
                        </div>
                    </li>
                    <li class="cls_li2">
                        <div class="cls_5">
                            <span>span_text7</span>
                            <span>span_text8</span>
                            <i>text_4</i>
                        </div>
                        <div>
                            <a href="a_4.html">a_4</a>
                            <a href="a_5.html">a_5</a>
                            <a href="a_6.html">a_6</a>
                        </div>
                        <div class="cls_6">
                            <a href="a_5.html">
                                <img href="a_img4.jpg" />
                            </a>
                        </div>
                    </li>
                </ul>
            </div>
        </div>
    </body>
</html>
'''
html_data = etree.HTML(html) #  

二、应用:

# 从根节点开始,沿着XML路径一步一步选择节点,text()表示节点内容
content = html_data.xpath("/html/head/title/text()")
for con in content:
    print(con)
>>>哈哈测试一下
# 从根节点开始,沿着XML路径一步一步选择节点,text表示节点内容
node = html_data.xpath("/html/head/title") 
for n in node:
    print(n.text)
>>>哈哈测试一下
# 从文档中某个节点开始,不考虑此节点位置,text()表示节点内容
content = html_data.xpath("//title/text()") 
for con in content:
    print(con)
>>>哈哈测试一下
# 获取所有div(html/body/div/div)的id属性值
nodes = html_data.xpath("/html/body/div/div") 
for i in range(len(nodes)): # 循环节点
    content = nodes[i].xpath("@id")
    for con in content:
        print(con)
>>>id2
   id3
# body节点下某节点的属性值
content = html_data.xpath("body/div/div[@id='id2']/ul/li[1]/div[2]/a/@href") 
for con in content:
    print(con)
>>>a_1.html
   a_2.html
   a_3.html
# div[@id='id2']节点下某节点的属性值
content = html_data.xpath("//div[@id='id2']/ul/li[1]/div[2]/a/@href") 
for con in content:
    print(con)
>>>a_1.html
   a_2.html
   a_3.html
# div[@id='id2']节点下某节点的内容
content = html_data.xpath("//div[@id='id2']/ul/li[1]/div[2]/a/text()") 
for con in content:
    print(con)
>>>a_1
   a_2
   a_3
# 用'*'来匹配任何元素
content = html_data.xpath("*//div[@id='id2']/ul/li[1]/div[2]/a/text()") 
for con in content:
    print(con)
>>>a_1
   a_2
   a_3
# 选取多个节点
nodes = html_data.xpath("//i|//span") 
for i in range(len(nodes)):
    print(nodes[i].text)
>>>span_text1
   span_text2
   text_1
   span_text3
   span_text4
   text_2
   text_22
   span_text5
   span_text6
   text_3
   span_text7
   span_text8
   text_4
# 选取所有li节点
nodes = html_data.xpath("//li") 
for i in range(len(nodes)):
    content = nodes[i].xpath("div/@class") # li节点下所有div节点的class属性值
    print(i,'='*5)
    for con in content:
        print(con)
>>>0 =====
   cls_3
   cls_4
   1 =====
   cls_3
   cls_4
   2 =====
   cls_5
   cls_6
   3 =====
   cls_5
   cls_6
# 选取所有li节点
nodes = html_data.xpath("//li") 
for i in range(len(nodes)):
    content = nodes[i].xpath("div[last()]/@class") # li节点下最后一个div节点的class属性值
    print(i,'='*5)
    for con in content:
        print(con)
>>>0 =====
   cls_4
   1 =====
   cls_4
   2 =====
   cls_6
   3 =====
   cls_6
# 这里应用了'..'和'@',其中'..'表示父节点,具体就是上一步(title)的父节点head;'@'表示属性,就是它后面接是属性名,在这里的意思就是属性href的内容
content = html_data.xpath("/html/head/title/../link/@href") 
for con in content:
    print(con)
>>>haha.css
   haha1.css
   haha2.css
# div[@class='cls_3']的子节点span的兄弟节点i
nodes = html_data.xpath("//div[@class='cls_3']/span/following-sibling::i") 
for i in range(len(nodes)):
    content = nodes[i].xpath("./text()") # 当前节点的内容
    for con in content:
        print(con)
>>>text_1
   text_2
   text_22
# li[@class='cls_li1']后代节点里第一个div的class属性值
content = html_data.xpath("//li[@class='cls_li1']/descendant::div[1]/@class") 
for con in content:
    print(con)
>>>cls_3
   cls_3
# li[@class='cls_li1']后代节点里span的内容
content = html_data.xpath("//li[@class='cls_li1']/descendant::span/text()") 
for con in content:
    print(con)
>>>span_text1
   span_text2
   span_text3
   span_text4
# 用'*'来匹配任何元素,且不包含class属性的div节点
content = html_data.xpath("*//div[@id='id2']/ul/li[1]/div[not(@class)]/a/text()")
for con in content:
    print(con)
>>>a_1
>>>a_2
>>>a_3

==================================

# 多个条件的情况
xpath("//div[@class='cls_1' and @id='id_1']")
# contains 包含的情况
xpath("//div[contains(@class,'cls_1')]") # <div class='cls_1 cls_2'></div>

 

 

 

 

 

 

 

 

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值