python 爬虫（lxml与jsonpath）使用示例

最新推荐文章于 2026-06-18 18:31:27 发布

原创最新推荐文章于 2026-06-18 18:31:27 发布 · 524 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#1024程序员节 #爬虫 #python

python 专栏收录该内容

1 篇文章

订阅专栏

一、lxml主要是用来爬取html数据

from lxml import etree

html = '''
<html>
  <body>
    <div>
      <h1>Hello World</h1>
      <p>This is a paragraph.</p>
      <ul>
        <li class="item-0"><a href="link1.html">first item</a></li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-inactive"><a href="link3.html">third item</a></li>
        <li class="item-1"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
      </ul>
    </div>
  </body>
</html>
'''

selector = etree.HTML(html)

# 选择所有带有class属性的li标签
items = selector.xpath('//li[@class]/a/text()')
print(items)

# 选择所有带有class属性为item-0的li标签
items = selector.xpath('//li[@class="item-0"]')
print(items)

# 选择所有带有class属性为item-0的li标签，并获取其文本
texts = selector.xpath('//li[@class="item-0"]/text()')
print(texts)

# 选择所有带有class属性为item-0的li标签，并获取其a标签的href属性
links = selector.xpath('//li[@class="item-0"]/a/@href')
print(links)

# 选择所有带有class属性为item-0的li标签，并获取其a标签的文本
texts = selector.xpath('//li[@class="item-0"]/a/text()')
print(texts)

# 选择带有class属性为item-0的第一个li标签
item = selector.xpath('//li[@class="item-0"][1]')
print(item)

# 选择带有class属性为item-0的第一个li标签，并获取其a标签的文本
text = selector.xpath('//li[@class="item-0"][1]/a/text()')[0]
print(text)

# 选择所有带有class属性为item-0的li标签，并获取其a标签的文本，去除空白字符
texts = selector.xpath('//li[@class="item-0"]/a/text()')
clean_texts = [t.strip() for t in texts if t.strip()]
print(clean_texts)

# 选择所有带有class属性为item-0的li标签，并获取其a标签的文本，去除空白字符并转换成大写
texts = selector.xpath('//li[@class="item-0"]/a/text()')
clean_texts = [t.strip().upper() for t in texts if t.strip()]
print(clean_texts)

二、jsonpath主要是用来爬取json数据的

import json
import jsonpath

# # JSON 数据 （arr格式）
# json_data = '''
# [
#     { "category": "reference", "title": "Sayings of the Century", "price": 8.95 },
#     { "category": "fiction", "title": "Sword of Honour", "price": 12.99 },
#     { "category": "fiction", "title": "Moby Dick", "isbn": "0-553-21311-3", "price": 8.99 }
# ]
# '''

# JSON 数据 （obj格式）
json_data = '''
{
    "store": {
        "book": [
            { "category": "reference", "title": "Sayings of the Century", "price": 8.95 },
            { "category": "fiction", "title": "Sword of Honour", "price": 12.99 },
            { "category": "fiction", "title": "Moby Dick", "isbn": "0-553-21311-3", "price": 8.99 }
        ],
        "bicycle": {
            "color": "red",
            "price": 19.95
        }
    }
}
'''

# 解析 JSON 数据
data = json.loads(json_data)

# # 执行 JSONPath 查询
# # 所有数据
# all_books = jsonpath.jsonpath(data, "$[*]")
# # 第一个数据
# first_book = jsonpath.jsonpath(data, "$[0]")
# # 过滤器，价格小于10的
# cheap_books = jsonpath.jsonpath(data, "$[?(@.price < 10)]")
# # 找0、1两条数据
# first_two_books = jsonpath.jsonpath(data, "$[0:2]")

all_books = jsonpath.jsonpath(data, "$.store.book[*]")
first_book = jsonpath.jsonpath(data, "$.store.book[0]")
cheap_books = jsonpath.jsonpath(data, "$.store.book[?(@.price < 10)]")
first_two_books = jsonpath.jsonpath(data, "$.store.book[0:2]")

print("All books:", all_books)
print("First book:", first_book)
print("Cheap books:", cheap_books)
print("First two books:", first_two_books)