可用于美食推荐及食物安全预警及改善研究
相关字段
店铺id 店铺名称 城市 区县 商圈信息 地址 营业时间 电话 店铺综合评分 人均消费 菜系分类 推荐菜系 评论数量 差评数量 用户id 用户名称 用户详情评分 评论时间 评论内容 商家回复
样例数据如下



代码如下
import os
import random
import sys
import time
import pandas as pd
import requests
from lxml import etree
from openpyxl.reader.excel import load_workbook
from pypinyin import pinyin, Style
session = requests.Session()
# 请求头
session.headers.update({
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'cookie': '_lxsdk_cuid=19517f27440c8-07c7a1cdd46c17-26011a51-144000-19517f27440c8; _lxsdk=19517f27440c8-07c7a1cdd46c17-26011a51-144000-19517f27440c8; _hc.v=8eb6e1f0-0f75-ba28-2dca-ea03bc499c10.1765254412; WEBDFPID=w55725w711xx5xuyyz8v142v0v8x2w5w80482u7208x97958817922v5-1765340811987-1739863519709EQICKSKfd79fef3d01d5e9aadc18ccd4d0c95075212; utm_source_rg=AM%25efN-m-m%25276; qruuid=bffb0e34-1fc0-47cd-a49e-df6f23299adf; dplet=497d0e6dassxd2dd7b2a4a3887de3274dd7; dper=0202b82088aa5c740cbab9135cfce2014fecb64f42284810197f5c557a5cdd3d47e616a87c6142ede8f90b34aea924b3a98a30425dbaa1248c21000000005c2f0000916fa9353822e039ed3104c8653f13312e4084ea5c239d315af7e97baddbb260142bb36e2169f79b45b86ac1c344b685; ua=Star; ctu=ed15460f96197f8ee30928db4d710dc2380d79939cc902441ae6d877de0e3e70; ll=7fd06e815b796be3df069dec7836c3df; fspop=test; logan_session_token=xvl4ay78w7tnzqxex6qv; _lxsdk_s=19b0bdc7230-f7b-ffe-1fb%7C%7C2; s_ViewType=10x'x's',
})
def save_current_page(page_number):
"""保存当前页码到进度文件"""
with open('大众_progress.txt', 'w') as f:
f.write(str(page_number))
def load_last_page():
"""读取上次保存的页码,如果没有则返回1"""
if os.path.exists('大众_progress.txt'):
with open('大众_progress.txt', 'r') as f:
progress = f.read().strip()
return int(progress) if progress.isdigit() else 1
return 1
def clean_text_list(text):
"""去掉文本中多余空格和换行"""
return [item.strip() for item in text.split(" ") if item.strip() != ""]
def join_text_list(text_list):
"""将文本列表拼接成字符串,用顿号分隔"""
return "、".join([item for item in text_list if item != ""])
def safe_xpath(tree, xpath_expr, default="NULL", index=0):
"""安全的XPath提取,如果不存在返回默认值"""
try:
result = tree.xpath(xpath_expr)
if len(result) > index:
return result[index].strip()
else:
return default
except IndexError:
print(f"⚠️ XPath结果越界: {xpath_expr}")
return default
except AttributeError:
print(f"⚠️ XPath节点不存在或类型错误: {xpath_expr}")
return default
except Exception as e:
print(f"⚠️ 解析XPath时出错: {xpath_expr} -> {type(e).__name__}: {e}")
return default
def chinese_to_pinyin(text):
# 拼接为完整拼音,不带声调
result = pinyin(text, style=Style.NORMAL)
return "".join([item[0] for item in result])
def human_sleep(min_s, max_s):
"""模拟人类暂停"""
delay = random.uniform(min_s, max_s)
return delay
def save_to_excel(data_row, sheet_name):
"""保存一行数据到指定 sheet"""
columns = ['店铺名称', '评论数量', '人均消费', '售卖主食', '商圈信息', '推荐菜系', '详情链接']
df = pd.DataFrame([data_row], columns=columns)
file_name = '大众点评.xlsx'
if not os.path.isfile(file_name):
# 文件不存在,直接创建
df.to_excel(file_name, index=False, header=True, sheet_name=sheet_name, engine='openpyxl')
else:
# 文件存在
book = load_workbook(file_name)
if sheet_name not in book.sheetnames:
# sheet 不存在就创建
with pd.ExcelWriter(file_name, engine='openpyxl', mode='a') as writer:
df.to_excel(writer, index=False, header=True, sheet_name=sheet_name)
else:
# sheet 已存在,追加数据
start_row = book[sheet_name].max_row
with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
df.to_excel(writer, index=False, header=False, startrow=start_row, sheet_name=sheet_name)
def parse_page(page_number, pinyin_str, city_chinese):
"""抓取某一页数据"""
url = f'https://www.dianping.com/{pinyin_str}/ch10/g34351r8842o11p{page_number}'
response = session.get(url)
time.sleep(human_sleep(3, 5))
if response.status_code == 403:
print("封号了!换号或等待一段时间")
sys.exit(1)
else:
html_tree = etree.HTML(response.text)
for i in range(1, 16):
shop_name = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[1]/a/h4/text()')
review_count = safe_xpath(html_tree,
f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[2]/a[1]/span/b/text()')
avg_price = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[2]/a[2]/b/text()')
main_food = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[3]/a[1]/span/text()')
business_area = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[3]/a[2]/span/text()')
recommended_dish = html_tree.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[4])')
recommended_dish_clean = join_text_list(clean_text_list(recommended_dish))
detail_link = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[1]/a/@href')
row_data = [shop_name, review_count, avg_price, main_food, business_area, recommended_dish_clean,
detail_link]
save_to_excel(row_data, city_chinese) # 保存数据
print(f'第{page_number}页,第{i}条: {shop_name}')
save_current_page(page_number + 1) # 保存页码,方便下次不用重复爬
if __name__ == '__main__':
city_chinese = input("请输入城市名: ")
pinyin_str = chinese_to_pinyin(city_chinese)
start_page = load_last_page()
for page_num in range(start_page, 51):
parse_page(page_num, pinyin_str, city_chinese)
有需要问题可以留下评论。

182

被折叠的 条评论
为什么被折叠?



