分享一批南宁市美食热搜榜top750的店铺基础信息以及好评评论信息。

可用于美食推荐及食物安全预警及改善研究

相关字段

店铺id    店铺名称    城市    区县    商圈信息    地址    营业时间    电话    店铺综合评分    人均消费    菜系分类    推荐菜系    评论数量    差评数量    用户id    用户名称    用户详情评分    评论时间    评论内容    商家回复

样例数据如下

代码如下

import os
import random
import sys
import time
import pandas as pd
import requests
from lxml import etree
from openpyxl.reader.excel import load_workbook
from pypinyin import pinyin, Style

session = requests.Session()
# 请求头
session.headers.update({
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="140", "Not=A?Brand";v="24", "Google Chrome";v="140"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'cookie': '_lxsdk_cuid=19517f27440c8-07c7a1cdd46c17-26011a51-144000-19517f27440c8; _lxsdk=19517f27440c8-07c7a1cdd46c17-26011a51-144000-19517f27440c8; _hc.v=8eb6e1f0-0f75-ba28-2dca-ea03bc499c10.1765254412; WEBDFPID=w55725w711xx5xuyyz8v142v0v8x2w5w80482u7208x97958817922v5-1765340811987-1739863519709EQICKSKfd79fef3d01d5e9aadc18ccd4d0c95075212; utm_source_rg=AM%25efN-m-m%25276; qruuid=bffb0e34-1fc0-47cd-a49e-df6f23299adf; dplet=497d0e6dassxd2dd7b2a4a3887de3274dd7; dper=0202b82088aa5c740cbab9135cfce2014fecb64f42284810197f5c557a5cdd3d47e616a87c6142ede8f90b34aea924b3a98a30425dbaa1248c21000000005c2f0000916fa9353822e039ed3104c8653f13312e4084ea5c239d315af7e97baddbb260142bb36e2169f79b45b86ac1c344b685; ua=Star; ctu=ed15460f96197f8ee30928db4d710dc2380d79939cc902441ae6d877de0e3e70; ll=7fd06e815b796be3df069dec7836c3df; fspop=test; logan_session_token=xvl4ay78w7tnzqxex6qv; _lxsdk_s=19b0bdc7230-f7b-ffe-1fb%7C%7C2; s_ViewType=10x'x's',

})


def save_current_page(page_number):
    """保存当前页码到进度文件"""
    with open('大众_progress.txt', 'w') as f:
        f.write(str(page_number))


def load_last_page():
    """读取上次保存的页码,如果没有则返回1"""
    if os.path.exists('大众_progress.txt'):
        with open('大众_progress.txt', 'r') as f:
            progress = f.read().strip()
            return int(progress) if progress.isdigit() else 1
    return 1


def clean_text_list(text):
    """去掉文本中多余空格和换行"""
    return [item.strip() for item in text.split(" ") if item.strip() != ""]


def join_text_list(text_list):
    """将文本列表拼接成字符串,用顿号分隔"""
    return "、".join([item for item in text_list if item != ""])


def safe_xpath(tree, xpath_expr, default="NULL", index=0):
    """安全的XPath提取,如果不存在返回默认值"""
    try:
        result = tree.xpath(xpath_expr)
        if len(result) > index:
            return result[index].strip()
        else:
            return default
    except IndexError:
        print(f"⚠️ XPath结果越界: {xpath_expr}")
        return default
    except AttributeError:
        print(f"⚠️ XPath节点不存在或类型错误: {xpath_expr}")
        return default
    except Exception as e:
        print(f"⚠️ 解析XPath时出错: {xpath_expr} -> {type(e).__name__}: {e}")
        return default


def chinese_to_pinyin(text):
    # 拼接为完整拼音,不带声调
    result = pinyin(text, style=Style.NORMAL)
    return "".join([item[0] for item in result])


def human_sleep(min_s, max_s):
    """模拟人类暂停"""
    delay = random.uniform(min_s, max_s)
    return delay


def save_to_excel(data_row, sheet_name):
    """保存一行数据到指定 sheet"""
    columns = ['店铺名称', '评论数量', '人均消费', '售卖主食', '商圈信息', '推荐菜系', '详情链接']
    df = pd.DataFrame([data_row], columns=columns)
    file_name = '大众点评.xlsx'

    if not os.path.isfile(file_name):
        # 文件不存在,直接创建
        df.to_excel(file_name, index=False, header=True, sheet_name=sheet_name, engine='openpyxl')
    else:
        # 文件存在
        book = load_workbook(file_name)
        if sheet_name not in book.sheetnames:
            # sheet 不存在就创建
            with pd.ExcelWriter(file_name, engine='openpyxl', mode='a') as writer:
                df.to_excel(writer, index=False, header=True, sheet_name=sheet_name)
        else:
            # sheet 已存在,追加数据
            start_row = book[sheet_name].max_row
            with pd.ExcelWriter(file_name, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
                df.to_excel(writer, index=False, header=False, startrow=start_row, sheet_name=sheet_name)


def parse_page(page_number, pinyin_str, city_chinese):
    """抓取某一页数据"""
    url = f'https://www.dianping.com/{pinyin_str}/ch10/g34351r8842o11p{page_number}'
    response = session.get(url)
    time.sleep(human_sleep(3, 5))
    if response.status_code == 403:
        print("封号了!换号或等待一段时间")
        sys.exit(1)
    else:
        html_tree = etree.HTML(response.text)
        for i in range(1, 16):
            shop_name = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[1]/a/h4/text()')
            review_count = safe_xpath(html_tree,
                                      f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[2]/a[1]/span/b/text()')
            avg_price = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[2]/a[2]/b/text()')
            main_food = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[3]/a[1]/span/text()')
            business_area = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[3]/a[2]/span/text()')

            recommended_dish = html_tree.xpath(f'string(//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[4])')
            recommended_dish_clean = join_text_list(clean_text_list(recommended_dish))

            detail_link = safe_xpath(html_tree, f'//*[@id="shop-all-list"]/ul/li[{i}]/div[2]/div[1]/a/@href')

            row_data = [shop_name, review_count, avg_price, main_food, business_area, recommended_dish_clean,
                        detail_link]
            save_to_excel(row_data, city_chinese)  # 保存数据
            print(f'第{page_number}页,第{i}条: {shop_name}')
        save_current_page(page_number + 1)  # 保存页码,方便下次不用重复爬


if __name__ == '__main__':
    city_chinese = input("请输入城市名: ")
    pinyin_str = chinese_to_pinyin(city_chinese)
    start_page = load_last_page()
    for page_num in range(start_page, 51):
        parse_page(page_num, pinyin_str, city_chinese)

有需要问题可以留下评论。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值