import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://youbian.cc/" # 替换为你要爬取的网站的 URL
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
alist = soup.select(".uk-card-body")[1:-2]
# 初始化数据列表和请求头
data = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def extract_first_dir(path):
parts = [p for p in path.split('/') if p]
return '/' + parts[0] + '/' if parts else '/'
try:
for a in alist:
try:
# 提取基础信息
path = extract_first_dir(a.select(".uk-grid-small a")[0]['href'])
print(path)
title = a.select(".uk-card-title")[0].text.split("邮政")[0].strip()
print(f"正在处理省份: {title}")
# 构建URL并请求
url1 = "https://youbian.cc" + path
response1 = requests.get(url1, headers=headers)
response1.raise_for_status() # 检查请求是否成功
soup1 = BeautifulSoup(response1.content, "html.parser")
# 获取城市列表
rows = soup1.select(".uk-card-small table tbody")[0].find_all('tr')
for row in rows:
try:
# 提取城市信息
city_td = row.find_all('td')[0]
cityurl = city_td.a['href']
url_c = "https://youbian.cc" + cityurl
citys = city_td.text.strip("»").strip()
youbians = row.find_all('td')[1].text.strip()
xingzhengs = row.find_all('td')[2].text.strip()
tels = row.find_all('td')[3].text.strip()
cns = row.find_all('td')[4].text.strip()
# 随机延迟防止被封
time.sleep(random.uniform(1.0, 2.0))
# 请求城市详情页
resp_c = requests.get(url_c, headers=headers)
resp_c.raise_for_status()
soup_c = BeautifulSoup(resp_c.content, "html.parser")
row_cs = soup_c.select(".uk-card-small table tbody")[0].find_all('tr')
# 处理每个区县(跳过表头)
for row_c in row_cs[1:]:
try:
# 提取区县信息
tds = row_c.find_all('td')
city = tds[0].text.strip("»").strip()
youbian = tds[1].text.strip()
xingzheng = tds[2].text.strip()
tel = tds[3].text.strip()
cn = tds[4].text.strip()
url_town = tds[0].a['href']
url_t = "https://youbian.cc" + url_town
# 请求乡镇详情页
time.sleep(random.uniform(0.5, 1.5))
responset = requests.get(url_t, headers=headers)
responset.raise_for_status()
soup_t = BeautifulSoup(responset.content, "html.parser")
# 提取乡镇邮编数据
rows_t = soup_t.select(".uk-card-small table tbody")[-1].find_all('td')
for row_t in rows_t:
youbiant = row_t.text.strip()
if not youbiant:
continue
data.append([youbiant, title, citys, city, xingzheng, tel])
print(f"已处理: {citys} {city} {title}")
except Exception as e:
print(f"处理区县时出错: {e}")
continue
except Exception as e:
print(f"处理城市时出错: {e}")
continue
except Exception as e:
print(f"处理省份时出错: {e}")
continue
except Exception as e:
print(f"全局错误: {e}")
finally:
# 保存数据到DataFrame
df = pd.DataFrame(data, columns=['邮编', '省', '市', '区', '行政区号', '电话区号'])
print("数据爬取完成,共获取记录:", len(df))
df.to_csv('postal_codes.csv', index=False, encoding='utf-8-sig')
最新省市区邮编行政区电话区号划分获取
于 2025-06-12 15:08:47 首次发布

819

被折叠的 条评论
为什么被折叠?



