最新省市区邮编行政区电话区号划分获取

import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = "https://youbian.cc/"  # 替换为你要爬取的网站的 URL
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
alist = soup.select(".uk-card-body")[1:-2]
# 初始化数据列表和请求头
data = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def extract_first_dir(path):
    parts = [p for p in path.split('/') if p]
    return '/' + parts[0] + '/' if parts else '/'

try:
    for a in alist:
        try:
            # 提取基础信息
            path = extract_first_dir(a.select(".uk-grid-small a")[0]['href'])
            print(path)
            title = a.select(".uk-card-title")[0].text.split("邮政")[0].strip()
            print(f"正在处理省份: {title}")
            
            # 构建URL并请求
            url1 = "https://youbian.cc" + path
            response1 = requests.get(url1, headers=headers)
            response1.raise_for_status()  # 检查请求是否成功
            soup1 = BeautifulSoup(response1.content, "html.parser")
            
            # 获取城市列表
            rows = soup1.select(".uk-card-small table tbody")[0].find_all('tr')
            
            for row in rows:
                try:
                    # 提取城市信息
                    city_td = row.find_all('td')[0]
                    cityurl = city_td.a['href']
                    url_c = "https://youbian.cc" + cityurl
                    citys = city_td.text.strip("»").strip()
                    youbians = row.find_all('td')[1].text.strip()
                    xingzhengs = row.find_all('td')[2].text.strip()
                    tels = row.find_all('td')[3].text.strip()
                    cns = row.find_all('td')[4].text.strip()
                    
                    # 随机延迟防止被封
                    time.sleep(random.uniform(1.0, 2.0))
                    
                    # 请求城市详情页
                    resp_c = requests.get(url_c, headers=headers)
                    resp_c.raise_for_status()
                    soup_c = BeautifulSoup(resp_c.content, "html.parser")
                    row_cs = soup_c.select(".uk-card-small table tbody")[0].find_all('tr')
                    
                    # 处理每个区县(跳过表头)
                    for row_c in row_cs[1:]:
                        try:
                            # 提取区县信息
                            tds = row_c.find_all('td')
                            city = tds[0].text.strip("»").strip()
                            youbian = tds[1].text.strip()
                            xingzheng = tds[2].text.strip()
                            tel = tds[3].text.strip()
                            cn = tds[4].text.strip()
                            url_town = tds[0].a['href']
                            url_t = "https://youbian.cc" + url_town
                            
                            # 请求乡镇详情页
                            time.sleep(random.uniform(0.5, 1.5))
                            responset = requests.get(url_t, headers=headers)
                            responset.raise_for_status()
                            soup_t = BeautifulSoup(responset.content, "html.parser")
                            
                            # 提取乡镇邮编数据
                            rows_t = soup_t.select(".uk-card-small table tbody")[-1].find_all('td')
                            for row_t in rows_t:
                                youbiant = row_t.text.strip()
                                if not youbiant:
                                    continue
                                data.append([youbiant, title, citys, city, xingzheng, tel])
                            
                            print(f"已处理: {citys} {city} {title}")
                            
                        except Exception as e:
                            print(f"处理区县时出错: {e}")
                            continue
                
                except Exception as e:
                    print(f"处理城市时出错: {e}")
                    continue
        
        except Exception as e:
            print(f"处理省份时出错: {e}")
            continue

except Exception as e:
    print(f"全局错误: {e}")

finally:
    # 保存数据到DataFrame
    df = pd.DataFrame(data, columns=['邮编', '省', '市', '区', '行政区号', '电话区号'])
    print("数据爬取完成,共获取记录:", len(df))
    df.to_csv('postal_codes.csv', index=False, encoding='utf-8-sig')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值