python爬虫
1、下载一些python库
import csv
from bs4 import BeautifulSoup # 网页解析,获取数据
import urllib.request, urllib.error # 制定URL,获取网页数据
from bs4 import BeautifulSoup 如果标红报错,先下载BeautifulSoup4和bs4,就不报错了
python下载安装BeautifulSoup报错
2、爬取网页内容
def askURL(url):
response = urllib.request.urlopen(url)
# 尝试获取网页的实际编码
content_type = response.getheader('Content-Type')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
else:
charset = 'GBK' # 如果没有指定编码,使用默认编码utf-8
html = response.read().decode(charset)
return html
3、解析网页,获取数据
def getData(baseurl):
datalist = [] # 用来存储爬取的网页信息
html = askURL(baseurl) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
dataLists = soup.find_all('tr', class_='t_tr1')
# print(dataLists)
for item in dataLists:
finddate = item.find_all('td')
first_td_text = finddate[0].text.strip() # 24174
second_td_text = finddate[1].get_text(strip=True) # 0 0 8
last_td_text = finddate[-1].text.strip() # 2024-07-02
datalist.append([first_td_text, second_td_text, last_td_text])
return datalist
4、将数据写入csv文杰
def saveData(datalist, savepath):
col = ("期号", "中奖号码", "开奖日期") #列名
with open(savepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(col) # 写入列名
writer.writerows(datalist) # 写入数据
print(f"数据已保存到 {savepath}")
完整代码
import csv
from bs4 import BeautifulSoup # 网页解析,获取数据
import urllib.request, urllib.error # 制定URL,获取网页数据
def getData(baseurl):
datalist = [] # 用来存储爬取的网页信息
html = askURL(baseurl) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
dataLists = soup.find_all('tr', class_='t_tr1')
# print(dataLists)
for item in dataLists:
finddate = item.find_all('td')
first_td_text = finddate[0].text.strip() # 24174
second_td_text = finddate[1].get_text(strip=True) # 0 0 8
last_td_text = finddate[-1].text.strip() # 2024-07-02
datalist.append([first_td_text, second_td_text, last_td_text])
return datalist
def askURL(url):
response = urllib.request.urlopen(url)
# 尝试获取网页的实际编码
content_type = response.getheader('Content-Type')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
else:
charset = 'GBK' # 如果没有指定编码,使用默认编码utf-8
html = response.read().decode(charset)
return html
def saveData(datalist, savepath):
col = ("期号", "中奖号码", "开奖日期") #列名
with open(savepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(col) # 写入列名
writer.writerows(datalist) # 写入数据
print(f"数据已保存到 {savepath}")
def main(start, end):
baseurl = f"url?limit={end - start + 1}&start={start}&end={end}" # 要爬取的网页链接
# 1.爬取网页
datalist = getData(baseurl)
# print(datalist)
savepath = "彩票.csv" # 当前目录新建csv,存储进去
saveData(datalist, savepath)
if __name__ == "__main__": # 当程序执行时
# 调用函数
main(24000, 24259)
# init_db("movietest.db")
print("爬取完毕!")

3687

被折叠的 条评论
为什么被折叠?



