下载所有的壁纸,并且将壁纸和描述保存到数据库
数据库字段:id, type, small, big, title
运行请注意延时,不要暴力运行,影响他人正常使用
import json
import os
import random
import time
import pymysql
import requests
from bs4 import BeautifulSoup
host = 'www.netbian.com'
Referer = 'http://www.netbian.com/dongman/index_2.htm'
headers = [{
'Referer': Referer,
'Host': host,
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
}, {
'Referer': Referer,
'Host': host,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
}]
db = pymysql.connect(
host='localhost',
user='root',
password='123456',
db='taici',
charset='utf8mb4',
)
def download_img(img_url):
h = {
'Referer': Referer,
'Host': "img.netbian.com",
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
}
r = requests.get(img_url, headers=h, stream=True)
print(img_url)
print(r.status_code) # 返回状态码
if r.status_code == 200:
path = './img/%s' % getUrlFileName(img_url)
if os.path.exists(path):
pass
else:
open(path, 'wb').write(r.content) # 将内容写入图片
print("done")
del r
def getUrlFileName(url):
return os.path.basename(url)
def inserSql(type, small, big, title):
global db
cursor = db.cursor()
sql = "INSERT INTO bizhi(type, small, big, title) " \
"VALUES ('%s', '%s', '%s', '%s')" % (type, small, big, title)
try:
# 执行sql语句
cursor.execute(sql)
except Exception as e:
print("插入数据失败:", e)
else:
db.commit()
def pa(url):
ra = random.randint(0, 1)
s = requests.session()
h = headers[ra]
response = s.request('get', url, headers=h).content
html = BeautifulSoup(response, 'html.parser')
return html
def main():
page = 141
for index in range(2, page + 1):
time.sleep(3)
if index != 1:
url = 'http://www.netbian.com/dongman/index_%d.htm' % index
else:
url = 'http://www.netbian.com/dongman/index.htm'
print(url)
html = pa(url)
ce = html.find("div", {"class": "list"})
table = ce.find("ul")
liArr = table.findAll("li")
# 遍历单页的每个图片
for lis in liArr:
time.sleep(1)
try:
if lis.attrs["class"] == "nextpage":
break
except:
pass
li = lis.find("img")
small = li.attrs["src"]
# 下载缩略图
download_img(small)
title = li.attrs["alt"]
href = lis.find("a").attrs["href"]
if href[0] == '/':
href = "http://www.netbian.com" + href
imgHref = pa(href)
big = imgHref.find("div", {"class": "pic"}).find("img").attrs["src"]
download_img(big)
inserSql('动漫', getUrlFileName(small), getUrlFileName(big), title)
print("OK", index)
if __name__ == '__main__':
main()
db.close()

该脚本用于从网站www.netbian.com批量下载动漫壁纸,同时将壁纸的缩略图和大图以及标题保存到本地的MySQL数据库中。程序采用了BeautifulSoup进行网页解析,通过requests库进行网络请求,使用了随机User-Agent防止被封禁,并在每次请求之间添加延时,确保不会对服务器造成过大压力。数据库表结构包含id、type、small(缩略图文件名)、big(大图文件名)和title(壁纸描述)等字段。

336

被折叠的 条评论
为什么被折叠?



