Python爬虫壁纸下载

最新推荐文章于 2024-08-10 15:41:11 发布

原创最新推荐文章于 2024-08-10 15:41:11 发布 · 446 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#python #爬虫 #开发语言

资源同时被 2 个专栏收录

21 篇文章

订阅专栏

python

11 篇文章

订阅专栏

该脚本用于从网站www.netbian.com批量下载动漫壁纸，同时将壁纸的缩略图和大图以及标题保存到本地的MySQL数据库中。程序采用了BeautifulSoup进行网页解析，通过requests库进行网络请求，使用了随机User-Agent防止被封禁，并在每次请求之间添加延时，确保不会对服务器造成过大压力。数据库表结构包含id、type、small（缩略图文件名）、big（大图文件名）和title（壁纸描述）等字段。

下载所有的壁纸，并且将壁纸和描述保存到数据库

数据库字段：id, type, small, big, title

运行请注意延时，不要暴力运行，影响他人正常使用

import json
import os
import random
import time

import pymysql
import requests
from bs4 import BeautifulSoup

host = 'www.netbian.com'
Referer = 'http://www.netbian.com/dongman/index_2.htm'
headers = [{
    'Referer': Referer,
    'Host': host,
    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
}, {
    'Referer': Referer,
    'Host': host,
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
}]
db = pymysql.connect(
    host='localhost',
    user='root',
    password='123456',
    db='taici',
    charset='utf8mb4',
)


def download_img(img_url):
    h = {
        'Referer': Referer,
        'Host': "img.netbian.com",
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    }
    r = requests.get(img_url, headers=h, stream=True)
    print(img_url)
    print(r.status_code)  # 返回状态码
    if r.status_code == 200:
        path = './img/%s' % getUrlFileName(img_url)
        if os.path.exists(path):
            pass
        else:
            open(path, 'wb').write(r.content)  # 将内容写入图片
        print("done")
    del r


def getUrlFileName(url):
    return os.path.basename(url)


def inserSql(type, small, big, title):
    global db
    cursor = db.cursor()
    sql = "INSERT INTO bizhi(type, small, big, title) " \
          "VALUES ('%s', '%s', '%s', '%s')" % (type, small, big, title)
    try:
        # 执行sql语句
        cursor.execute(sql)
    except Exception as e:
        print("插入数据失败:", e)
    else:
        db.commit()


def pa(url):
    ra = random.randint(0, 1)
    s = requests.session()
    h = headers[ra]
    response = s.request('get', url, headers=h).content
    html = BeautifulSoup(response, 'html.parser')
    return html


def main():
    page = 141
    for index in range(2, page + 1):
        time.sleep(3)
        if index != 1:
            url = 'http://www.netbian.com/dongman/index_%d.htm' % index
        else:
            url = 'http://www.netbian.com/dongman/index.htm'
        print(url)
        html = pa(url)
        ce = html.find("div", {"class": "list"})
        table = ce.find("ul")
        liArr = table.findAll("li")
        # 遍历单页的每个图片
        for lis in liArr:
            time.sleep(1)
            try:
                if lis.attrs["class"] == "nextpage":
                    break
            except:
                pass
            li = lis.find("img")
            small = li.attrs["src"]
            # 下载缩略图
            download_img(small)
            title = li.attrs["alt"]
            href = lis.find("a").attrs["href"]
            if href[0] == '/':
                href = "http://www.netbian.com" + href
                imgHref = pa(href)
                big = imgHref.find("div", {"class": "pic"}).find("img").attrs["src"]
                download_img(big)
                inserSql('动漫', getUrlFileName(small), getUrlFileName(big), title)
        print("OK", index)


if __name__ == '__main__':
    main()
    db.close()