1. 相关网址和库
网址
http://www.allitebooks.org/

需要用到的库
requests、beautifulsoup4
2. 代码实现
import requests
from lxml import etree
from bs4 import BeautifulSoup
import json
import csv
class BookSpider(object):
def __init__(self):
self.base_url = 'http://www.allitebooks.com/page/{}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36 Edg/83.0.478.45'
}
self.data_list = []
# 1.构建所有url
def get_url_list(self):
url_list = []
for i in range(1, 6):
url = self.base_url.format(i)
url_list.append(url)
return url_list
# 2.发送请求
def send_request(self, url):
data = requests.get(url, headers=self.headers).content.decode('utf-8')
print(url)
return data
# 3.解析数据
def parse_bs4_data(self, data):
bs4_data = BeautifulSoup(data, 'lxml')
# 1) 取出所有的书
book_list = bs4_data.select('article')
# 2) 解析出每本书的信息
for book in book_list:
book_dict = {}
# 书名
book_dict['book_name'] = book.select_one('.entry-title').get_text()
# 书封面 img_url
book_dict['book_img_url'] = book.select_one('.attachment-post-thumbnail').get('src')
# 书的作者
book_dict['book_author'] = book.select_one('.entry-author').get_text()
# 书的简介
book_dict['book_info'] = book.select_one('.entry-summary').get_text()
self.data_list.append(book_dict)
# 4.保存数据
def save_data(self):
json.dump(self.data_list, open("book.json", 'w'))
def save_data_csv(self):
sheet_title = self.data_list[0].keys()
sheet_data = []
for data in self.data_list:
sheet_data.append(data.values())
writer = csv.writer(open("book.csv", 'w', encoding='utf-8'))
writer.writerow(sheet_title)
writer.writerows(sheet_data)
def run(self):
url_list = self.get_url_list()
# 循环遍历发送请求
for url in url_list:
data = self.send_request(url)
# self.parse_xpath_data(data)
self.parse_bs4_data(data)
self.save_data()
self.save_data_csv()
if __name__ == '__main__':
BookSpider().run()
3.小结
爬取电子书概要信息,保存为 json 或 csv 格式文件。
还可以进一步进入书目详情页,获取电子书详细信息及下载链接。
本文介绍了如何使用Python爬虫技术获取免费电子书的概要信息,包括涉及的网址和所需的requests、beautifulsoup4库。通过爬取,数据被保存为json或csv文件。还提及可以深入爬取书目详情以获取更多电子书的详细信息和下载链接。
 获取免费电子书信息&spm=1001.2101.3001.5002&articleId=106951229&d=1&t=3&u=4989e7ed6f5e4119915c5e9f449cdeb6)
1831

被折叠的 条评论
为什么被折叠?



