使用python制作epub
前期工作
- ebookLib库
- 关于该库,如果pypi版本太低,需要去gitlab上clone,然后运行python setup.py install
- zhconv库,主要用来简繁转换
- 一个允许抓取数据的小说网站novel-backup
- 一点点时间
开始制作
第一步、分析网站
根据自己抓取的网站,获取所有章节的链接
https://novels.novel-backup.cf/index/1558018541.json
根据获得的内容,对内容进行分析
(已省略部分数据)
[
{
"name": "41.成套的葡萄酒杯",
"id": 7460
},
{
"name": "42.烤肉午餐",
"id": 7550
}
]
里面的id就是下面章节内容的链接xx/yy.json的yy
再获取章节内容,对其内容进行分析
https://novels.novel-backup.cf/novels/93065.json
(已省略部分数据)
{
"code_id": 1558018541,
"title": "第1卷插圖",
"create_date": "2020-10-07 20:51:33",
"content": "<p><img src=\"https://live.staticflickr.com/65535/50431755246_afecb655fc_o.png[/img][/url][url=https://flic.kr/p/2jQtVPu]魔導具師ダリヤはうつむかない 1-0[/url] by [url=https://www.flickr.com/photos/55799173@N00/]jameslam518[/url], on Flickr\" class=\"fr-fic fr-dib\"></p><p><br></p>",
"author": "職業量地官",
"views": 2896
}
对于我们来说,有用的是title、content、author
第二步、抓取数据并清洗
ebookLib的章节顺序是按照add_item来排的,所以我们需要对抓取的章节进行排序。
首先新建一个py文件,然后新建一个类Espider
def getJson(self,url):
html:requests.Response= requests.get(url)
return html.json()
def getDictList(self,url):
js:typing.List[dict]=self.getJson(url)
return js
def getFilter(self,li_list):
maxx=0
id_dicts=[]
for li in li_list:
idict=li
idict['name']=convert(idict['name'],'zh-hans')
ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])
if(len(ll)>0):
s:str=ll[0]
num=int(s[:-1])
idict['num']=num
maxx=max(maxx,num)
else:
ll=re.findall(r'第([1-9]\d*)话',idict['name'])
if(len(ll)>0):
s:str=ll[0]
num=int(s)
idict['num']=num
maxx=max(num,maxx)
else:
maxx+=1
idict['num']=maxx
id_dicts.append(idict)
id_dicts.sort(key=lambda it:it['num'])
tmp_list:typing.List[dict]=[]
for i in range(len(id_dicts)):
id_dicts[i]['i']=str(i)
tmp_list.append(id_dicts[i])
return tmp_list
首先是获取数据,然后将数据转换格式(getJson,getDictList)
getFilter长长的代码简单理解就是将章节的链接List中每个174. 疲勞與真心話或第3話 商業公會前面的数字取出来,然后如果有不存在数字的章节,就让这个章节的id=maxx
获取文章内容
对于图片需要特殊处理,先保存到本地后再添加到epub文件里
def getDict(self,url):
js:dict=self.getJson(url)
return js
def saveImg(self,title,src):
path='Images/{}'.format(title)
if(os.path.exists(path)==False):
os.mkdir(path)
s=re.findall(r'65535/(.*?)\[/img\]',src)
if(len(s)==0):
s=re.findall(r'65535/(.*?.png)',src)[0]
else:
s=s[0]
res:requests.Response=requests.get(src,stream=True)
res.raise_for_status()
with open("{}/{}".format(path,s),"wb") as f:
f.write(res.content)
self.img_list.append({
'src':"{}/{}".format(path,s),
'uid':s.split('.')[0]
})
return "{}/{}".format(path,s)
def contentCheck(self,title,content:str):
soup=BeautifulSoup(content,'lxml')
for img in soup.findAll('img'):
s=self.saveImg(title,img['src'])
img['src']=s
return str(soup.body)
def getContent(self,id):
url_s='https://novels.novel-backup.cf/novels/'
url_e='.json'
print(url_s+id+url_e)
js=self.getDict(url_s+id+url_e)
js['author']=convert(js['author'],'zh-hans')
js['title']=convert(js['title'],'zh-hans')
js['content']=convert(js['content'],'zh-hans')
return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])
getDict是获取数据,然后getContent对js取出属性,使用contentCheck对内容处理,之后将其保存到List中。
第三步、保存到Epub中
新建一个ebook文件,将eooklib和Espider引入
toc = []
spine = ['nav']
book = epub.EpubBook()
chp_list = []
def init(title, author):
# set metadata
book.set_identifier('id123456')
book.set_title(title)
book.set_language('cn')
book.add_author(author)
book.add_author('Anonymous', file_as='Anonymous',
role='ill', uid='coauthor')
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# define CSS style
style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'
nav_css = epub.EpubItem(
uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
# add CSS file
book.add_item(nav_css)
def saveChapter():
c1=getChapter(
'前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')
book.add_item(c1)
toc.append(epub.Link(c1.file_name,c1.title,c1.title))
spine.append(c1)
for it in chp_list:
# For each chapter add chapter to the book, TOC and spine
book.add_item(it['chapter'])
toc.append(epub.Link(it['chapter'].file_name,
it['chapter'].title, it['chapter'].title))
spine.append(it['chapter'])
def saveImage(img_list:typing.List[dict]):
for img in img_list:
image_content = open(
img['src'], 'rb').read()
img = epub.EpubImage(uid=img['uid'], file_name=img['src'],
media_type='image/png', content=image_content)
book.add_item(img)
def saveEpub(file_name):
# define Table Of Contents
book.toc = tuple(toc)
# basic spine
book.spine = spine
# write to the file
epub.write_epub('epub/'+file_name, book, {})
def getChapter(title, content, id):
c1 = epub.EpubHtml(title=title,
file_name='chap_'+id+'.xhtml', lang='hr')
c1.content = '<h1>'+title+'<h1>'+content
return c1
def poChapter(it, llen):
i = int(it['i'])+1
c = getChapter(it['name'], es.getContent(
str(it['id'])), str(i).zfill(llen))
chp_list.append({
'chapter': c,
'id': i
})
if __name__ == '__main__':
init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')
es = Espider()
li_url = 'https://novels.novel-backup.cf/index/1558018541.json'
li_list = es.getDictList(li_url)
id_dicts = es.getFilter(li_list)
llen = len(str(len(id_dicts)))
# poChapter(id_dicts[0],llen)
# 创建线程
index = [i for i in range(0, len(id_dicts), 4)]
threads = []
for i in index:
for j in range(0, 4):
threads.append(threading.Thread(
target=poChapter, args=(id_dicts[i+j], llen)))
for t in threads:
t.start()
for t in threads:
t.join()
print('Main thread has ended!')
chp_list.sort(key=lambda it: it['id'])
saveChapter()
saveImage(es.img_list)
saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')
init来自ebookLib官方文档给出的函数,str(i).zfill(llen)是对数字进行数位补齐,如’chap_002.xhtml’
引入threading是为了在爬虫的时候进行多线程,提高效率。
全部代码
# spider.py
import requests
from bs4 import BeautifulSoup
import typing
import re
import os
from zhconv import convert
class Espider:
# https://novels.novel-backup.cf/index/1558018541.json
# https://novels.novel-backup.cf/novels/7460.json
img_list=[]
def getJson(self,url):
html:requests.Response= requests.get(url)
# soup = BeautifulSoup(html.json())
return html.json()
def getDict(self,url):
js:dict=self.getJson(url)
# print(js)
return js
def getDictList(self,url):
js:typing.List[dict]=self.getJson(url)
# print(js)
return js
def saveImg(self,title,src):
path='Images/{}'.format(title)
if(os.path.exists(path)==False):
os.mkdir(path)
# print(src)
s=re.findall(r'65535/(.*?)\[/img\]',src)
# print(s)
if(len(s)==0):
s=re.findall(r'65535/(.*?.png)',src)[0]
else:
s=s[0]
# print(s)
res:requests.Response=requests.get(src,stream=True)
res.raise_for_status()
with open("{}/{}".format(path,s),"wb") as f:
f.write(res.content)
self.img_list.append({
'src':"{}/{}".format(path,s),
'uid':s.split('.')[0]
})
return "{}/{}".format(path,s)
def contentCheck(self,title,content:str):
soup=BeautifulSoup(content,'lxml')
# print(soup)
for img in soup.findAll('img'):
s=self.saveImg(title,img['src'])
img['src']=s
# ''.join(str(it) for it in soup.find_all('p'))
return str(soup.body)
def getContent(self,id):
url_s='https://novels.novel-backup.cf/novels/'
url_e='.json'
print(url_s+id+url_e)
js=self.getDict(url_s+id+url_e)
js['author']=convert(js['author'],'zh-hans')
js['title']=convert(js['title'],'zh-hans')
js['content']=convert(js['content'],'zh-hans')
# print(js['author'],js['title'],js['content'])
return '<p>搬运:'+js['author']+'</p>'+self.contentCheck(js['title'],js['content'])
def getFilter(self,li_list):
maxx=0
id_dicts=[]
for li in li_list:
idict=li
idict['name']=convert(idict['name'],'zh-hans')
ll=re.findall(r'([1-9]\d*.\d*|0\.\d*[1-9]\d*)',idict['name'])
if(len(ll)>0):
s:str=ll[0]
num=int(s[:-1])
idict['num']=num
maxx=max(maxx,num)
else:
ll=re.findall(r'第([1-9]\d*)话',idict['name'])
if(len(ll)>0):
s:str=ll[0]
num=int(s)
idict['num']=num
maxx=max(num,maxx)
else:
maxx+=1
idict['num']=maxx
id_dicts.append(idict)
id_dicts.sort(key=lambda it:it['num'])
tmp_list:typing.List[dict]=[]
for i in range(len(id_dicts)):
id_dicts[i]['i']=str(i)
tmp_list.append(id_dicts[i])
return tmp_list
def getIdList(self,li_list):
id_list:typing.List[str]=[str(it['id']) for it in li_list]
return id_list
if __name__=="__main__":
print("爬取开始")
# po=pool.Pool(5)
# li_url='https://novels.novel-backup.cf/index/1558018541.json'
es=Espider()
# li_list=es.getDictList(li_url);
# # print(li_list)
# id_dicts=es.getFilter(li_list)
# print(id_dicts)
print(es.getContent('112353'))
print(es.getContent('16733'))
# print(es.img_list)
print('爬取结束')
# ebook.py
import threading
import typing
from ebooklib import epub
from spider import Espider
toc = []
spine = ['nav']
book = epub.EpubBook()
chp_list = []
def init(title, author):
# set metadata
book.set_identifier('id123456')
book.set_title(title)
book.set_language('cn')
book.add_author(author)
book.add_author('Anonymous', file_as='Anonymous',
role='ill', uid='coauthor')
# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
# define CSS style
style = 'pre{white-space:pre-wrap;background:#f7f9fa;padding:10px 15px;color:#263238;line-height:1.6;font-size:13px;border-radius:3px margin-top: 0;margin-bottom:1em;overflow:auto}b,strong{font-weight:bolder}#title{font-size:16px;color:#212121;font-weight:600;margin-bottom:10px}hr{height:10px;border:0;box-shadow:0 10px 10px -10px #8c8b8b inset}'
nav_css = epub.EpubItem(
uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
# add CSS file
book.add_item(nav_css)
def saveChapter():
c1=getChapter(
'前言', '<p>使用python ebooklib整合,数据来源https://novel-backup.cf/,仅供参考请勿商用</p>', '000')
book.add_item(c1)
toc.append(epub.Link(c1.file_name,c1.title,c1.title))
spine.append(c1)
for it in chp_list:
# For each chapter add chapter to the book, TOC and spine
book.add_item(it['chapter'])
toc.append(epub.Link(it['chapter'].file_name,
it['chapter'].title, it['chapter'].title))
spine.append(it['chapter'])
# print('save c', chapter.file_name)
def saveImage(img_list:typing.List[dict]):
for img in img_list:
image_content = open(
img['src'], 'rb').read()
img = epub.EpubImage(uid=img['uid'], file_name=img['src'],
media_type='image/png', content=image_content)
book.add_item(img)
def saveEpub(file_name):
# define Table Of Contents
book.toc = tuple(toc)
# basic spine
book.spine = spine
# write to the file
epub.write_epub('epub/'+file_name, book, {})
def getChapter(title, content, id):
c1 = epub.EpubHtml(title=title,
file_name='chap_'+id+'.xhtml', lang='hr')
c1.content = '<h1>'+title+'<h1>'+content
print("g", c1.file_name, c1.title, id)
return c1
def poChapter(it, llen):
# print("开始进程", it['i'])
i = int(it['i'])+1
c = getChapter(it['name'], es.getContent(
str(it['id'])), str(i).zfill(llen))
chp_list.append({
'chapter': c,
'id': i
})
# saveChapter(c, it['i'])
if __name__ == '__main__':
init('魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~', '自动叉积·整合')
es = Espider()
li_url = 'https://novels.novel-backup.cf/index/1558018541.json'
li_list = es.getDictList(li_url)
id_dicts = es.getFilter(li_list)
llen = len(str(len(id_dicts)))
# poChapter(id_dicts[0],llen)
# 创建线程
index = [i for i in range(0, len(id_dicts), 4)]
threads = []
for i in index:
for j in range(0, 4):
threads.append(threading.Thread(
target=poChapter, args=(id_dicts[i+j], llen)))
for t in threads:
t.start()
for t in threads:
t.join()
print('Main thread has ended!')
chp_list.sort(key=lambda it: it['id'])
saveChapter()
# es.img_list.append('Images/第6卷插圖/51154283631_826ee93727_o.png')
saveImage(es.img_list)
saveEpub('《魔導具師妲莉雅不會低頭 ~從今天開始自由的職人生活~》.epub')

999

被折叠的 条评论
为什么被折叠?



