python3爬小说然后转为PDF,用于手机小说阅读器观看
# -*- coding: utf-8 -*-
#1,首先导入库
import requests
from bs4 import BeautifulSoup
import pdfkit
import lxml
import lxml.etree
import os
import os.path
from PyPDF2 import PdfFileReader, PdfFileWriter
#2.获取url列表
def get_url_list(url):
url_list=[]
body=requests.get(url)
html=body.content.decode("gbk")
soup=BeautifulSoup(html,"lxml")
links=soup.select(\'a\')
for tag in links:
if tag.get_text().startswith("第"):
url_list.append(url2+tag.get_attribute_list("href")[0])
return url_list
#3,获取需要的文本和标题
def get_content(url):
"""
解析URL,获取需要的html内容
:param url: 目标网址
:return: html
"""
response = requests.get(url)
soup = BeautifulSoup(response.content, "html5lib")
title = soup.select("#h1")[0]
content = soup.select("#content")[0]
content = str(content).replace("****", "")
html = html_template.format(content=content, title=title)
return html
#4,保存每一个到一个PDF
def save_pdf(html, filename):
"""
把所有html文件保存到pdf文件
:param html: html内容
:param file_name: pdf文件名
:return:
"""
options = {
\'page-size\': \'Letter\',
\'margin-top\': \'0.75in\',
\'margin-right\': \'0.75in\',
\'margin-bottom\': \'0.75in\',
\'margin-left\': \'0.75in\',
\'encoding\': "UTF-8",
\'custom-header\': [
(\'Accept-Encoding\', \'gzip\')
],
\'cookie\': [
(\'cookie-name1\', \'cookie-value1\'),
(\'cookie-name2\', \'cookie-value2\'),
],
\'outline-depth\': 10,
}
# 下载wkhtmktopdf转换软件,添加到环境变量只能在用户下执行,并添加执行路径,只有这样
path_wk = r\'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe\' # 安装位置
config = pdfkit.configuration(wkhtmltopdf=path_wk)
pdfkit.from_string(html, filename, options=options, configuration=config)
#5,获取同一个文件夹下的所有PDF文件名
def getFileName(filepath):
file_list = []
for root, dirs, files in os.walk(filepath):
for filespath in files:
# print(os.path.join(root,filespath))
file_list.append(os.path.join(root, filespath))
return file_list
#6,合并同一个文件夹下所有PDF文件
def MergePDF(filepath, outfile):
output = PdfFileWriter()
outputPages = 0
pdf_fileName = getFileName(filepath)
for each in pdf_fileName:
print(each)
# 读取源pdf文件
input = PdfFileReader(open(each, "rb"))
# 如果pdf文件已经加密,必须首先解密才能使用pyPdf
if input.isEncrypted == True:
input.decrypt("map")
# 获得源pdf文件中页面总数
pageCount = input.getNumPages()
outputPages += pageCount
print(pageCount)
# 分别将page添加到输出output中
for iPage in range(0, pageCount):
output.addPage(input.getPage(iPage))
print("All Pages Number:" + str(outputPages))
# 最后写pdf文件
outputStream = open(filepath + outfile, "wb")
output.write(outputStream)
outputStream.close()
print("finished")
if __name__ == \'__main__\':
url = \'http://www.\'
url2= \'http://www/\'
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{title}
{content}
</body>
</html>
"""
for k,url in enumerate(get_url_list(url)):
print(url)
html=get_content(url)
print(html)
save_pdf(html,str(k)+r".pdf")
file_dir = r\'C:\Users\Administrator\Desktop\dailyScript\'
out = "总裁.pdf"
MergePDF(file_dir, out)
版权声明:本文为hyolyn原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。