python3爬小说然后转为PDF，用于手机小说阅读器观看

# -*- coding: utf-8 -*-
#1，首先导入库
import requests
from bs4 import BeautifulSoup
import pdfkit
import lxml
import lxml.etree
import os
import os.path
from PyPDF2 import PdfFileReader, PdfFileWriter


#2.获取url列表
def get_url_list(url):
    url_list=[]
    body=requests.get(url)
    html=body.content.decode("gbk")
    soup=BeautifulSoup(html,"lxml")
    links=soup.select(\'a\')
    for tag in links:
        if tag.get_text().startswith("第"):

            url_list.append(url2+tag.get_attribute_list("href")[0])
    return url_list

#3，获取需要的文本和标题
def get_content(url):
    """
    解析URL，获取需要的html内容
    :param url: 目标网址
    :return: html
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html5lib")
    title = soup.select("#h1")[0]
    content = soup.select("#content")[0]
    content = str(content).replace("****", "")
    html = html_template.format(content=content, title=title)

    return html

#4，保存每一个到一个PDF
def save_pdf(html, filename):
    """
    把所有html文件保存到pdf文件
    :param html:  html内容
    :param file_name: pdf文件名
    :return:
    """
    options = {
        \'page-size\': \'Letter\',
        \'margin-top\': \'0.75in\',
        \'margin-right\': \'0.75in\',
        \'margin-bottom\': \'0.75in\',
        \'margin-left\': \'0.75in\',
        \'encoding\': "UTF-8",
        \'custom-header\': [
            (\'Accept-Encoding\', \'gzip\')
        ],
        \'cookie\': [
            (\'cookie-name1\', \'cookie-value1\'),
            (\'cookie-name2\', \'cookie-value2\'),
        ],
        \'outline-depth\': 10,

    }
    # 下载wkhtmktopdf转换软件，添加到环境变量只能在用户下执行，并添加执行路径，只有这样
    path_wk = r\'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe\'  # 安装位置
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    pdfkit.from_string(html, filename, options=options, configuration=config)

#5，获取同一个文件夹下的所有PDF文件名
def getFileName(filepath):
    file_list = []
    for root, dirs, files in os.walk(filepath):
        for filespath in files:
            # print(os.path.join(root,filespath))
            file_list.append(os.path.join(root, filespath))

    return file_list


#6，合并同一个文件夹下所有PDF文件
def MergePDF(filepath, outfile):
    output = PdfFileWriter()
    outputPages = 0
    pdf_fileName = getFileName(filepath)
    for each in pdf_fileName:
        print(each)
        # 读取源pdf文件
        input = PdfFileReader(open(each, "rb"))

        # 如果pdf文件已经加密，必须首先解密才能使用pyPdf
        if input.isEncrypted == True:
            input.decrypt("map")

        # 获得源pdf文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print(pageCount)

        # 分别将page添加到输出output中
        for iPage in range(0, pageCount):
            output.addPage(input.getPage(iPage))

    print("All Pages Number:" + str(outputPages))
    # 最后写pdf文件
    outputStream = open(filepath + outfile, "wb")
    output.write(outputStream)
    outputStream.close()
    print("finished")



if __name__ == \'__main__\':
    url = \'http://www.\'
    url2= \'http://www/\'
    html_template = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
    </head>
    <body>
    {title}
    {content}
    </body>
    </html>
    """
    for k,url in enumerate(get_url_list(url)):
        print(url)
        html=get_content(url)
        print(html)
        save_pdf(html,str(k)+r".pdf")
    file_dir = r\'C:\Users\Administrator\Desktop\dailyScript\'
    out = "总裁.pdf"
    MergePDF(file_dir, out)
本文链接：https://www.cnblogs.com/hyolyn/p/10142670.html
python3爬小说然后转为PDF，用于手机小说阅读器观看

python3爬小说然后转为PDF，用于手机小说阅读器观看的更多相关文章

随机推荐

热门专题

目录导航