爬取《坏蛋是怎样练成的》

# 导入第三方库
import requests
from bs4 import BeautifulSoup
# 模拟反爬
headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102\
     Safari/537.36\'
}


# 请求网页
def open_url(url):
    r = requests.get(url, headers=headers)
    r.encoding = r.apparent_encoding
    return r.text


# 提取文本标题
def get_title(tit):
    soup = BeautifulSoup(tit, \'lxml\')
    title = soup.find_all(\'div\', class_="post_title")[0].h2.get_text()
    return \'\n\' + title + \'\n\'


# 提取文章内容
def get_text(txt):
    soup = BeautifulSoup(txt, \'lxml\')
    texts = soup.find_all(\'div\', class_="post_entry")
    for i in texts:
        text = i.find_all(\'p\')
        return text


# 保存标题
def save_title(filename, tit):
    with open(filename + \'.txt\', \'a+\') as file:
        file.write(tit)

# 保存文本
def save_text(filename, tex, num):
    with open(filename + \'.txt\', \'a+\') as file:
        file.write(tex)


# 主程序框架
def main():
    num = input(\'你想要下载《坏蛋是怎样炼成的》第几章?(共346章节)\')
    num = int(num)
    filename = \'坏蛋是怎样炼成的\'
    url = \'http://www.huaidan1.com/\' + str(num) + \'.html\'
    text = open_url(url)
    tit = get_title(text)
    tex = get_text(text)
    save_title(filename, tit)
    for i in tex:
        txt = \'    \' + i.get_text() + \'\n\'
        save_text(filename, txt, num)
    print(\'第{}章已经下载完成!\'.format(num))


if __name__ == \'__main__\':
    main()

版权声明:本文为jsxxd原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/jsxxd/p/13740838.html