爬取“盗墓笔记”小说

import requests
from bs4 import BeautifulSoup
headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\'
}


def open_url(url):
    r = requests.get(url, headers=headers)
    r.encoding = \'gbk\'
    html = r.text
    return html


def get_title(html):
    soup = BeautifulSoup(html, \'lxml\')
    title_tag = soup.find(\'div\', class_=\'h1title\')
    return title_tag.text + \'\n\'


def get_text(html):
    soup2 = BeautifulSoup(html, \'lxml\')
    text_tag = soup2.find(\'div\', id=\'htmlContent\')
    return text_tag.text


def save(title, text):
    with open(\'盗墓笔记.txt\', \'a+\', encoding=\'utf-8\') as file:
        file.write(title)
        file.write(text)
    print(\'下载完成!\')


def main():
    while True:
        num = int(input(\'请输入你想要下载第几章:\')) + 78209
        url = \'http://www.taiuu.com/0/67/\' + str(num) + \'.html\'
        html = open_url(url)
        title = get_title(html)
        text = get_text(html)
        save(title, text)
        repeat = input(\'请问还要继续下载吗?(y/n)\')
        if repeat == \'y\':
            continue
        else:
            break
            print(\'已退出!\')


if __name__ == \'__main__\':
    main()

版权声明:本文为jsxxd原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/jsxxd/p/13740812.html