爬取“全书网”《斗罗大陆》小说

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by Fzy on 2018/12/27 17:14
import requests
import re
headers = {
    \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\'
}


def get_html(first_url):
    try:
        r = requests.get(first_url, headers=headers)
        r.encoding = \'gbk\'
        html = r.text
        return html
    except Exception as e:
        peat = str(e) + \'\n访问失败!\'
        print(peat)


def get_info(html):
    txt_info = {}
    txt_info[\'title\'] = re.findall(r\'<div class="chapName">.*?<strong>(.*?)</strong>\', html)[0]
    txt_info[\'author\'] = re.findall(r\'<div class="chapName"><span class="r">(.*?)</span>\', html)[0]
    return txt_info


def get_urls(html):
    li_tags = re.findall(r\'<DIV class="clearfix dirconone">(.*?)</div>\', html, re.S|re.I)[0]
    urls = re.findall(r\'<a href="(.*?)"\', li_tags)
    return urls


def save_text(urls, txt_info):
    with open(txt_info[\'title\'] + \'.txt\', \'a+\') as file:
        file.write(txt_info[\'title\']+\'\n\n\')
        file.write(txt_info[\'author\']+\'\n\')
    print(\'正在下载《{}》全本小说(共六百八十七章),时间稍长,请稍等......\'.format(txt_info[\'title\']))
    for url in urls:
        html = get_html(url)
        text = re.findall(r\'&nbsp;&nbsp;&nbsp;&nbsp;(.*?)<br />\', html)
        text1 = re.findall(r\'&nbsp;&nbsp;&nbsp;&nbsp;(.*?)<script type="text/javascript">\', html)[0]
        text.append(text1)
        for i in text:
            with open(txt_info[\'title\']+\'.txt\', \'a+\') as file:
                file.write(\'    \'+i+\'\n\')
        print(text[0])


def main():
    first_url = \'http://www.quanshuwang.com/book/44/44683\'
    html = get_html(first_url)
    txt_info = get_info(html)
    urls = get_urls(html)
    save_text(urls, txt_info)


if __name__ == \'__main__\':
    main()

版权声明:本文为jsxxd原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/jsxxd/p/13740824.html