python爬虫之爬取小说(二)
爬取“全书网”《斗罗大陆》小说
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by Fzy on 2018/12/27 17:14 import requests import re headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\' } def get_html(first_url): try: r = requests.get(first_url, headers=headers) r.encoding = \'gbk\' html = r.text return html except Exception as e: peat = str(e) + \'\n访问失败!\' print(peat) def get_info(html): txt_info = {} txt_info[\'title\'] = re.findall(r\'<div class="chapName">.*?<strong>(.*?)</strong>\', html)[0] txt_info[\'author\'] = re.findall(r\'<div class="chapName"><span class="r">(.*?)</span>\', html)[0] return txt_info def get_urls(html): li_tags = re.findall(r\'<DIV class="clearfix dirconone">(.*?)</div>\', html, re.S|re.I)[0] urls = re.findall(r\'<a href="(.*?)"\', li_tags) return urls def save_text(urls, txt_info): with open(txt_info[\'title\'] + \'.txt\', \'a+\') as file: file.write(txt_info[\'title\']+\'\n\n\') file.write(txt_info[\'author\']+\'\n\') print(\'正在下载《{}》全本小说(共六百八十七章),时间稍长,请稍等......\'.format(txt_info[\'title\'])) for url in urls: html = get_html(url) text = re.findall(r\' (.*?)<br />\', html) text1 = re.findall(r\' (.*?)<script type="text/javascript">\', html)[0] text.append(text1) for i in text: with open(txt_info[\'title\']+\'.txt\', \'a+\') as file: file.write(\' \'+i+\'\n\') print(text[0]) def main(): first_url = \'http://www.quanshuwang.com/book/44/44683\' html = get_html(first_url) txt_info = get_info(html) urls = get_urls(html) save_text(urls, txt_info) if __name__ == \'__main__\': main()
版权声明:本文为jsxxd原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。