python爬虫之爬取小说(四)
爬取《坏蛋是怎样练成的》
# 导入第三方库 import requests from bs4 import BeautifulSoup # 模拟反爬 headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102\ Safari/537.36\' } # 请求网页 def open_url(url): r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding return r.text # 提取文本标题 def get_title(tit): soup = BeautifulSoup(tit, \'lxml\') title = soup.find_all(\'div\', class_="post_title")[0].h2.get_text() return \'\n\' + title + \'\n\' # 提取文章内容 def get_text(txt): soup = BeautifulSoup(txt, \'lxml\') texts = soup.find_all(\'div\', class_="post_entry") for i in texts: text = i.find_all(\'p\') return text # 保存标题 def save_title(filename, tit): with open(filename + \'.txt\', \'a+\') as file: file.write(tit) # 保存文本 def save_text(filename, tex, num): with open(filename + \'.txt\', \'a+\') as file: file.write(tex) # 主程序框架 def main(): num = input(\'你想要下载《坏蛋是怎样炼成的》第几章?(共346章节)\') num = int(num) filename = \'坏蛋是怎样炼成的\' url = \'http://www.huaidan1.com/\' + str(num) + \'.html\' text = open_url(url) tit = get_title(text) tex = get_text(text) save_title(filename, tit) for i in tex: txt = \' \' + i.get_text() + \'\n\' save_text(filename, txt, num) print(\'第{}章已经下载完成!\'.format(num)) if __name__ == \'__main__\': main()
版权声明:本文为jsxxd原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。