爬虫批量自动下载小说
下载排行榜的所有小说
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 #Author: ss 4 5 from bs4 import BeautifulSoup 6 import requests 7 import time 8 import os 9 10 headers = { 11 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36\' 12 } 13 14 def get_text(url,title1): 15 #url = \'https://www.xxbiquge.com/0_36/8840634.html\' 16 data = requests.get(url,headers=headers) 17 time.sleep(0.5) 18 soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'utf-8\'),\'lxml\') 19 text = soup.select(\'div.content_read > div > div#content\')[0].text 20 title2 = soup.select(\'div.content_read > div > div.bookname > h1\')[0].text 21 ls = [] 22 for i in text: 23 if i in "\' \n\',\'\xa0\',\'readx();\'": 24 continue 25 else: 26 ls.append(i) 27 text = \'\'.join(ls) 28 with open(\'.\\books\\\' + title1 + \'.txt\',\'ab+\') as f: 29 f.write((title1 + \'\r\n\').encode()) 30 #f.write(\'\r\n\'.encode()) 31 f.write(text.encode()) 32 f.write(\'\r\n\r\n\'.encode()) 33 print(\'正在下载{}\'.format(title2)) 34 35 def get_one_links(url): 36 #url = \'https://www.xxbiquge.com/0_36/\' 37 data = requests.get(url, headers=headers) 38 soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'utf-8\'), \'lxml\') 39 links = soup.select(\'div#list > dl > dd\') 40 title = soup.select(\'div#maininfo > div#info > h1\')[0].text 41 print(\'开始下载{}\'.format(title)) 42 for i in links: 43 data = i.select(\'a\') 44 for m in data: 45 url = \'https://www.xxbiquge.com\' + m.get(\'href\') 46 get_text(url,title) 47 48 def get_all(): 49 url = \'https://www.xxbiquge.com/xbqgph.html\' 50 data = requests.get(url,headers=headers) 51 time.sleep(0.5) 52 soup = BeautifulSoup(data.text.encode(\'ISO-8859-1\').decode(\'utf-8\'),\'lxml\') 53 links = soup.select(\'div.novelslist2 > ul > li\') 54 for i in links: 55 data = i.select(\'span.s2 > a\') 56 for m in data: 57 url = \'https://www.xxbiquge.com\' + data[0].get(\'href\') 58 get_one_links(url) 59 60 if not os.path.exists(\'.\\books\'): 61 os.mkdir(\'.\\books\') 62 get_all()
版权声明:本文为ssxsy原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。