python爬虫爬小说,来不及解释了。
网上没新的txt下载,自己想办法下载来看,可以根据网页标签不同来修改。 还没入门,不智能慢慢研究吧。
bs4要装一下,pip install BeautifulSoup4
1 #coding utf-8 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 import re 6 7 def get_html(url): 8 page = urllib.request.urlopen(url) 9 html = page.read() 10 # print(bytes.decode(html)) 11 return html 12 13 \'\'\' 14 page=\'https://www.xuehong.cc/book/36273/\' 15 p1 = BeautifulSoup(get_html(page).decode(\'utf-8\'), \'html.parser\') 16 p2=[] 17 for p in p1.find_all(\'a\',): 18 p2.append(p[\'href\']) 19 print(p2) 20 \'\'\' 21 22 p3=[\'/book/36273/31737154.html\', \'/book/36273/31737155.html\', \'/book/36273/31737156.html\', \'/book/36273/31737157.html\', \'/book/36273/31737158.html\', \'/book/36273/31737159.html\', \'/book/36273/31737160.html\', \'/book/36273/31737161.html\', \'/book/36273/31737162.html\', \'/book/36273/31737163.html\', \'/book/36273/31737164.html\', \'/book/36273/31737165.html\', \'/book/36273/31737166.html\', \'/book/36273/31737167.html\', \'/book/36273/31737168.html\', \'/book/36273/31737169.html\', \'/book/36273/31737170.html\', \'/book/36273/31863549.html\', \'/book/36273/32060318.html\', \'/book/36273/32060319.html\', \'/book/36273/32060320.html\', \'/book/36273/32157836.html\', \'/book/36273/32675620.html\', \'/book/36273/32693741.html\', \'/book/36273/32705629.html\', \'/book/36273/32720993.html\', \'/book/36273/32720995.html\', \'/book/36273/32751825.html\', \'/book/36273/32969531.html\', \'/book/36273/32969532.html\', \'/book/36273/32969533.html\', \'/book/36273/32969534.html\', \'/book/36273/32969535.html\', \'/book/36273/32969536.html\', \'/book/36273/32969537.html\', \'/book/36273/32969538.html\', \'/book/36273/32969539.html\', \'/book/36273/32969540.html\', \'/book/36273/32969541.html\', \'/book/36273/33178998.html\', \'/book/36273/33179002.html\', \'/book/36273/33179005.html\', \'/book/36273/33179008.html\', \'/book/36273/33415818.html\', \'/book/36273/33434196.html\', \'/book/36273/35213931.html\', \'/book/36273/35213932.html\', \'/book/36273/35213933.html\', \'/book/36273/35213934.html\', \'/book/36273/35213935.html\', \'/book/36273/35213936.html\', \'/book/36273/35213937.html\', \'/book/36273/35213938.html\', \'/book/36273/35213939.html\', \'/book/36273/35213940.html\', \'/book/36273/35213941.html\', \'/book/36273/35213942.html\', \'/book/36273/35213943.html\', \'/book/36273/35262823.html\', \'/book/36273/35318036.html\', \'/book/36273/35318037.html\', \'/book/36273/35362277.html\', \'/book/36273/35390213.html\', \'/book/36273/35397646.html\', \'/book/36273/35398640.html\', \'/book/36273/35410795.html\', \'/book/36273/35418366.html\', \'/book/36273/35454975.html\', \'/book/36273/35455295.html\', \'/book/36273/35456452.html\', \'/book/36273/35458123.html\', \'/book/36273/35488936.html\', \'/book/36273/35488937.html\', \'/book/36273/35495130.html\', \'/book/36273/35498675.html\', \'/book/36273/35503958.html\', \'/book/36273/35510595.html\', \'/book/36273/35510628.html\', \'/book/36273/35517338.html\', \'/book/36273/35522119.html\', \'/book/36273/35529846.html\', \'/book/36273/35536421.html\', \'/book/36273/35590637.html\', \'/book/36273/35590638.html\', \'/book/36273/35601859.html\', \'/book/36273/35657475.html\', \'/book/36273/35662329.html\', \'/book/36273/35675638.html\', \'/book/36273/35693345.html\', \'/book/36273/35693346.html\', \'/book/36273/35735160.html\', \'/book/36273/35740864.html\', \'/book/36273/35750550.html\', \'/book/36273/35754379.html\', \'/book/36273/35786823.html\'] 23 24 url=\'https://www.xuehong.cc/book/36273/31737154.html\' 25 i=0 26 for num in p3: 27 urlNum=\'https://www.xuehong.cc\'+p3[i] 28 29 soup = BeautifulSoup(get_html(urlNum).decode(\'utf-8\'), \'html.parser\') 30 for j in soup.find_all(\'h1\',): 31 print(j) 32 with open(\'F:\\book.txt\', \'a\',encoding=\'utf-8\') as f: # 设置文件对象 33 f.write(str(j)+"\n\n") 34 35 for k in soup.find_all(\'div\', id=\'content\'): 36 k1=str(k).replace(" ","") 37 k2=k1.replace("<br/><br/>","\n\n") 38 print(k2) 39 with open(\'F:\\book.txt\', \'a\',encoding=\'utf-8\') as f: # 设置文件对象 40 f.write(k2+"\n\n\n\n")
41 i=i+1
版权声明:本文为rood原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。