网上没新的txt下载,自己想办法下载来看,可以根据网页标签不同来修改。 还没入门,不智能慢慢研究吧。

bs4要装一下,pip install BeautifulSoup4

 1 #coding utf-8
 2 import urllib.request
 3 from bs4 import BeautifulSoup
 4 import time
 5 import re
 6 
 7 def get_html(url):
 8     page = urllib.request.urlopen(url)
 9     html = page.read()
10 #    print(bytes.decode(html))
11     return html
12 
13 \'\'\'
14 page=\'https://www.xuehong.cc/book/36273/\'
15 p1 = BeautifulSoup(get_html(page).decode(\'utf-8\'), \'html.parser\')
16 p2=[]
17 for p in p1.find_all(\'a\',):
18     p2.append(p[\'href\'])
19 print(p2)
20 \'\'\'
21 
22 p3=[\'/book/36273/31737154.html\', \'/book/36273/31737155.html\', \'/book/36273/31737156.html\', \'/book/36273/31737157.html\', \'/book/36273/31737158.html\', \'/book/36273/31737159.html\', \'/book/36273/31737160.html\', \'/book/36273/31737161.html\', \'/book/36273/31737162.html\', \'/book/36273/31737163.html\', \'/book/36273/31737164.html\', \'/book/36273/31737165.html\', \'/book/36273/31737166.html\', \'/book/36273/31737167.html\', \'/book/36273/31737168.html\', \'/book/36273/31737169.html\', \'/book/36273/31737170.html\', \'/book/36273/31863549.html\', \'/book/36273/32060318.html\', \'/book/36273/32060319.html\', \'/book/36273/32060320.html\', \'/book/36273/32157836.html\', \'/book/36273/32675620.html\', \'/book/36273/32693741.html\', \'/book/36273/32705629.html\', \'/book/36273/32720993.html\', \'/book/36273/32720995.html\', \'/book/36273/32751825.html\', \'/book/36273/32969531.html\', \'/book/36273/32969532.html\', \'/book/36273/32969533.html\', \'/book/36273/32969534.html\', \'/book/36273/32969535.html\', \'/book/36273/32969536.html\', \'/book/36273/32969537.html\', \'/book/36273/32969538.html\', \'/book/36273/32969539.html\', \'/book/36273/32969540.html\', \'/book/36273/32969541.html\', \'/book/36273/33178998.html\', \'/book/36273/33179002.html\', \'/book/36273/33179005.html\', \'/book/36273/33179008.html\', \'/book/36273/33415818.html\', \'/book/36273/33434196.html\', \'/book/36273/35213931.html\', \'/book/36273/35213932.html\', \'/book/36273/35213933.html\', \'/book/36273/35213934.html\', \'/book/36273/35213935.html\', \'/book/36273/35213936.html\', \'/book/36273/35213937.html\', \'/book/36273/35213938.html\', \'/book/36273/35213939.html\', \'/book/36273/35213940.html\', \'/book/36273/35213941.html\', \'/book/36273/35213942.html\', \'/book/36273/35213943.html\', \'/book/36273/35262823.html\', \'/book/36273/35318036.html\', \'/book/36273/35318037.html\', \'/book/36273/35362277.html\', \'/book/36273/35390213.html\', \'/book/36273/35397646.html\', \'/book/36273/35398640.html\', \'/book/36273/35410795.html\', \'/book/36273/35418366.html\', \'/book/36273/35454975.html\', \'/book/36273/35455295.html\', \'/book/36273/35456452.html\', \'/book/36273/35458123.html\', \'/book/36273/35488936.html\', \'/book/36273/35488937.html\', \'/book/36273/35495130.html\', \'/book/36273/35498675.html\', \'/book/36273/35503958.html\', \'/book/36273/35510595.html\', \'/book/36273/35510628.html\', \'/book/36273/35517338.html\', \'/book/36273/35522119.html\', \'/book/36273/35529846.html\', \'/book/36273/35536421.html\', \'/book/36273/35590637.html\', \'/book/36273/35590638.html\', \'/book/36273/35601859.html\', \'/book/36273/35657475.html\', \'/book/36273/35662329.html\', \'/book/36273/35675638.html\', \'/book/36273/35693345.html\', \'/book/36273/35693346.html\', \'/book/36273/35735160.html\', \'/book/36273/35740864.html\', \'/book/36273/35750550.html\', \'/book/36273/35754379.html\', \'/book/36273/35786823.html\']
23 
24 url=\'https://www.xuehong.cc/book/36273/31737154.html\'
25 i=0
26 for num in p3:
27     urlNum=\'https://www.xuehong.cc\'+p3[i]
28 
29     soup = BeautifulSoup(get_html(urlNum).decode(\'utf-8\'), \'html.parser\')
30     for j in soup.find_all(\'h1\',):
31         print(j)
32         with open(\'F:\\book.txt\', \'a\',encoding=\'utf-8\') as f:  # 设置文件对象
33             f.write(str(j)+"\n\n")
34 
35     for k in soup.find_all(\'div\', id=\'content\'):
36         k1=str(k).replace("    ","")
37         k2=k1.replace("<br/><br/>","\n\n")
38         print(k2)
39         with open(\'F:\\book.txt\', \'a\',encoding=\'utf-8\') as f:  # 设置文件对象
40             f.write(k2+"\n\n\n\n")
41 i=i+1

 

版权声明:本文为rood原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/rood/p/11592835.html