1 # -*- coding:UTF-8 -*-
 2 from urllib import request
 3 from bs4 import BeautifulSoup
 4 import re
 5 import sys
 6 
 7 if __name__ == "__main__":
 8     #创建txt文件
 9     file = open(\'一念永恒.txt\', \'w\', encoding=\'utf-8\')
10     #一念永恒小说目录地址
11     target_url = \'http://www.biqukan.com/1_1094/\'
12     #User-Agent
13     head = {}
14     head[\'User-Agent\'] = \'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19\'
15     target_req = request.Request(url = target_url, headers = head)
16     target_response = request.urlopen(target_req)
17     target_html = target_response.read().decode(\'gbk\',\'ignore\')
18     #创建BeautifulSoup对象
19     listmain_soup = BeautifulSoup(target_html,\'lxml\')
20 
21     #搜索文档树,找出div标签中class为listmain的所有子标签
22     chapters = listmain_soup.find_all(\'div\',class_ = \'listmain\')
23     #使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析
24     download_soup = BeautifulSoup(str(chapters), \'lxml\')
25     #计算章节个数
26     numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
27     index = 1
28     #开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除
29     begin_flag = False
30     #遍历dl标签下所有子节点
31     for child in download_soup.dl.children:
32         #滤除回车
33         if child != \'\n\':
34             #找到《一念永恒》正文卷,使能标志位
35             if child.string == u"《一念永恒》正文卷":
36                 begin_flag = True
37             #爬取链接并下载链接内容
38             if begin_flag == True and child.a != None:
39                 download_url = "http://www.biqukan.com" + child.a.get(\'href\')
40                 download_req = request.Request(url = download_url, headers = head)
41                 download_response = request.urlopen(download_req)
42                 download_html = download_response.read().decode(\'gbk\',\'ignore\')
43                 download_name = child.string
44                 soup_texts = BeautifulSoup(download_html, \'lxml\')
45                 texts = soup_texts.find_all(id = \'content\', class_ = \'showtxt\')
46                 soup_text = BeautifulSoup(str(texts), \'lxml\')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
47                 write_flag = True
48                 file.write(download_name + \'\n\n\')
49                 #将爬取内容写入文件
50                 for each in soup_text.div.text.replace(\'\xa0\',\'\'):
51                     if each == \'h\':
52                         write_flag = False
53                     if write_flag == True and each != \' \':
54                         file.write(each)
55                     if write_flag == True and each == \'\r\':
56                         file.write(\'\n\')
57                 file.write(\'\n\n\')
58                 #打印爬取进度
59                 sys.stdout.write("已下载:%.3f%%" % float(index/numbers) + \'\r\')
60                 sys.stdout.flush()
61                 index += 1
62     file.close()

 

>>> for link in soup.find_all(\’a\’):
… print(link.get(\’href\’))
#用于爬取a标签的链接

 

  

Beautiful Soup 4.4.0 文档链接:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/

 

已下文章来自博客园大佬:http://www.cnblogs.com/sakura3/p/8460224.html(为了复习方便,搬一下,谢谢)

爬小说:

 1 #!/usr/bin/python
 2 # -*- coding: UTF-8 -*-
 3 import requests
 4 from bs4 import BeautifulSoup
 5 # get_url_list 获取所有章节的URL,在一个list里
 6 def get_url_list(url):
 7     content = requests.get(url).content           #获取页面内容
 8     soup = BeautifulSoup(content,\'lxml\')          #Beautifulsoup 实例化对象
 9     url_list = []                                #空的url_list 数组
10     # urls = soup.find(\'div\',{\'id\':\'list\'}).find(\'dl\').find_all(\'dd\')
11     urls = soup.select(\'#list > dl > dd > a\')    # 根据页面选择到URL ,还可以urls = soup.find(\'div\',{\'id\':\'list\'}).find(\'dl\').find_all(\'dd\')
12     for i in urls:          #遍历里面的每一章的URL
13         i = i.get(\'href\')   #获取URL
14         # print(i)      
15         i = \'http://www.biquge.com.tw\' + i   #分析文章组成,形成最终的URL
16         url_list.append(i)    #添加到url_list 里面去
17     # print (url_list)
18     return url_list
19 # 获取这一章的内容
20 def get_data(url):
21     content = requests.get(url).content
22     soup = BeautifulSoup(content, \'lxml\')
23     f = open(r\'C:\Users\HBX\Documents\staudy\HMXX.txt\',\'a+\',encoding=\'utf-8\')  #不加utf-8 会有编码报错
24     text_name = soup.find(\'div\',{\'class\':\'bookname\'}).find(\'h1\').text     #获得章节名字
25     # text_content = soup.select(\'#content\')
26     text_content = soup.find(\'div\',{\'id\':\'content\'}).get_text()    #获得章节内容  ,还有一种select css 选择的获取章节内容的方式
27     book =text_name+ \'\r\n\' + text_content    #整体的一章
28     # print(book)
29     f.write((book)+\'\r\n\')   #换行写入
30     f.close()   #关闭文件
31     # for x in text_content:
32     #     a = x.text.replace(\'readx();\', \'\')
33     #     print(a)
34 
35 
36 
37 if __name__ ==\'__main__\':
38     url = \'http://www.biquge.com.tw/18_18049/\'   #笔趣阁的小说目录页面
39     url_list = get_url_list(url)   #获取了所有的url
40     for i in url_list:    # 循环一章url
41         get_data(i)      #获取文章内容

 

 

 

 

版权声明:本文为kangdong原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/kangdong/p/8489532.html