python从爬虫基础到爬取网络小说实例
一.爬虫基础
1.1 requests类
1.1.1 request的7个方法
requests.request() 实例化一个对象,拥有以下方法
requests.get(url, *args)
requests.head() 头信息
requests.post()
requests.put()
requests.patch() 修改一部分内容
requests.delete()
url = "http://quanben5.com/n/doupocangqiong/6.html" headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"
} r = requests.get(url, headers=headers) data = { "pinyin": "doupocangqiong", "content_id": "4", } r = requests.post(url, data=data, headers=headers)
1.1.2*arg里面的参数
params 字典或者字节序列,作为参数增加到url中
data 字典字节序列文件对象, 放在url里面对应的地方
json 作为requests的内容
headers 字典 模拟服务头
cookies 字典 cookieJar
auth 元组
files 字典类型,传输文件
timeout 设定的超时时间
proxies 字典类型,设定访问代理服务器,可以增加登录认证 pxs={“http”:”http://user:pass@10.10.1.1234″}
allow_redirects 默认True 是否允许重定向
stream 默认TRUE 获得数据立刻下载
verity 默认True 认证SSL证书开关
cert 本地SSL证书路径
1.2 BeautifulSoup类
soup = BeautifulSoup("","html.parser")
soup.prettify()
soup.find_all(name, attrs, recursive, string, **kwargs)
name: 标签名称
attrs: 属性
string: 检索字符串
soup.head
soup.head.contents 列表
soup.body.contents[1]
soup.body.children
soup.body.descendants
.parent
.parents
.next_sibling 下一个
.previous_sibling 上一个
.next_siblings 下一个所有 迭代
.previous_siblings 上一个所有
1.3 selenium
1 from selenium import webdriver 2 from selenium.webdriver.chrome.options import Options 3 import time 4 5 6 chrome_options = Options() 7 chrome_options.add_argument(\'--headless\') 8 chrome_options.add_argument(\'--disable-gpu\') 9 10 d = webdriver.Chrome(chrome_options=chrome_options) # 设置成不显示浏览器的模式 11 d.get(\'http://quanben5.com/n/dazhuzai/23241.html\') 12 time.sleep(2) 13 print(d.page_source) # 显示出的是加载完之后的内容
查找单个和多个元素
d.find_element_by_id
d.find_elements_by_id
元素交互
from selenium import webdriver d = webdriver.Chrome() d.get(\'http://www.baidu.com\') inputText = d.find_element_by_id("kw") inputText.send_keys("萝莉") button = d.find_element_by_id("su") button.click()
二.实战
首先要找到可以在线看小说的网页
这里我随便百度了一下,首先选择了一个全本5200小说网(“https://www.qb5200.tw”)
打开某小说章节目录表
https://www.qb5200.tw/xiaoshuo/0/357/
查看源代码
发现正文卷是在class为listmain的div下面的第二个dt标签里
之后的路径标签为a
1 url_text = soup.find_all(\'div\', \'listmain\')[0] # 找到第一个class=listmain的div标签 2 main_text = url_text.find_all(\'dt\')[1] # 找到下面的第二个dt标签 3 for tag in main_text.next_siblings: 4 try: 5 url = \'\'.join([\'https://\', host, tag.a.attrs[\'href\']]) 6 print(\'parsering\', url) 7 except: 8 continue
找到每个章节的url
在随便打开一章
查看源代码为:
1 def getHTMLText(url, headers={}): 2 """ 3 获取网站源码 4 :param url: 5 :return: class response 6 """ 7 if headers != {}: 8 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"} 9 10 # proxies = get_random_ip() 11 proxies = {} 12 try: 13 # print("start", url) 14 r = requests.get(url, proxies=proxies, headers=headers) 15 r.raise_for_status() 16 # r.encoding = r.apparent_encoding 17 # print(\'end\', url) 18 return r 19 except: 20 return r.status_code 21 22 def parseByQB5200(soup, host, f): 23 """ 24 在全本小说网的源码下爬取小说 25 :param soup, host: 26 :param f: 27 :return: 28 """ 29 url_text = soup.find_all(\'div\', \'listmain\')[0] 30 main_text = url_text.find_all(\'dt\')[1] 31 x = 0 32 for tag in main_text.next_siblings: 33 time.sleep(1) 34 try: 35 url = \'\'.join([\'https://\', host, tag.a.attrs[\'href\']]) 36 print(\'parsering\', url) 37 soup = BeautifulSoup(getHTMLText(url).text, "html.parser") 38 passage = soup.find_all("div", "content")[0] 39 title = passage.h1.string 40 f.writelines(\'\'.join([title, \'\n\'])) 41 passage = soup.find_all("div", "showtxt")[0] 42 for i in passage.descendants: 43 if i.name != "br": 44 st = i.string 45 if st.strip().startswith(\'http\') or st.strip().startswith(\'请记住本书\'): 46 continue 47 f.writelines(\'\'.join([\' \', st.strip(), \'\n\'])) 48 x += 1 49 print(\'%d.%s 下载完成\' % (x, title)) 50 except: 51 continue 52 53 def getNovelUrls(url): 54 """ 55 通过小说的目录网址判断小说所在的网站 56 并调用属于该网站的爬虫语句 57 :param url: 58 :return: 59 """ 60 61 response = getHTMLText(url) 62 host = url.split(\'//\')[1].split(\'/\')[0] 63 host_list = { 64 "www.qb5200.tw": parseByQB5200, 65 # "www.qu.la": parseByQuLa, 66 "quanben5.com": parseByQB5 67 } 68 print(response) 69 soup = BeautifulSoup(response.text, \'html.parser\') 70 with open(\'1.txt\', \'w\', encoding=\'utf8\') as f: 71 host_list[host](soup, host, f) 72 73 if __name__ == \'__main__\': 74 getNovelUrls("https://www.qb5200.tw/xiaoshuo/0/357/")
在全本5200爬取小说txt
问题在于
全本小说网(“www.qb5200.tw”)在这样的暴力获取下只允许爬3次,之后就403错误
本来以为是同一IP限制访问次数, 使用了IP代理之后发现问题依旧
猜测应该是跟请求头有关
因此加上了访问该网站时浏览器的所有请求头,并且把User-Agent设置为随机
USER_AGENT_LIST = [ \'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23\', \'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)\', \'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)\', \'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)\', \'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)\', \'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)\', \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0\', \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0\', \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0\', \'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)\', \'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)\' ] headers = {"User-Agent": random.choice(USER_AGENT_LIST), "Host": "www.qb5200.tw", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Referer": "https://www.qb5200.tw/xiaoshuo/0/355/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "UM_distinctid=16736c47b1d8b-04bd85d2b5e8ab-50422618-144000-16736c47b1e1f2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260750615=1497982366-1542811927-%7C1542882690; fikker-7ZUK-qXIL=qDaKEBihatSNEFgtMlGVIKubCHYLi89J; fikker-7ZUK-qXIL=qDaKEBihatSNEFgtMlGVIKubCHYLi89J; fikker-Sbwr-GN9F=GbMX3HOhwvoDMxopLMGt3VWliXQIK0SP; fikker-Sbwr-GN9F=GbMX3HOhwvoDMxopLMGt3VWliXQIK0SP; fikker-yJ3O-W61D=UfinETMnCR38ADWZEl1KNHQRU2m81Fwb; fikker-yJ3O-W61D=UfinETMnCR38ADWZEl1KNHQRU2m81Fwb; fikker-rWK3-6KHs=T9T5b7lYVJReTQviPm2IdLPyHu83RwFM; fikker-rWK3-6KHs=T9T5b7lYVJReTQviPm2IdLPyHu83RwFM" }
随机请求头
解决问题.
三.使用ajax动态加载的实例:
全本5小说网(“quanben5.com”)
同样的方法搜索源代码
然而发现了问题
给出的html页面只有一半的源码
因此按F12打开检查
发现所有的文本存在这个xhr里
点击查看请求头信息
发现是post请求
请求的url是/index.php?c=book&a=ajax_content
请求的数据在最下面的form表单里
打开网页源文件和js文件,搜索这些表单信息
分别在ajax.js里和源文件里找到了这些
源文件里面的可以直接生成data数据表单
在ajax.js里可以知道rndval字段是当前时间,精确到毫秒
四.优化
采用了gvent进行异步IO处理,每一张网页保存在temp里面,最后将文件合成一个txt
加入了搜索功能,目前仅支持一个小说网站
代码如下:
1 #!/usr/bin/env python 2 # encoding: utf-8 3 4 """ 5 @version: 6 @author: Wish Chen 7 @contact: 986859110@qq.com 8 @file: get_novels.py 9 @time: 2018/11/21 19:43 10 11 """ 12 import gevent 13 from gevent import monkey 14 monkey.patch_all() 15 import requests, time, random, re, os 16 from bs4 import BeautifulSoup 17 18 dir = os.path.dirname(os.path.abspath(__file__)) 19 20 def getHTMLText(url, data=[], method=\'get\'): 21 """ 22 获取网站的源代码 23 请求默认为get 24 :param url: 25 :param data: 26 :param method: 27 :return: 28 """ 29 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"} 30 proxies = {} 31 try: 32 # print("start", url) 33 r = requests.request(method, url, proxies=proxies, headers=headers, data=data) 34 r.raise_for_status() 35 # r.encoding = r.apparent_encoding 36 # print(\'end\', url) 37 return r 38 except: 39 return r.status_code 40 41 42 def fetch_async(x, url): 43 """ 44 异步IO所需要执行的操作 45 获取源文件 46 模拟向ajax请求获取完整文字 47 每一章输入到temp文件夹下 48 :param x: 49 :param url: 50 :return: 51 """ 52 url_main = "http://quanben5.com/index.php?c=book&a=ajax_content" 53 r = getHTMLText(url) # 获取每一章的源文件 54 title = re.search(r\'<h1 class="title1">(.*)</h1>\', r.text).group(1) 55 result = re.search(r\'<script type="text/javascript">ajax_post\((.*)\)</script>\', r.text).group(1) 56 num_list = result.split("\',\'") 57 num_list[9] = num_list[9][:-1] 58 content = {} # 开始模拟post请求发送的表单 59 for i in range(1, 5): 60 content[num_list[i * 2]] = num_list[i * 2 + 1] 61 content[\'_type\'] = "ajax" 62 content[\'rndval\'] = int(time.time() * 1000) 63 r = getHTMLText(url_main, data=content, method=\'post\') # 模拟post请求 64 soup = BeautifulSoup(r.text, "lxml") 65 with open(os.path.join(dir, \'temp\', "%s.txt" % x), "w", encoding=\'utf8\') as f: 66 f.writelines(\'\'.join([str(x), \'.\', title, \'\n\n\'])) 67 for tag in soup.body.children: 68 if tag.name == \'p\': 69 f.writelines(\'\'.join([\' \', tag.string.strip(), \'\n\n\'])) 70 print(\'%d.%s 下载完成\' % (x, title)) 71 72 73 def get_together(name, author, x): 74 """ 75 将temp目录下的各网页下载下来的txt 76 合并在一起 77 并删除temp文件 78 :param name: 79 :param author: 80 :return: 81 """ 82 with open(os.path.join(dir, "%s.txt" % name), "w", encoding=\'utf8\') as f: 83 f.writelines(\'\'.join([name, \'\n\n作者:\', author, \'\n\n\'])) 84 85 for i in range(x): 86 try: 87 f.write(open(os.path.join(dir, \'temp\', "%s.txt" % (i+1)), "r", encoding=\'utf8\').read()) 88 f.write(\'\n\n\') 89 # os.remove(os.path.join(dir, \'temp\', "%s.txt" % (i+1))) 90 except: 91 continue 92 93 94 def parseByQB5(response, host): 95 """ 96 在全本5小说网的源码下爬取小说 97 获得书名和作者 98 采用gevent异步IO优化 99 :param response: 100 :param host: 101 :return: 102 """ 103 soup = BeautifulSoup(response.text, \'html.parser\') 104 url_text = soup.find_all(\'div\', \'box\')[2] 105 main_text = url_text.find_all(\'h2\')[0].next_sibling 106 url_list = [] 107 for tag in main_text.descendants: 108 if tag.name == \'li\': 109 url = \'\'.join([\'http://\', host, tag.a.attrs[\'href\']]) 110 url_list.append(url) 111 from gevent.pool import Pool 112 pool = Pool(100) 113 114 gevent.joinall([pool.spawn(fetch_async, i+1, url=url_list[i]) for i in range(len(url_list))]) 115 116 name = re.search(r"<h3><span>(.*)</span></h3>", response.text).group(1) 117 author = re.search(r\'<span class="author">(.*)</span></p>\', response.text).group(1) 118 print("%d文档已下载,正在合并..." % len(url_list)) 119 get_together(name, author, len(url_list)) 120 121 122 def getNovelUrls(url): 123 """ 124 通过小说的目录网址判断小说所在的网站 125 并调用属于该网站的爬虫语句 126 :param url: 127 :return: 128 """ 129 130 response = getHTMLText(url) 131 host = url.split(\'//\')[1].split(\'/\')[0] 132 host_list = { 133 "quanben5.com": parseByQB5 134 } 135 host_list[host](response, host) 136 137 138 def get_url(): 139 input_name = input(\'>>\') 140 r = getHTMLText("http://quanben5.com//index.php?c=book&a=search&keywords=%s" % input_name) 141 soup = BeautifulSoup(r.text, "html.parser") 142 main_book = soup.find_all("div", "pic_txt_list") 143 for i in range(len(main_book)): 144 tag = main_book[i].h3 145 print("%s.%s %s" %(i, tag.span.text, tag.next_sibling.next_sibling.text)) 146 choice = int(input(">>")) 147 if choice in range(len(main_book)): 148 return \'\'.join(["http://quanben5.com", main_book[choice].h3.a["href"], "xiaoshuo.html"]) 149 150 151 if __name__ == \'__main__\': 152 # url_list = [ 153 # "https://www.qu.la/book/365/", 154 # "https://www.qb5200.tw/xiaoshuo/0/357/", 155 # "http://quanben5.com/n/doupocangqiong/xiaoshuo.html", 156 # "http://quanben5.com/n/dazhuzai/xiaoshuo.html", 157 # "http://quanben5.com/n/douluodalu/xiaoshuo.html", 158 # "http://quanben5.com/n/renxingderuodian/xiaoshuo.html" 159 # ] 160 # if not os.path.exists(\'temp\'): 161 # os.mkdir(\'temp\') 162 # getNovelUrls(url_list[5]) 163 while True: 164 url = get_url() 165 time_start = time.time() 166 getNovelUrls(url) 167 print("成功爬取! 用时:%ds" % (int((time.time()-time_start)*100)/100))
异步IO+搜索
封装性还不够好
最好一个网站用一个类来封装
采用scrapy框架
正在设计中…
五.未解决问题:
代理IP问题:
目前只有免费的代理IP网
生成随机IP并使用代理IP访问
多网站定制
要观察各个网站的目录源代码结构以及文章源代码结构
每一个网站都可以用一个parse函数来解析其内容