爬取廖雪峰的python3教程
从廖雪峰老师的python教程入门的,最近在看python爬虫,入手了一下
代码比较low,没有用到多线程和ip代理池
然后呢,由于robots.txt的限定,构建了一些user-agent,并放慢的爬虫的速度,起到一些效果,可能多次抓取才能完成所有文章~~~
仅供一些刚入门的同学参考一下……..
用到的库及工具:(自行百度)
1.BeautifulSoup4
2.pdfkit
3.requests
4.wkhtmltopdf(需添加环境变量)
代码如下:
1 # -*- coding:utf-8 -*- 2 # @author:lijinxi 3 # @file: __init__.py.py 4 # @time: 2018/05/07 5 6 import requests 7 from bs4 import BeautifulSoup 8 import pdfkit 9 import time 10 import os 11 import re 12 import random 13 14 15 class Crawel(object): 16 def __init__(self): 17 self.htmlTemplate = ''' 18 <!DOCTYPE html> 19 <html lang="en"> 20 <head> 21 <meta charset="UTF-8"> 22 </head> 23 <body> 24 {content} 25 </body> 26 </html> 27 ''' 28 # robots.txt不允许,设置请求头 29 user_agent=[ 30 "Mozilla / 5.0(Windows NT 10.0;Win64; x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 66.0.3359.139,Safari / 537.36", 31 "Mozilla / 5.0(Windows NT 10.0;Win64; x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 58.0.3029.110Safari / 537.36 Edge / 16.16299", 32 " Mozilla / 5.0(WindowsNT10.0;WOW64;Trident / 7.0;LCTE;rv: 11.0) likeGecko", 33 "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 59.0) Gecko / 20100101Firefox / 59.0", 34 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 35 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 36 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 37 "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 38 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)" 39 ] 40 self.headers = { 41 "Proxy-Connection": "keep-alive", 42 "Pragma": "no-cache", 43 "Cache-Control": "no-cache", 44 "User - Agent": (user_agent[random.randint(0,len(user_agent)-1)]), 45 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 46 "DNT": "1", 47 "Accept-Encoding": "gzip, deflate, sdch", 48 "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4", 49 "Accept-Charset": "gb2312,gbk;q=0.7,utf-8;q=0.7,*;q=0.7", 50 "Referer": "https: // www.liaoxuefeng.com /", 51 } 52 53 def getPageLinks(self): 54 ''' 55 获取所有的URL集合 56 :return: 57 ''' 58 response = requests.get("https://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000", 59 headers=self.headers) 60 bsObj = BeautifulSoup(response.text, "lxml") 61 menu_list = bsObj.find("ul", {"id": "x-wiki-index", "class": "uk-nav uk-nav-side"}) 62 pageLinks = [] 63 for pageLink in menu_list.findAll("a", {"class": "x-wiki-index-item"}): 64 if pageLink.attrs["href"] is not None: 65 newLink = "https://www.liaoxuefeng.com" + pageLink.attrs["href"] 66 pageLinks.append(newLink) 67 return pageLinks 68 69 def getUrlContent(self, url, file): 70 ''' 71 解析URL,获取HTML内容 72 :param url: 73 :param file:保存的html 文件名 74 :return: 75 ''' 76 response = requests.get(url, headers=self.headers) 77 bsObj = BeautifulSoup(response.text, "lxml") 78 # 正文 79 pageContent = bsObj.find("div", {"class": "x-wiki-content x-main-content"}) 80 # 标题 81 pageTitle = bsObj.find("h4").get_text() 82 # 标题放在正文之前居中显示 83 center_tag = bsObj.new_tag("center") 84 title_tag = bsObj.new_tag("h1") 85 title_tag.string = pageTitle 86 center_tag.insert(1, title_tag) 87 pageContent.insert(0, center_tag) 88 html = str(pageContent) 89 html = self.htmlTemplate.format(content=html) 90 html = html.encode("utf-8") 91 with open(file, 'wb+') as f: 92 f.write(html) 93 return file 94 95 def sloveImage(self, filename1, filename2): 96 ''' 97 解决图片不能正常保存的问题 98 由路径引起,尝试修改路径 99 :param filename1:原始文件 100 :param filename2:修改后要保存的文件 101 :return: 102 ''' 103 with open(filename1, "rb+") as f: 104 text = f.read().decode("utf-8") 105 text = text.replace("data-src", "src") 106 with open(filename2, "wb+") as f: 107 f.write(text.encode("utf-8")) 108 return filename2 109 110 def savePdf(self, htmls, filename): 111 ''' 112 将所有的html保存到pdf文件 113 :param htmls: 114 :param filename: 115 :return: 116 ''' 117 options = { 118 'page-size': 'Letter', 119 'margin-top': '0.75in', 120 'margin-right': '0.75in', 121 'margin-bottom': '0.75in', 122 'margin-left': '0.75in', 123 'encoding': "UTF-8", 124 'custom-header': [ 125 ('Accept-Encoding', 'gzip') 126 ], 127 'cookie': [ 128 ('cookie-name1', 'cookie-value1'), 129 ('cookie-name2', 'cookie-value2'), 130 ], 131 'outline-depth': 10, 132 } 133 pdfkit.from_file(htmls, filename, options=options) 134 135 136 def main(): 137 ''' 138 处理 139 :return: 140 ''' 141 start = time.time() 142 crawer = Crawel() 143 filename = "liaoxuefeng_blogs_python3.pdf" 144 pageLinks = crawer.getPageLinks() 145 htmls = [] # html文件列表 146 for index, pageLink in enumerate(pageLinks): 147 if index<18: 148 continue 149 filename1 = "index" + str(index) + ".html" 150 filename2 = "indexc" + str(index) + ".html" 151 crawer.getUrlContent(pageLink, filename1) 152 waittime=random.randint(0,20)+20; 153 time.sleep(waittime) # 给自己留一线生机 154 html = crawer.sloveImage(filename1, filename2) 155 htmls.append(html) 156 print("第%d页采集完成........." % index) 157 crawer.savePdf(htmls, filename) 158 # 移除html文件 159 '''' 160 rex = re.compile("^index.*\.html$") 161 for i in os.listdir(): 162 if rex.match(i): 163 os.remove(i) 164 ''' 165 total_time = time.time() - start 166 print("总共运行了%d秒" % total_time) 167 168 169 if __name__ == '__main__': 170 main()
进一步完善中……………………..^<>^