爬虫:requests.get爬虫模块参数
地址和请求头参数–url和header
res = requests.get(url,headers=headers) 向网站发起请求,并获取响应对象
参数
- url :需要抓取的URL地址
- headers : 请求头
- timeout : 超时时间,超过时间会抛出异常
响应对象(res)属性
- encoding :响应字符编码 res.encoding = \’utf-8\’
- text :字符串 网站源码
- content :字节流 字符串网站源码
- status_code :HTTP响应码
- url :实际数据的URL地址
import requests url = \'http://www.baidu.com/\' # 爬取百度网页 headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 \ (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\'} res = requests.get(url, headers=headers) print(res.encoding) # 查看网站的编码格式 ISO-8859-1 # text属性获取响应内容(字符串)网站源码 res = requests.get(url,headers=headers) res.encoding = \'utf-8\' html = res.text # content属性获取响应内容(字节串 bytes)网站源码 res = requests.get(url,headers=headers) html = res.content.decode(\'utf-8\') print(res.status_code) # 查看响应码 200 print(res.url) # 查看访问的URL地址 https://www.baidu.com/
非结构化数据的保存方式
像压缩文件zip、图片文件等都可以使用非结构化数据的保存方式
with open(\'xxx.jpg\',\'wb\') as f: f.write(res.content)
示例:保存赵丽颖图片到本地
import requests url = \'http://dingyue.nosdn.127.net/lL1JH2YdpAWrzEhfp8BrJ8lTHa1602AEX9E7qpTpH5NzW1535203788506compressflag.jpg\' headers = {\'User-Agent\': \'Mozilla/5.0\'} html = requests.get(url, headers=headers).content # 把图片保存到本地 with open(\'赵丽颖.jpg\', \'wb\') as f: f.write(html)
百度贴吧图片抓取
目标:抓取指定贴吧所有图片
思路
- 获取贴吧主页URL,下一页,找到不同页的URL规律
- 获取1页中所有帖子URL地址: [帖子链接1,帖子链接2,…]
- 对每个帖子链接发请求,获取图片URL
- 向图片的URL发请求,以wb方式写入本地文件
贴吧URL规律:http://tieba.baidu.com/f?kw=??&pn=50
xpath表达式
1、帖子链接xpath,这里为什么属性选择class,因为相同的元素他们要相同的样式
//div[@class="t_con cleafix"]/div/div/div/a/@href
2、图片链接xpath
//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src
3、视频链接xpath
//div[@class="video_src_wrapper"]/embed/@data-video
# 注意: 此处视频链接前端对响应内容做了处理,需要查看网页源代码来查看,复制HTML代码在线格式化
百度贴吧视频抓取反爬机制(对响应内容做处理)
网页源代码是:
<div class=”video_src_wrapper”>
<embed data-video=”http://tb-video.bdstatic.com/tieba-smallvideo-transcode-cae/2754153_8fcd225842344de9901c1489e49defbe_0_cae.mp4″
F12调试定位到的代码是:
<div class="video_src_wrapper"> <div class="video_src_wrap_main"> <video src="http://tb-video.bdstatic.com/tie-cae/f2358e8_0_cae.mp4" "></video> </div> </div>
如果通过F12定位的位置,写xpath,会爬取不到,因为我们requsets爬取的是网页代码,最后还是要以网页源代码为主。
import requests from lxml import etree import random import time from urllib import parse class BaiduImageSpider(object): def __init__(self): self.url = \'http://tieba.baidu.com/f?kw={}&pn={}\' self.ua_list = [ \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\', \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0\', \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET \ CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)\', ] # 获取html网页代码 def get_html(self, url): headers = {\'User-Agent\': random.choice(self.ua_list)} html = requests.get(url=url, headers=headers).content.decode(\'utf-8\', \'ignore\') return html # 解析html def xpath_func(self, html, xpath_bds): parse_html = etree.HTML(html) r_list = parse_html.xpath(xpath_bds) return r_list # 图片抓取 def parse_html(self, one_url): html = self.get_html(one_url) xpath_bds = \'//div[@class="t_con cleafix"]/div/div/div/a/@href\' r_list = self.xpath_func(html, xpath_bds) # 提取帖子链接:xpath_list [\'/p/32323\',\'\',\'\'] for r in r_list: t_url = \'http://tieba.baidu.com\' + r # 拼接帖子的URL地址 self.get_image(t_url) # 把帖子中所有图片保存到本地 time.sleep(random.uniform(0, 2)) # 爬完1个帖子中所有图片,休眠0-2秒钟 # 给定1个帖子URL,把帖子中所有图片保存到本地 def get_image(self, t_url): html = self.get_html(t_url) # 使用xpath表达式的或| : 图片链接 + 视频链接 xpath_bds = \'//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src | //div[@class="video_src_wrapper"]/embed/@data-video\' img_list = self.xpath_func(html, xpath_bds) # [\'http://xxx.jpg\',\'\'] print(img_list) for img in img_list: html_bytes = requests.get(url=img, headers={\'User-Agent\': random.choice(self.ua_list)}).content self.save_img(html_bytes, img) # 保存图片函数 def save_img(self, html_bytes, img): filename = img[-10:] with open(filename, \'wb\') as f: f.write(html_bytes) print(\'%s下载成功\' % filename) # 主函数 def main(self): name = input(\'请输入贴吧名:\') begin = int(input(\'请输入起始页:\')) end = int(input(\'请输入终止页:\')) # 对贴吧名进行编码 kw = parse.quote(name) for page in range(begin, end + 1): pn = (page - 1) * 50 url = self.url.format(kw, pn) # 调用主线函数 self.parse_html(url) if __name__ == \'__main__\': spider = BaiduImageSpider() spider.main()
查询参数-params
res = requests.get(url,params=params,headers=headers)
url为基准的url地址,不包含查询参数,该方法会自动对params字典编码,然后和url拼接
参数类型:字典,字典中键值对作为查询参数
import requests baseurl = \'http://tieba.baidu.com/f?\' params = { \'kw\': \'赵丽颖吧\', \'pn\': \'50\'} headers = {\'User-Agent\': \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2\ ; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR \ 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)\'} # 自动对params进行编码,然后自动和url进行拼接,去发请求 res = requests.get(baseurl, headers=headers, params=params) res.encoding = \'utf-8\' print(res.text)
Web客户端验证参数-auth
res = requests.get(url, headers=headers, auth=(\’username\’,\’password\’))
针对于需要web客户端用户名密码认证的网站,auth = (\’username\’,\’password\’)
达内课程笔记
import requests from lxml import etree import random import os class CodeSpider(object): def __init__(self): self.url = \'http://code.tarena.com.cn/AIDCode/aid1904/14-redis/\' self.auth = (\'tarenacode\', \'code_2013\') self.ua_list = [ \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1\', \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0\', \'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .\ NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)\', ] def parse_html(self): # 获取响应内容 html = requests.get(url=self.url, headers={\'User-Agent\': random.choice(self.ua_list)}, auth=self.auth) html = html.content.decode(\'utf-8\', \'ignore\') parse_html = etree.HTML(html) # 解析 r_list = parse_html.xpath(\'//a/@href\') # # r_list : [\'../\',\'day01\',\'day02\',\'redis_day01.zip\'] for r in r_list: if r.endswith(\'.zip\') or r.endswith(\'.rar\'): self.save_files(r) def save_files(self, r): directory = \'/home/tarena/AID/redis/\' if not os.path.exists(directory): os.makedirs(directory) # 拼接地址,把zip文件保存到指定目录 url = self.url + r # filename: /home/tarena/AID/redis/xxx.zip filename = directory + r html = requests.get(url=url, headers={\'User-Agent\': random.choice(self.ua_list)}, auth=self.auth).content with open(filename, \'wb\') as f: f.write(html) print(\'%s下载成功\' % r) if __name__ == \'__main__\': spider = CodeSpider() spider.parse_html()
SSL证书认证参数-verify
response = requests.get(url=url,params=params,headers=headers,verify=False)
SSL证书认证参适用于没有经过 证书认证机构认证的https类型网站,一般这种网站会抛出 SSLError 异常则考虑使用此参数
verify:True(默认)检查证书认证;False(常用)忽略证书认证
代理参数-proxies
定义:代替你原来的IP地址去对接网络的IP地址。隐藏自身真实IP,避免被封。
普通代理
获取代理IP网站:西刺代理、快代理、全网代理、代理精灵、… …
语法结构
proxies = {\'协议\':\'协议://IP:端口号\'} # http和https是相同的 proxies = { \'http\':\'http://59.172.27.6:38380\', \'https\':\'https://59.172.27.6:38380\' }
使用免费普通代理IP访问测试网站: http://httpbin.org/get
import requests url = \'http://httpbin.org/get\' headers = {\'User-Agent\': \'Mozilla/5.0\'} # 定义代理,在代理IP网站中查找免费代理IP proxies = { \'http\': \'http://309435365:szayclhp@43.226.164.156:16818\', \'https\': \'https://309435365:szayclhp@43.226.164.156:16818\'} html = requests.get(url, proxies=proxies, headers=headers, timeout=5).text print(html)
IP池
从西刺代理上面爬取IP,迭代测试能否使用,建立一个自己的代理IP池,随时更新用来抓取网站数据
import requests from lxml import etree import time import random from fake_useragent import UserAgent class GetProxyIP(object): def __init__(self): self.url = \'https://www.xicidaili.com/nn/\' self.proxies = { \'http\': \'http://163.204.247.219:9999\', \'https\': \'http://163.204.247.219:9999\'} # 随机生成User-Agent def get_random_ua(self): ua = UserAgent() # 创建User-Agent对象 useragent = ua.random return useragent # 从西刺代理网站上获取随机的代理IP def get_ip_file(self, url): headers = {\'User-Agent\': self.get_random_ua()} # 访问西刺代理网站国内高匿代理,找到所有的tr节点对象 html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode(\'utf-8\', \'ignore\') parse_html = etree.HTML(html) # 基准xpath,匹配每个代理IP的节点对象列表 tr_list = parse_html.xpath(\'//tr\') for tr in tr_list[1:]: ip = tr.xpath(\'./td[2]/text()\')[0] port = tr.xpath(\'./td[3]/text()\')[0] # 测试ip:port是否可用 self.test_proxy_ip(ip, port) # 测试抓取的代理IP是否可用 def test_proxy_ip(self, ip, port): proxies = { \'http\': \'http://{}:{}\'.format(ip, port), \'https\': \'https://{}:{}\'.format(ip, port), } test_url = \'http://www.baidu.com/\' try: res = requests.get(url=test_url, proxies=proxies, timeout=8) if res.status_code == 200: print(ip, ":", port, \'Success\') with open(\'proxies.txt\', \'a\') as f: f.write(ip + \':\' + port + \'\n\') except Exception as e: print(ip, port, \'Failed\') # 主函数 def main(self): for i in range(1, 1001): url = self.url.format(i) self.get_ip_file(url) time.sleep(random.randint(5, 10)) if __name__ == \'__main__\': spider = GetProxyIP() spider.main()
从IP池中取IP
从文件中随机获取代理IP写爬虫
import random import requests class BaiduSpider(object): def __init__(self): self.url = \'http://www.baidu.com/\' self.headers = {\'User-Agent\': \'Mozilla/5.0\'} self.blag = 1 def get_proxies(self): with open(\'proxies.txt\', \'r\') as f: result = f.readlines() # 读取所有行并返回列表 proxy_ip = random.choice(result)[:-1] # 获取了所有代理IP L = proxy_ip.split(\':\') proxy_ip = { \'http\': \'http://{}:{}\'.format(L[0], L[1]), \'https\': \'https://{}:{}\'.format(L[0], L[1]) } return proxy_ip def get_html(self): proxies = self.get_proxies() if self.blag <= 3: try: html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text print(html) except Exception as e: print(\'Retry\') self.blag += 1 self.get_html() if __name__ == \'__main__\': spider = BaiduSpider() spider.get_html()
收费代理API
写一个获取收费开放API代理的接口
# 获取开放代理的接口 import requests from fake_useragent import UserAgent ua = UserAgent() # 创建User-Agent对象 useragent = ua.random headers = {\'User-Agent\': useragent} def ip_test(ip): url = \'http://www.baidu.com/\' ip_port = ip.split(\':\') proxies = { \'http\': \'http://{}:{}\'.format(ip_port[0], ip_port[1]), \'https\': \'https://{}:{}\'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: return True else: return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = \'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2\' html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\') ip_port_list = html.split(\'\n\') for ip in ip_port_list: with open(\'proxy_ip.txt\', \'a\') as f: if ip_test(ip): f.write(ip + \'\n\') if __name__ == \'__main__\': get_ip_list()
私密代理
1、语法结构
proxies = { \'协议\':\'协议://用户名:密码@IP:端口号\' } proxies = { \'http\':\'http://用户名:密码@IP:端口号\', \'https\':\'https://用户名:密码@IP:端口号\' } proxies = { \'http\': \'http://309435365:szayclhp@106.75.71.140:16816\', \'https\':\'https://309435365:szayclhp@106.75.71.140:16816\', }
用户名和密码会在给你API_URL的时候给你。不是你的账号和账号密码。
# 获取开放代理的接口 import requests from fake_useragent import UserAgent ua = UserAgent() # 创建User-Agent对象 useragent = ua.random headers = {\'User-Agent\': useragent} def ip_test(ip): url = \'https://blog.csdn.net/qq_34218078/article/details/90901602/\' ip_port = ip.split(\':\') proxies = { \'http\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]), \'https\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: print("OK") return True else: print(res.status_code) print("错误") return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = \'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2\' html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\') ip_port_list = html.split(\'\n\') for ip in ip_port_list: with open(\'proxy_ip.txt\', \'a\') as f: if ip_test(ip): f.write(ip + \'\n\') if __name__ == \'__main__\': get_ip_list()