反爬很重要的手段之一就是根据ip来了,包括新浪微博搜索页 微信搜索页 360全系网站360搜索 360百科 360 问答 360新闻,这些都是明确的提示了是根据ip反扒的,所以需要买ip。买得是快代理和芝麻代理。

 

芝麻代理是4600元包年,每天200个ip,每个ip可用时长为25到180分钟,也就意味着同一小时内,大概只有不到10个ip可用,因为如果一下子提取完了200ip,会造成3小时后没ip可用了,所以要少提取一点。芝麻代理优点是质量好ip非常稳定,可用性高,有效时长高,劣势是ip数量有限制,每小时几个ip应付不了每秒都需要高并发的爬虫。

快代理是3000元包年,每天ip无限量,优势是ip多,劣势是失效速度非常之快,得到一些ip后,可能十几秒之后就失效了,而且刚获取的ip里面有大半不可用。但是量大,失效无所谓,反正每分钟都可以补充新的ip,非常适合爬虫。

 

需要定时从快代理和芝麻代理的提取接口获取http和https代理,获取后需要检测可用性再放到redis;同时要开另外几个线程专门检测redis中已存在ip有没有失效,毕竟快代理的失效速度是相当之快的,把失效的代理从redis中及时删除,避免影响爬虫效率。

https的ip和端口可以用来请求http网址,但http的ip和端口却并一定能请求https网址。

 

代码如下,使用一个没有反扒的https和一个http网址来检测ip可用性,一定要选个没反扒的而且页面代码少的网址,不然会浪费大把的流量还降低了检测效率。或者用讯代理的检测接口来检查ip,可以看到匿名和所在地。

#coding=utf-8
import requests ,random,re,json,time,threading,multiprocessing
from concurrent.futures import ThreadPoolExecutor
from gevent import monkey
from gevent.pool import Pool


from  connect2redis import r

import logging
logger=logging.getLogger(\'daili\')
from Logger import Logger

import sys
reload(sys)
sys.setdefaultencoding(\'utf8\')

class Daili(object):
    pool_executor= ThreadPoolExecutor(max_workers=10)
    gevent_pool = Pool(200)
    gevent_pool2 = Pool(120)
    gevent_pool3 = Pool(110)
    get_proxy_kuaidaili_https=\'http://svip.kuaidaili.com/api/getproxy/?orderid=94015562773xxxx&num=100&protocol=2&method=1&an_an=1&an_ha=1&sp1=1&sp2=1&sp3=1&quality=1&sort=2&format=json&sep=1\'
    get_proxy_kuaidaili_http=\'http://svip.kuaidaili.com/api/getproxy/?orderid=9401556277xxxxx&num=100&protocol=1&method=1&an_an=1&an_ha=1&quality=1&sort=1&format=json&sep=1\'
    get_proxy_zhima= \'http://http-webapi.zhimaruanjian.com/getip?num=1&type=2&pro=&city=0&yys=0&port=11&pack=387&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1\'


    #check_http_url=\'http://n.sinaimg.cn/ent/home1704/js/pl/echo.min.js\'      #\'http://mini.eastday.com/assets/v1/js/search_word.js\'  http://images1.jyimg.com/w4/mai/c/jy_mai.css
    check_https_url =\'https://www.baidu.com/content-search.xml\'
    check_http_url=\'http://mini.eastday.com/assets/v1/js/search_word.js\'


    #check_https_url = \'https://www.baidu.com/\'
    TIMEOUT=30

    def get_proxies(self,zhima,kuaidaili_https,kuaidaili_http):
        def get_kuaidaili(http_or_https):
            if http_or_https==\'https\':
                redis_key=\'kuaidaili:https\'
                kuaidaili_name=\'快https\'
                kuaidaili_url=self.get_proxy_kuaidaili_https

            if http_or_https==\'http\':
                redis_key=\'kuaidaili:http\'
                kuaidaili_name=\'快http\'
                kuaidaili_url=self.get_proxy_kuaidaili_http

            try:
                resp=requests.get(kuaidaili_url)
                dictx = json.loads(resp.content)
                logger.info(\'从%s代理获取的 %s 个ip分别是:%s\' % (kuaidaili_name,len(dictx[\'data\'][\'proxy_list\']), resp.content))
                for i in dictx[\'data\'][\'proxy_list\']:
                    pr = {\'adress\': i, "site": redis_key}
                    prs.append(pr)

            except Exception ,e:
                logger.warning(\'从%s代理获取代理错误: %s\' % (kuaidaili_name,str(e)))
                time.sleep(3)

        while 1:
            prs=[]
            if zhima==1:
                if r.scard(\'zhima\')<6:
                    try:
                        resp=requests.get(self.get_proxy_zhima)
                        if \'您的套餐今日已到达上限\' not in resp.content:
                            dictx = json.loads(resp.content)
                            logger.info(\'从芝麻代理获取的 %s 个ip分别是:%s\' % (len(dictx[\'data\']), resp.content))
                            for i in dictx[\'data\']:
                                pr = {\'adress\': "%s:%s" % (i[\'ip\'], i[\'port\']), "site": "zhima"}
                                prs.append(pr)
                                # self.pool_executor.submit(self.check_proxy,pr).add_done_callback(self.callback_normol)
                        else:
                            logger.warning(\'从芝麻代理获取ip已达上限 %s\' % resp.content)
                    except Exception ,e:
                        logger.warning(\'从芝麻代理获取代理错误: %s\'%str(e))
                        time.sleep(3)

            if kuaidaili_https==1:
                if r.scard(\'kuaidaili:https\') < 100:
                    get_kuaidaili(\'https\')

            if kuaidaili_http==1:
                if r.scard(\'kuaidaili:http\') < 80:
                    get_kuaidaili(\'http\')

            self.gevent_pool.map(self.check_proxy,prs)
            logger.info( \'检测完成%s\'%(\'-\'*300))


    def _get_kind(self,dictx):
        if dictx[\'site\']==\'kuaidaili:https\':
            daili_name=\'快https\'
            check_url=self.check_https_url
            proxy={\'https\': \'https://%s\' % dictx[\'adress\']}

        if dictx[\'site\']==\'kuaidaili:http\':
            daili_name=\'快http\'
            check_url=self.check_http_url
            proxy={\'http\': \'http://%s\' % dictx[\'adress\']}

        if dictx[\'site\']==\'zhima\':
            daili_name=\'芝麻\'
            check_url=self.check_https_url
            proxy={\'https\': \'https://%s\' % dictx[\'adress\']}
        return daili_name,check_url,proxy

    def check_proxy(self,pr):
        daili_name,check_url,proxy=self._get_kind(pr)
        try:
            if r.sismember(pr[\'site\'], pr[\'adress\']) == 0:   ###ip如果已经在redis中了就不检测
                if pr[\'site\']==\'zhima\'  or pr[\'site\']==\'kuaidali:https\':
                    check_url=self.check_https_url
                if pr[\'site\']==\'kuaidaili:http\':
                    check_url=self.check_http_url
                resp=requests.get(check_url, proxies=proxy, timeout=self.TIMEOUT)

                #print resp.content
                if resp.status_code==200 or resp.status_code==429:
                    logger.debug(\'check检测%s代理ip %s 可用\'%(daili_name,str(pr[\'adress\'])))
                    r.sadd(pr[\'site\'], pr[\'adress\'])
                else:                         ###可能是407,需要认证的ip
                    logger.debug(\'check检测%s代理ip %s 返回状态不是200,返回状态是 %s \'  % (daili_name,str(pr[\'adress\']), resp.status_code))
            else:
                logger.debug(\'check检测%s代理ip %s 已存在redis中\' % (daili_name,pr[\'adress\']))

        except Exception, e:
            logger.debug(\'check检测%s代理ip %s 不可用原因是: %s\' %(daili_name,str(pr[\'adress\']),str(e)))

    def drop_redis_daili(self,redis_key,time_sleep):
        if redis_key==\'kuaidaili:https\':
            daili_name=\'快https\'
        if redis_key==\'kuaidaili:http\':
            daili_name=\'快http\'
        if redis_key==\'zhima\':
            daili_name=\'芝麻\'
        while 1:
            all_proxies=list(r.smembers(redis_key))
            prs=[]
            for adressx in  all_proxies:
                prs.append({\'adress\': \'%s\' % adressx,\'site\':redis_key})
            self.gevent_pool2.map(self.drop_daili,prs)
            logger.info(\'清除完成%s代理,redis中有%s个%s代理%s\' % (daili_name,r.scard(redis_key),daili_name,\'#\'* 300))
            time.sleep(time_sleep)

    def drop_daili(self,pr):
        daili_name,check_url,proxy=self._get_kind(pr)
        try:
            resp = requests.get(check_url, proxies=proxy, timeout=self.TIMEOUT)
            logger.debug(\'redis检测%s代理ip %s 没有失效,返回状态是 %s\' % (daili_name,pr, resp.status_code))

        except Exception, e:
            logger.debug(\'redis检测%s代理ip %s 失效了原因是: %s\' % (daili_name,pr, str(e)))
            r.srem(pr[\'site\'], pr[\'adress\'])



# class Xdaili(Daili):   ##使用讯代理接口检测
#
#     check_url=\'http://www.xdaili.cn/ipagent//checkIp/ipList\'
#     TIMEOUT2=30
#     def check_proxy(self,pr):
#
#         if pr[\'site\']==\'kuaidaili\':
#             daili_name=\'快\'
#         if pr[\'site\']==\'zhima\':
#             daili_name=\'芝麻\'
#         try:
#             if r.sismember(pr[\'site\'], pr[\'adress\']) == 0 or r.sismember(pr[\'site\'], pr[\'adress\']) ==1 :   ###ip如果已经在redis中了就不检测
#                 paramsx={\'ip_ports[]\':pr[\'adress\']}
#                 resp=requests.get(self.check_url,params=paramsx , timeout=self.TIMEOUT2)
#
#                 #print resp.content
#                 if \'time\' in resp.content:
#                     logger.info(\'check检测%s代理ip %s 可用\'%(daili_name,str(pr[\'adress\'])))
#                     r.sadd(pr[\'site\'], pr[\'adress\'])
#                     print \'使用讯代理检测ip可用\',resp.content
#                 else:
#                     print  \'使用讯代理检测ip不可用\',resp.content
#
#             else:
#                 logger.info(\'check检测%s代理ip %s 已存在redis中\' % (daili_name,pr[\'adress\']))
#
#         except Exception, e:
#             print e
#             #logger.warning(\'check检测%s代理ip %s 不可用原因是: %s\' %(daili_name,str(pr[\'adress\']),str(e)))





if __name__=="__main__":
    monkey.patch_all(select=True, socket=True)
    lg = Logger(logname=\'daili_dandu.txt\', loglevel=1, logger="daili").getlog()
    Logger(logname=\'daili_future_log.txt\', loglevel=2, logger="concurrent.futures").getlog()
    daili=Daili()
    #daili.check_proxy({\'adress\':\'58.16.42.140:80\',\'site\':\'kuaidaili:http\'})
    ths=[]
    ths.append(threading.Thread(target=daili.get_proxies,args=(1,1,1,)))
    ths.append(threading.Thread(target=daili.drop_redis_daili,args=(\'kuaidaili:https\',1,)))
    ths.append(threading.Thread(target=daili.drop_redis_daili,args=(\'kuaidaili:http\',1,)))
    ths.append(threading.Thread(target=daili.drop_redis_daili,args=(\'zhima\',500,)))
    for t in ths:
        t.start()

 



 







版权声明:本文为ydf0509原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/ydf0509/p/7298133.html