首先需要在ip代理的网站爬取有用的ip,保存到数据库中

  1. import requests
  2. from scrapy.selector import Selector
  3. import pymysql
  4. conn = pymysql.connect(host = \'127.0.0.1\', user = \'root\' ,passwd = \'root\',db = \'mysql18_text\',charset = \'utf8\')
  5. cursor = conn.cursor()
  6. def crawl_ips():
  7. #爬取xici的免费ip代理
  8. agent = \'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0\'
  9. header = {
  10. \'User-Agent\':agent
  11. }
  12. for i in range(1,3458):
  13. reas = requests.get(\'http://www.xicidaili.com/nn/\',headers = header)
  14. Selectora = Selector(reas)
  15. all_trs = Selectora.xpath(\'//table[@id="ip_list"]/tr\')
  16. ip_list = []
  17. for tr in all_trs[1:]:
  18. spend_str = tr.xpath(\'./td/div[@class="bar"]/@title\').extract()[0] ##提取速度
  19. if spend_str:
  20. speed = float(spend_str.split(\'\')[0])
  21. all_text = tr.xpath(\'./td/text()\').extract()
  22. ip = all_text[0]
  23. port = all_text[1]
  24. proxy_type = all_text[5]
  25. ip_list.append((ip,port,speed,proxy_type))
  26. for ip_info in ip_list:
  27. cursor.execute(
  28. """insert project_ip(ip,port,speed,proxy_type) VALUES(\'{0}\',\'{1}\',\'{2}\',\'HTTP\')""".format(
  29. ip_info[0],ip_info[1],ip_info[2]
  30. )
  31. )
  32. conn.commit()
  33. print(ip_list)
  34. crawl_ips()
  35. conn.close()
  36. cursor.close()

随机在数据库中获取一个ip的代码

  1. class GetIP(object):
  2. def delete_ip(self,ip):
  3. #从数据库中删除无效的ip
  4. delete_sql = """
  5. delete from project_ip where ip=\'{0}\'
  6. """.format(ip)
  7. cursor.execute(delete_sql)
  8. conn.commit()
  9. return True
  10. def judge_ip(self,ip,port):
  11. #判断一个ip是否可用
  12. http_url = \'http://www.baidu.com\'
  13. proxy_url = \'https://{0}:{1}\'.format(ip,port)
  14. try:
  15. proxy_dict = {
  16. \'http\':proxy_url,
  17. }
  18. requests.get(http_url,proxies = proxy_dict)
  19. return True
  20. except Exception as e:
  21. print("ip出现异常")
  22. #出现异常后就把这个ip给删除掉
  23. self.delete_ip(ip)
  24. return False
  25. else:
  26. code = response.status_code
  27. if code>=200 and code<300:
  28. print(\'effective ip\')
  29. return True
  30. else:
  31. print(\'invalid\')
  32. self.delete_ip(ip)
  33. return False
  34. def get_random_ip(self):
  35. #从数据库中随机获取到一个可用的ip
  36. random_sql = """
  37. SELECT ip,port FROM project_ip
  38. ORDER BY RAND()
  39. LIMIT 1
  40. """
  41. result = cursor.execute(random_sql)
  42. for ip_info in cursor.fetchall():
  43. ip = ip_info[0]
  44. port = ip_info[1]
  45. judge_re = self.judge_ip(ip,port)
           if judge_re:#如果返回True
             return "http://\'{0}\':\'{1}\'".format(ip,port)
           else:
              return get_random_ip()

Middleware动态设置ip代理

  1. class RandomProxyMiddleware(object):
  2. def process_request(self,request,spider):
  3. get_ip = GetIP()#这里需要导入那个函数
  4. request.meta[\'proxy\'] = get_ip.get_random_ip()

 

版权声明:本文为ArtisticMonk原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/ArtisticMonk/p/9738921.html