实战1:建立代理IP池
一、爬取免费代理IP
1、爬取代理IP:
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:Meng Zhaoce import requests from bs4 import BeautifulSoup from multiprocessing.dummy import Pool as ThreadPool #多线程模块 from pymongo import MongoClient data = [] def getIp(page): url = \'https://www.xicidaili.com/nt/%d\'%(page) headers ={ \'User-Agent\' :\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\' }#伪装请求头 res = requests.get(url,headers=headers).text #发送请求 soup = BeautifulSoup(res,\'lxml\') for i in soup.find_all(\'tr\'): try: data.append({\'ip\':\'%s:%s\'%(i.find_all(\'td\')[1].get_text(),i.find_all(\'td\')[2].get_text()),\'verify\':False}) except: continue pool = ThreadPool(10) pool.map(getIp,[i for i in range(100)]) pool.close() pool.join() print(data) print(len(data)) db = MongoClient(\'127.0.0.1\',27017).test db.ippool.insert_many(data)
此处涉及知识点:请求库、解析库、多线程模块、菲关系型数据库
二、建立代理IP池
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:Meng Zhaoce import multiprocessing #引入多线程模块 import time import requests from pymongo import MongoClient import redis db = MongoClient(\'127.0.0.1\',27017).text url = \'http://www.baidu.com\' ippool = [] for i in db.ippool.find({\'verify\':False}): ippool.append(i[\'ip\']) start = time.time() def verify(ip): proxies = { \'http\':\'http://%s\'%(ip) } try: res = requests.get(url,proxies=proxies,timeout=2) print(res.status_code) if res.status_code == 200: db.ippool.insert({\'ip\':ip,\'verify\':True}) print(\'insert finished\'.center(50,\'*\')) except Exception as e: print(e) pool = multiprocessing.Pool(processes=10) pool.map(verify,ippool[:100]) print(time.time()-start) print(\'finshed\')
版权声明:本文为1218-mzc原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。