一、爬取免费代理IP

1、爬取代理IP:

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Meng Zhaoce
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool #多线程模块
from pymongo import MongoClient
data = []

def getIp(page):
    url = \'https://www.xicidaili.com/nt/%d\'%(page)
    headers ={
        \'User-Agent\' :\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\'

    }#伪装请求头
    res = requests.get(url,headers=headers).text #发送请求
    soup = BeautifulSoup(res,\'lxml\')
    for i in soup.find_all(\'tr\'):
        try:
            data.append({\'ip\':\'%s:%s\'%(i.find_all(\'td\')[1].get_text(),i.find_all(\'td\')[2].get_text()),\'verify\':False})
        except:
            continue

pool = ThreadPool(10)
pool.map(getIp,[i for i in range(100)])
pool.close()
pool.join()
print(data)
print(len(data))

db = MongoClient(\'127.0.0.1\',27017).test
db.ippool.insert_many(data)

此处涉及知识点:请求库、解析库、多线程模块、菲关系型数据库

 二、建立代理IP池

 

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:Meng Zhaoce
import multiprocessing #引入多线程模块
import time
import requests
from pymongo import MongoClient
import redis
db = MongoClient(\'127.0.0.1\',27017).text
url = \'http://www.baidu.com\'
ippool = []
for i in db.ippool.find({\'verify\':False}):
    ippool.append(i[\'ip\'])
start = time.time()
def verify(ip):
    proxies = {
        \'http\':\'http://%s\'%(ip)
    }
    try:
        res = requests.get(url,proxies=proxies,timeout=2)
        print(res.status_code)
        if res.status_code == 200:
            db.ippool.insert({\'ip\':ip,\'verify\':True})
            print(\'insert finished\'.center(50,\'*\'))
    except Exception as e:
        print(e)

pool = multiprocessing.Pool(processes=10)
pool.map(verify,ippool[:100])
print(time.time()-start)
print(\'finshed\')

 

版权声明:本文为1218-mzc原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/1218-mzc/p/11780484.html