自己写python爬虫从百度上下载图片脚本

参考URL: https://blog.csdn.net/z704630835/article/details/82992036

1 下载脚本

# 导入需要的库
import requests
import os
import json

# 爬取百度图片，解析页面的函数
def getManyPages(keyword, pages):
    \'\'\'
    参数keyword：要下载的影像关键词
    参数pages：需要下载的页面数
    \'\'\'
    params = []

    for i in range(30, 30 * pages + 30, 30):
        params.append({
            \'tn\': \'resultjson_com\',
            \'ipn\': \'rj\',
            \'ct\': 201326592,
            \'is\': \'\',
            \'fp\': \'result\',
            \'queryWord\': keyword,
            \'cl\': 2,
            \'lm\': -1,
            \'ie\': \'utf-8\',
            \'oe\': \'utf-8\',
            \'adpicid\': \'\',
            \'st\': -1,
            \'z\': \'\',
            \'ic\': 0,
            \'word\': keyword,
            \'s\': \'\',
            \'se\': \'\',
            \'tab\': \'\',
            \'width\': \'\',
            \'height\': \'\',
            \'face\': 0,
            \'istype\': 2,
            \'qc\': \'\',
            \'nc\': 1,
            \'fr\': \'\',
            \'pn\': i,
            \'rn\': 30,
            \'gsm\': \'1e\',
            \'1488942260214\': \'\'
        })
    url = \'https://image.baidu.com/search/acjson\'
    urls = []
    for i in params:
        try:
            urls.append(requests.get(url, params=i).json().get(\'data\'))
        except json.decoder.JSONDecodeError:
            print("解析出错")
    return urls

# 下载图片并保存
def getImg(dataList, localPath):
    \'\'\'
    参数datallist：下载图片的地址集
    参数localPath：保存下载图片的路径
    \'\'\'
    if not os.path.exists(localPath):  # 判断是否存在保存路径，如果不存在就创建
        os.makedirs(localPath)
    x = 0
    for list in dataList:
        for i in list:
            if i.get(\'thumbURL\') != None:
                print(\'正在下载：%s\' % i.get(\'thumbURL\'))
                ir = requests.get(i.get(\'thumbURL\'))
                open(localPath + \'%d.jpg\' % x, \'wb\').write(ir.content)
                x += 1
            else:
                print(\'图片链接不存在\')

# 根据关键词来下载图片
if __name__ == \'__main__\':
    dataList = getManyPages(\'吃惊\', 20)     # 参数1:关键字，参数2:要下载的页数
    getImg(dataList, \'./data/chijing/\')            # 参数2:指定保存的路径

2 通过人脸检测来过滤非人脸和剪切人脸

2.1 使用opencv的人脸检测
#!/usr/bin/env python
# -*- coding:utf-8-*-

import os
import os.path as osp
import cv2
import glob

from io_helper import *

cv_root = \'D:/install packages/opencv-3.4.2/data/haarcascades\'
cv_face_model_path = cv_root + \'/haarcascade_frontalface_alt2.xml\'
cv_face_model_path2 = cv_root + \'/haarcascade_profileface.xml\'


def test_face_detect_cv():
    classifier1 = cv2.CascadeClassifier(cv_face_model_path)  # 正脸
    filepath = \'\'
    img = cv2.imread(filepath)  # 读取图片
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 转换灰色
    faceRects = classifier1.detectMultiScale(gray,
                                             scaleFactor=1.1,
                                             minNeighbors=1,
                                             minSize=(10, 10))
    if len(faceRects):  # 大于0则检测到人脸
        for box in faceRects:  # 单独框出每一张人脸
            x, y, w, h = box
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 1)

        cv2.imshow(\'a\', img)
        cv2.waitKey(0)
    cv2.destroyAllWindows()


def face_detect_save():
    path = r\'D:/AI/DataSet/emotion/fer2013/train_class\'
    files = glob.glob(path + \'/**/*.jpg\')

    new_dir = path + \'/cut_face\'
    new_dir2 = path + \'/no_face\'
    mkdir_if_not_exist(new_dir)
    mkdir_if_not_exist(new_dir2)

    # OpenCV人脸识别分类器
    classifier1 = cv2.CascadeClassifier(cv_face_model_path)  # 正脸
    # classifier2 = cv2.CascadeClassifier(cv_face_model_path2)  # 侧脸
    for filepath in files:
        chd_dir = new_dir + \'/\' + filepath.split(\'\\\')[-2]
        mkdir_if_not_exist(chd_dir)
        chd_dir2 = new_dir2 + \'/\' + filepath.split(\'\\\')[-2]
        mkdir_if_not_exist(chd_dir2)

        filename = osp.basename(filepath)
        img = cv2.imread(filepath)  # 读取图片
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 转换灰色
        color = (0, 255, 0)  # 定义绘制颜色
        # 调用识别 正脸人脸
        faceRects = classifier1.detectMultiScale(gray,
                                                 scaleFactor=1.1,
                                                 minNeighbors=1,
                                                 minSize=(10, 10))
        if len(faceRects):
            for box in faceRects:  # 单独框出每一张人脸
                x, y, w, h = box
                face_roi = img[y:y + h, x:x + w, :]
                file = chd_dir + \'/\' + filename
                cv2.imwrite(file,face_roi)
        else:
            file = chd_dir2 + \'/\' + filename
            shutil.copy(filepath,file)

    print(\'work is done .\')


if __name__ == \'__main__\':
    face_detect_save()

2.2 使用mtcnn的包进行人脸检测
-----------------------------------------
使用python公开包 mtcnn 来进行人脸检测和关键点检测
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple mtcnn

gpu_id = 3
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
import tensorflow as tf
from mtcnn.mtcnn import MTCNN

detector = MTCNN(scale_factor=0.99)
face_list = detector.detect_faces(img)

for item in face_list:
    box = item[\'box\']
    conf = item[\'confidence\']
    keypoints_dict = item[\'keypoints\']
    # {\'left_eye\': (14, 16), \'right_eye\': (31, 12), \
    # \'nose\': (23, 25), \'mouth_left\': (19, 35), \'mouth_right\': (33, 32)}
    left_eyeXY = keypoints_dict[\'left_eye\']
    right_eyeXY = keypoints_dict[\'right_eye\']
    noseXY = keypoints_dict[\'nose\']
    mouth_leftXY = keypoints_dict[\'mouth_left\']
    mouth_rightXY = keypoints_dict[\'mouth_right\']
    if conf > 0:
        print(\'detect a face .\')
        x, y, w, h = box
        offset = 5
        x = max(0, x - offset)
        y = max(0, y - offset)
        w = min(w + 2 * offset, src_w - x)
        h = min(h + 2 * offset, src_h - y)

        face_img = img[y:y + h, x:x + w, :]


-----------------------------------
2.3 使用关键点来进行人脸对齐

本文链接：https://www.cnblogs.com/dxscode/p/12593950.html

自己写python爬虫从百度上下载图片脚本

自己写python爬虫从百度上下载图片脚本的更多相关文章

随机推荐

热门专题

目录导航