本文介绍两种爬取方式:

1.正则表达式

2.bs4解析Html

 

以下为正则表达式爬虫,面向对象封装后的代码如下:

import urllib.request  # 用于下载图片
import os
import requests  # 发送http请求
import re   # 正则表达式匹配


class GetJpg(object):

    def __init__(self, start_urls):
        self.start_urls = start_urls

    def get_response(self,url):

        \'\'\'获取网页响应内容\'\'\'

        response = requests.get(url).text
        return response

    def get_content(self,html):

        \'\'\'获取网页响应内容中所有图片的整体div部分\'\'\'

        reg = re.compile(r\'(<div class="j-r-list-c">.*?</div>.*?</div>)\', re.S)
        return re.findall(reg, html)

    def get_jpg_url(self,content):

        \'\'\'获取图片url\'\'\'

        reg = r\'data-original="(.*?)"\'
        return re.findall(reg, content)

    def get_jpg_name(self,content):

        \'\'\' 获取图片名称\'\'\'

        reg = re.compile(r\'<a href="/detail-.{8}.html">(.*?)</a>\')
        return re.findall(reg, content)

    def download_jpg(self,src_url, path,index):

        \'\'\'下载图片保存到本地目录\'\'\'

        path = \'\'.join(path.split())
        path = \'E:\Python\爬图片\{name}.{index}\'.format(name=path,index=index)
        if not os.path.exists(path):
            urllib.request.urlretrieve(src_url, path)  # 下载图片
            print(\'OK!!!\')
        else:
            print(\'文件已存在\')

    def get_url_name(self,start_url):

        \'\'\' 逐页下载,本部分本来可以放在main函数里,考虑到会多嵌套一个循环所以单独拿出来作为一个函数\'\'\'

        content = self.get_content(self.get_response(start_url))
        for i in content:
            jpg_url = self.get_jpg_url(i)
            if jpg_url:
                jpg_name = self.get_jpg_name(i)
                index = jpg_url[0].split(\'.\')[-1]
                try:
                    self.download_jpg(jpg_url[0], jpg_name[0],index)
                except:
                    continue

    def main(self):
        
        \'\'\' 执行\'\'\'
        
        [self.get_url_name(start_url) for start_url in self.start_urls]   # 此处列表生成器来执行
        # 这部分的代码相当于:
        # for start_url in self.start_urls:
        #     self.get_url_name(start_url)


if __name__ == \'__main__\':
    start_urls = [\'http://www.budejie.com/{id}\'.format(id=i) for i in range(1,10)]
    jpg = GetJpg(start_urls)  # 实例化一个对象
    jpg.main()

以下为使用bs4爬取的代码:

from bs4 import BeautifulSoup
import urllib.request
import re


def get_urls(img_girl):
    \'\'\'
    :param img_girl: <img>标签内容
    :return: 所有图片的url
    \'\'\'
    all_urls = [girl.get(\'src\') for girl in img_girl]
    return all_urls


def get_img_name(img_girl):
    \'\'\'
    :param img_girl:  <img>标签内容
    :return: 所有图片title
    \'\'\'

    all_name = [girl.get(\'title\') for girl in img_girl]
    return all_name


def get_img_resource(url):
    \'\'\'
    :param url:网站url
    :return:网页源码中的所有<img>标签内容
    \'\'\'
    headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36\',
               \'Accept - Language\': \'zh - CN, zh;q = 0.8\'
    }  # 添加请求头部,模拟浏览器
    req = urllib.request.Request(url, headers=headers)  # 创建对象
    res = urllib.request.urlopen(req, timeout=20)  # 发送请求
    content = res.read()  # 获取响应网页源码
    soup = BeautifulSoup(content,\'html.parser\')  # HMTL源码解析
    img_girl = soup.find_all(\'img\')  # 获取 源码中的<img>标签模块内容
    return img_girl


def main(url):
    \'\'\'
    下载保存图片
    :param url: 网站url
    \'\'\'
    urls = get_urls(get_img_resource(url))
    names = get_img_name(get_img_resource(url))
    x = 1
    for src_url in urls:
        path_l = re.split(r\'\W\', names[urls.index(src_url)])  # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错
        path = \'\'.join(path_l)
        path = \'E:\Python\爬图片\BS4\{name}_{index}.jpg\'.format(name=path,index=x)
        urllib.request.urlretrieve(src_url, path)
        print(\'OK\')
        x += 1

if __name__ == "__main__":
    urls = [\'https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}\'.format(i=id)for id in range(1,10) ]
    [main(url)for url in urls]

 bs4面向对象封装后代码:

from bs4 import BeautifulSoup
import urllib.request
import re


class GetWebImg(object):

    def __init__(self, url, index):
        self.url = url
        self.index = index

    def get_urls(self,img_girl):
        \'\'\'
        :param img_girl: <img>标签内容
        :return: 所有图片的url
        \'\'\'
        all_urls = [girl.get(\'src\') for girl in img_girl]
        return all_urls

    def get_img_name(self,img_girl):
        \'\'\'
        :param img_girl:  <img>标签内容
        :return: 所有图片title
        \'\'\'

        all_name = [girl.get(\'title\') for girl in img_girl]
        return all_name

    def get_img_resource(self, url):
        \'\'\'
        :param url:网站url
        :return:网页源码中的所有<img>标签内容
        \'\'\'
        headers = {
            \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36\',
            \'Accept - Language\': \'zh - CN, zh;q = 0.8\'
            }  # 添加请求头部,模拟浏览器
        req = urllib.request.Request(url, headers=headers)  # 创建对象
        res = urllib.request.urlopen(req, timeout=20)  # 发送请求
        content = res.read()  # 获取响应网页源码
        soup = BeautifulSoup(content, \'html.parser\')  # HMTL源码解析
        img_girl = soup.find_all(\'img\')  # 获取 源码中的<img>标签模块内容
        return img_girl

    def main(self):
        \'\'\'
        下载保存图片
        :param url: 网站url
        \'\'\'
        url_list = self.get_urls(self.get_img_resource(self.url))
        name_list = self.get_img_name(self.get_img_resource(self.url))
        x = 1
        for src_url in url_list:
            path_l = re.split(r\'\W\', name_list[url_list.index(src_url)])  # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错
            path = \'\'.join(path_l)
            path = \'E:\Python\爬图片\BS4\{name}_{index}_{id}.jpg\'.format(name=path, index=self.index,id =x)
            urllib.request.urlretrieve(src_url, path)
            print(\'第{index}页第{id}张图片下载OK\'.format(index=self.index,id =x))
            x += 1


if __name__ == "__main__":
    urls = [\'https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}\'.format(i=id)for id in range(1,10) ]
    index = 1
    for url in urls:
        get_img = GetWebImg(url,index)
        get_img.main()
        index += 1

运行结果:

 

版权声明:本文为wolfshining原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/wolfshining/p/9013906.html