python网络爬虫抓取网站图片
本文介绍两种爬取方式:
1.正则表达式
2.bs4解析Html
以下为正则表达式爬虫,面向对象封装后的代码如下:
import urllib.request # 用于下载图片 import os import requests # 发送http请求 import re # 正则表达式匹配 class GetJpg(object): def __init__(self, start_urls): self.start_urls = start_urls def get_response(self,url): \'\'\'获取网页响应内容\'\'\' response = requests.get(url).text return response def get_content(self,html): \'\'\'获取网页响应内容中所有图片的整体div部分\'\'\' reg = re.compile(r\'(<div class="j-r-list-c">.*?</div>.*?</div>)\', re.S) return re.findall(reg, html) def get_jpg_url(self,content): \'\'\'获取图片url\'\'\' reg = r\'data-original="(.*?)"\' return re.findall(reg, content) def get_jpg_name(self,content): \'\'\' 获取图片名称\'\'\' reg = re.compile(r\'<a href="/detail-.{8}.html">(.*?)</a>\') return re.findall(reg, content) def download_jpg(self,src_url, path,index): \'\'\'下载图片保存到本地目录\'\'\' path = \'\'.join(path.split()) path = \'E:\Python\爬图片\{name}.{index}\'.format(name=path,index=index) if not os.path.exists(path): urllib.request.urlretrieve(src_url, path) # 下载图片 print(\'OK!!!\') else: print(\'文件已存在\') def get_url_name(self,start_url): \'\'\' 逐页下载,本部分本来可以放在main函数里,考虑到会多嵌套一个循环所以单独拿出来作为一个函数\'\'\' content = self.get_content(self.get_response(start_url)) for i in content: jpg_url = self.get_jpg_url(i) if jpg_url: jpg_name = self.get_jpg_name(i) index = jpg_url[0].split(\'.\')[-1] try: self.download_jpg(jpg_url[0], jpg_name[0],index) except: continue def main(self): \'\'\' 执行\'\'\' [self.get_url_name(start_url) for start_url in self.start_urls] # 此处列表生成器来执行 # 这部分的代码相当于: # for start_url in self.start_urls: # self.get_url_name(start_url) if __name__ == \'__main__\': start_urls = [\'http://www.budejie.com/{id}\'.format(id=i) for i in range(1,10)] jpg = GetJpg(start_urls) # 实例化一个对象 jpg.main()
以下为使用bs4爬取的代码:
from bs4 import BeautifulSoup import urllib.request import re def get_urls(img_girl): \'\'\' :param img_girl: <img>标签内容 :return: 所有图片的url \'\'\' all_urls = [girl.get(\'src\') for girl in img_girl] return all_urls def get_img_name(img_girl): \'\'\' :param img_girl: <img>标签内容 :return: 所有图片title \'\'\' all_name = [girl.get(\'title\') for girl in img_girl] return all_name def get_img_resource(url): \'\'\' :param url:网站url :return:网页源码中的所有<img>标签内容 \'\'\' headers = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36\', \'Accept - Language\': \'zh - CN, zh;q = 0.8\' } # 添加请求头部,模拟浏览器 req = urllib.request.Request(url, headers=headers) # 创建对象 res = urllib.request.urlopen(req, timeout=20) # 发送请求 content = res.read() # 获取响应网页源码 soup = BeautifulSoup(content,\'html.parser\') # HMTL源码解析 img_girl = soup.find_all(\'img\') # 获取 源码中的<img>标签模块内容 return img_girl def main(url): \'\'\' 下载保存图片 :param url: 网站url \'\'\' urls = get_urls(get_img_resource(url)) names = get_img_name(get_img_resource(url)) x = 1 for src_url in urls: path_l = re.split(r\'\W\', names[urls.index(src_url)]) # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错 path = \'\'.join(path_l) path = \'E:\Python\爬图片\BS4\{name}_{index}.jpg\'.format(name=path,index=x) urllib.request.urlretrieve(src_url, path) print(\'OK\') x += 1 if __name__ == "__main__": urls = [\'https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}\'.format(i=id)for id in range(1,10) ] [main(url)for url in urls]
bs4面向对象封装后代码:
from bs4 import BeautifulSoup import urllib.request import re class GetWebImg(object): def __init__(self, url, index): self.url = url self.index = index def get_urls(self,img_girl): \'\'\' :param img_girl: <img>标签内容 :return: 所有图片的url \'\'\' all_urls = [girl.get(\'src\') for girl in img_girl] return all_urls def get_img_name(self,img_girl): \'\'\' :param img_girl: <img>标签内容 :return: 所有图片title \'\'\' all_name = [girl.get(\'title\') for girl in img_girl] return all_name def get_img_resource(self, url): \'\'\' :param url:网站url :return:网页源码中的所有<img>标签内容 \'\'\' headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36\', \'Accept - Language\': \'zh - CN, zh;q = 0.8\' } # 添加请求头部,模拟浏览器 req = urllib.request.Request(url, headers=headers) # 创建对象 res = urllib.request.urlopen(req, timeout=20) # 发送请求 content = res.read() # 获取响应网页源码 soup = BeautifulSoup(content, \'html.parser\') # HMTL源码解析 img_girl = soup.find_all(\'img\') # 获取 源码中的<img>标签模块内容 return img_girl def main(self): \'\'\' 下载保存图片 :param url: 网站url \'\'\' url_list = self.get_urls(self.get_img_resource(self.url)) name_list = self.get_img_name(self.get_img_resource(self.url)) x = 1 for src_url in url_list: path_l = re.split(r\'\W\', name_list[url_list.index(src_url)]) # 去除图片名称中的特殊字符,不然文件名可能在保存的时候报错 path = \'\'.join(path_l) path = \'E:\Python\爬图片\BS4\{name}_{index}_{id}.jpg\'.format(name=path, index=self.index,id =x) urllib.request.urlretrieve(src_url, path) print(\'第{index}页第{id}张图片下载OK\'.format(index=self.index,id =x)) x += 1 if __name__ == "__main__": urls = [\'https://www.dbmeinv.com/dbgroup/show.htm?cid=4&pager_offset={i}\'.format(i=id)for id in range(1,10) ] index = 1 for url in urls: get_img = GetWebImg(url,index) get_img.main() index += 1
运行结果:
版权声明:本文为wolfshining原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。