Python爬虫 —— 抓取美女图片 - h_z_cong
代码如下:
#coding:utf-8 # import datetime import requests import os import sys from lxml import etree import codecs class Spider: def __init__(self): self.headers = {} self.headers[\'User_Agent\'] = \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0\' self.headers[\'Referer\'] = \'http://www.mzitu.com/all/\' def crawl(self, root_url): html_text = requests.get(root_url,headers=self.headers).text html_tree = etree.HTML(html_text) groups = html_tree.xpath("//div[@class=\'main-content\']//ul[@class=\'archives\']//a") count = 0 print "开始抓取:" for group in groups: title = group.text groupUrl = group.get(\'href\') print "正在抓取组图:"+title dirpath = self.makDir(title) #获取标题,并以标题为名字创建文件夹 self.getGroup(groupUrl,dirpath) # count = count+1 if count>=5: print "抓取完成……" os._exit(0) def makDir(self,dirname): dirpath = os.path.join(u\'E:\学习资料\',dirname) if not os.path.exists(dirpath): os.makedirs(dirpath) return dirpath def getGroup(self,groupUrl,dirpath): self.headers[\'Referer\'] = groupUrl html_text = requests.get(groupUrl, headers=self.headers).text html_tree = etree.HTML(html_text) maxPage = html_tree.xpath("//div[@class=\'pagenavi\']//span")[-2].text #获取改组图的张数 for page in range(1,int(maxPage)+1): #获取每一张图的所在页面 pageUrl = groupUrl + \'/\' + str(page) #拼接页面url self.getPage(pageUrl,page,dirpath) #访问该页面 def getPage(self, pageUrl,page,dirpath): self.headers[\'Referer\'] = pageUrl page_text = requests.get(pageUrl, headers=self.headers).text #请求该图所在的页面 page_tree = etree.HTML(page_text) imageurl = page_tree.xpath("//div[@class=\'main-image\']//img")[0].get(\'src\') #获取图片url image = requests.get(imageurl, headers=self.headers).content #请求获取图片 self.saveImage(image,page,dirpath) def saveImage(self,image,page,dirpath): imagepath = os.path.join(dirpath, str(page) + u\'.jpg\') file = codecs.open(imagepath, \'wb\') file.write(image) file.close() if __name__ == \'__main__\': reload(sys) sys.setdefaultencoding(\'utf-8\') Mzitu = Spider() Mzitu.crawl(\'http://www.mzitu.com/all\')