python爬虫之淘宝宝贝图片抓取
写在前面的话:家里有人开淘宝店,作为一个小的淘宝店主,经常要做的就是从别人的店铺(当然是批发商)把图片一张一张存下来。然后再自己做ps做好看一点,再上架。这样存图什么的,挺烦人的,刚好最近在学习python,发现这东西,真心的那叫一个方便。
总的来说,其实也并没有什么技术含量,只是熟悉一下python的语言和正则表达式的使用。
主要步骤 :
1、当然是抓取页面html代码
1 import urllib 2 import urllib2 3 4 #获取html代码 5 def getHtml(url): 6 request = urllib2.Request(url , headers = headers) 7 try: 8 response = urllib2.urlopen(request) 9 html = response.read() 10 return html 11 except urllib2.URLError,e: 12 print e.reason
2、分析页面中的详情图片部分和主图部分
淘宝的html页面相当的整齐,可读性不错。很快就可以找到,他们的描述页位置:descUrl .. location.protocol = \’http:…….\’
可以写一个正则表达式,提取出来
1 import re 2 3 #提取描述url 4 def descUrl(html): 5 reg = r"descUrl.*?location.protocol===\'http:\' \? \'//(.*?)\'.?:" 6 desurlre = re.compile(reg,re.I) 7 desurl = re.findall(desurlre , html) 8 return desurl
再获取这个详情页地址,就可以提取出所有的图片地址了。
1 #提取所有图片 2 def getImglist(html): 3 reg = r\'src=\"(.*?)\"\' 4 imgre = re.compile(reg,re.I) 5 imglist = re.findall(imgre , html) 6 return imglist
3、下载图片
获取到了图片的url后,当然就是把图片下下来,这里做一个指定路径的保存方法。
因此再加一个创建路径
1 #目录是否存在,不存在则创建 2 def createDir(path): 3 if not os.path.exists(path): 4 os.makedirs(path) 5 else: 6 if os.path.isfile(path): 7 os.mkdir(path)
保存图片
1 #保存所有图片 2 def saveImgTo(imglist , path): 3 createDir(path) 4 imgIndex = 1 5 for imgurl in imglist: 6 splist = imgurl.split(\'.\') 7 filetype = splist[len(splist)-1] 8 print "saving " + imgurl 9 try: 10 urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + \'.\' + filetype ) 11 imgIndex += 1 12 print "==> ok!" 13 except: 14 print "==> err!!!!!!"
以下为一份完整代码,传入存储路径,保存下url.txt 中所有url的淘宝或其他网页图片。新手上路,写的不好的地方轻拍:
1 #coding=utf-8 2 3 import re 4 import urllib 5 import urllib2 6 import cookielib 7 import StringIO, gzip 8 import os 9 import sys 10 11 headers = { 12 \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\' 13 } 14 15 16 #解压gzip 17 def gzdecode(data) : 18 compressedstream = StringIO.StringIO(data) 19 gziper = gzip.GzipFile(fileobj=compressedstream) 20 data2 = gziper.read() # 读取解压缩后数据 21 return data2 22 23 #获取html代码 24 def getHtml(url): 25 request = urllib2.Request(url , headers = headers) 26 try: 27 response = urllib2.urlopen(request) 28 html = response.read() 29 return html 30 except urllib2.URLError,e: 31 print e.reason 32 33 #目录是否存在,不存在则创建 34 def createDir(path): 35 if not os.path.exists(path): 36 os.makedirs(path) 37 else: 38 if os.path.isfile(path): 39 os.mkdir(path) 40 41 #提取描述url 42 def descUrl(html): 43 reg = r"descUrl.*?location.protocol===\'http:\' \? \'//(.*?)\'.?:" 44 desurlre = re.compile(reg,re.I) 45 desurl = re.findall(desurlre , html) 46 return desurl 47 48 #提取所有图片 49 def getImglist(html): 50 reg = r\'src=\"(.*?)\"\' 51 imgre = re.compile(reg,re.I) 52 imglist = re.findall(imgre , html) 53 return imglist 54 #提取主图 55 def getTitleImg(html, path): 56 createDir(path) 57 reg = r\'auctionImages.*?\[(.*?)\]\' 58 imgre = re.compile(reg,re.I) 59 titleImg = re.findall(imgre , html) 60 titleImg = titleImg[0] 61 imglist = titleImg.split(\',\') 62 titleIndex = 1 63 for imgurl in imglist: 64 print "img ==== > " + imgurl 65 imgurl = imgurl.strip(\'"\') 66 imgurl = \'http:\' + imgurl 67 print imgurl 68 splist = imgurl.split(\'.\') 69 filetype = splist[len(splist)-1] 70 try: 71 urllib.urlretrieve(imgurl , path + "/title"+ str(titleIndex) + \'.\' + filetype ) 72 titleIndex += 1 73 print "==> ok!" 74 except: 75 print "==> err!!!!!!" 76 77 #保存所有图片 78 def saveImgTo(imglist , path): 79 createDir(path) 80 imgIndex = 1 81 for imgurl in imglist: 82 splist = imgurl.split(\'.\') 83 filetype = splist[len(splist)-1] 84 print "saving " + imgurl 85 try: 86 urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + \'.\' + filetype ) 87 imgIndex += 1 88 print "==> ok!" 89 except: 90 print "==> err!!!!!!" 91 92 #从一个淘宝页面,得到详情图片 93 def getTaoBaoImg(url ,savePath): 94 html = getHtml(url) 95 getTitleImg(html , savePath) 96 desurl = descUrl(html) 97 desurl = "http://" + desurl[0] 98 print "desurl = " + desurl 99 print "----------------------------------------------------------" 100 #得到淘贝详情html 101 desHtml = getHtml(desurl) 102 imglist = getImglist(desHtml) 103 saveImgTo(imglist , savePath) 104 #-------------------------------------我是华丽的分界线 begin Other----------------------------------------- 105 #提取其他详情图片列表 106 def getOtherImgurllist(html): 107 reg = r\'src="(.*?)"\' 108 desre = re.compile(reg,re.S) 109 imgurllist = re.findall(desre , html) 110 return imgurllist 111 112 113 #从其他提取详情图片 114 def getOtherImg(url , savePath): 115 html = getHtml(url) 116 imglist = getOtherImgurllist(html) 117 saveImgTo(imglist , savePath) 118 119 #提取其他主图 120 def getOthertitleImg(html, savePath): 121 print "todo:" 122 123 #-------------------------------------我是华丽的分界线 end Other----------------------------------------- 124 125 #保存原地址 126 def saveUrl(url , savePath): 127 output = open( savePath + "/url.htm" , "w") 128 output.write("""<html> 129 <head> 130 <meta http-equiv="Content-Language" content="zh-CN"> 131 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312"> 132 <meta http-equiv="refresh" content="0.1;url=""" + url + """\"> 133 <title></title> 134 </head> 135 <body> 136 </body> 137 </html>""") 138 output.close() 139 140 141 savepath = "img" 142 143 input = open(\'url.txt\', \'r\') 144 145 urls = input.read( ) 146 urls = urls.split(\'\r\n\') 147 print urls 148 149 if len(sys.argv)>1 and sys.argv[1]: 150 savepath = sys.argv[1] 151 152 print savepath 153 154 urlIndex = 1 155 for url in urls: 156 if len(url) < 10: 157 continue 158 urlSavePath = savepath + \'/\' + str(urlIndex) 159 createDir(urlSavePath) 160 saveUrl(url , urlSavePath) 161 print \'*\'*50 162 print url 163 if url.find(\'taobao\') != -1: 164 getTaoBaoImg(url , urlSavePath) 165 else: 166 getOtherImg(url , urlSavePath) 167 urlIndex += 1 168 169 print "success!"