python抓取网页图片红黑树:个人理解与Python实现
网页的图片大致是用Image导入的,使用的是相对路径,例如
<image src="image/bg.jpg"/>
通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址
除了直接引入的图片,还有通过CSS,HTML引入的图片,也需要处理
# -*- coding: utf-8 -*- import urllib, httplib, urlparse import sys import re def httpExists(url): host, path = urlparse.urlsplit(url)[1:3] if \':\' in host: # port specified, try to use it host, port = host.split(\':\', 1) try: port = int(port) except ValueError: print \'invalid port number %r\' % (port,) return False else: # no port specified, use default port port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request("HEAD", path) resp = connection.getresponse( ) if resp.status == 200: # normal \'found\' status found = True elif resp.status == 302: # recurse on temporary redirect found = httpExists(urlparse.urljoin(url,resp.getheader(\'location\', \'\'))) else: # everything else -> not found print "Status %d %s : %s" % (resp.status, resp.reason, url) found = False except Exception, e: print e.__class__, e, url found = False return found """根据url获取文件名""" def gGetFileName(url): if url==None: return None if url=="" : return "" arr=url.split("/") return arr[len(arr)-1] """根据url下载文件,文件名参数指定""" def gDownloadWithFilename(url,savePath,file): #参数检查,现忽略 try: urlopen=urllib.URLopener() fp = urlopen.open(url) data = fp.read() fp.close() print \'download file url :\',url file=open(savePath + file,\'w+b\') file.write(data) file.close() except IOError: print "download error!"+ url def gDownload(url,savePath): fileName = gGetFileName(url) gDownloadWithFilename(url,savePath,fileName) def getRexgList(lines,regx,searchRegx): if lines==None : return lists =[] for line in lines: ismatch = re.search(regx,line,re.IGNORECASE) if ismatch : matchs = re.search(searchRegx,line,re.IGNORECASE) if matchs != None: groups = matchs.groups() for str in groups: if str not in lists: lists.append(str) return lists def checkLine(lines): for line in lines : matchs = re.search(r\'url\((\S+)\)\',re.IGNORECASE) if matchs != None : print matchs.groups() def getPageLines(url): if url==None : return if not httpExists(url): return try: page = urllib.urlopen(url) html = page.readlines() page.close() return html except: print "getPageLines() error!" return def getCurrentPageImage(url,savePath): lines = getPageLines(url) print \'lines.length\',len(lines) regxlists = getRexgList(lines,r\'src\s*="images(\S+)"\',r\'src\s*="(\S+)"\') if regxlists==None: return print \'getCurrentPageImage() images.length\',len(regxlists) for jpg in regxlists: jpg =url + jpg gDownload(jpg,savePath) def getCSSImages(link,savePath,url): lines = getPageLines(link) print \'lines.length\',len(lines) regxlists = getRexgList(lines,r\'url\((\S+)\)\',r\'url\((\S+)\)\') if regxlists==None: return print \'getCurrentPageImage() images.length\',len(regxlists) for jpg in regxlists: jpg =url + jpg gDownload(jpg,savePath) """根据url获取其上的相关htm、html链接,返回list""" def gGetHtmlLink(url): #参数检查,现忽略 rtnList=[] lines=getPageLines(url) regx = r"""href="?(\S+)\.htm""" for link in getRexgList(lines,regx,r\'href="(\S+)"\'): link =url + link if link not in rtnList: rtnList.append(link) print link return rtnList """根据url获取其上的相关css链接,返回list""" def gGetCSSLink(url): #参数检查,现忽略 rtnList=[] lines=getPageLines(url) regx = r"""href="?(\S+)\.css""" for link in getRexgList(lines,regx,r\'href="(\S+)"\'): link = url + link if link not in rtnList: rtnList.append(link) return rtnList def getPageImage(url,savePath): """getCurrentPageImage(url,savePath)""" """读取其他的CSS,html文件中的图片 links=gGetHtmlLink(url) for link in links: print u\'get images on link-html读取\' getCurrentPageImage(link,savePath)""" links=gGetCSSLink(url) for link in links: print \'get images on link:\',link getCSSImages(link,savePath,url) if __name__ == \'__main__\': url = \'http://www.templatemo.com/templates/templatemo_281_chrome/\' savePath = \'d:/tmp/\' print \'download pic from [\' + url +\']\' print \'save to [\' +savePath+\'] ...\' getPageImage(url,savePath) print "download finished"
具体使用的时候根据URL的情况,具体分析得到图片地址的方式。
版权声明:本文为Leo_wl原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。