python抓取网页图片红黑树：个人理解与Python实现

网页的图片大致是用Image导入的，使用的是相对路径，例如

<image src="image/bg.jpg"/>

通过匹配可以获取image/bg.jpg,与页面地址组合可以得到图片的地址

除了直接引入的图片，还有通过CSS，HTML引入的图片，也需要处理

# -*- coding: utf-8 -*-
import urllib, httplib, urlparse
import sys
import re

def httpExists(url):
    host, path = urlparse.urlsplit(url)[1:3]
    if \':\' in host:
        # port specified, try to use it
        host, port = host.split(\':\', 1)
        try:
            port = int(port)
        except ValueError:
            print \'invalid port number %r\' % (port,)
            return False
    else:
        # no port specified, use default port
        port = None
    try:
        connection = httplib.HTTPConnection(host, port=port)
        connection.request("HEAD", path)
        resp = connection.getresponse( )
        if resp.status == 200:       # normal \'found\' status
            found = True
        elif resp.status == 302:     # recurse on temporary redirect
            found = httpExists(urlparse.urljoin(url,resp.getheader(\'location\', \'\')))
        else:                        # everything else -> not found
            print "Status %d %s : %s" % (resp.status, resp.reason, url)
            found = False
    except Exception, e:
        print e.__class__, e, url
        found = False
    return found

"""根据url获取文件名"""
def gGetFileName(url):
    if url==None: return None
    if url=="" : return ""
    arr=url.split("/")
    return arr[len(arr)-1]

"""根据url下载文件，文件名参数指定"""
def gDownloadWithFilename(url,savePath,file):
    #参数检查，现忽略
    try:
        urlopen=urllib.URLopener()
        fp = urlopen.open(url)
        data = fp.read()
        fp.close()
        print \'download file url :\',url
        file=open(savePath + file,\'w+b\')
        file.write(data)
        file.close()
    except IOError:
        print "download error!"+ url

def gDownload(url,savePath):

    fileName = gGetFileName(url)
    gDownloadWithFilename(url,savePath,fileName)

def getRexgList(lines,regx,searchRegx):
    if lines==None : return 
    lists =[]
    for line in lines:
        ismatch = re.search(regx,line,re.IGNORECASE)
        if ismatch :
           
            matchs = re.search(searchRegx,line,re.IGNORECASE)
            if matchs != None:
                groups = matchs.groups()
                for str in groups:
                    if str not in lists:
                        lists.append(str)
    return lists
def checkLine(lines):
    for line in lines :
        matchs = re.search(r\'url\((\S+)\)\',re.IGNORECASE)
        if matchs != None :
            print matchs.groups()
def  getPageLines(url):
    if url==None : return
    if not httpExists(url): return 
    try:
        page = urllib.urlopen(url)   
        html = page.readlines()
        page.close()
        return html
    except:
        print "getPageLines() error!"
        return
def getCurrentPageImage(url,savePath):
    lines = getPageLines(url)
    print \'lines.length\',len(lines)
   
    regxlists =  getRexgList(lines,r\'src\s*="images(\S+)"\',r\'src\s*="(\S+)"\')
    if regxlists==None: return 
    print \'getCurrentPageImage() images.length\',len(regxlists)
    for jpg in regxlists:
        jpg =url + jpg
        gDownload(jpg,savePath)

def getCSSImages(link,savePath,url):
    lines = getPageLines(link)
    print \'lines.length\',len(lines)
    regxlists =  getRexgList(lines,r\'url\((\S+)\)\',r\'url\((\S+)\)\')
    if regxlists==None: return 
    print \'getCurrentPageImage() images.length\',len(regxlists)
    for jpg in regxlists:
        jpg =url + jpg
        gDownload(jpg,savePath)

"""根据url获取其上的相关htm、html链接，返回list"""
def gGetHtmlLink(url):
    #参数检查，现忽略
    rtnList=[]
    lines=getPageLines(url)
    regx = r"""href="?(\S+)\.htm"""
    for link in getRexgList(lines,regx,r\'href="(\S+)"\'):
        link =url + link
        if link not in rtnList:
            rtnList.append(link)
            print link
    return rtnList
"""根据url获取其上的相关css链接，返回list"""
def gGetCSSLink(url):
    #参数检查，现忽略
    rtnList=[]
    lines=getPageLines(url)
    regx = r"""href="?(\S+)\.css"""
    for link in getRexgList(lines,regx,r\'href="(\S+)"\'):
        link = url + link
        if link not in rtnList:
            rtnList.append(link)
    return rtnList   
def getPageImage(url,savePath):
    """getCurrentPageImage(url,savePath)"""

    """读取其他的CSS，html文件中的图片
    links=gGetHtmlLink(url)
    for link in links:
        print u\'get images on link-html读取\'
        getCurrentPageImage(link,savePath)"""
    links=gGetCSSLink(url)
    for link in links:
        print \'get images on link:\',link
        getCSSImages(link,savePath,url)
if __name__ == \'__main__\':
    
    url = \'http://www.templatemo.com/templates/templatemo_281_chrome/\'
    savePath = \'d:/tmp/\'
    print \'download pic from [\' + url +\']\'
    print \'save to [\' +savePath+\'] ...\'
    getPageImage(url,savePath)
    print "download finished"

具体使用的时候根据URL的情况，具体分析得到图片地址的方式。

posted on
2013-08-25 16:16
HackerVirus
阅读(477)
评论(0)
编辑
收藏
举报

本文链接：https://www.cnblogs.com/Leo_wl/p/3280909.html

python抓取网页图片红黑树：个人理解与Python实现

python抓取网页图片红黑树：个人理解与Python实现的更多相关文章

随机推荐

热门专题

目录导航