1 from sys import argv
  2 from os import makedirs,unlink,sep,mkdir
  3 from os.path import dirname,exists,isdir,splitext
  4 from string import replace,find,lower
  5 from htmllib import HTMLParser
  6 from urllib import urlretrieve
  7 from urlparse import urlparse,urljoin
  8 from formatter import DumbWriter,AbstractFormatter
  9 from cStringIO import StringIO
 10 
 11 
 12 class Retriever(object):
 13     def __init__(self,url):
 14         self.url = url
 15         self.file = \'E:\install\Python27\\\' + self.filename(url)
 16 
 17     def filename(self,url,deffile=\'index.htm\'):
 18         parsedurl = urlparse(url,\'http:\',0)
 19         path = parsedurl[1] + parsedurl[2]
 20         ext = splitext(path) # seperate ext name
 21         if ext[1] == \'\':
 22             if path[-1] == \'/\':
 23                 path += deffile
 24             else:
 25                 path += \'/\' + deffile
 26 
 27         ldir = dirname(path)
 28         if sep != \'/\':
 29             ldir = replace(ldir,\'/\',sep)
 30         if not isdir(ldir):
 31             if exists(ldir): unlink(ldir)
 32             makedirs(ldir)
 33         return path
 34 
 35     def download(self):
 36         try:
 37             retval = urlretrieve(self.url,self.file)
 38         except IOError:
 39             retval = (\'*** ERROR: invalid URL "%s"\' %\
 40                 self.url)
 41         return retval
 42 
 43     def parseAndGetLinks(self):
 44         self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
 45         self.parser.feed(open(self.file).read())
 46         self.parser.close()
 47         return self.parser.anchorlist
 48 
 49 class Crawler(object):
 50     count = 0 # static downloaded page counter
 51 
 52     def __init__(self,url):
 53         self.q = [url]
 54         self.seen = []
 55         self.dom = urlparse(url)[1]
 56 
 57     def getPage(self,url):
 58         r = Retriever(url)
 59         retval = r.download()
 60         if retval[0] == \'*\':
 61             print retval,\'...skipping parse\'
 62             return
 63         Crawler.count += 1
 64         print \'\n(\',Crawler.count,\')\'
 65         print \'URL:\',url
 66         print \'FILE:\',retval[0]
 67         self.seen.append(url)
 68 
 69 
 70 
 71         links = r.parseAndGetLinks()
 72         for eachLink in links:
 73             if eachLink[:4] != \'http\' and find(eachLink,\'://\') == -1:
 74                 eachLink = urljoin(url,eachLink)
 75 
 76             if find(lower(eachLink),\'mailto:\') != -1:
 77                 print \'...discarded,mailto link\'
 78                 continue
 79             if eachLink not in self.seen:
 80                 if find(eachLink,self.dom) == -1:
 81                     print \'...discarded,not in domain\'
 82                 else:
 83                     if eachLink not in self.q:
 84                         self.q.append(eachLink)
 85                         print \'...new,added to Q\'
 86                     else:
 87                         print \'...discarded,already in Q\'
 88             else:
 89                 print \'...discarded,already processed\'
 90 
 91 
 92 
 93     def go(self):#process links in queue
 94         while self.q:
 95             url = self.q.pop()
 96             self.getPage(url)
 97 
 98 
 99 
100 def main():
101     if len(argv) > 1:
102         url = argv[1]
103 
104     else:
105         try:
106             url = raw_input(\'Enter starting URL:\')
107         except(KeyboardInerrupt,EOFError):
108             url = \'\'
109         if not url: return
110         robot = Crawler(url)
111         robot.go()
112 
113 if __name__ == \'__main__\':
114     main()

 

版权声明:本文为elliottc原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/elliottc/p/4947983.html