Python通用网络爬虫脚本
1 from sys import argv 2 from os import makedirs,unlink,sep,mkdir 3 from os.path import dirname,exists,isdir,splitext 4 from string import replace,find,lower 5 from htmllib import HTMLParser 6 from urllib import urlretrieve 7 from urlparse import urlparse,urljoin 8 from formatter import DumbWriter,AbstractFormatter 9 from cStringIO import StringIO 10 11 12 class Retriever(object): 13 def __init__(self,url): 14 self.url = url 15 self.file = \'E:\install\Python27\\\' + self.filename(url) 16 17 def filename(self,url,deffile=\'index.htm\'): 18 parsedurl = urlparse(url,\'http:\',0) 19 path = parsedurl[1] + parsedurl[2] 20 ext = splitext(path) # seperate ext name 21 if ext[1] == \'\': 22 if path[-1] == \'/\': 23 path += deffile 24 else: 25 path += \'/\' + deffile 26 27 ldir = dirname(path) 28 if sep != \'/\': 29 ldir = replace(ldir,\'/\',sep) 30 if not isdir(ldir): 31 if exists(ldir): unlink(ldir) 32 makedirs(ldir) 33 return path 34 35 def download(self): 36 try: 37 retval = urlretrieve(self.url,self.file) 38 except IOError: 39 retval = (\'*** ERROR: invalid URL "%s"\' %\ 40 self.url) 41 return retval 42 43 def parseAndGetLinks(self): 44 self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) 45 self.parser.feed(open(self.file).read()) 46 self.parser.close() 47 return self.parser.anchorlist 48 49 class Crawler(object): 50 count = 0 # static downloaded page counter 51 52 def __init__(self,url): 53 self.q = [url] 54 self.seen = [] 55 self.dom = urlparse(url)[1] 56 57 def getPage(self,url): 58 r = Retriever(url) 59 retval = r.download() 60 if retval[0] == \'*\': 61 print retval,\'...skipping parse\' 62 return 63 Crawler.count += 1 64 print \'\n(\',Crawler.count,\')\' 65 print \'URL:\',url 66 print \'FILE:\',retval[0] 67 self.seen.append(url) 68 69 70 71 links = r.parseAndGetLinks() 72 for eachLink in links: 73 if eachLink[:4] != \'http\' and find(eachLink,\'://\') == -1: 74 eachLink = urljoin(url,eachLink) 75 76 if find(lower(eachLink),\'mailto:\') != -1: 77 print \'...discarded,mailto link\' 78 continue 79 if eachLink not in self.seen: 80 if find(eachLink,self.dom) == -1: 81 print \'...discarded,not in domain\' 82 else: 83 if eachLink not in self.q: 84 self.q.append(eachLink) 85 print \'...new,added to Q\' 86 else: 87 print \'...discarded,already in Q\' 88 else: 89 print \'...discarded,already processed\' 90 91 92 93 def go(self):#process links in queue 94 while self.q: 95 url = self.q.pop() 96 self.getPage(url) 97 98 99 100 def main(): 101 if len(argv) > 1: 102 url = argv[1] 103 104 else: 105 try: 106 url = raw_input(\'Enter starting URL:\') 107 except(KeyboardInerrupt,EOFError): 108 url = \'\' 109 if not url: return 110 robot = Crawler(url) 111 robot.go() 112 113 if __name__ == \'__main__\': 114 main()
版权声明:本文为elliottc原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。