豆瓣小组爬虫.....^_^
为了抓豆瓣小组的图片写的一个爬虫…大家懂的…
python用的是3.5.2
根据网上gdp12315的版本改出来的, 吧想抓的小组编号添加进url_list就行了
随时能停止, 增加了很多防止重复抓取下载的判断, 坏处是不能判断更新的主题(更新的很少…忽略掉了)
建议多增加点user_agents, 能有效的防止403
如果有什么修改或者好的建议, 请联系我 lzl_17948876@hotmail.com
# -*- coding: utf-8 -*- # ----------------------------------------------- # 程序:豆瓣小组图片爬虫 # 版本:1.2.2 # 语言:Python 3.5.2 # 作者:刘志林 # # 感谢: 程序修改自gdp12315的1.0版本 http://blog.csdn.net/gdp12315_gu/article/details/47323613 # # 2016-11-07 # 修改已处理连接的记录方式, 每个小组一个信息记录 # 2016-11-08 # 修改时间记录位置不对的问题, 改为开始处理前记录 # 2016-11-28 # 增加记录总循环次数 # 增量获取时增加一个判断: 如果最后一个页面仍然有未获取过的主题, 则再获取下一个页面, 直到某个页面主题全部是已获取为止 # ----------------------------------------------- import random import socket, http.cookies, http.cookiejar import urllib.request import re import os, sys import datetime, time import pickle class UrlInfo(object): __filename = \'\' dic_topic = {} lastdt = \'\' def __init__(self, a_filename): self.__filename = a_filename self.dic_topic = {} self.lastdt = \'\' def load(self): if os.path.exists(self.__filename): f = open(self.__filename, \'rb\') try: tmp = pickle.load(f) finally: f.close() self.__dict__.update(tmp) def save(self): f = open(self.__filename, \'wb\') try: pickle.dump(self.__dict__, f) finally: f.close() class BrowserBase(object): def __init__(self): socket.setdefaulttimeout(20) def speak(self,name,content): print(\'[%s]%s\', name,content) def openurl(self,url): #预制一堆user_agents 防止403 user_agents = [ \'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11\', \'Opera/9.25 (Windows NT 5.1; U; en)\', \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\', \'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11\', ] try: cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()) self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler) urllib.request.install_opener(self.opener) self.opener.addheaders = [ (\'Host\',\'www.douban.com\'), (\'Connection\', \'keep-alive\'), (\'Accept\', \'*/*\'), (\'User-Agent\', random.choice(user_agents)), (\'Referer\',\'http://www.google.com\'), ] res = self.opener.open(url) #print(res.read()) except Exception as e: self.speak(str(e),url) raise Exception else: return res finally: time.sleep(1) if __name__==\'__main__\': splider=BrowserBase() #要处理的小组列表, 第一个值是小组Code -> https://www.douban.com/group/小组Code/discussion?start= url_list = [ (\'tomorrow\', \'灵异豆瓣\'), (\'439803\', \'出差男女\'), ] #记录处理过的主题 workpath = os.getcwd() + \'\\\' loopCount = 0 while True: for url_rec in url_list: print(\'\n-------- (L-%d) %s %s 开始采集 --------\'%(loopCount + 1, datetime.datetime.now().strftime(\'%Y-%m-%d %X\'), url_rec[1])) #创建目录 filepath = \'%sPictures\\%s\\\'%(workpath, url_rec[1]) if not os.path.exists(filepath): os.makedirs(filepath) url = \'https://www.douban.com/group/%s/discussion?start=\'%(url_rec[0]) try: html_topic_list = splider.openurl(url).read().decode(\'utf-8\') except: continue #加载信息数据 info = UrlInfo(\'%sPictures\\%s.info\'%(workpath, url_rec[1])) info.load() #最后一次处理时间, 如果是空的就处理全部记录 if info.lastdt == \'\': print(\'第一次处理\') dt_last = None else: print(\'上次处理完毕时间: %s\'%(info.lastdt)) dt_last = datetime.datetime.strptime(info.lastdt, \'%Y-%m-%d %X\') page_max = int(re.compile(r\'\d+\').findall(re.compile(r\'data-total-page="\d+"\').findall(html_topic_list)[0])[0]) if dt_last == None: page_end = page_max num_end = (page_end - 1) * 25 else: t2 = (datetime.datetime.now() - dt_last) num_end = t2.days * 24 * 6 + t2.seconds //300 #假设每5分钟会出现一篇新主题 page_end = num_end // 25 + 1 #记录当前处理时间 _lastdt = datetime.datetime.now().strftime(\'%Y-%m-%d %X\') num_begin = 0 page_begin = 1 while num_begin <= num_end: try: nFullTopicExists = True html_topic_list = splider.openurl(url+str(num_begin)).read().decode(\'utf-8\') #获得主题列表 topic_list = re.compile(r\'https://www.douban.com/group/topic/\d+/\').findall(html_topic_list) topic_count = len(topic_list) print(\'%s page: %d/%d - %d\'%(url_rec[1], page_begin, page_end, topic_count)) for topic_url_index in range(topic_count): topic_url = topic_list[topic_url_index] #print(\'topic_url \'+topic_url) #不再处理已经处理过的主题 topic_code = re.findall(r\'\d+\', topic_url)[0] if topic_code in info.dic_topic: print(\'#%d \'%(topic_url_index + 1), end=\'\') continue else: nFullTopicExists = False print(\'%d \'%(topic_url_index + 1), end=\'\') try: html_topic = splider.openurl(topic_url).read().decode(\'utf-8\') except: continue #记录主题已经处理过 info.dic_topic[topic_code] = \'\' info.save() #获得图片下载地址列表 img_list = re.compile(r\'https://img\d.doubanio.com/view/group_topic/large/public/p\d+.jpg\').findall(html_topic) #遍历图片下载地址并保存 for img_url in img_list: #print(\'img_url: \'+img_url) filename = \'%s\\%s-%s.jpg\'%(filepath, topic_code, re.findall(r\'p\d+\',img_url)[0]) if not os.path.exists(filename): try: #print(filename) download_img = urllib.request.urlretrieve(img_url, filename) except Exception as e: print(e) continue finally: time.sleep(2) #waittime = random.randint(10,15) #print(\'wait %d\'%waittime) #time.sleep(waittime) num_begin = num_begin + 25 if (dt_last != None) and (num_begin > num_end) and (not nFullTopicExists): num_end = num_end + 25 except Exception as e: print(e) continue finally: page_begin = page_begin + 1 print() info.lastdt = _lastdt info.save() print(\'-------- %s %s 采集完成 --------\n\'%(datetime.datetime.now().strftime(\'%Y-%m-%d %X\'), url_rec[1])) loopCount = loopCount + 1
版权声明:本文为lzl_17948876原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。