豆瓣小组爬虫.....^

为了抓豆瓣小组的图片写的一个爬虫…大家懂的…
python用的是3.5.2
根据网上gdp12315的版本改出来的, 吧想抓的小组编号添加进url_list就行了
随时能停止, 增加了很多防止重复抓取下载的判断, 坏处是不能判断更新的主题(更新的很少…忽略掉了)
建议多增加点user_agents, 能有效的防止403
如果有什么修改或者好的建议, 请联系我 lzl_17948876@hotmail.com
# -*- coding: utf-8 -*-
# -----------------------------------------------
#   程序：豆瓣小组图片爬虫
#   版本：1.2.2
#   语言：Python 3.5.2
#   作者：刘志林
#
#   感谢: 程序修改自gdp12315的1.0版本  http://blog.csdn.net/gdp12315_gu/article/details/47323613
#
#   2016-11-07
#       修改已处理连接的记录方式, 每个小组一个信息记录
#   2016-11-08
#       修改时间记录位置不对的问题, 改为开始处理前记录
#   2016-11-28
#       增加记录总循环次数
#       增量获取时增加一个判断: 如果最后一个页面仍然有未获取过的主题, 则再获取下一个页面, 直到某个页面主题全部是已获取为止
# -----------------------------------------------

    
import random
import socket, http.cookies, http.cookiejar
import urllib.request
import re
import os, sys
import datetime, time
import pickle

class UrlInfo(object):
    __filename = \'\'
    dic_topic = {}
    lastdt = \'\'
    
    def __init__(self, a_filename):
        self.__filename = a_filename
        self.dic_topic = {}
        self.lastdt = \'\'

    def load(self):
        if os.path.exists(self.__filename):
            f = open(self.__filename, \'rb\')
            try:
                tmp = pickle.load(f)
            finally:
                f.close()
            self.__dict__.update(tmp)

    def save(self):
        f = open(self.__filename, \'wb\')
        try:
            pickle.dump(self.__dict__, f)
        finally:
            f.close()
        

class BrowserBase(object): 

    def __init__(self):
        socket.setdefaulttimeout(20)

    def speak(self,name,content):
        print(\'[%s]%s\', name,content)

    def openurl(self,url):
        #预制一堆user_agents 防止403
        user_agents = [
                    \'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11\',
                    \'Opera/9.25 (Windows NT 5.1; U; en)\',
                    \'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)\',
                    \'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11\',
                    ]

        try:
            cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
            self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)
            urllib.request.install_opener(self.opener)
            self.opener.addheaders = [
                (\'Host\',\'www.douban.com\'),
                (\'Connection\', \'keep-alive\'),
                (\'Accept\', \'*/*\'),
                (\'User-Agent\', random.choice(user_agents)),
                (\'Referer\',\'http://www.google.com\'),
                ]
            
            res = self.opener.open(url)
            #print(res.read())
        except Exception as e:
            self.speak(str(e),url)
            raise Exception
        else:
            return res
        finally:
            time.sleep(1)


if __name__==\'__main__\':
    splider=BrowserBase()
        
    
#要处理的小组列表, 第一个值是小组Code -> https://www.douban.com/group/小组Code/discussion?start=
url_list = [
    (\'tomorrow\', \'灵异豆瓣\'),
    (\'439803\', \'出差男女\'),
    ]

#记录处理过的主题

workpath = os.getcwd() + \'\\\'

loopCount = 0

while True:
    for url_rec in url_list:
        print(\'\n-------- (L-%d) %s  %s 开始采集 --------\'%(loopCount + 1, datetime.datetime.now().strftime(\'%Y-%m-%d %X\'), url_rec[1]))
        #创建目录
        filepath = \'%sPictures\\%s\\\'%(workpath, url_rec[1])
        if not os.path.exists(filepath):
            os.makedirs(filepath)

        url = \'https://www.douban.com/group/%s/discussion?start=\'%(url_rec[0])
        try:
            html_topic_list = splider.openurl(url).read().decode(\'utf-8\')
        except:
            continue

        #加载信息数据
        info = UrlInfo(\'%sPictures\\%s.info\'%(workpath, url_rec[1]))
        info.load()

        #最后一次处理时间, 如果是空的就处理全部记录
        if info.lastdt == \'\':
            print(\'第一次处理\')
            dt_last = None
        else:
            print(\'上次处理完毕时间: %s\'%(info.lastdt))
            dt_last = datetime.datetime.strptime(info.lastdt, \'%Y-%m-%d %X\')

        page_max = int(re.compile(r\'\d+\').findall(re.compile(r\'data-total-page="\d+"\').findall(html_topic_list)[0])[0])
        if dt_last == None:
            page_end = page_max
            num_end = (page_end - 1) * 25
        else:
            t2 = (datetime.datetime.now() - dt_last)
            num_end = t2.days * 24 * 6 + t2.seconds //300 #假设每5分钟会出现一篇新主题
            page_end = num_end // 25 + 1

        #记录当前处理时间
        _lastdt = datetime.datetime.now().strftime(\'%Y-%m-%d %X\')

        num_begin = 0
        page_begin = 1
        while num_begin <= num_end:
            try:
                nFullTopicExists = True
                html_topic_list = splider.openurl(url+str(num_begin)).read().decode(\'utf-8\')
                #获得主题列表
                topic_list = re.compile(r\'https://www.douban.com/group/topic/\d+/\').findall(html_topic_list)
                topic_count = len(topic_list)
                print(\'%s page: %d/%d - %d\'%(url_rec[1], page_begin, page_end, topic_count))

                for topic_url_index in range(topic_count):
                    topic_url = topic_list[topic_url_index]
                    #print(\'topic_url \'+topic_url)
                    
                    #不再处理已经处理过的主题
                    topic_code = re.findall(r\'\d+\', topic_url)[0]
                    if topic_code in info.dic_topic:
                        print(\'#%d \'%(topic_url_index + 1), end=\'\')
                        continue
                    else:
                        nFullTopicExists = False
                        print(\'%d \'%(topic_url_index + 1), end=\'\')
                    
                    try:
                        html_topic = splider.openurl(topic_url).read().decode(\'utf-8\')
                    except:
                        continue

                    #记录主题已经处理过
                    info.dic_topic[topic_code] = \'\'
                    info.save()
                        
                    #获得图片下载地址列表
                    img_list = re.compile(r\'https://img\d.doubanio.com/view/group_topic/large/public/p\d+.jpg\').findall(html_topic)
                    
                    #遍历图片下载地址并保存
                    for img_url in img_list:
                        #print(\'img_url: \'+img_url)
                        filename = \'%s\\%s-%s.jpg\'%(filepath, topic_code, re.findall(r\'p\d+\',img_url)[0])
                        if not os.path.exists(filename):
                            try:
                                #print(filename)
                                download_img = urllib.request.urlretrieve(img_url, filename)
                            except Exception as e:
                                print(e)
                                continue
                            finally:
                                time.sleep(2)
                     #waittime = random.randint(10,15)
                     #print(\'wait %d\'%waittime)
                     #time.sleep(waittime)
                num_begin = num_begin + 25
                if (dt_last != None) and (num_begin > num_end) and (not nFullTopicExists):
                    num_end = num_end + 25
            except Exception as e:
                print(e)
                continue
            finally:
                page_begin = page_begin + 1
            print()

        info.lastdt = _lastdt
        info.save()
        print(\'-------- %s  %s 采集完成 --------\n\'%(datetime.datetime.now().strftime(\'%Y-%m-%d %X\'), url_rec[1]))
    loopCount = loopCount + 1
本文链接：https://www.cnblogs.com/lzl_17948876/p/6031001.html
豆瓣小组爬虫.....^_^

豆瓣小组爬虫.....^_^的更多相关文章

随机推荐

热门专题

目录导航