微博采集

  1 import json
  2 import queue
  3 import re
  4 import threading
  5 
  6 import requests
  7 import xlrd
  8 from lxml import etree
  9 from pymongo import MongoClient
 10 
 11 proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
 12             "host": "http-dyn.abuyun.com",
 13             "port": "9020",
 14             "user": "H6VZC52B4BF2986D",
 15             "pass": "7850C72DC876E723",
 16         }
 17 
 18 
 19 class WB:
 20     def __init__(self):
 21         self.start_temp_url = "https://m.weibo.cn/api/container/getIndex?type=uid&value={}"
 22         self.headers = {
 23             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
 24         }
 26         self.proxies = {
 27             "http": proxyMeta,
 28             "https": proxyMeta,
 29         }
 30         self.save_content_q = queue.Queue()
 31         self.url_q = queue.Queue()
 32         self.client = MongoClient(host=\'localhost\', port=27017)
 33         # 使用的mongo的数据表
 34         self.db = self.client.WbImageSet
 35 
 36     def save_data_mongodb(self, collect_name, data):
 37         # 定义一个去重的图标集名称
 38         self.collect_name = self.db[collect_name]
 39         history_record = self.collect_name.find_one({"_id": data[\'id\']})
 40         if history_record:
 41             # 数据库中已经存在数据
 42             pass
 43         else:
 44             # 数据库中不存在,插入数据
 45             self.collect_name.update_one({\'_id\': data[\'id\']}, {\'$set\': data}, upsert=True)
 46 
 47     def get_author_info(self, url):
 48         """
 49         获取博主信息
 50         :return:
 51         """
 52         response = requests.get(url=url, headers=self.headers, proxies=self.proxies, verify=True)
 53         self.data = response.content.decode(\'utf8\')
 54         content = json.loads(self.data).get(\'data\')
 55         max_content = content.get(\'userInfo\').get(\'statuses_count\')
 56         return max_content
 57 
 58     def get_containerid(self):
 59         """获取微博主页的containerid,爬取微博内容时需要此id"""
 60         content = json.loads(self.data).get(\'data\')
 61         for data in content.get(\'tabsInfo\').get(\'tabs\'):
 62             if (data.get(\'tab_type\') == \'weibo\'):
 63                 containerid = data.get(\'containerid\')
 64         return containerid
 65 
 66     def get_url(self, containerid, max_content):
 67         for x in range(int(max_content)//10):
 68             wb_content_url = \'https://m.weibo.cn/api/container/getIndex?type=uid&value=\' + self.name + \'&containerid=\'+containerid+\'&page=\'+str(x)
 69             self.url_q.put(wb_content_url)
 70 
 71     def get_wb_content(self):
 72         """获取微博内容信息,并保存到文本中,内容包括:每条微博的内容、微博详情页面地址、点赞数、评论数、转发数等"""
 73         num = 0
 74         while True:
 75             try:
 76                 if self.url_q.empty():
 77                     break
 78                 weibo_url = self.url_q.get()
 79                 response = requests.get(url=weibo_url, headers=self.headers, proxies=self.proxies, verify=True)
 80                 content = json.loads(response.content.decode(\'utf8\')).get(\'data\')
 81                 cards = content.get(\'cards\')
 82                 if (len(cards) > 0):
 83                     for j in range(len(cards)):
 84                         num += 1
 85                         card_type = cards[j].get(\'card_type\')
 86                         item = {}
 87                         if (card_type == 9):
 88                             mblog = cards[j].get(\'mblog\')
 89                             scheme = cards[j].get(\'scheme\')  # 微博地址
 90                             print(scheme)
 91                             print("--正在抓取--{}--第{}条微博--".format(self.name, num))
 92                             text_id = mblog.get("id")  # 微博id 用于去重
 93                             text = mblog.get(\'text\')  # 微博内容
 94                             # 以下为获取标题的正则匹配
 95                             html = etree.HTML(text)
 96                             x = html.xpath(\'//text()\')
 97                             title = \',\'.join(x)
 98                             title = title.replace(\'\r\', \'\').replace(\'\n\', \'\').replace(\'\t\', \'\')
 99                             title = re.sub(\'(#.*#)\', \'\', title)
100                             title = re.sub(\'@\', \'\', title)
101                             title = re.sub(\' \', \'\', title)
102                             pictures = mblog.get(\'pics\')  # 正文配图,返回list
103                             pic_urls = []  # 存储图片url地址
104                             if pictures:
105                                 for picture in pictures:
106                                     pic_url = picture.get(\'large\').get(\'url\')
107                                     pic_urls.append(pic_url)
108                             if pic_urls == []:
109                                 continue
110                             item[\'id\'] = text_id
111                             item[\'category\'] = self.category
112                             item[\'author\'] = self.name
113                             item[\'title\'] = title
114                             item[\'url\'] = pic_urls
115                             item[\'select\'] = 0
116                             # 去重逻辑
117                             self.save_data_mongodb(self.name, item)
118             except Exception as e:
119                 print(e)
120 
121     def run(self):
122         while True:
123             if wb_content_q.empty():
124                 break
125             dict_wb = wb_content_q.get()
126             self.category = dict_wb[\'category\']
127             self.name = dict_wb[\'name\']
128             self.wb_id = dict_wb[\'id\']
129             max_content = self.get_author_info(self.start_temp_url.format(self.wb_id))
130             containerid = self.get_containerid()
131             self.get_url(containerid, max_content)
132 
133             Threads_caiji = []
134             for x in range(5):
135                 t1 = threading.Thread(target=self.get_wb_content)
136                 Threads_caiji.append(t1)
137             for a in Threads_caiji:
138                 a.start()
139                 a.join()
140 
141 
142 class Excel_path:
143 
144     @staticmethod
145     def get_excel_info(path,num):
146         """
147         从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典
148         :param path:  excel路径
149         :return:返回从excel中读取的字典
150         """
151         excel_sheet = xlrd.open_workbook(path).sheet_by_index(num)
152         category_name = excel_sheet.col_values(0)[1:]
153         wb_url = excel_sheet.col_values(2)[1:]
154         name = excel_sheet.col_values(1)[1:]
155         for i in wb_url:
156             item = {}
157             a = wb_url.index(i)
158             c = i.split(\'?\')[0]
159             d = c.split(\'/\')[-1]
160             item[\'name\'] = name[a]
161             item[\'category\'] = category_name[a]
162             item[\'id\'] = d
163             print(item)
164             wb_content_q.put(item)
165 
166 
167 if __name__ == \'__main__\':
168     wb_content_q = queue.Queue()
169     excel_path = \'D:\\gongsi_code\\ImageSpider\\微博\\Image_set\\数据源.xlsx\'
170     excel_index = 0
171     Excel_path.get_excel_info(excel_path, int(excel_index))
172     WB().run()

采集微博数据 将数据存入mongo 所有链接存mongo中 

下载器

 

 1 import os
 2 import queue
 3 import re
 4 import threading
 5 import time
 6 from concurrent.futures.thread import ThreadPoolExecutor
 7 
 8 import pymongo
 9 import requests
10 import xlrd
11 
12 
13 class WbDownload:
14     def __init__(self):
15         self.client = pymongo.MongoClient(host=\'localhost\', port=27017)
16         self.db = self.client.WbImageSet
17         self.info_q = queue.Queue()
18 
19     def get_info(self, collection_name):
20         self.collection = self.db[collection_name]
21         a = self.collection.find({"select": 0})
22         for index in a:
23             self.info_q.put(index)
24 
25     def save_(self):
26         while True:
27             """保存到本地"""
28             if self.info_q.empty():
29                 break
30             image = self.info_q.get()
31             category_name = image[\'category\']
32             upload_time = time.strftime("%Y-%m-%d", time.localtime())
33             rule = re.compile(r\'\s*\', re.S)
34             rule2 = re.compile(r\'\W*\', re.S)
35             title = rule.sub(\'\', image[\'title\'])
36             title = rule2.sub(\'\', title)
37             path = \'D:/微博/\' + category_name + \'/\' + str(upload_time) + \'/\' + title
38             if os.path.exists(path):
39                 continue
40             else:
41                 os.makedirs(path)
42             with open(path + \'/content.txt\', \'w\', encoding=\'utf8\')as fb:
43                 fb.write(str([image[\'title\']]))
44             for x in image[\'url\']:
45                 x_index = image[\'url\'].index(x)
46                 if x[-4:-1:] == \'gif\':
47                     continue
48                 with open(path + \'/{}.jpg\'.format(str(x_index)), \'wb\') as f:
49                     response = requests.get(url=x)
50                     f.write(response.content)
51             self.collection.update_one({"id": id}, {"$set": {"select": 1}})
52             print(\'-----  \' + title + \'  写入完成  ------\')
53 
54 
55     def run(self):
56         while True:
57             if name_q.empty():
58                 break
59             name = name_q.get()
60             self.get_info(name)
61             Threads = []
62             for i in range(20):
63                 t_down = threading.Thread(target=self.save_)
64                 t_down.start()
65                 Threads.append(t_down)
66             for t in Threads:
67                 t.join()
68 
69 
70 
71 
72 class Excel_path:
73 
74     @staticmethod
75     def get_excel_info(path,num):
76         """
77         从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典
78         :param path:  excel路径
79         :return:返回从excel中读取的字典
80         """
81         excel_sheet = xlrd.open_workbook(path).sheet_by_index(num)
82         name = excel_sheet.col_values(1)[1:]
83         for x in name:
84             print(x)
85             name_q.put(x)
86 
87 
88 if __name__ == \'__main__\':
89     name_q = queue.Queue()
90     path = \'D:\\gongsi_code\\ImageSpider\\微博\\Image_set\\数据源.xlsx\'
91     excel_index = 0
92     Excel_path.get_excel_info(path, excel_index)
93     with ThreadPoolExecutor(5) as executor:
94             wb = WbDownload()
95             executor.submit(wb.run)

 

专门从mongo 中取数据然后下载,下载完后修改mongo中字段的名称,避免重复使用数据

 

版权声明:本文为lqn404原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/lqn404/p/11325252.html