降低耦合性获取微博数据
微博采集
1 import json 2 import queue 3 import re 4 import threading 5 6 import requests 7 import xlrd 8 from lxml import etree 9 from pymongo import MongoClient 10 11 proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 12 "host": "http-dyn.abuyun.com", 13 "port": "9020", 14 "user": "H6VZC52B4BF2986D", 15 "pass": "7850C72DC876E723", 16 } 17 18 19 class WB: 20 def __init__(self): 21 self.start_temp_url = "https://m.weibo.cn/api/container/getIndex?type=uid&value={}" 22 self.headers = { 23 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" 24 } 26 self.proxies = { 27 "http": proxyMeta, 28 "https": proxyMeta, 29 } 30 self.save_content_q = queue.Queue() 31 self.url_q = queue.Queue() 32 self.client = MongoClient(host=\'localhost\', port=27017) 33 # 使用的mongo的数据表 34 self.db = self.client.WbImageSet 35 36 def save_data_mongodb(self, collect_name, data): 37 # 定义一个去重的图标集名称 38 self.collect_name = self.db[collect_name] 39 history_record = self.collect_name.find_one({"_id": data[\'id\']}) 40 if history_record: 41 # 数据库中已经存在数据 42 pass 43 else: 44 # 数据库中不存在,插入数据 45 self.collect_name.update_one({\'_id\': data[\'id\']}, {\'$set\': data}, upsert=True) 46 47 def get_author_info(self, url): 48 """ 49 获取博主信息 50 :return: 51 """ 52 response = requests.get(url=url, headers=self.headers, proxies=self.proxies, verify=True) 53 self.data = response.content.decode(\'utf8\') 54 content = json.loads(self.data).get(\'data\') 55 max_content = content.get(\'userInfo\').get(\'statuses_count\') 56 return max_content 57 58 def get_containerid(self): 59 """获取微博主页的containerid,爬取微博内容时需要此id""" 60 content = json.loads(self.data).get(\'data\') 61 for data in content.get(\'tabsInfo\').get(\'tabs\'): 62 if (data.get(\'tab_type\') == \'weibo\'): 63 containerid = data.get(\'containerid\') 64 return containerid 65 66 def get_url(self, containerid, max_content): 67 for x in range(int(max_content)//10): 68 wb_content_url = \'https://m.weibo.cn/api/container/getIndex?type=uid&value=\' + self.name + \'&containerid=\'+containerid+\'&page=\'+str(x) 69 self.url_q.put(wb_content_url) 70 71 def get_wb_content(self): 72 """获取微博内容信息,并保存到文本中,内容包括:每条微博的内容、微博详情页面地址、点赞数、评论数、转发数等""" 73 num = 0 74 while True: 75 try: 76 if self.url_q.empty(): 77 break 78 weibo_url = self.url_q.get() 79 response = requests.get(url=weibo_url, headers=self.headers, proxies=self.proxies, verify=True) 80 content = json.loads(response.content.decode(\'utf8\')).get(\'data\') 81 cards = content.get(\'cards\') 82 if (len(cards) > 0): 83 for j in range(len(cards)): 84 num += 1 85 card_type = cards[j].get(\'card_type\') 86 item = {} 87 if (card_type == 9): 88 mblog = cards[j].get(\'mblog\') 89 scheme = cards[j].get(\'scheme\') # 微博地址 90 print(scheme) 91 print("--正在抓取--{}--第{}条微博--".format(self.name, num)) 92 text_id = mblog.get("id") # 微博id 用于去重 93 text = mblog.get(\'text\') # 微博内容 94 # 以下为获取标题的正则匹配 95 html = etree.HTML(text) 96 x = html.xpath(\'//text()\') 97 title = \',\'.join(x) 98 title = title.replace(\'\r\', \'\').replace(\'\n\', \'\').replace(\'\t\', \'\') 99 title = re.sub(\'(#.*#)\', \'\', title) 100 title = re.sub(\'@\', \'\', title) 101 title = re.sub(\' \', \'\', title) 102 pictures = mblog.get(\'pics\') # 正文配图,返回list 103 pic_urls = [] # 存储图片url地址 104 if pictures: 105 for picture in pictures: 106 pic_url = picture.get(\'large\').get(\'url\') 107 pic_urls.append(pic_url) 108 if pic_urls == []: 109 continue 110 item[\'id\'] = text_id 111 item[\'category\'] = self.category 112 item[\'author\'] = self.name 113 item[\'title\'] = title 114 item[\'url\'] = pic_urls 115 item[\'select\'] = 0 116 # 去重逻辑 117 self.save_data_mongodb(self.name, item) 118 except Exception as e: 119 print(e) 120 121 def run(self): 122 while True: 123 if wb_content_q.empty(): 124 break 125 dict_wb = wb_content_q.get() 126 self.category = dict_wb[\'category\'] 127 self.name = dict_wb[\'name\'] 128 self.wb_id = dict_wb[\'id\'] 129 max_content = self.get_author_info(self.start_temp_url.format(self.wb_id)) 130 containerid = self.get_containerid() 131 self.get_url(containerid, max_content) 132 133 Threads_caiji = [] 134 for x in range(5): 135 t1 = threading.Thread(target=self.get_wb_content) 136 Threads_caiji.append(t1) 137 for a in Threads_caiji: 138 a.start() 139 a.join() 140 141 142 class Excel_path: 143 144 @staticmethod 145 def get_excel_info(path,num): 146 """ 147 从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典 148 :param path: excel路径 149 :return:返回从excel中读取的字典 150 """ 151 excel_sheet = xlrd.open_workbook(path).sheet_by_index(num) 152 category_name = excel_sheet.col_values(0)[1:] 153 wb_url = excel_sheet.col_values(2)[1:] 154 name = excel_sheet.col_values(1)[1:] 155 for i in wb_url: 156 item = {} 157 a = wb_url.index(i) 158 c = i.split(\'?\')[0] 159 d = c.split(\'/\')[-1] 160 item[\'name\'] = name[a] 161 item[\'category\'] = category_name[a] 162 item[\'id\'] = d 163 print(item) 164 wb_content_q.put(item) 165 166 167 if __name__ == \'__main__\': 168 wb_content_q = queue.Queue() 169 excel_path = \'D:\\gongsi_code\\ImageSpider\\微博\\Image_set\\数据源.xlsx\' 170 excel_index = 0 171 Excel_path.get_excel_info(excel_path, int(excel_index)) 172 WB().run()
采集微博数据 将数据存入mongo 所有链接存mongo中
下载器
1 import os 2 import queue 3 import re 4 import threading 5 import time 6 from concurrent.futures.thread import ThreadPoolExecutor 7 8 import pymongo 9 import requests 10 import xlrd 11 12 13 class WbDownload: 14 def __init__(self): 15 self.client = pymongo.MongoClient(host=\'localhost\', port=27017) 16 self.db = self.client.WbImageSet 17 self.info_q = queue.Queue() 18 19 def get_info(self, collection_name): 20 self.collection = self.db[collection_name] 21 a = self.collection.find({"select": 0}) 22 for index in a: 23 self.info_q.put(index) 24 25 def save_(self): 26 while True: 27 """保存到本地""" 28 if self.info_q.empty(): 29 break 30 image = self.info_q.get() 31 category_name = image[\'category\'] 32 upload_time = time.strftime("%Y-%m-%d", time.localtime()) 33 rule = re.compile(r\'\s*\', re.S) 34 rule2 = re.compile(r\'\W*\', re.S) 35 title = rule.sub(\'\', image[\'title\']) 36 title = rule2.sub(\'\', title) 37 path = \'D:/微博/\' + category_name + \'/\' + str(upload_time) + \'/\' + title 38 if os.path.exists(path): 39 continue 40 else: 41 os.makedirs(path) 42 with open(path + \'/content.txt\', \'w\', encoding=\'utf8\')as fb: 43 fb.write(str([image[\'title\']])) 44 for x in image[\'url\']: 45 x_index = image[\'url\'].index(x) 46 if x[-4:-1:] == \'gif\': 47 continue 48 with open(path + \'/{}.jpg\'.format(str(x_index)), \'wb\') as f: 49 response = requests.get(url=x) 50 f.write(response.content) 51 self.collection.update_one({"id": id}, {"$set": {"select": 1}}) 52 print(\'----- \' + title + \' 写入完成 ------\') 53 54 55 def run(self): 56 while True: 57 if name_q.empty(): 58 break 59 name = name_q.get() 60 self.get_info(name) 61 Threads = [] 62 for i in range(20): 63 t_down = threading.Thread(target=self.save_) 64 t_down.start() 65 Threads.append(t_down) 66 for t in Threads: 67 t.join() 68 69 70 71 72 class Excel_path: 73 74 @staticmethod 75 def get_excel_info(path,num): 76 """ 77 从excel中读取数据 所有链接切割把博主的id分割出来,按照 id:category 的方式保存为字典 78 :param path: excel路径 79 :return:返回从excel中读取的字典 80 """ 81 excel_sheet = xlrd.open_workbook(path).sheet_by_index(num) 82 name = excel_sheet.col_values(1)[1:] 83 for x in name: 84 print(x) 85 name_q.put(x) 86 87 88 if __name__ == \'__main__\': 89 name_q = queue.Queue() 90 path = \'D:\\gongsi_code\\ImageSpider\\微博\\Image_set\\数据源.xlsx\' 91 excel_index = 0 92 Excel_path.get_excel_info(path, excel_index) 93 with ThreadPoolExecutor(5) as executor: 94 wb = WbDownload() 95 executor.submit(wb.run)
专门从mongo 中取数据然后下载,下载完后修改mongo中字段的名称,避免重复使用数据
版权声明:本文为lqn404原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。