在线微博数据可视化,即时采集微博数据,通过不同词云进行展示数据

完整代码gitee地址:https://gitee.com/lyc96/weibo

1.先来效果图(压压惊)

1)输入明星完整名字

 

2)点击查看后,可以看到明星的言语文字可视化,有六种图形,可以随意切换

 

 

2.程序功能介绍

1)根据明星姓名去爬取该明星的微博言论,并存储到文本文件(项目中不涉及任何数据库存储),程序中也会根据日期进行去重爬取,避免程序重复爬取同一条微博。

2)采集好的明星微博文本存储到txt文本中,使用stylecloud进行词云生成图片(有六种图形词云,可以在网页端进行切换)

 

3.python后端代码

  1. 1 # -*- coding: utf-8 -*-
  2. 2 """
  3. 3 Created on Sun Jul 19 12:03:56 2020
  4. 4
  5. 5 @author: 李运辰
  6. 6 """
  7. 7 import requests
  8. 8 import time
  9. 9 import os
  10. 10 import json
  11. 11 from stylecloud import gen_stylecloud
  12. 12 import jieba
  13. 13 from flask_cors import CORS
  14. 14 from flask import Flask,render_template,request,Response,redirect,url_for
  15. 15 #内网ip
  16. 16 app = Flask(__name__)
  17. 17 ###此处改为自己的ip地址,在index.html中两次也记得更改
  18. 18 ip="192.168.0.112"
  19. 19 ###
  20. 20 root="static/data/"
  21. 21 pagedata="pagedata/"
  22. 22 textdata="textdata/"
  23. 23
  24. 24 # 睡眠时间 传入int为休息时间,页面加载和网速的原因 需要给网页加载页面元素的时间
  25. 25 def s(int):
  26. 26 time.sleep(int)
  27. 27 headers = {
  28. 28
  29. 29 \'user-agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36\'
  30. 30 }
  31. 31 """初始化"""
  32. 32 def initialization():
  33. 33 #初始化爬取记录文本
  34. 34 if not os.path.exists(root):
  35. 35 os.mkdir(root)
  36. 36 if not os.path.exists(root+pagedata):
  37. 37 os.mkdir(root+pagedata)
  38. 38 if not os.path.exists(root+textdata):
  39. 39 os.mkdir(root+textdata)
  40. 40
  41. 41 def write(path,t):
  42. 42 #记录当前爬取页数
  43. 43 with open(path,"a+",encoding=\'utf8\') as f:
  44. 44 f.writelines(str(t))
  45. 45 f.writelines("\n")
  46. 46
  47. 47 def search(name_s,url,since_id):
  48. 48
  49. 49 #url = "https://m.weibo.cn/api/container/getIndex?uid=1566301073&t=0&luicode=10000011&lfid=100103type=1&q=贾玲&type=uid&value=1566301073&containerid=1076031566301073"
  50. 50 start=1
  51. 51 if since_id is not None and len(since_id)>1:
  52. 52 url+="&since_id="+since_id
  53. 53 start=0
  54. 54 response = requests.get(url,headers = headers)
  55. 55
  56. 56 datas = response.json()
  57. 57 #print(data)
  58. 58 ok = str(datas[\'ok\'])
  59. 59 try:
  60. 60 with open(root+pagedata+name_s+".txt","r") as f: #设置文件对象
  61. 61 pagelist = f.read()
  62. 62 except:
  63. 63 pagelist=[]
  64. 64
  65. 65 if ok is not None and ok==\'1\':
  66. 66 data = datas[\'data\']
  67. 67 since_ids = data[\'cardlistInfo\'][\'since_id\']
  68. 68 print(since_ids)
  69. 69 cards = data[\'cards\']
  70. 70 print(len(cards))
  71. 71 for i in range(start,len(cards)):
  72. 72 date = cards[i][\'mblog\'][\'created_at\']
  73. 73 if str(date) not in pagelist:
  74. 74 text1 = cards[i][\'mblog\'][\'text\']
  75. 75 write(root+textdata+name_s+".txt",clean(text1))
  76. 76 write(root+pagedata+name_s+".txt",date)
  77. 77
  78. 78 """去掉表情...,等html标签"""
  79. 79 def clean(s):
  80. 80 istart=-1
  81. 81 try:
  82. 82 istart = s.index(\'<\')
  83. 83 iend = s.index(\'>\')
  84. 84 s = s[:istart]+s[iend+1:]
  85. 85 except:
  86. 86 pass
  87. 87 try:
  88. 88 istart = s.index(\'<\')
  89. 89 except:
  90. 90 pass
  91. 91 if istart>=0:
  92. 92 return clean(s)
  93. 93 else:
  94. 94 #print(s)
  95. 95 return(s)
  96. 96
  97. 97 def geturl(name_g):
  98. 98 url1="https://m.weibo.cn/api/container/getIndex?containerid=100103type=1%26q="+name_g+"&page_type=searchall"
  99. 99 response = requests.get(url1,headers = headers)
  100. 100 datas = response.json()
  101. 101 uid = str(datas[\'data\'][\'cards\'][0][\'card_group\'][0][\'user\'][\'id\'])
  102. 102 newurl = "https://m.weibo.cn/api/container/getIndex?uid="+uid+"&t=0&luicode=10000011&lfid=100103type=1&q="+name_g+"&type=uid&value="+uid+"&containerid=107603"+uid
  103. 103 return newurl
  104. 104
  105. 105 def jieba_cloud(file_name,icon):
  106. 106 with open(file_name,\'r\',encoding=\'utf8\') as f:
  107. 107 word_list = jieba.cut(f.read())
  108. 108 result = " ".join(word_list) #分词用 隔开
  109. 109 #制作中文云词
  110. 110 icon_name=""
  111. 111 if icon=="1":
  112. 112 icon_name=\'\'
  113. 113 elif icon=="2":
  114. 114 icon_name=\'fas fa-dragon\'
  115. 115 elif icon=="3":
  116. 116 icon_name=\'fas fa-dog\'
  117. 117 elif icon=="4":
  118. 118 icon_name=\'fas fa-cat\'
  119. 119 elif icon=="5":
  120. 120 icon_name=\'fas fa-dove\'
  121. 121 elif icon=="6":
  122. 122 icon_name=\'fab fa-qq\'
  123. 123 """
  124. 124 # icon_name=\'\',#国旗
  125. 125 # icon_name=\'fas fa-dragon\',#翼龙
  126. 126 icon_name=\'fas fa-dog\',#狗
  127. 127 # icon_name=\'fas fa-cat\',#猫
  128. 128 # icon_name=\'fas fa-dove\',#鸽子
  129. 129 # icon_name=\'fab fa-qq\',#qq
  130. 130 """
  131. 131 picp=file_name.split(\'.\')[0] +str(icon)+\'.png\'
  132. 132 if icon_name is not None and len(icon_name)>0:
  133. 133 gen_stylecloud(text=result,icon_name=icon_name,font_path=\'simsun.ttc\',output_name=picp) #必须加中文字体,否则格式错误
  134. 134 else:
  135. 135 gen_stylecloud(text=result,font_path=\'simsun.ttc\',output_name=picp) #必须加中文字体,否则格式错误
  136. 136
  137. 137 return picp
  138. 138 ############################flask路由
  139. 139 #进入首页
  140. 140 @app.route(\'/\')
  141. 141 def index():
  142. 142 return render_template(\'index.html\')
  143. 143 #获取图片
  144. 144 @app.route(\'/find\')
  145. 145 def find():
  146. 146 #global history
  147. 147 #采集数据
  148. 148 name_i = request.args.get(\'name\')
  149. 149
  150. 150 if not os.path.exists(root+textdata+name_i+\'.txt\'):
  151. 151 u = geturl(name_i)
  152. 152 search(name_i,u,"")
  153. 153 #制作词云
  154. 154 file_name = root+textdata+name_i+\'.txt\'
  155. 155 picpath = jieba_cloud(file_name,"1")
  156. 156
  157. 157 return Response(json.dumps(picpath), mimetype=\'application/json\')
  158. 158 #切换图标
  159. 159 @app.route(\'/switchs\')
  160. 160 def switchs():
  161. 161 #global history
  162. 162 #采集数据
  163. 163 name_i = request.args.get(\'name\')
  164. 164 icon = request.args.get(\'ic\')
  165. 165 #制作词云
  166. 166 file_name = root+textdata+name_i+\'.txt\'
  167. 167 picpath = jieba_cloud(file_name,str(icon))
  168. 168 return Response(json.dumps(picpath), mimetype=\'application/json\')
  169. 169 ############################end
  170. 170
  171. 171 if __name__ == "__main__":
  172. 172 """初始化"""
  173. 173 initialization()
  174. 174 app.run(host=\'\'+ip, port=5000,threaded=True)

View Code

 

4.完整代码gitee地址:https://gitee.com/lyc96/weibo

 

 

关注公众号:Python爬虫数据分析挖掘,学习更多python知识

版权声明:本文为chenlove原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/chenlove/p/13367153.html