csdn论坛页抓取
抓取csdn论坛
实现功能
-
获取论坛分类所有链接,并拼接成推荐精华页的完成的链接
-
获取推荐精华页的帖子状态,赏分,帖子标题,作者,发布时间,回复量,查看量,最后发表时间
-
置顶内容不爬取,只打印置顶内容的帖子标题,作者信息和帖子链接
-
跳过没有内容的版块,并打印出来
-
把获取到的内容添加到数据库
未实现功能
-
获取非技术论坛时遇到一个讨论帖子,前面没有帖子状态,出现index下标异常
-
未爬取推荐精华页的所有帖子(只实现了第一页的爬取)
-
未爬取帖子内容(帖子发布的内容和回复信息等等)
-
未爬取发帖人的个人信息(排名,发帖数,回帖数,结帖率等等)
总结:
-
python的很多基础方法不知道
-
字符串操作不熟练
-
xpath语法不熟练
-
peewee很多方法不知道
-
马虎,判断帖子有没有内容时,后面忘加一个方法导致运行错误,找了半个多小时才找到
-
未实现功能应该都能解决,只是嫌麻烦
""" 抓取 解析 存储 """ import re import ast from urllib import parse import requests from scrapy import Selector import json import time from csdn_spider.models import * domain = 'https://bbs.csdn.net' def get_nodes_json(): left_menu_text = requests.get('https://bbs.csdn.net/dynamic_js/left_menu.js?csdn').text # print(left_menu_text) nodes_str_match = re.search('forumNodes: (.*])',left_menu_text) if nodes_str_match: nodes_str = nodes_str_match.group(1).replace('null','None') nodes_list = ast.literal_eval(nodes_str) # print(nodes_list) return nodes_list return [] url_list = [] def process_nodes_list(nodes_list): #将js的格式提取出url转换到list中 for item in nodes_list: if 'url' in item: if item['url']: url_list.append(item['url']) if 'children' in item: process_nodes_list(item['children']) def get_levell_list(nodes_list): levell_url = [] for item in nodes_list: if 'url' in item and item['url']: levell_url.append(item['url']) return levell_url def get_last_list(): #获取最终需要抓取的url nodes_list = get_nodes_json() process_nodes_list(nodes_list) levell_url = get_levell_list(nodes_list) last_url = [] for url in url_list: if url not in levell_url: last_url.append(url) all_urls = [] for url in last_url: all_urls.append(parse.urljoin(domain, url+'/recommend')) return all_urls def parse_list(url): res_text = requests.get(url).text sel = Selector(text=res_text) all_sel = sel.xpath('//table[@class="forums_tab_table"]/tbody//tr') if len(all_sel.extract()) != 0: if str(re.search('没有帖子', all_sel.extract()[0])) != 'None': print('没有帖子') return for tr in all_sel: if (tr.xpath('td[@class="forums_topic"]/span[1]/text()').extract()) == ['[置顶]']: print('发现置顶!!!') print('置顶账号为:',tr.xpath('td[@class="forums_author"]/a/text()').extract()[0]) print('置顶内容为:',tr.xpath('td[@class="forums_topic"]/a[2]/text()').extract()[0]) print('置顶链接为:',parse.urljoin(domain,tr.xpath('td[@class="forums_topic"]/a[2]/@href').extract()[0])) print('###############') else: #帖子状态 status = tr.xpath('td[@class="forums_topic_flag"]/span/text()').extract()[0] # print('帖子状态', status) #赏分 score = tr.xpath('td[@class="forums_score"]/em/text()').extract()[0] # print('赏分', score) #标题链接 topic_url = parse.urljoin(domain,tr.xpath('td[@class="forums_topic"]/a/@href').extract()[0]) # print('标题链接', topic_url) #标题 topic_title = tr.xpath('td[@class="forums_topic"]/a/text()').extract()[0] # print('标题', topic_title) #标题id topic_id = topic_url.split('/')[-1] # print('标题id', topic_id) #作者链接 id_url = tr.xpath('td[@class="forums_author"]/a/@href').extract()[0] # print('作者链接', id_url) #作者id author_id = id_url.split('/')[-1] # print('作者id', author_id) #作者名称 author_name = tr.xpath('td[@class="forums_author"]/a/text()').extract()[0] # print('作者名称', author_name) #发布时间 create_time = datetime.strptime(tr.xpath('td[@class="forums_author"]/em/text()').extract()[0], '%Y-%m-%d %H:%M') # print('发布时间', create_time) #回复和查看的字符串 answer_info = (tr.xpath('td[@class="forums_reply"]/span/text()').extract()[0]).split('/') #回复数量 answer_nums = answer_info[0] # print('回复数量', answer_nums) #查看数量 click_nums = answer_info[-1] # print('查看数量', click_nums) #最后发表时间 last_time = datetime.strptime(tr.xpath('td[@class="forums_last_pub"]/em/text()').extract()[0], '%Y-%m-%d %H:%M') # print('最后发表时间', last_time) #添加到数据库 Topic.create(id=topic_id,status=status,score=score,title_url=topic_url,title=topic_title,author_id=author_id,author_name=author_name,create_time=create_time,answer_nums=answer_nums,click_nums=click_nums,last_answer_time=last_time) if __name__=='__main__': all_urls = get_last_list() for url in all_urls: print('正在连接:',url) parse_list(url) time.sleep(3)
from peewee import *#建立数据库连接 db = MySQLDatabase('spider', host='127.0.0.1', port=3306, user='root', password='123456lmr') # class BaseModel(Model): class Meta: database = db ''' 设计数据表的时候需要注意的点 char类型,尽量设置MAX(最大长度) 对于无法确定最大长度的,要采用TextField类型 default和null=True 主键无法设置int以外的类型(可能是版本问题) ''' #帖子list class Topic(BaseModel): #帖子名称 title = CharField() #帖子链接 title_url = CharField(default='') # #帖子内容 # content = TextField(default='') #帖子id id = IntegerField(primary_key=True) #用户id author_id = CharField() #用户名称 author_name = CharField() #创建时间 create_time = DateTimeField() #回复数量 answer_nums = IntegerField(default=0) #查看数量 click_nums = IntegerField(default=0) # #点赞数量 # parised_nums = IntegerField(default=0) # #结帖率 # jtl = FloatField(default=0.0) #赏分 score = IntegerField(default=0) #状态 status = CharField() #最后回复时间 last_answer_time = DateTimeField() #帖子内容 class Answer(BaseModel): # topic_id = IntegerField() author = CharField() content = TextField(default="") create_time = DateTimeField() parised_nums = IntegerField(default=0) #点赞数 #用户 class Author(BaseModel): name = CharField() sign_name_id = CharField() # id = CharField(primary_key=True) click_nums = IntegerField(default=0) # 访问数 original_nums = IntegerField(default=0) # 原创数 forward_nums = IntegerField(default=0) # 转发数 rate = CharField(default=-1) # 排名 answer_nums = IntegerField(default=0) # 评论数 parised_nums = IntegerField(default=0) # 获赞数 desc = TextField(null=True) industry = CharField(null=True) location = CharField(null=True) follower_nums = IntegerField(default=0) # 粉丝数 following_nums = IntegerField(default=0) # 关注数 if __name__ == '__main__': # db.create_tables([Topic]) # db.create_tables([Answer]) # db.create_tables([Author]) db.create_tables([Topic, Answer, Author])
版权声明:本文为Ly-233原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。