python-虎扑爬虫
Python作为一个高级编程语言,不知从何时起就在圈子里流行起来了。个人也是图个鲜,跟上时代步伐学习了一下。“鲁迅”说过:不能学以致用,就是耍流氓。我用python对虎扑论坛作了一个爬虫。脚本写的糙了点,权作初学者交流使用,同时也方便以后查阅。本来是准备写一个虎扑的分析帖子,可后来动力不足就没有写成了。不过,作为一个马刺球迷很荣幸我们的组织是热度前三。
准备工作:安装Python、安装MySQL、虚拟机【选择性,后期将每日放在服务器上执行定时任务使用】
1、安装python:选择3.*,过程忽略
2、安装MySQL:选择5.6版本及以上,过程忽略
3、虚拟机:linux系列,过程忽略
需求描述
爬取虎扑论坛帖子,了解帖子内容、作者、热度等。
写脚本
一共分为三部分:part1通过对当前链接进行分析,提取帖子作者、阅读的信息;part2取得帖子本身的内容;part3对发帖人进行数据提取,为后期分析提供思路。具体的脚本如下。需要注意的是:编码、编码、编码。谢谢!
注:由于虎扑的反爬虫导致可细分论坛的可读取页面数为10(突破防御失败,谢谢!)这种情况下,我的处理方式是将脚本放入服务器中每日爬取进行累积。
Part1:爬取帖子的名称、作者、创建时间、阅读/回复、作者链接等,并放入本地MySQL数据库
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup import requests import json import time import pymysql import importlib,sys importlib.reload(sys) forum_note_sum=[] #variavle:save the content of tiezi list_d=[\'原创\',\'翻译\',\'讨论\'] #内容判断条件,如果帖子标题内容为此,取另一个值 type = sys.getfilesystemencoding() #num:the record number of one page;get tiezi of author and others def parent_li_web(num): forum_note_record = {} try: parent_tiezi=bs_obj.find(\'ul\',class_=\'for-list\').find_all(\'li\')[num] div_one = parent_tiezi.find(\'div\', class_=\'titlelink box\') div_two = parent_tiezi.find(\'div\', class_=\'author box\') span_three = parent_tiezi.find(\'span\', class_=\'ansour box\').string.strip() div_four = parent_tiezi.find(\'div\', class_=\'endreply box\') subname=div_one.a.string sublink=\'https://bbs.hupu.com\'+div_one.a[\'href\'] team_tmp=theme_tmp for i in list_d: if i==subname: subname=div_one.find_all(\'a\')[1].string sublink=\'https://bbs.hupu.com\'+div_one.find_all(\'a\')[1][\'href\'] # print (i,subname,sublink) forum_note_record.update({ \'subname\':subname, \'subname_link\':sublink, \'author\':div_two.a.string, \'author_link\':div_two.a[\'href\'], \'author_create_time\':div_two.find(\'a\',style=\'color:#808080;cursor: initial; \').string, \'read_reply_number\':span_three, \'last_reply_writer\':div_four.span.string, \'last_reply_time\':div_four.a.string, \'team_tmp\':team_tmp }) forum_note_sum.append(forum_note_record) except: return None if __name__==\'__main__\': # all_spurs_note begin_time=time.time() print(\'---------脚本执行时间为:{}------------\'.format(time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime()))) team_list = [\'rockets\', \'warriors\', \'cavaliers\', \'spurs\', \'lakers\', \'celtics\', \'thunder\', \'clippers\', \'timberwolves\', \'mavericks\', \'knicks\', \'bulls\', \'nets\', \'sixers\', \'jazz\', \'pacers\', \'blazers\', \'heat\', \'suns\', \'grizzlies\', \'wizards\', \'pelicans\', \'bucks\', \'kings\', \'raptors\', \'nuggets\', \'hawks\', \'hornets\', \'pistons\', \'magic\'] for li in team_list: forum_note_sum_code=[] theme_tmp=li for i in range(1,11,1): #由于虎扑反爬,只能爬到10页;后续可放入linux中定时执行 url = \'https://bbs.hupu.com/{}-{}\'.format(li,i) print (url) wb_string = requests.get(url) bs_obj = BeautifulSoup(wb_string.content, \'html.parser\') with open(\'web_spider_original.txt\',\'w\',encoding=\'utf8\') as f: f.write(str(bs_obj)) f.write(\'\r\'*10+\'-----我是分割线-----\'+\'\r\'*10) for j in range(1,61,1): #每个页面数据有60个帖子 parent_li_web(j) with open(\'hupu_spider_spurs_load.txt\', \'w\', encoding=\'utf8\') as f: for item in forum_note_sum: json.dump(item,f,ensure_ascii=False) f.write(\'\r\') #insert into mysql conn=pymysql.connect(host=\'localhost\',user=\'root\',passwd=\'1234\',db=\'spider\',port=3306,charset=\'utf8\') cur=conn.cursor() cur.execute(\'delete from hupuforum_spurs_note_daytmp\') with open(\'hupu_spider_spurs_load.txt\',\'r\',encoding=\'utf8\') as f: for item in f: item=json.loads(item) #how convert string to dict # print(type(item)) cur.execute(\'insert into hupuforum_spurs_note_daytmp(subname,subname_link,author,author_link,author_create_time,read_reply_number,last_reply_writer,last_reply_time,theme_title) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)\',(item[\'subname\'],item[\'subname_link\'],item[\'author\'],item[\'author_link\'],item[\'author_create_time\'],item[\'read_reply_number\'],item[\'last_reply_writer\'],item[\'last_reply_time\'],item[\'team_tmp\'])) conn.commit() cur.close() conn.close() print(\'Finished!本次执行消耗时间为:{}秒\'.format(time.time()-begin_time))
Part2:增加贴子内容并更新部分字段
# coding=utf8 import time import requests from bs4 import BeautifulSoup import pymysql import signal begin_time=time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime()) conn=pymysql.connect(host=\'localhost\',port=3306,user=\'root\',passwd=\'1234\',db=\'spider\',charset=\'utf8\') cur=conn.cursor() sub_cur = conn.cursor() cur.execute(\'INSERT INTO hupuforum_spurs_note SELECT * FROM hupuforum_spurs_note_daytmp WHERE subname_link NOT IN (SELECT a.subname_link FROM hupuforum_spurs_note a);\') cur.execute(\'update hupuforum_spurs_note a,hupuforum_spurs_note_daytmp b set a.read_reply_number=b.read_reply_number,a.last_reply_writer=b.last_reply_writer,a.last_reply_time=b.last_reply_time where a.subname_link=b.subname_link \') # conn.commit() cur.execute(\'use spider;\') conn.commit() cur.execute(\'select subname_link from hupuforum_spurs_note where sub_text is null;\') for url in cur.fetchall(): url=list(url) # print(url) try: wb_page = requests.get(url[0],timeout=2) #实际执行中,存在网页假死状态,设置超时 bs_obj = BeautifulSoup(wb_page.content, \'html.parser\') tmp_text = bs_obj.select(\'#tpc > div > div.floor_box > table.case > tbody > tr > td > div.quote-content\') sub_text=tmp_text[0].get_text(strip=True) sub_text=sub_text.replace(\'\\'\',\'’\') sql="""update hupuforum_spurs_note set sub_text=\\'{}\\' where subname_link={};""".format((sub_text[:1000]),str(url).replace(\'[\',\'\').replace(\']\',\'\')) # print(sql) sub_cur.execute(sql) conn.commit() print(\'success\') except IndexError as e: #这个错误意味着页面也不存在 sql="""update hupuforum_spurs_note set sub_text=\\'{}\\' where subname_link={};""".format(\'网页不存在\',str(url).replace(\'[\',\'\').replace(\']\',\'\')) sub_cur.execute(sql) conn.commit() except pymysql.err.InternalError as e: #说明内容中包含emoj等utf8四字符内容 sql="""update hupuforum_spurs_note set sub_text=\\'{}\\' where subname_link={};""".format(\'内容格式有误,导致出错!\',str(url).replace(\'[\',\'\').replace(\']\',\'\')) sub_cur.execute(sql) conn.commit() except requests.exceptions.ReadTimeout as e: #网页响应超时 sql="""update hupuforum_spurs_note set sub_text=\\'{}\\' where subname_link={};""".format(\'网页打开超时\',str(url).replace(\'[\',\'\').replace(\']\',\'\')) sub_cur.execute(sql) conn.commit() else: sql="""update hupuforum_spurs_note set sub_text=\\'{}\\' where subname_link={};""".format(\'其他类型错误\',str(url).replace(\'[\',\'\').replace(\']\',\'\')) sub_cur.execute(sql) conn.commit() conn.commit() cur.close() sub_cur.close() conn.close() end_time=time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime()) print(\'Finished,任务开始时间为:{},结束时间为:{}\'.format(begin_time,end_time))
Part3:爬取注册用户信息
# coding=utf8 import time import requests from bs4 import BeautifulSoup import pymysql begin_time=time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime()) conn=pymysql.connect(host=\'localhost\',port=3306,user=\'root\',passwd=\'1234\',db=\'spider\',charset=\'utf8\') cur=conn.cursor() sub_cur=conn.cursor() cur.execute(\'select distinct author_link from hupuforum_spurs_note;\') for author_url in cur.fetchall(): try: author_url=list(author_url) wb_obj=requests.get(author_url[0],timeout=2) bs_obj=BeautifulSoup(wb_obj.content,\'html.parser\') author=bs_obj.select(\'#main > div.personal > div.personal_right > h3 > div\')[0].string author_visited=bs_obj.select(\'#main > div.personal > div.personal_right > h3 > span\')[0].string.replace(\'有\',\'\').replace(\'人次访问\',\'\') author_info=bs_obj.select(\'#main > div.personal > div.personal_right > div\')[0].get_text(strip=True) sub_cur.execute(\'insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)\',(author,author_url[0],author_visited,author_info,\'正常\')) except IndexError as e: sub_cur.execute( \'insert into hupuforum_authors_info(author,author_link,author_visited,author_info,author_status) values(%s,%s,%s,%s,%s)\', (author, author_url[0], \'\', \'\', \'无法访问\')) conn.commit() conn.commit() cur.close() conn.close() end_time=time.strftime(\'%Y-%m-%d %H:%M:%S\',time.localtime()) print(\'Finished,任务开始时间为:{},结束时间为:{}\'.format(begin_time,end_time))