爬取腾讯课堂的课程评论
最近想了解一下在线教育的课程的如何去选择,课程的质量如何?所以试着去爬了一下腾讯课堂,只爬了IT互联网这一项。
通过分析发现要想爬取到评论需要是个步骤:
-
解析学习方向,如下图所示:
通过开发者工具审查元素,发现标签在<dl class="sort-menu sort-menu1 clearfix">
下
然后去写解析代码:
· # _pattern表示解析href的正则表达式
def get_menu_link(self, url, _pattern):
headers = {
'user-agent': self.round_header()
}
start = time.perf_counter()
res = self.s.get(url, headers=headers)
if res is None:
return
content = res.text
menu_pattern = re.compile(r'<dl class="sort-menu sort-menu1 clearfix">(.*?)</dl>', re.S)
menu = re.findall(menu_pattern, content)
link_paternt = re.compile(_pattern, re.S | re.M)
if len(menu) != 0:
links = re.findall(link_paternt, menu[0])
end = time.perf_counter()
_time = end - start
print('{0}解析成功,共耗时:{1:f}s'.format(url, _time))
for item in links:
item = item.replace('&', '&')
link = 'https://ke.qq.com{0}'.format(item)
yield link
else:
end = time.perf_counter()
_time = end - start
print('{0}解析失败!!!,共耗时:{1:f}s'.format(url, _time))
return None
-
解析学习方向下的分类,如下图所示(发现与第一步相似):
-
到这里就要解析课程信息了,全部课程都在
<ul class="course-card-list" auto-test="">
下,如图所示:
解析代码如下:
def get_course_list(self, url):
headers = {
'user-agent': self.round_header()
}
start = time.perf_counter()
res = self.s.get(url, headers=headers)
if res is None:
return
content = res.text
course_card_list_pattern = re.compile(r'<ul class="course-card-list.+?">\s+(.+)\s+</ul>', re.S)
course_card_list = re.findall(course_card_list_pattern, content)
course_list_pattern = re.compile(r'<li class="course-card-item.*?">.*?<h4 class="item-tt">\s+'
+ r'<a href="(.*?)" target="_blank" class="item-tt-link.*?">(.*?)</a>\s+</h4>.*?<div '
+ r'class="item-line.*?middle">\s+<span class="line-cell.*?">\s+(.*?)\s+</span>\s+<span '
+ r'class="item-source">.*?class="item-source-link.*?">(.*?)</a>\s+.*?<div '
+ r'class="item-line.*?bottom">\s+<span class="line-cell item-price free">(.*?)</span>\s+</div>\s+</li>',
re.S)
if len(course_card_list) != 0:
#这里只获取了前三个
course_list = re.findall(course_list_pattern, course_card_list[0])[0:3]
end = time.perf_counter()
_time = end - start
print('解析成功,共耗时:{0}s'.format(_time))
for item in course_list:
yield {
'url': 'https:{0}'.format(item[0]),
'courseName': item[1],
'num': item[2],
'source': item[3],
'fee': item[4]
}
else:
end = time.perf_counter()
_time = end - start
print('在该链接下没有找到课程列表,共耗时:{0}s'.format(_time))
return None
- 最后到了获取评论,发现是动态获取的,所以要去分析发送请求中的参数
好了,参数都分析好了,接着看代码:
def get_comment(self, url, params, headers):
res = self.get(url, params=params, headers=headers)
if res is None:
return
#解析json成Python对象
result = json.loads(res.text).get('result')
return {
'total_page': result.get('total_page'),
'comments': result.get('items'),
'total_num': result.get('total_num')
}
# 获取cid
cid = re.search(r'/(\d+)', _url).group(1)
# 获取19位随机数
r = eval('{0:.18f}'.format(random.random())[0:19])
params = {
'cid': cid,
'count': 10,
'page': 0,
'filter_rating': 0,
'bkn': '',
'r': r
}
headers = {
'user-agent': t.round_header(),
'referer': _url,
'cookie': 'pgv_info=ssid=s6819497920; ts_last=ke.qq.com/course/144558; pgv_pvid=1821056816; ts_uid=7896600315; _pathcode=0.9075570219139721; tdw_auin_data=-; tdw_data={"ver4":"4","ver6":"","refer":"","from_channel":"","path":"eh-0.9075570219139721","auin":"-","uin":null,"real_uin":null}; tdw_first_visited=1; Hm_lvt_0c196c536f609d373a16d246a117fd44=1543998342; Hm_lpvt_0c196c536f609d373a16d246a117fd44=1543998342; tdw_data_new_2={"auin":"-","sourcetype":"","sourcefrom":"","uin":"","visitor_id":"53087919"}'
}
__url = 'https://ke.qq.com/cgi-bin/comment_new/course_comment_list'
print('获取cid:{0}的评论'.format(cid), end='\t')
comments = t.get_comment(__url, params, headers=headers)
coures.update(comments)
到这里整个爬虫就写完了,全部代码如下:
import requests as req
import sys
import io
import time
import re
import random
import json
import csv
from utils.spider import Spider
class Ten(Spider):
def __init__(self, url):
Spider.__init__(self, url)
self.url = url
self.s = req.Session()
self.flag = 1
def get_menu_link(self, url, _pattern):
headers = {
'user-agent': self.round_header()
}
start = time.perf_counter()
res = self.s.get(url, headers=headers)
if res is None:
return
content = res.text
menu_pattern = re.compile(r'<dl class="sort-menu sort-menu1 clearfix">(.*?)</dl>', re.S)
menu = re.findall(menu_pattern, content)
link_paternt = re.compile(_pattern, re.S | re.M)
if len(menu) != 0:
links = re.findall(link_paternt, menu[0])
end = time.perf_counter()
_time = end - start
print('{0}解析成功,共耗时:{1:f}s'.format(url, _time))
for item in links:
item = item.replace('&', '&')
link = 'https://ke.qq.com{0}'.format(item)
yield link
else:
end = time.perf_counter()
_time = end - start
print('{0}解析失败!!!,共耗时:{1:f}s'.format(url, _time))
return None
def get_course_list(self, url):
headers = {
'user-agent': self.round_header()
}
start = time.perf_counter()
res = self.s.get(url, headers=headers)
if res is None:
return
content = res.text
course_card_list_pattern = re.compile(r'<ul class="course-card-list.+?">\s+(.+)\s+</ul>', re.S)
course_card_list = re.findall(course_card_list_pattern, content)
course_list_pattern = re.compile(r'<li class="course-card-item.*?">.*?<h4 class="item-tt">\s+'
+ r'<a href="(.*?)" target="_blank" class="item-tt-link.*?">(.*?)</a>\s+</h4>.*?<div '
+ r'class="item-line.*?middle">\s+<span class="line-cell.*?">\s+(.*?)\s+</span>\s+<span '
+ r'class="item-source">.*?class="item-source-link.*?">(.*?)</a>\s+.*?<div '
+ r'class="item-line.*?bottom">\s+<span class="line-cell item-price free">(.*?)</span>\s+</div>\s+</li>',
re.S)
if len(course_card_list) != 0:
course_list = re.findall(course_list_pattern, course_card_list[0])[0:3]
end = time.perf_counter()
_time = end - start
print('解析成功,共耗时:{0}s'.format(_time))
for item in course_list:
yield {
'url': 'https:{0}'.format(item[0]),
'courseName': item[1],
'num': item[2],
'source': item[3],
'fee': item[4]
}
else:
end = time.perf_counter()
_time = end - start
print('在该链接下没有找到课程列表,共耗时:{0}s'.format(_time))
return None
def get_comment(self, url, params, headers):
res = self.get(url, params=params, headers=headers)
if res is None:
return
result = json.loads(res.text).get('result')
return {
'total_page': result.get('total_page'),
'comments': result.get('items'),
'total_num': result.get('total_num')
}
def save(self, data):
fieldnames = ['url', 'courseName', 'num', 'source', 'fee', 'total_num', 'total_page', 'comments']
file_name = 'mooc.csv'
with open(file_name, 'a+', newline='', encoding='utf-8') as f:
w = csv.DictWriter(f, fieldnames)
if self.flag == 1:
w.writeheader()
self.flag = 0
w.writerow(data)
if __name__ == "__main__":
# it 互联网
# 第一步先解析互联网下的分类URL
# 第二步解析一级菜单下的分类
# 第三步解析二级菜单下的前三个课程链接
# 第四步解析课程中的评论
url = 'https://ke.qq.com/course/list?mt=1001'
list_no = []
t = Ten(url)
# 1.
link_paternt = r'<dd class="">\s+<\w+></\w+>\s+<a href="(.*?)" title=".*?">.*?</a>\s+</dd>'
print('--------开始爬取--------')
links = t.get_menu_link(url, link_paternt)
if links is not None:
for item in links:
# 2.
option_pattern = r'<dd class="">\s+<a href="(.*?)" title=".*?">.*?</a>\s+</dd>'
options = t.get_menu_link(item, option_pattern)
time.sleep(2)
if options is not None:
for option in options:
print('开始解析{}'.format(option), end=' ====>> ')
# 3
course_list = t.get_course_list(option)
time.sleep(2)
if course_list is None:
list_no.append(option)
continue
else:
for coures in course_list:
_url = coures.get('url')
# 4
cid = re.search(r'/(\d+)', _url).group(1)
r = eval('{0:.18f}'.format(random.random())[0:19])
params = {
'cid': cid,
'count': 10,
'page': 0,
'filter_rating': 0,
'bkn': '',
'r': r
}
headers = {
'user-agent': t.round_header(),
'referer': _url,
'cookie': 'pgv_info=ssid=s6819497920; ts_last=ke.qq.com/course/144558; pgv_pvid=1821056816; ts_uid=7896600315; _pathcode=0.9075570219139721; tdw_auin_data=-; tdw_data={"ver4":"4","ver6":"","refer":"","from_channel":"","path":"eh-0.9075570219139721","auin":"-","uin":null,"real_uin":null}; tdw_first_visited=1; Hm_lvt_0c196c536f609d373a16d246a117fd44=1543998342; Hm_lpvt_0c196c536f609d373a16d246a117fd44=1543998342; tdw_data_new_2={"auin":"-","sourcetype":"","sourcefrom":"","uin":"","visitor_id":"53087919"}'
}
__url = 'https://ke.qq.com/cgi-bin/comment_new/course_comment_list'
print('获取cid:{0}的评论'.format(cid), end='\t')
comments = t.get_comment(__url, params, headers=headers)
coures.update(comments)
t.save(coures)