动态页面的模拟点击:

以斗鱼直播为例:http://www.douyu.com/directory/all

爬取每页的房间名、直播类型、主播名称、在线人数等数据,然后模拟点击下一页,继续爬取

#!/usr/bin/python3
# -*- coding:utf-8 -*-
__author__ = \'mayi\'

"""
动态页面的模拟点击:
    模拟点击斗鱼直播:http://www.douyu.com/directory/all
    爬取每页房间名、直播类型、主播名称、在线人数等数据,然后模拟点击下一页,继续爬取
"""

from selenium import webdriver
import json

# 调用环境变量指定的PhantomJS浏览器创建浏览器对象,executable_path:指定PhantomJS位置
driver = webdriver.PhantomJS(executable_path = r"D:\Program Files\phantomjs\bin\phantomjs")
from bs4 import BeautifulSoup

class DouyuSpider(object):
    """
    爬虫类
    """
    def __init__(self):
        self.url = "http://www.douyu.com/directory/all/"
        self.driver = webdriver.PhantomJS()
        self.file_name = open("douyu.json", "w", encoding = "utf-8")

    def run(self):
        """
        爬虫开始工作
        """
        self.driver.get(self.url)
        # 循环处理每一页,直至最后一页
        page = 1
        start_flag = True
        while True:
            # 等待3秒,防止访问过于频繁
            self.driver.implicitly_wait(3)
            print("正在处理第" + page + "页......")
            page += 1
            # 解析
            soup = BeautifulSoup(self.driver.page_source, "lxml")
            # 在线直播部分
            online_live = soup.find_all(\'ul\', {\'id\': \'live-list-contentbox\'})[0]
            # 房间列表
            live_list = online_live.find_all(\'li\')
            # 处理每一个房间
            for live in live_list:
                # 房间名、直播类型、主播名称、在线人数
                # 房间名
                home_name = live.find_all(\'h3\', {\'class\': \'ellipsis\'})[0].get_text().strip()
                # 直播类型
                live_type = live.find_all(\'span\', {\'class\': \'tag ellipsis\'})[0].get_text().strip()
                # 主播名称
                anchor_name = live.find_all(\'span\', {\'class\': \'dy-name ellipsis fl\'})[0].get_text().strip()
                # 在线人数
                online_num = live.find_all(\'span\', {\'class\' :\'dy-num fr\'})[0].get_text().strip()
                # print(home_name, live_type, anchor_name, online_num)
                item = {}
                item["房间名"] = home_name
                item["直播类型"] = live_type
                item["主播名称"] = anchor_name
                item["在线人数"] = online_num
                if start_flag:
                    start_flag = False
                    content = "[\n" + json.dumps(item)
                else:
                    content = ",\n" + json.dumps(item)
                self.file_name.write(content)
            # page_source.find()未找到内容则返回-1
            if self.driver.page_source.find(\'shark-pager-disable-next\') != -1:
                # 已到最后一页
                break
            # 模拟点击下一页
            self.driver.find_element_by_class_name(\'shark-pager-next\').click()

        # 爬虫结束前关闭文件
        self.file_name.write("\n]")
        self.file_name.close()



if __name__ == \'__main__\':
    douyu = DouyuSpider()
    douyu.run()

 

版权声明:本文为mayi0312原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/mayi0312/p/7236472.html