爬取不得姐网站,利用多线程来爬取

利用到的库

  • time, requests, lxml, queue, threading

功能

  • 爬取不得姐网站中前二十页的段子数据
import time
import requests
from lxml import etree
from queue import Queue
import threading


class bsSpider:
    def __init__(self):
        self.baseUrl = "http://www.budejie.com/"
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"}
        # URL队列
        self.urlQueue = Queue()
        # 相应html队列
        self.resQueue = Queue()

    # 生成url队列
    def getUrl(self):
        for pNumber in range(1, 21):
            url = self.baseUrl + str(pNumber)
            self.urlQueue.put(url)

    # 请求,得到相应的html,放到解析队列
    def getHtml(self):
        while True:
            # 1.从url队列中get值
            url = self.urlQueue.get()
            # 2.发请求,得响应,put到响应队列中
            res = requests.get(url, headers=self.headers)
            res.encoding = "utf-8"
            html = res.text
            self.resQueue.put(html)
            # 清除此任务
            self.urlQueue.task_done()

    # 解析页面方法
    def getText(self):
        while True:
            html = self.resQueue.get()
            parseHtml = etree.HTML(html)
            r_list = parseHtml.xpath(\'//div[@class="j-r-list-c-desc"]/a/text()\')
            for r in r_list:
                print(r+"\n")
            self.resQueue.task_done()

    def run(self):
        # 空列表,用来存放
        thList = []
        # 生成URL队列
        self.getUrl()
        # 创建请求线程,放到列表中
        for i in range(10):
            thRes = threading.Thread(target=self.getHtml)
            thList.append(thRes)
        # 创建解析线程,放到列表中
        for i in range(3):
            thParse = threading.Thread(target=self.getText)
            thList.append(thParse)
        # 所有线程开始干活
        for th in thList:
            th.setDaemon(True)
            th.start()

        # 如果队列为空,则执行其他程序
        self.urlQueue.join()
        self.resQueue.join()


if __name__ == \'__main__\':
    begin = time.time()
    spider = bsSpider()
    spider.run()
    end = time.time()
    print(end - begin)

版权声明:本文为zengsf原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/zengsf/p/10040162.html