scrapy爬虫下载音频文件并储存到本地

玩爬虫，怎么能少了scrapy框架呢。scrapy框架被称为是复杂并好用的爬虫框架。

当初学框架的时候是一头雾水，一旦实战成功过后，感觉瞬间打通了任督二脉，很有成就感。

接下来，将对scrapy框架爬虫代码编写流程做简要说明：

import scrapy


class OnlinelistenningItem(scrapy.Item):
    file_text = scrapy.Field()  # 左边是字段名，右边是item对象。item是字典类型数据，字段通过item[\'file_text\']方式提取数据。
    text_path = scrapy.Field()
    file_paths = scrapy.Field()
    file_urls = scrapy.Field()

四、解析网页，得到相应数据

1. 打开spider_name.py文件

2. 将解析获取到的url音频下载链接返回

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import time
from scrapy import Selector
from ..items import OnlinelistenningItem
from scrapy.utils.project import get_project_settings


class TingroomSpider(scrapy.Spider):
    name = \'tingroom\'
    allowed_domains = [\'tingroom.com\']
    start_urls = [\'http://www.tingroom.com/lesson/\']  # 起始网页，通过起始网页解析获取更多url，然后直到解析到所需音频链接
    domain = \'http://www.tingroom.com\'
    root_dir = get_project_settings().get(\'FILES_STORE\') + \'\\\'  # 获取根目录，在setting文件中定义的变量

    def parse(self, response):
        # response.body  # 获取音频图片下载到的数据，以二进制写入文件的方式储存
        listenning_rts = response.xpath(\'/html/body/div[5]//ul[@id="line_01"]//a\')
        for class1_rt in listenning_rts:
            class1_title = class1_rt.xpath(\'./text()\').extract_first().strip()
            class1_path = self.root_dir + class1_title  # 判断title文件夹是否存在
            first_url = class1_rt.xpath(\'./@href\').extract_first().strip()
            first_url = parse.urljoin(self.domain, first_url)  # 类别1链接
            meta = {\'result_path\': class1_path}
            if class1_title in download_list:
                print(\'first_url:\', first_url)
                yield scrapy.http.Request(first_url, meta=meta, callback=self.listenningParse)  # 将获取到的连接传给listenningParse进行进一步解析，通过meta传递参数

    def listenningParse(self, response):  # 听力板块解析
        meta = response.meta  # meta是字典若直接使用meta[\'\']取值，若无会报错。采用get方法，若无数据不会报错，且返回None
        result_path = meta.get(\'result_path\')  # 当前文件列表的目录
        
        # 获取下一页数据
        next_url = response.xpath(\'//div[@class="dede_pages"]//a[text()="下一页"]/@href\')
        if next_url:
            next_url = next_url.extract_first()
            next_url = parse.urljoin(self.domain, next_url)  # 下一页链接，再调用自己处理数据
            yield scrapy.http.Request(next_url, meta=meta, callback=self.listenningParse)  # 调用自身，继续执行
        # 获取所有内容的标题和正文链接
        article_rts = response.xpath(\'//a[@class="goog"]\')
        if article_rts:
            for article_rt in article_rts:
                article_url = article_rt.xpath(\'./@href\').extract_first().strip()
                article_url = parse.urljoin(self.domain, article_url)
                article_title = article_rt.xpath(\'./text()\')
                if article_title:
                    article_title = article_title.extract_first().strip().replace(\':\', \'：\').replace(\'/\',
                                                                                                    \'_\').replace(
                        \'\\\', \'_\').replace(\'*\', \'\').replace(
                        \'?\', \'？\').replace(\'\"\', \'”\').replace(\'|\', \'\').replace(\'<\', \'《\').replace(\'>\', \'》\').replace(
                        \' \', \'_\')
                    meta[\'article_title\'] = article_title
                    yield scrapy.http.Request(article_url, meta=meta,
                                              callback=self.listenningArticlePage)  # 传给listenningArticlePage继续解析

    def listenningArticlePage(self, response):  # 进入正文
        meta = response.meta  # 获取参数
        
        # 获取正文内容：文本，字幕
        rs_texts = response.xpath(\'//div[@class="content"]//text()\').extract()
        rs_text = [i.strip().replace(\'\n\', \'\').replace(\'\r\', \'\').replace(\'\t\', \'\').replace(\'\xa0\', \'\') for i in
                   rs_texts]
        while \'\' in rs_text:
            rs_text.remove(\'\')
        file_text = []
        for rs in rs_text:
            if rs == \'点击\':  # 用于分割正文和单词
                rs = \'\n\' + \'=\' * 60 + \'\n\' + \'\n重要词汇：\'
            # 如果是纯数字，或其他字符串，就跳过
            elif \'google_ad_client\' in rs or \'tingroom\' in rs or \'单词翻译:\' in rs or \'收听单词发音\' == rs or rs.isdigit():
                continue
            file_text.append(rs + \'\n\')
        
        # 获取下载链接，返回给pipeline下载并储存文件
        download_rt = response.xpath(\'//param[@name="movie"]/@value\')
        if download_rt:
            download_url = download_rt.re_first(\'http:.*\')  # 下载链接
            file_type = download_rt.re_first(\'com.*(\..*)\')  # 根据下载链接，获取下载文件类型，有的是mp3，有的是rm等
            if not file_type:
                return
            file_name = meta.get(\'article_title\') + file_type
            file_path = meta.get(\'result_path\') + \'\\\' + file_name  # 文件储存目录 + 名称
            text_path = meta.get(\'result_path\') + \'\\\' + meta.get(\'article_title\') + \'.txt\'  # 文本储存目录 + 名称

            item = OnlinelistenningItem()  # item中定义的字段，此处通过其建立对象
            item[\'file_text\'] = file_text  # 通过字典方法，存入数据
            item[\'text_path\'] = text_path  # 文本路径 + 文件名，用于存储。因为是自定义储存，所以文本路径是绝对路径
            item[\'file_paths\'] = [file_path]  # 音频储存路径，因为是scrapy储存，所以可以是相对路径，可以是绝对路径。
            item[\'file_urls\'] = [download_url]  # 音频下载链接
            yield item  # 此处，会将下载链接等信息传给pipeline，pipeline里面配置好，会自动下载并储存文件

五、储存数据

1. 打开pipelines.py文件：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from .items import OnlinelistenningItem  # 如果需要判断不同的item，需要导入item，用isinstance(item, item_name)来判断
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request

class OnlinelistenningSelfPipeline(FilesPipeline):
    def get_media_requests(self, item, info):  # 获取item中的url，用于下载文件
        file_url = item[\'file_urls\'][0]
        yield Request(file_url, meta=item)

    def file_path(self, request, response=None, info=None):  # 通过request匹配设置文件路径
        meta = request.meta
        file_path = meta.get(\'file_paths\')[0]  # 自动储存。相对路径（相对setting.py中的FILES_STORE），或绝对路径
        return file_path

    def item_completed(self, results, item, info):
        with open(item[\'text_path\'], \'w\', encoding=\'utf8\') as f:  # 自己写入文本内容到对应路径下，同样通过item传入数据（路径 + 文本内容）
            f.writelines(item[\'file_text\'])
        print(f\'{item["file_paths"][0]}下载完成！\n{"*" * 50}\n\')
        return item

六、user_agent和ip代理设置

1. 打开middlewares.py文件

2. 输入下面代码：

from fake_useragent import UserAgent
class OnlinelistenningUseragentMiddleware(object):
    def __init__(self):
        self.ua = UserAgent()  # 建立UserAgent对象
    def process_request(self, request, spider):
        us_agent = self.ua.random  # 调用UserAgent().random生成随机的user agent
        request.headers.setdefault(\'User-Agent\', us_agent)

import json, random
class OnlinelistenningProxyMiddleware(object):
    def __init__(self):
        pass
    def process_request(self, request, spider):
        ip = \'https://114.98.25.25:4216\'  
        request.meta[\'proxy\'] = ip  # 将ip地址传入即可

七、setting.py文件设置

# 关闭机器协议
ROBOTSTXT_OBEY = False  # 一定要关闭，不然无法爬取到数据

# 储存文件根目录设置：
# 注意windows中斜杠方向，如果方向反了程序也能运行，不过会有个小bug，会在当前代码路径下生成一个空文件夹。名字为自定义的文件夹。
FILES_STORE = \'F:\\在线英语听力室\\听力教程new\'  # 储存路径，注意：应使用双\\，单斜杠会让\202,\201等识别为url状态码

# 长连接断开时间
DOWNLOAD_TIMEOUT = 1800  


# 代理和user_agent设置
DOWNLOADER_MIDDLEWARES = {  
    \'OnlineListenning.middlewares.OnlinelistenningUseragentMiddleware\': 543,  # 打开useragent，数字表示优先级，越小优先级越高
    \'OnlineListenning.middlewares.OnlinelistenningProxyMiddleware\': 542,  # 打开ip代理
}

# 打开pipeline下载
ITEM_PIPELINES = {
    \'OnlineListenning.pipelines.OnlinelistenningSelfPipeline\': 1,  # 注意修改自己的pipeline名称
}

# 下载延时
DOWNLOAD_DELAY = 3

# 同时下载数量，减少服务器压力
CONCURRENT_REQUESTS_PER_DOMAIN = 8

本文链接：https://www.cnblogs.com/jaysonteng/p/12931775.html

scrapy爬虫下载音频文件并储存到本地

目录

一、新建工程

二、新建spider

三、定义所需爬取字段

四、解析网页，得到相应数据

五、储存数据

六、user_agent和ip代理设置

七、setting.py文件设置

scrapy爬虫下载音频文件并储存到本地的更多相关文章

随机推荐

热门专题

目录导航