1.简介:

  简单的一个小爬虫,通过给函数start_url,high_nun,width_num三个参数,从而爬取start_url下面的URL链接。start_url为开始爬取的URL,high_num为从start_url开始算,往下面计算high_num个URL,也就是爬取的URL 链接的深度。width_num就是宽度了。简单理解就是一个二维表吧。同时,爬取的每个页面的url会存储为一个txt文件,文件名就是用页面的URL来命名的。

2.python包:

  • BeautifulSoup 用来解析html的,过滤出页面下的url链接
  • requests 发送请求
  • re 使用正则过滤出url
from bs4 import BeautifulSoup
import requests
import re


# 获取页面下面的所有url链接
def fist_url(start_url='https://www.test.com'):
    start_url = start_url
    response = requests.get(url=start_url)
    bs_html = BeautifulSoup(response.content, 'lxml')
    url_list = []
    for href in bs_html.findAll("a"):
        if 'href' in href.attrs:
            # 过滤掉href为javascript的数据
            url_te = href.attrs['href']
            if url_te != 'javascript:void(0);':
                patten = re.compile(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*['
                                    r'\w\-\@?^=%&/~\+#])?')
                url_text = patten.search(url_te)
                try:
                    # 获取正则表达式匹配url
                    url_list.append(url_text.group())
                except AttributeError as e:
                    if url_te == '' or url_te == '/':
                        pass
                    else:
                        if len(url_te) > 10:
                            result_url = 'https:' + str(url_te)
                            # print("异常数据处理结果:", result_url)
                            url_list.append(result_url)
                        else:
                            pass
            else:
                pass
    file_name = '../Drivers/{}.html'.format(start_url[8:-1])
    with open(file=file_name, mode='w', encoding='utf-8') as f:
        for i in url_list:
            f.write(str(i)+'\n')
    return file_name


# 读取start_url下面的high_num个连接
def next_url(start_url='https://www.test.com/', width=3):
    width_num = width
    url_list_test = []
    start_url = start_url
    print('start_url:', start_url)
    file_path = fist_url(start_url=start_url)
    with open(file=file_path, mode='r', encoding='utf-8') as f:
        url_list = f.readlines()
    for i in url_list[0:width_num]:
        url_list_test.append(i.strip())
        fist_url(start_url=i.strip())
    return url_list_test


def run(high_num=2, width_num=2):
    for i in range(high_num):
        next_list = next_url(start_url='https://www.test.com', width=width_num)
        next_url(start_url=next_list[0], width=width_num)
        print('return_url', next_list[0])


if __name__ == '__main__':
    run(high_num=2, width_num=2)

ps: 待完善功能:
1.在start_url之后第二个文件的第二个url假如还是start_url的情况的处理。
2.正则匹配的URL存在匹配不准确的情况。
3.加入selenium,获取需要javascrript渲染的页面

版权声明:本文为yp-blogs原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/yp-blogs/p/16398893.html