首先登录珞珈一号数据系统查询想要的数据

 

 

利用浏览器审查元素获取包含下载信息的源码

将最右侧的table相关的网页源码copy到剪切板备用

利用python下载数据

 

## utf-8



import requests
import os
# import urllib.request
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd 


def saveFile(url,fileName):

    # \'\'\' 保存文件\'\'\'

    r = requests.get(url, stream=True)
    chunkSize = 256
    # print(\'dowloading...\',fileName)
    with open(\'data/\'+fileName, \'wb\') as f:
        pbar = tqdm( unit="B", total=int( r.headers[\'Content-Length\'] ) ,desc = "downloading..."+fileName)
        for chunk in r.iter_content(chunk_size=chunkSize):
            if chunk: # filter out keep-alive new chunks
                pbar.update (len(chunk))
                f.write(chunk)


html = \'\'\'将table的源码粘贴到这里\'\'\'

##  get download url and file name

soup = BeautifulSoup(html)
tbody = soup.findAll(\'tbody\')[0]
trs = tbody.findAll("tr")

data = []
for tr in trs:
    tds = tr.findAll("td")[-4:]
    temp = []

    # 
    for td in tds[:-1]:
        temp.append(td.text)

    a = tds[-1].findAll("a")[-1]

##   download url
    href = "http://59.175.109.173:8888" + a["href"]

    temp.append(href)

    data.append(temp)

dataSet = pd.DataFrame(data,columns = ["weixing","chuanganqi","time","url"])

###file name
dataSet.loc[:,"fileName"] = dataSet.loc[:,"weixing"] + dataSet.loc[:,"chuanganqi"] + dataSet.loc[:,"time"] + "-" + dataSet.index.map(str) + ".tar.gz"




#### dowload


for i in tqdm(range(dataSet.shape[0])):
    # if i<start:
    #     continue

    # if i > 200:
    #     continue
    row = dataSet.loc[i,:]
    fileName = row["fileName"]
    url = row["url"]
    saveFile(url,fileName)

 

 

 

版权声明:本文为wybert原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/wybert/p/10613873.html