爬科学基金共享服务网中基金数据

#coding=utf-8
import json
import requests
from lxml import etree
from HTMLParser import HTMLParser
from pymongo import MongoClient

data = {\'pageSize\':10,\'currentPage\':1,\'fundingProject.projectNo\':\'\',\'fundingProject.name\':\'\',\'fundingProject.person\':\'\',\'fundingProject.org\':\'\',
\'fundingProject.applyCode\':\'\',\'fundingProject.grantCode\':\'\',\'fundingProject.subGrantCode\':\'\',\'fundingProject.helpGrantCode\':\'\',\'fundingProject.keyword\':\'\',
\'fundingProject.statYear\':\'\',\'checkCode\':\'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81\'}
url = \'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action\'
headers = {\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\',
\'Accept-Encoding\':\'gzip, deflate\',
\'Accept-Language\':\'zh-CN,zh;q=0.9\',
\'Cache-Control\':\'max-age=0\',
\'Connection\':\'keep-alive\',
\'Content-Length\':\'340\',
\'Content-Type\':\'application/x-www-form-urlencoded\',
\'Cookie\':\'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4\',
\'Host\':\'npd.nsfc.gov.cn\',
\'Origin\':\'http://npd.nsfc.gov.cn\',
\'Referer\':\'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action\',
\'Upgrade-Insecure-Requests\':\'1\',
\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\'}

def main():
    client = MongoClient(\'localhost\', 27017)
    db = client.ScienceFund
    db.authenticate("","")
    collection=db.science_fund
    for i in range(1, 43184):
        print i
        data[\'currentPage\'] = i
        result = requests.post(url, data = data, headers = headers)
        html = result.text
        tree = etree.HTML(html)
        table = tree.xpath("//dl[@class=\'time_dl\']")
        for item in table:
            content = etree.tostring(item, method=\'html\')
            content =  HTMLParser().unescape(content)
            # print content
            bson = jiexi(content)
            collection.insert(bson)

        
def jiexi(content):
    # 标题
    title1 = content.find(\'">\', 20)
    title2 = content.find(\'</\')
    title = content[title1+2:title2]
    # print title
    # 批准号
    standard_no1 = content.find(u\'批准号\', title2)
    standard_no2 = content.find(\'</dd>\', standard_no1)
    standard_no = content[standard_no1+4:standard_no2].strip()
    # print standard_no
    # 项目类别
    standard_type1 = content.find(u\'项目类别\', standard_no2)
    standard_type2 = content.find(\'</dd>\', standard_type1)
    standard_type = content[standard_type1+5:standard_type2].strip()
    # print standard_type
    # 依托单位
    supporting_institution1 = content.find(u\'依托单位\', standard_type2)
    supporting_institution2= content.find(\'</dd>\', supporting_institution1)
    supporting_institution = content[supporting_institution1+5:supporting_institution2].strip()
    # print supporting_institution
    # 项目负责人
    project_principal1 = content.find(u\'项目负责人\', supporting_institution2)
    project_principal2 = content.find(\'</dd>\', project_principal1)
    project_principal = content[project_principal1+6:project_principal2].strip()
    # print project_principal
    # 资助经费
    funds1 = content.find(u\'资助经费\', project_principal2)
    funds2 = content.find(\'</dd>\', funds1)
    funds = content[funds1+5:funds2].strip()
    # print funds
    # 批准年度
    year1 = content.find(u\'批准年度\', funds2)
    year2 = content.find(\'</dd>\', year1)
    year = content[year1+5:year2].strip()
    # print year
    # 关键词
    keywords1 = content.find(u\'关键词\', year2)
    keywords2 = content.find(\'</dd>\', keywords1)
    keywords = content[keywords1+4:keywords2].strip()
    # print keywords
    dc = {}
    dc[\'title\'] = title
    dc[\'standard_no\'] = standard_no
    dc[\'standard_type\'] = standard_type
    dc[\'supporting_institution\'] = supporting_institution
    dc[\'project_principal\'] = project_principal
    dc[\'funds\'] = funds
    dc[\'year\'] = year
    dc[\'keywords\'] = keywords
    return dc

if __name__ == \'__main__\':
    main()

 

版权声明:本文为zhangtianyuan原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/zhangtianyuan/p/8482255.html