Python 爬基金数据
爬科学基金共享服务网中基金数据
#coding=utf-8 import json import requests from lxml import etree from HTMLParser import HTMLParser from pymongo import MongoClient data = {\'pageSize\':10,\'currentPage\':1,\'fundingProject.projectNo\':\'\',\'fundingProject.name\':\'\',\'fundingProject.person\':\'\',\'fundingProject.org\':\'\', \'fundingProject.applyCode\':\'\',\'fundingProject.grantCode\':\'\',\'fundingProject.subGrantCode\':\'\',\'fundingProject.helpGrantCode\':\'\',\'fundingProject.keyword\':\'\', \'fundingProject.statYear\':\'\',\'checkCode\':\'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81\'} url = \'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action\' headers = {\'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\', \'Accept-Encoding\':\'gzip, deflate\', \'Accept-Language\':\'zh-CN,zh;q=0.9\', \'Cache-Control\':\'max-age=0\', \'Connection\':\'keep-alive\', \'Content-Length\':\'340\', \'Content-Type\':\'application/x-www-form-urlencoded\', \'Cookie\':\'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4\', \'Host\':\'npd.nsfc.gov.cn\', \'Origin\':\'http://npd.nsfc.gov.cn\', \'Referer\':\'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action\', \'Upgrade-Insecure-Requests\':\'1\', \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\'} def main(): client = MongoClient(\'localhost\', 27017) db = client.ScienceFund db.authenticate("","") collection=db.science_fund for i in range(1, 43184): print i data[\'currentPage\'] = i result = requests.post(url, data = data, headers = headers) html = result.text tree = etree.HTML(html) table = tree.xpath("//dl[@class=\'time_dl\']") for item in table: content = etree.tostring(item, method=\'html\') content = HTMLParser().unescape(content) # print content bson = jiexi(content) collection.insert(bson) def jiexi(content): # 标题 title1 = content.find(\'">\', 20) title2 = content.find(\'</\') title = content[title1+2:title2] # print title # 批准号 standard_no1 = content.find(u\'批准号\', title2) standard_no2 = content.find(\'</dd>\', standard_no1) standard_no = content[standard_no1+4:standard_no2].strip() # print standard_no # 项目类别 standard_type1 = content.find(u\'项目类别\', standard_no2) standard_type2 = content.find(\'</dd>\', standard_type1) standard_type = content[standard_type1+5:standard_type2].strip() # print standard_type # 依托单位 supporting_institution1 = content.find(u\'依托单位\', standard_type2) supporting_institution2= content.find(\'</dd>\', supporting_institution1) supporting_institution = content[supporting_institution1+5:supporting_institution2].strip() # print supporting_institution # 项目负责人 project_principal1 = content.find(u\'项目负责人\', supporting_institution2) project_principal2 = content.find(\'</dd>\', project_principal1) project_principal = content[project_principal1+6:project_principal2].strip() # print project_principal # 资助经费 funds1 = content.find(u\'资助经费\', project_principal2) funds2 = content.find(\'</dd>\', funds1) funds = content[funds1+5:funds2].strip() # print funds # 批准年度 year1 = content.find(u\'批准年度\', funds2) year2 = content.find(\'</dd>\', year1) year = content[year1+5:year2].strip() # print year # 关键词 keywords1 = content.find(u\'关键词\', year2) keywords2 = content.find(\'</dd>\', keywords1) keywords = content[keywords1+4:keywords2].strip() # print keywords dc = {} dc[\'title\'] = title dc[\'standard_no\'] = standard_no dc[\'standard_type\'] = standard_type dc[\'supporting_institution\'] = supporting_institution dc[\'project_principal\'] = project_principal dc[\'funds\'] = funds dc[\'year\'] = year dc[\'keywords\'] = keywords return dc if __name__ == \'__main__\': main()