数据分析(Pandas模块:人口分析实例)
需求: 导入文件,查看原始数据 将人口数据和各州简称数据进行合并 将合并的数据中重复的abbreviation列进行删除 查看存在缺失数据的列 找到有哪些state/region使得state的值为NaN,进行去重操作 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN 合并各州面积数据areas 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行 去除含有缺失数据的行 找出2010年的全民人口数据 计算各州的人口密度 排序,并找出人口密度最高的五个州 df.sort_values()
import numpy as np from pandas import DataFrame,Series import pandas as pd # 导入文件,查看原始数据 abb = pd.read_csv(\'./state-abbrevs.csv\') pop = pd.read_csv(\'./state-population.csv\') area = pd.read_csv(\'./state-areas.csv\') # 查看abb前三行数据 abb.head(3) #结果:>>> state abbreviation 0 Alabama AL 1 Alaska AK 2 Arizona AZ # 查看pop前三行数据 pop.head(3) #结果: state/region ages year population 0 AL under18 2012 1117489.0 1 AL total 2012 4817528.0 2 AL under18 2010 1130966.0 # 查看area前三行数据 area.head(3) #结果:>>> state area (sq. mi) 0 Alabama 52423 1 Alaska 656425 2 Arizona 114006 # 将人口数据和各州简称数据进行合并 abb_pop = pd.merge(abb,pop,left_on="abbreviation",right_on="state/region",how="outer") abb_pop.head(3) #结果>>> state abbreviation state/region ages year population 0 Alabama AL AL under18 2012 1117489.0 1 Alabama AL AL total 2012 4817528.0 2 Alabama AL AL under18 2010 1130966.0 # 将合并的数据中重复的abbreviation列进行删除 abb_pop.drop(labels="abbreviation", axis=1, inplace=True) abb_pop.head(3) #结果>>> state state/region ages year population 0 Alabama AL under18 2012 1117489.0 1 Alabama AL total 2012 4817528.0 2 Alabama AL under18 2010 1130966.0 # 查看存在缺失数据的列 abb_pop.isnull().any(axis=0) #结果>>> state True state/region False ages False year False population True dtype: bool #找到有哪些state/region使得state的值为NaN,进行去重操作 # 1.找到state的值为空 abb_pop["state"].isnull() # 2.通过state的值找出所有为空的行 abb_pop.loc[abb_pop["state"].isnull()] # 去重 abb_pop.loc[abb_pop["state"].isnull()]["state/region"].unique() # 结果>>> array([\'USA\',\'PR\'], dtype=object) # 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN # 1.找出USA对应state列中的空值 abb_pop["state/region"] == "USA" # 2.取出USA对应的行数据 abb_pop.loc[abb_pop["state/region"] == "USA"] # 3.取出USA对应的行数据的索引值 indexs = abb_pop.loc[abb_pop["state/region"] == "USA"].index # 4.将USA对应的空值覆盖成对应的值 abb_pop.loc[indexs,"state"] = \'United States\' # 1.找出PR对应state列中的空值 abb_pop["state/region"] == "PR" # 2.取出PR对应的行数据 abb_pop.loc[abb_pop["state/region"] == "PR"] # 3.取出PR对应的行数据的索引值 indexs = abb_pop.loc[abb_pop["state/region"] == "PR"].index # 4.将USA对应的空值覆盖成对应的值 abb_pop.loc[indexs,"state"] = \'pppr\' # 合并各州面积数据areas abb_pop_area = pd.merge(abb_pop,area, how="outer") abb_pop_area.head() #结果:>>> state state/region ages year population area (sq. mi) 0 Alabama AL under18 2012.0 1117489.0 52423.0 1 Alabama AL total 2012.0 4817528.0 52423.0 2 Alabama AL under18 2010.0 1130966.0 52423.0 3 Alabama AL total 2010.0 4785570.0 52423.0 4 Alabama AL under18 2011.0 1125763.0 52423.0 # 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行 abb_pop_area["area (sq. mi)"].isnull() # 去除含有缺失数据的行 indexs = abb_pop_area.loc[abb_pop_area["area (sq. mi)"].isnull()].index abb_pop_area.drop(labels=indexs,axis=0,inplace=True) # 找出2010年的全民人口数据 df_2010 = abb_pop_area.query("year == 2010 & ages == \'total\'") df_2010.head() #结果>>> state state/region ages year population area (sq. mi) 3 Alabama AL total 2010.0 4785570.0 52423.0 91 Alaska AK total 2010.0 713868.0 656425.0 101 Arizona AZ total 2010.0 6408790.0 114006.0 189 Arkansas AR total 2010.0 2922280.0 53182.0 197 California CA total 2010.0 37333601.0 163707.0 # 计算各州的人口密度 abb_pop_area["midu"] = abb_pop_area["population"] / abb_pop_area["area (sq. mi)"] abb_pop_area.head() #结果>>> state state/region ages year population area (sq. mi) midu 0 Alabama AL under18 2012.0 1117489.0 52423.0 21.316769 1 Alabama AL total 2012.0 4817528.0 52423.0 91.897221 2 Alabama AL under18 2010.0 1130966.0 52423.0 21.573851 3 Alabama AL total 2010.0 4785570.0 52423.0 91.287603 4 Alabama AL under18 2011.0 1125763.0 52423.0 21.474601 # 排序,并找出人口密度最高的五个州 df.sort_values() abb_pop_area.sort_values(by="midu",axis=0,ascending=False)