import os
import pandas as pd
os.chdir('happiness') # 进入子文件夹
current_path = os.getcwd() # 获取当前文件夹路径
files=os.listdir() # 列出当前文件夹所有文件名
df=[]
for filename in files:
dot_position = filename.find('.')
file_key = filename[:dot_position] #文件名 - 年份
file_path = os.path.join(current_path,filename) #组合绝对路径
t = (pd.read_csv(file_path), int(file_key))
df.append(t)
os.chdir('..') # 跳出当前文件夹
# [(..., 2015), (..., 2016), (..., 2017),..., (..., 2020)]
>>> df[0][0].shape
(138, 10)
>>> df[0][0].columns
Index(['Happiness Rank', 'Country', 'Region', 'Happiness Score', 'Economy',
'Family', 'Health', 'Freedom', 'Trust', 'Generosity'],dtype='object')
# 做一个WHR 2015的副本
>>> df15 = df[0][0].copy()
>>> df15_h = df15.set_index(["Region","Country"])
>>> df15_h.head()
Happiness Rank ... Generosity
Region Country ...
Western Europe Switzerland 1 ... 0.29678
Iceland 2 ... 0.43630
Denmark 3 ... 0.34139
Norway 4 ... 0.34699
North America Canada 5 ... 0.45811
Select Concisely
>>> df15_h.loc["North America"]
Happiness Rank Happiness Score ... Trust Generosity
Country ...
Canada 5 7.427 ... 0.32957 0.45811
United States 15 7.119 ... 0.15890 0.40105
>>> df15_h.loc["North America"].loc["Canada"]
Happiness Rank 5.00000
Happiness Score 7.42700
Economy 1.32629
Family 1.32261
Health 0.90563
Freedom 0.63297
Trust 0.32957
Generosity 0.45811
Name: Canada, dtype: float64
Rearrange
# 重置index
>>> df15_h.reset_index(0)
Region Happiness Rank ... Trust Generosity
Country ...
Switzerland Western Europe 1 ... 0.41978 0.29678
Iceland Western Europe 2 ... 0.14145 0.43630
Denmark Western Europe 3 ... 0.48357 0.34139
...
>>> df15_h.reset_index(1)
Country Happiness Rank ... Trust Generosity
Region ...
Western Europe Switzerland 1 ... 0.41978 0.29678
Western Europe Iceland 2 ... 0.14145 0.43630
Western Europe Denmark 3 ... 0.48357 0.34139
...
>>> df15_h.reset_index()
Region Country ... Trust Generosity
0 Western Europe Switzerland ... 0.41978 0.29678
1 Western Europe Iceland ... 0.14145 0.43630
2 Western Europe Denmark ... 0.48357 0.34139
...
>>> df15_h.reset_index().reset_index()
index Region Country ... Trust Generosity
0 0 Western Europe Switzerland ... 0.41978 0.29678
1 1 Western Europe Iceland ... 0.14145 0.43630
2 2 Western Europe Denmark ... 0.48357 0.34139
...
>>> df15.stack()
0 Happiness Rank 1
Country Switzerland
Region Western Europe
Happiness Score 7.587
Economy 1.39651
... ...
137 Family 0.13995
Health 0.28443
Freedom 0.36453
Trust 0.10731
Generosity 0.16681
Length: 1380, dtype: object
# 反向还原
>>> df15.stack().unstack()
# 提取每一年的排名
rank_list=[]
for i in range(len(df)):
rank_list.append(df[i][0][['Country','Happiness Rank']].copy())
>>> wide=pd.merge(rank_list[0],rank_list[1],
...: on="Country", suffixes=[" 15"," 16"])
>>> wide=wide.rename(columns={"Happiness Rank 15":15,
...: "Happiness Rank 16":16})
>>> wide
Country 15 16
0 Switzerland 1 2
1 Iceland 2 3
2 Denmark 3 1
.. ... ... ...
135 Benin 155 153
136 Burundi 157 157
137 Togo 158 155
>>> df1
name number score
0 Tom 101 77
1 Bob 102 85
2 Amy 103 64
3 Lee 104 92
>>> df2
number class
0 101 one
1 102 one
2 103 two
3 104 three
>>> pd.merge(df1,df2)
name number score class
0 Tom 101 77 one
1 Bob 102 85 one
2 Amy 103 64 two
3 Lee 104 92 three
>>> df3
n class
0 101 one
1 102 one
2 103 two
3 104 three
column names are different
>>> pd.merge(df1,df3,left_on='number',right_on='n')
name number score n class
0 Tom 101 77 101 one
1 Bob 102 85 102 one
2 Amy 103 64 103 two
3 Lee 104 92 104 three
>>> df1
name number score
0 Tom 101 77
1 Bob 102 85
2 Amy 103 64
>>> df4
number f_score
0 101 77
1 101 55
2 102 60
3 103 99
>>> pd.merge(df1,df4)
name number score f_score
0 Tom 101 77 77
1 Tom 101 77 55
2 Bob 102 85 60
3 Amy 103 64 99
>>> df5
name number score
0 Tom 101 77
1 Tom 101 86
2 Bob 102 85
3 Amy 103 64
Many to Many
>>> pd.merge(df5,df4)
name number score f_score
0 Tom 101 77 77
1 Tom 101 77 55
2 Tom 101 86 77
3 Tom 101 86 55
4 Bob 102 85 60
5 Amy 103 64 99
>>> df6
name number score
0 Tom 101 77
1 Bob 102 85
2 Amy 103 64
3 Ivy 105 82
>>> df7
number f_score
0 101 77
1 102 55
2 103 60
3 104 99
>>> pd.merge(df6,df7)
name number score f_score
0 Tom 101 77 77
1 Bob 102 85 55
2 Amy 103 64 60
>>> pd.merge(df6,df7,how='left')
name number score f_score
0 Tom 101 77 77.0
1 Bob 102 85 55.0
2 Amy 103 64 60.0
3 Ivy 105 82 NaN
>>> pd.merge(df6,df7,how="right")
name number score f_score
0 Tom 101 77.0 77
1 Bob 102 85.0 55
2 Amy 103 64.0 60
3 NaN 104 NaN 99
>>> pd.merge(df6,df7,how="outer")
name number score f_score
0 Tom 101 77.0 77.0
1 Bob 102 85.0 55.0
2 Amy 103 64.0 60.0
3 Ivy 105 82.0 NaN
4 NaN 104 NaN 99.0
Merging on Index
>>> df1
name score
number
101 Tom 77
102 Bob 85
103 Amy 64
104 Lee 92
>>> df2
number class
0 101 one
1 102 one
2 103 two
3 104 three
>>> pd.merge(df1,df2,left_index=True,
...: right_on="number")
name score number class
0 Tom 77 101 one
1 Bob 85 102 one
2 Amy 64 103 two
3 Lee 92 104 three
>>> df3
class
number
101 one
102 one
103 two
104 three
A simple way
>>> df1.join(df3)
Concatenating
>>> long = pd.concat([rank_list[0],rank_list[1]])
>>> long
Country Happiness Rank
0 Switzerland 1
1 Iceland 2
2 Denmark 3
.. ... ...
135 Rwanda 152
136 Zimbabwe 131
137 Afghanistan 154
>>> long = pd.concat([rank_list[0],rank_list[1]],
...: ignore_index=True)
>>> long
Country Happiness Rank
0 Switzerland 1
1 Iceland 2
2 Denmark 3
.. ... ...
273 Rwanda 152
274 Zimbabwe 131
275 Afghanistan 154
>>> long = pd.concat([rank_list[0],rank_list[1]],
...: keys=[15,16])
>>> long
Country Happiness Rank
15 0 Switzerland 1
1 Iceland 2
2 Denmark 3
... ...
16 135 Rwanda 152
136 Zimbabwe 131
137 Afghanistan 154
>>> long = long.reset_index(0)
>>> long = long.rename(columns={"level_0": "Year"})
>>> long
Year Country Happiness Rank
0 15 Switzerland 1
1 15 Iceland 2
2 15 Denmark 3
.. ... ... ...
135 16 Rwanda 152
136 16 Zimbabwe 131
137 16 Afghanistan 154
The wide format (用merge得到的数据,country不重,多列年份)
The long format (用concat得到的数据,country重复,单列年份)
>>> wide
Country 15 16
0 Switzerland 1 2
1 Iceland 2 3
2 Denmark 3 1
.. ... ... ...
135 Benin 155 153
136 Burundi 157 157
137 Togo 158 155
>>> long
Year Country Happiness Rank
0 15 Switzerland 1
1 15 Iceland 2
2 15 Denmark 3
.. ... ... ...
135 16 Afghanistan 154
136 16 Togo 155
137 16 Burundi 157
The wide format (用merge得到的数据,country不重,多列年份)
From wide to long
>>> m = pd.melt(wide, id_vars=['Country'],
...: var_name="Year",value_name="rank")
>>> m
Country Year rank
0 Switzerland 15 1
1 Iceland 15 2
2 Denmark 15 3
.. ... ... ...
273 Benin 16 153
274 Burundi 16 157
275 Togo 16 155
>>> m.set_index(['Year', m.index])
Country rank
Year
15 0 Switzerland 1
1 Iceland 2
2 Denmark 3
... ...
16 273 Benin 153
274 Burundi 157
275 Togo 155
The long format (用concat得到的数据,country重复,单列年份)
From long to wide
>>> l = pd.pivot_table(long, index=["Country"],
...: columns="Year", values="Happiness Rank")
Year 15 16
Country
Afghanistan 153 154
Albania 95 109
Algeria 68 38
... ...
Yemen 136 147
Zambia 85 106
Zimbabwe 115 131
>>> l.reset_index()
Year Country 15 16
0 Afghanistan 153 154
1 Albania 95 109
2 Algeria 68 38
.. ... ... ...
135 Yemen 136 147
136 Zambia 85 106
137 Zimbabwe 115 131
table_all =[]
year_all=[]
for table, year in df:
table_all.append(table)
year_all.append(year)
rank_all = pd.concat(table_all, keys=year_all,join='inner')
rank_all = rank_all.reset_index(0)
rank_all = rank_all.rename(columns={'level_0':'Year'})
year Happiness Rank Country ... Freedom Trust Generosity
0 2015 1 Switzerland ... 0.665570 0.419780 0.296780
1 2015 2 Iceland ... 0.628770 0.141450 0.436300
2 2015 3 Denmark ... 0.649380 0.483570 0.341390
3 2015 4 Norway ... 0.669730 0.365030 0.346990
4 2015 5 Canada ... 0.632970 0.329570 0.458110
.. ... ... ... ... ... ... ...
133 2020 147 Botswana ... 0.821328 0.777931 -0.250394
134 2020 148 Tanzania ... 0.821540 0.619799 0.109669
135 2020 150 Rwanda ... 0.900589 0.183541 0.055484
136 2020 151 Zimbabwe ... 0.711458 0.810237 -0.072064
137 2020 153 Afghanistan ... 0.396573 0.933687 -0.096429
>>> rank_all['Region'].unique()
array(['Western Europe', 'North America', 'Australia and New Zealand',
'Middle East and Northern Africa', 'Latin America and Caribbean',
'Southeastern Asia', 'Central and Eastern Europe', 'Eastern Asia',
'Sub-Saharan Africa', 'Southern Asia'], dtype=object)
>>> grouped = rank_all.groupby('Country')['Happiness Rank'].mean()
>>> grouped
Country
Afghanistan 150.000000
Albania 106.166667
Algeria 71.833333
Argentina 35.166667
Armenia 121.666667
...
Venezuela 76.333333
Vietnam 89.500000
Yemen 146.333333
Zambia 118.500000
Zimbabwe 137.500000
Name: Happiness Rank, Length: 138, dtype: float64
>>> g_1 = rank_all.groupby(['Year','Region'])['Happiness Rank'].count()
>>> g_1[2015]
Region
Australia and New Zealand 2
Central and Eastern Europe 28
Eastern Asia 4
Latin America and Caribbean 20
Middle East and Northern Africa 17
North America 2
Southeastern Asia 8
Southern Asia 6
Sub-Saharan Africa 31
Western Europe 20
Name: Happiness Rank, dtype: int64
>>> g_1.unstack()
Region Australia and New Zealand ... Western Europe
Year ...
2015 2 ... 20
2016 2 ... 20
2017 2 ... 20
2018 2 ... 20
2019 2 ... 20
2020 2 ... 20
>>> g_2 = rank_all.groupby(['Year','Region']).mean()
>>> g_3 = g_2.unstack()
>>> g_3
Happiness Rank ... Generosity
Region Australia and New Zealand ... Western Europe
Year ...
2015 9.5 ... 0.304130
2016 8.5 ... 0.306789
2017 9.0 ... 0.305329
2018 9.0 ... 0.223850
2019 9.5 ... 0.221050
2020 10.0 ... 0.040858
Selecting Columns
>>> rank_all.groupby('Region')['Freedom'].mean()
>>> rank_all.groupby(['Year','Region'])[['Freedom','Economy']].mean()
Function Application and Mapping
def f(arr):
return arr.max()-arr.min()
>>> g_4 = rank_all.groupby(['Year','Region'])['Happiness Rank'].apply(f)
Year Region
2015 Australia and New Zealand 1
Central and Eastern Europe 103
Eastern Asia 54
Latin America and Caribbean 107
... ... ...
2016 Australia and New Zealand 1
... ... ...
>>> g_5 = g_4.unstack(0)
Year 2015 2016 2017 2018 2019 2020
Region
Australia and New Zealand 1 1 2 2 3 4
Central and Eastern Europe 103 102 109 117 113 104
Eastern Asia 54 48 49 40 39 33
Latin America and Caribbean 107 122 133 135 135 127
Middle East and Northern Africa 125 136 135 133 138 132
...
Data Aggregation
>>> rank_all.groupby('Region')['Freedom'].agg(['count','mean'])
count mean
Region
Australia and New Zealand 12 0.663815
Central and Eastern Europe 168 0.423580
Eastern Asia 24 0.465703
Latin America and Caribbean 120 0.518843
Middle East and Northern Africa 102 0.406711
North America 12 0.613810
Southeastern Asia 48 0.602917
Southern Asia 36 0.439074
Sub-Saharan Africa 186 0.427333
Western Europe 120 0.574996
# rank_all.groupby('Region').agg(['count','mean'])
# count, sum, mean, median, std, var
# min, max, prod, first, last
>>> rank_all.groupby('Region').agg({'Year':'count', 'Freedom': "mean"})
Year Freedom
Region
Australia and New Zealand 12 0.663815
Central and Eastern Europe 168 0.423580
Eastern Asia 24 0.465703
Latin America and Caribbean 120 0.518843
Middle East and Northern Africa 102 0.406711
North America 12 0.613810
Southeastern Asia 48 0.602917
Southern Asia 36 0.439074
Sub-Saharan Africa 186 0.427333
Western Europe 120 0.574996
def f(arr):
return arr.max()-arr.min()
>>> rank_all.groupby(['Year','Region'])['Happiness Rank'].agg(f)
Region
Australia and New Zealand 4
Central and Eastern Europe 119
Eastern Asia 55
Latin America and Caribbean 136
Middle East and Northern Africa 141
North America 14
Southeastern Asia 123
Southern Asia 88
Sub-Saharan Africa 109
Western Europe 101
Name: Happiness Rank, dtype: int64
Quantile and Bucket Analysis
>>> g_7 = rank_all.groupby(['Year','Country'])['Happiness Rank'].mean()
>>> g_7
Year Country
2015 Afghanistan 153
Albania 95
Algeria 68
... ... ...
2020 Yemen 146
Zambia 141
Zimbabwe 151
>>> interval_50 = pd.cut(g_7.loc[2015].values, bins = [0,50,100,150,200])
>>> interval_50
[(150, 200], (50, 100], (50, 100], ..., (50, 100], (100, 150]]
Length: 138
Categories (4, interval[int64]):
[(0, 50] < (50, 100] < (100, 150] < (150, 200]]
>>> g_7.loc[2015].groupby(interval_50).count()
(0, 50] 45
(50, 100] 42
(100, 150] 44
(150, 200] 7
Pivot Tables and Cross-Tabulation (数据透视表)
>>> rank_all.pivot_table(values="Economy", columns="Year", index="Country")
Year 2015 2016 2017 2018 2019 2020
Country
Afghanistan 0.31982 0.38227 0.401477 0.332 0.350 7.462861
Albania 0.87867 0.95530 0.996193 0.916 0.947 9.417931
Algeria 0.93929 1.05266 1.091864 0.979 1.002 9.537965
... ... ... ... ... ...
Yemen 0.54649 0.57939 0.591683 0.442 0.287 7.759683
Zambia 0.47038 0.61202 0.636407 0.562 0.578 8.224720
Zimbabwe 0.27100 0.35041 0.375847 0.357 0.366 7.865712
>>> rank_all.pivot_table(values="Economy", columns="Year",
...: index="Region", aggfunc="mean")
Year 2015 2016 ... 2019 2020
Region ...
Australia and New Zealand 1.291880 1.402545 ... 1.337500 10.610770
Central and Eastern Europe 0.943293 1.048545 ... 1.022429 9.662970
Eastern Asia 1.058415 1.188970 ... 1.151250 10.047753
Latin America and Caribbean 0.854139 0.959183 ... 0.908950 9.255175
...
>>> rank_all.pivot_table(values="Economy", columns="Year",
...: index="Country", aggfunc="sum", margins= True, margins_name="sum")
Year 2015 2016 2017 ... 2019 2020 sum
Country ...
Afghanistan 0.319820 0.382270 0.401477 ... 0.350000 7.462861 9.248428
Albania 0.878670 0.955300 0.996193 ... 0.947000 9.417931 14.111093
... ... ... ... ... ... ...
Zambia 0.470380 0.612020 0.636407 ... 0.578000 8.224720 11.083527
Zimbabwe 0.271000 0.350410 0.375847 ... 0.366000 7.865712 9.585968
sum 118.20839 132.42220 137.774266 ... 127.743 1289.437513 1930.569370