forked from iSchool-590PR-2019-Spring/Final_Project
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPR_Final_WinYaoPhil_functions.py
2060 lines (1875 loc) · 97.2 KB
/
PR_Final_WinYaoPhil_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# additional package requirements: bs4, gdelt, seaborn
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from pandas import read_excel
import logging
import zipfile
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class Data(object):
def __init__(self: object, file_path: str = "590PR_final_datasets"):
self.file_path = file_path
self.df_list = []
def get_peace(self: object) -> pd.DataFrame:
"""
This is a function to pull the most updated peace data modified from https://www.kaggle.com/kretes/gpi2008-2016
and return it in a data frame
Global get_peace Index (GPI) measures the relative position of nations' and regions' peacefulness.
The GPI ranks 163 independent states and territories (99.7 per cent of the world’s population)
according to their levels of peacefulness. In the past decade, the GPI has presented trends of
increased global violence and less peacefulness.
The lower the number indicates more peace in the region.
Source:
http://visionofhumanity.org/indexes/global-peace-index/
Requires:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
:return: a Data Frame contain peace index score, by country and year.
>>> print(type(Data().get_peace()))
<class 'pandas.core.frame.DataFrame'>
"""
response = requests.get(url='https://en.wikipedia.org/wiki/Global_Peace_Index')
soup = BeautifulSoup(response.text, 'html.parser')
base_year = 2018 # latest year
years = 9 # number of years to get data
def get_countries_by_gpi():
i = 1
for table in soup.find_all('table', re.compile('wikitable sortable')):
if table.find_all('th')[0].get_text() == 'Country\n':
for tr in table.find_all('tr'):
country_name = tr.find_all('a')[0].get_text()
if not country_name.startswith('['):
row = {'country': country_name}
for year, index in zip(range(base_year - years + 1, base_year + 1),
range(2 * (years), 0, -2)):
score = tr.find_all('td')[index].get_text()
if score != '' and score != '\n':
row['score_%s' % year] = float(score)
yield row
gpi = pd.DataFrame.from_dict(list(get_countries_by_gpi()))
gpi.to_csv(self.file_path + '/gpi_%s-%s.csv' % (base_year - years + 1, base_year), index=False)
gpi.columns = ['Country', 'pi_2010', 'pi_2011', 'pi_2012', 'pi_2013', 'pi_2014', 'pi_2015', 'pi_2016',
'pi_2017', 'pi_2018']
return (gpi)
def get_trade(self: object) -> pd.DataFrame:
"""
The function to read the world hunger data and return it in data frame
WITS get_trade Stats is a database created by aggregating data from UN COMTRADE and UNCTAD TRAINS database.
It provides information on bilateral trade exports, imports and tariffs for over 180 countries and regions.
Source:
https://datacatalog.worldbank.org/dataset/world-integrated-trade-solution-trade-stats
Required:
import logging
:return: trade data in form of Data Frame
>>> print(type(Data().get_trade()))
<class 'pandas.core.frame.DataFrame'>
"""
logging.basicConfig(filename="test.log", level=logging.DEBUG)
file_name = "wits_en_trade_summary_allcountries_allyears.zip"
zf = zipfile.ZipFile(self.file_path + '/' + file_name)
df = []
for name in zipfile.ZipFile.infolist(zf):
logging.debug(name.filename)
try:
df.append(pd.read_csv(zf.open(name.filename), header=0))
except:
pass
frame = pd.concat(df, axis=0, ignore_index=True)
return frame
def get_hunger(self: object) -> pd.DataFrame:
"""
The function to read the world hunger data and return it in data frame
The prevalence of undernourishment, as a share of the population, is the main hunger indicator used by
the UN's Food and Agriculture Organization. It measures the share of the population which has
a caloric (dietary energy) intake which is insufficient to meet the minimum energy requirements
defined as necessary for a given population.
Source:
https://ourworldindata.org/hunger-and-undernourishment
Required:
import pandas as pd
:return: world hunger data in a list of Pandas Data Frames
>>> print(type(Data().get_hunger()))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "Hunger.csv"
df_hunger = pd.read_csv(self.file_path + "/" + file_name, na_values='\t', sep='\t', header=0)
list = []
list.append(df_hunger['Country Name'])
list.append(df_hunger['Indicator Name'])
for i in range(2009, 2019):
year = str(i)
df = df_hunger[year]
list.append(df)
df_new_hunger = pd.concat(list, axis=1)
df_new_hunger.columns = ['Country', 'undernourishment_rate_2009', 'undernourishment_rate_2010',
'undernourishment_rate_2011', 'undernourishment_rate_2012',
'undernourishment_rate_2013', 'undernourishment_rate_2014',
'undernourishment_rate_2015', 'undernourishment_rate_2016',
'undernourishment_rate_2017', 'undernourishment_rate_2018',
'undernourishment_rate_2019']
return df_new_hunger
def get_unemployment(self: object) -> pd.DataFrame:
"""
The function to read the United Nation world unemployment data and return it in data frame
:return: unemployment data in Pandas Data Frame
>>> print(type(Data().get_unemployment()))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "unemployment.zip"
df = pd.read_csv(self.file_path + '/' + file_name, compression='zip')
return df
def get_suicide(self: object) -> pd.DataFrame:
"""
The function to read the United Nation world suicide data and return it in data frame
This compiled dataset pulled from four other datasets linked by time and place, and was built to find signals
correlated to increased suicide rates among different cohorts globally, across the socio-economic spectrum.
Source:
https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016
:return: suicide data in Pandas Data Frame
>>> print(type(Data().get_suicide()))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "suicide-rates-overview-1985-to-2016.zip"
df = pd.read_csv(self.file_path + '/' + file_name, compression='zip')
return df
def get_freedom(self: object) -> pd.DataFrame:
"""
The function to read the world freedom index data and return it in data frame
The Human Freedom Index presents the state of human freedom in the world based on a broad measure that
encompasses personal, civil, and economic freedom. Human freedom is a social concept that recognizes the
dignity of individuals and is defined here as negative liberty or the absence of coercive constraint.
Because freedom is inherently valuable and plays a role in human progress, it is worth measuring carefully.
The Human get_freedom Index is a resource that can help to more objectively observe relationships between
freedom and other social and economic phenomena, as well as the ways in which the various dimensions of freedom
interact with one another.
Source:
https://www.kaggle.com/gsutters/the-human-freedom-index
:return: freedom data in Pandas Data Frame
>>> print(type(Data().get_freedom()))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "the-human-freedom-index.zip"
df = pd.read_csv(self.file_path + '/' + file_name, compression='zip')
return df
def get_happiness(self: object) -> dict:
"""
The function to read the world happiness data and return it in dict list of data frame per year
The World Happiness Report is a landmark survey of the state of global happiness. The World Happiness 2017,
which ranks 155 countries by their happiness levels, was released at the United Nations at an event celebrating
International Day of Happiness on March 20th.
The happiness scores and rankings use data from the Gallup World Poll. The scores are based on answers to
the main life evaluation question asked in the poll. This question, known as the Cantril ladder,
asks respondents to think of a ladder with the best possible life for them being a 10 and the worst
possible life being a 0 and to rate their own current lives on that scale.
Source:
https://www.kaggle.com/unsdsn/world-happiness
Required:
import zipfile
:return: happiness data in a list which contains Pandas Data Frame of each year
>>> print(type(Data().get_happiness()['2015.csv']))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "world-happiness-report.zip"
zf = zipfile.ZipFile(self.file_path + '/' + file_name)
happy = {}
for name in zipfile.ZipFile.infolist(zf):
happy[name.filename] = pd.read_csv(zf.open(name.filename))
return happy
def get_poverty(self: object, sheet: str = None) -> dict:
"""
The function to read the world porverty data and return it in dict list of data frame
Latest poverty and inequality indicators compiled from officially recognized international sources.
Poverty indicators include the poverty headcount ratio, poverty gap, and number of poor at both
international and national poverty lines. Inequality indicators include the Gini index and income or
consumption distributions. The database includes national, regional and global estimates.
This database is maintained by the Gloabl Poverty Working Group (GPWG)
Source:
https://datacatalog.worldbank.org/dataset/poverty-and-equity-database
Required:
import zipfile
:param sheet: parameter to select the sheet from excel file.
:return: pass back a dict that contains either specific excel sheet or all sheets in form of Data Frame(s).
>>> print(type(Data().get_poverty()))
<class 'dict'>
"""
file_name = "PovStats_csv.zip"
zf = zipfile.ZipFile(self.file_path + '/' + file_name)
pov = {}
for name in zipfile.ZipFile.infolist(zf):
pov[name.filename] = pd.read_csv(zf.open(name.filename))
if not sheet == None:
return pov[sheet]
else:
return pov
def get_marital(self: object) -> pd.DataFrame:
"""
The function to read the United Nation marital data and return it in data frame
:return: marital data in Pandas Data Frame
>>> print(type(Data().get_marital()))
<class 'pandas.core.frame.DataFrame'>
"""
file_name = "UNdata_MARITAL_STATUS_2010-2013.csv"
df = pd.read_csv(self.file_path + '/' + file_name)
df.drop(df.tail(74).index, inplace=True)
file_name = "UNdata_MARITAL_STATUS_2014-2017.csv"
df2 = pd.read_csv(self.file_path + '/' + file_name)
df2.drop(df2.tail(54).index, inplace=True)
df_all = pd.concat([df, df2], axis=0).reset_index(drop=True)
return df_all
def get_innovation(self: object, sheet: str = None) -> list:
"""
The function to read the world innovation data and return it in dict list of data frame per year
Required:
read_excel fom pandas package (from pandas import read_excel)
OR
use pd.read_excel when calling (import pandas as pd)
:param sheet: parameter to select the sheet from excel file.
:return: pass back a dict that contains either specific excel sheet or all sheets in form of Data Frame(s).
>>> print(type(Data().get_innovation()))
<class 'dict'>
"""
file_path = "590PR_final_datasets"
file_name = "Innovation.zip"
zf = zipfile.ZipFile(file_path + '/' + file_name)
inv = {}
for name in zipfile.ZipFile.infolist(zf):
inv[name.filename] = pd.read_csv(zf.open(name.filename))
if not sheet == None:
return inv[sheet]
else:
return inv
def prep_freedom() -> list:
"""
The function to prep the world freedom data into only freedom index, year, and country,
and return it in form of list of data frames per year.
:return: pass back a list of Data Frames by year, each contains only freedom index and country
>>> print(type(prep_freedom()))
<class 'list'>
"""
freedom = Data().get_freedom()
free = freedom[['year', 'countries', 'hf_score']]
free.dropna(inplace=True)
free.sort_values('hf_score', ascending=True, inplace=True)
free.reset_index(drop=True, inplace=True)
free.rename({'countries': 'Country'}, axis='columns', inplace=True)
free_list = []
for i in range(6):
free_list.append(free[free['year'] == (2013 + i)])
return free_list
def prep_innovation() -> list:
"""
The function to prep the world innovation data into only innovation score, year, and country,
and return it in form of data frame
:return: pass back a list of Data Frames by year, each contains only innovation index and country
>>> print(type(prep_freedom()))
<class 'list'>
"""
inno_list = []
for i in range(6):
df = Data().get_innovation('Innovation-201%s.csv' %(i + 3))
df.rename({'Economy': 'Country', 'Score': 'Score201%s' % (i + 3)}, axis='columns', inplace=True)
inno_list.append(df[['Country', 'Score201%s' % (i + 3)]])
return inno_list
## Part 1: Analysis of level 1-2, Physiological-Safety by hunger+peace index dataset
# need docstring
def analysis_first_two_level(hunger: pd.DataFrame, peace: pd.DataFrame) -> tuple:
"""
This function is to analyze the first two levels of human needs' hierarchy. We choose the columns we want and
merge these together on the Country name. Because all these two datasets have different columns for each year, so
we get every year's data by getting corresponding column and merge them together. Then, we can get the data from lower
level and higher level, two things here are that we need to drop all the data points that contains Nan value in either
lower or higher level and then sorted the data in the order of small to large for the data of lower level. After this
process, the function should return all these two lists and one dataframe, shows correspondingly the data from lower
level, higher level and merged dataframe that can be used later. Because the original datasets only contains the data
from 2010 to 2016 in common, so we use the data from these 7 years.
:param hunger: World Hunger DataFrame
:param peace: World Peace Index's DataFrame
:return: all these two lists and one dataframe, shows correspondingly the data from lower level, higher level and
merged dataframe
>>> hunger_data = Data().get_hunger()
>>> peace_data = Data().get_peace()
>>> type(analysis_first_two_level(hunger_data, peace_data))
<class 'tuple'>
"""
df_level1 = pd.merge(hunger, peace, on = 'Country', how='inner')
df_level1 = df_level1.drop_duplicates(keep='first', inplace=False)
x_list = []
y_list = []
item = [['undernourishment_rate_2010', 'pi_2010'], ['undernourishment_rate_2011', 'pi_2011'],
['undernourishment_rate_2012', 'pi_2012'], ['undernourishment_rate_2013', 'pi_2013'],
['undernourishment_rate_2014', 'pi_2014'], ['undernourishment_rate_2015', 'pi_2015'],
['undernourishment_rate_2016', 'pi_2016']]
for i in item:
df = df_level1[i]
dd = df.sort_values(by=i[0], ascending=True)
x = np.asarray(dd[i[0]])
y = np.asarray(dd[i[1]])
index_list = []
for j in range(x.shape[0]):
if math.isnan(x[j]) == True or math.isnan(y[j]) == True:
index_list.append(j)
new_x = np.delete(x, index_list)
new_y = np.delete(y, index_list)
x_list.append(new_x)
y_list.append(new_y)
return x_list, y_list, df_level1
# need docstring
def plot12(x_list: list, y_list: list):
"""
This function is to use the two lists of lower and higher level data we get before and plot a figure about their relationship
for all the countries in these two datasets. The figure should contain all the countries as corresponding data points.
So the trend should be harder to see.
:param x_list: List of the lower level sorted data
:param y_list: List of the higher level corresponding data
:return: A plt plot shows how the brief trend look like for all countries.
>>> hunger_data = Data().get_hunger()
>>> peace_data = Data().get_peace()
>>> x_list, y_list, df_level1 = analysis_first_two_level(hunger_data, peace_data)
>>> type(plot12(x_list, y_list))
<class 'NoneType'>
"""
plt.figure(figsize = (30, 15), dpi=100)
item = [['undernourishment_rate_2010', 'pi_2010'], ['undernourishment_rate_2011', 'pi_2011'],
['undernourishment_rate_2012', 'pi_2012'], ['undernourishment_rate_2013', 'pi_2013'],
['undernourishment_rate_2014', 'pi_2014'], ['undernourishment_rate_2015', 'pi_2015'],
['undernourishment_rate_2016', 'pi_2016']]
marker = ['.', '*', '>', '<', '1', '2', 's']
color = ['#E11B00', '#1E90FF', '#FF4233', '#FFE333', '#7EFF33', '#33F4FF', '#D433FF']
for f in range(len(x_list)):
plt.plot(x_list[f], y_list[f], label = 'data of year '+item[f][1][-4:], marker = marker[f], color = color[f])
plt.xlabel('undernourishment rate')
plt.ylabel('Peacefulness Index')
plt.ylim(1, 4)
plt.xticks(fontsize = 8, horizontalalignment = 'center', alpha = .7)
plt.yticks(fontsize = 12, alpha = .7)
plt.grid(axis='both', alpha = .3)
plt.legend()
plt.gca().invert_yaxis()
plt.gca().invert_xaxis()
plt.show()
# need docstring
def plot_cat12(x_list: list, y_list: list):
"""
From previous plot, we can not see the trend and check if all kinds of countries satisfy this relationship.
So this function is used to categorize the lower level needs in different ranges. By using this way, we can smooth
the relationship and see the corresponding trend more clearly. After we've done with this process, we'll check if
the relationship we got can fit the model we proposed or not. Moreover, if the trend of this data plot is not clear
or didn't change a lot, we would go back and check if the dataset we used has deficiency and substitute with other datasets.
:param x_list: List of Lower Level Sorted data
:param y_list: List of corresponding higher level data
:return: plt plot shows how the categorized figure look like.
>>> hunger_data = Data().get_hunger()
>>> peace_data = Data().get_peace()
>>> x_list, y_list, df_level1 = analysis_first_two_level(hunger_data, peace_data)
>>> type(plot_cat12(x_list, y_list))
<class 'NoneType'>
"""
plt.figure(figsize = (30,20), dpi=100)
item = [['undernourishment_rate_2010','pi_2010'],['undernourishment_rate_2011','pi_2011'],
['undernourishment_rate_2012','pi_2012'],['undernourishment_rate_2013','pi_2013'],
['undernourishment_rate_2014','pi_2014'],['undernourishment_rate_2015','pi_2015'],
['undernourishment_rate_2016','pi_2016']]
x_item = ['[0,5)','[5,10)','[10,15)','[15,20)','[20,25)','[25,30)','[30,35)','[35,40)','[40,45)',
'[45,50)']#,'[50+']
marker = ['.','*','>','<','1','2','s']
color = ['#E11B00', '#1E90FF','#FF4233','#FFE333','#7EFF33','#33F4FF','#D433FF']
y_new_list = []
for a in range(len(x_list)):
cc = x_list[a]
cy = y_list[a]
y_item = np.zeros((10,2))
for b in range(len(cc)):
if cc[b] >= 0 and cc[b]<5:
y_item[0][0] += cy[b]
y_item[0][1] += 1
if cc[b] >= 5 and cc[b]<10:
y_item[1][0] += cy[b]
y_item[1][1] += 1
if cc[b] >= 10 and cc[b]<15:
y_item[2][0] += cy[b]
y_item[2][1] += 1
if cc[b] >= 15 and cc[b]<20:
y_item[3][0] += cy[b]
y_item[3][1] += 1
if cc[b] >= 20 and cc[b]<25:
y_item[4][0] += cy[b]
y_item[4][1] += 1
if cc[b] >= 25 and cc[b]<30:
y_item[5][0] += cy[b]
y_item[5][1] += 1
if cc[b] >= 30 and cc[b]<35:
y_item[6][0] += cy[b]
y_item[6][1] += 1
if cc[b] >= 35 and cc[b]<40:
y_item[7][0] += cy[b]
y_item[7][1] += 1
if cc[b] >= 40 and cc[b]<45:
y_item[8][0] += cy[b]
y_item[8][1] += 1
if cc[b] >= 45 and cc[b]<50:
y_item[9][0] += cy[b]
y_item[9][1] += 1
# if cc[b] >= 50:
# y_item[10][0] += cy[b]
# y_item[10][1] += 1
y_new = np.zeros(y_item.shape[0])
for c in range(y_item.shape[0]):
if y_item[c][1] == 0:
y_new[c] = math.nan
else:
y_new[c] = y_item[c][0]/y_item[c][1]
y_new_list.append(y_new)
for s in range(len(y_new_list)):
plt.plot(x_item,y_new_list[s],label = 'data of year '+item[s][1][-4:], marker = marker[s],color = color[s])
plt.xlabel('undernourishment rate')
plt.ylabel('Peacefulness Index')
plt.ylim(1.6,2.8)
plt.xticks(fontsize = 8, horizontalalignment = 'center', alpha = .7)
plt.yticks(fontsize = 12, alpha = .7)
plt.grid(axis='both',alpha = .3)
plt.legend()
plt.gca().invert_yaxis()
plt.gca().invert_xaxis()
plt.show()
# need docstring
def box_plot_level12(df_level1: pd.DataFrame):
"""
The last step of analysis in each step is to check if the results we got suits for all the kinds of countries, like
poor ones, developing ones and developed ondes. So in this function, we plotted the corresponding box plot to show the statistics information
of corresponding datasets and check if there are any outliers or if the data of each category is highly skewed. However,
due to the fact that the total countries' amount is not that huge, so for some categories in some analysis level,
the data can be highly skewed, or have outlier, or even just one or two data points.
Requirement:
import seaborn as sns
:param df_level1: the dataframe of merge datasets
:return: the Box plot of all categories from lower level data
>>> hunger_data = Data().get_hunger()
>>> peace_data = Data().get_peace()
>>> x_list, y_list, df_level1 = analysis_first_two_level(hunger_data, peace_data)
>>> type(box_plot_level12(df_level1))
<class 'NoneType'>
"""
fig, axes = plt.subplots(3, 3, figsize=(60,40))
item = [['undernourishment_rate_2010','pi_2010'],['undernourishment_rate_2011','pi_2011'],
['undernourishment_rate_2012','pi_2012'],['undernourishment_rate_2013','pi_2013'],
['undernourishment_rate_2014','pi_2014'],['undernourishment_rate_2015','pi_2015'],
['undernourishment_rate_2016','pi_2016']]
peace_item = ['Peace_Index_of 2010','Peace_Index_of 2011','Peace_Index_of 2012','Peace_Index_of 2013',
'Peace_Index_of 2014','Peace_Index_of 2015','Peace_Index_of 2016']
for s in range(7):
cat_list = []
data = df_level1[item[s]]
cc = data[item[s][0]]
for b in range(data.iloc[:,0].size):
if math.isnan(cc[b]) == True:
cat_list.append(math.nan)
if cc[b] >= 0 and cc[b]<5:
cat_list.append('[0,5)')
if cc[b] >= 5 and cc[b]<10:
cat_list.append('[5,10)')
if cc[b] >= 10 and cc[b]<15:
cat_list.append('[10,15)')
if cc[b] >= 15 and cc[b]<20:
cat_list.append('[15,20)')
if cc[b] >= 20 and cc[b]<25:
cat_list.append('[20,25)')
if cc[b] >= 25 and cc[b]<30:
cat_list.append('[25,30)')
if cc[b] >= 30 and cc[b]<35:
cat_list.append('[30,35)')
if cc[b] >= 35 and cc[b]<40:
cat_list.append('[35,40)')
if cc[b] >= 40 and cc[b]<45:
cat_list.append('[40,45)')
if cc[b] >= 45 and cc[b]<50:
cat_list.append('[45,50)')
cat_col = pd.DataFrame(cat_list)
new_data = pd.concat([data,cat_col],axis=1)
new_data.columns = [item[s][0], peace_item[s], 'Catergory of Hunger']
box = sns.boxplot(x=peace_item[s], y='Catergory of Hunger', data=new_data, whis="range", palette="vlag",
ax=axes[s // 3, s % 3],
order=['[0,5)','[5,10)','[10,15)','[15,20)','[20,25)','[25,30)','[30,35)','[35,40)','[40,45)'])
#rescale boxplot x-axis with log
axes[s // 3, s % 3].set_title('Box Plot for Data of year '+ item[s][0][-4:])
fig.subplots_adjust(wspace=.4)
## Part 2: Analysis of level 2-3, Safety-Belonging by Peace Index and Marriage data
def get_marriage_rate(married: pd.DataFrame) -> list:
"""
Because the marriage data only has marriage population, we need to manually calculate the marriage rate for every country
in each year
:param married: The original marriage dataset
:return: Calculated dataset
>>> married = Data().get_marital()
>>> married = married.astype({"Year": int}, copy=False)
>>> type(get_marriage_rate(married))
<class 'list'>
"""
percent_married = []
for i in range(8):
total = married.loc[(married['Marital status']=='Total') & (married['Age']=='Total')].groupby(['Year', 'Country or Area'], as_index=False).sum()
single = married.loc[(married['Marital status']=='Single (never married)') & (married['Age']=='Total')].groupby(['Year', 'Country or Area'], as_index=False).sum()
single_pop = single[single['Year'] == 2010+i].groupby(['Year', 'Country or Area']).sum()['Value']
total_pop = total[total['Year'] == 2010+i].groupby(['Year', 'Country or Area']).sum()['Value']
df1 = (total_pop-single_pop)/total_pop
df = pd.concat([total_pop.to_frame().reset_index(), df1.to_frame().reset_index()['Value']],axis=1)
df.columns = ['Year','Country','Total_pop','Marriage_Rate']
percent_married.append(df)
return percent_married
def analysis_two_third_level(peace: pd.DataFrame, percent_married: pd.DataFrame) -> (list, list, pd.DataFrame):
"""
This function is to analyze the second and third levels of human needs' hierarchy. We choose the columns we want and
merge these together on the Country name. Because peaceful index dataset has different columns for each year and marriage data
has the samecolumns name for every year, so we get every year's data by getting corresponding column and merge them together.
In order to get the marriage percentage for evey country, we remade our dataset by calculating the percentage of marriage population and total population.
Then, we can get the data from lower level and higher level, two things here are that we need to drop all the data points that contains Nan value in either
lower or higher level and then sorted the data in the order of small to large for the data of lower level. After this
process, the function should return all these two lists and one dataframe, shows correspondingly the data from lower
level, higher level and merged dataframe that can be used later. Because the original datasets only contains the data
from 2010 to 2018 in common, so we use the data from these 9 years.
:param peace: World Peace Index's DataFrame
:param percent_married: The percentage_marriage data we made in the previous function
:return: all these two lists and one dataframe, shows correspondingly the data from lower level, higher level and
merged dataframe
>>> peace_data = Data().get_peace()
>>> married = Data().get_marital()
>>> married = married.astype({"Year": int}, copy=False)
>>> percent_married = get_marriage_rate(married)
>>> x_list_level2, y_list_level2, df_level2 = analysis_two_third_level(peace_data, percent_married)
>>> type(x_list_level2)
<class 'list'>
"""
level2_list = []
for i in range(8):
df = percent_married[i]
string = 'pi_'+str(2010+i)
p1 = peace['Country']
p2 = peace[string]
p = pd.concat([p1, p2],axis = 1)
df_level2 = pd.merge(df, p, on = 'Country', how='inner')
level2_list.append(df_level2)
x_list = []
y_list = []
pi_list = ['pi_2010','pi_2011','pi_2012','pi_2013','pi_2014','pi_2015','pi_2016','pi_2017','pi_2018']
for s in range(8):
df = level2_list[s]
dd = df.sort_values(by=[pi_list[s]], ascending=True)
x = np.asarray(dd[pi_list[s]])
y = np.asarray(dd['Marriage_Rate'])
index_list = []
for j in range(x.shape[0]):
if math.isnan(x[j]) == True or math.isnan(y[j]) == True:
index_list.append(j)
new_x = np.delete(x, index_list)
new_y = np.delete(y, index_list)
x_list.append(new_x)
y_list.append(new_y)
return x_list, y_list, df_level2
def plot_level_23(x_list: list, y_list: list):
"""
This function is to use the two lists of lower and higher level data we get before and plot a figure about their relationship
for all the countries in these two datasets. The figure should contain all the countries as corresponding data points.
So the trend should be harder to see. However, from the result, we can see that the marriage data is bad and
also small in size, so we change into the world happiness data to describe the third level need.
:param x_list: List of the lower level sorted data
:param y_list: List of the higher level corresponding data
:return: A plt plot shows how the brief trend look like for all countries.
>>> peace_data = Data().get_peace()
>>> married = Data().get_marital()
>>> married = married.astype({"Year": int}, copy=False)
>>> percent_married = get_marriage_rate(married)
>>> x_list_level2, y_list_level2, df_level2 = analysis_two_third_level(peace_data, percent_married)
>>> type(plot_level_23(x_list_level2, y_list_level2))
<class 'NoneType'>
"""
plt.figure(figsize = (30,15), dpi=100)
marker = ['.','*','>','<','1','2','s','3','4']
color = ['#E11B00', '#1E90FF','#FF4233','#FFE333','#7EFF33','#33F4FF','#D433FF','#3351FF','#D433FF']
pi_list = ['pi_2010','pi_2011','pi_2012','pi_2013','pi_2014','pi_2015','pi_2016','pi_2017','pi_2018']
for f in range(8):
plt.plot(x_list[f],y_list[f],label = 'data of year '+pi_list[f][-4:], marker = marker[f],color = color[f])
plt.xlabel('Peacefulness Index')
plt.ylabel('Marriage rate')
plt.ylim(0,1)
plt.xticks(fontsize = 8, horizontalalignment = 'center', alpha = .7)
plt.yticks(fontsize = 12, alpha = .7)
plt.grid(axis='both',alpha = .3)
plt.legend()
plt.gca().invert_xaxis()
plt.show()
## Part 2-2: Analysis of level 2-3, Safety-Belonging by Peace Index and Happiness data
def analysis_peace_happiness_level(peace: pd.DataFrame, happiness: pd.DataFrame) -> (list, list, list):
"""
This function is to analyze the second and third levels of human needs' hierarchy, we changed the third level
data from marriage into world happiness status. We choose the columns we want and merge these together on the Country name.
Because peaceful index dataset has different columns for each year and happiness original dataset is read as a dictionary of dataframes,
so we get every year's data by getting corresponding column and merge them together.
Then, we can get the data from lower level and higher level, two things here are that we need to drop all the data points that contains Nan value in either
lower or higher level and then sorted the data in the order of small to large for the data of lower level. After this
process, the function should return all these three lists, shows correspondingly the data from lower
level, higher level and the list of all merged dataframes that can be used later. Because the original datasets only contains the data
from 2015 to 2017 in common, so we use the data from these 3 years.
:param peace: World peace index data
:param happiness: World happiness data
:return: all these three lists, shows correspondingly the data from lower level, higher level and list of
merged dataframe
>>> peace_data = Data().get_peace()
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> x_list_p_h, y_list_p_h, p_h_list = analysis_peace_happiness_level(peace_data, happiness)
>>> print(type(p_h_list))
<class 'list'>
"""
p_h_list = []
for i in range(3):
string2 = str(2015+i)+'.csv'
df = happiness[string2][['Country','Happiness Score']]
string = 'pi_'+str(2015+i)
p1 = peace['Country']
p2 = peace[string]
p = pd.concat([p1, p2],axis = 1)
df_p_h = pd.merge(df, p, on = 'Country', how='inner')
p_h_list.append(df_p_h)
x_list = []
y_list = []
pi_list = ['pi_2015','pi_2016','pi_2017']
for s in range(3):
df = p_h_list[s]
dd = df.sort_values(by=[pi_list[s]], ascending=True)
x = np.asarray(dd[pi_list[s]])
y = np.asarray(dd['Happiness Score'])
index_list = []
for j in range(x.shape[0]):
if math.isnan(x[j]) == True or math.isnan(y[j]) == True:
index_list.append(j)
new_x = np.delete(x, index_list)
new_y = np.delete(y, index_list)
x_list.append(new_x)
y_list.append(new_y)
return x_list, y_list, p_h_list
def plot_level_p_h(x_list: list, y_list: list):
"""
This function is to use the two lists of lower and higher level data we get before and plot a figure about their relationship
for all the countries in these two datasets. The figure should contain all the countries as corresponding data points.
So the trend should be harder to see.
:param x_list: List of the lower level sorted data
:param y_list: List of the higher level corresponding data
:return: A plt plot shows how the brief trend look like for all countries.
>>> peace_data = Data().get_peace()
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> x_list_p_h, y_list_p_h, p_h_list = analysis_peace_happiness_level(peace_data, happiness)
>>> print(type(plot_level_p_h(x_list_p_h, y_list_p_h)))
<class 'NoneType'>
"""
plt.figure(figsize = (30,15), dpi=100)
marker = ['.','*','>','<','1','2','s','3','4']
color = ['#E11B00', '#1E90FF','#FFE333','#7EFF33','#33F4FF','#D433FF','#3351FF','#D433FF']
pi_list = ['pi_2015','pi_2016','pi_2017']
for f in range(3):
plt.plot(x_list[f],y_list[f],label = 'data of year '+pi_list[f][-4:], marker = marker[f],color = color[f])
plt.xlabel('Peacefulness Index')
plt.ylabel('Happiness Score')
plt.xticks(fontsize = 8, horizontalalignment = 'center', alpha = .7)
plt.yticks(fontsize = 12, alpha = .7)
plt.grid(axis='both',alpha = .3)
plt.legend()
plt.gca().invert_xaxis()
plt.show()
def plot_cat_ph(x_list: list, y_list: list):
"""
from previous plot, we can not see the trend and check if all kinds of countries satisfy this relationship.
So, we categorized the lower level needs in different ranges. By using this way, we can smooth the relationship and
see the corresponding trend more clearly. After we've done with this process, we'll check if the relationship we got
can fit the model we proposed or not. Moreover, if the trend of this data plot is not clear and didn't change a lot,
we would go back and check if the dataset we used has deficiency and substitute with other datasets.
:param x_list: List of Lower Level Sorted data
:param y_list: List of corresponding higher level data
:return: plt plot shows how the categorized figure look like.
>>> peace_data = Data().get_peace()
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> x_list_p_h, y_list_p_h, p_h_list = analysis_peace_happiness_level(peace_data, happiness)
>>> print(type(plot_cat_ph(x_list_p_h, y_list_p_h)))
<class 'NoneType'>
"""
plt.figure(figsize = (30,20), dpi=100)
pi_list = ['pi_2015','pi_2016','pi_2017']
x_item = ['[1,1.25)','[1.25,1.5)','[1.5,1.75)','[1.75,2)','[2,2.25)','[2.25,2.5)',
'[2.5,2.75)','[2.75,3)','[3,3,5)','3.5+']
marker = ['.','*','>','<','1','2','s']
color = ['#E11B00', '#1E90FF','#FFE333','#7EFF33','#33F4FF','#D433FF']
y_new_list = []
for a in range(len(x_list)):
cc = x_list[a]
cy = y_list[a]
y_item = np.zeros((10,2))
for b in range(len(cc)):
if cc[b] >= 1 and cc[b]<1.25:
y_item[0][0] += cy[b]
y_item[0][1] += 1
if cc[b] >= 1.25 and cc[b]<1.5:
y_item[1][0] += cy[b]
y_item[1][1] += 1
if cc[b] >= 1.5 and cc[b]<1.75:
y_item[2][0] += cy[b]
y_item[2][1] += 1
if cc[b] >= 1.75 and cc[b]<2:
y_item[3][0] += cy[b]
y_item[3][1] += 1
if cc[b] >= 2 and cc[b]<2.25:
y_item[4][0] += cy[b]
y_item[4][1] += 1
if cc[b] >= 2.25 and cc[b]<2.5:
y_item[5][0] += cy[b]
y_item[5][1] += 1
if cc[b] >= 2.5 and cc[b]<2.75:
y_item[6][0] += cy[b]
y_item[6][1] += 1
if cc[b] >= 2.75 and cc[b]<3:
y_item[7][0] += cy[b]
y_item[7][1] += 1
if cc[b] >= 3 and cc[b]<3.5:
y_item[8][0] += cy[b]
y_item[8][1] += 1
if cc[b] >= 3.5:
y_item[9][0] += cy[b]
y_item[9][1] += 1
y_new = np.zeros(y_item.shape[0])
for c in range(y_item.shape[0]):
if y_item[c][1] == 0:
y_new[c] = math.nan
else:
y_new[c] = y_item[c][0]/y_item[c][1]
y_new_list.append(y_new)
for s in range(len(y_new_list)):
plt.plot(x_item,y_new_list[s],label = 'data of year '+pi_list[s][1][-4:], marker = marker[s],color = color[s])
plt.xlabel('Peacefulness Index')
plt.ylabel('Happiness Score')
#plt.ylim(1.6,2.8)
plt.xticks(fontsize = 8, horizontalalignment = 'center', alpha = .7)
plt.yticks(fontsize = 12, alpha = .7)
plt.grid(axis='both',alpha = .3)
plt.legend()
plt.gca().invert_xaxis()
plt.show()
def box_plot_level_ph(p_h_list: list):
"""
The last step of analysis in each step is to check if the results we got suits for all the kinds of countries, like
poor ones, developing ones and developed ondes. So in this function, we plotted the corresponding box plot to show the statistics information
of corresponding datasets and check if there are any outliers or if the data of each category is highly skewed. However,
due to the fact that the total countries' amount is not that huge, so for some categories in some analysis level,
the data can be highly skewed, or have outlier, or even just one or two data points.
Requirement:
import seaborn as sns
:param p_h_list: The list of merges dataframes
:return: The Box plot of all categories from lower level data
>>> peace_data = Data().get_peace()
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> x_list_p_h, y_list_p_h, p_h_list = analysis_peace_happiness_level(peace_data, happiness)
>>> print(type(box_plot_level_ph(p_h_list)))
<class 'NoneType'>
"""
fig, axes = plt.subplots(2,2, figsize=(50,40))
item = [['pi_2015','Happiness Score'],['pi_2016','Happiness Score'],['pi_2017','Happiness Score']]
# ax_list = [[0,0],[1,0],[2,0]]
for s in range(3):
cat_list = []
data = p_h_list[s]
cc = data[item[s][0]]
for b in range(data.iloc[:,0].size):
if math.isnan(cc[b]) == True:
cat_list.append(math.nan)
if cc[b] >= 1 and cc[b]<1.25:
cat_list.append('[1,1.25)')
if cc[b] >= 1.25 and cc[b]<1.5:
cat_list.append('[1.25,1.5)')
if cc[b] >= 1.5 and cc[b]<1.75:
cat_list.append('[1.5,1.75)')
if cc[b] >= 1.75 and cc[b]<2:
cat_list.append('[1.75,2)')
if cc[b] >= 2 and cc[b]<2.25:
cat_list.append('[2,2.25)')
if cc[b] >= 2.25 and cc[b]<2.5:
cat_list.append('[2.25,2.5)')
if cc[b] >= 2.5 and cc[b]<2.75:
cat_list.append('[2.5,2.75)')
if cc[b] >= 2.75 and cc[b]<3:
cat_list.append('[2.75,3)')
if cc[b] >= 3 and cc[b]<3.5:
cat_list.append('[3,3.5)')
if cc[b] >= 3.5:
cat_list.append('3.5+')
cat_col = pd.DataFrame(cat_list)
new_data = pd.concat([data,cat_col],axis=1)
new_data.columns = ['Country', item[s][1], item[s][0], 'Catergory of Peace Index']
box = sns.boxplot(x=item[s][1], y='Catergory of Peace Index', data=new_data, whis="range", palette="vlag",
ax=axes[s // 2, s % 2],
order=['[1,1.25)', '[1.25,1.5)','[1.5,1.75)','[1.75,2)',
'[2,2.25)','[2.25,2.5)','[2.5,2.75)','[2.75,3)','[3,3.5)','3.5+'])
#rescale boxplot x-axis with log
axes[s // 2, s % 2].set_title('Box Plot for Data of year '+ item[s][1][-4:])
fig.subplots_adjust(wspace=.4)
## Part 3: Analysis of level 3-4, Belonging-Esteem by World Happiness and freedom dataset
def analysis_happiness_Freedom_level(happiness: pd.DataFrame, df_free: pd.DataFrame) -> (list, list, list):
"""
This function is to analyze the third and fourth levels of human needs' hierarchy, we changed the third level
data from marriage into world happiness status. We choose the columns we want and merge these together on the Country name.
Because Freedom index dataset has same columns for each year and happiness original dataset is read as a dictionary of dataframes,
so we get every year's data by getting corresponding column and merge them together.
Then, we can get the data from lower level and higher level, two things here are that we need to drop all the data points that contains Nan value in either
lower or higher level and then sorted the data in the order of small to large for the data of lower level. After this
process, the function should return all these three lists, shows correspondingly the data from lower
level, higher level and the list of all merged dataframes that can be used later. Because the original datasets only contains the data
from 2015 to 2017 in common, so we use the data from these 3 years.
:param happiness: World Happiness DataFrame
:param df_free: World Freedom Index Dataframe
:return: all these three lists, shows correspondingly the data from lower level, higher level and list of
merged dataframe
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> freedom = Data().get_freedom()
>>> df_free_data = pd.concat([freedom['year'], freedom['countries'], freedom['hf_score']], axis=1)
>>> df_free_data.columns = ['Year', 'Country', 'Human_Freedom_Score']
>>> x_list_h_f, y_list_h_f, h_f_list = analysis_happiness_Freedom_level(happiness, df_free_data)
>>> type(x_list_h_f)
<class 'list'>
"""
h_f_list = []
for i in range(2):
string2 = str(2015+i)+'.csv'
df = happiness[string2][['Country','Happiness Score']]
df_f = df_free[df_free['Year']==(2015+i)]
df_h_f = pd.merge(df, df_f, on = 'Country', how='inner')
h_f_list.append(df_h_f)
x_list = []
y_list = []
for s in range(2):
df = h_f_list[s]
dd = df.sort_values(by=['Happiness Score'], ascending=True)
x = np.asarray(dd['Happiness Score'])
y = np.asarray(dd['Human_Freedom_Score'])
index_list = []
for j in range(x.shape[0]):
if math.isnan(x[j]) == True or math.isnan(y[j]) == True:
index_list.append(j)
new_x = np.delete(x, index_list)
new_y = np.delete(y, index_list)
x_list.append(new_x)
y_list.append(new_y)
return x_list, y_list, h_f_list
def plot_level_h_f(x_list: list, y_list: list):
"""
This function is to use the two lists of lower and higher level data we get before and plot a figure about their relationship
for all the countries in these two datasets. The figure should contain all the countries as corresponding data points.
So the trend should be harder to see.
:param x_list: List of the lower level sorted data
:param y_list: List of the higher level corresponding data
:return: A plt plot shows how the brief trend look like for all countries.
>>> happiness = Data().get_happiness()
>>> happiness['2017.csv'] = happiness['2017.csv'][['Country', 'Happiness.Score']]
>>> happiness['2017.csv'].columns = ['Country', 'Happiness Score']
>>> freedom = Data().get_freedom()
>>> df_free_data = pd.concat([freedom['year'], freedom['countries'], freedom['hf_score']], axis=1)
>>> df_free_data.columns = ['Year', 'Country', 'Human_Freedom_Score']
>>> x_list_h_f, y_list_h_f, h_f_list = analysis_happiness_Freedom_level(happiness, df_free_data)
>>> print(type(plot_level_h_f(x_list_h_f, y_list_h_f)))
<class 'NoneType'>