World Happiness Data
Table of Contents
Imports
# python
from functools import partial
from pathlib import Path
import os
# pypi
from dotenv import load_dotenv
from expects import be_true, expect
from tabulate import tabulate
import pandas
TABLE = partial(partial(tabulate,
headers="keys",
tablefmt="orgtbl",
showindex=False))
load_dotenv(override=True)
kaggle_path = Path(os.getenv("KAGGLE_WORLD_HAPPINESS"))
figure_path = Path(os.getenv("WORLD_HAPPINESS_FIGURE"))
table_path = Path(os.getenv("WORLD_HAPPINESS_TABLE"))
print(kaggle_path)
print(figure_path)
print(table_path)
expect(kaggle_path.is_file()).to(be_true)
expect(table_path.is_file()).to(be_true)
expect(figure_path.is_file()).to(be_true)
/home/bravo/data/datasets/kaggle/world-happiness-report/WHR_2016.csv /home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-figure-2.1.csv /home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-table-2.1.csv
kaggle = pandas.read_csv(kaggle_path)
figure = pandas.read_csv(figure_path)
table = pandas.read_csv(table_path)
print(kaggle.shape)
print(figure.shape)
print(table.shape)
(157, 13) (137, 19) (2199, 11)
def column_printer(table, headers=("Column", "Type")):
print(TABLE(
((column, str(table[column].dtype))
for column in table.columns),
headers=headers))
return
column_printer(kaggle)
Column | Type |
---|---|
Country | object |
Region | object |
Happiness Rank | int64 |
Happiness Score | float64 |
Lower Confidence Interval | float64 |
Upper Confidence Interval | float64 |
Economy (GDP per Capita) | float64 |
Family | float64 |
Health (Life Expectancy) | float64 |
Freedom | float64 |
Trust (Government Corruption) | float64 |
Generosity | float64 |
Dystopia Residual | float64 |
column_printer(figure)
Column | Type |
---|---|
Country name | object |
Ladder score | float64 |
Standard error of ladder score | float64 |
upperwhisker | float64 |
lowerwhisker | float64 |
Logged GDP per capita | float64 |
Social support | float64 |
Healthy life expectancy | float64 |
Freedom to make life choices | float64 |
Generosity | float64 |
Perceptions of corruption | float64 |
Ladder score in Dystopia | float64 |
Explained by: Log GDP per capita | float64 |
Explained by: Social support | float64 |
Explained by: Healthy life expectancy | float64 |
Explained by: Freedom to make life choices | float64 |
Explained by: Generosity | float64 |
Explained by: Perceptions of corruption | float64 |
Dystopia + residual | float64 |
column_printer(table)
Column | Type |
---|---|
Country name | object |
year | int64 |
Life Ladder | float64 |
Log GDP per capita | float64 |
Social support | float64 |
Healthy life expectancy at birth | float64 |
Freedom to make life choices | float64 |
Generosity | float64 |
Perceptions of corruption | float64 |
Positive affect | float64 |
Negative affect | float64 |
It's hard to say exactly but it looks like Region and Happiness Rank were added by whoever created the kaggle
dataset and it isn't clear what the Family column ties into. The only column in the UN data not matched is Social Support but that doesn't seem to have the right value range:
print(kaggle.Family.max())
print(figure["Social support"].max())
1.18326 0.983
We're not going to use Family anyway, so I'll just ignore it.
Country
COUNTRY = "Country name"
print(kaggle.Country.min())
print(table[COUNTRY].min())
print(figure[COUNTRY].min())
Afghanistan Afghanistan Afghanistan
print(len(kaggle.Country.unique()))
print(len(table[COUNTRY].unique()))
print(len(figure[COUNTRY].unique()))
157 165 137
The figure data has quite a bit fewer entries than the table data. I'll have to look into that as I was planning to merge them, but I'll have to figure out why those twenty-eight countries are missing.
The Table vs Kaggle
k_countries = set(kaggle.Country)
t_countries = set(table[COUNTRY])
print(sorted(t_countries - k_countries))
print()
print(sorted(k_countries - t_countries))
['Central African Republic', 'Cuba', 'Czechia', 'Djibouti', 'Eswatini', 'Gambia', 'Guyana', 'Hong Kong S.A.R. of China', 'Lesotho', 'Maldives', 'Mozambique', 'North Macedonia', 'Oman', 'Somaliland region', 'State of Palestine', 'Taiwan Province of China', 'Turkiye'] ['Czech Republic', 'Hong Kong', 'Macedonia', 'North Cyprus', 'Palestinian Territories', 'Puerto Rico', 'Somaliland Region', 'Taiwan', 'Turkey']
Kaggle | World Happiness Report | Wikipedia |
---|---|---|
- Missing - | Central African Republic | Central African Republic |
- Missing - | Cuba | Cuba |
Czech Republic | Czechia | Czech Republic |
- Missing - | Djibouti | Djibouti |
- Missing - | Eswatini | Eswatini |
- Missing - | Gambia | The Gambia |
- Missing - | Guyana | Guyana |
Hong Kong | Hong Kong S.A.R. of China | Hong Kong |
- Missing - | Lesotho | Lesotho |
Macedonia | North Macedonia | North Macedonia |
- Missing - | Maldives | Maldives |
- Missing - | Mozambique | Mozambique |
North Cyprus | - Only recognized by Turkey - | Northern Cyprus |
- Missing - | Oman | Oman |
Palestinian Territories | State of Palestine | State of Palestine, Palestinian Territories |
Puerto Rico | - Territory of U.S. - | Puerto Rico |
Somaliland Region | Somaliland region | |
Taiwan | Taiwan Province of China | Taiwan |
Turkey | Turkiye | Turkey |
I'm more interested in the World Happiness Report so I'll conform Kaggle's country names to match that and ignore the countries that it's missing.
def rename_country(names: dict, data: pandas.DataFrame,
country_column: str=COUNTRY) -> pandas.DataFrame:
"""Rename the countries in the kaggle data
Args:
- names: dict mapping kaggle names to names you want
- kaggle_data: the kaggle happiness data to rename countries
- country_column: name to use for the country column
Returns:
kaggle_data with countries renamed
"""
data = kaggle_data.rename(columns=dict(Country=country_column))
data[country_column] = data[country_column].replace(names)
return data
kaggle_to_world = {
"Czech Republic": "Czechia",
"Macedonia": "North Macedonia",
"Palestinian Territories": "State of Palestine",
"Turkey": "Turkiye"
}
kaggled = kaggle.copy()
kaggled["Country"] = kaggled.Country.replace(kaggle_to_world)
print(set(kaggled.Country) - set(table[COUNTRY]))
print(set(table[COUNTRY]) - set(kaggled.Country))
{'Somaliland Region', 'North Cyprus', 'Taiwan', 'Hong Kong', 'Puerto Rico'} {'Hong Kong S.A.R. of China', 'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Somaliland region', 'Gambia', 'Taiwan Province of China', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}
world_to_kaggle = {"Hong Kong S.A.R. of China": "Hong Kong",
"Somaliland region": "Somaliland Region",
"Taiwan Province of China": "Taiwan"}
tabled = table.rename(columns={COUNTRY: "Country"})
tabled["Country"] = tabled.Country.replace(world_to_kaggle)
print(set(kaggled.Country) - set(tabled.Country))
print(set(tabled.Country) - set(kaggled.Country))
{'North Cyprus', 'Puerto Rico'} {'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Gambia', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}
Figure Data
figured = figure.rename(columns={COUNTRY: "Country"})
figured["Country"] = figured.Country.replace(world_to_kaggle)
print(set(figured.Country) - set(kaggled.Country))
print()
print(set(figured.Country) - set(tabled.Country))
{'Mozambique', 'Gambia'} set()
The Figure Countries
f_countries = set(figure[COUNTRY])
f_only = f_countries - t_countries
kd_countries = set(kaggled[COUNTRY])
print(sorted(f_only - kd_countries))
print()
print(sorted(kd_countries - f_countries))
[] ['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Haiti', 'Kuwait', 'Libya', 'North Cyprus', 'Puerto Rico', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']
The figure data has twenty fewer countries than the Kaggle data so it's not surprising that there's a lot left over. It doesn't look like there's any in the figure data that Kaggle doesn't have, though, which is good.
print(sorted(t_countries - f_countries))
['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Central African Republic', 'Cuba', 'Djibouti', 'Eswatini', 'Guyana', 'Haiti', 'Kuwait', 'Lesotho', 'Libya', 'Maldives', 'Oman', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']
Links
- Kaggle World Happiness Data: The World Happiness Data. There are four separate CSV files available for 2015 through 2019.
- World Happiness Report: The website for the World Happiness Report.