World Happiness Data

Imports

# python
from functools import partial
from pathlib import Path
import os

# pypi
from dotenv import load_dotenv
from expects import be_true, expect
from tabulate import tabulate

import pandas
TABLE = partial(partial(tabulate,
                        headers="keys",
                        tablefmt="orgtbl",
                        showindex=False))
load_dotenv(override=True)
kaggle_path = Path(os.getenv("KAGGLE_WORLD_HAPPINESS"))
figure_path = Path(os.getenv("WORLD_HAPPINESS_FIGURE"))
table_path = Path(os.getenv("WORLD_HAPPINESS_TABLE"))
print(kaggle_path)
print(figure_path)
print(table_path)
expect(kaggle_path.is_file()).to(be_true)
expect(table_path.is_file()).to(be_true)
expect(figure_path.is_file()).to(be_true)
/home/bravo/data/datasets/kaggle/world-happiness-report/WHR_2016.csv
/home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-figure-2.1.csv
/home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-table-2.1.csv
kaggle = pandas.read_csv(kaggle_path)
figure = pandas.read_csv(figure_path)
table = pandas.read_csv(table_path)

print(kaggle.shape)
print(figure.shape)
print(table.shape)
(157, 13)
(137, 19)
(2199, 11)
def column_printer(table, headers=("Column", "Type")):
    print(TABLE(
        ((column, str(table[column].dtype))
         for column in table.columns),
        headers=headers))
    return
column_printer(kaggle)
Column Type
Country object
Region object
Happiness Rank int64
Happiness Score float64
Lower Confidence Interval float64
Upper Confidence Interval float64
Economy (GDP per Capita) float64
Family float64
Health (Life Expectancy) float64
Freedom float64
Trust (Government Corruption) float64
Generosity float64
Dystopia Residual float64
column_printer(figure)
Column Type
Country name object
Ladder score float64
Standard error of ladder score float64
upperwhisker float64
lowerwhisker float64
Logged GDP per capita float64
Social support float64
Healthy life expectancy float64
Freedom to make life choices float64
Generosity float64
Perceptions of corruption float64
Ladder score in Dystopia float64
Explained by: Log GDP per capita float64
Explained by: Social support float64
Explained by: Healthy life expectancy float64
Explained by: Freedom to make life choices float64
Explained by: Generosity float64
Explained by: Perceptions of corruption float64
Dystopia + residual float64
column_printer(table)
Column Type
Country name object
year int64
Life Ladder float64
Log GDP per capita float64
Social support float64
Healthy life expectancy at birth float64
Freedom to make life choices float64
Generosity float64
Perceptions of corruption float64
Positive affect float64
Negative affect float64

It's hard to say exactly but it looks like Region and Happiness Rank were added by whoever created the kaggle dataset and it isn't clear what the Family column ties into. The only column in the UN data not matched is Social Support but that doesn't seem to have the right value range:

print(kaggle.Family.max())
print(figure["Social support"].max())
1.18326
0.983

We're not going to use Family anyway, so I'll just ignore it.

Country

COUNTRY = "Country name"
print(kaggle.Country.min())
print(table[COUNTRY].min())
print(figure[COUNTRY].min())
Afghanistan
Afghanistan
Afghanistan
print(len(kaggle.Country.unique()))
print(len(table[COUNTRY].unique()))
print(len(figure[COUNTRY].unique()))
157
165
137

The figure data has quite a bit fewer entries than the table data. I'll have to look into that as I was planning to merge them, but I'll have to figure out why those twenty-eight countries are missing.

The Table vs Kaggle

k_countries = set(kaggle.Country)
t_countries = set(table[COUNTRY])

print(sorted(t_countries - k_countries))
print()
print(sorted(k_countries - t_countries))
['Central African Republic', 'Cuba', 'Czechia', 'Djibouti', 'Eswatini', 'Gambia', 'Guyana', 'Hong Kong S.A.R. of China', 'Lesotho', 'Maldives', 'Mozambique', 'North Macedonia', 'Oman', 'Somaliland region', 'State of Palestine', 'Taiwan Province of China', 'Turkiye']

['Czech Republic', 'Hong Kong', 'Macedonia', 'North Cyprus', 'Palestinian Territories', 'Puerto Rico', 'Somaliland Region', 'Taiwan', 'Turkey']
Kaggle World Happiness Report Wikipedia
- Missing - Central African Republic Central African Republic
- Missing - Cuba Cuba
Czech Republic Czechia Czech Republic
- Missing - Djibouti Djibouti
- Missing - Eswatini Eswatini
- Missing - Gambia The Gambia
- Missing - Guyana Guyana
Hong Kong Hong Kong S.A.R. of China Hong Kong
- Missing - Lesotho Lesotho
Macedonia North Macedonia North Macedonia
- Missing - Maldives Maldives
- Missing - Mozambique Mozambique
North Cyprus - Only recognized by Turkey - Northern Cyprus
- Missing - Oman Oman
Palestinian Territories State of Palestine State of Palestine, Palestinian Territories
Puerto Rico - Territory of U.S. - Puerto Rico
Somaliland Region Somaliland region  
Taiwan Taiwan Province of China Taiwan
Turkey Turkiye Turkey

I'm more interested in the World Happiness Report so I'll conform Kaggle's country names to match that and ignore the countries that it's missing.

def rename_country(names: dict, data: pandas.DataFrame,
                   country_column: str=COUNTRY) -> pandas.DataFrame:
    """Rename the countries in the kaggle data

    Args:

     - names: dict mapping kaggle names to names you want
     - kaggle_data: the kaggle happiness data to rename countries
     - country_column: name to use for the country column

    Returns:
     kaggle_data with countries renamed
    """
    data = kaggle_data.rename(columns=dict(Country=country_column))
    data[country_column] = data[country_column].replace(names)
    return data
kaggle_to_world = {
    "Czech Republic": "Czechia",
    "Macedonia": "North Macedonia",
    "Palestinian Territories": "State of Palestine",
    "Turkey": "Turkiye"
}

kaggled = kaggle.copy()
kaggled["Country"] = kaggled.Country.replace(kaggle_to_world)
print(set(kaggled.Country) - set(table[COUNTRY]))
print(set(table[COUNTRY]) - set(kaggled.Country))
{'Somaliland Region', 'North Cyprus', 'Taiwan', 'Hong Kong', 'Puerto Rico'}
{'Hong Kong S.A.R. of China', 'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Somaliland region', 'Gambia', 'Taiwan Province of China', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}
world_to_kaggle = {"Hong Kong S.A.R. of China": "Hong Kong",
                   "Somaliland region": "Somaliland Region",
                   "Taiwan Province of China": "Taiwan"}

tabled = table.rename(columns={COUNTRY: "Country"})
tabled["Country"] = tabled.Country.replace(world_to_kaggle)

print(set(kaggled.Country) - set(tabled.Country))
print(set(tabled.Country) - set(kaggled.Country))
{'North Cyprus', 'Puerto Rico'}
{'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Gambia', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}

Figure Data

figured = figure.rename(columns={COUNTRY: "Country"})
figured["Country"] = figured.Country.replace(world_to_kaggle)

print(set(figured.Country) - set(kaggled.Country))
print()
print(set(figured.Country) - set(tabled.Country))
{'Mozambique', 'Gambia'}

set()

The Figure Countries

f_countries = set(figure[COUNTRY])
f_only = f_countries - t_countries
kd_countries = set(kaggled[COUNTRY])

print(sorted(f_only - kd_countries))
print()
print(sorted(kd_countries - f_countries))
[]

['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Haiti', 'Kuwait', 'Libya', 'North Cyprus', 'Puerto Rico', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']

The figure data has twenty fewer countries than the Kaggle data so it's not surprising that there's a lot left over. It doesn't look like there's any in the figure data that Kaggle doesn't have, though, which is good.

print(sorted(t_countries - f_countries))
['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Central African Republic', 'Cuba', 'Djibouti', 'Eswatini', 'Guyana', 'Haiti', 'Kuwait', 'Lesotho', 'Libya', 'Maldives', 'Oman', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']

Links