World Happiness Data

The Cloistered Monkey

2023-12-14 14:58

Imports

# python
from functools import partial
from pathlib import Path
import os

# pypi
from dotenv import load_dotenv
from expects import be_true, expect
from tabulate import tabulate

import pandas

TABLE = partial(partial(tabulate,
                        headers="keys",
                        tablefmt="orgtbl",
                        showindex=False))

load_dotenv(override=True)
kaggle_path = Path(os.getenv("KAGGLE_WORLD_HAPPINESS"))
figure_path = Path(os.getenv("WORLD_HAPPINESS_FIGURE"))
table_path = Path(os.getenv("WORLD_HAPPINESS_TABLE"))
print(kaggle_path)
print(figure_path)
print(table_path)
expect(kaggle_path.is_file()).to(be_true)
expect(table_path.is_file()).to(be_true)
expect(figure_path.is_file()).to(be_true)

/home/bravo/data/datasets/kaggle/world-happiness-report/WHR_2016.csv
/home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-figure-2.1.csv
/home/bravo/data/datasets/world-happiness-data/world-happiness-report-2023-data-for-table-2.1.csv

kaggle = pandas.read_csv(kaggle_path)
figure = pandas.read_csv(figure_path)
table = pandas.read_csv(table_path)

print(kaggle.shape)
print(figure.shape)
print(table.shape)

(157, 13)
(137, 19)
(2199, 11)

def column_printer(table, headers=("Column", "Type")):
    print(TABLE(
        ((column, str(table[column].dtype))
         for column in table.columns),
        headers=headers))
    return

column_printer(kaggle)

Column	Type
Country	object
Region	object
Happiness Rank	int64
Happiness Score	float64
Lower Confidence Interval	float64
Upper Confidence Interval	float64
Economy (GDP per Capita)	float64
Family	float64
Health (Life Expectancy)	float64
Freedom	float64
Trust (Government Corruption)	float64
Generosity	float64
Dystopia Residual	float64

column_printer(figure)

Column	Type
Country name	object
Ladder score	float64
Standard error of ladder score	float64
upperwhisker	float64
lowerwhisker	float64
Logged GDP per capita	float64
Social support	float64
Healthy life expectancy	float64
Freedom to make life choices	float64
Generosity	float64
Perceptions of corruption	float64
Ladder score in Dystopia	float64
Explained by: Log GDP per capita	float64
Explained by: Social support	float64
Explained by: Healthy life expectancy	float64
Explained by: Freedom to make life choices	float64
Explained by: Generosity	float64
Explained by: Perceptions of corruption	float64
Dystopia + residual	float64

column_printer(table)

Column	Type
Country name	object
year	int64
Life Ladder	float64
Log GDP per capita	float64
Social support	float64
Healthy life expectancy at birth	float64
Freedom to make life choices	float64
Generosity	float64
Perceptions of corruption	float64
Positive affect	float64
Negative affect	float64

It's hard to say exactly but it looks like Region and Happiness Rank were added by whoever created the kaggle dataset and it isn't clear what the Family column ties into. The only column in the UN data not matched is Social Support but that doesn't seem to have the right value range:

print(kaggle.Family.max())
print(figure["Social support"].max())

1.18326
0.983

We're not going to use Family anyway, so I'll just ignore it.

Country

COUNTRY = "Country name"

print(kaggle.Country.min())
print(table[COUNTRY].min())
print(figure[COUNTRY].min())

Afghanistan
Afghanistan
Afghanistan

print(len(kaggle.Country.unique()))
print(len(table[COUNTRY].unique()))
print(len(figure[COUNTRY].unique()))

157
165
137

The figure data has quite a bit fewer entries than the table data. I'll have to look into that as I was planning to merge them, but I'll have to figure out why those twenty-eight countries are missing.

The Table vs Kaggle

k_countries = set(kaggle.Country)
t_countries = set(table[COUNTRY])

print(sorted(t_countries - k_countries))
print()
print(sorted(k_countries - t_countries))

['Central African Republic', 'Cuba', 'Czechia', 'Djibouti', 'Eswatini', 'Gambia', 'Guyana', 'Hong Kong S.A.R. of China', 'Lesotho', 'Maldives', 'Mozambique', 'North Macedonia', 'Oman', 'Somaliland region', 'State of Palestine', 'Taiwan Province of China', 'Turkiye']

['Czech Republic', 'Hong Kong', 'Macedonia', 'North Cyprus', 'Palestinian Territories', 'Puerto Rico', 'Somaliland Region', 'Taiwan', 'Turkey']

Kaggle	World Happiness Report	Wikipedia
- Missing -	Central African Republic	Central African Republic
- Missing -	Cuba	Cuba
Czech Republic	Czechia	Czech Republic
- Missing -	Djibouti	Djibouti
- Missing -	Eswatini	Eswatini
- Missing -	Gambia	The Gambia
- Missing -	Guyana	Guyana
Hong Kong	Hong Kong S.A.R. of China	Hong Kong
- Missing -	Lesotho	Lesotho
Macedonia	North Macedonia	North Macedonia
- Missing -	Maldives	Maldives
- Missing -	Mozambique	Mozambique
North Cyprus	- Only recognized by Turkey -	Northern Cyprus
- Missing -	Oman	Oman
Palestinian Territories	State of Palestine	State of Palestine, Palestinian Territories
Puerto Rico	- Territory of U.S. -	Puerto Rico
Somaliland Region	Somaliland region
Taiwan	Taiwan Province of China	Taiwan
Turkey	Turkiye	Turkey

I'm more interested in the World Happiness Report so I'll conform Kaggle's country names to match that and ignore the countries that it's missing.

def rename_country(names: dict, data: pandas.DataFrame,
                   country_column: str=COUNTRY) -> pandas.DataFrame:
    """Rename the countries in the kaggle data

    Args:

     - names: dict mapping kaggle names to names you want
     - kaggle_data: the kaggle happiness data to rename countries
     - country_column: name to use for the country column

    Returns:
     kaggle_data with countries renamed
    """
    data = kaggle_data.rename(columns=dict(Country=country_column))
    data[country_column] = data[country_column].replace(names)
    return data

kaggle_to_world = {
    "Czech Republic": "Czechia",
    "Macedonia": "North Macedonia",
    "Palestinian Territories": "State of Palestine",
    "Turkey": "Turkiye"
}

kaggled = kaggle.copy()
kaggled["Country"] = kaggled.Country.replace(kaggle_to_world)
print(set(kaggled.Country) - set(table[COUNTRY]))
print(set(table[COUNTRY]) - set(kaggled.Country))

{'Somaliland Region', 'North Cyprus', 'Taiwan', 'Hong Kong', 'Puerto Rico'}
{'Hong Kong S.A.R. of China', 'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Somaliland region', 'Gambia', 'Taiwan Province of China', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}

world_to_kaggle = {"Hong Kong S.A.R. of China": "Hong Kong",
                   "Somaliland region": "Somaliland Region",
                   "Taiwan Province of China": "Taiwan"}

tabled = table.rename(columns={COUNTRY: "Country"})
tabled["Country"] = tabled.Country.replace(world_to_kaggle)

print(set(kaggled.Country) - set(tabled.Country))
print(set(tabled.Country) - set(kaggled.Country))

{'North Cyprus', 'Puerto Rico'}
{'Central African Republic', 'Djibouti', 'Oman', 'Lesotho', 'Mozambique', 'Gambia', 'Eswatini', 'Guyana', 'Cuba', 'Maldives'}

Figure Data

figured = figure.rename(columns={COUNTRY: "Country"})
figured["Country"] = figured.Country.replace(world_to_kaggle)

print(set(figured.Country) - set(kaggled.Country))
print()
print(set(figured.Country) - set(tabled.Country))

{'Mozambique', 'Gambia'}

set()

The Figure Countries

f_countries = set(figure[COUNTRY])
f_only = f_countries - t_countries
kd_countries = set(kaggled[COUNTRY])

print(sorted(f_only - kd_countries))
print()
print(sorted(kd_countries - f_countries))

[]

['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Haiti', 'Kuwait', 'Libya', 'North Cyprus', 'Puerto Rico', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']

The figure data has twenty fewer countries than the Kaggle data so it's not surprising that there's a lot left over. It doesn't look like there's any in the figure data that Kaggle doesn't have, though, which is good.

print(sorted(t_countries - f_countries))

['Angola', 'Azerbaijan', 'Belarus', 'Belize', 'Bhutan', 'Burundi', 'Central African Republic', 'Cuba', 'Djibouti', 'Eswatini', 'Guyana', 'Haiti', 'Kuwait', 'Lesotho', 'Libya', 'Maldives', 'Oman', 'Qatar', 'Rwanda', 'Somalia', 'Somaliland region', 'South Sudan', 'Sudan', 'Suriname', 'Syria', 'Trinidad and Tobago', 'Turkmenistan', 'Yemen']