The Geopandas DataFrame Coordinates Example

Beginning

Imports

# python
from functools import partial

# pypi
import geopandas
import hvplot.pandas
import matplotlib.pyplot as pyplot
import pandas

# my stuff
from graeae import EmbedHoloviews

Set Up

Embed = partial(EmbedHoloviews, folder_path="files/posts/the-geopandas-dataframe-coordinates-example")

Middle

Longitudes and Latitudes

data = pandas.DataFrame(
    {'City': ['Buenos Aires', 'Brasilia', 'Santiago', 'Bogota', 'Caracas'],
     'Country': ['Argentina', 'Brazil', 'Chile', 'Colombia', 'Venezuela'],
     'Latitude': [-34.58, -15.78, -33.45, 4.60, 10.48],
     'Longitude': [-58.66, -47.91, -70.66, -74.08, -66.86]})
geoframe = geopandas.GeoDataFrame(data,
                                  geometry=geopandas.points_from_xy(
                                      data.Longitude, data.Latitude))
print(geoframe.head())
           City    Country  Latitude  Longitude                     geometry
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)
1      Brasilia     Brazil    -15.78     -47.91  POINT (-47.91000 -15.78000)
2      Santiago      Chile    -33.45     -70.66  POINT (-70.66000 -33.45000)
3        Bogota   Colombia      4.60     -74.08    POINT (-74.08000 4.60000)
4       Caracas  Venezuela     10.48     -66.86   POINT (-66.86000 10.48000)

Plot It

# read in the plotting data
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

# restrict it to South America.
ax = world[world.continent == 'South America'].plot(
    color='white', edgecolor='black')

# We can now plot our ``GeoDataFrame``.
geoframe.plot(ax=ax, color='red')

pyplot.savefig("files/posts/the-geopandas-dataframe-coordinates-example/south_america.png")

nil

With Holoviews

Since this is going to be interactive we can add more to the plot.

print(world.head())
     pop_est      continent                      name iso_a3  gdp_md_est  \
0     920938        Oceania                      Fiji    FJI      8374.0   
1   53950935         Africa                  Tanzania    TZA    150600.0   
2     603253         Africa                 W. Sahara    ESH       906.5   
3   35623680  North America                    Canada    CAN   1674000.0   
4  326625791  North America  United States of America    USA  18560000.0   

                                            geometry  
0  MULTIPOLYGON (((180.00000 -16.06713, 180.00000...  
1  POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...  
2  POLYGON ((-8.66559 27.65643, -8.66512 27.58948...  
3  MULTIPOLYGON (((-122.84000 49.00000, -122.9742...  
4  MULTIPOLYGON (((-122.84000 49.00000, -120.0000...  

Let's add the population estimate to the countries and the city to the points.

world_plot = world[world.continent == "South America"].hvplot(
    color="white",
    x="Longitude",
    y="Latitude",
    legend=False,
    hover_cols=["pop_est", "name"])

# NOTE: the overlay uses the pandas.DataFrame, not the GeoDataFrame
points = data.hvplot.points("Longitude", "Latitude", hover_cols=["City"])
plot = (world_plot * points).opts(
    title="South American Cities",
    width=700,
    height=890,
)
outcome = Embed(plot=plot, file_name="south_america", height_in_pixels=1000)()
print(outcome)

Figure Missing

This was mostly a check to see if I could get it working so I'll have to think about adding more explanations later.

Notes

  • To get matplotlib to work I had to install descartes (pip install descartes)
  • To get hvplot to work I had to install geoviews (pip install geoviews)

Links

Secrets of Mental Math

Citation

  • Benjamin A, Shermer M, Benjamin A, 3M Cloud Library. Secrets of mental math: the mathemagician’s guide to lightning calculation and amazing math tricks [Internet]. Place of publication not identified: Crown Publishing Group; 2006 [cited 2020 Nov 9].

Bibliography for Think Complexity

Citation

  1. Downey A. Think complexity: complexity science and computational modeling. Second edition. Beijing ; Boston: O’Reilly; 2018. 185 p.

Comments

This is a creative commons book that is for sale but is also available to read online or as a free PDF from Green Tea Press. There is also a github repository here.

CPR and Blood Thinners

Beginning

We want to test the hypothesis that blood thinners administered to patients that have been give CPR has some kind of effect (for good or for ill) on survival rates.

  • \(H_0\): Bood thinners don't have an effect on survival rate - the proportions will be the same for each group (\(p_{treatment} = p_{control}\)).
  • \(H_A\): Blood thinners impact survival, either positive or negative (\(p_{treatment} - p_{control} \neq 0\)).

We're going to use a significance level of 0.05.

ALPHA = 0.05

Set Up

Imports

from argparse import Namespace
from functools import partial

# pypi
from numpy.random import default_rng
from tabulate import tabulate

import holoviews
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews, Timer

Random Generator

numpy_random = default_rng()

The Timer

TIMER = Timer()

Plotting

SLUG = "cpr-and-blood-thinners"
path = f"files/posts/{SLUG}"
Embed = partial(EmbedHoloviews, folder_path=path)

Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

The Table Printer

TABLE = partial(tabulate, headers="keys", tablefmt="orgtbl")

Middle

The Study Data

data = pandas.DataFrame.from_dict(
    dict(
        Survived=[11, 14],
        Died = [39, 26]))
data.index = ["Control", "Treatment"]
row = data.sum()
row.name = "Total"
data = data.append(row)
data["Total"] = data.sum(axis="columns")
print(TABLE(data))
  Survived Died Total
Control 11 39 50
Treatment 14 26 40
Total 25 65 90

The fifty people in the Control Group didn't receive blood thinners and the forty in the Treatment Group did.

Now I'll plot it to see what the differences look like.

# put the groups into a column
plotter = data.reset_index()

# get rid of the totals
del(plotter["Total"])
plotter = plotter.iloc[:-1]

plotter = plotter.melt(id_vars=["index"],
                       value_vars=["Survived", "Died"])

plotter = plotter.rename(columns=dict(
    index="Group",
    variable="Outcome",
    value="Count"))
plot = plotter.hvplot.bar(x="Group", y="Count", by="Outcome").opts(
    title="Survived or Died",
    width=Plot.width,
    height=Plot.height,
    color=Plot.color_cycle,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="survived_or_died")()
print(outcome)

Figure Missing

Hard to say, but it looks like more of the Treatment Group survived.

The Porportions

We are interested in the difference in survival rates so we need to calculate the proprtion that survived in each group and take the difference.

survival_rates = (data["Survived"] / data.Total).reset_index().rename(
    columns={"index": "Group", 0: "Survival Rate"})
print(TABLE(survival_rates, showindex=False))
Group Survival Rate
Control 0.22
Treatment 0.35
Total 0.277778
survival_rates = survival_rates.set_index("Group")
TEST_STATISTIC = survival_rates.loc["Treatment"] - survival_rates.loc["Control"]
print(f"Difference: {TEST_STATISTIC}")
Difference: Survival Rate    0.13
dtype: float64

It appears that 13% more people survive when they are given blood thinners than when they aren't.

Simulation

Now I'll run a simulation to build up the null distribution of the differences.

Set Up The Urn

The urn will be based on the total row on the contingency table. The that number survived will be represented as ones and the number that died will be represented as zeros.

total = data.loc["Total"]
survived = numpy.ones(total.Survived)
died = numpy.zeros(total.Died)
urn = numpy.append(survived, died)

Now we'll run the simulation to get the distribution.

TRIALS = 10**6
CONTROL_GROUP = data.Total.loc["Control"]
TREATMENT_GROUP = data.Total.loc["Treatment"]
survived = data["Survived"]

with TIMER:
    controls = numpy.array([numpy_random.choice(urn,
                                                CONTROL_GROUP,
                                                replace=False).sum()
                            for trial in range(TRIALS)])
    treatments = survived.loc["Total"] - controls

    control_proportions = controls/CONTROL_GROUP
    treatment_proportions = treatments/TREATMENT_GROUP

    differences = treatment_proportions - control_proportions
    differences = pandas.DataFrame.from_dict({"Point Estimate": differences})
2020-10-01 19:38:55,370 graeae.timers.timer start: Started: 2020-10-01 19:38:55.370038
2020-10-01 19:39:31,437 graeae.timers.timer end: Ended: 2020-10-01 19:39:31.437364
2020-10-01 19:39:31,438 graeae.timers.timer end: Elapsed: 0:00:36.067326

Plot the distribution.

plot = differences.hvplot.hist("Point Estimate").opts(
    title="Null Distribution of Point Estimate",
    width=Plot.width,
    height=Plot.height,
    color=Plot.tan,
    fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="null_distribution")()
print(outcome)

Figure Missing

Now we want to try a two-sided hypothesis test.

right_side = (differences >= TEST_STATISTIC.values).sum()
total = right_side * 2
p_value = total / len(differences)
print(f"One-Sided p-value: {(right_side/len(differences)).values[0]}")
print(f"Two-sided p-value: {p_value.values[0]}")
One-Sided p-value: 0.129026
Two-sided p-value: 0.258052
print(f"We reject the Null Hypothesis: {p_value.values[0] < ALPHA}")
We reject the Null Hypothesis: False

Even a single-sided test wouldn't have provided enough evidence to reject the null hypothesis, but with the two-sided version it looks really like the difference was from chance.

End

Although it initially looked like the use of blood thinners after someone has been given CPR helped survival rates our data does not contain evidence to say that it does.

Opportunity Cost Neglect

Citation

  • Frederick S, Novemsky N, Wang J, Dhar R, Nowlis S. Opportunity cost neglect. Journal of Consumer Research. 2009 Dec 1;36(4):553-61.

Abstract

And

To properly consider the opportunity costs of a purchase, consumers must actively generate the alternatives that it would displace.

But

The current research suggests that consumers often fail to do so. Even under conditions promoting cognitive effort, various cues to consider opportunity costs reduce purchase rates and increase the choice share of more affordable options. Sensitivity to such cues varies with chronic dispositional differences in spending attitudes.

Therefore

We discuss the implications of these results for the marketing strategies of economy and premium brands.

Opportunity Cost

Beginning

We want to answer the question of whether reminding college students that spending money now deprives them of money in the future will affect their spending behavior.

Imports

# python
from argparse import Namespace
from functools import partial

# pypi
from expects import (
    equal,
    expect,
)
from numpy.random import default_rng
from tabulate import tabulate

import holoviews
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews, Timer

Set Up

Randomness

numpy_random = default_rng()

Table

TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")

Timer and Plotting

TIMER = Timer()

SLUG = "opportunity-cost"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/{SLUG}")

Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

Middle

Hypotheses

  • \(H_0:\) Null Hypothesis. Reminding students that they can save money for future purchases won't impact their spending decisions.
  • \(H_A:\) Alternative Hypothesis. Reminding students that they can save money for future purchases will reduce their spending.

I'll us a 95% significance level.

ALPHA = "0.05"

The Study Data

150 Students were given the following statement:

Imagine that you have been saving some extra money on the side to make some purchases and on your most recent visit to the video store you come across a special sale on a new video. This video is one with your favorite actor or actress, and your favorite type of movie (such as comedy, drama, thriller, etc.). This particular video that you are considering is one you have been thinking about buying for a long time. It is available for a special sale price of $14.99.

What would you do in this situation? Please circle one of the options below.

Video store? Are we going to have to invent a time machine to run this study? No, we'll just swipe the data published by Frederick et al. in 2009. They conducted their study at Arizona State University, which perhaps still had some DVD stores around back then. The study was building on prior work that showed that people focus exclusively on explicitly presented details and ignore the facts that the explicit statements imply but don't state. (Also, I just looked it up and there's quite a few video stores around me (here in Portland) I guess I just haven't been to one in a while).

The options that the subjects were given to circle depended on which group they belonged to. The control group (75 students) were given these options:

A. Buy this entertaining video.
B. Not buy this entertaining video.

The treatment group (also 75 students) were given these options:

A. Buy this entertaining video.
B. Not buy this entertaining video. Keep the $14.99 for other purchases.

Here's the outcome of the study.

Outcome = Namespace(
    buy="Buy DVD",
    dont_buy="Not Buy DVD",
    control="Control Group",
    treatment="Treatment Group",
    total="Total"
)

data = pandas.DataFrame.from_dict({Outcome.buy:[56, 41],
                                   Outcome.dont_buy: [19, 34]})
data.index = [Outcome.control, Outcome.treatment]
row = data.sum()
row.name = Outcome.total
data = data.append(row)
data[Outcome.total] = data.sum(axis="columns")
print(TABLE(data))
  Buy DVD Not Buy DVD Total
Control Group 56 19 75
Treatment Group 41 34 75
Total 97 53 150
Total = Namespace(
    buy = data.loc[Outcome.total][Outcome.buy],
    dont_buy = data.loc[Outcome.total][Outcome.dont_buy],
    group = data.loc[Outcome.control][Outcome.total],
    students = data.loc[Outcome.total][Outcome.total],
)

Now a little plotting.

# put the groups into a column
plotter = data.reset_index()

# get rid of the totals
del(plotter["Total"])
plotter = plotter.iloc[:-1]

# move the outcome headers into a column
plotter = plotter.melt(id_vars=["index"],
                       value_vars=[Outcome.buy, Outcome.dont_buy])
plotter = plotter.rename(columns=dict(
    index="Group",
    variable="Outcome",
    value="Count"))
plot = plotter.hvplot.bar(x="Group", y="Count", by="Outcome").opts(
    title="Buy Or Don't Buy DVD",
    width=Plot.width,
    height=Plot.height,
    color=Plot.color_cycle,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="buy_dont_buy")()
print(outcome)

Figure Missing

It looks like there was a significant difference since not only did the majority of the treatment group opt not to buy the DVD, but in the control a significant majority did.

Now as row-proportions.

proportions = data.divide(data[Outcome.total], axis="rows")
print(TABLE(proportions))
  Buy DVD Not Buy DVD Total
Control Group 0.746667 0.253333 1
Treatment Group 0.546667 0.453333 1
Total 0.646667 0.353333 1

Looking at the proportions it looks like quite a bit more didn't buy the DVD, but let's run the experiment and see.

Point Estimate of Effect

The thing we are interested in is whether the wording of the second option to not buy the DVD made a difference, so our statistic of interest is the difference in proportions of the control and treatment group participants who didn't buy the DVD.

\begin{align} \hat{p}_{control} &= \frac{\textrm{Control Group that wouldn't buy}}{\textrm{size of Control Group}}\\ \hat{p}_{treatment} &= \frac{\textrm{Treatment Group that wouldn't buy}}{\textrm{size of Treament Group}}\\ \hat{p} &= \hat{p}_{treatment} - \hat{p}_{control} \end{align}
POINT_ESTIMATE = (proportions[Outcome.dont_buy].loc[Outcome.treatment]
                  - proportions[Outcome.dont_buy].loc[Outcome.control])
print(f"{POINT_ESTIMATE * 100:.2f} %")
20.00 %

About twenty percent more of the treatment group said they would abstain from the purchase than control group.

Simulating Random Chance

In a previous post looking at Gender Discrimination Inference I split the Urn into a 50-50 split of males and females to see if there was gender bias in choosing whether to promote them. In this case it's going to work a little differently.

We are asking here whether our treatment had an effect. If it didn't then we would expect that the distribution of "buy" and "don't buy" that we saw represents the distribution of the underlying population of ASU students, so we need to split our urn up to match the counts in the "Buy" and "Don't Buy" columns and then randomly split it into two equal groups. If the difference we saw was the result of random chance then we would expect that the difference between "buy" and "don't buy" group would be equal most of the time. This doesn't seem as intuitive a way to set it up as the previous method, but we'll see how it goes.

Set Up the Urn

buy = data.loc[Outcome.total][Outcome.buy]
dont = data.loc[Outcome.total][Outcome.dont_buy]

bought = numpy.zeros(buy)
didnt = numpy.ones(dont)

URN = numpy.append(bought, didnt)

expect(len(URN)).to(equal(STUDENTS))
expect(buy).to(equal(97))
expect(dont).to(equal(53))
expect(URN.sum()).to(equal(dont))

Simulate

The Process:

  1. Shuffle the urn
  2. Split the shuffled urn in half (control and treatment)
  3. Count the proportion of each half that said they would not buy the DVD
  4. Record the differences between the proportions of the control that wouldn't buy and the treatment that wouldn't buy

But, since I'm doing this with numpy it works a little different.

My process:

  1. Randomly choose half the urn (without replacement) to get the control group
  2. Sum the choices (this gives the count of the control that said they wouldn't buy)
  3. Subtract the previous sum from the total that said they wouldn't buy (to get the treatment count)
  4. Calculate the proportion of each group that said they wouldn't buy
  5. Find the difference in proportion for each group
TRIALS = 1 * 10**6

with TIMER:
    controls = numpy.array([numpy_random.choice(URN,
                                                Total.group,
                                                replace=False).sum()
                            for trial in range(TRIALS)])
    treatments = Total.dont_buy - controls

    control_proportions = controls/Total.group
    treatment_proportions = treatments/Total.group

    differences = control_proportions - treatment_proportions
    simulation = pandas.DataFrame.from_dict({
        "Point Estimate": differences,
        "Control": control_proportions,
        "Treatment": treatment_proportions,
    })
2020-09-26 20:02:52,224 graeae.timers.timer start: Started: 2020-09-26 20:02:52.224515
2020-09-26 20:03:29,174 graeae.timers.timer end: Ended: 2020-09-26 20:03:29.174373
2020-09-26 20:03:29,175 graeae.timers.timer end: Elapsed: 0:00:36.949858

Plot the Distribution

plot = simulation.hvplot.hist("Point Estimate",
                              bins=25).opts(
    title="Opportunity Cost Don't Buy Difference",
    width=Plot.width,
    height=Plot.height,
    color=Plot.tan,
    )
outcome = Embed(plot=plot, file_name="opportunity_cost_simulation")()
print(outcome)

Figure Missing

The distribution appears to be reasonably normal, if a bit peaked in the middle.

matched = len(simulation[simulation["Point Estimate"] >= POINT_ESTIMATE])/len(simulation)
print(f"Percent of trials >= Point Estimate of Study: {100 * matched:0.3f} %")
Percent of trials >= Point Estimate of Study: 0.817 %

According to our simulation, less than one percent of the time we would see a difference like the study found by random chance alone.

Test Our Hypothesis

print(f"Null Hypothesis is {matched >= POINT_ESTIMATE}")
Null Hypothesis is False

So we'll conclude that the Null Hypothesis is false and that the Alternative Hypothesis that telling students about the opportunity cost of buying a DVD does have an effecte in getting them to not buy the DVD.

End

So here we have a walk through of Inference using simulation and an experiment with Control and Treatment groups. Although the conclusion reached is that reminding students of the money they would have in the future if they didn't spend it is "causal", since this was an experiment, I'm not 100% convinced that asking studnents what the

In the Abstract

  1. Create an urn where the ones and zeros are equal to the totals for each of the outcome columns
  2. Sample from the urn a simulated control group and treatment group (matching the sizes of the original groups)
  3. Find the difference between the proportion of the control group and the treatment group that match the "success" outcome
  4. Calculate the fraction of the differences that are equal to or greater than the differences in the study
  5. Check if the fraction in the simulation meets your confidence level

Gender Discrimination Inference

Beginning

Imports

# python
from argparse import Namespace
from functools import partial

# pypi
from expects import (
    equal,
    expect,    
)
from numpy.random import default_rng
from scipy import stats
from tabulate import tabulate

import holoviews
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews, Timer

Randomness

numpy_random = default_rng()

Plotting, Tables, and a Timer

Plotting

Embed = partial(EmbedHoloviews,
                folder_path="files/posts/gender-discrimination/")
Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

The Table

TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")

The Timer

TIMER = Timer()

Middle

The Original Study

This starts with a study where bank managers were each given a personnel file and asked to decide if they would promote the person represented by the file to branch manager. The files were identical but half of them were filled out as male and half as female. The researchers wanted to know if these manager were biased in favor of men. There was actually two types of jobs, one described by the authors as "simple" and one as "complex". This only looks at the "simple" job. The table below shows their results.

study = pandas.DataFrame({"Promoted": [21, 14],
                          "Not Promoted":[3, 10]},
                         index=["Male", "Female"])
row = study.sum()
row.name = "Total"
study = study.append(row)
column = study.sum(axis="columns")
study["Total"] = column
print(TABLE(study))
  Promoted Not Promoted Total
Male 21 3 24
Female 14 10 24
Total 35 13 48

Let's look at the values (without the totals) as a plot.

plotter = study.reset_index()
del(plotter["Total"])
plotter = plotter.iloc[:-1]
plotter = plotter.melt(id_vars=["index"],
                       value_vars=["Promoted", "Not Promoted"])
plotter = plotter.rename(columns=dict(index="Gender",
                                      value="Count",
                                      variable="Decision"))
plot = plotter.hvplot.bar(x="Gender", y="Count", by="Decision").opts(
    title="Outcome by Gender",
    height=Plot.height,
    width=Plot.width,
    fontscale=Plot.fontscale,
    color=Plot.color_cycle,
)
outcome = Embed(plot=plot, file_name="promotions_by_gender")()
print(outcome)

Figure Missing

It looks like a considerable majority of the males were promoted whereas for the females only a slight majority were promoted.

plot = plotter.hvplot.bar(x="Decision", y="Count", by="Gender").opts(
    title="Male Vs Female by Decision",
    height=Plot.height,
    width=Plot.width,
    fontscale=Plot.fontscale,
    color=Plot.color_cycle,
)
outcome = Embed(plot=plot, file_name="decision_by_gender")()
print(outcome)

Figure Missing

This plot doesn't make the disparity look quite as extreme as the previous plot did, I think.

About the Variables

I started using them without explicity stating it but we have two variable here - Promoted and Not Promoted are part of the decision variable and Male and Female are part of the gender variable.

Anyway… so more males were chosen for promotion than female, but by what proportion?

print(TABLE(study/study.loc["Total"]))
  Promoted Not Promoted Total
Male 0.6 0.230769 0.5
Female 0.4 0.769231 0.5
Total 1 1 1

So 60% of the promoted were male and 40% of the promoted were female.

print(f"{.6/.4:.2f}")
1.50

If you were promoted you were one and a half times more likely to be male. Another way to look at it is to ask - What proportion of each gender was promoted?

fractions = study/study.Total.values
print(TABLE(fractions))
  Promoted Not Promoted Total
Male 0.875 0.125 0.5
Female 0.583333 0.416667 0.5
Total 1.45833 0.541667 1

So about 88% of the males were promoted while only 58% of the females were promoted.

POINT_ESTIMATE = (fractions.Promoted.loc['Male']
                  - fractions.Promoted.loc['Female'])
print(f"{POINT_ESTIMATE:0.2f}")
0.29

There was a 30% difference between the rate of male promotion and the rate of female promotion. The question we have now is - could this difference have happened by random chance or is this evidence of bias?.

The Experiment

We have a point estimate of the difference of 0.29 - is this evidence of bias?

Our Hypotheses

  • \(H_0\): Null Hypothesis. The variables gender and =decision are independent and the observed difference was due to chance.
  • \(H_A\): Alternative Hypothesis. The variables gender and decision are not independent and the difference was not due to chance, but rather women were less likely to be promoted than men.

I'm going to pick an arbitrary confidence interval of 0.95.

ALPHA = 0.05

The Simulation

The basic method here is we'll create an "urn" with an equal number of "balls" for men and women (24 of each in this case) and then randomly select 35 balls representing the number that were promoted and see the difference in the fraction of males promoted vs the number of females. To make the math simple I'll run it a number of times that is a power of 10.

  • Some Setup

    To make the counting easier I'll set males to be 1 and females to be 0 (so the number of males promoted is the sum and the females is the remainder).

    NUMBER_OF_MALES = NUMBER_OF_FEMALES = 24
    MALE = 1
    FEMALE = 0
    PROMOTED = 35
    MALES = numpy.ones(NUMBER_OF_MALES)
    FEMALES = numpy.zeros(NUMBER_OF_FEMALES)
    URN = numpy.append(MALES, FEMALES)
    
    expect(len(URN)).to(equal(NUMBER_OF_MALES + NUMBER_OF_FEMALES))
    expect(URN.sum()).to(equal(NUMBER_OF_MALES))
    
  • The Trials
    TRIALS = 10**7
    males_promoted = [numpy_random.choice(URN, PROMOTED, replace=False).sum()
                      for trial in range(TRIALS)]
    females_promoted = [PROMOTED - males for males in males_promoted]
    proportion_males_promoted = [males/NUMBER_OF_MALES for males in males_promoted]
    proportion_females_promoted = [females/NUMBER_OF_FEMALES for females in females_promoted]
    pairs = zip(proportion_males_promoted, proportion_females_promoted)
    differences = [males - females for males, females in pairs]
    

Looking at the Distribution

data = pandas.DataFrame.from_dict({
    "Point Estimate": differences,
    "Males Promoted": males_promoted,
    "Females Promoted": females_promoted,
    "Proportion Males Promoted": proportion_males_promoted,
    "Proportion Females Promoted": proportion_females_promoted,
})
plot = data.hvplot.hist("Point Estimate").opts(
    title="Distribution of Differences in Gender Promotion",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="difference_distribution")()
print(outcome)

Figure Missing

The distribution looks mostly normal. As you might guess the distribution is centered around 0 (cases where exactly the same number of males and females were promoted - although 35 were promoted each time so the trials are never exactly 0). The cases where the difference in proportion is as great or greater than as it was in the original study are in the rightmost two bins.

I should note that because the sample size is so small, there's a weird distribution of the points - there's not enough variation in the differences to make a smooth curve (thus the gaps in the histogram).

output = stats.probplot(data["Point Estimate"])
theoretical, actual = output[0]
slope, intercept, r = output[1]

first = theoretical.min()
last = theoretical.max()

line = holoviews.Curve([(first, slope * first + intercept), (last, slope * last + intercept)],
                       color=Plot.red)

qq_frame = pandas.DataFrame.from_dict({
    "Theoretical Quantiles": theoretical,
    "Differences In Hiring Rates": actual,
})

scatter = qq_frame.hvplot.scatter(x="Theoretical Quantiles",
                                  y="Differences In Hiring Rates", datashade=True)
plot = (scatter * line).opts(
    title="Hiring Probability Plot",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="differences_qq")()
print(outcome)

Figure Missing

Hmm… I'm not sure I'm doing that right. But it looks reasonable, I suppose.

lowest = data["Point Estimate"].min()
data_sorted = data["Point Estimate"].sort_values()

max_loc = int(len(data_sorted) * 0.95)
highest = data_sorted.iloc[max_loc]

print(f"Point Estimate: {POINT_ESTIMATE: 0.3f}")
print(f"({lowest: 0.3f} , {highest: 0.3f})")
Point Estimate:  0.292
(-0.542 ,  0.208)

Looking at the confidence interval we can see that the original point estimate is higher than 95 % of the values, but the original book example used a hypothesis test so let's finish it up.

A Hypothesis Test

But anyway, what proportion of our simulations had as much or more of a difference than that found in the study?

proportion = len(data[data["Point Estimate"] >= POINT_ESTIMATE])/TRIALS
print(f"Proportion: {100 * proportion:0.3f} %")
Proportion: 2.450 %
print(f"Probability of our study's difference in promotion between genders by chance alone: {proportion:0.2f}.")
print(f"Our tolerance was {ALPHA:0.2f}.")
Probability of our study's difference in promotion between genders by chance alone: 0.02.
Our tolerance was 0.05.

So we reject the null hypothesis and conclude that there is a statistically significant chance that the number of women promoted vs men in the original study was the result of bias.

End

Well, that was the replication (sort of) of this problem from Introductory Statistics with Randomization and Simulation. The point of it was to both review Hypothesis Testing and see how it can be done using simulation rather than a p-test.

In Abstract

  1. Is your problem that you suspect some kind of bias in outcomes for two groups?
  2. Get the Point Estimate for the value you want to test
  3. State the Null Hypothesis that what happened could happen by chance and the Alternative Hypothesis that there was bias involved.
  4. Decide on a tolerance level for the probability that it happened by chance.
  5. Set up your urn
  6. Simulate random selections for a large number of trials.
  7. Calculate the proporiton of the trials that were greater than the original studies Point Estimate.
  8. Make a conclusion whether the original outcome could have happened by random chance or not.

Source

Oregon Covid-19 Tables

Beginning

Set Up

Imports

# python
from argparse import Namespace
from pathlib import Path
from functools import partial

import os

# pypi
from dotenv import load_dotenv

import holoviews
import hvplot.pandas
import pandas
import tabula

from graeae import EmbedHoloviews

Plotting

SLUG = "oregon-covid-19-tables"
Embed = partial(EmbedHoloviews, folder_path = f"files/posts/{SLUG}")

Plot = Namespace(
    width=990,
    height=780,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    font_scale=2,
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

Middle

Loading Some Tables

BASE = "COVID-19-Weekly-Report-{date}-FINAL.pdf"

DATE = "2020-05-27"
load_dotenv(Path("~/.local/share/env").expanduser())
path = Path(os.environ["OREGON_COVID"]).expanduser()
filename = path/BASE.format(date=DATE)
assert filename.is_file()
tables = tabula.read_pdf(filename, pages="all")
print(type(tables))
print(len(tables))
<class 'list'>
15

Okay, so it found 15 tables, which should have been loaded as pandas DataFrames. Since a list doesn't really provide an easy way to get metadata, maybe I'll just traverse the list and see what's there.

for table in tables:
    print(table.head(1))
    print("*" * 20)
    Sex  Cases % of total cases Cases per 10,000a  Deaths Case fatality (%)  \
0  Male  1,874            47.5%               8.7      85              4.5%   

   Hospitalized  
0           402  
********************
  Age group Cases % of total cases  Cases per 10,000a  Deaths  \
0       0-9    53             1.3%                1.1       0   

  Case fatality (%)  Hospitalized  
0                0%             6  
********************
  Unnamed: 0  Race  Cases % of total cases Cases per 10,000b  Deaths  \
0      White   NaN  1,946            49.3%               6.2   109.0   

  Case fatality (%)  Hospitalized  
0              5.6%         452.0  
********************
  Ethnicity Case count % of total cases Cases per 10,000a  Deaths  \
0  Hispanic      1,317            33.4%              23.7      14   

  Case fatality (%)  Hospitalized  
0              1.1%           178  
********************
                Facility name     County First reported  Total casesa  \
0  Healthcare at Foster Creek  Multnomah      3/24/2020         119.0   

   Total deathsa  
0           30.0  
********************
   ZIP code Number of cases Cases per 10,000
0     97002             1–9              n/a
********************
   97053 1–9  n/a
0  97054   0  0.0
********************
   97148   0  0.0
0  97201  13  8.4
********************
   97324   0   0.0
0  97325  10  15.1
********************
   97411    0  0.0
0  97415  1–9  n/a
********************
   97501 1–9  n/a
0  97502  13  4.7
********************
   97813  1–9  n/a
0  97814  1–9  n/a
********************
  Hospital                    Max Patients    Max Patients.1
0      NaN  (Both suspected and confirmed)  (Confirmed only)
********************
                               Hospital
0  Asante Rogue Regional Medical Center
********************
  Providence Newberg Medical Center
0       Providence Seaside Hospital
********************

Most of the tables appear to be either fragments or case-counts based on different categorizations.

COLUMNS = Namespace(
    sex="Sex",
    cases="Cases",
    cases_per_10000 = "Cases per 10,000a",
    percent = "% of total cases",
)

By Sex

tables_2 = dict(sex=tables[0])

print(tables_2["sex"])
             Sex  Cases % of total cases Cases per 10,000a  Deaths  \
0           Male  1,874            47.5%               8.7      85   
1         Female  2,071            52.4%               9.9      63   
2     Non-Binary      1               0%               n/a       0   
3  Not available      3             0.1%               n/a       0   
4          Total  3,949           100.0%               9.3     148   

  Case fatality (%)  Hospitalized  
0              4.5%           402  
1              3.0%           344  
2                0%             0  
3                0%             1  
4              3.7%           747  

One thing to notice is that there are "n/a", ",", and "%" strings in there that prevent some columns from being treated as numbers.

tables_2["sex"].info()
males_females = tables[0].iloc[:2]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sex                5 non-null      object
 1   Cases              5 non-null      object
 2   % of total cases   5 non-null      object
 3   Cases per 10,000a  5 non-null      object
 4   Deaths             5 non-null      int64 
 5   Case fatality (%)  5 non-null      object
 6   Hospitalized       5 non-null      int64 
dtypes: int64(2), object(5)
memory usage: 408.0+ bytes

So to use most of the columns you have to do something to convert them back to numbers. I'll start by looking at male and female cases adjusted by population.

def bar_plot(data: pandas.DataFrame, x_column: str, y_column: str,
             title:str, filename: str) -> str:
    """Plot the data as a bar-plot

    Args:
     data: thing to plot
     x_column: name of column for the x-axis
     y_column: name of the column with the bar height
     title: Title for the plot
     filename: Name to save the plot

    Returns:
     html: the html to use to embed the plot into org
    """

    plot = data.hvplot.bar(x=x_column,
                           y=y_column,
                           color=Plot.color_cycle).opts(
                               width=Plot.width,
                               height=Plot.height,
                               fontscale=Plot.font_scale,
                               title=title,
)

    outcome = Embed(plot=plot, file_name=filename)()
    return outcome
gender_bar = partial(bar_plot, data=males_females, x_column=COLUMNS.sex)

Total Cases

First, make the counts integers.

males_females.loc[:, COLUMNS.cases] = males_females[COLUMNS.cases].str.replace(
    ",", "").astype(int)

Now a little bar plot.

outcome = gender_bar(y_column=COLUMNS.cases,
                     title="Cases by Gender",
                     filename="cases_by_gender")
print(outcome)

Figure Missing

There were slightly more female cases than male cases.

Case Percentages

males_females.loc[:, COLUMNS.percent] = males_females[COLUMNS.percent].str.replace(
    "%", "").astype(float)
outcome = gender_bar(y_column=COLUMNS.percent,
                     title="Case Percentage by Gender",
                     filename="percentage_by_gender")
print(outcome)

Figure Missing

Cases Adjusted For Population

males_females.loc[:, COLUMNS.cases_per_10000] = males_females[
    COLUMNS.cases_per_10000].astype(float)

plot = males_females.hvplot.bar(x=COLUMNS.sex, y=COLUMNS.cases_per_10000,
                                color=Plot.color_cycle,
                                fontscale=Plot.font_scale).opts(
    title="Cases per 10,000 Population by Gender",
    width=Plot.width,
    height=Plot.height)

outcome = Embed(plot=plot, file_name="cases_by_sex_and_population")()
print(outcome)

Figure Missing

End