State Zip Code GeoJSON Repository
Citation
- GitHub - OpenDataDE/State-zip-code-GeoJSON: Zip code boundaries for each of the 50 states [Internet]. [cited 2020 Nov 20]. Available from: https://github.com/OpenDataDE/State-zip-code-GeoJSON
# python
from functools import partial
# pypi
import geopandas
import hvplot.pandas
import matplotlib.pyplot as pyplot
import pandas
# my stuff
from graeae import EmbedHoloviews
Embed = partial(EmbedHoloviews, folder_path="files/posts/the-geopandas-dataframe-coordinates-example")
data = pandas.DataFrame(
{'City': ['Buenos Aires', 'Brasilia', 'Santiago', 'Bogota', 'Caracas'],
'Country': ['Argentina', 'Brazil', 'Chile', 'Colombia', 'Venezuela'],
'Latitude': [-34.58, -15.78, -33.45, 4.60, 10.48],
'Longitude': [-58.66, -47.91, -70.66, -74.08, -66.86]})
geoframe = geopandas.GeoDataFrame(data,
geometry=geopandas.points_from_xy(
data.Longitude, data.Latitude))
print(geoframe.head())
City Country Latitude Longitude geometry 0 Buenos Aires Argentina -34.58 -58.66 POINT (-58.66000 -34.58000) 1 Brasilia Brazil -15.78 -47.91 POINT (-47.91000 -15.78000) 2 Santiago Chile -33.45 -70.66 POINT (-70.66000 -33.45000) 3 Bogota Colombia 4.60 -74.08 POINT (-74.08000 4.60000) 4 Caracas Venezuela 10.48 -66.86 POINT (-66.86000 10.48000)
# read in the plotting data
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# restrict it to South America.
ax = world[world.continent == 'South America'].plot(
color='white', edgecolor='black')
# We can now plot our ``GeoDataFrame``.
geoframe.plot(ax=ax, color='red')
pyplot.savefig("files/posts/the-geopandas-dataframe-coordinates-example/south_america.png")
Since this is going to be interactive we can add more to the plot.
print(world.head())
pop_est continent name iso_a3 gdp_md_est \ 0 920938 Oceania Fiji FJI 8374.0 1 53950935 Africa Tanzania TZA 150600.0 2 603253 Africa W. Sahara ESH 906.5 3 35623680 North America Canada CAN 1674000.0 4 326625791 North America United States of America USA 18560000.0 geometry 0 MULTIPOLYGON (((180.00000 -16.06713, 180.00000... 1 POLYGON ((33.90371 -0.95000, 34.07262 -1.05982... 2 POLYGON ((-8.66559 27.65643, -8.66512 27.58948... 3 MULTIPOLYGON (((-122.84000 49.00000, -122.9742... 4 MULTIPOLYGON (((-122.84000 49.00000, -120.0000...
Let's add the population estimate to the countries and the city to the points.
world_plot = world[world.continent == "South America"].hvplot(
color="white",
x="Longitude",
y="Latitude",
legend=False,
hover_cols=["pop_est", "name"])
# NOTE: the overlay uses the pandas.DataFrame, not the GeoDataFrame
points = data.hvplot.points("Longitude", "Latitude", hover_cols=["City"])
plot = (world_plot * points).opts(
title="South American Cities",
width=700,
height=890,
)
outcome = Embed(plot=plot, file_name="south_america", height_in_pixels=1000)()
print(outcome)
This was mostly a check to see if I could get it working so I'll have to think about adding more explanations later.
pip install descartes
)pip install geoviews
)This is a creative commons book that is for sale but is also available to read online or as a free PDF from Green Tea Press. There is also a github repository here.
We want to test the hypothesis that blood thinners administered to patients that have been give CPR has some kind of effect (for good or for ill) on survival rates.
We're going to use a significance level of 0.05.
ALPHA = 0.05
from argparse import Namespace
from functools import partial
# pypi
from numpy.random import default_rng
from tabulate import tabulate
import holoviews
import hvplot.pandas
import numpy
import pandas
# my stuff
from graeae import EmbedHoloviews, Timer
numpy_random = default_rng()
TIMER = Timer()
SLUG = "cpr-and-blood-thinners"
path = f"files/posts/{SLUG}"
Embed = partial(EmbedHoloviews, folder_path=path)
Plot = Namespace(
width=990,
height=780,
fontscale=2,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)
TABLE = partial(tabulate, headers="keys", tablefmt="orgtbl")
data = pandas.DataFrame.from_dict(
dict(
Survived=[11, 14],
Died = [39, 26]))
data.index = ["Control", "Treatment"]
row = data.sum()
row.name = "Total"
data = data.append(row)
data["Total"] = data.sum(axis="columns")
print(TABLE(data))
Survived | Died | Total | |
---|---|---|---|
Control | 11 | 39 | 50 |
Treatment | 14 | 26 | 40 |
Total | 25 | 65 | 90 |
The fifty people in the Control Group didn't receive blood thinners and the forty in the Treatment Group did.
Now I'll plot it to see what the differences look like.
# put the groups into a column
plotter = data.reset_index()
# get rid of the totals
del(plotter["Total"])
plotter = plotter.iloc[:-1]
plotter = plotter.melt(id_vars=["index"],
value_vars=["Survived", "Died"])
plotter = plotter.rename(columns=dict(
index="Group",
variable="Outcome",
value="Count"))
plot = plotter.hvplot.bar(x="Group", y="Count", by="Outcome").opts(
title="Survived or Died",
width=Plot.width,
height=Plot.height,
color=Plot.color_cycle,
fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="survived_or_died")()
print(outcome)
Hard to say, but it looks like more of the Treatment Group survived.
We are interested in the difference in survival rates so we need to calculate the proprtion that survived in each group and take the difference.
survival_rates = (data["Survived"] / data.Total).reset_index().rename(
columns={"index": "Group", 0: "Survival Rate"})
print(TABLE(survival_rates, showindex=False))
Group | Survival Rate |
---|---|
Control | 0.22 |
Treatment | 0.35 |
Total | 0.277778 |
survival_rates = survival_rates.set_index("Group")
TEST_STATISTIC = survival_rates.loc["Treatment"] - survival_rates.loc["Control"]
print(f"Difference: {TEST_STATISTIC}")
Difference: Survival Rate 0.13 dtype: float64
It appears that 13% more people survive when they are given blood thinners than when they aren't.
Now I'll run a simulation to build up the null distribution of the differences.
The urn will be based on the total row on the contingency table. The that number survived will be represented as ones and the number that died will be represented as zeros.
total = data.loc["Total"]
survived = numpy.ones(total.Survived)
died = numpy.zeros(total.Died)
urn = numpy.append(survived, died)
Now we'll run the simulation to get the distribution.
TRIALS = 10**6
CONTROL_GROUP = data.Total.loc["Control"]
TREATMENT_GROUP = data.Total.loc["Treatment"]
survived = data["Survived"]
with TIMER:
controls = numpy.array([numpy_random.choice(urn,
CONTROL_GROUP,
replace=False).sum()
for trial in range(TRIALS)])
treatments = survived.loc["Total"] - controls
control_proportions = controls/CONTROL_GROUP
treatment_proportions = treatments/TREATMENT_GROUP
differences = treatment_proportions - control_proportions
differences = pandas.DataFrame.from_dict({"Point Estimate": differences})
2020-10-01 19:38:55,370 graeae.timers.timer start: Started: 2020-10-01 19:38:55.370038 2020-10-01 19:39:31,437 graeae.timers.timer end: Ended: 2020-10-01 19:39:31.437364 2020-10-01 19:39:31,438 graeae.timers.timer end: Elapsed: 0:00:36.067326
Plot the distribution.
plot = differences.hvplot.hist("Point Estimate").opts(
title="Null Distribution of Point Estimate",
width=Plot.width,
height=Plot.height,
color=Plot.tan,
fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="null_distribution")()
print(outcome)
Now we want to try a two-sided hypothesis test.
right_side = (differences >= TEST_STATISTIC.values).sum()
total = right_side * 2
p_value = total / len(differences)
print(f"One-Sided p-value: {(right_side/len(differences)).values[0]}")
print(f"Two-sided p-value: {p_value.values[0]}")
One-Sided p-value: 0.129026 Two-sided p-value: 0.258052
print(f"We reject the Null Hypothesis: {p_value.values[0] < ALPHA}")
We reject the Null Hypothesis: False
Even a single-sided test wouldn't have provided enough evidence to reject the null hypothesis, but with the two-sided version it looks really like the difference was from chance.
Although it initially looked like the use of blood thinners after someone has been given CPR helped survival rates our data does not contain evidence to say that it does.
To properly consider the opportunity costs of a purchase, consumers must actively generate the alternatives that it would displace.
The current research suggests that consumers often fail to do so. Even under conditions promoting cognitive effort, various cues to consider opportunity costs reduce purchase rates and increase the choice share of more affordable options. Sensitivity to such cues varies with chronic dispositional differences in spending attitudes.
We discuss the implications of these results for the marketing strategies of economy and premium brands.
We want to answer the question of whether reminding college students that spending money now deprives them of money in the future will affect their spending behavior.
# python
from argparse import Namespace
from functools import partial
# pypi
from expects import (
equal,
expect,
)
from numpy.random import default_rng
from tabulate import tabulate
import holoviews
import hvplot.pandas
import numpy
import pandas
# my stuff
from graeae import EmbedHoloviews, Timer
numpy_random = default_rng()
TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")
TIMER = Timer()
SLUG = "opportunity-cost"
Embed = partial(EmbedHoloviews,
folder_path=f"files/posts/{SLUG}")
Plot = Namespace(
width=990,
height=780,
fontscale=2,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)
I'll us a 95% significance level.
ALPHA = "0.05"
150 Students were given the following statement:
Imagine that you have been saving some extra money on the side to make some purchases and on your most recent visit to the video store you come across a special sale on a new video. This video is one with your favorite actor or actress, and your favorite type of movie (such as comedy, drama, thriller, etc.). This particular video that you are considering is one you have been thinking about buying for a long time. It is available for a special sale price of $14.99.
What would you do in this situation? Please circle one of the options below.
Video store? Are we going to have to invent a time machine to run this study? No, we'll just swipe the data published by Frederick et al. in 2009. They conducted their study at Arizona State University, which perhaps still had some DVD stores around back then. The study was building on prior work that showed that people focus exclusively on explicitly presented details and ignore the facts that the explicit statements imply but don't state. (Also, I just looked it up and there's quite a few video stores around me (here in Portland) I guess I just haven't been to one in a while).
The options that the subjects were given to circle depended on which group they belonged to. The control group (75 students) were given these options:
A. Buy this entertaining video.
B. Not buy this entertaining video.
The treatment group (also 75 students) were given these options:
A. Buy this entertaining video.
B. Not buy this entertaining video. Keep the $14.99 for other purchases.
Here's the outcome of the study.
Outcome = Namespace(
buy="Buy DVD",
dont_buy="Not Buy DVD",
control="Control Group",
treatment="Treatment Group",
total="Total"
)
data = pandas.DataFrame.from_dict({Outcome.buy:[56, 41],
Outcome.dont_buy: [19, 34]})
data.index = [Outcome.control, Outcome.treatment]
row = data.sum()
row.name = Outcome.total
data = data.append(row)
data[Outcome.total] = data.sum(axis="columns")
print(TABLE(data))
Buy DVD | Not Buy DVD | Total | |
---|---|---|---|
Control Group | 56 | 19 | 75 |
Treatment Group | 41 | 34 | 75 |
Total | 97 | 53 | 150 |
Total = Namespace(
buy = data.loc[Outcome.total][Outcome.buy],
dont_buy = data.loc[Outcome.total][Outcome.dont_buy],
group = data.loc[Outcome.control][Outcome.total],
students = data.loc[Outcome.total][Outcome.total],
)
Now a little plotting.
# put the groups into a column
plotter = data.reset_index()
# get rid of the totals
del(plotter["Total"])
plotter = plotter.iloc[:-1]
# move the outcome headers into a column
plotter = plotter.melt(id_vars=["index"],
value_vars=[Outcome.buy, Outcome.dont_buy])
plotter = plotter.rename(columns=dict(
index="Group",
variable="Outcome",
value="Count"))
plot = plotter.hvplot.bar(x="Group", y="Count", by="Outcome").opts(
title="Buy Or Don't Buy DVD",
width=Plot.width,
height=Plot.height,
color=Plot.color_cycle,
fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="buy_dont_buy")()
print(outcome)
It looks like there was a significant difference since not only did the majority of the treatment group opt not to buy the DVD, but in the control a significant majority did.
Now as row-proportions.
proportions = data.divide(data[Outcome.total], axis="rows")
print(TABLE(proportions))
Buy DVD | Not Buy DVD | Total | |
---|---|---|---|
Control Group | 0.746667 | 0.253333 | 1 |
Treatment Group | 0.546667 | 0.453333 | 1 |
Total | 0.646667 | 0.353333 | 1 |
Looking at the proportions it looks like quite a bit more didn't buy the DVD, but let's run the experiment and see.
The thing we are interested in is whether the wording of the second option to not buy the DVD made a difference, so our statistic of interest is the difference in proportions of the control and treatment group participants who didn't buy the DVD.
\begin{align} \hat{p}_{control} &= \frac{\textrm{Control Group that wouldn't buy}}{\textrm{size of Control Group}}\\ \hat{p}_{treatment} &= \frac{\textrm{Treatment Group that wouldn't buy}}{\textrm{size of Treament Group}}\\ \hat{p} &= \hat{p}_{treatment} - \hat{p}_{control} \end{align}POINT_ESTIMATE = (proportions[Outcome.dont_buy].loc[Outcome.treatment]
- proportions[Outcome.dont_buy].loc[Outcome.control])
print(f"{POINT_ESTIMATE * 100:.2f} %")
20.00 %
About twenty percent more of the treatment group said they would abstain from the purchase than control group.
In a previous post looking at Gender Discrimination Inference I split the Urn into a 50-50 split of males and females to see if there was gender bias in choosing whether to promote them. In this case it's going to work a little differently.
We are asking here whether our treatment had an effect. If it didn't then we would expect that the distribution of "buy" and "don't buy" that we saw represents the distribution of the underlying population of ASU students, so we need to split our urn up to match the counts in the "Buy" and "Don't Buy" columns and then randomly split it into two equal groups. If the difference we saw was the result of random chance then we would expect that the difference between "buy" and "don't buy" group would be equal most of the time. This doesn't seem as intuitive a way to set it up as the previous method, but we'll see how it goes.
buy = data.loc[Outcome.total][Outcome.buy]
dont = data.loc[Outcome.total][Outcome.dont_buy]
bought = numpy.zeros(buy)
didnt = numpy.ones(dont)
URN = numpy.append(bought, didnt)
expect(len(URN)).to(equal(STUDENTS))
expect(buy).to(equal(97))
expect(dont).to(equal(53))
expect(URN.sum()).to(equal(dont))
The Process:
But, since I'm doing this with numpy it works a little different.
My process:
TRIALS = 1 * 10**6
with TIMER:
controls = numpy.array([numpy_random.choice(URN,
Total.group,
replace=False).sum()
for trial in range(TRIALS)])
treatments = Total.dont_buy - controls
control_proportions = controls/Total.group
treatment_proportions = treatments/Total.group
differences = control_proportions - treatment_proportions
simulation = pandas.DataFrame.from_dict({
"Point Estimate": differences,
"Control": control_proportions,
"Treatment": treatment_proportions,
})
2020-09-26 20:02:52,224 graeae.timers.timer start: Started: 2020-09-26 20:02:52.224515 2020-09-26 20:03:29,174 graeae.timers.timer end: Ended: 2020-09-26 20:03:29.174373 2020-09-26 20:03:29,175 graeae.timers.timer end: Elapsed: 0:00:36.949858
plot = simulation.hvplot.hist("Point Estimate",
bins=25).opts(
title="Opportunity Cost Don't Buy Difference",
width=Plot.width,
height=Plot.height,
color=Plot.tan,
)
outcome = Embed(plot=plot, file_name="opportunity_cost_simulation")()
print(outcome)
The distribution appears to be reasonably normal, if a bit peaked in the middle.
matched = len(simulation[simulation["Point Estimate"] >= POINT_ESTIMATE])/len(simulation)
print(f"Percent of trials >= Point Estimate of Study: {100 * matched:0.3f} %")
Percent of trials >= Point Estimate of Study: 0.817 %
According to our simulation, less than one percent of the time we would see a difference like the study found by random chance alone.
print(f"Null Hypothesis is {matched >= POINT_ESTIMATE}")
Null Hypothesis is False
So we'll conclude that the Null Hypothesis is false and that the Alternative Hypothesis that telling students about the opportunity cost of buying a DVD does have an effecte in getting them to not buy the DVD.
So here we have a walk through of Inference using simulation and an experiment with Control and Treatment groups. Although the conclusion reached is that reminding students of the money they would have in the future if they didn't spend it is "causal", since this was an experiment, I'm not 100% convinced that asking studnents what the
# python
from argparse import Namespace
from functools import partial
# pypi
from expects import (
equal,
expect,
)
from numpy.random import default_rng
from scipy import stats
from tabulate import tabulate
import holoviews
import hvplot.pandas
import numpy
import pandas
# my stuff
from graeae import EmbedHoloviews, Timer
numpy_random = default_rng()
Embed = partial(EmbedHoloviews,
folder_path="files/posts/gender-discrimination/")
Plot = Namespace(
width=990,
height=780,
fontscale=2,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)
TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys")
TIMER = Timer()
This starts with a study where bank managers were each given a personnel file and asked to decide if they would promote the person represented by the file to branch manager. The files were identical but half of them were filled out as male and half as female. The researchers wanted to know if these manager were biased in favor of men. There was actually two types of jobs, one described by the authors as "simple" and one as "complex". This only looks at the "simple" job. The table below shows their results.
study = pandas.DataFrame({"Promoted": [21, 14],
"Not Promoted":[3, 10]},
index=["Male", "Female"])
row = study.sum()
row.name = "Total"
study = study.append(row)
column = study.sum(axis="columns")
study["Total"] = column
print(TABLE(study))
Promoted | Not Promoted | Total | |
---|---|---|---|
Male | 21 | 3 | 24 |
Female | 14 | 10 | 24 |
Total | 35 | 13 | 48 |
Let's look at the values (without the totals) as a plot.
plotter = study.reset_index()
del(plotter["Total"])
plotter = plotter.iloc[:-1]
plotter = plotter.melt(id_vars=["index"],
value_vars=["Promoted", "Not Promoted"])
plotter = plotter.rename(columns=dict(index="Gender",
value="Count",
variable="Decision"))
plot = plotter.hvplot.bar(x="Gender", y="Count", by="Decision").opts(
title="Outcome by Gender",
height=Plot.height,
width=Plot.width,
fontscale=Plot.fontscale,
color=Plot.color_cycle,
)
outcome = Embed(plot=plot, file_name="promotions_by_gender")()
print(outcome)
It looks like a considerable majority of the males were promoted whereas for the females only a slight majority were promoted.
plot = plotter.hvplot.bar(x="Decision", y="Count", by="Gender").opts(
title="Male Vs Female by Decision",
height=Plot.height,
width=Plot.width,
fontscale=Plot.fontscale,
color=Plot.color_cycle,
)
outcome = Embed(plot=plot, file_name="decision_by_gender")()
print(outcome)
This plot doesn't make the disparity look quite as extreme as the previous plot did, I think.
I started using them without explicity stating it but we have two variable here - Promoted
and Not Promoted
are part of the decision
variable and Male
and Female
are part of the gender
variable.
Anyway… so more males were chosen for promotion than female, but by what proportion?
print(TABLE(study/study.loc["Total"]))
Promoted | Not Promoted | Total | |
---|---|---|---|
Male | 0.6 | 0.230769 | 0.5 |
Female | 0.4 | 0.769231 | 0.5 |
Total | 1 | 1 | 1 |
So 60% of the promoted were male and 40% of the promoted were female.
print(f"{.6/.4:.2f}")
1.50
If you were promoted you were one and a half times more likely to be male. Another way to look at it is to ask - What proportion of each gender was promoted?
fractions = study/study.Total.values
print(TABLE(fractions))
Promoted | Not Promoted | Total | |
---|---|---|---|
Male | 0.875 | 0.125 | 0.5 |
Female | 0.583333 | 0.416667 | 0.5 |
Total | 1.45833 | 0.541667 | 1 |
So about 88% of the males were promoted while only 58% of the females were promoted.
POINT_ESTIMATE = (fractions.Promoted.loc['Male']
- fractions.Promoted.loc['Female'])
print(f"{POINT_ESTIMATE:0.2f}")
0.29
There was a 30% difference between the rate of male promotion and the rate of female promotion. The question we have now is - could this difference have happened by random chance or is this evidence of bias?.
We have a point estimate of the difference of 0.29 - is this evidence of bias?
gender
and =decision are independent and the observed difference was due to chance.gender
and decision
are not independent and the difference was not due to chance, but rather women were less likely to be promoted than men.I'm going to pick an arbitrary confidence interval of 0.95.
ALPHA = 0.05
The basic method here is we'll create an "urn" with an equal number of "balls" for men and women (24 of each in this case) and then randomly select 35 balls representing the number that were promoted and see the difference in the fraction of males promoted vs the number of females. To make the math simple I'll run it a number of times that is a power of 10.
To make the counting easier I'll set males to be 1 and females to be 0 (so the number of males promoted is the sum and the females is the remainder).
NUMBER_OF_MALES = NUMBER_OF_FEMALES = 24
MALE = 1
FEMALE = 0
PROMOTED = 35
MALES = numpy.ones(NUMBER_OF_MALES)
FEMALES = numpy.zeros(NUMBER_OF_FEMALES)
URN = numpy.append(MALES, FEMALES)
expect(len(URN)).to(equal(NUMBER_OF_MALES + NUMBER_OF_FEMALES))
expect(URN.sum()).to(equal(NUMBER_OF_MALES))
TRIALS = 10**7
males_promoted = [numpy_random.choice(URN, PROMOTED, replace=False).sum()
for trial in range(TRIALS)]
females_promoted = [PROMOTED - males for males in males_promoted]
proportion_males_promoted = [males/NUMBER_OF_MALES for males in males_promoted]
proportion_females_promoted = [females/NUMBER_OF_FEMALES for females in females_promoted]
pairs = zip(proportion_males_promoted, proportion_females_promoted)
differences = [males - females for males, females in pairs]
data = pandas.DataFrame.from_dict({
"Point Estimate": differences,
"Males Promoted": males_promoted,
"Females Promoted": females_promoted,
"Proportion Males Promoted": proportion_males_promoted,
"Proportion Females Promoted": proportion_females_promoted,
})
plot = data.hvplot.hist("Point Estimate").opts(
title="Distribution of Differences in Gender Promotion",
width=Plot.width,
height=Plot.height,
fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="difference_distribution")()
print(outcome)
The distribution looks mostly normal. As you might guess the distribution is centered around 0 (cases where exactly the same number of males and females were promoted - although 35 were promoted each time so the trials are never exactly 0). The cases where the difference in proportion is as great or greater than as it was in the original study are in the rightmost two bins.
I should note that because the sample size is so small, there's a weird distribution of the points - there's not enough variation in the differences to make a smooth curve (thus the gaps in the histogram).
output = stats.probplot(data["Point Estimate"])
theoretical, actual = output[0]
slope, intercept, r = output[1]
first = theoretical.min()
last = theoretical.max()
line = holoviews.Curve([(first, slope * first + intercept), (last, slope * last + intercept)],
color=Plot.red)
qq_frame = pandas.DataFrame.from_dict({
"Theoretical Quantiles": theoretical,
"Differences In Hiring Rates": actual,
})
scatter = qq_frame.hvplot.scatter(x="Theoretical Quantiles",
y="Differences In Hiring Rates", datashade=True)
plot = (scatter * line).opts(
title="Hiring Probability Plot",
width=Plot.width,
height=Plot.height,
fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="differences_qq")()
print(outcome)
Hmm… I'm not sure I'm doing that right. But it looks reasonable, I suppose.
lowest = data["Point Estimate"].min()
data_sorted = data["Point Estimate"].sort_values()
max_loc = int(len(data_sorted) * 0.95)
highest = data_sorted.iloc[max_loc]
print(f"Point Estimate: {POINT_ESTIMATE: 0.3f}")
print(f"({lowest: 0.3f} , {highest: 0.3f})")
Point Estimate: 0.292 (-0.542 , 0.208)
Looking at the confidence interval we can see that the original point estimate is higher than 95 % of the values, but the original book example used a hypothesis test so let's finish it up.
But anyway, what proportion of our simulations had as much or more of a difference than that found in the study?
proportion = len(data[data["Point Estimate"] >= POINT_ESTIMATE])/TRIALS
print(f"Proportion: {100 * proportion:0.3f} %")
Proportion: 2.450 %
print(f"Probability of our study's difference in promotion between genders by chance alone: {proportion:0.2f}.")
print(f"Our tolerance was {ALPHA:0.2f}.")
Probability of our study's difference in promotion between genders by chance alone: 0.02. Our tolerance was 0.05.
So we reject the null hypothesis and conclude that there is a statistically significant chance that the number of women promoted vs men in the original study was the result of bias.
Well, that was the replication (sort of) of this problem from Introductory Statistics with Randomization and Simulation. The point of it was to both review Hypothesis Testing and see how it can be done using simulation rather than a p-test.
# python
from argparse import Namespace
from pathlib import Path
from functools import partial
import os
# pypi
from dotenv import load_dotenv
import holoviews
import hvplot.pandas
import pandas
import tabula
from graeae import EmbedHoloviews
SLUG = "oregon-covid-19-tables"
Embed = partial(EmbedHoloviews, folder_path = f"files/posts/{SLUG}")
Plot = Namespace(
width=990,
height=780,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
font_scale=2,
color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)
BASE = "COVID-19-Weekly-Report-{date}-FINAL.pdf"
DATE = "2020-05-27"
load_dotenv(Path("~/.local/share/env").expanduser())
path = Path(os.environ["OREGON_COVID"]).expanduser()
filename = path/BASE.format(date=DATE)
assert filename.is_file()
tables = tabula.read_pdf(filename, pages="all")
print(type(tables))
print(len(tables))
<class 'list'> 15
Okay, so it found 15 tables, which should have been loaded as pandas DataFrames. Since a list doesn't really provide an easy way to get metadata, maybe I'll just traverse the list and see what's there.
for table in tables:
print(table.head(1))
print("*" * 20)
Sex Cases % of total cases Cases per 10,000a Deaths Case fatality (%) \ 0 Male 1,874 47.5% 8.7 85 4.5% Hospitalized 0 402 ******************** Age group Cases % of total cases Cases per 10,000a Deaths \ 0 0-9 53 1.3% 1.1 0 Case fatality (%) Hospitalized 0 0% 6 ******************** Unnamed: 0 Race Cases % of total cases Cases per 10,000b Deaths \ 0 White NaN 1,946 49.3% 6.2 109.0 Case fatality (%) Hospitalized 0 5.6% 452.0 ******************** Ethnicity Case count % of total cases Cases per 10,000a Deaths \ 0 Hispanic 1,317 33.4% 23.7 14 Case fatality (%) Hospitalized 0 1.1% 178 ******************** Facility name County First reported Total casesa \ 0 Healthcare at Foster Creek Multnomah 3/24/2020 119.0 Total deathsa 0 30.0 ******************** ZIP code Number of cases Cases per 10,000 0 97002 1–9 n/a ******************** 97053 1–9 n/a 0 97054 0 0.0 ******************** 97148 0 0.0 0 97201 13 8.4 ******************** 97324 0 0.0 0 97325 10 15.1 ******************** 97411 0 0.0 0 97415 1–9 n/a ******************** 97501 1–9 n/a 0 97502 13 4.7 ******************** 97813 1–9 n/a 0 97814 1–9 n/a ******************** Hospital Max Patients Max Patients.1 0 NaN (Both suspected and confirmed) (Confirmed only) ******************** Hospital 0 Asante Rogue Regional Medical Center ******************** Providence Newberg Medical Center 0 Providence Seaside Hospital ********************
Most of the tables appear to be either fragments or case-counts based on different categorizations.
COLUMNS = Namespace(
sex="Sex",
cases="Cases",
cases_per_10000 = "Cases per 10,000a",
percent = "% of total cases",
)
tables_2 = dict(sex=tables[0])
print(tables_2["sex"])
Sex Cases % of total cases Cases per 10,000a Deaths \ 0 Male 1,874 47.5% 8.7 85 1 Female 2,071 52.4% 9.9 63 2 Non-Binary 1 0% n/a 0 3 Not available 3 0.1% n/a 0 4 Total 3,949 100.0% 9.3 148 Case fatality (%) Hospitalized 0 4.5% 402 1 3.0% 344 2 0% 0 3 0% 1 4 3.7% 747
One thing to notice is that there are "n/a", ",", and "%" strings in there that prevent some columns from being treated as numbers.
tables_2["sex"].info()
males_females = tables[0].iloc[:2]
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5 entries, 0 to 4 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sex 5 non-null object 1 Cases 5 non-null object 2 % of total cases 5 non-null object 3 Cases per 10,000a 5 non-null object 4 Deaths 5 non-null int64 5 Case fatality (%) 5 non-null object 6 Hospitalized 5 non-null int64 dtypes: int64(2), object(5) memory usage: 408.0+ bytes
So to use most of the columns you have to do something to convert them back to numbers. I'll start by looking at male and female cases adjusted by population.
def bar_plot(data: pandas.DataFrame, x_column: str, y_column: str,
title:str, filename: str) -> str:
"""Plot the data as a bar-plot
Args:
data: thing to plot
x_column: name of column for the x-axis
y_column: name of the column with the bar height
title: Title for the plot
filename: Name to save the plot
Returns:
html: the html to use to embed the plot into org
"""
plot = data.hvplot.bar(x=x_column,
y=y_column,
color=Plot.color_cycle).opts(
width=Plot.width,
height=Plot.height,
fontscale=Plot.font_scale,
title=title,
)
outcome = Embed(plot=plot, file_name=filename)()
return outcome
gender_bar = partial(bar_plot, data=males_females, x_column=COLUMNS.sex)
First, make the counts integers.
males_females.loc[:, COLUMNS.cases] = males_females[COLUMNS.cases].str.replace(
",", "").astype(int)
Now a little bar plot.
outcome = gender_bar(y_column=COLUMNS.cases,
title="Cases by Gender",
filename="cases_by_gender")
print(outcome)
There were slightly more female cases than male cases.
males_females.loc[:, COLUMNS.percent] = males_females[COLUMNS.percent].str.replace(
"%", "").astype(float)
outcome = gender_bar(y_column=COLUMNS.percent,
title="Case Percentage by Gender",
filename="percentage_by_gender")
print(outcome)
males_females.loc[:, COLUMNS.cases_per_10000] = males_females[
COLUMNS.cases_per_10000].astype(float)
plot = males_females.hvplot.bar(x=COLUMNS.sex, y=COLUMNS.cases_per_10000,
color=Plot.color_cycle,
fontscale=Plot.font_scale).opts(
title="Cases per 10,000 Population by Gender",
width=Plot.width,
height=Plot.height)
outcome = Embed(plot=plot, file_name="cases_by_sex_and_population")()
print(outcome)