Galton's Sweet Pea Data
Beginning
Imports
# python
from functools import partial
from collections import namedtuple
# pypi
from tabulate import tabulate
import holoviews
import hvplot.pandas
import pandas
# my stuff
from graeae import EmbedHoloviews
Set Up
SLUG = "galtons-sweet-pea-data"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/{SLUG}")
Plot = namedtuple("Plot", ["width", "height", "fontscale",
"tan", "blue",
"red"])
PLOT = Plot(
width=900,
height=750,
fontscale=2,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
)
Middle
The Data
URL = "http://jse.amstat.org/v9n3/stanton.html#appendixa"
frames = pandas.read_html(URL, header=0)
There are two tables on the page, we want the second one.
data = frames[1]
print(tabulate(data, tablefmt="orgtbl", headers="keys", showindex=False))
Diameter of Parent Seed(0.01 inch) | Diameter of Daughter Seed(0.01 inch) | Frequency |
---|---|---|
21 | 14.67 | 22 |
21 | 15.67 | 8 |
21 | 16.67 | 10 |
21 | 17.67 | 18 |
21 | 18.67 | 21 |
21 | 19.67 | 13 |
21 | 20.67 | 6 |
21 | 22.67 | 2 |
20 | 14.66 | 23 |
20 | 15.66 | 10 |
20 | 16.66 | 12 |
20 | 17.66 | 17 |
20 | 18.66 | 20 |
20 | 19.66 | 13 |
20 | 20.66 | 3 |
20 | 22.66 | 2 |
19 | 14.07 | 35 |
19 | 15.07 | 16 |
19 | 16.07 | 12 |
19 | 17.07 | 13 |
19 | 18.07 | 11 |
19 | 19.07 | 10 |
19 | 20.07 | 2 |
19 | 22.07 | 1 |
18 | 14.35 | 34 |
18 | 15.35 | 12 |
18 | 16.35 | 13 |
18 | 17.35 | 17 |
18 | 18.35 | 16 |
18 | 19.35 | 6 |
18 | 20.35 | 2 |
17 | 13.92 | 37 |
17 | 14.92 | 16 |
17 | 15.92 | 13 |
17 | 16.92 | 16 |
17 | 17.92 | 13 |
17 | 18.92 | 4 |
17 | 19.92 | 1 |
16 | 14.28 | 34 |
16 | 15.28 | 15 |
16 | 16.28 | 18 |
16 | 17.28 | 16 |
16 | 18.28 | 13 |
16 | 19.28 | 3 |
16 | 20.28 | 1 |
15 | 13.77 | 46 |
15 | 14.77 | 14 |
15 | 15.77 | 9 |
15 | 16.77 | 11 |
15 | 17.77 | 14 |
15 | 18.77 | 4 |
15 | 19.77 | 2 |
Note that the data is somewhat aggregated - the last column is the number of times that the parent/daughter diameter pairs occured in the original data.
data.columns = ["Parent", "Daughter", "Frequency"]
plot = data.hvplot.scatter(x="Parent", y="Daughter", s=data.Frequency,
c="Frequency",
color=PLOT.red,
title="Sweet Pea Diameter (0.01 Inch)").opts(
height=PLOT.height,
width=PLOT.width,
fontscale=PLOT.fontscale,
)
embedder = Embed(plot=plot, file_name="scatter_plot")
output = embedder()
print(output)
rows = []
for row in data.itertuples():
rows.extend([[row.Parent, row.Daughter] for _ in range(row.Frequency)])
frame = pandas.DataFrame(rows, columns="Parent Daughter".split())
plot = frame.hvplot.hist(stacked=True, title="Diameter Distribution", alpha=0.5).opts(
height=PLOT.height,
width=PLOT.width,
fontscale=PLOT.fontscale,
xlabel="Diameter",
ylabel="Count",
)
output = Embed(plot=plot, file_name="histogram")()
print(output)
parent_min = frame.Parent.min()
parent_max = frame.Parent.max()
y_1 = frame[frame.Parent==parent_min].Daughter.mean()
y_2 = frame[frame.Parent==parent_max].Daughter.mean()
dots = frame.hvplot.scatter(x="Parent", y="Daughter",
color=PLOT.red,
title="Sweet Pea Diameter (0.01 Inch)")
line = holoviews.Curve([(parent_min, y_1), (parent_max, y_2)])
plot = (dots * line).opts(
height=PLOT.height,
width=PLOT.width,
fontscale=PLOT.fontscale,
)
output = Embed(plot=plot, file_name="scatter_plot_with_means")()
print(output)
End
Source
- Stanton JM. Galton, Pearson, and the peas: A brief history of linear regression for statistics instructors. Journal of Statistics Education. 2001 Jan 1;9(3). (Link)