A First Look At HVPlot
Table of Contents
Introduction
This is a look at HVPlot, a HoloViews based plotting adapter that works directly with pandas or other pandas-like libraries (e.g. dask). I'm starting with their Introduction but might branch out after that. We'll see.
Set Up
Imports
From Python
from datetime import datetime
from functools import partial
from pathlib import Path
from typing import Union
import textwrap
From PyPi
from sklearn.datasets import load_iris
from tabulate import tabulate
import numpy
import pandas
My Stuff
from neurotic.tangles.timer import Timer
The Bokeh Imports
from bokeh.embed import autoload_static
import bokeh.resources
Set Up the HVPlot
I'm not sure exactly what it's doing, but this next import adds an hvplot
method to pandas' DataFrames to do the actual plotting.
import holoviews
import hvplot.pandas
Typing
PathType = Union[str, Path]
Constants
FOLDER_PATH = "../files/posts/libraries/a-first-look-at-hvplot/"
Tables
table = partial(tabulate, tablefmt="orgtbl", headers="keys", showindex=False)
Helpers
EmbedBokeh Class
class EmbedBokeh:
"""Embed a bokeh figure
Args:
plot: a hvplot to embed
folder_path: path to the folder to save the file
file_name: name of the file to save the javascript in
create_folder: if the folder doesn't exist create it
make_parents: if creating a folder add the missing folders in the path
"""
def __init__(self, plot: holoviews.core.overlay.NdOverlay,
file_name: str,
folder_path: PathType,
create_folder: bool=True,
make_parents: bool=True) -> None:
self.plot = plot
self._figure = None
self.create_folder = create_folder
self.make_parents = make_parents
self._folder_path = None
self.folder_path = folder_path
self._file_name = None
self.file_name = file_name
self._source = None
self._javascript = None
self._bokeh_source = None
self._export_string = None
return
@property
def folder_path(self) -> Path:
"""The path to the folder to store javascript"""
return self._folder_path
@folder_path.setter
def folder_path(self, path: PathType) -> None:
"""Sets the path to the javascript folder"""
self._folder_path = Path(path)
if self.create_folder and not self._folder_path.is_dir():
self._folder_path.mkdir(parents=self.make_parents)
return
@property
def file_name(self) -> str:
"""The name of the javascript file"""
return self._file_name
@file_name.setter
def file_name(self, name: str) -> None:
"""Sets the filename
Args:
name: name to save the javascript (without the folder)
"""
name = Path(name)
self._file_name = "{}.js".format(name.stem)
return
@property
def figure(self) -> bokeh.plotting.Figure:
"""The Figure to plot"""
if self._figure is None:
self._figure = holoviews.render(self.plot)
return self._figure
@property
def bokeh_source(self) -> bokeh.resources.Resources:
"""The javascript source
"""
if self._bokeh_source is None:
self._bokeh_source = bokeh.resources.CDN
return self._bokeh_source
@property
def source(self) -> str:
"""The HTML fragment to export"""
if self._source is None:
self._javascript, self._source = autoload_static(self.figure,
self.bokeh_source,
self.file_name)
return self._source
@property
def javascript(self) -> str:
"""javascript to save"""
if self._javascript is None:
self._javascript, self._source = autoload_static(self.figure,
self.bokeh_source,
self.file_name)
return self._javascript
@property
def export_string(self) -> str:
"""The string to embed the figure into org-mode"""
if self._export_string is None:
self._export_string = textwrap.dedent(
"""#+BEGIN_EXPORT html{}
#+END_EXPORT""".format(self.source))
return self._export_string
def save_figure(self) -> None:
"""Saves the javascript file"""
with open(self.folder_path.joinpath(self.file_name), "w") as writer:
writer.write(self.javascript)
return
def __call__(self) -> None:
"""Creates the bokeh javascript and emits it"""
self.save_figure()
print(self.export_string)
return
def reset(self) -> None:
"""Sets the generated (bokeh) properties back to None"""
self._export_string = None
self._javascript = None
self._source = None
self._figure = None
return
Embed = partial(EmbedBokeh, folder_path=FOLDER_PATH)
The Timer
TIMER = Timer()
The Data
Portland Crime
This is taken from the Portland Crime Statistics page.
portland_path = Path("~/data/datasets/portland/crime-to-january-2018.csv").expanduser()
assert portland_path.is_file()
with TIMER:
crime = pandas.read_csv(portland_path)
Started: 2019-02-02 18:38:59.025251 Ended: 2019-02-02 18:39:00.170796 Elapsed: 0:00:01.145545
print(crime.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 217224 entries, 0 to 217223 Data columns (total 17 columns): Address 196626 non-null object Case Number 217224 non-null object Crime Against 217224 non-null object Neighborhood 210788 non-null object Number of Records 217224 non-null int64 Occur Month Year 217224 non-null object Occur Date 217224 non-null object Occur Time 217224 non-null int64 Offense Category 217224 non-null object Offense Count 217224 non-null int64 Offense Type 217224 non-null object OpenDataLat 193352 non-null float64 OpenDataLon 193352 non-null float64 OpenDataX 193352 non-null float64 OpenDataY 193352 non-null float64 Report Date 217224 non-null object ReportMonthYear 217224 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 28.2+ MB None
Here's a possible categorical column to use.
crime["type"] = crime["Crime Against"].astype("category")
crime = crime.drop(columns=["Crime Against"])
print(table(crime.type.value_counts().reset_index(), headers=["Type", "Count"]))
Type | Count |
---|---|
Property | 175567 |
Person | 32109 |
Society | 9548 |
Making the Plot
Holoviews is expecting you to work in a jupyter notebook and isn't quite so easy to work with in org-mode so I'll make the plot with hvplot
but then convert it to a bokeh figure to embed it in this post.
The Plot
with TIMER:
crime["date"] = pandas.to_datetime(crime["Occur Date"])
crime["id"] = crime["Case Number"]
crime = crime.drop(columns=["Occur Date", "Case Number"])
crime_dates = crime.set_index("date")
Started: 2019-02-01 20:31:47.668915 Ended: 2019-02-01 20:32:09.889378 Elapsed: 0:00:22.220463
weekly = crime_dates.resample("W").count()
plot = weekly.id.hvplot()
Embed(plot, "weekly_crime.js")()
That didn't work out is planned. It turns out that the data starts in 1972, but is mostly empty until around May of 2015. It also looks like January is missing values. I think I'll trim the data set.
Trimmed
crime_dates = crime_dates[(crime_dates.index >= datetime(2015, 5, 31))
& (crime_dates.index < datetime(2019, 1, 1))]
weekly = crime_dates.resample("W").count()
By Type
HoloViews uses this rather odd way of composing figures. Instead of the object-oriented way you might expect it overrides the multiplication sign (*
for adding to the same plot) and addition sign (+
for adding an adjacent plot) so to plot the types I'll have to multiply their plots.
types = {name: crime_dates[crime_dates.type==name]
for name in crime_dates.type.unique()}
weekly_types = {name: data.resample("W").count()
for name, data in types.items()}
keys = list(weekly_types.keys())
first = keys[0]
plot = weekly_types[first].hvplot(y="id", label=first)
for key in keys[1:]:
plot *= weekly_types[key].hvplot(y="id", label=key)
It looks like it could use more trimming, but it also looks like it's mostly property crimes, which is what you'd expect, I guess. Actually I tried another trim and it looks like it always starts at zero because of the way the resampling works, so trimming doesn't make that first anomaly go away. Maybe trimming the weekly would help.
Looking a Little More at the Crimes
By Neighborhood
top_ten = crime_dates.Neighborhood.value_counts()[:10].reset_index()
print(table(top_ten, headers="Neighborhood Count".split()))
Neighborhood | Count |
---|---|
Downtown | 10237 |
Hazelwood | 10127 |
Lents | 5681 |
Powellhurst-Gilbert | 5605 |
Centennial | 5016 |
Old Town/Chinatown | 4966 |
Northwest | 4648 |
Montavilla | 4026 |
Pearl | 3905 |
Lloyd | 3699 |
neighborhoods = crime_dates["Neighborhood"]
neighborhoods = pandas.get_dummies(neighborhoods)
neighborhoods = neighborhoods[top_ten["index"]].resample("M").sum()
plot = (neighborhoods.hvplot(title="Top Ten Monthly Neighborhood Crime Counts")
+ neighborhoods.hvplot.table(columns=["Downtown", "Hazelwood",
"Lents", "Powellhurst-Gilbert"]))
Embed(plot, "neighborhoods")()
So the first thing to notice is that Downtown and Hazelwood dominate the case counts. There doesn't seem to be any strong upward or downward trend.
I live in Powelhurst-Gilbert, about a block north of Lents, and it looks like if you considered them one big neighborhood (they are adjacent), then they form the highest-crime Neighborhood, but, sticking to the arbitrariness of the boundaries, we are relegated to numbers three and four.
Distribution
plot = neighborhoods.hvplot.kde(
by="Neighborhood",
title="Distributions of Top Ten Crime Neighborhoods")
Embed(plot, "neighborhoods_kde")()
I don't know what that mysterious bulge around zero is, all the neighborhoods are in the other peaks.
Irises
Since the previous data was time-series data I thought I'd load a data set that wasn't to illustrate the use of the by
parameter.
irises = load_iris()
print(irises.DESCR)
.. _iris_dataset: Iris plants dataset -------------------- **Data Set Characteristics:** :Number of Instances: 150 (50 in each of three classes) :Number of Attributes: 4 numeric, predictive attributes and the class :Attribute Information: - sepal length in cm - sepal width in cm - petal length in cm - petal width in cm - class: - Iris-Setosa - Iris-Versicolour - Iris-Virginica :Summary Statistics: ============== ==== ==== ======= ===== ==================== Min Max Mean SD Class Correlation ============== ==== ==== ======= ===== ==================== sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None :Class Distribution: 33.3% for each of 3 classes. :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken from Fisher's paper. Note that it's the same as in R, but not as in the UCI Machine Learning Repository, which has two wrong data points. This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. .. topic:: References - Fisher, R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis. (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System Structure and Classification Rule for Recognition in Partially Exposed Environments". IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. PAMI-2, No. 1, 67-71. - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions on Information Theory, May 1972, 431-433. - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II conceptual clustering system finds 3 classes in the data. - Many, many more ...
iris_data = pandas.DataFrame(irises.data, columns=irises.feature_names)
print(iris_data.head())
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2
I don't know where this convention came from, but you can use the by
keyword to specify a categorical column to differentiate the data points. In this case I'll use it to differentiate the species.
target = pandas.Series(irises.target)
target_map = dict(zip(range(3), irises.target_names))
iris_data["target"] = target.apply(lambda x: target_map[x])
plot = iris_data.hvplot.scatter(x="sepal length (cm)", y="petal length (cm)",
by="target", alpha=0.5,
title="Iris Sepal Length vs Petal Length")
EmbedBokeh(plot, folder_path=FOLDER_PATH, file_name="irises.js")()
Scatter Matrix
plot = hvplot.scatter_matrix(iris_data, c="target")
Embed(plot, "iris_scatter_matrix")()
Parallel Coordinates
plot = hvplot.parallel_coordinates(iris_data, "target")
Embed(plot, "iris_parallel_coordinates")()