A First Look At HVPlot

Cloistered Monkey

2019-02-01 11:24

Introduction

This is a look at HVPlot, a HoloViews based plotting adapter that works directly with pandas or other pandas-like libraries (e.g. dask). I'm starting with their Introduction but might branch out after that. We'll see.

Set Up

Imports

From Python

from datetime import datetime
from functools import partial
from pathlib import Path
from typing import Union
import textwrap

From PyPi

from sklearn.datasets import load_iris
from tabulate import tabulate
import numpy
import pandas

My Stuff

from neurotic.tangles.timer import Timer

The Bokeh Imports

from bokeh.embed import autoload_static
import bokeh.resources

Set Up the HVPlot

I'm not sure exactly what it's doing, but this next import adds an hvplot method to pandas' DataFrames to do the actual plotting.

import holoviews
import hvplot.pandas

Typing

PathType = Union[str, Path]

Constants

FOLDER_PATH = "../files/posts/libraries/a-first-look-at-hvplot/"

Tables

table = partial(tabulate, tablefmt="orgtbl", headers="keys", showindex=False)

Helpers

EmbedBokeh Class

class EmbedBokeh:
    """Embed a bokeh figure

    Args:
     plot: a hvplot to embed
     folder_path: path to the folder to save the file
     file_name: name of the file to save the javascript in
     create_folder: if the folder doesn't exist create it
     make_parents: if creating a folder add the missing folders in the path
    """
    def __init__(self, plot: holoviews.core.overlay.NdOverlay,
                 file_name: str,
                 folder_path: PathType,
                 create_folder: bool=True,
                 make_parents: bool=True) -> None:
        self.plot = plot
        self._figure = None
        self.create_folder = create_folder
        self.make_parents = make_parents
        self._folder_path = None
        self.folder_path = folder_path
        self._file_name = None
        self.file_name = file_name
        self._source = None
        self._javascript = None
        self._bokeh_source = None
        self._export_string = None
        return

    @property
    def folder_path(self) -> Path:
        """The path to the folder to store javascript"""
        return self._folder_path

    @folder_path.setter
    def folder_path(self, path: PathType) -> None:
        """Sets the path to the javascript folder"""
        self._folder_path = Path(path)
        if self.create_folder and  not self._folder_path.is_dir():
            self._folder_path.mkdir(parents=self.make_parents)
        return

    @property
    def file_name(self) -> str:
        """The name of the javascript file"""
        return self._file_name

    @file_name.setter
    def file_name(self, name: str) -> None:
        """Sets the filename

        Args:
         name: name to save the javascript (without the folder)
        """
        name = Path(name)
        self._file_name = "{}.js".format(name.stem)
        return

    @property
    def figure(self) -> bokeh.plotting.Figure:
        """The Figure to plot"""
        if self._figure is None:
            self._figure = holoviews.render(self.plot)
        return self._figure

    @property
    def bokeh_source(self) -> bokeh.resources.Resources:
        """The javascript source
        """
        if self._bokeh_source is None:
            self._bokeh_source = bokeh.resources.CDN
        return self._bokeh_source

    @property
    def source(self) -> str:
        """The HTML fragment to export"""
        if self._source is None:
            self._javascript, self._source = autoload_static(self.figure,
                                                             self.bokeh_source,
                                                             self.file_name)
        return self._source

    @property
    def javascript(self) -> str:
        """javascript to save"""
        if self._javascript is None:
            self._javascript, self._source = autoload_static(self.figure,
                                                             self.bokeh_source,
                                                             self.file_name)
        return self._javascript

    @property
    def export_string(self) -> str:
        """The string to embed the figure into org-mode"""
        if self._export_string is None:
            self._export_string = textwrap.dedent(
                """#+BEGIN_EXPORT html{}
#+END_EXPORT""".format(self.source))
        return self._export_string

    def save_figure(self) -> None:
        """Saves the javascript file"""
        with open(self.folder_path.joinpath(self.file_name), "w") as writer:
            writer.write(self.javascript)
        return

    def __call__(self) -> None:
        """Creates the bokeh javascript and emits it"""
        self.save_figure()
        print(self.export_string)
        return

    def reset(self) -> None:
        """Sets the generated (bokeh) properties back to None"""
        self._export_string = None
        self._javascript = None
        self._source = None
        self._figure = None
        return

Embed = partial(EmbedBokeh, folder_path=FOLDER_PATH)

The Timer

TIMER = Timer()

The Data

Portland Crime

This is taken from the Portland Crime Statistics page.

portland_path = Path("~/data/datasets/portland/crime-to-january-2018.csv").expanduser()
assert portland_path.is_file()
with TIMER:
    crime = pandas.read_csv(portland_path)

Started: 2019-02-02 18:38:59.025251
Ended: 2019-02-02 18:39:00.170796
Elapsed: 0:00:01.145545

print(crime.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217224 entries, 0 to 217223
Data columns (total 17 columns):
Address              196626 non-null object
Case Number          217224 non-null object
Crime Against        217224 non-null object
Neighborhood         210788 non-null object
Number of Records    217224 non-null int64
Occur Month Year     217224 non-null object
Occur Date           217224 non-null object
Occur Time           217224 non-null int64
Offense Category     217224 non-null object
Offense Count        217224 non-null int64
Offense Type         217224 non-null object
OpenDataLat          193352 non-null float64
OpenDataLon          193352 non-null float64
OpenDataX            193352 non-null float64
OpenDataY            193352 non-null float64
Report Date          217224 non-null object
ReportMonthYear      217224 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 28.2+ MB
None

Here's a possible categorical column to use.

crime["type"] = crime["Crime Against"].astype("category")
crime = crime.drop(columns=["Crime Against"])

print(table(crime.type.value_counts().reset_index(), headers=["Type", "Count"]))

Type	Count
Property	175567
Person	32109
Society	9548

Making the Plot

Holoviews is expecting you to work in a jupyter notebook and isn't quite so easy to work with in org-mode so I'll make the plot with hvplot but then convert it to a bokeh figure to embed it in this post.

The Plot

with TIMER:
    crime["date"] = pandas.to_datetime(crime["Occur Date"])
    crime["id"] = crime["Case Number"]
    crime = crime.drop(columns=["Occur Date", "Case Number"])
    crime_dates = crime.set_index("date")

Started: 2019-02-01 20:31:47.668915
Ended: 2019-02-01 20:32:09.889378
Elapsed: 0:00:22.220463

weekly = crime_dates.resample("W").count()

plot = weekly.id.hvplot()
Embed(plot, "weekly_crime.js")()

That didn't work out is planned. It turns out that the data starts in 1972, but is mostly empty until around May of 2015. It also looks like January is missing values. I think I'll trim the data set.

Trimmed

crime_dates = crime_dates[(crime_dates.index >= datetime(2015, 5, 31))
                          & (crime_dates.index < datetime(2019, 1, 1))]
weekly = crime_dates.resample("W").count()

By Type

HoloViews uses this rather odd way of composing figures. Instead of the object-oriented way you might expect it overrides the multiplication sign (* for adding to the same plot) and addition sign (+ for adding an adjacent plot) so to plot the types I'll have to multiply their plots.

types = {name: crime_dates[crime_dates.type==name]
         for name in crime_dates.type.unique()}
weekly_types = {name: data.resample("W").count()
                for name, data in types.items()}
keys = list(weekly_types.keys())
first = keys[0]
plot = weekly_types[first].hvplot(y="id", label=first)
for key in keys[1:]:
    plot *= weekly_types[key].hvplot(y="id", label=key)

It looks like it could use more trimming, but it also looks like it's mostly property crimes, which is what you'd expect, I guess. Actually I tried another trim and it looks like it always starts at zero because of the way the resampling works, so trimming doesn't make that first anomaly go away. Maybe trimming the weekly would help.

Looking a Little More at the Crimes

By Neighborhood

top_ten = crime_dates.Neighborhood.value_counts()[:10].reset_index()
print(table(top_ten, headers="Neighborhood Count".split()))

Neighborhood	Count
Downtown	10237
Hazelwood	10127
Lents	5681
Powellhurst-Gilbert	5605
Centennial	5016
Old Town/Chinatown	4966
Northwest	4648
Montavilla	4026
Pearl	3905
Lloyd	3699

neighborhoods = crime_dates["Neighborhood"]
neighborhoods = pandas.get_dummies(neighborhoods)

neighborhoods = neighborhoods[top_ten["index"]].resample("M").sum()

plot = (neighborhoods.hvplot(title="Top Ten Monthly Neighborhood Crime Counts")
        + neighborhoods.hvplot.table(columns=["Downtown", "Hazelwood",
                                              "Lents", "Powellhurst-Gilbert"]))
Embed(plot, "neighborhoods")()

So the first thing to notice is that Downtown and Hazelwood dominate the case counts. There doesn't seem to be any strong upward or downward trend.

I live in Powelhurst-Gilbert, about a block north of Lents, and it looks like if you considered them one big neighborhood (they are adjacent), then they form the highest-crime Neighborhood, but, sticking to the arbitrariness of the boundaries, we are relegated to numbers three and four.

Distribution

plot = neighborhoods.hvplot.kde(
    by="Neighborhood",
    title="Distributions of Top Ten Crime Neighborhoods")

Embed(plot, "neighborhoods_kde")()

I don't know what that mysterious bulge around zero is, all the neighborhoods are in the other peaks.

Irises

Since the previous data was time-series data I thought I'd load a data set that wasn't to illustrate the use of the by parameter.

irises = load_iris()
print(irises.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

.. topic:: References

   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

iris_data = pandas.DataFrame(irises.data, columns=irises.feature_names)
print(iris_data.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

I don't know where this convention came from, but you can use the by keyword to specify a categorical column to differentiate the data points. In this case I'll use it to differentiate the species.

target = pandas.Series(irises.target)
target_map = dict(zip(range(3), irises.target_names))
iris_data["target"] = target.apply(lambda x: target_map[x])

plot = iris_data.hvplot.scatter(x="sepal length (cm)", y="petal length (cm)",
                                by="target", alpha=0.5,
                                title="Iris Sepal Length vs Petal Length")
EmbedBokeh(plot, folder_path=FOLDER_PATH, file_name="irises.js")()

Scatter Matrix

plot = hvplot.scatter_matrix(iris_data, c="target")

Embed(plot, "iris_scatter_matrix")()

Parallel Coordinates

plot = hvplot.parallel_coordinates(iris_data, "target")

Embed(plot, "iris_parallel_coordinates")()

Table of Contents