Visualizing Naive Bayes

Cloistered Monkey

2020-08-31 06:16

Beginning

In the previous post I made a class-based version of the Naive Bayes Classifier for tweets. For this post I'm going to plot the model values. It turns out that we need to get at some values that the previous implementations hide so I'm going to re-calculate the likelihoods from scratch rather than alter the previous code.

Set Up

Imports

# python
from argparse import Namespace
from functools import partial
from pathlib import Path

import os
import pickle

# from pypi
from dotenv import load_dotenv
from matplotlib.patches import Ellipse

import holoviews
import hvplot.pandas
import matplotlib.pyplot as pyplot
import matplotlib.transforms as transforms
import numpy
import pandas
import seaborn

# this project
from neurotic.nlp.twitter.counter import WordCounter

# graeae
from graeae import EmbedHoloviews

The Dotenv

env_path = Path("posts/nlp/.env")
assert env_path.is_file()
load_dotenv(env_path)

Plotting

SLUG = "visualizing-naive-bayes"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/nlp/{SLUG}", create_folder=False)

plot_path = Path(os.environ["TWITTER_PLOT"])
assert plot_path.is_file()
with plot_path.open("rb") as reader:
    Plot = pickle.load(reader)

seaborn.set_style("whitegrid", rc={"axes.grid": False})
FIGURE_SIZE = (12, 10)

The Data

train_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())

test_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TEST_RAW"]).expanduser()
)

print(f"Training: {len(train_raw):,}")
print(f"Testing: {len(test_raw):,}")

Training: 8,000
Testing: 2,000

The Word Counter

This is a class to clean and tokenize the tweets and build up a Counter with the word counts.

counter = WordCounter(train_raw.tweet, train_raw.label)

Constants

Sentiment = Namespace(
    positive = 1,
    negative = 0,
)

Middle

Log Likelihoods

Calculating the Likelihoods

The first thing to plot are the log-likelihoods for positive and negative tweets. When I implemented the Naive Bayes Classifier I took advantage of the fact that we're making a binary classifier and took the odds ratio when making predictions, but for our plot we're going to need to undo the division and plot the numerator against the denominator.

\begin{align} log \frac{P(tweet|pos)}{P(tweet|neg)} &= log(P(tweet|pos)) - log(P(tweet|neg)) \\ positive = log(P(tweet|pos)) &= \sum_{i=0}^{n}{log P(W_i|pos)}\\ negative = log(P(tweet|neg)) &= \sum_{i=0}^{n}{log P(W_i|neg)}\\ \end{align}

So, let's get the log-likelihoods.

COUNTS = counter.counts

positive_loglikelihood = {}
negative_loglikelihood = {}
log_ratio = {}

all_positive_words = sum(
    (counts[(token, sentiment)] for token, sentiment in COUNTS
     if sentiment == Sentiment.positive))
all_negative_words = sum(
    (counts[(token, sentiment)] for token, sentiment in COUNTS
     if sentiment == Sentiment.negative))

vocabulary = {key[0] for key in COUNTS}
vocabulary_size = len(vocabulary)

for word in vocabulary:
    this_word_positive_count = COUNTS[(word, Sentiment.positive)]
    this_word_negative_count = COUNTS[(word, Sentiment.negative)]

    probability_word_is_positive = ((this_word_positive_count + 1)/
                                    (all_positive_words + vocabulary_size))
    probability_word_is_negative = ((this_word_negative_count + 1)/
                                    (all_negative_words + vocabulary_size))
    positive_loglikelihood[word] = numpy.log(probability_word_is_positive)
    negative_loglikelihood[word] = numpy.log(probability_word_is_negative)
    log_ratio[word] = positive_loglikelihood[word] - negative_loglikelihood[word]

So now we have our positive and negative log-likelihoods and I'll put them into a pandas DataFrame to make it easier to plot.

positive_document_likelihood = []
negative_document_likelihood = []
sentiment = []

for row in train_raw.itertuples():
    tokens = counter.process(row.tweet)

    positive_document_likelihood.append(sum(positive_loglikelihood.get(token, 0)
                                            for token in tokens))
    negative_document_likelihood.append(sum(negative_loglikelihood.get(token, 0)
                                            for token in tokens))
    sentiment.append(row.label)

features = pandas.DataFrame.from_dict(
    dict(
        positive = positive_document_likelihood,
        negative = negative_document_likelihood,
        sentiment=sentiment,
    )
)

print(features.head())

     positive   negative  sentiment
0  -26.305672 -33.940649          1
1  -30.909803 -37.634516          1
2  -42.936400 -33.403567          0
3  -15.983546 -25.501140          1
4 -107.899933 -99.191875          0

Plotting the Likelihoods

plot = features.hvplot.scatter(x="positive", y="negative", by="sentiment",
                               color=Plot.color_cycle, fill_alpha=0).opts(
                                   title="Positive vs Negative",
                                   width=Plot.width,
                                   height=Plot.height,
                                   fontscale=Plot.font_scale,
                               )

outcome = Embed(plot=plot, file_name="positive_vs_negative_sentiment")()

print(outcome)

It looks like the log likelihoods for the negatives are linearly separable.

Confidence Ellipses

Now we're going to plot a Confidence Region, which is a generalization of a confidence interval to higher dimensions. In this case we're going to create confidence ellipses. I'm not really sure about the details of the math to get them, but matplotlib has a page with a function to create a matplotlib plot for a confidence ellipse that I'm going to adapt.

The Ellipse Function

This is taken almost verbatim from matplotlib's page.

def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of `x` and `y`

    Parameters
    ----------
    x, y : array_like, shape (n, )
       Input data.

    ax : matplotlib.axes.Axes
       The axes object to draw the ellipse into.

    n_std : float
       The number of standard deviations to determine the ellipse's radiuses.

    Returns
    -------
    matplotlib.patches.Ellipse

    Other parameters
    ----------------
    kwargs : `~matplotlib.patches.Patch` properties
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = numpy.cov(x, y)
    pearson = cov[0, 1]/numpy.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = numpy.sqrt(1 + pearson)
    ell_radius_y = numpy.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0),
                      width=ell_radius_x * 2,
                      height=ell_radius_y * 2,
                      facecolor=facecolor,
                      **kwargs
                      )

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = numpy.sqrt(cov[0, 0]) * n_std
    mean_x = numpy.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = numpy.sqrt(cov[1, 1]) * n_std
    mean_y = numpy.mean(y)

    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

figure, axis = pyplot.subplots(figsize = (12, 12))

positives = features[features.sentiment==Sentiment.positive]
negatives = features[features.sentiment==Sentiment.negative]

confidence_ellipse(positives.positive, positives.negative, axis, n_std=2,
                   label=r'$2\sigma$', edgecolor=Plot.red)
confidence_ellipse(negatives.positive, negatives.negative, axis, n_std=2, edgecolor=Plot.blue)

confidence_ellipse(positives.positive, positives.negative, axis, n_std=3,
                   label=r'$3\sigma$', edgecolor=Plot.red)
confidence_ellipse(negatives.positive, negatives.negative, axis, n_std=3, edgecolor=Plot.blue)

SIZE = 0.5
_ = positives.plot.scatter(x="positive", y="negative", s=SIZE, ax=axis, facecolors="none",
                           color=Plot.blue, label="positive")
negatives.plot.scatter(x="positive", y="negative", s=SIZE, ax=axis,
                       label="negative", color=Plot.red)

LIMITS = (-200, 100)
axis.set_xlim(LIMITS)
axis.set_ylim(LIMITS)
axis.legend()
axis.set_title("Confidence Ellipses")

figure.savefig("ellipses.png")

It's a bit squashed looking, since the results are so tight, but you can sort of see that the distributions are "left-skewed", with the points that fall outside of the \(3 \sigma\) range being the cases where the "positive" and "negative" values are both negative.