Class-Based Naive Bayes Tweet Sentiment Classifier

Beginning

I previously implemented a Naive Bayes Classifier for Tweets as separate functions, and while that is useful for learningi I want to re-use it so I'm going to re-implement it as a class-based system.

The Naive Bayes Classifier

Imports

# python
from argparse import Namespace
from collections import Counter
from typing import Iterable

# pypi
import attr
import numpy

# my stuff
from neurotic.nlp.twitter.counter import WordCounter

The Sentiment Constants

Sentiment = Namespace(
    negative = 0,
    positive = 1,
)

The Declaration

@attr.s(auto_attribs=True)
class NaiveBayes:
    """Naive Bayes Sentiment Classifier for Tweets

    Args:
     tweets: the training tweets
     labels: the sentiment labels for the training tweets
    """
    tweets: Iterable
    labels: Iterable
    _counter: WordCounter = None
    _vocabulary: set = None
    _logprior: float = None
    _loglikelihood: dict = None

The Counter

@property
def counter(self) -> WordCounter:
    """The word processor/counter"""
    if self._counter is None:
        self._counter = WordCounter(self.tweets, self.labels)
    return self._counter

The Vocabulary

@property
def vocabulary(self) -> set:
    """The unique tokens in the tweets"""
    if self._vocabulary is None:
        self._vocabulary = {key[0] for key in self.counter.counts}
    return self._vocabulary

The logprior

@property
def logprior(self) -> float:
    """the log-odds of the priors"""
    if self._logprior is None:
        positive_documents = numpy.sum(self.labels)
        negative_documents = len(self.labels) - positive_documents
        self._logprior = numpy.log(positive_documents) - numpy.log(negative_documents)
    return self._logprior

The loglikelihood

@property
def loglikelihood(self) -> dict:
    """The log-likelihoods for words"""
    if self._loglikelihood is None:
        self._loglikelihood = {}
        counts = self.counter.counts        

        all_positive_words = sum(
            (counts[(token, sentiment)] for token, sentiment in counts
             if sentiment == Sentiment.positive))
        all_negative_words = sum(
            (counts[(token, sentiment)] for token, sentiment in counts
             if sentiment == Sentiment.negative))
        vocabulary_size = len(self.vocabulary)

        for word in self.vocabulary:
            this_word_positive_count = counts[(word, Sentiment.positive)]
            this_word_negative_count = counts[(word, Sentiment.negative)]

            probability_word_is_positive = ((this_word_positive_count + 1)/
                                         (all_positive_words + vocabulary_size))
            probability_word_is_negative = ((this_word_negative_count + 1)/
                                         (all_negative_words + vocabulary_size))

            self._loglikelihood[word] = (numpy.log(probability_word_is_positive) -
                                         numpy.log(probability_word_is_negative))
    return self._loglikelihood

Predict Probability

def predict_ratio(self, tweet: str) -> float:
    """predict the odds-ratio positive/negative

    Args:
     tweet: the tweet to predict

    Returns:
     log-odds-ratio for tweet (positive/negative)
    """
    tokens = self.counter.process(tweet)
    return self.logprior + sum(self.loglikelihood.get(token, 0) for token in tokens)

Predict Sentiment

def predict_sentiment(self, tweet: str) -> int:
    """Predict whether the tweet's sentiment is positive or negative

    Args:
     tweet: the 'document' to analyze

    Returns:
     the sentiment (0=negative, 1=positive)
    """
    return self.predict_ratio(tweet) > 0

Check Rep

def check_rep(self) -> None:
    """Does some basic checks of the input arguments"""
    assert len(self.tweets) == len(self.labels)
    return

Testing

Imports

"""NaiveBayes Tweet Sentiment Classifier feature tests."""

# python
from collections import Counter

import math

# pypi
from expects import (
    be,
    be_empty,
    be_true,
    equal,
    expect,
)

from pytest_bdd import (
    given,
    scenarios,
    then,
    when,
)

import pytest_bdd

# this test repo
from fixtures import katamari

# software under test
from neurotic.nlp.twitter.counter import WordCounter
from neurotic.nlp.twitter.naive_bayes import NaiveBayes

Test Setup

scenarios("../../features/twitter/naive_bayes.feature")

Can you construct it?

Feature: NaiveBayes Tweet Sentiment Classifier

Scenario: The user builds the classifier
  Given a Naive Bayes definition
  When the user builds the classifier
  Then it has the expected attributes
# Scenario: The user builds the classifier


@given('a Naive Bayes definition')
def a_naive_bayes_definition(katamari):
    katamari.definition = NaiveBayes
    return


@when('the user builds the classifier')
def the_user_builds_the_classifier(katamari):
    katamari.labels = [0, 1, 1]
    katamari.tweets = "alfa bravo charley".split()
    katamari.classifier = katamari.definition(tweets=katamari.tweets,
                                              labels=katamari.labels)
    return


@then('it has the expected attributes')
def it_has_the_expected_attributes(katamari):
    expect(katamari.classifier.tweets).to(be(katamari.tweets))
    expect(katamari.classifier.labels).to(be(katamari.labels))
    katamari.classifier.check_rep()
    return

Does it build the counter?

Scenario: The user checks the counter
  Given a Naive Bayes classifier
  When the user checks the counter
  Then it is the expected counter
# Scenario: The user checks the counter

@given("a Naive Bayes classifier")
def build_naive_classifier(katamari):
    katamari.classifier = NaiveBayes(tweets=[], labels=[])
    return


@when("the user checks the counter")
def check_counter(katamari, mocker):
    katamari.counter = mocker.MagicMock(spec=WordCounter)
    katamari.counter_definition = mocker.MagicMock()
    katamari.counter_definition.return_value = katamari.counter
    mocker.patch("neurotic.nlp.twitter.naive_bayes.WordCounter", katamari.counter_definition)
    katamari.actual_counter = katamari.classifier.counter
    return


@then("it is the expected counter")
def expect_counter(katamari):
    expect(katamari.actual_counter).to(be(katamari.counter))
    return

Does it build the logprior?

Scenario: The user checks the log-prior
 Given a valid Naive Bayes Classifier
 When the user checks the log-odds prior
 Then it is close enough
# Scenario: The user checks the log-prior

@given("a valid Naive Bayes Classifier")
def setup_classifier(katamari):
    katamari.tweets = ["a blowfish", "b closing", "c that", "d plane"]
    katamari.labels = [1, 1, 0, 1]
    katamari.counts = Counter({
        ("appl", 0): 5,
        ("b", 1): 2,
        ("c", 1): 4,

    })
    katamari.classifier = NaiveBayes(tweets=katamari.tweets,
                                     labels = katamari.labels)
    katamari.classifier.counter._counts = katamari.counts
    return


@when("the user checks the log-odds prior")
def get_log_odds_prior(katamari):
    katamari.expected = math.log(3) - math.log(1)
    katamari.actual = katamari.classifier.logprior
    return


@then("it is close enough")
def expect_close_enough(katamari):
    expect(math.isclose(katamari.actual, katamari.expected)).to(be_true)
    return

Does it build the vocabulary?

Scenario: The user checks the vocabulary
  Given a valid Naive Bayes Classifier
  When the user checks the vocabulary
  Then all the words are there
# Scenario: The user checks the vocabulary
#  Given a valid Naive Bayes Classifier


@when("the user checks the vocabulary")
def check_vocabulary(katamari):
  katamari.actual = katamari.classifier.vocabulary
  katamari.expected = {"appl", "b", "c"}
  return


@then("all the words are there")
def compare_words(katamari):
  expect(katamari.actual ^ katamari.expected).to(be_empty)
  return

Does it build the log-likelihood?

Scenario: The user gets the log-likelihood dictionary
  Given a valid Naive Bayes Classifier
  When the user checks the loglikelihoods
  Then they are close enough
# Scenario: The user gets the log-likelihood dictionary
#  Given a valid Naive Bayes Classifier


@when("the user checks the loglikelihoods")
def check_log_likelihoods(katamari):
    katamari.expected = dict(
        appl=math.log(1/9) - math.log(6/8),
        b=math.log(3/9) - math.log(1/8),
        c=math.log(5/9) - math.log(1/8)
    )
    katamari.actual = katamari.classifier.loglikelihood
    return


@then("they are close enough")
def expect_close_values(katamari):
    for word in katamari.classifier.loglikelihood:
        expect(math.isclose(katamari.expected[word],
                            katamari.actual[word])).to(be_true)
    return

Does it predict probabilities?

Scenario: User predicts tweet positive probability
  Given a valid Naive Bayes Classifier
  When the user makes a tweet prediction
  Then it is the expected probability
# Scenario: User predicts tweet positive probability
#   Given a valid Naive Bayes Classifier


@when("the user makes a tweet prediction")
def check_prediction(katamari):
    katamari.expected = (katamari.classifier.logprior
                         + katamari.classifier.loglikelihood["c"]
                         + katamari.classifier.loglikelihood["b"])
    katamari.actual = katamari.classifier.predict_ratio(
        "c you later b"
    )
    return


@then("it is the expected probability")
def expect_probability(katamari):
    expect(math.isclose(katamari.actual, katamari.expected)).to(be_true)
    return

Does it predict the sentiment?

Scenario: The user predicts tweet sentiment
  Given a valid Naive Bayes Classifier
  When the user predicts the sentiment of tweets
  Then the sentiments are the expected ones
# Scenario: The user predicts tweet sentiment
#   Given a valid Naive Bayes Classifier


@when("the user predicts the sentiment of tweets")
def check_predict_sentiment(katamari):
    katamari.actual_1 = katamari.classifier.predict_sentiment("c you later b")
    katamari.expected_1 = 1

    katamari.actual_2 = katamari.classifier.predict_sentiment("apple banana tart")
    katamari.expected_2 = 0
    return


@then("the sentiments are the expected ones")
def expect_sentiments(katamari):
    expect(katamari.actual_1).to(equal(katamari.expected_1))
    expect(katamari.actual_2).to(equal(katamari.expected_2))
    return

End

Now that we have the class-based version let's do a little visualization of the model.