Class-Based Naive Bayes Tweet Sentiment Classifier

Cloistered Monkey

2020-08-29 11:01

Beginning

I previously implemented a Naive Bayes Classifier for Tweets as separate functions, and while that is useful for learningi I want to re-use it so I'm going to re-implement it as a class-based system.

The Naive Bayes Classifier

Imports

# python
from argparse import Namespace
from collections import Counter
from typing import Iterable

# pypi
import attr
import numpy

# my stuff
from neurotic.nlp.twitter.counter import WordCounter

The Sentiment Constants

Sentiment = Namespace(
    negative = 0,
    positive = 1,
)

The Declaration

@attr.s(auto_attribs=True)
class NaiveBayes:
    """Naive Bayes Sentiment Classifier for Tweets

    Args:
     tweets: the training tweets
     labels: the sentiment labels for the training tweets
    """
    tweets: Iterable
    labels: Iterable
    _counter: WordCounter = None
    _vocabulary: set = None
    _logprior: float = None
    _loglikelihood: dict = None

The Counter

@property
def counter(self) -> WordCounter:
    """The word processor/counter"""
    if self._counter is None:
        self._counter = WordCounter(self.tweets, self.labels)
    return self._counter

The Vocabulary

@property
def vocabulary(self) -> set:
    """The unique tokens in the tweets"""
    if self._vocabulary is None:
        self._vocabulary = {key[0] for key in self.counter.counts}
    return self._vocabulary

The logprior

@property
def logprior(self) -> float:
    """the log-odds of the priors"""
    if self._logprior is None:
        positive_documents = numpy.sum(self.labels)
        negative_documents = len(self.labels) - positive_documents
        self._logprior = numpy.log(positive_documents) - numpy.log(negative_documents)
    return self._logprior

The loglikelihood

@property
def loglikelihood(self) -> dict:
    """The log-likelihoods for words"""
    if self._loglikelihood is None:
        self._loglikelihood = {}
        counts = self.counter.counts        

        all_positive_words = sum(
            (counts[(token, sentiment)] for token, sentiment in counts
             if sentiment == Sentiment.positive))
        all_negative_words = sum(
            (counts[(token, sentiment)] for token, sentiment in counts
             if sentiment == Sentiment.negative))
        vocabulary_size = len(self.vocabulary)

        for word in self.vocabulary:
            this_word_positive_count = counts[(word, Sentiment.positive)]
            this_word_negative_count = counts[(word, Sentiment.negative)]

            probability_word_is_positive = ((this_word_positive_count + 1)/
                                         (all_positive_words + vocabulary_size))
            probability_word_is_negative = ((this_word_negative_count + 1)/
                                         (all_negative_words + vocabulary_size))

            self._loglikelihood[word] = (numpy.log(probability_word_is_positive) -
                                         numpy.log(probability_word_is_negative))
    return self._loglikelihood

Predict Probability

def predict_ratio(self, tweet: str) -> float:
    """predict the odds-ratio positive/negative

    Args:
     tweet: the tweet to predict

    Returns:
     log-odds-ratio for tweet (positive/negative)
    """
    tokens = self.counter.process(tweet)
    return self.logprior + sum(self.loglikelihood.get(token, 0) for token in tokens)

Predict Sentiment

def predict_sentiment(self, tweet: str) -> int:
    """Predict whether the tweet's sentiment is positive or negative

    Args:
     tweet: the 'document' to analyze

    Returns:
     the sentiment (0=negative, 1=positive)
    """
    return self.predict_ratio(tweet) > 0

Check Rep

def check_rep(self) -> None:
    """Does some basic checks of the input arguments"""
    assert len(self.tweets) == len(self.labels)
    return

Testing

Imports

"""NaiveBayes Tweet Sentiment Classifier feature tests."""

# python
from collections import Counter

import math

# pypi
from expects import (
    be,
    be_empty,
    be_true,
    equal,
    expect,
)

from pytest_bdd import (
    given,
    scenarios,
    then,
    when,
)

import pytest_bdd

# this test repo
from fixtures import katamari

# software under test
from neurotic.nlp.twitter.counter import WordCounter
from neurotic.nlp.twitter.naive_bayes import NaiveBayes

Test Setup

scenarios("../../features/twitter/naive_bayes.feature")

Can you construct it?

Feature: NaiveBayes Tweet Sentiment Classifier

Scenario: The user builds the classifier
  Given a Naive Bayes definition
  When the user builds the classifier
  Then it has the expected attributes

# Scenario: The user builds the classifier


@given('a Naive Bayes definition')
def a_naive_bayes_definition(katamari):
    katamari.definition = NaiveBayes
    return


@when('the user builds the classifier')
def the_user_builds_the_classifier(katamari):
    katamari.labels = [0, 1, 1]
    katamari.tweets = "alfa bravo charley".split()
    katamari.classifier = katamari.definition(tweets=katamari.tweets,
                                              labels=katamari.labels)
    return


@then('it has the expected attributes')
def it_has_the_expected_attributes(katamari):
    expect(katamari.classifier.tweets).to(be(katamari.tweets))
    expect(katamari.classifier.labels).to(be(katamari.labels))
    katamari.classifier.check_rep()
    return

Does it build the counter?

Scenario: The user checks the counter
  Given a Naive Bayes classifier
  When the user checks the counter
  Then it is the expected counter

# Scenario: The user checks the counter

@given("a Naive Bayes classifier")
def build_naive_classifier(katamari):
    katamari.classifier = NaiveBayes(tweets=[], labels=[])
    return


@when("the user checks the counter")
def check_counter(katamari, mocker):
    katamari.counter = mocker.MagicMock(spec=WordCounter)
    katamari.counter_definition = mocker.MagicMock()
    katamari.counter_definition.return_value = katamari.counter
    mocker.patch("neurotic.nlp.twitter.naive_bayes.WordCounter", katamari.counter_definition)
    katamari.actual_counter = katamari.classifier.counter
    return


@then("it is the expected counter")
def expect_counter(katamari):
    expect(katamari.actual_counter).to(be(katamari.counter))
    return

Does it build the logprior?

Scenario: The user checks the log-prior
 Given a valid Naive Bayes Classifier
 When the user checks the log-odds prior
 Then it is close enough

# Scenario: The user checks the log-prior

@given("a valid Naive Bayes Classifier")
def setup_classifier(katamari):
    katamari.tweets = ["a blowfish", "b closing", "c that", "d plane"]
    katamari.labels = [1, 1, 0, 1]
    katamari.counts = Counter({
        ("appl", 0): 5,
        ("b", 1): 2,
        ("c", 1): 4,

    })
    katamari.classifier = NaiveBayes(tweets=katamari.tweets,
                                     labels = katamari.labels)
    katamari.classifier.counter._counts = katamari.counts
    return


@when("the user checks the log-odds prior")
def get_log_odds_prior(katamari):
    katamari.expected = math.log(3) - math.log(1)
    katamari.actual = katamari.classifier.logprior
    return


@then("it is close enough")
def expect_close_enough(katamari):
    expect(math.isclose(katamari.actual, katamari.expected)).to(be_true)
    return

Does it build the vocabulary?

Scenario: The user checks the vocabulary
  Given a valid Naive Bayes Classifier
  When the user checks the vocabulary
  Then all the words are there

# Scenario: The user checks the vocabulary
#  Given a valid Naive Bayes Classifier


@when("the user checks the vocabulary")
def check_vocabulary(katamari):
  katamari.actual = katamari.classifier.vocabulary
  katamari.expected = {"appl", "b", "c"}
  return


@then("all the words are there")
def compare_words(katamari):
  expect(katamari.actual ^ katamari.expected).to(be_empty)
  return

Does it build the log-likelihood?

Scenario: The user gets the log-likelihood dictionary
  Given a valid Naive Bayes Classifier
  When the user checks the loglikelihoods
  Then they are close enough

# Scenario: The user gets the log-likelihood dictionary
#  Given a valid Naive Bayes Classifier


@when("the user checks the loglikelihoods")
def check_log_likelihoods(katamari):
    katamari.expected = dict(
        appl=math.log(1/9) - math.log(6/8),
        b=math.log(3/9) - math.log(1/8),
        c=math.log(5/9) - math.log(1/8)
    )
    katamari.actual = katamari.classifier.loglikelihood
    return


@then("they are close enough")
def expect_close_values(katamari):
    for word in katamari.classifier.loglikelihood:
        expect(math.isclose(katamari.expected[word],
                            katamari.actual[word])).to(be_true)
    return

Does it predict probabilities?

Scenario: User predicts tweet positive probability
  Given a valid Naive Bayes Classifier
  When the user makes a tweet prediction
  Then it is the expected probability

# Scenario: User predicts tweet positive probability
#   Given a valid Naive Bayes Classifier


@when("the user makes a tweet prediction")
def check_prediction(katamari):
    katamari.expected = (katamari.classifier.logprior
                         + katamari.classifier.loglikelihood["c"]
                         + katamari.classifier.loglikelihood["b"])
    katamari.actual = katamari.classifier.predict_ratio(
        "c you later b"
    )
    return


@then("it is the expected probability")
def expect_probability(katamari):
    expect(math.isclose(katamari.actual, katamari.expected)).to(be_true)
    return

Does it predict the sentiment?

Scenario: The user predicts tweet sentiment
  Given a valid Naive Bayes Classifier
  When the user predicts the sentiment of tweets
  Then the sentiments are the expected ones

# Scenario: The user predicts tweet sentiment
#   Given a valid Naive Bayes Classifier


@when("the user predicts the sentiment of tweets")
def check_predict_sentiment(katamari):
    katamari.actual_1 = katamari.classifier.predict_sentiment("c you later b")
    katamari.expected_1 = 1

    katamari.actual_2 = katamari.classifier.predict_sentiment("apple banana tart")
    katamari.expected_2 = 0
    return


@then("the sentiments are the expected ones")
def expect_sentiments(katamari):
    expect(katamari.actual_1).to(equal(katamari.expected_1))
    expect(katamari.actual_2).to(equal(katamari.expected_2))
    return

End

Now that we have the class-based version let's do a little visualization of the model.

Table of Contents

Beginning

The Naive Bayes Classifier

Imports

The Sentiment Constants

The Declaration

The Counter

The Vocabulary

The logprior

The loglikelihood

Predict Probability

Predict Sentiment

Check Rep

Testing

Imports

Test Setup

Can you construct it?

Does it build the counter?

Does it build the logprior?

Does it build the vocabulary?

Does it build the log-likelihood?

Does it predict probabilities?

Does it predict the sentiment?

End