Tweet Classifier Class

Beginning

I implemented the Logistic Regression Tweet Sentiment Analysis classifier in this post but I'm going to re-use it later so this just gathers everything together. There's already a class called TweetSentiment but I'm going to add the training to this one as well as the tweet pre-processing and vectorization.

Middle

We'll start with the imports.

# from pypi
import attr
import numpy

# this project
from .counter import WordCounter
from .sentiment import TweetSentiment
from .vectorizer import TweetVectorizer

The Logistic Regression Class

@attr.s(auto_attribs=True)
class LogisticRegression:
    """train and predict tweet sentiment

    Args:
     iterations: number of times to run gradient descent
     learning_rate: how fast to change the weights during training
    """
    iterations: int
    learning_rate: float
    _weights: numpy.array = None
    loss: float=None

Weights

These are the weights for the regression function (\(\theta\)).

@property
def weights(self) -> numpy.array:
    """The weights for the regression

    Initially this will be an array of zeros.
    """
    if self._weights is None:
        self._weights = numpy.zeros((3, 1))
    return self._weights

The Weights Setter

@weights.setter
def weights(self, new_weights: numpy.array) -> None:
    """Set the weights to a new value"""
    self._weights = new_weights
    return

Sigmoid

def sigmoid(self, vectors: numpy.ndarray) -> float:
    """Calculates the logistic function value

    Args:
     vectors: a matrix of bias, positive, negative wordc ounts

    Returns:
     array of probabilities that the tweets are positive
    """
    return 1/(1 + numpy.exp(-vectors))

This is the training function

def gradient_descent(self, x: numpy.ndarray, y: numpy.ndarray):
    """Finds the weights for the model

    Args:
     x: the tweet vectors
     y: the positive/negative labels
    """
    assert len(x) == len(y)
    rows = len(x)
    self.learning_rate /= rows
    for iteration in range(self.iterations):
        y_hat = self.sigmoid(x.dot(self.weights))
        # average loss
        loss = numpy.squeeze(-((y.T.dot(numpy.log(y_hat))) +
                               (1 - y.T).dot(numpy.log(1 - y_hat))))/rows
        gradient = ((y_hat - y).T.dot(x)).sum(axis=0, keepdims=True)
        self.weights -= self.learning_rate * gradient.T
    return loss

Fit

This is mostly an alias to make it match (somewhat) sklearn's methods.

def fit(self, x_train: numpy.ndarray, y_train:numpy.ndarray) -> float:
    """fits the weights for the logistic regression

    Note:
     as a side effect this also sets counter, loss, and sentimenter attributes

    Args:
     x_train: the training tweets
     y_train: the training labels

    Returns:
     The final mean loss (which is also saved as the =.loss= attribute)
    """
    self.counter = WordCounter(x_train, y_train)
    vectorizer = TweetVectorizer(x_train, self.counter.counts, processed=False)
    y = y_train.values.reshape((-1, 1))
    self.loss = self.gradient_descent(vectorizer.vectors, y)
    return self.loss

Predict

def predict(self, x: numpy.ndarray) -> numpy.ndarray:
    """Predict the labels for the inputs

    Args:
     x: a list or array of tweets

    Returns:
     array of predicted labels for the tweets
    """
    vectorizer = TweetVectorizer(x, self.counter.counts, processed=False)
    sentimenter = TweetSentiment(vectorizer, self.weights)
    return sentimenter()

Score

def score(self, x: numpy.ndarray, y: numpy.ndarray) -> float:
    """Get the mean accuracy

    Args:
     x: arrray of tweets
     y: labels for the tweets

    Returns:
     mean accuracy
    """
    predictions = self.predict(x)
    correct = sum(predictions.T[0] == y)
    return correct/len(x)

End

Testing it out.

# python
from argparse import Namespace
from pathlib import Path

import math
import os

# pypi
from dotenv import load_dotenv
from expects import (
    be_true,
    expect
)

import pandas

# this project
from neurotic.nlp.twitter.logistic_regression import LogisticRegression
load_dotenv("posts/nlp/.env")

train_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())

test_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TEST_RAW"]).expanduser()
)


Settings = Namespace(
    eta = 1e-9,
    iterations = 1500
)
model = LogisticRegression(iterations=Settings.iterations,
                           learning_rate=Settings.eta)
model.fit(x_train=train_raw.tweet, y_train=train_raw.label)
expected = 0.22043072
expect(math.isclose(model.loss, expected, rel_tol=1e-7)).to(be_true)
accuracy = model.score(test_raw.tweet, test_raw.label)
print(f"Accuracy: {accuracy}")
Accuracy: 0.996