Tweet Classifier Class
Table of Contents
Beginning
I implemented the Logistic Regression Tweet Sentiment Analysis classifier in this post but I'm going to re-use it later so this just gathers everything together. There's already a class called TweetSentiment
but I'm going to add the training to this one as well as the tweet pre-processing and vectorization.
Middle
We'll start with the imports.
# from pypi
import attr
import numpy
# this project
from .counter import WordCounter
from .sentiment import TweetSentiment
from .vectorizer import TweetVectorizer
The Logistic Regression Class
@attr.s(auto_attribs=True)
class LogisticRegression:
"""train and predict tweet sentiment
Args:
iterations: number of times to run gradient descent
learning_rate: how fast to change the weights during training
"""
iterations: int
learning_rate: float
_weights: numpy.array = None
loss: float=None
Weights
These are the weights for the regression function (\(\theta\)).
@property
def weights(self) -> numpy.array:
"""The weights for the regression
Initially this will be an array of zeros.
"""
if self._weights is None:
self._weights = numpy.zeros((3, 1))
return self._weights
The Weights Setter
@weights.setter
def weights(self, new_weights: numpy.array) -> None:
"""Set the weights to a new value"""
self._weights = new_weights
return
Sigmoid
def sigmoid(self, vectors: numpy.ndarray) -> float:
"""Calculates the logistic function value
Args:
vectors: a matrix of bias, positive, negative wordc ounts
Returns:
array of probabilities that the tweets are positive
"""
return 1/(1 + numpy.exp(-vectors))
This is the training function
def gradient_descent(self, x: numpy.ndarray, y: numpy.ndarray):
"""Finds the weights for the model
Args:
x: the tweet vectors
y: the positive/negative labels
"""
assert len(x) == len(y)
rows = len(x)
self.learning_rate /= rows
for iteration in range(self.iterations):
y_hat = self.sigmoid(x.dot(self.weights))
# average loss
loss = numpy.squeeze(-((y.T.dot(numpy.log(y_hat))) +
(1 - y.T).dot(numpy.log(1 - y_hat))))/rows
gradient = ((y_hat - y).T.dot(x)).sum(axis=0, keepdims=True)
self.weights -= self.learning_rate * gradient.T
return loss
Fit
This is mostly an alias to make it match (somewhat) sklearn's methods.
def fit(self, x_train: numpy.ndarray, y_train:numpy.ndarray) -> float:
"""fits the weights for the logistic regression
Note:
as a side effect this also sets counter, loss, and sentimenter attributes
Args:
x_train: the training tweets
y_train: the training labels
Returns:
The final mean loss (which is also saved as the =.loss= attribute)
"""
self.counter = WordCounter(x_train, y_train)
vectorizer = TweetVectorizer(x_train, self.counter.counts, processed=False)
y = y_train.values.reshape((-1, 1))
self.loss = self.gradient_descent(vectorizer.vectors, y)
return self.loss
Predict
def predict(self, x: numpy.ndarray) -> numpy.ndarray:
"""Predict the labels for the inputs
Args:
x: a list or array of tweets
Returns:
array of predicted labels for the tweets
"""
vectorizer = TweetVectorizer(x, self.counter.counts, processed=False)
sentimenter = TweetSentiment(vectorizer, self.weights)
return sentimenter()
Score
def score(self, x: numpy.ndarray, y: numpy.ndarray) -> float:
"""Get the mean accuracy
Args:
x: arrray of tweets
y: labels for the tweets
Returns:
mean accuracy
"""
predictions = self.predict(x)
correct = sum(predictions.T[0] == y)
return correct/len(x)
End
Testing it out.
# python
from argparse import Namespace
from pathlib import Path
import math
import os
# pypi
from dotenv import load_dotenv
from expects import (
be_true,
expect
)
import pandas
# this project
from neurotic.nlp.twitter.logistic_regression import LogisticRegression
load_dotenv("posts/nlp/.env")
train_raw = pandas.read_feather(
Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())
test_raw = pandas.read_feather(
Path(os.environ["TWITTER_TEST_RAW"]).expanduser()
)
Settings = Namespace(
eta = 1e-9,
iterations = 1500
)
model = LogisticRegression(iterations=Settings.iterations,
learning_rate=Settings.eta)
model.fit(x_train=train_raw.tweet, y_train=train_raw.label)
expected = 0.22043072
expect(math.isclose(model.loss, expected, rel_tol=1e-7)).to(be_true)
accuracy = model.score(test_raw.tweet, test_raw.label)
print(f"Accuracy: {accuracy}")
Accuracy: 0.996