Twitter Word Frequencies
Beginning
In the previous post in this series (Twitter Preprocessing With NLTK) I made a pre-processor for tweets, now I'm going to make a counter that counts how many times a certain token shows up in positive and negative tweets.
Setup
Imports
# from python
from argparse import Namespace
from functools import partial
from pathlib import Path
import os
import pickle
# from pypi
from bokeh.models import HoverTool
from dotenv import load_dotenv
from nltk.corpus import twitter_samples
from tabulate import tabulate
import holoviews
import hvplot.pandas
import nltk
import numpy
import pandas
# this project
from neurotic.nlp.twitter.processor import TwitterProcessor
from neurotic.nlp.twitter.counter import WordCounter
# some helper stuff
from graeae import EmbedHoloviews
The Data
First we'll load the training data that I set-up while building the tweet pre-processor.
load_dotenv("posts/nlp/.env")
path = Path(os.environ["TWITTER_TRAINING_PROCESSED"]).expanduser()
assert path.is_file()
training = pandas.read_feather(path)
training_raw = pandas.read_feather(
Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())
print(training.head(1))
print(f"Rows: {len(training):,}")
tweet label 0 [park, get, sunlight, :)] 1 Rows: 8,000
I also made an object to pass around to make sure I didn't switch the numeric positive
and negative
encodings so let's load that.
path = Path(os.environ["TWITTER_SENTIMENT"]).expanduser()
with path.open("rb") as reader:
Sentiment = pickle.load(reader)
print(Sentiment)
Namespace(decode={1: 'positive', 0: 'negative'}, encode={'positive': 1, 'negative': 0}, negative=0, positive=1)
Plotting and Printing
This is some preliminary setup of the plotter and table-printer so I don't have to keep typing the same things over and over.
SLUG = "01b-twitter-word-frequencies"
Embed = partial(EmbedHoloviews,
folder_path=f"files/posts/nlp/{SLUG}")
path = Path(os.environ["TWITTER_PLOT"]).expanduser()
with path.open("rb") as reader:
Plot = pickle.load(reader)
TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys", showindex=False)
Middle
Word Frequencies
We're going to build up a Counter of token frequencies. The keys will be (token, sentiment)
tuples and the values will be the counts for the token-sentiment pairs.
Tests
These are the tests for the implementation that follows them.
- The Tangles
Feature: A Word Frequency Counter In order to get a sense of how the words correlate with sentiment I want to be able to count word-sentiment pairs. <<counter-feature>> <<call-feature>>
# pypi from expects import ( be, equal, expect ) from pytest_bdd import ( given, scenarios, then, when ) # testing setup from fixtures import katamari # software under test from neurotic.nlp.twitter.counter import WordCounter from neurotic.nlp.twitter.processor import TwitterProcessor scenarios("twitter/word_frequencies.feature") <<test-creation>> <<test-call>>
- Setup
Scenario: The Word Counter is created Given a word counter class When the word counter is created Then it has the expected attributes
# Scenario: The Word Counter is created @given("a word counter class") def setup_class(katamari): katamari.definition = WordCounter return @when("the word counter is created") def create_word_counter(katamari, faker, mocker): katamari.tweets = mocker.Mock(list) katamari.labels = mocker.Mock(list) katamari.processor = mocker.Mock() katamari.counter = katamari.definition(tweets=katamari.tweets, labels=katamari.labels) katamari.counter._process = katamari.processor return @then("it has the expected attributes") def check_attributes(katamari): expect(katamari.counter.tweets).to(be(katamari.tweets)) expect(katamari.counter.labels).to(be(katamari.labels)) expect(katamari.counter.process).to(be(katamari.processor)) return
- The Call
Scenario: The Word Frequency counter is called Given a word frequency counter When the counter is called Then the counts are the expected
# Scenario: The Word Frequency counter is called @given("a word frequency counter") def setup_word_frequency_counter(katamari, mocker): processor = TwitterProcessor() katamari.tweets = ["a b aab a b c"] katamari.labels = [1] * len(katamari.tweets) katamari.counter = WordCounter(tweets=katamari.tweets, labels=katamari.labels) bad_sentiment = ["c aab aab"] katamari.tweets += bad_sentiment katamari.labels += [0] # since the tokenizer removes and changes words # I'm going to mock it out katamari.counter._process = mocker.MagicMock(TwitterProcessor) katamari.counter.process.side_effect = lambda x: x.split() katamari.expected = {("a", 1): 2, ("b", 1): 2, ("c", 1): 1, ("aab", 1):1, ("c", 0): 1, ("aab", 0): 2} return @when("the counter is called") def call_counter(katamari): katamari.counts = katamari.counter.counts return @then("the counts are the expected") def check_counts(katamari): for key, value in katamari.counts.items(): expect(katamari.expected[key]).to(equal(value)) return
Implementation
This is going to be a counter class that pre-processes the tweets and then counts the frequency of word-sentiment pairs.
# A Word Counter
# from python
from collections import Counter
import typing
# from pypi
import attr
# this project
from .processor import TwitterProcessor
@attr.s(auto_attribs=True)
class WordCounter:
"""A word-sentiment counter
Args:
tweets: list of unprocessed tweets
labels: list of 1's (positive) and 0's that identifies sentiment for each tweet
"""
tweets: typing.List[str]
labels: typing.List[int]
_process: TwitterProcessor = None
_processed: list = None
_counts: Counter = None
@property
def process(self) -> TwitterProcessor:
"""A callable to process tweets to lists of words"""
if self._process is None:
self._process = TwitterProcessor()
return self._process
@property
def processed(self) -> list:
"""The processed and tokenized tweets"""
if self._processed is None:
self._processed = [self.process(tweet) for tweet in self.tweets]
return self._processed
@property
def counts(self) -> Counter:
"""Processes the tweets and labels
Returns:
counts of word-sentiment pairs
"""
if self._counts is None:
assert len(self.tweets) == len(self.labels), \
f"Tweets: {len(self.tweets)}, Labels: {len(self.labels)}"
self._counts = Counter()
for tweet, label in zip(self.processed, self.labels):
for word in tweet:
self._counts[(word, label)] += 1
return self._counts
Counting
Now that we've implemented the counter we might as well get to counting. This is going to be kind of hacky, but I originally wasn't saving the processed data and so was expecting this thing to process it. Maybe I'll change it to look better late. But, anyway that's why I'm assigning the column to the ._processed
attribute.
counter = WordCounter(tweets=training.tweet, labels=training.label)
counter._processed = training.tweet
counts = counter.counts
print(f"Total token-sentiment pairs: {len(counts):,}")
Total token-sentiment pairs: 11,476
What are the most common? To make the rest of the post easier I'm going to set up a pandas DataFrame.
tokens = []
top_counts = []
sentiments = []
for key, count in counts.most_common():
token, sentiment = key
tokens.append(token)
sentiments.append(sentiment)
top_counts.append(count)
top_counts = pandas.DataFrame.from_dict(dict(
token=tokens,
count=top_counts,
sentiment=sentiments,
))
top_counts.loc[:, "sentiment"] = top_counts.sentiment.apply(lambda row: Sentiment.decode[row])
print(TABLE(top_counts.iloc[:20]))
token | count | sentiment |
---|---|---|
:( | 3705 | negative |
:) | 2967 | positive |
:-) | 547 | positive |
:D | 537 | positive |
thank | 516 | positive |
:-( | 407 | negative |
follow | 349 | positive |
love | 306 | positive |
i'm | 282 | negative |
… | 261 | negative |
miss | 241 | negative |
… | 228 | positive |
pleas | 215 | negative |
follow | 211 | negative |
get | 200 | negative |
want | 197 | negative |
day | 194 | positive |
u | 193 | positive |
good | 189 | positive |
like | 189 | positive |
It's interesting that the only tokens in the top 20 that are both positive and negative are ellipses and "follow" and that the four most common tokens were smileys, although given the nature of tweets I guess the use of smileys (emoticons?) shouldn't be so surprising. I didn't notice this at first, but the most common token is a negative one.
Plotting
The counts themselves are interesting, but it might be more informative to look at their distribution as well as whether some tokens are more positive or negative.
Positive Vs Negative
tooltips = [
("Token", "@token"),
("Sentiment", "@sentiment"),
("Count", "@count"),
]
hover = HoverTool(tooltips=tooltips)
plot = top_counts.hvplot(kind="bar", x="sentiment", y="count",
hover_cols="all").opts(
width=Plot.width,
height= Plot.height,
title="Positive and Negative",
fontscale=2,
tools=[hover],
color=Plot.tan,
line_color="white",
)
embedded = Embed(plot=plot, file_name="positive_negative_distribution")
output = embedded()
print(output)
So it looks like negative sentiment is more common for the tokens, even though the tweets themselves were evenly split, maybe because the negative tweets had more diverse tokens.
Distribution
tooltips = [
("Token", "@token"),
("Sentiment", "@sentiment"),
("Count", "@count"),
]
hover = HoverTool(tooltips=tooltips)
CUTOFF = 150
plot = top_counts[:CUTOFF].hvplot.bar(
y="count", hover_cols=["token", "sentiment"],
loglog=True).opts(
tools=[hover],
width=Plot.width,
height=Plot.height,
fontscale=2,
color=Plot.tan,
line_color=Plot.tan,
xaxis=None,
ylim=(0, None),
title=f"Log-Log Count Distribution (top {CUTOFF})")
output = Embed(plot=plot, file_name="count_distribution")()
print(output)
This shows how steep the drop is from the two most common tokens which are then followed by a long tail. Without the logarithmic axes the drop is even more pronounced.
Positive Vs Negative by Tweet
CUTOFF = 250
top_counts.loc[:, "positive"] = top_counts.apply(
lambda row: row["count"] if row.sentiment=="positive" else 0,
axis="columns")
top_counts.loc[:, "negative"] = top_counts.apply(
lambda row: row["count"] if row.sentiment=="negative" else 0,
axis="columns"
)
tooltips = [
("Token", "@token"),
("Positive", "@positive"),
("Negative", "@negative"),
]
hover = HoverTool(tooltips=tooltips)
grouped = top_counts.groupby("token").agg({"positive": "sum", "negative": "sum"})
to_plot = grouped.reset_index()
# log plots can't have zero values
MIN = 1
for column in ("positive", "negative"):
to_plot.loc[:, column] = to_plot[column] + 1
MAX = to_plot.negative.max() + 1
line = holoviews.Curve(([MIN, MAX], [MIN, MAX])).opts(color=Plot.red)
scatter = to_plot.hvplot.scatter(
loglog=True,
color=Plot.blue,
x="positive", y="negative",
hover_cols=["token"])
plot = (line * scatter ).opts(
tools=[hover],
width=Plot.width,
height=Plot.height,
xlabel="Positive",
ylabel="Negative",
fontscale=2,
title="Log-Log Positive vs Negative")
output = Embed(plot=plot, file_name="scatter_plot")()
print(output)
The tokens along or around the diagonal are evenly positive and negative so they probably aren't useful indicators of sentiment in and of themselves, while those furthest from the diagonal are the most biased to one side or the other so we might expect them to be useful in guessing a tweet's sentiment.
There are some unexpectedly negative tokens like "love" (400, 152) and "thank" (620, 107), but at this point we haven't really started to look at the sentiment yet so I'll leave further exploration for later.
End
Since the counter gets re-used I'm going to pickle it for later.
with Path(os.environ["TWITTER_COUNTER"]).expanduser().open("wb") as writer:
pickle.dump(counter, writer)
Next in this series: The Tweet Vectorizer