Twitter Word Frequencies

Beginning

In the previous post in this series (Twitter Preprocessing With NLTK) I made a pre-processor for tweets, now I'm going to make a counter that counts how many times a certain token shows up in positive and negative tweets.

Setup

Imports

# from python
from argparse import Namespace
from functools import partial
from pathlib import Path

import os
import pickle

# from pypi
from bokeh.models import HoverTool
from dotenv import load_dotenv
from nltk.corpus import twitter_samples
from tabulate import tabulate

import holoviews
import hvplot.pandas
import nltk
import numpy
import pandas

# this project
from neurotic.nlp.twitter.processor import TwitterProcessor
from neurotic.nlp.twitter.counter import WordCounter

# some helper stuff
from graeae import EmbedHoloviews

The Data

First we'll load the training data that I set-up while building the tweet pre-processor.

load_dotenv("posts/nlp/.env")
path = Path(os.environ["TWITTER_TRAINING_PROCESSED"]).expanduser()
assert path.is_file()
training = pandas.read_feather(path)
training_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())
print(training.head(1))
print(f"Rows: {len(training):,}")
                       tweet  label
0  [park, get, sunlight, :)]      1
Rows: 8,000

I also made an object to pass around to make sure I didn't switch the numeric positive and negative encodings so let's load that.

path = Path(os.environ["TWITTER_SENTIMENT"]).expanduser()
with path.open("rb") as reader:
    Sentiment = pickle.load(reader)
print(Sentiment)
Namespace(decode={1: 'positive', 0: 'negative'}, encode={'positive': 1, 'negative': 0}, negative=0, positive=1)

Plotting and Printing

This is some preliminary setup of the plotter and table-printer so I don't have to keep typing the same things over and over.

SLUG = "01b-twitter-word-frequencies"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/nlp/{SLUG}")

path = Path(os.environ["TWITTER_PLOT"]).expanduser()
with path.open("rb") as reader:
    Plot = pickle.load(reader)
TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys", showindex=False)

Middle

Word Frequencies

We're going to build up a Counter of token frequencies. The keys will be (token, sentiment) tuples and the values will be the counts for the token-sentiment pairs.

Tests

These are the tests for the implementation that follows them.

  • The Tangles
    Feature: A Word Frequency Counter
    
    In order to get a sense of how the words correlate with sentiment
    I want to be able to count word-sentiment pairs.
    
    <<counter-feature>>
    
    <<call-feature>>
    
    # pypi
    from expects import (
        be,
        equal,
        expect
        )
    
    from pytest_bdd import (
        given,
        scenarios,
        then,
        when
    )
    
    # testing setup
    from fixtures import katamari
    
    # software under test
    from neurotic.nlp.twitter.counter import WordCounter
    from neurotic.nlp.twitter.processor import TwitterProcessor
    
    scenarios("twitter/word_frequencies.feature")
    
    <<test-creation>>
    
    
    <<test-call>>
    
  • Setup
    Scenario: The Word Counter is created
      Given a word counter class
      When the word counter is created
      Then it has the expected attributes
    
    # Scenario: The Word Counter is created
    
    
    @given("a word counter class")
    def setup_class(katamari):
        katamari.definition = WordCounter
        return
    
    
    @when("the word counter is created")
    def create_word_counter(katamari, faker, mocker):
        katamari.tweets = mocker.Mock(list)
        katamari.labels = mocker.Mock(list)
        katamari.processor = mocker.Mock()
        katamari.counter = katamari.definition(tweets=katamari.tweets,
                                               labels=katamari.labels)
        katamari.counter._process = katamari.processor
        return
    
    
    @then("it has the expected attributes")
    def check_attributes(katamari):
        expect(katamari.counter.tweets).to(be(katamari.tweets))
        expect(katamari.counter.labels).to(be(katamari.labels))
        expect(katamari.counter.process).to(be(katamari.processor))
        return
    
  • The Call
    Scenario: The Word Frequency counter is called
      Given a word frequency counter
      When the counter is called
      Then the counts are the expected
    
    # Scenario: The Word Frequency counter is called
    
    
    @given("a word frequency counter")
    def setup_word_frequency_counter(katamari, mocker):
        processor = TwitterProcessor()
        katamari.tweets = ["a b aab a b c"]
        katamari.labels = [1] * len(katamari.tweets)
        katamari.counter = WordCounter(tweets=katamari.tweets,
                                       labels=katamari.labels)
    
        bad_sentiment = ["c aab aab"]
        katamari.tweets += bad_sentiment
        katamari.labels += [0]
        # since the tokenizer removes and changes words
        # I'm going to mock it out
        katamari.counter._process = mocker.MagicMock(TwitterProcessor)
        katamari.counter.process.side_effect = lambda x: x.split()
        katamari.expected = {("a", 1): 2, ("b", 1): 2, ("c", 1): 1, ("aab", 1):1,
                             ("c", 0): 1, ("aab", 0): 2}
        return
    
    
    @when("the counter is called")
    def call_counter(katamari):
        katamari.counts = katamari.counter.counts
        return
    
    
    @then("the counts are the expected")
    def check_counts(katamari):
        for key, value in katamari.counts.items():
            expect(katamari.expected[key]).to(equal(value))
        return
    

Implementation

This is going to be a counter class that pre-processes the tweets and then counts the frequency of word-sentiment pairs.

# A Word Counter

# from python
from collections import Counter
import typing

# from pypi
import attr

# this project
from .processor import TwitterProcessor

@attr.s(auto_attribs=True)
class WordCounter:
    """A word-sentiment counter

    Args:
     tweets: list of unprocessed tweets
     labels: list of 1's (positive) and 0's that identifies sentiment for each tweet
    """
    tweets: typing.List[str]
    labels: typing.List[int]
    _process: TwitterProcessor = None
    _processed: list = None
    _counts: Counter = None

    @property
    def process(self) -> TwitterProcessor:
        """A callable to process tweets to lists of words"""
        if self._process is None:
            self._process = TwitterProcessor()
        return self._process

    @property
    def processed(self) -> list:
        """The processed and tokenized tweets"""
        if self._processed is None:
            self._processed = [self.process(tweet) for tweet in self.tweets]
        return self._processed

    @property
    def counts(self) -> Counter:
        """Processes the tweets and labels

       Returns:
        counts of word-sentiment pairs
       """
        if self._counts is None:
            assert len(self.tweets) == len(self.labels), \
                f"Tweets: {len(self.tweets)}, Labels: {len(self.labels)}"
            self._counts = Counter()
            for tweet, label in zip(self.processed, self.labels):
                for word in tweet:
                    self._counts[(word, label)] += 1
        return self._counts

Counting

Now that we've implemented the counter we might as well get to counting. This is going to be kind of hacky, but I originally wasn't saving the processed data and so was expecting this thing to process it. Maybe I'll change it to look better late. But, anyway that's why I'm assigning the column to the ._processed attribute.

counter = WordCounter(tweets=training.tweet, labels=training.label)
counter._processed = training.tweet
counts = counter.counts
print(f"Total token-sentiment pairs: {len(counts):,}")
Total token-sentiment pairs: 11,476

What are the most common? To make the rest of the post easier I'm going to set up a pandas DataFrame.

tokens = []
top_counts = []
sentiments = []

for key, count in counts.most_common():
    token, sentiment = key
    tokens.append(token)
    sentiments.append(sentiment)
    top_counts.append(count)

top_counts = pandas.DataFrame.from_dict(dict(
    token=tokens,
    count=top_counts,
    sentiment=sentiments,
))

top_counts.loc[:, "sentiment"] = top_counts.sentiment.apply(lambda row: Sentiment.decode[row])
print(TABLE(top_counts.iloc[:20]))
token count sentiment
:( 3705 negative
:) 2967 positive
:-) 547 positive
:D 537 positive
thank 516 positive
:-( 407 negative
follow 349 positive
love 306 positive
i'm 282 negative
261 negative
miss 241 negative
228 positive
pleas 215 negative
follow 211 negative
get 200 negative
want 197 negative
day 194 positive
u 193 positive
good 189 positive
like 189 positive

It's interesting that the only tokens in the top 20 that are both positive and negative are ellipses and "follow" and that the four most common tokens were smileys, although given the nature of tweets I guess the use of smileys (emoticons?) shouldn't be so surprising. I didn't notice this at first, but the most common token is a negative one.

Plotting

The counts themselves are interesting, but it might be more informative to look at their distribution as well as whether some tokens are more positive or negative.

Positive Vs Negative

tooltips = [
    ("Token", "@token"),
    ("Sentiment", "@sentiment"),
    ("Count", "@count"),
]

hover = HoverTool(tooltips=tooltips)

plot = top_counts.hvplot(kind="bar", x="sentiment", y="count",
                         hover_cols="all").opts(    
    width=Plot.width,
    height= Plot.height,
    title="Positive and Negative",
    fontscale=2,
    tools=[hover],
    color=Plot.tan,
    line_color="white",
)
embedded = Embed(plot=plot, file_name="positive_negative_distribution")
output = embedded()
print(output)

Figure Missing

So it looks like negative sentiment is more common for the tokens, even though the tweets themselves were evenly split, maybe because the negative tweets had more diverse tokens.

Distribution

tooltips = [
    ("Token", "@token"),
    ("Sentiment", "@sentiment"),
    ("Count", "@count"),
]

hover = HoverTool(tooltips=tooltips)

CUTOFF = 150

plot = top_counts[:CUTOFF].hvplot.bar(
    y="count", hover_cols=["token", "sentiment"],
    loglog=True).opts(
        tools=[hover],
        width=Plot.width,
        height=Plot.height,
        fontscale=2,
        color=Plot.tan,
        line_color=Plot.tan,
        xaxis=None,
        ylim=(0, None),
        title=f"Log-Log Count Distribution (top {CUTOFF})")
output = Embed(plot=plot, file_name="count_distribution")()
print(output)

Figure Missing

This shows how steep the drop is from the two most common tokens which are then followed by a long tail. Without the logarithmic axes the drop is even more pronounced.

Positive Vs Negative by Tweet

CUTOFF = 250

top_counts.loc[:, "positive"] = top_counts.apply(
    lambda row: row["count"] if row.sentiment=="positive" else 0,
    axis="columns")

top_counts.loc[:, "negative"] = top_counts.apply(
    lambda row: row["count"] if row.sentiment=="negative" else 0,
    axis="columns"
)

tooltips = [
    ("Token", "@token"),
    ("Positive", "@positive"),
    ("Negative", "@negative"),
]

hover = HoverTool(tooltips=tooltips)

grouped = top_counts.groupby("token").agg({"positive": "sum", "negative": "sum"})
to_plot = grouped.reset_index()

# log plots can't have zero values
MIN = 1
for column in ("positive", "negative"):
    to_plot.loc[:, column] = to_plot[column] + 1

MAX = to_plot.negative.max() + 1
line = holoviews.Curve(([MIN, MAX], [MIN, MAX])).opts(color=Plot.red)
scatter = to_plot.hvplot.scatter(
    loglog=True,
    color=Plot.blue,
    x="positive", y="negative",
    hover_cols=["token"])
plot = (line * scatter ).opts(
        tools=[hover],
        width=Plot.width,
        height=Plot.height,
        xlabel="Positive",
        ylabel="Negative",
        fontscale=2,
        title="Log-Log Positive vs Negative")
output = Embed(plot=plot, file_name="scatter_plot")()
print(output)

Figure Missing

The tokens along or around the diagonal are evenly positive and negative so they probably aren't useful indicators of sentiment in and of themselves, while those furthest from the diagonal are the most biased to one side or the other so we might expect them to be useful in guessing a tweet's sentiment.

There are some unexpectedly negative tokens like "love" (400, 152) and "thank" (620, 107), but at this point we haven't really started to look at the sentiment yet so I'll leave further exploration for later.

End

Since the counter gets re-used I'm going to pickle it for later.

with Path(os.environ["TWITTER_COUNTER"]).expanduser().open("wb") as writer:
    pickle.dump(counter, writer)

Next in this series: The Tweet Vectorizer