Twitter Word Frequencies

Cloistered Monkey

2020-07-07 18:19

Beginning

In the previous post in this series (Twitter Preprocessing With NLTK) I made a pre-processor for tweets, now I'm going to make a counter that counts how many times a certain token shows up in positive and negative tweets.

Setup

Imports

# from python
from argparse import Namespace
from functools import partial
from pathlib import Path

import os
import pickle

# from pypi
from bokeh.models import HoverTool
from dotenv import load_dotenv
from nltk.corpus import twitter_samples
from tabulate import tabulate

import holoviews
import hvplot.pandas
import nltk
import numpy
import pandas

# this project
from neurotic.nlp.twitter.processor import TwitterProcessor
from neurotic.nlp.twitter.counter import WordCounter

# some helper stuff
from graeae import EmbedHoloviews

The Data

First we'll load the training data that I set-up while building the tweet pre-processor.

load_dotenv("posts/nlp/.env")
path = Path(os.environ["TWITTER_TRAINING_PROCESSED"]).expanduser()
assert path.is_file()
training = pandas.read_feather(path)
training_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())
print(training.head(1))
print(f"Rows: {len(training):,}")

                       tweet  label
0  [park, get, sunlight, :)]      1
Rows: 8,000

I also made an object to pass around to make sure I didn't switch the numeric positive and negative encodings so let's load that.

path = Path(os.environ["TWITTER_SENTIMENT"]).expanduser()
with path.open("rb") as reader:
    Sentiment = pickle.load(reader)
print(Sentiment)

Namespace(decode={1: 'positive', 0: 'negative'}, encode={'positive': 1, 'negative': 0}, negative=0, positive=1)

Plotting and Printing

This is some preliminary setup of the plotter and table-printer so I don't have to keep typing the same things over and over.

SLUG = "01b-twitter-word-frequencies"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/nlp/{SLUG}")

path = Path(os.environ["TWITTER_PLOT"]).expanduser()
with path.open("rb") as reader:
    Plot = pickle.load(reader)

TABLE = partial(tabulate, tablefmt="orgtbl", headers="keys", showindex=False)

Middle

Word Frequencies

We're going to build up a Counter of token frequencies. The keys will be (token, sentiment) tuples and the values will be the counts for the token-sentiment pairs.

Tests

These are the tests for the implementation that follows them.

The Tangles

Feature: A Word Frequency Counter

In order to get a sense of how the words correlate with sentiment
I want to be able to count word-sentiment pairs.

<<counter-feature>>

<<call-feature>>

# pypi
from expects import (
    be,
    equal,
    expect
    )

from pytest_bdd import (
    given,
    scenarios,
    then,
    when
)

# testing setup
from fixtures import katamari

# software under test
from neurotic.nlp.twitter.counter import WordCounter
from neurotic.nlp.twitter.processor import TwitterProcessor

scenarios("twitter/word_frequencies.feature")

<<test-creation>>


<<test-call>>

Setup

Scenario: The Word Counter is created
  Given a word counter class
  When the word counter is created
  Then it has the expected attributes

# Scenario: The Word Counter is created


@given("a word counter class")
def setup_class(katamari):
    katamari.definition = WordCounter
    return


@when("the word counter is created")
def create_word_counter(katamari, faker, mocker):
    katamari.tweets = mocker.Mock(list)
    katamari.labels = mocker.Mock(list)
    katamari.processor = mocker.Mock()
    katamari.counter = katamari.definition(tweets=katamari.tweets,
                                           labels=katamari.labels)
    katamari.counter._process = katamari.processor
    return


@then("it has the expected attributes")
def check_attributes(katamari):
    expect(katamari.counter.tweets).to(be(katamari.tweets))
    expect(katamari.counter.labels).to(be(katamari.labels))
    expect(katamari.counter.process).to(be(katamari.processor))
    return

The Call

Scenario: The Word Frequency counter is called
  Given a word frequency counter
  When the counter is called
  Then the counts are the expected

# Scenario: The Word Frequency counter is called


@given("a word frequency counter")
def setup_word_frequency_counter(katamari, mocker):
    processor = TwitterProcessor()
    katamari.tweets = ["a b aab a b c"]
    katamari.labels = [1] * len(katamari.tweets)
    katamari.counter = WordCounter(tweets=katamari.tweets,
                                   labels=katamari.labels)

    bad_sentiment = ["c aab aab"]
    katamari.tweets += bad_sentiment
    katamari.labels += [0]
    # since the tokenizer removes and changes words
    # I'm going to mock it out
    katamari.counter._process = mocker.MagicMock(TwitterProcessor)
    katamari.counter.process.side_effect = lambda x: x.split()
    katamari.expected = {("a", 1): 2, ("b", 1): 2, ("c", 1): 1, ("aab", 1):1,
                         ("c", 0): 1, ("aab", 0): 2}
    return


@when("the counter is called")
def call_counter(katamari):
    katamari.counts = katamari.counter.counts
    return


@then("the counts are the expected")
def check_counts(katamari):
    for key, value in katamari.counts.items():
        expect(katamari.expected[key]).to(equal(value))
    return

Implementation

This is going to be a counter class that pre-processes the tweets and then counts the frequency of word-sentiment pairs.

# A Word Counter

# from python
from collections import Counter
import typing

# from pypi
import attr

# this project
from .processor import TwitterProcessor

@attr.s(auto_attribs=True)
class WordCounter:
    """A word-sentiment counter

    Args:
     tweets: list of unprocessed tweets
     labels: list of 1's (positive) and 0's that identifies sentiment for each tweet
    """
    tweets: typing.List[str]
    labels: typing.List[int]
    _process: TwitterProcessor = None
    _processed: list = None
    _counts: Counter = None

    @property
    def process(self) -> TwitterProcessor:
        """A callable to process tweets to lists of words"""
        if self._process is None:
            self._process = TwitterProcessor()
        return self._process

    @property
    def processed(self) -> list:
        """The processed and tokenized tweets"""
        if self._processed is None:
            self._processed = [self.process(tweet) for tweet in self.tweets]
        return self._processed

    @property
    def counts(self) -> Counter:
        """Processes the tweets and labels

       Returns:
        counts of word-sentiment pairs
       """
        if self._counts is None:
            assert len(self.tweets) == len(self.labels), \
                f"Tweets: {len(self.tweets)}, Labels: {len(self.labels)}"
            self._counts = Counter()
            for tweet, label in zip(self.processed, self.labels):
                for word in tweet:
                    self._counts[(word, label)] += 1
        return self._counts

Counting

Now that we've implemented the counter we might as well get to counting. This is going to be kind of hacky, but I originally wasn't saving the processed data and so was expecting this thing to process it. Maybe I'll change it to look better late. But, anyway that's why I'm assigning the column to the ._processed attribute.

counter = WordCounter(tweets=training.tweet, labels=training.label)
counter._processed = training.tweet
counts = counter.counts
print(f"Total token-sentiment pairs: {len(counts):,}")

Total token-sentiment pairs: 11,476

What are the most common? To make the rest of the post easier I'm going to set up a pandas DataFrame.

tokens = []
top_counts = []
sentiments = []

for key, count in counts.most_common():
    token, sentiment = key
    tokens.append(token)
    sentiments.append(sentiment)
    top_counts.append(count)

top_counts = pandas.DataFrame.from_dict(dict(
    token=tokens,
    count=top_counts,
    sentiment=sentiments,
))

top_counts.loc[:, "sentiment"] = top_counts.sentiment.apply(lambda row: Sentiment.decode[row])

print(TABLE(top_counts.iloc[:20]))

token	count	sentiment
:(	3705	negative
:)	2967	positive
:-)	547	positive
:D	537	positive
thank	516	positive
:-(	407	negative
follow	349	positive
love	306	positive
i'm	282	negative
…	261	negative
miss	241	negative
…	228	positive
pleas	215	negative
follow	211	negative
get	200	negative
want	197	negative
day	194	positive
u	193	positive
good	189	positive
like	189	positive

It's interesting that the only tokens in the top 20 that are both positive and negative are ellipses and "follow" and that the four most common tokens were smileys, although given the nature of tweets I guess the use of smileys (emoticons?) shouldn't be so surprising. I didn't notice this at first, but the most common token is a negative one.

Plotting

The counts themselves are interesting, but it might be more informative to look at their distribution as well as whether some tokens are more positive or negative.

Positive Vs Negative

tooltips = [
    ("Token", "@token"),
    ("Sentiment", "@sentiment"),
    ("Count", "@count"),
]

hover = HoverTool(tooltips=tooltips)

plot = top_counts.hvplot(kind="bar", x="sentiment", y="count",
                         hover_cols="all").opts(    
    width=Plot.width,
    height= Plot.height,
    title="Positive and Negative",
    fontscale=2,
    tools=[hover],
    color=Plot.tan,
    line_color="white",
)
embedded = Embed(plot=plot, file_name="positive_negative_distribution")
output = embedded()

print(output)

So it looks like negative sentiment is more common for the tokens, even though the tweets themselves were evenly split, maybe because the negative tweets had more diverse tokens.

Distribution

tooltips = [
    ("Token", "@token"),
    ("Sentiment", "@sentiment"),
    ("Count", "@count"),
]

hover = HoverTool(tooltips=tooltips)

CUTOFF = 150

plot = top_counts[:CUTOFF].hvplot.bar(
    y="count", hover_cols=["token", "sentiment"],
    loglog=True).opts(
        tools=[hover],
        width=Plot.width,
        height=Plot.height,
        fontscale=2,
        color=Plot.tan,
        line_color=Plot.tan,
        xaxis=None,
        ylim=(0, None),
        title=f"Log-Log Count Distribution (top {CUTOFF})")
output = Embed(plot=plot, file_name="count_distribution")()

print(output)

This shows how steep the drop is from the two most common tokens which are then followed by a long tail. Without the logarithmic axes the drop is even more pronounced.

Positive Vs Negative by Tweet

CUTOFF = 250

top_counts.loc[:, "positive"] = top_counts.apply(
    lambda row: row["count"] if row.sentiment=="positive" else 0,
    axis="columns")

top_counts.loc[:, "negative"] = top_counts.apply(
    lambda row: row["count"] if row.sentiment=="negative" else 0,
    axis="columns"
)

tooltips = [
    ("Token", "@token"),
    ("Positive", "@positive"),
    ("Negative", "@negative"),
]

hover = HoverTool(tooltips=tooltips)

grouped = top_counts.groupby("token").agg({"positive": "sum", "negative": "sum"})
to_plot = grouped.reset_index()

# log plots can't have zero values
MIN = 1
for column in ("positive", "negative"):
    to_plot.loc[:, column] = to_plot[column] + 1

MAX = to_plot.negative.max() + 1
line = holoviews.Curve(([MIN, MAX], [MIN, MAX])).opts(color=Plot.red)
scatter = to_plot.hvplot.scatter(
    loglog=True,
    color=Plot.blue,
    x="positive", y="negative",
    hover_cols=["token"])
plot = (line * scatter ).opts(
        tools=[hover],
        width=Plot.width,
        height=Plot.height,
        xlabel="Positive",
        ylabel="Negative",
        fontscale=2,
        title="Log-Log Positive vs Negative")
output = Embed(plot=plot, file_name="scatter_plot")()

print(output)

The tokens along or around the diagonal are evenly positive and negative so they probably aren't useful indicators of sentiment in and of themselves, while those furthest from the diagonal are the most biased to one side or the other so we might expect them to be useful in guessing a tweet's sentiment.

There are some unexpectedly negative tokens like "love" (400, 152) and "thank" (620, 107), but at this point we haven't really started to look at the sentiment yet so I'll leave further exploration for later.

End

Since the counter gets re-used I'm going to pickle it for later.

with Path(os.environ["TWITTER_COUNTER"]).expanduser().open("wb") as writer:
    pickle.dump(counter, writer)

Next in this series: The Tweet Vectorizer

Table of Contents

Beginning

Setup

Imports

The Data

Plotting and Printing

Middle

Word Frequencies

Tests

Implementation

Counting

Plotting

Positive Vs Negative

Distribution

Positive Vs Negative by Tweet

End