Sentiment Classification Lectures

Sentiment Classification & How To "Frame Problems" for a Neural Network

by Andrew Trask

What You Should Already Know

  • neural networks, forward and back-propagation
  • stochastic gradient descent
  • mean squared error
  • and train/test splits

Where to Get Help if You Need it

  • Re-watch previous Udacity Lectures
  • Leverage the recommended Course Reading Material - Grokking Deep Learning
  • Shoot me Andrew a tweet @iamtrask

Set Up

Debug

%load_ext autoreload
%autoreload 2

Imports

Python Standard Library

from datetime import datetime
from functools import partial

From Pypi

from graphviz import Graph
from tabulate import tabulate
import matplotlib.pyplot as pyplot
import numpy
import seaborn

This Project

from neurotic.tangles.data_paths import DataPath

Tables

table = partial(tabulate, tablefmt="orgtbl", headers="keys")

Printing

%matplotlib inline
seaborn.set_style("whitegrid")
FIGURE_SIZE = (12, 10)

Analysis: What's Going on in the Weights?

Let's start with a model that doesn't have any noise cancellation.

mlp_full = SentimentNoiseReduction(reviews=x_train, labels=y_train,
                                   lower_bound=0,
                                   polarity_cutoff=0,
                                   learning_rate=0.01)
mlp_full.train()
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 100.00 Error: [-0.38320156] #Correct: 740 #Trained: 1001 Training Accuracy: 73.93 %
Progress: 8.33 % Speed(reviews/sec): 181.82 Error: [-0.26004622] #Correct: 1529 #Trained: 2001 Training Accuracy: 76.41 %
Progress: 12.50 % Speed(reviews/sec): 250.00 Error: [-0.40350302] #Correct: 2376 #Trained: 3001 Training Accuracy: 79.17 %
Progress: 16.67 % Speed(reviews/sec): 285.71 Error: [-0.23990249] #Correct: 3187 #Trained: 4001 Training Accuracy: 79.66 %
Progress: 20.83 % Speed(reviews/sec): 333.33 Error: [-0.14119144] #Correct: 4002 #Trained: 5001 Training Accuracy: 80.02 %
Progress: 25.00 % Speed(reviews/sec): 375.00 Error: [-0.06442389] #Correct: 4829 #Trained: 6001 Training Accuracy: 80.47 %
Progress: 29.17 % Speed(reviews/sec): 411.76 Error: [-0.03508728] #Correct: 5690 #Trained: 7001 Training Accuracy: 81.27 %
Progress: 33.33 % Speed(reviews/sec): 444.44 Error: [-0.05110633] #Correct: 6548 #Trained: 8001 Training Accuracy: 81.84 %
Progress: 37.50 % Speed(reviews/sec): 450.00 Error: [-0.07432703] #Correct: 7404 #Trained: 9001 Training Accuracy: 82.26 %
Progress: 41.67 % Speed(reviews/sec): 476.19 Error: [-0.26512013] #Correct: 8272 #Trained: 10001 Training Accuracy: 82.71 %
Progress: 45.83 % Speed(reviews/sec): 500.00 Error: [-0.14067275] #Correct: 9129 #Trained: 11001 Training Accuracy: 82.98 %
Progress: 50.00 % Speed(reviews/sec): 521.74 Error: [-0.01215903] #Correct: 9994 #Trained: 12001 Training Accuracy: 83.28 %
Progress: 54.17 % Speed(reviews/sec): 541.67 Error: [-0.33825111] #Correct: 10864 #Trained: 13001 Training Accuracy: 83.56 %
Progress: 58.33 % Speed(reviews/sec): 560.00 Error: [-0.00522004] #Correct: 11721 #Trained: 14001 Training Accuracy: 83.72 %
Progress: 62.50 % Speed(reviews/sec): 555.56 Error: [-0.49523538] #Correct: 12553 #Trained: 15001 Training Accuracy: 83.68 %
Progress: 66.67 % Speed(reviews/sec): 571.43 Error: [-0.20026672] #Correct: 13390 #Trained: 16001 Training Accuracy: 83.68 %
Progress: 70.83 % Speed(reviews/sec): 586.21 Error: [-0.20786817] #Correct: 14243 #Trained: 17001 Training Accuracy: 83.78 %
Progress: 75.00 % Speed(reviews/sec): 580.65 Error: [-0.03469862] #Correct: 15108 #Trained: 18001 Training Accuracy: 83.93 %
Progress: 79.17 % Speed(reviews/sec): 593.75 Error: [-0.99460657] #Correct: 15982 #Trained: 19001 Training Accuracy: 84.11 %
Progress: 83.33 % Speed(reviews/sec): 606.06 Error: [-0.0523489] #Correct: 16867 #Trained: 20001 Training Accuracy: 84.33 %
Progress: 87.50 % Speed(reviews/sec): 617.65 Error: [-0.28370015] #Correct: 17734 #Trained: 21001 Training Accuracy: 84.44 %
Progress: 91.67 % Speed(reviews/sec): 611.11 Error: [-0.33222958] #Correct: 18616 #Trained: 22001 Training Accuracy: 84.61 %
Progress: 95.83 % Speed(reviews/sec): 621.62 Error: [-0.17177784] #Correct: 19475 #Trained: 23001 Training Accuracy: 84.67 %
Training Time: 0:00:38.794351

Now here's a function to find the similarity of words in the vocabulary to a word, based on the dot product of the weights from the input layer to the hidden layer.

def get_most_similar_words(focus: str="horrible", count:int=10) -> list:
    """Returns a list of similar words based on weights"""
    most_similar = Counter()
    for word in mlp_full.word_to_index:
        most_similar[word] = numpy.dot(
            mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]],
            mlp_full.weights_input_to_hidden[mlp_full.word_to_index[focus]])    
    return most_similar.most_common(count)
print(get_most_similar_words("excellent"))
[('excellent', 0.14672474869646132), ('perfect', 0.12529721850063252), ('great', 0.1072983586254582), ('amazing', 0.10168346112776101), ('wonderful', 0.0971402564667566), ('best', 0.09640599864254018), ('today', 0.09064606014006837), ('fun', 0.08859560811231239), ('loved', 0.07914150763452406), ('definitely', 0.07693307843353574)]

excellent was, ouf course, most similar to itself, but we can see that the network's weights are most similar to each other when the words are most similar to each other - the network has 'learned' what words are similar to excellent using the training set.

Now a negative example.

print(get_most_similar_words("terrible"))
[('worst', 0.1761389721390966), ('awful', 0.12576492326546337), ('waste', 0.11989143949659276), ('poor', 0.10186721140388931), ('boring', 0.09740050873489904), ('terrible', 0.09719144477251088), ('bad', 0.08198016341605044), ('dull', 0.0812576973066953), ('worse', 0.07504920898991188), ('poorly', 0.07494303321254764)]

Once again, the more similar words were in sentiment, the closer the weights leading from their inputs became.

import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word_to_index):
        words_to_visualize.append(word)

for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word_to_index):
        words_to_visualize.append(word)
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0], x2=words_top_ted_tsne[:,1], names=words_to_visualize, color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(word_labels)

show(p)