Sentiment Classification Lectures
Table of Contents
Sentiment Classification & How To "Frame Problems" for a Neural Network
by Andrew Trask
- Twitter: @iamtrask
- Blog: http://iamtrask.github.io
What You Should Already Know
- neural networks, forward and back-propagation
- stochastic gradient descent
- mean squared error
- and train/test splits
Where to Get Help if You Need it
- Re-watch previous Udacity Lectures
- Leverage the recommended Course Reading Material - Grokking Deep Learning
- Shoot me Andrew a tweet @iamtrask
Set Up
Debug
%load_ext autoreload
%autoreload 2
Imports
Python Standard Library
from datetime import datetime
from functools import partial
From Pypi
from graphviz import Graph
from tabulate import tabulate
import matplotlib.pyplot as pyplot
import numpy
import seaborn
This Project
from neurotic.tangles.data_paths import DataPath
Tables
table = partial(tabulate, tablefmt="orgtbl", headers="keys")
Printing
%matplotlib inline
seaborn.set_style("whitegrid")
FIGURE_SIZE = (12, 10)
Analysis: What's Going on in the Weights?
Let's start with a model that doesn't have any noise cancellation.
mlp_full = SentimentNoiseReduction(reviews=x_train, labels=y_train,
lower_bound=0,
polarity_cutoff=0,
learning_rate=0.01)
mlp_full.train()
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 % Progress: 4.17 % Speed(reviews/sec): 100.00 Error: [-0.38320156] #Correct: 740 #Trained: 1001 Training Accuracy: 73.93 % Progress: 8.33 % Speed(reviews/sec): 181.82 Error: [-0.26004622] #Correct: 1529 #Trained: 2001 Training Accuracy: 76.41 % Progress: 12.50 % Speed(reviews/sec): 250.00 Error: [-0.40350302] #Correct: 2376 #Trained: 3001 Training Accuracy: 79.17 % Progress: 16.67 % Speed(reviews/sec): 285.71 Error: [-0.23990249] #Correct: 3187 #Trained: 4001 Training Accuracy: 79.66 % Progress: 20.83 % Speed(reviews/sec): 333.33 Error: [-0.14119144] #Correct: 4002 #Trained: 5001 Training Accuracy: 80.02 % Progress: 25.00 % Speed(reviews/sec): 375.00 Error: [-0.06442389] #Correct: 4829 #Trained: 6001 Training Accuracy: 80.47 % Progress: 29.17 % Speed(reviews/sec): 411.76 Error: [-0.03508728] #Correct: 5690 #Trained: 7001 Training Accuracy: 81.27 % Progress: 33.33 % Speed(reviews/sec): 444.44 Error: [-0.05110633] #Correct: 6548 #Trained: 8001 Training Accuracy: 81.84 % Progress: 37.50 % Speed(reviews/sec): 450.00 Error: [-0.07432703] #Correct: 7404 #Trained: 9001 Training Accuracy: 82.26 % Progress: 41.67 % Speed(reviews/sec): 476.19 Error: [-0.26512013] #Correct: 8272 #Trained: 10001 Training Accuracy: 82.71 % Progress: 45.83 % Speed(reviews/sec): 500.00 Error: [-0.14067275] #Correct: 9129 #Trained: 11001 Training Accuracy: 82.98 % Progress: 50.00 % Speed(reviews/sec): 521.74 Error: [-0.01215903] #Correct: 9994 #Trained: 12001 Training Accuracy: 83.28 % Progress: 54.17 % Speed(reviews/sec): 541.67 Error: [-0.33825111] #Correct: 10864 #Trained: 13001 Training Accuracy: 83.56 % Progress: 58.33 % Speed(reviews/sec): 560.00 Error: [-0.00522004] #Correct: 11721 #Trained: 14001 Training Accuracy: 83.72 % Progress: 62.50 % Speed(reviews/sec): 555.56 Error: [-0.49523538] #Correct: 12553 #Trained: 15001 Training Accuracy: 83.68 % Progress: 66.67 % Speed(reviews/sec): 571.43 Error: [-0.20026672] #Correct: 13390 #Trained: 16001 Training Accuracy: 83.68 % Progress: 70.83 % Speed(reviews/sec): 586.21 Error: [-0.20786817] #Correct: 14243 #Trained: 17001 Training Accuracy: 83.78 % Progress: 75.00 % Speed(reviews/sec): 580.65 Error: [-0.03469862] #Correct: 15108 #Trained: 18001 Training Accuracy: 83.93 % Progress: 79.17 % Speed(reviews/sec): 593.75 Error: [-0.99460657] #Correct: 15982 #Trained: 19001 Training Accuracy: 84.11 % Progress: 83.33 % Speed(reviews/sec): 606.06 Error: [-0.0523489] #Correct: 16867 #Trained: 20001 Training Accuracy: 84.33 % Progress: 87.50 % Speed(reviews/sec): 617.65 Error: [-0.28370015] #Correct: 17734 #Trained: 21001 Training Accuracy: 84.44 % Progress: 91.67 % Speed(reviews/sec): 611.11 Error: [-0.33222958] #Correct: 18616 #Trained: 22001 Training Accuracy: 84.61 % Progress: 95.83 % Speed(reviews/sec): 621.62 Error: [-0.17177784] #Correct: 19475 #Trained: 23001 Training Accuracy: 84.67 % Training Time: 0:00:38.794351
Now here's a function to find the similarity of words in the vocabulary to a word, based on the dot product of the weights from the input layer to the hidden layer.
def get_most_similar_words(focus: str="horrible", count:int=10) -> list:
"""Returns a list of similar words based on weights"""
most_similar = Counter()
for word in mlp_full.word_to_index:
most_similar[word] = numpy.dot(
mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]],
mlp_full.weights_input_to_hidden[mlp_full.word_to_index[focus]])
return most_similar.most_common(count)
print(get_most_similar_words("excellent"))
[('excellent', 0.14672474869646132), ('perfect', 0.12529721850063252), ('great', 0.1072983586254582), ('amazing', 0.10168346112776101), ('wonderful', 0.0971402564667566), ('best', 0.09640599864254018), ('today', 0.09064606014006837), ('fun', 0.08859560811231239), ('loved', 0.07914150763452406), ('definitely', 0.07693307843353574)]
excellent was, ouf course, most similar to itself, but we can see that the network's weights are most similar to each other when the words are most similar to each other - the network has 'learned' what words are similar to excellent using the training set.
Now a negative example.
print(get_most_similar_words("terrible"))
[('worst', 0.1761389721390966), ('awful', 0.12576492326546337), ('waste', 0.11989143949659276), ('poor', 0.10186721140388931), ('boring', 0.09740050873489904), ('terrible', 0.09719144477251088), ('bad', 0.08198016341605044), ('dull', 0.0812576973066953), ('worse', 0.07504920898991188), ('poorly', 0.07494303321254764)]
Once again, the more similar words were in sentiment, the closer the weights leading from their inputs became.
import matplotlib.colors as colors
words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
if(word in mlp_full.word_to_index):
words_to_visualize.append(word)
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
if(word in mlp_full.word_to_index):
words_to_visualize.append(word)
pos = 0
neg = 0
colors_list = list()
vectors_list = list()
for word in words_to_visualize:
if word in pos_neg_ratios.keys():
vectors_list.append(mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]])
if(pos_neg_ratios[word] > 0):
pos+=1
colors_list.append("#00ff00")
else:
neg+=1
colors_list.append("#000000")
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="vector T-SNE for most polarized words")
source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0], x2=words_top_ted_tsne[:,1], names=words_to_visualize, color=colors_list))
p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")
word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(word_labels)
show(p)