Removing Noise

Set Up

Imports

Python

from collections import Counter
from functools import partial
from pathlib import Path
import pickle

PyPy

from tabulate import tabulate
import numpy

This Project

from network_helpers import update_input_layer
from neurotic.tangles.data_paths import DataPath

Constants

SPLIT_ON_THIS = " "

Understanding Neural Noise

We're going to try and figure out why the Neural Network isn't improving as much as we want it to. First, let's checkout the first review.

The Pickles

with DataPath("total_counts.pkl").from_folder.open("rb") as reader:
    total_counts = pickle.load(reader)
vocab_size = len(total_counts)
path = DataPath("reviews.pkl")
with path.from_folder.open("rb") as reader:
    reviews = pickle.load(reader)
vocab = total_counts.keys()
word2index = {word: index for index, word in enumerate(vocab)}
layer_0 = numpy.zeros((1, vocab_size))
counter = update_input_layer(reviews[0], layer_0, word2index)
print(sorted(layer_0[0], reverse=True)[:10])
[27.0, 15.0, 9.0, 6.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0]

So it looks like two of the words in the first review have a disproportionate representation. Lets see what they are.

table = partial(tabulate, tablefmt="orgtbl", headers="keys")
def print_most_common(counter: Counter, count: int=10, bottom=False) -> None:
    """Prints most common tokens as an org-tabel"""
    tokens, counts = [], []
    for token, count in sorted(counter.items(), reverse=bottom)[:count]:
        tokens.append(token)
        counts.append(count)
    print(table(dict(Token=tokens, Count=counts)))
    return
print_most_common(counter)
Token Count
  15
. 27
a 4
about 1
adults 1
age 1
all 1
and 1
as 2
at 2

So, as I noted before, the dominant tokens aren't revealing as far as sentiment goes. A smarter tokenizer would probably have helped.

print_most_common(total_counts)
Token Count
  1049343
. 327192
a 163009
aa 5
aaa 9
aaaaaaah 1
aaaaah 1
aaaaatch 1
aaaahhhhhhh 1
aaaand 1

You can see that it gets even worse when you look at the overall corpus. All these unuseful tokens are adding too much noise to the dataset.

Reducing Noise in Our Input Data

Rather than using a tokenizer that knows stop-words, punctuation, etc. we're going to just stop using counts and rely on the neural network to figure out which weights between the input layer and the hidden layer to zero-out. To do this we just have to convert the inputs from word counts to just binary inputs (is the token in the review or not)?

I'm going to keep extending this class so I'll tangle it out so I can import it elsewhere so the next two blocks are actually in a module named sentiment_renetwork.

from sentiment_network import SentimentNetwork
class SentimentRenetwork(SentimentNetwork):
    """Re-do of the Sentiment Network

    .. uml::

       SentimentRenetwork --|> SentimentNetwork

    This is a re-implementation that doesn't use counts as inputs
    """
    def update_input_layer(self, review: str) -> None:
        """Update the counts in the input layer

       Args:
        review: A movie review
       """
        self.input_layer *= 0
        tokens = set(review.split(self.tokenizer))
        for token in tokens:
            if token in self.word_to_index:
                self.input_layer[:, self.word_to_index[token]] = 1
        return
from sentiment_renetwork import SentimentRenetwork
sentimental = SentimentRenetwork(learning_rate=0.1, verbose=True)
with DataPath("x_train.pkl").from_folder.open("rb") as reader:
    x_train = pickle.load(reader)

with DataPath("y_train.pkl").from_folder.open("rb") as reader:
    y_train = pickle.load(reader)
sentimental.train(x_train, y_train)
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 250.00 Error: [-0.12803969] #Correct: 745 #Trained: 1001 Training Accuracy: 74.43 %
Progress: 8.33 % Speed(reviews/sec): 285.71 Error: [-0.05466563] #Correct: 1542 #Trained: 2001 Training Accuracy: 77.06 %
Progress: 12.50 % Speed(reviews/sec): 300.00 Error: [-0.76659525] #Correct: 2378 #Trained: 3001 Training Accuracy: 79.24 %
Progress: 16.67 % Speed(reviews/sec): 285.71 Error: [-0.13244093] #Correct: 3185 #Trained: 4001 Training Accuracy: 79.61 %
Progress: 20.83 % Speed(reviews/sec): 294.12 Error: [-0.03716464] #Correct: 3997 #Trained: 5001 Training Accuracy: 79.92 %
Progress: 25.00 % Speed(reviews/sec): 300.00 Error: [-0.00921009] #Correct: 4835 #Trained: 6001 Training Accuracy: 80.57 %
Progress: 29.17 % Speed(reviews/sec): 304.35 Error: [-0.00274399] #Correct: 5703 #Trained: 7001 Training Accuracy: 81.46 %
Progress: 33.33 % Speed(reviews/sec): 307.69 Error: [-0.0040905] #Correct: 6555 #Trained: 8001 Training Accuracy: 81.93 %
Progress: 37.50 % Speed(reviews/sec): 300.00 Error: [-0.02414385] #Correct: 7412 #Trained: 9001 Training Accuracy: 82.35 %
Progress: 41.67 % Speed(reviews/sec): 303.03 Error: [-0.11133286] #Correct: 8282 #Trained: 10001 Training Accuracy: 82.81 %
Progress: 45.83 % Speed(reviews/sec): 305.56 Error: [-0.05147756] #Correct: 9143 #Trained: 11001 Training Accuracy: 83.11 %
Progress: 50.00 % Speed(reviews/sec): 300.00 Error: [-0.00178148] #Correct: 10006 #Trained: 12001 Training Accuracy: 83.38 %
Progress: 54.17 % Speed(reviews/sec): 302.33 Error: [-0.3016099] #Correct: 10874 #Trained: 13001 Training Accuracy: 83.64 %
Progress: 58.33 % Speed(reviews/sec): 304.35 Error: [-0.00105685] #Correct: 11741 #Trained: 14001 Training Accuracy: 83.86 %
Progress: 62.50 % Speed(reviews/sec): 306.12 Error: [-0.49072786] #Correct: 12584 #Trained: 15001 Training Accuracy: 83.89 %
Progress: 66.67 % Speed(reviews/sec): 307.69 Error: [-0.18036635] #Correct: 13414 #Trained: 16001 Training Accuracy: 83.83 %
Progress: 70.83 % Speed(reviews/sec): 309.09 Error: [-0.17892538] #Correct: 14265 #Trained: 17001 Training Accuracy: 83.91 %
Progress: 75.00 % Speed(reviews/sec): 305.08 Error: [-0.00702446] #Correct: 15127 #Trained: 18001 Training Accuracy: 84.03 %
Progress: 79.17 % Speed(reviews/sec): 306.45 Error: [-0.99885025] #Correct: 16000 #Trained: 19001 Training Accuracy: 84.21 %
Progress: 83.33 % Speed(reviews/sec): 307.69 Error: [-0.02833534] #Correct: 16873 #Trained: 20001 Training Accuracy: 84.36 %
Progress: 87.50 % Speed(reviews/sec): 308.82 Error: [-0.22776195] #Correct: 17746 #Trained: 21001 Training Accuracy: 84.50 %
Progress: 91.67 % Speed(reviews/sec): 305.56 Error: [-0.22165232] #Correct: 18630 #Trained: 22001 Training Accuracy: 84.68 %
Progress: 95.83 % Speed(reviews/sec): 306.67 Error: [-0.13901935] #Correct: 19489 #Trained: 23001 Training Accuracy: 84.73 %
Training Time: 0:01:18.649050
with DataPath("sentimental_renetwork.pkl", check_exists=False).from_folder.open("wb") as writer:
    pickle.dump(sentimental, writer)

Here's how well it does on the test-set.

sentimental.test(x_test, y_test)
Progress: 0.00% Speed(reviews/sec): 0.00 #Correct: 1 #Tested: 1 Testing Accuracy: 100.00 %
Progress: 10.00% Speed(reviews/sec): 0.00 #Correct: 92 #Tested: 101 Testing Accuracy: 91.09 %
Progress: 20.00% Speed(reviews/sec): 0.00 #Correct: 178 #Tested: 201 Testing Accuracy: 88.56 %
Progress: 30.00% Speed(reviews/sec): 0.00 #Correct: 268 #Tested: 301 Testing Accuracy: 89.04 %
Progress: 40.00% Speed(reviews/sec): 0.00 #Correct: 351 #Tested: 401 Testing Accuracy: 87.53 %
Progress: 50.00% Speed(reviews/sec): 0.00 #Correct: 442 #Tested: 501 Testing Accuracy: 88.22 %
Progress: 60.00% Speed(reviews/sec): 0.00 #Correct: 533 #Tested: 601 Testing Accuracy: 88.69 %
Progress: 70.00% Speed(reviews/sec): 0.00 #Correct: 610 #Tested: 701 Testing Accuracy: 87.02 %
Progress: 80.00% Speed(reviews/sec): 0.00 #Correct: 689 #Tested: 801 Testing Accuracy: 86.02 %
Progress: 90.00% Speed(reviews/sec): 0.00 #Correct: 777 #Tested: 901 Testing Accuracy: 86.24 %

Oddly, it does better on the test set than the training set?

Tensors In PyTorch

Introduction to Deep Learning with PyTorch

In this notebook, you'll get introduced to PyTorch, a framework for building and training neural networks. PyTorch, in a lot of ways, behaves like the arrays you love from Numpy. These Numpy arrays, after all, are just tensors. PyTorch takes these tensors and makes it simple to move them to GPUs for the faster processing needed when training neural networks. It also provides a module that automatically calculates gradients (for backpropagation!) and another module specifically for building neural networks. All together, PyTorch ends up being more coherent with Python and the Numpy/Scipy stack compared to TensorFlow and other frameworks.

Neural Networks

Deep Learning is based on artificial neural networks which have been around in some form since the late 1950s. The networks are built from individual parts approximating neurons, typically called units or simply "neurons." Each unit has some number of weighted inputs. These weighted inputs are summed together (a linear combination) then passed through an activation function to get the unit's output.

Mathematically this looks like:

\[ y = f(w_1 x_1 + w_2 x_2 + b) \\ y = f\left(\sum_i w_i x_i +b \right) \]

With vectors this is the dot/inner product of two vectors:

$$ h = x_1 , x_2 ⋅ x_n

\begin{bmatrix} w_1 \\ w_2 \\ \vdots \\ w_n \end{bmatrix}

$$

Tensors

It turns out neural network computations are just a bunch of linear algebra operations on tensors, a generalization of matrices. A vector is a 1-dimensional tensor, a matrix is a 2-dimensional tensor, an array with three indices is a 3-dimensional tensor (RGB color images for example). The fundamental data structure for neural networks are tensors and PyTorch (as well as pretty much every other deep learning framework) is built around tensors.

Now that we have the basics covered, it's time to explore how we can use PyTorch to build a simple neural network.

Imports

From PyPi

import numpy
import torch

The Activation Function

Using pytorch's exp function looks a lot like it did with numpy.

def activation(x):
    """ Sigmoid activation function 

       Arguments
       ---------
       x: torch.Tensor
    """
    return 1/(1 + torch.exp(-x))

Generate some data

Set the random seed so things are predictable.

torch.manual_seed(7)

Features

Our features will be a tensor of 3 random normal variables created with torch.randn.

features = torch.randn((1, 5))

True weights for our data, random normal variables again

randn_like creates a tensor of random numbers that is the same size as the tensor it is given.

weights = torch.randn_like(features)

And a true bias term.

bias = torch.randn((1, 1))

Above I generated data we can use to get the output of our simple network. This is all just random for now, going forward we'll start using normal data. Going through each relevant line:

`features = torch.randn((1, 5))` creates a tensor with shape `(1, 5)`, one row and five columns, that contains values randomly distributed according to the normal distribution with a mean of zero and standard deviation of one.

`weights = torch.randn_like(features)` creates another tensor with the same shape as `features`, again containing values from a normal distribution.

Finally, `bias = torch.randn((1, 1))` creates a single value from a normal distribution.

PyTorch tensors can be added, multiplied, subtracted, etc, just like Numpy arrays. In general, you'll use PyTorch tensors pretty much the same way you'd use Numpy arrays. They come with some nice benefits though such as GPU acceleration which we'll get to later. For now, use the generated data to calculate the output of this simple single layer network.

Exercise: Calculate the output of the network with input features `features`, weights `weights`, and bias `bias`. Similar to Numpy, PyTorch has a torch.sum() function, as well as a `.sum()` method on tensors, for taking sums. Use the function `activation` defined above as the activation function.

Calculate the output of this network using the weights and bias tensors

You can do the multiplication and sum in the same operation using a matrix multiplication. In general, you'll want to use matrix multiplications since they are more efficient and accelerated using modern libraries and high-performance computing on GPUs.

Here, we want to do a matrix multiplication of the features and the weights. For this we can use [`torch.mm()`] or torch.matmul() which is somewhat more complicated and supports broadcasting. If we try to do it with `features` and `weights` as they are, we'll get an error:

torch.mm(features, weights);
 
 ---------------------------------------------------------------------------
 RuntimeError                              Traceback (most recent call last)
 <python-input-13-15d592eb5279> in <module>()
 ----> 1 torch.mm(features, weights)
 
 RuntimeError: size mismatch, m1: [1 x 5], m2: [1 x 5] at /Users/soumith/minicondabuild3/conda-bld/pytorch_1524590658547/work/aten/src/TH/generic/THTensorMath.c:2033

As you're building neural networks in any framework, you'll see this often. Really often. What's happening here is our tensors aren't the correct shapes to perform a matrix multiplication. Remember that for matrix multiplications, the number of columns in the first tensor must equal to the number of rows in the second column. Both `features` and `weights` have the same shape, `(1, 5)`. This means we need to change the shape of `weights` to get the matrix multiplication to work.

Note: To see the shape of a tensor called `tensor`, use `tensor.shape`. If you're building neural networks, you'll be using this method often.

There are a few options here: weights.reshape(), weights.resize_(), and weights.view().

-`weights.reshape(a, b)` will return a new tensor with the same data as `weights` with size `(a, b)` sometimes, and sometimes a clone, as in it copies the data to another part of memory.

  • `weights.resize_(a, b)` returns the same tensor with a different shape. However, if the new shape results in fewer elements than the original tensor, some elements will be removed from the tensor (but not from memory). If the new shape results in more elements than the original tensor, new elements will be uninitialized in memory. Here I should note that the underscore at the end of the method denotes that this method is performed in-place. Here is a great forum thread to read more about in-place operations in PyTorch.
  • `weights.view(a, b)` will return a new tensor with the same data as `weights` with size `(a, b)`.

I usually use `.view()`, but any of the three methods will work for this. So, now we can reshape `weights` to have five rows and one column with something like `weights.view(5, 1)`.

Exercise: Calculate the output of our little network using matrix multiplication.

weights = weights.view(5, 1)
product = torch.matmul(features, weights)
total = torch.add(product, bias)
print(activation(total.sum()))
tensor(0.1595)

Stack them up!

That's how you can calculate the output for a single neuron. The real power of this algorithm happens when you start stacking these individual units into layers and stacks of layers, into a network of neurons. The output of one layer of neurons becomes the input for the next layer. With multiple input units and output units, we now need to express the weights as a matrix.

The first layer shown on the bottom here are the inputs, understandably called the input layer. The middle layer is called the hidden layer, and the final layer (on the right) is the output layer. We can express this network mathematically with matrices again and use matrix multiplication to get linear combinations for each unit in one operation. For example, the hidden layer (\(h_1\) and \(h_2\) here) can be calculated

\[ \vec{h} = [h_1 \, h_2] =

\begin{bmatrix} x_1 \, x_2 \cdots \, x_n \end{bmatrix}

\begin{bmatrix} w_{11} & w_{12} \\ w_{21} &w_{22} \\ \vdots &\vdots \\ w_{n1} &w_{n2} \end{bmatrix}

\]

The output for this small network is found by treating the hidden layer as inputs for the output unit. The network output is expressed simply

\[ y = f_2 \! \left(\, f_1 \! \left(\vec{x} \, \mathbf{W_1}\right) \mathbf{W_2} \right) \]

Generate some data

Set the random seed so things are predictable

torch.manual_seed(7)

The features are 3 random normal variables that will be your input.

features = torch.randn((1, 3))

Define the size of each layer in our network

n_input = features.shape[1]     # Number of input units, must match number of input features
n_hidden = 2                    # Number of hidden units 
n_output = 1                    # Number of output units

Weights for inputs to hidden layer

W1 = torch.randn(n_input, n_hidden)

Weights for hidden layer to output layer

W2 = torch.randn(n_hidden, n_output)

and bias terms for hidden and output layers

B1 = torch.randn((1, n_hidden))
B2 = torch.randn((1, n_output))

Exercise: Calculate the output for this multi-layer network using the weights `W1` & `W2`, and the biases, `B1` & `B2`.

input_layer_out = activation(torch.matmul(features, W1)) + B1
hidden_layer_out = activation(torch.matmul(input_layer_out, W2)) + B2
print(hidden_layer_out)
tensor([[0.4813]])
expected = numpy.array([[0.4813]])
numpy.testing.assert_allclose(hidden_layer_out.numpy(), expected, atol=0.000305)

If you did this correctly, you should see the output tensor([[ 0.4813]]).

The number of hidden units a parameter of the network, often called a hyperparameter to differentiate it from the weights and biases parameters. As you'll see later when we discuss training a neural network, the more hidden units a network has, and the more layers, the better able it is to learn from data and make accurate predictions.

Numpy to Torch and back

Special bonus section! PyTorch has a great feature for converting between Numpy arrays and Torch tensors. To create a tensor from a Numpy array, use torch.from_numpy(). To convert a tensor to a Numpy array, use the .numpy() method.

a = numpy.random.rand(4,3)
print(a)
[[0.07665652 0.06831265 0.7607324 ]
 [0.71495335 0.34479699 0.67489027]
 [0.45834284 0.78789824 0.40383355]
 [0.28864364 0.21713754 0.62036028]]
b = torch.from_numpy(a)
print(b)
tensor([[0.0767, 0.0683, 0.7607],
        [0.7150, 0.3448, 0.6749],
        [0.4583, 0.7879, 0.4038],
        [0.2886, 0.2171, 0.6204]], dtype=torch.float64)
print(b.numpy())
[[0.07665652 0.06831265 0.7607324 ]
 [0.71495335 0.34479699 0.67489027]
 [0.45834284 0.78789824 0.40383355]
 [0.28864364 0.21713754 0.62036028]]

The memory is shared between the Numpy array and Torch tensor, so if you change the values in-place of one object, the other will change as well.

Multiply PyTorch Tensor by 2, in place

b.mul_(2)

Numpy array matches new values from Tensor

print(a)
[[0.15331305 0.1366253  1.52146479]
 [1.4299067  0.68959399 1.34978053]
 [0.91668568 1.57579648 0.80766711]
 [0.57728729 0.43427509 1.24072056]]

The Sentiment Analyzer

Set Up

Imports

Python

import pickle

This Project

from neurotic.tangles.data_paths import DataPath

The Data

path = DataPath("reviews.pkl")
with path.from_folder.open('rb') as reader:
    reviews = pickle.load(reader)

The Labels

A similar deal except casting the labels to upper case.

path = DataPath("labels.pkl")
with path.from_folder.open('rb') as reader:
    labels = pickle.load(reader)

Note: The data in reviews.txt we're using has already been preprocessed a bit and contains only lower case characters. If we were working from raw data, where we didn't know it was all lower case, we would want to add a step here to convert it. That's so we treat different variations of the same word, like `The`, `the`, and `THE`, all the same way.

Encapsulate our neural network in a class

I'm going to try and break up the class so that I can make notes. You can't really do that in a notebook, though, so I'm going to tangle it out. The following Class is going to end up in a module named sentiment_network.

<<imports>>

<<constants>>

<<sentiment-network>>

    <<sentiment-network-review-vocabulary>>

    <<sentiment-network-review-vocabulary-size>>

    <<sentiment-network-label-vocabulary>>

    <<sentiment-network-label-vocabulary-size>>

    <<sentiment-network-word-to-index>>

    <<sentiment-network-label-to-index>>

    <<sentiment-network-input-nodes>>

    <<sentiment-network-weights-input-to-hidden>>

    <<sentiment-network-weights-hidden-to-output>>

    <<sentiment-network-input-layer>>

<<sentiment-network-update-input-layer>>

<<sentiment-network-get-target-for-label>>

<<sentiment-network-sigmoid>>

<<sentiment-network-sigmoid-output-2-derivative>>

<<sentiment-network-train>>

<<sentiment-network-test>>

<<sentiment-network-run>>

Imports

# From python
from collections import Counter
from datetime import datetime
from typing import (
    List,
    Union,
    )
# from pypi
import numpy

Constants

SPLIT_ON_THIS = " "
Review = List[str]
Label = List[str]
Classification = Union[int, str]

Sentiment Network Constructor

To make this more like a SKlearn implementation I'm not going to add the training and testing data at this point. This will break one of the examples given. Oh well.

class SentimentNetwork:
    """A network to predict if a review is positive or negative

    Args:
     hidden_nodes: Number of nodes to create in the hidden layer
     learning_rate: Learning rate to use while training        
     output_nodes: Number of output nodes (should always be 1)
     tokenizer: what to split on
     verbose: whether to output update information
    """
    def __init__(self,
                 hidden_nodes: int=10, 
                 learning_rate: float=0.1,
                 output_nodes: int=1,
                 tokenizer:str=" ",
                 verbose:bool=False) -> None:
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        numpy.random.seed(1)
        self.hidden_nodes = hidden_nodes
        self.learning_rate = learning_rate
        self.output_nodes = output_nodes
        self.tokenizer = tokenizer
        self.verbose = verbose
        self._review_vocabulary = None
        self._label_vocabulary = None
        self._review_vocabulary_size = None
        self._label_vocabulary_size = None
        self._word_to_index = None
        self._label_to_index = None
        self._input_nodes = None
        self._weights_input_to_hidden = None
        self._weights_hidden_to_output = None
        self._input_layer = None
        return

The Review Vocabulary

This takes the training reviews and tokenizes them so we have a set of unique tokens to work with. This requires that self.reviews and self.tokenizer are set.

@property
def review_vocabulary(self) -> List:
    """list of tokens in the reviews"""
    if self._review_vocabulary is None:
        vocabulary = set()
        for review in self.reviews:
            vocabulary.update(set(review.split(self.tokenizer)))
        self._review_vocabulary = list(vocabulary)
    return self._review_vocabulary

The Review Vocabulary Size

This is the number of tokens we ended up with after tokenizing the training reviews.

@property
def review_vocabulary_size(self) -> int:
    """The amount of tokens in our reviews"""
    if self._review_vocabulary_size is None:
        self._review_vocabulary_size = len(self.review_vocabulary)
    return self._review_vocabulary_size

The Label Vocabulary

These are the labels - there should only be two in this case. This requires that self.labels has been set.

@property
def label_vocabulary(self) -> List:
    """List of sentiment labels"""
    if self._label_vocabulary is None:
        self._label_vocabulary = list(set(self.labels))
    return self._label_vocabulary

The Label Vocabulary Size

The number of labels we ended up with.

@property
def label_vocabulary_size(self) -> int:
    """The amount of tokens in our labels"""
    if self._label_vocabulary_size is None:
        self._label_vocabulary_size = len(self.label_vocabulary)
    return self._label_vocabulary_size

The Word To Index Map

This is a map to find the index in our review vocabulary where a word is. This requires that self.review_vocabulary has been set.

@property
def word_to_index(self) -> dict:
    """maps a word to the index in our review vocabulary"""
    if self._word_to_index is None:
        self._word_to_index = {
            word: index
            for index, word in enumerate(self.review_vocabulary)}
    return self._word_to_index

The Label To Index Map

This finds the index where a label is in our vocabulary of labels. This requires that self.label_vocabulary has been set.

@property
def label_to_index(self) -> dict:
    """maps a label to the index in our label vocabulary"""
    if self._label_to_index is None:
        self._label_to_index = {
            label: index
            for index, label in enumerate(self.label_vocabulary)}
    return self._label_to_index

Input Nodes

The number of input nodes is the size of our vocabulary built from the reviews. This requires self.review_vocabulary to have been set.

@property
def input_nodes(self) -> int:
    """Number of input nodes"""
    if self._input_nodes is None:
        self._input_nodes = len(self.review_vocabulary)
    return self._input_nodes

Weight From the Input Layer To the Hidden Layer

This is a matrix with as many rows as the number of input nodes and as many columns as the number of hidden nodes. This relies on self.input_nodes and self.hidden_nodes.

@property
def weights_input_to_hidden(self) -> numpy.ndarray:
    """Weights for edges from input to hidden layer"""
    if self._weights_input_to_hidden is None:
        self._weights_input_to_hidden = numpy.zeros(
            (self.input_nodes, self.hidden_nodes))
    return self._weights_input_to_hidden

@weights_input_to_hidden.setter
def weights_input_to_hidden(self, weights: numpy.ndarray) -> None:
    """Set the weights"""
    self._weights_input_to_hidden = weights
    return

Weight From the Hidden Layer To the Output Layer

This is a matrix with as many rows as the number of hidden nodes and as many columns as the number of output nodes (which should be 1). This depends of self.hidden_nodes and self.output_nodes.

@property
def weights_hidden_to_output(self) -> numpy.ndarray:
    """Weights for edges from hidden to output layer"""
    if self._weights_hidden_to_output is None:
        self._weights_hidden_to_output = numpy.random.random(
            (self.hidden_nodes, self.output_nodes))
    return self._weights_hidden_to_output

@weights_hidden_to_output.setter
def weights_hidden_to_output(self, weights: numpy.ndarray) -> None:
    """updates the weights"""
    self._weights_hidden_to_output = weights
    return

The Input Layer

This is the layer where we will set the tokens for a particular review that we are going to categorize. This depends on self.input_nodes.

@property
def input_layer(self) -> numpy.ndarray:
    """The Input Layer for the review tokens"""
    if self._input_layer is None:
        self._input_layer = numpy.zeros((1, self.input_nodes))
    return self._input_layer

@input_layer.setter
def input_layer(self, layer: numpy.ndarray) -> None:
    """Set the input layer"""
    self._input_layer = layer
    return

Update the Input Layer

def update_input_layer(self, review: str) -> None:
    """Update the counts in the input layer

    Args:
     review: A movie review
    """
    # reset any previous inputs
    self.input_layer *= 0
    tokens = review.split(self.tokenizer)
    counter = Counter()
    counter.update(tokens)
    for key, value in counter.items():
        if key in self.word_to_index:
            self.input_layer[:, self.word_to_index[key]] = value
    return

Get the Target for the Label

This converts a string label to an integer.

def get_target_for_label(self, label: str) -> int:
    """Convert a label to `0` or `1`.
    Args:
     label(string) - Either "POSITIVE" or "NEGATIVE".
    Returns:
     `0` or `1`.
    """
    return 1 if label=="POSITIVE" else 0

The Sigmoid

def sigmoid(self, x: numpy.ndarray) -> numpy.ndarray:
    """calculates the sigmoid for the input

    Args:
     x: vector to calculate the sigmoid

    Returns:
     sigmoid of x
    """
    return 1/(1 + numpy.exp(-x))

Sigmoid Derivative

def sigmoid_output_to_derivative(self, output: numpy.ndarray) -> numpy.ndarray:
    """Calculates the derivative if the sigmoid

    Args:
     output: the sigmoid output
    """
    return output * (1 - output)

Train the Network

def train(self, training_reviews: Review, training_labels: Label) -> int:
    """Trains the model

    Args:
     training_reviews: list of reviews
     training_labels: listo of labels for the reviews

    Returns:
     count of correct
    """
    # there are side-effects that require self.reviews and self.labels
    self.reviews, self.labels = training_reviews, training_labels

    assert(len(training_reviews) == len(training_labels))
    correct_so_far = 0

    if self.verbose:        
        # Remember when we started for printing time statistics
        start = datetime.now()

    # loop through all the given reviews and run a forward and backward pass,
    # updating weights for every item
    reviews_labels = zip(training_reviews, training_labels)
    n_records = len(training_reviews)

    for index, (review, label) in enumerate(reviews_labels):
        # feed-forward
        self.update_input_layer(review)
        hidden_inputs = self.input_layer.dot(self.weights_input_to_hidden)
        hidden_outputs = hidden_inputs.dot(self.weights_hidden_to_output)
        output = self.sigmoid(hidden_outputs)

        # Backpropagation
        # we need to calculate the output_error separately
        # to update our correct count
        output_error = output - self.get_target_for_label(label)

        # we applied a sigmoid to the output
        # so we need to apply the derivative
        hidden_to_output_delta = (
            output_error
            * self.sigmoid_output_to_derivative(output))

        input_to_hidden_error = hidden_to_output_delta.dot(
            self.weights_hidden_to_output.T)
        # we didn't apply a function to the inputs to the hidden layer
        # so we don't need a derivative
        input_to_hidden_delta = input_to_hidden_error

        # our delta is based on the derivative which is heading
        # in the opposite direction of what we want so we need to negate it
        self.weights_hidden_to_output -= (
            self.learning_rate
            * hidden_inputs.T.dot(hidden_to_output_delta))
        self.weights_input_to_hidden -= (
            self.learning_rate
            * self.input_layer.T.dot(input_to_hidden_delta))

        if ((output < 0.5 and label=="NEGATIVE")
            or (output >= 0.5 and label=="POSITIVE")):
            correct_so_far += 1
        if self.verbose and not index % 1000:
            elapsed_time = datetime.now() - start
            reviews_per_second = (index/elapsed_time.seconds
                                  if elapsed_time.seconds > 0 else 0)
            print(
                "Progress: {:.2f} %".format(100 * index/len(training_reviews))
                + " Speed(reviews/sec): {:.2f}".format(reviews_per_second)
                + " Error: {}".format(output_error[0])
                + " #Correct: {}".format(correct_so_far)
                + " #Trained: {}".format(index+1)
                + " Training Accuracy: {:.2f} %".format(
                    correct_so_far * 100/float(index+1))
                )
    if self.verbose:
        print("Training Time: {}".format(datetime.now() - start))
    return correct_so_far

Test The Model

def test(self, testing_reviews: list, testing_labels:list) -> int:
    """
    Attempts to predict the labels for the given testing_reviews,
    and uses the test_labels to calculate the accuracy of those predictions.

    Returns:
     correct: number of correct predictions
    """

    # keep track of how many correct predictions we make
    correct = 0

    # we'll time how many predictions per second we make
    start = datetime.now()

    # Loop through each of the given reviews and call run to predict
    # its label.
    reviews_and_labels = zip(testing_reviews, testing_labels)
    for index, (review, label) in enumerate(reviews_and_labels):
        prediction = self.run(review)
        if prediction == label:
            correct += 1

        if not index % 100:
            elapsed_time = datetime.now() - start
            reviews_per_second = (index/elapsed_time.seconds
                                  if elapsed_time.seconds > 0 else 0)

            print(
                "Progress: {:.2f}%".format(
                    100 * index/len(testing_reviews))
                + " Speed(reviews/sec): {:.2f}".format(reviews_per_second)
                + " #Correct: {}".format(correct)
                + " #Tested: {}".format(index + 1)
                + " Testing Accuracy: {:.2f} %".format(
                    correct * 100/(index+1))
            )
    return correct

Run a Prediction

def run(self, review: str) -> str:
    """
    Returns a POSITIVE or NEGATIVE prediction for the given review.
    """
    review = review.lower()
    self.update_input_layer(review)
    hidden_inputs = self.input_layer.dot(self.weights_input_to_hidden)
    hidden_outputs = hidden_inputs.dot(self.weights_hidden_to_output)
    output = self.sigmoid(hidden_outputs)
    return "POSITIVE" if output[0] >= 0.5 else "NEGATIVE"

Test The Network

So now we'll actually try and run the network to see how it does.

%reload_ext autoreload
from sentiment_network import SentimentNetwork

We'll be using the last 1,000 labels to test the network and all but the last to train it.

BOUNDARY = -1000
x_test, y_test = reviews[BOUNDARY:],labels[BOUNDARY:]
print(len(x_test))
1000
x_train, y_train = reviews[:BOUNDARY],labels[:BOUNDARY]
print(len(x_train))
24000

Since I split this up into multiple posts I'm going to pickle up the data-sets to make sure that they're only being created once.

pickles = dict(x_test=x_test, y_test=y_test,
               x_train=x_train, y_train=y_train)
for potential_pickle, collection in pickles.items():
    potential_path = DataPath("{}.pkl".format(potential_pickle), check_exists=False)
    if not potential_path.from_folder.is_file():
        with potential_path.from_folder.open("wb") as writer:
            pickle.dump(collection, writer)
untrained = SentimentNetwork(learning_rate=0.1, verbose=True)

Run the following cell to actually train the network. During training, it will display the model's accuracy repeatedly as it trains so you can see how well it's doing.

untrained.train(x_train, y_train)
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 125.00 Error: [-0.50133709] #Correct: 492 #Trained: 1001 Training Accuracy: 49.15 %
Progress: 8.33 % Speed(reviews/sec): 153.85 Error: [-0.46896641] #Correct: 940 #Trained: 2001 Training Accuracy: 46.98 %
Progress: 12.50 % Speed(reviews/sec): 150.00 Error: [-0.76053545] #Correct: 1401 #Trained: 3001 Training Accuracy: 46.68 %
Progress: 16.67 % Speed(reviews/sec): 142.86 Error: [-0.5175674] #Correct: 1860 #Trained: 4001 Training Accuracy: 46.49 %
Progress: 20.83 % Speed(reviews/sec): 142.86 Error: [-0.7057053] #Correct: 2329 #Trained: 5001 Training Accuracy: 46.57 %
Progress: 25.00 % Speed(reviews/sec): 146.34 Error: [-0.87768714] #Correct: 2859 #Trained: 6001 Training Accuracy: 47.64 %
Progress: 29.17 % Speed(reviews/sec): 142.86 Error: [-0.42471556] #Correct: 3376 #Trained: 7001 Training Accuracy: 48.22 %
Progress: 33.33 % Speed(reviews/sec): 140.35 Error: [-0.25287871] #Correct: 3931 #Trained: 8001 Training Accuracy: 49.13 %
Progress: 37.50 % Speed(reviews/sec): 138.46 Error: [-0.13143902] #Correct: 4508 #Trained: 9001 Training Accuracy: 50.08 %
Progress: 41.67 % Speed(reviews/sec): 136.99 Error: [-0.30215181] #Correct: 5141 #Trained: 10001 Training Accuracy: 51.40 %
Progress: 45.83 % Speed(reviews/sec): 137.50 Error: [-0.83628373] #Correct: 5690 #Trained: 11001 Training Accuracy: 51.72 %
Progress: 50.00 % Speed(reviews/sec): 136.36 Error: [-0.2236724] #Correct: 6318 #Trained: 12001 Training Accuracy: 52.65 %
Progress: 54.17 % Speed(reviews/sec): 136.84 Error: [-0.00040756] #Correct: 6873 #Trained: 13001 Training Accuracy: 52.87 %
Progress: 58.33 % Speed(reviews/sec): 137.25 Error: [-0.24857157] #Correct: 7463 #Trained: 14001 Training Accuracy: 53.30 %
Progress: 62.50 % Speed(reviews/sec): 136.36 Error: [-0.56169307] #Correct: 8091 #Trained: 15001 Training Accuracy: 53.94 %
Progress: 66.67 % Speed(reviews/sec): 136.75 Error: [-0.30580514] #Correct: 8710 #Trained: 16001 Training Accuracy: 54.43 %
Progress: 70.83 % Speed(reviews/sec): 136.00 Error: [-0.85096669] #Correct: 9343 #Trained: 17001 Training Accuracy: 54.96 %
Progress: 75.00 % Speed(reviews/sec): 136.36 Error: [-0.0031485] #Correct: 9973 #Trained: 18001 Training Accuracy: 55.40 %
Progress: 79.17 % Speed(reviews/sec): 135.71 Error: [-0.73531052] #Correct: 10671 #Trained: 19001 Training Accuracy: 56.16 %
Progress: 83.33 % Speed(reviews/sec): 136.05 Error: [-0.14522187] #Correct: 11341 #Trained: 20001 Training Accuracy: 56.70 %
Progress: 87.50 % Speed(reviews/sec): 135.48 Error: [-0.38478658] #Correct: 11973 #Trained: 21001 Training Accuracy: 57.01 %
Progress: 91.67 % Speed(reviews/sec): 134.97 Error: [-0.39655627] #Correct: 12678 #Trained: 22001 Training Accuracy: 57.62 %
Progress: 95.83 % Speed(reviews/sec): 134.50 Error: [-0.55767025] #Correct: 13345 #Trained: 23001 Training Accuracy: 58.02 %

That most likely didn't train very well. Part of the reason may be because the learning rate is too high. Run the following cell to recreate the network with a smaller learning rate, `0.01`, and then train the new network.

trainer = SentimentNetwork(learning_rate=0.01, verbose=True)
trainer.train(x_train, y_train)
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 250.00 Error: [-0.73627527] #Correct: 482 #Trained: 1001 Training Accuracy: 48.15 %
Progress: 8.33 % Speed(reviews/sec): 333.33 Error: [-0.27663369] #Correct: 1065 #Trained: 2001 Training Accuracy: 53.22 %
Progress: 12.50 % Speed(reviews/sec): 333.33 Error: [-0.41620613] #Correct: 1743 #Trained: 3001 Training Accuracy: 58.08 %
Progress: 16.67 % Speed(reviews/sec): 333.33 Error: [-0.41925862] #Correct: 2378 #Trained: 4001 Training Accuracy: 59.44 %
Progress: 20.83 % Speed(reviews/sec): 333.33 Error: [-0.3792133] #Correct: 3022 #Trained: 5001 Training Accuracy: 60.43 %
Progress: 25.00 % Speed(reviews/sec): 333.33 Error: [-0.31493906] #Correct: 3670 #Trained: 6001 Training Accuracy: 61.16 %
Progress: 29.17 % Speed(reviews/sec): 333.33 Error: [-0.19472257] #Correct: 4380 #Trained: 7001 Training Accuracy: 62.56 %
Progress: 33.33 % Speed(reviews/sec): 333.33 Error: [-0.20326775] #Correct: 5068 #Trained: 8001 Training Accuracy: 63.34 %
Progress: 37.50 % Speed(reviews/sec): 333.33 Error: [-0.17244992] #Correct: 5751 #Trained: 9001 Training Accuracy: 63.89 %
Progress: 41.67 % Speed(reviews/sec): 333.33 Error: [-0.74943668] #Correct: 6475 #Trained: 10001 Training Accuracy: 64.74 %
Progress: 45.83 % Speed(reviews/sec): 333.33 Error: [-0.34768212] #Correct: 7171 #Trained: 11001 Training Accuracy: 65.18 %
Progress: 50.00 % Speed(reviews/sec): 333.33 Error: [-0.23588717] #Correct: 7895 #Trained: 12001 Training Accuracy: 65.79 %
Progress: 54.17 % Speed(reviews/sec): 325.00 Error: [-0.67639111] #Correct: 8634 #Trained: 13001 Training Accuracy: 66.41 %
Progress: 58.33 % Speed(reviews/sec): 325.58 Error: [-0.18425262] #Correct: 9360 #Trained: 14001 Training Accuracy: 66.85 %
Progress: 62.50 % Speed(reviews/sec): 326.09 Error: [-0.31647149] #Correct: 10083 #Trained: 15001 Training Accuracy: 67.22 %
Progress: 66.67 % Speed(reviews/sec): 326.53 Error: [-0.31838031] #Correct: 10791 #Trained: 16001 Training Accuracy: 67.44 %
Progress: 70.83 % Speed(reviews/sec): 326.92 Error: [-0.71363956] #Correct: 11494 #Trained: 17001 Training Accuracy: 67.61 %
Progress: 75.00 % Speed(reviews/sec): 327.27 Error: [-0.03786987] #Correct: 12237 #Trained: 18001 Training Accuracy: 67.98 %
Progress: 79.17 % Speed(reviews/sec): 327.59 Error: [-0.89039967] #Correct: 12995 #Trained: 19001 Training Accuracy: 68.39 %
Progress: 83.33 % Speed(reviews/sec): 327.87 Error: [-0.19787345] #Correct: 13741 #Trained: 20001 Training Accuracy: 68.70 %
Progress: 87.50 % Speed(reviews/sec): 328.12 Error: [-0.60033441] #Correct: 14484 #Trained: 21001 Training Accuracy: 68.97 %
Progress: 91.67 % Speed(reviews/sec): 323.53 Error: [-0.47631941] #Correct: 15242 #Trained: 22001 Training Accuracy: 69.28 %
Progress: 95.83 % Speed(reviews/sec): 323.94 Error: [-0.47388592] #Correct: 15995 #Trained: 23001 Training Accuracy: 69.54 %
Training Time: 0:01:15.489437

This actually did better, but let's see what a smaller learning rate will do.

trainer = SentimentNetwork(learning_rate=0.001, verbose=True)
trainer.train(x_train, y_train)
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 250.00 Error: [-0.42248049] #Correct: 472 #Trained: 1001 Training Accuracy: 47.15 %
Progress: 8.33 % Speed(reviews/sec): 333.33 Error: [-0.27087125] #Correct: 1046 #Trained: 2001 Training Accuracy: 52.27 %
Progress: 12.50 % Speed(reviews/sec): 333.33 Error: [-0.45852835] #Correct: 1708 #Trained: 3001 Training Accuracy: 56.91 %
Progress: 16.67 % Speed(reviews/sec): 333.33 Error: [-0.41728936] #Correct: 2334 #Trained: 4001 Training Accuracy: 58.34 %
Progress: 20.83 % Speed(reviews/sec): 333.33 Error: [-0.37365937] #Correct: 2959 #Trained: 5001 Training Accuracy: 59.17 %
Progress: 25.00 % Speed(reviews/sec): 315.79 Error: [-0.25350906] #Correct: 3595 #Trained: 6001 Training Accuracy: 59.91 %
Progress: 29.17 % Speed(reviews/sec): 318.18 Error: [-0.22273178] #Correct: 4292 #Trained: 7001 Training Accuracy: 61.31 %
Progress: 33.33 % Speed(reviews/sec): 320.00 Error: [-0.22148954] #Correct: 4985 #Trained: 8001 Training Accuracy: 62.30 %
Progress: 37.50 % Speed(reviews/sec): 321.43 Error: [-0.164888] #Correct: 5670 #Trained: 9001 Training Accuracy: 62.99 %
Progress: 41.67 % Speed(reviews/sec): 322.58 Error: [-0.70030978] #Correct: 6381 #Trained: 10001 Training Accuracy: 63.80 %
Progress: 45.83 % Speed(reviews/sec): 305.56 Error: [-0.37677934] #Correct: 7082 #Trained: 11001 Training Accuracy: 64.38 %
Progress: 50.00 % Speed(reviews/sec): 307.69 Error: [-0.25747753] #Correct: 7812 #Trained: 12001 Training Accuracy: 65.09 %
Progress: 54.17 % Speed(reviews/sec): 302.33 Error: [-0.66038851] #Correct: 8550 #Trained: 13001 Training Accuracy: 65.76 %
Progress: 58.33 % Speed(reviews/sec): 304.35 Error: [-0.21017589] #Correct: 9271 #Trained: 14001 Training Accuracy: 66.22 %
Progress: 62.50 % Speed(reviews/sec): 306.12 Error: [-0.32861519] #Correct: 9993 #Trained: 15001 Training Accuracy: 66.62 %
Progress: 66.67 % Speed(reviews/sec): 307.69 Error: [-0.31545046] #Correct: 10705 #Trained: 16001 Training Accuracy: 66.90 %
Progress: 70.83 % Speed(reviews/sec): 309.09 Error: [-0.70497608] #Correct: 11411 #Trained: 17001 Training Accuracy: 67.12 %
Progress: 75.00 % Speed(reviews/sec): 310.34 Error: [-0.04885612] #Correct: 12162 #Trained: 18001 Training Accuracy: 67.56 %
Progress: 79.17 % Speed(reviews/sec): 316.67 Error: [-0.79732231] #Correct: 12916 #Trained: 19001 Training Accuracy: 67.98 %
Progress: 83.33 % Speed(reviews/sec): 312.50 Error: [-0.2568252] #Correct: 13678 #Trained: 20001 Training Accuracy: 68.39 %
Progress: 87.50 % Speed(reviews/sec): 313.43 Error: [-0.59070143] #Correct: 14418 #Trained: 21001 Training Accuracy: 68.65 %
Progress: 91.67 % Speed(reviews/sec): 305.56 Error: [-0.42520887] #Correct: 15181 #Trained: 22001 Training Accuracy: 69.00 %
Progress: 95.83 % Speed(reviews/sec): 302.63 Error: [-0.50276096] #Correct: 15931 #Trained: 23001 Training Accuracy: 69.26 %
Training Time: 0:01:19.701444

Surprisingly it did around the same (maybe a little worse). It looks like tuning the learning rate isn't enough.

The Network Parts

This is an initial exploration of some of the parts that are going to make up the Neural Network as well as a little inspection of the data and how we're going to use it.

Set Up

Imports

The Tangle

from collections import Counter
import numpy
from neurotic.tangles.data_paths import DataPath

Python

import pickle

PyPi

from graphviz import Graph
import numpy

This Project

from neurotic.tangles.data_paths import DataPath

Loading The Pickles

path = DataPath("total_count.pkl")
with path.from_folder.open("rb") as reader:
    total_counts = pickle.load(reader)

Some Constants

SPLIT_ON_THIS = " "

The Data

The Reviews.

path = DataPath("reviews.txt")
output_path = DataPath("reviews.pkl", check_exists=False)
if not output_path.from_folder.is_file():
    with open(path.from_folder,'r') as reader:
        reviews = [line.rstrip() for line in reader]
    with output_path.from_folder.open('wb') as writer:
        pickle.dump(reviews, writer)

The labels.

path = DataPath("labels.txt")
output_path = DataPath("labels.pkl", check_exists=False)
if not output_path.from_folder.is_file():
    with path.from_folder.open() as reader:
        labels = (line.rstrip() for line in reader)
        labels = [line.upper() for line in labels]
    with output_path.from_folder.open("wb") as writer:
        pickle.dump(labels, writer)

Transforming Text into Numbers

def plot_network():
    """
    Creates a simplified plot of our network (simple_network.dot.png)
    """
    graph = Graph(format="png")
    graph.attr(rankdir="LR")

    graph.node("a", "horrible")
    graph.node("b", "excellent")
    graph.node("c", "terrible")
    graph.node("d", "")
    graph.node("e", "")
    graph.node("f", "")
    graph.node("g", "")
    graph.node("h", "positive")

    graph.edges(["ad", "ae", "af", "ag",
                 "bd", "be", "bf", "bg",
                 "cd", "ce" , "cf", "cg"])
    graph.edges(["dh", 'eh', 'fh', 'gh'])
    graph.render("graphs/simple_network.dot")
    graph
    return

This is one potential way to classify the sentiment of a review using a neural network. In this case if any of the terms (horrible, excellent, or terrible) exists the input is a one for that term and the output is the sum of the multiplication of the weights times the inputs.

Creating the Input/Output Data

The Vocabulary

We're going to create a "vocabulary" which is just a list of all the words in our reviews.

vocab = total_counts.keys()

Here's our vocabulary size.

vocab_size = len(vocab)
print("{:,}".format(vocab_size))
assert vocab_size==74074
74,074

Layer 0

Now we're going to create a numpy array called layer_0 and initialize it to all zeros. This will represent our input layer, so it will be a 2-dimensional matrix with 1 row and vocab_size columns.

layer_0 = numpy.zeros((1, vocab_size))

Now we can double-check the shape to make sure it matches what we're expecting.

shape = layer_0.shape
print("{}, {:,}".format(*shape))
assert shape == (1,74074)
1, 74,074

Word 2 Index

layer_0 contains one entry for every word in the vocabulary. We need to make sure we know the index of each word, so we'rec going to create a lookup table that stores the index of every word.

word2index = {word: index for index, word in enumerate(vocab)}

Here's the first ten entries in the lookup table.

print("|Term| Index|")
print("|-+-|")
keys = list(word2index.keys())[:10]
for key in keys:
    print("|{}|{}|".format(key, word2index[key]))
Term Index
bromwell 0
high 1
is 2
a 3
cartoon 4
comedy 5
. 6
it 7
ran 8
at 9

Update Input Layer

The update_input_layer will count how many times each word is used in the review and then store those counts at the appropriate indices inside layer_0. To make this useable in other posts you have to pass in the word2index table, but in the actual Neural Network we're going to use a class so it will look a little different.

def update_input_layer(review:str, layer_0: numpy.ndarray, word2index: dict) -> Counter:
    """ Modify the global layer_0 to represent the vector form of review.
    The element at a given index of layer_0 should represent
    how many times the given word occurs in the review.

    Args:
       review: the string of the review
       layer_0: array representing layer 0
       word2index: dict mapping word to index in layer_0
    Returns:
        counter for the tokens (used for troubleshooting)
    """
    # clear out previous state by resetting the layer to be all 0s
    layer_0 *= 0
    tokens = review.split(SPLIT_ON_THIS)
    counter = Counter()
    counter.update(tokens)
    for key, value in counter.items():
        layer_0[:, word2index[key]] = value
    return counter

Here's what happens when you update layer_0 with the first review.

update_input_layer(reviews[0])
print(layer_0)
[[4. 5. 4. ... 0. 0. 0.]]

It doesn't look exciting, but if we remember that we initialized the values as all zeros, then we can see that something is changing.

Get Target For Labels

get_target_for_labels returns 0 or 1, depending on whether the given label is NEGATIVE or POSITIVE, respectively. This will allow us to use the labels as we were given them and map them to numbers inside the neural net. An alternative might be to pre-process the labels or make this a dictionary.

def get_target_for_label(label: str) -> int:
    """Convert a label to `0` or `1`.
    Args:
       label(string) - Either "POSITIVE" or "NEGATIVE".
    Returns:
       `0` or `1`.
    """
    return 1 if label=="POSITIVE" else 0

So, here's the first label.

print(labels[0])
POSITIVE

And here's what we mapped it to.

output = get_target_for_label(labels[0])
assert output == 1
print(output)
1

And here we go with the second label.

print(labels[1])
NEGATIVE
output = get_target_for_label(labels[1])
assert output == 0
print(output)
0

Exploring the Reviews Dataset

Set Up

Imports

Python

from collections import Counter
import pickle
import textwrap

PyPi

import numpy

This Project

from neurotic.tangles.data_paths import DataPath

Lesson 1: Curate a Dataset

The goal of this section is to become familiar with the data and perform any preprocessing that might be needed.

A Helper To Print

def pretty_print_review_and_label(index: int, up_to: int=80) -> None:
    """Prints the label and review

    Args:
     index: the index of the review in the data set
     up_to: number of characters in the review to show
    """
    print("|{}|{}|".format(labels[index], reviews[index][:up_to] + "..."))
    return

The Reviews

It's not really clear what he's doing here. I think he's stripping the newlines off of the reviews, so each review must be one line.

path = DataPath("reviews.txt")
with open(path.from_folder,'r') as reader:
    reviews = [line.rstrip() for line in reader]

The Labels

A similar deal except casting the labels to upper case.

path = DataPath("labels.txt")
with open(path.from_folder,'r') as reader:
    labels = (line.rstrip() for line in reader)
    labels = [line.upper() for line in labels]

Note: The data in reviews.txt we're using has already been preprocessed a bit and contains only lower case characters. If we were working from raw data, where we didn't know it was all lower case, we would want to add a step here to convert it. That's so we treat different variations of the same word, like `The`, `the`, and `THE`, all the same way.

How many reviews do we have?

print("{:,}".format(len(reviews)))
25,000

What does a review look like?

print("\n".join(textwrap.wrap(reviews[0], width=80)))
bromwell high is a cartoon comedy . it ran at the same time as some other
programs about school life  such as  teachers  . my   years in the teaching
profession lead me to believe that bromwell high  s satire is much closer to
reality than is  teachers  . the scramble to survive financially  the insightful
students who can see right through their pathetic teachers  pomp  the pettiness
of the whole situation  all remind me of the schools i knew and their students .
when i saw the episode in which a student repeatedly tried to burn down the
school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a
classic line inspector i  m here to sack one of your teachers . student welcome
to bromwell high . i expect that many adults of my age think that bromwell high
is far fetched . what a pity that it isn  t

Kind of odd looking. It looks like the pre-processor did some bad things.

What does the label for that review look like?

print(labels[0])
POSITIVE

What are the labels available?

At this point we don't have pandas loaded, so I'll just use a set to look at the labels.

print(set(labels))
{'NEGATIVE', 'POSITIVE'}

So there are two labels - "NEGATIVE" and "POSITIVE".

Develop a Predictive Theory

The previous section gave us a rough idea of what's in the data set. Now we want to make a guess as to what the labels mean - why is a review labled POSITIVE or NEGATIVE?

print("|labels.txt| reviews.txt|")
print("|-+-|")
indices = (2137, 12816, 6267, 21934, 5297, 4998)
for index in indices:
    pretty_print_review_and_label(index)
labels.txt reviews.txt
NEGATIVE this movie is terrible but it has some good effects ….
POSITIVE adrian pasdar is excellent is this film . he makes a fascinating woman ….
NEGATIVE comment this movie is impossible . is terrible very improbable bad interpretat…
POSITIVE excellent episode movie ala pulp fiction . days suicides . it doesnt get more…
NEGATIVE if you haven t seen this it s terrible . it is pure trash . i saw this about …
POSITIVE this schiffer guy is a real genius the movie is of excellent quality and both e…

If you look at the negative reviews, they all have the work 'terrible' in them, and the positives all have the workd 'excellent' in them. The theory then, is that the labels are based on whether a review has a key-word in it that makes it either positive or negative.

Quick Theory Validation

In this section we're going to test our theory that key-words identify whether a review is positive or negative using the Counter class and the numpy library.

Word Counter

We'll create three Counter objects, one for words from postive reviews, one for words from negative reviews, and one for all the words.

positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

Examine all the reviews. For each word in a positive review, increase the count for that word in both your positive counter and the total words counter; likewise, for each word in a negative review, increase the count for that word in both your negative counter and the total words counter.

Note: Throughout these projects, you should use `split(' ')` to divide a piece of text (such as a review) into individual words. If you use `split()` instead, you'll get slightly different results than what the videos and solutions show.

The classifications in the labels list.

class Classification:
    positive = "POSITIVE"
    negative = "NEGATIVE"

What we are splitting on.

class Tokens:
    splitter = " "
with DataPath("labels.pkl").from_folder.open("rb") as reader:
    labels = pickle.load(reader)
for label, review in zip(labels, reviews):
    tokens = review.split(Tokens.splitter)
    total_counts.update(tokens)

    if label == Classification.positive:
        positive_counts.update(tokens)        
    else:
        negative_counts.update(tokens)

Most Common Words

Run the following two cells to list the words used in positive reviews and negative reviews, respectively, ordered from most to least commonly used.

Examine the counts of the most common words in positive reviews

print("|Token| Count|")
print("|-+-|")
for token, count in positive_counts.most_common(10):
    print("|{}|{:,}|".format(token, count))
Token Count
  518,327
the 173,324
. 159,654
and 89,722
a 83,688
of 76,855
to 66,746
is 57,245
in 50,215
br 49,235

So, we probably don't want most of the most common tokens.

Examine the counts of the most common words in negative reviews

print("|Token| Count|")
print("|-+-|")
for token, count in negative_counts.most_common(10):
    print("|{}|{:,}|".format(token, count))
Token Count
  531,016
. 167,538
the 163,389
a 79,321
and 74,385
of 69,009
to 68,974
br 52,637
is 50,083
it 48,327

As you can see, common words like "the" appear very often in both positive and negative reviews. Instead of finding the most common words in positive or negative reviews, what you really want are the words found in positive reviews more often than in negative reviews, and vice versa. To accomplish this, you'll need to calculate the ratios of word usage between positive and negative reviews.

Check all the words you've seen and calculate the ratio of postive to negative uses and store that ratio in pos_neg_ratios.

Hint: the positive-to-negative ratio for a given word can be calculated with `positive_counts[word] / float(negative_counts[word]+1)`. Notice the `+1` in the denominator – that ensures we don't divide by zero for words that are only seen in positive reviews.

Create a Counter object to store positive/negative ratios

pos_neg_ratios = Counter()

Positive to negative ratios

Calculate the ratios of positive and negative uses of the most common words

ratios = {element: positive_counts[element]/(negative_counts[element] + 1)
          for element in total_counts}
pos_neg_ratios.update(ratios)
with DataPath("pos_neg_ratios.pkl",
              check_exists=False).from_folder.open("wb") as writer:
    pickle.dump(pos_neg_ratios, writer)

Examine the ratios you've calculated for a few words:

print("Pos-to-neg ratio for 'the' = {:.2f}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {:.2f}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {:.2f}".format(pos_neg_ratios["terrible"]))
Pos-to-neg ratio for 'the' = 1.06
Pos-to-neg ratio for 'amazing' = 4.02
Pos-to-neg ratio for 'terrible' = 0.18

Looking closely at the values you just calculated, we see the following:

  • Words that you would expect to see more often in positive reviews - like "amazing" - have a ratio greater than 1. The more skewed a word is toward postive, the farther from 1 its positive-to-negative ratio will be.
  • Words that you would expect to see more often in negative reviews - like "terrible" - have positive values that are less than 1. The more skewed a word is toward negative, the closer to zero its positive-to-negative ratio will be.
  • Neutral words, which don't really convey any sentiment because you would expect to see them in all sorts of reviews – like "the" – have values very close to 1. A perfectly neutral word – one that was used in exactly the same number of positive reviews as negative reviews – would be almost exactly 1. The `+1` we suggested you add to the denominator slightly biases words toward negative, but it won't matter because it will be a tiny bias and later we'll be ignoring words that are too close to neutral anyway.

Ok, the ratios tell us which words are used more often in postive or negative reviews, but the specific values we've calculated are a bit difficult to work with. A very positive word like "amazing" has a value above 4, whereas a very negative word like "terrible" has a value around 0.18. Those values aren't easy to compare for a couple of reasons:

  • Right now, 1 is considered neutral, but the absolute value of the postive-to-negative ratios of very postive words is larger than the absolute value of the ratios for the very negative words. So there is no way to directly compare two numbers and see if one word conveys the same magnitude of positive sentiment as another word conveys negative sentiment. So we should center all the values around netural so the absolute value from neutral of the postive-to-negative ratio for a word would indicate how much sentiment (positive or negative) that word conveys.

When comparing absolute values it's easier to do that around zero than one.

To fix these issues, we'll convert all of our ratios to new values using logarithms.

Go through all the ratios you calculated and convert them to logarithms. (i.e. use `np.log(ratio)`)

In the end, extremely positive and extremely negative words will have positive-to-negative ratios with similar magnitudes but opposite signs. Note that you have to create a new counter - the update method adds the new value to the previous values.

log_ratios = {}
for token, ratio in pos_neg_ratios.items():
    if ratio > 1:
        log_ratios[token] = numpy.log(ratio)
    else:
        log_ratios[token] = -numpy.log(1/(ratio + 0.01))
positive_negative_log_ratios = Counter()
positive_negative_log_ratios.update(log_ratios)

Examine the new ratios you've calculated for the same words from before:

print("Pos-to-neg ratio for 'the' = {:.2f}".format(positive_negative_log_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {:.2f}".format(positive_negative_log_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {:.2f}".format(positive_negative_log_ratios["terrible"]))
Pos-to-neg ratio for 'the' = 0.06
Pos-to-neg ratio for 'amazing' = 1.39
Pos-to-neg ratio for 'terrible' = -1.67
with DataPath("pos_neg_log_ratios.pkl",
              check_exists=False).from_folder.open("wb") as writer:
    pickle.dump(positive_negative_log_ratios, writer)

If everything worked, now you should see neutral words with values close to zero. In this case, "the" is near zero but slightly positive, so it was probably used in more positive reviews than negative reviews. But look at "amazing"'s ratio - it's above 1, showing it is clearly a word with positive sentiment. And "terrible" has a similar score, but in the opposite direction, so it's below -1. It's now clear that both of these words are associated with specific, opposing sentiments.

Now run the following cells to see more ratios.

The first cell displays all the words, ordered by how associated they are with postive reviews. (Your notebook will most likely truncate the output so you won't actually see all the words in the list.)

The second cell displays the 30 words most associated with negative reviews by reversing the order of the first list and then looking at the first 30 words. (If you want the second cell to display all the words, ordered by how associated they are with negative reviews, you could just write `reversed(pos_neg_ratios.most_common())`.)

You should continue to see values similar to the earlier ones we checked – neutral words will be close to `0`, words will get more positive as their ratios approach and go above `1`, and words will get more negative as their ratios approach and go below `-1`. That's why we decided to use the logs instead of the raw ratios.

Here are the words most frequently seen in a review with a "POSITIVE" label.

print("|Token|Log Ratio|")
print("|-+-|")
for token, ratio in positive_negative_log_ratios.most_common(10):
    print("|{}|{:.2f}|".format(token, ratio))
Token Log Ratio
edie 4.69
antwone 4.48
din 4.41
gunga 4.19
goldsworthy 4.17
gypo 4.09
yokai 4.09
paulie 4.08
visconti 3.93
flavia 3.93

Ummm… okay.

print(positive_counts["edie"])
print(negative_counts["edie"])
109
0

So the ones that are most positive appeared in the positive but not in the negative.

Here are the words most frequently seen in a review with a "NEGATIVE" label. The python slice notation is list-name[first to include: first to exclude: step ].

print("|Token|Log Ratio|")
print("|-+-|")
for token, ratio in positive_negative_log_ratios.most_common()[:-11:-1]:
    print("|{}|{:.2f}|".format(token, ratio))
Token Log Ratio
whelk -4.61
pressurized -4.61
bellwood -4.61
mwuhahahaa -4.61
insulation -4.61
hoodies -4.61
yaks -4.61
raksha -4.61
deamon -4.61
ziller -4.61
print(positive_counts["whelk"])
print(negative_counts["whelk"])
0
1

And the most negative counts just didn't appear in the positive counts, even if they only appeared once in the negative counts.

As with the positive reviews, it's actually hard to figure out exactly what the most common tokens for negative reviews are.

Did our theory work?

Our theory was that key-words identify whether a review is positive or negative. There is some evidence for this, but really, it's not obvious that this is the case in general.

Pickling

Since the other posts in this section re-use some of this stuff it might make sense to pickle them.

with DataPath("total_counts.pkl", check_exists=False).from_folder.open("wb") as writer:
    pickle.dump(total_counts, writer)

Sentiment Classification Lectures

Sentiment Classification & How To "Frame Problems" for a Neural Network

by Andrew Trask

What You Should Already Know

  • neural networks, forward and back-propagation
  • stochastic gradient descent
  • mean squared error
  • and train/test splits

Where to Get Help if You Need it

  • Re-watch previous Udacity Lectures
  • Leverage the recommended Course Reading Material - Grokking Deep Learning
  • Shoot me Andrew a tweet @iamtrask

Set Up

Debug

%load_ext autoreload
%autoreload 2

Imports

Python Standard Library

from datetime import datetime
from functools import partial

From Pypi

from graphviz import Graph
from tabulate import tabulate
import matplotlib.pyplot as pyplot
import numpy
import seaborn

This Project

from neurotic.tangles.data_paths import DataPath

Tables

table = partial(tabulate, tablefmt="orgtbl", headers="keys")

Printing

%matplotlib inline
seaborn.set_style("whitegrid")
FIGURE_SIZE = (12, 10)

Analysis: What's Going on in the Weights?

Let's start with a model that doesn't have any noise cancellation.

mlp_full = SentimentNoiseReduction(reviews=x_train, labels=y_train,
                                   lower_bound=0,
                                   polarity_cutoff=0,
                                   learning_rate=0.01)
mlp_full.train()
Progress: 0.00 % Speed(reviews/sec): 0.00 Error: [-0.5] #Correct: 1 #Trained: 1 Training Accuracy: 100.00 %
Progress: 4.17 % Speed(reviews/sec): 100.00 Error: [-0.38320156] #Correct: 740 #Trained: 1001 Training Accuracy: 73.93 %
Progress: 8.33 % Speed(reviews/sec): 181.82 Error: [-0.26004622] #Correct: 1529 #Trained: 2001 Training Accuracy: 76.41 %
Progress: 12.50 % Speed(reviews/sec): 250.00 Error: [-0.40350302] #Correct: 2376 #Trained: 3001 Training Accuracy: 79.17 %
Progress: 16.67 % Speed(reviews/sec): 285.71 Error: [-0.23990249] #Correct: 3187 #Trained: 4001 Training Accuracy: 79.66 %
Progress: 20.83 % Speed(reviews/sec): 333.33 Error: [-0.14119144] #Correct: 4002 #Trained: 5001 Training Accuracy: 80.02 %
Progress: 25.00 % Speed(reviews/sec): 375.00 Error: [-0.06442389] #Correct: 4829 #Trained: 6001 Training Accuracy: 80.47 %
Progress: 29.17 % Speed(reviews/sec): 411.76 Error: [-0.03508728] #Correct: 5690 #Trained: 7001 Training Accuracy: 81.27 %
Progress: 33.33 % Speed(reviews/sec): 444.44 Error: [-0.05110633] #Correct: 6548 #Trained: 8001 Training Accuracy: 81.84 %
Progress: 37.50 % Speed(reviews/sec): 450.00 Error: [-0.07432703] #Correct: 7404 #Trained: 9001 Training Accuracy: 82.26 %
Progress: 41.67 % Speed(reviews/sec): 476.19 Error: [-0.26512013] #Correct: 8272 #Trained: 10001 Training Accuracy: 82.71 %
Progress: 45.83 % Speed(reviews/sec): 500.00 Error: [-0.14067275] #Correct: 9129 #Trained: 11001 Training Accuracy: 82.98 %
Progress: 50.00 % Speed(reviews/sec): 521.74 Error: [-0.01215903] #Correct: 9994 #Trained: 12001 Training Accuracy: 83.28 %
Progress: 54.17 % Speed(reviews/sec): 541.67 Error: [-0.33825111] #Correct: 10864 #Trained: 13001 Training Accuracy: 83.56 %
Progress: 58.33 % Speed(reviews/sec): 560.00 Error: [-0.00522004] #Correct: 11721 #Trained: 14001 Training Accuracy: 83.72 %
Progress: 62.50 % Speed(reviews/sec): 555.56 Error: [-0.49523538] #Correct: 12553 #Trained: 15001 Training Accuracy: 83.68 %
Progress: 66.67 % Speed(reviews/sec): 571.43 Error: [-0.20026672] #Correct: 13390 #Trained: 16001 Training Accuracy: 83.68 %
Progress: 70.83 % Speed(reviews/sec): 586.21 Error: [-0.20786817] #Correct: 14243 #Trained: 17001 Training Accuracy: 83.78 %
Progress: 75.00 % Speed(reviews/sec): 580.65 Error: [-0.03469862] #Correct: 15108 #Trained: 18001 Training Accuracy: 83.93 %
Progress: 79.17 % Speed(reviews/sec): 593.75 Error: [-0.99460657] #Correct: 15982 #Trained: 19001 Training Accuracy: 84.11 %
Progress: 83.33 % Speed(reviews/sec): 606.06 Error: [-0.0523489] #Correct: 16867 #Trained: 20001 Training Accuracy: 84.33 %
Progress: 87.50 % Speed(reviews/sec): 617.65 Error: [-0.28370015] #Correct: 17734 #Trained: 21001 Training Accuracy: 84.44 %
Progress: 91.67 % Speed(reviews/sec): 611.11 Error: [-0.33222958] #Correct: 18616 #Trained: 22001 Training Accuracy: 84.61 %
Progress: 95.83 % Speed(reviews/sec): 621.62 Error: [-0.17177784] #Correct: 19475 #Trained: 23001 Training Accuracy: 84.67 %
Training Time: 0:00:38.794351

Now here's a function to find the similarity of words in the vocabulary to a word, based on the dot product of the weights from the input layer to the hidden layer.

def get_most_similar_words(focus: str="horrible", count:int=10) -> list:
    """Returns a list of similar words based on weights"""
    most_similar = Counter()
    for word in mlp_full.word_to_index:
        most_similar[word] = numpy.dot(
            mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]],
            mlp_full.weights_input_to_hidden[mlp_full.word_to_index[focus]])    
    return most_similar.most_common(count)
print(get_most_similar_words("excellent"))
[('excellent', 0.14672474869646132), ('perfect', 0.12529721850063252), ('great', 0.1072983586254582), ('amazing', 0.10168346112776101), ('wonderful', 0.0971402564667566), ('best', 0.09640599864254018), ('today', 0.09064606014006837), ('fun', 0.08859560811231239), ('loved', 0.07914150763452406), ('definitely', 0.07693307843353574)]

excellent was, ouf course, most similar to itself, but we can see that the network's weights are most similar to each other when the words are most similar to each other - the network has 'learned' what words are similar to excellent using the training set.

Now a negative example.

print(get_most_similar_words("terrible"))
[('worst', 0.1761389721390966), ('awful', 0.12576492326546337), ('waste', 0.11989143949659276), ('poor', 0.10186721140388931), ('boring', 0.09740050873489904), ('terrible', 0.09719144477251088), ('bad', 0.08198016341605044), ('dull', 0.0812576973066953), ('worse', 0.07504920898991188), ('poorly', 0.07494303321254764)]

Once again, the more similar words were in sentiment, the closer the weights leading from their inputs became.

import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word_to_index):
        words_to_visualize.append(word)

for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word_to_index):
        words_to_visualize.append(word)
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_input_to_hidden[mlp_full.word_to_index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above", title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0], x2=words_top_ted_tsne[:,1], names=words_to_visualize, color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6, text_font_size="8pt", text_color="#555555", source=source, text_align='center') p.add_layout(word_labels)

show(p)

Notes on The Deep Learning Revolution

Intelligence Reimagined (Where did this come from?)

Timeline

  • 1956: Dartmouth Artificial Intelligence Summer Research Project - start of the field of Artificial Intelligence.
  • 1962: Frank Rosenblatt publishes description of the Perceptron
  • 1962: David Huble and Torsten Wiesel report first recordings of responses from neurons
  • 1969: Marvin Minsky and Seymour Papert point out limits of perceptron, triggering the AI Winter
  • 1979: Geoffrey Hinton and James Anderson organize Parallel Models of Associative Memory workshop to gather researchers working on neural networks
  • 1987: First Neural Information Processing Systems (NIPS) conference held, bringing machine learning reasearchers together

The Rise of Machine Learning

What is deep learning?

Deep Learning is a form of machine learning that uses data to train artificial neural networks to do things. When the field of artificial intelligence began in the 1950s there were two camps - one that believed the path to intelligenc lay in using formal logic and writing computer programs, and one that believe intelligence would come by learning directly from data. Deep Learning belongs to the second camp, and although it has been around for a long time, only once we had enough computational power and data was it able to compete.

How did self-driving cars come about?

  • In 2005 a group from Stanford lead by Sebastian Thrun won the DARPA Grand Challenge. This was the second Darpa challenge and the first where (five) vehicles were able to finish.
  • Some see self-driving cars as a way to remake society:
    • no need to own a car, use a just-in-time service
    • No need for parking lots and so many lanes on the road
    • Travel time can be productive
    • Once one car learns something it can be taught to all the other cars so 'rare' events will be handled even if it is the first time a car sees the event.

How do machines translate languages?

Originally they worked using a statistical approach, looking for familiar word combinations and counts. Now they are able to keep longer sections of text which improves the translation because there is more seen in contetxt. The hope is that when they can be expanded to learn paragraphs or an author's entire body of work, then they can learn more subtleties and the poetry of the text.

What's the big deal about speech recognition?

Some people think that the next interface to our machines will be the human voice. There have already been demonstrations of live translations made using computer speech recognition and translation.

How good is machine learning at playing poker?

DeepStack played poker against professional poker players and beat all of them. This is important because the nature of the game means that every player is working with imperfect information (the unseen cards and the other players' cards). This could imply that machine learning could be used in other places where you don't have all the information, like politics and negotiations.

Does artificial intelligence pose a threat to humanity?

If you look an the areas where deep learning managed to outdo human competitors (e.g. Alpha Go), what eventually happened was that the human players were able to learn moves from the Artificial Intelligence that they would likely not have come up with themselves. This points the way to the immediate future of Artificial Intelligence. Although AI can sometimes outperform humans, the more open-ended the problem, the more it is likely that humans and machines can complement each other, with the machine creating outcomes we could never think of and the humans contributing the expertise needed as a human to solve human problems. AI is, so far, more of a complement to human intelligence, not a replacement for it.

The Rebirth of Artificial Intelligence

The Dawn of Neural Networks

Brain-style Computing

Insights from the Visual System

Many Ways To Learn (How does it work?)

The Cocktail Party Problem

The Hopfield Net and Boltzmann Machine

Backpropagating Errors

Convolutional Learning

Reward Learning

Neural Information Processing Systems

Technological and Scientific Impact (What has it done and what might it do?)

The Future of Machine Learning

The Age of Algorithms

Hello, Mr. Chips

Inside Information

Conscousness

Nature Is Cleverer Than We Are

Deep Intelligence

Citation

[TDLR] Sejnowski TJ. The deep learning revolution. MIT Press; 2018 Oct 23.

Reading List

Table of Contents

Books

Deep Learning

[DLI] Krohn J. Deep Learning Illustrated: a visual, interactive guide to artificial intelligence. Boston, MA: Addison-Wesley; 2019.

[GDL] Trask AW. Grokking Deep Learning. Shelter Island: Manning; 2019. 309 p.

[TDLR] Sejnowski TJ. The Deep Learning Revolution. MIT Press; 2018 Oct 23.

Links

Bike Sharing Project Answers

Introduction

The Bike Sharing Project uses a neural network to predict daily ridership for a bike sharing service. The code is split into two parts - a jupyter notebook that you work with and a python file (my_answers.py) where you put the parts of the code that isn't provided. This creates the my_answer.py file.

Imports

import numpy

The Neural Network

class NeuralNetwork(object):
    """Implementation of a neural network with one hidden layer

    Args:
     input_nodes: number of input nodes
     hidden_nodes: number of hidden nodes
     output_nodes: number of output_nodes
     learning_rate: rate at which to update the weights
    """
    def __init__(self, input_nodes: int, hidden_nodes: int, output_nodes:int,
                 learning_rate: float) -> None:
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        self.learning_rate = learning_rate

        # Initialize weights
        self._weights_input_to_hidden = None
        self._weights_hidden_to_output = None
        return

Input To Hidden Weights

@property
def weights_input_to_hidden(self) -> numpy.ndarray:
    """Array of weights from input layer to the hidden layer"""
    if self._weights_input_to_hidden is None:
        self._weights_input_to_hidden = numpy.random.normal(
            0.0, self.input_nodes**-0.5, 
            (self.input_nodes, self.hidden_nodes))
    return self._weights_input_to_hidden

The unit-test tries to set the weights so we need a setter.

@weights_input_to_hidden.setter
def weights_input_to_hidden(self, weights: numpy.ndarray) -> None:
    """Sets the weights"""
    self._weights_input_to_hidden = weights
    return

Hidden To Output Weights

@property
def weights_hidden_to_output(self):
    """Array of weights for edges from hidden layer to output"""
    if self._weights_hidden_to_output is None:
        self._weights_hidden_to_output = numpy.random.normal(
            0.0,
            self.hidden_nodes**-0.5,
            (self.hidden_nodes, self.output_nodes))
    return self._weights_hidden_to_output

Once again, this is for the unit-testing.

@weights_hidden_to_output.setter
def weights_hidden_to_output(self, weights: numpy.ndarray) -> None:
    """sets the weights for edges from hidden layer to output"""
    self._weights_hidden_to_output = weights
    return

Activation Function

def activation_function(self, value):
    """A pass-through to the sigmoid"""
    return self.sigmoid(value)

Sigmoid

def sigmoid(self, value):
    """Calculates the sigmoid of the value"""
    return 1/(1 + numpy.exp(-value))

Train

def train(self, features, targets):
    ''' Train the network on batch of features and targets. 

       Arguments
       ---------

       features: 2D array, each row is one data record, each column is a feature
       targets: 1D array of target values

    '''
    n_records = features.shape[0]
    delta_weights_i_h = numpy.zeros(self.weights_input_to_hidden.shape)
    delta_weights_h_o = numpy.zeros(self.weights_hidden_to_output.shape) 
   for X, y in zip(features, targets):            
        final_outputs, hidden_outputs = self.forward_pass_train(X)
        # Implement the backpropagation function below
        delta_weights_i_h, delta_weights_h_o = self.backpropagation(
            final_outputs, hidden_outputs, X, y, 
            delta_weights_i_h, delta_weights_h_o)
    self.update_weights(delta_weights_i_h, delta_weights_h_o, n_records)

Forward Pass Train

def forward_pass_train(self, X):
    ''' Implement forward pass here 

       Arguments
       ---------
       X: features batch

    '''
    hidden_inputs = numpy.matmul(X, self.weights_input_to_hidden)
    hidden_outputs = self.activation_function(hidden_inputs)

    final_inputs = numpy.matmul(hidden_outputs, self.weights_hidden_to_output)
    final_outputs = final_inputs
    return final_outputs, hidden_outputs

Back Propagation

def backpropagation(self, final_outputs, hidden_outputs, X, y, delta_weights_i_h, delta_weights_h_o):
    ''' Implement backpropagation

       Arguments
       ---------
       final_outputs: output from forward pass
       y: target (i.e. label) batch
       delta_weights_i_h: change in weights from input to hidden layers
       delta_weights_h_o: change in weights from hidden to output layers

    '''
    error = final_outputs - y

    hidden_error = numpy.matmul(self.weights_hidden_to_output, error)

    output_error_term = error

    hidden_error_term = hidden_error * hidden_outputs * (1 - hidden_outputs)

    delta_weights_i_h += -hidden_error_term * X[:, None]

    delta_weights_h_o += -output_error_term * hidden_outputs[:,None]
    return delta_weights_i_h, delta_weights_h_o

Update Weights

def update_weights(self, delta_weights_i_h, delta_weights_h_o, n_records):
    ''' Update weights on gradient descent step

       Arguments
       ---------
       delta_weights_i_h: change in weights from input to hidden layers
       delta_weights_h_o: change in weights from hidden to output layers
       n_records: number of records

    '''
    self.weights_hidden_to_output += self.learning_rate * (delta_weights_h_o/n_records)
    self.weights_input_to_hidden += self.learning_rate * (delta_weights_i_h/n_records)
    return

Run

Warning: The MSE function defined in the jupyter notebook won't work if you use numpy.dot instead of numpy.matmul. You can make it work by passing in axis=1 to numpy.mean but I don't think you're allowed to change the things in the jupyter notebook.

def run(self, features):
    ''' Run a forward pass through the network with input features 

       Arguments
       ---------
       features: 1D array of feature values
    '''

    hidden_inputs = numpy.matmul(features, self.weights_input_to_hidden)
    hidden_outputs = self.activation_function(hidden_inputs) 

    final_inputs = numpy.matmul(hidden_outputs, self.weights_hidden_to_output)
    final_outputs = final_inputs        
    return final_outputs

The Hyper Parameters

iterations = 7500
learning_rate = 0.4
hidden_nodes = 28
output_nodes = 1