Building the Machine Translation Training Set

<<imports>>

<<dotenv>>

<<embeddings>>

<<get-matrices>>

<<training-testing-data>>

Beginning

This continues from a prior post where we built the EmbeddingsLoader to gather the word-embeddings that match our French and English dictionaries.

Imports

# pypi
from dotenv import load_dotenv

import attr
import numpy

# my stuff
from neurotic.nlp.word_embeddings.embeddings import EmbeddingsLoader

Set Up

The Dotenv

This loads the paths to the files.

 load_dotenv("posts/nlp/.env", override=True)

The Embeddings

Instead of using the subset word-embeddings that the course created I'm going to try and load the whole word embeddings from scratch. I defined the EmbeddingsLoader in {% lancelot title="this post" %}english-to-french-data{% /lancelot %} so I'll just load it here.

 loader = EmbeddingsLoader()

Middle

Generate Embedding and Transform Matrices

Our English and French Embeddings are stored as word:vector dictionaries. To work with the embeddings we're going to need to convert them to matrices. At the same time we need to filter out words that are in one set but not the other so we're going to brute force it a little.

def get_matrices(en_fr: dict, french_vecs: dict, english_vecs: dict):
    """
    Args:
       en_fr: English to French dictionary
       french_vecs: French words to their corresponding word embeddings.
       english_vecs: English words to their corresponding word embeddings.

    Return: 
       X: a matrix where the columns are the English embeddings.
       Y: a matrix where the columns correspond to the French embeddings.
    """

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

    # X_l and Y_l are lists of the english and french word embeddings
    # X_l = list()
    # Y_l = list()

    # get the english words (the keys in the dictionary) and store in a set()
    english_set = set(english_vecs)

    # get the french words (keys in the dictionary) and store in a set()
    french_set = set(french_vecs)

    # store the french words that are part of the english-french dictionary (these are the values of the dictionary)
    # french_words = set(en_fr.values())
    filtered = {english_word: french_word
                for english_word, french_word in en_fr.items()
                if english_word in english_set and french_word in french_set}
    X = [english_vecs[english_word] for english_word in filtered]
    Y = [french_vecs[french_word] for french_word in filtered.values()]

    # loop through all english, french word pairs in the english french dictionary

    # for en_word, fr_word in en_fr.items():
    # 
    #     # check that the french word has an embedding and that the english word has an embedding
    #     if fr_word in french_set and en_word in english_set:
    # 
    #         # get the english embedding
    #         en_vec = english_vecs[en_word]
    # 
    #         # get the french embedding
    #         fr_vec = french_vecs[fr_word]
    # 
    #         # add the english embedding to the list
    #         X_l.append(en_vec)
    # 
    #         # add the french embedding to the list
    #         Y_l.append(fr_vec)
    # 
    # # stack the vectors of X_l into a matrix X
    # X = numpy.vstack(X_l)
    # 
    # # stack the vectors of Y_l into a matrix Y
    # Y = numpy.vstack(Y_l)
    ### END CODE HERE ###

    # return X, Y
    return numpy.vstack(X), numpy.vstack(Y)

Getting the Training Sets

@attr.s(auto_attribs=True)
class TrainingData:
    """Converts the embeddings into a test set

    Args:
     loader: EmbeddingsLoader instance
    """
    _loader: EmbeddingsLoader=None
    _english_vocabulary: set=None
    _french_vocabulary: set=None
    _filtered: dict=None
    _x_train: numpy.ndarray=None
    _y_train: numpy.ndarray=None

    @property
    def loader(self) -> EmbeddingsLoader:
        """A loader for the embeddings subsets"""
        if self._loader is None:
            self._loader = EmbeddingsLoader()
        return self._loader

    @loader.setter
    def loader(self, new_loader: EmbeddingsLoader) -> None:
        """Sets the embeddings loader"""
        self._loader = new_loader
        return

    @property
    def english_vocabulary(self) -> set:
        """The english embeddings subset words"""
        if self._english_vocabulary is None:
            self._english_vocabulary = set(self.loader.english_subset)
        return self._english_vocabulary

    @property
    def french_vocabulary(self) -> set:
        """The french embeddings subset words"""
        if self._french_vocabulary is None:
            self._french_vocabulary = set(self.loader.french_subset)
        return self._french_vocabulary

    @property
    def filtered(self) -> dict:
        """A {enlish:french} dict filtered down

       This is a dict made of the original english-french dictionary created
       by the embeddings loader but filtered down so that the key is in the
       ``english_vocabulary`` and the value is in the ``french_vocabulary``

       This is used to ensure that the training set is created it will only
       contain terms that have entries in both embeddings subsets
       """
        if self._filtered is None:
            self._filtered = {
                english_word: french_word
                for english_word, french_word in self.loader.training.items()
                if (english_word in self.english_vocabulary and
                    french_word in self.french_vocabulary)}
        return self._filtered

    @property
    def x_train(self) -> numpy.ndarray:
        """The english-language embeddings as row-vectors"""
        if self._x_train is None:
            self._x_train = numpy.vstack(
                [self.loader.english_subset[english_word]
                 for english_word in self.filtered]
                )
        return self._x_train

    @property
    def y_train(self) -> numpy.ndarray:
        """The french-language embeddings as row-vectors"""
        if self._y_train is None:
            self._y_train = numpy.vstack(
                [self.loader.french_subset[french_word]
                 for french_word in self.filtered.values()]
            )
        return self._y_train

    def check_rep(self) -> None:
        """Checks the shape of the training data


       Note:
        since this checks those attributes they will be built if they don't
        already exist

       Raises:
        AttributeError - there'se something unexpected about the shape of the data
       """
        rows, columns = self.x_train.shape
        assert rows == len(self.filtered)
        assert columns == len(next(iter(self.loader.english_subset.values())))

        rows, columns = self.y_train.shape
        assert rows == len(self.filtered)
        assert columns == len(next(iter(self.loader.french_subset.values())))            
        return

End

Loading the English and French Word Embeddings

Beginning

This is the first post in a series - the document with links to all the posts in the series is this post.

The Machine Translation exercise uses word embeddings that are subsets of prebuilt Word2Vec (English) embeddings (GoogleNews-vectors-negative300.bin.gz) and prebuilt French Embeddings (wiki.multi.fr.vec). Coursera provides them but I thought it would be a good exercise to look at how they're built.

Imports

# python
from pathlib import Path
import os

# pypi
from dotenv import load_dotenv

Set Up

The Dotenv

To make loading files more or less portable I'm using a .env file with the paths to the data sets. This loads it into the environment so the values are accessible using os.environ.

load_dotenv("posts/nlp/.env", override=True)

Middle

The Embeddings

As I noted the English and French embeddings are available from the web. I was thinking of making a download if the files don't exist but the Google News embeddings file is pretty big so the download takes a while on my internet connection so I thought it'd be better to download it from a browser anyway. I'm going to assume the files are downloaded and the Google News embeddings are un-zipped (probably using gunzip or pigz, both of which are installed by default on Ubuntu 20.04).

Notes

"""This is a module for word embeddings loaders.
"""

Imports

# python
from argparse import Namespace
from pathlib import Path

import os
import pickle

# from pypi
from gensim.models.keyedvectors import BaseKeyedVectors, KeyedVectors

import attr
import pandas

The Raw Loader

@attr.s(auto_attribs=True)
class Embeddings:
    """Embeddings Loader"""
    path: str
    binary: bool
    _embeddings: BaseKeyedVectors=None
  • The Embeddings
    @property
    def embeddings(self) -> BaseKeyedVectors:
        """The loaded embeddings"""
        if self._embeddings is None:
            self._embeddings = KeyedVectors.load_word2vec_format(self.path,
                                                                 binary=self.binary)
        return self._embeddings
    

The Subset Builder

@attr.s(auto_attribs=True)
class SubsetBuilder:
    """Create subset of embeddings that matches sets

    Args:
     embeddings_1: word embeddings
     embeddings_2: word embeddings
     subset_dict: dict whose keys and values to pull out of the embeddings
     output_1: path to save the first subset to
     output_2: path to save the second subset to
    """
    embeddings_1: KeyedVectors
    embeddings_2: KeyedVectors
    subset_dict: dict
    output_1: Path
    output_2: Path

    _vocabulary_1: set=None
    _vocabulary_2: set=None
    _subset_1: dict=None
    _subset_2: dict=None
  • Subset 1
    @property
    def subset_1(self) -> dict:
        """Subset of embeddings 1"""
        if self._subset_1 is None and self.output_1.is_file():        
            with self.output_1.open("rb") as reader:
                self._subset_1 = pickle.load(reader)
        return self._subset_1
    
  • Subset 2
    @property
    def subset_2(self) -> dict:
        """subset of embeddings 2"""
        if self._subset_2 is None and self.output_2.is_file():
            with self.output_2.open("rb") as reader:
                self._subset_2 = pickle.load(reader)
        return self._subset_2
    
  • Save
    def pickle_it(self):
        """Save the subsets"""
        if self.subset_1 is not None:
            with self.output_1.open("wb") as writer:
                pickle.dump(self.subset_1, writer)
        if self.subset_2 is not None:
            with self.output_2.open("wb") as writer:
                pickle.dump(self.subset_2, writer)
        return
    
  • Clean it
    def clean(self) -> None:
        """Remove any pickled subsets
    
        Also removes any subset dictionaries
        """
        for path in (self.output_1, self.output_2):
            if path.is_file():
                path.unlink()
        self._subset_1 = self._subset_2 = None
        return
    
  • Call the Subset Builder
    def  __call__(self, pickle_it: bool=True) -> None:
        """Builds or loads the subsets and saves them as pickles
    
        Args:
         pickle_it: whether to save the subsets
        """
        if self.subset_1 is None or self.subset_2 is None:
            self.clean()
            self._subset_1, self._subset_2 = {}, {}
            for key, value in self.subset_dict.items():
                if key in self.embeddings_1 and value in self.embeddings_2:
                    self._subset_1[key] = self.embeddings_1[key]
                    self._subset_2[value] = self.embeddings_2[value]
            if pickle_it:
                self.pickle_it()
        return
    

Dict Loader

@attr.s(auto_attribs=True)
class DictLoader:
    """Loader for the english and french dictionaries

    This is specifically for the training and testing files
     - CSV-ish (separated by spaces instead of commas)
     - No header: column 1 = English, column 2 = English

    Args:
     path: path to the file
     columns: list of strings
     delimiter: separator for the columns in the source file
    """
    path: str
    columns: list=["English", "French"]
    delimiter: str=" "

    _dataframe: pandas.DataFrame=None
    _dictionary: dict=None
  • Data Frame
    @property
    def dataframe(self) -> pandas.DataFrame:
        """Loads the space-separated file as a dataframe"""
        if self._dataframe is None:
            self._dataframe = pandas.read_csv(self.path,
                                              names=self.columns,
                                              delimiter=self.delimiter)
        return self._dataframe
    
  • Dictionary
    @property
    def dictionary(self) -> dict:
        """english to french dictionary"""
        if self._dictionary is None:
            self._dictionary = dict(zip(self.dataframe[self.columns[0]],
                                        self.dataframe[self.columns[1]]))
        return self._dictionary
    

Loading It

from neurotic.nlp.word_embeddings.embeddings import Embeddings

english_embeddings = Embeddings(os.environ["GOOGLE_EMBEDDINGS"], binary=True)
french_embeddings = Embeddings(os.environ["FRENCH_EMBEDDINGS"], binary=False)
print(english_embeddings.embeddings)
from neurotic.nlp.word_embeddings.embeddings import DictLoader

training = DictLoader(os.environ["ENGLISH_FRENCH_TRAINING"])
testing = DictLoader(os.environ["ENGLISH_FRENCH_TESTING"])

train_keys = set(training.dictionary)
test_keys = set(testing.dictionary)
print(train_keys.intersection(test_keys))
set()

After I made the subset builder it occured to me that if there was overlap between the testing and training sets but they mapped to different definitions then the way I was going to build them would require two separated dictionaries, but as you can see, the training and testing sets don't have English terms in common.

merged = training.dictionary.copy()
merged.update(testing.dictionary)
print(len(training.dictionary))
print(len(testing.dictionary))
print(len(merged))
5000
1500
6500
from neurotic.nlp.word_embeddings.embeddings import SubsetBuilder

english_path = Path(os.environ["ENGLISH_EMBEDDINGS_SUBSET"])
french_path = Path(os.environ["FRENCH_EMBEDDINGS_SUBSET"])

builder = SubsetBuilder(embeddings_1=english_embeddings.embeddings,
                        embeddings_2=french_embeddings.embeddings,
                        subset_dict=merged,
                        output_1=english_path, output_2=french_path)
builder()
builder = SubsetBuilder(embeddings_1=None,
                        embeddings_2=None,
                        subset_dict=None,
                        output_1=english_path, output_2=french_path)

More Builders

After I tried using the EmbeddingsLoader on a different computer I realized that I didn't really simplify the creation of the embeddings all that much so I'm going to make an overall builder that maybe hides it from the end-user (although not entirely since I use environment variables that have to be set).

Source Keys

SourceKeys = Namespace(
    english="GOOGLE_EMBEDDINGS",
    french="FRENCH_EMBEDDINGS",
    training="ENGLISH_FRENCH_TRAINING",
    testing="ENGLISH_FRENCH_TESTING",
    )

Target Keys

TargetKeys = Namespace(
    english="ENGLISH_EMBEDDINGS_SUBSET",
    french="FRENCH_EMBEDDINGS_SUBSET",
    )

Keys

Keys = Namespace(
    source=SourceKeys,
    target=TargetKeys,
    )

Source Paths

@attr.s(auto_attribs=True)
class SourcePaths:
    """Paths to the source files

    These are files provided from other sources
    """
    keys: Namespace=Keys
    _english: Path=None
    _french: Path=None
    _training: Path=None
    _testing: Path=None

    @property
    def english(self) -> Path:
        """Path to the english word-embeddings"""
        if self._english is None:
            self._english = Path(os.environ[self.keys.source.english])
        return self._english

    @property
    def french(self) -> Path:
        """Path to the french word-embeddings"""
        if self._french is None:
            self._french = Path(os.environ[self.keys.source.french])
        return self._french

    @property
    def training(self) -> Path:
        """Path to the training dictionary"""
        if self._training is None:
            self._training = Path(os.environ[self.keys.source.training])
        return self._training

    @property
    def testing(self) -> Path:
        """Path to the testing dictionary"""
        if self._testing is None:
            self._testing = Path(os.environ[self.keys.source.testing])
        return self._testing

Target Paths

@attr.s(auto_attribs=True)
class TargetPaths:
    """Paths to save derived files"""
    keys: Namespace=Keys
    _english: Path=None
    _french: Path=None

    @property
    def english(self) -> Path:
        """Path to derived subset of english embeddings"""
        if self._english is None:
            self._english = Path(os.environ[self.keys.target.english])
        return self._english

    @property
    def french(self) -> Path:
        """Path to derived subset of french embeddings"""
        if self._french is None:
            self._french = Path(os.environ[self.keys.target.french])
        return self._french

Paths

@attr.s(auto_attribs=True)
class Paths:
    """Class to build and hold the source and target file paths"""
    _target: Path=None
    _source: Path=None

    @property
    def target(self) -> TargetPaths:
        """Holds object with paths to created embeddings subsets"""
        if self._target is None:
            self._target = TargetPaths()
        return self._target

    @property
    def source(self) -> SourcePaths:
        """Holds objetw with paths to original source files"""
        if self._source is None:
            self._source = SourcePaths()
        return self._source

Load And Build

@attr.s(auto_attribs=True)
class LoadAndBuild:
    """Loads embeddings and dictionaries and builds subsets"""
    _paths: Paths=None
    _english_embeddings: BaseKeyedVectors=None
    _french_embeddings: BaseKeyedVectors=None
    _training: dict=None
    _testing: dict=None
    _merged_dicts: dict=None
    _subset_builder: SubsetBuilder=None

    @property
    def paths(self) -> Paths:
        """Object with paths to files"""
        if self._paths is None:
            self._paths = Paths()
        return self._paths

    @property
    def english_embeddings(self) -> BaseKeyedVectors:
        """Word embeddings for English"""
        if self._english_embeddings is None:
            self._english_embeddings = Embeddings(self.paths.source.english,
                                                  binary=True).embeddings
        return self._english_embeddings

    @property
    def french_embeddings(self) -> BaseKeyedVectors:
        """Word embeddings for French"""
        if self._french_embeddings is None:
            self._french_embeddings = Embeddings(self.paths.source.french,
                                                 binary=False).embeddings
        return self._french_embeddings

    @property
    def training(self) -> dict:
        """training dictionary"""
        if self._training is None:
            self._training = DictLoader(self.paths.source.training).dictionary
        return self._training

    @property
    def testing(self) -> dict:
        """Testing dictionary"""
        if self._testing is None:
            self._testing = DictLoader(self.paths.source.testing).dictionary
        return self._testing

    @property
    def merged_dicts(self) -> dict:
        """Testing and training merged"""
        if self._merged_dicts is None:
            self._merged_dicts = self.training.copy()
            self._merged_dicts.update(self.testing)
            assert len(self._merged_dicts) == (len(self.training) + len(self.testing))
        return self._merged_dicts

    @property
    def subset_builder(self) -> SubsetBuilder:
        """Builder of the subset dictionaries"""
        if self._subset_builder is None:
            self._subset_builder = SubsetBuilder(
                self.english_embeddings,
                self.french_embeddings,
                self.merged_dicts,
                self.paths.target.english,
                self.paths.target.french)
        return self._subset_builder

    def __call__(self) -> None:
        """Calls the subset builder"""
        self.subset_builder()
        return

A Loader

As a convenience I'm going to make a loader for all the parts.

EmbeddingsKeys = Namespace(
    english_subset="ENGLISH_EMBEDDINGS_SUBSET",
    french_subset="FRENCH_EMBEDDINGS_SUBSET",
    training="ENGLISH_FRENCH_TRAINING",
    testing="ENGLISH_FRENCH_TESTING",
)
@attr.s(auto_attribs=True)
class EmbeddingsLoader:
    """Loads the embeddings and dictionaries

    Warning:
     this assumes that you've loaded the proper environment variables to
    find the files - it doesn't call ``load_dotenv``

    """
    _loader_builder: LoadAndBuild=None
    _english_subset: dict=None
    _french_subset: dict=None
    _training: dict=None
    _testing: dict=None
@property
def loader_builder(self) -> LoadAndBuild:
    """Object to load sources and build subsets"""
    if self._loader_builder is None:
        self._loader_builder = LoadAndBuild()
    return self._loader_builder
@property
def english_subset(self) -> dict:
    """The english embeddings subset

    This is a subset of the Google News embeddings that matches the keys in 
    the english to french dictionaries
    """
    if self._english_subset is None:
        if not self.loader_builder.paths.target.english.is_file():
            self.loader_builder()
            self._english_subset = self.loader_builder.subset_builder.subset_1
        else:
            with self.loader_builder.paths.target.english.open("rb") as reader:
                self._english_subset = pickle.load(reader)
    return self._english_subset
@property
def french_subset(self) -> dict:
    """Subset of the MUSE French embeddings"""
    if self._french_subset is None:
        if self.loader_builder.paths.target.french.is_file():
            with self.loader_builder.paths.target.french.open("rb") as reader:
                self._french_subset = pickle.load(reader)
        else:
            self.loader_builder()
            self._french_subset = self.loader_builder.subset_builder.subset_2
    return self._french_subset
@property
def training(self) -> dict:
    """The english to french dictionary training set"""
    if self._training is None:
        self._training = DictLoader(self.loader_builder.paths.source.training).dictionary
    return self._training
@property
def testing(self) -> dict:
    """testing english to french dictionary"""
    if self._testing is None:
        self._testing = DictLoader(self.loader_builder.paths.source.testing).dictionary
    return self._testing

End

Approximate kNN for Machine Translation

Beginning

In the previous post we implemented Locality Sensitive Hashing. It's part of a series of posts building an English to French translator whose links are gathered in this post.

Imports

# python
from argparse import Namespace

# pypi
from dotenv import load_dotenv
from nltk.corpus import twitter_samples

import numpy

# this repo
from neurotic.nlp.word_embeddings.embeddings import EmbeddingsLoader
from neurotic.nlp.word_embeddings.english_french import TrainingData
from neurotic.nlp.word_embeddings.hashing import (DocumentsEmbeddings,
                                                  PlanesUniverse,
                                                  HashTable,
                                                  HashTables)
from neurotic.nlp.word_embeddings.nearest_neighbors import NearestNeighbors
from neurotic.nlp.twitter.processor import TwitterProcessor
from neurotic.nlp.word_embeddings.training import TheTrainer

Set Up

The Environment

load_dotenv("posts/nlp/.env")

The Tweets

positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
tweets = positive_tweets + negative_tweets

The Twitter Processor

process_tweet = TwitterProcessor()

The Embeddings

embeddings = EmbeddingsLoader()
documents = DocumentsEmbeddings(embeddings=embeddings.english_subset,
                                process=process_tweet, documents=tweets)

Some Constants

TWEET = Namespace(
    vectors=len(tweets),
    dimensions=len(next(iter(embeddings.english_subset.values()))),
    universes=25,
    vectors_per_bucket=16
)

The Planes

universes = PlanesUniverse(vector_count=TWEET.vectors,
                           dimensions=TWEET.dimensions,
                           universes=TWEET.universes,
                           vectors_per_bucket=TWEET.vectors_per_bucket)
assert universes.plane_count == 10

The Hash Tables

hasher = HashTables(planes=universes.planes,
                    universes=TWEET.universes,
                    vectors=documents.documents_embeddings)
hash_tables = hasher.hash_tables

The ID Tables

id_tables = hasher.id_tables

The Training Data

data = TrainingData()

Middle

Approximate K-NN

Implement approximate K nearest neighbors using locality sensitive hashing, to search for documents that are similar to a given document at the index doc_id.

Arguments

Variable Description
doc_id index into the document list all_tweets
v document vector for the tweet in all_tweets at index doc_id
planes_l list of planes (the global variable created earlier)
k number of nearest neighbors to search for
num_universes_to_use Number of available universes to use (25 by default)

The approximate_knn function finds a subset of candidate vectors that are in the same "hash bucket" as the input vector 'v'. Then it performs the usual k-nearest neighbors search on this subset (instead of searching through all 10,000 tweets).

Hints

  • There are many dictionaries used in this function. Try to print out planes_l, hash_tables, id_tables to understand how they are structured, what the keys represent, and what the values contain.
  • To remove an item from a list, use .remove()
  • To append to a list, use .append()
  • To add to a set, use .add()
# UNQ_C21 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# This is the code used to do the fast nearest neighbor search. Feel free to go over it
def approximate_knn(document_id: int,
                    document_embedding: numpy.ndarray,
                    multiverse_planes: list,
                    k: int=1,
                    universes: int=TWEET.universes):
    """Search for k-NN using hashes

    Args:
     document_id: index for the document in the lists
     document_embedding: vector representing a documents word embeddings
     multiverse_planes: dictionary of planes for the hash-tables
     k: number of neighbors to find
     universes: number of times to repeat the search

    Returns:
     list of indexes for neighbor documents
    """
    assert universes <= TWEET.universes

    # Vectors that will be checked as possible nearest neighbor
    possible_neighbors = list()

    # list of document IDs
    ids_of_possible_neighbors = list()

    # create a set for ids to consider, for faster checking if a document ID already exists in the set
    set_of_ids_of_possible_neighbors = set()
    hasher = HashTable(planes=multiverse_planes, vectors=None)

    # loop through the universes of planes
    for universe in range(universes):

        # get the set of planes from the planes_l list, for this particular universe_id
        planes = multiverse_planes[universe]

        # get the hash value of the vector for this set of planes
        # hash_value = hash_value_of_vector(v, planes)
        hash_value = HashTable(planes=planes, vectors=None).hash_value(document_embedding)

        # get the hash table for this particular universe_id
        hash_table = hash_tables[universe]

        # get the list of document vectors for this hash table, where the key is the hash_value
        document_vectors = hash_table[hash_value]

        # get the id_table for this particular universe_id
        id_table = id_tables[universe]

        # get the subset of documents to consider as nearest neighbors from this id_table dictionary
        new_ids_to_consider = id_table[hash_value]

        ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

        # remove the id of the document that we're searching
        if document_id in new_ids_to_consider:
            new_ids_to_consider.remove(document_id)
            print(f"removed document_id {document_id} of input vector from new_ids_to_search")

        # loop through the subset of document vectors to consider
        for index, new_id in enumerate(new_ids_to_consider):

            # if the document ID is not yet in the set ids_to_consider...
            if new_id not in set_of_ids_of_possible_neighbors:
                # access document_vectors_l list at index i to get the embedding
                # then append it to the list of vectors to consider as possible nearest neighbors
                document_vector = document_vectors[index]
                possible_neighbors.append(document_vector)

                # append the new_id (the index for the document) to the list of ids to consider
                ids_of_possible_neighbors.append(new_id)

                # also add the new_id to the set of ids to consider
                # (use this to check if new_id is not already in the IDs to consider)
                set_of_ids_of_possible_neighbors.add(new_id)

        ### END CODE HERE ###

    # Now run k-NN on the smaller set of vecs-to-consider.
    print("Fast considering %d vecs" % len(possible_neighbors))

    # convert the vecs to consider set to a list, then to a numpy array
    vecs_to_consider_arr = numpy.array(possible_neighbors)

    # call nearest neighbors on the reduced list of candidate vectors
    nearest_neighbors = NearestNeighbors(candidates=possible_neighbors, k=k)
    nearest_neighbor_ids = nearest_neighbors(document_embedding)

    # Use the nearest neighbor index list as indices into the ids to consider
    # create a list of nearest neighbors by the document ids
    nearest_neighbor_ids = [ids_of_possible_neighbors[index]
                            for index in nearest_neighbor_ids]

    return nearest_neighbor_ids
doc_id = 0
doc_to_search = tweets[doc_id]
vec_to_search = documents.documents_embeddings[doc_id]

print(doc_to_search)
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
nearest_neighbor_ids = approximate_knn(
    document_id=doc_id,
    document_embedding=vec_to_search,
    multiverse_planes=universes.planes,
    k=3, universes=5)

print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {tweets[neighbor_id]}")
Fast considering 79 vecs
Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 254
document contents: Something to get your #Friday off to a great start :) Have a great day all! #Mclaren #FridayFeeling #TGIF http://t.co/LshgwcXsSv
Nearest neighbor at document id 2714
document contents: Current playlist :D http://t.co/PYKQLD4KHr
Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)

The first and third neighbors seem reasonable, although the third looks like it's just a re-working of our source tweet.

End

Hash Tables

Beginning

Imports

# python
from argparse import Namespace
from functools import partial

import math
import pprint

# pypi
from numpy.random import default_rng

import hvplot.pandas
import numpy
import pandas

from graeae import EmbedHoloviews

Set Up

Plotting

SLUG = "hash-tables"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/nlp/{SLUG}")
Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
)

Random Number Generator

numpy_random = default_rng()

Pretty Printer

pretty = pprint.PrettyPrinter()

Middle

A Basic Hash Table

def basic_hash_table(things_to_hash: list, buckets: int) -> dict:
    """Create a basic hash table

    Args :
     things_to_hash: list of integers to hash
     buckets: number of buckets in the table

    Returns:
     hash_table: the things to hash sorted into their buckets
    """

    def hash_function(value: int, buckets: int) -> int:
        """Maps the value to an integer

       Args:
        value: what to hash
        n_buckets: number of buckets in the hash table

       Returns:
        remainder of value//n_buckets
       """        
        return int(value) % buckets

     # Initialize all the buckets in the hash table as empty lists
    hash_table = {bucket:[] for bucket in range(buckets)}

    for value in things_to_hash:
         # Get the hash key for the given value
        hash_value = hash_function(value, buckets)

         # Add the element to the corresponding bucket
        hash_table[hash_value].append(value)
    return hash_table

The basic_hash_table maps values that can be cast to integers to a dictionary of lists. Let's see what it does.

examples = [100, 10, 14, 17, 97]
hash_table_example = basic_hash_table(examples, buckets=10)
pretty.pprint(hash_table_example)
{0: [100, 10],
 1: [],
 2: [],
 3: [],
 4: [14],
 5: [],
 6: [],
 7: [17, 97],
 8: [],
 9: []}

This Basic Hash Table maps the values based on their remainder after dividing the value by the number of buckets. In this case there are ten buckets so the value gets mapped to the value in its ones column.

Multiplane Hash Functions

To visualize it we'll start with a single plane and color some points based on which side of the plane they fall.

I'll start by defining the vector that we'll use to decide which side of the plane a vector is on (by taking the dot product and checking the sign of the result).

decider  = pandas.DataFrame([[1, 2]])

This isn't the separating plane but rather a vector perpendicular to the separating plane. You don't need the separating plane to make the categorizations of the vectors, but for the sake of visualization it might be useful to see it. We can create it by creating a rotation matrix that rotates our originar vector 90 degrees.

theta_1 = math.radians(90)

rotation = numpy.array([[math.cos(theta_1), -math.sin(theta_1)],
                        [math.sin(theta_1), math.cos(theta_1)]])

plane = pandas.Series(numpy.dot(rotation, decider.T).T[0])

Now we can plot them along with some categorized points.

First plot the vector we use to decide what side of the plane the points are.

# so to plot it I'll add a starting point
COLUMNS = "X Y".split()
start = pandas.DataFrame([[0, 0]])
decider_plotter = pandas.concat([start, plane])
decider_plotter.columns = COLUMNS
plot = decider_plotter.hvplot(x="X", y="Y")

Now plot the plane that separates the categories. I'll scale it a little to move the plot back a little. Also the rotation gives us only the line segment rotated by 90 degrees so I'm going to negate it to get the -90 segment as well to complete the rendering of the plane.

SCALE = 2
plane_plotter = start.append(plane, ignore_index=True) * SCALE
plane_plotter.columns = COLUMNS
plot *= plane_plotter.hvplot(x="X", y="Y", color=Plot.tan, line_dash="dashed")

plane_plotter *= -1
plot *= plane_plotter.hvplot(x="X", y="Y", color=Plot.tan, line_dash="dashed")

Now we get to the points. The main lines to pay attention to are the calculation of the side_of_plane value and the conditional. The side_of_plane is an array but you can do boolean equality checks with integers as shown.

## Get a pair of random numbers between -4 and 4 
POINTS = 20
LIMIT = 4

for _ in range(0, POINTS):
    vector = pandas.DataFrame([numpy_random.uniform(-LIMIT, LIMIT, 2)], 
                              columns=["x", "y"])
    side_of_plane = numpy.sign(numpy.dot(plane, vector.T)) 

    if side_of_plane == 1:
        plot *= vector.hvplot.scatter(x="x", y="y", color=Plot.blue)
    else:
        plot *= vector.hvplot.scatter(x="x", y="y", color=Plot.red)

plot = plot.opts(
    title="Plane Hash Table",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
    xlim=(-LIMIT, LIMIT),
    ylim=(-LIMIT, LIMIT)
)
outcome = Embed(plot=plot, file_name="multiplane_hash")()
print(outcome)

Figure Missing

So the dashed tan line is our separation plane and the blue line segment is the vector we use to decide which side of the plane the dots are on. The blue dots have a positive dot product with the blue vector and the red dots have a negative dot product with the blue vector.

Multiple PLanes

plane_1 = numpy.array([[1, 1]])
plane_2 = numpy.array([[-1, 1]])
plane_3 = numpy.array([[-1, -1]])
multi_plane = [plane_1, plane_2, plane_3]

search_vector = numpy.array([[2, 2]])
def side_of_plane(plane: numpy.ndarray, vector: numpy.ndarray) -> int:
    """Finds the side of the plane that the vector is

    Args:
     plane: separating plane
     vector: location to check 

    Returns:
     sign of the dot product between the plane and the vector
    """
    return numpy.sign(numpy.dot(plane, vector.T)).item()
def hash_multi_plane(planes: list, vector: numpy.ndarray) -> int:
    """Creates hash value for set of planes

    Args:
     planes: list of arrays to hash
     vector: array to determine which side of the planes are positive

    Returns:
     hash_value: the hash for plane matching the vector
    """
    hash_value = 0
    for index, plane in enumerate(planes):
        sign = side_of_plane(plane, vector)

        # increment the hash if the sign is non-negative
        hash_i = 0 if sign < 0 else 1
        hash_value += 2**index * hash_i
    return hash_value
print(hash_multi_plane(multi_plane, search_vector))
3

Random Planes

numpy_random = default_rng(0)
num_dimensions = 2
num_planes = 3
random_planes_matrix = numpy_random.normal(
                       size=(num_planes,
                             num_dimensions))
print(random_planes_matrix)
[[ 0.12573022 -0.13210486]
 [ 0.64042265  0.10490012]
 [-0.53566937  0.36159505]]
search_vector = numpy.array([[2, 2]])
def side_of_plane_matrix(planes: numpy.ndarray, vector: numpy.ndarray) -> numpy.ndarray:
    """Decides which side of planes vector is on

    Returns:
     side-of-plane value for vector with respect to each plane
    """
    return numpy.sign(numpy.dot(planes, vector.T))
print(side_of_plane_matrix(random_planes_matrix, search_vector))
[[-1.]
 [ 1.]
 [-1.]]
def hash_multi_plane_matrix(planes: numpy.ndarray,
                            vector: numpy.ndarray,
                            num_planes: int):
    """calculates hash for vector with respect to planes"""
    sides_matrix = side_of_plane_matrix(planes, vector)
    hash_value = 0
    for i in range(num_planes):
        sign = sides_matrix[i].item() # Get the value inside the matrix cell
        hash_i = 1 if sign >=0 else 0
        hash_value += 2**i * hash_i # sum 2^i * hash_i
    return hash_value
sm = side_of_plane_matrix(random_planes_matrix, search_vector)
print(hash_multi_plane_matrix(random_planes_matrix, search_vector, num_planes))
2

Document Vectors

This is how you would convert a document to an embedding using word vectors (just add up all the vectors for the words in the document).

word_embedding = {"I": numpy.array([1,0,1]),
                  "love": numpy.array([-1,0,1]),
                  "learning": numpy.array([1,0,1])
                  }
document = ['I', 'love', 'learning', 'not_a_word']
document_embedding = numpy.array([0,0,0])
for word in document:
    document_embedding += word_embedding.get(word,0)

print(document_embedding)
[1 0 3]

More Matrix Math in Python

Beginning

This is another lab from Coursera's NLP Specialization. This time it's about using numpy to perform vector operations.

Imports

# python
from argparse import Namespace
from functools import partial

import math

# from pypi
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews

Set Up

Plotting

SLUG = "more-matrix-math-in-python"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/nlp/{SLUG}")
Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
 )

Middle

Let's start with a simple matrix. We'll call it R because when we do our machine translation we'll need a rotation matrix which is named R.

R = numpy.array([[2, 0],
                 [0, -2]])

Now we'll create another matrix.

x = numpy.array([[1, 1]])
print(x.shape)
(1, 2)

Note the nested square brackets, this makes it a matrix and not a vector.

The Dot Product

y = numpy.dot(x, R)
print(y)
[[ 2 -2]]

The rotation matrix (R) rotates and scales the matrix x. To see the effect we can plot the original vector x and the rotated version y.

X = pandas.DataFrame(dict(X=[0, x[0][0]], Y=[0, x[0][1]]))
Y = pandas.DataFrame(dict(X=[0, y[0][0]], Y=[0, y[0][1]]))

x_plot = X.hvplot(x="X", y="Y", color=Plot.blue)
y_plot = Y.hvplot(x="X", y="Y", color=Plot.red)

plot = (x_plot * y_plot).opts(
    title="Original and Rotated Vectors",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
    xlim=(-2, 2),
    ylim=(-2, 2)
)

outcome = Embed(plot=plot, file_name="original_and_rotate_vectors")()
print(outcome)

Figure Missing

The blue segment is the original vector and the red is the rotated and scaled vector.

More Rotations

In the previous section we rotated the vector using integer values, but if we wanted to rotate the vector a specific number of degrees then the way to do that is to use a rotation matrix.

\[ Ro = \begin{bmatrix} cos \theta & -sin \theta \\ sin \theta & cos \theta \end{bmatrix} \]

Let's start with a vector and rotate it \(100^o\).

theta = math.radians(100)
Ro = pandas.DataFrame([[numpy.cos(theta), -numpy.sin(theta)],
                  [numpy.sin(theta), numpy.cos(theta)]])

x_2 = pandas.Series([2, 2])
y_2 = x_2.dot(Ro)
print("The Rotation Matrix")
print(Ro)
print("\nThe Rotated Vector")
print(y_2)

print(f'\n x2 norm {numpy.linalg.norm(x_2)}')
print(f'\n y2 norm {numpy.linalg.norm(y_2)}')
print(f'\n Rotation matrix norm {numpy.linalg.norm(Ro)}')
print(f" Square Root of 2: {2**0.5}")
The Rotation Matrix
          0         1
0 -0.173648 -0.984808
1  0.984808 -0.173648

The Rotated Vector
0    1.622319
1   -2.316912
dtype: float64

 x2 norm 2.8284271247461903

 y2 norm 2.82842712474619

 Rotation matrix norm 1.414213562373095
 Square Root of 2: 1.4142135623730951

You can see that in this case our transformed vector (y2) didn't change in length the way it did in the previous example. Let's plot it and see what it looks like.

origin = pandas.DataFrame([[0, 0]])
X = origin.append(x_2, ignore_index=True)
Y = origin.append(y_2, ignore_index=True)
COLUMNS = "X Y".split()

X.columns = COLUMNS
Y.columns = COLUMNS

x_plot = X.hvplot(x="X", y="Y", color=Plot.blue)
y_plot = Y.hvplot(x="X", y="Y", color=Plot.red)

plot = (x_plot * y_plot).opts(
    title="100 Degree rotation",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
    xlim=(-3, 3),
    ylim=(-3, 3)
)

outcome = Embed(plot=plot, file_name="one_hundred_degree_rotation")()
print(outcome)

Figure Missing

Rotation matrices rotate anti-clockwise, which makes that look like more than a 100 degree rotation. I'm going to have to figure that out.

The Frobenius Norm

\[ \| \vec a \| = \sqrt {{\vec a} \cdot {\vec a}} \]

For an \(R_2\) matrix, the Frobenius Norm looks like this:

\[ \| \mathrm{A} \|_{F} \equiv \sqrt{\sum_{i=1}^{m} \sum_{j=1}^{n}\left|a_{i j}\right|^{2}} \]

We can translate the second equation directly to numpy.

some_array = numpy.array([[2, 2],
                          [2, 2]])
frobenius_norm = numpy.sqrt(numpy.sum(numpy.square(some_array)))

print(f"The Frobenius Norm = {frobenius_norm}")
The Frobenius Norm = 4.0

So, you might be thinking, we've been using numpy.linalg.norm all this time, what's the difference?

old_norm = numpy.linalg.norm(some_array)
print(old_norm)
assert old_norm == frobenius_norm
4.0

It turns out that the default for norm is the Frobenius Norm so you can calculate it either way.

PCA Dimensionality Reduction and Word Vectors

Beginning

This is an extension of the previous two posts about Word Embeddings and Principal Component Analysis. Once again we're going to start with pre-trained word embeddings rather than train our own and then take the embeddings and explore them to better understand them.

Imports

# from python
from argparse import Namespace
from functools import partial
from pathlib import Path

import math
import os
import pickle

# from pypi
from dotenv import load_dotenv
from expects import (
    be_true,
    equal,
    expect,
)
from numpy.random import default_rng
from sklearn.decomposition import PCA

import holoviews
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews, Timer

Set Up

The Timer

Just something to tell how long some processes take.

TIMER = Timer()

Plotting

SLUG = "pca-dimensionality-reduction-and-word-vectors"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/nlp/{SLUG}")

Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

Randomness

numpy_random = default_rng(1)

The Environment

load_dotenv("posts/nlp/.env")

The Embeddings

These are the same embeddings as in the Word Embeddings exploration. They're loaded a dictionary of arrays (vectors). The original source is the Google News pre-trained data set available from the Word2Vec archive, but it is 3.64 gigabytes so Coursera extracted a subset of it to work with.

path = Path(os.environ["WORD_EMBEDDINGS"])
assert path.is_file()

with path.open("rb") as reader:
    embeddings = pickle.load(reader)

assert len(embeddings) == 243

The instructors also provide some code to show you how to create a different subset and I'm assuming that what they're showing is the actual way that they built this dataset. For future reference, this is the code given.

import nltk
from gensim.models import KeyedVectors

embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)
f = open('capitals.txt', 'r').read()
set_words = set(nltk.word_tokenize(f))
select_words = words = ['king', 'queen', 'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful']
for w in select_words:
    set_words.add(w)

def get_word_embeddings(embeddings):

    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    return word_embeddings

word_embeddings = get_word_embeddings(embeddings)

The Data

The data set is a space-separated-values file with no header.

path = Path(os.environ["CAPITALS"])
assert path.is_file()

data = pandas.read_csv(path, delimiter=" ",
                       names=["City 1", "Country 1", "City 2", "Country 2"])
print(data.head())
   City 1 Country 1   City 2    Country 2
0  Athens    Greece  Baghdad         Iraq
1  Athens    Greece  Bangkok     Thailand
2  Athens    Greece  Beijing        China
3  Athens    Greece   Berlin      Germany
4  Athens    Greece     Bern  Switzerland

It looks odd because this is actually an evaluation set. The first three columns are used to predict the fourth (e.g. Athens, Greece, and Baghdad are used to predict that Baghdad is the capital of Iraq).

Middle

Predicting Relationships Among Words

This part is about writing a function that will use the word embeddings to predict relationships among words.

Requirements

  • The arguments will be three words
  • The first two will be considered related to each other somehow
  • The function will then predict a fourth word that is related to the third word in a way that is similar to the relationship between the first two words.

Another way to look at is it that if you are given three words - Athens, Greece, and Bangkok then the function will fill in the blank for "Athens is to Greece as Bangkok is to __".

Because of our input data set what the function will end up doing is finding the capital of a country. But first we need a distance function.

Cosine Similarity

\begin{align} \cos (\theta) &=\frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\|\|\mathbf{B}\|}\\ &= \frac{\sum_{i=1}^{n} A_{i} B_{i}}{\sqrt{\sum_{i=1}^{n} A_{i}^{2}} \sqrt{\sum_{i=1}^{n} B_{i}^{2}}}\\ \end{align}
  • A and B are the word vectors and \(A_i\) or \(B_i\) is the ith item of that vector
  • If the output is 0 then they are opposites and if the output is 1 then they are the same
  • If the number is between 0 and 1 then it is a similarity score
  • If the number is between 0 and -1 then it is a dissimilarity score
def cosine_similarity(A: numpy.ndarray, B: numpy.ndarray) -> float:
    '''Calculates the cosine similarity between two arrays

    Args:
       A: a numpy array which corresponds to a word vector
       B: A numpy array which corresponds to a word vector
    Return:
       cos: numerical number representing the cosine similarity between A and B.
    '''
    dot_product = A.dot(B)
    norm_of_A = numpy.linalg.norm(A)
    norm_of_B = numpy.linalg.norm(B)
    cos = dot_product/(norm_of_A * norm_of_B)
    return cos
king = embeddings["king"]
queen = embeddings["queen"]
similarity = cosine_similarity(king, queen)
print(f"The Cosine Similarity between 'king' and 'queen': {similarity:0.2f}.")
expected = 0.6510956
expect(math.isclose(similarity, expected, rel_tol=1e-6)).to(be_true)
The Cosine Similarity between 'king' and 'queen': 0.65.

Euclidean Distance

In addition to the Cosine Similarity we can use the (probably better known) Euclidean Distance.

\begin{aligned} d(\mathbf{A}, \mathbf{B})=d(\mathbf{B}, \mathbf{A}) &=\sqrt{\left(A_{1}-B_{1}\right)^{2}+\left(A_{2}-B_{2}\right)^{2}+\cdots+\left(A_{n}-B_{n}\right)^{2}} \\ &=\sqrt{\sum_{i=1}^{n}\left(A_{i}-B_{i}\right)^{2}} \end{aligned}
  • n is the number of elements in the vector
  • A and B are the corresponding word vectors.
  • The more similar the words, the more likely the Euclidean distance will be close to 0 (and zero means they are the same).
def euclidean(A: numpy.ndarray, B: numpy.ndarray) -> float:
    """Calculate the euclidean distance between two vectors

    Args:
       A: a numpy array which corresponds to a word vector
       B: A numpy array which corresponds to a word vector
    Return:
       d: numerical number representing the Euclidean distance between A and B.
    """
    d = numpy.sqrt(((A - B)**2).sum())
    return d
actual = euclidean(king, queen)
expected = 2.4796925
print(f"The Euclidean Distance between 'king' and 'queen' is {actual:0.2f}.")
expect(math.isclose(actual, expected, rel_tol=1e-6)).to(be_true)
The Euclidean Distance between 'king' and 'queen' is 2.48.

The Predictor

Here's whdere we make the function that tries to predict the Country for a given Capital City. This will use the cosine similarity. This first version will use brute-force.

def get_country(city1: str, country1: str, city2: str, embeddings: dict) -> tuple:
    """Find the country that has a particular capital city

    Args:
       city1: a string (the capital city of country1)
       country1: a string (the country of capital1)
       city2: a string (the capital city of country2)
       embeddings: a dictionary where the keys are words and values are their embeddings
    Return:
       countries: most likely country, similarity score
    """
    group = set((city1, country1, city2))

    city1_emb = embeddings[city1]

    country1_emb = embeddings[country1]

    city2_emb = embeddings[city2]

    vec = country1_emb - city1_emb  + city2_emb

    # Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1)
    similarity = -1

    # initialize country to an empty string
    country = ''

    for word in embeddings:
        if word not in group:
            word_emb = embeddings[word]
            # calculate cosine similarity between embedding of country 2 and the word in the embeddings dictionary
            cur_similarity = cosine_similarity(vec, word_emb)

            # if the cosine similarity is more similar than the previously best similarity...
            if cur_similarity > similarity:

                # update the similarity to the new, better similarity
                similarity = cur_similarity

                # store the country as a tuple, which contains the word and the similarity
                country = (word, similarity)
    return country
actual_country, actual_similarity = get_country("Athens", "Greece", "Cairo", embeddings)
print(f"Cairo is the capital of {actual_country}.")

expected_country, expected_similarity = "Egypt", 0.7626821
expect(actual_country).to(equal(expected_country))
expect(math.isclose(actual_similarity, expected_similarity, rel_tol=1e-6)).to(be_true)
Cairo is the capital of Egypt.

Checking the Model Accuracy

\[ \text{Accuracy}=\frac{\text{Correct # of predictions}}{\text{Total # of predictions}} \]

country_getter = partial(get_country, embeddings=embeddings)
def get_accuracy(data: pandas.DataFrame) -> float:
    '''Calculate the fraction of correct capitals

    Args:
       embeddings: a dictionary where the key is a word and the value is its embedding

    Return:
       accuracy: the accuracy of the model
    '''
    num_correct = 0

    # loop through the rows of the dataframe
    for index, row in data.iterrows():

        # get city1
        city1 = row["City 1"]

        # get country1
        country1 = row["Country 1"]

        # get city2
        city2 =  row["City 2"]

        # get country2
        country2 = row["Country 2"]

        # use get_country to find the predicted country2
        predicted_country2, _ = country_getter(city1=city1, country1=country1, city2=city2)

        # if the predicted country2 is the same as the actual country2...
        if predicted_country2 == country2:
            # increment the number of correct by 1
            num_correct += 1

    # get the number of rows in the data dataframe (length of dataframe)
    m = len(data)

    # calculate the accuracy by dividing the number correct by m
    accuracy = num_correct/m
    return accuracy
with TIMER:
    accuracy = get_accuracy(data)
    print(f"Accuracy: {accuracy:0.2f}")
    expect(math.isclose(accuracy, 0.92, rel_tol=0.2)).to(be_true)
2020-10-07 17:50:28,897 graeae.timers.timer start: Started: 2020-10-07 17:50:28.897165
2020-10-07 17:50:50,755 graeae.timers.timer end: Ended: 2020-10-07 17:50:50.755424
2020-10-07 17:50:50,756 graeae.timers.timer end: Elapsed: 0:00:21.858259
Accuracy: 0.92

Plotting With PCA

Computing the PCA

Now we'll write a function to do the Principal Component Analysis for our embeddings.

  • The word vectors are of dimension 300.
  • Use PCA to change the 300 dimensions to n_components dimensions.
  • The new matrix should be of dimension m, n_components (m being the number of rows).
  • First de-mean the data
  • Get the eigenvalues using `linalg.eigh`. Use `eigh` rather than `eig` since R is symmetric. The performance gain when using `eigh` instead of `eig` is substantial.
  • Sort the eigenvectors and eigenvalues by decreasing order of the eigenvalues.
  • Get a subset of the eigenvectors (choose how many principle components you want to use using `n_components`).
  • Return the new transformation of the data by multiplying the eigenvectors with the original data.
def compute_pca(X: numpy.ndarray, n_components: int=2) -> numpy.ndarray:
    """Calculate the principal components for X

    Args:
       X: of dimension (m,n) where each row corresponds to a word vector
       n_components: Number of components you want to keep.

    Return:
       X_reduced: data transformed in 2 dims/columns + regenerated original data
    """
    # you need to set axis to 0 or it will calculate the mean of the entire matrix instead of one per row
    X_demeaned = X - X.mean(axis=0)

    # calculate the covariance matrix
    # the default numpy.cov assumes the rows are variables, not columns so set rowvar to False
    covariance_matrix = numpy.cov(X_demeaned, rowvar=False)

    # calculate eigenvectors & eigenvalues of the covariance matrix
    eigen_vals, eigen_vecs = numpy.linalg.eigh(covariance_matrix)

    # sort eigenvalue in increasing order (get the indices from the sort)
    idx_sorted = numpy.argsort(eigen_vals)

    # reverse the order so that it's from highest to lowest.
    idx_sorted_decreasing = list(reversed(idx_sorted))

    # sort the eigen values by idx_sorted_decreasing
    eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]

    # sort eigenvectors using the idx_sorted_decreasing indices
    # We're only sorting the columns so remember to get all the rows in the slice
    eigen_vecs_sorted = eigen_vecs[:, idx_sorted_decreasing]

    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    # once again, make sure to get all the rows and only slice the columns
    eigen_vecs_subset = eigen_vecs_sorted[:, :n_components]

    # transform the data by multiplying the transpose of the eigenvectors 
    # with the transpose of the de-meaned data
    # Then take the transpose of that product.
    X_reduced = numpy.dot(eigen_vecs_subset.T, X_demeaned.T).T
    return X_reduced

I was getting the wrong values because for some reason so I decided to take out the call to random (since the seed was being set the values were always the same anyway) and just declare the test input array.

X = numpy.array([[4.17022005e-01, 7.20324493e-01, 1.14374817e-04, 3.02332573e-01,
                  1.46755891e-01, 9.23385948e-02, 1.86260211e-01, 3.45560727e-01,
                  3.96767474e-01, 5.38816734e-01],
                 [4.19194514e-01, 6.85219500e-01, 2.04452250e-01, 8.78117436e-01,
                  2.73875932e-02, 6.70467510e-01, 4.17304802e-01, 5.58689828e-01,
                  1.40386939e-01, 1.98101489e-01],
                 [8.00744569e-01, 9.68261576e-01, 3.13424178e-01, 6.92322616e-01,
                  8.76389152e-01, 8.94606664e-01, 8.50442114e-02, 3.90547832e-02,
                  1.69830420e-01, 8.78142503e-01]])
X_reduced = compute_pca(X, n_components=2)
# eigen_vecs, eigen_subset, X_demeaned = compute_pca(X, n_components=2)
print("Your original matrix was " + str(X.shape) + " and it became:")
print(X_reduced)

expected = numpy.array([
 [0.43437323, 0.49820384],
 [0.42077249, -0.50351448],
 [-0.85514571, 0.00531064],
])

numpy.testing.assert_almost_equal(X_reduced, expected)
Your original matrix was (3, 10) and it became:
[[ 0.43437323  0.49820384]
 [ 0.42077249 -0.50351448]
 [-0.85514571  0.00531064]]

Plot It

We'll use most of the non-country words to create a plot to see how well the PCA does.

words = ['oil', 'gas', 'happy', 'sad', 'city', 'town',
         'village', 'country', 'continent', 'petroleum', 'joyful']
subset = numpy.array([embeddings[word] for word in words])
reduced = compute_pca(subset)
reduced = pandas.DataFrame(reduced, columns="X Y".split())
reduced["Word"] = words
labels = reduced.hvplot.labels(x="X", y="Y", text="Word", text_baseline="top")

points = reduced.hvplot.scatter(x="X", y="Y", color=Plot.blue, padding=0.5)

plot = (points * labels).opts(
    title="PCA of Words",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="pca_words")()
print(outcome)

Figure Missing

It appears to have worked fairly well.

Sklearn Comparison

As a comparison here's what SKlearn's PCA does.

model = PCA(n_components=2)
reduced = model.fit(subset).transform(subset)
reduced = pandas.DataFrame(reduced, columns="X Y".split())
reduced["Word"] = words

labels = reduced.hvplot.labels(x="X", y="Y", text="Word", text_baseline="top")

points = reduced.hvplot.scatter(x="X", y="Y", color=Plot.blue, padding=0.5)

plot = (points * labels).opts(
    title="PCA of Words (SKLearn)",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="sklearn_pca_words")()
print(outcome)

Figure Missing

They look fairly comparable, I'll conclude that they are close (or close enough).

PCA Exploration

Beginning

In this post I'm going to walk through the Lab for Coursera's NLP Specialization in which we take a look at Principal Component Analysis which we're going to use for Dimensionality Reduction later on. While PCA can be used as a black box it's useful to get an intuitive understanding of what it's doing so we'll take a look at a couple of simplified examples and pick apart a little bit of what's going on.

Imports

Just the usual suspects.

# python
from argparse import Namespace
from functools import partial

import math
import random

# pypi
from numpy.random import default_rng
from sklearn.decomposition import PCA

import holoviews
import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews

Set Up

Plotting

This is a little bit of convenience code for the HoloViews plotting.

SLUG = "pca-exploration"
Embed = partial(EmbedHoloviews,
                folder_path=f"files/posts/nlp/{SLUG}")

Plot = Namespace(
    width=990,
    height=780,
    fontscale=2,
    tan="#ddb377",
    blue="#4687b7",
    red="#ce7b6d",
    color_cycle = holoviews.Cycle(["#4687b7", "#ce7b6d"])
)

Randomness

numpy's default_rng creates a random-number generator. The only argument it takes is the seed which I'll set to 0.

numpy_random = default_rng(0)

Middle

Example One: Random Uniform Data

This first example will use a set of data that's in a straight line so that we can see a really basic example of what the PCA does to a straight line.

The Dataset

To start I'll create a dataset generated by numpy's uniform function, which takes three arguments - low, high, and size.

correlation = 1
x = numpy_random.uniform(1, 2, 1000)
y = x.copy()

Since \(x=y\) we're going to end up with a line segment at a 45 degree angle.

Center It

For PCA they recommend that you center the data by subtracting the mean.

x -= numpy.mean(x)
y -= numpy.mean(y)

Note: according to sklearn's PCA documentation they center it for you so this is probably an unnecessary step.

data = pandas.DataFrame(dict(x=x, y=y))

The PCA Transformation

We're going to use sklearn's PCA for Principal Component Analysis. The n_components argument is the number of components it will keep - we'll keep 2.

pca = PCA(n_components=2)

Now fit it to the data.

transformation_model = pca.fit(data)

And then transform it.

pca_data = pandas.DataFrame(
    transformation_model.transform(data),
    columns=["Principal Component 1", "Principal Component 2"])

Plot the Transformation

original = data.hvplot.scatter(x="x", y="y", color=Plot.blue)
transformed = pca_data.hvplot.scatter(x="Principal Component 1", y="Principal Component 2", color=Plot.red)
plot = (original * transformed).opts(
    title="Correlated and PCA Data",
    height=Plot.height,
    width=Plot.width,
    fontscale=Plot.fontscale,
)

outcome = Embed(plot=plot, file_name="correlated_data")()
print(outcome)

Figure Missing

Our blue line is the original data and the red line is the transformed data. So it looks like the PCA transform rotates the line to a horizontal one.

Understanding the Model

Now that we have the model we can look at the Eigenvalues and Eigenvectors that it created to do the transformation.

  • The Eigenvectors (principal component).
    print(transformation_model.components_)
    
    [[ 0.70710678  0.70710678]
     [-0.70710678  0.70710678]]
    

    The numbers look a little inscrutable at first, but what you need to know that it's a rotation matrix.

    \[ R = \begin{bmatrix} cos(45^o) & sin(45^o)\\ -sin(45^o) & cos(45^o)\\ \end{bmatrix} \]

    And since our line is at a \(45^\circ\) angle, the values in the Eigenvectors are the sin and cos of \(45^\circ\) that are used to rotate the line flat.

    print(math.cos(math.radians(45)))
    print(math.sin(math.radians(45)))
    
    0.7071067811865476
    0.7071067811865475
    
  • The Eigenvalues (explained variance).

    Also part of the model are the eigenvalues which give the amount of variance explained by each of the components.

    print(transformation_model.explained_variance_)
    
    [1.59912782e-01 7.31437644e-33]
    

    So, what does that mean? Start with the fact that the equation for variance of a uniform distribution is:

    \[ Var = \frac{(b - a)^2}{12} \]

    And remember that hen we called the uniform function we set b to 2, and a to 1, so we get.

    print((2 - 1)**2/12)
    
    0.08333333333333333
    

    If you look at the Eigenvalues we got, the second term is \(7 \times 10^{-33}\) which is pretty much zero, and the first term is about 0.16, so what we have here is.

    \begin{align} Var &= \langle Var(x) + Var(y), 0\rangle\\ &= \langle 0.083 + 0.083, 0 \rangle\\ &= \langle 0.16, 0 \rangle\\ \end{align}

    It rounds more to 0.167, but close enough, the point is that the first component contributed all the variance and the second didn't contribute any.

Example Two: Normal Random Data

Now we'll move onto normally-distributed data so we can see something a little more interesting.

Generate the Data

Now we'll to use numpy's random normal function to generate the data. The three arguments it takes are loc (the mean), scale (the standard deviation), and size (the number of numbers to generate).

standard_deviation_1 = 1
standard_deviation_2 = 0.333
points = 10**3

x = numpy_random.normal(0, standard_deviation_1, points)
y = numpy_random.normal(0, standard_deviation_2, points)

Even though we specify that the mean is 0, because it the data is generated randomly it isn't exactly zero so we'll center it.

print(f"x mean start: {x.mean()}")
print(f"y mean start: {y.mean()}")
x = x - x.mean()
y = y - y.mean()

print(f"\nx mean: {x.mean()}")
print(f"y mean: {y.mean()}")
x mean start: -0.012000607736595292
y mean start: -0.01409218413437418

x mean: 3.552713678800501e-18
y mean: 2.6645352591003758e-18

Plot It

And now a plot to show the data.

data = pandas.DataFrame(dict(x=x, y=y))
plot = data.hvplot.scatter(x="x", y="y").opts(
    title="Random Normal Data",
    height=Plot.height,
    width=Plot.width,
    fontscale=Plot.fontscale,
    color=Plot.blue,
)
outcome = Embed(plot=plot, file_name="random_normal_data")()
print(outcome)

Figure Missing

As you can see, the data is pretty uncorrelated so we're going to rotate it to make it a little less of a blob.

Rotate The Data

Now we're going to put the x and y data into a matrix and rotate it.

covariance = 1
in_degrees = 45
angle = math.radians(in_degrees)
print(f"angle: {math.degrees(angle)}\n")

rotation_matrix = numpy.array([[numpy.cos(angle), numpy.sin(angle)],
                               [-numpy.sin(angle), numpy.cos(angle)]])
print(rotation_matrix)
angle: 45.0

[[ 0.70710678  0.70710678]
 [-0.70710678  0.70710678]]

You might notice that this is the same rotation matrix that we had before with the sklearn eigenvectors, so we could have used that, but this is how you would roll your own.

Now we can apply the rotation by taking the dot-product between the data array and the rotation-matrix.

rotated = data.dot(rotation_matrix)
rotated.columns = ["x", "y"]

Plot The Rotated Data

To get a sense of what our transformation did we can plot it. In addition we'll plot the axes created by the rotation matrix so we can see how they're related. So first thing is to unpack the axes contained within the rotation matrix. In addition we'll scale the axes by the standard deviation we used along each of the original axes to see how that relates to the shape of the data.

FIRST_ROW, SECOND_ROW = 0, 1
FIRST_COLUMN, SECOND_COLUMN = 0, 1
ORIGIN = [0, 0]
SCALAR = 3
FIRST_SPREAD, SECOND_SPREAD = (standard_deviation_1 * SCALAR,
                               standard_deviation_2 * SCALAR)
COLUMNS = "x y".split()

first_axis = pandas.DataFrame([
    ORIGIN,
    rotation_matrix[FIRST_ROW][:]],
                              columns=COLUMNS)
first_axis *= FIRST_SPREAD


second_axis = pandas.DataFrame([
    ORIGIN,
    rotation_matrix[SECOND_ROW][:]],
                               columns=COLUMNS)
second_axis *= SECOND_SPREAD
first_axis_plot = first_axis.hvplot(x="x", y="y", color="red")
second_axis_plot = second_axis.hvplot(x="x", y="y", color="orange")
rotated_plot = rotated.hvplot.scatter(x="x", y="y", color=Plot.blue)

plot = (rotated_plot * first_axis_plot * second_axis_plot).opts(
    title="Rotated Normal Data",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale,
)
outcome = Embed(plot=plot, file_name="rotated_normal_data")()
print(outcome)

Figure Missing

So our data is now grouped around a 45-degree angle and spread further along the axis that had more variance.

Apply the PCA

pca = PCA(n_components=2)
fitted = pca.fit(rotated)

Once again, the Eigenvectors (the transformation matirix).

print(fitted.components_)
[[-0.70844626 -0.70576476]
 [-0.70576476  0.70844626]]

And then the Eigenvalues (the variance).

variance = fitted.explained_variance_
print(variance)
[1.05270169 0.10604603]

Now we apply the PCA transformation.

pca_data = fitted.transform(rotated)
pca_data = pandas.DataFrame(pca_data, columns="x y".split())

Plot the PCA Transformed Data

We're going to plot the rotated and the transformed data along with the axes for the rotated data so the first

transformed = pca_data.hvplot.scatter(x="x", y="y", color=Plot.red, fill_alpha=0)
rotated_plot = rotated.hvplot.scatter(x="x", y="y", color=Plot.blue, fill_alpha=0)
first_axis_plot = first_axis.hvplot(x="x", y="y", color="red")
second_axis_plot = second_axis.hvplot(x="x", y="y", color="orange")

plot = (transformed * rotated_plot * first_axis_plot * second_axis_plot).opts(
    title="PCA of Random Normal Data",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale
)

outcome = Embed(plot=plot, file_name="pca_random_normal")()
print(outcome)

Figure Missing

Looking at the model

  • The rotation matrix took the original uncorrelated variables and transformed them into correllated variables (the blue circles).
  • Fitting the PCA to our correlated data finds the rotation matrix that was used to create the blue points.
  • Applying the PCA transformation undoes the rotation (but the spread doesn't return).

Our orginal standard deviations were 1 and 0.333 and if we look at the Explained Variance it is roughly our original standard deviations squared.

print(numpy.sqrt(variance))
[0.99140088 0.32958007]

Dimensionality Reduction

The previous sections were meant to understand what PCA is doing, but to use the PCA for visualization we will use it to reduce the number of dimensions of a data set so that it can be plotted. We can get a sense of how that works here by looking at our rotated data set with either the entire x-axis set to 0 or the entire y-axis set to 0.

first_component = rotated.copy()
first_component["y"] = 0
second_component = rotated.copy()
second_component["x"] = 0

original = rotated.hvplot.scatter(x="x", y="y", color=Plot.tan,
                                  fill_alpha=0)
first = first_component.hvplot.scatter(x="x", y="y",
                                       color=Plot.blue, fill_alpha=0)
second = second_component.hvplot.scatter(x="x", y="y",
                                         color=Plot.red, fill_alpha=0)

plot = (original * first * second).opts(
    title="Data Decomposition",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.fontscale
)
outcome = Embed(plot=plot, file_name="data_decomposed")()
print(outcome)

Figure Missing

This is only a teaser to doing an actual dimensionality reduction.

End

This is a walk-through of a lab for Coursera's NLP Specialization.

Word Embeddings

Beginning

This is a walk through a lab for week 3 of Coursera's Natural Language Processing course. It's going to use some pretrained word embeddings to develop some sense of how to use them.

Set Up

Imports

# python
from functools import partial
from pathlib import Path

import os
import pickle

# pypi
from dotenv import load_dotenv
from expects import (
    equal,
    expect,
)

import hvplot.pandas
import numpy
import pandas

# my stuff
from graeae import EmbedHoloviews

Plotting

load_dotenv("posts/nlp/.env")
SLUG = "word-embeddings"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/nlp/{SLUG}")
plot_path = Path(os.environ["TWITTER_PLOT"])
assert plot_path.is_file()
with plot_path.open("rb") as reader:
    Plot = pickle.load(reader)

The Embeddings

Like I mentioned above, I'm going to use pre-trained word embeddings that have been pickled so I'll load them here.

path = Path(os.environ["WORD_EMBEDDINGS"])
with path.open("rb") as reader:
    embeddings = pickle.load(reader)
expect(len(embeddings)).to(equal(243))

Middle*

Inspecting the Embeddings

The embeddings is a dictionary of words to word-vectors that represent them. Here's the first 5 words.

print(type(embeddings))
print(list(embeddings.keys())[:5])
<class 'dict'>
['country', 'city', 'China', 'Iraq', 'oil']
vector = embeddings["country"]
print(type(vector))
print(vector.shape)
<class 'numpy.ndarray'>
(300,)

Each word-embedding vector has 300 entries.

Plotting

Since there are 300 columns you can't easily visualize them without using PCA or some other method, but this is more about getting an intuition as to how the linear-algebra works, so instead we're going to reduce a subset of words to only two columns so that we can plot them.

words = ['oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful']
plot_data = pandas.DataFrame([embeddings[word] for word in words])
plot_columns = [3, 2]
plot_data = plot_data[plot_columns]
plot_data.columns = ["x", "y"]
plot_data["Word"] = words
origins = plot_data * 0
origins["Word"] = words
combined_plot_data = pandas.concat([origins, plot_data])

segment_plot = combined_plot_data.hvplot(x="x", y="y", by="Word")
scatter_plot = plot_data.hvplot.scatter(x="x", y="y", by="Word")

plot = (segment_plot * scatter_plot).opts(
    title="Embeddings Columns 3 and 2",
    width=Plot.width,
    height=Plot.height,
    fontscale=Plot.font_scale
)
outcome = Embed(plot=plot, file_name="embeddings_segments")()
print(outcome)

Figure Missing

You can see that words like "village" and "town" are similar while "city" and "oil" are opposites for whatever reason. Oddly, "joyful" and "country" are also very similar (although I'm only looking at two out of three-hundred columns so that might not be the case once the other columns enter into place).

Word Distance

This is supposed to be a visualization of the difference vectors between sad and happy and town and village, but as far as I can see holoviews doesn't have the equivalent of matplotlib's arrow which lets you use the base coordinate and distance in each dimension to draw arrows, so it's kind of a fake version where I use the points directly. Oh, well.

words = ['sad', 'happy', 'town', 'village']
plot_data = pandas.DataFrame([embeddings[word] for word in words])
plot_data = plot_data[plot_columns]
plot_data.columns = ["x", "y"]
plot_data.index = words

This is the fake part - when you take the difference between two "points" it gives you a vector with the base at the origin so you have to add the base point back in to move it from the origin, but then all you're doing is undoing the subtraction, giving you what you started with.

difference = pandas.DataFrame([
    plot_data.loc["happy"] - plot_data.loc["sad"] + plot_data.loc["sad"],
    plot_data.loc["town"] - plot_data.loc["village"] + plot_data.loc["village"]
])

difference["Word"] = ["sad", "village"]
plot_data = plot_data.reset_index().rename(columns=dict(index="Word"))

difference = pandas.concat([difference,
                            plot_data[plot_data.Word=="sad"],
                            plot_data[plot_data.Word=="village"]])


with_origin = pandas.concat([origins[origins.Word.isin(words)], plot_data])
scatter = plot_data.hvplot.scatter(x="x", y="y", by="Word")
segments = with_origin.hvplot(x="x", y="y", by="Word")
distances = difference.hvplot(x="x", y="y", by="Word")

plot = (distances * segments * scatter).opts(
    title="Vector Differences",
    height=Plot.height,
    width=Plot.width,
    fontscale=Plot.font_scale,
)

outcome = Embed(plot=plot, file_name="vector_differences")()
print(outcome)

Figure Missing

Linear Algebra on Word Embeddings

The norm

First I'll check out the norm of some word vectors using numpy.linalg.norm. This calculates the Euclidean Distance between vectors (but oddly we won't use it here).

print(numpy.linalg.norm(embeddings["town"]))
print(numpy.linalg.norm(embeddings["sad"]))
2.3858097
2.9004838

Predicting Capitals

Here we'll see how to use the embeddings to predict what country a city is the capital of. To encode the concept of "capital" into a vector we'll use the difference between a specific country and its real capital (in this case France and Paris).

capital = embeddings["France"] - embeddings["Paris"]

Now that we have the concept of a capital encoded as a word embedding we can add it to the embedding of "Madrid" to get a vector near where "Spain" would be. Note that although there is a "Spain" in the embeddings we're going to use this to see if we can find it without knowing that Madrid is the capital of Spain.

country = embeddings["Madrid"] + capital

To make a prediction we have to find the embeddings that are closest to a country. We're going to convert the embeddings to a pandas DataFrame and since our embeddings are a dictionary of arrays we'll have to do a little unpacking first.

keys = embeddings.keys()
embeddings = pandas.DataFrame([embeddings[key] for key in keys], index=keys)

Now we'll make a function to find the closest embeddings for a word vector.

def closest_word(vector: numpy.ndarray) -> str:
    """Find the word closest to a given vector

    Args:
     vector: the vector to match

    Returns:
     name of the closest embedding
    """
    differences = embeddings - vector
    expect(differences.shape).to(equal(embeddings.shape))

    distances = (differences**2).sum(axis="columns")
    expect(distances.shape).to(equal((len(differences),)))

    return embeddings.iloc[numpy.argmin(distances)].name

Now we can check what word most closesly matches Madrid + (France - Paris).

print(closest_word(country))
Spain

Like magic.

More Countries

What happens if we use a different know country and its capital instead of France and Paris?

print(closest_word(embeddings.loc['Italy'] - embeddings.loc['Rome']
                   + embeddings.loc['Madrid']))
Spain

So swapping the capital derivation didn't change the prediction. Now we'll go back to using France - Paris but try different cities.

for word in "Tokyo Moscow".split():
    print(f"{word} is the capital of {closest_word(embeddings.loc[word] + capital)}")
Tokyo is the capital of Japan
Moscow is the capital of Russia

That seems to be working, but here's a case where our search fails.

print(closest_word(embeddings.loc['Lisbon'] + capital))
Lisbon

For some reason "Lisbon" is closer to itself than portugal. I tried it with Germany and Italy instead of France as the template capital but it still didn't work. If you try random cities from the embeddings you'll see that a fair amount of them fail.

Sentence Vectors

To use this for sentences you construct a vector with all the vectors for each word and then sum up all the columns to get back to a single vector.

sentence = "Canada oil city town".split()
vectors = [embeddings.loc[token] for token in sentence]
summed = numpy.sum(vectors, axis=0)
print(closest_word(summed))
city

Not exciting, but that's how you do it.

Tweet Classifier Class

Beginning

I implemented the Logistic Regression Tweet Sentiment Analysis classifier in this post but I'm going to re-use it later so this just gathers everything together. There's already a class called TweetSentiment but I'm going to add the training to this one as well as the tweet pre-processing and vectorization.

Middle

We'll start with the imports.

# from pypi
import attr
import numpy

# this project
from .counter import WordCounter
from .sentiment import TweetSentiment
from .vectorizer import TweetVectorizer

The Logistic Regression Class

@attr.s(auto_attribs=True)
class LogisticRegression:
    """train and predict tweet sentiment

    Args:
     iterations: number of times to run gradient descent
     learning_rate: how fast to change the weights during training
    """
    iterations: int
    learning_rate: float
    _weights: numpy.array = None
    loss: float=None

Weights

These are the weights for the regression function (\(\theta\)).

@property
def weights(self) -> numpy.array:
    """The weights for the regression

    Initially this will be an array of zeros.
    """
    if self._weights is None:
        self._weights = numpy.zeros((3, 1))
    return self._weights

The Weights Setter

@weights.setter
def weights(self, new_weights: numpy.array) -> None:
    """Set the weights to a new value"""
    self._weights = new_weights
    return

Sigmoid

def sigmoid(self, vectors: numpy.ndarray) -> float:
    """Calculates the logistic function value

    Args:
     vectors: a matrix of bias, positive, negative wordc ounts

    Returns:
     array of probabilities that the tweets are positive
    """
    return 1/(1 + numpy.exp(-vectors))

This is the training function

def gradient_descent(self, x: numpy.ndarray, y: numpy.ndarray):
    """Finds the weights for the model

    Args:
     x: the tweet vectors
     y: the positive/negative labels
    """
    assert len(x) == len(y)
    rows = len(x)
    self.learning_rate /= rows
    for iteration in range(self.iterations):
        y_hat = self.sigmoid(x.dot(self.weights))
        # average loss
        loss = numpy.squeeze(-((y.T.dot(numpy.log(y_hat))) +
                               (1 - y.T).dot(numpy.log(1 - y_hat))))/rows
        gradient = ((y_hat - y).T.dot(x)).sum(axis=0, keepdims=True)
        self.weights -= self.learning_rate * gradient.T
    return loss

Fit

This is mostly an alias to make it match (somewhat) sklearn's methods.

def fit(self, x_train: numpy.ndarray, y_train:numpy.ndarray) -> float:
    """fits the weights for the logistic regression

    Note:
     as a side effect this also sets counter, loss, and sentimenter attributes

    Args:
     x_train: the training tweets
     y_train: the training labels

    Returns:
     The final mean loss (which is also saved as the =.loss= attribute)
    """
    self.counter = WordCounter(x_train, y_train)
    vectorizer = TweetVectorizer(x_train, self.counter.counts, processed=False)
    y = y_train.values.reshape((-1, 1))
    self.loss = self.gradient_descent(vectorizer.vectors, y)
    return self.loss

Predict

def predict(self, x: numpy.ndarray) -> numpy.ndarray:
    """Predict the labels for the inputs

    Args:
     x: a list or array of tweets

    Returns:
     array of predicted labels for the tweets
    """
    vectorizer = TweetVectorizer(x, self.counter.counts, processed=False)
    sentimenter = TweetSentiment(vectorizer, self.weights)
    return sentimenter()

Score

def score(self, x: numpy.ndarray, y: numpy.ndarray) -> float:
    """Get the mean accuracy

    Args:
     x: arrray of tweets
     y: labels for the tweets

    Returns:
     mean accuracy
    """
    predictions = self.predict(x)
    correct = sum(predictions.T[0] == y)
    return correct/len(x)

End

Testing it out.

# python
from argparse import Namespace
from pathlib import Path

import math
import os

# pypi
from dotenv import load_dotenv
from expects import (
    be_true,
    expect
)

import pandas

# this project
from neurotic.nlp.twitter.logistic_regression import LogisticRegression
load_dotenv("posts/nlp/.env")

train_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TRAINING_RAW"]).expanduser())

test_raw = pandas.read_feather(
    Path(os.environ["TWITTER_TEST_RAW"]).expanduser()
)


Settings = Namespace(
    eta = 1e-9,
    iterations = 1500
)
model = LogisticRegression(iterations=Settings.iterations,
                           learning_rate=Settings.eta)
model.fit(x_train=train_raw.tweet, y_train=train_raw.label)
expected = 0.22043072
expect(math.isclose(model.loss, expected, rel_tol=1e-7)).to(be_true)
accuracy = model.score(test_raw.tweet, test_raw.label)
print(f"Accuracy: {accuracy}")
Accuracy: 0.996