Word Embeddings: Shakespeare Data

Cloistered Monkey

2020-12-13 12:44

Beginning

This is the first part of as series on building word embeddings using a Continuous Bag of Words. There's an overview post that has links to all the posts in the series.

Imports

# python
import os
import random
import re

# pypi
from expects import equal, expect

Middle

We're going to be using the same dataset that we used in building the autocorrect system.

A Little Cleaning

Imports

# python
from pathlib import Path

import os
import re

# pypi
from dotenv import load_dotenv

import attr
import nltk

The Cleaner

@attr.s(auto_attribs=True)
class DataCleaner:
    """A cleaner for the word-embeddings data

    Args:
     key: environment key with path to the data file
     env_path: path to the .env file
    """
    key: str="SHAKESPEARE"
    env_path: str="posts/nlp/.env"
    stop: str="."
    _data_path: str=None
    _data: str=None
    _unpunctuated: str=None
    _punctuation: re.Pattern=None
    _tokens: list=None
    _processed: list=None

The Path To the Data

@property
def data_path(self) -> Path:
    """The path to the data file"""
    if self._data_path is None:
        load_dotenv(self.env_path)
        self._data_path = Path(os.environ[self.key]).expanduser()
    return self._data_path

The Data

@property
def data(self) -> str:
    """The data-file read in as a string"""
    if self._data is None:
        with self.data_path.open() as reader:
            self._data = reader.read()
    return self._data

The Punctuation Expression

@property
def punctuation(self) -> re.Pattern:
    """The regular expression to find punctuation"""
    if self._punctuation is None:
        self._punctuation = re.compile("[,!?;-]")
    return self._punctuation

The Un-Punctuated

@property
def unpunctuated(self) -> str:
    """The data with punctuation replaced by stop"""
    if self._unpunctuated is None:
        self._unpunctuated = self.punctuation.sub(self.stop, self.data)
    return self._unpunctuated

The Tokens

We're going to use NLTK's word_tokenize function to tokenize the string.

@property
def tokens(self) -> list:
    """The tokenized data"""
    if self._tokens is None:
        self._tokens = nltk.word_tokenize(self.unpunctuated)
    return self._tokens

The Processed Tokens

The final processed data will be all lowercased words and periods only.

@property
def processed(self) -> list:
    """The final processed tokens"""
    if self._processed is None:
        self._processed = [token.lower() for token in self.tokens
                           if token.isalpha() or token==self.stop]
    return self._processed

The Counter

@attr.s(auto_attribs=True)
class MetaData:
    """Compile some basic data about the data

    Args:
     data: the cleaned and tokenized data
    """
    data: list
    _distribution: nltk.probability.FreqDist=None
    _vocabulary: tuple=None
    _word_to_index: dict=None

The Frequency Distribution

According to the doc-string, the FreqDist is meant to hold outcomes from experiments. It looks like a Counter with extra methods added.

@property
def distribution(self) -> nltk.probability.FreqDist:
    """The Token Frequency Distribution"""
    if self._distribution is None:
        self._distribution = nltk.FreqDist(self.data)
    return self._distribution

The Vocabulary

@property
def vocabulary(self) -> tuple:
    """The sorted unique tokens in the data"""
    if self._vocabulary is None:
        self._vocabulary = tuple(sorted(set(self.data)))
    return self._vocabulary

The Word-To-Index Mapping

@property
def word_to_index(self) -> dict:
    """Maps words to their index in the vocabulary"""
    if self._word_to_index is None:
        self._word_to_index = {word: index
                               for index, word in enumerate(self.vocabulary)}
    return self._word_to_index

The Cleaned

from neurotic.nlp.word_embeddings import DataCleaner
cleaner = DataCleaner()
print(cleaner.unpunctuated[:50])
print(cleaner.tokens[:10])
print(cleaner.processed[:10])
print(f"Tokens: {len(cleaner.processed):,}")

O for a Muse of fire. that would ascend
The bright
['O', 'for', 'a', 'Muse', 'of', 'fire', '.', 'that', 'would', 'ascend']
['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend']
Tokens: 60,996

The Data Data

from neurotic.nlp.word_embeddings import MetaData
counter = MetaData(cleaner.processed)

print(f"Size of vocabulary: {len(counter.distribution):,}")
for token in counter.distribution.most_common(20):
    print(f" - {token}")
words = len(counter.distribution)
expect(len(counter.vocabulary)).to(equal(words))
expect(len(counter.word_to_index)).to(equal(words))
print(f"Size of the Vocabulary: {len(counter.vocabulary):,}")

index = random.randrange(words)
word = counter.vocabulary[index]
expect(index).to(equal(counter.word_to_index[word]))

Size of vocabulary: 5,778
 - ('.', 9630)
 - ('the', 1521)
 - ('and', 1394)
 - ('i', 1257)
 - ('to', 1159)
 - ('of', 1093)
 - ('my', 857)
 - ('that', 781)
 - ('in', 770)
 - ('a', 752)
 - ('you', 748)
 - ('is', 630)
 - ('not', 559)
 - ('for', 467)
 - ('it', 460)
 - ('with', 441)
 - ('his', 434)
 - ('but', 417)
 - ('me', 417)
 - ('your', 397)
Size of the Vocabulary: 5,778

End

Now that we have the data setup its time to build and train the model.

Table of Contents

Beginning

Imports

Middle

A Little Cleaning

Imports

The Cleaner

The Counter

The Cleaned

The Data Data

End