Word Embeddings: Shakespeare Data
Table of Contents
Beginning
This is the first part of as series on building word embeddings using a Continuous Bag of Words. There's an overview post that has links to all the posts in the series.
Imports
# python
import os
import random
import re
# pypi
from expects import equal, expect
Middle
We're going to be using the same dataset that we used in building the autocorrect system.
A Little Cleaning
Imports
# python
from pathlib import Path
import os
import re
# pypi
from dotenv import load_dotenv
import attr
import nltk
The Cleaner
@attr.s(auto_attribs=True)
class DataCleaner:
"""A cleaner for the word-embeddings data
Args:
key: environment key with path to the data file
env_path: path to the .env file
"""
key: str="SHAKESPEARE"
env_path: str="posts/nlp/.env"
stop: str="."
_data_path: str=None
_data: str=None
_unpunctuated: str=None
_punctuation: re.Pattern=None
_tokens: list=None
_processed: list=None
- The Path To the Data
@property def data_path(self) -> Path: """The path to the data file""" if self._data_path is None: load_dotenv(self.env_path) self._data_path = Path(os.environ[self.key]).expanduser() return self._data_path
- The Data
@property def data(self) -> str: """The data-file read in as a string""" if self._data is None: with self.data_path.open() as reader: self._data = reader.read() return self._data
- The Punctuation Expression
@property def punctuation(self) -> re.Pattern: """The regular expression to find punctuation""" if self._punctuation is None: self._punctuation = re.compile("[,!?;-]") return self._punctuation
- The Un-Punctuated
@property def unpunctuated(self) -> str: """The data with punctuation replaced by stop""" if self._unpunctuated is None: self._unpunctuated = self.punctuation.sub(self.stop, self.data) return self._unpunctuated
- The Tokens
We're going to use NLTK's word_tokenize function to tokenize the string.
@property def tokens(self) -> list: """The tokenized data""" if self._tokens is None: self._tokens = nltk.word_tokenize(self.unpunctuated) return self._tokens
- The Processed Tokens
The final processed data will be all lowercased words and periods only.
@property def processed(self) -> list: """The final processed tokens""" if self._processed is None: self._processed = [token.lower() for token in self.tokens if token.isalpha() or token==self.stop] return self._processed
The Counter
@attr.s(auto_attribs=True)
class MetaData:
"""Compile some basic data about the data
Args:
data: the cleaned and tokenized data
"""
data: list
_distribution: nltk.probability.FreqDist=None
_vocabulary: tuple=None
_word_to_index: dict=None
- The Frequency Distribution
According to the doc-string, the FreqDist is meant to hold outcomes from experiments. It looks like a Counter with extra methods added.
@property def distribution(self) -> nltk.probability.FreqDist: """The Token Frequency Distribution""" if self._distribution is None: self._distribution = nltk.FreqDist(self.data) return self._distribution
- The Vocabulary
@property def vocabulary(self) -> tuple: """The sorted unique tokens in the data""" if self._vocabulary is None: self._vocabulary = tuple(sorted(set(self.data))) return self._vocabulary
- The Word-To-Index Mapping
@property def word_to_index(self) -> dict: """Maps words to their index in the vocabulary""" if self._word_to_index is None: self._word_to_index = {word: index for index, word in enumerate(self.vocabulary)} return self._word_to_index
The Cleaned
from neurotic.nlp.word_embeddings import DataCleaner
cleaner = DataCleaner()
print(cleaner.unpunctuated[:50])
print(cleaner.tokens[:10])
print(cleaner.processed[:10])
print(f"Tokens: {len(cleaner.processed):,}")
O for a Muse of fire. that would ascend The bright ['O', 'for', 'a', 'Muse', 'of', 'fire', '.', 'that', 'would', 'ascend'] ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend'] Tokens: 60,996
The Data Data
from neurotic.nlp.word_embeddings import MetaData
counter = MetaData(cleaner.processed)
print(f"Size of vocabulary: {len(counter.distribution):,}")
for token in counter.distribution.most_common(20):
print(f" - {token}")
words = len(counter.distribution)
expect(len(counter.vocabulary)).to(equal(words))
expect(len(counter.word_to_index)).to(equal(words))
print(f"Size of the Vocabulary: {len(counter.vocabulary):,}")
index = random.randrange(words)
word = counter.vocabulary[index]
expect(index).to(equal(counter.word_to_index[word]))
Size of vocabulary: 5,778 - ('.', 9630) - ('the', 1521) - ('and', 1394) - ('i', 1257) - ('to', 1159) - ('of', 1093) - ('my', 857) - ('that', 781) - ('in', 770) - ('a', 752) - ('you', 748) - ('is', 630) - ('not', 559) - ('for', 467) - ('it', 460) - ('with', 441) - ('his', 434) - ('but', 417) - ('me', 417) - ('your', 397) Size of the Vocabulary: 5,778
End
Now that we have the data setup its time to build and train the model.