Autocorrect System: Data Preprocessing

Beginning

This is part of a series that builds an autocorrect system. The introduction is this post.

Imports

# python
from collections import Counter
from pathlib import Path

import math
import os
import re

# pypi
from dotenv import load_dotenv

Set Up

The Environment

This loads our environment variables.

load_dotenv("posts/nlp/.env", override=True)

Middle

Our corpus is going to come from a text file with some plays of Shakespeare in it.

The Process Data Function

def process_data(file_name: str) -> list:
    """
    pre-processes the text file

    Note:
     The original assignment assumes the file will be in the same directory as 
    the code - so it's called file_name for now but it's really the path to 
    the file

    Args: 
       a path to the text file

    Returns: 
       words: list of all words in the corpus (text file you read) lower-cased
    """
    words = []

    with open(file_name) as lines:
        for line in lines:
            tokens = re.findall("\w+", line)
            words += [token.strip().lower() for token in tokens]

    return words
words = process_data(os.environ["SHAKESPEARE"])
vocabulary = set(words)  # this will be your new vocabulary
first_ten = words[:10]
print(f"The first ten words in the text are: \n{first_ten}")
print(f"There are {len(vocabulary)} unique words in the vocabulary.")

expected = "o for a muse of fire that would ascend the"
actual = " ".join(first_ten)
assert expected == actual, actual
assert len(vocabulary) == 6116
The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.

Get Count

This creates a dictionary of word: count pairs.

def get_count(word_l: list) -> Counter:
    """Creates a counter for the words

    Args:
       word_l: a list of words representing the corpus. 

    Returns:
       word_counter: word-frequency dictionary
    """
    word_count_dict = Counter(word_l)
    return word_count_dict
word_counter = get_count(words)
print(f"There are {len(word_counter)} key values pairs")
print(f"The count for the word 'thee' is {word_counter['thee']}")
assert len(word_counter) == 6116
assert word_counter['thee'] == 240
There are 6116 key values pairs
The count for the word 'thee' is 240

Get Probability

Given the dictionary of word counts, compute the probability that each word will appear if randomly selected from the corpus of words.

\[ P(w_i) = \frac{C(w_i)}{M} \tag{Equation-2} \]

where

\(C(w_i)\) is the total number of times \(w_i\) appears in the corpus.

M is the total number of words in the corpus.

For example, the probability of the word 'am' in the sentence 'I am happy because I am learning' is:

\[ P(am) = \frac{C(w_i)}{M} = \frac {2}{7} \tag{Equation-3} \]

def get_probs(word_count_dict: Counter) -> dict:
    """Calculates the probability of each word

    Args:
      word_count_dict: word:frequency dictionary

    Returns:
      probs: word:probability of word dictionary
    """
    probs = {}  # return this variable correctly

    ### START CODE HERE ###
    total = sum(word_count_dict.values())
    probs = {word: word_count_dict[word]/total for word in word_count_dict}
    ### END CODE HERE ###
    return probs
probabilities = get_probs(word_counter)
print(f"Length of probabilities is {len(probabilities)}")
thee_probability = probabilities["thee"]
print(f"P('thee') is {thee_probability:.4f}")
assert len(probabilities) == 6116
assert math.isclose(thee_probability, 0.0045, abs_tol=1e-04), thee_probability
Length of probabilities is 6116
P('thee') is 0.0045

End

Now that we have the skeleton I'll put it all into a class to make it easier to use it in another notebook.

Imports

# python
from collections import Counter
from pathlib import Path

import math
import os
import re

# pypi
import attr

Corpus Builder

@attr.s(auto_attribs=True)
class CorpusBuilder:
    """Builds the autocorrect corpus counts

    Args:
     path: Path to the corpus source file
    """
    path: Path
    _words: list=None
    _counts: Counter=None
    _probabilities: dict=None
    _vocabulary: set=None

Corpus Words

@property
def words(self) -> list:
    """
    The processed words from the source file

    Returns: 
      words: list of all words in the corpus lower-cased
    """
    if self._words is None:
        with self.path.open() as lines:
            tokenized = (re.findall("\w+", line) for line in lines)
            self._words = [word.strip().lower() for sublist in tokenized for word in sublist]
    return self._words

Corpus Word Counts

@property
def counts(self) -> Counter:
    """The counter for the words in the corpus

    Returns:
     word: word-frequency counter
    """
    if self._counts is None:
        self._counts = Counter(self.words)
    return self._counts

Corpus Word Probabilities

@property
def probabilities(self) -> dict:
    """The probability for each word in the corpus

    Returns:
     word:probability dictionary
    """
    if self._probabilities is None:
        total = sum(self.counts.values())
        self._probabilities = {word: self.counts[word]/total
                               for word in self.counts}
    return self._probabilities

Vocabulary

The final code is going to use set operations so for convenience I'll duplicate the words as a set.

@property
def vocabulary(self) -> set:
    """The set of vocabulary words"""
    if self._vocabulary is None:
        self._vocabulary = set(self.words)
    return self._vocabulary

Testing the Corpus

from neurotic.nlp.autocorrect.preprocessing import CorpusBuilder

path = Path(os.environ["SHAKESPEARE"])
assert path.is_file()

corpus = CorpusBuilder(path)

words = corpus.words
vocabulary = corpus.vocabulary  # this will be your new vocabulary
first_ten = words[:10]
print(f"The first ten words in the text are: \n{first_ten}")
print(f"There are {len(vocabulary)} unique words in the vocabulary.")

expected = "o for a muse of fire that would ascend the"
actual = " ".join(first_ten)
assert expected == actual, actual
assert len(vocabulary) == 6116
The first ten words in the text are: 
['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the']
There are 6116 unique words in the vocabulary.
word_counter = corpus.counts
print(f"There are {len(word_counter)} key values pairs")
print(f"The count for the word 'thee' is {word_counter['thee']}")
assert len(word_counter) == 6116
assert word_counter['thee'] == 240
There are 6116 key values pairs
The count for the word 'thee' is 240
probabilities = corpus.probabilities
print(f"Length of probabilities is {len(probabilities)}")
thee_probability = probabilities["thee"]
print(f"P('thee') is {thee_probability:.4f}")
assert len(probabilities) == 6116
assert math.isclose(thee_probability, 0.0045, abs_tol=1e-04), thee_probability
Length of probabilities is 6116
P('thee') is 0.0045

So, now we have a corpus builder. In the next part - Autocorrect System: Edits - we'll implement some functions to help with creating candidate replacements using edits.