Autocorrect System: Combining the Edits
Table of Contents
Beginning
This is a continuation of a series of posts that were introduced here. Now that we've implemented the string manipulations, we'll create two functions that, given a string, will return all the possible single and double edits on that string. These will be edit_one_letter()
and edit_two_letters()
.
Imports
# python
from pathlib import Path
import math
import os
# pypi
from dotenv import load_dotenv
# this repo
from neurotic.nlp.autocorrect.edits import TheEditor
from neurotic.nlp.autocorrect.preprocessing import CorpusBuilder
Set Up
The CorpusBuilder has both the vocabulary and the probability for each word in the vocabulary. It gets the path to the text-file with the vocabulary in it from an envirnoment variable named SHAKESPEARE
so we have to load that first.
load_dotenv("posts/nlp/.env", override=True)
path = Path(os.environ["SHAKESPEARE"])
And then build the corpus.
corpus = CorpusBuilder(path)
Middle
Combining the edits
Edit one letter
Instructions: Implement the edit_one_letter
function to get all the possible edits that are one edit away from a word. The edits consist of the replace, insert, delete, and optionally the switch operation. You should use the previous functions you have already implemented to complete this function. The 'switch' function is a less common edit function, so its use will be selected by an "allow_switches" input argument.
Note that those functions return lists while this function should return a python set. Utilizing a set eliminates any duplicate entries.
def edit_one_letter(word: str, allow_switches: bool=True) -> set:
"""Get all possible words one edit away from the original
Args:
word: word for which we will generate all possible words one edit away.
Returns:
edit_one_set: a set of words with one possible edit.
"""
edit_one_set = set()
editor = TheEditor(word)
edits = editor.replaced + editor.inserted + editor.deleted
if allow_switches:
edits += editor.switched
edit_one_set = set(edits)
return edit_one_set
tmp_word = "at"
tmp_edit_one_set = edit_one_letter(tmp_word)
# turn this into a list to sort it, in order to view it
tmp_edit_one_l = sorted(list(tmp_edit_one_set))
print(f"input word {tmp_word} \nedit_one_l \n{tmp_edit_one_l}\n")
print(f"The type of the returned object should be a set {type(tmp_edit_one_set)}")
print(f"Number of outputs from edit_one_letter('at') is {len(edit_one_letter('at'))}")
expected = ['a', 'aa', 'aat', 'ab', 'abt', 'ac', 'act', 'ad', 'adt', 'ae', 'aet', 'af', 'aft', 'ag', 'agt', 'ah', 'aht', 'ai', 'ait', 'aj', 'ajt', 'ak', 'akt', 'al', 'alt', 'am', 'amt', 'an', 'ant', 'ao', 'aot', 'ap', 'apt', 'aq', 'aqt', 'ar', 'art', 'as', 'ast', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz', 'au', 'aut', 'av', 'avt', 'aw', 'awt', 'ax', 'axt', 'ay', 'ayt', 'az', 'azt', 'bat', 'bt', 'cat', 'ct', 'dat', 'dt', 'eat', 'et', 'fat', 'ft', 'gat', 'gt', 'hat', 'ht', 'iat', 'it', 'jat', 'jt', 'kat', 'kt', 'lat', 'lt', 'mat', 'mt', 'nat', 'nt', 'oat', 'ot', 'pat', 'pt', 'qat', 'qt', 'rat', 'rt', 'sat', 'st', 't', 'ta', 'tat', 'tt', 'uat', 'ut', 'vat', 'vt', 'wat', 'wt', 'xat', 'xt', 'yat', 'yt', 'zat', 'zt']
assert tmp_edit_one_l == expected
assert len(edit_one_letter("at")) == 129
input word at edit_one_l ['a', 'aa', 'aat', 'ab', 'abt', 'ac', 'act', 'ad', 'adt', 'ae', 'aet', 'af', 'aft', 'ag', 'agt', 'ah', 'aht', 'ai', 'ait', 'aj', 'ajt', 'ak', 'akt', 'al', 'alt', 'am', 'amt', 'an', 'ant', 'ao', 'aot', 'ap', 'apt', 'aq', 'aqt', 'ar', 'art', 'as', 'ast', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz', 'au', 'aut', 'av', 'avt', 'aw', 'awt', 'ax', 'axt', 'ay', 'ayt', 'az', 'azt', 'bat', 'bt', 'cat', 'ct', 'dat', 'dt', 'eat', 'et', 'fat', 'ft', 'gat', 'gt', 'hat', 'ht', 'iat', 'it', 'jat', 'jt', 'kat', 'kt', 'lat', 'lt', 'mat', 'mt', 'nat', 'nt', 'oat', 'ot', 'pat', 'pt', 'qat', 'qt', 'rat', 'rt', 'sat', 'st', 't', 'ta', 'tat', 'tt', 'uat', 'ut', 'vat', 'vt', 'wat', 'wt', 'xat', 'xt', 'yat', 'yt', 'zat', 'zt'] The type of the returned object should be a set <class 'set'> Number of outputs from edit_one_letter('at') is 129
Edit two letters
Now you can generalize this to implement to get two edits on a word. To do so, you would have to get all the possible edits on a single word and then for each modified word, you would have to modify it again.
Instructions: Implement the edit_two_letters
function that returns a set of words that are two edits away. Note that creating additional edits based on the edit_one_letter
function may 'restore' some one_edits to zero or one edits. That is allowed here. This accounted for in get_corrections.
def edit_two_letters(word: str, allow_switches: bool=True) -> set:
"""Make two-letter edits
Args:
word: the input string/word
Returns:
edit_two_set: a set of strings with all possible two edits
"""
edit_two_set = set()
ones = edit_one_letter(word, allow_switches)
for word in ones:
edit_two_set = edit_two_set.union(edit_one_letter(word, allow_switches))
return edit_two_set
tmp_edit_two_set = edit_two_letters("a")
tmp_edit_two_l = sorted(list(tmp_edit_two_set))
twos = len(tmp_edit_two_l)
assert twos == 2654, twos
print(f"Number of strings with edit distance of two: {twos}")
first_ten = tmp_edit_two_l[:10]
assert first_ten == ['', 'a', 'aa', 'aaa', 'aab', 'aac', 'aad', 'aae', 'aaf', 'aag']
print(f"First 10 strings {first_ten}")
last_ten = tmp_edit_two_l[-10:]
assert last_ten == ['zv', 'zva', 'zw', 'zwa', 'zx', 'zxa', 'zy', 'zya', 'zz', 'zza']
print(f"Last 10 strings {last_ten}")
print(f"The data type of the returned object should be a set {type(tmp_edit_two_set)}")
actual = len(edit_two_letters('at'))
expected = 7154
assert expected == actual, actual
print(f"Number of strings that are 2 edit distances from 'at' is {actual}")
Number of strings with edit distance of two: 2654 First 10 strings ['', 'a', 'aa', 'aaa', 'aab', 'aac', 'aad', 'aae', 'aaf', 'aag'] Last 10 strings ['zv', 'zva', 'zw', 'zwa', 'zx', 'zxa', 'zy', 'zya', 'zz', 'zza'] The data type of the returned object should be a set <class 'set'> Number of strings that are 2 edit distances from 'at' is 7154
Suggest Spelling Corrections
Now you will use your edit_two_letters
function to get a set of all the possible 2 edits on your word. You will then use those strings to get the most probable word you meant to type aka your typing suggestion.
Instructions: Implement get_corrections
, which returns a list of zero to n possible suggestion tuples of the form (word, probability_of_word).
- Step 1: Generate suggestions for a supplied word: You'll use the edit functions you have developed. The 'suggestion algorithm' should follow this logic:
- If the word is in the vocabulary, suggest the word.
- Otherwise, if there are suggestions from
edit_one_letter
that are in the vocabulary, use those. - Otherwise, if there are suggestions from
edit_two_letters
that are in the vocabulary, use those. - Otherwise, suggest the input word.*
- The idea is that words generated from fewer edits are more likely than words with more edits.
Note:
- Edits of one or two letters may 'restore' strings to either zero or one edit. This algorithm accounts for this by preferentially selecting lower distance edits first.
- Short circuit
In Python, logical operations such as
and
andor
have two useful properties. They can operate on lists and they have ['short-circuit' behavior](https://docs.python.org/3/library/stdtypes.html). Try these:Example of logical operation on lists or sets.
print( [] and ["a","b"] ) print( [] or ["a","b"] ) #example of Short circuit behavior val1 = ["Most","Likely"] or ["Less","so"] or ["least","of","all"] # selects first, does not evalute remainder print(val1) val2 = [] or [] or ["least","of","all"] # continues evaluation until there is a non-empty list print(val2)
[] ['a', 'b'] ['Most', 'Likely'] ['least', 'of', 'all']
The logical
or
could be used to implement the suggestion algorithm very compactly. Alternately, if/then constructs could be used.Step 2: Create a 'best_words' dictionary where the 'key' is a suggestion and the 'value' is the probability of that word in your vocabulary. If the word is not in the vocabulary, assign it a probability of 0.
Step 3: Select the n best suggestions. There may be fewer than n.
edit_one_letter
andedit_two_letters
return python sets.- Sets have a handy set.intersection feature
- To find the keys that have the highest values in a dictionary, you can use the Counter dictionary to create a Counter object from a regular dictionary. Then you can use Counter.most_common(n) to get the n most common keys.
- To find the intersection of two sets, you can use set.intersection or the & operator.
- If you are not as familiar with short circuit syntax (as shown above), feel free to use if else statements instead.
- To use an if statement to check of a set is empty, use 'if not x:' syntax
def get_corrections(word: str, probs: dict, vocab: set, n: int=2, verbose: bool=False) -> list: """Gets corrections within n edits Args: word: a user entered string to check for suggestions probs: a dictionary that maps each word to its probability in the corpus vocab: a set containing all the vocabulary n: number of possible word corrections you want returned in the dictionary Returns: n_best: a list of tuples with the most probable n corrected words and their probabilities. """ suggestions = [] n_best = [] if word in vocab: n_best = [(word, probs[word])] else: suggestions = vocab.intersection(edit_one_letter(word)) if not suggestions: suggestions = vocab.intersection(edit_two_letters(word)) if suggestions: probabilities = list(reversed(sorted([(probs.get(suggestion, 0), suggestion) for suggestion in suggestions]))) n_best = [(word, probability) for (probability, word) in probabilities[:n]] if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions) return n_best
word = "dbadd" test = get_corrections(word, probs=corpus.probabilities, vocab=corpus.vocabulary, n=2, verbose=True) print(test)
entered word = dbadd suggestions = {'bade', 'band', 'add', 'dead', 'bad'} [('dead', 0.0006341627186928787), ('bad', 0.0002051702913418137)]
word = "days" test = get_corrections(word, probs=corpus.probabilities, vocab=corpus.vocabulary, n=2, verbose=True) assert len(test) == 1, test print(test)
entered word = days suggestions = [] [('days', 0.0004103405826836274)]
# Test your implementation - feel free to try other words in my word my_word = 'dys' tmp_corrections = get_corrections(my_word, corpus.probabilities, set(corpus.words), 2, verbose=True) # keep verbose=True for i, word_prob in enumerate(tmp_corrections): print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}") print(f"data type of corrections {type(tmp_corrections)}") expected = 0.000410 actual = tmp_corrections[0][1] assert math.isclose(expected, actual, abs_tol=1e-6), actual expected = 0.000019 actual = tmp_corrections[1][1] assert math.isclose(expected, actual, abs_tol=1e-6), actual
entered word = dys suggestions = {'days', 'dye'} word 0: days, probability 0.000410 word 1: dye, probability 0.000019 data type of corrections <class 'list'>
End
The next step is to write some code to find the Minimum Edit Distance needed to transform one word into another word.
A Suggestor
Imports
# pypi
import attr
# this repository
from neurotic.nlp.autocorrect.edits import TheEditor
The Suggestor
@attr.s(auto_attribs=True)
class WordSuggestor:
"""Suggests Words for Autocorrection
Args:
corpus: a Corpus Builder object
suggestions: number of suggestions to return for each word
want_switches: also do the =switch= edit
"""
corpus: object
suggestions: int=2
want_switches: bool=True
Edit One Letter
def one_letter_edits(self, word: str) -> set:
"""Get all possible words one edit away from the original
Args:
word: word for which we will generate all possible words one edit away.
Returns:
set of words with one possible edit.
"""
editor = TheEditor(word)
edits = editor.replaced + editor.inserted + editor.deleted
if self.want_switches:
edits += editor.switched
return set(edits)
Two-Letter Edits
def two_letter_edits(self, word: str) -> set:
"""Make two-letter edits
Args:
word: the input string/word
Returns:
set of strings with all possible two-letter edits
"""
ones = self.one_letter_edits(word)
return set.union(*(self.one_letter_edits(one) for one in ones))
The Call
def __call__(self, word: str) -> list:
"""Finds the closest words to the word
If the word is in our corpus then it just returns the word
Args:
word: potential word to correct
Returns:
list of (word, probability) tuples
"""
if word in self.corpus.vocabulary:
best = [(word, self.corpus.probabilities[word])]
else:
suggestions = self.corpus.vocabulary.intersection(self.one_letter_edits(word))
if not suggestions:
suggestions = self.corpus.vocabulary.intersection(self.two_letter_edits(word))
if suggestions:
probabilities = list(reversed(sorted(
[(self.corpus.probabilities.get(suggestion, 0), suggestion)
for suggestion in suggestions])))
best = [(word, probability)
for (probability, word) in probabilities[
:self.suggestions]]
else:
best = [(word, 0)]
return best
Test the Suggestor
from neurotic.nlp.autocorrect.suggestor import WordSuggestor
suggestor = WordSuggestor(corpus=corpus, suggestions=2)
# this doesn't have any one-letter-edits in the corpus so it won't return anything
# unless the two-letter-edits is working
word = "dbadd"
test = suggestor(word)
print(test)
[('dead', 0.0006341627186928787), ('bad', 0.0002051702913418137)]
word = "days"
test = suggestor(word)
assert len(test) == 1, test
assert test[0][0] == word
print(test)
[('days', 0.0004103405826836274)]
word = 'dys'
tmp_corrections = suggestor(word)
for i, word_prob in enumerate(tmp_corrections):
print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")
print(f"data type of corrections {type(tmp_corrections)}")
expected = 0.000410
actual = tmp_corrections[0][1]
assert math.isclose(expected, actual, abs_tol=1e-6), actual
expected = 0.000019
actual = tmp_corrections[1][1]
assert math.isclose(expected, actual, abs_tol=1e-6), actual
word 0: days, probability 0.000410 word 1: dye, probability 0.000019 data type of corrections <class 'list'>