Autocorrect System: Edits
Table of Contents
Beginning
This is part of a series that's introduced in Autocorrect: The System. In the previous post of this series we computed \(P(w_i)\) for all the words in the corpus, so now we'll write a few functions to manipulate strings so that we can edit the erroneous strings and return the right spellings of the words. In this section, we'll implement four functions:
delete_letter
: given a word, it returns all the possible strings that have one character removed.switch_letter
: given a word, it returns all the possible strings that have two adjacent letters switched.replace_letter
: given a word, it returns all the possible strings that have one character replaced by another different letter.insert_letter
: given a word, it returns all the possible strings that have an additional character inserted.
Middle
Delete Letter
We're going to implement a function that, given a word, returns a list of strings with one character deleted.
def delete_letter(word: str, verbose: bool=False) -> list:
"""Delete one letter at a time
Args:
word: the string/word for which you will generate all possible words
in the vocabulary which have 1 missing character
Returns:
delete_l: a list of all possible strings obtained by deleting 1 character from word
"""
delete_l = []
split_l = []
split_l = [(word[:index], word[index:]) for index in range(len(word) + 1)]
delete_l = [left + right[1:] for left, right in split_l if right]
if verbose:
print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")
return delete_l
delete_word_l = delete_letter(word="cans",
verbose=True)
assert delete_word_l == ['ans', 'cns', 'cas', 'can']
input word cans, split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's'), ('cans', '')], delete_l = ['ans', 'cns', 'cas', 'can']
deleted = len(delete_letter('at'))
print(f"Number of outputs of delete_letter('at') is {deleted}")
assert deleted == 2
Number of outputs of delete_letter('at') is 2
Switch Letter
Now implement a function that switches two letters in a word. It takes in a word and returns a list of all the possible switches of two letters that are adjacent to each other.
- For example, given the word 'eta', it returns {'eat', 'tea'}, but does not return 'ate'.
def switch_letter(word: str, verbose: bool=False) -> list:
"""Switches pairs of adjacent letters
Args:
word: input string
Returns:
switches: a list of all possible strings with one adjacent charater switched
"""
switch_l = []
split_l = []
split_l = [(word[:index], word[index:]) for index in range(len(word) + 1)]
switch_l = [left[:-1] + right[0] + left[-1] + right[1:]
for left, right in split_l
if left and right]
if verbose:
print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}")
return switch_l
switch_word_l = switch_letter(word="eta",
verbose=True)
assert switch_word_l == ['tea', 'eat']
Input word = eta split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a'), ('eta', '')] switch_l = ['tea', 'eat']
switches = len(switch_letter('at'))
print(f"Number of outputs of switch_letter('at') is {switches}")
assert switches == 1
Number of outputs of switch_letter('at') is 1
Replace Letter
Now implement a function that takes in a word and returns a list of strings with one replaced letter from the original word.
- Step 1: is the same as in `delete_letter()`
- Step 2: A list comprehension or for loop which form strings by replacing letters. This can be of the form:
[f(a,b,c) for a, b in splits if condition for c in string]
Note the use of the second for loop. It is expected in this routine that one or more of the replacements will include the original word. For example, replacing the first letter of 'ear' with 'e' will return 'ear'.
- Step 3: Remove the original input letter from the output.
def replace_letter(word: str, verbose: bool=False) -> list:
"""Replace each letter in the string with another letter in the alphabet
Args:
word: the input string/word
Returns:
replaces: a list of all possible strings where we replaced one letter from the original word.
"""
letters = 'abcdefghijklmnopqrstuvwxyz'
replace_l = []
split_l = []
split_l = [(word[:index], word[index:]) for index in range(len(word) + 1)]
replace_l = [left + letter + right[1:] for left, right in split_l if right
for letter in letters]
replace_set = set(replace_l)
replace_set.discard(word)
replace_l = sorted(list(replace_set))
if verbose:
print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")
return replace_l
word = "can"
replace_l = replace_letter(word=word,
verbose=True)
expected_replacements = (len(word) * 26) - len(word)
assert len(replace_l) == expected_replacements
print(f"Replacements: {len(replace_l)}")
expected = ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']
assert replace_l == expected
Input word = can split_l = [('', 'can'), ('c', 'an'), ('ca', 'n'), ('can', '')] replace_l ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan'] Replacements: 75
word = "at"
replacements = len(replace_letter(word))
print(f"Number of outputs of replace_letter('at') is {replacements}")
expected = (len(word) * 26) - len(word)
assert expected == replacements
Number of outputs of replace_letter('at') is 50
Insert Letter
Now implement a function that takes in a word and returns a list with a letter inserted at every offset.
- Step 1: is the same as in `delete_letter()`
- Step 2: This can be a list comprehension of the form:
[f(a,b,c) for a, b in splits if condition for c in string]
def insert_letter(word: str, verbose: bool=False) -> list:
"""Stick a letter before and after each letter in the word
Args:
word: the input string/word
Returns:
inserts: a set of all possible strings with one new letter inserted at every offset
"""
letters = 'abcdefghijklmnopqrstuvwxyz'
insert_l = []
split_l = []
split_l = [(word[:index], word[index:]) for index in range(len(word) + 1)]
insert_l = [left + letter + right for left, right in split_l for letter in letters]
if verbose:
print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")
return insert_l
word = "at"
insert_l = insert_letter(word, True)
inserted = len(insert_l)
print(f"Number of strings output by insert_letter('at') is {inserted}")
assert inserted == (len(word) + 1) * 26
expected = ['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz']
assert expected == insert_l
Input word at split_l = [('', 'at'), ('a', 't'), ('at', '')] insert_l = ['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz'] Number of strings output by insert_letter('at') is 78
word = "at"
inserted = len(insert_letter(word))
print(f"Number of outputs of insert_letter('at') is {inserted}")
expected = (len(word) + 1) * 26
assert expected == inserted
Number of outputs of insert_letter('at') is 78
End
The Editor
Now to bundle it up for later.
Imports
# python
from string import ascii_lowercase
# from pypi
import attr
The Editor Class
@attr.s(auto_attribs=True)
class TheEditor:
"""Does various edits to words
Args:
word: string to edit
"""
word: str
_splits: list=None
_deleted: list=None
_switched: list=None
_replaced: list=None
_inserted: list=None
Splits
A list of splits.
@property
def splits(self) -> list:
"""Tuples of splits for word"""
if self._splits is None:
self._splits = [(self.word[:index], self.word[index:])
for index in range(len(self.word) + 1)]
return self._splits
Deleted
@property
def deleted(self) -> list:
"""Deletes one letter at a time from the word
Returns:
list of all possible strings created by deleting one letter
"""
if self._deleted is None:
self._deleted = [left + right[1:]
for left, right in self.splits if right]
return self._deleted
Switched
@property
def switched(self) -> list:
"""switches one letter pair at a time
Returns:
all possible strings with one adjacent charater switched
"""
if self._switched is None:
self._switched = [left[:-1] + right[0] + left[-1] + right[1:]
for left, right in self.splits
if left and right]
return self._switched
Replace a Letter
@property
def replaced(self) -> list:
"""Replace each letter with every other letter of the alphabet
Returns:
replacements in alphabetical order (doesn't include original word)
"""
if self._replaced is None:
self._replaced = set([left + letter + right[1:]
for left, right in self.splits if right
for letter in ascii_lowercase])
self._replaced.discard(self.word)
self._replaced = sorted(list(self._replaced))
return self._replaced
Insert Letters
@property
def inserted(self) -> list:
"""Adds letters before and after each letter
Returns:
all possible strings with one new letter inserted at every offset
"""
if self._inserted is None:
self._inserted = [left + letter + right
for left, right in self.splits
for letter in ascii_lowercase]
return self._inserted
Checking the Editor
from neurotic.nlp.autocorrect.edits import TheEditor
editor = TheEditor(word="cans")
# splits
expected = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's'), ('cans', '')]
assert editor.splits == expected, editor.splits
# deletions
expected = ['ans', 'cns', 'cas', 'can']
assert editor.deleted == expected
# switches
word = "eta"
editor = TheEditor(word=word)
expected = ['tea', 'eat']
assert editor.switched == expected
editor = TheEditor(word="at")
switches = len(editor.switched)
print(f"Number of outputs of switch_letter('at') is {switches}")
assert switches == 1
# replacements
word = "can"
editor = TheEditor(word)
replacements = editor.replaced
expected = (len(word) * 26) - len(word)
assert len(replacements) == expected, f"expected: {expected} actual: {len(replacements)}"
expected = ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']
assert replacements == expected
word = "at"
editor = TheEditor(word)
expected = (len(word) * 26) - len(word)
assert expected == len(editor.replaced)
# Insertions
inserted = len(editor.inserted)
assert inserted == (len(word) + 1) * 26
expected = ['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt', 'ata', 'atb', 'atc', 'atd', 'ate', 'atf', 'atg', 'ath', 'ati', 'atj', 'atk', 'atl', 'atm', 'atn', 'ato', 'atp', 'atq', 'atr', 'ats', 'att', 'atu', 'atv', 'atw', 'atx', 'aty', 'atz']
assert expected == editor.inserted
word = "at"
editor = TheEditor(word)
inserted = len(editor.inserted)
print(f"Number of outputs of insert_letter('at') is {inserted}")
expected = (len(word) + 1) * 26
assert expected == inserted