Autocorrect: Minimum Edit Distance
Table of Contents
Beginning
This is a post in the series of posts introduced here. Now that we implemented auto-correct, how do we evaluate the similarity between two strings (e.g. 'waht' and 'what')?
Also how do you efficiently find the shortest path to go from the word, 'waht' to the word 'what'?
We'll implement a dynamic programming system that will tell us the minimum number of edits required to convert a string into another string.
Imports
# python
from pathlib import Path
import os
# from pypi
from dotenv import load_dotenv
import numpy
import pandas
# this repo
from neurotic.nlp.autocorrect.preprocessing import CorpusBuilder
from neurotic.nlp.autocorrect.suggestor import WordSuggestor
Set Up
load_dotenv("posts/nlp/.env", override=True)
path = Path(os.environ["SHAKESPEARE"])
corpus = CorpusBuilder(path)
suggestor = WordSuggestor(corpus=corpus, suggestions=2, want_switches=False)
Minimum Edit distance
Dynamic Programming
Dynamic Programming breaks a problem down into subproblems which can be combined to form the final solution. Here, given a string \(source_{[0 \ldots i]}\) and a string \(target_{[0 \ldots j]}\), we will compute all the combinations of \(substrings_{[i, j]}\) and calculate their edit distance. To do this efficiently, we will use a table to maintain the previously computed substrings and use those to calculate larger substrings.
We'll create a matrix and update each element in the matrix as follows:
\[ \text{Initialization} \]
\begin{align} D[0,0] &= 0 \\ D[i,0] &= D[i-1,0] + del\_cost(source[i]) \tag{4}\\ D[0,j] &= D[0,j-1] + ins\_cost(target[j]) \\ \end{align}\[ \text{Per Cell Operations} \]
\begin{align} D[i,j] =min \begin{cases} D[i-1,j] + del\_cost\\ D[i,j-1] + ins\_cost\\ D[i-1,j-1] + \left\{\begin{matrix} rep\_cost; & \textit{if src}[i]\neq tar[j]\\ 0 ; & \textit{if src}[i]=tar[j] \end{matrix}\right. \end{cases} \tag{5} \end{align}So converting the source word play to the target word stay, using an insert cost of one, a delete cost of 1, and replace cost of 2 would give you the following table:
# | s | t | a | y | |
# | 0 | 1 | 2 | 3 | 4 |
p | 1 | 2 | 3 | 4 | 5 |
l | 2 | 3 | 4 | 5 | 6 |
a | 3 | 4 | 5 | 4 | 5 |
y | 4 | 5 | 6 | 5 | 4 |
The operations used in this algorithm are 'insert', 'delete', and 'replace'. These correspond to the functions that you defined earlier: insert_letter(), delete_letter() and replace_letter(). switch_letter() is not used here.
The diagram below describes how to initialize the table. Each entry in D[i,j] represents the minimum cost of converting string source[0:i] to string target[0:j]. The first column is initialized to represent the cumulative cost of deleting the source characters to convert string "EER" to "". The first row is initialized to represent the cumulative cost of inserting the target characters to convert from "" to "NEAR".
\begin{align} D[i,j] =min \begin{cases} D[i-1,j] + del\_cost\\ D[i,j-1] + ins\_cost\\ D[i-1,j-1] + \left\{\begin{matrix} rep\_cost; & if src[i]\neq tar[j]\\ 0 ; & if src[i]=tar[j] \end{matrix}\right. \end{cases} \tag{5} \end{align}def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
'''
Input:
source: a string corresponding to the string you are starting with
target: a string corresponding to the string you want to end with
ins_cost: an integer setting the insert cost
del_cost: an integer setting the delete cost
rep_cost: an integer setting the replace cost
Output:
D: a matrix of len(source)+1 by len(target)+1 containing minimum edit distances
med: the minimum edit distance (med) required to convert the source string to the target
'''
# use deletion and insert cost as 1
m = len(source)
n = len(target)
#initialize cost matrix with zeros and dimensions (m+1,n+1)
D = numpy.zeros((m+1, n+1), dtype=int)
# Fill in column 0, from row 1 to row m, both inclusive
for row in range(1, m + 1): # Replace None with the proper range
D[row,0] = D[row - 1, 0] + del_cost
# Fill in row 0, for all columns from 1 to n, both inclusive
for col in range(1, n + 1): # Replace None with the proper range
D[0,col] = D[0, col - 1] + ins_cost
# Loop through row 1 to row m, both inclusive
for row in range(1, m + 1):
# Loop through column 1 to column n, both inclusive
for col in range(1, n + 1):
r_cost = rep_cost
if source[row - 1] == target[col - 1]:
r_cost = 0
D[row,col] = min((D[row-1, col] + del_cost),
D[row, col - 1] + ins_cost,
D[row-1, col-1] + r_cost)
# Set the minimum edit distance with the cost found at row m, column n
med = D[m, n]
return D, med
Testing the Function
play to stay
source = 'play'
target = 'stay'
matrix, min_edits = min_edit_distance(source, target)
print("minimum edits: ",min_edits, "\n")
idx = list('#' + source)
cols = list('#' + target)
expected = pandas.DataFrame(numpy.array([
[0, 1, 2, 3, 4],
[1, 2, 3, 4, 5],
[2, 3, 4, 5, 6],
[3, 4, 5, 4, 5],
[4, 5, 6, 5, 4],
]), index=idx, columns=cols)
actual = pandas.DataFrame(matrix, index=idx, columns= cols)
print(actual)
assert min_edits==4
assert all(expected == actual)
minimum edits: 4 # s t a y # 0 1 2 3 4 p 1 2 3 4 5 l 2 3 4 5 6 a 3 4 5 4 5 y 4 5 6 5 4
eer to near
source = 'eer'
target = 'near'
matrix, min_edits = min_edit_distance(source, target)
print("minimum edits: ",min_edits, "\n")
idx = list(source)
idx.insert(0, '#')
cols = list(target)
cols.insert(0, '#')
actual = pandas.DataFrame(matrix, index=idx, columns= cols)
print(actual)
expected = pandas.DataFrame([
[0, 1, 2, 3, 4],
[1, 2, 1, 2, 3],
[2, 3, 2, 3, 4],
[3, 4, 3, 4, 3],
], index=idx, columns=cols)
assert all(expected == actual)
assert min_edits == 3
minimum edits: 3 # n e a r # 0 1 2 3 4 e 1 2 1 2 3 e 2 3 2 3 4 r 3 4 3 4 3
intention to execution
source = "intention"
target = "execution"
matrix, min_edits = min_edit_distance(source, target)
print("minimum edits: ",min_edits, "\n")
index = list("#" + source)
columns = list("#" + target)
actual = pandas.DataFrame(matrix, index=index, columns=columns)
print(actual)
expected = pandas.DataFrame([
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
[1, 2, 3, 4, 5, 6, 7, 6, 7, 8],
[2, 3, 4, 5, 6, 7, 8, 7, 8, 7],
[3, 4, 5, 6, 7, 8, 7, 8, 9, 8],
[4, 3, 4, 5, 6, 7, 8, 9, 10, 9],
[5, 4, 5, 6, 7, 8, 9, 10, 11, 10],
[6, 5, 6, 7, 8, 9, 8, 9, 10, 11],
[7, 6, 7, 8, 9, 10, 9, 8, 9, 10],
[8, 7, 8, 9, 10, 11, 10, 9, 8, 9],
[9, 8, 9, 10, 11, 12, 11, 10, 9, 8],
], index=index, columns=columns)
assert all(expected == actual)
assert min_edits == 8
minimum edits: 8 # e x e c u t i o n # 0 1 2 3 4 5 6 7 8 9 i 1 2 3 4 5 6 7 6 7 8 n 2 3 4 5 6 7 8 7 8 7 t 3 4 5 6 7 8 7 8 9 8 e 4 3 4 5 6 7 8 9 10 9 n 5 4 5 6 7 8 9 10 11 10 t 6 5 6 7 8 9 8 9 10 11 i 7 6 7 8 9 10 9 8 9 10 o 8 7 8 9 10 11 10 9 8 9 n 9 8 9 10 11 12 11 10 9 8
Finding the Closest of Multiple Strings
One Letter Edits
source = "eer"
targets = suggestor.one_letter_edits(source)
rep_cost = 1
for t in targets:
_, min_edits = min_edit_distance(source, t, rep_cost=rep_cost) # set ins, del, sub costs all to one
if min_edits != 1: print(source, t, min_edits)
Two Letter Edits
The 'replace()' routine utilizes all letters a-z one of which returns the original word.
source = "eer"
targets = suggestor.two_letter_edits(source)
for t in targets:
_, min_edits = min_edit_distance(source, t,rep_cost=rep_cost)
if min_edits != 2 and min_edits != 1: print(source, t, min_edits)
eer eer 0
We have to allow single edits here because some two_edits will restore a single edit.
End
The next post will be about implementing backtrace to find the shortest path to the minimum edit distance.
A Class-Based Minimum Edit Distance
Imports
# pypi
from tabulate import tabulate
import attr
import numpy
import pandas
The Minimum Edit Distance
@attr.s(auto_attribs=True)
class MinimumEdits:
"""Calculates the minimum edit distance between two strings
Uses the Levenshtein distance
Args:
source: the starting string
target: what to transform the source to
insertion_cost: how much inserting a character costs
deletion_cost: how much deleting a character costs
replacement_cost: how much swapping out a character costs
table_format: tabluate table format for printing table
"""
source: str
target: str
insertion_cost: int=1
deletion_cost: int=1
replacement_cost: int=2
table_format: str="orgtbl"
_rows: int=None
_columns: int=None
_distance_table: numpy.ndarray=None
_distance_frame: pandas.DataFrame=None
_minimum_distance: int=None
_backtrace: list=None
Rows
@property
def rows(self) -> int:
"""Rows in the table"""
if self._rows is None:
self._rows = len(self.source)
return self._rows
Columns
@property
def columns(self) -> int:
"""Number of columns for the table"""
if self._columns is None:
self._columns = len(self.target)
return self._columns
The Table
@property
def distance_table(self) -> numpy.ndarray:
"""Table of edit distances"""
if self._distance_table is None:
self._distance_table = numpy.zeros((self.rows + 1, self.columns + 1),
dtype=int)
# initialize the first row
for row in range(1, self.rows + 1):
self._distance_table[row, 0] = (self._distance_table[row - 1, 0]
+ self.deletion_cost)
# initialize the first column
for column in range(1, self.columns + 1):
self._distance_table[0, column] = (self._distance_table[0, column-1]
+ self.insertion_cost)
for row in range(1, self.rows + 1):
one_row_back = row - 1
for column in range(1, self.columns + 1):
one_column_back = column - 1
replacement_cost = (
0 if self.source[one_row_back] == self.target[one_column_back]
else self.replacement_cost)
self._distance_table[row, column] = min(
(self._distance_table[one_row_back, column]
+ self.deletion_cost),
(self._distance_table[row, one_column_back]
+ self.insertion_cost),
(self._distance_table[one_row_back, one_column_back]
+ replacement_cost))
return self._distance_table
Distance Frame
@property
def distance_frame(self) -> pandas.DataFrame:
"""pandas dataframe of the distance table"""
if self._distance_frame is None:
self._distance_frame = pandas.DataFrame(
self.distance_table,
index= list("#" + self.source),
columns = list("#" + self.target),
)
return self._distance_frame
Minimum Distance
@property
def minimum_distance(self) -> int:
"""The minimum edit distance from source to target"""
if self._minimum_distance is None:
self._minimum_distance = self.distance_table[
self.rows, self.columns]
return self._minimum_distance
Distance String
def __str__(self) -> str:
"""tabluate version of distance frame
Returns:
table formatted string of distance table
"""
return tabulate(self.distance_frame, headers="keys", tablefmt=self.table_format)
Test Out the Minimum Distance
from neurotic.nlp.autocorrect.distance import MinimumEdits
SOURCE, TARGET = "cow", "dog"
editor = MinimumEdits(source=SOURCE, target=TARGET)
assert editor.rows == len(SOURCE)
assert editor.columns == len(TARGET)
assert editor.distance_table.shape == (len(SOURCE) + 1, len(TARGET) + 1)
assert (editor.distance_table[:, 0] == numpy.arange(editor.rows + 1, dtype=int)).all()
assert (editor.distance_table[0, :] == numpy.arange(editor.columns + 1, dtype=int)).all()
assert (editor.distance_table == numpy.array([[0, 1, 2, 3],
[1, 2, 3, 4],
[2, 3, 2, 3],
[3, 4, 3, 4]])).all()
assert editor.minimum_distance == 4
editor = MinimumEdits(source="play", target="stay")
assert editor.minimum_distance == 4
editor = MinimumEdits(source="eer", target="near")
assert editor.minimum_distance == 3
editor = MinimumEdits(source="intention", target="execution")
assert editor.minimum_distance == 8
print(editor.distance_frame)
# d o g # 0 1 2 3 c 1 2 3 4 o 2 3 2 3 w 3 4 3 4
print(str(editor))
# | d | o | g | |
---|---|---|---|---|
# | 0 | 1 | 2 | 3 |
c | 1 | 2 | 3 | 4 |
o | 2 | 3 | 2 | 3 |
w | 3 | 4 | 3 | 4 |