Autocorrect: Building the Vocabulary
Table of Contents
Beginning
# python
from collections import Counter
from functools import partial
import re
# pypi
import holoviews
import hvplot.pandas
import pandas
# my stuff
from graeae import EmbedHoloviews
Set Up
Plotting
SLUG = "autocorrect-building-the-vocabulary"
Embed = partial(EmbedHoloviews,
folder_path=f"files/posts/nlp/{SLUG}/")
Middle
Set Up the Corpus
text = 'red pink pink blue blue yellow ORANGE BLUE BLUE PINK'
print(text)
print(f"String Length: {len(text)}")
red pink pink blue blue yellow ORANGE BLUE BLUE PINK String Length: 52
Preprocessing
Lowercasing
text_lowercased = text.lower()
print(text_lowercased)
red pink pink blue blue yellow orange blue blue pink
Tokenizing
ALPHANUMERIC_UNDERSCORE = r"\w"
ONE_OR_MORE = r"+"
TOKEN = ALPHANUMERIC_UNDERSCORE + ONE_OR_MORE
tokens = re.findall(TOKEN, text_lowercased)
print(f"Tokens: {len(tokens)}")
Tokens: 10
Create the Vocabulary
First Way: Distinct Words
vocabulary = set(tokens)
print(vocabulary)
print(f"Count: {len(vocabulary)}")
{'pink', 'red', 'orange', 'blue', 'yellow'} Count: 5
Second Way: Add Word Counts
- With a Dictionary
counts_from_dict = {token: tokens.count(token) for token in tokens} print(counts_from_dict) print(f"Unique: {len(counts_from_dict)}")
{'red': 1, 'pink': 3, 'blue': 4, 'yellow': 1, 'orange': 1} Unique: 5
- With a Counter
counts_from_counter = Counter(tokens) print(counts_from_counter) print(f"Unique: {len(counts_from_counter)}") for key, count in counts_from_counter.items(): assert count == counts_from_dict[key]
Counter({'blue': 4, 'pink': 3, 'red': 1, 'yellow': 1, 'orange': 1}) Unique: 5
Plot the Vocabulary
keys = list(counts_from_counter.keys())
colors = holoviews.Cycle(values=keys)
data = pandas.DataFrame(dict(
Count=list(counts_from_counter.values()),
Token=keys)
)
plot = data.hvplot.bar(x="Token", y="Count").opts(
title="Token Counts",
width=990,
height=780,
fontscale=2,
color=colors,
color_index="Token"
)
outcome = Embed(plot=plot, file_name="token_counts")()
print(outcome)
End
This is the basic way that we'll be creating a vocabulary for the autocorrect feature.