Auto-Complete: Building the Auto-Complete System

Cloistered Monkey

2020-12-04 15:21

Build the Auto-Complete System

In the previous post we tested the perplexity of our N-Gram Language model. In this, the final post in the series that we began with this post, we'll implement the final system.

Imports

# python
from itertools import chain
import math
import os

# pypi
from dotenv import load_dotenv
from expects import be_true, equal, expect
# this project
from neurotic.nlp.autocomplete import NGrams, Tokenizer, TrainTestSplit

Set Up

The Environment

load_dotenv("posts/nlp/.env", override=True)

The Data

path = os.environ["TWITTER_AUTOCOMPLETE"]
with open(path) as reader:
    data = reader.read()

Middle

Probabilities Again

Once again the function we're defining here expects this probability function so I'm going to have to paste it in here.

def estimate_probability(word: str,
                         previous_n_gram: tuple, 
                         n_gram_counts: dict,
                         n_plus1_gram_counts: dict,
                         vocabulary_size: int,
                         k: float=1.0) -> float:
    """
    Estimate the probabilities of a next word using the n-gram counts with k-smoothing

    Args:
       word: next word
       previous_n_gram: A sequence of words of length n
       n_gram_counts: Dictionary of counts of n-grams
       n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
       vocabulary_size: number of words in the vocabulary
       k: positive constant, smoothing parameter

    Returns:
       A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)

    n_plus1_gram = previous_n_gram + (word,)  
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)       
    return (n_plus1_gram_count + k)/(previous_n_gram_count + k * vocabulary_size)

def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with k-smoothing

    Args:
       previous_n_gram: A sequence of words of length n
       n_gram_counts: Dictionary of counts of (n+1)-grams
       n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
       vocabulary: List of words
       k: positive constant, smoothing parameter

    Returns:
       A dictionary mapping from next words to the probability.
    """

    # convert list to tuple to use it as a dictionary key
    previous_n_gram = tuple(previous_n_gram)

    # add <e> <unk> to the vocabulary
    # <s> is not needed since it should not appear as the next word
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)

    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, 
                                           n_gram_counts, n_plus1_gram_counts, 
                                           vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

Suggest-a-Word

Compute probabilities for all possible next words and suggest the most likely one.

This function also take an optional argument `start_with`, which specifies the first few letters of the next words.

Hints

estimate_probabilities returns a dictionary where the key is a word and the value is the word's probability.
Use str1.startswith(str2) to determine if a string starts with the letters of another string. For example, 'learning'.startswith('lea') returns True, whereas 'learning'.startswith('ear') returns False. There are two additional parameters in str.startswith(), but you can use the default values for those parameters in this case.

# UNQ_C11 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION: suggest_a_word
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    """
    Get suggestion for the next word

    Args:
       previous_tokens: The sentence you input where each token is a word. Must have length > n 
       n_gram_counts: Dictionary of counts of (n+1)-grams
       n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
       vocabulary: List of words
       k: positive constant, smoothing parameter
       start_with: If not None, specifies the first few letters of the next word

    Returns:
       A tuple of 
         - string of the most likely next word
         - corresponding probability
    """

    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 

    # From the words that the user already typed
    # get the most recent 'n' words as the previous n-gram
    previous_n_gram = previous_tokens[-n:]

    # Estimate the probabilities that each word in the vocabulary
    # is the next word,
    # given the previous n-gram, the dictionary of n-gram counts,
    # the dictionary of n plus 1 gram counts, and the smoothing constant
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)

    # Initialize suggested word to None
    # This will be set to the word with highest probability
    suggestion = None

    # Initialize the highest word probability to 0
    # this will be set to the highest probability 
    # of all words to be suggested
    max_prob = 0

    ### START CODE HERE (Replace instances of 'None' with your code) ###

    # For each word and its probability in the probabilities dictionary:
    for word, prob in probabilities.items(): # complete this line

        # If the optional start_with string is set
        if start_with is not None: # complete this line

            # Check if the beginning of word does not match with the letters in 'start_with'
            if not word.startswith(start_with): # complete this line

                # if they don't match, skip this word (move onto the next word)
                continue # complete this line

        # Check if this word's probability
        # is greater than the current maximum probability
        if prob > max_prob: # complete this line

            # If so, save this word as the best suggestion (so far)
            suggestion = word

            # Save the new maximum probability
            max_prob = prob

    ### END CODE HERE

    return suggestion, max_prob

Test It Out

sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = NGrams(sentences, 1).counts
bigram_counts = NGrams(sentences, 2).counts

previous_tokens = ["i", "like"]
word, probability = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{word}` with a probability of {probability:.4f}")
expected_word, expected_probability = "a", 0.2727
expect(word).to(equal(expected_word))
expect(math.isclose(probability, expected_probability, abs_tol=1e-4)).to(be_true)
print()

# test your code when setting the starts_with
tmp_starts_with = 'c'
word, probability = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{word}` with a probability of {probability:.4f}")

expected_word, expected_probability = "cat", 0.0909
expect(word).to(equal(expected_word))
expect(math.isclose(probability, expected_probability, abs_tol=1e-4)).to(be_true)

The previous words are 'i like',
        and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
        and the suggested word is `cat` with a probability of 0.0909

Multiple Suggestions

The function defined below loops over various n-gram models to get multiple suggestions.

def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]

        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

Test It

sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = NGrams(sentences, 1).counts
bigram_counts = NGrams(sentences, 2).counts
trigram_counts = NGrams(sentences, 3).counts
quadgram_counts = NGrams(sentences, 4).counts
qintgram_counts = NGrams(sentences, 5).counts

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are 'i like', the suggestions are:")
display(tmp_suggest3)

The previous words are 'i like', the suggestions are:

a	0.2727272727272727
a	0.2
like	0.1111111111111111
like	0.1111111111111111

Multiple Word Suggestions

tokenizer = Tokenizer(data)
splitter = TrainTestSplit(tokenizer.tokenized)
train_data_processed = splitter.training

n_gram_counts_list = [NGrams(train_data_processed, n).counts for n in range(1, 6)]

vocabulary = list(set(chain.from_iterable(train_data_processed)))
previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['i', 'am', 'to'], the suggestions are:

be	0.015552924847940564
please	5.4935999560512006e-05
please	5.494354550699157e-05
sucks	2.747403703500192e-05

previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:

to	0.006007762241480142
to	0.0019077728115120462
to	0.00030196552102778083
home	0.00016478989288656962

previous_tokens = ["hey", "how", "are"]
tmp_suggest6 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are'], the suggestions are:

you	0.010055522861602231
you	0.0014810345300458024
you	5.494656446605676e-05
sucks	2.747403703500192e-05

previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest7 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest7)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:

're	0.012929170630459223
?	0.0011416145691764065
?	0.0007132863295931524
<e>	5.494656446605676e-05

previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="d")

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:

do	0.004734930381388913
doing	0.000679532481652623
doing	0.0001646045375984198
deserving	2.747328223302838e-05

End

So, now we have our system. Here are all the prior posts in this series.

Table of Contents