Approximate kNN for Machine Translation
Beginning
In the previous post we implemented Locality Sensitive Hashing. It's part of a series of posts building an English to French translator whose links are gathered in this post.
Imports
# python
from argparse import Namespace
# pypi
from dotenv import load_dotenv
from nltk.corpus import twitter_samples
import numpy
# this repo
from neurotic.nlp.word_embeddings.embeddings import EmbeddingsLoader
from neurotic.nlp.word_embeddings.english_french import TrainingData
from neurotic.nlp.word_embeddings.hashing import (DocumentsEmbeddings,
PlanesUniverse,
HashTable,
HashTables)
from neurotic.nlp.word_embeddings.nearest_neighbors import NearestNeighbors
from neurotic.nlp.twitter.processor import TwitterProcessor
from neurotic.nlp.word_embeddings.training import TheTrainer
Set Up
The Environment
load_dotenv("posts/nlp/.env")
The Tweets
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
tweets = positive_tweets + negative_tweets
The Twitter Processor
process_tweet = TwitterProcessor()
The Embeddings
embeddings = EmbeddingsLoader()
documents = DocumentsEmbeddings(embeddings=embeddings.english_subset,
process=process_tweet, documents=tweets)
Some Constants
TWEET = Namespace(
vectors=len(tweets),
dimensions=len(next(iter(embeddings.english_subset.values()))),
universes=25,
vectors_per_bucket=16
)
The Planes
universes = PlanesUniverse(vector_count=TWEET.vectors,
dimensions=TWEET.dimensions,
universes=TWEET.universes,
vectors_per_bucket=TWEET.vectors_per_bucket)
assert universes.plane_count == 10
The Hash Tables
hasher = HashTables(planes=universes.planes,
universes=TWEET.universes,
vectors=documents.documents_embeddings)
hash_tables = hasher.hash_tables
The ID Tables
id_tables = hasher.id_tables
The Training Data
data = TrainingData()
Middle
Approximate K-NN
Implement approximate K nearest neighbors using locality sensitive hashing, to search for documents that are similar to a given document at the index doc_id
.
Arguments
Variable | Description |
---|---|
doc_id |
index into the document list all_tweets |
v |
document vector for the tweet in all_tweets at index doc_id |
planes_l |
list of planes (the global variable created earlier) |
k |
number of nearest neighbors to search for |
num_universes_to_use |
Number of available universes to use (25 by default) |
The approximate_knn
function finds a subset of candidate vectors that are in the same "hash bucket" as the input vector 'v'. Then it performs the usual k-nearest neighbors search on this subset (instead of searching through all 10,000 tweets).
Hints
- There are many dictionaries used in this function. Try to print out
planes_l
,hash_tables
,id_tables
to understand how they are structured, what the keys represent, and what the values contain. - To remove an item from a list, use
.remove()
- To append to a list, use
.append()
- To add to a set, use
.add()
# UNQ_C21 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# This is the code used to do the fast nearest neighbor search. Feel free to go over it
def approximate_knn(document_id: int,
document_embedding: numpy.ndarray,
multiverse_planes: list,
k: int=1,
universes: int=TWEET.universes):
"""Search for k-NN using hashes
Args:
document_id: index for the document in the lists
document_embedding: vector representing a documents word embeddings
multiverse_planes: dictionary of planes for the hash-tables
k: number of neighbors to find
universes: number of times to repeat the search
Returns:
list of indexes for neighbor documents
"""
assert universes <= TWEET.universes
# Vectors that will be checked as possible nearest neighbor
possible_neighbors = list()
# list of document IDs
ids_of_possible_neighbors = list()
# create a set for ids to consider, for faster checking if a document ID already exists in the set
set_of_ids_of_possible_neighbors = set()
hasher = HashTable(planes=multiverse_planes, vectors=None)
# loop through the universes of planes
for universe in range(universes):
# get the set of planes from the planes_l list, for this particular universe_id
planes = multiverse_planes[universe]
# get the hash value of the vector for this set of planes
# hash_value = hash_value_of_vector(v, planes)
hash_value = HashTable(planes=planes, vectors=None).hash_value(document_embedding)
# get the hash table for this particular universe_id
hash_table = hash_tables[universe]
# get the list of document vectors for this hash table, where the key is the hash_value
document_vectors = hash_table[hash_value]
# get the id_table for this particular universe_id
id_table = id_tables[universe]
# get the subset of documents to consider as nearest neighbors from this id_table dictionary
new_ids_to_consider = id_table[hash_value]
### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
# remove the id of the document that we're searching
if document_id in new_ids_to_consider:
new_ids_to_consider.remove(document_id)
print(f"removed document_id {document_id} of input vector from new_ids_to_search")
# loop through the subset of document vectors to consider
for index, new_id in enumerate(new_ids_to_consider):
# if the document ID is not yet in the set ids_to_consider...
if new_id not in set_of_ids_of_possible_neighbors:
# access document_vectors_l list at index i to get the embedding
# then append it to the list of vectors to consider as possible nearest neighbors
document_vector = document_vectors[index]
possible_neighbors.append(document_vector)
# append the new_id (the index for the document) to the list of ids to consider
ids_of_possible_neighbors.append(new_id)
# also add the new_id to the set of ids to consider
# (use this to check if new_id is not already in the IDs to consider)
set_of_ids_of_possible_neighbors.add(new_id)
### END CODE HERE ###
# Now run k-NN on the smaller set of vecs-to-consider.
print("Fast considering %d vecs" % len(possible_neighbors))
# convert the vecs to consider set to a list, then to a numpy array
vecs_to_consider_arr = numpy.array(possible_neighbors)
# call nearest neighbors on the reduced list of candidate vectors
nearest_neighbors = NearestNeighbors(candidates=possible_neighbors, k=k)
nearest_neighbor_ids = nearest_neighbors(document_embedding)
# Use the nearest neighbor index list as indices into the ids to consider
# create a list of nearest neighbors by the document ids
nearest_neighbor_ids = [ids_of_possible_neighbors[index]
for index in nearest_neighbor_ids]
return nearest_neighbor_ids
doc_id = 0
doc_to_search = tweets[doc_id]
vec_to_search = documents.documents_embeddings[doc_id]
print(doc_to_search)
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
nearest_neighbor_ids = approximate_knn(
document_id=doc_id,
document_embedding=vec_to_search,
multiverse_planes=universes.planes,
k=3, universes=5)
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")
for neighbor_id in nearest_neighbor_ids:
print(f"Nearest neighbor at document id {neighbor_id}")
print(f"document contents: {tweets[neighbor_id]}")
Fast considering 79 vecs Nearest neighbors for document 0 Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :) Nearest neighbor at document id 254 document contents: Something to get your #Friday off to a great start :) Have a great day all! #Mclaren #FridayFeeling #TGIF http://t.co/LshgwcXsSv Nearest neighbor at document id 2714 document contents: Current playlist :D http://t.co/PYKQLD4KHr Nearest neighbor at document id 51 document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
The first and third neighbors seem reasonable, although the third looks like it's just a re-working of our source tweet.
End
- The post that collects all the posts in this project is Machine Translation.