Hash Tables
Table of Contents
Beginning
Imports
# python
from argparse import Namespace
from functools import partial
import math
import pprint
# pypi
from numpy.random import default_rng
import hvplot.pandas
import numpy
import pandas
from graeae import EmbedHoloviews
Set Up
Plotting
SLUG = "hash-tables"
Embed = partial(EmbedHoloviews, folder_path=f"files/posts/nlp/{SLUG}")
Plot = Namespace(
width=990,
height=780,
fontscale=2,
tan="#ddb377",
blue="#4687b7",
red="#ce7b6d",
)
Random Number Generator
numpy_random = default_rng()
Pretty Printer
pretty = pprint.PrettyPrinter()
Middle
A Basic Hash Table
def basic_hash_table(things_to_hash: list, buckets: int) -> dict:
"""Create a basic hash table
Args :
things_to_hash: list of integers to hash
buckets: number of buckets in the table
Returns:
hash_table: the things to hash sorted into their buckets
"""
def hash_function(value: int, buckets: int) -> int:
"""Maps the value to an integer
Args:
value: what to hash
n_buckets: number of buckets in the hash table
Returns:
remainder of value//n_buckets
"""
return int(value) % buckets
# Initialize all the buckets in the hash table as empty lists
hash_table = {bucket:[] for bucket in range(buckets)}
for value in things_to_hash:
# Get the hash key for the given value
hash_value = hash_function(value, buckets)
# Add the element to the corresponding bucket
hash_table[hash_value].append(value)
return hash_table
The basic_hash_table
maps values that can be cast to integers to a dictionary of lists. Let's see what it does.
examples = [100, 10, 14, 17, 97]
hash_table_example = basic_hash_table(examples, buckets=10)
pretty.pprint(hash_table_example)
{0: [100, 10], 1: [], 2: [], 3: [], 4: [14], 5: [], 6: [], 7: [17, 97], 8: [], 9: []}
This Basic Hash Table maps the values based on their remainder after dividing the value by the number of buckets. In this case there are ten buckets so the value gets mapped to the value in its ones column.
Multiplane Hash Functions
To visualize it we'll start with a single plane and color some points based on which side of the plane they fall.
I'll start by defining the vector that we'll use to decide which side of the plane a vector is on (by taking the dot product and checking the sign of the result).
decider = pandas.DataFrame([[1, 2]])
This isn't the separating plane but rather a vector perpendicular to the separating plane. You don't need the separating plane to make the categorizations of the vectors, but for the sake of visualization it might be useful to see it. We can create it by creating a rotation matrix that rotates our originar vector 90 degrees.
theta_1 = math.radians(90)
rotation = numpy.array([[math.cos(theta_1), -math.sin(theta_1)],
[math.sin(theta_1), math.cos(theta_1)]])
plane = pandas.Series(numpy.dot(rotation, decider.T).T[0])
Now we can plot them along with some categorized points.
First plot the vector we use to decide what side of the plane the points are.
# so to plot it I'll add a starting point
COLUMNS = "X Y".split()
start = pandas.DataFrame([[0, 0]])
decider_plotter = pandas.concat([start, plane])
decider_plotter.columns = COLUMNS
plot = decider_plotter.hvplot(x="X", y="Y")
Now plot the plane that separates the categories. I'll scale it a little to move the plot back a little. Also the rotation gives us only the line segment rotated by 90 degrees so I'm going to negate it to get the -90 segment as well to complete the rendering of the plane.
SCALE = 2
plane_plotter = start.append(plane, ignore_index=True) * SCALE
plane_plotter.columns = COLUMNS
plot *= plane_plotter.hvplot(x="X", y="Y", color=Plot.tan, line_dash="dashed")
plane_plotter *= -1
plot *= plane_plotter.hvplot(x="X", y="Y", color=Plot.tan, line_dash="dashed")
Now we get to the points. The main lines to pay attention to are the calculation of the side_of_plane
value and the conditional. The side_of_plane
is an array but you can do boolean equality checks with integers as shown.
## Get a pair of random numbers between -4 and 4
POINTS = 20
LIMIT = 4
for _ in range(0, POINTS):
vector = pandas.DataFrame([numpy_random.uniform(-LIMIT, LIMIT, 2)],
columns=["x", "y"])
side_of_plane = numpy.sign(numpy.dot(plane, vector.T))
if side_of_plane == 1:
plot *= vector.hvplot.scatter(x="x", y="y", color=Plot.blue)
else:
plot *= vector.hvplot.scatter(x="x", y="y", color=Plot.red)
plot = plot.opts(
title="Plane Hash Table",
width=Plot.width,
height=Plot.height,
fontscale=Plot.fontscale,
xlim=(-LIMIT, LIMIT),
ylim=(-LIMIT, LIMIT)
)
outcome = Embed(plot=plot, file_name="multiplane_hash")()
print(outcome)
So the dashed tan line is our separation plane and the blue line segment is the vector we use to decide which side of the plane the dots are on. The blue dots have a positive dot product with the blue vector and the red dots have a negative dot product with the blue vector.
Multiple PLanes
plane_1 = numpy.array([[1, 1]])
plane_2 = numpy.array([[-1, 1]])
plane_3 = numpy.array([[-1, -1]])
multi_plane = [plane_1, plane_2, plane_3]
search_vector = numpy.array([[2, 2]])
def side_of_plane(plane: numpy.ndarray, vector: numpy.ndarray) -> int:
"""Finds the side of the plane that the vector is
Args:
plane: separating plane
vector: location to check
Returns:
sign of the dot product between the plane and the vector
"""
return numpy.sign(numpy.dot(plane, vector.T)).item()
def hash_multi_plane(planes: list, vector: numpy.ndarray) -> int:
"""Creates hash value for set of planes
Args:
planes: list of arrays to hash
vector: array to determine which side of the planes are positive
Returns:
hash_value: the hash for plane matching the vector
"""
hash_value = 0
for index, plane in enumerate(planes):
sign = side_of_plane(plane, vector)
# increment the hash if the sign is non-negative
hash_i = 0 if sign < 0 else 1
hash_value += 2**index * hash_i
return hash_value
print(hash_multi_plane(multi_plane, search_vector))
3
Random Planes
numpy_random = default_rng(0)
num_dimensions = 2
num_planes = 3
random_planes_matrix = numpy_random.normal(
size=(num_planes,
num_dimensions))
print(random_planes_matrix)
[[ 0.12573022 -0.13210486] [ 0.64042265 0.10490012] [-0.53566937 0.36159505]]
search_vector = numpy.array([[2, 2]])
def side_of_plane_matrix(planes: numpy.ndarray, vector: numpy.ndarray) -> numpy.ndarray:
"""Decides which side of planes vector is on
Returns:
side-of-plane value for vector with respect to each plane
"""
return numpy.sign(numpy.dot(planes, vector.T))
print(side_of_plane_matrix(random_planes_matrix, search_vector))
[[-1.] [ 1.] [-1.]]
def hash_multi_plane_matrix(planes: numpy.ndarray,
vector: numpy.ndarray,
num_planes: int):
"""calculates hash for vector with respect to planes"""
sides_matrix = side_of_plane_matrix(planes, vector)
hash_value = 0
for i in range(num_planes):
sign = sides_matrix[i].item() # Get the value inside the matrix cell
hash_i = 1 if sign >=0 else 0
hash_value += 2**i * hash_i # sum 2^i * hash_i
return hash_value
sm = side_of_plane_matrix(random_planes_matrix, search_vector)
print(hash_multi_plane_matrix(random_planes_matrix, search_vector, num_planes))
2
Document Vectors
This is how you would convert a document to an embedding using word vectors (just add up all the vectors for the words in the document).
word_embedding = {"I": numpy.array([1,0,1]),
"love": numpy.array([-1,0,1]),
"learning": numpy.array([1,0,1])
}
document = ['I', 'love', 'learning', 'not_a_word']
document_embedding = numpy.array([0,0,0])
for word in document:
document_embedding += word_embedding.get(word,0)
print(document_embedding)
[1 0 3]