Binary Search Tree: Randomized

This is the next post in a series on Binary Search Trees that start with this post. In this post we'll be looking at tree-heights when you randomize the order that the nodes are inserted into the tree.

Introduction

We now have a basic Binary Search Tree and some methods to build and query it. I didn't really go over it but if you look at the basic querying methods (search, min, max, etc.) you can see that they boil down to starting at a node (often the root) and then traversing up or down the levels of the tree until we find what we want or we reach a leaf. This means that at most the number of comparisons you make (everytime you reach a node you compare the node's key to your key to pick a child to follow) will be the height of tree (h).

The height of the tree depends, however, on the order in which you insert and delete the nodes. If you insert them in ascending order, for instance, you end up with all the nodes being right children and a height of n - 1 (the number of edges). But, CLRS shows that if you randomize the insertion you will, on average have a height of \(O(\lg(n))\) so I'm going to take an experimental look at that here.

Imports and Setup

# python
from functools import partial
from pathlib import Path

import random

# pypi
from expects import equal, expect
from joblib import Parallel, delayed

import altair
import networkx
import numpy
import pandas

# this project
from bowling.data_structures.binary_search_tree import (Node, Query, Tree)

# monkey projects
from graeae import Timer
from graeae.visualization.altair_helpers import output_path, save_chart
TIMER = Timer()
SLUG = "binary-search-tree-randomized"
GRAPH_OUTPUT = Path(f"files/posts/{SLUG}/")
ALTAIR_OUTPUT = output_path(SLUG)
if not GRAPH_OUTPUT.is_dir():
    GRAPH_OUTPUT.mkdir()
save_it = partial(save_chart, output_path=ALTAIR_OUTPUT)
def preorder(node: Node, adjacencies: dict) -> None:
    """Build the adjancency dictionary

    Args:
     node: the next node to add
     adjacencies: dictionary of key: adjacent node keys
    """
    if node is not None:
        left = node.left.key if node.left is not None else None
        right = node.right.key if node.right is not None else None
        if any((left, right)):
            if left is not None and right is not None:
                adjacencies[node.key] = (left, right)
            elif left is not None and right is None:
                adjacencies[node.key] = (left, )
            else:
                adjacencies[node.key] = (right,)
        preorder(node.left, adjacencies)
        preorder(node.right, adjacencies)
    return

Some Illustrative Cases

Added In Sorted Order

This is our worst-case where the nodes after the root all end up as right-children and the height is n - 1.

tree = Tree()
query = Query(tree)
n = 5
for node in range(n):
    tree.insert(Node(node))

expect(query.height).to(equal(n - 1))

adjacencies = {}
preorder(tree.root, adjacencies)

graph = networkx.nx_pydot.to_pydot(networkx.DiGraph(adjacencies))
graph.write_png(GRAPH_OUTPUT/"first_tree.png")

nil

If it were in reversed order we'd get a similar graph but the children would all be on the left and in the reverse order, but since graphviz/dot doesn't show left vs right it'd be hard to see what's going on.

A More Balanced Tree

This would be the best case, where the nodes are evenly balanced across the tree. Even though we have more nodes than the prior tree we only have a height of two.

KEYS = [50, 25, 75, 10, 30, 65, 80]
tree = Tree(Node(KEYS[0]))
for key in KEYS[1:]:
    tree.insert(Node(key))

adjacencies = {}
preorder(tree.root, adjacencies)
graph = networkx.nx_pydot.to_pydot(networkx.DiGraph(adjacencies))
graph.write_png(GRAPH_OUTPUT/"second_tree.png")

query = Query(tree)
expect(query.height).to(equal(2))

nil

Something In Between

If we insert the nodes in random order then we would expect to see something better than the worst case but not a perfectly balanced tree either.

random.shuffle(KEYS)
tree = Tree(Node(KEYS[0]))

for key in KEYS[1:]:
    tree.insert(Node(key))

adjacencies = {}
preorder(tree.root, adjacencies)
graph = networkx.nx_pydot.to_pydot(networkx.DiGraph(adjacencies))
graph.write_png(GRAPH_OUTPUT/"third_tree.png")

nil

An Empirical Check

Height vs Nodes

def heights(keys: list) -> tuple:
    """Builds the binary tree from the nodes and gets height

    Args:
     nodes: list of keys for nodes

    Returns:
     (number of nodes, height)
    """
    tree = Tree()
    query = Query(tree)
    for key in keys:
        tree.insert(Node(key))

    return (len(keys), query.height)
things_to_sort = (random.sample(range(count), k=count)
                  for count in range(10, 10**5, 100))

with TIMER:
    height_vs_count = Parallel(n_jobs=-1)(
        delayed(heights)(thing_to_sort)
        for thing_to_sort in things_to_sort
    )
Started: 2022-03-17 20:17:02.325084
Ended: 2022-03-17 20:17:30.486929
Elapsed: 0:00:28.161845
NODES, HEIGHT = 0, 1
unzipped = list(zip(*height_vs_count))
frame = pandas.DataFrame({"Nodes": unzipped[NODES],
                          "Height": unzipped[HEIGHT]})
frame["3Log2"] = 3 * numpy.log2(frame["Nodes"])
points = altair.Chart(frame).mark_point().encode(
    x="Nodes",
    y="Height",
    tooltip = [altair.Tooltip("Nodes", format=","),
               altair.Tooltip("Height", format=",")]
)

line = points.mark_line(color="#feb236").encode(
    x="Nodes",
    y="3Log2",
    tooltip = [altair.Tooltip("Nodes", format=","),
               altair.Tooltip("3Log2", format=",")]
)
chart = (line + points).properties(
    title="Randomized Binary Search Tree Node Count vs Height",
    width=800, height=500,
)
save_it(chart, "nodes-vs-height")

Figure Missing

So it looks like when you insert the nodes randomly you tend to get a tree height that's \(O(\log(n))\).

Node Distribution

nodes = 10**4
urn = list(range(nodes))
samples = 10**4
print(f"log2({nodes}) = {numpy.log2(nodes)}")
things_to_sort = (random.sample(urn, k=nodes) for sample in range(samples))

with TIMER:
    height_distribution = Parallel(n_jobs=-1)(
        delayed(heights)(thing_to_sort)
        for thing_to_sort in things_to_sort
    )
log2(10000) = 13.287712379549449
Started: 2022-03-17 21:15:42.270315
Ended: 2022-03-17 21:16:32.620183
Elapsed: 0:00:50.349868
NODES, HEIGHT = 0, 1
unzipped = list(zip(*height_distribution))
frame = pandas.DataFrame({"Nodes": unzipped[NODES],
                          "Height": unzipped[HEIGHT]})

base = altair.Chart(frame)

histogram = base.mark_bar(color="#50586C").encode(
    x=altair.X("Height", bin=True),
    y="count()", tooltip=altair.Tooltip("Height"))

median = base.mark_rule(color="#DCE2F0").encode(
    x="mean(Height):Q", size=altair.value(5),
    tooltip=altair.Tooltip("mean(Height)"))

chart = (histogram + median).properties(
    title=f"Randomized Binary Search Tree Height Distribution ({nodes:,} Nodes)",
    width=800, height=500,
)
save_it(chart, "height-distribution")

Figure Missing

print(f"Median: {frame.Height.median()}")
print(f"Min: {frame.Height.min()}")
print(f"Max: {frame.Height.max()}")
print(f"3 x log2(n): {3 * numpy.log2(frame.Nodes.iloc[0]):.2f}")
Median: 30.0
Min: 24
Max: 40
3 x log2(n): 39.86

So, with our evenly ranged inputs, even though in the worst case we could have ended up with a tree with a height of 10,000, by inserting the nodes in random order it turns out that most of the time it has a height of 30 and in this sample it has a maximum of 40.