Comparing the Partition Swaps

The Cloistered Monkey

2022-02-23 21:29

Introduction

This is part of a series of posts about the Partition algorithm that starts with this post.

Imports and Setup

# python
from typing import Callable, List, TypeVar, MutableSequence
from dataclasses import dataclass
from functools import partial

import random

# pypi
from expects import be_true, equal, expect
from joblib import Parallel, delayed

import altair
import pandas

# monkey
from graeae.visualization.altair_helpers import output_path, save_chart

SLUG = "comparing-the-partition-swaps"
OUTPUT_PATH = output_path(SLUG)
save_it = partial(save_chart, output_path=OUTPUT_PATH)

Orderable = TypeVar("Orderable")
random.seed(2022)

Comparing the Swaps

Both of the versions of partition shown here traverse the collection once, so looking at the number of comparisons doesn't seem so interesting. According to the Wikipedia page on Quicksort, the swaps for Hoare's method (which is slightly different from Levitin's) is better for the worst cases, so let's see if this matters for Levitin's and CLRS's swaps.

Some Swap Counters

Since we're only calling the partitioners once rather than repeatedly using them to partition sub-lists in the collection we can get rid of the left and right arguments to the functions.

A Levitin Swap Counter

def levitin_swaps(collection: MutableSequence[Orderable]) -> int:
    """Partitions the collection using a variation of Hoare's method

    Args:
     collection: the list to partition

    Returns:
     count of swaps
    """
    left, right = 0, len(collection) - 1
    pivot_element = collection[left]
    partition_left = left
    partition_right = right + 1
    stop_right = len(collection) - 1
    swaps = 0
    while True:
        while partition_left < stop_right:
            partition_left += 1
            if collection[partition_left] >= pivot_element:
                break

        while True:
            partition_right -= 1
            if collection[partition_right] <= pivot_element:
                break

        if partition_left >= partition_right:
            break

        collection[partition_left], collection[partition_right] = (
            collection[partition_right], collection[partition_left]
        )
        swaps += 1

    # move the pivot element from the far left to its final place
    collection[left], collection[partition_right] = (
        collection[partition_right], collection[left]
    )
    swaps += 1
    return swaps

middle = 100
prefix = random.choices(range(middle), k=middle)
suffix = random.choices(range(middle + 1, 201), k=middle)
test = [middle] + prefix + suffix

swaps = levitin_swaps(test)

expect(all(item < middle for item in test[:middle])).to(be_true)
expect(all(item > middle for item in test[middle + 1:])).to(be_true)

A CLRS Swap Counter

def clrs_swaps(collection: MutableSequence[Orderable]) -> int:
    """Partitions the collection around the last element

    Args:
     collection: the list to partition

    Returns:
     count of swaps
    """
    left, right = 0, len(collection) - 1
    pivot_element = collection[right]
    lower_bound = left - 1
    swaps = 0
    for upper_bound in range(left, right):
        if collection[upper_bound] <= pivot_element:
            lower_bound += 1
            (collection[lower_bound],
             collection[upper_bound]) = (collection[upper_bound],
                                         collection[lower_bound])
            swaps += 1
    pivot = lower_bound + 1
    (collection[pivot],
     collection[right]) = (collection[right],
                           collection[pivot])
    swaps += 1
    return swaps

middle = 100
prefix = random.choices(range(middle), k=middle)
suffix = random.choices(range(middle + 1, 201), k=middle)
test = prefix + suffix + [middle]

swaps = clrs_swaps(test)

expect(all(item < middle for item in test[:middle])).to(be_true)
expect(all(item > middle for item in test[middle + 1:])).to(be_true)

A Parallelizer Function

This is what I'll pass to joblib to run the two swap-counter functions.

@dataclass
class SwapCounts:
    size: int
    swaps: int

def swap_counter(collection: MutableSequence, swap_counter: object) -> SwapCounts:
    """Runs the swap_counter over the collection of inputs

    Args:
     collection: elements to partition

    Returns:
     SwapCounts: size, swap-count
    """
    size = len(collection)
    swaps = swap_counter(collection)
    return SwapCounts(size=size, swaps=swaps)

def swap_plots(first_output: List[SwapCounts], second_output: List[SwapCounts],
               first_label: str, second_label: str,
               title: str, filename: str):
    """Plot the swap counts

    Args:
     first_output, second_output: lists of SwapCounts
     first_label, second_label: Algorithm labels for the counts
     title: title for the plot
     filename: name to save the plot to 
    """
    expect(len(first_output)).to(equal(len(second_output)))
    frame = pandas.DataFrame({
        "Input Size": [output.size for output in first_output],
        first_label: [output.swaps for output in first_output],
        second_label: [output.swaps for output in second_output]})

    melted = frame.melt(id_vars="Input Size",
                        value_vars=[first_label, second_label],
                        var_name="Algorithm", value_name="Swaps")

    base = (altair.Chart(melted).mark_circle(opacity=0.5)
            .encode(x=altair.X("Input Size:Q",
                               scale=altair.Scale(
                                   domain=(0, melted["Input Size"].max()))),
                    y=altair.Y("Swaps:Q",scale=altair.Scale(domainMin=0)),
                               color="Algorithm:N",
                    tooltip=[altair.Tooltip("Input Size:Q", format=","),
                             altair.Tooltip("Swaps:Q", format=","),
                             altair.Tooltip("Algorithm:N")]))

    line = (base.transform_regression("Input Size", "Swaps",
                                      groupby=["Algorithm"])
            .mark_line())
    chart = (line + base).properties(
        title=title,
        width=800,
        height=525
    ).interactive()

    save_it(chart, filename)
    return

def swap_ratio_plots(first_output: List[SwapCounts], second_output: List[SwapCounts],
                     first_label: str, second_label: str,
                     title: str, filename: str):
    """Plot the swap count ratios

    Args:
     first_output, second_output: lists of SwapCounts
     first_label, second_label: Algorithm labels for the counts
     title: title for the plot
     filename: name to save the plot to 
    """
    expect(len(first_output)).to(equal(len(second_output)))
    frame = pandas.DataFrame({
        "Input Size": [output.size for output in first_output],
        first_label: [output.swaps for output in first_output],
        second_label: [output.swaps for output in second_output]})

    frame["Ratio"] = frame[first_label]/(frame[second_label] + 1)

    base = (altair.Chart(frame[["Input Size", "Ratio"]]).mark_circle(opacity=0.5)
            .encode(x="Input Size:Q", y=altair.Y("Ratio:Q", scale=altair.Scale(domainMin=0)),
                    tooltip=[altair.Tooltip("Input Size", format=","),
                             altair.Tooltip("Ratio", format=",.2f")]))

    line = (base.transform_regression("Input Size", "Ratio")
            .mark_line())
    chart = (base + line).properties(
        title=title,
        width=800,
        height=525
    ).interactive()

    save_it(chart, filename)
    return

def swap_runner(swapper_one: Callable, swapper_two: Callable,
                label_one: str, label_two: str,
                things_to_partition: List[List],
                title: str, filename: str, plotter: Callable=swap_plots) -> None:
    """Run the partitioners and plot

    Args:
     swapper_one, swapper_two: swap-counter functions
     label_one, label_two: algorithm labels for the plot
     things_to_partition: collections for the partitioners to partition
     title, filename: plot strings
    """
    first_output = Parallel(n_jobs=-1)(
    delayed(swap_counter)(thing_to_partition, swapper_one)
    for thing_to_partition in things_to_partition)

    second_output = Parallel(n_jobs=-1)(
        delayed(swap_counter)(thing_to_partition, swapper_two)
        for thing_to_partition in things_to_partition)

    plotter(first_output, second_output, label_one, label_two,
            title=title,
            filename=filename)
    return

Random Inputs

counts = range(10, 100011, 100)
random_things_to_partition = [random.choices(range(count), k=count)
                              for count in counts]
swap_runner(levitin_swaps, clrs_swaps, "Levitin", "CLRS", random_things_to_partition,
           title="Levitin vs CLRS Partition Swap Count (Randomized Input)",
           filename="swaps_random_2")

swap_runner(clrs_swaps, levitin_swaps, "CLRS", "Levitin", random_things_to_partition,
            title="CLRS/Levitin Partition Swap Ratio (Randomized Input)",
            filename="swaps_random_ratio", plotter=swap_ratio_plots)

Biggest Item Last

What happens if the input has the biggest item at the end of the list?

counts = range(10, 100011, 100)
big_ended_things_to_partition = [random.choices(range(count), k=count) + [count]
                                 for count in counts]

swap_runner(levitin_swaps, clrs_swaps, "Levitin", "CLRS",
            big_ended_things_to_partition,
           title="Levitin vs CLRS Partition Swap Count (Biggest Item Last)",
           filename="swaps_sorted")

Sorted

What happens if the input is already sorted?

counts = range(10, 100011, 100)
sorted_things_to_partition = [list(range(count)) for count in counts]

swap_runner(levitin_swaps, clrs_swaps, "Levitin", "CLRS",
            sorted_things_to_partition,
            title="Levitin vs CLRS Partition Swap Count (Sorted input)",
           filename="swaps_already_sorted")

All The Same

counts = range(10, 100011, 100)
same_things_to_partition = [[count] * count for count in counts]

swap_runner(clrs_swaps, levitin_swaps, "CLRS", "Levitin", same_things_to_partition,
           title="Levitin vs CLRS Partition Swap Count (All Same Input)",
           filename="swaps_all_same")

swap_runner(clrs_swaps, levitin_swaps,"CLRS", "Levitin", same_things_to_partition,
           title="CLRS/Levitin Partition Swap Ratio (All Same Input)",
           filename="swaps_all_same_ratio", plotter=swap_ratio_plots)

Randomized Swaps

A Randomized Swap Counter

def randomized_swaps(collection: MutableSequence[Orderable],
                     partition: Callable, pivot: int) -> int:
    """Partitions the collection around the last element

    Args:
     collection: the list to partition
     partition: function to partition the list
     pivot: index of the element used for the pivot

    Returns:
     count of swaps
    """
    random_index = random.randrange(len(collection))
    collection[pivot], collection[random_index] = (collection[random_index],
                                                   collection[pivot])
    return 1 + partition(collection)

randomized_levitin = partial(randomized_swaps, partition=levitin_swaps, pivot=0)
randomized_clrs = partial(randomized_swaps, partition=clrs_swaps, pivot=-1)

Randomized Levitin Swaps

Shuffled Input

swap_runner(levitin_swaps, randomized_levitin, "Levitin", "Randomized Levitin",
            random_things_to_partition,
           title="Levitin vs Randomized Levitin Partition Swap Count (Randomized Input)",
           filename="swaps_randomized_random")

Smallest Item At the End

counts = range(10, 100011, 100)
backwards_things_to_partition = [random.choices(range(1, count), k=count) + [0]
                                 for count in counts]

swap_runner(levitin_swaps, randomized_levitin, "Levitin", "Randomized Levitin",
            backwards_things_to_partition,
            title=("Levitin vs Randomized Levitin Partition Swap Count (Backwards Input)"),
            filename="swaps_levitin_randomized_sorted_backwards")

All The Same Input

Since only difference between the randomized and the non-randomized input is that the pivot was (probably) swapped out, the input will look the same when all the values are the same to start with so I'm not going to re-plot those.

Randomized CLRS Swaps

Shuffled Input

swap_runner(clrs_swaps, randomized_clrs, "CLRS", "Randomized CLRS",
            random_things_to_partition,
           title="CLRS vs Randomized CLRS Partition Swap Count (Randomized Input)",
           filename="swaps_randomized_clrs_random")

Biggest Item at the End

swap_runner(clrs_swaps, randomized_clrs, "CLRS", "Randomized CLRS",
            big_ended_things_to_partition,
            title="CLRS vs Randomized CLRS Partition Swap Count (Big-Ended Input)",
            filename="swaps_clrs_randomized_big_ended")

Sorted

swap_runner(clrs_swaps, randomized_clrs, "CLRS", "Randomized CLRS",
            sorted_things_to_partition,
            title="CLRS vs Randomized CLRS Partition Swap Count (Sorted Input)",
            filename="swaps_clrs_randomized_sorted")