Quicksort
Table of Contents
The Algorithm
I covered the Partition
function in The Road To Partition and now I'll use it to implement a quicksort. This is the CLRS version.
\begin{algorithm} \caption{QuickSort} \begin{algorithmic} \INPUT An array and left and right locations defining a subarray \OUTPUT The subarray is sorted \PROCEDURE{QuickSort}{A, left, right} \IF {left < right} \STATE pivot $\gets$ \textsc{Partition}(A, left, right) \STATE \textsc{QuickSort}(A, left, pivot - 1) \STATE \textsc{QuickSort}(A, pivot + 1, right) \ENDIF \ENDPROCEDURE \end{algorithmic} \end{algorithm}
Implementing It
Some Imports
# python
from collections.abc import MutableSequence
from dataclasses import dataclass
from functools import partial
from math import log2
import random
# pypi
from expects import contain_exactly, equal, expect
from joblib import Parallel, delayed
import altair
import pandas
# this project
from bowling.sort.insertion import insertion_sort
from bowling.sort.merge import mergesort
# other monkey stuff
from graeae.visualization.altair_helpers import output_path, save_chart
from graeae import Timer
Some Setup
@dataclass
class PartitionOutput:
"""Keeps the output for the partition funcion"""
pivot: int
count: int
TIMER = Timer()
SLUG = "quicksort"
OUTPUT_PATH = output_path(SLUG)
save_it = partial(save_chart, output_path=OUTPUT_PATH)
The Partition
Even though I implemented it in that other post I didn't make it so it returns counts so we can estimate the runtime so I'll do that here.
def partition(collection: MutableSequence,
left: int, right: int) -> PartitionOutput:
"""Partitions the collection around the last element
Args:
collection: the list to partition
left: index of the first element in the sub-list to partition
right: index of the last element in the sub-list to partition
Returns:
PartitionOutput(pivot, count)
"""
count = 0
pivot_element = collection[right]
lower_bound = left - 1
for upper_bound in range(left, right):
count += 1
if collection[upper_bound] <= pivot_element:
lower_bound += 1
(collection[lower_bound],
collection[upper_bound]) = (collection[upper_bound],
collection[lower_bound])
pivot = lower_bound + 1
(collection[pivot],
collection[right]) = (collection[right],
collection[pivot])
count += 1
return PartitionOutput(pivot=pivot, count=count)
Some Checks
The First Example
start = [5, 7, 9, 4, 6]
test = start.copy()
expected = [5, 4, 6, 7, 9]
first_expected_pivot = 2
output = partition(test, 0, 4)
expect(output.pivot).to(equal(first_expected_pivot))
expect(test).to(contain_exactly(*expected))
expect(output.count).to(equal(len(test)))
And to make sure the sub-list works.
left, right = [100, 20], [999, 888, 777]
test = left + start.copy() + right
output = partition(test, 2, 6)
# all we did was shift the sub-list to spots to the right
expect(output.pivot).to(equal(first_expected_pivot + 2))
# only the sub-list should be partitioned
expect(test).to(contain_exactly(*(left + expected + right)))
# the count should match our sub-array
expect(output.count).to(equal(len(start)))
The Pivot Is the Biggest Element
start = [9, 6, 25, 4, 100]
test = start.copy()
output = partition(test, 0, 4)
# the pivot should be the last element
expect(output.pivot).to(equal(4))
# nothing changes in the list
expect(test).to(contain_exactly(*start))
# once again, count should match the size of the input
expect(output.count).to(equal(len(test)))
The QuickSort
def quicksort(collection: MutableSequence, left: int, right: int) -> int:
"""Recursive quicksort
Args:
collection: list to sort
left: index of start of sub-list in collection to sort
right: index of end of sub-list in collection to sort
Returns:
count of comparisons
"""
count = 0
if left < right:
output = partition(collection, left, right)
count += output.count
count += quicksort(collection, left, output.pivot - 1)
count += quicksort(collection, output.pivot + 1, right)
return count
Check It Out
start = list(range(10))
items = start.copy()
random.shuffle(items)
length = len(items)
count = quicksort(items, 0, length-1)
print(f"count: {count}")
print(f"Theoretical Average: {length * log2(length):.2f}")
print(f"Theoretical Worst: {length**2}")
expect(items).to(contain_exactly(*start))
count: 37 Theoretical Average: 33.22 Theoretical Worst: 100
Plotting The Quicksort Runtimes
@dataclass
class QuicksortOutput:
"""Holds the output of the quicksort counts"""
comparisons: int
size: int
def quicksorter(collection: MutableSequence) -> QuicksortOutput:
"""runs the quicksort and outputs count and size of collection
Args:
collection: thing to sort
Returns:
QuicksortOutput(count, size)
"""
size = len(collection)
count = quicksort(collection, 0, size - 1)
return QuicksortOutput(comparisons=count, size=size)
With Random Input
things_to_sort = [list(range(count)) for count in range(1, 10**5, 1000)]
for things in things_to_sort:
random.shuffle(things)
with TIMER:
quick_output = Parallel(n_jobs=-1)(
delayed(quicksorter)(thing_to_sort)
for thing_to_sort in things_to_sort)
Started: 2022-02-05 00:57:12.120397 Ended: 2022-02-05 00:57:14.397235 Elapsed: 0:00:02.276838
with TIMER:
merge_output = Parallel(n_jobs=-1)(
delayed(mergesort)(thing_to_sort)
for thing_to_sort in things_to_sort)
Started: 2022-02-05 00:58:40.087042 Ended: 2022-02-05 00:58:42.326204 Elapsed: 0:00:02.239162
Note to future self: Pypy is much faster with the python inputs and much slower with numpy inputs.
counts = [output.comparisons for output in quick_output]
sizes = [output.size for output in quick_output]
frame = pandas.DataFrame({"Size": sizes, "QuickSort": counts})
frame["Merge Sort"] = [output for output in merge_output]
melted = frame.melt(id_vars=["Size"],
var_name="Sort Algorithm", value_name="Comparisons")
chart = altair.Chart(melted).mark_point().encode(
x="Size",
y="Comparisons",
color="Sort Algorithm",
tooltip=[altair.Tooltip("Size", format=","),
altair.Tooltip("Comparisons", format=","),
"Sort Algorithm"]
).properties(
title="QuickSort vs Merge Sort",
width=800,
height=525,
)
save_it(chart, "quicksort-runtime")
I originally had Insertion Sort in the plot too, but it does so poorly that it just squashes both the Merge Sort and Quick Sort runtimes to a flat line. This is kind of an interesting plot. Quick Sort does much, much better than Insertion Sort, but it still doesn't quite keep up with Merge Sort. The trade-off being that Quick Sort does its sorting in place while Merge Sort creates all these temporary copies.
Worst-Case Input
Remember that case in the the partition post where the last item (the pivot) was the largest item, and how it resulted in nothing being moved around? What if no matter what sub-array you picked, the last item was always the largest? In other words, what if it's already sorted?
For one thing with really big inputs the interpreter throws an error because you've made too many recursive calls, so that tells you that something bad is happening.
things_to_sort = [list(range(count)) for count in range(1, 10**3, 100)]
with TIMER:
quick_output = Parallel(n_jobs=-1)(
delayed(quicksorter)(thing_to_sort)
for thing_to_sort in things_to_sort)
Started: 2022-02-05 01:06:19.614254 Ended: 2022-02-05 01:06:20.159874 Elapsed: 0:00:00.545620
with TIMER:
merge_output = Parallel(n_jobs=-1)(
delayed(mergesort)(thing_to_sort)
for thing_to_sort in things_to_sort)
Started: 2022-02-05 01:06:23.592928 Ended: 2022-02-05 01:06:23.661150 Elapsed: 0:00:00.068222
Note to future self: Pypy is much faster with the python inputs and much slower with numpy inputs.
counts = [output.comparisons for output in quick_output]
sizes = [output.size for output in quick_output]
frame = pandas.DataFrame({"Size": sizes, "QuickSort": counts})
frame["Merge Sort"] = [output for output in merge_output]
melted = frame.melt(id_vars=["Size"],
var_name="Sort Algorithm", value_name="Comparisons")
chart = altair.Chart(melted).mark_point().encode(
x="Size",
y="Comparisons",
color="Sort Algorithm",
tooltip=[altair.Tooltip("Size", format=","),
altair.Tooltip("Comparisons", format=","),
"Sort Algorithm"]
).properties(
title="QuickSort vs Merge Sort (Worst-Case)",
width=800,
height=525,
)
save_it(chart, "quicksort-runtime-worst")
Looking At the Sort In Progress
def quicksort_tracer(collection: MutableSequence,
left: int, right: int, tracer: dict=None) -> int:
"""Recursive quicksort
Args:
collection: list to sort
left: index of start of sub-list in collection to sort
right: index of end of sub-list in collection to sort
tracer: dict of element: list of locations
Returns:
tracer dict
"""
if tracer is None:
tracer = {element: [index] for index, element in enumerate(collection)}
if left < right:
output = partition(collection, left, right)
quicksort_tracer(collection, left, output.pivot - 1, tracer)
quicksort_tracer(collection, output.pivot + 1, right, tracer)
for index, element in enumerate(collection):
tracer[element].append(index)
return tracer
size = 20
start = list(range(size))
inputs = start.copy()
inputs.reverse()
tracer = quicksort_tracer(inputs, 0, size - 1)
frame = pandas.DataFrame(tracer)
frame = frame.reset_index().rename(columns={"index": "Quicksort Call"})
melted = frame.melt(id_vars=["Quicksort Call"], var_name="Element", value_name="Location")
chart = altair.Chart(melted).mark_line().encode(
x="Quicksort Call",
y="Location",
color="Element",
tooltip=["Quicksort Call", "Location", "Element"]
).properties(
title="Quicksort Trace (Reversed Input)",
width=800,
height=525,
)
save_it(chart, "quicksort-trace-backwards")
size = 20
start = list(range(size))
inputs = start.copy()
random.shuffle(inputs)
tracer = quicksort_tracer(inputs, 0, size - 1)
frame = pandas.DataFrame(tracer)
frame = frame.reset_index().rename(columns={"index": "Quicksort Call"})
melted = frame.melt(id_vars=["Quicksort Call"], var_name="Element", value_name="Location")
chart = altair.Chart(melted).mark_line().encode(
x="Quicksort Call",
y="Location",
color="Element",
tooltip=["Quicksort Call", "Location", "Element"]
).properties(
title="Quicksort Trace (Shuffled Input)",
width=800,
height=525,
)
save_it(chart, "quicksort-trace-shuffled")