From 99b5f0c77f46b93e2166903a21aafad00b107235 Mon Sep 17 00:00:00 2001
From: Jason Paulos <jasonpaulos@users.noreply.github.com>
Date: Mon, 12 Sep 2022 16:31:23 -0700
Subject: [PATCH 1/3] An attempt to share scratch slot assignments among
 subroutines

---
 .../application/abi/algobank_approval.teal    |  10 +-
 pyteal/compiler/compiler.py                   |  19 ++-
 pyteal/compiler/scratchslots.py               | 160 ++++++++++++++++--
 pyteal/compiler/subroutines.py                | 114 ++++++++++---
 pyteal/compiler/subroutines_test.py           | 139 +++++++++++++--
 5 files changed, 387 insertions(+), 55 deletions(-)

diff --git a/examples/application/abi/algobank_approval.teal b/examples/application/abi/algobank_approval.teal
index be89cd0d3..553b3fd20 100644
--- a/examples/application/abi/algobank_approval.teal
+++ b/examples/application/abi/algobank_approval.teal
@@ -202,23 +202,23 @@ retsub
 
 // withdraw
 withdraw_3:
-store 8
-store 7
+store 5
+store 6
 txn Sender
 byte "balance"
 txn Sender
 byte "balance"
 app_local_get
-load 7
+load 6
 -
 app_local_put
 itxn_begin
 int pay
 itxn_field TypeEnum
-load 8
+load 5
 txnas Accounts
 itxn_field Receiver
-load 7
+load 6
 itxn_field Amount
 int 0
 itxn_field Fee
diff --git a/pyteal/compiler/compiler.py b/pyteal/compiler/compiler.py
index 472cfdba2..82dcefb9a 100644
--- a/pyteal/compiler/compiler.py
+++ b/pyteal/compiler/compiler.py
@@ -124,7 +124,7 @@ def verifyOpsForMode(teal: List[TealComponent], mode: Mode):
 def compileSubroutine(
     ast: Expr,
     options: CompileOptions,
-    subroutineGraph: Dict[SubroutineDefinition, Set[SubroutineDefinition]],
+    subroutine_graph: Dict[Optional[SubroutineDefinition], Set[SubroutineDefinition]],
     subroutine_start_blocks: Dict[Optional[SubroutineDefinition], TealBlock],
     subroutine_end_blocks: Dict[Optional[SubroutineDefinition], TealBlock],
 ) -> None:
@@ -202,15 +202,14 @@ def compileSubroutine(
             for subroutine in stmt.getSubroutines():
                 referencedSubroutines.add(subroutine)
 
-    if currentSubroutine is not None:
-        subroutineGraph[currentSubroutine] = referencedSubroutines
+    subroutine_graph[currentSubroutine] = referencedSubroutines
 
     newSubroutines = referencedSubroutines - subroutine_start_blocks.keys()
     for subroutine in sorted(newSubroutines, key=lambda subroutine: subroutine.id):
         compileSubroutine(
             subroutine.get_declaration(),
             options,
-            subroutineGraph,
+            subroutine_graph,
             subroutine_start_blocks,
             subroutine_end_blocks,
         )
@@ -272,11 +271,13 @@ def compileTeal(
 
     options = CompileOptions(mode=mode, version=version, optimize=optimize)
 
-    subroutineGraph: Dict[SubroutineDefinition, Set[SubroutineDefinition]] = dict()
+    subroutine_graph: Dict[
+        Optional[SubroutineDefinition], Set[SubroutineDefinition]
+    ] = dict()
     subroutine_start_blocks: Dict[Optional[SubroutineDefinition], TealBlock] = dict()
     subroutine_end_blocks: Dict[Optional[SubroutineDefinition], TealBlock] = dict()
     compileSubroutine(
-        ast, options, subroutineGraph, subroutine_start_blocks, subroutine_end_blocks
+        ast, options, subroutine_graph, subroutine_start_blocks, subroutine_end_blocks
     )
 
     # note: optimizations are off by default, in which case, apply_global_optimizations
@@ -291,14 +292,16 @@ def compileTeal(
         for start in subroutine_start_blocks.values():
             apply_global_optimizations(start, options.optimize)
 
-    localSlotAssignments = assignScratchSlotsToSubroutines(subroutine_start_blocks)
+    localSlotAssignments = assignScratchSlotsToSubroutines(
+        subroutine_start_blocks, subroutine_graph
+    )
 
     subroutineMapping: Dict[
         Optional[SubroutineDefinition], List[TealComponent]
     ] = sort_subroutine_blocks(subroutine_start_blocks, subroutine_end_blocks)
 
     spillLocalSlotsDuringRecursion(
-        version, subroutineMapping, subroutineGraph, localSlotAssignments
+        version, subroutineMapping, subroutine_graph, localSlotAssignments
     )
 
     subroutineLabels = resolveSubroutines(subroutineMapping)
diff --git a/pyteal/compiler/scratchslots.py b/pyteal/compiler/scratchslots.py
index 3d316b269..66905d109 100644
--- a/pyteal/compiler/scratchslots.py
+++ b/pyteal/compiler/scratchslots.py
@@ -4,6 +4,7 @@
 from pyteal.ir import TealBlock, Op
 from pyteal.errors import TealInternalError
 from pyteal.config import NUM_SLOTS
+from pyteal.compiler.subroutines import find_callstack_exclusive_subroutines
 
 
 def collect_unoptimized_slots(
@@ -89,6 +90,7 @@ def collectSlotsFromBlock(block: TealBlock, slots: Set[ScratchSlot]):
 
 def assignScratchSlotsToSubroutines(
     subroutineBlocks: Dict[Optional[SubroutineDefinition], TealBlock],
+    subroutine_graph: dict[Optional[SubroutineDefinition], set[SubroutineDefinition]],
 ) -> Dict[Optional[SubroutineDefinition], Set[int]]:
     """Assign scratch slot values for an entire program.
 
@@ -113,7 +115,119 @@ def assignScratchSlotsToSubroutines(
         *local_slots.values()
     )
 
-    slotAssignments: Dict[ScratchSlot, int] = dict()
+    # This is an imperfect greedy algorithm to share scratch slot assignments between callstack
+    # exclusive subroutines.
+    # * It's granularity is at the subroutine level, meaning it decides that two subroutines must share
+    #   all of their scratch slot assignments, or none of them.
+    # * It uses the "exclusivity" of a subroutine (i.e. how many other subroutines it's callstack
+    #   exclusive with) as a heuristic to combine subroutines.
+    # * Analysis has not been done to prove that this algorithm always terminates (or if its results
+    #   are anywhere near optimal).
+    # * WARNING: this algorithm DOES NOT honor user-defined scratch slots. Those slots may be
+    #   assigned to a numeric slot which IS NOT what the user specified.
+    callstack_exclusive_subroutines = find_callstack_exclusive_subroutines(
+        subroutine_graph
+    )
+    # combined_subroutine_groups is a makeshift union-find data structure
+    combined_subroutine_groups: list[set[Optional[SubroutineDefinition]]] = [
+        {s} for s in callstack_exclusive_subroutines.keys()
+    ]
+    if len(callstack_exclusive_subroutines) != 0:
+        # choose "most exclusive" (i.e. most compatible) subroutine to start
+        current_subroutine = max(
+            callstack_exclusive_subroutines.keys(),
+            key=lambda s: len(callstack_exclusive_subroutines[s]),
+        )
+        while True:
+            group_index = -1
+            for i, group in enumerate(combined_subroutine_groups):
+                if current_subroutine in group:
+                    group_index = i
+                    break
+
+            # only look at subroutines we're not already grouped with
+            new_callstack_exclusive = [
+                s
+                for s in callstack_exclusive_subroutines[current_subroutine]
+                if s not in combined_subroutine_groups[group_index]
+            ]
+            if len(new_callstack_exclusive) == 0:
+                # nothing else to do
+                break
+
+            # choose the "most exclusive" subroutine that is exclusive to `current_subroutine`
+            to_combine = max(
+                new_callstack_exclusive,
+                key=lambda s: len(callstack_exclusive_subroutines[s]),
+            )
+            # Share scratch slot assignments between `current_subroutine` and `to_combine`.
+            to_combine_group_index = -1
+            for i, group in enumerate(combined_subroutine_groups):
+                if to_combine in group:
+                    to_combine_group_index = i
+                    break
+            combined_subroutine_groups[group_index] |= combined_subroutine_groups[
+                to_combine_group_index
+            ]
+            combined_subroutine_groups.pop(to_combine_group_index)
+
+            # BEWARE! Now that we've decided to share scratch slot assignments between the two
+            # subroutines, this potentially limits the other subroutines that they can share assignments
+            # with. Specifically, if even if `current_subroutine` is callstack exclusive with another
+            # subroutine `X`, if `to_combine` is not callstack exclusive with `X`, it's no longer safe
+            # for `current_subroutine` to share assignments with `X`. We encode this constraint by
+            # taking the intersection of `current_subroutine` and `to_combine`'s callstack exclusive
+            # subroutines.
+            intersection = (
+                callstack_exclusive_subroutines[current_subroutine]
+                & callstack_exclusive_subroutines[to_combine]
+            )
+            callstack_exclusive_subroutines[current_subroutine] = intersection | {
+                to_combine
+            }
+            callstack_exclusive_subroutines[to_combine] = intersection | {
+                cast(SubroutineDefinition, current_subroutine)
+            }
+
+            current_subroutine = max(
+                callstack_exclusive_subroutines.keys(),
+                key=lambda s: len(callstack_exclusive_subroutines[s]),
+            )
+
+    # the "spokesperson" for a group is the subroutine with the largest number of local slots
+    combined_subroutine_groups_spokesperson: list[Optional[SubroutineDefinition]] = []
+    # all other subroutines in the group will have their local slots mapped to their spokesperson's
+    local_slot_mappings_to_spokesperson: list[dict[ScratchSlot, ScratchSlot]] = []
+    for group in combined_subroutine_groups:
+        spokesperson = max(group, key=lambda s: len(local_slots[s]))
+        spokesperson_local_slots = list(local_slots[spokesperson])
+        local_slot_mappings = {slot: slot for slot in spokesperson_local_slots}
+
+        for subroutine in group:
+            if subroutine is spokesperson:
+                continue
+            for i, slot in enumerate(local_slots[subroutine]):
+                local_slot_mappings[slot] = spokesperson_local_slots[i]
+
+        combined_subroutine_groups_spokesperson.append(spokesperson)
+        local_slot_mappings_to_spokesperson.append(local_slot_mappings)
+
+    slots_to_assign: set[ScratchSlot] = global_slots | cast(
+        set[ScratchSlot], set()
+    ).union(
+        *[
+            local_slots[spokesperson]
+            for spokesperson in combined_subroutine_groups_spokesperson
+        ]
+    )
+
+    if len(slots_to_assign) > NUM_SLOTS:
+        raise TealInternalError(
+            "Too many slots in use: {}, maximum is {}".format(
+                slots_to_assign, NUM_SLOTS
+            )
+        )
+
     slotIds: Set[int] = set()
 
     for slot in allSlots:
@@ -127,12 +241,15 @@ def assignScratchSlotsToSubroutines(
             )
         slotIds.add(slot.id)
 
-    if len(allSlots) > NUM_SLOTS:
-        # TODO: identify which slots can be reused
-        # subroutines which never invoke each other can use the same slot ID for local slots
-        raise TealInternalError(
-            "Too many slots in use: {}, maximum is {}".format(len(allSlots), NUM_SLOTS)
-        )
+    # Run the above check on all slots (before subroutine combination optimization), but clear it out
+    # and populate slotIds again. We only do this because the optimization algorithm above doesn't
+    # honor user-defined slot IDs.
+    slotIds.clear()
+
+    for slot in slots_to_assign:
+        if not slot.isReservedSlot:
+            continue
+        slotIds.add(slot.id)
 
     # verify that all local slots are assigned to before being loaded.
     # TODO: for simplicity, the current implementation does not perform this check with global slots
@@ -145,8 +262,9 @@ def assignScratchSlotsToSubroutines(
             )
             raise TealInternalError(msg) from errors[0]
 
+    slotAssignments: Dict[ScratchSlot, int] = dict()
     nextSlotIndex = 0
-    for slot in sorted(allSlots, key=lambda slot: slot.id):
+    for slot in sorted(slots_to_assign, key=lambda slot: slot.id):
         # Find next vacant slot that compiler can assign to
         while nextSlotIndex in slotIds:
             nextSlotIndex += 1
@@ -158,14 +276,32 @@ def assignScratchSlotsToSubroutines(
             slotAssignments[slot] = nextSlotIndex
             slotIds.add(nextSlotIndex)
 
-    for start in subroutineBlocks.values():
+    for subroutine, start in subroutineBlocks.items():
+        group_index = -1
+        for i, group in enumerate(combined_subroutine_groups):
+            if subroutine in group:
+                group_index = i
+                break
+        assert group_index != -1
+
+        slot_mapping = local_slot_mappings_to_spokesperson[group_index]
+
         for block in TealBlock.Iterate(start):
             for op in block.ops:
                 for slot in op.getSlots():
-                    op.assignSlot(slot, slotAssignments[slot])
+                    if slot in slot_mapping:
+                        # a local slot
+                        op.assignSlot(slot, slotAssignments[slot_mapping[slot]])
+                    else:
+                        # a global slot
+                        op.assignSlot(slot, slotAssignments[slot])
 
     assignedLocalSlots: Dict[Optional[SubroutineDefinition], Set[int]] = dict()
-    for subroutine, slots in local_slots.items():
-        assignedLocalSlots[subroutine] = set(slotAssignments[slot] for slot in slots)
+    for i, group in enumerate(combined_subroutine_groups):
+        slot_mapping = local_slot_mappings_to_spokesperson[i]
+        for subroutine in group:
+            assignedLocalSlots[subroutine] = set(
+                slotAssignments[slot_mapping[slot]] for slot in local_slots[subroutine]
+            )
 
     return assignedLocalSlots
diff --git a/pyteal/compiler/subroutines.py b/pyteal/compiler/subroutines.py
index 403549b1f..95e8754f3 100644
--- a/pyteal/compiler/subroutines.py
+++ b/pyteal/compiler/subroutines.py
@@ -1,5 +1,5 @@
 import re
-from typing import List, Dict, Set, Optional, TypeVar
+from typing import List, Dict, Set, Optional, TypeVar, cast
 from collections import OrderedDict
 
 from pyteal.errors import TealInputError
@@ -32,38 +32,111 @@ def graph_search(graph: Dict[Node, Set[Node]], start: Node, end: Node) -> bool:
     return False
 
 
-def findRecursionPoints(
-    subroutineGraph: Dict[SubroutineDefinition, Set[SubroutineDefinition]]
-) -> Dict[SubroutineDefinition, Set[SubroutineDefinition]]:
+def find_callstack_exclusive_subroutines(
+    subroutine_graph: dict[Optional[SubroutineDefinition], set[SubroutineDefinition]]
+) -> dict[Optional[SubroutineDefinition], set[SubroutineDefinition]]:
+    """Find all subroutines which are "callstack exclusive."
+
+    Two subroutines are "callstack exclusive" if they never appear on the callstack at the same time.
+
+    Args:
+        subroutine_graph: A graph of subroutines. Each key is a subroutine (the main routine should
+            be present), which represents a node in the graph. Each value is a set of all
+            subroutines that specific subroutine calls, which represent directional edges in the
+            graph.
+
+    Returns:
+        A dictionary whose keys are the same as subroutine_graph, and whose values are a subset of
+        the key's values from subroutine_graph. Each element in this subset represents a subroutine
+        which is callstack exclusive with the subroutine key.
+    """
+    all_subroutines = subroutine_graph.keys()
+    callstack_exclusives: dict[
+        Optional[SubroutineDefinition], set[SubroutineDefinition]
+    ] = {s: set() for s in all_subroutines}
+
+    for s1 in all_subroutines:
+        for s2 in all_subroutines:
+            if s1 is s2:
+                continue
+            # need to use the casts to help mypy detect the generic type assignment
+            if graph_search(
+                cast(
+                    dict[
+                        Optional[SubroutineDefinition],
+                        set[Optional[SubroutineDefinition]],
+                    ],
+                    subroutine_graph,
+                ),
+                s1,
+                s2,
+            ) or graph_search(
+                cast(
+                    dict[
+                        Optional[SubroutineDefinition],
+                        set[Optional[SubroutineDefinition]],
+                    ],
+                    subroutine_graph,
+                ),
+                s2,
+                s1,
+            ):
+                # Is there a path from s1 to s2, or s2 to s1? If so, these subroutines can exist on
+                # the callstack at the same time.
+                continue
+            callstack_exclusives[s1].add(
+                cast(SubroutineDefinition, s2)
+            )  # this is annoying, we should reconsider using None to represent the main routine
+            callstack_exclusives[s2].add(cast(SubroutineDefinition, s1))
+
+    return callstack_exclusives
+
+
+def find_recursion_points(
+    subroutine_graph: dict[Optional[SubroutineDefinition], set[SubroutineDefinition]]
+) -> dict[Optional[SubroutineDefinition], set[SubroutineDefinition]]:
     """Find all subroutine calls which may result in the current subroutine being called again
     recursively.
 
     Args:
-        subroutineGraph: A graph of subroutines. Each key is a subroutine (the main routine should
+        subroutine_graph: A graph of subroutines. Each key is a subroutine (the main routine should
             be present), which represents a node in the graph. Each value is a set of all
             subroutines that specific subroutine calls, which represent directional edges in the
             graph.
 
     Returns:
-        A dictionary whose keys are the same as subroutineGraph, and whose values are a subset of
-        the key's values from subroutineGraph. Each element in this subset represents a subroutine
+        A dictionary whose keys are the same as subroutine_graph, and whose values are a subset of
+        the key's values from subroutine_graph. Each element in this subset represents a subroutine
         which may reenter the calling subroutine.
     """
-    reentryPoints: Dict[SubroutineDefinition, Set[SubroutineDefinition]] = dict()
+    reentry_points: dict[
+        Optional[SubroutineDefinition], set[SubroutineDefinition]
+    ] = dict()
 
-    for subroutine in subroutineGraph.keys():
+    for subroutine in subroutine_graph.keys():
         # perform a depth first search to see which callers (if any) have a path to invoke the calling subroutine again
-        reentryPoints[subroutine] = set(
+        reentry_points[subroutine] = set(
             callee
-            for callee in subroutineGraph[subroutine]
-            if graph_search(subroutineGraph, callee, subroutine)
+            for callee in subroutine_graph[subroutine]
+            # need to use the cast to help mypy detect the generic type assignment
+            if graph_search(
+                cast(
+                    dict[
+                        Optional[SubroutineDefinition],
+                        set[Optional[SubroutineDefinition]],
+                    ],
+                    subroutine_graph,
+                ),
+                callee,
+                subroutine,
+            )
         )
 
-    return reentryPoints
+    return reentry_points
 
 
 def find_recursive_path(
-    subroutine_graph: Dict[SubroutineDefinition, Set[SubroutineDefinition]],
+    subroutine_graph: Dict[Optional[SubroutineDefinition], Set[SubroutineDefinition]],
     subroutine: SubroutineDefinition,
 ) -> List[SubroutineDefinition]:
     visited = set()
@@ -91,7 +164,7 @@ def dfs(x):
 def spillLocalSlotsDuringRecursion(
     version: int,
     subroutineMapping: Dict[Optional[SubroutineDefinition], List[TealComponent]],
-    subroutineGraph: Dict[SubroutineDefinition, Set[SubroutineDefinition]],
+    subroutineGraph: Dict[Optional[SubroutineDefinition], Set[SubroutineDefinition]],
     localSlots: Dict[Optional[SubroutineDefinition], Set[int]],
 ) -> None:
     """In order to prevent recursion from modifying the local scratch slots a subroutine uses,
@@ -114,11 +187,11 @@ def spillLocalSlotsDuringRecursion(
         localSlots: The output from the function `assignScratchSlotsToSubroutines`, which indicates
             the local slots which must be spilled for each subroutine.
     """
-    recursivePoints = findRecursionPoints(subroutineGraph)
+    recursivePoints = find_recursion_points(subroutineGraph)
 
     recursive_byref = None
     for k, v in recursivePoints.items():
-        if v and k.by_ref_args:
+        if v and k is not None and k.by_ref_args:
             recursive_byref = k
             break
 
@@ -214,7 +287,7 @@ def spillLocalSlotsDuringRecursion(
 
                 hideReturnValueInFirstSlot = False
 
-                if subroutine.return_type != TealType.none:
+                if subroutine is not None and subroutine.return_type != TealType.none:
                     # if the subroutine returns a value on the stack, we need to preserve this after
                     # restoring all local slots.
 
@@ -245,7 +318,10 @@ def spillLocalSlotsDuringRecursion(
                         # clear out the duplicate arguments that were dug up previously, since dig
                         # does not pop the dug values -- once we use cover/uncover to properly set up
                         # the spilled slots, this will no longer be necessary
-                        if subroutine.return_type != TealType.none:
+                        if (
+                            subroutine is not None
+                            and subroutine.return_type != TealType.none
+                        ):
                             # if there is a return value on top of the stack, we need to preserve
                             # it, so swap it with the subroutine argument that's below it on the
                             # stack
diff --git a/pyteal/compiler/subroutines_test.py b/pyteal/compiler/subroutines_test.py
index bd3879a94..fa79585fb 100644
--- a/pyteal/compiler/subroutines_test.py
+++ b/pyteal/compiler/subroutines_test.py
@@ -5,21 +5,138 @@
 import pyteal as pt
 
 from pyteal.compiler.subroutines import (
-    findRecursionPoints,
+    find_callstack_exclusive_subroutines,
+    find_recursion_points,
     spillLocalSlotsDuringRecursion,
     resolveSubroutines,
 )
 
 
-def test_findRecursionPoints_empty():
+def test_find_callstack_exclusive_subroutines_empty():
     subroutines = dict()
 
     expected = dict()
-    actual = findRecursionPoints(subroutines)
+    actual = find_callstack_exclusive_subroutines(subroutines)
     assert actual == expected
 
 
-def test_findRecursionPoints_none():
+def test_find_callstack_exclusive_subroutines_none():
+    def sub1Impl():
+        return None
+
+    def sub2Impl(a1):
+        return None
+
+    def sub3Impl(a1, a2, a3):
+        return None
+
+    subroutine1 = pt.SubroutineDefinition(sub1Impl, pt.TealType.uint64)
+    subroutine2 = pt.SubroutineDefinition(sub2Impl, pt.TealType.bytes)
+    subroutine3 = pt.SubroutineDefinition(sub3Impl, pt.TealType.none)
+
+    subroutines = {
+        subroutine1: {subroutine2},
+        subroutine2: {subroutine3},
+        subroutine3: set(),
+    }
+
+    expected = {
+        subroutine1: set(),
+        subroutine2: set(),
+        subroutine3: set(),
+    }
+
+    actual = find_callstack_exclusive_subroutines(subroutines)
+    assert actual == expected
+
+
+def test_find_callstack_exclusive_subroutines_leaf_subroutines():
+    def sub1Impl():
+        return None
+
+    def sub2Impl(a1):
+        return None
+
+    def sub3Impl(a1, a2, a3):
+        return None
+
+    def sub4Impl():
+        return None
+
+    def sub5Impl():
+        return None
+
+    subroutine1 = pt.SubroutineDefinition(sub1Impl, pt.TealType.uint64)
+    subroutine2 = pt.SubroutineDefinition(sub2Impl, pt.TealType.bytes)
+    subroutine3 = pt.SubroutineDefinition(sub3Impl, pt.TealType.none)
+    subroutine4 = pt.SubroutineDefinition(sub4Impl, pt.TealType.none)
+    subroutine5 = pt.SubroutineDefinition(sub5Impl, pt.TealType.none)
+
+    subroutines = {
+        subroutine1: {subroutine2, subroutine4},
+        subroutine2: {subroutine3, subroutine5},
+        subroutine3: set(),
+        subroutine4: set(),
+        subroutine5: set(),
+    }
+
+    expected = {
+        subroutine1: set(),
+        subroutine2: {subroutine4},
+        subroutine3: {subroutine4, subroutine5},
+        subroutine4: {subroutine2, subroutine3, subroutine5},
+        subroutine5: {subroutine3, subroutine4},
+    }
+
+    actual = find_callstack_exclusive_subroutines(subroutines)
+    assert actual == expected
+
+
+def test_find_callstack_exclusive_subroutines_leaf_subroutines_recursive():
+    def sub1Impl():
+        return None
+
+    def sub2Impl(a1):
+        return None
+
+    def sub3Impl(a1, a2, a3):
+        return None
+
+    def sub4Impl():
+        return None
+
+    subroutine1 = pt.SubroutineDefinition(sub1Impl, pt.TealType.uint64)
+    subroutine2 = pt.SubroutineDefinition(sub2Impl, pt.TealType.bytes)
+    subroutine3 = pt.SubroutineDefinition(sub3Impl, pt.TealType.none)
+    subroutine4 = pt.SubroutineDefinition(sub4Impl, pt.TealType.none)
+
+    subroutines = {
+        subroutine1: {subroutine2, subroutine3},
+        subroutine2: {subroutine1, subroutine4},
+        subroutine3: set(),
+        subroutine4: set(),
+    }
+
+    expected = {
+        subroutine1: set(),
+        subroutine2: set(),
+        subroutine3: {subroutine4},
+        subroutine4: {subroutine3},
+    }
+
+    actual = find_callstack_exclusive_subroutines(subroutines)
+    assert actual == expected
+
+
+def test_find_recursion_points_empty():
+    subroutines = dict()
+
+    expected = dict()
+    actual = find_recursion_points(subroutines)
+    assert actual == expected
+
+
+def test_find_recursion_points_none():
     def sub1Impl():
         return None
 
@@ -45,11 +162,11 @@ def sub3Impl(a1, a2, a3):
         subroutine3: set(),
     }
 
-    actual = findRecursionPoints(subroutines)
+    actual = find_recursion_points(subroutines)
     assert actual == expected
 
 
-def test_findRecursionPoints_direct_recursion():
+def test_find_recursion_points_direct_recursion():
     def sub1Impl():
         return None
 
@@ -75,11 +192,11 @@ def sub3Impl(a1, a2, a3):
         subroutine3: set(),
     }
 
-    actual = findRecursionPoints(subroutines)
+    actual = find_recursion_points(subroutines)
     assert actual == expected
 
 
-def test_findRecursionPoints_mutual_recursion():
+def test_find_recursion_points_mutual_recursion():
     def sub1Impl():
         return None
 
@@ -105,11 +222,11 @@ def sub3Impl(a1, a2, a3):
         subroutine3: set(),
     }
 
-    actual = findRecursionPoints(subroutines)
+    actual = find_recursion_points(subroutines)
     assert actual == expected
 
 
-def test_findRecursionPoints_direct_and_mutual_recursion():
+def test_find_recursion_points_direct_and_mutual_recursion():
     def sub1Impl():
         return None
 
@@ -135,7 +252,7 @@ def sub3Impl(a1, a2, a3):
         subroutine3: set(),
     }
 
-    actual = findRecursionPoints(subroutines)
+    actual = find_recursion_points(subroutines)
     assert actual == expected
 
 

From ab9b3418e5fd5f319f45684b0a25ed58b9f08174 Mon Sep 17 00:00:00 2001
From: Jason Paulos <jasonpaulos@users.noreply.github.com>
Date: Tue, 13 Sep 2022 14:17:54 -0700
Subject: [PATCH 2/3] Fix missing len in error message

---
 pyteal/compiler/scratchslots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyteal/compiler/scratchslots.py b/pyteal/compiler/scratchslots.py
index 66905d109..480610ae0 100644
--- a/pyteal/compiler/scratchslots.py
+++ b/pyteal/compiler/scratchslots.py
@@ -224,7 +224,7 @@ def assignScratchSlotsToSubroutines(
     if len(slots_to_assign) > NUM_SLOTS:
         raise TealInternalError(
             "Too many slots in use: {}, maximum is {}".format(
-                slots_to_assign, NUM_SLOTS
+                len(slots_to_assign), NUM_SLOTS
             )
         )
 

From 61fbc82e7f0c9985f6d67c4fc479167cfbbe6d58 Mon Sep 17 00:00:00 2001
From: Jason Paulos <jasonpaulos@users.noreply.github.com>
Date: Wed, 14 Sep 2022 08:17:58 -0700
Subject: [PATCH 3/3] Move greedy algorithm to separate function to better show
 where it starts and ends

---
 .../application/abi/algobank_approval.teal    |   8 +-
 pyteal/compiler/scratchslots.py               | 183 +++++++++++-------
 2 files changed, 112 insertions(+), 79 deletions(-)

diff --git a/examples/application/abi/algobank_approval.teal b/examples/application/abi/algobank_approval.teal
index 553b3fd20..b79c430fb 100644
--- a/examples/application/abi/algobank_approval.teal
+++ b/examples/application/abi/algobank_approval.teal
@@ -202,23 +202,23 @@ retsub
 
 // withdraw
 withdraw_3:
-store 5
 store 6
+store 5
 txn Sender
 byte "balance"
 txn Sender
 byte "balance"
 app_local_get
-load 6
+load 5
 -
 app_local_put
 itxn_begin
 int pay
 itxn_field TypeEnum
-load 5
+load 6
 txnas Accounts
 itxn_field Receiver
-load 6
+load 5
 itxn_field Amount
 int 0
 itxn_field Fee
diff --git a/pyteal/compiler/scratchslots.py b/pyteal/compiler/scratchslots.py
index 480610ae0..61bbbcf97 100644
--- a/pyteal/compiler/scratchslots.py
+++ b/pyteal/compiler/scratchslots.py
@@ -88,6 +88,100 @@ def collectSlotsFromBlock(block: TealBlock, slots: Set[ScratchSlot]):
     return global_slots, local_slots
 
 
+def combine_subroutine_slot_assignments_greedy_algorithm(
+    combined_subroutine_groups: list[set[Optional[SubroutineDefinition]]],
+    subroutine_graph: dict[Optional[SubroutineDefinition], set[SubroutineDefinition]],
+) -> None:
+    """This is an imperfect greedy algorithm to share scratch slot assignments between callstack
+    exclusive subroutines.
+
+    * It's granularity is at the subroutine level, meaning it decides that two subroutines must share
+      all of their scratch slot assignments, or none of them.
+    * It uses the "exclusivity" of a subroutine (i.e. how many other subroutines it's callstack
+      exclusive with) as a heuristic to combine subroutines.
+    * Analysis has not been done to prove that this algorithm always terminates (or if its results
+      are anywhere near optimal).
+    * WARNING: this algorithm DOES NOT honor user-defined scratch slots. Those slots may be
+      assigned to a numeric slot which IS NOT what the user specified.
+
+    Args:
+        combined_subroutine_groups: A list of sets of subroutines. Each set indicates subroutines
+            which will share scratch slot assignments. This is a makeshift union-find data
+            structure.
+        subroutine_graph: A graph of subroutines. Each key is a subroutine (the main routine should
+            not be present), which represents a node in the graph. Each value is a set of all
+            subroutines that specific subroutine calls, which represent directional edges in the
+            graph.
+    """
+    callstack_exclusive_subroutines = find_callstack_exclusive_subroutines(
+        subroutine_graph
+    )
+
+    if len(callstack_exclusive_subroutines) == 0:
+        return
+
+    # choose "most exclusive" (i.e. most compatible) subroutine to start
+    current_subroutine = max(
+        callstack_exclusive_subroutines.keys(),
+        key=lambda s: len(callstack_exclusive_subroutines[s]),
+    )
+    while True:
+        group_index = -1
+        for i, group in enumerate(combined_subroutine_groups):
+            if current_subroutine in group:
+                group_index = i
+                break
+
+        # only look at subroutines we're not already grouped with
+        new_callstack_exclusive = [
+            s
+            for s in callstack_exclusive_subroutines[current_subroutine]
+            if s not in combined_subroutine_groups[group_index]
+        ]
+        if len(new_callstack_exclusive) == 0:
+            # nothing else to do
+            break
+
+        # choose the "most exclusive" subroutine that is exclusive to `current_subroutine`
+        to_combine = max(
+            new_callstack_exclusive,
+            key=lambda s: len(callstack_exclusive_subroutines[s]),
+        )
+        # Share scratch slot assignments between `current_subroutine` and `to_combine`.
+        to_combine_group_index = -1
+        for i, group in enumerate(combined_subroutine_groups):
+            if to_combine in group:
+                to_combine_group_index = i
+                break
+        combined_subroutine_groups[group_index] |= combined_subroutine_groups[
+            to_combine_group_index
+        ]
+        combined_subroutine_groups.pop(to_combine_group_index)
+
+        # BEWARE! Now that we've decided to share scratch slot assignments between the two
+        # subroutines, this potentially limits the other subroutines that they can share assignments
+        # with. Specifically, if even if `current_subroutine` is callstack exclusive with another
+        # subroutine `X`, if `to_combine` is not callstack exclusive with `X`, it's no longer safe
+        # for `current_subroutine` to share assignments with `X`. We encode this constraint by
+        # taking the intersection of `current_subroutine` and `to_combine`'s callstack exclusive
+        # subroutines.
+        intersection = (
+            callstack_exclusive_subroutines[current_subroutine]
+            & callstack_exclusive_subroutines[to_combine]
+        )
+        callstack_exclusive_subroutines[current_subroutine] = intersection | {
+            to_combine
+        }
+        callstack_exclusive_subroutines[to_combine] = intersection | {
+            cast(SubroutineDefinition, current_subroutine)
+        }
+
+        current_subroutine = max(
+            callstack_exclusive_subroutines.keys(),
+            key=lambda s: len(callstack_exclusive_subroutines[s]),
+        )
+
+
 def assignScratchSlotsToSubroutines(
     subroutineBlocks: Dict[Optional[SubroutineDefinition], TealBlock],
     subroutine_graph: dict[Optional[SubroutineDefinition], set[SubroutineDefinition]],
@@ -99,6 +193,10 @@ def assignScratchSlotsToSubroutines(
             blocks. The key None is taken to mean the main program routine. The values of this
             map will be modified in order to assign specific slot values to all referenced scratch
             slots.
+        subroutine_graph: A graph of subroutines. Each key is a subroutine (the main routine should
+            not be present), which represents a node in the graph. Each value is a set of all
+            subroutines that specific subroutine calls, which represent directional edges in the
+            graph.
 
     Raises:
         TealInternalError: if the scratch slots referenced by the program do not fit into 256 slots,
@@ -115,84 +213,19 @@ def assignScratchSlotsToSubroutines(
         *local_slots.values()
     )
 
-    # This is an imperfect greedy algorithm to share scratch slot assignments between callstack
-    # exclusive subroutines.
-    # * It's granularity is at the subroutine level, meaning it decides that two subroutines must share
-    #   all of their scratch slot assignments, or none of them.
-    # * It uses the "exclusivity" of a subroutine (i.e. how many other subroutines it's callstack
-    #   exclusive with) as a heuristic to combine subroutines.
-    # * Analysis has not been done to prove that this algorithm always terminates (or if its results
-    #   are anywhere near optimal).
-    # * WARNING: this algorithm DOES NOT honor user-defined scratch slots. Those slots may be
-    #   assigned to a numeric slot which IS NOT what the user specified.
-    callstack_exclusive_subroutines = find_callstack_exclusive_subroutines(
-        subroutine_graph
-    )
-    # combined_subroutine_groups is a makeshift union-find data structure
+    # combined_subroutine_groups is a makeshift union-find data structure which identifies which
+    # subroutines will share scratch slot assignments.
+    # TODO: replace this with an actual union-find data structure
+    # TODO: it may make more sense to decide whether two subroutines can share an assignment on a
+    # slot-by-slot basis, instead of grouping all a subroutine's slots together.
     combined_subroutine_groups: list[set[Optional[SubroutineDefinition]]] = [
-        {s} for s in callstack_exclusive_subroutines.keys()
+        {s} for s in subroutine_graph.keys()
     ]
-    if len(callstack_exclusive_subroutines) != 0:
-        # choose "most exclusive" (i.e. most compatible) subroutine to start
-        current_subroutine = max(
-            callstack_exclusive_subroutines.keys(),
-            key=lambda s: len(callstack_exclusive_subroutines[s]),
-        )
-        while True:
-            group_index = -1
-            for i, group in enumerate(combined_subroutine_groups):
-                if current_subroutine in group:
-                    group_index = i
-                    break
-
-            # only look at subroutines we're not already grouped with
-            new_callstack_exclusive = [
-                s
-                for s in callstack_exclusive_subroutines[current_subroutine]
-                if s not in combined_subroutine_groups[group_index]
-            ]
-            if len(new_callstack_exclusive) == 0:
-                # nothing else to do
-                break
 
-            # choose the "most exclusive" subroutine that is exclusive to `current_subroutine`
-            to_combine = max(
-                new_callstack_exclusive,
-                key=lambda s: len(callstack_exclusive_subroutines[s]),
-            )
-            # Share scratch slot assignments between `current_subroutine` and `to_combine`.
-            to_combine_group_index = -1
-            for i, group in enumerate(combined_subroutine_groups):
-                if to_combine in group:
-                    to_combine_group_index = i
-                    break
-            combined_subroutine_groups[group_index] |= combined_subroutine_groups[
-                to_combine_group_index
-            ]
-            combined_subroutine_groups.pop(to_combine_group_index)
-
-            # BEWARE! Now that we've decided to share scratch slot assignments between the two
-            # subroutines, this potentially limits the other subroutines that they can share assignments
-            # with. Specifically, if even if `current_subroutine` is callstack exclusive with another
-            # subroutine `X`, if `to_combine` is not callstack exclusive with `X`, it's no longer safe
-            # for `current_subroutine` to share assignments with `X`. We encode this constraint by
-            # taking the intersection of `current_subroutine` and `to_combine`'s callstack exclusive
-            # subroutines.
-            intersection = (
-                callstack_exclusive_subroutines[current_subroutine]
-                & callstack_exclusive_subroutines[to_combine]
-            )
-            callstack_exclusive_subroutines[current_subroutine] = intersection | {
-                to_combine
-            }
-            callstack_exclusive_subroutines[to_combine] = intersection | {
-                cast(SubroutineDefinition, current_subroutine)
-            }
-
-            current_subroutine = max(
-                callstack_exclusive_subroutines.keys(),
-                key=lambda s: len(callstack_exclusive_subroutines[s]),
-            )
+    # TODO: implement a way to opt into this optimization -- don't always run it
+    combine_subroutine_slot_assignments_greedy_algorithm(
+        combined_subroutine_groups, subroutine_graph
+    )
 
     # the "spokesperson" for a group is the subroutine with the largest number of local slots
     combined_subroutine_groups_spokesperson: list[Optional[SubroutineDefinition]] = []