From 56cd3614a572cc463281c096e4710f35c4fb145e Mon Sep 17 00:00:00 2001
From: Max Marrone <max@opentrons.com>
Date: Mon, 7 Oct 2024 11:11:46 -0400
Subject: [PATCH] feat(robot-server): Wire up global error recovery setting
 (#16416)

## Overview

Closes EXEC-719.

## Changelog

* Connect the new `PATCH /errorRecovery/settings` HTTP API so it
actually affects run behavior. This closes EXEC-719.
    * One quirk to my implementation:

This will not affect preexisting runs, *unless* you *also* do `PUT
/runs/{id}/errorRecoveryPolicy`. In effect, `PUT
/runs/{id}/errorRecoveryPolicy` refreshes the run's snapshot of
`/errorRecovery/settings`. This is probably confusing; it would be
better if `PATCH /errorRecovery/settings` immediately affected the
preexisting run, or if `PUT /runs/{id}/errorRecoveryPolicy` did not have
that refresh effect.

I don't think this matters in practice because this button will be
disabled or inaccessible in our UIs while there's an active run.

Also fix a couple of nearby, unticketed bugs:

* If a client gave us an empty error recovery policy, we were no-opping.
Instead, we want to overwrite the current policy with the new empty one.
* The logic in `error_recovery_mapping.py` was unintentionally enabling
error recovery on OT-2 robots if the policy was specific enough. We want
it to always be disabled on OT-2s.
---
 .../protocol_engine/error_recovery_policy.py  |   1 +
 .../error_recovery/settings/router.py         |   4 -
 .../error_recovery/settings/store.py          |  13 +-
 .../robot_server/runs/dependencies.py         |  10 +-
 .../runs/error_recovery_mapping.py            | 121 +++++++----
 .../runs/error_recovery_models.py             |  49 +++--
 .../robot_server/runs/router/base_router.py   |  17 +-
 .../robot_server/runs/run_data_manager.py     |  34 ++-
 .../runs/run_orchestrator_store.py            |   8 +-
 .../error_recovery/settings/test_store.py     |   4 +-
 .../tests/runs/router/test_base_router.py     |   8 +-
 .../tests/runs/test_error_recovery_mapping.py |   6 +-
 .../tests/runs/test_run_data_manager.py       | 199 +++++++++---------
 ...tore.py => test_run_orchestrator_store.py} |  12 ++
 14 files changed, 283 insertions(+), 203 deletions(-)
 rename robot-server/tests/runs/{test_engine_store.py => test_run_orchestrator_store.py} (94%)

diff --git a/api/src/opentrons/protocol_engine/error_recovery_policy.py b/api/src/opentrons/protocol_engine/error_recovery_policy.py
index f9f39d99f4d..d959651393e 100644
--- a/api/src/opentrons/protocol_engine/error_recovery_policy.py
+++ b/api/src/opentrons/protocol_engine/error_recovery_policy.py
@@ -40,6 +40,7 @@ class ErrorRecoveryPolicy(Protocol):
     and return an appropriate `ErrorRecoveryType`.
 
     Args:
+        config: The config of the calling `ProtocolEngine`.
         failed_command: The command that failed, in its final `status=="failed"` state.
         defined_error_data: If the command failed with a defined error, details about
             that error. If the command failed with an undefined error, `None`.
diff --git a/robot-server/robot_server/error_recovery/settings/router.py b/robot-server/robot_server/error_recovery/settings/router.py
index 1a302b05582..4fdfeee5498 100644
--- a/robot-server/robot_server/error_recovery/settings/router.py
+++ b/robot-server/robot_server/error_recovery/settings/router.py
@@ -61,10 +61,6 @@ async def _get_current_response(
     store: ErrorRecoverySettingStore,
 ) -> PydanticResponse[SimpleBody[ResponseData]]:
     is_enabled = store.get_is_enabled()
-    if is_enabled is None:
-        # todo(mm, 2024-09-30): This defaulting will probably need to move down a layer
-        # when we connect this setting to `POST /runs`.
-        is_enabled = True
     return await PydanticResponse.create(
         SimpleBody.construct(data=ResponseData.construct(enabled=is_enabled))
     )
diff --git a/robot-server/robot_server/error_recovery/settings/store.py b/robot-server/robot_server/error_recovery/settings/store.py
index 6cef66aae2e..7bad6b5c77c 100644
--- a/robot-server/robot_server/error_recovery/settings/store.py
+++ b/robot-server/robot_server/error_recovery/settings/store.py
@@ -10,24 +10,25 @@
 from robot_server.persistence.tables import boolean_setting_table, BooleanSettingKey
 
 
+_ERROR_RECOVERY_ENABLED_DEFAULT = True
+
+
 class ErrorRecoverySettingStore:
     """Persistently stores settings related to error recovery."""
 
     def __init__(self, sql_engine: sqlalchemy.engine.Engine) -> None:
         self._sql_engine = sql_engine
 
-    def get_is_enabled(self) -> bool | None:
-        """Get the value of the "error recovery enabled" setting.
-
-        `None` is the default, i.e. it's never been explicitly set one way or the other.
-        """
+    def get_is_enabled(self) -> bool:
+        """Get the value of the "error recovery enabled" setting."""
         with self._sql_engine.begin() as transaction:
-            return transaction.execute(
+            result: bool | None = transaction.execute(
                 sqlalchemy.select(boolean_setting_table.c.value).where(
                     boolean_setting_table.c.key
                     == BooleanSettingKey.ENABLE_ERROR_RECOVERY
                 )
             ).scalar_one_or_none()
+        return result if result is not None else _ERROR_RECOVERY_ENABLED_DEFAULT
 
     def set_is_enabled(self, is_enabled: bool | None) -> None:
         """Set the value of the "error recovery enabled" setting.
diff --git a/robot-server/robot_server/runs/dependencies.py b/robot-server/robot_server/runs/dependencies.py
index 146297715a9..60c96d1e5e6 100644
--- a/robot-server/robot_server/runs/dependencies.py
+++ b/robot-server/robot_server/runs/dependencies.py
@@ -2,6 +2,10 @@
 from typing import Annotated
 
 from fastapi import Depends, status
+from robot_server.error_recovery.settings.store import (
+    ErrorRecoverySettingStore,
+    get_error_recovery_setting_store,
+)
 from robot_server.protocols.dependencies import get_protocol_store
 from robot_server.protocols.protocol_models import ProtocolKind
 from robot_server.protocols.protocol_store import ProtocolStore
@@ -156,12 +160,16 @@ async def get_run_data_manager(
     ],
     run_store: Annotated[RunStore, Depends(get_run_store)],
     runs_publisher: Annotated[RunsPublisher, Depends(get_runs_publisher)],
+    error_recovery_setting_store: Annotated[
+        ErrorRecoverySettingStore, Depends(get_error_recovery_setting_store)
+    ],
 ) -> RunDataManager:
     """Get a run data manager to keep track of current/historical run data."""
     return RunDataManager(
-        task_runner=task_runner,
         run_orchestrator_store=run_orchestrator_store,
         run_store=run_store,
+        error_recovery_setting_store=error_recovery_setting_store,
+        task_runner=task_runner,
         runs_publisher=runs_publisher,
     )
 
diff --git a/robot-server/robot_server/runs/error_recovery_mapping.py b/robot-server/robot_server/runs/error_recovery_mapping.py
index fb936eddadd..d29ebf4b054 100644
--- a/robot-server/robot_server/runs/error_recovery_mapping.py
+++ b/robot-server/robot_server/runs/error_recovery_mapping.py
@@ -1,5 +1,4 @@
 """Functions used for managing error recovery policy."""
-from typing import Optional
 from opentrons.protocol_engine.state.config import Config
 from robot_server.runs.error_recovery_models import ErrorRecoveryRule, ReactionIfMatch
 from opentrons.protocol_engine.commands.command_unions import (
@@ -14,52 +13,96 @@
 
 def create_error_recovery_policy_from_rules(
     rules: list[ErrorRecoveryRule],
+    enabled: bool,
 ) -> ErrorRecoveryPolicy:
-    """Given a list of error recovery rules return an error recovery policy."""
+    """Map a robot-server error recovery policy to an opentrons.protocol_engine one.
 
-    def _policy(
+    In its HTTP API, robot-server expresses error recovery policies as Pydantic models.
+    But opentrons.protocol_engine is more general, expressing them as Python callables.
+
+    Args:
+        rules: The rules in the robot-server error recovery policy.
+        enabled: Whether error recovery should be enabled at all.
+            If `False`, `rules` is ignored.
+
+    Returns:
+        An error recovery policy in `opentrons.protocol_engine` terms.
+    """
+
+    def mapped_policy(
         config: Config,
         failed_command: Command,
-        defined_error_data: Optional[CommandDefinedErrorData],
+        defined_error_data: CommandDefinedErrorData | None,
     ) -> ErrorRecoveryType:
-        for rule in rules:
-            command_type_matches = (
-                failed_command.commandType == rule.matchCriteria.command.commandType
-            )
-            error_type_matches = (
-                defined_error_data is not None
-                and defined_error_data.public.errorType
-                == rule.matchCriteria.command.error.errorType
+        first_matching_rule = next(
+            (
+                rule
+                for rule in rules
+                if _rule_matches_error(rule, failed_command, defined_error_data)
+            ),
+            None,
+        )
+        robot_is_flex = config.robot_type == "OT-3 Standard"
+        error_is_defined = defined_error_data is not None
+
+        if not enabled:
+            return ErrorRecoveryType.FAIL_RUN
+        elif not robot_is_flex:
+            # Although error recovery can theoretically work on OT-2s, we haven't tested
+            # it, and it's generally scarier because the OT-2 has much less hardware
+            # feedback.
+            return ErrorRecoveryType.FAIL_RUN
+        elif first_matching_rule is not None:
+            # The input policy explicitly deals this error, so do what it says.
+            return _map_error_recovery_type(first_matching_rule.ifMatch)
+        else:
+            # The input policy doesn't explicitly deal with this error, so the decision
+            # is our own.
+            #
+            # We try to WAIT_FOR_RECOVERY whenever we can, for two reasons:
+            #
+            # 1. It matches the frontend's current expectations.
+            #    For example, the frontend expects us to WAIT_FOR_RECOVERY on
+            #    overpressure errors, but it does not send us an error recovery policy
+            #    that explicitly says that; it relies on this default.
+            # 2. Philosophically, we always want to give the operator a shot at
+            #    recovery, even if we don't know the details of the problem and can't
+            #    guarantee good robot behavior if they keep using it.
+            #
+            # We currently FAIL_RUN for undefined errors, with the thinking that they
+            # are especially likely to have messed something up in Protocol Engine's
+            # internal state, and that they are especially likely to cause confusing
+            # behavior. But we might want to change that--see point (2) above.
+            return (
+                ErrorRecoveryType.WAIT_FOR_RECOVERY
+                if error_is_defined
+                else ErrorRecoveryType.FAIL_RUN
             )
 
-            if command_type_matches and error_type_matches:
-                if rule.ifMatch == ReactionIfMatch.IGNORE_AND_CONTINUE:
-                    return ErrorRecoveryType.IGNORE_AND_CONTINUE
-                elif rule.ifMatch == ReactionIfMatch.FAIL_RUN:
-                    return ErrorRecoveryType.FAIL_RUN
-                elif rule.ifMatch == ReactionIfMatch.WAIT_FOR_RECOVERY:
-                    return ErrorRecoveryType.WAIT_FOR_RECOVERY
+    return mapped_policy
 
-        return default_error_recovery_policy(config, failed_command, defined_error_data)
 
-    return _policy
+def _rule_matches_error(
+    rule: ErrorRecoveryRule,
+    failed_command: Command,
+    defined_error_data: CommandDefinedErrorData | None,
+) -> bool:
+    command_type_matches = (
+        failed_command.commandType == rule.matchCriteria.command.commandType
+    )
+    error_type_matches = (
+        defined_error_data is not None
+        and defined_error_data.public.errorType
+        == rule.matchCriteria.command.error.errorType
+    )
+    return command_type_matches and error_type_matches
 
 
-def default_error_recovery_policy(
-    config: Config,
-    failed_command: Command,
-    defined_error_data: Optional[CommandDefinedErrorData],
-) -> ErrorRecoveryType:
-    """The `ErrorRecoveryPolicy` to use when none has been set on a run."""
-    # Although error recovery can theoretically work on OT-2s, we haven't tested it,
-    # and it's generally scarier because the OT-2 has much less hardware feedback.
-    robot_is_flex = config.robot_type == "OT-3 Standard"
-    # If the error is defined, we're taking that to mean that we should
-    # WAIT_FOR_RECOVERY. This is not necessarily the right long-term logic--we might
-    # want to FAIL_RUN on certain defined errors and WAIT_FOR_RECOVERY on certain
-    # undefined errors--but this is convenient for now.
-    error_is_defined = defined_error_data is not None
-    if robot_is_flex and error_is_defined:
-        return ErrorRecoveryType.WAIT_FOR_RECOVERY
-    else:
-        return ErrorRecoveryType.FAIL_RUN
+def _map_error_recovery_type(reaction_if_match: ReactionIfMatch) -> ErrorRecoveryType:
+    match reaction_if_match:
+        case ReactionIfMatch.IGNORE_AND_CONTINUE:
+            return ErrorRecoveryType.IGNORE_AND_CONTINUE
+        case ReactionIfMatch.FAIL_RUN:
+            return ErrorRecoveryType.FAIL_RUN
+        case ReactionIfMatch.WAIT_FOR_RECOVERY:
+            return ErrorRecoveryType.WAIT_FOR_RECOVERY
diff --git a/robot-server/robot_server/runs/error_recovery_models.py b/robot-server/robot_server/runs/error_recovery_models.py
index 5558c65a8ac..a2990a007cb 100644
--- a/robot-server/robot_server/runs/error_recovery_models.py
+++ b/robot-server/robot_server/runs/error_recovery_models.py
@@ -4,13 +4,31 @@
 
 from pydantic import BaseModel, Field
 
+# There's a lot of nested classes here.
+# Here's an example of a JSON document that this code models:
+# {
+#   "policyRules": [
+#     {
+#       "matchCriteria": {
+#         "command": {
+#           "commandType": "foo",
+#           "error": {
+#             "errorType": "bar"
+#           }
+#         }
+#       },
+#       "ifMatch": "ignoreAndContinue"
+#     }
+#   ]
+# }
+
 
 class ReactionIfMatch(Enum):
-    """The type of the error recovery setting.
+    """How to handle a given error.
 
-    * `"ignoreAndContinue"`: Ignore this error and future errors of the same type.
-    * `"failRun"`: Errors of this type should fail the run.
-    * `"waitForRecovery"`: Instances of this error should initiate a recover operation.
+    * `"ignoreAndContinue"`: Ignore this error and continue with the next command.
+    * `"failRun"`: Fail the run.
+    * `"waitForRecovery"`: Enter interactive error recovery mode.
 
     """
 
@@ -19,20 +37,6 @@ class ReactionIfMatch(Enum):
     WAIT_FOR_RECOVERY = "waitForRecovery"
 
 
-# There's a lot of nested classes here. This is the JSON schema this code models.
-# "ErrorRecoveryRule": {
-#     "matchCriteria": {
-#         "command": {
-#             "commandType": "foo",
-#             "error": {
-#                 "errorType": "bar"
-#             }
-#         }
-#     },
-#     "ifMatch": "baz"
-# }
-
-
 class ErrorMatcher(BaseModel):
     """The error type that this rule applies to."""
 
@@ -67,7 +71,7 @@ class ErrorRecoveryRule(BaseModel):
     )
     ifMatch: ReactionIfMatch = Field(
         ...,
-        description="The specific recovery setting that will be in use if the type parameters match.",
+        description="How to handle errors matched by this rule.",
     )
 
 
@@ -76,6 +80,9 @@ class ErrorRecoveryPolicy(BaseModel):
 
     policyRules: List[ErrorRecoveryRule] = Field(
         ...,
-        description="A list or error recovery rules to apply for a run's recovery management."
-        "The rules are evaluated first-to-last. The first exact match will dectate recovery management.",
+        description=(
+            "A list of error recovery rules to apply for a run's recovery management."
+            " The rules are evaluated first-to-last."
+            " The first exact match will dictate recovery management."
+        ),
     )
diff --git a/robot-server/robot_server/runs/router/base_router.py b/robot-server/robot_server/runs/router/base_router.py
index 3b0e7040e02..b9bd8cd24b2 100644
--- a/robot-server/robot_server/runs/router/base_router.py
+++ b/robot-server/robot_server/runs/router/base_router.py
@@ -426,11 +426,13 @@ async def update_run(
 @PydanticResponse.wrap_route(
     base_router.put,
     path="/runs/{runId}/errorRecoveryPolicy",
-    summary="Set run policies",
+    summary="Set a run's error recovery policy",
     description=dedent(
         """
         Update how to handle different kinds of command failures.
-        The following rules will persist during the run.
+
+        For this to have any effect, error recovery must also be enabled globally.
+        See `PATCH /errorRecovery/settings`.
         """
     ),
     status_code=status.HTTP_201_CREATED,
@@ -451,12 +453,11 @@ async def put_error_recovery_policy(
         request_body:  Request body with run policies data.
         run_data_manager: Current and historical run data management.
     """
-    policies = request_body.data.policyRules
-    if policies:
-        try:
-            run_data_manager.set_policies(run_id=runId, policies=policies)
-        except RunNotCurrentError as e:
-            raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e
+    rules = request_body.data.policyRules
+    try:
+        run_data_manager.set_error_recovery_rules(run_id=runId, rules=rules)
+    except RunNotCurrentError as e:
+        raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e
 
     return await PydanticResponse.create(
         content=SimpleEmptyBody.construct(),
diff --git a/robot-server/robot_server/runs/run_data_manager.py b/robot-server/robot_server/runs/run_data_manager.py
index b756b185bf1..4168b1d4d5d 100644
--- a/robot-server/robot_server/runs/run_data_manager.py
+++ b/robot-server/robot_server/runs/run_data_manager.py
@@ -22,6 +22,7 @@
     CSVRuntimeParamPaths,
 )
 
+from robot_server.error_recovery.settings.store import ErrorRecoverySettingStore
 from robot_server.protocols.protocol_store import ProtocolResource
 from robot_server.service.task_runner import TaskRunner
 from robot_server.service.notifications import RunsPublisher
@@ -35,6 +36,9 @@
 from opentrons.protocol_engine.types import DeckConfigurationType, RunTimeParameter
 
 
+_INITIAL_ERROR_RECOVERY_RULES: list[ErrorRecoveryRule] = []
+
+
 def _build_run(
     run_resource: Union[RunResource, BadRunResource],
     state_summary: Union[StateSummary, BadStateSummary],
@@ -145,11 +149,13 @@ def __init__(
         self,
         run_orchestrator_store: RunOrchestratorStore,
         run_store: RunStore,
+        error_recovery_setting_store: ErrorRecoverySettingStore,
         task_runner: TaskRunner,
         runs_publisher: RunsPublisher,
     ) -> None:
         self._run_orchestrator_store = run_orchestrator_store
         self._run_store = run_store
+        self._error_recovery_setting_store = error_recovery_setting_store
         self._task_runner = task_runner
         self._runs_publisher = runs_publisher
 
@@ -190,6 +196,7 @@ async def create(
         """
         prev_run_id = self._run_orchestrator_store.current_run_id
         if prev_run_id is not None:
+            # Allow clear() to propagate RunConflictError.
             prev_run_result = await self._run_orchestrator_store.clear()
             self._run_store.update_run_state(
                 run_id=prev_run_id,
@@ -197,9 +204,18 @@ async def create(
                 commands=prev_run_result.commands,
                 run_time_parameters=prev_run_result.parameters,
             )
+
+        error_recovery_is_enabled = self._error_recovery_setting_store.get_is_enabled()
+        initial_error_recovery_policy = (
+            error_recovery_mapping.create_error_recovery_policy_from_rules(
+                _INITIAL_ERROR_RECOVERY_RULES, error_recovery_is_enabled
+            )
+        )
+
         state_summary = await self._run_orchestrator_store.create(
             run_id=run_id,
             labware_offsets=labware_offsets,
+            initial_error_recovery_policy=initial_error_recovery_policy,
             deck_configuration=deck_configuration,
             protocol=protocol,
             run_time_param_values=run_time_param_values,
@@ -215,6 +231,7 @@ async def create(
         self._run_store.insert_csv_rtp(
             run_id=run_id, run_time_parameters=run_time_parameters
         )
+
         self._runs_publisher.start_publishing_for_run(
             get_current_command=self.get_current_command,
             get_recovery_target_command=self.get_recovery_target_command,
@@ -498,16 +515,23 @@ def get_all_commands_as_preserialized_list(
             run_id, include_fixit_commands
         )
 
-    def set_policies(self, run_id: str, policies: List[ErrorRecoveryRule]) -> None:
-        """Create run policy rules for error recovery."""
+    def set_error_recovery_rules(
+        self, run_id: str, rules: List[ErrorRecoveryRule]
+    ) -> None:
+        """Set the run's error recovery policy.
+
+        The input rules get combined with the global error recovery enabled/disabled
+        setting, which this method retrieves automatically.
+        """
         if run_id != self._run_orchestrator_store.current_run_id:
             raise RunNotCurrentError(
                 f"Cannot update {run_id} because it is not the current run."
             )
-        policy = error_recovery_mapping.create_error_recovery_policy_from_rules(
-            policies
+        is_enabled = self._error_recovery_setting_store.get_is_enabled()
+        mapped_policy = error_recovery_mapping.create_error_recovery_policy_from_rules(
+            rules, is_enabled
         )
-        self._run_orchestrator_store.set_error_recovery_policy(policy=policy)
+        self._run_orchestrator_store.set_error_recovery_policy(policy=mapped_policy)
 
     def _get_state_summary(self, run_id: str) -> Union[StateSummary, BadStateSummary]:
         if run_id == self._run_orchestrator_store.current_run_id:
diff --git a/robot-server/robot_server/runs/run_orchestrator_store.py b/robot-server/robot_server/runs/run_orchestrator_store.py
index 46a384b96ea..03af7315ef9 100644
--- a/robot-server/robot_server/runs/run_orchestrator_store.py
+++ b/robot-server/robot_server/runs/run_orchestrator_store.py
@@ -53,8 +53,6 @@
 )
 from opentrons_shared_data.labware.types import LabwareUri
 
-from .error_recovery_mapping import default_error_recovery_policy
-
 _log = logging.getLogger(__name__)
 
 
@@ -119,8 +117,6 @@ def run_handler_in_engine_thread_from_hardware_thread(
 class RunOrchestratorStore:
     """Factory and in-memory storage for ProtocolEngine."""
 
-    _run_orchestrator: Optional[RunOrchestrator] = None
-
     def __init__(
         self,
         hardware_api: HardwareControlAPI,
@@ -138,6 +134,7 @@ def __init__(
         self._hardware_api = hardware_api
         self._robot_type = robot_type
         self._deck_type = deck_type
+        self._run_orchestrator: Optional[RunOrchestrator] = None
         self._default_run_orchestrator: Optional[RunOrchestrator] = None
         hardware_api.register_callback(_get_estop_listener(self))
 
@@ -194,6 +191,7 @@ async def create(
         self,
         run_id: str,
         labware_offsets: List[LabwareOffsetCreate],
+        initial_error_recovery_policy: error_recovery_policy.ErrorRecoveryPolicy,
         deck_configuration: DeckConfigurationType,
         notify_publishers: Callable[[], None],
         protocol: Optional[ProtocolResource],
@@ -235,7 +233,7 @@ async def create(
                     RobotTypeEnum.robot_literal_to_enum(self._robot_type)
                 ),
             ),
-            error_recovery_policy=default_error_recovery_policy,
+            error_recovery_policy=initial_error_recovery_policy,
             load_fixed_trash=load_fixed_trash,
             deck_configuration=deck_configuration,
             notify_publishers=notify_publishers,
diff --git a/robot-server/tests/error_recovery/settings/test_store.py b/robot-server/tests/error_recovery/settings/test_store.py
index cc69f5d307f..e67f5e72ee9 100644
--- a/robot-server/tests/error_recovery/settings/test_store.py
+++ b/robot-server/tests/error_recovery/settings/test_store.py
@@ -17,7 +17,7 @@ def subject(
 
 def test_error_recovery_setting_store(subject: ErrorRecoverySettingStore) -> None:
     """Test `ErrorRecoverySettingStore`."""
-    assert subject.get_is_enabled() is None
+    assert subject.get_is_enabled() is True
 
     subject.set_is_enabled(is_enabled=False)
     assert subject.get_is_enabled() is False
@@ -26,4 +26,4 @@ def test_error_recovery_setting_store(subject: ErrorRecoverySettingStore) -> Non
     assert subject.get_is_enabled() is True
 
     subject.set_is_enabled(is_enabled=None)
-    assert subject.get_is_enabled() is None
+    assert subject.get_is_enabled() is True
diff --git a/robot-server/tests/runs/router/test_base_router.py b/robot-server/tests/runs/router/test_base_router.py
index 37e5cd6dd3d..8a10af1940d 100644
--- a/robot-server/tests/runs/router/test_base_router.py
+++ b/robot-server/tests/runs/router/test_base_router.py
@@ -698,8 +698,8 @@ async def test_create_policies(
         run_data_manager=mock_run_data_manager,
     )
     decoy.verify(
-        mock_run_data_manager.set_policies(
-            run_id="rud-id", policies=policies.policyRules
+        mock_run_data_manager.set_error_recovery_rules(
+            run_id="rud-id", rules=policies.policyRules
         )
     )
 
@@ -710,8 +710,8 @@ async def test_create_policies_raises_not_active_run(
     """It should raise that the run is not current."""
     policies = decoy.mock(cls=ErrorRecoveryPolicy)
     decoy.when(
-        mock_run_data_manager.set_policies(
-            run_id="rud-id", policies=policies.policyRules
+        mock_run_data_manager.set_error_recovery_rules(
+            run_id="rud-id", rules=policies.policyRules
         )
     ).then_raise(RunNotCurrentError())
     with pytest.raises(ApiError) as exc_info:
diff --git a/robot-server/tests/runs/test_error_recovery_mapping.py b/robot-server/tests/runs/test_error_recovery_mapping.py
index a142fbc5e30..fba8e4315d9 100644
--- a/robot-server/tests/runs/test_error_recovery_mapping.py
+++ b/robot-server/tests/runs/test_error_recovery_mapping.py
@@ -71,7 +71,7 @@ def test_create_error_recovery_policy_with_rules(
     mock_rule: ErrorRecoveryRule,
 ) -> None:
     """Should return IGNORE_AND_CONTINUE if that's what we specify as the rule."""
-    policy = create_error_recovery_policy_from_rules([mock_rule])
+    policy = create_error_recovery_policy_from_rules([mock_rule], enabled=True)
     exampleConfig = Config(
         robot_type="OT-3 Standard",
         deck_type=DeckType.OT3_STANDARD,
@@ -86,7 +86,7 @@ def test_create_error_recovery_policy_undefined_error(
     decoy: Decoy, mock_command: LiquidProbe
 ) -> None:
     """Should return a FAIL_RUN policy when error is not defined."""
-    policy = create_error_recovery_policy_from_rules(rules=[])
+    policy = create_error_recovery_policy_from_rules(rules=[], enabled=True)
     exampleConfig = Config(
         robot_type="OT-3 Standard",
         deck_type=DeckType.OT3_STANDARD,
@@ -99,7 +99,7 @@ def test_create_error_recovery_policy_defined_error(
     decoy: Decoy, mock_command: LiquidProbe, mock_error_data: CommandDefinedErrorData
 ) -> None:
     """Should return a WAIT_FOR_RECOVERY policy when error is defined."""
-    policy = create_error_recovery_policy_from_rules(rules=[])
+    policy = create_error_recovery_policy_from_rules(rules=[], enabled=True)
     exampleConfig = Config(
         robot_type="OT-3 Standard",
         deck_type=DeckType.OT3_STANDARD,
diff --git a/robot-server/tests/runs/test_run_data_manager.py b/robot-server/tests/runs/test_run_data_manager.py
index 49399447597..981a0e7177c 100644
--- a/robot-server/tests/runs/test_run_data_manager.py
+++ b/robot-server/tests/runs/test_run_data_manager.py
@@ -1,10 +1,10 @@
 """Tests for RunDataManager."""
 from datetime import datetime
 from typing import Optional, List, Dict
+from unittest.mock import sentinel
 
 import pytest
 from decoy import Decoy, matchers
-from pathlib import Path
 
 from opentrons.protocol_engine import (
     EngineStatus,
@@ -21,16 +21,15 @@
     LabwareOffset,
     Liquid,
 )
-from opentrons.protocol_engine.error_recovery_policy import ErrorRecoveryPolicy
 from opentrons.protocol_engine.types import BooleanParameter, CSVParameter
 from opentrons.protocol_runner import RunResult
-from opentrons.types import DeckSlotName
 
 from opentrons.hardware_control.nozzle_manager import NozzleMap
 
 from opentrons_shared_data.errors.exceptions import InvalidStoredData
 from opentrons_shared_data.labware.labware_definition import LabwareDefinition
 
+from robot_server.error_recovery.settings.store import ErrorRecoverySettingStore
 from robot_server.protocols.protocol_models import ProtocolKind
 from robot_server.protocols.protocol_store import ProtocolResource
 from robot_server.runs import error_recovery_mapping
@@ -74,6 +73,12 @@ def mock_run_store(decoy: Decoy) -> RunStore:
     return decoy.mock(cls=RunStore)
 
 
+@pytest.fixture
+def mock_error_recovery_setting_store(decoy: Decoy) -> ErrorRecoverySettingStore:
+    """Get a mock ErrorRecoverySettingStore."""
+    return decoy.mock(cls=ErrorRecoverySettingStore)
+
+
 @pytest.fixture()
 def mock_task_runner(decoy: Decoy) -> TaskRunner:
     """Get a mock background TaskRunner."""
@@ -102,6 +107,20 @@ def engine_state_summary() -> StateSummary:
     )
 
 
+@pytest.fixture(autouse=True)
+def patch_error_recovery_mapping(decoy: Decoy, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Replace the members of the error_recovery_mapping module with mocks."""
+    monkeypatch.setattr(
+        error_recovery_mapping,
+        "create_error_recovery_policy_from_rules",
+        decoy.mock(
+            func=decoy.mock(
+                func=error_recovery_mapping.create_error_recovery_policy_from_rules
+            )
+        ),
+    )
+
+
 @pytest.fixture()
 def run_time_parameters() -> List[pe_types.RunTimeParameter]:
     """Get a RunTimeParameter list."""
@@ -150,6 +169,7 @@ def run_command() -> commands.Command:
 def subject(
     mock_run_orchestrator_store: RunOrchestratorStore,
     mock_run_store: RunStore,
+    mock_error_recovery_setting_store: ErrorRecoverySettingStore,
     mock_task_runner: TaskRunner,
     mock_runs_publisher: RunsPublisher,
 ) -> RunDataManager:
@@ -157,6 +177,7 @@ def subject(
     return RunDataManager(
         run_orchestrator_store=mock_run_orchestrator_store,
         run_store=mock_run_store,
+        error_recovery_setting_store=mock_error_recovery_setting_store,
         task_runner=mock_task_runner,
         runs_publisher=mock_runs_publisher,
     )
@@ -166,6 +187,7 @@ async def test_create(
     decoy: Decoy,
     mock_run_orchestrator_store: RunOrchestratorStore,
     mock_run_store: RunStore,
+    mock_error_recovery_setting_store: ErrorRecoverySettingStore,
     subject: RunDataManager,
     engine_state_summary: StateSummary,
     run_resource: RunResource,
@@ -173,102 +195,33 @@ async def test_create(
     """It should create an engine and a persisted run resource."""
     run_id = "hello world"
     created_at = datetime(year=2021, month=1, day=1)
-
-    decoy.when(
-        await mock_run_orchestrator_store.create(
-            run_id=run_id,
-            labware_offsets=[],
-            protocol=None,
-            deck_configuration=[],
-            run_time_param_values=None,
-            run_time_param_paths=None,
-            notify_publishers=mock_notify_publishers,
-        )
-    ).then_return(engine_state_summary)
-
-    decoy.when(mock_run_orchestrator_store.get_run_time_parameters()).then_return([])
-
-    decoy.when(
-        mock_run_store.insert(
-            run_id=run_id,
-            protocol_id=None,
-            created_at=created_at,
-        )
-    ).then_return(run_resource)
-
-    decoy.when(mock_run_orchestrator_store.get_run_time_parameters()).then_return([])
-
-    result = await subject.create(
-        run_id=run_id,
-        created_at=created_at,
-        labware_offsets=[],
-        protocol=None,
-        deck_configuration=[],
-        run_time_param_values=None,
-        run_time_param_paths=None,
-        notify_publishers=mock_notify_publishers,
-    )
-
-    assert result == Run(
-        id=run_resource.run_id,
-        protocolId=run_resource.protocol_id,
-        createdAt=run_resource.created_at,
-        current=True,
-        actions=run_resource.actions,
-        status=engine_state_summary.status,
-        errors=engine_state_summary.errors,
-        hasEverEnteredErrorRecovery=engine_state_summary.hasEverEnteredErrorRecovery,
-        labware=engine_state_summary.labware,
-        labwareOffsets=engine_state_summary.labwareOffsets,
-        pipettes=engine_state_summary.pipettes,
-        modules=engine_state_summary.modules,
-        liquids=engine_state_summary.liquids,
-    )
-    decoy.verify(mock_run_store.insert_csv_rtp(run_id=run_id, run_time_parameters=[]))
-
-
-async def test_create_with_options(
-    decoy: Decoy,
-    mock_run_orchestrator_store: RunOrchestratorStore,
-    mock_run_store: RunStore,
-    subject: RunDataManager,
-    engine_state_summary: StateSummary,
-    run_resource: RunResource,
-) -> None:
-    """It should handle creation with a protocol, labware offsets and parameters."""
-    run_id = "hello world"
-    created_at = datetime(year=2021, month=1, day=1)
-
     protocol = ProtocolResource(
-        protocol_id="protocol-id",
+        protocol_id=sentinel.protocol_id,
         created_at=datetime(year=2022, month=2, day=2),
         source=None,  # type: ignore[arg-type]
         protocol_key=None,
         protocol_kind=ProtocolKind.STANDARD,
     )
 
-    labware_offset = pe_types.LabwareOffsetCreate(
-        definitionUri="namespace/load_name/version",
-        location=pe_types.LabwareOffsetLocation(slotName=DeckSlotName.SLOT_5),
-        vector=pe_types.LabwareOffsetVector(x=1, y=2, z=3),
-    )
-
     decoy.when(
         await mock_run_orchestrator_store.create(
             run_id=run_id,
-            labware_offsets=[labware_offset],
+            labware_offsets=sentinel.labware_offsets,
+            initial_error_recovery_policy=sentinel.initial_error_recovery_policy,
             protocol=protocol,
-            deck_configuration=[],
-            run_time_param_values={"foo": "bar"},
-            run_time_param_paths={"xyzzy": Path("zork")},
+            deck_configuration=sentinel.deck_configuration,
+            run_time_param_values=sentinel.run_time_param_values,
+            run_time_param_paths=sentinel.run_time_param_paths,
             notify_publishers=mock_notify_publishers,
         )
     ).then_return(engine_state_summary)
 
+    decoy.when(mock_run_orchestrator_store.get_run_time_parameters()).then_return([])
+
     decoy.when(
         mock_run_store.insert(
             run_id=run_id,
-            protocol_id="protocol-id",
+            protocol_id=protocol.protocol_id,
             created_at=created_at,
         )
     ).then_return(run_resource)
@@ -276,21 +229,30 @@ async def test_create_with_options(
     bool_parameter = BooleanParameter(
         displayName="foo", variableName="bar", default=True, value=False
     )
-
     file_parameter = CSVParameter(displayName="my_file", variableName="file-id")
-
     decoy.when(mock_run_orchestrator_store.get_run_time_parameters()).then_return(
         [bool_parameter, file_parameter]
     )
 
+    expected_initial_error_recovery_rules: list[ErrorRecoveryRule] = []
+    decoy.when(mock_error_recovery_setting_store.get_is_enabled()).then_return(
+        sentinel.error_recovery_enabled
+    )
+    decoy.when(
+        error_recovery_mapping.create_error_recovery_policy_from_rules(
+            rules=expected_initial_error_recovery_rules,
+            enabled=sentinel.error_recovery_enabled,
+        )
+    ).then_return(sentinel.initial_error_recovery_policy)
+
     result = await subject.create(
         run_id=run_id,
         created_at=created_at,
-        labware_offsets=[labware_offset],
+        labware_offsets=sentinel.labware_offsets,
         protocol=protocol,
-        deck_configuration=[],
-        run_time_param_values={"foo": "bar"},
-        run_time_param_paths={"xyzzy": Path("zork")},
+        deck_configuration=sentinel.deck_configuration,
+        run_time_param_values=sentinel.run_time_param_values,
+        run_time_param_paths=sentinel.run_time_param_paths,
         notify_publishers=mock_notify_publishers,
     )
 
@@ -321,12 +283,24 @@ async def test_create_engine_error(
     decoy: Decoy,
     mock_run_orchestrator_store: RunOrchestratorStore,
     mock_run_store: RunStore,
+    mock_error_recovery_setting_store: ErrorRecoverySettingStore,
     subject: RunDataManager,
 ) -> None:
     """It should not create a resource if engine creation fails."""
     run_id = "hello world"
     created_at = datetime(year=2021, month=1, day=1)
 
+    expected_initial_error_recovery_rules: list[ErrorRecoveryRule] = []
+    decoy.when(mock_error_recovery_setting_store.get_is_enabled()).then_return(
+        sentinel.error_recovery_enabled
+    )
+    decoy.when(
+        error_recovery_mapping.create_error_recovery_policy_from_rules(
+            rules=expected_initial_error_recovery_rules,
+            enabled=sentinel.error_recovery_enabled,
+        )
+    ).then_return(sentinel.initial_error_recovery_policy)
+
     decoy.when(
         await mock_run_orchestrator_store.create(
             run_id,
@@ -336,6 +310,7 @@ async def test_create_engine_error(
             run_time_param_values=None,
             run_time_param_paths=None,
             notify_publishers=mock_notify_publishers,
+            initial_error_recovery_policy=matchers.Anything(),
         )
     ).then_raise(RunConflictError("oh no"))
 
@@ -783,6 +758,7 @@ async def test_create_archives_existing(
     run_command: commands.Command,
     mock_run_orchestrator_store: RunOrchestratorStore,
     mock_run_store: RunStore,
+    mock_error_recovery_setting_store: ErrorRecoverySettingStore,
     subject: RunDataManager,
 ) -> None:
     """It should persist the previously current run when a new run is created."""
@@ -798,11 +774,23 @@ async def test_create_archives_existing(
         )
     )
 
+    expected_initial_error_recovery_rules: list[ErrorRecoveryRule] = []
+    decoy.when(mock_error_recovery_setting_store.get_is_enabled()).then_return(
+        sentinel.error_recovery_enabled
+    )
+    decoy.when(
+        error_recovery_mapping.create_error_recovery_policy_from_rules(
+            rules=expected_initial_error_recovery_rules,
+            enabled=sentinel.error_recovery_enabled,
+        )
+    ).then_return(sentinel.initial_error_recovery_policy)
+
     decoy.when(
         await mock_run_orchestrator_store.create(
             run_id=run_id_new,
             labware_offsets=[],
             protocol=None,
+            initial_error_recovery_policy=sentinel.initial_error_recovery_policy,
             deck_configuration=[],
             run_time_param_values=None,
             run_time_param_paths=None,
@@ -1126,35 +1114,36 @@ async def test_create_policies_raises_run_not_current(
         "not-current-run-id"
     )
     with pytest.raises(RunNotCurrentError):
-        subject.set_policies(
-            run_id="run-id", policies=decoy.mock(cls=List[ErrorRecoveryRule])
+        subject.set_error_recovery_rules(
+            run_id="run-id", rules=decoy.mock(cls=List[ErrorRecoveryRule])
         )
 
 
 async def test_create_policies_translates_and_calls_orchestrator(
     decoy: Decoy,
-    monkeypatch: pytest.MonkeyPatch,
     mock_run_orchestrator_store: RunOrchestratorStore,
+    mock_error_recovery_setting_store: ErrorRecoverySettingStore,
     subject: RunDataManager,
 ) -> None:
     """Should translate rules into policy and call orchestrator."""
-    monkeypatch.setattr(
-        error_recovery_mapping,
-        "create_error_recovery_policy_from_rules",
-        decoy.mock(
-            func=decoy.mock(
-                func=error_recovery_mapping.create_error_recovery_policy_from_rules
-            )
-        ),
+    decoy.when(mock_error_recovery_setting_store.get_is_enabled()).then_return(
+        sentinel.is_enabled
     )
-    input_rules = decoy.mock(cls=List[ErrorRecoveryRule])
-    expected_output = decoy.mock(cls=ErrorRecoveryPolicy)
     decoy.when(
-        error_recovery_mapping.create_error_recovery_policy_from_rules(input_rules)
-    ).then_return(expected_output)
-    decoy.when(mock_run_orchestrator_store.current_run_id).then_return("run-id")
-    subject.set_policies(run_id="run-id", policies=input_rules)
-    decoy.verify(mock_run_orchestrator_store.set_error_recovery_policy(expected_output))
+        error_recovery_mapping.create_error_recovery_policy_from_rules(
+            rules=sentinel.input_rules,
+            enabled=sentinel.is_enabled,
+        )
+    ).then_return(sentinel.expected_output)
+    decoy.when(mock_run_orchestrator_store.current_run_id).then_return(
+        sentinel.current_run_id
+    )
+    subject.set_error_recovery_rules(
+        run_id=sentinel.current_run_id, rules=sentinel.input_rules
+    )
+    decoy.verify(
+        mock_run_orchestrator_store.set_error_recovery_policy(sentinel.expected_output)
+    )
 
 
 def test_get_nozzle_map_current_run(
diff --git a/robot-server/tests/runs/test_engine_store.py b/robot-server/tests/runs/test_run_orchestrator_store.py
similarity index 94%
rename from robot-server/tests/runs/test_engine_store.py
rename to robot-server/tests/runs/test_run_orchestrator_store.py
index 46f25f3edb4..e34c1340359 100644
--- a/robot-server/tests/runs/test_engine_store.py
+++ b/robot-server/tests/runs/test_run_orchestrator_store.py
@@ -6,6 +6,7 @@
 from opentrons_shared_data import get_shared_data_root
 from opentrons_shared_data.robot.types import RobotType
 
+from opentrons.protocol_engine.error_recovery_policy import never_recover
 from opentrons.protocol_engine.errors.exceptions import EStopActivatedError
 from opentrons.types import DeckSlotName
 from opentrons.hardware_control import HardwareControlAPI, API
@@ -58,6 +59,7 @@ async def test_create_engine(decoy: Decoy, subject: RunOrchestratorStore) -> Non
     result = await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         protocol=None,
         deck_configuration=[],
         notify_publishers=mock_notify_publishers,
@@ -85,6 +87,7 @@ async def test_create_engine_uses_robot_type(
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -106,6 +109,7 @@ async def test_create_engine_with_labware_offsets(
     result = await subject.create(
         run_id="run-id",
         labware_offsets=[labware_offset],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -129,6 +133,7 @@ async def test_archives_state_if_engine_already_exists(
     await subject.create(
         run_id="run-id-1",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -138,6 +143,7 @@ async def test_archives_state_if_engine_already_exists(
         await subject.create(
             run_id="run-id-2",
             labware_offsets=[],
+            initial_error_recovery_policy=never_recover,
             deck_configuration=[],
             protocol=None,
             notify_publishers=mock_notify_publishers,
@@ -151,6 +157,7 @@ async def test_clear_engine(subject: RunOrchestratorStore) -> None:
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -172,6 +179,7 @@ async def test_clear_engine_not_stopped_or_idle(
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -187,6 +195,7 @@ async def test_clear_idle_engine(subject: RunOrchestratorStore) -> None:
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -238,6 +247,7 @@ async def test_get_default_orchestrator_current_unstarted(
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -252,6 +262,7 @@ async def test_get_default_orchestrator_conflict(subject: RunOrchestratorStore)
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,
@@ -269,6 +280,7 @@ async def test_get_default_orchestrator_run_stopped(
     await subject.create(
         run_id="run-id",
         labware_offsets=[],
+        initial_error_recovery_policy=never_recover,
         deck_configuration=[],
         protocol=None,
         notify_publishers=mock_notify_publishers,