feat(robot-server): Wire up global error recovery setting (#16416)

## Overview Closes EXEC-719. ## Changelog * Connect the new `PATCH /errorRecovery/settings` HTTP API so it actually affects run behavior. This closes EXEC-719. * One quirk to my implementation: This will not affect preexisting runs, *unless* you *also* do `PUT /runs/{id}/errorRecoveryPolicy`. In effect, `PUT /runs/{id}/errorRecoveryPolicy` refreshes the run's snapshot of `/errorRecovery/settings`. This is probably confusing; it would be better if `PATCH /errorRecovery/settings` immediately affected the preexisting run, or if `PUT /runs/{id}/errorRecoveryPolicy` did not have that refresh effect. I don't think this matters in practice because this button will be disabled or inaccessible in our UIs while there's an active run. Also fix a couple of nearby, unticketed bugs: * If a client gave us an empty error recovery policy, we were no-opping. Instead, we want to overwrite the current policy with the new empty one. * The logic in `error_recovery_mapping.py` was unintentionally enabling error recovery on OT-2 robots if the policy was specific enough. We want it to always be disabled on OT-2s.
Opentrons · Oct 7, 2024 · 56cd361 · 56cd361
1 parent 9ac5105
commit 56cd361
Show file tree

Hide file tree

Showing 14 changed files with 283 additions and 203 deletions.
diff --git a/api/src/opentrons/protocol_engine/error_recovery_policy.py b/api/src/opentrons/protocol_engine/error_recovery_policy.py
@@ -40,6 +40,7 @@ class ErrorRecoveryPolicy(Protocol):
     and return an appropriate `ErrorRecoveryType`.
 
     Args:
+        config: The config of the calling `ProtocolEngine`.
         failed_command: The command that failed, in its final `status=="failed"` state.
         defined_error_data: If the command failed with a defined error, details about
             that error. If the command failed with an undefined error, `None`.

diff --git a/robot-server/robot_server/error_recovery/settings/router.py b/robot-server/robot_server/error_recovery/settings/router.py
@@ -61,10 +61,6 @@ async def _get_current_response(
     store: ErrorRecoverySettingStore,
 ) -> PydanticResponse[SimpleBody[ResponseData]]:
     is_enabled = store.get_is_enabled()
-    if is_enabled is None:
-        # todo(mm, 2024-09-30): This defaulting will probably need to move down a layer
-        # when we connect this setting to `POST /runs`.
-        is_enabled = True
     return await PydanticResponse.create(
         SimpleBody.construct(data=ResponseData.construct(enabled=is_enabled))
     )
diff --git a/robot-server/robot_server/error_recovery/settings/store.py b/robot-server/robot_server/error_recovery/settings/store.py
@@ -10,24 +10,25 @@
 from robot_server.persistence.tables import boolean_setting_table, BooleanSettingKey
 
 
+_ERROR_RECOVERY_ENABLED_DEFAULT = True
+
+
 class ErrorRecoverySettingStore:
     """Persistently stores settings related to error recovery."""
 
     def __init__(self, sql_engine: sqlalchemy.engine.Engine) -> None:
         self._sql_engine = sql_engine
 
-    def get_is_enabled(self) -> bool | None:
-        """Get the value of the "error recovery enabled" setting.
-
-        `None` is the default, i.e. it's never been explicitly set one way or the other.
-        """
+    def get_is_enabled(self) -> bool:
+        """Get the value of the "error recovery enabled" setting."""
         with self._sql_engine.begin() as transaction:
-            return transaction.execute(
+            result: bool | None = transaction.execute(
                 sqlalchemy.select(boolean_setting_table.c.value).where(
                     boolean_setting_table.c.key
                     == BooleanSettingKey.ENABLE_ERROR_RECOVERY
                 )
             ).scalar_one_or_none()
+        return result if result is not None else _ERROR_RECOVERY_ENABLED_DEFAULT
 
     def set_is_enabled(self, is_enabled: bool | None) -> None:
         """Set the value of the "error recovery enabled" setting.

diff --git a/robot-server/robot_server/runs/dependencies.py b/robot-server/robot_server/runs/dependencies.py
@@ -2,6 +2,10 @@
 from typing import Annotated
 
 from fastapi import Depends, status
+from robot_server.error_recovery.settings.store import (
+    ErrorRecoverySettingStore,
+    get_error_recovery_setting_store,
+)
 from robot_server.protocols.dependencies import get_protocol_store
 from robot_server.protocols.protocol_models import ProtocolKind
 from robot_server.protocols.protocol_store import ProtocolStore
@@ -156,12 +160,16 @@ async def get_run_data_manager(
     ],
     run_store: Annotated[RunStore, Depends(get_run_store)],
     runs_publisher: Annotated[RunsPublisher, Depends(get_runs_publisher)],
+    error_recovery_setting_store: Annotated[
+        ErrorRecoverySettingStore, Depends(get_error_recovery_setting_store)
+    ],
 ) -> RunDataManager:
     """Get a run data manager to keep track of current/historical run data."""
     return RunDataManager(
-        task_runner=task_runner,
         run_orchestrator_store=run_orchestrator_store,
         run_store=run_store,
+        error_recovery_setting_store=error_recovery_setting_store,
+        task_runner=task_runner,
         runs_publisher=runs_publisher,
     )
 

diff --git a/robot-server/robot_server/runs/error_recovery_mapping.py b/robot-server/robot_server/runs/error_recovery_mapping.py
@@ -1,5 +1,4 @@
 """Functions used for managing error recovery policy."""
-from typing import Optional
 from opentrons.protocol_engine.state.config import Config
 from robot_server.runs.error_recovery_models import ErrorRecoveryRule, ReactionIfMatch
 from opentrons.protocol_engine.commands.command_unions import (
@@ -14,52 +13,96 @@
 
 def create_error_recovery_policy_from_rules(
     rules: list[ErrorRecoveryRule],
+    enabled: bool,
 ) -> ErrorRecoveryPolicy:
-    """Given a list of error recovery rules return an error recovery policy."""
+    """Map a robot-server error recovery policy to an opentrons.protocol_engine one.
 
-    def _policy(
+    In its HTTP API, robot-server expresses error recovery policies as Pydantic models.
+    But opentrons.protocol_engine is more general, expressing them as Python callables.
+
+    Args:
+        rules: The rules in the robot-server error recovery policy.
+        enabled: Whether error recovery should be enabled at all.
+            If `False`, `rules` is ignored.
+
+    Returns:
+        An error recovery policy in `opentrons.protocol_engine` terms.
+    """
+
+    def mapped_policy(
         config: Config,
         failed_command: Command,
-        defined_error_data: Optional[CommandDefinedErrorData],
+        defined_error_data: CommandDefinedErrorData | None,
     ) -> ErrorRecoveryType:
-        for rule in rules:
-            command_type_matches = (
-                failed_command.commandType == rule.matchCriteria.command.commandType
-            )
-            error_type_matches = (
-                defined_error_data is not None
-                and defined_error_data.public.errorType
-                == rule.matchCriteria.command.error.errorType
+        first_matching_rule = next(
+            (
+                rule
+                for rule in rules
+                if _rule_matches_error(rule, failed_command, defined_error_data)
+            ),
+            None,
+        )
+        robot_is_flex = config.robot_type == "OT-3 Standard"
+        error_is_defined = defined_error_data is not None
+
+        if not enabled:
+            return ErrorRecoveryType.FAIL_RUN
+        elif not robot_is_flex:
+            # Although error recovery can theoretically work on OT-2s, we haven't tested
+            # it, and it's generally scarier because the OT-2 has much less hardware
+            # feedback.
+            return ErrorRecoveryType.FAIL_RUN
+        elif first_matching_rule is not None:
+            # The input policy explicitly deals this error, so do what it says.
+            return _map_error_recovery_type(first_matching_rule.ifMatch)
+        else:
+            # The input policy doesn't explicitly deal with this error, so the decision
+            # is our own.
+            #
+            # We try to WAIT_FOR_RECOVERY whenever we can, for two reasons:
+            #
+            # 1. It matches the frontend's current expectations.
+            #    For example, the frontend expects us to WAIT_FOR_RECOVERY on
+            #    overpressure errors, but it does not send us an error recovery policy
+            #    that explicitly says that; it relies on this default.
+            # 2. Philosophically, we always want to give the operator a shot at
+            #    recovery, even if we don't know the details of the problem and can't
+            #    guarantee good robot behavior if they keep using it.
+            #
+            # We currently FAIL_RUN for undefined errors, with the thinking that they
+            # are especially likely to have messed something up in Protocol Engine's
+            # internal state, and that they are especially likely to cause confusing
+            # behavior. But we might want to change that--see point (2) above.
+            return (
+                ErrorRecoveryType.WAIT_FOR_RECOVERY
+                if error_is_defined
+                else ErrorRecoveryType.FAIL_RUN
             )
 
-            if command_type_matches and error_type_matches:
-                if rule.ifMatch == ReactionIfMatch.IGNORE_AND_CONTINUE:
-                    return ErrorRecoveryType.IGNORE_AND_CONTINUE
-                elif rule.ifMatch == ReactionIfMatch.FAIL_RUN:
-                    return ErrorRecoveryType.FAIL_RUN
-                elif rule.ifMatch == ReactionIfMatch.WAIT_FOR_RECOVERY:
-                    return ErrorRecoveryType.WAIT_FOR_RECOVERY
+    return mapped_policy
 
-        return default_error_recovery_policy(config, failed_command, defined_error_data)
 
-    return _policy
+def _rule_matches_error(
+    rule: ErrorRecoveryRule,
+    failed_command: Command,
+    defined_error_data: CommandDefinedErrorData | None,
+) -> bool:
+    command_type_matches = (
+        failed_command.commandType == rule.matchCriteria.command.commandType
+    )
+    error_type_matches = (
+        defined_error_data is not None
+        and defined_error_data.public.errorType
+        == rule.matchCriteria.command.error.errorType
+    )
+    return command_type_matches and error_type_matches
 
 
-def default_error_recovery_policy(
-    config: Config,
-    failed_command: Command,
-    defined_error_data: Optional[CommandDefinedErrorData],
-) -> ErrorRecoveryType:
-    """The `ErrorRecoveryPolicy` to use when none has been set on a run."""
-    # Although error recovery can theoretically work on OT-2s, we haven't tested it,
-    # and it's generally scarier because the OT-2 has much less hardware feedback.
-    robot_is_flex = config.robot_type == "OT-3 Standard"
-    # If the error is defined, we're taking that to mean that we should
-    # WAIT_FOR_RECOVERY. This is not necessarily the right long-term logic--we might
-    # want to FAIL_RUN on certain defined errors and WAIT_FOR_RECOVERY on certain
-    # undefined errors--but this is convenient for now.
-    error_is_defined = defined_error_data is not None
-    if robot_is_flex and error_is_defined:
-        return ErrorRecoveryType.WAIT_FOR_RECOVERY
-    else:
-        return ErrorRecoveryType.FAIL_RUN
+def _map_error_recovery_type(reaction_if_match: ReactionIfMatch) -> ErrorRecoveryType:
+    match reaction_if_match:
+        case ReactionIfMatch.IGNORE_AND_CONTINUE:
+            return ErrorRecoveryType.IGNORE_AND_CONTINUE
+        case ReactionIfMatch.FAIL_RUN:
+            return ErrorRecoveryType.FAIL_RUN
+        case ReactionIfMatch.WAIT_FOR_RECOVERY:
+            return ErrorRecoveryType.WAIT_FOR_RECOVERY
diff --git a/robot-server/robot_server/runs/error_recovery_models.py b/robot-server/robot_server/runs/error_recovery_models.py
@@ -4,13 +4,31 @@
 
 from pydantic import BaseModel, Field
 
+# There's a lot of nested classes here.
+# Here's an example of a JSON document that this code models:
+# {
+#   "policyRules": [
+#     {
+#       "matchCriteria": {
+#         "command": {
+#           "commandType": "foo",
+#           "error": {
+#             "errorType": "bar"
+#           }
+#         }
+#       },
+#       "ifMatch": "ignoreAndContinue"
+#     }
+#   ]
+# }
+
 
 class ReactionIfMatch(Enum):
-    """The type of the error recovery setting.
+    """How to handle a given error.
 
-    * `"ignoreAndContinue"`: Ignore this error and future errors of the same type.
-    * `"failRun"`: Errors of this type should fail the run.
-    * `"waitForRecovery"`: Instances of this error should initiate a recover operation.
+    * `"ignoreAndContinue"`: Ignore this error and continue with the next command.
+    * `"failRun"`: Fail the run.
+    * `"waitForRecovery"`: Enter interactive error recovery mode.
 
     """
 
@@ -19,20 +37,6 @@ class ReactionIfMatch(Enum):
     WAIT_FOR_RECOVERY = "waitForRecovery"
 
 
-# There's a lot of nested classes here. This is the JSON schema this code models.
-# "ErrorRecoveryRule": {
-#     "matchCriteria": {
-#         "command": {
-#             "commandType": "foo",
-#             "error": {
-#                 "errorType": "bar"
-#             }
-#         }
-#     },
-#     "ifMatch": "baz"
-# }
-
-
 class ErrorMatcher(BaseModel):
     """The error type that this rule applies to."""
 
@@ -67,7 +71,7 @@ class ErrorRecoveryRule(BaseModel):
     )
     ifMatch: ReactionIfMatch = Field(
         ...,
-        description="The specific recovery setting that will be in use if the type parameters match.",
+        description="How to handle errors matched by this rule.",
     )
 
 
@@ -76,6 +80,9 @@ class ErrorRecoveryPolicy(BaseModel):
 
     policyRules: List[ErrorRecoveryRule] = Field(
         ...,
-        description="A list or error recovery rules to apply for a run's recovery management."
-        "The rules are evaluated first-to-last. The first exact match will dectate recovery management.",
+        description=(
+            "A list of error recovery rules to apply for a run's recovery management."
+            " The rules are evaluated first-to-last."
+            " The first exact match will dictate recovery management."
+        ),
     )
diff --git a/robot-server/robot_server/runs/router/base_router.py b/robot-server/robot_server/runs/router/base_router.py
@@ -426,11 +426,13 @@ async def update_run(
 @PydanticResponse.wrap_route(
     base_router.put,
     path="/runs/{runId}/errorRecoveryPolicy",
-    summary="Set run policies",
+    summary="Set a run's error recovery policy",
     description=dedent(
         """
         Update how to handle different kinds of command failures.
-        The following rules will persist during the run.
+
+        For this to have any effect, error recovery must also be enabled globally.
+        See `PATCH /errorRecovery/settings`.
         """
     ),
     status_code=status.HTTP_201_CREATED,
@@ -451,12 +453,11 @@ async def put_error_recovery_policy(
         request_body:  Request body with run policies data.
         run_data_manager: Current and historical run data management.
     """
-    policies = request_body.data.policyRules
-    if policies:
-        try:
-            run_data_manager.set_policies(run_id=runId, policies=policies)
-        except RunNotCurrentError as e:
-            raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e
+    rules = request_body.data.policyRules
+    try:
+        run_data_manager.set_error_recovery_rules(run_id=runId, rules=rules)
+    except RunNotCurrentError as e:
+        raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e
 
     return await PydanticResponse.create(
         content=SimpleEmptyBody.construct(),