Skip to content

Commit

Permalink
feat(robot-server): Wire up global error recovery setting (#16416)
Browse files Browse the repository at this point in the history
## Overview

Closes EXEC-719.

## Changelog

* Connect the new `PATCH /errorRecovery/settings` HTTP API so it
actually affects run behavior. This closes EXEC-719.
    * One quirk to my implementation:
    
This will not affect preexisting runs, *unless* you *also* do `PUT
/runs/{id}/errorRecoveryPolicy`. In effect, `PUT
/runs/{id}/errorRecoveryPolicy` refreshes the run's snapshot of
`/errorRecovery/settings`. This is probably confusing; it would be
better if `PATCH /errorRecovery/settings` immediately affected the
preexisting run, or if `PUT /runs/{id}/errorRecoveryPolicy` did not have
that refresh effect.
    
I don't think this matters in practice because this button will be
disabled or inaccessible in our UIs while there's an active run.

Also fix a couple of nearby, unticketed bugs:

* If a client gave us an empty error recovery policy, we were no-opping.
Instead, we want to overwrite the current policy with the new empty one.
* The logic in `error_recovery_mapping.py` was unintentionally enabling
error recovery on OT-2 robots if the policy was specific enough. We want
it to always be disabled on OT-2s.
  • Loading branch information
SyntaxColoring authored Oct 7, 2024
1 parent 9ac5105 commit 56cd361
Show file tree
Hide file tree
Showing 14 changed files with 283 additions and 203 deletions.
1 change: 1 addition & 0 deletions api/src/opentrons/protocol_engine/error_recovery_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class ErrorRecoveryPolicy(Protocol):
and return an appropriate `ErrorRecoveryType`.
Args:
config: The config of the calling `ProtocolEngine`.
failed_command: The command that failed, in its final `status=="failed"` state.
defined_error_data: If the command failed with a defined error, details about
that error. If the command failed with an undefined error, `None`.
Expand Down
4 changes: 0 additions & 4 deletions robot-server/robot_server/error_recovery/settings/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ async def _get_current_response(
store: ErrorRecoverySettingStore,
) -> PydanticResponse[SimpleBody[ResponseData]]:
is_enabled = store.get_is_enabled()
if is_enabled is None:
# todo(mm, 2024-09-30): This defaulting will probably need to move down a layer
# when we connect this setting to `POST /runs`.
is_enabled = True
return await PydanticResponse.create(
SimpleBody.construct(data=ResponseData.construct(enabled=is_enabled))
)
13 changes: 7 additions & 6 deletions robot-server/robot_server/error_recovery/settings/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,25 @@
from robot_server.persistence.tables import boolean_setting_table, BooleanSettingKey


_ERROR_RECOVERY_ENABLED_DEFAULT = True


class ErrorRecoverySettingStore:
"""Persistently stores settings related to error recovery."""

def __init__(self, sql_engine: sqlalchemy.engine.Engine) -> None:
self._sql_engine = sql_engine

def get_is_enabled(self) -> bool | None:
"""Get the value of the "error recovery enabled" setting.
`None` is the default, i.e. it's never been explicitly set one way or the other.
"""
def get_is_enabled(self) -> bool:
"""Get the value of the "error recovery enabled" setting."""
with self._sql_engine.begin() as transaction:
return transaction.execute(
result: bool | None = transaction.execute(
sqlalchemy.select(boolean_setting_table.c.value).where(
boolean_setting_table.c.key
== BooleanSettingKey.ENABLE_ERROR_RECOVERY
)
).scalar_one_or_none()
return result if result is not None else _ERROR_RECOVERY_ENABLED_DEFAULT

def set_is_enabled(self, is_enabled: bool | None) -> None:
"""Set the value of the "error recovery enabled" setting.
Expand Down
10 changes: 9 additions & 1 deletion robot-server/robot_server/runs/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
from typing import Annotated

from fastapi import Depends, status
from robot_server.error_recovery.settings.store import (
ErrorRecoverySettingStore,
get_error_recovery_setting_store,
)
from robot_server.protocols.dependencies import get_protocol_store
from robot_server.protocols.protocol_models import ProtocolKind
from robot_server.protocols.protocol_store import ProtocolStore
Expand Down Expand Up @@ -156,12 +160,16 @@ async def get_run_data_manager(
],
run_store: Annotated[RunStore, Depends(get_run_store)],
runs_publisher: Annotated[RunsPublisher, Depends(get_runs_publisher)],
error_recovery_setting_store: Annotated[
ErrorRecoverySettingStore, Depends(get_error_recovery_setting_store)
],
) -> RunDataManager:
"""Get a run data manager to keep track of current/historical run data."""
return RunDataManager(
task_runner=task_runner,
run_orchestrator_store=run_orchestrator_store,
run_store=run_store,
error_recovery_setting_store=error_recovery_setting_store,
task_runner=task_runner,
runs_publisher=runs_publisher,
)

Expand Down
121 changes: 82 additions & 39 deletions robot-server/robot_server/runs/error_recovery_mapping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Functions used for managing error recovery policy."""
from typing import Optional
from opentrons.protocol_engine.state.config import Config
from robot_server.runs.error_recovery_models import ErrorRecoveryRule, ReactionIfMatch
from opentrons.protocol_engine.commands.command_unions import (
Expand All @@ -14,52 +13,96 @@

def create_error_recovery_policy_from_rules(
rules: list[ErrorRecoveryRule],
enabled: bool,
) -> ErrorRecoveryPolicy:
"""Given a list of error recovery rules return an error recovery policy."""
"""Map a robot-server error recovery policy to an opentrons.protocol_engine one.
def _policy(
In its HTTP API, robot-server expresses error recovery policies as Pydantic models.
But opentrons.protocol_engine is more general, expressing them as Python callables.
Args:
rules: The rules in the robot-server error recovery policy.
enabled: Whether error recovery should be enabled at all.
If `False`, `rules` is ignored.
Returns:
An error recovery policy in `opentrons.protocol_engine` terms.
"""

def mapped_policy(
config: Config,
failed_command: Command,
defined_error_data: Optional[CommandDefinedErrorData],
defined_error_data: CommandDefinedErrorData | None,
) -> ErrorRecoveryType:
for rule in rules:
command_type_matches = (
failed_command.commandType == rule.matchCriteria.command.commandType
)
error_type_matches = (
defined_error_data is not None
and defined_error_data.public.errorType
== rule.matchCriteria.command.error.errorType
first_matching_rule = next(
(
rule
for rule in rules
if _rule_matches_error(rule, failed_command, defined_error_data)
),
None,
)
robot_is_flex = config.robot_type == "OT-3 Standard"
error_is_defined = defined_error_data is not None

if not enabled:
return ErrorRecoveryType.FAIL_RUN
elif not robot_is_flex:
# Although error recovery can theoretically work on OT-2s, we haven't tested
# it, and it's generally scarier because the OT-2 has much less hardware
# feedback.
return ErrorRecoveryType.FAIL_RUN
elif first_matching_rule is not None:
# The input policy explicitly deals this error, so do what it says.
return _map_error_recovery_type(first_matching_rule.ifMatch)
else:
# The input policy doesn't explicitly deal with this error, so the decision
# is our own.
#
# We try to WAIT_FOR_RECOVERY whenever we can, for two reasons:
#
# 1. It matches the frontend's current expectations.
# For example, the frontend expects us to WAIT_FOR_RECOVERY on
# overpressure errors, but it does not send us an error recovery policy
# that explicitly says that; it relies on this default.
# 2. Philosophically, we always want to give the operator a shot at
# recovery, even if we don't know the details of the problem and can't
# guarantee good robot behavior if they keep using it.
#
# We currently FAIL_RUN for undefined errors, with the thinking that they
# are especially likely to have messed something up in Protocol Engine's
# internal state, and that they are especially likely to cause confusing
# behavior. But we might want to change that--see point (2) above.
return (
ErrorRecoveryType.WAIT_FOR_RECOVERY
if error_is_defined
else ErrorRecoveryType.FAIL_RUN
)

if command_type_matches and error_type_matches:
if rule.ifMatch == ReactionIfMatch.IGNORE_AND_CONTINUE:
return ErrorRecoveryType.IGNORE_AND_CONTINUE
elif rule.ifMatch == ReactionIfMatch.FAIL_RUN:
return ErrorRecoveryType.FAIL_RUN
elif rule.ifMatch == ReactionIfMatch.WAIT_FOR_RECOVERY:
return ErrorRecoveryType.WAIT_FOR_RECOVERY
return mapped_policy

return default_error_recovery_policy(config, failed_command, defined_error_data)

return _policy
def _rule_matches_error(
rule: ErrorRecoveryRule,
failed_command: Command,
defined_error_data: CommandDefinedErrorData | None,
) -> bool:
command_type_matches = (
failed_command.commandType == rule.matchCriteria.command.commandType
)
error_type_matches = (
defined_error_data is not None
and defined_error_data.public.errorType
== rule.matchCriteria.command.error.errorType
)
return command_type_matches and error_type_matches


def default_error_recovery_policy(
config: Config,
failed_command: Command,
defined_error_data: Optional[CommandDefinedErrorData],
) -> ErrorRecoveryType:
"""The `ErrorRecoveryPolicy` to use when none has been set on a run."""
# Although error recovery can theoretically work on OT-2s, we haven't tested it,
# and it's generally scarier because the OT-2 has much less hardware feedback.
robot_is_flex = config.robot_type == "OT-3 Standard"
# If the error is defined, we're taking that to mean that we should
# WAIT_FOR_RECOVERY. This is not necessarily the right long-term logic--we might
# want to FAIL_RUN on certain defined errors and WAIT_FOR_RECOVERY on certain
# undefined errors--but this is convenient for now.
error_is_defined = defined_error_data is not None
if robot_is_flex and error_is_defined:
return ErrorRecoveryType.WAIT_FOR_RECOVERY
else:
return ErrorRecoveryType.FAIL_RUN
def _map_error_recovery_type(reaction_if_match: ReactionIfMatch) -> ErrorRecoveryType:
match reaction_if_match:
case ReactionIfMatch.IGNORE_AND_CONTINUE:
return ErrorRecoveryType.IGNORE_AND_CONTINUE
case ReactionIfMatch.FAIL_RUN:
return ErrorRecoveryType.FAIL_RUN
case ReactionIfMatch.WAIT_FOR_RECOVERY:
return ErrorRecoveryType.WAIT_FOR_RECOVERY
49 changes: 28 additions & 21 deletions robot-server/robot_server/runs/error_recovery_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,31 @@

from pydantic import BaseModel, Field

# There's a lot of nested classes here.
# Here's an example of a JSON document that this code models:
# {
# "policyRules": [
# {
# "matchCriteria": {
# "command": {
# "commandType": "foo",
# "error": {
# "errorType": "bar"
# }
# }
# },
# "ifMatch": "ignoreAndContinue"
# }
# ]
# }


class ReactionIfMatch(Enum):
"""The type of the error recovery setting.
"""How to handle a given error.
* `"ignoreAndContinue"`: Ignore this error and future errors of the same type.
* `"failRun"`: Errors of this type should fail the run.
* `"waitForRecovery"`: Instances of this error should initiate a recover operation.
* `"ignoreAndContinue"`: Ignore this error and continue with the next command.
* `"failRun"`: Fail the run.
* `"waitForRecovery"`: Enter interactive error recovery mode.
"""

Expand All @@ -19,20 +37,6 @@ class ReactionIfMatch(Enum):
WAIT_FOR_RECOVERY = "waitForRecovery"


# There's a lot of nested classes here. This is the JSON schema this code models.
# "ErrorRecoveryRule": {
# "matchCriteria": {
# "command": {
# "commandType": "foo",
# "error": {
# "errorType": "bar"
# }
# }
# },
# "ifMatch": "baz"
# }


class ErrorMatcher(BaseModel):
"""The error type that this rule applies to."""

Expand Down Expand Up @@ -67,7 +71,7 @@ class ErrorRecoveryRule(BaseModel):
)
ifMatch: ReactionIfMatch = Field(
...,
description="The specific recovery setting that will be in use if the type parameters match.",
description="How to handle errors matched by this rule.",
)


Expand All @@ -76,6 +80,9 @@ class ErrorRecoveryPolicy(BaseModel):

policyRules: List[ErrorRecoveryRule] = Field(
...,
description="A list or error recovery rules to apply for a run's recovery management."
"The rules are evaluated first-to-last. The first exact match will dectate recovery management.",
description=(
"A list of error recovery rules to apply for a run's recovery management."
" The rules are evaluated first-to-last."
" The first exact match will dictate recovery management."
),
)
17 changes: 9 additions & 8 deletions robot-server/robot_server/runs/router/base_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,11 +426,13 @@ async def update_run(
@PydanticResponse.wrap_route(
base_router.put,
path="/runs/{runId}/errorRecoveryPolicy",
summary="Set run policies",
summary="Set a run's error recovery policy",
description=dedent(
"""
Update how to handle different kinds of command failures.
The following rules will persist during the run.
For this to have any effect, error recovery must also be enabled globally.
See `PATCH /errorRecovery/settings`.
"""
),
status_code=status.HTTP_201_CREATED,
Expand All @@ -451,12 +453,11 @@ async def put_error_recovery_policy(
request_body: Request body with run policies data.
run_data_manager: Current and historical run data management.
"""
policies = request_body.data.policyRules
if policies:
try:
run_data_manager.set_policies(run_id=runId, policies=policies)
except RunNotCurrentError as e:
raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e
rules = request_body.data.policyRules
try:
run_data_manager.set_error_recovery_rules(run_id=runId, rules=rules)
except RunNotCurrentError as e:
raise RunStopped(detail=str(e)).as_error(status.HTTP_409_CONFLICT) from e

return await PydanticResponse.create(
content=SimpleEmptyBody.construct(),
Expand Down
Loading

0 comments on commit 56cd361

Please sign in to comment.