refactor(protocol-engine): Keep track of failed commands' error recov…

…ery types (#14795)
Opentrons · Apr 4, 2024 · f95af4f · f95af4f
1 parent eecf117
commit f95af4f
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 6 deletions.
diff --git a/api/src/opentrons/protocol_engine/state/commands.py b/api/src/opentrons/protocol_engine/state/commands.py
@@ -4,7 +4,7 @@
 import enum
 from dataclasses import dataclass
 from datetime import datetime
-from typing import List, Optional, Union
+from typing import Dict, List, Optional, Union
 from typing_extensions import assert_never
 
 from opentrons_shared_data.errors import EnumeratedError, ErrorCodes, PythonException
@@ -164,6 +164,19 @@ class CommandState:
     # that we're doing error recovery. See if we can implement robot-server pagination
     # atop simpler concepts, like "the last command that ran" or "the next command that
     # would run."
+    #
+    # TODO(mm, 2024-04-03): Can this be replaced by
+    # CommandHistory.get_terminal_command() now?
+
+    command_error_recovery_types: Dict[str, ErrorRecoveryType]
+    """For each command that failed (indexed by ID), what its recovery type was.
+
+    This only includes commands that actually failed, not the ones that we mark as
+    failed but that are effectively "cancelled" because a command before them failed.
+
+    This separate attribute is a stopgap until error recovery concepts are a bit more
+    stable. Eventually, we might want this info to be stored directly on each command.
+    """
 
     finish_error: Optional[ErrorOccurrence]
     """The error that happened during the post-run finish steps (homing & dropping tips), if any."""
@@ -199,6 +212,7 @@ def __init__(
             run_error=None,
             finish_error=None,
             failed_command=None,
+            command_error_recovery_types={},
             run_completed_at=None,
             run_started_at=None,
             latest_command_hash=None,
@@ -253,11 +267,11 @@ def handle_action(self, action: Action) -> None:  # noqa: C901
                 error=action.error,
             )
 
-            # TODO(mc, 2022-06-06): add new "cancelled" status or similar
             self._update_to_failed(
                 command_id=action.command_id,
                 failed_at=action.failed_at,
                 error_occurrence=error_occurrence,
+                error_recovery_type=action.type,
                 notes=action.notes,
             )
 
@@ -271,10 +285,12 @@ def handle_action(self, action: Action) -> None:  # noqa: C901
                     self._state.command_history.get_setup_queue_ids()
                 )
                 for command_id in other_command_ids_to_fail:
+                    # TODO(mc, 2022-06-06): add new "cancelled" status or similar
                     self._update_to_failed(
                         command_id=command_id,
                         failed_at=action.failed_at,
                         error_occurrence=None,
+                        error_recovery_type=None,
                         notes=None,
                     )
                 self._state.command_history.clear_setup_queue()
@@ -289,10 +305,12 @@ def handle_action(self, action: Action) -> None:  # noqa: C901
                         self._state.command_history.get_queue_ids()
                     )
                     for command_id in other_command_ids_to_fail:
+                        # TODO(mc, 2022-06-06): add new "cancelled" status or similar
                         self._update_to_failed(
                             command_id=command_id,
                             failed_at=action.failed_at,
                             error_occurrence=None,
+                            error_recovery_type=None,
                             notes=None,
                         )
                     self._state.command_history.clear_queue()
@@ -376,6 +394,7 @@ def _update_to_failed(
         command_id: str,
         failed_at: datetime,
         error_occurrence: Optional[ErrorOccurrence],
+        error_recovery_type: Optional[ErrorRecoveryType],
         notes: Optional[List[CommandNote]],
     ) -> None:
         prev_entry = self._state.command_history.get(command_id)
@@ -391,6 +410,8 @@ def _update_to_failed(
             }
         )
         self._state.command_history.set_command_failed(failed_command)
+        if error_recovery_type is not None:
+            self._state.command_error_recovery_types[command_id] = error_recovery_type
 
     @staticmethod
     def _map_run_exception_to_error_occurrence(
@@ -709,6 +730,13 @@ def raise_fatal_command_error(self) -> None:
                 message=failed_command.command.error.detail,
             )
 
+    def get_error_recovery_type(self, command_id: str) -> ErrorRecoveryType:
+        """Return the error recovery type with which the given command failed.
+
+        The command ID is assumed to point to a failed command.
+        """
+        return self.state.command_error_recovery_types[command_id]
+
     def get_is_stopped(self) -> bool:
         """Get whether an engine stop has completed."""
         return self._state.run_completed_at is not None

diff --git a/api/tests/opentrons/protocol_engine/state/test_command_state.py b/api/tests/opentrons/protocol_engine/state/test_command_state.py
@@ -0,0 +1,79 @@
+"""Tests for the CommandStore+CommandState+CommandView trifecta.
+
+The trifecta is tested here as a single unit, treating CommandState as a private
+implementation detail.
+"""
+
+from datetime import datetime
+
+from opentrons_shared_data.errors import PythonException
+
+from opentrons.protocol_engine import actions, commands
+from opentrons.protocol_engine.error_recovery_policy import ErrorRecoveryType
+from opentrons.protocol_engine.state.commands import CommandStore, CommandView
+from opentrons.protocol_engine.state.config import Config
+from opentrons.protocol_engine.types import DeckType
+
+
+def _make_config() -> Config:
+    return Config(
+        # Choice of robot and deck type is arbitrary.
+        robot_type="OT-2 Standard",
+        deck_type=DeckType.OT2_STANDARD,
+    )
+
+
+def test_error_recovery_type_tracking() -> None:
+    """It should keep track of each failed command's error recovery type."""
+    subject = CommandStore(config=_make_config(), is_door_open=False)
+
+    subject.handle_action(
+        actions.QueueCommandAction(
+            command_id="c1",
+            created_at=datetime.now(),
+            request=commands.CommentCreate(
+                params=commands.CommentParams(message="yeehaw"),
+            ),
+            request_hash=None,
+        )
+    )
+    subject.handle_action(
+        actions.QueueCommandAction(
+            command_id="c2",
+            created_at=datetime.now(),
+            request=commands.CommentCreate(
+                params=commands.CommentParams(message="yeehaw"),
+            ),
+            request_hash=None,
+        )
+    )
+    subject.handle_action(
+        actions.RunCommandAction(command_id="c1", started_at=datetime.now())
+    )
+    subject.handle_action(
+        actions.FailCommandAction(
+            command_id="c1",
+            error_id="c1-error",
+            failed_at=datetime.now(),
+            error=PythonException(RuntimeError("new sheriff in town")),
+            notes=[],
+            type=ErrorRecoveryType.WAIT_FOR_RECOVERY,
+        )
+    )
+    subject.handle_action(
+        actions.RunCommandAction(command_id="c2", started_at=datetime.now())
+    )
+    subject.handle_action(
+        actions.FailCommandAction(
+            command_id="c2",
+            error_id="c2-error",
+            failed_at=datetime.now(),
+            error=PythonException(RuntimeError("new sheriff in town")),
+            notes=[],
+            type=ErrorRecoveryType.FAIL_RUN,
+        )
+    )
+
+    view = CommandView(subject.state)
+    assert view.get_error_recovery_type("c1") == ErrorRecoveryType.WAIT_FOR_RECOVERY
+    assert view.get_error_recovery_type("c2") == ErrorRecoveryType.FAIL_RUN
diff --git a/...otocol_engine/state/test_command_store.py → ...ol_engine/state/test_command_store_old.py b/...otocol_engine/state/test_command_store.py → ...ol_engine/state/test_command_store_old.py
@@ -1,4 +1,10 @@
-"""Tests for the command lifecycle state."""
+"""Tests for CommandStore.
+
+DEPRECATED: Testing CommandStore independently of CommandView is no longer helpful.
+Add new tests to test_command_state.py, where they can be tested together.
+"""
+
+
 import pytest
 from datetime import datetime
 from typing import NamedTuple, Type
@@ -79,6 +85,7 @@ def test_initial_state(
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         latest_command_hash=None,
         stopped_by_estop=False,
     )
@@ -826,6 +833,7 @@ def test_command_store_handles_pause_action(pause_source: PauseSource) -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         latest_command_hash=None,
         stopped_by_estop=False,
     )
@@ -850,6 +858,7 @@ def test_command_store_handles_play_action(pause_source: PauseSource) -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=datetime(year=2021, month=1, day=1),
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -880,6 +889,7 @@ def test_command_store_handles_finish_action() -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=datetime(year=2021, month=1, day=1),
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -925,6 +935,7 @@ def test_command_store_handles_stop_action(from_estop: bool) -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=datetime(year=2021, month=1, day=1),
         latest_command_hash=None,
         stopped_by_estop=from_estop,
@@ -954,6 +965,7 @@ def test_command_store_cannot_restart_after_should_stop() -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=None,
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -1085,6 +1097,7 @@ def test_command_store_wraps_unknown_errors() -> None:
         ),
         run_started_at=None,
         failed_command=None,
+        command_error_recovery_types={},
         latest_command_hash=None,
         stopped_by_estop=False,
     )
@@ -1145,6 +1158,7 @@ def __init__(self, message: str) -> None:
             errorCode=ErrorCodes.PIPETTE_NOT_PRESENT.value.code,
         ),
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=None,
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -1176,6 +1190,7 @@ def test_command_store_ignores_stop_after_graceful_finish() -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=datetime(year=2021, month=1, day=1),
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -1207,6 +1222,7 @@ def test_command_store_ignores_finish_after_non_graceful_stop() -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=datetime(year=2021, month=1, day=1),
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -1219,6 +1235,8 @@ def test_command_store_ignores_finish_after_non_graceful_stop() -> None:
 
 def test_command_store_handles_command_failed() -> None:
     """It should store an error and mark the command if it fails."""
+    error_recovery_type = ErrorRecoveryType.FAIL_RUN
+
     expected_error_occurrence = errors.ErrorOccurrence(
         id="error-id",
         errorType="ProtocolEngineError",
@@ -1281,7 +1299,7 @@ def test_command_store_handles_command_failed() -> None:
                     source="source",
                 )
             ],
-            type=ErrorRecoveryType.FAIL_RUN,
+            type=error_recovery_type,
         )
     )
 
@@ -1299,6 +1317,7 @@ def test_command_store_handles_command_failed() -> None:
         run_error=None,
         finish_error=None,
         failed_command=failed_command_entry,
+        command_error_recovery_types={expected_failed_command.id: error_recovery_type},
         run_started_at=None,
         latest_command_hash=None,
         stopped_by_estop=False,
@@ -1327,6 +1346,7 @@ def test_handles_hardware_stopped() -> None:
         run_error=None,
         finish_error=None,
         failed_command=None,
+        command_error_recovery_types={},
         run_started_at=None,
         latest_command_hash=None,
         stopped_by_estop=False,

diff --git a/...rotocol_engine/state/test_command_view.py → ...col_engine/state/test_command_view_old.py b/...rotocol_engine/state/test_command_view.py → ...col_engine/state/test_command_view_old.py
@@ -1,8 +1,14 @@
-"""Labware state store tests."""
+"""Tests for CommandView.
+
+DEPRECATED: Testing CommandView independently of CommandStore is no longer helpful.
+Add new tests to test_command_state.py, where they can be tested together.
+"""
+
+
 import pytest
 from contextlib import nullcontext as does_not_raise
 from datetime import datetime
-from typing import List, NamedTuple, Optional, Sequence, Type, Union
+from typing import Dict, List, NamedTuple, Optional, Sequence, Type, Union
 
 from opentrons.protocol_engine import EngineStatus, commands as cmd, errors
 from opentrons.protocol_engine.actions import (
@@ -14,6 +20,7 @@
 )
 from opentrons.protocol_engine.actions.actions import ResumeFromRecoveryAction
 
+from opentrons.protocol_engine.error_recovery_policy import ErrorRecoveryType
 from opentrons.protocol_engine.state.commands import (
     CommandState,
     CommandView,
@@ -50,6 +57,7 @@ def get_command_view(
     queued_setup_command_ids: Sequence[str] = (),
     run_error: Optional[errors.ErrorOccurrence] = None,
     failed_command: Optional[CommandEntry] = None,
+    command_error_recovery_types: Optional[Dict[str, ErrorRecoveryType]] = None,
     finish_error: Optional[errors.ErrorOccurrence] = None,
     commands: Sequence[cmd.Command] = (),
     latest_command_hash: Optional[str] = None,
@@ -81,6 +89,7 @@ def get_command_view(
         run_error=run_error,
         finish_error=finish_error,
         failed_command=failed_command,
+        command_error_recovery_types=command_error_recovery_types or {},
         run_started_at=run_started_at,
         latest_command_hash=latest_command_hash,
         stopped_by_estop=False,