canonical · marcoppenheimer · Aug 7, 2023 · Aug 7, 2023 · Aug 9, 2023 · Aug 9, 2023
diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py
@@ -284,7 +284,7 @@ def restart(self, event) -> None:
 
 # Increment this PATCH version before using `charmcraft publish-lib` or reset
 # to 0 if you are raising the major API version
-LIBPATCH = 8
+LIBPATCH = 9
 
 PYDEPS = ["pydantic>=1.10,<2"]
 
@@ -652,18 +652,13 @@ class UpgradeGrantedEvent(EventBase):
     """Used to tell units that they can process an upgrade."""
 
 
-class UpgradeFinishedEvent(EventBase):
-    """Used to tell units that they finished the upgrade."""
-
-
 class UpgradeEvents(CharmEvents):
     """Upgrade events.
 
     This class defines the events that the lib can emit.
     """
 
     upgrade_granted = EventSource(UpgradeGrantedEvent)
-    upgrade_finished = EventSource(UpgradeFinishedEvent)
 
 
 # --- EVENT HANDLER ---
@@ -699,16 +694,14 @@ def __init__(
         )
         self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm)
         self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted)
-        self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished)
 
         # actions
         self.framework.observe(
             getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action
         )
-        if self.substrate == "k8s":
-            self.framework.observe(
-                getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
-            )
+        self.framework.observe(
+            getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
+        )
 
     @property
     def peer_relation(self) -> Optional[Relation]:
@@ -791,6 +784,33 @@ def unit_states(self) -> list:
 
         return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units]
 
+    @property
+    def resume_strategy(self) -> Optional[str]:
+        """Gets upgrade resume-strategy requested during pre-upgrade-check action.
+
+        Returns:
+            String of "auto" or "safe" resume-strategy
+        """
+        if not self.peer_relation:
+            return None
+
+        if self.substrate == "k8s":
+            return "safe"
+
+        return self.peer_relation.data[self.charm.app].get("resume-strategy", "auto")
+
+    @property
+    def first_unit(self) -> bool:
+        """Flag to check if the current top-of-stack unit is the first to upgrade.
+
+        Returns:
+            True if top-of-stack unit is the first to upgrade. Otherwise False
+        """
+        if not self.peer_relation or not self.upgrade_stack:
+            return True
+
+        return len(self.upgrade_stack) == len(self.peer_relation.units)
-        return len(self.upgrade_stack) == len(self.peer_relation.units)
+        return len(self.upgrade_stack) == len(self.peer_relation.units) + 1
-        return len(self.upgrade_stack) == len(self.peer_relation.units)
+        return len(self.upgrade_stack) == len(self.peer_relation.units) + 1
+
     @property
     def cluster_state(self) -> Optional[str]:
         """Current upgrade state for cluster units.
@@ -905,17 +925,15 @@ def set_unit_completed(self) -> None:
         # now leader pulls a fresh stack from newly updated relation data
         if self.charm.unit.is_leader():
             self._upgrade_stack = None
+            # recurse on leader to ensure relation changed event not lost
+            # in case leader is next or the last unit to complete
+            self.charm.on[self.relation_name].relation_changed.emit(
+                self.model.get_relation(self.relation_name)
+            )
 
         self.charm.unit.status = MaintenanceStatus("upgrade completed")
         self.peer_relation.data[self.charm.unit].update({"state": "completed"})
 
-        # Emit upgrade_finished event to run unit's post upgrade operations.
-        if self.substrate == "k8s":
-            logger.debug(
-                f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..."
-            )
-            getattr(self.on, "upgrade_finished").emit()
-
     def _on_upgrade_created(self, event: RelationCreatedEvent) -> None:
         """Handler for `upgrade-relation-created` events."""
         if not self.peer_relation:
@@ -977,6 +995,12 @@ def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None:
             event.fail(message="Unknown error found.")
             return
 
+        # always 'safe' on k8s, default to 'auto' on vm
+        resume_strategy = (
+            "safe" if self.substrate == "k8s" else event.params.get("resume-strategy", "auto")
+        )
+        self.peer_relation.data[self.charm.app].update({"resume-strategy": resume_strategy})
+
         logger.info("Setting upgrade-stack to relation data...")
         self.upgrade_stack = built_upgrade_stack
 
@@ -985,31 +1009,21 @@ def _on_resume_upgrade_action(self, event: ActionEvent) -> None:
 
         Continue the upgrade by setting the partition to the next unit.
         """
-        if not self.peer_relation:
-            event.fail(message="Could not find upgrade relation.")
-            return
+        # FIXME: use switch-case here when 3.10
 
-        if not self.charm.unit.is_leader():
-            event.fail(message="Action must be ran on the Juju leader.")
-            return
-
-        if not self.upgrade_stack:
-            event.fail(message="Nothing to resume, upgrade stack unset.")
-            return
+        checks = [
+            (self.peer_relation, "Could not find upgrade relation."),
+            (self.charm.unit.is_leader(), "Action must be ran on the Juju leader."),
+            (self.upgrade_stack, "Nothing to resume, upgrade-stack unset."),
+            (self.first_unit, "Upgrade can be resumed only once after juju refresh is called."),
+        ]
 
-        # Check whether this is being run after juju refresh was called
-        # (the size of the upgrade stack should match the number of total
-        # unit minus one).
-        if len(self.upgrade_stack) != len(self.peer_relation.units):
-            event.fail(message="Upgrade can be resumed only once after juju refresh is called.")
-            return
+        for check in checks:
+            if not check[0]:
+                event.fail(message=check[1])
+                return
 
-        try:
-            next_partition = self.upgrade_stack[-1]
-            self._set_rolling_update_partition(partition=next_partition)
-            event.set_results({"message": f"Upgrade will resume on unit {next_partition}"})
-        except KubernetesClientError:
-            event.fail(message="Cannot set rolling update partition.")
+        self._update_stack()
 
     def _upgrade_supported_check(self) -> None:
         """Checks if previous versions can be upgraded to new versions.
@@ -1056,32 +1070,24 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
             logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.")
             return
 
+        # for VM, run version checks on leader only
+        # for K8s, run version checks on first unit only
+        if (self.charm.unit.is_leader() and self.substrate == "vm") or (
+            self.first_unit and self.substrate == "k8s"
+        ):
+            try:
+                self._upgrade_supported_check()
+            except VersionError as e:  # not ready if not passed check
+                logger.error(e)
+                self.set_unit_failed()
+                return
+
         if self.substrate == "vm":
-            # for VM run version checks on leader only
-            if self.charm.unit.is_leader():
-                try:
-                    self._upgrade_supported_check()
-                except VersionError as e:  # not ready if not passed check
-                    logger.error(e)
-                    self.set_unit_failed()
-                    return
             self.charm.unit.status = WaitingStatus("other units upgrading first...")
             self.peer_relation.data[self.charm.unit].update({"state": "ready"})
 
-        else:
-            # for k8s run version checks only on highest ordinal unit
-            if (
-                self.charm.unit.name
-                == f"{self.charm.app.name}/{self.charm.app.planned_units() -1}"
-            ):
-                try:
-                    self._upgrade_supported_check()
-                except VersionError as e:  # not ready if not passed check
-                    logger.error(e)
-                    self.set_unit_failed()
-                    return
-            # On K8s an unit that receives the upgrade-charm event is upgrading
-            self.charm.unit.status = MaintenanceStatus("upgrading unit")
+        if self.substrate == "k8s":
+            self.charm.unit.status = MaintenanceStatus("upgrading unit...")
             self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})
 
     def on_upgrade_changed(self, event: EventBase) -> None:
@@ -1109,7 +1115,13 @@ def on_upgrade_changed(self, event: EventBase) -> None:
                 if self.charm.unit.is_leader():
                     logger.debug("Persisting new dependencies to upgrade relation data...")
                     self.peer_relation.data[self.charm.app].update(
-                        {"dependencies": json.dumps(self.dependency_model.dict())}
+                        {
+                            "dependencies": json.dumps(
+                                self.dependency_model.dict()
+                            ),  # persisting new deps
+                            "resume-strategy": "",  # resetting resume-strategy for future upgrades
+                            "upgrade-stack": "",  # resetting upgrade-stack for future upgrades
+                        }
                     )
                 return
 
@@ -1130,20 +1142,21 @@ def on_upgrade_changed(self, event: EventBase) -> None:
         top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}")
         top_state = self.peer_relation.data[top_unit].get("state")
 
-        # if top of stack is completed, leader pops it
-        if self.charm.unit.is_leader() and top_state == "completed":
-            logger.debug(f"{top_unit} has finished upgrading, updating stack...")
+        # persist the new popped stack to relation data
+        if top_state == "completed":
+            # don't modify stack and trigger new unit upgrades if 'safe' resume-strategy
+            # only pause on first unit
+            if self.first_unit and self.resume_strategy == "safe":
+                logger.info(
+                    f"Please evaluate unit {self.charm.app}/{self.upgrade_stack[-1]}, and run `resume-upgrade` action to continue upgrade"
+                )
+                self.charm.unit.status = BlockedStatus("awaiting resume-upgrade action...")
+                return
 
-            # writes the mutated attr back to rel data
-            self.peer_relation.data[self.charm.app].update(
-                {"upgrade-stack": json.dumps(self.upgrade_stack)}
-            )
-
-            # recurse on leader to ensure relation changed event not lost
-            # in case leader is next or the last unit to complete
-            self.on_upgrade_changed(event)
+            self._update_stack()
 
         # if unit top of stack and all units ready (i.e stack), emit granted event
+        # only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm
-        # only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm
+        # only runs on VM, as K8s units will be 'upgrading' due to not setting 'ready' in upgrade-charm
-        # only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm
+        # only runs on VM, as K8s units will be 'upgrading' due to not setting 'ready' in upgrade-charm
         if (
             self.charm.unit == top_unit
             and top_state in ["ready", "upgrading"]
@@ -1179,42 +1192,6 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:
 
         raise NotImplementedError
 
-    def _on_upgrade_finished(self, _) -> None:
-        """Handler for `upgrade-finished` events."""
-        if self.substrate == "vm" or not self.peer_relation:
-            return
-
-        # Emit the upgrade relation changed event in the leader to update the upgrade_stack.
-        if self.charm.unit.is_leader():
-            self.charm.on[self.relation_name].relation_changed.emit(
-                self.model.get_relation(self.relation_name)
-            )
-
-        # This hook shouldn't run for the last unit (the first that is upgraded). For that unit it
-        # should be done through an action after the upgrade success on that unit is double-checked.
-        unit_number = int(self.charm.unit.name.split("/")[1])
-        if unit_number == len(self.peer_relation.units):
-            logger.info(
-                f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade"
-            )
-            return
-
-        # Also, the hook shouldn't run for the first unit (the last that is upgraded).
-        if unit_number == 0:
-            logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete")
-            return
-
-        try:
-            # Use the unit number instead of the upgrade stack to avoid race conditions
-            # (i.e. the leader updates the upgrade stack after this hook runs).
-            next_partition = unit_number - 1
-            logger.debug(f"Set rolling update partition to unit {next_partition}")
-            self._set_rolling_update_partition(partition=next_partition)
-        except KubernetesClientError:
-            logger.exception("Cannot set rolling update partition")
-            self.set_unit_failed()
-            self.log_rollback_instructions()
-
     def _set_rolling_update_partition(self, partition: int) -> None:
         """Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`.
 
@@ -1243,3 +1220,29 @@ def _set_rolling_update_partition(self, partition: int) -> None:
             return
 
         raise NotImplementedError
+
+    def _update_stack(self) -> None:
+        """Updates the upgrade-stack on the leader unit."""
+        if not self.peer_relation or not self.charm.unit.is_leader() or not self.upgrade_stack:
+            return
+
+        # don't set partition on last unit, as upgrade_stack attr should be empty
+        if self.substrate == "k8s" and self.upgrade_stack:
+            try:
+                # stack is already popped, new partition is current top of stack
+                logger.debug(f"Setting rolling update partition to unit {self.upgrade_stack[-1]}")
+                self._set_rolling_update_partition(self.upgrade_stack[-1])
+            except KubernetesClientError:
+                self.set_unit_failed()
+                return
+
+        # writes the mutated attr back to rel data
+        self.peer_relation.data[self.charm.app].update(
+            {"upgrade-stack": json.dumps(self.upgrade_stack)}
+        )
+
+        # recurse on leader to ensure relation changed event not lost
+        # in case leader is next or the last unit to complete
+        self.charm.on[self.relation_name].relation_changed.emit(
+            self.model.get_relation(self.relation_name)
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,7 +64,7 @@ target-version="py310"
 src = ["src", "tests", "lib"]
 
 [tool.ruff.mccabe]
-max-complexity = 12
+max-complexity = 13
 
 [tool.pyright]
 include = ["src", "lib"]