Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: unify k8s+vm upgrade pattern #83

Closed
wants to merge 12 commits into from
223 changes: 113 additions & 110 deletions lib/charms/data_platform_libs/v0/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def restart(self, event) -> None:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 8
LIBPATCH = 9

PYDEPS = ["pydantic>=1.10,<2"]

Expand Down Expand Up @@ -652,18 +652,13 @@ class UpgradeGrantedEvent(EventBase):
"""Used to tell units that they can process an upgrade."""


class UpgradeFinishedEvent(EventBase):
"""Used to tell units that they finished the upgrade."""


class UpgradeEvents(CharmEvents):
"""Upgrade events.

This class defines the events that the lib can emit.
"""

upgrade_granted = EventSource(UpgradeGrantedEvent)
upgrade_finished = EventSource(UpgradeFinishedEvent)


# --- EVENT HANDLER ---
Expand Down Expand Up @@ -699,16 +694,14 @@ def __init__(
)
self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm)
self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted)
self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished)

# actions
self.framework.observe(
getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action
)
if self.substrate == "k8s":
self.framework.observe(
getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
)
self.framework.observe(
getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action
)

@property
def peer_relation(self) -> Optional[Relation]:
Expand Down Expand Up @@ -791,6 +784,33 @@ def unit_states(self) -> list:

return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units]

@property
def resume_strategy(self) -> Optional[str]:
"""Gets upgrade resume-strategy requested during pre-upgrade-check action.

Returns:
String of "auto" or "safe" resume-strategy
"""
if not self.peer_relation:
return None

if self.substrate == "k8s":
return "safe"

return self.peer_relation.data[self.charm.app].get("resume-strategy", "auto")
Comment on lines +797 to +800
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can allow users to also select the auto resume strategy on k8s.


@property
def first_unit(self) -> bool:
"""Flag to check if the current top-of-stack unit is the first to upgrade.

Returns:
True if top-of-stack unit is the first to upgrade. Otherwise False
"""
if not self.peer_relation or not self.upgrade_stack:
return True

return len(self.upgrade_stack) == len(self.peer_relation.units)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following change is needed as the upgrade stack has not changed when the resume upgrade action is called on k8s.

Suggested change
return len(self.upgrade_stack) == len(self.peer_relation.units)
return len(self.upgrade_stack) == len(self.peer_relation.units) + 1

However, it makes the upgrade process not respect the resume strategy (making it not wait for the call the the resume-upgrade action), so I think we need different comparisons for those two cases.


@property
def cluster_state(self) -> Optional[str]:
"""Current upgrade state for cluster units.
Expand Down Expand Up @@ -905,17 +925,15 @@ def set_unit_completed(self) -> None:
# now leader pulls a fresh stack from newly updated relation data
if self.charm.unit.is_leader():
self._upgrade_stack = None
# recurse on leader to ensure relation changed event not lost
# in case leader is next or the last unit to complete
self.charm.on[self.relation_name].relation_changed.emit(
self.model.get_relation(self.relation_name)
)

self.charm.unit.status = MaintenanceStatus("upgrade completed")
self.peer_relation.data[self.charm.unit].update({"state": "completed"})

# Emit upgrade_finished event to run unit's post upgrade operations.
if self.substrate == "k8s":
logger.debug(
f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..."
)
getattr(self.on, "upgrade_finished").emit()

def _on_upgrade_created(self, event: RelationCreatedEvent) -> None:
"""Handler for `upgrade-relation-created` events."""
if not self.peer_relation:
Expand Down Expand Up @@ -977,6 +995,12 @@ def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None:
event.fail(message="Unknown error found.")
return

# always 'safe' on k8s, default to 'auto' on vm
resume_strategy = (
"safe" if self.substrate == "k8s" else event.params.get("resume-strategy", "auto")
)
self.peer_relation.data[self.charm.app].update({"resume-strategy": resume_strategy})

logger.info("Setting upgrade-stack to relation data...")
self.upgrade_stack = built_upgrade_stack

Expand All @@ -985,31 +1009,21 @@ def _on_resume_upgrade_action(self, event: ActionEvent) -> None:

Continue the upgrade by setting the partition to the next unit.
"""
if not self.peer_relation:
event.fail(message="Could not find upgrade relation.")
return
# FIXME: use switch-case here when 3.10

if not self.charm.unit.is_leader():
event.fail(message="Action must be ran on the Juju leader.")
return

if not self.upgrade_stack:
event.fail(message="Nothing to resume, upgrade stack unset.")
return
checks = [
(self.peer_relation, "Could not find upgrade relation."),
(self.charm.unit.is_leader(), "Action must be ran on the Juju leader."),
(self.upgrade_stack, "Nothing to resume, upgrade-stack unset."),
(self.first_unit, "Upgrade can be resumed only once after juju refresh is called."),
]

# Check whether this is being run after juju refresh was called
# (the size of the upgrade stack should match the number of total
# unit minus one).
if len(self.upgrade_stack) != len(self.peer_relation.units):
event.fail(message="Upgrade can be resumed only once after juju refresh is called.")
return
for check in checks:
if not check[0]:
event.fail(message=check[1])
return

try:
next_partition = self.upgrade_stack[-1]
self._set_rolling_update_partition(partition=next_partition)
event.set_results({"message": f"Upgrade will resume on unit {next_partition}"})
except KubernetesClientError:
event.fail(message="Cannot set rolling update partition.")
self._update_stack()

def _upgrade_supported_check(self) -> None:
"""Checks if previous versions can be upgraded to new versions.
Expand Down Expand Up @@ -1056,32 +1070,24 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None:
logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.")
return

# for VM, run version checks on leader only
# for K8s, run version checks on first unit only
if (self.charm.unit.is_leader() and self.substrate == "vm") or (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on VM does it really matters where the upgrade_supported_check is done, since every unit will have a new charm code.
I think we could have same approach for both, i.e. check on the first

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're probably right now that I think about it. For VM, the unit that does the check is the last unit to mark itself as 'ready'. I suppose it could be the first!

self.first_unit and self.substrate == "k8s"
):
try:
self._upgrade_supported_check()
except VersionError as e: # not ready if not passed check
logger.error(e)
self.set_unit_failed()
return

if self.substrate == "vm":
# for VM run version checks on leader only
if self.charm.unit.is_leader():
try:
self._upgrade_supported_check()
except VersionError as e: # not ready if not passed check
logger.error(e)
self.set_unit_failed()
return
self.charm.unit.status = WaitingStatus("other units upgrading first...")
self.peer_relation.data[self.charm.unit].update({"state": "ready"})

else:
# for k8s run version checks only on highest ordinal unit
if (
self.charm.unit.name
== f"{self.charm.app.name}/{self.charm.app.planned_units() -1}"
):
try:
self._upgrade_supported_check()
except VersionError as e: # not ready if not passed check
logger.error(e)
self.set_unit_failed()
return
# On K8s an unit that receives the upgrade-charm event is upgrading
self.charm.unit.status = MaintenanceStatus("upgrading unit")
if self.substrate == "k8s":
self.charm.unit.status = MaintenanceStatus("upgrading unit...")
self.peer_relation.data[self.charm.unit].update({"state": "upgrading"})

def on_upgrade_changed(self, event: EventBase) -> None:
Expand Down Expand Up @@ -1109,7 +1115,13 @@ def on_upgrade_changed(self, event: EventBase) -> None:
if self.charm.unit.is_leader():
logger.debug("Persisting new dependencies to upgrade relation data...")
self.peer_relation.data[self.charm.app].update(
{"dependencies": json.dumps(self.dependency_model.dict())}
{
"dependencies": json.dumps(
self.dependency_model.dict()
), # persisting new deps
"resume-strategy": "", # resetting resume-strategy for future upgrades
"upgrade-stack": "", # resetting upgrade-stack for future upgrades
}
)
return

Expand All @@ -1130,20 +1142,21 @@ def on_upgrade_changed(self, event: EventBase) -> None:
top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}")
top_state = self.peer_relation.data[top_unit].get("state")

# if top of stack is completed, leader pops it
if self.charm.unit.is_leader() and top_state == "completed":
logger.debug(f"{top_unit} has finished upgrading, updating stack...")
# persist the new popped stack to relation data
if top_state == "completed":
# don't modify stack and trigger new unit upgrades if 'safe' resume-strategy
# only pause on first unit
if self.first_unit and self.resume_strategy == "safe":
logger.info(
f"Please evaluate unit {self.charm.app}/{self.upgrade_stack[-1]}, and run `resume-upgrade` action to continue upgrade"
)
self.charm.unit.status = BlockedStatus("awaiting resume-upgrade action...")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good addition!

return

# writes the mutated attr back to rel data
self.peer_relation.data[self.charm.app].update(
{"upgrade-stack": json.dumps(self.upgrade_stack)}
)

# recurse on leader to ensure relation changed event not lost
# in case leader is next or the last unit to complete
self.on_upgrade_changed(event)
self._update_stack()

# if unit top of stack and all units ready (i.e stack), emit granted event
# only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L1081? Or do you mean something like as not upgraded k8s units will be 'idle' due to upgrade-charm event have not being fired yet?

Suggested change
# only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm
# only runs on VM, as K8s units will be 'upgrading' due to not setting 'ready' in upgrade-charm

if (
self.charm.unit == top_unit
and top_state in ["ready", "upgrading"]
Expand Down Expand Up @@ -1179,42 +1192,6 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None:

raise NotImplementedError

def _on_upgrade_finished(self, _) -> None:
"""Handler for `upgrade-finished` events."""
if self.substrate == "vm" or not self.peer_relation:
return

# Emit the upgrade relation changed event in the leader to update the upgrade_stack.
if self.charm.unit.is_leader():
self.charm.on[self.relation_name].relation_changed.emit(
self.model.get_relation(self.relation_name)
)

# This hook shouldn't run for the last unit (the first that is upgraded). For that unit it
# should be done through an action after the upgrade success on that unit is double-checked.
unit_number = int(self.charm.unit.name.split("/")[1])
if unit_number == len(self.peer_relation.units):
logger.info(
f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade"
)
return

# Also, the hook shouldn't run for the first unit (the last that is upgraded).
if unit_number == 0:
logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete")
return

try:
# Use the unit number instead of the upgrade stack to avoid race conditions
# (i.e. the leader updates the upgrade stack after this hook runs).
next_partition = unit_number - 1
logger.debug(f"Set rolling update partition to unit {next_partition}")
self._set_rolling_update_partition(partition=next_partition)
except KubernetesClientError:
logger.exception("Cannot set rolling update partition")
self.set_unit_failed()
self.log_rollback_instructions()

def _set_rolling_update_partition(self, partition: int) -> None:
"""Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`.

Expand Down Expand Up @@ -1243,3 +1220,29 @@ def _set_rolling_update_partition(self, partition: int) -> None:
return

raise NotImplementedError

def _update_stack(self) -> None:
"""Updates the upgrade-stack on the leader unit."""
if not self.peer_relation or not self.charm.unit.is_leader() or not self.upgrade_stack:
return

# don't set partition on last unit, as upgrade_stack attr should be empty
if self.substrate == "k8s" and self.upgrade_stack:
try:
# stack is already popped, new partition is current top of stack
logger.debug(f"Setting rolling update partition to unit {self.upgrade_stack[-1]}")
self._set_rolling_update_partition(self.upgrade_stack[-1])
except KubernetesClientError:
self.set_unit_failed()
return

# writes the mutated attr back to rel data
self.peer_relation.data[self.charm.app].update(
{"upgrade-stack": json.dumps(self.upgrade_stack)}
)

# recurse on leader to ensure relation changed event not lost
# in case leader is next or the last unit to complete
self.charm.on[self.relation_name].relation_changed.emit(
self.model.get_relation(self.relation_name)
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ target-version="py310"
src = ["src", "tests", "lib"]

[tool.ruff.mccabe]
max-complexity = 12
max-complexity = 13

[tool.pyright]
include = ["src", "lib"]
Expand Down
Loading