-
Notifications
You must be signed in to change notification settings - Fork 9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor: unify k8s+vm upgrade pattern #83
Changes from all commits
35de63a
8baabca
b9f2215
1c8dbf6
8445747
7072431
a29e0fc
c36f437
8360382
262d4c8
8e3edd8
e98d7c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -284,7 +284,7 @@ def restart(self, event) -> None: | |||||
|
||||||
# Increment this PATCH version before using `charmcraft publish-lib` or reset | ||||||
# to 0 if you are raising the major API version | ||||||
LIBPATCH = 8 | ||||||
LIBPATCH = 9 | ||||||
|
||||||
PYDEPS = ["pydantic>=1.10,<2"] | ||||||
|
||||||
|
@@ -652,18 +652,13 @@ class UpgradeGrantedEvent(EventBase): | |||||
"""Used to tell units that they can process an upgrade.""" | ||||||
|
||||||
|
||||||
class UpgradeFinishedEvent(EventBase): | ||||||
"""Used to tell units that they finished the upgrade.""" | ||||||
|
||||||
|
||||||
class UpgradeEvents(CharmEvents): | ||||||
"""Upgrade events. | ||||||
|
||||||
This class defines the events that the lib can emit. | ||||||
""" | ||||||
|
||||||
upgrade_granted = EventSource(UpgradeGrantedEvent) | ||||||
upgrade_finished = EventSource(UpgradeFinishedEvent) | ||||||
|
||||||
|
||||||
# --- EVENT HANDLER --- | ||||||
|
@@ -699,16 +694,14 @@ def __init__( | |||||
) | ||||||
self.framework.observe(self.charm.on.upgrade_charm, self._on_upgrade_charm) | ||||||
self.framework.observe(getattr(self.on, "upgrade_granted"), self._on_upgrade_granted) | ||||||
self.framework.observe(getattr(self.on, "upgrade_finished"), self._on_upgrade_finished) | ||||||
|
||||||
# actions | ||||||
self.framework.observe( | ||||||
getattr(self.charm.on, "pre_upgrade_check_action"), self._on_pre_upgrade_check_action | ||||||
) | ||||||
if self.substrate == "k8s": | ||||||
self.framework.observe( | ||||||
getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action | ||||||
) | ||||||
self.framework.observe( | ||||||
getattr(self.charm.on, "resume_upgrade_action"), self._on_resume_upgrade_action | ||||||
) | ||||||
|
||||||
@property | ||||||
def peer_relation(self) -> Optional[Relation]: | ||||||
|
@@ -791,6 +784,33 @@ def unit_states(self) -> list: | |||||
|
||||||
return [self.peer_relation.data[unit].get("state", "") for unit in self.app_units] | ||||||
|
||||||
@property | ||||||
def resume_strategy(self) -> Optional[str]: | ||||||
"""Gets upgrade resume-strategy requested during pre-upgrade-check action. | ||||||
|
||||||
Returns: | ||||||
String of "auto" or "safe" resume-strategy | ||||||
""" | ||||||
if not self.peer_relation: | ||||||
return None | ||||||
|
||||||
if self.substrate == "k8s": | ||||||
return "safe" | ||||||
|
||||||
return self.peer_relation.data[self.charm.app].get("resume-strategy", "auto") | ||||||
|
||||||
@property | ||||||
def first_unit(self) -> bool: | ||||||
"""Flag to check if the current top-of-stack unit is the first to upgrade. | ||||||
|
||||||
Returns: | ||||||
True if top-of-stack unit is the first to upgrade. Otherwise False | ||||||
""" | ||||||
if not self.peer_relation or not self.upgrade_stack: | ||||||
return True | ||||||
|
||||||
return len(self.upgrade_stack) == len(self.peer_relation.units) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following change is needed as the upgrade stack has not changed when the resume upgrade action is called on k8s.
Suggested change
However, it makes the upgrade process not respect the resume strategy (making it not wait for the call the the |
||||||
|
||||||
@property | ||||||
def cluster_state(self) -> Optional[str]: | ||||||
"""Current upgrade state for cluster units. | ||||||
|
@@ -905,17 +925,15 @@ def set_unit_completed(self) -> None: | |||||
# now leader pulls a fresh stack from newly updated relation data | ||||||
if self.charm.unit.is_leader(): | ||||||
self._upgrade_stack = None | ||||||
# recurse on leader to ensure relation changed event not lost | ||||||
# in case leader is next or the last unit to complete | ||||||
self.charm.on[self.relation_name].relation_changed.emit( | ||||||
self.model.get_relation(self.relation_name) | ||||||
) | ||||||
|
||||||
self.charm.unit.status = MaintenanceStatus("upgrade completed") | ||||||
self.peer_relation.data[self.charm.unit].update({"state": "completed"}) | ||||||
|
||||||
# Emit upgrade_finished event to run unit's post upgrade operations. | ||||||
if self.substrate == "k8s": | ||||||
logger.debug( | ||||||
f"{self.charm.unit.name} has completed the upgrade, emitting `upgrade_finished` event..." | ||||||
) | ||||||
getattr(self.on, "upgrade_finished").emit() | ||||||
|
||||||
def _on_upgrade_created(self, event: RelationCreatedEvent) -> None: | ||||||
"""Handler for `upgrade-relation-created` events.""" | ||||||
if not self.peer_relation: | ||||||
|
@@ -977,6 +995,12 @@ def _on_pre_upgrade_check_action(self, event: ActionEvent) -> None: | |||||
event.fail(message="Unknown error found.") | ||||||
return | ||||||
|
||||||
# always 'safe' on k8s, default to 'auto' on vm | ||||||
resume_strategy = ( | ||||||
"safe" if self.substrate == "k8s" else event.params.get("resume-strategy", "auto") | ||||||
) | ||||||
self.peer_relation.data[self.charm.app].update({"resume-strategy": resume_strategy}) | ||||||
|
||||||
logger.info("Setting upgrade-stack to relation data...") | ||||||
self.upgrade_stack = built_upgrade_stack | ||||||
|
||||||
|
@@ -985,31 +1009,21 @@ def _on_resume_upgrade_action(self, event: ActionEvent) -> None: | |||||
|
||||||
Continue the upgrade by setting the partition to the next unit. | ||||||
""" | ||||||
if not self.peer_relation: | ||||||
event.fail(message="Could not find upgrade relation.") | ||||||
return | ||||||
# FIXME: use switch-case here when 3.10 | ||||||
|
||||||
if not self.charm.unit.is_leader(): | ||||||
event.fail(message="Action must be ran on the Juju leader.") | ||||||
return | ||||||
|
||||||
if not self.upgrade_stack: | ||||||
event.fail(message="Nothing to resume, upgrade stack unset.") | ||||||
return | ||||||
checks = [ | ||||||
(self.peer_relation, "Could not find upgrade relation."), | ||||||
(self.charm.unit.is_leader(), "Action must be ran on the Juju leader."), | ||||||
(self.upgrade_stack, "Nothing to resume, upgrade-stack unset."), | ||||||
(self.first_unit, "Upgrade can be resumed only once after juju refresh is called."), | ||||||
] | ||||||
|
||||||
# Check whether this is being run after juju refresh was called | ||||||
# (the size of the upgrade stack should match the number of total | ||||||
# unit minus one). | ||||||
if len(self.upgrade_stack) != len(self.peer_relation.units): | ||||||
event.fail(message="Upgrade can be resumed only once after juju refresh is called.") | ||||||
return | ||||||
for check in checks: | ||||||
if not check[0]: | ||||||
event.fail(message=check[1]) | ||||||
return | ||||||
|
||||||
try: | ||||||
next_partition = self.upgrade_stack[-1] | ||||||
self._set_rolling_update_partition(partition=next_partition) | ||||||
event.set_results({"message": f"Upgrade will resume on unit {next_partition}"}) | ||||||
except KubernetesClientError: | ||||||
event.fail(message="Cannot set rolling update partition.") | ||||||
self._update_stack() | ||||||
|
||||||
def _upgrade_supported_check(self) -> None: | ||||||
"""Checks if previous versions can be upgraded to new versions. | ||||||
|
@@ -1056,32 +1070,24 @@ def _on_upgrade_charm(self, event: UpgradeCharmEvent) -> None: | |||||
logger.error("Cluster upgrade failed, ensure pre-upgrade checks are ran first.") | ||||||
return | ||||||
|
||||||
# for VM, run version checks on leader only | ||||||
# for K8s, run version checks on first unit only | ||||||
if (self.charm.unit.is_leader() and self.substrate == "vm") or ( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on VM does it really matters where the upgrade_supported_check is done, since every unit will have a new charm code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're probably right now that I think about it. For VM, the unit that does the check is the last unit to mark itself as 'ready'. I suppose it could be the first! |
||||||
self.first_unit and self.substrate == "k8s" | ||||||
): | ||||||
try: | ||||||
self._upgrade_supported_check() | ||||||
except VersionError as e: # not ready if not passed check | ||||||
logger.error(e) | ||||||
self.set_unit_failed() | ||||||
return | ||||||
|
||||||
if self.substrate == "vm": | ||||||
# for VM run version checks on leader only | ||||||
if self.charm.unit.is_leader(): | ||||||
try: | ||||||
self._upgrade_supported_check() | ||||||
except VersionError as e: # not ready if not passed check | ||||||
logger.error(e) | ||||||
self.set_unit_failed() | ||||||
return | ||||||
self.charm.unit.status = WaitingStatus("other units upgrading first...") | ||||||
self.peer_relation.data[self.charm.unit].update({"state": "ready"}) | ||||||
|
||||||
else: | ||||||
# for k8s run version checks only on highest ordinal unit | ||||||
if ( | ||||||
self.charm.unit.name | ||||||
== f"{self.charm.app.name}/{self.charm.app.planned_units() -1}" | ||||||
): | ||||||
try: | ||||||
self._upgrade_supported_check() | ||||||
except VersionError as e: # not ready if not passed check | ||||||
logger.error(e) | ||||||
self.set_unit_failed() | ||||||
return | ||||||
# On K8s an unit that receives the upgrade-charm event is upgrading | ||||||
self.charm.unit.status = MaintenanceStatus("upgrading unit") | ||||||
if self.substrate == "k8s": | ||||||
self.charm.unit.status = MaintenanceStatus("upgrading unit...") | ||||||
self.peer_relation.data[self.charm.unit].update({"state": "upgrading"}) | ||||||
|
||||||
def on_upgrade_changed(self, event: EventBase) -> None: | ||||||
|
@@ -1109,7 +1115,13 @@ def on_upgrade_changed(self, event: EventBase) -> None: | |||||
if self.charm.unit.is_leader(): | ||||||
logger.debug("Persisting new dependencies to upgrade relation data...") | ||||||
self.peer_relation.data[self.charm.app].update( | ||||||
{"dependencies": json.dumps(self.dependency_model.dict())} | ||||||
{ | ||||||
"dependencies": json.dumps( | ||||||
self.dependency_model.dict() | ||||||
), # persisting new deps | ||||||
"resume-strategy": "", # resetting resume-strategy for future upgrades | ||||||
"upgrade-stack": "", # resetting upgrade-stack for future upgrades | ||||||
} | ||||||
) | ||||||
return | ||||||
|
||||||
|
@@ -1130,20 +1142,21 @@ def on_upgrade_changed(self, event: EventBase) -> None: | |||||
top_unit = self.charm.model.get_unit(f"{self.charm.app.name}/{top_unit_id}") | ||||||
top_state = self.peer_relation.data[top_unit].get("state") | ||||||
|
||||||
# if top of stack is completed, leader pops it | ||||||
if self.charm.unit.is_leader() and top_state == "completed": | ||||||
logger.debug(f"{top_unit} has finished upgrading, updating stack...") | ||||||
# persist the new popped stack to relation data | ||||||
if top_state == "completed": | ||||||
# don't modify stack and trigger new unit upgrades if 'safe' resume-strategy | ||||||
# only pause on first unit | ||||||
if self.first_unit and self.resume_strategy == "safe": | ||||||
logger.info( | ||||||
f"Please evaluate unit {self.charm.app}/{self.upgrade_stack[-1]}, and run `resume-upgrade` action to continue upgrade" | ||||||
) | ||||||
self.charm.unit.status = BlockedStatus("awaiting resume-upgrade action...") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good addition! |
||||||
return | ||||||
|
||||||
# writes the mutated attr back to rel data | ||||||
self.peer_relation.data[self.charm.app].update( | ||||||
{"upgrade-stack": json.dumps(self.upgrade_stack)} | ||||||
) | ||||||
|
||||||
# recurse on leader to ensure relation changed event not lost | ||||||
# in case leader is next or the last unit to complete | ||||||
self.on_upgrade_changed(event) | ||||||
self._update_stack() | ||||||
|
||||||
# if unit top of stack and all units ready (i.e stack), emit granted event | ||||||
# only runs on VM, as K8s units will be 'idle' due to not setting 'ready' in upgrade-charm | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. L1081? Or do you mean something like
Suggested change
|
||||||
if ( | ||||||
self.charm.unit == top_unit | ||||||
and top_state in ["ready", "upgrading"] | ||||||
|
@@ -1179,42 +1192,6 @@ def _on_upgrade_granted(self, event: UpgradeGrantedEvent) -> None: | |||||
|
||||||
raise NotImplementedError | ||||||
|
||||||
def _on_upgrade_finished(self, _) -> None: | ||||||
"""Handler for `upgrade-finished` events.""" | ||||||
if self.substrate == "vm" or not self.peer_relation: | ||||||
return | ||||||
|
||||||
# Emit the upgrade relation changed event in the leader to update the upgrade_stack. | ||||||
if self.charm.unit.is_leader(): | ||||||
self.charm.on[self.relation_name].relation_changed.emit( | ||||||
self.model.get_relation(self.relation_name) | ||||||
) | ||||||
|
||||||
# This hook shouldn't run for the last unit (the first that is upgraded). For that unit it | ||||||
# should be done through an action after the upgrade success on that unit is double-checked. | ||||||
unit_number = int(self.charm.unit.name.split("/")[1]) | ||||||
if unit_number == len(self.peer_relation.units): | ||||||
logger.info( | ||||||
f"{self.charm.unit.name} unit upgraded. Evaluate and run `resume-upgrade` action to continue upgrade" | ||||||
) | ||||||
return | ||||||
|
||||||
# Also, the hook shouldn't run for the first unit (the last that is upgraded). | ||||||
if unit_number == 0: | ||||||
logger.info(f"{self.charm.unit.name} unit upgraded. Upgrade is complete") | ||||||
return | ||||||
|
||||||
try: | ||||||
# Use the unit number instead of the upgrade stack to avoid race conditions | ||||||
# (i.e. the leader updates the upgrade stack after this hook runs). | ||||||
next_partition = unit_number - 1 | ||||||
logger.debug(f"Set rolling update partition to unit {next_partition}") | ||||||
self._set_rolling_update_partition(partition=next_partition) | ||||||
except KubernetesClientError: | ||||||
logger.exception("Cannot set rolling update partition") | ||||||
self.set_unit_failed() | ||||||
self.log_rollback_instructions() | ||||||
|
||||||
def _set_rolling_update_partition(self, partition: int) -> None: | ||||||
"""Patch the StatefulSet's `spec.updateStrategy.rollingUpdate.partition`. | ||||||
|
||||||
|
@@ -1243,3 +1220,29 @@ def _set_rolling_update_partition(self, partition: int) -> None: | |||||
return | ||||||
|
||||||
raise NotImplementedError | ||||||
|
||||||
def _update_stack(self) -> None: | ||||||
"""Updates the upgrade-stack on the leader unit.""" | ||||||
if not self.peer_relation or not self.charm.unit.is_leader() or not self.upgrade_stack: | ||||||
return | ||||||
|
||||||
# don't set partition on last unit, as upgrade_stack attr should be empty | ||||||
if self.substrate == "k8s" and self.upgrade_stack: | ||||||
try: | ||||||
# stack is already popped, new partition is current top of stack | ||||||
logger.debug(f"Setting rolling update partition to unit {self.upgrade_stack[-1]}") | ||||||
self._set_rolling_update_partition(self.upgrade_stack[-1]) | ||||||
except KubernetesClientError: | ||||||
self.set_unit_failed() | ||||||
return | ||||||
|
||||||
# writes the mutated attr back to rel data | ||||||
self.peer_relation.data[self.charm.app].update( | ||||||
{"upgrade-stack": json.dumps(self.upgrade_stack)} | ||||||
) | ||||||
|
||||||
# recurse on leader to ensure relation changed event not lost | ||||||
# in case leader is next or the last unit to complete | ||||||
self.charm.on[self.relation_name].relation_changed.emit( | ||||||
self.model.get_relation(self.relation_name) | ||||||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can allow users to also select the
auto
resume strategy on k8s.