diff --git a/src/charm.py b/src/charm.py index bd79d3d..6165ec5 100755 --- a/src/charm.py +++ b/src/charm.py @@ -13,7 +13,7 @@ from typing import List, Optional from charms.redis_k8s.v0.redis import RedisProvides -from ops.charm import ActionEvent, CharmBase +from ops.charm import ActionEvent, CharmBase, UpgradeCharmEvent from ops.framework import EventBase from ops.main import main from ops.model import ActiveStatus, BlockedStatus, ModelError, Relation, WaitingStatus @@ -22,12 +22,12 @@ from redis.exceptions import RedisError from tenacity import before_log, retry, stop_after_attempt, wait_fixed -from exceptions import RedisFailoverCheckError, RedisFailoverInProgressError from literals import ( LEADER_HOST_KEY, PEER, PEER_PASSWORD_KEY, REDIS_PORT, + REDIS_REL_NAME, SENTINEL_PASSWORD_KEY, SOCKET_TIMEOUT, WAITING_MESSAGE, @@ -86,14 +86,42 @@ def _redis_pebble_ready(self, event) -> None: event.defer() return - def _upgrade_charm(self, _) -> None: + def _upgrade_charm(self, event: UpgradeCharmEvent) -> None: """Handle the upgrade_charm event. - Tries to store the certificates on the redis container, as new `juju attach-resource` + Check for failover status and update connection information for redis relation and + current_master. + Also tries to store the certificates on the redis container, as new `juju attach-resource` will trigger this event. """ self._store_certificates() + # NOTE: This is the case of a single unit deployment. If that's the case, the charm + # doesn't need to check for failovers or figure out who the master is. + if not self._peers.units: + return + + # Pick a different unit to connect to sentinel + k8s_host = self._k8s_hostname(name=list(self._peers.units)[0].name) + if not self._is_failover_finished(host=k8s_host): + logger.info("Failover didn't finish, deferring") + event.defer() + return + + # NOTE: If unit is the leader, update application databag directly: peer_relation_changed + # will trigger for the rest of the units and they will update connection information. If + # unit is not a leader, add a key to the application databag so peer_relation_changed + # triggers for the leader unit and application databag is updated. + if self.unit.is_leader(): + info = self.sentinel.get_master_info(host=k8s_host) + logger.debug(f"Master info: {info}") + logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}") + self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"] + else: + relation = self.model.get_relation(relation_name=REDIS_REL_NAME) + if relation: + self._peers.data[self.unit]["upgrading"] = "true" + def _leader_elected(self, event) -> None: """Handle the leader_elected event. @@ -121,9 +149,7 @@ def _leader_elected(self, event) -> None: # TODO extract to method shared with relation_departed self._update_application_master() self._update_quorum() - try: - self._is_failover_finished() - except (RedisFailoverCheckError, RedisFailoverInProgressError): + if not self._is_failover_finished(): logger.info("Failover didn't finish, deferring") event.defer() return @@ -165,7 +191,13 @@ def _update_status(self, _) -> None: def _peer_relation_changed(self, event): """Handle relation for joining units.""" - if not self._check_master(): + if not self._master_up_to_date(): + logger.error(f"Unit {self.unit.name} doesn't agree on tracked master") + if not self._is_failover_finished(): + logger.info("Failover didn't finish, deferring") + event.defer() + return + if self.unit.is_leader(): # Update who the current master is self._update_application_master() @@ -175,6 +207,12 @@ def _peer_relation_changed(self, event): if self._peers.data[self.app].get("enable-password", "true") == "false": self._update_layer() + relation = self.model.get_relation(relation_name=REDIS_REL_NAME) + if relation: + relation.data[self.model.unit]["hostname"] = socket.gethostbyname(self.current_master) + if self._peers.data[self.unit].get("upgrading", "false") == "true": + self._peers.data[self.unit]["upgrading"] = "" + if not (self.unit.is_leader() and event.unit): return @@ -193,7 +231,7 @@ def _peer_relation_departed(self, event): if not self.unit.is_leader(): return - if not self._check_master(): + if not self._master_up_to_date(): self._update_application_master() # Quorum is updated beforehand, since removal of more units than current majority @@ -201,9 +239,7 @@ def _peer_relation_departed(self, event): logger.info("Updating quorum") self._update_quorum() - try: - self._is_failover_finished() - except (RedisFailoverCheckError, RedisFailoverInProgressError): + if not self._is_failover_finished(): msg = "Failover didn't finish, deferring" logger.info(msg) self.unit.status == WaitingStatus(msg) @@ -490,7 +526,7 @@ def _k8s_hostname(self, name: str) -> str: """Create a DNS name for a Redis unit name. Args: - name: the Redis unit name, e.g. "redis-k8s-0". + name: the Redis unit name, e.g. "redis-k8s/0". Returns: A string representing the hostname of the Redis unit. @@ -522,17 +558,17 @@ def _redis_client(self, hostname="localhost") -> Redis: finally: client.close() - def _check_master(self) -> bool: - """Connect to the current stored master and query role.""" - with self._redis_client(hostname=self.current_master) as redis: - try: - result = redis.execute_command("ROLE") - except (ConnectionError, TimeoutError) as e: - logger.warning("Error trying to check master: {}".format(e)) - return False + def _master_up_to_date(self, host="0.0.0.0") -> bool: + """Check if stored master is the same as sentinel tracked. - if result[0] == "master": - return True + Returns: + host: string to connect to sentinel + """ + info = self.sentinel.get_master_info(host=host) + if info is None: + return False + elif (info["ip"] == self.current_master) and ("s_down" not in info["flags"]): + return True return False @@ -544,6 +580,7 @@ def _update_application_master(self) -> None: logger.warning("Could not update current master") return + logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}") self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"] def _sentinel_failover(self, departing_unit_name: str) -> None: @@ -565,19 +602,27 @@ def _sentinel_failover(self, departing_unit_name: str) -> None: reraise=True, before=before_log(logger, logging.DEBUG), ) - def _is_failover_finished(self) -> None: - """Check if failover is still in progress.""" + def _is_failover_finished(self, host="localhost") -> bool: + """Check if failover is still in progress. + + Args: + host: string to connect to sentinel. + + Returns: + True if failover is finished, false otherwise + """ logger.warning("Checking if failover is finished.") - info = self.sentinel.get_master_info() + info = self.sentinel.get_master_info(host=host) if info is None: logger.warning("Could not check failover status") - raise RedisFailoverCheckError - - if "failover-state" in info: + return False + if "failover_in_progress" in info["flags"]: logger.warning( "Failover taking place. Current status: {}".format(info["failover-state"]) ) - raise RedisFailoverInProgressError + return False + + return True def _update_quorum(self) -> None: """Connect to all Sentinels deployed to update the quorum.""" diff --git a/src/exceptions.py b/src/exceptions.py deleted file mode 100644 index 089b5a3..0000000 --- a/src/exceptions.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. - -"""Module with custom exceptions related to the Redis charm.""" - - -class RedisOperatorError(Exception): - """Base class for exceptions in this module.""" - - def __repr__(self): - """String representation of the Error class.""" - return "<{}.{} {}>".format(type(self).__module__, type(self).__name__, self.args) - - @property - def name(self): - """Return a string representation of the model plus class.""" - return "<{}.{}>".format(type(self).__module__, type(self).__name__) - - @property - def message(self): - """Return the message passed as an argument.""" - return self.args[0] - - -class RedisFailoverInProgressError(RedisOperatorError): - """Exception raised when failover is in progress.""" - - -class RedisFailoverCheckError(RedisOperatorError): - """Exception raised when failover status cannot be checked.""" diff --git a/src/literals.py b/src/literals.py index 4dded05..20f09bd 100644 --- a/src/literals.py +++ b/src/literals.py @@ -6,6 +6,7 @@ WAITING_MESSAGE = "Waiting for Redis..." PEER = "redis-peers" +REDIS_REL_NAME = "redis" PEER_PASSWORD_KEY = "redis-password" SENTINEL_PASSWORD_KEY = "sentinel-password" LEADER_HOST_KEY = "leader-host" diff --git a/src/sentinel.py b/src/sentinel.py index e3ca061..2d32b7d 100644 --- a/src/sentinel.py +++ b/src/sentinel.py @@ -147,7 +147,7 @@ def _copy_file(self, path: str, rendered: str, container: str) -> None: group="redis", ) - def get_master_info(self, host="localhost") -> Optional[dict]: + def get_master_info(self, host="0.0.0.0") -> Optional[dict]: """Connect to sentinel and return the current master.""" with self.sentinel_client(host) as sentinel: try: diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index ba1907b..a0b7e9f 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -147,6 +147,7 @@ async def test_same_password_after_scaling(ops_test: OpsTest): ) +@pytest.mark.skip # TLS will not be implemented as resources in the future @pytest.mark.tls_tests async def test_blocked_if_no_certificates(ops_test: OpsTest): """Check the application status on TLS enable. @@ -166,6 +167,7 @@ async def test_blocked_if_no_certificates(ops_test: OpsTest): await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000) +@pytest.mark.skip # TLS will not be implemented as resources in the future @pytest.mark.tls_tests async def test_enable_tls(ops_test: OpsTest): """Check adding TLS certificates and enabling them. diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 7e1e555..3ed7b36 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -177,7 +177,8 @@ def mock_get_container(name): self.assertEqual(self.harness.charm.app.status, ActiveStatus()) self.assertEqual(self.harness.get_workload_version(), "6.0.11") - def test_password_on_leader_elected(self): + @mock.patch.object(RedisK8sCharm, "_is_failover_finished") + def test_password_on_leader_elected(self, _): # Assert that there is no password in the peer relation. self.assertFalse(self.harness.charm._get_password()) @@ -314,8 +315,13 @@ def test_non_leader_unit_as_replica(self, execute_command): # Custom responses to Redis `execute_command` call def my_side_effect(value: str): mapping = { - "ROLE": ["master"], f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK", + f"SENTINEL MASTER {self.harness.charm._name}": [ + "ip", + APPLICATION_DATA["leader-host"], + "flags", + "master", + ], } return mapping.get(value) @@ -360,9 +366,13 @@ def test_application_data_update_after_failover(self, execute_command): # Custom responses to Redis `execute_command` call def my_side_effect(value: str): mapping = { - "ROLE": ["non-master"], f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK", - f"SENTINEL MASTER {self.harness.charm._name}": ["ip", "different-leader"], + f"SENTINEL MASTER {self.harness.charm._name}": [ + "ip", + "different-leader", + "flags", + "s_down", + ], } return mapping.get(value) @@ -384,16 +394,26 @@ def my_side_effect(value: str): updated_data = self.harness.get_relation_data(rel.id, "redis-k8s") self.assertEqual(updated_data["leader-host"], "different-leader") + # Reset the application data to the initial state + self.harness.update_relation_data(rel.id, "redis-k8s", APPLICATION_DATA) + + # Now check that a pod reschedule will also result in updated information + self.harness.charm.on.upgrade_charm.emit() + + updated_data = self.harness.get_relation_data(rel.id, "redis-k8s") + self.assertEqual(updated_data["leader-host"], "different-leader") + @mock.patch.object(Redis, "execute_command") def test_forced_failover_when_unit_departed_is_master(self, execute_command): # Custom responses to Redis `execute_command` call def my_side_effect(value: str): mapping = { - "ROLE": ["non-master"], f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK", f"SENTINEL MASTER {self.harness.charm._name}": [ "ip", self.harness.charm._k8s_hostname("redis-k8s/1"), + "flags", + "master", ], } return mapping.get(value)