Skip to content

Commit

Permalink
Update relation ip on primary change (#55)
Browse files Browse the repository at this point in the history
* update relation ip on primary change
  • Loading branch information
zmraul committed Oct 17, 2022
1 parent 8e41c51 commit dc437fb
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 67 deletions.
105 changes: 75 additions & 30 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from typing import List, Optional

from charms.redis_k8s.v0.redis import RedisProvides
from ops.charm import ActionEvent, CharmBase
from ops.charm import ActionEvent, CharmBase, UpgradeCharmEvent
from ops.framework import EventBase
from ops.main import main
from ops.model import ActiveStatus, BlockedStatus, ModelError, Relation, WaitingStatus
Expand All @@ -22,12 +22,12 @@
from redis.exceptions import RedisError
from tenacity import before_log, retry, stop_after_attempt, wait_fixed

from exceptions import RedisFailoverCheckError, RedisFailoverInProgressError
from literals import (
LEADER_HOST_KEY,
PEER,
PEER_PASSWORD_KEY,
REDIS_PORT,
REDIS_REL_NAME,
SENTINEL_PASSWORD_KEY,
SOCKET_TIMEOUT,
WAITING_MESSAGE,
Expand Down Expand Up @@ -86,14 +86,42 @@ def _redis_pebble_ready(self, event) -> None:
event.defer()
return

def _upgrade_charm(self, _) -> None:
def _upgrade_charm(self, event: UpgradeCharmEvent) -> None:
"""Handle the upgrade_charm event.
Tries to store the certificates on the redis container, as new `juju attach-resource`
Check for failover status and update connection information for redis relation and
current_master.
Also tries to store the certificates on the redis container, as new `juju attach-resource`
will trigger this event.
"""
self._store_certificates()

# NOTE: This is the case of a single unit deployment. If that's the case, the charm
# doesn't need to check for failovers or figure out who the master is.
if not self._peers.units:
return

# Pick a different unit to connect to sentinel
k8s_host = self._k8s_hostname(name=list(self._peers.units)[0].name)
if not self._is_failover_finished(host=k8s_host):
logger.info("Failover didn't finish, deferring")
event.defer()
return

# NOTE: If unit is the leader, update application databag directly: peer_relation_changed
# will trigger for the rest of the units and they will update connection information. If
# unit is not a leader, add a key to the application databag so peer_relation_changed
# triggers for the leader unit and application databag is updated.
if self.unit.is_leader():
info = self.sentinel.get_master_info(host=k8s_host)
logger.debug(f"Master info: {info}")
logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}")
self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"]
else:
relation = self.model.get_relation(relation_name=REDIS_REL_NAME)
if relation:
self._peers.data[self.unit]["upgrading"] = "true"

def _leader_elected(self, event) -> None:
"""Handle the leader_elected event.
Expand Down Expand Up @@ -121,9 +149,7 @@ def _leader_elected(self, event) -> None:
# TODO extract to method shared with relation_departed
self._update_application_master()
self._update_quorum()
try:
self._is_failover_finished()
except (RedisFailoverCheckError, RedisFailoverInProgressError):
if not self._is_failover_finished():
logger.info("Failover didn't finish, deferring")
event.defer()
return
Expand Down Expand Up @@ -165,7 +191,13 @@ def _update_status(self, _) -> None:

def _peer_relation_changed(self, event):
"""Handle relation for joining units."""
if not self._check_master():
if not self._master_up_to_date():
logger.error(f"Unit {self.unit.name} doesn't agree on tracked master")
if not self._is_failover_finished():
logger.info("Failover didn't finish, deferring")
event.defer()
return

if self.unit.is_leader():
# Update who the current master is
self._update_application_master()
Expand All @@ -175,6 +207,12 @@ def _peer_relation_changed(self, event):
if self._peers.data[self.app].get("enable-password", "true") == "false":
self._update_layer()

relation = self.model.get_relation(relation_name=REDIS_REL_NAME)
if relation:
relation.data[self.model.unit]["hostname"] = socket.gethostbyname(self.current_master)
if self._peers.data[self.unit].get("upgrading", "false") == "true":
self._peers.data[self.unit]["upgrading"] = ""

if not (self.unit.is_leader() and event.unit):
return

Expand All @@ -193,17 +231,15 @@ def _peer_relation_departed(self, event):
if not self.unit.is_leader():
return

if not self._check_master():
if not self._master_up_to_date():
self._update_application_master()

# Quorum is updated beforehand, since removal of more units than current majority
# could lead to the cluster never reaching quorum.
logger.info("Updating quorum")
self._update_quorum()

try:
self._is_failover_finished()
except (RedisFailoverCheckError, RedisFailoverInProgressError):
if not self._is_failover_finished():
msg = "Failover didn't finish, deferring"
logger.info(msg)
self.unit.status == WaitingStatus(msg)
Expand Down Expand Up @@ -490,7 +526,7 @@ def _k8s_hostname(self, name: str) -> str:
"""Create a DNS name for a Redis unit name.
Args:
name: the Redis unit name, e.g. "redis-k8s-0".
name: the Redis unit name, e.g. "redis-k8s/0".
Returns:
A string representing the hostname of the Redis unit.
Expand Down Expand Up @@ -522,17 +558,17 @@ def _redis_client(self, hostname="localhost") -> Redis:
finally:
client.close()

def _check_master(self) -> bool:
"""Connect to the current stored master and query role."""
with self._redis_client(hostname=self.current_master) as redis:
try:
result = redis.execute_command("ROLE")
except (ConnectionError, TimeoutError) as e:
logger.warning("Error trying to check master: {}".format(e))
return False
def _master_up_to_date(self, host="0.0.0.0") -> bool:
"""Check if stored master is the same as sentinel tracked.
if result[0] == "master":
return True
Returns:
host: string to connect to sentinel
"""
info = self.sentinel.get_master_info(host=host)
if info is None:
return False
elif (info["ip"] == self.current_master) and ("s_down" not in info["flags"]):
return True

return False

Expand All @@ -544,6 +580,7 @@ def _update_application_master(self) -> None:
logger.warning("Could not update current master")
return

logger.info(f"Unit {self.unit.name} updating master info to {info['ip']}")
self._peers.data[self.app][LEADER_HOST_KEY] = info["ip"]

def _sentinel_failover(self, departing_unit_name: str) -> None:
Expand All @@ -565,19 +602,27 @@ def _sentinel_failover(self, departing_unit_name: str) -> None:
reraise=True,
before=before_log(logger, logging.DEBUG),
)
def _is_failover_finished(self) -> None:
"""Check if failover is still in progress."""
def _is_failover_finished(self, host="localhost") -> bool:
"""Check if failover is still in progress.
Args:
host: string to connect to sentinel.
Returns:
True if failover is finished, false otherwise
"""
logger.warning("Checking if failover is finished.")
info = self.sentinel.get_master_info()
info = self.sentinel.get_master_info(host=host)
if info is None:
logger.warning("Could not check failover status")
raise RedisFailoverCheckError

if "failover-state" in info:
return False
if "failover_in_progress" in info["flags"]:
logger.warning(
"Failover taking place. Current status: {}".format(info["failover-state"])
)
raise RedisFailoverInProgressError
return False

return True

def _update_quorum(self) -> None:
"""Connect to all Sentinels deployed to update the quorum."""
Expand Down
31 changes: 0 additions & 31 deletions src/exceptions.py

This file was deleted.

1 change: 1 addition & 0 deletions src/literals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

WAITING_MESSAGE = "Waiting for Redis..."
PEER = "redis-peers"
REDIS_REL_NAME = "redis"
PEER_PASSWORD_KEY = "redis-password"
SENTINEL_PASSWORD_KEY = "sentinel-password"
LEADER_HOST_KEY = "leader-host"
Expand Down
2 changes: 1 addition & 1 deletion src/sentinel.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _copy_file(self, path: str, rendered: str, container: str) -> None:
group="redis",
)

def get_master_info(self, host="localhost") -> Optional[dict]:
def get_master_info(self, host="0.0.0.0") -> Optional[dict]:
"""Connect to sentinel and return the current master."""
with self.sentinel_client(host) as sentinel:
try:
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ async def test_same_password_after_scaling(ops_test: OpsTest):
)


@pytest.mark.skip # TLS will not be implemented as resources in the future
@pytest.mark.tls_tests
async def test_blocked_if_no_certificates(ops_test: OpsTest):
"""Check the application status on TLS enable.
Expand All @@ -166,6 +167,7 @@ async def test_blocked_if_no_certificates(ops_test: OpsTest):
await ops_test.model.wait_for_idle(apps=[APP_NAME], status="active", timeout=1000)


@pytest.mark.skip # TLS will not be implemented as resources in the future
@pytest.mark.tls_tests
async def test_enable_tls(ops_test: OpsTest):
"""Check adding TLS certificates and enabling them.
Expand Down
30 changes: 25 additions & 5 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@ def mock_get_container(name):
self.assertEqual(self.harness.charm.app.status, ActiveStatus())
self.assertEqual(self.harness.get_workload_version(), "6.0.11")

def test_password_on_leader_elected(self):
@mock.patch.object(RedisK8sCharm, "_is_failover_finished")
def test_password_on_leader_elected(self, _):
# Assert that there is no password in the peer relation.
self.assertFalse(self.harness.charm._get_password())

Expand Down Expand Up @@ -314,8 +315,13 @@ def test_non_leader_unit_as_replica(self, execute_command):
# Custom responses to Redis `execute_command` call
def my_side_effect(value: str):
mapping = {
"ROLE": ["master"],
f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK",
f"SENTINEL MASTER {self.harness.charm._name}": [
"ip",
APPLICATION_DATA["leader-host"],
"flags",
"master",
],
}
return mapping.get(value)

Expand Down Expand Up @@ -360,9 +366,13 @@ def test_application_data_update_after_failover(self, execute_command):
# Custom responses to Redis `execute_command` call
def my_side_effect(value: str):
mapping = {
"ROLE": ["non-master"],
f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK",
f"SENTINEL MASTER {self.harness.charm._name}": ["ip", "different-leader"],
f"SENTINEL MASTER {self.harness.charm._name}": [
"ip",
"different-leader",
"flags",
"s_down",
],
}
return mapping.get(value)

Expand All @@ -384,16 +394,26 @@ def my_side_effect(value: str):
updated_data = self.harness.get_relation_data(rel.id, "redis-k8s")
self.assertEqual(updated_data["leader-host"], "different-leader")

# Reset the application data to the initial state
self.harness.update_relation_data(rel.id, "redis-k8s", APPLICATION_DATA)

# Now check that a pod reschedule will also result in updated information
self.harness.charm.on.upgrade_charm.emit()

updated_data = self.harness.get_relation_data(rel.id, "redis-k8s")
self.assertEqual(updated_data["leader-host"], "different-leader")

@mock.patch.object(Redis, "execute_command")
def test_forced_failover_when_unit_departed_is_master(self, execute_command):
# Custom responses to Redis `execute_command` call
def my_side_effect(value: str):
mapping = {
"ROLE": ["non-master"],
f"SENTINEL CKQUORUM {self.harness.charm._name}": "OK",
f"SENTINEL MASTER {self.harness.charm._name}": [
"ip",
self.harness.charm._k8s_hostname("redis-k8s/1"),
"flags",
"master",
],
}
return mapping.get(value)
Expand Down

0 comments on commit dc437fb

Please sign in to comment.