From 63dd4bc1d1386b821b99d13d72c4099ad72ef753 Mon Sep 17 00:00:00 2001 From: Arthur Novik Date: Wed, 26 Jun 2024 12:20:35 +0300 Subject: [PATCH] DC lost during wait The initial error that we faced during our online storage upgrade: ``` Command: x Exascaler Install: apply_lustre_params,create_udev_rules,email,emf_agent,emf_node_manager,ha,hosts,ipmi,kdump,logging,lustre,lvm,mdt_backup,modprobe,nics,ntp,os,ost_pools,restart_network,serial,start_cluster,sync_exa_toml (Config ver. 1) failed User: api Job: x es-install --steps start_cluster on node5 failed Step: x Run config-pacemaker on node5 failed (took: 12s 534ms 171us 586ns) Result (Error): Bad Exit Code: 1. Started: 2024-02-07T03:26:16.158Z Ended: 2024-02-07T03:26:28.692Z Stdout: Running Command: config-pacemaker --unmanaged-emf Stderr: x Command has failed. Code: exit status: 1 Stdout: INFO: cib.commit: committed '5e8558de-1ceb-46c2-bd70-1ab4d8504c9f' shadow CIB to the cluster Stderr: WARNING: DC lost during wait ``` Basically, the source of our problems below (case 3 - DC election or voiting during cluster recalculation): ``` [root@es-1-virt1 ~]# crmadmin -D -t 1; echo $? Designated Controller is: es-2-virt1 0 [root@es-1-virt1 ~]# crm cluster stop INFO: The cluster stack stopped on es-1-virt1 [root@es-1-virt1 ~]# crmadmin -D -t 1; echo $? error: Could not connect to controller: Connection refused error: Command failed: Connection refused 102 [root@es-1-virt1 ~]# crm cluster start INFO: The cluster stack started on es-1-virt1 [root@es-1-virt1 ~]# crmadmin -D -t 1; echo $? error: No reply received from controller before timeout (1000ms) error: Command failed: Connection timed out 124 ``` Potentially, we have a deadloop in dc_waiter, but it also means that pacemaker in the same state and in worst case the amount of time should not be more than 'dc-deadtime'. --- crmsh/utils.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/crmsh/utils.py b/crmsh/utils.py index b91abbc71..1d022d163 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -991,12 +991,12 @@ def append_file(dest, src): def get_dc(peer=None): cmd = "crmadmin -D -t 1" - _, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd) + rc, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd) if not out: - return None + return (None, rc) if not out.startswith("Designated"): - return None - return out.split()[-1] + return (None, rc) + return (out.split()[-1], rc) def wait4dc(what="", show_progress=True): @@ -1022,8 +1022,21 @@ def wait4dc(what="", show_progress=True): There's no timeout, as we expect the DC to eventually becomes idle. ''' - dc = get_dc() - if not dc: + def dc_waiter(): + while True: + dc, rc = get_dc() + if rc == 0: + return dc + if rc == 102 or rc == 1: + logger.warning("Could not connect to the controller: Connection refused") + return None + if rc == 124: + logger.warning("No reply received from the controller before timeout") + continue + logger.warning("Unknown return code from crmadmin: %d", rc) + return None + + if not dc_waiter: logger.warning("can't find DC") return False cmd = "crm_attribute -Gq -t crm_config -n crmd-transition-delay 2> /dev/null" @@ -1039,7 +1052,7 @@ def wait4dc(what="", show_progress=True): max_sleep = 1.00 sleep_time = init_sleep while True: - dc = get_dc() + dc = dc_waiter() if not dc: logger.warning("DC lost during wait") return False