From 63dd4bc1d1386b821b99d13d72c4099ad72ef753 Mon Sep 17 00:00:00 2001
From: Arthur Novik <freishutz@gmail.com>
Date: Wed, 26 Jun 2024 12:20:35 +0300
Subject: [PATCH] DC lost during wait

The initial error that we faced during our online storage upgrade:
```
Command: x Exascaler Install: apply_lustre_params,create_udev_rules,email,emf_agent,emf_node_manager,ha,hosts,ipmi,kdump,logging,lustre,lvm,mdt_backup,modprobe,nics,ntp,os,ost_pools,restart_network,serial,start_cluster,sync_exa_toml (Config ver. 1) failed
User: api

  Job: x es-install --steps start_cluster on node5 failed

    Step: x Run config-pacemaker on node5 failed (took: 12s 534ms 171us 586ns)
    Result (Error):
      Bad Exit Code: 1.
    Started: 2024-02-07T03:26:16.158Z
    Ended: 2024-02-07T03:26:28.692Z
    Stdout:
      Running Command: config-pacemaker --unmanaged-emf
    Stderr:
      x Command has failed.
      Code: exit status: 1
      Stdout: INFO: cib.commit: committed '5e8558de-1ceb-46c2-bd70-1ab4d8504c9f' shadow CIB to the cluster

      Stderr: WARNING: DC lost during wait
```

Basically, the source of our problems below (case 3 - DC election or
voiting during cluster recalculation):

```
[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
Designated Controller is: es-2-virt1
0

[root@es-1-virt1 ~]# crm cluster stop
INFO: The cluster stack stopped on es-1-virt1
[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
error: Could not connect to controller: Connection refused
error: Command failed: Connection refused
102

[root@es-1-virt1 ~]# crm cluster start
INFO: The cluster stack started on es-1-virt1
[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
error: No reply received from controller before timeout (1000ms)
error: Command failed: Connection timed out
124
```
Potentially, we have a deadloop in dc_waiter, but it also means that
pacemaker in the same state and in worst case the amount of time should
not be more than 'dc-deadtime'.
---
 crmsh/utils.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/crmsh/utils.py b/crmsh/utils.py
index b91abbc71..1d022d163 100644
--- a/crmsh/utils.py
+++ b/crmsh/utils.py
@@ -991,12 +991,12 @@ def append_file(dest, src):
 
 def get_dc(peer=None):
     cmd = "crmadmin -D -t 1"
-    _, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
+    rc, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
     if not out:
-        return None
+        return (None, rc)
     if not out.startswith("Designated"):
-        return None
-    return out.split()[-1]
+        return (None, rc)
+    return (out.split()[-1], rc)
 
 
 def wait4dc(what="", show_progress=True):
@@ -1022,8 +1022,21 @@ def wait4dc(what="", show_progress=True):
     There's no timeout, as we expect the DC to eventually becomes
     idle.
     '''
-    dc = get_dc()
-    if not dc:
+    def dc_waiter():
+        while True:
+            dc, rc = get_dc()
+            if rc == 0:
+                return dc
+            if rc == 102 or rc == 1:
+                logger.warning("Could not connect to the controller: Connection refused")
+                return None
+            if rc == 124:
+                logger.warning("No reply received from the controller before timeout")
+                continue
+            logger.warning("Unknown return code from crmadmin: %d", rc)
+            return None
+
+    if not dc_waiter:
         logger.warning("can't find DC")
         return False
     cmd = "crm_attribute -Gq -t crm_config -n crmd-transition-delay 2> /dev/null"
@@ -1039,7 +1052,7 @@ def wait4dc(what="", show_progress=True):
     max_sleep = 1.00
     sleep_time = init_sleep
     while True:
-        dc = get_dc()
+        dc = dc_waiter()
         if not dc:
             logger.warning("DC lost during wait")
             return False