Skip to content

Commit

Permalink
"DC lost during wait" during online upgrade
Browse files Browse the repository at this point in the history
```
Command: x Exascaler Install: apply_lustre_params,create_udev_rules,email,emf_agent,emf_node_manager,ha,hosts,ipmi,kdump,logging,lustre,lvm,mdt_backup,modprobe,nics,ntp,os,ost_pools,restart_network,serial,start_cluster,sync_exa_toml (Config ver. 1) failed
User: api

  Job: x es-install --steps start_cluster on node5 failed

    Step: x Run config-pacemaker on node5 failed (took: 12s 534ms 171us 586ns)
    Result (Error):
      Bad Exit Code: 1.
    Started: 2024-02-07T03:26:16.158Z
    Ended: 2024-02-07T03:26:28.692Z
    Stdout:
      Running Command: config-pacemaker --unmanaged-emf
    Stderr:
      x Command has failed.
      Code: exit status: 1
      Stdout: INFO: cib.commit: committed '5e8558de-1ceb-46c2-bd70-1ab4d8504c9f' shadow CIB to the cluster

      Stderr: WARNING: DC lost during wait
```
  • Loading branch information
freishutz committed Feb 8, 2024
1 parent 19d5796 commit 0a16a10
Showing 1 changed file with 20 additions and 7 deletions.
27 changes: 20 additions & 7 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -959,12 +959,12 @@ def append_file(dest, src):

def get_dc(peer=None):
cmd = "crmadmin -D -t 1"
_, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
rc, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
if not out:
return None
return (None, rc)
if not out.startswith("Designated"):
return None
return out.split()[-1]
return (None, rc)
return (out.split()[-1], rc)


def wait4dc(what="", show_progress=True):
Expand All @@ -990,8 +990,21 @@ def wait4dc(what="", show_progress=True):
There's no timeout, as we expect the DC to eventually becomes
idle.
'''
dc = get_dc()
if not dc:
def dc_waiter():
while True:
dc, rc = get_dc()
if rc == 0:
return dc
if rc == 102 or rc == 1:
logger.warning("Could not connect to controller: Connection refused")
return None
if rc == 124:
logger.warning("No reply received from controller before timeout")

This comment has been minimized.

Copy link
@jgrund

jgrund Feb 8, 2024

Member

It seems like we have the potential to loop forever here, no?

This comment has been minimized.

Copy link
@utopiabound

utopiabound Feb 8, 2024

Does crmadmin ever return EMEDIUMTYPE?

This comment has been minimized.

Copy link
@freishutz

freishutz Feb 8, 2024

Author Collaborator

Only if we always have 124 and this means that the machine(s) is voting, but this state under pacemaker control and it won't allow to be in it for a time more than (dc-deadtime=20s):

[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
Designated Controller is: es-2-virt1
0
[root@es-1-virt1 ~]# crm cluster stop
INFO: The cluster stack stopped on es-1-virt1
[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
error: Could not connect to controller: Connection refused
error: Command failed: Connection refused
102

[root@es-1-virt1 ~]# crm cluster start
INFO: The cluster stack started on es-1-virt1

[root@es-1-virt1 ~]# crmadmin -D -t 1; echo $?
error: No reply received from controller before timeout (1000ms)
error: Command failed: Connection timed out
124

This comment has been minimized.

Copy link
@jgrund

jgrund Feb 8, 2024

Member

got it, thanks.

continue
logger.warning("Unknown return code from crmadmin: %d", rc)
return None

if not dc_waiter:
logger.warning("can't find DC")
return False
cmd = "crm_attribute -Gq -t crm_config -n crmd-transition-delay 2> /dev/null"
Expand All @@ -1007,7 +1020,7 @@ def wait4dc(what="", show_progress=True):
max_sleep = 1.00
sleep_time = init_sleep
while True:
dc = get_dc()
dc = dc_waiter()
if not dc:
logger.warning("DC lost during wait")
return False
Expand Down

0 comments on commit 0a16a10

Please sign in to comment.