Skip to content

Commit

Permalink
Fix race condition in leader election for OCP
Browse files Browse the repository at this point in the history
In OCP we have two steps to prepare a node before we drain it.
first we get leader election and annotate the node with Draining
and then we pause the MCP and mark the node as MCP paused.

if the config_daemon get a reset between the fist part to the second
it will get stuck because one node will take the leader election
BUT it will not mark the node as Draining as there is another node
already draining. and the node with the draining label will try to get
the drain lock again but the first node has it.

with this change if the node as Draning or MCP pause label it will not
try to take the lock again and just continue after the reset.

Signed-off-by: Sebastian Sch <[email protected]>
  • Loading branch information
SchSeba committed Jun 11, 2023
1 parent 109c2c2 commit e81d9ad
Showing 1 changed file with 24 additions and 18 deletions.
42 changes: 24 additions & 18 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,18 +507,22 @@ func (dn *Daemon) nodeStateSyncHandler() error {
go dn.getDrainLock(ctx, done)
<-done
}
}

if dn.openshiftContext.IsOpenshiftCluster() && !dn.openshiftContext.IsHypershift() {
glog.Infof("nodeStateSyncHandler(): pause MCP")
if err := dn.pauseMCP(); err != nil {
return err
}
if dn.openshiftContext.IsOpenshiftCluster() && !dn.openshiftContext.IsHypershift() && !dn.isMcpPaused() {
glog.Infof("nodeStateSyncHandler(): pause MCP")
if err := dn.pauseMCP(); err != nil {
return err
}
}

glog.Info("nodeStateSyncHandler(): drain node")
if err := dn.drainNode(); err != nil {
return err
if dn.disableDrain {
glog.Info("nodeStateSyncHandler(): disable drain is true skipping drain")
} else {
glog.Info("nodeStateSyncHandler(): drain node")
if err := dn.drainNode(); err != nil {
return err
}
}
}

Expand Down Expand Up @@ -578,18 +582,25 @@ func (dn *Daemon) nodeHasAnnotation(annoKey string, value string) bool {
return false
}

// isNodeDraining: check if the node is draining
// both Draining and MCP paused labels will return true
func (dn *Daemon) isNodeDraining() bool {
anno, ok := dn.node.Annotations[annoKey]
if !ok {
return false
}

if dn.openshiftContext.IsOpenshiftCluster() && !dn.openshiftContext.IsHypershift() {
// for openshift cluster draining should be true only if the annotation has MCP paused
return anno == annoMcpPaused
return anno == annoDraining || anno == annoMcpPaused
}

// isMcpPaused: return try only if the node label as the MCP paused label
func (dn *Daemon) isMcpPaused() bool {
anno, ok := dn.node.Annotations[annoKey]
if !ok {
return false
}

return anno == annoDraining || anno == annoMcpPaused
return anno == annoMcpPaused
}

func (dn *Daemon) completeDrain() error {
Expand All @@ -599,7 +610,7 @@ func (dn *Daemon) completeDrain() error {
}
}

if dn.openshiftContext.IsOpenshiftCluster() && !dn.openshiftContext.IsHypershift() {
if dn.openshiftContext.IsOpenshiftCluster() && !dn.openshiftContext.IsHypershift() && dn.isMcpPaused() {
glog.Infof("completeDrain(): resume MCP %s", dn.mcpName)
pausePatch := []byte("{\"spec\":{\"paused\":false}}")
if _, err := dn.openshiftContext.McClient.MachineconfigurationV1().MachineConfigPools().Patch(context.Background(), dn.mcpName, types.MergePatchType, pausePatch, metav1.PatchOptions{}); err != nil {
Expand Down Expand Up @@ -895,11 +906,6 @@ func (dn *Daemon) pauseMCP() error {
}

func (dn *Daemon) drainNode() error {
if dn.disableDrain {
glog.Info("drainNode(): disable drain is true skipping drain")
return nil
}

glog.Info("drainNode(): Update prepared")
var err error

Expand Down

0 comments on commit e81d9ad

Please sign in to comment.