Skip to content

Commit

Permalink
impl compensation mechanism for driver.removeNotReadyTaint()
Browse files Browse the repository at this point in the history
  • Loading branch information
abbshr committed Mar 15, 2024
1 parent bed1afa commit 6ef167b
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
6 changes: 2 additions & 4 deletions pkg/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"net"
"strings"
"time"

"github.com/container-storage-interface/spec/lib/go/csi"
"google.golang.org/grpc"
Expand Down Expand Up @@ -129,10 +130,7 @@ func (d *Driver) Run() error {

// Remove taint from node to indicate driver startup success
// This is done at the last possible moment to prevent race conditions or false positive removals
err = removeNotReadyTaint(cloud.DefaultKubernetesAPIClient)
if err != nil {
klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)")
}
go tryRemoveNotReadyTaintUntilSucceed(cloud.DefaultKubernetesAPIClient, time.Second)

klog.Infof("Listening for connections on address: %#v", listener.Addr())
return d.srv.Serve(listener)
Expand Down
16 changes: 15 additions & 1 deletion pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"path/filepath"
"strconv"
"strings"
"time"

"github.com/container-storage-interface/spec/lib/go/csi"
"github.com/kubernetes-sigs/aws-efs-csi-driver/pkg/cloud"
Expand Down Expand Up @@ -452,7 +453,7 @@ type JSONPatch struct {
Value interface{} `json:"value"`
}

// removeNotReadyTaint removes the taint ebs.csi.aws.com/agent-not-ready from the local node
// removeNotReadyTaint removes the taint efs.csi.aws.com/agent-not-ready from the local node
// This taint can be optionally applied by users to prevent startup race conditions such as
// https://github.com/kubernetes/kubernetes/issues/95911
func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error {
Expand Down Expand Up @@ -512,3 +513,16 @@ func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error {
klog.InfoS("Removed taint(s) from local node", "node", nodeName)
return nil
}

// remove taint may failed, this keep retring until succeed, make sure the taint will eventually being removed
func tryRemoveNotReadyTaintUntilSucceed(k8sClient cloud.KubernetesAPIClient, interval time.Duration) {
for {
err := removeNotReadyTaint(k8sClient)
if err == nil {
return
}

klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)")
time.Sleep(interval)
}
}

0 comments on commit 6ef167b

Please sign in to comment.