From 91ea65f928469c8cd5b8357d0a19e60fe8e96d99 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Wed, 28 Jun 2023 14:35:56 +0200 Subject: [PATCH] webhook: Avoid setting DisableDrain while updating Disable the drain while a node is updating its configuration prevents the config-daemon to uncordon the node, leading to a node that needs to be manually uncordoned. Signed-off-by: Andrea Panattoni --- pkg/webhook/client.go | 2 +- pkg/webhook/validate.go | 43 ++++++++++++++++++++++++++++ pkg/webhook/validate_test.go | 54 +++++++++++++++++++++++++++++++----- 3 files changed, 91 insertions(+), 8 deletions(-) diff --git a/pkg/webhook/client.go b/pkg/webhook/client.go index 270e0903d..c4d3686af 100644 --- a/pkg/webhook/client.go +++ b/pkg/webhook/client.go @@ -11,7 +11,7 @@ import ( snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" ) -var snclient *snclientset.Clientset +var snclient snclientset.Interface var kubeclient *kubernetes.Clientset func SetupInClusterClient() error { diff --git a/pkg/webhook/validate.go b/pkg/webhook/validate.go index 5e298c1dc..91b3922fa 100644 --- a/pkg/webhook/validate.go +++ b/pkg/webhook/validate.go @@ -17,6 +17,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" ) @@ -47,9 +49,50 @@ func validateSriovOperatorConfig(cr *sriovnetworkv1.SriovOperatorConfig, operati warnings = append(warnings, "Node draining is disabled for applying SriovNetworkNodePolicy, it may result in workload interruption.") } + err := validateSriovOperatorConfigDisableDrain(cr) + if err != nil { + return false, warnings, err + } + return true, warnings, nil } +// validateSriovOperatorConfigDisableDrain checks if the user is setting `.Spec.DisableDrain` from false to true while +// operator is updating one or more nodes. Disabling the drain at this stage would prevent the operator to uncordon a node at +// the end of the update operation, keeping nodes un-schedulable until manual intervention. +func validateSriovOperatorConfigDisableDrain(cr *sriovnetworkv1.SriovOperatorConfig) error { + if !cr.Spec.DisableDrain { + return nil + } + + previousConfig, err := snclient.SriovnetworkV1().SriovOperatorConfigs(cr.Namespace).Get(context.Background(), cr.Name, metav1.GetOptions{}) + if err != nil { + if k8serrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("can't validate SriovOperatorConfig[%s] DisableDrain against its previous value: %q", cr.Name, err) + } + + if previousConfig.Spec.DisableDrain == cr.Spec.DisableDrain { + // DisableDrain didn't change + return nil + } + + // DisableDrain has been changed `false -> true`, check if any node is updating + nodeStates, err := snclient.SriovnetworkV1().SriovNetworkNodeStates(namespace).List(context.Background(), metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("can't validate SriovOperatorConfig[%s] DisableDrain transition to true: %q", cr.Name, err) + } + + for _, nodeState := range nodeStates.Items { + if nodeState.Status.SyncStatus == "InProgress" { + return fmt.Errorf("can't set Spec.DisableDrain = true while node[%s] is updating", nodeState.Name) + } + } + + return nil +} + func validateSriovNetworkNodePolicy(cr *sriovnetworkv1.SriovNetworkNodePolicy, operation v1.Operation) (bool, []string, error) { glog.V(2).Infof("validateSriovNetworkNodePolicy: %v", cr) var warnings []string diff --git a/pkg/webhook/validate_test.go b/pkg/webhook/validate_test.go index 028d62a5e..a82796312 100644 --- a/pkg/webhook/validate_test.go +++ b/pkg/webhook/validate_test.go @@ -1,6 +1,7 @@ package webhook import ( + "context" "fmt" "os" "testing" @@ -14,6 +15,8 @@ import ( . "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" constants "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" + + fakesnclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned/fake" ) func TestMain(m *testing.M) { @@ -129,11 +132,8 @@ func NewNode() *corev1.Node { return &corev1.Node{Spec: corev1.NodeSpec{ProviderID: "openstack"}} } -func TestValidateSriovOperatorConfigWithDefaultOperatorConfig(t *testing.T) { - var err error - var ok bool - var w []string - config := &SriovOperatorConfig{ +func newDefaultOperatorConfig() *SriovOperatorConfig { + return &SriovOperatorConfig{ ObjectMeta: metav1.ObjectMeta{ Name: "default", }, @@ -145,8 +145,15 @@ func TestValidateSriovOperatorConfigWithDefaultOperatorConfig(t *testing.T) { LogLevel: 2, }, } +} + +func TestValidateSriovOperatorConfigWithDefaultOperatorConfig(t *testing.T) { g := NewGomegaWithT(t) - ok, _, err = validateSriovOperatorConfig(config, "DELETE") + + config := newDefaultOperatorConfig() + snclient = fakesnclientset.NewSimpleClientset() + + ok, _, err := validateSriovOperatorConfig(config, "DELETE") g.Expect(err).To(HaveOccurred()) g.Expect(ok).To(Equal(false)) @@ -154,7 +161,7 @@ func TestValidateSriovOperatorConfigWithDefaultOperatorConfig(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) g.Expect(ok).To(Equal(true)) - ok, w, err = validateSriovOperatorConfig(config, "UPDATE") + ok, w, err := validateSriovOperatorConfig(config, "UPDATE") g.Expect(err).NotTo(HaveOccurred()) g.Expect(ok).To(Equal(true)) g.Expect(w[0]).To(ContainSubstring("Node draining is disabled")) @@ -164,6 +171,39 @@ func TestValidateSriovOperatorConfigWithDefaultOperatorConfig(t *testing.T) { g.Expect(ok).To(Equal(true)) } +func TestValidateSriovOperatorConfigDisableDrain(t *testing.T) { + g := NewGomegaWithT(t) + + config := newDefaultOperatorConfig() + config.Spec.DisableDrain = false + + nodeState := &SriovNetworkNodeState{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-1", Namespace: namespace}, + Status: SriovNetworkNodeStateStatus{ + SyncStatus: "InProgress", + }, + } + + snclient = fakesnclientset.NewSimpleClientset( + config, + nodeState, + ) + + config.Spec.DisableDrain = true + ok, _, err := validateSriovOperatorConfig(config, "UPDATE") + g.Expect(err).To(MatchError("can't set Spec.DisableDrain = true while node[worker-1] is updating")) + g.Expect(ok).To(Equal(false)) + + // Simulate node update finished + nodeState.Status.SyncStatus = "Succeeded" + snclient.SriovnetworkV1().SriovNetworkNodeStates(namespace). + Update(context.Background(), nodeState, metav1.UpdateOptions{}) + + ok, _, err = validateSriovOperatorConfig(config, "UPDATE") + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(ok).To(Equal(true)) +} + func TestValidateSriovNetworkNodePolicyWithDefaultPolicy(t *testing.T) { var err error var ok bool