From 36036551ff2deb1e0fa242c4d79340ab8f9f42f9 Mon Sep 17 00:00:00 2001 From: Claudio Busse Date: Fri, 31 May 2024 13:11:03 +0200 Subject: [PATCH] fix(10661): volumes don't block deletion of unreachable nodes --- .../controllers/machine/machine_controller.go | 11 +- .../machine/machine_controller_test.go | 111 ++++++++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index f519f33db114..decc312cc9ac 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -401,6 +401,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu // After node draining is completed, and if isNodeVolumeDetachingAllowed returns True, make sure all // volumes are detached before proceeding to delete the Node. + // In case the node is unreachable, the detachment is skipped. if r.isNodeVolumeDetachingAllowed(m) { // The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time, // so its transition time can be used to record the first time we wait for volume detachment. @@ -690,7 +691,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, return ctrl.Result{}, nil } -// shouldWaitForNodeVolumes returns true if node status still have volumes attached +// shouldWaitForNodeVolumes returns true if node status still have volumes attached and the node is reachable // pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node // this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic // because if the node is deleted before detach success, then the underline VMDK will be deleted together with the Machine @@ -712,6 +713,14 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus return true, err } + if noderefutil.IsNodeUnreachable(node) { + // If a node is unreachable, we can't detach the volume. + // We need to skip the detachment as we otherwise block deletions + // of unreachable nodes when a volume is attached. + log.Info("Skipping volume detachment as node is unreachable.") + return true, nil + } + return len(node.Status.VolumesAttached) != 0, nil } diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index b547f6afa87e..95fdb497d5f6 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -1599,6 +1599,117 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) { } } +func TestShouldWaitForNodeVolumes(t *testing.T) { + testCluster := &clusterv1.Cluster{ + TypeMeta: metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()}, + ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: "test-cluster"}, + } + + attachedVolumes := []corev1.AttachedVolume{ + { + Name: "test-volume", + DevicePath: "test-path", + }, + } + + tests := []struct { + name string + node *corev1.Node + expected bool + }{ + { + name: "Node has volumes attached", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + VolumesAttached: attachedVolumes, + }, + }, + expected: true, + }, + { + name: "Node has no volumes attached", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + expected: false, + }, + { + name: "Node is unreachable and has volumes attached", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unreachable-node", + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionUnknown, + }, + }, + VolumesAttached: attachedVolumes, + }, + }, + expected: true, + }, + { + name: "Node is unreachable and has no volumes attached", + node: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "unreachable-node", + }, + Status: corev1.NodeStatus{ + Conditions: []corev1.NodeCondition{ + { + Type: corev1.NodeReady, + Status: corev1.ConditionUnknown, + }, + }, + }, + }, + expected: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + var objs []client.Object + objs = append(objs, testCluster, tt.node) + + c := fake.NewClientBuilder().WithObjects(objs...).Build() + tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster)) + r := &Reconciler{ + Client: c, + UnstructuredCachingClient: c, + Tracker: tracker, + } + + got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(got).To(Equal(tt.expected)) + }) + } +} + func TestIsDeleteNodeAllowed(t *testing.T) { deletionts := metav1.Now()