From 3adac9075ef7fa150f9c5d8b23308b5c38414e1c Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Fri, 15 Mar 2024 02:29:53 +0900 Subject: [PATCH] WaitForPodsReady: Reset the requeueState while reconciling instead of webhook (#1843) Signed-off-by: Yuki Iwai --- charts/kueue/templates/webhook/webhook.yaml | 2 -- config/components/webhook/manifests.yaml | 2 -- pkg/controller/core/workload_controller.go | 6 ++++ pkg/webhooks/workload_webhook.go | 6 +--- pkg/webhooks/workload_webhook_test.go | 16 ---------- .../scheduler/podsready/scheduler_test.go | 25 ++++++++++++++- test/integration/webhook/workload_test.go | 31 ------------------- 7 files changed, 31 insertions(+), 57 deletions(-) diff --git a/charts/kueue/templates/webhook/webhook.yaml b/charts/kueue/templates/webhook/webhook.yaml index 1de8e46241..413ebb38e4 100644 --- a/charts/kueue/templates/webhook/webhook.yaml +++ b/charts/kueue/templates/webhook/webhook.yaml @@ -287,10 +287,8 @@ webhooks: - v1beta1 operations: - CREATE - - UPDATE resources: - workloads - - workloads/status sideEffects: None --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/config/components/webhook/manifests.yaml b/config/components/webhook/manifests.yaml index 46378039a5..c7bed96edc 100644 --- a/config/components/webhook/manifests.yaml +++ b/config/components/webhook/manifests.yaml @@ -267,10 +267,8 @@ webhooks: - v1beta1 operations: - CREATE - - UPDATE resources: - workloads - - workloads/status sideEffects: None --- apiVersion: admissionregistration.k8s.io/v1 diff --git a/pkg/controller/core/workload_controller.go b/pkg/controller/core/workload_controller.go index 9c8aa9666f..6452ca8b5a 100644 --- a/pkg/controller/core/workload_controller.go +++ b/pkg/controller/core/workload_controller.go @@ -149,6 +149,12 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c ctx = ctrl.LoggerInto(ctx, log) log.V(2).Info("Reconciling Workload") + // If a deactivated workload is re-activated, we need to reset the RequeueState. + if wl.Status.RequeueState != nil && ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(&wl) { + wl.Status.RequeueState = nil + return ctrl.Result{}, workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) + } + if len(wl.ObjectMeta.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() { return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, &wl) } diff --git a/pkg/webhooks/workload_webhook.go b/pkg/webhooks/workload_webhook.go index d9f2edc3ec..86ee9a0e84 100644 --- a/pkg/webhooks/workload_webhook.go +++ b/pkg/webhooks/workload_webhook.go @@ -50,7 +50,7 @@ func setupWebhookForWorkload(mgr ctrl.Manager) error { Complete() } -// +kubebuilder:webhook:path=/mutate-kueue-x-k8s-io-v1beta1-workload,mutating=true,failurePolicy=fail,sideEffects=None,groups=kueue.x-k8s.io,resources=workloads;workloads/status,verbs=create;update,versions=v1beta1,name=mworkload.kb.io,admissionReviewVersions=v1 +// +kubebuilder:webhook:path=/mutate-kueue-x-k8s-io-v1beta1-workload,mutating=true,failurePolicy=fail,sideEffects=None,groups=kueue.x-k8s.io,resources=workloads,verbs=create,versions=v1beta1,name=mworkload.kb.io,admissionReviewVersions=v1 var _ webhook.CustomDefaulter = &WorkloadWebhook{} @@ -76,10 +76,6 @@ func (w *WorkloadWebhook) Default(ctx context.Context, obj runtime.Object) error } } - // If a deactivated workload is re-activated, we need to reset the RequeueState. - if ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(wl) && workload.HasRequeueState(wl) { - wl.Status.RequeueState = nil - } return nil } diff --git a/pkg/webhooks/workload_webhook_test.go b/pkg/webhooks/workload_webhook_test.go index 78051be403..8414a22286 100644 --- a/pkg/webhooks/workload_webhook_test.go +++ b/pkg/webhooks/workload_webhook_test.go @@ -78,22 +78,6 @@ func TestWorkloadWebhookDefault(t *testing.T) { }, }, }, - "re-activated workload with re-queue state is reset the re-queue state": { - wl: *testingutil.MakeWorkload(testWorkloadName, testWorkloadNamespace). - Condition(metav1.Condition{ - Type: kueue.WorkloadEvicted, - Status: metav1.ConditionTrue, - Reason: kueue.WorkloadEvictedByDeactivation, - }).RequeueState(ptr.To[int32](5), ptr.To(metav1.Now())). - Obj(), - wantWl: *testingutil.MakeWorkload(testWorkloadName, testWorkloadNamespace). - Condition(metav1.Condition{ - Type: kueue.WorkloadEvicted, - Status: metav1.ConditionTrue, - Reason: kueue.WorkloadEvictedByDeactivation, - }). - Obj(), - }, } for name, tc := range cases { t.Run(name, func(t *testing.T) { diff --git a/test/integration/scheduler/podsready/scheduler_test.go b/test/integration/scheduler/podsready/scheduler_test.go index e76ac9b6b9..523890354d 100644 --- a/test/integration/scheduler/podsready/scheduler_test.go +++ b/test/integration/scheduler/podsready/scheduler_test.go @@ -248,7 +248,7 @@ var _ = ginkgo.Describe("SchedulerWithWaitForPodsReady", func() { // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. }) - ginkgo.It("Should re-admit a timed out workload and deactivate a workload exceeded the re-queue count limit", func() { + ginkgo.It("Should re-admit a timed out workload and deactivate a workload exceeded the re-queue count limit. After that re-activating a workload", func() { ginkgo.By("create the 'prod' workload") prodWl := testing.MakeWorkload("prod", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) @@ -276,6 +276,29 @@ var _ = ginkgo.Describe("SchedulerWithWaitForPodsReady", func() { g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)) g.Expect(ptr.Deref(prodWl.Spec.Active, true)).Should(gomega.BeFalse()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) + + ginkgo.By("verify the re-activated inactive 'prod' workload re-queue state is reset") + // TODO: Once we move a logic to issue the Eviction with InactiveWorkload reason, we need to remove the below updates. + // REF: https://github.com/kubernetes-sigs/kueue/issues/1841 + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + apimeta.SetStatusCondition(&prodWl.Status.Conditions, metav1.Condition{ + Type: kueue.WorkloadEvicted, + Status: metav1.ConditionTrue, + Reason: kueue.WorkloadEvictedByDeactivation, + Message: "evicted by Test", + }) + g.Expect(k8sClient.Status().Update(ctx, prodWl)).Should(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Job reconciler should add an Evicted condition with InactiveWorkload to the Workload") + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + prodWl.Spec.Active = ptr.To(true) + g.Expect(k8sClient.Update(ctx, prodWl)).Should(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Reactivate inactive Workload") + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) + g.Expect(prodWl.Status.RequeueState).Should(gomega.BeNil()) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) }) ginkgo.It("Should unblock admission of new workloads in other ClusterQueues once the admitted workload exceeds timeout", func() { diff --git a/test/integration/webhook/workload_test.go b/test/integration/webhook/workload_test.go index 8b97543b0f..bf49b6a365 100644 --- a/test/integration/webhook/workload_test.go +++ b/test/integration/webhook/workload_test.go @@ -82,37 +82,6 @@ var _ = ginkgo.Describe("Workload defaulting webhook", func() { gomega.Expect(created.Spec.PodSets[0].Name).Should(gomega.Equal(kueue.DefaultPodSetName)) }) - ginkgo.It("Should reset re-queue state", func() { - ginkgo.By("Creating a new inactive Workload") - workload := testing.MakeWorkload(workloadName, ns.Name). - Active(false). - Obj() - gomega.Expect(k8sClient.Create(ctx, workload)).Should(gomega.Succeed()) - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - workload.Status = kueue.WorkloadStatus{ - Conditions: []metav1.Condition{{ - Type: kueue.WorkloadEvicted, - Reason: kueue.WorkloadEvictedByDeactivation, - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.Now(), - }}, - RequeueState: &kueue.RequeueState{ - Count: ptr.To[int32](10), - RequeueAt: ptr.To(metav1.Now()), - }, - } - g.Expect(k8sClient.Status().Update(ctx, workload)).Should(gomega.Succeed()) - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - ginkgo.By("Activate a Workload") - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - workload.Spec.Active = ptr.To(true) - g.Expect(k8sClient.Update(ctx, workload)).Should(gomega.Succeed()) - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)) - g.Expect(workload.Status.RequeueState).Should(gomega.BeNil(), "re-queue state should be reset") - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - }) }) })