diff --git a/pkg/cache/scheduler_callback.go b/pkg/cache/scheduler_callback.go index 643b4c719..d00853bc6 100644 --- a/pkg/cache/scheduler_callback.go +++ b/pkg/cache/scheduler_callback.go @@ -93,12 +93,11 @@ func (callback *AsyncRMCallback) UpdateAllocation(response *si.AllocationRespons // update cache callback.context.ForgetPod(release.GetAllocationKey()) - // TerminationType 0 mean STOPPED_BY_RM - if release.TerminationType != si.TerminationType_STOPPED_BY_RM { - // send release app allocation to application states machine - ev := NewReleaseAppAllocationEvent(release.ApplicationID, release.TerminationType, release.AllocationKey) - dispatcher.Dispatch(ev) - } + // TerminationType 0 mean STOPPED_BY_RM, but we also need to do the release when task failed, + // we also should send release event to application in case task failed but the pod is still pending. + // send release app allocation to application states machine + ev := NewReleaseAppAllocationEvent(release.ApplicationID, release.TerminationType, release.AllocationKey) + dispatcher.Dispatch(ev) } return nil diff --git a/pkg/cache/scheduler_callback_test.go b/pkg/cache/scheduler_callback_test.go index d98f5f6f7..619b58f5e 100644 --- a/pkg/cache/scheduler_callback_test.go +++ b/pkg/cache/scheduler_callback_test.go @@ -215,7 +215,9 @@ func TestUpdateAllocation_AllocationReleased_StoppedByRM(t *testing.T) { assert.NilError(t, err, "error updating allocation") assert.Assert(t, !context.schedulerCache.IsAssumedPod(taskUID1)) err = utils.WaitForCondition(deleteCalled.Load, 10*time.Millisecond, 500*time.Millisecond) - assert.Error(t, err, "timeout waiting for condition") // pod is not expected to be deleted + // Pod should be deleted, because TerminationType_STOPPED_BY_RM will also be called when task fail. + // If we don't delete the pod, the pod will be stuck in pending state. + assert.NilError(t, err, "pod has not been deleted") } func TestUpdateApplication_Accepted(t *testing.T) {