Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YUNIKORN-2884] Task fail with post allocated but the pod will keep p… #916

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions pkg/cache/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"github.com/looplab/fsm"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/apache/yunikorn-k8shim/pkg/common"
Expand Down Expand Up @@ -636,9 +637,19 @@

if task, ok := app.taskMap[taskID]; ok {
task.setTaskTerminationType(terminationType)
err := task.DeleteTaskPod()
if err != nil {
log.Log(log.ShimCacheApplication).Error("failed to release allocation from application", zap.Error(err))
pod := task.GetTaskPod()
// Get the pod first to check if it exists
// For example, users manually killed the pod, so the pod already deleted before release.
_, err := task.context.apiProvider.GetAPIs().KubeClient.Get(pod.Namespace, pod.Name)
if err != nil && k8serrors.IsNotFound(err) {
// Pod does not exist, no need to delete
log.Log(log.ShimCacheApplication).Info("pod not found, it already deleted or released")

Check warning on line 646 in pkg/cache/application.go

View check run for this annotation

Codecov / codecov/patch

pkg/cache/application.go#L646

Added line #L646 was not covered by tests
} else {
// Pod exists, delete it
err = task.DeleteTaskPod()
if err != nil {
log.Log(log.ShimCacheApplication).Error("failed to release allocation from application", zap.Error(err))

Check warning on line 651 in pkg/cache/application.go

View check run for this annotation

Codecov / codecov/patch

pkg/cache/application.go#L651

Added line #L651 was not covered by tests
}
}
app.publishPlaceholderTimeoutEvents(task)
} else {
Expand Down
11 changes: 5 additions & 6 deletions pkg/cache/scheduler_callback.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,11 @@ func (callback *AsyncRMCallback) UpdateAllocation(response *si.AllocationRespons
// update cache
callback.context.ForgetPod(release.GetAllocationKey())

// TerminationType 0 mean STOPPED_BY_RM
if release.TerminationType != si.TerminationType_STOPPED_BY_RM {
// send release app allocation to application states machine
ev := NewReleaseAppAllocationEvent(release.ApplicationID, release.TerminationType, release.AllocationKey)
dispatcher.Dispatch(ev)
}
// TerminationType 0 mean STOPPED_BY_RM, but we also need to do the release when task failed,
// we also should send release event to application in case task failed but the pod is still pending.
// send release app allocation to application states machine
ev := NewReleaseAppAllocationEvent(release.ApplicationID, release.TerminationType, release.AllocationKey)
dispatcher.Dispatch(ev)
}

return nil
Expand Down
4 changes: 3 additions & 1 deletion pkg/cache/scheduler_callback_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ func TestUpdateAllocation_AllocationReleased_StoppedByRM(t *testing.T) {
assert.NilError(t, err, "error updating allocation")
assert.Assert(t, !context.schedulerCache.IsAssumedPod(taskUID1))
err = utils.WaitForCondition(deleteCalled.Load, 10*time.Millisecond, 500*time.Millisecond)
assert.Error(t, err, "timeout waiting for condition") // pod is not expected to be deleted
// Pod should be deleted, because TerminationType_STOPPED_BY_RM will also be called when task fail.
// If we don't delete the pod, the pod will be stuck in pending state.
assert.NilError(t, err, "pod has not been deleted")
}

func TestUpdateApplication_Accepted(t *testing.T) {
Expand Down
Loading