Skip to content

Commit 8fe8de2

Browse files
committed
fix: mark taskresult complete when failed or error. Fixes #12993, Fixes #13533 (#13798)
Signed-off-by: isubasinghe <[email protected]>
1 parent 70c58a6 commit 8fe8de2

File tree

3 files changed

+6
-11
lines changed

3 files changed

+6
-11
lines changed

docs/environment-variables.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ most users. Environment variables may be removed at any time.
4545
| `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int` | `6` | The number of buckets to collect the metric for the operation duration. |
4646
| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. |
4747
| `RECENTLY_STARTED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently started. |
48-
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `10s` | The duration of a pod before the pod is considered to be recently deleted. |
48+
| `RECENTLY_DELETED_POD_DURATION` | `time.Duration` | `2m` | The duration of a pod before the pod is considered to be recently deleted. |
4949
| `RETRY_BACKOFF_DURATION` | `time.Duration` | `10ms` | The retry back-off duration when retrying API calls. |
5050
| `RETRY_BACKOFF_FACTOR` | `float` | `2.0` | The retry back-off factor when retrying API calls. |
5151
| `RETRY_BACKOFF_STEPS` | `int` | `5` | The retry back-off steps when retrying API calls. |

pkg/apis/workflow/v1alpha1/workflow_types.go

-5
Original file line numberDiff line numberDiff line change
@@ -2418,11 +2418,6 @@ func (n NodeStatus) IsExitNode() bool {
24182418
return strings.HasSuffix(n.DisplayName, ".onExit")
24192419
}
24202420

2421-
// IsPodDeleted returns whether node is error with pod deleted.
2422-
func (n NodeStatus) IsPodDeleted() bool {
2423-
return n.Phase == NodeError && n.Message == "pod deleted"
2424-
}
2425-
24262421
func (n NodeStatus) Succeeded() bool {
24272422
return n.Phase == NodeSucceeded
24282423
}

workflow/controller/taskresult.go

+5-5
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
5454
}
5555

5656
func recentlyDeleted(node *wfv1.NodeStatus) bool {
57-
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second)
57+
return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
5858
}
5959

6060
func (woc *wfOperationCtx) taskResultReconciliation() {
@@ -83,19 +83,19 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
8383
if err != nil {
8484
continue
8585
}
86+
8687
// Mark task result as completed if it has no chance to be completed.
87-
if label == "false" && old.IsPodDeleted() {
88+
if label == "false" && old.Completed() && !woc.nodePodExist(*old) {
8889
if recentlyDeleted(old) {
8990
woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
9091
// If the pod was deleted, then it is possible that the controller never get another informer message about it.
9192
// In this case, the workflow will only be requeued after the resync period (20m). This means
9293
// workflow will not update for 20m. Requeuing here prevents that happening.
9394
woc.requeue()
9495
continue
95-
} else {
96-
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
97-
woc.wf.Status.MarkTaskResultComplete(nodeID)
9896
}
97+
woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
98+
woc.wf.Status.MarkTaskResultComplete(nodeID)
9999
}
100100
newNode := old.DeepCopy()
101101
if result.Outputs.HasOutputs() {

0 commit comments

Comments
 (0)