fix: mark taskresult complete when failed or error. Fixes #12993, Fixes #13533 (#13798)

isubasinghe · isubasinghe · commit 8fe8de2e16ec · 2024-10-30T21:24:59.000+11:00
Signed-off-by: isubasinghe &lt;isitha@pipekit.io&gt;
diff --git a/docs/environment-variables.md b/docs/environment-variables.md
@@ -45,7 +45,7 @@ most users. Environment variables may be removed at any time.
 | `OPERATION_DURATION_METRIC_BUCKET_COUNT` | `int`               | `6`                                                                                         | The number of buckets to collect the metric for the operation duration.                                                                                                                                                                                                  |
 | `POD_NAMES`                              | `string`            | `v2`                                                                                        | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server.                                                                                                                                                |
 | `RECENTLY_STARTED_POD_DURATION`          | `time.Duration`     | `10s`                                                                                       | The duration of a pod before the pod is considered to be recently started.                                                                                                                                                                                               |
-| `RECENTLY_DELETED_POD_DURATION`          | `time.Duration`     | `10s`                                                                                       | The duration of a pod before the pod is considered to be recently deleted.                                                                                                                                                                                               |
+| `RECENTLY_DELETED_POD_DURATION`          | `time.Duration`     | `2m`                                                                                       | The duration of a pod before the pod is considered to be recently deleted.                                                                                                                                                                                               |
 | `RETRY_BACKOFF_DURATION`                 | `time.Duration`     | `10ms`                                                                                      | The retry back-off duration when retrying API calls.                                                                                                                                                                                                                     |
 | `RETRY_BACKOFF_FACTOR`                   | `float`             | `2.0`                                                                                       | The retry back-off factor when retrying API calls.                                                                                                                                                                                                                       |
 | `RETRY_BACKOFF_STEPS`                    | `int`               | `5`                                                                                         | The retry back-off steps when retrying API calls.                                                                                                                                                                                                                        |
diff --git a/pkg/apis/workflow/v1alpha1/workflow_types.go b/pkg/apis/workflow/v1alpha1/workflow_types.go
@@ -2418,11 +2418,6 @@ func (n NodeStatus) IsExitNode() bool {
 	return strings.HasSuffix(n.DisplayName, ".onExit")
 }
 
-// IsPodDeleted returns whether node is error with pod deleted.
-func (n NodeStatus) IsPodDeleted() bool {
-	return n.Phase == NodeError && n.Message == "pod deleted"
-}
-
 func (n NodeStatus) Succeeded() bool {
 	return n.Phase == NodeSucceeded
 }
diff --git a/workflow/controller/taskresult.go b/workflow/controller/taskresult.go
@@ -54,7 +54,7 @@ func (wfc *WorkflowController) newWorkflowTaskResultInformer() cache.SharedIndex
 }
 
 func recentlyDeleted(node *wfv1.NodeStatus) bool {
-	return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 10*time.Second)
+	return time.Since(node.FinishedAt.Time) <= envutil.LookupEnvDurationOr("RECENTLY_DELETED_POD_DURATION", 2*time.Minute)
 }
 
 func (woc *wfOperationCtx) taskResultReconciliation() {
@@ -83,19 +83,19 @@ func (woc *wfOperationCtx) taskResultReconciliation() {
 		if err != nil {
 			continue
 		}
+
 		// Mark task result as completed if it has no chance to be completed.
-		if label == "false" && old.IsPodDeleted() {
+		if label == "false" && old.Completed() && !woc.nodePodExist(*old) {
 			if recentlyDeleted(old) {
 				woc.log.WithField("nodeID", nodeID).Debug("Wait for marking task result as completed because pod is recently deleted.")
 				// If the pod was deleted, then it is possible that the controller never get another informer message about it.
 				// In this case, the workflow will only be requeued after the resync period (20m). This means
 				// workflow will not update for 20m. Requeuing here prevents that happening.
 				woc.requeue()
 				continue
-			} else {
-				woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
-				woc.wf.Status.MarkTaskResultComplete(nodeID)
 			}
+			woc.log.WithField("nodeID", nodeID).Info("Marking task result as completed because pod has been deleted for a while.")
+			woc.wf.Status.MarkTaskResultComplete(nodeID)
 		}
 		newNode := old.DeepCopy()
 		if result.Outputs.HasOutputs() {

Original file line number	Diff line number	Diff line change
`@@ -2418,11 +2418,6 @@ func (n NodeStatus) IsExitNode() bool {`
`2418`	`2418`	`return strings.HasSuffix(n.DisplayName, ".onExit")`
`2419`	`2419`	`}`
`2420`	`2420`
`2421`		`-// IsPodDeleted returns whether node is error with pod deleted.`
`2422`		`-func (n NodeStatus) IsPodDeleted() bool {`
`2423`		`- return n.Phase == NodeError && n.Message == "pod deleted"`
`2424`		`-}`
`2425`		`-`
`2426`	`2421`	`func (n NodeStatus) Succeeded() bool {`
`2427`	`2422`	`return n.Phase == NodeSucceeded`
`2428`	`2423`	`}`