Skip to content

Commit 5f98022

Browse files
authored
Treat kubelet NodeAffinity status.reason as retryable system error (#6461)
Signed-off-by: Mike Hotan <mike@union.ai>
1 parent a75cea0 commit 5f98022

2 files changed

Lines changed: 32 additions & 4 deletions

File tree

flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,15 @@ const primaryContainerTemplateName = "primary"
3838
const primaryInitContainerTemplateName = "primary-init"
3939
const PrimaryContainerKey = "primary_container_name"
4040

41-
// nodePreemptionStatusReasons are the status reasons that a pod's respective node
42-
// is preempted by the scheduler
43-
var nodePreemptionStatusReasons = sets.NewString("Shutdown", "Terminated", "NodeShutdown")
41+
var retryableStatusReasons = sets.NewString(
42+
// Reasons that indicate the node was preempted aggressively.
43+
// Kubelet can miss deleting the pod prior to the node being shutdown.
44+
"Shutdown",
45+
"Terminated",
46+
"NodeShutdown",
47+
// kubelet admission rejects the pod before the node gets assigned appropriate labels.
48+
"NodeAffinity",
49+
)
4450

4551
// AddRequiredNodeSelectorRequirements adds the provided v1.NodeSelectorRequirement
4652
// objects to an existing v1.Affinity object. If there are no existing required
@@ -1207,7 +1213,7 @@ func DemystifyFailure(ctx context.Context, status v1.PodStatus, info pluginsCore
12071213

12081214
var isSystemError bool
12091215
// In some versions of GKE the reason can also be "Terminated" or "NodeShutdown"
1210-
if nodePreemptionStatusReasons.Has(code) {
1216+
if retryableStatusReasons.Has(code) {
12111217
isSystemError = true
12121218
}
12131219

flyteplugins/go/tasks/pluginmachinery/flytek8s/pod_helper_test.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,6 +1721,28 @@ func TestDemystifyFailure(t *testing.T) {
17211721
})
17221722
}
17231723
})
1724+
1725+
t.Run("Kubelet admission denies pod due to missing node label", func(t *testing.T) {
1726+
for _, reason := range []string{
1727+
"NodeAffinity",
1728+
} {
1729+
t.Run(reason, func(t *testing.T) {
1730+
message := "Pod was rejected: Predicate NodeAffinity failed: node(s) didn't match Pod's node affinity/selector"
1731+
phaseInfo, err := DemystifyFailure(ctx, v1.PodStatus{
1732+
Message: message,
1733+
Reason: reason,
1734+
Phase: v1.PodFailed,
1735+
// Can't always rely on GCP returining container statuses when node is preempted
1736+
ContainerStatuses: []v1.ContainerStatus{},
1737+
}, pluginsCore.TaskInfo{}, "")
1738+
assert.Nil(t, err)
1739+
assert.Equal(t, pluginsCore.PhaseRetryableFailure, phaseInfo.Phase())
1740+
assert.Equal(t, "Interrupted", phaseInfo.Err().GetCode())
1741+
assert.Equal(t, core.ExecutionError_SYSTEM, phaseInfo.Err().GetKind())
1742+
assert.Equal(t, message, phaseInfo.Err().GetMessage())
1743+
})
1744+
}
1745+
})
17241746
}
17251747

17261748
func TestDemystifyPending_testcases(t *testing.T) {

0 commit comments

Comments
 (0)