@@ -26,8 +26,11 @@ import (
2626 "fmt"
2727 "github.com/microsoft/frameworkcontroller/pkg/common"
2828 core "k8s.io/api/core/v1"
29+ apiErrors "k8s.io/apimachinery/pkg/api/errors"
30+ "k8s.io/apimachinery/pkg/util/net"
2931 "reflect"
3032 "regexp"
33+ "strings"
3134 "time"
3235)
3336
@@ -63,17 +66,19 @@ const (
6366
6467 // [-999, -1]: Predefined Framework Error
6568 // -1XX: Transient Error
66- CompletionCodeConfigMapExternalDeleted CompletionCode = - 100
67- CompletionCodePodExternalDeleted CompletionCode = - 101
68- CompletionCodeConfigMapCreationTimeout CompletionCode = - 110
69- CompletionCodePodCreationTimeout CompletionCode = - 111
69+ CompletionCodeConfigMapExternalDeleted CompletionCode = - 100
70+ CompletionCodePodExternalDeleted CompletionCode = - 101
71+ CompletionCodeConfigMapLocalCacheCreationTimeout CompletionCode = - 110
72+ CompletionCodePodLocalCacheCreationTimeout CompletionCode = - 111
73+ CompletionCodePodCreationTransientError CompletionCode = - 120
7074 // -2XX: Permanent Error
71- CompletionCodePodSpecPermanentError CompletionCode = - 200
75+ CompletionCodePodCreationPermanentError CompletionCode = - 200
7276 CompletionCodeStopFrameworkRequested CompletionCode = - 210
7377 CompletionCodeFrameworkAttemptCompletion CompletionCode = - 220
7478 CompletionCodeDeleteTaskRequested CompletionCode = - 230
7579 // -3XX: Unknown Error
7680 CompletionCodePodFailedWithoutFailedContainer CompletionCode = - 300
81+ CompletionCodePodCreationUnknownError CompletionCode = - 310
7782)
7883
7984var completionCodeInfoList = []* CompletionCodeInfo {}
@@ -152,20 +157,28 @@ func initCompletionCodeInfos() {
152157 []CompletionTypeAttribute {CompletionTypeAttributeTransient }},
153158 },
154159 {
155- Code : CompletionCodeConfigMapCreationTimeout .Ptr (),
156- Phrase : "ConfigMapCreationTimeout " ,
160+ Code : CompletionCodeConfigMapLocalCacheCreationTimeout .Ptr (),
161+ Phrase : "ConfigMapLocalCacheCreationTimeout " ,
157162 Type : CompletionType {CompletionTypeNameFailed ,
158163 []CompletionTypeAttribute {CompletionTypeAttributeTransient }},
159164 },
160165 {
161- Code : CompletionCodePodCreationTimeout .Ptr (),
162- Phrase : "PodCreationTimeout " ,
166+ Code : CompletionCodePodLocalCacheCreationTimeout .Ptr (),
167+ Phrase : "PodLocalCacheCreationTimeout " ,
163168 Type : CompletionType {CompletionTypeNameFailed ,
164169 []CompletionTypeAttribute {CompletionTypeAttributeTransient }},
165170 },
166171 {
167- Code : CompletionCodePodSpecPermanentError .Ptr (),
168- Phrase : "PodSpecPermanentError" ,
172+ // Only used to distinguish with others, and will never be used to complete
173+ // a TaskAttempt.
174+ Code : CompletionCodePodCreationTransientError .Ptr (),
175+ Phrase : "PodCreationTransientError" ,
176+ Type : CompletionType {CompletionTypeNameFailed ,
177+ []CompletionTypeAttribute {CompletionTypeAttributeTransient }},
178+ },
179+ {
180+ Code : CompletionCodePodCreationPermanentError .Ptr (),
181+ Phrase : "PodCreationPermanentError" ,
169182 Type : CompletionType {CompletionTypeNameFailed ,
170183 []CompletionTypeAttribute {CompletionTypeAttributePermanent }},
171184 },
@@ -193,6 +206,12 @@ func initCompletionCodeInfos() {
193206 Type : CompletionType {CompletionTypeNameFailed ,
194207 []CompletionTypeAttribute {}},
195208 },
209+ {
210+ Code : CompletionCodePodCreationUnknownError .Ptr (),
211+ Phrase : "PodCreationUnknownError" ,
212+ Type : CompletionType {CompletionTypeNameFailed ,
213+ []CompletionTypeAttribute {}},
214+ },
196215 })
197216}
198217
@@ -238,6 +257,9 @@ type MatchedContainer struct {
238257}
239258
240259// Match ANY CompletionCodeInfo
260+ // The returned CompletionCode may not within CompletionCodeInfos, such as for
261+ // the ContainerUnrecognizedFailed, so it should not be used to
262+ // NewTaskAttemptCompletionStatus or NewFrameworkAttemptCompletionStatus later.
241263func MatchCompletionCodeInfos (pod * core.Pod ) PodMatchResult {
242264 for _ , codeInfo := range completionCodeInfoList {
243265 for _ , podPattern := range codeInfo .PodPatterns {
@@ -404,6 +426,55 @@ func generatePodUnmatchedResult(pod *core.Pod) PodMatchResult {
404426 }
405427}
406428
429+ // The returned CompletionCode must be within CompletionCodeInfos.
430+ func ClassifyPodCreationError (apiErr error ) PodMatchResult {
431+ diag := fmt .Sprintf ("Failed to create Pod: %v" , common .ToJson (apiErr ))
432+
433+ // Treat Platform Error as Transient Error, such as Pod decoding error.
434+ if strings .Contains (apiErr .Error (), "object provided is unrecognized" ) ||
435+ strings .Contains (apiErr .Error (), "exceeded quota" ) {
436+ return PodMatchResult {
437+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationTransientError ],
438+ Diagnostics : diag ,
439+ }
440+ }
441+
442+ // Treat General Framework Error as Unknown Error for safety.
443+ if apiErrors .IsBadRequest (apiErr ) ||
444+ apiErrors .IsForbidden (apiErr ) {
445+ return PodMatchResult {
446+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationUnknownError ],
447+ Diagnostics : diag ,
448+ }
449+ }
450+
451+ // Treat Permanent Framework Error as Permanent Error only if it must be
452+ // Permanent Error.
453+ if apiErrors .IsInvalid (apiErr ) ||
454+ apiErrors .IsRequestEntityTooLargeError (apiErr ) {
455+ // TODO: Also check net.IsConnectionRefused
456+ if net .IsConnectionReset (apiErr ) || net .IsProbableEOF (apiErr ) {
457+ // The ApiServer Permanent Error may be caused by Network Transient Error,
458+ // so treat it as Unknown Error for safety.
459+ return PodMatchResult {
460+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationUnknownError ],
461+ Diagnostics : diag ,
462+ }
463+ } else {
464+ return PodMatchResult {
465+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationPermanentError ],
466+ Diagnostics : diag ,
467+ }
468+ }
469+ }
470+
471+ // Treat all other errors as Transient Error, including all non-APIStatus errors.
472+ return PodMatchResult {
473+ CodeInfo : completionCodeInfoMap [CompletionCodePodCreationTransientError ],
474+ Diagnostics : diag ,
475+ }
476+ }
477+
407478///////////////////////////////////////////////////////////////////////////////////////
408479// Completion Utils
409480///////////////////////////////////////////////////////////////////////////////////////
0 commit comments