Skip to content

Commit 5d1f0da

Browse files
committed
feat: add aggressive kueue gating and enhanced verbose output
Replace 6 per-resource-type kueue label checks with a single DataIntegrityCheck that validates three cluster-wide invariants: 1. Every workload in a kueue-managed namespace has the queue-name label 2. Every workload with the queue-name label is in a kueue-managed namespace 3. Within each top-level CR's ownership tree, all resources agree on the queue-name label value Key design decisions: - Top-down traversal from monitored CRs (Notebook, InferenceService, LLMInferenceService, RayCluster, RayJob, PyTorchJob) through an ownership graph of intermediate types (Deployment, StatefulSet, ReplicaSet, DaemonSet, Job, CronJob, Pod) - Ownership graph built once per namespace and reused across all CRs - Single KueueConsistency condition (True/False) with aborting impact - First-violation-wins per CR to avoid redundant diagnostics - ImpactedObjects populated with TypeMeta for consistency with other checks - Pre-filter to only process kueue-relevant namespaces (union of kueue-managed namespaces and namespaces with kueue-labeled workloads) Introduces ImpactAborting as the highest severity level ("aborting"), representing conditions where an upgrade MUST NOT proceed. The kueue data-integrity check uses this impact to gate upgrades when label inconsistencies are detected. Fatal severity support: - New SeverityLevelFatal filter level for --severity flag - Fatal banner (box-drawn border) rendered above the summary table when any aborting conditions are detected, listing all fatal findings - Double exclamation mark (‼) status icon for fatal conditions - Bold "fatal" severity label (other severities remain non-bold) - FATAL verdict in output when aborting conditions are present - FilterBySeverity updated to support fatal-only filtering Refactors NotebookVerboseFormatter into a centrally defined EnhancedVerboseFormatter in pkg/lint/check/: - Handles mixed resource types by deriving per-object CRD FQN from TypeMeta, with annotation preference for single-kind results - Exports CRDFullyQualifiedName and DeriveCRDFQNFromTypeMeta helpers - Supports optional per-object context via AnnotationObjectContext, rendered as a sub-bullet beneath each impacted object reference - Notebook checks updated to embed check.EnhancedVerboseFormatter - Kueue DataIntegrityCheck embeds EnhancedVerboseFormatter for CRD-qualified verbose output across its 6 monitored resource types, with per-object context indicating which invariant was violated Adds resource type definitions for StatefulSet, ReplicaSet, DaemonSet, Job, and CronJob. Updates ToPartialObjectMetadata helper to preserve UID and OwnerReferences needed for ownership graph construction. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Andy Stoneberg <astonebe@redhat.com>
1 parent 2bed7c2 commit 5d1f0da

37 files changed

Lines changed: 2353 additions & 2412 deletions

pkg/lint/check/result/result.go

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ const (
1616
// (e.g., "notebooks.kubeflow.org"). Automatically set by SetImpactedObjects and
1717
// AddImpactedObjects from the ResourceType.
1818
AnnotationResourceCRDName = "resource.opendatahub.io/crd-name"
19+
20+
// AnnotationObjectContext is an optional per-object annotation key that provides
21+
// brief clarifying context about why a specific object is impacted. When set on
22+
// an ImpactedObject's ObjectMeta.Annotations, the EnhancedVerboseFormatter renders
23+
// it as a sub-bullet beneath the object reference.
24+
AnnotationObjectContext = "result.opendatahub.io/context"
1925
)
2026

2127
const (
@@ -33,8 +39,9 @@ const (
3339
// Impact represents the upgrade impact level of a diagnostic condition.
3440
type Impact string
3541

36-
// Impact levels for diagnostic conditions.
42+
// Impact levels for diagnostic conditions, ordered by severity (highest first).
3743
const (
44+
ImpactAborting Impact = "aborting" // Upgrade MUST NOT proceed
3845
ImpactBlocking Impact = "blocking" // Upgrade CANNOT proceed
3946
ImpactAdvisory Impact = "advisory" // Upgrade CAN proceed with warning
4047
ImpactNone Impact = "" // No impact (omitted from JSON/YAML)
@@ -71,16 +78,16 @@ func (c Condition) Validate() error {
7178
// False/Unknown status must have impact specified.
7279
if c.Impact == ImpactNone || c.Impact == "" {
7380
return fmt.Errorf(
74-
"condition with Status=%q must have Impact specified (Blocking or Advisory), got Impact=%q",
81+
"condition with Status=%q must have Impact specified (Aborting, Blocking, or Advisory), got Impact=%q",
7582
c.Status, c.Impact,
7683
)
7784
}
7885

7986
// Validate impact values.
80-
if c.Impact != ImpactBlocking && c.Impact != ImpactAdvisory {
87+
if c.Impact != ImpactAborting && c.Impact != ImpactBlocking && c.Impact != ImpactAdvisory {
8188
return fmt.Errorf(
82-
"invalid Impact=%q, must be %q or %q",
83-
c.Impact, ImpactBlocking, ImpactAdvisory,
89+
"invalid Impact=%q, must be %q, %q, or %q",
90+
c.Impact, ImpactAborting, ImpactBlocking, ImpactAdvisory,
8491
)
8592
}
8693

@@ -254,10 +261,14 @@ func (r *DiagnosticResult) GetImpact() Impact {
254261

255262
for _, cond := range r.Status.Conditions {
256263
switch cond.Impact {
264+
case ImpactAborting:
265+
return ImpactAborting
257266
case ImpactBlocking:
258-
return ImpactBlocking
267+
maxImpact = ImpactBlocking
259268
case ImpactAdvisory:
260-
maxImpact = ImpactAdvisory
269+
if maxImpact == ImpactNone {
270+
maxImpact = ImpactAdvisory
271+
}
261272
case ImpactNone:
262273
// No impact - continue checking other conditions
263274
}

pkg/lint/check/result/result_test.go

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,53 @@ func TestDiagnosticResult_Validate_EmptyConditionReason(t *testing.T) {
307307
g.Expect(err.Error()).To(ContainSubstring("has empty reason"))
308308
}
309309

310+
func TestConditionValidate_AbortingWithStatusFalse(t *testing.T) {
311+
g := NewWithT(t)
312+
313+
cond := result.Condition{
314+
Condition: metav1.Condition{
315+
Type: "Consistency",
316+
Status: metav1.ConditionFalse,
317+
Reason: "Inconsistent",
318+
},
319+
Impact: result.ImpactAborting,
320+
}
321+
322+
g.Expect(cond.Validate()).ToNot(HaveOccurred())
323+
}
324+
325+
func TestConditionValidate_AbortingWithStatusUnknown(t *testing.T) {
326+
g := NewWithT(t)
327+
328+
cond := result.Condition{
329+
Condition: metav1.Condition{
330+
Type: "Consistency",
331+
Status: metav1.ConditionUnknown,
332+
Reason: "CannotDetermine",
333+
},
334+
Impact: result.ImpactAborting,
335+
}
336+
337+
g.Expect(cond.Validate()).ToNot(HaveOccurred())
338+
}
339+
340+
func TestConditionValidate_AbortingWithStatusTrue_Invalid(t *testing.T) {
341+
g := NewWithT(t)
342+
343+
cond := result.Condition{
344+
Condition: metav1.Condition{
345+
Type: "Consistency",
346+
Status: metav1.ConditionTrue,
347+
Reason: "Consistent",
348+
},
349+
Impact: result.ImpactAborting,
350+
}
351+
352+
err := cond.Validate()
353+
g.Expect(err).To(HaveOccurred())
354+
g.Expect(err.Error()).To(ContainSubstring("Status=True must have Impact=None"))
355+
}
356+
310357
// T023: Multiple conditions array tests
311358

312359
func TestMultipleConditions_OrderedByExecutionSequence(t *testing.T) {
@@ -790,6 +837,103 @@ func TestSetCondition_MultipleConditionTypes(t *testing.T) {
790837
}))
791838
}
792839

840+
// GetImpact tests
841+
842+
func TestGetImpact_ReturnsAbortingAsHighest(t *testing.T) {
843+
g := NewWithT(t)
844+
845+
dr := result.New("workload", "kueue", "data-integrity", "test")
846+
dr.SetCondition(check.NewCondition(
847+
"Advisory",
848+
metav1.ConditionFalse,
849+
check.WithReason("Reason"),
850+
check.WithMessage("advisory finding"),
851+
check.WithImpact(result.ImpactAdvisory),
852+
))
853+
dr.SetCondition(check.NewCondition(
854+
"Blocking",
855+
metav1.ConditionFalse,
856+
check.WithReason("Reason"),
857+
check.WithMessage("blocking finding"),
858+
check.WithImpact(result.ImpactBlocking),
859+
))
860+
dr.SetCondition(check.NewCondition(
861+
"Aborting",
862+
metav1.ConditionFalse,
863+
check.WithReason("Reason"),
864+
check.WithMessage("aborting finding"),
865+
check.WithImpact(result.ImpactAborting),
866+
))
867+
868+
g.Expect(dr.GetImpact()).To(Equal(result.ImpactAborting))
869+
}
870+
871+
func TestGetImpact_AbortingShortCircuits(t *testing.T) {
872+
g := NewWithT(t)
873+
874+
// Aborting condition first — should return immediately without checking others.
875+
dr := result.New("workload", "kueue", "data-integrity", "test")
876+
dr.SetCondition(check.NewCondition(
877+
"Aborting",
878+
metav1.ConditionFalse,
879+
check.WithReason("Reason"),
880+
check.WithMessage("aborting finding"),
881+
check.WithImpact(result.ImpactAborting),
882+
))
883+
dr.SetCondition(check.NewCondition(
884+
"Passing",
885+
metav1.ConditionTrue,
886+
check.WithReason("Reason"),
887+
check.WithMessage("all good"),
888+
))
889+
890+
g.Expect(dr.GetImpact()).To(Equal(result.ImpactAborting))
891+
}
892+
893+
func TestGetImpact_BlockingWithoutAborting(t *testing.T) {
894+
g := NewWithT(t)
895+
896+
dr := result.New("component", "kserve", "removal", "test")
897+
dr.SetCondition(check.NewCondition(
898+
"Blocking",
899+
metav1.ConditionFalse,
900+
check.WithReason("Reason"),
901+
check.WithMessage("blocking finding"),
902+
check.WithImpact(result.ImpactBlocking),
903+
))
904+
dr.SetCondition(check.NewCondition(
905+
"Advisory",
906+
metav1.ConditionFalse,
907+
check.WithReason("Reason"),
908+
check.WithMessage("advisory finding"),
909+
check.WithImpact(result.ImpactAdvisory),
910+
))
911+
912+
g.Expect(dr.GetImpact()).To(Equal(result.ImpactBlocking))
913+
}
914+
915+
func TestGetImpact_NoneWhenAllPassing(t *testing.T) {
916+
g := NewWithT(t)
917+
918+
dr := result.New("component", "dashboard", "status", "test")
919+
dr.SetCondition(check.NewCondition(
920+
"Available",
921+
metav1.ConditionTrue,
922+
check.WithReason("Ready"),
923+
check.WithMessage("all good"),
924+
))
925+
926+
g.Expect(dr.GetImpact()).To(Equal(result.ImpactNone))
927+
}
928+
929+
func TestGetImpact_NoneWhenNoConditions(t *testing.T) {
930+
g := NewWithT(t)
931+
932+
dr := result.New("component", "test", "check", "test")
933+
934+
g.Expect(dr.GetImpact()).To(Equal(result.ImpactNone))
935+
}
936+
793937
// SetImpactedObjects and AddImpactedObjects tests
794938

795939
func TestSetImpactedObjects(t *testing.T) {

0 commit comments

Comments
 (0)