Skip to content

Commit 0463e2d

Browse files
authored
feat: add expected-resources deployment check for validating Kubernetes resources exist (#149)
1 parent ed4973b commit 0463e2d

File tree

15 files changed

+1147
-71
lines changed

15 files changed

+1147
-71
lines changed

docs/integrator/recipe-development.md

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,15 +254,23 @@ constraints:
254254
Optional multi-phase validation beyond basic constraints:
255255

256256
```yaml
257+
# expectedResources are declared on componentRefs, not under validation
258+
componentRefs:
259+
- name: gpu-operator
260+
type: Helm
261+
expectedResources:
262+
- kind: Deployment
263+
name: gpu-operator
264+
namespace: gpu-operator
265+
- kind: DaemonSet
266+
name: nvidia-driver-daemonset
267+
namespace: gpu-operator
268+
257269
validation:
258270
readiness:
259271
checks: [gpu-hardware-detection, kernel-parameters]
260272
deployment:
261-
checks: [operator-health]
262-
expectedResources:
263-
- apiVersion: v1
264-
kind: Pod
265-
namespace: gpu-operator
273+
checks: [operator-health, expected-resources]
266274
performance:
267275
infrastructure: nccl-doctor
268276
checks: [nccl-bandwidth-test]

examples/recipes/eks-gb200-ubuntu-training-with-validation.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,16 @@ componentRefs:
9797
source: https://helm.ngc.nvidia.com/nvidia
9898
version: v25.3.3
9999
valuesFile: components/gpu-operator/values-eks-training.yaml
100+
expectedResources:
101+
- kind: Deployment
102+
name: gpu-operator
103+
namespace: gpu-operator
104+
- kind: DaemonSet
105+
name: nvidia-driver-daemonset
106+
namespace: gpu-operator
107+
- kind: DaemonSet
108+
name: nvidia-device-plugin-daemonset
109+
namespace: gpu-operator
100110
overrides:
101111
cdi:
102112
default: false

pkg/defaults/timeouts.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,27 @@ const (
112112
CLISnapshotTimeout = 5 * time.Minute
113113
)
114114

115+
// Validation phase timeouts for validation phase operations.
116+
// These are used when the recipe does not specify a timeout.
117+
const (
118+
// ValidateReadinessTimeout is the default timeout for readiness validation.
119+
ValidateReadinessTimeout = 5 * time.Minute
120+
121+
// ValidateDeploymentTimeout is the default timeout for deployment validation.
122+
ValidateDeploymentTimeout = 10 * time.Minute
123+
124+
// ValidatePerformanceTimeout is the default timeout for performance validation.
125+
// Performance tests may take longer due to GPU benchmarks.
126+
ValidatePerformanceTimeout = 30 * time.Minute
127+
128+
// ValidateConformanceTimeout is the default timeout for conformance validation.
129+
ValidateConformanceTimeout = 15 * time.Minute
130+
131+
// ResourceVerificationTimeout is the timeout for verifying individual
132+
// expected resources exist and are healthy during deployment validation.
133+
ResourceVerificationTimeout = 10 * time.Second
134+
)
135+
115136
// Pod operation timeouts for validation and agent operations.
116137
const (
117138
// PodWaitTimeout is the maximum time to wait for pod operations to complete.

pkg/defaults/timeouts_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@ func TestTimeoutConstants(t *testing.T) {
5050
// HTTP client timeouts
5151
{"HTTPClientTimeout", HTTPClientTimeout, 10 * time.Second, 60 * time.Second},
5252
{"HTTPConnectTimeout", HTTPConnectTimeout, 1 * time.Second, 15 * time.Second},
53+
54+
// Validation phase timeouts
55+
{"ValidateReadinessTimeout", ValidateReadinessTimeout, 1 * time.Minute, 10 * time.Minute},
56+
{"ValidateDeploymentTimeout", ValidateDeploymentTimeout, 5 * time.Minute, 30 * time.Minute},
57+
{"ValidatePerformanceTimeout", ValidatePerformanceTimeout, 10 * time.Minute, 60 * time.Minute},
58+
{"ValidateConformanceTimeout", ValidateConformanceTimeout, 5 * time.Minute, 30 * time.Minute},
59+
{"ResourceVerificationTimeout", ResourceVerificationTimeout, 5 * time.Second, 30 * time.Second},
5360
}
5461

5562
for _, tt := range tests {
@@ -101,6 +108,19 @@ func TestHTTPClientTimeoutRelationships(t *testing.T) {
101108
}
102109
}
103110

111+
func TestValidationPhaseTimeoutRelationships(t *testing.T) {
112+
// Readiness should be the shortest phase
113+
if ValidateReadinessTimeout > ValidateDeploymentTimeout {
114+
t.Errorf("ValidateReadinessTimeout (%v) should not exceed ValidateDeploymentTimeout (%v)",
115+
ValidateReadinessTimeout, ValidateDeploymentTimeout)
116+
}
117+
// Resource verification should be much shorter than phase timeout
118+
if ResourceVerificationTimeout >= ValidateDeploymentTimeout {
119+
t.Errorf("ResourceVerificationTimeout (%v) should be less than ValidateDeploymentTimeout (%v)",
120+
ResourceVerificationTimeout, ValidateDeploymentTimeout)
121+
}
122+
}
123+
104124
func TestCollectorTimeoutLessThanK8s(t *testing.T) {
105125
// Individual collector timeout should be less than K8s collector timeout
106126
// since K8s operations may involve multiple API calls

pkg/recipe/metadata_store.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ func (s *MetadataStore) initBaseMergedSpec() (RecipeMetadataSpec, []string) {
241241
mergedSpec := RecipeMetadataSpec{
242242
Constraints: make([]Constraint, len(s.Base.Spec.Constraints)),
243243
ComponentRefs: make([]ComponentRef, len(s.Base.Spec.ComponentRefs)),
244+
Validation: s.Base.Spec.Validation,
244245
}
245246
copy(mergedSpec.Constraints, s.Base.Spec.Constraints)
246247
copy(mergedSpec.ComponentRefs, s.Base.Spec.ComponentRefs)

pkg/validator/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,15 @@ validation:
313313
314314
**Checks** - Named validation tests:
315315
```yaml
316+
# expected-resources check requires expectedResources on componentRefs
317+
componentRefs:
318+
- name: gpu-operator
319+
type: Helm
320+
expectedResources:
321+
- kind: Deployment
322+
name: gpu-operator
323+
namespace: gpu-operator
324+
316325
validation:
317326
deployment:
318327
checks:
@@ -323,6 +332,18 @@ validation:
323332
### Multi-Phase Recipe Example
324333
325334
```yaml
335+
# expectedResources are declared on componentRefs (used by expected-resources check)
336+
componentRefs:
337+
- name: gpu-operator
338+
type: Helm
339+
expectedResources:
340+
- kind: Deployment
341+
name: gpu-operator
342+
namespace: gpu-operator
343+
- kind: DaemonSet
344+
name: nvidia-driver-daemonset
345+
namespace: gpu-operator
346+
326347
validation:
327348
# Phase 1: Readiness (pre-deployment validation)
328349
readiness:

pkg/validator/checks/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,18 @@ make generate-validator ARGS="--constraint Deployment.my-app.version --phase dep
213213
### Example Recipe Usage
214214

215215
```yaml
216+
# expectedResources are declared on componentRefs (used by expected-resources check)
217+
componentRefs:
218+
- name: gpu-operator
219+
type: Helm
220+
expectedResources:
221+
- kind: Deployment
222+
name: gpu-operator
223+
namespace: gpu-operator
224+
- kind: DaemonSet
225+
name: nvidia-driver-daemonset
226+
namespace: gpu-operator
227+
216228
validation:
217229
deployment:
218230
constraints:
@@ -224,7 +236,7 @@ validation:
224236
checks:
225237
# These also run inside the Job
226238
- operator-health
227-
- expected-resources
239+
- expected-resources # validates componentRefs[].expectedResources
228240
```
229241
230242
## Registration Pattern
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package deployment
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"strings"
21+
22+
"github.com/NVIDIA/eidos/pkg/defaults"
23+
"github.com/NVIDIA/eidos/pkg/errors"
24+
"github.com/NVIDIA/eidos/pkg/recipe"
25+
"github.com/NVIDIA/eidos/pkg/validator/checks"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/client-go/kubernetes"
28+
)
29+
30+
func init() {
31+
// Register this check
32+
checks.RegisterCheck(&checks.Check{
33+
Name: "expected-resources",
34+
Description: "Verify expected Kubernetes resources exist and are healthy after component deployment",
35+
Phase: "deployment",
36+
Func: validateExpectedResources,
37+
TestName: "TestCheckExpectedResources",
38+
})
39+
}
40+
41+
// validateExpectedResources verifies that all expected Kubernetes resources declared
42+
// in the recipe's componentRefs exist and are healthy in the live cluster.
43+
func validateExpectedResources(ctx *checks.ValidationContext) error {
44+
if ctx.Clientset == nil {
45+
return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
46+
}
47+
if ctx.Recipe == nil {
48+
return errors.New(errors.ErrCodeInvalidRequest, "recipe is not available")
49+
}
50+
51+
var failures []string
52+
53+
for _, ref := range ctx.Recipe.ComponentRefs {
54+
for _, er := range ref.ExpectedResources {
55+
if err := verifyResource(ctx.Context, ctx.Clientset, er); err != nil {
56+
failures = append(failures, fmt.Sprintf("%s %s/%s (%s): %s",
57+
er.Kind, er.Namespace, er.Name, ref.Name, err.Error()))
58+
}
59+
}
60+
}
61+
62+
if len(failures) > 0 {
63+
return errors.New(errors.ErrCodeNotFound,
64+
fmt.Sprintf("expected resource check failed:\n %s", strings.Join(failures, "\n ")))
65+
}
66+
return nil
67+
}
68+
69+
// verifyResource checks that a single expected resource exists and is healthy.
70+
func verifyResource(ctx context.Context, clientset kubernetes.Interface, er recipe.ExpectedResource) error {
71+
ctx, cancel := context.WithTimeout(ctx, defaults.ResourceVerificationTimeout)
72+
defer cancel()
73+
74+
switch er.Kind {
75+
case "Deployment":
76+
deploy, err := clientset.AppsV1().Deployments(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
77+
if err != nil {
78+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
79+
}
80+
expected := int32(1)
81+
if deploy.Spec.Replicas != nil {
82+
expected = *deploy.Spec.Replicas
83+
}
84+
if deploy.Status.AvailableReplicas < expected {
85+
return errors.New(errors.ErrCodeInternal,
86+
fmt.Sprintf("not healthy: %d/%d replicas available",
87+
deploy.Status.AvailableReplicas, expected))
88+
}
89+
90+
case "DaemonSet":
91+
ds, err := clientset.AppsV1().DaemonSets(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
92+
if err != nil {
93+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
94+
}
95+
if ds.Status.NumberReady < ds.Status.DesiredNumberScheduled {
96+
return errors.New(errors.ErrCodeInternal,
97+
fmt.Sprintf("not healthy: %d/%d pods ready",
98+
ds.Status.NumberReady, ds.Status.DesiredNumberScheduled))
99+
}
100+
101+
case "StatefulSet":
102+
ss, err := clientset.AppsV1().StatefulSets(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
103+
if err != nil {
104+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
105+
}
106+
expected := int32(1)
107+
if ss.Spec.Replicas != nil {
108+
expected = *ss.Spec.Replicas
109+
}
110+
if ss.Status.ReadyReplicas < expected {
111+
return errors.New(errors.ErrCodeInternal,
112+
fmt.Sprintf("not healthy: %d/%d replicas ready",
113+
ss.Status.ReadyReplicas, expected))
114+
}
115+
116+
case "Service":
117+
_, err := clientset.CoreV1().Services(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
118+
if err != nil {
119+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
120+
}
121+
122+
case "ConfigMap":
123+
_, err := clientset.CoreV1().ConfigMaps(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
124+
if err != nil {
125+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
126+
}
127+
128+
case "Secret":
129+
_, err := clientset.CoreV1().Secrets(er.Namespace).Get(ctx, er.Name, metav1.GetOptions{})
130+
if err != nil {
131+
return errors.Wrap(errors.ErrCodeNotFound, "not found", err)
132+
}
133+
134+
default:
135+
return errors.New(errors.ErrCodeInvalidRequest,
136+
fmt.Sprintf("unsupported resource kind %q", er.Kind))
137+
}
138+
139+
return nil
140+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package deployment
16+
17+
import (
18+
"testing"
19+
20+
"github.com/NVIDIA/eidos/pkg/validator/checks"
21+
)
22+
23+
// TestCheckExpectedResources is the integration test for expected-resources.
24+
// This runs inside validator Jobs and invokes the validator.
25+
func TestCheckExpectedResources(t *testing.T) {
26+
if testing.Short() {
27+
t.Skip("Skipping integration test in short mode")
28+
}
29+
30+
// Load Job environment
31+
runner, err := checks.NewTestRunner(t)
32+
if err != nil {
33+
t.Skipf("Not in Job environment: %v", err)
34+
}
35+
defer runner.Cancel()
36+
37+
// Check if this check is enabled in recipe
38+
if !runner.HasCheck("deployment", "expected-resources") {
39+
t.Skip("Check expected-resources not enabled in recipe")
40+
}
41+
42+
t.Logf("Running check: expected-resources")
43+
44+
// Run the validator
45+
ctx := runner.Context()
46+
err = validateExpectedResources(ctx)
47+
48+
if err != nil {
49+
t.Errorf("Check failed: %v", err)
50+
} else {
51+
t.Logf("✓ Check passed: expected-resources")
52+
}
53+
}

0 commit comments

Comments
 (0)