Skip to content

Commit b975bcc

Browse files
committed
feat(validator): add Chainsaw-style health check assertions via --data flag
1 parent 1e3474d commit b975bcc

File tree

14 files changed

+1183
-1
lines changed

14 files changed

+1183
-1
lines changed

Dockerfile.validator

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ RUN CGO_ENABLED=0 go test -c -o /out/deployment.test ./pkg/validator/checks/depl
5757
# test2json (e.g. readiness.test -test.v | test2json).
5858
RUN CGO_ENABLED=0 go build -o /out/test2json cmd/test2json
5959

60+
# Install Chainsaw CLI for component health check assertions.
61+
ARG CHAINSAW_VERSION=v0.2.12
62+
RUN set -e; \
63+
ARCH=$(go env GOARCH); \
64+
curl -fsSL "https://github.com/kyverno/chainsaw/releases/download/${CHAINSAW_VERSION}/chainsaw_linux_${ARCH}.tar.gz" \
65+
| tar xz -C /out chainsaw
66+
6067
# ---------------------------------------------------------------------------
6168
# Final stage: CUDA runtime provides nvidia-smi for GPU validation
6269
# ---------------------------------------------------------------------------
@@ -71,6 +78,7 @@ COPY --from=builder /out/aicr /usr/local/bin/aicr
7178
COPY --from=builder /out/deployment.test /usr/local/bin/deployment.test
7279
COPY --from=builder /out/conformance.test /usr/local/bin/conformance.test
7380
COPY --from=builder /out/test2json /usr/local/bin/test2json
81+
COPY --from=builder /out/chainsaw /usr/local/bin/chainsaw
7482

7583
# Copy testdata needed by deployment tests at runtime (loaded via os.ReadFile
7684
# relative to CWD, e.g. testdata/nvidia-smi-verify-pod.yaml)

pkg/cli/validate.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ func validateCmdFlags() []cli.Flag {
366366
Name: "result",
367367
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Requires --phase conformance and --evidence-dir.",
368368
},
369+
dataFlag,
369370
outputFlag,
370371
formatFlag,
371372
kubeconfigFlag,
@@ -433,7 +434,12 @@ Use a saved result file for evidence instead of the live run:
433434
Action: func(ctx context.Context, cmd *cli.Command) error {
434435
// Validate single-value flags are not duplicated
435436
// Note: --phase allows multiple values so it's not included here
436-
if err := validateSingleValueFlags(cmd, "recipe", "snapshot", "output", "format", "namespace", "validation-namespace", "image", "job-name", "service-account-name", "timeout", "resume", "result"); err != nil {
437+
if err := validateSingleValueFlags(cmd, "recipe", "snapshot", "output", "format", "namespace", "validation-namespace", "image", "job-name", "service-account-name", "timeout", "resume", "result", "data"); err != nil {
438+
return err
439+
}
440+
441+
// Initialize external data provider if --data flag is set
442+
if err := initDataProvider(cmd); err != nil {
437443
return err
438444
}
439445

pkg/defaults/timeouts.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ const (
137137
ComponentRenderTimeout = 60 * time.Second
138138
)
139139

140+
// Chainsaw assertion timeouts for component health checks.
141+
const (
142+
// ChainsawAssertTimeout is the timeout for Chainsaw CLI assertions
143+
// when evaluating component health check assert files.
144+
ChainsawAssertTimeout = 2 * time.Minute
145+
)
146+
140147
// Conformance test timeouts for DRA and gang scheduling validation.
141148
const (
142149
// DRATestPodTimeout is the timeout for the DRA test pod to complete.

pkg/recipe/components.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ type ComponentConfig struct {
6060

6161
// Validations defines component-specific validation checks.
6262
Validations []ComponentValidationConfig `yaml:"validations,omitempty"`
63+
64+
// HealthCheck defines custom health check configuration for this component.
65+
HealthCheck HealthCheckConfig `yaml:"healthCheck,omitempty"`
66+
}
67+
68+
// HealthCheckConfig defines custom health check settings for a component.
69+
type HealthCheckConfig struct {
70+
// AssertFile is the path to a Chainsaw-style assert YAML file (relative to data directory).
71+
// When set, the expected-resources check uses Chainsaw CLI to evaluate assertions
72+
// instead of the default auto-discovery + typed replica checks.
73+
AssertFile string `yaml:"assertFile,omitempty"`
6374
}
6475

6576
// HelmConfig contains default Helm chart settings for a component.
@@ -161,6 +172,15 @@ func GetComponentRegistry() (*ComponentRegistry, error) {
161172
return globalRegistry, globalRegistryErr
162173
}
163174

175+
// ResetComponentRegistryForTesting resets the singleton registry so it will be
176+
// reloaded from the current DataProvider on the next call to GetComponentRegistry.
177+
// This must only be called from tests.
178+
func ResetComponentRegistryForTesting() {
179+
globalRegistry = nil
180+
globalRegistryErr = nil
181+
globalRegistryOnce = sync.Once{}
182+
}
183+
164184
// MustGetComponentRegistry returns the global component registry or panics.
165185
// Use this in init() functions where the registry must be available.
166186
func MustGetComponentRegistry() *ComponentRegistry {

pkg/recipe/metadata.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ type ComponentRef struct {
111111
// ExpectedResources lists Kubernetes resources that should exist after deployment.
112112
// Used by deployment phase validation to verify component health.
113113
ExpectedResources []ExpectedResource `json:"expectedResources,omitempty" yaml:"expectedResources,omitempty"`
114+
115+
// HealthCheckAsserts contains raw Chainsaw-style assert YAML loaded from the
116+
// registry's healthCheck.assertFile via the DataProvider. When non-empty, the
117+
// expected-resources check runs Chainsaw CLI to evaluate assertions instead of
118+
// the default auto-discovery + typed replica checks.
119+
HealthCheckAsserts string `json:"healthCheckAsserts,omitempty" yaml:"healthCheckAsserts,omitempty"`
114120
}
115121

116122
// ExpectedResource represents a Kubernetes resource that should exist after deployment.
@@ -479,6 +485,11 @@ func mergeComponentRef(base, overlay ComponentRef) ComponentRef {
479485
result.ExpectedResources = overlay.ExpectedResources
480486
}
481487

488+
// HealthCheckAsserts: overlay takes precedence if set
489+
if overlay.HealthCheckAsserts != "" {
490+
result.HealthCheckAsserts = overlay.HealthCheckAsserts
491+
}
492+
482493
return result
483494
}
484495

pkg/validator/chainsaw/runner.go

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package chainsaw executes Chainsaw-style assertions against a live Kubernetes cluster.
16+
package chainsaw
17+
18+
import (
19+
"bytes"
20+
"context"
21+
"fmt"
22+
"log/slog"
23+
"os"
24+
"os/exec"
25+
"path/filepath"
26+
"sync"
27+
"text/template"
28+
"time"
29+
30+
"github.com/NVIDIA/aicr/pkg/errors"
31+
)
32+
33+
// ComponentAssert holds the data needed to run Chainsaw for one component.
34+
type ComponentAssert struct {
35+
// Name is the component name (e.g., "gpu-operator").
36+
Name string
37+
38+
// AssertYAML is the raw Chainsaw assert file content.
39+
AssertYAML string
40+
}
41+
42+
// Result holds the outcome of a Chainsaw assertion run for one component.
43+
type Result struct {
44+
// Component is the component name.
45+
Component string
46+
47+
// Passed indicates whether the assertion passed.
48+
Passed bool
49+
50+
// Output contains Chainsaw stdout/stderr for diagnostics.
51+
Output string
52+
53+
// Error contains any error from executing Chainsaw.
54+
Error error
55+
}
56+
57+
// chainsawTestTemplate is the Chainsaw test manifest template.
58+
var chainsawTestTemplate = template.Must(template.New("chainsaw-test").Parse(`apiVersion: chainsaw.kyverno.io/v1alpha1
59+
kind: Test
60+
metadata:
61+
name: {{ .Name }}
62+
spec:
63+
timeouts:
64+
assert: {{ .Timeout }}
65+
steps:
66+
- try:
67+
- assert:
68+
file: assert.yaml
69+
`))
70+
71+
// chainsawTestData holds template parameters for generating chainsaw-test.yaml.
72+
type chainsawTestData struct {
73+
Name string
74+
Timeout string
75+
}
76+
77+
// Run executes Chainsaw assertions for a set of components.
78+
// Creates a temp directory structure and runs `chainsaw test` per component.
79+
// Components are run concurrently with bounded parallelism.
80+
func Run(ctx context.Context, asserts []ComponentAssert, timeout time.Duration) []Result {
81+
if len(asserts) == 0 {
82+
return nil
83+
}
84+
85+
results := make([]Result, len(asserts))
86+
87+
var wg sync.WaitGroup
88+
// Limit concurrency to 4 parallel Chainsaw runs.
89+
sem := make(chan struct{}, 4)
90+
91+
for i, ca := range asserts {
92+
wg.Add(1)
93+
go func() {
94+
defer wg.Done()
95+
sem <- struct{}{}
96+
defer func() { <-sem }()
97+
98+
results[i] = runSingle(ctx, ca, timeout)
99+
}()
100+
}
101+
102+
wg.Wait()
103+
return results
104+
}
105+
106+
// runSingle executes Chainsaw for a single component.
107+
func runSingle(ctx context.Context, ca ComponentAssert, timeout time.Duration) Result {
108+
result := Result{Component: ca.Name}
109+
110+
// Create temp directory for this component's test files.
111+
baseDir, err := os.MkdirTemp("", "chainsaw-run-*")
112+
if err != nil {
113+
result.Error = errors.Wrap(errors.ErrCodeInternal, "failed to create temp directory", err)
114+
return result
115+
}
116+
defer os.RemoveAll(baseDir)
117+
118+
testDir := filepath.Join(baseDir, ca.Name)
119+
if err := os.MkdirAll(testDir, 0o750); err != nil {
120+
result.Error = errors.Wrap(errors.ErrCodeInternal, "failed to create test directory", err)
121+
return result
122+
}
123+
124+
// Write assert.yaml.
125+
assertPath := filepath.Join(testDir, "assert.yaml")
126+
if err := os.WriteFile(assertPath, []byte(ca.AssertYAML), 0o600); err != nil {
127+
result.Error = errors.Wrap(errors.ErrCodeInternal, "failed to write assert.yaml", err)
128+
return result
129+
}
130+
131+
// Generate chainsaw-test.yaml.
132+
testYAMLPath := filepath.Join(testDir, "chainsaw-test.yaml")
133+
if err := generateTestManifest(testYAMLPath, ca.Name, timeout); err != nil {
134+
result.Error = err
135+
return result
136+
}
137+
138+
// Execute chainsaw test.
139+
output, execErr := execChainsaw(ctx, testDir)
140+
result.Output = output
141+
142+
if execErr != nil {
143+
result.Passed = false
144+
result.Error = execErr
145+
slog.Warn("chainsaw health check failed",
146+
"component", ca.Name,
147+
"error", execErr)
148+
} else {
149+
result.Passed = true
150+
slog.Info("chainsaw health check passed", "component", ca.Name)
151+
}
152+
153+
return result
154+
}
155+
156+
// generateTestManifest writes a chainsaw-test.yaml file for the given component.
157+
func generateTestManifest(path, componentName string, timeout time.Duration) error {
158+
data := chainsawTestData{
159+
Name: componentName,
160+
Timeout: fmt.Sprintf("%ds", int(timeout.Seconds())),
161+
}
162+
163+
var buf bytes.Buffer
164+
if err := chainsawTestTemplate.Execute(&buf, data); err != nil {
165+
return errors.Wrap(errors.ErrCodeInternal, "failed to render chainsaw test template", err)
166+
}
167+
168+
if err := os.WriteFile(path, buf.Bytes(), 0o600); err != nil {
169+
return errors.Wrap(errors.ErrCodeInternal, "failed to write chainsaw-test.yaml", err)
170+
}
171+
172+
return nil
173+
}
174+
175+
// execChainsaw runs `chainsaw test --test-dir <dir> --no-color` and returns
176+
// combined stdout+stderr output. Returns nil error on exit code 0, otherwise
177+
// wraps the exec error.
178+
func execChainsaw(ctx context.Context, testDir string) (string, error) {
179+
cmd := exec.CommandContext(ctx, "chainsaw", "test", "--test-dir", testDir, "--no-color")
180+
181+
var combined bytes.Buffer
182+
cmd.Stdout = &combined
183+
cmd.Stderr = &combined
184+
185+
slog.Debug("executing chainsaw", "dir", testDir, "cmd", cmd.String())
186+
187+
err := cmd.Run()
188+
output := combined.String()
189+
190+
if err != nil {
191+
return output, errors.Wrap(errors.ErrCodeInternal,
192+
fmt.Sprintf("chainsaw test failed (exit code: %v)", cmd.ProcessState.ExitCode()), err)
193+
}
194+
195+
return output, nil
196+
}

0 commit comments

Comments
 (0)