Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ aicr/
- **Location**: `pkg/validator/`
- **Purpose**: Multi-phase validation of cluster configuration against recipe requirements
- **Phases**:
- **Readiness**: Validates infrastructure prerequisites (K8s version, OS, kernel) and runs readiness checks
- **Readiness**: Evaluates constraints inline against snapshot (K8s version, OS, kernel) — no checks or Jobs
- **Deployment**: Validates component deployment health and expected resources
- **Performance**: Validates system performance and network fabric health
- **Conformance**: Validates workload-specific requirements and conformance
Expand Down
4 changes: 1 addition & 3 deletions Dockerfile.validator
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ RUN set -e; \
./cmd/aicr

# Pre-compile test binaries for in-cluster validation Jobs
RUN CGO_ENABLED=0 go test -c -o /out/readiness.test ./pkg/validator/checks/readiness && \
CGO_ENABLED=0 go test -c -o /out/deployment.test ./pkg/validator/checks/deployment && \
RUN CGO_ENABLED=0 go test -c -o /out/deployment.test ./pkg/validator/checks/deployment && \
CGO_ENABLED=0 go test -c -o /out/conformance.test ./pkg/validator/checks/conformance

# Build test2json tool — converts verbose test output to JSON event stream.
Expand All @@ -69,7 +68,6 @@ LABEL org.opencontainers.image.title="aicr-validator" \

# Copy compiled binaries
COPY --from=builder /out/aicr /usr/local/bin/aicr
COPY --from=builder /out/readiness.test /usr/local/bin/readiness.test
COPY --from=builder /out/deployment.test /usr/local/bin/deployment.test
COPY --from=builder /out/conformance.test /usr/local/bin/conformance.test
COPY --from=builder /out/test2json /usr/local/bin/test2json
Expand Down
1 change: 0 additions & 1 deletion cmd/aicr/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
// Each package's init() function registers its validators.
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
_ "github.com/NVIDIA/aicr/pkg/validator/checks/deployment"
_ "github.com/NVIDIA/aicr/pkg/validator/checks/readiness"
)

func main() {
Expand Down
3 changes: 1 addition & 2 deletions docs/integrator/recipe-development.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,7 @@ componentRefs:
namespace: gpu-operator

validation:
readiness:
checks: [gpu-hardware-detection, kernel-parameters]
# Readiness phase has no checks — constraints are evaluated inline from snapshot.
deployment:
checks: [operator-health, expected-resources]
performance:
Expand Down
16 changes: 1 addition & 15 deletions docs/user/cli-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ Validation can be run in different phases to validate different aspects of the d

| Phase | Description | When to Run |
|-------|-------------|-------------|
| `readiness` | Validates infrastructure prerequisites (K8s version, OS, kernel) and runs readiness checks | Before deploying any components |
| `readiness` | Evaluates constraints inline against snapshot (K8s version, OS, kernel) — no checks or Jobs | Before deploying any components |
| `deployment` | Validates component deployment health and expected resources | After deploying components |
| `performance` | Validates system performance and network fabric health | After components are running |
| `conformance` | Validates workload-specific requirements and conformance | Before running production workloads |
Expand Down Expand Up @@ -570,13 +570,6 @@ phases:
expected: ubuntu
actual: ubuntu
status: passed
checks:
- name: gpu-hardware-detection
status: pass
- name: kernel-parameters
status: pass
- name: os-prerequisites
status: pass
duration: 20.5µs
```

Expand Down Expand Up @@ -608,13 +601,6 @@ phases:
expected: ubuntu
actual: ubuntu
status: passed
checks:
- name: gpu-hardware-detection
status: pass
- name: kernel-parameters
status: pass
- name: os-prerequisites
status: pass
duration: 20.7µs
deployment:
status: pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,9 @@ constraints:

# Multi-phase validation configuration
validation:
# Readiness phase: Validate infrastructure before installing components
readiness:
checks:
- gpu-hardware-detection
- kernel-parameters
- os-prerequisites
# Readiness phase: Constraint-only gate (evaluated inline from snapshot, no Jobs)
# Constraints are defined at the top level (spec.constraints) and evaluated
# during readiness. No checks or Jobs needed.

# Deployment phase: Validate component deployment
deployment:
Expand Down
8 changes: 5 additions & 3 deletions pkg/recipe/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,11 @@ func TestGetEmbeddedFS(t *testing.T) {

// TestConstraintWarning tests the ConstraintWarning struct.
func TestConstraintWarning(t *testing.T) {
const k8sVersionConstraint = "K8s.server.version"

warning := ConstraintWarning{
Overlay: "h100-eks-ubuntu-training-kubeflow",
Constraint: "K8s.server.version",
Constraint: k8sVersionConstraint,
Expected: ">= 1.32.4",
Actual: "1.30.0",
Reason: "expected >= 1.32.4, got 1.30.0",
Expand All @@ -289,8 +291,8 @@ func TestConstraintWarning(t *testing.T) {
if warning.Overlay != "h100-eks-ubuntu-training-kubeflow" {
t.Errorf("expected overlay h100-eks-ubuntu-training-kubeflow, got %q", warning.Overlay)
}
if warning.Constraint != "K8s.server.version" {
t.Errorf("expected constraint K8s.server.version, got %q", warning.Constraint)
if warning.Constraint != k8sVersionConstraint {
t.Errorf("expected constraint %s, got %q", k8sVersionConstraint, warning.Constraint)
}
if warning.Expected != ">= 1.32.4" {
t.Errorf("expected expression >= 1.32.4, got %q", warning.Expected)
Expand Down
6 changes: 3 additions & 3 deletions pkg/recipe/metadata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ func TestMergeValidationConfig(t *testing.T) {
base := RecipeMetadataSpec{
Validation: &ValidationConfig{
Readiness: &ValidationPhase{
Checks: []string{"gpu-hardware-detection"},
Constraints: []Constraint{{Name: "K8s.server.version", Value: ">= 1.30"}},
},
Deployment: &ValidationPhase{
Timeout: "5m",
Expand Down Expand Up @@ -517,8 +517,8 @@ func TestMergeValidationConfig(t *testing.T) {
if base.Validation.Readiness == nil {
t.Fatal("readiness should be preserved from base")
}
if base.Validation.Readiness.Checks[0] != "gpu-hardware-detection" {
t.Error("readiness checks should be preserved from base")
if base.Validation.Readiness.Constraints[0].Name != "K8s.server.version" {
t.Error("readiness constraints should be preserved from base")
}
if base.Validation.Deployment.Timeout != "10m" {
t.Errorf("deployment timeout = %q, want 10m (from overlay)", base.Validation.Deployment.Timeout)
Expand Down
38 changes: 20 additions & 18 deletions pkg/validator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fmt.Printf("Status: %s, Passed: %d, Failed: %d\n",

| Phase | Execution | Data Source | Purpose |
|-------|-----------|-------------|---------|
| **Readiness** | Constraints inline, Checks in Jobs | Snapshot only | Validate prerequisites before deployment |
| **Readiness** | Constraints evaluated inline (no cluster access) | Snapshot only | Validate prerequisites before deployment |
| **Deployment** | All in Jobs | Snapshot + Live cluster | Verify deployed resources |
| **Performance** | All in Jobs | Snapshot + Live cluster | Measure system performance |
| **Conformance** | All in Jobs | Snapshot + Live cluster | Validate API conformance |
Expand All @@ -48,7 +48,7 @@ Recipe Definition
┌─────────────────────────────────────────────────────┐
│ Readiness Phase │
│ • Constraints: Evaluated inline (snapshot) │
│ • Checks: Run in Jobs (GPU detection, kernel, OS)
│ • No checks — constraint-only gate
└─────────────────────────────────────────────────────┘
↓ (if passed)
┌─────────────────────────────────────────────────────┐
Expand Down Expand Up @@ -141,22 +141,30 @@ Checks execute via Go's standard test framework:

```go
// Check function (registered in init())
func CheckGPUHardwareDetection(ctx *checks.ValidationContext) error {
// Access snapshot data and K8s API
for _, m := range ctx.Snapshot.Measurements {
if m.Type == measurement.TypeGPU { /* validate */ }
func CheckOperatorHealth(ctx *checks.ValidationContext) error {
// Access live cluster via K8s API
pods, err := ctx.Clientset.CoreV1().Pods("gpu-operator").List(
ctx.Context,
metav1.ListOptions{LabelSelector: "app=gpu-operator"},
)
if err != nil {
return fmt.Errorf("failed to list pods: %w", err)
}
// Verify at least one pod is running
for _, pod := range pods.Items {
if pod.Status.Phase == "Running" { return nil }
}
return nil
return fmt.Errorf("no GPU operator pods running")
}

// Test wrapper (enables Job execution)
func TestGPUHardwareDetection(t *testing.T) {
func TestOperatorHealth(t *testing.T) {
runner, err := checks.NewTestRunner(t) // Loads context from Job env
if err != nil {
t.Skipf("Skipping (not in Kubernetes): %v", err)
return
}
runner.RunCheck("gpu-hardware-detection") // Executes check
runner.RunCheck("operator-health") // Executes check
}
```

Expand All @@ -180,7 +188,7 @@ Each validation run is assigned a unique **RunID** for resource isolation and re
All resources created during a validation run include the RunID:
- Input ConfigMaps: `aicr-snapshot-{runID}`, `aicr-recipe-{runID}` (shared by all phases)
- Output ConfigMap: `aicr-validation-result-{runID}` (progressively updated)
- Jobs: `aicr-{runID}-readiness`, `aicr-{runID}-deployment`, etc. (one per phase)
- Jobs: `aicr-{runID}-deployment`, `aicr-{runID}-performance`, etc. (one per phase, readiness has no Job)

**Benefits:**
- **Concurrent Validations**: Multiple validation runs can execute simultaneously without conflicts
Expand Down Expand Up @@ -345,7 +353,7 @@ componentRefs:
namespace: gpu-operator

validation:
# Phase 1: Readiness (pre-deployment validation)
# Phase 1: Readiness (pre-deployment validation, constraints only, no cluster access)
readiness:
constraints:
- name: GPU.count
Expand All @@ -354,10 +362,6 @@ validation:
value: "== ubuntu"
- name: Kernel.version
value: ">= 5.15.0"
checks:
- gpu-hardware-detection
- kernel-parameters
- os-prerequisites

# Phase 2: Deployment (verify deployed resources)
deployment:
Expand Down Expand Up @@ -551,7 +555,7 @@ Deployment, performance, and conformance constraints need **live cluster access*
- Measure network bandwidth
- Check API conformance

Only readiness constraints can evaluate inline because they only need snapshot data.
Readiness is constraint-only and evaluates inline because it only needs snapshot data (no cluster access, no Jobs).

### Why ConfigMaps for Results?

Expand Down Expand Up @@ -612,8 +616,6 @@ validation:
constraints:
- name: GPU.count
value: ">= 8"
checks:
- gpu-hardware-detection

deployment:
constraints:
Expand Down
12 changes: 6 additions & 6 deletions pkg/validator/agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ The validator reads Job logs and updates the unified ValidationResult ConfigMap.
The validation Job needs permissions to:
- Read/write ConfigMaps (for inputs and results)
- Read pods, services, deployments (for deployment phase checks)
- Read nodes (for readiness phase checks)
- Read nodes (for deployment phase checks)

## Usage Example

Expand All @@ -59,13 +59,13 @@ clientset, err := k8sclient.GetKubeClient()
// Configure validation agent
config := agent.Config{
Namespace: "aicr-validation",
JobName: "aicr-validation-readiness",
JobName: "aicr-validation-deployment",
Image: "ghcr.io/nvidia/aicr-validator:latest", // Validator image with Go toolchain
ServiceAccountName: "aicr-validator",
SnapshotConfigMap: "aicr-snapshot",
RecipeConfigMap: "aicr-recipe",
TestPackage: "./pkg/validator/checks/readiness",
TestPattern: "TestGpuHardwareDetection",
TestPackage: "./pkg/validator/checks/deployment",
TestPattern: "TestOperatorHealth",
Timeout: 5 * time.Minute,
Cleanup: true,
}
Expand Down Expand Up @@ -123,9 +123,9 @@ The validator package uses this agent to run checks:

```go
// In pkg/validator/phases.go
func (v *Validator) validateReadiness(ctx context.Context, ...) {
func (v *Validator) validateDeployment(ctx context.Context, ...) {
// For each check in recipe
for _, checkName := range recipe.Validation.Readiness.Checks {
for _, checkName := range recipe.Validation.Deployment.Checks {
// Deploy Job for this check
deployer := agent.NewDeployer(...)
deployer.Deploy(ctx)
Expand Down
4 changes: 2 additions & 2 deletions pkg/validator/agent/deployer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ func createConfig() Config {
Image: "ghcr.io/nvidia/aicr-validator:latest",
SnapshotConfigMap: "test-snapshot",
RecipeConfigMap: "test-recipe",
TestPackage: "./pkg/validator/checks/readiness",
TestPattern: "TestGpuHardwareDetection",
TestPackage: "./pkg/validator/checks/deployment",
TestPattern: "TestOperatorHealth",
Timeout: 5 * time.Minute,
ImagePullSecrets: []string{"regcred"},
NodeSelector: map[string]string{
Expand Down
2 changes: 1 addition & 1 deletion pkg/validator/agent/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func (d *Deployer) buildJobSpec() *batchv1.Job {
}

// testBinaryName derives the pre-compiled test binary name from a TestPackage path.
// Example: "./pkg/validator/checks/readiness" → "readiness.test"
// Example: "./pkg/validator/checks/deployment" → "deployment.test"
func testBinaryName(testPackage string) string {
return filepath.Base(testPackage) + ".test"
}
Expand Down
14 changes: 7 additions & 7 deletions pkg/validator/agent/job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,12 @@ func TestBuildTestCommand(t *testing.T) {
}{
{
name: "basic test command",
testPackage: "./pkg/validator/checks/readiness",
testPattern: "TestGpuHardwareDetection",
testPackage: "./pkg/validator/checks/deployment",
testPattern: "TestOperatorHealth",
wantContain: []string{
"readiness.test",
"deployment.test",
"-test.v",
"-test.run 'TestGpuHardwareDetection'",
"-test.run 'TestOperatorHealth'",
"| test2json |",
"tee /tmp/test-output.json",
"--- BEGIN TEST OUTPUT ---",
Expand All @@ -327,10 +327,10 @@ func TestBuildTestCommand(t *testing.T) {
},
{
name: "no pattern",
testPackage: "./pkg/validator/checks/readiness",
testPackage: "./pkg/validator/checks/deployment",
testPattern: "",
wantContain: []string{
"readiness.test -test.v",
"deployment.test -test.v",
"| test2json |",
},
},
Expand Down Expand Up @@ -358,7 +358,7 @@ func TestTestBinaryName(t *testing.T) {
testPackage string
want string
}{
{"./pkg/validator/checks/readiness", "readiness.test"},
{"./pkg/validator/checks/deployment", "deployment.test"},
{"./pkg/validator/checks/deployment", "deployment.test"},
{"./pkg/validator/checks/performance", "performance.test"},
{"./pkg/validator/checks/conformance", "conformance.test"},
Expand Down
Loading
Loading