Skip to content

Commit 2c05fe1

Browse files
authored
Merge branch 'main' into ci/workflow-improvements
2 parents 3194f31 + 361120e commit 2c05fe1

32 files changed

+1301
-121
lines changed

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ jobs:
109109
--namespace gpu-operator \
110110
--kubeconfig="${HOME}/.kube/config" \
111111
--require-gpu \
112-
--image=ko.local:smoke-test
112+
--image=ko.local:smoke-test \
113+
--output=validation-result.yaml \
114+
--evidence-dir=conformance-evidence
113115
114116
- name: Install chainsaw
115117
run: |
@@ -228,6 +230,16 @@ jobs:
228230
--kubeconfig="${HOME}/.kube/config" \
229231
--debug
230232
233+
- name: Upload conformance evidence
234+
if: always()
235+
uses: actions/upload-artifact@v4
236+
with:
237+
name: conformance-evidence
238+
path: |
239+
conformance-evidence/
240+
validation-result.yaml
241+
if-no-files-found: warn
242+
231243
- name: Debug diagnostics
232244
if: failure()
233245
run: |

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,9 @@ jobs:
122122
--namespace gpu-operator \
123123
--kubeconfig="${HOME}/.kube/config" \
124124
--require-gpu \
125-
--image=ko.local:smoke-test
125+
--image=ko.local:smoke-test \
126+
--output=validation-result.yaml \
127+
--evidence-dir=conformance-evidence
126128
127129
# --- Evidence collection ---
128130

@@ -140,6 +142,16 @@ jobs:
140142
--kubeconfig="${HOME}/.kube/config" \
141143
--debug
142144
145+
- name: Upload conformance evidence
146+
if: always()
147+
uses: actions/upload-artifact@v4
148+
with:
149+
name: conformance-evidence
150+
path: |
151+
conformance-evidence/
152+
validation-result.yaml
153+
if-no-files-found: warn
154+
143155
# --- Debug diagnostics (before cleanup so resources still exist) ---
144156

145157
- name: Debug diagnostics

docs/conformance/cncf/README.md

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,26 @@ docs/conformance/cncf/
3535

3636
## Usage
3737

38+
Evidence is generated automatically from `aicr validate` conformance results:
39+
3840
```bash
39-
# Collect all evidence
40-
./docs/conformance/cncf/collect-evidence.sh all
41-
42-
# Collect evidence for a single feature
43-
./docs/conformance/cncf/collect-evidence.sh dra
44-
./docs/conformance/cncf/collect-evidence.sh gang
45-
./docs/conformance/cncf/collect-evidence.sh secure
46-
./docs/conformance/cncf/collect-evidence.sh metrics
47-
./docs/conformance/cncf/collect-evidence.sh gateway
48-
./docs/conformance/cncf/collect-evidence.sh operator
41+
# Generate evidence during validation
42+
aicr validate -r recipe.yaml -s snapshot.yaml \
43+
--phase conformance --evidence-dir ./evidence
44+
45+
# Use a saved result file for evidence instead of the live run
46+
aicr validate -r recipe.yaml -s snapshot.yaml \
47+
--phase conformance --evidence-dir ./evidence \
48+
--result validation-result.yaml
4949
```
5050

51+
The chainsaw assertion evidence (`go run ./tests/chainsaw/ai-conformance/`) checks
52+
resource existence (CRDs, deployments, etc.) and is complementary to the behavioral
53+
validation evidence generated by `aicr validate --evidence-dir`.
54+
55+
> **Note:** `collect-evidence.sh` is deprecated. Use `aicr validate --evidence-dir`
56+
> instead.
57+
5158
## Evidence
5259

5360
See [evidence/index.md](evidence/index.md) for a summary of all collected evidence and results.
@@ -63,7 +70,5 @@ See [evidence/index.md](evidence/index.md) for a summary of all collected eviden
6370
| 5 | Inference API Gateway | `ai_inference` | [evidence/inference-gateway.md](evidence/inference-gateway.md) |
6471
| 6 | Robust AI Operator | `robust_controller` | [evidence/robust-operator.md](evidence/robust-operator.md) |
6572

66-
## TODO
67-
68-
- [ ] **Cluster Autoscaling** (`cluster_autoscaling`, MUST) — Demonstrate Karpenter or cluster autoscaler scaling GPU node groups based on pending pod requests
69-
- [ ] **Pod Autoscaling** (`pod_autoscaling`, MUST) — Demonstrate HPA scaling pods based on custom GPU metrics (e.g., `gpu_utilization` from prometheus-adapter)
73+
| 7 | Cluster Autoscaling | `cluster_autoscaling` | [evidence/cluster-autoscaling.md](evidence/cluster-autoscaling.md) |
74+
| 8 | Pod Autoscaling | `pod_autoscaling` | [evidence/pod-autoscaling.md](evidence/pod-autoscaling.md) |

docs/conformance/cncf/collect-evidence.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,14 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
# CNCF AI Conformance Evidence Collection
17-
# Collects evidence for Must-have requirements (Kubernetes 1.34)
16+
# DEPRECATED: Use 'aicr validate --evidence-dir' instead.
1817
#
19-
# Usage: ./docs/conformance/cncf/collect-evidence.sh [section]
20-
# Sections: dra, gang, secure, metrics, gateway, operator, all (default: all)
21-
#
22-
# Each section produces a separate evidence file under docs/conformance/cncf/evidence/
18+
# Evidence is now generated directly from validation results:
19+
# aicr validate -r recipe.yaml --phase conformance --evidence-dir ./evidence
20+
# aicr validate -r recipe.yaml --phase conformance --evidence-dir ./evidence --result result.yaml
2321

24-
set -euo pipefail
22+
echo "DEPRECATED: Use 'aicr validate --evidence-dir' instead." >&2
23+
exit 1
2524

2625
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
2726
REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"

pkg/cli/validate.go

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
"github.com/NVIDIA/aicr/pkg/defaults"
2727
"github.com/NVIDIA/aicr/pkg/errors"
28+
"github.com/NVIDIA/aicr/pkg/evidence"
2829
"github.com/NVIDIA/aicr/pkg/recipe"
2930
"github.com/NVIDIA/aicr/pkg/serializer"
3031
"github.com/NVIDIA/aicr/pkg/snapshotter"
@@ -158,6 +159,8 @@ func runValidation(
158159
cleanup bool,
159160
imagePullSecrets []string,
160161
noCluster bool,
162+
evidenceDir string,
163+
evidenceResultPath string,
161164
) error {
162165

163166
slog.Info("running validation",
@@ -218,6 +221,30 @@ func runValidation(
218221
"skipped", result.Summary.Skipped,
219222
"duration", result.Summary.Duration)
220223

224+
// Generate evidence if requested. Strict: failure is an error.
225+
if evidenceDir != "" {
226+
// Use a saved result file for evidence when --result is provided,
227+
// otherwise use the result from the validation run we just completed.
228+
evidenceSource := result
229+
if evidenceResultPath != "" {
230+
slog.Info("loading saved result for evidence rendering", "path", evidenceResultPath)
231+
saved, loadErr := serializer.FromFile[validator.ValidationResult](evidenceResultPath)
232+
if loadErr != nil {
233+
return errors.Wrap(errors.ErrCodeInvalidRequest, "failed to load evidence result", loadErr)
234+
}
235+
evidenceSource = saved
236+
}
237+
238+
evidenceCtx, evidenceCancel := context.WithTimeout(ctx, defaults.EvidenceRenderTimeout)
239+
defer evidenceCancel()
240+
241+
renderer := evidence.New(evidence.WithOutputDir(evidenceDir))
242+
if err := renderer.Render(evidenceCtx, evidenceSource); err != nil {
243+
return errors.Wrap(errors.ErrCodeInternal, "evidence rendering failed", err)
244+
}
245+
slog.Info("conformance evidence written", "dir", evidenceDir)
246+
}
247+
221248
// If cleanup is disabled, provide helpful debugging info
222249
if !cleanup {
223250
slog.Info("cleanup disabled - Jobs and RBAC kept for debugging",
@@ -332,6 +359,14 @@ func validateCmdFlags() []cli.Flag {
332359
Sources: cli.EnvVars("AICR_REQUIRE_GPU"),
333360
Usage: "Request nvidia.com/gpu resource for the agent pod. Required in CDI environments where GPU devices are only injected when explicitly requested.",
334361
},
362+
&cli.StringFlag{
363+
Name: "evidence-dir",
364+
Usage: "Write CNCF conformance evidence markdown to this directory. Requires --phase conformance.",
365+
},
366+
&cli.StringFlag{
367+
Name: "result",
368+
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Requires --phase conformance and --evidence-dir.",
369+
},
335370
outputFlag,
336371
formatFlag,
337372
kubeconfigFlag,
@@ -385,15 +420,48 @@ Run validation without failing on constraint errors (informational mode):
385420
386421
Resume a previous validation run from where it left off:
387422
aicr validate -r recipe.yaml -s snapshot.yaml --resume 20260206-140523-a3f9
423+
424+
Generate conformance evidence alongside validation:
425+
aicr validate -r recipe.yaml -s snapshot.yaml \
426+
--phase conformance --evidence-dir ./evidence
427+
428+
Use a saved result file for evidence instead of the live run:
429+
aicr validate -r recipe.yaml -s snapshot.yaml \
430+
--phase conformance --evidence-dir ./evidence \
431+
--result validation-result.yaml
388432
`,
389433
Flags: validateCmdFlags(),
390434
Action: func(ctx context.Context, cmd *cli.Command) error {
391435
// Validate single-value flags are not duplicated
392436
// Note: --phase allows multiple values so it's not included here
393-
if err := validateSingleValueFlags(cmd, "recipe", "snapshot", "output", "format", "namespace", "validation-namespace", "image", "job-name", "service-account-name", "timeout", "resume"); err != nil {
437+
if err := validateSingleValueFlags(cmd, "recipe", "snapshot", "output", "format", "namespace", "validation-namespace", "image", "job-name", "service-account-name", "timeout", "resume", "result"); err != nil {
394438
return err
395439
}
396440

441+
evidenceDir := cmd.String("evidence-dir")
442+
resultPath := cmd.String("result")
443+
444+
// Parse phases (default to readiness if none specified)
445+
phases, err := parseValidationPhases(cmd.StringSlice("phase"))
446+
if err != nil {
447+
return err
448+
}
449+
450+
// Validate evidence flag constraints.
451+
hasConformance := false
452+
for _, p := range phases {
453+
if p == validator.PhaseConformance || p == validator.PhaseAll {
454+
hasConformance = true
455+
break
456+
}
457+
}
458+
if evidenceDir != "" && !hasConformance {
459+
return errors.New(errors.ErrCodeInvalidRequest, "--evidence-dir requires --phase conformance")
460+
}
461+
if resultPath != "" && evidenceDir == "" {
462+
return errors.New(errors.ErrCodeInvalidRequest, "--result requires --evidence-dir")
463+
}
464+
397465
recipeFilePath := cmd.String("recipe")
398466
snapshotFilePath := cmd.String("snapshot")
399467
kubeconfig := cmd.String("kubeconfig")
@@ -418,12 +486,6 @@ Resume a previous validation run from where it left off:
418486

419487
failOnError := cmd.Bool("fail-on-error")
420488

421-
// Parse phases (default to readiness if none specified)
422-
phases, err := parseValidationPhases(cmd.StringSlice("phase"))
423-
if err != nil {
424-
return err
425-
}
426-
427489
slog.Info("loading recipe", "uri", recipeFilePath)
428490

429491
// Load recipe
@@ -460,7 +522,7 @@ Resume a previous validation run from where it left off:
460522
}
461523
}
462524

463-
return runValidation(ctx, rec, snap, phases, recipeFilePath, snapshotSource, cmd.String("output"), outFormat, failOnError, validationNamespace, cmd.String("resume"), cmd.String("image"), cmd.Bool("cleanup"), cmd.StringSlice("image-pull-secret"), cmd.Bool("no-cluster"))
525+
return runValidation(ctx, rec, snap, phases, recipeFilePath, snapshotSource, cmd.String("output"), outFormat, failOnError, validationNamespace, cmd.String("resume"), cmd.String("image"), cmd.Bool("cleanup"), cmd.StringSlice("image-pull-secret"), cmd.Bool("no-cluster"), evidenceDir, resultPath)
464526
},
465527
}
466528
}

pkg/defaults/timeouts.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,20 @@ const (
167167
KarpenterPollInterval = 10 * time.Second
168168
)
169169

170+
// Gang scheduling co-scheduling validation.
171+
const (
172+
// CoScheduleWindow is the maximum time span between PodScheduled timestamps
173+
// for gang-scheduled pods. If pods are scheduled further apart than this,
174+
// they are not considered co-scheduled.
175+
CoScheduleWindow = 30 * time.Second
176+
)
177+
178+
// Evidence rendering timeouts.
179+
const (
180+
// EvidenceRenderTimeout is the timeout for rendering conformance evidence markdown.
181+
EvidenceRenderTimeout = 30 * time.Second
182+
)
183+
170184
// Deployment and pod scheduling test timeouts for conformance validation.
171185
const (
172186
// DeploymentScaleTimeout is the timeout for waiting for Deployment controller

pkg/defaults/timeouts_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ func TestTimeoutConstants(t *testing.T) {
5757
{"ValidatePerformanceTimeout", ValidatePerformanceTimeout, 10 * time.Minute, 60 * time.Minute},
5858
{"ValidateConformanceTimeout", ValidateConformanceTimeout, 5 * time.Minute, 30 * time.Minute},
5959
{"ResourceVerificationTimeout", ResourceVerificationTimeout, 5 * time.Second, 30 * time.Second},
60+
61+
// Gang scheduling co-scheduling window
62+
{"CoScheduleWindow", CoScheduleWindow, 10 * time.Second, 60 * time.Second},
63+
64+
// Evidence rendering timeout
65+
{"EvidenceRenderTimeout", EvidenceRenderTimeout, 10 * time.Second, 60 * time.Second},
6066
}
6167

6268
for _, tt := range tests {

0 commit comments

Comments
 (0)