Skip to content

Commit 3181cc5

Browse files
committed
feat: add 'aicr evidence' command for CNCF conformance evidence collection
Add a new top-level 'aicr evidence' command that collects behavioral CNCF AI Conformance evidence from a live GPU cluster. The command embeds the collect-evidence.sh script and test manifests, invoking them against the cluster to produce submission-ready markdown evidence files. Usage: aicr evidence -o ./evidence # collect all evidence aicr evidence -o ./evidence -f dra -f hpa # specific features aicr evidence --list # list available features Features: dra, gang, secure, metrics, gateway, operator, hpa, cluster-autoscaling This is separate from 'aicr validate --phase conformance' which performs structural pass/fail checks for CI. The evidence command captures rich, human-reviewable proof (command outputs, workload logs, metric queries) for CNCF submission. Signed-off-by: Yuan Chen <yuanchen97@gmail.com>
1 parent 1f1758a commit 3181cc5

File tree

9 files changed

+383
-64
lines changed

9 files changed

+383
-64
lines changed

docs/conformance/cncf/README.md

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,9 @@ recipe meets the Must-have requirements for Kubernetes v1.34.
1919
```
2020
docs/conformance/cncf/
2121
├── README.md
22-
├── collect-evidence.sh
23-
├── manifests/
24-
│ ├── dra-gpu-test.yaml
25-
│ ├── gang-scheduling-test.yaml
26-
│ └── hpa-gpu-test.yaml
22+
├── submission/
23+
│ ├── PRODUCT.yaml
24+
│ └── README.md
2725
└── evidence/
2826
├── index.md
2927
├── dra-support.md
@@ -34,6 +32,13 @@ docs/conformance/cncf/
3432
├── robust-operator.md
3533
├── pod-autoscaling.md
3634
└── cluster-autoscaling.md
35+
36+
pkg/evidence/scripts/ # Evidence collection script + test manifests
37+
├── collect-evidence.sh
38+
└── manifests/
39+
├── dra-gpu-test.yaml
40+
├── gang-scheduling-test.yaml
41+
└── hpa-gpu-test.yaml
3742
```
3843

3944
## Usage
@@ -58,23 +63,25 @@ aicr validate -r recipe.yaml -s snapshot.yaml \
5863

5964
### Step 2: Behavioral Test Evidence
6065

61-
`collect-evidence.sh` deploys test workloads and collects behavioral evidence
66+
`aicr evidence` deploys test workloads and collects behavioral evidence
6267
(DRA GPU allocation, gang scheduling, HPA autoscaling, etc.) that requires
6368
running actual GPU workloads on the cluster:
6469

6570
```bash
6671
# Collect all behavioral evidence
67-
./docs/conformance/cncf/collect-evidence.sh all
68-
69-
# Collect evidence for a single feature
70-
./docs/conformance/cncf/collect-evidence.sh dra
71-
./docs/conformance/cncf/collect-evidence.sh gang
72-
./docs/conformance/cncf/collect-evidence.sh secure
73-
./docs/conformance/cncf/collect-evidence.sh metrics
74-
./docs/conformance/cncf/collect-evidence.sh gateway
75-
./docs/conformance/cncf/collect-evidence.sh operator
76-
./docs/conformance/cncf/collect-evidence.sh hpa
77-
./docs/conformance/cncf/collect-evidence.sh cluster-autoscaling
72+
aicr evidence -o ./evidence
73+
74+
# Collect specific features
75+
aicr evidence -o ./evidence -f dra -f hpa
76+
77+
# List available features
78+
aicr evidence --list
79+
```
80+
81+
Alternatively, run the script directly:
82+
```bash
83+
./pkg/evidence/scripts/collect-evidence.sh all
84+
./pkg/evidence/scripts/collect-evidence.sh dra
7885
```
7986

8087
> **Note:** The HPA test (`hpa`) deploys a GPU stress workload (nbody) and waits

pkg/cli/doc.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
//
1717
// # Overview
1818
//
19-
// The aicr CLI provides commands for the four-stage workflow: capturing system snapshots,
20-
// generating configuration recipes, validating constraints, and creating deployment bundles.
19+
// The aicr CLI provides commands for the five-stage workflow: capturing system snapshots,
20+
// generating configuration recipes, validating constraints, creating deployment bundles,
21+
// and collecting CNCF AI Conformance evidence.
2122
// It is designed for cluster administrators and SREs managing NVIDIA GPU infrastructure.
2223
//
2324
// # Commands

pkg/cli/evidence.go

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cli
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"log/slog"
21+
"strings"
22+
"time"
23+
24+
"github.com/NVIDIA/aicr/pkg/errors"
25+
"github.com/NVIDIA/aicr/pkg/evidence"
26+
"github.com/urfave/cli/v3"
27+
)
28+
29+
func evidenceCmd() *cli.Command {
30+
return &cli.Command{
31+
Name: "evidence",
32+
Category: functionalCategoryName,
33+
EnableShellCompletion: true,
34+
Usage: "Collect CNCF AI Conformance evidence from a live cluster.",
35+
Description: `Deploy test workloads and collect behavioral evidence for CNCF AI
36+
Conformance submission. This captures detailed command outputs, workload logs,
37+
and metric queries that demonstrate conformance requirements are met.
38+
39+
This is separate from 'aicr validate --phase conformance' which performs
40+
structural pass/fail checks for CI. Evidence collection captures rich,
41+
human-reviewable proof for CNCF submission.
42+
43+
Requires GPU hardware on the target cluster.
44+
45+
Examples:
46+
47+
Collect all evidence:
48+
aicr evidence -o ./evidence
49+
50+
Collect specific features:
51+
aicr evidence -o ./evidence -f dra -f hpa
52+
53+
List available features:
54+
aicr evidence --list
55+
`,
56+
Flags: []cli.Flag{
57+
&cli.StringFlag{
58+
Name: "output-dir",
59+
Aliases: []string{"o"},
60+
Usage: "Evidence output directory",
61+
},
62+
&cli.StringSliceFlag{
63+
Name: "feature",
64+
Aliases: []string{"f"},
65+
Usage: "Feature to collect evidence for (repeatable, default: all)",
66+
},
67+
&cli.BoolFlag{
68+
Name: "list",
69+
Usage: "List available evidence features",
70+
},
71+
&cli.BoolFlag{
72+
Name: "no-cleanup",
73+
Usage: "Skip test namespace cleanup after collection",
74+
},
75+
&cli.DurationFlag{
76+
Name: "timeout",
77+
Usage: "Overall timeout for evidence collection",
78+
Value: 20 * time.Minute,
79+
},
80+
},
81+
Action: runEvidence,
82+
}
83+
}
84+
85+
func runEvidence(ctx context.Context, cmd *cli.Command) error {
86+
// Handle --list flag.
87+
if cmd.Bool("list") {
88+
fmt.Println("Available evidence features:")
89+
fmt.Println()
90+
fmt.Printf(" %-24s %s\n", "Feature", "Description")
91+
fmt.Printf(" %-24s %s\n", strings.Repeat("─", 24), strings.Repeat("─", 45))
92+
for _, f := range evidence.ValidFeatures {
93+
fmt.Printf(" %-24s %s\n", f, evidence.FeatureDescriptions[f])
94+
}
95+
fmt.Println()
96+
fmt.Println("Use -f/--feature to select specific features, or omit for all.")
97+
return nil
98+
}
99+
100+
// Require --output-dir.
101+
outputDir := cmd.String("output-dir")
102+
if outputDir == "" {
103+
return errors.New(errors.ErrCodeInvalidRequest,
104+
"--output-dir is required (use -o ./evidence)")
105+
}
106+
107+
// Validate features.
108+
features := cmd.StringSlice("feature")
109+
if err := validateFeatures(features); err != nil {
110+
return err
111+
}
112+
113+
// Apply timeout.
114+
timeout := cmd.Duration("timeout")
115+
ctx, cancel := context.WithTimeout(ctx, timeout)
116+
defer cancel()
117+
118+
slog.Info("starting evidence collection",
119+
"outputDir", outputDir,
120+
"features", features,
121+
"timeout", timeout)
122+
123+
// Run collector.
124+
collector := evidence.NewCollector(outputDir,
125+
evidence.WithFeatures(features),
126+
evidence.WithNoCleanup(cmd.Bool("no-cleanup")),
127+
)
128+
129+
if err := collector.Run(ctx); err != nil {
130+
return errors.Wrap(errors.ErrCodeInternal, "evidence collection failed", err)
131+
}
132+
133+
slog.Info("evidence collection complete", "outputDir", outputDir)
134+
return nil
135+
}
136+
137+
// validateFeatures checks that all specified features are valid.
138+
func validateFeatures(features []string) error {
139+
valid := make(map[string]bool, len(evidence.ValidFeatures)+1)
140+
for _, f := range evidence.ValidFeatures {
141+
valid[f] = true
142+
}
143+
valid["all"] = true
144+
for _, f := range features {
145+
if !valid[f] {
146+
return errors.New(errors.ErrCodeInvalidRequest,
147+
fmt.Sprintf("unknown feature %q, valid features: %s",
148+
f, strings.Join(evidence.ValidFeatures, ", ")))
149+
}
150+
}
151+
return nil
152+
}

pkg/cli/root.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ func Execute() {
133133
recipeCmd(),
134134
bundleCmd(),
135135
validateCmd(),
136+
evidenceCmd(),
136137
},
137138
ShellComplete: commandLister,
138139
}

0 commit comments

Comments
 (0)