Skip to content

Commit d0e6322

Browse files
committed
feat(evidence): add artifact capture for conformance evidence
Add an artifact capture mechanism so conformance checks record rich diagnostic evidence during execution, flowing it through the pipeline into evidence markdown. Single command, rich output. Infrastructure: - Artifact type, ArtifactCollector with thread-safe Record()/Drain(), base64 encode/decode, 8KB per-artifact / 20 per-check caps - Pipeline: runner.go Cancel() emits via t.Logf → phases.go extracts using Contains+SplitN (handles t.Logf source prefixes) → evidence renderer emits labeled code blocks in markdown - Artifacts are ephemeral (json:"-") — never persisted in saved results - Failed artifact decodes log a warning and preserve the line in Reason Conformance checks instrumented (9 checks): - dra_support_check: controller, kubelet plugin, ResourceSlices - accelerator_metrics_check: DCGM metrics sample, required metrics - ai_service_metrics_check: Prometheus query, custom metrics API - inference_gateway_check: GatewayClass, Gateway, CRDs, data plane - robust_controller_check: Dynamo operator, webhook, rejection test - secure_access_check: DRA test pod, access patterns, isolation test - gang_scheduling_check: KAI scheduler, GPU availability, gang results - pod_autoscaling_check: custom/external metrics API, HPA test - cluster_autoscaling_check: Karpenter, NodePools, autoscaling test Testing: - Artifact encode/decode round-trip, cap enforcement, thread safety - extractArtifacts() with realistic source-prefixed t.Logf lines - Evidence renderer with/without artifacts
1 parent 02c786c commit d0e6322

25 files changed

+937
-33
lines changed

.github/actions/gpu-snapshot-validate/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ runs:
3333
- name: Run aicr snapshot
3434
shell: bash
3535
run: |
36-
./aicr snapshot --deploy-agent \
36+
./aicr snapshot \
3737
--kubeconfig="${HOME}/.kube/config" \
3838
--namespace=default \
3939
--image=ko.local:smoke-test \

pkg/cli/validate.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ func runValidation(
227227
evidenceSource := result
228228
if evidenceResultPath != "" {
229229
slog.Info("loading saved result for evidence rendering", "path", evidenceResultPath)
230+
slog.Warn("saved results do not include diagnostic artifacts; evidence output will contain check status only")
230231
saved, loadErr := serializer.FromFile[validator.ValidationResult](evidenceResultPath)
231232
if loadErr != nil {
232233
return errors.Wrap(errors.ErrCodeInvalidRequest, "failed to load evidence result", loadErr)
@@ -364,7 +365,7 @@ func validateCmdFlags() []cli.Flag {
364365
},
365366
&cli.StringFlag{
366367
Name: "result",
367-
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Requires --phase conformance and --evidence-dir.",
368+
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Note: saved results do not include diagnostic artifacts captured during live runs. Requires --phase conformance and --evidence-dir.",
368369
},
369370
outputFlag,
370371
formatFlag,

pkg/defaults/timeouts.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,17 @@ const (
211211
PodReadyTimeout = 2 * time.Minute
212212
)
213213

214+
// Artifact limits for conformance evidence capture.
215+
const (
216+
// ArtifactMaxDataSize is the maximum size in bytes of a single artifact's Data field.
217+
// Ensures each base64-encoded ARTIFACT: line stays well under the bufio.Scanner
218+
// default 64KB limit (base64 expands ~4/3, so 8KB → ~11KB encoded).
219+
ArtifactMaxDataSize = 8 * 1024
220+
221+
// ArtifactMaxPerCheck is the maximum number of artifacts a single check can record.
222+
ArtifactMaxPerCheck = 20
223+
)
224+
214225
// Job configuration constants.
215226
const (
216227
// JobTTLAfterFinished is the time-to-live for completed Jobs.

pkg/evidence/renderer.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,11 @@ func (r *Renderer) buildEntries(conformance *validator.PhaseResult) []EvidenceEn
132132
}
133133

134134
entry := CheckEntry{
135-
Name: check.Name,
136-
Status: cr.Status,
137-
Reason: cr.Reason,
138-
Duration: cr.Duration,
135+
Name: check.Name,
136+
Status: cr.Status,
137+
Reason: cr.Reason,
138+
Duration: cr.Duration,
139+
Artifacts: cr.Artifacts,
139140
}
140141

141142
g, exists := groups[check.EvidenceFile]

pkg/evidence/renderer_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"time"
2424

2525
"github.com/NVIDIA/aicr/pkg/validator"
26+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2627

2728
// Import conformance checks to register them.
2829
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
@@ -337,3 +338,105 @@ func TestRenderIndexContent(t *testing.T) {
337338
t.Error("index.md should contain run ID")
338339
}
339340
}
341+
342+
func TestRenderWithArtifacts(t *testing.T) {
343+
dir := t.TempDir()
344+
r := New(WithOutputDir(dir))
345+
346+
result := &validator.ValidationResult{
347+
RunID: "test-artifacts",
348+
Phases: map[string]*validator.PhaseResult{
349+
"conformance": {
350+
Checks: []validator.CheckResult{
351+
{
352+
Name: "dra-support",
353+
Status: validator.ValidationStatusPass,
354+
Reason: "DRA controller healthy",
355+
Duration: 5 * time.Second,
356+
Artifacts: []checks.Artifact{
357+
{Label: "DRA Controller Pods", Data: "NAME READY STATUS\ndra-controller-abc12 1/1 Running"},
358+
{Label: "ResourceSlice Count", Data: "Total ResourceSlices: 8"},
359+
},
360+
},
361+
},
362+
},
363+
},
364+
}
365+
366+
if err := r.Render(context.Background(), result); err != nil {
367+
t.Fatalf("Render() error = %v", err)
368+
}
369+
370+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
371+
if err != nil {
372+
t.Fatalf("failed to read dra-support.md: %v", err)
373+
}
374+
375+
s := string(content)
376+
377+
// Verify artifact labels are present.
378+
if !strings.Contains(s, "#### DRA Controller Pods") {
379+
t.Error("evidence should contain artifact label 'DRA Controller Pods'")
380+
}
381+
if !strings.Contains(s, "#### ResourceSlice Count") {
382+
t.Error("evidence should contain artifact label 'ResourceSlice Count'")
383+
}
384+
385+
// Verify artifact data is present.
386+
if !strings.Contains(s, "dra-controller-abc12") {
387+
t.Error("evidence should contain artifact data")
388+
}
389+
if !strings.Contains(s, "Total ResourceSlices: 8") {
390+
t.Error("evidence should contain ResourceSlice count data")
391+
}
392+
393+
// Verify the reason is also present (artifacts don't replace reason).
394+
if !strings.Contains(s, "DRA controller healthy") {
395+
t.Error("evidence should still contain the reason text")
396+
}
397+
}
398+
399+
func TestRenderWithoutArtifacts(t *testing.T) {
400+
dir := t.TempDir()
401+
r := New(WithOutputDir(dir))
402+
403+
result := &validator.ValidationResult{
404+
RunID: "test-no-artifacts",
405+
Phases: map[string]*validator.PhaseResult{
406+
"conformance": {
407+
Checks: []validator.CheckResult{
408+
{
409+
Name: "dra-support",
410+
Status: validator.ValidationStatusPass,
411+
Reason: "all healthy",
412+
Duration: 3 * time.Second,
413+
},
414+
},
415+
},
416+
},
417+
}
418+
419+
if err := r.Render(context.Background(), result); err != nil {
420+
t.Fatalf("Render() error = %v", err)
421+
}
422+
423+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
424+
if err != nil {
425+
t.Fatalf("failed to read dra-support.md: %v", err)
426+
}
427+
428+
s := string(content)
429+
430+
// Verify basic content is present.
431+
if !strings.Contains(s, "dra-support") {
432+
t.Error("evidence should contain check name")
433+
}
434+
if !strings.Contains(s, "all healthy") {
435+
t.Error("evidence should contain reason")
436+
}
437+
438+
// Verify no artifact headers appear.
439+
if strings.Contains(s, "####") {
440+
t.Error("evidence without artifacts should not contain #### headers")
441+
}
442+
}

pkg/evidence/templates.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,13 @@ const evidenceTemplate = `# {{ .Title }}
5454
{{ .Reason }}
5555
` + "```" + `
5656
{{- end }}
57+
{{- range .Artifacts }}
58+
59+
#### {{ .Label }}
60+
61+
` + "```" + `
62+
{{ .Data }}
63+
` + "```" + `
64+
{{- end }}
5765
{{ end }}
5866
`

pkg/evidence/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"time"
1919

2020
"github.com/NVIDIA/aicr/pkg/validator"
21+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2122
)
2223

2324
// EvidenceEntry holds all data needed to render a single evidence document.
@@ -58,4 +59,7 @@ type CheckEntry struct {
5859

5960
// Duration is how long the check took.
6061
Duration time.Duration
62+
63+
// Artifacts contains diagnostic evidence captured during check execution.
64+
Artifacts []checks.Artifact
6165
}

pkg/validator/checks/artifact.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package checks
16+
17+
import (
18+
"encoding/base64"
19+
"encoding/json"
20+
"sync"
21+
22+
"github.com/NVIDIA/aicr/pkg/defaults"
23+
"github.com/NVIDIA/aicr/pkg/errors"
24+
)
25+
26+
// Artifact represents a captured piece of diagnostic evidence from a conformance check.
27+
// Each artifact has a human-readable label and a data payload (kubectl output,
28+
// metric samples, resource YAML, etc.) that is rendered as a fenced code block
29+
// in evidence markdown.
30+
type Artifact struct {
31+
// Label is the human-readable title (e.g., "DRA Driver Pods").
32+
Label string `json:"label"`
33+
34+
// Data is the captured content (command output, metric text, YAML, etc.).
35+
Data string `json:"data"`
36+
}
37+
38+
// Encode returns a base64-encoded JSON representation of the artifact,
39+
// suitable for emission via t.Logf("ARTIFACT:%s", encoded).
40+
func (a Artifact) Encode() (string, error) {
41+
jsonBytes, err := json.Marshal(a)
42+
if err != nil {
43+
return "", errors.Wrap(errors.ErrCodeInternal, "failed to marshal artifact", err)
44+
}
45+
return base64.StdEncoding.EncodeToString(jsonBytes), nil
46+
}
47+
48+
// DecodeArtifact decodes a base64-encoded JSON artifact string.
49+
func DecodeArtifact(encoded string) (*Artifact, error) {
50+
jsonBytes, err := base64.StdEncoding.DecodeString(encoded)
51+
if err != nil {
52+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to decode artifact base64", err)
53+
}
54+
var a Artifact
55+
if err := json.Unmarshal(jsonBytes, &a); err != nil {
56+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to unmarshal artifact JSON", err)
57+
}
58+
return &a, nil
59+
}
60+
61+
// ArtifactCollector is a thread-safe accumulator for artifacts within a single check execution.
62+
// It enforces per-artifact size limits and per-check count limits.
63+
type ArtifactCollector struct {
64+
mu sync.Mutex
65+
artifacts []Artifact
66+
}
67+
68+
// NewArtifactCollector creates a new empty artifact collector.
69+
func NewArtifactCollector() *ArtifactCollector {
70+
return &ArtifactCollector{}
71+
}
72+
73+
// Record adds a labeled artifact. Data exceeding defaults.ArtifactMaxDataSize is truncated.
74+
// Returns an error if the per-check artifact count limit is reached.
75+
func (c *ArtifactCollector) Record(label, data string) error {
76+
c.mu.Lock()
77+
defer c.mu.Unlock()
78+
79+
if len(c.artifacts) >= defaults.ArtifactMaxPerCheck {
80+
return errors.New(errors.ErrCodeInvalidRequest, "artifact limit reached")
81+
}
82+
83+
if len(data) > defaults.ArtifactMaxDataSize {
84+
data = data[:defaults.ArtifactMaxDataSize] + "\n... [truncated]"
85+
}
86+
87+
c.artifacts = append(c.artifacts, Artifact{Label: label, Data: data})
88+
return nil
89+
}
90+
91+
// Drain returns the collected artifacts and resets the internal list.
92+
// Returns nil if no artifacts were recorded.
93+
func (c *ArtifactCollector) Drain() []Artifact {
94+
c.mu.Lock()
95+
defer c.mu.Unlock()
96+
arts := c.artifacts
97+
c.artifacts = nil
98+
return arts
99+
}

0 commit comments

Comments
 (0)