Skip to content

Commit 5036c02

Browse files
committed
feat(evidence): add artifact capture for conformance evidence
Add diagnostic artifact capture mechanism so conformance checks can record rich evidence (deployment status, metrics samples, test results) during execution, flowing through the pipeline into evidence markdown. The artifact pipeline: checks call ctx.Artifacts.Record() → Cancel() emits base64-encoded ARTIFACT: lines via t.Logf → phases.go extracts and populates CheckResult.Artifacts → evidence renderer outputs labeled code blocks. Artifacts are ephemeral (json:"-") and never persisted. Infrastructure (pkg/validator/checks/artifact.go): - Artifact type with Encode/DecodeArtifact for base64 JSON transport - ArtifactCollector with thread-safe Record/Drain, 8KB/20-count caps - ValidationContext.Artifacts field, TestRunner init + Cancel() emit Pipeline integration: - phases.go: extract ARTIFACT: lines from test output, filter from reason - evidence types/renderer/templates: pass artifacts through to markdown All 9 submission requirement checks now record diagnostic artifacts: - DRA: controller/DaemonSet status, ResourceSlice count - Accelerator metrics: DCGM sample, required metrics presence - AI service metrics: Prometheus query results, custom metrics API - Inference gateway: GatewayClass/Gateway status, CRDs, data plane - Robust controller: operator status, webhook config, rejection test - Secure access: DRA pod status, access patterns, isolation test - Gang scheduling: KAI components, GPU availability, co-scheduling - Pod autoscaling: metrics APIs, HPA scale-up/down proof - Cluster autoscaling: Karpenter status, NodePools, node provisioning
1 parent 399a2dc commit 5036c02

22 files changed

+735
-24
lines changed

pkg/evidence/renderer.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,11 @@ func (r *Renderer) buildEntries(conformance *validator.PhaseResult) []EvidenceEn
132132
}
133133

134134
entry := CheckEntry{
135-
Name: check.Name,
136-
Status: cr.Status,
137-
Reason: cr.Reason,
138-
Duration: cr.Duration,
135+
Name: check.Name,
136+
Status: cr.Status,
137+
Reason: cr.Reason,
138+
Duration: cr.Duration,
139+
Artifacts: cr.Artifacts,
139140
}
140141

141142
g, exists := groups[check.EvidenceFile]

pkg/evidence/renderer_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"time"
2424

2525
"github.com/NVIDIA/aicr/pkg/validator"
26+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2627

2728
// Import conformance checks to register them.
2829
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
@@ -337,3 +338,105 @@ func TestRenderIndexContent(t *testing.T) {
337338
t.Error("index.md should contain run ID")
338339
}
339340
}
341+
342+
func TestRenderWithArtifacts(t *testing.T) {
343+
dir := t.TempDir()
344+
r := New(WithOutputDir(dir))
345+
346+
result := &validator.ValidationResult{
347+
RunID: "test-artifacts",
348+
Phases: map[string]*validator.PhaseResult{
349+
"conformance": {
350+
Checks: []validator.CheckResult{
351+
{
352+
Name: "dra-support",
353+
Status: validator.ValidationStatusPass,
354+
Reason: "DRA controller healthy",
355+
Duration: 5 * time.Second,
356+
Artifacts: []checks.Artifact{
357+
{Label: "DRA Controller Pods", Data: "NAME READY STATUS\ndra-controller-abc12 1/1 Running"},
358+
{Label: "ResourceSlice Count", Data: "Total ResourceSlices: 8"},
359+
},
360+
},
361+
},
362+
},
363+
},
364+
}
365+
366+
if err := r.Render(context.Background(), result); err != nil {
367+
t.Fatalf("Render() error = %v", err)
368+
}
369+
370+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
371+
if err != nil {
372+
t.Fatalf("failed to read dra-support.md: %v", err)
373+
}
374+
375+
s := string(content)
376+
377+
// Verify artifact labels are present.
378+
if !strings.Contains(s, "#### DRA Controller Pods") {
379+
t.Error("evidence should contain artifact label 'DRA Controller Pods'")
380+
}
381+
if !strings.Contains(s, "#### ResourceSlice Count") {
382+
t.Error("evidence should contain artifact label 'ResourceSlice Count'")
383+
}
384+
385+
// Verify artifact data is present.
386+
if !strings.Contains(s, "dra-controller-abc12") {
387+
t.Error("evidence should contain artifact data")
388+
}
389+
if !strings.Contains(s, "Total ResourceSlices: 8") {
390+
t.Error("evidence should contain ResourceSlice count data")
391+
}
392+
393+
// Verify the reason is also present (artifacts don't replace reason).
394+
if !strings.Contains(s, "DRA controller healthy") {
395+
t.Error("evidence should still contain the reason text")
396+
}
397+
}
398+
399+
func TestRenderWithoutArtifacts(t *testing.T) {
400+
dir := t.TempDir()
401+
r := New(WithOutputDir(dir))
402+
403+
result := &validator.ValidationResult{
404+
RunID: "test-no-artifacts",
405+
Phases: map[string]*validator.PhaseResult{
406+
"conformance": {
407+
Checks: []validator.CheckResult{
408+
{
409+
Name: "dra-support",
410+
Status: validator.ValidationStatusPass,
411+
Reason: "all healthy",
412+
Duration: 3 * time.Second,
413+
},
414+
},
415+
},
416+
},
417+
}
418+
419+
if err := r.Render(context.Background(), result); err != nil {
420+
t.Fatalf("Render() error = %v", err)
421+
}
422+
423+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
424+
if err != nil {
425+
t.Fatalf("failed to read dra-support.md: %v", err)
426+
}
427+
428+
s := string(content)
429+
430+
// Verify basic content is present.
431+
if !strings.Contains(s, "dra-support") {
432+
t.Error("evidence should contain check name")
433+
}
434+
if !strings.Contains(s, "all healthy") {
435+
t.Error("evidence should contain reason")
436+
}
437+
438+
// Verify no artifact headers appear.
439+
if strings.Contains(s, "####") {
440+
t.Error("evidence without artifacts should not contain #### headers")
441+
}
442+
}

pkg/evidence/templates.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,13 @@ const evidenceTemplate = `# {{ .Title }}
5454
{{ .Reason }}
5555
` + "```" + `
5656
{{- end }}
57+
{{- range .Artifacts }}
58+
59+
#### {{ .Label }}
60+
61+
` + "```" + `
62+
{{ .Data }}
63+
` + "```" + `
64+
{{- end }}
5765
{{ end }}
5866
`

pkg/evidence/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"time"
1919

2020
"github.com/NVIDIA/aicr/pkg/validator"
21+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2122
)
2223

2324
// EvidenceEntry holds all data needed to render a single evidence document.
@@ -58,4 +59,7 @@ type CheckEntry struct {
5859

5960
// Duration is how long the check took.
6061
Duration time.Duration
62+
63+
// Artifacts contains diagnostic evidence captured during check execution.
64+
Artifacts []checks.Artifact
6165
}

pkg/validator/checks/artifact.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package checks
16+
17+
import (
18+
"encoding/base64"
19+
"encoding/json"
20+
"sync"
21+
22+
"github.com/NVIDIA/aicr/pkg/errors"
23+
)
24+
25+
const (
26+
// MaxArtifactDataSize is the maximum size in bytes of a single artifact's Data field.
27+
// Ensures each base64-encoded ARTIFACT: line stays well under the bufio.Scanner
28+
// default 64KB limit (base64 expands ~4/3, so 8KB → ~11KB encoded).
29+
MaxArtifactDataSize = 8 * 1024
30+
31+
// MaxArtifactsPerCheck is the maximum number of artifacts a single check can record.
32+
MaxArtifactsPerCheck = 20
33+
)
34+
35+
// Artifact represents a captured piece of diagnostic evidence from a conformance check.
36+
// Each artifact has a human-readable label and a data payload (kubectl output,
37+
// metric samples, resource YAML, etc.) that is rendered as a fenced code block
38+
// in evidence markdown.
39+
type Artifact struct {
40+
// Label is the human-readable title (e.g., "DRA Driver Pods").
41+
Label string `json:"label"`
42+
43+
// Data is the captured content (command output, metric text, YAML, etc.).
44+
Data string `json:"data"`
45+
}
46+
47+
// Encode returns a base64-encoded JSON representation of the artifact,
48+
// suitable for emission via t.Logf("ARTIFACT:%s", encoded).
49+
func (a Artifact) Encode() (string, error) {
50+
jsonBytes, err := json.Marshal(a)
51+
if err != nil {
52+
return "", errors.Wrap(errors.ErrCodeInternal, "failed to marshal artifact", err)
53+
}
54+
return base64.StdEncoding.EncodeToString(jsonBytes), nil
55+
}
56+
57+
// DecodeArtifact decodes a base64-encoded JSON artifact string.
58+
func DecodeArtifact(encoded string) (*Artifact, error) {
59+
jsonBytes, err := base64.StdEncoding.DecodeString(encoded)
60+
if err != nil {
61+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to decode artifact base64", err)
62+
}
63+
var a Artifact
64+
if err := json.Unmarshal(jsonBytes, &a); err != nil {
65+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to unmarshal artifact JSON", err)
66+
}
67+
return &a, nil
68+
}
69+
70+
// ArtifactCollector is a thread-safe accumulator for artifacts within a single check execution.
71+
// It enforces per-artifact size limits and per-check count limits.
72+
type ArtifactCollector struct {
73+
mu sync.Mutex
74+
artifacts []Artifact
75+
}
76+
77+
// NewArtifactCollector creates a new empty artifact collector.
78+
func NewArtifactCollector() *ArtifactCollector {
79+
return &ArtifactCollector{}
80+
}
81+
82+
// Record adds a labeled artifact. Data exceeding MaxArtifactDataSize is truncated.
83+
// Returns an error if the per-check artifact count limit is reached.
84+
func (c *ArtifactCollector) Record(label, data string) error {
85+
c.mu.Lock()
86+
defer c.mu.Unlock()
87+
88+
if len(c.artifacts) >= MaxArtifactsPerCheck {
89+
return errors.New(errors.ErrCodeInvalidRequest, "artifact limit reached")
90+
}
91+
92+
if len(data) > MaxArtifactDataSize {
93+
data = data[:MaxArtifactDataSize] + "\n... [truncated]"
94+
}
95+
96+
c.artifacts = append(c.artifacts, Artifact{Label: label, Data: data})
97+
return nil
98+
}
99+
100+
// Drain returns the collected artifacts and resets the internal list.
101+
// Returns nil if no artifacts were recorded.
102+
func (c *ArtifactCollector) Drain() []Artifact {
103+
c.mu.Lock()
104+
defer c.mu.Unlock()
105+
arts := c.artifacts
106+
c.artifacts = nil
107+
return arts
108+
}

0 commit comments

Comments
 (0)