Skip to content

Commit 1d15151

Browse files
committed
feat(evidence): add artifact capture for conformance evidence
Add an artifact capture mechanism so conformance checks record rich diagnostic evidence during execution, flowing it through the pipeline into evidence markdown. Single command, rich output. Infrastructure: - Artifact type, ArtifactCollector with thread-safe Record()/Drain(), base64 encode/decode, 8KB per-artifact / 20 per-check caps - Pipeline: runner.go Cancel() emits via t.Logf → phases.go extracts using Contains+SplitN (handles t.Logf source prefixes) → evidence renderer emits labeled code blocks in markdown - Artifacts are ephemeral (json:"-") — never persisted in saved results - Failed artifact decodes log a warning and preserve the line in Reason Conformance checks instrumented (9 checks): - dra_support_check: controller, kubelet plugin, ResourceSlices - accelerator_metrics_check: DCGM metrics sample, required metrics - ai_service_metrics_check: Prometheus query, custom metrics API - inference_gateway_check: GatewayClass, Gateway, CRDs, data plane - robust_controller_check: Dynamo operator, webhook, rejection test - secure_access_check: DRA test pod, access patterns, isolation test - gang_scheduling_check: KAI scheduler, GPU availability, gang results - pod_autoscaling_check: custom/external metrics API, HPA test - cluster_autoscaling_check: Karpenter, NodePools, autoscaling test Testing: - Artifact encode/decode round-trip, cap enforcement, thread safety - extractArtifacts() with realistic source-prefixed t.Logf lines - Evidence renderer with/without artifacts
1 parent ccca286 commit 1d15151

28 files changed

+982
-45
lines changed

.github/actions/gpu-operator-install/action.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,13 @@ runs:
107107
# features not available in kind (DRA feature gates, driver modules).
108108
# The explicit "Wait for GPU operands" step below gates on what
109109
# actually matters (device plugin readiness).
110+
# --best-effort: some components (e.g. network-operator) have Helm
111+
# hooks that may time out in Kind; continue deploying remaining
112+
# components so the overall stack is functional.
110113
chmod +x deploy.sh
111114
echo "--- deploy.sh ---"
112115
cat deploy.sh
113-
./deploy.sh --no-wait
116+
./deploy.sh --no-wait --best-effort
114117
115118
- name: Wait for GPU operands (bundle)
116119
if: inputs.method == 'bundle'

.github/actions/gpu-snapshot-validate/action.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ runs:
3333
- name: Run aicr snapshot
3434
shell: bash
3535
run: |
36-
./aicr snapshot --deploy-agent \
36+
./aicr snapshot \
3737
--kubeconfig="${HOME}/.kube/config" \
3838
--namespace=default \
3939
--image=ko.local:smoke-test \

kwok/scripts/install-karpenter-kwok.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ install_kwok() {
6767
helm upgrade --install kwok-controller kwok/kwok \
6868
--namespace kube-system \
6969
--set hostNetwork=true \
70-
--wait --timeout 120s
70+
--wait --timeout 300s
7171

7272
helm upgrade --install kwok-stage-fast kwok/stage-fast \
7373
--namespace kube-system

pkg/bundler/deployer/helm/templates/deploy.sh.tmpl

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ set -euo pipefail
44
# Cloud Native Stack Deployment Script
55
# Generated by AICR Bundler {{ .BundlerVersion }}
66
#
7-
# Usage: ./deploy.sh [--no-wait]
8-
# --no-wait Skip waiting for each component to become ready
7+
# Usage: ./deploy.sh [--no-wait] [--best-effort]
8+
# --no-wait Skip waiting for each component to become ready
9+
# --best-effort Continue past individual component failures (log warnings)
910

1011
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1112

@@ -16,9 +17,24 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT
1617
cd "${HELM_WORKDIR}"
1718

1819
WAIT_ARGS="--wait --timeout 10m"
19-
if [[ "${1:-}" == "--no-wait" ]]; then
20-
WAIT_ARGS=""
21-
fi
20+
BEST_EFFORT=false
21+
FAILED_COMPONENTS=""
22+
23+
for arg in "$@"; do
24+
case "${arg}" in
25+
--no-wait) WAIT_ARGS="" ;;
26+
--best-effort) BEST_EFFORT=true ;;
27+
esac
28+
done
29+
30+
helm_failed() {
31+
if [[ "${BEST_EFFORT}" == "true" ]]; then
32+
echo "WARNING: $1 install failed, continuing (--best-effort)"
33+
FAILED_COMPONENTS="${FAILED_COMPONENTS} $1"
34+
else
35+
exit 1
36+
fi
37+
}
2238

2339
# Components that use operator patterns with custom resources that reconcile
2440
# asynchronously. Helm --wait may time out waiting for CR readiness even though
@@ -41,19 +57,24 @@ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
4157
--version {{ .Version }} \
4258
-n {{ .Namespace }} --create-namespace \
4359
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
44-
${COMPONENT_WAIT_ARGS}
60+
${COMPONENT_WAIT_ARGS} \
61+
|| helm_failed "{{ .Name }}"
4562
{{ else -}}
4663
helm upgrade --install {{ .Name }} {{ .ChartName }} \
4764
--repo {{ .Repository }} \
4865
--version {{ .Version }} \
4966
-n {{ .Namespace }} --create-namespace \
5067
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
51-
${COMPONENT_WAIT_ARGS}
68+
${COMPONENT_WAIT_ARGS} \
69+
|| helm_failed "{{ .Name }}"
5270
{{ end -}}
5371
{{ end -}}
5472
{{ if .HasManifests -}}
5573
echo "Applying manifests for {{ .Name }}..."
5674
kubectl apply -f "${SCRIPT_DIR}/{{ .Name }}/manifests/"
5775
{{ end -}}
5876
{{ end }}
77+
if [[ -n "${FAILED_COMPONENTS}" ]]; then
78+
echo "WARNING: the following components failed:${FAILED_COMPONENTS}"
79+
fi
5980
echo "Deployment complete."

pkg/cli/validate.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ func runValidation(
227227
evidenceSource := result
228228
if evidenceResultPath != "" {
229229
slog.Info("loading saved result for evidence rendering", "path", evidenceResultPath)
230+
slog.Warn("saved results do not include diagnostic artifacts; evidence output will contain check status only")
230231
saved, loadErr := serializer.FromFile[validator.ValidationResult](evidenceResultPath)
231232
if loadErr != nil {
232233
return errors.Wrap(errors.ErrCodeInvalidRequest, "failed to load evidence result", loadErr)
@@ -364,7 +365,7 @@ func validateCmdFlags() []cli.Flag {
364365
},
365366
&cli.StringFlag{
366367
Name: "result",
367-
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Requires --phase conformance and --evidence-dir.",
368+
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Note: saved results do not include diagnostic artifacts captured during live runs. Requires --phase conformance and --evidence-dir.",
368369
},
369370
outputFlag,
370371
formatFlag,

pkg/defaults/timeouts.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,17 @@ const (
211211
PodReadyTimeout = 2 * time.Minute
212212
)
213213

214+
// Artifact limits for conformance evidence capture.
215+
const (
216+
// ArtifactMaxDataSize is the maximum size in bytes of a single artifact's Data field.
217+
// Ensures each base64-encoded ARTIFACT: line stays well under the bufio.Scanner
218+
// default 64KB limit (base64 expands ~4/3, so 8KB → ~11KB encoded).
219+
ArtifactMaxDataSize = 8 * 1024
220+
221+
// ArtifactMaxPerCheck is the maximum number of artifacts a single check can record.
222+
ArtifactMaxPerCheck = 20
223+
)
224+
214225
// Job configuration constants.
215226
const (
216227
// JobTTLAfterFinished is the time-to-live for completed Jobs.

pkg/evidence/renderer.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,10 +132,11 @@ func (r *Renderer) buildEntries(conformance *validator.PhaseResult) []EvidenceEn
132132
}
133133

134134
entry := CheckEntry{
135-
Name: check.Name,
136-
Status: cr.Status,
137-
Reason: cr.Reason,
138-
Duration: cr.Duration,
135+
Name: check.Name,
136+
Status: cr.Status,
137+
Reason: cr.Reason,
138+
Duration: cr.Duration,
139+
Artifacts: cr.Artifacts,
139140
}
140141

141142
g, exists := groups[check.EvidenceFile]

pkg/evidence/renderer_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"time"
2424

2525
"github.com/NVIDIA/aicr/pkg/validator"
26+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2627

2728
// Import conformance checks to register them.
2829
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
@@ -337,3 +338,105 @@ func TestRenderIndexContent(t *testing.T) {
337338
t.Error("index.md should contain run ID")
338339
}
339340
}
341+
342+
func TestRenderWithArtifacts(t *testing.T) {
343+
dir := t.TempDir()
344+
r := New(WithOutputDir(dir))
345+
346+
result := &validator.ValidationResult{
347+
RunID: "test-artifacts",
348+
Phases: map[string]*validator.PhaseResult{
349+
"conformance": {
350+
Checks: []validator.CheckResult{
351+
{
352+
Name: "dra-support",
353+
Status: validator.ValidationStatusPass,
354+
Reason: "DRA controller healthy",
355+
Duration: 5 * time.Second,
356+
Artifacts: []checks.Artifact{
357+
{Label: "DRA Controller Pods", Data: "NAME READY STATUS\ndra-controller-abc12 1/1 Running"},
358+
{Label: "ResourceSlice Count", Data: "Total ResourceSlices: 8"},
359+
},
360+
},
361+
},
362+
},
363+
},
364+
}
365+
366+
if err := r.Render(context.Background(), result); err != nil {
367+
t.Fatalf("Render() error = %v", err)
368+
}
369+
370+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
371+
if err != nil {
372+
t.Fatalf("failed to read dra-support.md: %v", err)
373+
}
374+
375+
s := string(content)
376+
377+
// Verify artifact labels are present.
378+
if !strings.Contains(s, "#### DRA Controller Pods") {
379+
t.Error("evidence should contain artifact label 'DRA Controller Pods'")
380+
}
381+
if !strings.Contains(s, "#### ResourceSlice Count") {
382+
t.Error("evidence should contain artifact label 'ResourceSlice Count'")
383+
}
384+
385+
// Verify artifact data is present.
386+
if !strings.Contains(s, "dra-controller-abc12") {
387+
t.Error("evidence should contain artifact data")
388+
}
389+
if !strings.Contains(s, "Total ResourceSlices: 8") {
390+
t.Error("evidence should contain ResourceSlice count data")
391+
}
392+
393+
// Verify the reason is also present (artifacts don't replace reason).
394+
if !strings.Contains(s, "DRA controller healthy") {
395+
t.Error("evidence should still contain the reason text")
396+
}
397+
}
398+
399+
func TestRenderWithoutArtifacts(t *testing.T) {
400+
dir := t.TempDir()
401+
r := New(WithOutputDir(dir))
402+
403+
result := &validator.ValidationResult{
404+
RunID: "test-no-artifacts",
405+
Phases: map[string]*validator.PhaseResult{
406+
"conformance": {
407+
Checks: []validator.CheckResult{
408+
{
409+
Name: "dra-support",
410+
Status: validator.ValidationStatusPass,
411+
Reason: "all healthy",
412+
Duration: 3 * time.Second,
413+
},
414+
},
415+
},
416+
},
417+
}
418+
419+
if err := r.Render(context.Background(), result); err != nil {
420+
t.Fatalf("Render() error = %v", err)
421+
}
422+
423+
content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
424+
if err != nil {
425+
t.Fatalf("failed to read dra-support.md: %v", err)
426+
}
427+
428+
s := string(content)
429+
430+
// Verify basic content is present.
431+
if !strings.Contains(s, "dra-support") {
432+
t.Error("evidence should contain check name")
433+
}
434+
if !strings.Contains(s, "all healthy") {
435+
t.Error("evidence should contain reason")
436+
}
437+
438+
// Verify no artifact headers appear.
439+
if strings.Contains(s, "####") {
440+
t.Error("evidence without artifacts should not contain #### headers")
441+
}
442+
}

pkg/evidence/templates.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,13 @@ const evidenceTemplate = `# {{ .Title }}
5454
{{ .Reason }}
5555
` + "```" + `
5656
{{- end }}
57+
{{- range .Artifacts }}
58+
59+
#### {{ .Label }}
60+
61+
` + "```" + `
62+
{{ .Data }}
63+
` + "```" + `
64+
{{- end }}
5765
{{ end }}
5866
`

pkg/evidence/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"time"
1919

2020
"github.com/NVIDIA/aicr/pkg/validator"
21+
"github.com/NVIDIA/aicr/pkg/validator/checks"
2122
)
2223

2324
// EvidenceEntry holds all data needed to render a single evidence document.
@@ -58,4 +59,7 @@ type CheckEntry struct {
5859

5960
// Duration is how long the check took.
6061
Duration time.Duration
62+
63+
// Artifacts contains diagnostic evidence captured during check execution.
64+
Artifacts []checks.Artifact
6165
}

0 commit comments

Comments
 (0)