Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/actions/gpu-operator-install/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ runs:
# features not available in kind (DRA feature gates, driver modules).
# The explicit "Wait for GPU operands" step below gates on what
# actually matters (device plugin readiness).
# --best-effort: some components (e.g. network-operator) have Helm
# hooks that may time out in Kind; continue deploying remaining
# components so the overall stack is functional.
chmod +x deploy.sh
echo "--- deploy.sh ---"
cat deploy.sh
./deploy.sh --no-wait
./deploy.sh --no-wait --best-effort

- name: Wait for GPU operands (bundle)
if: inputs.method == 'bundle'
Expand Down
2 changes: 1 addition & 1 deletion .github/actions/gpu-snapshot-validate/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ runs:
- name: Run aicr snapshot
shell: bash
run: |
./aicr snapshot --deploy-agent \
./aicr snapshot \
--kubeconfig="${HOME}/.kube/config" \
--namespace=default \
--image=ko.local:smoke-test \
Expand Down
2 changes: 1 addition & 1 deletion kwok/scripts/install-karpenter-kwok.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ install_kwok() {
helm upgrade --install kwok-controller kwok/kwok \
--namespace kube-system \
--set hostNetwork=true \
--wait --timeout 120s
--wait --timeout 300s

helm upgrade --install kwok-stage-fast kwok/stage-fast \
--namespace kube-system
Expand Down
35 changes: 28 additions & 7 deletions pkg/bundler/deployer/helm/templates/deploy.sh.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ set -euo pipefail
# Cloud Native Stack Deployment Script
# Generated by AICR Bundler {{ .BundlerVersion }}
#
# Usage: ./deploy.sh [--no-wait]
# --no-wait Skip waiting for each component to become ready
# Usage: ./deploy.sh [--no-wait] [--best-effort]
# --no-wait Skip waiting for each component to become ready
# --best-effort Continue past individual component failures (log warnings)

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

Expand All @@ -16,9 +17,24 @@ trap 'rm -rf "${HELM_WORKDIR}"' EXIT
cd "${HELM_WORKDIR}"

WAIT_ARGS="--wait --timeout 10m"
if [[ "${1:-}" == "--no-wait" ]]; then
WAIT_ARGS=""
fi
BEST_EFFORT=false
FAILED_COMPONENTS=""

for arg in "$@"; do
case "${arg}" in
--no-wait) WAIT_ARGS="" ;;
--best-effort) BEST_EFFORT=true ;;
esac
done

helm_failed() {
if [[ "${BEST_EFFORT}" == "true" ]]; then
echo "WARNING: $1 install failed, continuing (--best-effort)"
FAILED_COMPONENTS="${FAILED_COMPONENTS} $1"
else
exit 1
fi
}

# Components that use operator patterns with custom resources that reconcile
# asynchronously. Helm --wait may time out waiting for CR readiness even though
Expand All @@ -41,19 +57,24 @@ helm upgrade --install {{ .Name }} {{ .Repository }}/{{ .ChartName }} \
--version {{ .Version }} \
-n {{ .Namespace }} --create-namespace \
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
${COMPONENT_WAIT_ARGS}
${COMPONENT_WAIT_ARGS} \
|| helm_failed "{{ .Name }}"
{{ else -}}
helm upgrade --install {{ .Name }} {{ .ChartName }} \
--repo {{ .Repository }} \
--version {{ .Version }} \
-n {{ .Namespace }} --create-namespace \
-f "${SCRIPT_DIR}/{{ .Name }}/values.yaml" \
${COMPONENT_WAIT_ARGS}
${COMPONENT_WAIT_ARGS} \
|| helm_failed "{{ .Name }}"
{{ end -}}
{{ end -}}
{{ if .HasManifests -}}
echo "Applying manifests for {{ .Name }}..."
kubectl apply -f "${SCRIPT_DIR}/{{ .Name }}/manifests/"
{{ end -}}
{{ end }}
if [[ -n "${FAILED_COMPONENTS}" ]]; then
echo "WARNING: the following components failed:${FAILED_COMPONENTS}"
fi
echo "Deployment complete."
3 changes: 2 additions & 1 deletion pkg/cli/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ func runValidation(
evidenceSource := result
if evidenceResultPath != "" {
slog.Info("loading saved result for evidence rendering", "path", evidenceResultPath)
slog.Warn("saved results do not include diagnostic artifacts; evidence output will contain check status only")
saved, loadErr := serializer.FromFile[validator.ValidationResult](evidenceResultPath)
if loadErr != nil {
return errors.Wrap(errors.ErrCodeInvalidRequest, "failed to load evidence result", loadErr)
Expand Down Expand Up @@ -364,7 +365,7 @@ func validateCmdFlags() []cli.Flag {
},
&cli.StringFlag{
Name: "result",
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Requires --phase conformance and --evidence-dir.",
Usage: "Use a saved validation result file as the source for evidence rendering (live validation still runs). Note: saved results do not include diagnostic artifacts captured during live runs. Requires --phase conformance and --evidence-dir.",
},
outputFlag,
formatFlag,
Expand Down
17 changes: 17 additions & 0 deletions pkg/defaults/timeouts.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ const (

// Conformance test timeouts for DRA and gang scheduling validation.
const (
// CheckExecutionTimeout is the parent context timeout for checks running
// inside a K8s Job. Must be long enough for behavioral checks (DRA pod
// creation + image pull + GPU allocation + isolation verification) and
// shorter than the Job-level ValidateConformanceTimeout.
CheckExecutionTimeout = 10 * time.Minute

// DRATestPodTimeout is the timeout for the DRA test pod to complete.
// The pod runs a simple CUDA device check but may need time for image pull.
DRATestPodTimeout = 5 * time.Minute
Expand Down Expand Up @@ -211,6 +217,17 @@ const (
PodReadyTimeout = 2 * time.Minute
)

// Artifact limits for conformance evidence capture.
const (
// ArtifactMaxDataSize is the maximum size in bytes of a single artifact's Data field.
// Ensures each base64-encoded ARTIFACT: line stays well under the bufio.Scanner
// default 64KB limit (base64 expands ~4/3, so 8KB → ~11KB encoded).
ArtifactMaxDataSize = 8 * 1024

// ArtifactMaxPerCheck is the maximum number of artifacts a single check can record.
ArtifactMaxPerCheck = 20
)

// Job configuration constants.
const (
// JobTTLAfterFinished is the time-to-live for completed Jobs.
Expand Down
17 changes: 17 additions & 0 deletions pkg/defaults/timeouts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ func TestTimeoutConstants(t *testing.T) {
{"ValidateConformanceTimeout", ValidateConformanceTimeout, 5 * time.Minute, 30 * time.Minute},
{"ResourceVerificationTimeout", ResourceVerificationTimeout, 5 * time.Second, 30 * time.Second},

// Conformance check execution timeout
{"CheckExecutionTimeout", CheckExecutionTimeout, 5 * time.Minute, 15 * time.Minute},

// Gang scheduling co-scheduling window
{"CoScheduleWindow", CoScheduleWindow, 10 * time.Second, 60 * time.Second},

Expand Down Expand Up @@ -127,6 +130,20 @@ func TestValidationPhaseTimeoutRelationships(t *testing.T) {
}
}

func TestCheckExecutionTimeoutRelationships(t *testing.T) {
// Check execution timeout must be shorter than the conformance Job timeout
// to allow the Job to observe completion before its own deadline.
if CheckExecutionTimeout >= ValidateConformanceTimeout {
t.Errorf("CheckExecutionTimeout (%v) should be less than ValidateConformanceTimeout (%v)",
CheckExecutionTimeout, ValidateConformanceTimeout)
}
// Individual check timeouts must fit within the execution context.
if DRATestPodTimeout >= CheckExecutionTimeout {
t.Errorf("DRATestPodTimeout (%v) should be less than CheckExecutionTimeout (%v)",
DRATestPodTimeout, CheckExecutionTimeout)
}
}

func TestCollectorTimeoutLessThanK8s(t *testing.T) {
// Individual collector timeout should be less than K8s collector timeout
// since K8s operations may involve multiple API calls
Expand Down
9 changes: 5 additions & 4 deletions pkg/evidence/renderer.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ func (r *Renderer) buildEntries(conformance *validator.PhaseResult) []EvidenceEn
}

entry := CheckEntry{
Name: check.Name,
Status: cr.Status,
Reason: cr.Reason,
Duration: cr.Duration,
Name: check.Name,
Status: cr.Status,
Reason: cr.Reason,
Duration: cr.Duration,
Artifacts: cr.Artifacts,
}

g, exists := groups[check.EvidenceFile]
Expand Down
103 changes: 103 additions & 0 deletions pkg/evidence/renderer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"github.com/NVIDIA/aicr/pkg/validator"
"github.com/NVIDIA/aicr/pkg/validator/checks"

// Import conformance checks to register them.
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
Expand Down Expand Up @@ -337,3 +338,105 @@ func TestRenderIndexContent(t *testing.T) {
t.Error("index.md should contain run ID")
}
}

func TestRenderWithArtifacts(t *testing.T) {
dir := t.TempDir()
r := New(WithOutputDir(dir))

result := &validator.ValidationResult{
RunID: "test-artifacts",
Phases: map[string]*validator.PhaseResult{
"conformance": {
Checks: []validator.CheckResult{
{
Name: "dra-support",
Status: validator.ValidationStatusPass,
Reason: "DRA controller healthy",
Duration: 5 * time.Second,
Artifacts: []checks.Artifact{
{Label: "DRA Controller Pods", Data: "NAME READY STATUS\ndra-controller-abc12 1/1 Running"},
{Label: "ResourceSlice Count", Data: "Total ResourceSlices: 8"},
},
},
},
},
},
}

if err := r.Render(context.Background(), result); err != nil {
t.Fatalf("Render() error = %v", err)
}

content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
if err != nil {
t.Fatalf("failed to read dra-support.md: %v", err)
}

s := string(content)

// Verify artifact labels are present.
if !strings.Contains(s, "#### DRA Controller Pods") {
t.Error("evidence should contain artifact label 'DRA Controller Pods'")
}
if !strings.Contains(s, "#### ResourceSlice Count") {
t.Error("evidence should contain artifact label 'ResourceSlice Count'")
}

// Verify artifact data is present.
if !strings.Contains(s, "dra-controller-abc12") {
t.Error("evidence should contain artifact data")
}
if !strings.Contains(s, "Total ResourceSlices: 8") {
t.Error("evidence should contain ResourceSlice count data")
}

// Verify the reason is also present (artifacts don't replace reason).
if !strings.Contains(s, "DRA controller healthy") {
t.Error("evidence should still contain the reason text")
}
}

func TestRenderWithoutArtifacts(t *testing.T) {
dir := t.TempDir()
r := New(WithOutputDir(dir))

result := &validator.ValidationResult{
RunID: "test-no-artifacts",
Phases: map[string]*validator.PhaseResult{
"conformance": {
Checks: []validator.CheckResult{
{
Name: "dra-support",
Status: validator.ValidationStatusPass,
Reason: "all healthy",
Duration: 3 * time.Second,
},
},
},
},
}

if err := r.Render(context.Background(), result); err != nil {
t.Fatalf("Render() error = %v", err)
}

content, err := os.ReadFile(filepath.Join(dir, "dra-support.md"))
if err != nil {
t.Fatalf("failed to read dra-support.md: %v", err)
}

s := string(content)

// Verify basic content is present.
if !strings.Contains(s, "dra-support") {
t.Error("evidence should contain check name")
}
if !strings.Contains(s, "all healthy") {
t.Error("evidence should contain reason")
}

// Verify no artifact headers appear.
if strings.Contains(s, "####") {
t.Error("evidence without artifacts should not contain #### headers")
}
}
8 changes: 8 additions & 0 deletions pkg/evidence/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,13 @@ const evidenceTemplate = `# {{ .Title }}
{{ .Reason }}
` + "```" + `
{{- end }}
{{- range .Artifacts }}

#### {{ .Label }}

` + "```" + `
{{ .Data }}
` + "```" + `
{{- end }}
{{ end }}
`
4 changes: 4 additions & 0 deletions pkg/evidence/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"time"

"github.com/NVIDIA/aicr/pkg/validator"
"github.com/NVIDIA/aicr/pkg/validator/checks"
)

// EvidenceEntry holds all data needed to render a single evidence document.
Expand Down Expand Up @@ -58,4 +59,7 @@ type CheckEntry struct {

// Duration is how long the check took.
Duration time.Duration

// Artifacts contains diagnostic evidence captured during check execution.
Artifacts []checks.Artifact
}
Loading
Loading