Skip to content

Commit 399a2dc

Browse files
authored
fix: prevent snapshot agent Job from nesting agent deployment (#200)
1 parent fa0446c commit 399a2dc

File tree

3 files changed

+28
-67
lines changed

3 files changed

+28
-67
lines changed

pkg/cli/snapshot.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package cli
1616

1717
import (
1818
"context"
19+
"os"
1920

2021
"github.com/urfave/cli/v3"
2122

@@ -237,6 +238,12 @@ See examples/templates/snapshot-template.md.tmpl for a sample template.
237238
return errors.Wrap(errors.ErrCodeInvalidRequest, "invalid toleration", err)
238239
}
239240

241+
// When running inside an agent Job, collect locally instead of
242+
// deploying another agent (prevents infinite nesting).
243+
if os.Getenv("AICR_AGENT_MODE") == "true" {
244+
return ns.Measure(ctx)
245+
}
246+
240247
// Configure agent deployment
241248
ns.AgentConfig = &snapshotter.AgentConfig{
242249
Kubeconfig: cmd.String("kubeconfig"),

pkg/k8s/agent/job.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ func (d *Deployer) buildPodSpec(args []string) corev1.PodSpec {
118118
Command: []string{"aicr"},
119119
Args: args,
120120
Env: []corev1.EnvVar{
121+
{
122+
Name: "AICR_AGENT_MODE",
123+
Value: "true",
124+
},
121125
{
122126
Name: "AICR_LOG_PREFIX",
123127
Value: "agent",

tests/e2e/run.sh

Lines changed: 17 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -601,84 +601,34 @@ test_snapshot() {
601601
msg "=========================================="
602602

603603
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
604-
skip "snapshot/deploy-agent" "Fake GPU not enabled"
604+
skip "snapshot/agent" "Fake GPU not enabled"
605605
return 0
606606
fi
607607

608608
# Clean up any existing snapshot
609609
kubectl delete cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1
610610

611-
# Test: Snapshot with deploy-agent using custom Job (with nvidia-smi hostPath)
612-
msg "--- Test: Snapshot with deploy-agent ---"
611+
# Test: Snapshot via agent deployment from the CI runner.
612+
# The snapshot command always deploys a Job to capture data on a cluster node.
613+
msg "--- Test: Snapshot via agent deployment ---"
613614
detail "Image: ${AICR_IMAGE}"
614615
detail "Output: cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"
615616

616-
# Create a custom Job that mounts nvidia-smi from host
617-
echo -e "${DIM} \$ kubectl apply -f snapshot-job.yaml${NC}"
618-
kubectl delete job aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1
619-
sleep 2
617+
echo -e "${DIM} \$ aicr snapshot --image ${AICR_IMAGE} --namespace ${SNAPSHOT_NAMESPACE} -o cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}${NC}"
618+
local snapshot_output
619+
snapshot_output=$("${AICR_BIN}" snapshot \
620+
--image "${AICR_IMAGE}" \
621+
--namespace "${SNAPSHOT_NAMESPACE}" \
622+
--output "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
623+
--timeout 120s \
624+
--privileged \
625+
--node-selector kubernetes.io/os=linux 2>&1) || true
620626

621-
kubectl apply -f - << EOF
622-
apiVersion: batch/v1
623-
kind: Job
624-
metadata:
625-
name: aicr-e2e-snapshot
626-
namespace: ${SNAPSHOT_NAMESPACE}
627-
spec:
628-
completions: 1
629-
backoffLimit: 0
630-
ttlSecondsAfterFinished: 300
631-
template:
632-
spec:
633-
serviceAccountName: aicr
634-
restartPolicy: Never
635-
nodeSelector:
636-
kubernetes.io/os: linux
637-
hostPID: true
638-
hostNetwork: true
639-
containers:
640-
- name: aicr
641-
image: ${AICR_IMAGE}
642-
command: ["aicr"]
643-
args: ["snapshot", "-o", "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"]
644-
env:
645-
- name: AICR_LOG_PREFIX
646-
value: agent
647-
- name: NODE_NAME
648-
valueFrom:
649-
fieldRef:
650-
fieldPath: spec.nodeName
651-
securityContext:
652-
privileged: true
653-
runAsUser: 0
654-
volumeMounts:
655-
- name: tmp
656-
mountPath: /tmp
657-
- name: run-systemd
658-
mountPath: /run/systemd
659-
readOnly: true
660-
- name: nvidia-smi
661-
mountPath: /usr/bin/nvidia-smi
662-
readOnly: true
663-
volumes:
664-
- name: tmp
665-
emptyDir: {}
666-
- name: run-systemd
667-
hostPath:
668-
path: /run/systemd
669-
type: Directory
670-
- name: nvidia-smi
671-
hostPath:
672-
path: /usr/local/bin/nvidia-smi
673-
type: File
674-
EOF
675-
676-
# Wait for job to complete
677-
if kubectl wait --for=condition=complete job/aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --timeout=120s > /dev/null 2>&1; then
678-
pass "snapshot/deploy-agent"
627+
if kubectl get cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" > /dev/null 2>&1; then
628+
pass "snapshot/agent"
679629
else
680-
kubectl logs -n "$SNAPSHOT_NAMESPACE" -l job-name=aicr-e2e-snapshot 2>/dev/null || true
681-
fail "snapshot/deploy-agent" "Job did not complete"
630+
echo "$snapshot_output"
631+
fail "snapshot/agent" "Snapshot ConfigMap not created"
682632
return 1
683633
fi
684634

0 commit comments

Comments
 (0)