@@ -601,84 +601,34 @@ test_snapshot() {
601601 msg " =========================================="
602602
603603 if [ " $FAKE_GPU_ENABLED " != " true" ]; then
604- skip " snapshot/deploy- agent" " Fake GPU not enabled"
604+ skip " snapshot/agent" " Fake GPU not enabled"
605605 return 0
606606 fi
607607
608608 # Clean up any existing snapshot
609609 kubectl delete cm " $SNAPSHOT_CM " -n " $SNAPSHOT_NAMESPACE " --ignore-not-found=true > /dev/null 2>&1
610610
611- # Test: Snapshot with deploy-agent using custom Job (with nvidia-smi hostPath)
612- msg " --- Test: Snapshot with deploy-agent ---"
611+ # Test: Snapshot via agent deployment from the CI runner.
612+ # The snapshot command always deploys a Job to capture data on a cluster node.
613+ msg " --- Test: Snapshot via agent deployment ---"
613614 detail " Image: ${AICR_IMAGE} "
614615 detail " Output: cm://${SNAPSHOT_NAMESPACE} /${SNAPSHOT_CM} "
615616
616- # Create a custom Job that mounts nvidia-smi from host
617- echo -e " ${DIM} \$ kubectl apply -f snapshot-job.yaml${NC} "
618- kubectl delete job aicr-e2e-snapshot -n " $SNAPSHOT_NAMESPACE " --ignore-not-found=true > /dev/null 2>&1
619- sleep 2
617+ echo -e " ${DIM} \$ aicr snapshot --image ${AICR_IMAGE} --namespace ${SNAPSHOT_NAMESPACE} -o cm://${SNAPSHOT_NAMESPACE} /${SNAPSHOT_CM}${NC} "
618+ local snapshot_output
619+ snapshot_output=$( " ${AICR_BIN} " snapshot \
620+ --image " ${AICR_IMAGE} " \
621+ --namespace " ${SNAPSHOT_NAMESPACE} " \
622+ --output " cm://${SNAPSHOT_NAMESPACE} /${SNAPSHOT_CM} " \
623+ --timeout 120s \
624+ --privileged \
625+ --node-selector kubernetes.io/os=linux 2>&1 ) || true
620626
621- kubectl apply -f - << EOF
622- apiVersion: batch/v1
623- kind: Job
624- metadata:
625- name: aicr-e2e-snapshot
626- namespace: ${SNAPSHOT_NAMESPACE}
627- spec:
628- completions: 1
629- backoffLimit: 0
630- ttlSecondsAfterFinished: 300
631- template:
632- spec:
633- serviceAccountName: aicr
634- restartPolicy: Never
635- nodeSelector:
636- kubernetes.io/os: linux
637- hostPID: true
638- hostNetwork: true
639- containers:
640- - name: aicr
641- image: ${AICR_IMAGE}
642- command: ["aicr"]
643- args: ["snapshot", "-o", "cm://${SNAPSHOT_NAMESPACE} /${SNAPSHOT_CM} "]
644- env:
645- - name: AICR_LOG_PREFIX
646- value: agent
647- - name: NODE_NAME
648- valueFrom:
649- fieldRef:
650- fieldPath: spec.nodeName
651- securityContext:
652- privileged: true
653- runAsUser: 0
654- volumeMounts:
655- - name: tmp
656- mountPath: /tmp
657- - name: run-systemd
658- mountPath: /run/systemd
659- readOnly: true
660- - name: nvidia-smi
661- mountPath: /usr/bin/nvidia-smi
662- readOnly: true
663- volumes:
664- - name: tmp
665- emptyDir: {}
666- - name: run-systemd
667- hostPath:
668- path: /run/systemd
669- type: Directory
670- - name: nvidia-smi
671- hostPath:
672- path: /usr/local/bin/nvidia-smi
673- type: File
674- EOF
675-
676- # Wait for job to complete
677- if kubectl wait --for=condition=complete job/aicr-e2e-snapshot -n " $SNAPSHOT_NAMESPACE " --timeout=120s > /dev/null 2>&1 ; then
678- pass " snapshot/deploy-agent"
627+ if kubectl get cm " $SNAPSHOT_CM " -n " $SNAPSHOT_NAMESPACE " > /dev/null 2>&1 ; then
628+ pass " snapshot/agent"
679629 else
680- kubectl logs -n " $SNAPSHOT_NAMESPACE " -l job-name=aicr-e2e-snapshot 2> /dev/null || true
681- fail " snapshot/deploy- agent" " Job did not complete "
630+ echo " $snapshot_output "
631+ fail " snapshot/agent" " Snapshot ConfigMap not created "
682632 return 1
683633 fi
684634
0 commit comments