-
Notifications
You must be signed in to change notification settings - Fork 27
Expand file tree
/
Copy pathrun.sh
More file actions
executable file
·2043 lines (1799 loc) · 68.4 KB
/
run.sh
File metadata and controls
executable file
·2043 lines (1799 loc) · 68.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -euo pipefail
# =============================================================================
# E2E Tests for aicr with Tilt Cluster
# =============================================================================
#
# This script tests the full aicr workflow with a running Kubernetes cluster
# and the aicrd API server (via Tilt).
#
# Prerequisites:
# - Tilt cluster running: make dev-env
# - aicrd accessible at localhost:8080
#
# Usage:
# ./tests/e2e/run.sh
#
# =============================================================================
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
DIM='\033[2m'
NC='\033[0m' # No Color
# Configuration
aicrd_URL="${aicrd_URL:-http://localhost:8080}"
OUTPUT_DIR="${OUTPUT_DIR:-$(mktemp -d)}"
AICR_BIN=""
AICR_IMAGE="${AICR_IMAGE:-localhost:5001/aicr:local}"
AICR_VALIDATOR_IMAGE="${AICR_VALIDATOR_IMAGE:-localhost:5001/aicr-validator:local}"
SNAPSHOT_NAMESPACE="${SNAPSHOT_NAMESPACE:-gpu-operator}"
SNAPSHOT_CM="${SNAPSHOT_CM:-aicr-e2e-snapshot}"
FAKE_GPU_ENABLED="${FAKE_GPU_ENABLED:-false}"
# Test counters
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0
# =============================================================================
# Helpers
# =============================================================================
msg() {
echo -e "${BLUE}[INFO]${NC} $1"
}
warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
err() {
echo -e "${RED}[ERROR]${NC} $1"
exit 1
}
pass() {
local name=$1
TOTAL_TESTS=$((TOTAL_TESTS + 1))
PASSED_TESTS=$((PASSED_TESTS + 1))
echo -e "${GREEN}[PASS]${NC} $name"
}
fail() {
local name=$1
local reason=${2:-""}
TOTAL_TESTS=$((TOTAL_TESTS + 1))
FAILED_TESTS=$((FAILED_TESTS + 1))
if [ -n "$reason" ]; then
echo -e "${RED}[FAIL]${NC} $name: $reason"
else
echo -e "${RED}[FAIL]${NC} $name"
fi
}
skip() {
local name=$1
local reason=${2:-""}
echo -e "${YELLOW}[SKIP]${NC} $name: $reason"
}
check_command() {
if ! command -v "$1" &> /dev/null; then
err "$1 is required but not installed"
fi
}
# Show command being executed
run_cmd() {
echo -e "${DIM} \$ $*${NC}"
"$@"
}
# Show detail/info line
detail() {
echo -e "${CYAN} → $1${NC}"
}
# =============================================================================
# Build
# =============================================================================
build_binaries() {
msg "=========================================="
msg "Building binaries"
msg "=========================================="
cd "${ROOT_DIR}"
# Build aicr directly with go build (simpler than goreleaser for e2e tests)
local bin_dir="${ROOT_DIR}/dist/e2e"
mkdir -p "${bin_dir}"
if ! go build -o "${bin_dir}/aicr" ./cmd/aicr 2>&1; then
err "Failed to build aicr"
fi
AICR_BIN="${bin_dir}/aicr"
if [ ! -x "$AICR_BIN" ]; then
err "aicr binary not found at ${AICR_BIN}"
fi
pass "build/aicr"
msg "Using: ${AICR_BIN}"
}
# =============================================================================
# API Health Checks
# =============================================================================
check_api_health() {
msg "=========================================="
msg "Checking API health"
msg "=========================================="
# Health endpoint
if curl -sf "${aicrd_URL}/health" > /dev/null 2>&1; then
pass "api/health"
else
fail "api/health" "aicrd not responding at ${aicrd_URL}/health"
warn "Is Tilt running? Try: make dev-env"
return 1
fi
# Ready endpoint
if curl -sf "${aicrd_URL}/ready" > /dev/null 2>&1; then
pass "api/ready"
else
fail "api/ready" "aicrd not ready"
return 1
fi
return 0
}
# =============================================================================
# CLI Recipe Tests (from e2e.md)
# =============================================================================
test_cli_recipe() {
msg "=========================================="
msg "Testing CLI recipe generation"
msg "=========================================="
local recipe_dir="${OUTPUT_DIR}/recipes"
mkdir -p "$recipe_dir"
# Test 1: Basic recipe with query parameters
msg "--- Test: Recipe with query parameters ---"
local basic_recipe="${recipe_dir}/basic.yaml"
echo -e "${DIM} \$ aicr recipe --service eks --accelerator h100 --os ubuntu --intent training -o basic.yaml${NC}"
if "${AICR_BIN}" recipe \
--service eks \
--accelerator h100 \
--os ubuntu \
--intent training \
--output "$basic_recipe" 2>&1; then
if [ -f "$basic_recipe" ] && grep -q "kind: RecipeResult" "$basic_recipe"; then
# Show components from recipe
local components
components=$(grep "^ - name:" "$basic_recipe" 2>/dev/null | wc -l | tr -d ' ')
detail "Generated recipe with ${components} components"
pass "cli/recipe/query-params"
else
fail "cli/recipe/query-params" "Recipe file invalid"
fi
else
fail "cli/recipe/query-params" "Command failed"
fi
# Test 2: Recipe from criteria file
msg "--- Test: Recipe from criteria file ---"
local criteria_file="${recipe_dir}/criteria.yaml"
cat > "$criteria_file" << 'EOF'
kind: RecipeCriteria
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h100-eks-ubuntu-training
spec:
service: eks
accelerator: h100
os: ubuntu
intent: training
EOF
local criteria_recipe="${recipe_dir}/from-criteria.yaml"
if "${AICR_BIN}" recipe --criteria "$criteria_file" --output "$criteria_recipe" 2>&1; then
if [ -f "$criteria_recipe" ]; then
pass "cli/recipe/criteria-file"
else
fail "cli/recipe/criteria-file" "Recipe file not created"
fi
else
fail "cli/recipe/criteria-file" "Command failed"
fi
# Test 3: CLI flags override criteria file
msg "--- Test: CLI flags override criteria file ---"
local override_recipe="${recipe_dir}/override.yaml"
if "${AICR_BIN}" recipe --criteria "$criteria_file" --service gke --output "$override_recipe" 2>&1; then
if grep -q "service: gke" "$override_recipe" 2>/dev/null; then
pass "cli/recipe/override"
else
fail "cli/recipe/override" "Override not applied"
fi
else
fail "cli/recipe/override" "Command failed"
fi
}
# =============================================================================
# API Recipe Tests (from e2e.md)
# =============================================================================
test_api_recipe() {
msg "=========================================="
msg "Testing API recipe endpoints"
msg "=========================================="
local recipe_dir="${OUTPUT_DIR}/api-recipes"
mkdir -p "$recipe_dir"
# Test 1: GET /v1/recipe with query params
msg "--- Test: GET /v1/recipe ---"
echo -e "${DIM} \$ curl ${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training${NC}"
local get_recipe="${recipe_dir}/get.json"
local http_code
http_code=$(curl -s -w "%{http_code}" -o "$get_recipe" \
"${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training")
if [ "$http_code" = "200" ] && [ -s "$get_recipe" ]; then
detail "HTTP ${http_code} OK"
pass "api/recipe/GET"
else
fail "api/recipe/GET" "HTTP $http_code"
fi
# Test 2: POST /v1/recipe with YAML body
msg "--- Test: POST /v1/recipe ---"
local post_recipe="${recipe_dir}/post.json"
http_code=$(curl -s -w "%{http_code}" -o "$post_recipe" \
-X POST "${aicrd_URL}/v1/recipe" \
-H "Content-Type: application/x-yaml" \
-d 'kind: RecipeCriteria
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
name: h100-training
spec:
service: eks
accelerator: h100
intent: training')
if [ "$http_code" = "200" ] && [ -s "$post_recipe" ]; then
pass "api/recipe/POST"
else
fail "api/recipe/POST" "HTTP $http_code"
fi
}
# =============================================================================
# CLI Bundle Tests (from e2e.md)
# =============================================================================
test_cli_bundle() {
msg "=========================================="
msg "Testing CLI bundle generation"
msg "=========================================="
# First generate a recipe to use
local recipe_file="${OUTPUT_DIR}/bundle-test-recipe.yaml"
"${AICR_BIN}" recipe \
--service eks \
--accelerator h100 \
--os ubuntu \
--intent training \
--output "$recipe_file" 2>&1 || true
if [ ! -f "$recipe_file" ]; then
fail "cli/bundle/prerequisite" "Could not generate recipe for bundle tests"
return 1
fi
# Test 1: Basic bundle generation
msg "--- Test: Basic bundle ---"
local basic_bundle="${OUTPUT_DIR}/bundles/basic"
mkdir -p "$basic_bundle"
echo -e "${DIM} \$ aicr bundle --recipe recipe.yaml --output bundles/basic${NC}"
if "${AICR_BIN}" bundle \
--recipe "$recipe_file" \
--output "$basic_bundle" 2>&1; then
if [ -f "${basic_bundle}/deploy.sh" ] && [ -f "${basic_bundle}/README.md" ]; then
local file_count
file_count=$(find "$basic_bundle" -type f | wc -l | tr -d ' ')
detail "Generated ${file_count} files in bundle"
# Verify at least one component directory has values.yaml
local comp_count
comp_count=$(find "$basic_bundle" -mindepth 2 -name "values.yaml" | wc -l | tr -d ' ')
if [ "$comp_count" -gt 0 ]; then
detail "Found ${comp_count} component directories"
pass "cli/bundle/basic"
else
fail "cli/bundle/basic" "No component directories with values.yaml"
fi
else
fail "cli/bundle/basic" "Missing deploy.sh or README.md"
fi
else
fail "cli/bundle/basic" "Command failed"
fi
# Test 2: Bundle with node selectors and tolerations
msg "--- Test: Bundle with scheduling options ---"
local sched_bundle="${OUTPUT_DIR}/bundles/scheduling"
mkdir -p "$sched_bundle"
if "${AICR_BIN}" bundle \
--recipe "$recipe_file" \
--output "$sched_bundle" \
--system-node-selector nodeGroup=system-pool \
--accelerated-node-selector nodeGroup=customer-gpu \
--accelerated-node-toleration nvidia.com/gpu=present:NoSchedule 2>&1; then
# Search across all component values files for the node selector
local found_selector=false
for vfile in "${sched_bundle}"/*/values.yaml; do
if [ -f "$vfile" ] && grep -q "system-pool" "$vfile" 2>/dev/null; then
found_selector=true
break
fi
done
if [ "$found_selector" = true ]; then
pass "cli/bundle/scheduling"
else
fail "cli/bundle/scheduling" "Node selector not found in component values"
fi
else
fail "cli/bundle/scheduling" "Command failed"
fi
# Test 3: Bundle with ArgoCD deployer
msg "--- Test: Bundle with ArgoCD deployer ---"
local argocd_bundle="${OUTPUT_DIR}/bundles/argocd"
mkdir -p "$argocd_bundle"
if "${AICR_BIN}" bundle \
--recipe "$recipe_file" \
--output "$argocd_bundle" \
--deployer argocd 2>&1; then
if [ -f "${argocd_bundle}/app-of-apps.yaml" ]; then
pass "cli/bundle/argocd"
else
fail "cli/bundle/argocd" "app-of-apps.yaml not found"
fi
else
fail "cli/bundle/argocd" "Command failed"
fi
# Test 4: Verify bundle integrity (checksums)
msg "--- Test: Bundle integrity ---"
if [ -f "${basic_bundle}/checksums.txt" ]; then
cd "$basic_bundle"
if shasum -a 256 -c checksums.txt > /dev/null 2>&1; then
pass "cli/bundle/integrity"
else
fail "cli/bundle/integrity" "Checksum verification failed"
fi
cd - > /dev/null
else
skip "cli/bundle/integrity" "No checksums.txt"
fi
# Test 5: deploy.sh is executable
msg "--- Test: deploy.sh executable ---"
if [ -x "${basic_bundle}/deploy.sh" ]; then
pass "cli/bundle/deploy-script"
elif [ -f "${basic_bundle}/deploy.sh" ]; then
fail "cli/bundle/deploy-script" "deploy.sh exists but is not executable"
else
fail "cli/bundle/deploy-script" "deploy.sh not found"
fi
}
# =============================================================================
# API Bundle Tests (from e2e.md)
# =============================================================================
test_api_bundle() {
msg "=========================================="
msg "Testing API bundle endpoint"
msg "=========================================="
local bundle_dir="${OUTPUT_DIR}/api-bundles"
mkdir -p "$bundle_dir"
# Test: POST /v1/bundle (recipe -> bundle pipeline)
msg "--- Test: POST /v1/bundle ---"
echo -e "${DIM} \$ curl -X POST ${aicrd_URL}/v1/bundle?deployer=helm -d <recipe>${NC}"
# First get a recipe from API
local recipe_json
recipe_json=$(curl -s "${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training")
if [ -z "$recipe_json" ]; then
fail "api/bundle/POST" "Could not get recipe from API"
return 1
fi
# Then send to bundle endpoint
local bundle_zip="${bundle_dir}/bundle.zip"
local http_code
http_code=$(curl -s -w "%{http_code}" -o "$bundle_zip" \
-X POST "${aicrd_URL}/v1/bundle?deployer=helm" \
-H "Content-Type: application/json" \
-d "$recipe_json")
if [ "$http_code" = "200" ] && [ -s "$bundle_zip" ]; then
# Verify it's a valid zip
if unzip -t "$bundle_zip" > /dev/null 2>&1; then
pass "api/bundle/POST"
# Extract and verify contents
local extract_dir="${bundle_dir}/extracted"
mkdir -p "$extract_dir"
unzip -q "$bundle_zip" -d "$extract_dir"
if [ -f "${extract_dir}/deploy.sh" ]; then
pass "api/bundle/contents"
else
fail "api/bundle/contents" "deploy.sh not in bundle"
fi
else
fail "api/bundle/POST" "Invalid zip file"
fi
else
fail "api/bundle/POST" "HTTP $http_code"
fi
}
# =============================================================================
# CLI Help Test
# =============================================================================
test_cli_help() {
msg "=========================================="
msg "Testing CLI help"
msg "=========================================="
# Test: aicr -h
msg "--- Test: aicr -h ---"
if "${AICR_BIN}" -h > /dev/null 2>&1; then
pass "cli/help"
else
fail "cli/help" "aicr -h failed"
fi
# Test: aicr --version
msg "--- Test: aicr --version ---"
if "${AICR_BIN}" --version > /dev/null 2>&1; then
pass "cli/version"
else
fail "cli/version" "aicr --version failed"
fi
}
# =============================================================================
# Fake GPU Setup (for snapshot tests)
# =============================================================================
setup_fake_gpu() {
msg "=========================================="
msg "Setting up fake GPU environment"
msg "=========================================="
# Check if we can access the cluster
if ! kubectl cluster-info > /dev/null 2>&1; then
warn "Cannot access Kubernetes cluster, skipping fake GPU setup"
return 1
fi
# Check if fake-gpu-operator is already running
if kubectl get pods -n gpu-operator -l app.kubernetes.io/name=fake-gpu-operator > /dev/null 2>&1; then
msg "fake-gpu-operator already running"
fi
# Inject fake nvidia-smi into Kind worker node
local fake_smi="${ROOT_DIR}/tools/fake-nvidia-smi"
if [ -f "$fake_smi" ]; then
# Find Kind worker nodes
local workers
workers=$(docker ps --filter "name=aicr-worker" --format "{{.Names}}" 2>/dev/null || true)
if [ -n "$workers" ]; then
for worker in $workers; do
msg "Injecting fake nvidia-smi into $worker"
echo -e "${DIM} \$ docker cp fake-nvidia-smi ${worker}:/usr/local/bin/nvidia-smi${NC}"
docker cp "$fake_smi" "${worker}:/usr/local/bin/nvidia-smi"
docker exec "$worker" chmod +x /usr/local/bin/nvidia-smi
# Show what GPU is being simulated
local gpu_info
gpu_info=$(docker exec "$worker" nvidia-smi -L 2>/dev/null | head -1)
detail "Simulated: ${gpu_info}"
done
# Show driver info
local driver_info
driver_info=$(docker exec "$worker" nvidia-smi --version 2>/dev/null | head -1)
detail "Driver: ${driver_info}"
pass "setup/fake-nvidia-smi"
FAKE_GPU_ENABLED=true
else
warn "No Kind worker nodes found"
return 1
fi
else
warn "Fake nvidia-smi script not found at $fake_smi"
return 1
fi
# Create namespace for snapshot tests (if it doesn't exist)
kubectl create namespace "$SNAPSHOT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
# Create RBAC for snapshot agent
msg "Creating RBAC for snapshot agent"
kubectl apply -f - << EOF
apiVersion: v1
kind: ServiceAccount
metadata:
name: aicr
namespace: ${SNAPSHOT_NAMESPACE}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: aicr-e2e-reader
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "configmaps"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: aicr-e2e-reader
subjects:
- kind: ServiceAccount
name: aicr
namespace: ${SNAPSHOT_NAMESPACE}
roleRef:
kind: ClusterRole
name: aicr-e2e-reader
apiGroup: rbac.authorization.k8s.io
EOF
pass "setup/rbac"
return 0
}
# =============================================================================
# Snapshot Tests (from e2e.md)
# =============================================================================
test_snapshot() {
msg "=========================================="
msg "Testing snapshot collection"
msg "=========================================="
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
skip "snapshot/deploy-agent" "Fake GPU not enabled"
return 0
fi
# Clean up any existing snapshot
kubectl delete cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1
# Test: Snapshot with deploy-agent using custom Job (with nvidia-smi hostPath)
msg "--- Test: Snapshot with deploy-agent ---"
detail "Image: ${AICR_IMAGE}"
detail "Output: cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"
# Create a custom Job that mounts nvidia-smi from host
echo -e "${DIM} \$ kubectl apply -f snapshot-job.yaml${NC}"
kubectl delete job aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1
sleep 2
kubectl apply -f - << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: aicr-e2e-snapshot
namespace: ${SNAPSHOT_NAMESPACE}
spec:
completions: 1
backoffLimit: 0
ttlSecondsAfterFinished: 300
template:
spec:
serviceAccountName: aicr
restartPolicy: Never
nodeSelector:
kubernetes.io/os: linux
hostPID: true
hostNetwork: true
containers:
- name: aicr
image: ${AICR_IMAGE}
command: ["aicr"]
args: ["snapshot", "-o", "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"]
env:
- name: AICR_LOG_PREFIX
value: agent
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
privileged: true
runAsUser: 0
volumeMounts:
- name: tmp
mountPath: /tmp
- name: run-systemd
mountPath: /run/systemd
readOnly: true
- name: nvidia-smi
mountPath: /usr/bin/nvidia-smi
readOnly: true
volumes:
- name: tmp
emptyDir: {}
- name: run-systemd
hostPath:
path: /run/systemd
type: Directory
- name: nvidia-smi
hostPath:
path: /usr/local/bin/nvidia-smi
type: File
EOF
# Wait for job to complete
if kubectl wait --for=condition=complete job/aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --timeout=120s > /dev/null 2>&1; then
pass "snapshot/deploy-agent"
else
kubectl logs -n "$SNAPSHOT_NAMESPACE" -l job-name=aicr-e2e-snapshot 2>/dev/null || true
fail "snapshot/deploy-agent" "Job did not complete"
return 1
fi
# Verify ConfigMap was created
msg "--- Test: Snapshot ConfigMap ---"
if kubectl get cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" > /dev/null 2>&1; then
pass "snapshot/configmap-created"
else
fail "snapshot/configmap-created" "ConfigMap not found"
return 1
fi
# Verify snapshot contains GPU data
msg "--- Test: Snapshot GPU data ---"
local snapshot_data
snapshot_data=$(kubectl get cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" -o jsonpath='{.data.snapshot\.yaml}' 2>/dev/null)
# Extract and display GPU info from snapshot
local gpu_name gpu_count gpu_mem driver_ver cuda_ver
gpu_name=$(echo "$snapshot_data" | grep "gpu-product:" | head -1 | sed 's/.*gpu-product: //' || echo "unknown")
gpu_count=$(echo "$snapshot_data" | grep "gpu-count:" | head -1 | sed 's/.*gpu-count: //' || echo "0")
gpu_mem=$(echo "$snapshot_data" | grep "gpu-memory:" | head -1 | sed 's/.*gpu-memory: //' || echo "unknown")
driver_ver=$(echo "$snapshot_data" | grep "driver-version:" | head -1 | sed 's/.*driver-version: //' || echo "unknown")
cuda_ver=$(echo "$snapshot_data" | grep "cuda-version:" | head -1 | sed 's/.*cuda-version: //' || echo "unknown")
if [ -n "$gpu_name" ] && [ "$gpu_name" != "unknown" ]; then
detail "GPU: ${gpu_name}"
detail "Count: ${gpu_count}"
detail "Memory: ${gpu_mem}"
detail "Driver: ${driver_ver}, CUDA: ${cuda_ver}"
pass "snapshot/gpu-data"
else
warn "No GPU data in snapshot (may be expected without fake-gpu-operator)"
pass "snapshot/gpu-data"
fi
}
# =============================================================================
# Recipe from Snapshot Tests (from e2e.md)
# =============================================================================
test_recipe_from_snapshot() {
msg "=========================================="
msg "Testing recipe from snapshot"
msg "=========================================="
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
skip "recipe/from-snapshot" "Fake GPU not enabled"
return 0
fi
local recipe_dir="${OUTPUT_DIR}/snapshot-recipes"
mkdir -p "$recipe_dir"
# Test: Recipe from ConfigMap snapshot
msg "--- Test: Recipe from snapshot (cm://...) ---"
local snapshot_recipe="${recipe_dir}/from-snapshot.yaml"
echo -e "${DIM} \$ aicr recipe --snapshot cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM} --intent training -o from-snapshot.yaml${NC}"
if "${AICR_BIN}" recipe \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--intent training \
--output "$snapshot_recipe" 2>&1; then
if [ -f "$snapshot_recipe" ] && grep -q "kind: RecipeResult" "$snapshot_recipe"; then
# Show detected criteria
local service accelerator
service=$(grep "^ service:" "$snapshot_recipe" 2>/dev/null | head -1 | awk '{print $2}')
accelerator=$(grep "^ accelerator:" "$snapshot_recipe" 2>/dev/null | head -1 | awk '{print $2}')
detail "Detected: service=${service:-auto}, accelerator=${accelerator:-auto}"
pass "recipe/from-snapshot"
else
fail "recipe/from-snapshot" "Recipe file invalid"
fi
else
fail "recipe/from-snapshot" "Command failed"
fi
# Test: View recipe constraints
msg "--- Test: Recipe constraints ---"
if [ -f "$snapshot_recipe" ]; then
if grep -q "constraints:" "$snapshot_recipe" 2>/dev/null; then
pass "recipe/constraints"
else
warn "No constraints in recipe (may be expected)"
pass "recipe/constraints"
fi
else
skip "recipe/constraints" "No recipe file"
fi
}
# =============================================================================
# Validate Tests (from e2e.md)
# =============================================================================
test_validate() {
msg "=========================================="
msg "Testing recipe validation"
msg "=========================================="
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
skip "validate/recipe" "Fake GPU not enabled"
return 0
fi
local validate_dir="${OUTPUT_DIR}/validate"
mkdir -p "$validate_dir"
# First generate a recipe
local recipe_file="${validate_dir}/recipe.yaml"
"${AICR_BIN}" recipe \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--intent training \
--output "$recipe_file" 2>&1 || true
if [ ! -f "$recipe_file" ]; then
skip "validate/recipe" "Could not generate recipe"
return 0
fi
# Test: Validate recipe against snapshot
msg "--- Test: Validate recipe ---"
echo -e "${DIM} \$ aicr validate --recipe recipe.yaml --snapshot cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}${NC}"
local validation_result="${validate_dir}/validation.yaml"
local validate_output
validate_output=$("${AICR_BIN}" validate \
--recipe "$recipe_file" \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--output "$validation_result" 2>&1) || true
if [ -f "$validation_result" ] || echo "$validate_output" | grep -q "status=pass"; then
# Show validation result
local constraints_passed
constraints_passed=$(echo "$validate_output" | grep -o "passed=[0-9]*" | head -1 | cut -d= -f2 || echo "?")
detail "Validation: PASS (${constraints_passed} constraints checked)"
pass "validate/recipe"
elif echo "$validate_output" | grep -q "status=fail"; then
warn "Validation failed (constraints not met)"
pass "validate/recipe"
else
# Validation may have other issues
warn "Validation had issues (may be expected)"
pass "validate/recipe"
fi
}
test_validate_multiphase() {
msg "=========================================="
msg "Testing multi-phase validation"
msg "=========================================="
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
skip "validate/multi-phase" "Fake GPU not enabled"
return 0
fi
local validate_dir="${OUTPUT_DIR}/validate-multiphase"
mkdir -p "$validate_dir"
# Generate a recipe for testing
local recipe_file="${validate_dir}/recipe.yaml"
"${AICR_BIN}" recipe \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--intent training \
--output "$recipe_file" 2>&1 || true
if [ ! -f "$recipe_file" ]; then
skip "validate/multi-phase" "Could not generate recipe"
return 0
fi
# Test 1: Readiness phase (default)
msg "--- Test: Validate with --phase readiness ---"
echo -e "${DIM} \$ aicr validate --phase readiness${NC}"
local readiness_result="${validate_dir}/validation-readiness.yaml"
local readiness_output
readiness_output=$("${AICR_BIN}" validate \
--recipe "$recipe_file" \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--phase readiness \
--output "$readiness_result" 2>&1) || true
if echo "$readiness_output" | grep -q "readiness"; then
detail "Readiness phase: PASS"
pass "validate/phase-readiness"
else
fail "validate/phase-readiness" "Readiness phase not found in output"
fi
# Test 2: Deployment phase
msg "--- Test: Validate with --phase deployment ---"
echo -e "${DIM} \$ aicr validate --phase deployment${NC}"
local deployment_output
deployment_output=$("${AICR_BIN}" validate \
--recipe "$recipe_file" \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--phase deployment 2>&1) || true
if echo "$deployment_output" | grep -q "deployment"; then
detail "Deployment phase: PASS"
pass "validate/phase-deployment"
else
fail "validate/phase-deployment" "Deployment phase not found in output"
fi
# Test 3: Performance phase
msg "--- Test: Validate with --phase performance ---"
echo -e "${DIM} \$ aicr validate --phase performance${NC}"
local performance_output
performance_output=$("${AICR_BIN}" validate \
--recipe "$recipe_file" \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--phase performance 2>&1) || true
if echo "$performance_output" | grep -q "performance"; then
detail "Performance phase: PASS"
pass "validate/phase-performance"
else
fail "validate/phase-performance" "Performance phase not found in output"
fi
# Test 4: All phases
msg "--- Test: Validate with --phase all ---"
echo -e "${DIM} \$ aicr validate --phase all${NC}"
local all_result="${validate_dir}/validation-all.yaml"
local all_output
all_output=$("${AICR_BIN}" validate \
--recipe "$recipe_file" \
--snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
--phase all \
--output "$all_result" 2>&1) || true
# Check that all phases are present in the output
local phases_found=0
echo "$all_output" | grep -q "readiness" && ((phases_found++)) || true
echo "$all_output" | grep -q "deployment" && ((phases_found++)) || true
echo "$all_output" | grep -q "performance" && ((phases_found++)) || true
echo "$all_output" | grep -q "conformance" && ((phases_found++)) || true
if [ $phases_found -ge 3 ]; then
detail "All phases: PASS (found $phases_found phases)"
pass "validate/phase-all"
else
fail "validate/phase-all" "Expected at least 3 phases, found $phases_found"
fi
# Test 5: Verify phase result structure
if [ -f "$all_result" ]; then
msg "--- Test: Verify phase result structure ---"
echo -e "${DIM} \$ yq '.phases' validation-all.yaml${NC}"
# Check if phases field exists
if yq '.phases' "$all_result" | grep -q "readiness"; then
detail "Phase result structure: PASS"
pass "validate/result-structure"
else
fail "validate/result-structure" "phases field not found in result"
fi
fi
}
# =============================================================================
# External Data Tests (--data flag)
# =============================================================================
# =============================================================================
# Deployment Phase Constraint Tests
# =============================================================================
test_validate_deployment_constraints() {
msg "=========================================="
msg "Testing deployment phase constraints"
msg "=========================================="
# Create validation namespace for constraint tests
kubectl create namespace aicr-validation 2>&1 || true
if [ "$FAKE_GPU_ENABLED" != "true" ]; then
skip "validate/deployment-constraints" "Fake GPU not enabled"
return 0
fi
local validate_dir="${OUTPUT_DIR}/validate-deployment"
mkdir -p "$validate_dir"
# Create a fake GPU operator deployment for testing
msg "--- Setup: Create fake GPU operator deployment ---"
kubectl create namespace gpu-operator --dry-run=client -o yaml | kubectl apply -f - 2>&1 || true
cat <<YAML | kubectl apply -f - 2>&1 || true
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-operator
namespace: gpu-operator
labels:
app.kubernetes.io/name: gpu-operator
app.kubernetes.io/version: v24.6.0
spec:
replicas: 1
selector:
matchLabels:
app: gpu-operator
template:
metadata:
labels:
app: gpu-operator
spec:
containers:
- name: gpu-operator
image: nvcr.io/nvidia/gpu-operator:v24.6.0
imagePullPolicy: IfNotPresent
YAML
if [ $? -eq 0 ]; then
detail "Created fake GPU operator deployment (v24.6.0)"
else
skip "validate/deployment-constraints" "Could not create GPU operator deployment"
return 0