aicr/tests/e2e/run.sh at 7460763d431950a7e9a2437dca7ca9e482c24c1c · NVIDIA/aicr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail

# =============================================================================
# E2E Tests for aicr with Tilt Cluster
# =============================================================================
#
# This script tests the full aicr workflow with a running Kubernetes cluster
# and the aicrd API server (via Tilt).
#
# Prerequisites:
#   - Tilt cluster running: make dev-env
#   - aicrd accessible at localhost:8080
#
# Usage:
#   ./tests/e2e/run.sh
#
# =============================================================================

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
DIM='\033[2m'
NC='\033[0m' # No Color

# Configuration
aicrd_URL="${aicrd_URL:-http://localhost:8080}"
OUTPUT_DIR="${OUTPUT_DIR:-$(mktemp -d)}"
AICR_BIN=""
AICR_IMAGE="${AICR_IMAGE:-localhost:5001/aicr:local}"
AICR_VALIDATOR_IMAGE="${AICR_VALIDATOR_IMAGE:-localhost:5001/aicr-validator:local}"
SNAPSHOT_NAMESPACE="${SNAPSHOT_NAMESPACE:-gpu-operator}"
SNAPSHOT_CM="${SNAPSHOT_CM:-aicr-e2e-snapshot}"
FAKE_GPU_ENABLED="${FAKE_GPU_ENABLED:-false}"

# Test counters
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0

# =============================================================================
# Helpers
# =============================================================================

msg() {
  echo -e "${BLUE}[INFO]${NC} $1"
}

warn() {
  echo -e "${YELLOW}[WARN]${NC} $1"
}

err() {
  echo -e "${RED}[ERROR]${NC} $1"
  exit 1
}

pass() {
  local name=$1
  TOTAL_TESTS=$((TOTAL_TESTS + 1))
  PASSED_TESTS=$((PASSED_TESTS + 1))
  echo -e "${GREEN}[PASS]${NC} $name"
}

fail() {
  local name=$1
  local reason=${2:-""}
  TOTAL_TESTS=$((TOTAL_TESTS + 1))
  FAILED_TESTS=$((FAILED_TESTS + 1))
  if [ -n "$reason" ]; then
    echo -e "${RED}[FAIL]${NC} $name: $reason"
  else
    echo -e "${RED}[FAIL]${NC} $name"
  fi
}

skip() {
  local name=$1
  local reason=${2:-""}
  echo -e "${YELLOW}[SKIP]${NC} $name: $reason"
}

check_command() {
  if ! command -v "$1" &> /dev/null; then
    err "$1 is required but not installed"
  fi
}

# Show command being executed
run_cmd() {
  echo -e "${DIM}  \$ $*${NC}"
  "$@"
}

# Show detail/info line
detail() {
  echo -e "${CYAN}     → $1${NC}"
}

# =============================================================================
# Build
# =============================================================================

build_binaries() {
  msg "=========================================="
  msg "Building binaries"
  msg "=========================================="

  cd "${ROOT_DIR}"

  # Build aicr directly with go build (simpler than goreleaser for e2e tests)
  local bin_dir="${ROOT_DIR}/dist/e2e"
  mkdir -p "${bin_dir}"

  if ! go build -o "${bin_dir}/aicr" ./cmd/aicr 2>&1; then
    err "Failed to build aicr"
  fi

  AICR_BIN="${bin_dir}/aicr"

  if [ ! -x "$AICR_BIN" ]; then
    err "aicr binary not found at ${AICR_BIN}"
  fi

  pass "build/aicr"
  msg "Using: ${AICR_BIN}"
}

# =============================================================================
# API Health Checks
# =============================================================================

check_api_health() {
  msg "=========================================="
  msg "Checking API health"
  msg "=========================================="

  # Health endpoint
  if curl -sf "${aicrd_URL}/health" > /dev/null 2>&1; then
    pass "api/health"
  else
    fail "api/health" "aicrd not responding at ${aicrd_URL}/health"
    warn "Is Tilt running? Try: make dev-env"
    return 1
  fi

  # Ready endpoint
  if curl -sf "${aicrd_URL}/ready" > /dev/null 2>&1; then
    pass "api/ready"
  else
    fail "api/ready" "aicrd not ready"
    return 1
  fi

  return 0
}

# =============================================================================
# CLI Recipe Tests (from e2e.md)
# =============================================================================

test_cli_recipe() {
  msg "=========================================="
  msg "Testing CLI recipe generation"
  msg "=========================================="

  local recipe_dir="${OUTPUT_DIR}/recipes"
  mkdir -p "$recipe_dir"

  # Test 1: Basic recipe with query parameters
  msg "--- Test: Recipe with query parameters ---"
  local basic_recipe="${recipe_dir}/basic.yaml"
  echo -e "${DIM}  \$ aicr recipe --service eks --accelerator h100 --os ubuntu --intent training -o basic.yaml${NC}"
  if "${AICR_BIN}" recipe \
    --service eks \
    --accelerator h100 \
    --os ubuntu \
    --intent training \
    --output "$basic_recipe" 2>&1; then
    if [ -f "$basic_recipe" ] && grep -q "kind: RecipeResult" "$basic_recipe"; then
      # Show components from recipe
      local components
      components=$(grep "^  - name:" "$basic_recipe" 2>/dev/null | wc -l | tr -d ' ')
      detail "Generated recipe with ${components} components"
      pass "cli/recipe/query-params"
    else
      fail "cli/recipe/query-params" "Recipe file invalid"
    fi
  else
    fail "cli/recipe/query-params" "Command failed"
  fi

  # Test 2: Recipe from criteria file
  msg "--- Test: Recipe from criteria file ---"
  local criteria_file="${recipe_dir}/criteria.yaml"
  cat > "$criteria_file" << 'EOF'
kind: RecipeCriteria
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
  name: h100-eks-ubuntu-training
spec:
  service: eks
  accelerator: h100
  os: ubuntu
  intent: training
EOF

  local criteria_recipe="${recipe_dir}/from-criteria.yaml"
  if "${AICR_BIN}" recipe --criteria "$criteria_file" --output "$criteria_recipe" 2>&1; then
    if [ -f "$criteria_recipe" ]; then
      pass "cli/recipe/criteria-file"
    else
      fail "cli/recipe/criteria-file" "Recipe file not created"
    fi
  else
    fail "cli/recipe/criteria-file" "Command failed"
  fi

  # Test 3: CLI flags override criteria file
  msg "--- Test: CLI flags override criteria file ---"
  local override_recipe="${recipe_dir}/override.yaml"
  if "${AICR_BIN}" recipe --criteria "$criteria_file" --service gke --output "$override_recipe" 2>&1; then
    if grep -q "service: gke" "$override_recipe" 2>/dev/null; then
      pass "cli/recipe/override"
    else
      fail "cli/recipe/override" "Override not applied"
    fi
  else
    fail "cli/recipe/override" "Command failed"
  fi
}

# =============================================================================
# API Recipe Tests (from e2e.md)
# =============================================================================

test_api_recipe() {
  msg "=========================================="
  msg "Testing API recipe endpoints"
  msg "=========================================="

  local recipe_dir="${OUTPUT_DIR}/api-recipes"
  mkdir -p "$recipe_dir"

  # Test 1: GET /v1/recipe with query params
  msg "--- Test: GET /v1/recipe ---"
  echo -e "${DIM}  \$ curl ${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training${NC}"
  local get_recipe="${recipe_dir}/get.json"
  local http_code
  http_code=$(curl -s -w "%{http_code}" -o "$get_recipe" \
    "${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training")

  if [ "$http_code" = "200" ] && [ -s "$get_recipe" ]; then
    detail "HTTP ${http_code} OK"
    pass "api/recipe/GET"
  else
    fail "api/recipe/GET" "HTTP $http_code"
  fi

  # Test 2: POST /v1/recipe with YAML body
  msg "--- Test: POST /v1/recipe ---"
  local post_recipe="${recipe_dir}/post.json"
  http_code=$(curl -s -w "%{http_code}" -o "$post_recipe" \
    -X POST "${aicrd_URL}/v1/recipe" \
    -H "Content-Type: application/x-yaml" \
    -d 'kind: RecipeCriteria
apiVersion: aicr.nvidia.com/v1alpha1
metadata:
  name: h100-training
spec:
  service: eks
  accelerator: h100
  intent: training')

  if [ "$http_code" = "200" ] && [ -s "$post_recipe" ]; then
    pass "api/recipe/POST"
  else
    fail "api/recipe/POST" "HTTP $http_code"
  fi
}

# =============================================================================
# CLI Bundle Tests (from e2e.md)
# =============================================================================

test_cli_bundle() {
  msg "=========================================="
  msg "Testing CLI bundle generation"
  msg "=========================================="

  # First generate a recipe to use
  local recipe_file="${OUTPUT_DIR}/bundle-test-recipe.yaml"
  "${AICR_BIN}" recipe \
    --service eks \
    --accelerator h100 \
    --os ubuntu \
    --intent training \
    --output "$recipe_file" 2>&1 || true

  if [ ! -f "$recipe_file" ]; then
    fail "cli/bundle/prerequisite" "Could not generate recipe for bundle tests"
    return 1
  fi

  # Test 1: Basic bundle generation
  msg "--- Test: Basic bundle ---"
  local basic_bundle="${OUTPUT_DIR}/bundles/basic"
  mkdir -p "$basic_bundle"
  echo -e "${DIM}  \$ aicr bundle --recipe recipe.yaml --output bundles/basic${NC}"
  if "${AICR_BIN}" bundle \
    --recipe "$recipe_file" \
    --output "$basic_bundle" 2>&1; then
    if [ -f "${basic_bundle}/deploy.sh" ] && [ -f "${basic_bundle}/README.md" ]; then
      local file_count
      file_count=$(find "$basic_bundle" -type f | wc -l | tr -d ' ')
      detail "Generated ${file_count} files in bundle"
      # Verify at least one component directory has values.yaml
      local comp_count
      comp_count=$(find "$basic_bundle" -mindepth 2 -name "values.yaml" | wc -l | tr -d ' ')
      if [ "$comp_count" -gt 0 ]; then
        detail "Found ${comp_count} component directories"
        pass "cli/bundle/basic"
      else
        fail "cli/bundle/basic" "No component directories with values.yaml"
      fi
    else
      fail "cli/bundle/basic" "Missing deploy.sh or README.md"
    fi
  else
    fail "cli/bundle/basic" "Command failed"
  fi

  # Test 2: Bundle with node selectors and tolerations
  msg "--- Test: Bundle with scheduling options ---"
  local sched_bundle="${OUTPUT_DIR}/bundles/scheduling"
  mkdir -p "$sched_bundle"
  if "${AICR_BIN}" bundle \
    --recipe "$recipe_file" \
    --output "$sched_bundle" \
    --system-node-selector nodeGroup=system-pool \
    --accelerated-node-selector nodeGroup=customer-gpu \
    --accelerated-node-toleration nvidia.com/gpu=present:NoSchedule 2>&1; then
    # Search across all component values files for the node selector
    local found_selector=false
    for vfile in "${sched_bundle}"/*/values.yaml; do
      if [ -f "$vfile" ] && grep -q "system-pool" "$vfile" 2>/dev/null; then
        found_selector=true
        break
      fi
    done
    if [ "$found_selector" = true ]; then
      pass "cli/bundle/scheduling"
    else
      fail "cli/bundle/scheduling" "Node selector not found in component values"
    fi
  else
    fail "cli/bundle/scheduling" "Command failed"
  fi

  # Test 3: Bundle with ArgoCD deployer
  msg "--- Test: Bundle with ArgoCD deployer ---"
  local argocd_bundle="${OUTPUT_DIR}/bundles/argocd"
  mkdir -p "$argocd_bundle"
  if "${AICR_BIN}" bundle \
    --recipe "$recipe_file" \
    --output "$argocd_bundle" \
    --deployer argocd 2>&1; then
    if [ -f "${argocd_bundle}/app-of-apps.yaml" ]; then
      pass "cli/bundle/argocd"
    else
      fail "cli/bundle/argocd" "app-of-apps.yaml not found"
    fi
  else
    fail "cli/bundle/argocd" "Command failed"
  fi

  # Test 4: Verify bundle integrity (checksums)
  msg "--- Test: Bundle integrity ---"
  if [ -f "${basic_bundle}/checksums.txt" ]; then
    cd "$basic_bundle"
    if shasum -a 256 -c checksums.txt > /dev/null 2>&1; then
      pass "cli/bundle/integrity"
    else
      fail "cli/bundle/integrity" "Checksum verification failed"
    fi
    cd - > /dev/null
  else
    skip "cli/bundle/integrity" "No checksums.txt"
  fi

  # Test 5: deploy.sh is executable
  msg "--- Test: deploy.sh executable ---"
  if [ -x "${basic_bundle}/deploy.sh" ]; then
    pass "cli/bundle/deploy-script"
  elif [ -f "${basic_bundle}/deploy.sh" ]; then
    fail "cli/bundle/deploy-script" "deploy.sh exists but is not executable"
  else
    fail "cli/bundle/deploy-script" "deploy.sh not found"
  fi
}

# =============================================================================
# API Bundle Tests (from e2e.md)
# =============================================================================

test_api_bundle() {
  msg "=========================================="
  msg "Testing API bundle endpoint"
  msg "=========================================="

  local bundle_dir="${OUTPUT_DIR}/api-bundles"
  mkdir -p "$bundle_dir"

  # Test: POST /v1/bundle (recipe -> bundle pipeline)
  msg "--- Test: POST /v1/bundle ---"
  echo -e "${DIM}  \$ curl -X POST ${aicrd_URL}/v1/bundle?deployer=helm -d <recipe>${NC}"

  # First get a recipe from API
  local recipe_json
  recipe_json=$(curl -s "${aicrd_URL}/v1/recipe?service=eks&accelerator=h100&intent=training")

  if [ -z "$recipe_json" ]; then
    fail "api/bundle/POST" "Could not get recipe from API"
    return 1
  fi

  # Then send to bundle endpoint
  local bundle_zip="${bundle_dir}/bundle.zip"
  local http_code
  http_code=$(curl -s -w "%{http_code}" -o "$bundle_zip" \
    -X POST "${aicrd_URL}/v1/bundle?deployer=helm" \
    -H "Content-Type: application/json" \
    -d "$recipe_json")

  if [ "$http_code" = "200" ] && [ -s "$bundle_zip" ]; then
    # Verify it's a valid zip
    if unzip -t "$bundle_zip" > /dev/null 2>&1; then
      pass "api/bundle/POST"

      # Extract and verify contents
      local extract_dir="${bundle_dir}/extracted"
      mkdir -p "$extract_dir"
      unzip -q "$bundle_zip" -d "$extract_dir"
      if [ -f "${extract_dir}/deploy.sh" ]; then
        pass "api/bundle/contents"
      else
        fail "api/bundle/contents" "deploy.sh not in bundle"
      fi
    else
      fail "api/bundle/POST" "Invalid zip file"
    fi
  else
    fail "api/bundle/POST" "HTTP $http_code"
  fi
}

# =============================================================================
# CLI Help Test
# =============================================================================

test_cli_help() {
  msg "=========================================="
  msg "Testing CLI help"
  msg "=========================================="

  # Test: aicr -h
  msg "--- Test: aicr -h ---"
  if "${AICR_BIN}" -h > /dev/null 2>&1; then
    pass "cli/help"
  else
    fail "cli/help" "aicr -h failed"
  fi

  # Test: aicr --version
  msg "--- Test: aicr --version ---"
  if "${AICR_BIN}" --version > /dev/null 2>&1; then
    pass "cli/version"
  else
    fail "cli/version" "aicr --version failed"
  fi
}

# =============================================================================
# Fake GPU Setup (for snapshot tests)
# =============================================================================

setup_fake_gpu() {
  msg "=========================================="
  msg "Setting up fake GPU environment"
  msg "=========================================="

  # Check if we can access the cluster
  if ! kubectl cluster-info > /dev/null 2>&1; then
    warn "Cannot access Kubernetes cluster, skipping fake GPU setup"
    return 1
  fi

  # Check if fake-gpu-operator is already running
  if kubectl get pods -n gpu-operator -l app.kubernetes.io/name=fake-gpu-operator > /dev/null 2>&1; then
    msg "fake-gpu-operator already running"
  fi

  # Inject fake nvidia-smi into Kind worker node
  local fake_smi="${ROOT_DIR}/tools/fake-nvidia-smi"
  if [ -f "$fake_smi" ]; then
    # Find Kind worker nodes
    local workers
    workers=$(docker ps --filter "name=aicr-worker" --format "{{.Names}}" 2>/dev/null || true)
    if [ -n "$workers" ]; then
      for worker in $workers; do
        msg "Injecting fake nvidia-smi into $worker"
        echo -e "${DIM}  \$ docker cp fake-nvidia-smi ${worker}:/usr/local/bin/nvidia-smi${NC}"
        docker cp "$fake_smi" "${worker}:/usr/local/bin/nvidia-smi"
        docker exec "$worker" chmod +x /usr/local/bin/nvidia-smi
        # Show what GPU is being simulated
        local gpu_info
        gpu_info=$(docker exec "$worker" nvidia-smi -L 2>/dev/null | head -1)
        detail "Simulated: ${gpu_info}"
      done
      # Show driver info
      local driver_info
      driver_info=$(docker exec "$worker" nvidia-smi --version 2>/dev/null | head -1)
      detail "Driver: ${driver_info}"
      pass "setup/fake-nvidia-smi"
      FAKE_GPU_ENABLED=true
    else
      warn "No Kind worker nodes found"
      return 1
    fi
  else
    warn "Fake nvidia-smi script not found at $fake_smi"
    return 1
  fi

  # Create namespace for snapshot tests (if it doesn't exist)
  kubectl create namespace "$SNAPSHOT_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -

  # Create RBAC for snapshot agent
  msg "Creating RBAC for snapshot agent"
  kubectl apply -f - << EOF
apiVersion: v1
kind: ServiceAccount
metadata:
  name: aicr
  namespace: ${SNAPSHOT_NAMESPACE}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: aicr-e2e-reader
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "configmaps"]
  verbs: ["get", "list", "watch", "create", "update", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: aicr-e2e-reader
subjects:
- kind: ServiceAccount
  name: aicr
  namespace: ${SNAPSHOT_NAMESPACE}
roleRef:
  kind: ClusterRole
  name: aicr-e2e-reader
  apiGroup: rbac.authorization.k8s.io
EOF
  pass "setup/rbac"

  return 0
}

# =============================================================================
# Snapshot Tests (from e2e.md)
# =============================================================================

test_snapshot() {
  msg "=========================================="
  msg "Testing snapshot collection"
  msg "=========================================="

  if [ "$FAKE_GPU_ENABLED" != "true" ]; then
    skip "snapshot/deploy-agent" "Fake GPU not enabled"
    return 0
  fi

  # Clean up any existing snapshot
  kubectl delete cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1

  # Test: Snapshot with deploy-agent using custom Job (with nvidia-smi hostPath)
  msg "--- Test: Snapshot with deploy-agent ---"
  detail "Image: ${AICR_IMAGE}"
  detail "Output: cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"

  # Create a custom Job that mounts nvidia-smi from host
  echo -e "${DIM}  \$ kubectl apply -f snapshot-job.yaml${NC}"
  kubectl delete job aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --ignore-not-found=true > /dev/null 2>&1
  sleep 2

  kubectl apply -f - << EOF
apiVersion: batch/v1
kind: Job
metadata:
  name: aicr-e2e-snapshot
  namespace: ${SNAPSHOT_NAMESPACE}
spec:
  completions: 1
  backoffLimit: 0
  ttlSecondsAfterFinished: 300
  template:
    spec:
      serviceAccountName: aicr
      restartPolicy: Never
      nodeSelector:
        kubernetes.io/os: linux
      hostPID: true
      hostNetwork: true
      containers:
      - name: aicr
        image: ${AICR_IMAGE}
        command: ["aicr"]
        args: ["snapshot", "-o", "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}"]
        env:
        - name: AICR_LOG_PREFIX
          value: agent
        - name: NODE_NAME
          valueFrom:
            fieldRef:
              fieldPath: spec.nodeName
        securityContext:
          privileged: true
          runAsUser: 0
        volumeMounts:
        - name: tmp
          mountPath: /tmp
        - name: run-systemd
          mountPath: /run/systemd
          readOnly: true
        - name: nvidia-smi
          mountPath: /usr/bin/nvidia-smi
          readOnly: true
      volumes:
      - name: tmp
        emptyDir: {}
      - name: run-systemd
        hostPath:
          path: /run/systemd
          type: Directory
      - name: nvidia-smi
        hostPath:
          path: /usr/local/bin/nvidia-smi
          type: File
EOF

  # Wait for job to complete
  if kubectl wait --for=condition=complete job/aicr-e2e-snapshot -n "$SNAPSHOT_NAMESPACE" --timeout=120s > /dev/null 2>&1; then
    pass "snapshot/deploy-agent"
  else
    kubectl logs -n "$SNAPSHOT_NAMESPACE" -l job-name=aicr-e2e-snapshot 2>/dev/null || true
    fail "snapshot/deploy-agent" "Job did not complete"
    return 1
  fi

  # Verify ConfigMap was created
  msg "--- Test: Snapshot ConfigMap ---"
  if kubectl get cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" > /dev/null 2>&1; then
    pass "snapshot/configmap-created"
  else
    fail "snapshot/configmap-created" "ConfigMap not found"
    return 1
  fi

  # Verify snapshot contains GPU data
  msg "--- Test: Snapshot GPU data ---"
  local snapshot_data
  snapshot_data=$(kubectl get cm "$SNAPSHOT_CM" -n "$SNAPSHOT_NAMESPACE" -o jsonpath='{.data.snapshot\.yaml}' 2>/dev/null)

  # Extract and display GPU info from snapshot
  local gpu_name gpu_count gpu_mem driver_ver cuda_ver
  gpu_name=$(echo "$snapshot_data" | grep "gpu-product:" | head -1 | sed 's/.*gpu-product: //' || echo "unknown")
  gpu_count=$(echo "$snapshot_data" | grep "gpu-count:" | head -1 | sed 's/.*gpu-count: //' || echo "0")
  gpu_mem=$(echo "$snapshot_data" | grep "gpu-memory:" | head -1 | sed 's/.*gpu-memory: //' || echo "unknown")
  driver_ver=$(echo "$snapshot_data" | grep "driver-version:" | head -1 | sed 's/.*driver-version: //' || echo "unknown")
  cuda_ver=$(echo "$snapshot_data" | grep "cuda-version:" | head -1 | sed 's/.*cuda-version: //' || echo "unknown")

  if [ -n "$gpu_name" ] && [ "$gpu_name" != "unknown" ]; then
    detail "GPU: ${gpu_name}"
    detail "Count: ${gpu_count}"
    detail "Memory: ${gpu_mem}"
    detail "Driver: ${driver_ver}, CUDA: ${cuda_ver}"
    pass "snapshot/gpu-data"
  else
    warn "No GPU data in snapshot (may be expected without fake-gpu-operator)"
    pass "snapshot/gpu-data"
  fi
}

# =============================================================================
# Recipe from Snapshot Tests (from e2e.md)
# =============================================================================

test_recipe_from_snapshot() {
  msg "=========================================="
  msg "Testing recipe from snapshot"
  msg "=========================================="

  if [ "$FAKE_GPU_ENABLED" != "true" ]; then
    skip "recipe/from-snapshot" "Fake GPU not enabled"
    return 0
  fi

  local recipe_dir="${OUTPUT_DIR}/snapshot-recipes"
  mkdir -p "$recipe_dir"

  # Test: Recipe from ConfigMap snapshot
  msg "--- Test: Recipe from snapshot (cm://...) ---"
  local snapshot_recipe="${recipe_dir}/from-snapshot.yaml"
  echo -e "${DIM}  \$ aicr recipe --snapshot cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM} --intent training -o from-snapshot.yaml${NC}"
  if "${AICR_BIN}" recipe \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --intent training \
    --output "$snapshot_recipe" 2>&1; then
    if [ -f "$snapshot_recipe" ] && grep -q "kind: RecipeResult" "$snapshot_recipe"; then
      # Show detected criteria
      local service accelerator
      service=$(grep "^  service:" "$snapshot_recipe" 2>/dev/null | head -1 | awk '{print $2}')
      accelerator=$(grep "^  accelerator:" "$snapshot_recipe" 2>/dev/null | head -1 | awk '{print $2}')
      detail "Detected: service=${service:-auto}, accelerator=${accelerator:-auto}"
      pass "recipe/from-snapshot"
    else
      fail "recipe/from-snapshot" "Recipe file invalid"
    fi
  else
    fail "recipe/from-snapshot" "Command failed"
  fi

  # Test: View recipe constraints
  msg "--- Test: Recipe constraints ---"
  if [ -f "$snapshot_recipe" ]; then
    if grep -q "constraints:" "$snapshot_recipe" 2>/dev/null; then
      pass "recipe/constraints"
    else
      warn "No constraints in recipe (may be expected)"
      pass "recipe/constraints"
    fi
  else
    skip "recipe/constraints" "No recipe file"
  fi
}

# =============================================================================
# Validate Tests (from e2e.md)
# =============================================================================

test_validate() {
  msg "=========================================="
  msg "Testing recipe validation"
  msg "=========================================="

  if [ "$FAKE_GPU_ENABLED" != "true" ]; then
    skip "validate/recipe" "Fake GPU not enabled"
    return 0
  fi

  local validate_dir="${OUTPUT_DIR}/validate"
  mkdir -p "$validate_dir"

  # First generate a recipe
  local recipe_file="${validate_dir}/recipe.yaml"
  "${AICR_BIN}" recipe \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --intent training \
    --output "$recipe_file" 2>&1 || true

  if [ ! -f "$recipe_file" ]; then
    skip "validate/recipe" "Could not generate recipe"
    return 0
  fi

  # Test: Validate recipe against snapshot
  msg "--- Test: Validate recipe ---"
  echo -e "${DIM}  \$ aicr validate --recipe recipe.yaml --snapshot cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}${NC}"
  local validation_result="${validate_dir}/validation.yaml"
  local validate_output
  validate_output=$("${AICR_BIN}" validate \
    --recipe "$recipe_file" \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --output "$validation_result" 2>&1) || true

  if [ -f "$validation_result" ] || echo "$validate_output" | grep -q "status=pass"; then
    # Show validation result
    local constraints_passed
    constraints_passed=$(echo "$validate_output" | grep -o "passed=[0-9]*" | head -1 | cut -d= -f2 || echo "?")
    detail "Validation: PASS (${constraints_passed} constraints checked)"
    pass "validate/recipe"
  elif echo "$validate_output" | grep -q "status=fail"; then
    warn "Validation failed (constraints not met)"
    pass "validate/recipe"
  else
    # Validation may have other issues
    warn "Validation had issues (may be expected)"
    pass "validate/recipe"
  fi
}

test_validate_multiphase() {
  msg "=========================================="
  msg "Testing multi-phase validation"
  msg "=========================================="

  if [ "$FAKE_GPU_ENABLED" != "true" ]; then
    skip "validate/multi-phase" "Fake GPU not enabled"
    return 0
  fi

  local validate_dir="${OUTPUT_DIR}/validate-multiphase"
  mkdir -p "$validate_dir"

  # Generate a recipe for testing
  local recipe_file="${validate_dir}/recipe.yaml"
  "${AICR_BIN}" recipe \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --intent training \
    --output "$recipe_file" 2>&1 || true

  if [ ! -f "$recipe_file" ]; then
    skip "validate/multi-phase" "Could not generate recipe"
    return 0
  fi

  # Test 1: Readiness phase (default)
  msg "--- Test: Validate with --phase readiness ---"
  echo -e "${DIM}  \$ aicr validate --phase readiness${NC}"
  local readiness_result="${validate_dir}/validation-readiness.yaml"
  local readiness_output
  readiness_output=$("${AICR_BIN}" validate \
    --recipe "$recipe_file" \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --phase readiness \
    --output "$readiness_result" 2>&1) || true

  if echo "$readiness_output" | grep -q "readiness"; then
    detail "Readiness phase: PASS"
    pass "validate/phase-readiness"
  else
    fail "validate/phase-readiness" "Readiness phase not found in output"
  fi

  # Test 2: Deployment phase
  msg "--- Test: Validate with --phase deployment ---"
  echo -e "${DIM}  \$ aicr validate --phase deployment${NC}"
  local deployment_output
  deployment_output=$("${AICR_BIN}" validate \
    --recipe "$recipe_file" \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --phase deployment 2>&1) || true

  if echo "$deployment_output" | grep -q "deployment"; then
    detail "Deployment phase: PASS"
    pass "validate/phase-deployment"
  else
    fail "validate/phase-deployment" "Deployment phase not found in output"
  fi

  # Test 3: Performance phase
  msg "--- Test: Validate with --phase performance ---"
  echo -e "${DIM}  \$ aicr validate --phase performance${NC}"
  local performance_output
  performance_output=$("${AICR_BIN}" validate \
    --recipe "$recipe_file" \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --phase performance 2>&1) || true

  if echo "$performance_output" | grep -q "performance"; then
    detail "Performance phase: PASS"
    pass "validate/phase-performance"
  else
    fail "validate/phase-performance" "Performance phase not found in output"
  fi

  # Test 4: All phases
  msg "--- Test: Validate with --phase all ---"
  echo -e "${DIM}  \$ aicr validate --phase all${NC}"
  local all_result="${validate_dir}/validation-all.yaml"
  local all_output
  all_output=$("${AICR_BIN}" validate \
    --recipe "$recipe_file" \
    --snapshot "cm://${SNAPSHOT_NAMESPACE}/${SNAPSHOT_CM}" \
    --phase all \
    --output "$all_result" 2>&1) || true

  # Check that all phases are present in the output
  local phases_found=0
  echo "$all_output" | grep -q "readiness" && ((phases_found++)) || true
  echo "$all_output" | grep -q "deployment" && ((phases_found++)) || true
  echo "$all_output" | grep -q "performance" && ((phases_found++)) || true
  echo "$all_output" | grep -q "conformance" && ((phases_found++)) || true

  if [ $phases_found -ge 3 ]; then
    detail "All phases: PASS (found $phases_found phases)"
    pass "validate/phase-all"
  else
    fail "validate/phase-all" "Expected at least 3 phases, found $phases_found"
  fi

  # Test 5: Verify phase result structure
  if [ -f "$all_result" ]; then
    msg "--- Test: Verify phase result structure ---"
    echo -e "${DIM}  \$ yq '.phases' validation-all.yaml${NC}"

    # Check if phases field exists
    if yq '.phases' "$all_result" | grep -q "readiness"; then
      detail "Phase result structure: PASS"
      pass "validate/result-structure"
    else
      fail "validate/result-structure" "phases field not found in result"
    fi
  fi
}

# =============================================================================
# External Data Tests (--data flag)
# =============================================================================


# =============================================================================
# Deployment Phase Constraint Tests
# =============================================================================

test_validate_deployment_constraints() {
  msg "=========================================="
  msg "Testing deployment phase constraints"
  msg "=========================================="

  # Create validation namespace for constraint tests
  kubectl create namespace aicr-validation 2>&1 || true

  if [ "$FAKE_GPU_ENABLED" != "true" ]; then
    skip "validate/deployment-constraints" "Fake GPU not enabled"
    return 0
  fi

  local validate_dir="${OUTPUT_DIR}/validate-deployment"
  mkdir -p "$validate_dir"

  # Create a fake GPU operator deployment for testing
  msg "--- Setup: Create fake GPU operator deployment ---"
  kubectl create namespace gpu-operator --dry-run=client -o yaml | kubectl apply -f - 2>&1 || true

  cat <<YAML | kubectl apply -f - 2>&1 || true
apiVersion: apps/v1
kind: Deployment
metadata:
  name: gpu-operator
  namespace: gpu-operator
  labels:
    app.kubernetes.io/name: gpu-operator
    app.kubernetes.io/version: v24.6.0
spec:
  replicas: 1
  selector:
    matchLabels:
      app: gpu-operator
  template:
    metadata:
      labels:
        app: gpu-operator
    spec:
      containers:
      - name: gpu-operator
        image: nvcr.io/nvidia/gpu-operator:v24.6.0
        imagePullPolicy: IfNotPresent
YAML

  if [ $? -eq 0 ]; then
    detail "Created fake GPU operator deployment (v24.6.0)"
  else
    skip "validate/deployment-constraints" "Could not create GPU operator deployment"
    return 0