forked from wso2/agent-manager
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinstall.sh
More file actions
executable file
·1536 lines (1336 loc) · 54.1 KB
/
install.sh
File metadata and controls
executable file
·1536 lines (1336 loc) · 54.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
set -euo pipefail
# ============================================================================
# OpenChoreo Development Environment Setup
# ============================================================================
# This script provides a comprehensive, idempotent installation that:
# 1. Creates a k3d cluster
# 2. Installs OpenChoreo (Control Plane, Data Plane, Workflow Plane, Observability Plane)
# 3. Registers planes and configures observability
# 4. Installs Agent Management Platform
#
# The script is idempotent - it can be run multiple times safely.
# Only public helm charts are used - no local charts or custom images.
# ============================================================================
# Configuration
CLUSTER_NAME="amp-local"
CLUSTER_CONTEXT="k3d-${CLUSTER_NAME}"
OPENCHOREO_VERSION="1.0.1-hotfix.1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
K3D_CONFIG="${SCRIPT_DIR}/k3d-config.yaml"
# WSO2 API Platform / Gateway Operator versions
GATEWAY_OPERATOR_VERSION="0.6.0"
GATEWAY_CHART_VERSION="1.0.1"
# Source AMP installation helpers
source "${SCRIPT_DIR}/install-helpers.sh"
# Timeouts (in seconds)
TIMEOUT_K3D_READY=60
TIMEOUT_CONTROL_PLANE=600
TIMEOUT_DATA_PLANE=600
TIMEOUT_BUILD_PLANE=600
TIMEOUT_OBSERVABILITY_PLANE=900
# Colors for output (8-bit mode for maximum compatibility)
RED='\033[1;31m'
GREEN='\033[1;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Helper functions
log_info() {
echo -e "${NC}ℹ${NC} $1"
}
log_success() {
echo -e "${GREEN}✓${NC} $1"
}
log_warning() {
echo -e "${YELLOW}⚠${NC} $1"
}
log_error() {
echo -e "${RED}✗${NC} $1"
}
log_step() {
echo ""
echo -e "${NC}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo -e "${NC}$1${NC}"
echo -e "${NC}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
echo ""
}
# Check if command exists
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Check if a port is available
check_port_available() {
local port=$1
local hex_port
hex_port=$(printf "%04X" "${port}")
# Use /proc/net/tcp if available (reliable in Linux containers)
local proc_files=()
[ -r /proc/net/tcp ] && proc_files+=(/proc/net/tcp)
[ -r /proc/net/tcp6 ] && proc_files+=(/proc/net/tcp6)
if [ ${#proc_files[@]} -gt 0 ]; then
if grep -qE ":${hex_port} .* 0A " "${proc_files[@]}" 2>/dev/null; then
return 1 # Port is in use
fi
return 0 # Port is available
fi
# Fall back to lsof for macOS
if lsof -iTCP:"${port}" -sTCP:LISTEN -Pn >/dev/null 2>&1; then
return 1 # Port is in use
fi
return 0 # Port is available
}
# Check all required ports are available
check_required_ports() {
# Required ports for k3d cluster (host:container mapping)
# 3000 - AMP Console UI
# 8080 - kgateway HTTP (Thunder auth + OpenChoreo API routing)
# 8443 - kgateway HTTPS
# 8084 - AI Gateway HTTP
# 8243 - AI Gateway HTTPS
# 9000 - AMP API service
# 9098 - AMP Traces Observer
# 9243 - AMP Internal API endpoint
# 10082 - Container Registry (Workflow Plane)
# 11080 - Observer API (Observability Plane)
# 11082 - OpenSearch API
# 11085 - OpenSearch HTTPS
# 19080 - Data Plane Gateway HTTP (agent workloads)
# 19443 - Data Plane Gateway HTTPS
# 21893 - OTel Collector
# 22893 - Observability Gateway HTTP
# 22894 - Observability Gateway HTTPS
local required_ports=(3000 8080 8443 8084 8243 9000 9098 9243 10082 11080 11082 11085 19080 19443 21893 22893 22894)
local ports_in_use=()
for port in "${required_ports[@]}"; do
if ! check_port_available "$port"; then
ports_in_use+=("$port")
fi
done
if [ ${#ports_in_use[@]} -gt 0 ]; then
log_error "The following required ports are already in use: ${ports_in_use[*]}"
log_info "Please free these ports before running the installer."
log_info "To find processes using these ports, run: lsof -i :<port>"
return 1
fi
return 0
}
# Wait for k3d cluster to be ready
wait_for_k3d_cluster() {
local cluster_name=$1
local timeout=$2
local elapsed=0
log_info "Waiting for k3d cluster '${cluster_name}' to be ready..."
while true; do
# Check if cluster exists and get its status
CLUSTER_LINE=$(k3d cluster list 2>/dev/null | grep "${cluster_name}" || echo "")
# Check if cluster is running - k3d shows status in various formats
# Format can be: "amp-local 1/1 0/0 true" or "amp-local running"
if [ -n "${CLUSTER_LINE}" ]; then
# Check for "running" text or "true" status (which indicates running)
if echo "${CLUSTER_LINE}" | grep -qE "(running|true)" || \
echo "${CLUSTER_LINE}" | grep -qE "[0-9]+/[0-9]+.*true"; then
# Give k3d a moment to register the kubeconfig context
sleep 2
# Always try to merge kubeconfig to ensure it's up to date
k3d kubeconfig merge "${cluster_name}" --kubeconfig-merge-default 2>/dev/null || true
sleep 2
# Check if context exists in kubeconfig
if kubectl config get-contexts "${CLUSTER_CONTEXT}" &>/dev/null 2>&1; then
# Set context
kubectl config use-context "${CLUSTER_CONTEXT}" &>/dev/null 2>&1 || true
# Verify cluster is actually accessible (try multiple methods)
# Method 1: cluster-info without context flag (uses current context)
if kubectl cluster-info &>/dev/null 2>&1; then
return 0
fi
# Method 2: cluster-info with context flag
if kubectl cluster-info --context "${CLUSTER_CONTEXT}" &>/dev/null 2>&1; then
return 0
fi
# Method 3: Try a simple get nodes command
if kubectl get nodes &>/dev/null 2>&1; then
return 0
fi
else
# Context doesn't exist yet, continue waiting
if [ $((elapsed % 10)) -eq 0 ]; then
log_info "Context ${CLUSTER_CONTEXT} not yet available, waiting... (${elapsed}s elapsed)"
fi
fi
fi
fi
if [ $elapsed -ge $timeout ]; then
log_error "Cluster not ready after ${timeout}s"
log_info "Cluster status: ${CLUSTER_LINE:-not found}"
log_info "Available contexts:"
kubectl config get-contexts 2>/dev/null || true
log_info "Expected context: ${CLUSTER_CONTEXT}"
log_info "Trying to merge kubeconfig one more time..."
k3d kubeconfig merge "${cluster_name}" --kubeconfig-merge-default 2>&1 || true
sleep 2
log_info "Contexts after merge:"
kubectl config get-contexts 2>/dev/null || true
# Try one last time with any k3d context
if kubectl config get-contexts 2>/dev/null | grep -q "k3d"; then
K3D_CTX=$(kubectl config get-contexts --no-headers 2>/dev/null | grep "k3d" | awk '{print $2}' | head -1)
if [ -n "${K3D_CTX}" ]; then
log_info "Trying with context: ${K3D_CTX}"
kubectl config use-context "${K3D_CTX}" 2>/dev/null || true
if kubectl cluster-info &>/dev/null 2>&1; then
log_warning "Cluster accessible with context ${K3D_CTX}, but expected ${CLUSTER_CONTEXT}"
# Update CLUSTER_CONTEXT to match
CLUSTER_CONTEXT="${K3D_CTX}"
return 0
fi
fi
fi
return 1
fi
sleep 2
elapsed=$((elapsed + 2))
done
}
# Wait for kubectl to be ready (assumes context is already set)
wait_for_kubectl() {
local timeout=$1
local elapsed=0
log_info "Waiting for kubectl to be ready..."
while ! kubectl cluster-info &>/dev/null; do
if [ $elapsed -ge $timeout ]; then
log_error "kubectl not ready after ${timeout}s"
return 1
fi
sleep 2
elapsed=$((elapsed + 2))
done
return 0
}
# Install helm chart with idempotency check
helm_install_idempotent() {
local release_name=$1
local chart=$2
local namespace=$3
local timeout=$4
shift 4
local extra_args=("$@")
if helm status "${release_name}" -n "${namespace}" &>/dev/null; then
log_info "${release_name} already installed, skipping..."
return 0
fi
log_info "Installing ${release_name}..."
log_info "This may take several minutes..."
if helm install "${release_name}" "${chart}" \
--namespace "${namespace}" \
--create-namespace \
--timeout "${timeout}s" \
"${extra_args[@]}"; then
log_success "${release_name} installed successfully"
return 0
else
log_error "Failed to install ${release_name}"
return 1
fi
}
# Wait for workloads to be ready in a namespace.
# Uses deployment and statefulset waits instead of raw pod waits to avoid a
# race condition where kubectl wait captures a Job pod in Running phase, then
# hangs indefinitely when it transitions to Succeeded (since Succeeded pods
# never become Ready).
wait_for_pods() {
local namespace=$1
local timeout=$2
local selector=${3:-""}
local all_ready=true
if [ -n "$selector" ]; then
log_info "Waiting for pods (${selector}) in ${namespace} to be ready (timeout: ${timeout}s)..."
kubectl wait --for=condition=Ready pod -l "${selector}" --field-selector=status.phase!=Succeeded -n "${namespace}" --timeout="${timeout}s" || {
log_warning "Some pods may still be starting (non-fatal)"
all_ready=false
}
else
log_info "Waiting for workloads in ${namespace} to be ready (timeout: ${timeout}s)..."
kubectl wait --for=condition=Available deployment --all -n "${namespace}" --timeout="${timeout}s" 2>/dev/null || {
log_warning "Some deployments may still be starting (non-fatal)"
all_ready=false
}
for sts in $(kubectl get statefulset -n "${namespace}" -o name 2>/dev/null); do
kubectl rollout status "${sts}" -n "${namespace}" --timeout="${timeout}s" 2>/dev/null || {
log_warning "StatefulSet ${sts} may still be starting (non-fatal)"
all_ready=false
}
done
for job in $(kubectl get jobs -n "${namespace}" -o name 2>/dev/null); do
if kubectl get "${job}" -n "${namespace}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null | grep -q '^True$'; then
log_warning "Job ${job} has failed; skipping Complete wait (non-fatal)"
all_ready=false
continue
fi
kubectl wait --for=condition=Complete "${job}" -n "${namespace}" --timeout="${timeout}s" 2>/dev/null || {
log_warning "Job ${job} may still be running (non-fatal)"
all_ready=false
}
done
fi
if [ "${all_ready}" = true ]; then
log_success "Workloads in ${namespace} are ready"
else
log_warning "Workloads in ${namespace} are running but some were not fully ready"
fi
}
# Wait for deployments to be available
wait_for_deployments() {
local namespace=$1
local timeout=$2
log_info "Waiting for deployments in ${namespace} to be available (timeout: ${timeout}s)..."
kubectl wait --for=condition=Available deployment --all -n "${namespace}" --timeout="${timeout}s" || {
log_warning "Some deployments may still be starting (non-fatal)"
return 0
}
log_success "Deployments in ${namespace} are available"
}
# Wait for statefulsets to be ready
wait_for_statefulsets() {
local namespace=$1
local timeout=$2
log_info "Waiting for statefulsets in ${namespace} to be ready (timeout: ${timeout}s)..."
for sts in $(kubectl get statefulset -n "${namespace}" -o name 2>/dev/null); do
kubectl rollout status "${sts}" -n "${namespace}" --timeout="${timeout}s" || {
log_warning "StatefulSet ${sts} may still be starting (non-fatal)"
}
done
log_success "Statefulsets in ${namespace} are ready"
}
# Wait for a job to complete
wait_for_job() {
local job_name=$1
local namespace=$2
local timeout=$3
local interval=5
local elapsed=0
log_info "Waiting for job '${job_name}' in ${namespace} to exist (timeout: ${timeout}s)..."
# First, poll for job existence
while [ $elapsed -lt $timeout ]; do
if kubectl get job/"${job_name}" -n "${namespace}" --context ${CLUSTER_CONTEXT} &>/dev/null; then
log_info "Job '${job_name}' found, waiting for completion..."
break
fi
sleep $interval
elapsed=$((elapsed + interval))
done
if [ $elapsed -ge $timeout ]; then
log_warning "Job '${job_name}' not found after ${timeout}s"
return 1
fi
# Calculate remaining timeout for completion wait
local remaining_timeout=$((timeout - elapsed))
if [ $remaining_timeout -lt 10 ]; then
remaining_timeout=10
fi
# Now wait for job completion
if kubectl wait --for=condition=Complete job/"${job_name}" -n "${namespace}" --context ${CLUSTER_CONTEXT} --timeout="${remaining_timeout}s" 2>/dev/null; then
log_success "Job '${job_name}' completed successfully"
return 0
else
log_warning "Job '${job_name}' did not complete in time"
return 1
fi
}
# Wait for a secret to exist
wait_for_secret() {
local namespace=$1
local secret_name=$2
local timeout=${3:-120}
local interval=5
local elapsed=0
log_info "Waiting for secret '${secret_name}' in ${namespace} (timeout: ${timeout}s)..."
while [ $elapsed -lt $timeout ]; do
if kubectl get secret "${secret_name}" -n "${namespace}" &>/dev/null; then
log_success "Secret '${secret_name}' is ready"
return 0
fi
sleep $interval
elapsed=$((elapsed + interval))
done
log_warning "Timeout waiting for secret '${secret_name}' in ${namespace}"
return 1
}
# Copy cluster-gateway-ca certificate resources from control plane to a target plane namespace
create_plane_cert_resources() {
local target_namespace=$1
log_info "Creating certificate resources in ${target_namespace}..."
# Create namespace if it doesn't exist
kubectl create namespace "${target_namespace}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null
# Wait for cert-manager to issue the cluster-gateway CA
log_info "Waiting for cluster-gateway-ca certificate to be ready..."
if ! kubectl wait -n openchoreo-control-plane \
--for=condition=Ready certificate/cluster-gateway-ca --timeout=120s 2>/dev/null; then
log_warning "Timeout waiting for cluster-gateway-ca certificate"
return 1
fi
# Get CA certificate from control plane secret
CA_CRT=$(kubectl get secret cluster-gateway-ca \
-n openchoreo-control-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d)
if [ -z "$CA_CRT" ]; then
log_warning "Could not retrieve CA certificate from control plane"
return 1
fi
# Create configmap in target namespace
if kubectl create configmap cluster-gateway-ca \
--from-literal=ca.crt="$CA_CRT" \
-n "${target_namespace}" --dry-run=client -o yaml | kubectl apply -f - 2>/dev/null; then
log_success "cluster-gateway-ca configmap created in ${target_namespace}"
else
log_error "Failed to create cluster-gateway-ca configmap in ${target_namespace}"
return 1
fi
return 0
}
# ============================================================================
# MAIN INSTALLATION FLOW
# ============================================================================
log_step "OpenChoreo Development Environment Setup"
# Verify Docker is pointing at the dedicated 'agent-manager' Colima profile.
# Detection works from inside the dev container because `docker info` queries
# the daemon: Colima names the VM node after its profile ('colima' for the
# default profile, 'colima-<profile>' otherwise). Non-Colima Docker setups
# (Docker Desktop, Rancher, Linux) fall through this check.
EXPECTED_COLIMA_PROFILE="agent-manager"
check_colima_profile() {
local node_name
node_name=$(docker info --format '{{.Name}}' 2>/dev/null || echo "")
# Non-Colima daemon -> nothing to enforce
case "${node_name}" in
colima|colima-*) ;;
*) return 0 ;;
esac
local expected="colima-${EXPECTED_COLIMA_PROFILE}"
if [ "${node_name}" = "${expected}" ]; then
log_success "Colima profile '${EXPECTED_COLIMA_PROFILE}' is active"
return 0
fi
log_error "Expected Colima profile '${EXPECTED_COLIMA_PROFILE}', got '${node_name}'."
log_info "Start it with: colima start --profile ${EXPECTED_COLIMA_PROFILE} --vm-type=vz --vz-rosetta --network-address --cpu 4 --memory 8"
return 1
}
# Check and fix Docker permissions
check_docker_permissions() {
local docker_sock="/var/run/docker.sock"
if [ ! -S "${docker_sock}" ]; then
log_error "Docker socket not found at ${docker_sock}"
log_info "Make sure Docker is running and the socket is mounted"
return 1
fi
# Check if we can access Docker
if docker ps &>/dev/null; then
log_success "Docker access verified"
check_colima_profile || return 1
return 0
fi
# Try to fix permissions
log_warning "Docker socket permissions issue detected. Attempting to fix..."
if sudo chmod 666 "${docker_sock}" 2>/dev/null; then
log_success "Docker socket permissions fixed"
return 0
else
log_error "Cannot fix Docker socket permissions. Please run: sudo chmod 666 ${docker_sock}"
return 1
fi
}
# Check prerequisites
log_step "Step 1/13: Verifying prerequisites"
# Check Docker access first
if ! check_docker_permissions; then
log_error "Docker permission check failed"
exit 1
fi
if ! command_exists k3d; then
log_error "k3d is not installed"
exit 1
fi
if ! command_exists kubectl; then
log_error "kubectl is not installed"
exit 1
fi
if ! command_exists helm; then
log_error "helm is not installed"
exit 1
fi
if ! command_exists curl; then
log_error "curl is not installed"
exit 1
fi
if ! command_exists lsof; then
log_error "lsof is not installed"
exit 1
fi
# Check if required ports are available (only when creating a new cluster)
if ! k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then
log_info "Checking required ports are available..."
if ! check_required_ports; then
exit 1
fi
log_success "All required ports are available"
fi
log_success "All prerequisites verified"
# ============================================================================
# Step 2: Setup k3d Cluster
# ============================================================================
log_step "Step 2/13: Setting up k3d cluster"
# Check if cluster already exists
if k3d cluster list 2>/dev/null | grep -q "${CLUSTER_NAME}"; then
log_info "k3d cluster '${CLUSTER_NAME}' already exists"
# Check cluster status - k3d shows status in various formats
CLUSTER_LINE=$(k3d cluster list 2>/dev/null | grep "${CLUSTER_NAME}" || echo "")
if [ -n "${CLUSTER_LINE}" ] && (echo "${CLUSTER_LINE}" | grep -qE "(running|true)" || \
echo "${CLUSTER_LINE}" | grep -qE "[0-9]+/[0-9]+.*true"); then
CLUSTER_STATUS="running"
else
CLUSTER_STATUS="stopped"
fi
if [ "${CLUSTER_STATUS}" = "running" ]; then
log_info "Cluster is running, verifying access..."
# Set context first (might not be set yet)
kubectl config use-context "${CLUSTER_CONTEXT}" 2>/dev/null || true
# Verify cluster is accessible
if kubectl cluster-info --context "${CLUSTER_CONTEXT}" &>/dev/null; then
log_success "Cluster is running and accessible"
else
log_info "Cluster is running but not accessible yet. Merging kubeconfig and waiting..."
# Merge kubeconfig to ensure context is available
k3d kubeconfig merge "${CLUSTER_NAME}" --kubeconfig-merge-default 2>/dev/null || true
sleep 2
if ! wait_for_k3d_cluster "${CLUSTER_NAME}" "${TIMEOUT_K3D_READY}"; then
log_error "Cluster failed to become ready"
exit 1
fi
fi
else
log_info "Cluster exists but is not running. Starting cluster..."
k3d cluster start "${CLUSTER_NAME}"
# Merge kubeconfig to ensure context is available
log_info "Merging k3d kubeconfig..."
k3d kubeconfig merge "${CLUSTER_NAME}" --kubeconfig-merge-default 2>/dev/null || true
sleep 2
# Wait for cluster to be fully ready (context registered and API accessible)
if ! wait_for_k3d_cluster "${CLUSTER_NAME}" "${TIMEOUT_K3D_READY}"; then
log_error "Cluster failed to become ready"
exit 1
fi
log_success "Cluster is now ready"
fi
# Ensure context is set
kubectl config use-context "${CLUSTER_CONTEXT}" || {
log_error "Failed to set kubectl context"
exit 1
}
log_success "Using existing cluster"
else
log_info "Creating k3d cluster..."
# Create shared directory for OpenChoreo
mkdir -p /tmp/k3d-shared
# Create k3d cluster
if k3d cluster create --config "${K3D_CONFIG}"; then
log_success "k3d cluster created successfully"
else
log_error "Failed to create k3d cluster"
exit 1
fi
# Merge kubeconfig to ensure context is available
log_info "Merging k3d kubeconfig..."
k3d kubeconfig merge "${CLUSTER_NAME}" --kubeconfig-merge-default 2>/dev/null || true
sleep 2
# Set kubectl context
kubectl config use-context "${CLUSTER_CONTEXT}" || {
log_error "Failed to set kubectl context"
exit 1
}
# Wait for cluster to be ready
if wait_for_kubectl "${TIMEOUT_K3D_READY}"; then
log_success "Cluster is ready"
else
log_error "Cluster failed to become ready"
exit 1
fi
log_info "Cluster info:"
kubectl cluster-info --context "${CLUSTER_CONTEXT}"
echo ""
log_info "Cluster nodes:"
kubectl get nodes
fi
# ============================================================================
# Step 3: Apply CoreDNS Custom Configuration
# ============================================================================
log_step "Step 3/13: Applying CoreDNS Custom Configuration"
log_info "Applying CoreDNS custom configuration for OpenChoreo and AMP..."
COREDNS_FILE="https://raw.githubusercontent.com/wso2/agent-manager/amp/v${VERSION}/deployments/k8s/coredns-amp-custom.yaml"
if kubectl apply -f "${COREDNS_FILE}"; then
log_success "CoreDNS custom configuration applied successfully"
else
log_error "Failed to apply CoreDNS custom configuration"
exit 1
fi
# ============================================================================
# Step 4: Generate Machine IDs for observability
# ============================================================================
log_step "Step 4/13: Generating Machine IDs for observability"
log_info "Generating Machine IDs for Fluent Bit observability..."
NODES=$(k3d node list -o json | grep -o '"name"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"name"[[:space:]]*:[[:space:]]*"//;s/"$//' | grep "^k3d-$CLUSTER_NAME-")
if [[ -z "$NODES" ]]; then
log_error "Could not retrieve node list"
else
for NODE in $NODES; do
log_info "Generating machine ID for ${NODE}..."
if docker exec ${NODE} sh -c "cat /proc/sys/kernel/random/uuid | tr -d '-' > /etc/machine-id" 2>/dev/null; then
log_success "Machine ID generated for ${NODE}"
else
log_error "Could not generate Machine ID for ${NODE}"
fi
done
fi
log_success "Machine ID generation complete"
# ============================================================================
# Step 5: Install Cluster Prerequisites
# ============================================================================
log_step "Step 5/13: Installing Cluster Prerequisites (Cert Manager, Gateway API CRDs, External Secrets, kgateway)"
# Install Cert Manager
log_info "Installing Cert Manager..."
helm_install_idempotent \
"cert-manager" \
"oci://quay.io/jetstack/charts/cert-manager" \
"cert-manager" \
300 \
--version v1.19.2 \
--set crds.enabled=true
wait_for_pods "cert-manager" 300
# Install Gateway API CRDs
log_info "Installing Gateway API CRDs..."
GATEWAY_API_CRD="https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.4.1/experimental-install.yaml"
if kubectl apply --server-side --force-conflicts -f "${GATEWAY_API_CRD}" &>/dev/null; then
log_success "Gateway API CRDs applied successfully"
else
log_error "Failed to apply Gateway API CRDs"
exit 1
fi
# Install External Secrets Operator
log_info "Installing External Secret Operator..."
if helm upgrade --install external-secrets oci://ghcr.io/external-secrets/charts/external-secrets \
--kube-context ${CLUSTER_CONTEXT} \
--namespace external-secrets \
--create-namespace \
--version 1.3.2 \
--set installCRDs=true \
--timeout 180s &>/dev/null; then
log_success "External Secret Operator installed successfully"
else
log_error "Failed to install External Secret Operator"
exit 1
fi
log_info "Waiting for External Secret Operator to be ready..."
if kubectl wait --for=condition=Available deployment/external-secrets -n external-secrets --context ${CLUSTER_CONTEXT} --timeout=180s 2>/dev/null; then
log_success "External Secret Operator is ready"
else
log_warning "External Secret Operator may still be starting (non-fatal)"
fi
wait_for_pods "external-secrets" 180
# Install kgateway CRDs
log_info "Installing kgateway CRDs..."
if helm upgrade --install kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds \
--kube-context ${CLUSTER_CONTEXT} \
--namespace openchoreo-control-plane \
--create-namespace \
--version v2.2.1 \
--timeout 180s &>/dev/null; then
log_success "kgateway CRDs installed successfully"
else
log_error "Failed to install kgateway CRDs"
exit 1
fi
# Install kgateway
log_info "Installing kgateway..."
if helm upgrade --install kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway \
--kube-context ${CLUSTER_CONTEXT} \
--namespace openchoreo-control-plane \
--create-namespace \
--version v2.2.1 \
--set controller.extraEnv.KGW_ENABLE_GATEWAY_API_EXPERIMENTAL_FEATURES=true \
--timeout 180s &>/dev/null; then
log_success "kgateway installed successfully"
else
log_error "Failed to install kgateway"
exit 1
fi
# ============================================================================
# Step 6: Setup Secrets (OpenBao for Workflow Plane)
# ============================================================================
log_step "Step 6/13: Setup Secrets (OpenBao for Workflow Plane)"
# Install OpenBao for Workflow Plane secret management
log_info "Installing OpenBao for Workflow Plane..."
if helm upgrade --install openbao oci://ghcr.io/openbao/charts/openbao \
--kube-context ${CLUSTER_CONTEXT} \
--namespace openbao \
--create-namespace \
--version 0.25.6 \
--values "https://raw.githubusercontent.com/wso2/agent-manager/amp/v${VERSION}/deployments/single-cluster/values-openbao.yaml" \
--timeout 180s &>/dev/null; then
log_success "OpenBao installed successfully"
else
log_error "Failed to install OpenBao"
exit 1
fi
log_info "Waiting for OpenBao to be ready..."
if kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=openbao -n openbao --context ${CLUSTER_CONTEXT} --timeout=120s 2>/dev/null; then
log_success "OpenBao is ready"
else
log_warning "OpenBao may still be starting (non-fatal)"
fi
# Configure External Secrets ClusterSecretStore for OpenBao
log_info "Configuring External Secrets ClusterSecretStore for OpenBao..."
if kubectl --context ${CLUSTER_CONTEXT} apply -f - <<EOF
apiVersion: v1
kind: ServiceAccount
metadata:
name: external-secrets-openbao
namespace: openbao
---
apiVersion: external-secrets.io/v1
kind: ClusterSecretStore
metadata:
name: default
spec:
provider:
vault:
server: "http://openbao.openbao.svc:8200"
path: "secret"
version: "v2"
auth:
kubernetes:
mountPath: "kubernetes"
role: "openchoreo-secret-writer-role"
serviceAccountRef:
name: "external-secrets-openbao"
namespace: "openbao"
EOF
then
log_success "External Secrets ClusterSecretStore configured for OpenBao"
else
log_error "Failed to configure ClusterSecretStore for OpenBao"
exit 1
fi
# ============================================================================
# Step 7: Install OpenChoreo Control Plane
# ============================================================================
log_step "Step 7/13: Installing OpenChoreo Control Plane"
helm_install_idempotent \
"openchoreo-control-plane" \
"oci://ghcr.io/openchoreo/helm-charts/openchoreo-control-plane" \
"openchoreo-control-plane" \
"${TIMEOUT_CONTROL_PLANE}" \
--version "${OPENCHOREO_VERSION}" \
--values "https://raw.githubusercontent.com/wso2/agent-manager/amp/v${VERSION}/deployments/single-cluster/values-cp.yaml"
wait_for_pods "openchoreo-control-plane" "${TIMEOUT_CONTROL_PLANE}"
# ============================================================================
# Step 8: Install OpenChoreo Data Plane
# ============================================================================
log_step "Step 8/13: Installing OpenChoreo Data Plane"
create_plane_cert_resources "openchoreo-data-plane"
helm_install_idempotent \
"openchoreo-data-plane" \
"oci://ghcr.io/openchoreo/helm-charts/openchoreo-data-plane" \
"${DATA_PLANE_NS}" \
"${TIMEOUT_DATA_PLANE}" \
--version "${OPENCHOREO_VERSION}" \
--values "https://raw.githubusercontent.com/wso2/agent-manager/amp/v${VERSION}/deployments/single-cluster/values-dp.yaml"
# Register Data Plane with Control Plane
log_info "Registering Data Plane with Control Plane..."
# Wait for the cert-manager Certificate to be Ready (not just the Secret) to avoid a race
# where cert-manager re-issues the cert after we read it but before the agent connects.
log_info "Waiting for data plane agent certificate to be ready..."
if ! kubectl wait -n openchoreo-data-plane \
--for=condition=Ready certificate/cluster-agent-dataplane-tls --timeout=180s; then
log_error "Data plane agent certificate not ready. Cannot register data plane."
exit 1
fi
CA_CERT=$(kubectl get secret cluster-agent-tls -n openchoreo-data-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d || echo "")
if [ -n "$CA_CERT" ]; then
if kubectl apply -f - <<EOF
apiVersion: openchoreo.dev/v1alpha1
kind: ClusterDataPlane
metadata:
name: default
namespace: default
spec:
planeID: "default"
clusterAgent:
clientCA:
value: |
$(echo "$CA_CERT" | sed 's/^/ /')
gateway:
ingress:
external:
name: gateway-default
namespace: openchoreo-data-plane
http:
host: "openchoreoapis.localhost"
listenerName: http
port: 19080
https:
host: "openchoreoapis.localhost"
listenerName: https
port: 19443
secretStoreRef:
name: amp-openbao-store
EOF
then
log_success "Data Plane registered with Control Plane successfully"
else
log_warning "Failed to register Data Plane (non-fatal)"
fi
else
log_warning "CA certificate not found, skipping Data Plane registration"
fi
# Verify ClusterDataPlane resource
if kubectl get clusterdataplane default -n default &>/dev/null; then
log_success "ClusterDataPlane resource 'default' exists"
else
log_warning "ClusterDataPlane resource not found"
fi
wait_for_pods "openchoreo-data-plane" "${TIMEOUT_DATA_PLANE}"
# ============================================================================
# Step 9: Install OpenChoreo Workflow Plane
# ============================================================================
log_step "Step 9/13: Installing OpenChoreo Workflow Plane"
create_plane_cert_resources "openchoreo-workflow-plane"
# Install Docker Registry for Workflow Plane
log_info "Installing Docker Registry for Workflow Plane..."
if helm status registry -n openchoreo-workflow-plane &>/dev/null; then
log_info "Docker Registry already installed, skipping..."
else
if helm upgrade --install registry docker-registry \
--repo https://twuni.github.io/docker-registry.helm \
--namespace openchoreo-workflow-plane \
--create-namespace \
--values https://raw.githubusercontent.com/openchoreo/openchoreo/v${OPENCHOREO_VERSION}/install/k3d/single-cluster/values-registry.yaml \
--timeout 120s; then
log_success "Docker Registry installed successfully"
else
log_error "Failed to install Docker Registry"
exit 1
fi
fi
log_info "Waiting for Docker Registry to be ready..."
if kubectl wait --for=condition=available deployment/registry -n openchoreo-workflow-plane --timeout=120s 2>/dev/null; then
log_success "Docker Registry is ready"
else
log_warning "Docker Registry may still be starting (non-fatal)"
fi
helm_install_idempotent \
"openchoreo-workflow-plane" \
"oci://ghcr.io/openchoreo/helm-charts/openchoreo-workflow-plane" \
"${BUILD_CI_NS}" \
"${TIMEOUT_BUILD_PLANE}" \
--version "${OPENCHOREO_VERSION}"
# Register Workflow Plane with Control Plane
log_info "Registering Workflow Plane with Control Plane..."
wait_for_secret "openchoreo-workflow-plane" "cluster-agent-tls" 180
BP_CA_CERT=$(kubectl get secret cluster-agent-tls -n openchoreo-workflow-plane -o jsonpath='{.data.ca\.crt}' 2>/dev/null | base64 -d || echo "")
if [ -n "$BP_CA_CERT" ]; then
if kubectl apply -f - <<EOF
apiVersion: openchoreo.dev/v1alpha1
kind: ClusterWorkflowPlane
metadata:
name: default
namespace: default
spec:
planeID: "default"
secretStoreRef:
name: default
clusterAgent:
clientCA:
value: |
$(echo "$BP_CA_CERT" | sed 's/^/ /')
EOF
then
log_success "Workflow Plane registered with Control Plane successfully"
else
log_warning "Failed to register Workflow Plane (non-fatal)"
fi
else
log_warning "Workflow Plane CA certificate not found, skipping Workflow Plane registration"
fi