-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathdeploy_fma.sh
More file actions
executable file
·188 lines (157 loc) · 6.97 KB
/
deploy_fma.sh
File metadata and controls
executable file
·188 lines (157 loc) · 6.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env bash
# Usage: $0
# Current working directory must be the root of the Git repository.
#
# Deploys the FMA controllers (dual-pods controller + launcher-populator)
# and waits for them to be available.
#
# Required environment variables:
# FMA_NAMESPACE - target Kubernetes namespace
# FMA_CHART_INSTANCE_NAME - Helm chart instance name
# CONTAINER_IMG_REG - container image registry/namespace
# (e.g. ghcr.io/llm-d-incubation/llm-d-fast-model-actuation)
# IMAGE_TAG - image tag for all components
# (e.g. ref-abcd1234)
#
# Optional environment variables:
# NODE_VIEW_CLUSTER_ROLE - ClusterRole granting node read access.
# If unset or empty, no ClusterRole is configured
# (consistent with the Helm chart default).
# If set to "create/please", the script creates one
# named "${FMA_CHART_INSTANCE_NAME}-node-view".
# Any other value is used as the name of an existing
# ClusterRole.
# RUNTIME_CLASS_NAME - if set, adds runtimeClassName to GPU pod specs
# (e.g. "nvidia" when the GPU operator requires it)
# POLICIES_ENABLED - "true"/"false"; auto-detected if unset
# FMA_DEBUG - "true" to enable shell tracing (set -x)
# HELM_EXTRA_ARGS - additional Helm arguments appended to the
# `helm upgrade --install` invocation
# (e.g. "--set global.local=true --set dualPodsController.sleeperLimit=4")
set -euo pipefail
if [ "${FMA_DEBUG:-false}" = "true" ]; then
set -x
fi
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
step_num=0
total_steps=6
step() {
step_num=$((step_num + 1))
echo ""
echo "========================================"
echo "[deploy_fma] Step ${step_num}/${total_steps}: $*"
echo "========================================"
echo ""
}
# ---------------------------------------------------------------------------
# Step 1: Validate required environment variables
# ---------------------------------------------------------------------------
step "Validate required environment variables"
missing=()
for var in FMA_NAMESPACE FMA_CHART_INSTANCE_NAME CONTAINER_IMG_REG IMAGE_TAG; do
if [ -z "${!var:-}" ]; then
missing+=("$var")
fi
done
if [ ${#missing[@]} -gt 0 ]; then
echo "ERROR: Missing required environment variables: ${missing[*]}" >&2
exit 1
fi
echo "Configuration:"
echo " FMA_NAMESPACE: $FMA_NAMESPACE"
echo " FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
echo " CONTAINER_IMG_REG: $CONTAINER_IMG_REG"
echo " IMAGE_TAG: $IMAGE_TAG"
echo " NODE_VIEW_CLUSTER_ROLE: ${NODE_VIEW_CLUSTER_ROLE:-<none>}"
echo " RUNTIME_CLASS_NAME: ${RUNTIME_CLASS_NAME:-<unset>}"
echo " POLICIES_ENABLED: ${POLICIES_ENABLED:-<auto-detect>}"
echo " HELM_EXTRA_ARGS: ${HELM_EXTRA_ARGS:-<none>}"
# ---------------------------------------------------------------------------
# Step 2: Apply FMA CRDs
# ---------------------------------------------------------------------------
step "Apply FMA CRDs"
CRD_NAMES=""
for crd_file in config/crd/*.yaml; do
crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
CRD_NAMES="$CRD_NAMES $crd_name"
if kubectl get crd "$crd_name" &>/dev/null; then
echo " CRD $crd_name already exists, skipping"
else
echo " Applying $crd_file ($crd_name)"
kubectl apply --server-side -f "$crd_file"
fi
done
echo "Waiting for CRDs to become Established..."
for crd_name in $CRD_NAMES; do
kubectl wait --for=condition=Established "crd/$crd_name" --timeout=120s
done
echo "All CRDs established"
# ---------------------------------------------------------------------------
# Step 3: Create node-viewer ClusterRole
# ---------------------------------------------------------------------------
step "Configure node-viewer ClusterRole"
if [ -z "${NODE_VIEW_CLUSTER_ROLE:-}" ]; then
CLUSTER_ROLE_NAME=""
echo "Skipped (NODE_VIEW_CLUSTER_ROLE not set)"
elif [ "${NODE_VIEW_CLUSTER_ROLE}" = "create/please" ]; then
CLUSTER_ROLE_NAME="${FMA_CHART_INSTANCE_NAME}-node-view"
if kubectl get clusterrole "$CLUSTER_ROLE_NAME" &>/dev/null; then
echo "ClusterRole $CLUSTER_ROLE_NAME already exists, skipping"
else
kubectl create clusterrole "$CLUSTER_ROLE_NAME" --verb=get,list,watch --resource=nodes
echo "ClusterRole $CLUSTER_ROLE_NAME created"
fi
else
CLUSTER_ROLE_NAME="${NODE_VIEW_CLUSTER_ROLE}"
echo "Using existing ClusterRole: $CLUSTER_ROLE_NAME"
fi
# ---------------------------------------------------------------------------
# Step 4: Detect and apply ValidatingAdmissionPolicies
# ---------------------------------------------------------------------------
step "ValidatingAdmissionPolicies"
if [ -z "${POLICIES_ENABLED:-}" ]; then
POLICIES_ENABLED=false
if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
| grep -q 'validatingadmissionpolicies'; then
POLICIES_ENABLED=true
fi
echo "Auto-detected POLICIES_ENABLED=$POLICIES_ENABLED"
fi
if [ "$POLICIES_ENABLED" = "true" ]; then
echo "Applying ValidatingAdmissionPolicy resources..."
kubectl apply -f config/validating-admission-policies/
else
echo "ValidatingAdmissionPolicy not supported or disabled, skipping"
fi
# ---------------------------------------------------------------------------
# Step 5: Deploy FMA controllers via Helm
# ---------------------------------------------------------------------------
step "Deploy FMA controllers via Helm"
HELM_ARGS=(
--set global.imageRegistry="${CONTAINER_IMG_REG}"
--set global.imageTag="${IMAGE_TAG}"
)
# Append any caller-supplied Helm arguments (e.g. --set global.local=true)
if [ -n "${HELM_EXTRA_ARGS:-}" ]; then
read -ra _extra <<< "$HELM_EXTRA_ARGS"
HELM_ARGS+=("${_extra[@]}")
fi
if [ -n "$CLUSTER_ROLE_NAME" ]; then
HELM_ARGS+=(--set global.nodeViewClusterRole="${CLUSTER_ROLE_NAME}")
fi
helm upgrade --install "$FMA_CHART_INSTANCE_NAME" charts/fma-controllers \
-n "$FMA_NAMESPACE" \
"${HELM_ARGS[@]}"
# ---------------------------------------------------------------------------
# Step 6: Wait for controllers to be ready
# ---------------------------------------------------------------------------
step "Wait for controllers to be ready"
kubectl wait --for=condition=available --timeout=120s \
deployment "${FMA_CHART_INSTANCE_NAME}-dual-pods-controller" -n "$FMA_NAMESPACE"
kubectl wait --for=condition=available --timeout=120s \
deployment "${FMA_CHART_INSTANCE_NAME}-launcher-populator" -n "$FMA_NAMESPACE"
echo "Both controllers are available"
echo ""
echo "[deploy_fma] All steps completed successfully"