llm-d-fast-model-actuation/test/e2e/deploy_fma.sh at 306088a9f45c3d767250225471aecefb99eac51a · llm-d-incubation/llm-d-fast-model-actuation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/usr/bin/env bash

# Usage: $0
# Current working directory must be the root of the Git repository.
#
# Deploys the FMA controllers (dual-pods controller + launcher-populator)
# and waits for them to be available.
#
# Required environment variables:
#   FMA_NAMESPACE              - target Kubernetes namespace
#   FMA_CHART_INSTANCE_NAME    - Helm chart instance name
#   CONTAINER_IMG_REG          - container image registry/namespace
#                                (e.g. ghcr.io/llm-d-incubation/llm-d-fast-model-actuation)
#   IMAGE_TAG                  - image tag for all components
#                                (e.g. ref-abcd1234)
#
# Optional environment variables:
#   NODE_VIEW_CLUSTER_ROLE - ClusterRole granting node read access.
#                            If unset or empty, no ClusterRole is configured
#                            (consistent with the Helm chart default).
#                            If set to "create/please", the script creates one
#                            named "${FMA_CHART_INSTANCE_NAME}-node-view".
#                            Any other value is used as the name of an existing
#                            ClusterRole.
#   RUNTIME_CLASS_NAME  - if set, adds runtimeClassName to GPU pod specs
#                         (e.g. "nvidia" when the GPU operator requires it)
#   POLICIES_ENABLED    - "true"/"false"; auto-detected if unset
#   FMA_DEBUG            - "true" to enable shell tracing (set -x)
#   HELM_EXTRA_ARGS     - additional Helm arguments appended to the
#                         `helm upgrade --install` invocation
#                         (e.g. "--set global.local=true --set dualPodsController.sleeperLimit=4")

set -euo pipefail
if [ "${FMA_DEBUG:-false}" = "true" ]; then
    set -x
fi

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

step_num=0
total_steps=6

step() {
    step_num=$((step_num + 1))
    echo ""
    echo "========================================"
    echo "[deploy_fma] Step ${step_num}/${total_steps}: $*"
    echo "========================================"
    echo ""
}

# ---------------------------------------------------------------------------
# Step 1: Validate required environment variables
# ---------------------------------------------------------------------------

step "Validate required environment variables"

missing=()
for var in FMA_NAMESPACE FMA_CHART_INSTANCE_NAME CONTAINER_IMG_REG IMAGE_TAG; do
    if [ -z "${!var:-}" ]; then
        missing+=("$var")
    fi
done

if [ ${#missing[@]} -gt 0 ]; then
    echo "ERROR: Missing required environment variables: ${missing[*]}" >&2
    exit 1
fi

echo "Configuration:"
echo "  FMA_NAMESPACE:           $FMA_NAMESPACE"
echo "  FMA_CHART_INSTANCE_NAME: $FMA_CHART_INSTANCE_NAME"
echo "  CONTAINER_IMG_REG:       $CONTAINER_IMG_REG"
echo "  IMAGE_TAG:               $IMAGE_TAG"
echo "  NODE_VIEW_CLUSTER_ROLE:  ${NODE_VIEW_CLUSTER_ROLE:-<none>}"
echo "  RUNTIME_CLASS_NAME:      ${RUNTIME_CLASS_NAME:-<unset>}"
echo "  POLICIES_ENABLED:        ${POLICIES_ENABLED:-<auto-detect>}"
echo "  HELM_EXTRA_ARGS:         ${HELM_EXTRA_ARGS:-<none>}"

# ---------------------------------------------------------------------------
# Step 2: Apply FMA CRDs
# ---------------------------------------------------------------------------

step "Apply FMA CRDs"

CRD_NAMES=""
for crd_file in config/crd/*.yaml; do
    crd_name=$(kubectl apply --dry-run=client -f "$crd_file" -o jsonpath='{.metadata.name}')
    CRD_NAMES="$CRD_NAMES $crd_name"
    if kubectl get crd "$crd_name" &>/dev/null; then
        echo "  CRD $crd_name already exists, skipping"
    else
        echo "  Applying $crd_file ($crd_name)"
        kubectl apply --server-side -f "$crd_file"
    fi
done

echo "Waiting for CRDs to become Established..."
for crd_name in $CRD_NAMES; do
    kubectl wait --for=condition=Established "crd/$crd_name" --timeout=120s
done
echo "All CRDs established"

# ---------------------------------------------------------------------------
# Step 3: Create node-viewer ClusterRole
# ---------------------------------------------------------------------------

step "Configure node-viewer ClusterRole"

if [ -z "${NODE_VIEW_CLUSTER_ROLE:-}" ]; then
    CLUSTER_ROLE_NAME=""
    echo "Skipped (NODE_VIEW_CLUSTER_ROLE not set)"
elif [ "${NODE_VIEW_CLUSTER_ROLE}" = "create/please" ]; then
    CLUSTER_ROLE_NAME="${FMA_CHART_INSTANCE_NAME}-node-view"
    if kubectl get clusterrole "$CLUSTER_ROLE_NAME" &>/dev/null; then
        echo "ClusterRole $CLUSTER_ROLE_NAME already exists, skipping"
    else
        kubectl create clusterrole "$CLUSTER_ROLE_NAME" --verb=get,list,watch --resource=nodes
        echo "ClusterRole $CLUSTER_ROLE_NAME created"
    fi
else
    CLUSTER_ROLE_NAME="${NODE_VIEW_CLUSTER_ROLE}"
    echo "Using existing ClusterRole: $CLUSTER_ROLE_NAME"
fi

# ---------------------------------------------------------------------------
# Step 4: Detect and apply ValidatingAdmissionPolicies
# ---------------------------------------------------------------------------

step "ValidatingAdmissionPolicies"

if [ -z "${POLICIES_ENABLED:-}" ]; then
    POLICIES_ENABLED=false
    if kubectl api-resources --api-group=admissionregistration.k8s.io -o name 2>/dev/null \
       | grep -q 'validatingadmissionpolicies'; then
        POLICIES_ENABLED=true
    fi
    echo "Auto-detected POLICIES_ENABLED=$POLICIES_ENABLED"
fi

if [ "$POLICIES_ENABLED" = "true" ]; then
    echo "Applying ValidatingAdmissionPolicy resources..."
    kubectl apply -f config/validating-admission-policies/
else
    echo "ValidatingAdmissionPolicy not supported or disabled, skipping"
fi

# ---------------------------------------------------------------------------
# Step 5: Deploy FMA controllers via Helm
# ---------------------------------------------------------------------------

step "Deploy FMA controllers via Helm"

HELM_ARGS=(
    --set global.imageRegistry="${CONTAINER_IMG_REG}"
    --set global.imageTag="${IMAGE_TAG}"
)

# Append any caller-supplied Helm arguments (e.g. --set global.local=true)
if [ -n "${HELM_EXTRA_ARGS:-}" ]; then
    read -ra _extra <<< "$HELM_EXTRA_ARGS"
    HELM_ARGS+=("${_extra[@]}")
fi

if [ -n "$CLUSTER_ROLE_NAME" ]; then
    HELM_ARGS+=(--set global.nodeViewClusterRole="${CLUSTER_ROLE_NAME}")
fi

helm upgrade --install "$FMA_CHART_INSTANCE_NAME" charts/fma-controllers \
    -n "$FMA_NAMESPACE" \
    "${HELM_ARGS[@]}"

# ---------------------------------------------------------------------------
# Step 6: Wait for controllers to be ready
# ---------------------------------------------------------------------------

step "Wait for controllers to be ready"

kubectl wait --for=condition=available --timeout=120s \
    deployment "${FMA_CHART_INSTANCE_NAME}-dual-pods-controller" -n "$FMA_NAMESPACE"
kubectl wait --for=condition=available --timeout=120s \
    deployment "${FMA_CHART_INSTANCE_NAME}-launcher-populator" -n "$FMA_NAMESPACE"
echo "Both controllers are available"

echo ""
echo "[deploy_fma] All steps completed successfully"