1919# GPU cluster autoscaling without real cloud infrastructure. It:
2020# 1. Installs KWOK controller via Helm
2121# 2. Clones Karpenter and builds the KWOK provider image via ko
22- # 3. Deploys Karpenter via Helm with the locally-built image
23- # 4. Configures GPU instance types (p5.48xlarge, g5.xlarge, etc.)
22+ # 3. Creates GPU instance types ConfigMap
23+ # 4. Deploys Karpenter via Helm with the locally-built image and instance types
2424#
2525# Prerequisites: Go, ko, Helm, kubectl, kind cluster running
2626#
@@ -93,97 +93,101 @@ build_karpenter() {
9393
9494 cd " ${KARPENTER_CLONE_DIR} "
9595
96- # ko build with kind.local side-loads the image directly into the kind cluster
96+ # ko build with kind.local side-loads the image directly into the kind cluster.
97+ # Output format: kind.local/<name>:<content-hash>
9798 CONTROLLER_IMG=$( KO_DOCKER_REPO=kind.local \
9899 KIND_CLUSTER_NAME=" ${KIND_CLUSTER_NAME} " \
99100 ko build sigs.k8s.io/karpenter/kwok 2>&1 | tail -1)
100101
101- IMG_REPOSITORY=$( echo " ${CONTROLLER_IMG} " | cut -d " :" -f 1)
102+ # Extract repository and tag from the ko output.
103+ # ko outputs "kind.local/<name>:<hash>" — split on the first colon after the repo.
104+ if [[ " ${CONTROLLER_IMG} " == * " :" * ]]; then
105+ IMG_REPOSITORY=" ${CONTROLLER_IMG%%:* } "
106+ IMG_TAG=" ${CONTROLLER_IMG#*: } "
107+ else
108+ IMG_REPOSITORY=" ${CONTROLLER_IMG} "
109+ IMG_TAG=" "
110+ fi
102111
103112 log_info " Built image: ${CONTROLLER_IMG} "
104113 log_info " Repository: ${IMG_REPOSITORY} "
114+ log_info " Tag: ${IMG_TAG:- <none>} "
105115
106116 # Export for use in deploy step
107- export CONTROLLER_IMG IMG_REPOSITORY
117+ export CONTROLLER_IMG IMG_REPOSITORY IMG_TAG
108118}
109119
110120# -------------------------------------------------------------------
111121# Step 3: Deploy Karpenter via Helm
122+ # Creates the instance types ConfigMap first, then deploys Karpenter
123+ # with volume mounts and env vars configured via Helm values.
112124# -------------------------------------------------------------------
113125deploy_karpenter () {
114126 log_info " Deploying Karpenter to namespace ${KARPENTER_NAMESPACE} ..."
115127
116128 # Apply CRDs first
117129 kubectl apply -f " ${KARPENTER_CLONE_DIR} /kwok/charts/crds"
118130
119- # Deploy via Helm with the locally-built image
120- helm upgrade --install karpenter " ${KARPENTER_CLONE_DIR} /kwok/charts" \
121- --namespace " ${KARPENTER_NAMESPACE} " --create-namespace \
122- --set controller.image.repository=" ${IMG_REPOSITORY} " \
123- --set controller.image.tag=latest \
124- --set logLevel=info \
125- --set controller.resources.requests.cpu=500m \
126- --set controller.resources.requests.memory=512Mi \
127- --set controller.resources.limits.cpu=1 \
128- --set controller.resources.limits.memory=1Gi \
129- --wait --timeout 120s
130-
131- log_info " Karpenter deployed"
132- }
133-
134- # -------------------------------------------------------------------
135- # Step 4: Configure GPU instance types
136- # Patches the Karpenter deployment to mount the instance-types.json
137- # ConfigMap, enabling Karpenter to provision GPU-capable KWOK nodes.
138- # -------------------------------------------------------------------
139- configure_instance_types () {
140- log_info " Configuring GPU instance types..."
131+ # Create namespace and instance types ConfigMap before Helm install
132+ # so the volume mount can reference it immediately.
133+ kubectl create namespace " ${KARPENTER_NAMESPACE} " --dry-run=client -o yaml | kubectl apply -f -
141134
142135 local instance_types_file=" ${MANIFESTS_DIR} /instance-types.json"
143136 if [[ ! -f " ${instance_types_file} " ]]; then
144137 log_error " Instance types file not found: ${instance_types_file} "
145138 exit 1
146139 fi
147-
148- # Create ConfigMap from instance types JSON
149140 kubectl create configmap -n " ${KARPENTER_NAMESPACE} " karpenter-instance-types \
150141 --from-file=instance-types.json=" ${instance_types_file} " \
151142 --dry-run=client -o yaml | kubectl apply -f -
152143
153- # Patch deployment to mount the ConfigMap and set the env var
154- kubectl -n " ${KARPENTER_NAMESPACE} " patch deployment karpenter --type=json -p=' [
155- {
156- "op": "add",
157- "path": "/spec/template/spec/volumes/-",
158- "value": {
159- "name": "instance-types",
160- "configMap": {
161- "name": "karpenter-instance-types"
162- }
163- }
164- },
165- {
166- "op": "add",
167- "path": "/spec/template/spec/containers/0/volumeMounts/-",
168- "value": {
169- "name": "instance-types",
170- "mountPath": "/etc/karpenter/instance-types",
171- "readOnly": true
172- }
173- },
174- {
175- "op": "add",
176- "path": "/spec/template/spec/containers/0/env/-",
177- "value": {
178- "name": "INSTANCE_TYPES_FILE_PATH",
179- "value": "/etc/karpenter/instance-types/instance-types.json"
180- }
181- }
182- ]'
144+ # Build the image tag argument. If ko provided a tag, use it.
145+ # If not, omit it and let the chart default to its AppVersion.
146+ local tag_arg=" "
147+ if [[ -n " ${IMG_TAG} " ]]; then
148+ tag_arg=" --set controller.image.tag=${IMG_TAG} "
149+ fi
183150
184- kubectl -n " ${KARPENTER_NAMESPACE} " rollout status deployment/karpenter --timeout=60s
151+ # Deploy via Helm with the locally-built image.
152+ # - imagePullPolicy=Never: image is side-loaded into kind, no registry to pull from
153+ # - staticCapacity=true: required by the deployment template but missing from chart defaults
154+ # - extraVolumes + extraVolumeMounts: mount the instance types ConfigMap
155+ # - controller.env: set INSTANCE_TYPES_FILE_PATH for the KWOK provider
156+ # shellcheck disable=SC2086
157+ helm upgrade --install karpenter " ${KARPENTER_CLONE_DIR} /kwok/charts" \
158+ --namespace " ${KARPENTER_NAMESPACE} " --create-namespace \
159+ --set controller.image.repository=" ${IMG_REPOSITORY} " \
160+ ${tag_arg} \
161+ --set imagePullPolicy=Never \
162+ --set logLevel=info \
163+ --set settings.featureGates.staticCapacity=true \
164+ --set controller.resources.requests.cpu=500m \
165+ --set controller.resources.requests.memory=512Mi \
166+ --set controller.resources.limits.cpu=1 \
167+ --set controller.resources.limits.memory=1Gi \
168+ --set ' extraVolumes[0].name=instance-types' \
169+ --set ' extraVolumes[0].configMap.name=karpenter-instance-types' \
170+ --set ' controller.extraVolumeMounts[0].name=instance-types' \
171+ --set ' controller.extraVolumeMounts[0].mountPath=/etc/karpenter/instance-types' \
172+ --set ' controller.extraVolumeMounts[0].readOnly=true' \
173+ --set ' controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \
174+ --set ' controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \
175+ --wait --timeout 300s \
176+ || {
177+ log_error " Helm install failed. Diagnostics:"
178+ kubectl -n " ${KARPENTER_NAMESPACE} " get pods -o wide 2> /dev/null || true
179+ kubectl -n " ${KARPENTER_NAMESPACE} " describe deployment karpenter 2> /dev/null || true
180+ local POD
181+ POD=$( kubectl -n " ${KARPENTER_NAMESPACE} " get pods -l app.kubernetes.io/name=karpenter \
182+ -o jsonpath=' {.items[0].metadata.name}' 2> /dev/null)
183+ if [[ -n " ${POD} " ]]; then
184+ kubectl -n " ${KARPENTER_NAMESPACE} " describe pod " ${POD} " 2> /dev/null || true
185+ kubectl -n " ${KARPENTER_NAMESPACE} " logs " ${POD} " --tail=50 2> /dev/null || true
186+ fi
187+ exit 1
188+ }
185189
186- log_info " GPU instance types configured"
190+ log_info " Karpenter deployed with GPU instance types configured"
187191}
188192
189193# -------------------------------------------------------------------
@@ -198,7 +202,6 @@ main() {
198202 install_kwok
199203 build_karpenter
200204 deploy_karpenter
201- configure_instance_types
202205
203206 log_info " === Karpenter KWOK provider ready ==="
204207 log_info " Create a NodePool + KWOKNodeClass to start autoscaling"
0 commit comments