Skip to content

Commit 8cd4d0a

Browse files
committed
fix(ci): fix Karpenter KWOK provider deployment failures
Three issues prevented the Karpenter Deployment from becoming Ready: 1. Image tag mismatch: ko outputs kind.local/<name>:<content-hash> but the script hardcoded --set controller.image.tag=latest. The :latest tag doesn't exist in kind's containerd. Fix: extract the actual tag from ko output and set imagePullPolicy=Never since the image is already side-loaded. 2. staticCapacity feature gate undefined: the deployment template references settings.featureGates.staticCapacity but it's missing from the chart's values.yaml, rendering as StaticCapacity=<no value> which crashes the controller. Fix: set staticCapacity=true. 3. JSON patch on missing arrays: configure_instance_types used JSON patch with /- (append) on volumes/volumeMounts arrays that don't exist in the rendered deployment when extraVolumes is empty. Fix: pass extraVolumes, controller.extraVolumeMounts, and controller.env directly via Helm values, eliminating the patch entirely. The ConfigMap is now created before helm install so the volume mount is available immediately. Also added diagnostic output (pod describe, logs) on Helm failure and increased timeout to 300s to accommodate slower CI environments. Signed-off-by: Davanum Srinivas <dsrinivas@nvidia.com>
1 parent 3e408d3 commit 8cd4d0a

File tree

1 file changed

+65
-62
lines changed

1 file changed

+65
-62
lines changed

kwok/scripts/install-karpenter-kwok.sh

Lines changed: 65 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
# GPU cluster autoscaling without real cloud infrastructure. It:
2020
# 1. Installs KWOK controller via Helm
2121
# 2. Clones Karpenter and builds the KWOK provider image via ko
22-
# 3. Deploys Karpenter via Helm with the locally-built image
23-
# 4. Configures GPU instance types (p5.48xlarge, g5.xlarge, etc.)
22+
# 3. Creates GPU instance types ConfigMap
23+
# 4. Deploys Karpenter via Helm with the locally-built image and instance types
2424
#
2525
# Prerequisites: Go, ko, Helm, kubectl, kind cluster running
2626
#
@@ -93,97 +93,101 @@ build_karpenter() {
9393

9494
cd "${KARPENTER_CLONE_DIR}"
9595

96-
# ko build with kind.local side-loads the image directly into the kind cluster
96+
# ko build with kind.local side-loads the image directly into the kind cluster.
97+
# Output format: kind.local/<name>:<content-hash>
9798
CONTROLLER_IMG=$(KO_DOCKER_REPO=kind.local \
9899
KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}" \
99100
ko build sigs.k8s.io/karpenter/kwok 2>&1 | tail -1)
100101

101-
IMG_REPOSITORY=$(echo "${CONTROLLER_IMG}" | cut -d ":" -f 1)
102+
# Extract repository and tag from the ko output.
103+
# ko outputs "kind.local/<name>:<hash>" — split on the first colon after the repo.
104+
if [[ "${CONTROLLER_IMG}" == *":"* ]]; then
105+
IMG_REPOSITORY="${CONTROLLER_IMG%%:*}"
106+
IMG_TAG="${CONTROLLER_IMG#*:}"
107+
else
108+
IMG_REPOSITORY="${CONTROLLER_IMG}"
109+
IMG_TAG=""
110+
fi
102111

103112
log_info "Built image: ${CONTROLLER_IMG}"
104113
log_info "Repository: ${IMG_REPOSITORY}"
114+
log_info "Tag: ${IMG_TAG:-<none>}"
105115

106116
# Export for use in deploy step
107-
export CONTROLLER_IMG IMG_REPOSITORY
117+
export CONTROLLER_IMG IMG_REPOSITORY IMG_TAG
108118
}
109119

110120
# -------------------------------------------------------------------
111121
# Step 3: Deploy Karpenter via Helm
122+
# Creates the instance types ConfigMap first, then deploys Karpenter
123+
# with volume mounts and env vars configured via Helm values.
112124
# -------------------------------------------------------------------
113125
deploy_karpenter() {
114126
log_info "Deploying Karpenter to namespace ${KARPENTER_NAMESPACE}..."
115127

116128
# Apply CRDs first
117129
kubectl apply -f "${KARPENTER_CLONE_DIR}/kwok/charts/crds"
118130

119-
# Deploy via Helm with the locally-built image
120-
helm upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \
121-
--namespace "${KARPENTER_NAMESPACE}" --create-namespace \
122-
--set controller.image.repository="${IMG_REPOSITORY}" \
123-
--set controller.image.tag=latest \
124-
--set logLevel=info \
125-
--set controller.resources.requests.cpu=500m \
126-
--set controller.resources.requests.memory=512Mi \
127-
--set controller.resources.limits.cpu=1 \
128-
--set controller.resources.limits.memory=1Gi \
129-
--wait --timeout 120s
130-
131-
log_info "Karpenter deployed"
132-
}
133-
134-
# -------------------------------------------------------------------
135-
# Step 4: Configure GPU instance types
136-
# Patches the Karpenter deployment to mount the instance-types.json
137-
# ConfigMap, enabling Karpenter to provision GPU-capable KWOK nodes.
138-
# -------------------------------------------------------------------
139-
configure_instance_types() {
140-
log_info "Configuring GPU instance types..."
131+
# Create namespace and instance types ConfigMap before Helm install
132+
# so the volume mount can reference it immediately.
133+
kubectl create namespace "${KARPENTER_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
141134

142135
local instance_types_file="${MANIFESTS_DIR}/instance-types.json"
143136
if [[ ! -f "${instance_types_file}" ]]; then
144137
log_error "Instance types file not found: ${instance_types_file}"
145138
exit 1
146139
fi
147-
148-
# Create ConfigMap from instance types JSON
149140
kubectl create configmap -n "${KARPENTER_NAMESPACE}" karpenter-instance-types \
150141
--from-file=instance-types.json="${instance_types_file}" \
151142
--dry-run=client -o yaml | kubectl apply -f -
152143

153-
# Patch deployment to mount the ConfigMap and set the env var
154-
kubectl -n "${KARPENTER_NAMESPACE}" patch deployment karpenter --type=json -p='[
155-
{
156-
"op": "add",
157-
"path": "/spec/template/spec/volumes/-",
158-
"value": {
159-
"name": "instance-types",
160-
"configMap": {
161-
"name": "karpenter-instance-types"
162-
}
163-
}
164-
},
165-
{
166-
"op": "add",
167-
"path": "/spec/template/spec/containers/0/volumeMounts/-",
168-
"value": {
169-
"name": "instance-types",
170-
"mountPath": "/etc/karpenter/instance-types",
171-
"readOnly": true
172-
}
173-
},
174-
{
175-
"op": "add",
176-
"path": "/spec/template/spec/containers/0/env/-",
177-
"value": {
178-
"name": "INSTANCE_TYPES_FILE_PATH",
179-
"value": "/etc/karpenter/instance-types/instance-types.json"
180-
}
181-
}
182-
]'
144+
# Build the image tag argument. If ko provided a tag, use it.
145+
# If not, omit it and let the chart default to its AppVersion.
146+
local tag_arg=""
147+
if [[ -n "${IMG_TAG}" ]]; then
148+
tag_arg="--set controller.image.tag=${IMG_TAG}"
149+
fi
183150

184-
kubectl -n "${KARPENTER_NAMESPACE}" rollout status deployment/karpenter --timeout=60s
151+
# Deploy via Helm with the locally-built image.
152+
# - imagePullPolicy=Never: image is side-loaded into kind, no registry to pull from
153+
# - staticCapacity=true: required by the deployment template but missing from chart defaults
154+
# - extraVolumes + extraVolumeMounts: mount the instance types ConfigMap
155+
# - controller.env: set INSTANCE_TYPES_FILE_PATH for the KWOK provider
156+
# shellcheck disable=SC2086
157+
helm upgrade --install karpenter "${KARPENTER_CLONE_DIR}/kwok/charts" \
158+
--namespace "${KARPENTER_NAMESPACE}" --create-namespace \
159+
--set controller.image.repository="${IMG_REPOSITORY}" \
160+
${tag_arg} \
161+
--set imagePullPolicy=Never \
162+
--set logLevel=info \
163+
--set settings.featureGates.staticCapacity=true \
164+
--set controller.resources.requests.cpu=500m \
165+
--set controller.resources.requests.memory=512Mi \
166+
--set controller.resources.limits.cpu=1 \
167+
--set controller.resources.limits.memory=1Gi \
168+
--set 'extraVolumes[0].name=instance-types' \
169+
--set 'extraVolumes[0].configMap.name=karpenter-instance-types' \
170+
--set 'controller.extraVolumeMounts[0].name=instance-types' \
171+
--set 'controller.extraVolumeMounts[0].mountPath=/etc/karpenter/instance-types' \
172+
--set 'controller.extraVolumeMounts[0].readOnly=true' \
173+
--set 'controller.env[0].name=INSTANCE_TYPES_FILE_PATH' \
174+
--set 'controller.env[0].value=/etc/karpenter/instance-types/instance-types.json' \
175+
--wait --timeout 300s \
176+
|| {
177+
log_error "Helm install failed. Diagnostics:"
178+
kubectl -n "${KARPENTER_NAMESPACE}" get pods -o wide 2>/dev/null || true
179+
kubectl -n "${KARPENTER_NAMESPACE}" describe deployment karpenter 2>/dev/null || true
180+
local POD
181+
POD=$(kubectl -n "${KARPENTER_NAMESPACE}" get pods -l app.kubernetes.io/name=karpenter \
182+
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
183+
if [[ -n "${POD}" ]]; then
184+
kubectl -n "${KARPENTER_NAMESPACE}" describe pod "${POD}" 2>/dev/null || true
185+
kubectl -n "${KARPENTER_NAMESPACE}" logs "${POD}" --tail=50 2>/dev/null || true
186+
fi
187+
exit 1
188+
}
185189

186-
log_info "GPU instance types configured"
190+
log_info "Karpenter deployed with GPU instance types configured"
187191
}
188192

189193
# -------------------------------------------------------------------
@@ -198,7 +202,6 @@ main() {
198202
install_kwok
199203
build_karpenter
200204
deploy_karpenter
201-
configure_instance_types
202205

203206
log_info "=== Karpenter KWOK provider ready ==="
204207
log_info "Create a NodePool + KWOKNodeClass to start autoscaling"

0 commit comments

Comments
 (0)