forked from kubernetes-sigs/gateway-api-inference-extension
-
Notifications
You must be signed in to change notification settings - Fork 0
385 lines (331 loc) · 14.8 KB
/
e2e-prefill-heavy-gke.yaml
File metadata and controls
385 lines (331 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
name: GKE Prefill Heavy Test
on:
# Runs with a PR comment /run-gke-prefill-heavy
issue_comment:
types: [created]
workflow_dispatch:
inputs:
pr_or_branch:
description: 'Pull-request number or branch name to test'
required: true
default: 'main'
type: string
permissions:
contents: read
pull-requests: write
jobs:
# Authorization Job: Ensures only authorized users can execute workflow
# Note, even if user checks out branch to modify access, user will need to provide correct secret keys to deploy to GCP.
check_access:
runs-on: ubuntu-latest
if: |
(github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/run-gke-prefill-heavy')) || github.event_name == 'workflow_dispatch'
outputs:
authorized: ${{ steps.auth_logic.outputs.authorized }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Authorization Logic
id: auth_logic
shell: bash
run: |
authorized='false'
auth_file=".github/authorized_workflow_users.txt"
user=""
role=""
if [[ "${{ github.event_name }}" == "issue_comment" ]]; then
user="${{ github.event.comment.user.login }}"
role="${{ github.event.comment.author_association }}"
elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
user="${{ github.actor }}"
fi
if [[ "$role" == "OWNER" || "$role" == "MAINTAINER" ]]; then
echo "User authorized by role: $role"
authorized='true'
elif grep -Fxq "$user" "$auth_file"; then
echo "User authorized by file lookup: $auth_file"
authorized='true'
fi
echo "authorized=$authorized" >> "$GITHUB_OUTPUT"
deploy_and_validate:
needs: [check_access]
if: |
(github.event_name == 'workflow_dispatch' || github.event_name == 'issue_comment') &&
needs.check_access.outputs.authorized == 'true'
name: Test on ${{ matrix.accelerator.name }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 1
matrix:
accelerator:
- name: GPU
env:
GCP_PROJECT_ID: llm-d-scale
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
GKE_CLUSTER_ZONE: us-east5
NAMESPACE: igw-prefill-heavy
GATEWAY: gke-l7-regional-external-managed
GATEWAY_PROVIDER: gke
PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch || github.event.issue.number || 'main' }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DEPLOYMENT_NAME: vllm-qwen3-32b
MODEL: Qwen/Qwen3-32B
GSA_EMAIL: ${{ secrets.GCS_WORKLOAD_SA }}
GCS_BUCKET: igw-e2e-benchmark-results
KSA_NAME: igw-e2e-benchmark-sa
steps:
- name: Checkout
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Determine if pr_or_branch is a PR number
id: check_pr
shell: bash
run: |
VAL="${{ env.PR_OR_BRANCH }}"
echo "TARGET=$VAL" >> "$GITHUB_ENV"
if [[ "$VAL" =~ ^[0-9]+$ ]]; then
echo "is_pr=true" >> "$GITHUB_OUTPUT"
else
echo "is_pr=false" >> "$GITHUB_OUTPUT"
fi
- name: Fetch and checkout PR
if: steps.check_pr.outputs.is_pr == 'true'
run: |
git fetch origin pull/"$TARGET"/head:pr-"$TARGET"
git checkout pr-"$TARGET"
- name: Checkout branch
if: steps.check_pr.outputs.is_pr == 'false'
run: |
git fetch origin "$TARGET"
git checkout "$TARGET"
- name: Install Python Dependencies
run: |
cd benchmarking
python3 -m venv .venv
pip install inference-perf pandas numpy matplotlib
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
with:
credentials_json: ${{ secrets.GCP_SA_KEY }}
- name: Set up gcloud CLI and kubectl
uses: google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
with:
project_id: ${{ env.GCP_PROJECT_ID }}
install_components: 'kubectl,gke-gcloud-auth-plugin'
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
- name: Create namespace
run: |
kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists"
- name: Create hf-token secret
run: |
kubectl create secret generic hf-token \
--from-literal="token=${{ secrets.HF_TOKEN }}" \
--namespace "${NAMESPACE}" \
--dry-run=client -o yaml | kubectl apply -f -
- name: Create and Annotate KSA for Workload Identity
run: |
kubectl create serviceaccount $KSA_NAME --namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
kubectl annotate serviceaccount $KSA_NAME \
iam.gke.io/gcp-service-account=$GSA_EMAIL \
--overwrite \
--namespace "${NAMESPACE}"
- name: Fetch Latest IGW Release
run: |
IGW_LATEST_RELEASE=$(curl -s https://api.github.com/repos/kubernetes-sigs/gateway-api-inference-extension/releases \
| jq -r '.[] | select(.prerelease == false) | .tag_name' \
| sort -V \
| tail -n1)
echo "IGW_CHART_VERSION=$IGW_LATEST_RELEASE" >> $GITHUB_ENV
- name: Deploy Model Server and CRDs
run: |
cd config/manifests/vllm
echo "Deploying Model Server..."
kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
kubectl scale deployment ${DEPLOYMENT_NAME} -n ${NAMESPACE} --replicas=8
echo "Installing CRDs"
kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${IGW_CHART_VERSION}/manifests.yaml
echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
- name: Wait for all pods to be ready
run: |
kubectl wait pod \
--for=condition=Ready \
--all \
-n "${NAMESPACE}" \
--timeout=25m
echo "✅ All pods are ready."
kubectl get pods -n "${NAMESPACE}"
- name: Deploy Gateway
run: |
GATEWAY_NAME=inference-gateway
echo "Deploying Gateway..."
kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/${IGW_CHART_VERSION}/config/manifests/gateway/gke/gateway.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
- name: Wait for gateway to be ready
run: |
GATEWAY_NAME=inference-gateway
kubectl wait gateway/${GATEWAY_NAME} \
--for=condition=Programmed=True \
-n "${NAMESPACE}" \
--timeout=500s
echo "✅ Gateway is ready."
kubectl get gateway -n "${NAMESPACE}"
- name: Deploy InferencePool and Endpoint Picker Extension
run: |
helm upgrade --install ${DEPLOYMENT_NAME} \
--dependency-update \
--namespace $NAMESPACE \
--set inferencePool.modelServers.matchLabels.app=${DEPLOYMENT_NAME} \
--set provider.name=$GATEWAY_PROVIDER \
--set experimentalHttpRoute.enabled=true \
--version $IGW_CHART_VERSION \
oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-prefill-heavy-deployment.log
echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
- name: Wait for all resources to be ready
run: |
kubectl rollout status deployment/$DEPLOYMENT_NAME-epp -n $NAMESPACE --timeout=120s
echo "Allow extra time to initialize resources..."
sleep 30
kubectl wait httproute/${DEPLOYMENT_NAME} \
--for=jsonpath='{.status.parents[0].conditions[?(@.type=="ResolvedRefs")].status}'=True \
-n ${NAMESPACE} \
--timeout=5m
kubectl wait inferencepool/${DEPLOYMENT_NAME} \
--for=jsonpath='{.status.parents[0].conditions[?(@.type=="ResolvedRefs")].status}'=True \
-n ${NAMESPACE} \
--timeout=5m
- name: Show deployment status
run: |
echo "=== Deployments ==="
kubectl get deployments -n "${NAMESPACE}"
echo ""
echo "=== Pods ==="
kubectl get pods -n "${NAMESPACE}"
echo ""
echo "=== Services ==="
kubectl get svc -n "${NAMESPACE}"
echo ""
echo "=== Helm releases ==="
helm list -n "${NAMESPACE}" || true
echo ""
echo "=== Inference Pools ==="
kubectl get inferencepools -n "${NAMESPACE}" -o yaml || true
echo ""
echo "=== HTTPRoutes ==="
kubectl get httproutes -n "${NAMESPACE}" -o yaml || true
echo ""
echo "=== Gateway ==="
kubectl get Gateway -n "${NAMESPACE}" || true
echo ""
- name: Verify installation and run validation test
run: |
cd .github/scripts/e2e
./e2e-validate.sh -n "${NAMESPACE}" -v -m ${MODEL}
- name: Run benchmarking test
run: |
TIMESTAMP=$(date +"%Y-%m-%d-%H-%M-%S")
echo "TIMESTAMP=$TIMESTAMP" >> "$GITHUB_ENV"
cd benchmarking/single-workload
host="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \
-o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}"
if [[ -z "$host" ]]; then
echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2
exit 1
fi
port=80
svc_host="${host}:${port}"
helm upgrade --install prefill-heavy-benchmark ../inference-perf/ -f prefill-heavy-values.yaml \
--namespace "${NAMESPACE}" \
--create-namespace \
--set token.hfToken="${HF_TOKEN}" \
--set "config.server.base_url=http://${svc_host}" \
--set "job.serviceAccountName=$KSA_NAME" \
--set "job.image.tag=latest" \
--set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \
--set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \
--set "gcsPath=${GCS_BUCKET}/datasets/billsum_conversations.json" \
--set "config.data.path=/gcsDataset/gcs-dataset.json" \
--set-string 'job.resources.limits.nvidia\.com/gpu=1'
- name: Wait for benchmarking job to finish
run: |
job_name=prefill-heavy-benchmark-inference-perf-job
TIMEOUT_DURATION="7200s"
if ! kubectl wait --for=condition=complete job/"$job_name" -n "$NAMESPACE" --timeout="$TIMEOUT_DURATION"; then
echo "Error: Benchmark job $job_name did not complete successfully within $TIMEOUT_DURATION." >&2
echo "--- Job Description ---" >&2
kubectl describe job "$job_name" -n "$NAMESPACE" >&2
echo "--- Pod Logs (Last 50 lines) ---" >&2
kubectl logs -l job-name="$job_name" -n "$NAMESPACE" --all-containers=true --tail 50 >&2
exit 1
fi
echo "✅ Benchmarking Job Completed."
- name: Analyze Benchmarking Results
run: |
cd benchmarking
python3 -m venv .venv
source .venv/bin/activate
benchmark_id='prefill-heavy' ./download-results.bash gcs $GCS_BUCKET ${NAMESPACE}/${TIMESTAMP}
inference-perf --analyze output/default-run/prefill-heavy/results/json/${TIMESTAMP}
- name: Upload Benchmark Results
uses: actions/upload-artifact@v4
with:
name: benchmarking-artifacts
path: benchmarking/output/default-run/prefill-heavy/results/json/**/*
- name: Comment on PR
if: steps.check_pr.outputs.is_pr == 'true'
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ github.event.pull_request.number }}
RUN_ID: ${{ github.run_id }}
run: |
cd benchmarking/output/default-run/prefill-heavy/results/json/${TIMESTAMP}
FILE_LIST=$(ls | sed 's/^/- /')
COMMENT_BODY=$(cat <<EOF
✅ **Benchmarking Job Completed!**
You can find the benchmarking artifacts in the [Workflow Summary](https://github.com{{ github.repository }}/actions/runs/$RUN_ID).
**Files generated:**
$FILE_LIST
EOF
)
gh pr comment $PR_NUMBER --body "$COMMENT_BODY"
- name: Collect and upload Kubernetes pod logs
if: always()
run: |
mkdir -p pod-logs-inference-prefill-heavy
cd pod-logs-inference-prefill-heavy
echo "Fetching ${NAMESPACE} pods log..."
kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
| xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1'
echo "Fetching ${NAMESPACE} pods descriptions..."
kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
| xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1'
mv ~/igw-prefill-heavy-deployment.log . || true
mv ~/install-deps.log . || true
- name: Upload pod logs as artifact
uses: actions/upload-artifact@v4
if: always()
with:
name: igw-pod-logs-inference-prefill-heavy-${{ matrix.accelerator.name }}
path: pod-logs-inference-prefill-heavy
- name: Send Google Chat notification on failure
if: failure()
uses: SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb
with:
webhookUrl: ${{ secrets.GOOGLE_CHAT_WEBHOOK }}
jobStatus: ${{ job.status }}
title: '${{ github.workflow }} - ${{ matrix.accelerator.name }}'
- name: Cleanup deployment
if: always()
run: |
GATEWAY_NAME=inference-gateway
helm uninstall ${DEPLOYMENT_NAME} -n ${NAMESPACE} --ignore-not-found
helm uninstall prefill-heavy-benchmark -n ${NAMESPACE} --ignore-not-found
kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
kubectl delete deployment ${DEPLOYMENT_NAME} -n ${NAMESPACE} --ignore-not-found