Skip to content

Commit 669f5c4

Browse files
authored
cleanup/ refactor deploy scripts (llm-d#959)
* phase 1 clean up Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * phase 2 decouple Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * skip modelservice install for e2es Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> * apply review changes from copilot Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com> --------- Signed-off-by: Mohammed Abdi <mohammed.munir.abdi@ibm.com>
1 parent f097ac8 commit 669f5c4

32 files changed

Lines changed: 2231 additions & 2118 deletions

Makefile

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ E2E_MONITORING_NAMESPACE ?= workload-variant-autoscaler-monitoring
2727
E2E_EMULATED_LLMD_NAMESPACE ?= llm-d-sim
2828

2929
# Flags for deploy/install.sh installation script
30+
# Full e2e / CI-style cluster infra (WVA + llm-d, no chart VA/HPA): prefer `make deploy-e2e-infra`
31+
# (wraps ./deploy/install.sh with INFRA_ONLY=true; set ENVIRONMENT=kubernetes|openshift|kind-emulator).
3032
CREATE_CLUSTER ?= false
3133
DEPLOY_LLM_D ?= true
3234
DELETE_CLUSTER ?= false
@@ -111,7 +113,7 @@ destroy-kind-cluster:
111113
.PHONY: deploy-wva-emulated-on-kind
112114
deploy-wva-emulated-on-kind: ## Deploy WVA + llm-d on Kind (Prometheus Adapter as scaler backend)
113115
@echo ">>> Deploying workload-variant-autoscaler (cluster args: $(KIND_ARGS), image: $(IMG))"
114-
KIND=$(KIND) KUBECTL=$(KUBECTL) IMG=$(IMG) DEPLOY_LLM_D=$(DEPLOY_LLM_D) ENVIRONMENT=kind-emulator CREATE_CLUSTER=$(CREATE_CLUSTER) CLUSTER_GPU_TYPE=$(CLUSTER_GPU_TYPE) CLUSTER_NODES=$(CLUSTER_NODES) CLUSTER_GPUS=$(CLUSTER_GPUS) MULTI_MODEL_TESTING=$(MULTI_MODEL_TESTING) NAMESPACE_SCOPED=false SCALER_BACKEND=$(SCALER_BACKEND) \
116+
KIND=$(KIND) KUBECTL=$(KUBECTL) IMG=$(IMG) DEPLOY_LLM_D=$(DEPLOY_LLM_D) ENVIRONMENT=kind-emulator CREATE_CLUSTER=$(CREATE_CLUSTER) CLUSTER_GPU_TYPE=$(CLUSTER_GPU_TYPE) CLUSTER_NODES=$(CLUSTER_NODES) CLUSTER_GPUS=$(CLUSTER_GPUS) NAMESPACE_SCOPED=false SCALER_BACKEND=$(SCALER_BACKEND) \
115117
deploy/install.sh
116118

117119
## Undeploy WVA from the emulated environment on Kind.
@@ -296,6 +298,21 @@ test-benchmark-with-setup: deploy-e2e-infra test-benchmark
296298
lint: golangci-lint ## Run golangci-lint linter
297299
$(GOLANGCI_LINT) run
298300

301+
.PHONY: lint-deploy-scripts
302+
lint-deploy-scripts: ## Run bash -n for deploy/install.sh, deploy/lib/*.sh, and deploy plugins
303+
@echo "Syntax-checking deploy shell scripts..."
304+
@bash -n deploy/install.sh
305+
@for script in deploy/lib/*.sh; do bash -n "$$script"; done
306+
@for script in deploy/*/install.sh; do if [ -f "$$script" ]; then bash -n "$$script"; fi; done
307+
@for script in deploy/kind-emulator/*.sh; do if [ -f "$$script" ]; then bash -n "$$script"; fi; done
308+
@echo "deploy script syntax OK"
309+
310+
.PHONY: smoke-deploy-scripts
311+
smoke-deploy-scripts: lint-deploy-scripts ## Non-interactive deploy script smoke check (source order + arg parsing)
312+
@echo "Running deploy script smoke check..."
313+
@SKIP_CHECKS=true E2E_TESTS_ENABLED=true INSTALL_GATEWAY_CTRLPLANE=true ENVIRONMENT=kubernetes ./deploy/install.sh --help >/dev/null
314+
@echo "deploy script smoke OK"
315+
299316
.PHONY: lint-fix
300317
lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
301318
$(GOLANGCI_LINT) run --fix

deploy/README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ export DEPLOY_PROMETHEUS=true # Deploy Prometheus stack
139139
export DEPLOY_WVA=true # Deploy WVA controller
140140
export DEPLOY_LLM_D=true # Deploy llm-d infrastructure
141141
export DEPLOY_PROMETHEUS_ADAPTER=true # Deploy Prometheus Adapter
142-
export DEPLOY_VA=true # Deploy VariantAutoscaling CR
143-
export DEPLOY_HPA=true # Deploy HPA
142+
export DEPLOY_VA=true # Chart-managed VariantAutoscaling (default off; e2e often creates its own)
143+
export DEPLOY_HPA=true # Chart-managed HPA (default off; enable with DEPLOY_VA for demos)
144144

145145
# HPA Configuration
146146
export HPA_STABILIZATION_SECONDS=240 # HPA stabilization window (default: 240s)
@@ -188,6 +188,9 @@ bash install.sh
188188

189189
```bash
190190
export HF_TOKEN="hf_xxxxx"
191+
# Optional: chart-managed VA + HPA for a single-variant demo (install.sh defaults skip these)
192+
export DEPLOY_VA=true
193+
export DEPLOY_HPA=true
191194
make deploy-wva-on-k8s
192195
```
193196

@@ -198,6 +201,8 @@ export HF_TOKEN="hf_xxxxx"
198201
export MODEL_ID="meta-llama/Llama-2-7b-hf"
199202
export SLO_TPOT=5
200203
export SLO_TTFT=500
204+
export DEPLOY_VA=true
205+
export DEPLOY_HPA=true
201206
make deploy-wva-on-k8s
202207
```
203208

@@ -208,6 +213,7 @@ export DEPLOY_WVA=true
208213
export DEPLOY_LLM_D=false
209214
export DEPLOY_PROMETHEUS=true # Prometheus is needed for metrics - disable if it is already installed in your cluster
210215
export DEPLOY_PROMETHEUS_ADAPTER=false
216+
export DEPLOY_VA=true # Create a VariantAutoscaling CR for the existing model service
211217
export DEPLOY_HPA=false
212218
make deploy-wva-on-k8s
213219
```
@@ -216,6 +222,8 @@ make deploy-wva-on-k8s
216222

217223
```bash
218224
export HF_TOKEN="hf_xxxxx"
225+
export DEPLOY_VA=true
226+
export DEPLOY_HPA=true
219227
export HPA_STABILIZATION_SECONDS=30 # Fast scaling for dev/test (default: 240)
220228
make deploy-wva-on-k8s
221229
```
@@ -224,9 +232,10 @@ make deploy-wva-on-k8s
224232

225233
```bash
226234
export HF_TOKEN="hf_xxxxx"
227-
export HPA_STABILIZATION_SECONDS=0 # Immediate scaling for e2e tests
228-
export VLLM_MAX_NUM_SEQS=8 # Low batch size for easy saturation
229235
export E2E_TESTS_ENABLED=true
236+
export INFRA_ONLY=true # Tests create VA/HPA; see also make deploy-e2e-infra
237+
export HPA_STABILIZATION_SECONDS=0 # Only applies if chart HPA is enabled
238+
export VLLM_MAX_NUM_SEQS=8 # Low batch size for easy saturation
230239
make deploy-wva-on-k8s
231240
```
232241

@@ -236,6 +245,8 @@ make deploy-wva-on-k8s
236245
export HF_TOKEN="hf_xxxxx"
237246
export VLLM_MAX_NUM_SEQS=64 # Match desired max batch size
238247
export MODEL_ID="unsloth/Meta-Llama-3.1-8B"
248+
export DEPLOY_VA=true
249+
export DEPLOY_HPA=true
239250
make deploy-wva-on-k8s
240251
```
241252

@@ -650,8 +661,8 @@ Each guide includes platform-specific examples, troubleshooting, and quick start
650661
| `DEPLOY_WVA` | Deploy WVA controller | `true` |
651662
| `DEPLOY_LLM_D` | Deploy llm-d infrastructure | `true` |
652663
| `DEPLOY_PROMETHEUS_ADAPTER` | Deploy Prometheus Adapter | `true` |
653-
| `DEPLOY_VA` | Deploy VariantAutoscaling CR | `true` |
654-
| `DEPLOY_HPA` | Deploy HPA | `true` |
664+
| `DEPLOY_VA` | Deploy VariantAutoscaling CR via WVA Helm chart | `false` |
665+
| `DEPLOY_HPA` | Deploy HPA via WVA Helm chart | `false` |
655666
| `INFRA_ONLY` | Deploy only infrastructure (skip VA/HPA) | `false` |
656667
| `SKIP_CHECKS` | Skip prerequisite checks | `false` |
657668

deploy/inference-objective-e2e.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
# InferenceObjective for GIE queuing (scale-from-zero e2e and flow control).
2-
# Applied when E2E_TESTS_ENABLED or ENABLE_SCALE_TO_ZERO is true.
1+
# InferenceObjective for GIE queuing (scale-from-zero flow control).
2+
# install.sh applies this when ENABLE_SCALE_TO_ZERO=true and not E2E (e2e applies e2e-default from Go).
33
# poolRef.name is templated by install.sh to match the deployed InferencePool.
44
apiVersion: inference.networking.x-k8s.io/v1alpha2
55
kind: InferenceObjective

0 commit comments

Comments
 (0)