Skip to content

Commit 5f7d2bc

Browse files
committed
update: chart for deployment on ocp, kind, k8s
- update on Makefile for ocp, we should not have prom stack or GW created on ocp - fix the order of prompt warning if should create GW - update helm template for handling tls cert ocp: just use annotation which creates the cert in the mounted secret- kind/k8s: if set a caCerrPath with non-default value, use it, by install.sh, it creates the secret prometheus-web-tls and with tls.crt as the key inside. no need extract secret's tls.crt - add missing config for values-dev.yaml - deprecate "caCert" instead of new caCertPath or existingSecret + key Signed-off-by: Wen Zhou <wenzhou@redhat.com> update: cleanup - remove cert extraction for prom adaptor - remove creation of configmap - remove reference in OCP Signed-off-by: Wen Zhou <wenzhou@redhat.com> update: bump version for llm-d and all images + add support for podman - use env variable LLM_D_RELEASE to control all image in the deploy/install.sh - clone llm-d to local based on local version if match required release version - use env variable CONTAINER_TOOL to support podmano on fedora - remove/update *ignore files Signed-off-by: Wen Zhou <wenzhou@redhat.com> fix: k8s label cannot have "/" sanitize MODLE_ID Signed-off-by: Wen Zhou <wenzhou@redhat.com> fix: add timeout for prom adaptor + create secret in wva namespace from prom ns Signed-off-by: Wen Zhou <wenzhou@redhat.com> fix: github action e2e which uses "LLM_D_RELEASE:main" Signed-off-by: Wen Zhou <wenzhou@redhat.com> fix: API bump - old test was based on v1alph2 of GIE for infpool - new default is on v1 for infpool Signed-off-by: Wen Zhou <wenzhou@redhat.com> update: code review and go fmt Signed-off-by: Wen Zhou <wenzhou@redhat.com>
1 parent 274f908 commit 5f7d2bc

22 files changed

Lines changed: 226 additions & 60 deletions

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ vendor/
1414

1515
# Submodules and sibling repos (not needed for building the manager binary)
1616
sample-data/
17+
llm-d/
1718
llm-d-infra/
1819
# If building from a parent repo that includes llmd or GAIE, add:
1920
# llmd/

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,8 @@ jobs:
398398
HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '240' }}
399399
SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
400400
# Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support)
401-
LLM_D_RELEASE: main
401+
LLM_D_EPP_RELEASE: main
402+
LLM_D_SIM_RELEASE: main
402403
# PR-specific namespaces for isolation between concurrent PR tests
403404
# Primary llm-d namespace (Model A1 + A2)
404405
LLMD_NAMESPACE: llm-d-inference-scheduler-pr-${{ needs.gate.outputs.pr_number || github.run_id }}

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@ gpu.cluster
3030
# llm-d and llm-d-infra directories
3131
llm-d/
3232
llm-d-infra/
33-
llmd/
34-
llmd-infra/
3533

3634
*.tgz
3735
actionlint

Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ CLUSTER_GPU_TYPE ?= nvidia-mix
77
CLUSTER_NODES ?= 3
88
CLUSTER_GPUS ?= 4
99
KUBECONFIG ?= $(HOME)/.kube/config
10-
K8S_VERSION ?= v1.32.0
10+
K8S_VERSION ?= v1.32.0 # match OCP 4.19
1111

1212
CONTROLLER_NAMESPACE ?= workload-variant-autoscaler-system
1313
MONITORING_NAMESPACE ?= openshift-user-workload-monitoring
@@ -44,7 +44,7 @@ endif
4444
# CONTAINER_TOOL defines the container tool to be used for building images.
4545
# Be aware that the target commands are only tested with Docker which is
4646
# scaffolded by default. However, you might want to replace it to use other
47-
# tools. (i.e. podman)
47+
# tools. (i.e. docker, podman)
4848
CONTAINER_TOOL ?= docker
4949

5050
# Setting SHELL to bash allows bash commands to be executed by recipes.
@@ -127,9 +127,9 @@ undeploy-wva-emulated-on-kind:
127127
## Deploy WVA to OpenShift cluster with specified image.
128128
.PHONY: deploy-wva-on-openshift
129129
deploy-wva-on-openshift: manifests kustomize ## Deploy WVA to OpenShift cluster with specified image.
130-
@echo "Deploying WVA to OpenShift with image: $(IMG)"
130+
@echo "Deploying WVA to OpenShift with image: $(IMG) by default no GW nor Prom stack to be created"
131131
@echo "Target namespace: $(or $(NAMESPACE),workload-variant-autoscaler-system)"
132-
NAMESPACE=$(or $(NAMESPACE),workload-variant-autoscaler-system) IMG=$(IMG) ENVIRONMENT=openshift DEPLOY_LLM_D=$(DEPLOY_LLM_D) ./deploy/install.sh
132+
NAMESPACE=$(or $(NAMESPACE),workload-variant-autoscaler-system) IMG=$(IMG) ENVIRONMENT=openshift INSTALL_GATEWAY_CTRLPLANE=false DEPLOY_PROMETHEUS=false DEPLOY_LLM_D=$(DEPLOY_LLM_D) ./deploy/install.sh
133133

134134
## Undeploy WVA from OpenShift.
135135
.PHONY: undeploy-wva-on-openshift
@@ -197,6 +197,7 @@ deploy-e2e-infra: ## Deploy e2e test infrastructure (infra-only: WVA + llm-d, no
197197
WVA_IMAGE_REPO=$$IMAGE_REPO \
198198
WVA_IMAGE_TAG=$$IMAGE_TAG \
199199
WVA_IMAGE_PULL_POLICY=IfNotPresent \
200+
CONTAINER_TOOL=$(CONTAINER_TOOL) \
200201
./deploy/install.sh; \
201202
else \
202203
echo "IMG not set - using default image from registry (latest)"; \
@@ -207,6 +208,7 @@ deploy-e2e-infra: ## Deploy e2e test infrastructure (infra-only: WVA + llm-d, no
207208
SCALER_BACKEND=$(SCALER_BACKEND) \
208209
INSTALL_GATEWAY_CTRLPLANE=true \
209210
NAMESPACE_SCOPED=false \
211+
CONTAINER_TOOL=$(CONTAINER_TOOL) \
210212
./deploy/install.sh; \
211213
fi
212214

charts/workload-variant-autoscaler/templates/manager/wva-deployment-controller-manager.yaml

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,14 @@ spec:
105105
mountPath: /etc/wva/config.yaml
106106
subPath: config.yaml
107107
readOnly: true
108-
{{- if .Values.wva.prometheus.caCert }}
108+
{{- if ne .Values.wva.prometheus.tls.caCertPath "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" }}
109109
- name: prometheus-ca-cert
110-
mountPath: /etc/ssl/certs/prometheus-ca.crt
110+
mountPath: {{ .Values.wva.prometheus.tls.caCertPath }}
111+
{{- if .Values.wva.prometheus.tls.existingSecret }}
112+
subPath: {{ .Values.wva.prometheus.tls.key }}
113+
{{- else }}
111114
subPath: ca.crt
115+
{{- end }}
112116
readOnly: true
113117
{{- end }}
114118
- name: epp-metrics-token
@@ -118,10 +122,22 @@ spec:
118122
- name: wva-config
119123
configMap:
120124
name: {{ include "workload-variant-autoscaler.fullname" . }}-variantautoscaling-config
121-
{{- if .Values.wva.prometheus.caCert }}
125+
{{- if ne .Values.wva.prometheus.tls.caCertPath "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" }}
122126
- name: prometheus-ca-cert
127+
{{- if .Values.wva.prometheus.tls.existingSecret }}
128+
secret:
129+
secretName: {{ .Values.wva.prometheus.tls.existingSecret }}
130+
{{- if .Values.wva.prometheus.tls.key }}
131+
items:
132+
- key: {{ .Values.wva.prometheus.tls.key }}
133+
path: {{ .Values.wva.prometheus.tls.key }}
134+
{{- end }}
135+
optional: false
136+
{{- else }}
123137
configMap:
124138
name: {{ include "workload-variant-autoscaler.fullname" . }}-prometheus-ca
139+
optional: true
140+
{{- end }}
125141
{{- end }}
126142
- name: epp-metrics-token
127143
secret:

charts/workload-variant-autoscaler/values-dev.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,13 @@ wva:
2727
# Development security configuration (relaxed for easier development)
2828
tls:
2929
insecureSkipVerify: true # Development: true, Production: false
30-
# caCertPath: "/etc/ssl/certs/prometheus-ca.crt" # Only used when caCert is provided
31-
# caCert: | # Uncomment and provide your CA certificate
30+
# On OpenShift, service CA is auto-injected at this path
31+
caCertPath: "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
32+
# Mount existing Secret for CA certificate (recommended)
33+
# This eliminates the need for --set-file and certificate extraction
34+
existingSecret: "" # Name of existing Secret (e.g., "prometheus-web-tls")
35+
key: "tls.crt" # Name of the key in the Secret's data map containing the CA certificate (required when existingSecret is set)
36+
# caCert: | # DEPRECATED: Use above existingSecret instead
3237
# -----BEGIN CERTIFICATE-----
3338
# YOUR_CA_CERTIFICATE_HERE
3439
# -----END CERTIFICATE-----

charts/workload-variant-autoscaler/values.yaml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,16 @@ wva:
3737
monitoringNamespace: openshift-user-workload-monitoring
3838
serviceAccountName: "kube-prometheus-stack-prometheus"
3939
baseURL: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091"
40-
# Development security configuration (relaxed for easier development)
40+
# TLS security configuration
4141
tls:
4242
insecureSkipVerify: true # Development: true, Production: false
43-
# caCertPath: "/etc/ssl/certs/prometheus-ca.crt" # Only used when caCert is provided
44-
# caCert: | # Uncomment and provide your CA certificate
43+
# On OpenShift, service CA is auto-injected at this path
44+
caCertPath: "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
45+
# Mount existing Secret for CA certificate (recommended)
46+
# This eliminates the need for --set-file and certificate extraction
47+
existingSecret: "" # Name of existing Secret (e.g., "prometheus-web-tls" for kind or k8s)
48+
key: "tls.crt" # Name of the key in the Secret's data map containing the CA certificate (required when existingSecret is set)
49+
# caCert: | # DEPRECATED: Use above existingSecret instead
4550
# -----BEGIN CERTIFICATE-----
4651
# YOUR_CA_CERTIFICATE_HERE
4752
# -----END CERTIFICATE-----

config/samples/dummy-va.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Minimal dummy VariantAutoscaling to satisfy the controller
2+
# This prevents "No active VariantAutoscalings found" log messages
3+
# while doing minimal actual work (no real optimization since it's alone)
4+
---
5+
apiVersion: v1
6+
kind: Namespace
7+
metadata:
8+
name: wva-dummy
9+
---
10+
apiVersion: apps/v1
11+
kind: Deployment
12+
metadata:
13+
name: dummy-deployment
14+
namespace: wva-dummy
15+
spec:
16+
replicas: 1
17+
selector:
18+
matchLabels:
19+
app: dummy
20+
template:
21+
metadata:
22+
labels:
23+
app: dummy
24+
spec:
25+
containers:
26+
- name: pause
27+
image: registry.k8s.io/pause:3.9
28+
resources:
29+
requests:
30+
cpu: 1m
31+
memory: 1Mi
32+
limits:
33+
cpu: 1m
34+
memory: 1Mi
35+
---
36+
apiVersion: llmd.ai/v1alpha1
37+
kind: VariantAutoscaling
38+
metadata:
39+
name: dummy-va
40+
namespace: wva-dummy
41+
labels:
42+
inference.optimization/acceleratorName: dummy
43+
spec:
44+
scaleTargetRef:
45+
kind: Deployment
46+
name: dummy-deployment
47+
modelID: "dummy/model"

config/samples/prometheus-adapter-values-ocp.yaml

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,14 @@ logLevel: 4
2020
tls:
2121
enable: false # Inbound TLS (Client → Adapter)
2222

23-
extraVolumes:
24-
- name: prometheus-ca
25-
configMap:
26-
name: prometheus-ca
23+
# No extra volumes needed - OpenShift auto-injects service CA
24+
# extraVolumes: []
2725

28-
extraVolumeMounts:
29-
- name: prometheus-ca
30-
mountPath: /etc/prometheus-ca
31-
readOnly: true
26+
# No extra volume mounts needed - use service CA from projected volume
27+
# extraVolumeMounts: []
3228

3329
extraArguments:
34-
- --prometheus-ca-file=/etc/prometheus-ca/ca.crt
30+
- --prometheus-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
3531
- --prometheus-token-file=/var/run/secrets/kubernetes.io/serviceaccount/token
3632

3733

config/samples/prometheus-adapter-values.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,11 @@ tls:
2222

2323
extraVolumes:
2424
- name: prometheus-ca
25-
configMap:
26-
name: prometheus-ca
25+
secret:
26+
secretName: prometheus-web-tls # Secret containing Prom TLS cert
27+
items:
28+
- key: tls.crt # Extract the cert from the Secret
29+
path: ca.crt # Mount as ca.crt
2730

2831
extraVolumeMounts:
2932
- name: prometheus-ca

0 commit comments

Comments
 (0)