Skip to content

Commit d429d81

Browse files
committed
fix: rebase mess
Signed-off-by: Wen Zhou <wenzhou@redhat.com>
1 parent 5f7d2bc commit d429d81

10 files changed

Lines changed: 25 additions & 138 deletions

File tree

.github/workflows/ci-e2e-openshift.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ jobs:
398398
HPA_STABILIZATION_SECONDS: ${{ github.event.inputs.hpa_stabilization_seconds || '240' }}
399399
SKIP_CLEANUP: ${{ github.event.inputs.skip_cleanup || 'false' }}
400400
# Use main branch of llm-d/llm-d for inferencepool chart v1.2.1 (GA API support)
401+
LLM_D_RELEASE: main
401402
LLM_D_EPP_RELEASE: main
402403
LLM_D_SIM_RELEASE: main
403404
# PR-specific namespaces for isolation between concurrent PR tests

config/samples/dummy-va.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

deploy/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -288,15 +288,16 @@ Deploy with a specific llm-d release version and use Podman instead of Docker:
288288
289289
```bash
290290
export HF_TOKEN="hf_xxxxx"
291-
export LLM_D_EPP_RELEASE="v0.7.1" # Pin to specific llm-d-inference-scheduler version
292-
export LLM_D_SIM_RELEASE="v0.8.2" # Pin to specific llm-d-inference-sim version
293-
export CONTAINER_TOOL=podman # Use Podman instead of Docker
291+
export LLM_D_RELEASE="v0.6.0" # Pin llm-d repo clone version
292+
export LLM_D_EPP_RELEASE="v0.7.1" # Pin llm-d-inference-scheduler image tag
293+
export LLM_D_SIM_RELEASE="v0.8.2" # Pin llm-d-inference-sim image tag
294+
export CONTAINER_TOOL=podman # Use Podman instead of Docker
294295
make deploy-wva-emulated-on-kind
295296
296-
# The variable automatically sets:
297+
# These variables independently control:
298+
# - LLM_D_RELEASE: which branch/tag of the llm-d repo to clone
297299
# - LLM_D_INFERENCE_SCHEDULER_IMG=ghcr.io/llm-d/llm-d-inference-scheduler:v0.7.1
298300
# - LLM_D_INFERENCE_SIM_IMG=ghcr.io/llm-d/llm-d-inference-sim:v0.8.2
299-
# - llm-d repository clone version
300301
```
301302

302303
### Method 2: Helm Chart

deploy/install.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ SLO_TPOT=${SLO_TPOT:-10} # Target time-per-output-token SLO (in ms)
9090
SLO_TTFT=${SLO_TTFT:-1000} # Target time-to-first-token SLO (in ms)
9191

9292
# Prometheus Configuration
93-
PROM_CA_CERT_PATH=${PROM_CA_CERT_PATH:-"/tmp/prometheus-ca.crt"}
9493
PROMETHEUS_SECRET_NAME=${PROMETHEUS_SECRET_NAME:-"prometheus-web-tls"}
9594

9695
# Flags for deployment steps

deploy/kind-emulator/install.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ PROMETHEUS_PORT="9090"
6767
PROMETHEUS_URL="$PROMETHEUS_BASE_URL:$PROMETHEUS_PORT"
6868
PROMETHEUS_SECRET_NAME="prometheus-web-tls"
6969
# Prometheus TLS - mount existing secret directly (no extraction needed)
70+
PROM_TLS_SECRET_NAME="prometheus-web-tls"
71+
PROM_TLS_KEY="tls.crt"
7072
PROM_TLS_CA_CERT_PATH="/etc/ssl/certs/prometheus-ca.crt" # need a different path than OCP default value
7173

7274
# KIND cluster configuration

deploy/kubernetes/install.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ PROMETHEUS_PORT="9090"
1919
PROMETHEUS_URL=${PROMETHEUS_URL:-"$PROMETHEUS_BASE_URL:$PROMETHEUS_PORT"}
2020
PROMETHEUS_SECRET_NAME=${PROMETHEUS_SECRET_NAME:-"prometheus-web-tls"}
2121
# Prometheus TLS - mount existing secret directly (no extraction needed)
22+
PROM_TLS_SECRET_NAME="prometheus-web-tls"
23+
PROM_TLS_KEY="tls.crt"
2224
PROM_TLS_CA_CERT_PATH="/etc/ssl/certs/prometheus-ca.crt" # need a different path than OCP default value
2325
DEPLOY_PROMETHEUS=${DEPLOY_PROMETHEUS:-"true"}
2426
SKIP_TLS_VERIFY=${SKIP_TLS_VERIFY:-"true"}

deploy/lib/cleanup.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ undeploy_wva_controller() {
9494
helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NS" 2>/dev/null || \
9595
log_warning "Workload-Variant-Autoscaler not found or already uninstalled"
9696

97-
rm -f "$PROM_CA_CERT_PATH"
98-
9997
log_success "WVA uninstalled"
10098
}
10199

deploy/lib/infra_wva.sh

Lines changed: 9 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ deploy_wva_controller() {
7171
helm upgrade -i "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
7272
-n "$WVA_NS" \
7373
--values $VALUES_FILE \
74-
--set-file wva.prometheus.caCert="$PROM_CA_CERT_PATH" \
7574
--set wva.image.repository="$WVA_IMAGE_REPO" \
7675
--set wva.image.tag="$WVA_IMAGE_TAG" \
7776
--set wva.imagePullPolicy="$WVA_IMAGE_PULL_POLICY" \
@@ -89,6 +88,9 @@ deploy_wva_controller() {
8988
--set llmd.namespace="$LLMD_NS" \
9089
--set wva.prometheus.baseURL="$PROMETHEUS_URL" \
9190
--set wva.prometheus.monitoringNamespace="$MONITORING_NAMESPACE" \
91+
--set wva.prometheus.tls.caCertPath="$PROM_TLS_CA_CERT_PATH" \
92+
${PROM_TLS_SECRET_NAME:+--set wva.prometheus.tls.existingSecret=$PROM_TLS_SECRET_NAME} \
93+
${PROM_TLS_KEY:+--set wva.prometheus.tls.key=$PROM_TLS_KEY} \
9294
--set vllmService.enabled="$VLLM_SVC_ENABLED" \
9395
--set vllmService.port="$VLLM_SVC_PORT" \
9496
--set vllmService.targetPort="$VLLM_SVC_PORT" \
@@ -172,9 +174,12 @@ delete_namespaces_kube_like() {
172174
deploy_wva_prerequisites_kube_like() {
173175
log_info "Deploying Workload-Variant-Autoscaler prerequisites for Kubernetes..."
174176

175-
# Extract Prometheus CA certificate
176-
log_info "Extracting Prometheus TLS certificate"
177-
kubectl get secret "$PROMETHEUS_SECRET_NAME" -n "$MONITORING_NAMESPACE" -o jsonpath='{.data.tls\.crt}' | base64 -d > "$PROM_CA_CERT_PATH"
177+
# Copy prometheus TLS Secret to WVA namespace for direct mounting (no extraction needed)
178+
log_info "Copying $PROMETHEUS_SECRET_NAME Secret to WVA namespace..."
179+
kubectl get secret "$PROMETHEUS_SECRET_NAME" -n "$MONITORING_NAMESPACE" -o yaml | \
180+
sed "s/namespace: $MONITORING_NAMESPACE/namespace: $WVA_NS/" | \
181+
kubectl apply -f - &> /dev/null
182+
log_success "Secret copied to $WVA_NS namespace"
178183

179184
local use_values_dev=false
180185
if [ "$SKIP_TLS_VERIFY" = "true" ]; then
@@ -202,63 +207,3 @@ deploy_wva_prerequisites_kube_like() {
202207

203208
log_success "WVA prerequisites complete"
204209
}
205-
206-
# OpenShift-specific CA extraction used by deploy/openshift/install.sh.
207-
extract_openshift_prometheus_ca() {
208-
# Extract OpenShift Service CA certificate for Thanos verification
209-
# Note: For OpenShift service certificates, we need the Service CA that signed the server cert,
210-
# not the server certificate itself. The server cert is in thanos-querier-tls, but we need the CA.
211-
log_info "Extracting OpenShift Service CA certificate for Thanos verification"
212-
213-
# Method 1: Extract Service CA from openshift-service-ca.crt ConfigMap (preferred)
214-
# This is the actual CA certificate that signs OpenShift service certificates
215-
if kubectl get configmap openshift-service-ca.crt -n "$PROMETHEUS_SECRET_NS" &> /dev/null; then
216-
log_info "Extracting Service CA from openshift-service-ca.crt ConfigMap"
217-
kubectl get configmap openshift-service-ca.crt -n "$PROMETHEUS_SECRET_NS" -o jsonpath='{.data.service-ca\.crt}' > "$PROM_CA_CERT_PATH" 2>/dev/null || true
218-
if [ -s "$PROM_CA_CERT_PATH" ]; then
219-
log_success "Extracted Service CA from openshift-service-ca.crt ConfigMap"
220-
fi
221-
fi
222-
223-
# Method 2: Extract Service CA from openshift-config namespace
224-
if [ ! -s "$PROM_CA_CERT_PATH" ]; then
225-
log_info "Trying to extract Service CA from openshift-config namespace"
226-
kubectl get configmap openshift-service-ca -n openshift-config -o jsonpath='{.data.service-ca\.crt}' > "$PROM_CA_CERT_PATH" 2>/dev/null || true
227-
if [ -s "$PROM_CA_CERT_PATH" ]; then
228-
log_success "Extracted Service CA from openshift-config namespace"
229-
fi
230-
fi
231-
232-
# Method 3: Fallback to thanos-querier-tls secret (as per Helm README)
233-
# Note: This extracts the server certificate, which may work if the cert chain includes the CA
234-
# but it's not ideal - we should use the Service CA instead.
235-
if [ ! -s "$PROM_CA_CERT_PATH" ]; then
236-
log_warning "Service CA not found, falling back to server certificate from thanos-querier-tls"
237-
log_warning "This may cause TLS verification issues - Service CA is preferred"
238-
if kubectl get secret "$PROMETHEUS_SECRET_NAME" -n "$PROMETHEUS_SECRET_NS" &> /dev/null; then
239-
log_info "Extracting certificate from thanos-querier-tls secret (as per Helm README)"
240-
kubectl get secret "$PROMETHEUS_SECRET_NAME" -n "$PROMETHEUS_SECRET_NS" -o jsonpath='{.data.tls\.crt}' | base64 -d > "$PROM_CA_CERT_PATH"
241-
if [ -s "$PROM_CA_CERT_PATH" ]; then
242-
log_success "Extracted certificate from thanos-querier-tls secret"
243-
fi
244-
fi
245-
fi
246-
247-
# Verify we have a valid certificate
248-
if [ ! -s "$PROM_CA_CERT_PATH" ]; then
249-
log_error "Failed to extract OpenShift Service CA certificate"
250-
log_error "Tried: openshift-service-ca.crt ConfigMap, openshift-config ConfigMap, and thanos-querier-tls secret"
251-
exit 1
252-
fi
253-
254-
# Verify the certificate is valid PEM format
255-
if ! openssl x509 -in "$PROM_CA_CERT_PATH" -text -noout &> /dev/null; then
256-
log_warning "Certificate file may not be in valid PEM format, but continuing..."
257-
log_warning "If TLS errors occur, verify the certificate format is correct"
258-
else
259-
# Log certificate details for debugging
260-
local cert_subject
261-
cert_subject=$(openssl x509 -in "$PROM_CA_CERT_PATH" -noout -subject 2>/dev/null | sed 's/subject=//' || echo "unknown")
262-
log_info "Certificate subject: $cert_subject"
263-
fi
264-
}

deploy/lib/scaler_runtime.sh

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# Scaler backend deployment/runtime helpers for deploy/install.sh.
44
# Requires vars: MONITORING_NAMESPACE, KEDA_NAMESPACE, KEDA_CHART_VERSION,
5-
# PROM_CA_CERT_PATH, PROMETHEUS_BASE_URL, PROMETHEUS_PORT, E2E_TESTS_ENABLED.
5+
# PROMETHEUS_BASE_URL, PROMETHEUS_PORT, E2E_TESTS_ENABLED.
66
# Requires funcs: log_info/log_warning/log_success/log_error,
77
# should_skip_helm_repo_update(), retry_until_success().
88
#
@@ -160,22 +160,6 @@ deploy_prometheus_adapter() {
160160
helm repo update
161161
fi
162162

163-
# Create prometheus-ca ConfigMap from the CA certificate
164-
log_info "Creating prometheus-ca ConfigMap for Prometheus Adapter"
165-
if [ ! -f "$PROM_CA_CERT_PATH" ] || [ ! -s "$PROM_CA_CERT_PATH" ]; then
166-
log_error "CA certificate file not found or empty: $PROM_CA_CERT_PATH"
167-
log_error "Please ensure deploy_wva_prerequisites() was called first"
168-
exit 1
169-
fi
170-
171-
# Create or update the prometheus-ca ConfigMap
172-
kubectl create configmap "$PROMETHEUS_CA_CONFIGMAP_NAME" \
173-
--from-file="ca.crt=$PROM_CA_CERT_PATH" \
174-
-n "$MONITORING_NAMESPACE" \
175-
--dry-run=client -o yaml | kubectl apply -f -
176-
177-
log_success "prometheus-ca ConfigMap created/updated"
178-
179163
# Use existing values files from config/samples
180164
local values_file=""
181165
if [ "$ENVIRONMENT" = "openshift" ]; then

deploy/openshift/install.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,10 @@ deploy_prometheus_stack() {
127127

128128
#### REQUIRED FUNCTION used by deploy/install.sh ####
129129
deploy_wva_prerequisites() {
130-
log_info "Deploying Workload-Variant-Autoscaler..."
131-
extract_openshift_prometheus_ca
130+
log_info "Deploying Workload-Variant-Autoscaler prerequisites..."
131+
132+
log_info "OpenShift automatically provides service CA certificate in projected volume"
133+
log_info "Certificate path: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
132134

133135
log_info "Installing LeaderWorkerSet version $LWS_CHART_VERSION into lws-system namespace"
134136
helm upgrade -i lws oci://registry.k8s.io/lws/charts/lws \

0 commit comments

Comments
 (0)