Skip to content

Commit 2a6f718

Browse files
authored
Fixes for WVA nightly E2E failing on CKS and OCP (#958)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com>
1 parent 611c6bf commit 2a6f718

5 files changed

Lines changed: 322 additions & 29 deletions

File tree

deploy/install.sh

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ VALUES_FILE=${VALUES_FILE:-"$WVA_PROJECT/charts/workload-variant-autoscaler/valu
4343
# Controller instance identifier for multi-controller isolation (optional)
4444
# When set, adds controller_instance label to metrics and HPA selectors
4545
CONTROLLER_INSTANCE=${CONTROLLER_INSTANCE:-""}
46+
# InferencePool API group to watch (v1 is the default, auto-detected after llm-d deploy)
47+
# Track if user explicitly set POOL_GROUP to avoid overriding their choice
48+
POOL_GROUP_USER_SET=${POOL_GROUP:+true}
49+
POOL_GROUP=${POOL_GROUP:-"inference.networking.k8s.io"}
4650

4751
# llm-d Configuration
4852
LLM_D_OWNER=${LLM_D_OWNER:-"llm-d"}
@@ -1120,17 +1124,28 @@ deploy_llm_d_infrastructure() {
11201124
# Align WVA with the InferencePool API group in use (scale-from-zero requires WVA to watch the same group).
11211125
# llm-d version determines whether pools are inference.networking.k8s.io (v1) or inference.networking.x-k8s.io (v1alpha2).
11221126
if [ "$DEPLOY_WVA" == "true" ]; then
1123-
detect_inference_pool_api_group
1124-
if [ -n "$DETECTED_POOL_GROUP" ]; then
1125-
log_info "Detected InferencePool API group: $DETECTED_POOL_GROUP; upgrading WVA to watch it (scale-from-zero)"
1126-
if helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
1127-
-n $WVA_NS --reuse-values --set wva.poolGroup=$DETECTED_POOL_GROUP --wait --timeout=60s; then
1128-
log_success "WVA upgraded with wva.poolGroup=$DETECTED_POOL_GROUP"
1127+
# Only auto-detect and upgrade if user didn't explicitly set POOL_GROUP
1128+
if [ "$POOL_GROUP_USER_SET" == "true" ]; then
1129+
log_info "POOL_GROUP explicitly set by user to $POOL_GROUP - skipping auto-detection to respect user's choice"
1130+
else
1131+
detect_inference_pool_api_group
1132+
if [ -n "$DETECTED_POOL_GROUP" ]; then
1133+
# Only upgrade if detected group differs from current POOL_GROUP
1134+
if [ "$DETECTED_POOL_GROUP" != "$POOL_GROUP" ]; then
1135+
log_info "Detected InferencePool API group: $DETECTED_POOL_GROUP (differs from default $POOL_GROUP); upgrading WVA to watch it (scale-from-zero)"
1136+
if helm upgrade "$WVA_RELEASE_NAME" ${WVA_PROJECT}/charts/workload-variant-autoscaler \
1137+
-n $WVA_NS --reuse-values --set wva.poolGroup=$DETECTED_POOL_GROUP --wait --timeout=60s; then
1138+
log_success "WVA upgraded with wva.poolGroup=$DETECTED_POOL_GROUP"
1139+
POOL_GROUP=$DETECTED_POOL_GROUP
1140+
else
1141+
log_warning "WVA upgrade with poolGroup failed - scale-from-zero may not see the InferencePool"
1142+
fi
1143+
else
1144+
log_info "Detected InferencePool API group matches default ($POOL_GROUP) - no upgrade needed"
1145+
fi
11291146
else
1130-
log_warning "WVA upgrade with poolGroup failed - scale-from-zero may not see the InferencePool"
1147+
log_warning "Could not detect InferencePool API group - using default POOL_GROUP=$POOL_GROUP (scale-from-zero may be misconfigured if cluster uses a different group)"
11311148
fi
1132-
else
1133-
log_warning "Could not detect InferencePool API group - WVA may have empty datastore for scale-from-zero"
11341149
fi
11351150
fi
11361151

internal/datastore/datastore.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ import (
3434
)
3535

3636
var (
37-
errPoolNotSynced = errors.New("EndpointPool not found in datastore")
37+
// ErrPoolNotSynced is returned when an EndpointPool is not found in the datastore
38+
ErrPoolNotSynced = errors.New("EndpointPool not found in datastore")
3839
errPoolIsNull = errors.New("EndpointPool object is nil, does not exist")
3940
)
4041

@@ -149,7 +150,7 @@ func (ds *datastore) PoolGet(namespacedName string) (*poolutil.EndpointPool, err
149150

150151
pool, exist := ds.pools.Load(namespacedName)
151152
if !exist {
152-
return nil, errPoolNotSynced
153+
return nil, ErrPoolNotSynced
153154
}
154155

155156
epp := pool.(*poolutil.EndpointPool)
@@ -182,7 +183,7 @@ func (ds *datastore) PoolGetFromLabels(namespace string, labels map[string]strin
182183
})
183184

184185
if !exist {
185-
return nil, errPoolNotSynced
186+
return nil, ErrPoolNotSynced
186187
}
187188
return ep, nil
188189
}

internal/engines/scalefromzero/engine.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -232,18 +232,33 @@ func (e *Engine) processInactiveVariant(ctx context.Context, deployments map[str
232232
// Find target EPP for metrics collection in the same namespace as the VA
233233
pool, err := e.Datastore.PoolGetFromLabels(va.Namespace, labels)
234234
if err != nil {
235-
logger.Error(err, "Error finding target EPP", "variant", va.Name, "namespace", va.Namespace, "target VA model", va.Spec.ModelID)
235+
// Only skip on "not found" errors - return other errors to surface real datastore failures
236+
if errors.Is(err, datastore.ErrPoolNotSynced) {
237+
logger.V(logging.DEBUG).Info("Skipping variant, target EPP not found in datastore",
238+
"variant", va.Name,
239+
"namespace", va.Namespace,
240+
"modelID", va.Spec.ModelID)
241+
return nil
242+
}
243+
// Unexpected error - log and return to surface the issue
244+
logger.Error(err, "Unexpected error finding target EPP",
245+
"variant", va.Name,
246+
"namespace", va.Namespace,
247+
"modelID", va.Spec.ModelID)
236248
return err
237249
}
238250

239251
// Use EPP source from registry
240-
eppSource := e.Datastore.PoolGetMetricsSource(pool.Namespace + "/" + pool.Name)
252+
namespacedPoolName := pool.Namespace + "/" + pool.Name
253+
eppSource := e.Datastore.PoolGetMetricsSource(namespacedPoolName)
241254
if eppSource == nil {
242-
logger.Info("Scale-from-zero: skipping VA, EPP metrics source not found in datastore",
243-
"va", va.Name,
255+
// This is unexpected - pool exists but metrics source is missing
256+
err := fmt.Errorf("EPP metrics source not found in registry for pool %s", namespacedPoolName)
257+
logger.Error(err, "Datastore inconsistency detected",
258+
"variant", va.Name,
244259
"namespace", va.Namespace,
245-
"pool", pool.Namespace+"/"+pool.Name)
246-
return errors.New("endpointpicker metrics source not found in datastore")
260+
"pool", namespacedPoolName)
261+
return err
247262
}
248263

249264
results, err := eppSource.Refresh(ctx, source.RefreshSpec{})

0 commit comments

Comments
 (0)