Skip to content

Commit 2c2082c

Browse files
committed
fix: update existing EPP ConfigMap instead of adding new --config-file
The Helm chart already deploys the EPP with --config-file pointing to its own ConfigMap (with scorer weights 2/2/3). Adding a second --config-file or --config-text flag broke the EPP and caused Gateway HTTP 500. New approach: - Find the EPP deployment's existing ConfigMap volume - Update the ConfigMap data to add featureGates: [flowControl] - Trigger a rollout restart via annotation (no arg/volume/env changes) - Wait for Gateway health after EPP restart - Gateway is a hard requirement — no fallback to direct vLLM Made-with: Cursor
1 parent 98ee08a commit 2c2082c

2 files changed

Lines changed: 58 additions & 40 deletions

File tree

test/benchmark/prefill_heavy_benchmark_test.go

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -226,28 +226,27 @@ var _ = Describe("Prefill Heavy Workload Benchmark", Label("benchmark", "phase4"
226226
GinkgoWriter.Println("--- End Diagnostics ---")
227227
}
228228

229-
// ensureEPPConfig patches the EPP with --config-text containing the
230-
// EndpointPickerConfig (flowControl + scorer weights 2/2/3). It also
231-
// removes the deprecated ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER env
232-
// var to avoid conflicts — the config-text featureGates supersede it.
229+
// ensureEPPConfig updates the EPP's existing ConfigMap to enable
230+
// flowControl and set scorer weights (queue=2, kv-cache=2, prefix-cache=3),
231+
// then triggers a rollout restart and waits for Gateway health.
233232
ensureEPPConfig := func() {
234233
By("Discovering EPP deployment")
235234
eppDeployName, findErr := fixtures.FindEPPDeployment(ctx, k8sClient, benchCfg.LLMDNamespace)
236235
Expect(findErr).NotTo(HaveOccurred(), "Failed to find EPP deployment")
237236
GinkgoWriter.Printf(" Found EPP deployment: %s\n", eppDeployName)
238237

239-
By("Patching EPP deployment with --config-text (scorer weights 2/2/3, flowControl)")
240-
patchErr := fixtures.PatchEPPWithConfigText(ctx, k8sClient, benchCfg.LLMDNamespace, eppDeployName)
241-
Expect(patchErr).NotTo(HaveOccurred(), "Failed to patch EPP deployment with config-text")
242-
GinkgoWriter.Println(" EPP deployment patched and rolled out successfully")
238+
By("Updating EPP ConfigMap with flowControl + scorer weights 2/2/3")
239+
patchErr := fixtures.PatchEPPConfigMap(ctx, k8sClient, benchCfg.LLMDNamespace, eppDeployName)
240+
Expect(patchErr).NotTo(HaveOccurred(), "Failed to update EPP ConfigMap")
241+
GinkgoWriter.Println(" EPP ConfigMap updated and rollout completed")
243242

244243
By("Waiting for Gateway to become healthy after EPP rollout")
245244
Eventually(func(g Gomega) {
246245
gwURL := fmt.Sprintf("http://%s.%s.svc.cluster.local:%d",
247246
benchCfg.GatewayServiceName, benchCfg.LLMDNamespace, benchCfg.GatewayServicePort)
248247
err := fixtures.VerifyGatewayConnectivity(ctx, k8sClient, benchCfg.LLMDNamespace, gwURL, benchCfg.ModelID)
249248
g.Expect(err).NotTo(HaveOccurred(), "Gateway not ready yet after EPP rollout")
250-
}, 5*time.Minute, 15*time.Second).Should(Succeed(), "Gateway failed to become healthy after EPP config patch")
249+
}, 5*time.Minute, 15*time.Second).Should(Succeed(), "Gateway failed to become healthy after EPP config update")
251250
GinkgoWriter.Println(" Gateway is healthy after EPP config update")
252251
}
253252

test/e2e/fixtures/epp_config_builder.go

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@ import (
66
"strings"
77
"time"
88

9-
corev1 "k8s.io/api/core/v1"
109
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1110
"k8s.io/client-go/kubernetes"
1211
)
1312

14-
// EndpointPickerConfigYAML is the full config text passed via --config-text.
15-
// It enables flowControl and sets scorer weights: queue=2, kv-cache=2, prefix-cache=3.
16-
const EndpointPickerConfigYAML = `apiVersion: inference.networking.x-k8s.io/v1alpha1
13+
// DesiredEPPConfig is the EndpointPickerConfig YAML with flowControl enabled
14+
// and scorer weights: queue=2, kv-cache=2, prefix-cache=3.
15+
const DesiredEPPConfig = `apiVersion: inference.networking.x-k8s.io/v1alpha1
1716
kind: EndpointPickerConfig
1817
featureGates:
1918
- flowControl
@@ -29,48 +28,69 @@ schedulingProfiles:
2928
- pluginRef: kv-cache-utilization-scorer
3029
weight: 2
3130
- pluginRef: prefix-cache-scorer
32-
weight: 3`
31+
weight: 3
32+
`
3333

34-
// PatchEPPWithConfigText patches the EPP deployment to use --config-text with
35-
// the EndpointPickerConfig YAML inline. It also removes the deprecated
36-
// ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER env var to avoid conflicts (the
37-
// config-text featureGates supersede it). This approach avoids ConfigMap
38-
// volume mounts which simplifies the patch. Waits for the rollout to complete.
39-
func PatchEPPWithConfigText(ctx context.Context, k8sClient *kubernetes.Clientset, namespace, eppDeploymentName string) error {
34+
// PatchEPPConfigMap updates the EPP's existing ConfigMap to include
35+
// featureGates: [flowControl] and the desired scorer weights, then triggers
36+
// a rollout restart. This avoids changing deployment args, volumes, or env
37+
// vars — only the ConfigMap data is modified.
38+
func PatchEPPConfigMap(ctx context.Context, k8sClient *kubernetes.Clientset, namespace, eppDeploymentName string) error {
4039
dep, err := k8sClient.AppsV1().Deployments(namespace).Get(ctx, eppDeploymentName, metav1.GetOptions{})
4140
if err != nil {
4241
return fmt.Errorf("failed to get EPP deployment %s: %w", eppDeploymentName, err)
4342
}
4443

45-
c := &dep.Spec.Template.Spec.Containers[0]
46-
47-
// Check if already patched
48-
for _, a := range c.Args {
49-
if strings.HasPrefix(a, "--config-text=") {
50-
return nil
44+
// Find the ConfigMap name from the deployment's volumes
45+
var configMapName string
46+
for _, v := range dep.Spec.Template.Spec.Volumes {
47+
if v.ConfigMap != nil {
48+
configMapName = v.ConfigMap.Name
49+
break
5150
}
5251
}
52+
if configMapName == "" {
53+
return fmt.Errorf("EPP deployment %s has no ConfigMap volume", eppDeploymentName)
54+
}
5355

54-
// Remove the ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER env var — the config
55-
// text's featureGates: [flowControl] supersedes it and having both causes
56-
// the EPP to malfunction on v0.5.0-rc.1.
57-
filtered := make([]corev1.EnvVar, 0, len(c.Env))
58-
for _, e := range c.Env {
59-
if e.Name != "ENABLE_EXPERIMENTAL_FLOW_CONTROL_LAYER" {
60-
filtered = append(filtered, e)
56+
// Find the config file key from --config-file arg
57+
configKey := "default-plugins.yaml"
58+
for _, a := range dep.Spec.Template.Spec.Containers[0].Args {
59+
if strings.HasPrefix(a, "--config-file=") {
60+
parts := strings.Split(a, "/")
61+
if len(parts) > 0 {
62+
configKey = parts[len(parts)-1]
63+
}
6164
}
6265
}
63-
c.Env = filtered
6466

65-
// Append --config-text with inline YAML (preserves all existing args)
66-
c.Args = append(c.Args, "--config-text="+EndpointPickerConfigYAML)
67+
// Update the ConfigMap with flowControl + weights 2/2/3
68+
cm, err := k8sClient.CoreV1().ConfigMaps(namespace).Get(ctx, configMapName, metav1.GetOptions{})
69+
if err != nil {
70+
return fmt.Errorf("failed to get EPP ConfigMap %s: %w", configMapName, err)
71+
}
72+
73+
if strings.Contains(cm.Data[configKey], "flowControl") {
74+
return nil
75+
}
76+
77+
cm.Data[configKey] = DesiredEPPConfig
78+
_, err = k8sClient.CoreV1().ConfigMaps(namespace).Update(ctx, cm, metav1.UpdateOptions{})
79+
if err != nil {
80+
return fmt.Errorf("failed to update EPP ConfigMap %s: %w", configMapName, err)
81+
}
6782

83+
// Trigger rollout restart via annotation change so the EPP picks up the new config
84+
if dep.Spec.Template.Annotations == nil {
85+
dep.Spec.Template.Annotations = make(map[string]string)
86+
}
87+
dep.Spec.Template.Annotations["benchmark/restart-trigger"] = time.Now().Format(time.RFC3339)
6888
_, err = k8sClient.AppsV1().Deployments(namespace).Update(ctx, dep, metav1.UpdateOptions{})
6989
if err != nil {
70-
return fmt.Errorf("failed to update EPP deployment with config-text: %w", err)
90+
return fmt.Errorf("failed to trigger EPP rollout restart: %w", err)
7191
}
7292

73-
// Wait for rollout: new pods ready
93+
// Wait for rollout to complete
7494
deadline := time.After(5 * time.Minute)
7595
tick := time.NewTicker(10 * time.Second)
7696
defer tick.Stop()
@@ -84,7 +104,7 @@ func PatchEPPWithConfigText(ctx context.Context, k8sClient *kubernetes.Clientset
84104
continue
85105
}
86106
if d.Status.UpdatedReplicas > 0 && d.Status.ReadyReplicas == d.Status.UpdatedReplicas &&
87-
d.Status.UnavailableReplicas == 0 {
107+
d.Status.UnavailableReplicas == 0 && d.Status.ObservedGeneration >= d.Generation {
88108
return nil
89109
}
90110
}
@@ -115,4 +135,3 @@ func containsAny(s string, substrs ...string) bool {
115135
}
116136
return false
117137
}
118-

0 commit comments

Comments
 (0)