Skip to content

Commit 4edb5b3

Browse files
authored
🐛 fix: configure E2E model service pods for real vLLM on GPU clusters (#826)
When running E2E tests with real vLLM (USE_SIMULATOR=false), test-created model service pods need the same configuration as infra-deployed model services: - Use ghcr.io/llm-d/llm-d-cuda-dev:latest image (matches infra deployment) - GPU resources (nvidia.com/gpu: 1) - HF_TOKEN env var from llm-d-hf-token secret (model download auth) - HF_HOME=/model-cache env var (writable cache directory) - Volume mounts: model-storage at /model-cache, torch-compile-cache at /.cache, metrics-volume at /.config, triton-cache at /.triton The /.triton volume is critical: vllm/vllm-openai runs as root in a read-only root filesystem area, causing PermissionError when Triton compiles CUDA kernels. Using the llm-d-cuda-dev image with writable emptyDir mounts resolves this. Signed-off-by: Andrew Anderson <andy@clubanderson.com>
1 parent ad031cd commit 4edb5b3

1 file changed

Lines changed: 66 additions & 16 deletions

File tree

test/e2e/fixtures/model_service_builder.go

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
8383
appLabel := name + "-decode"
8484
image := "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1"
8585
if !useSimulator {
86-
image = "vllm/vllm-openai:latest"
86+
image = "ghcr.io/llm-d/llm-d-cuda-dev:latest"
8787
}
8888
args := buildModelServerArgs(modelID, useSimulator, maxNumSeqs)
8989
labels := map[string]string{
@@ -93,6 +93,39 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
9393
"llm-d.ai/model-pool": poolName,
9494
"test-resource": "true",
9595
}
96+
97+
envVars := []corev1.EnvVar{
98+
{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "metadata.name"}}},
99+
{Name: "POD_NAMESPACE", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "metadata.namespace"}}},
100+
{Name: "POD_IP", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "status.podIP"}}},
101+
}
102+
var volumes []corev1.Volume
103+
var volumeMounts []corev1.VolumeMount
104+
105+
if !useSimulator {
106+
envVars = append(envVars,
107+
corev1.EnvVar{Name: "HF_HOME", Value: "/model-cache"},
108+
corev1.EnvVar{Name: "HF_TOKEN", ValueFrom: &corev1.EnvVarSource{
109+
SecretKeyRef: &corev1.SecretKeySelector{
110+
LocalObjectReference: corev1.LocalObjectReference{Name: "llm-d-hf-token"},
111+
Key: "HF_TOKEN",
112+
},
113+
}},
114+
)
115+
volumes = []corev1.Volume{
116+
{Name: "model-storage", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{SizeLimit: resourcePtr("100Gi")}}},
117+
{Name: "torch-compile-cache", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}},
118+
{Name: "metrics-volume", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}},
119+
{Name: "triton-cache", VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}},
120+
}
121+
volumeMounts = []corev1.VolumeMount{
122+
{Name: "model-storage", MountPath: "/model-cache"},
123+
{Name: "torch-compile-cache", MountPath: "/.cache"},
124+
{Name: "metrics-volume", MountPath: "/.config"},
125+
{Name: "triton-cache", MountPath: "/.triton"},
126+
}
127+
}
128+
96129
return &appsv1.Deployment{
97130
ObjectMeta: metav1.ObjectMeta{
98131
Name: appLabel,
@@ -120,30 +153,47 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
120153
Ports: []corev1.ContainerPort{
121154
{Name: "http", ContainerPort: 8000, Protocol: corev1.ProtocolTCP},
122155
},
123-
Env: []corev1.EnvVar{
124-
{Name: "POD_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "metadata.name"}}},
125-
{Name: "POD_NAMESPACE", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "metadata.namespace"}}},
126-
{Name: "POD_IP", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{APIVersion: "v1", FieldPath: "status.podIP"}}},
127-
},
128-
Resources: corev1.ResourceRequirements{
129-
Requests: corev1.ResourceList{
130-
corev1.ResourceCPU: resource.MustParse("1"),
131-
corev1.ResourceMemory: resource.MustParse("2Gi"),
132-
},
133-
Limits: corev1.ResourceList{
134-
corev1.ResourceCPU: resource.MustParse("2"),
135-
corev1.ResourceMemory: resource.MustParse("4Gi"),
136-
},
137-
},
156+
Env: envVars,
157+
Resources: buildModelServiceResources(useSimulator),
158+
VolumeMounts: volumeMounts,
138159
},
139160
},
161+
Volumes: volumes,
140162
RestartPolicy: corev1.RestartPolicyAlways,
141163
},
142164
},
143165
},
144166
}
145167
}
146168

169+
func resourcePtr(s string) *resource.Quantity {
170+
q := resource.MustParse(s)
171+
return &q
172+
}
173+
174+
// buildModelServiceResources returns resource requirements appropriate for the
175+
// deployment mode. Real vLLM requires a GPU to detect the device type at startup;
176+
// the simulator runs on CPU only.
177+
func buildModelServiceResources(useSimulator bool) corev1.ResourceRequirements {
178+
if useSimulator {
179+
return corev1.ResourceRequirements{
180+
Requests: corev1.ResourceList{
181+
corev1.ResourceCPU: resource.MustParse("1"),
182+
corev1.ResourceMemory: resource.MustParse("2Gi"),
183+
},
184+
Limits: corev1.ResourceList{
185+
corev1.ResourceCPU: resource.MustParse("2"),
186+
corev1.ResourceMemory: resource.MustParse("4Gi"),
187+
},
188+
}
189+
}
190+
return corev1.ResourceRequirements{
191+
Limits: corev1.ResourceList{
192+
"nvidia.com/gpu": resource.MustParse("1"),
193+
},
194+
}
195+
}
196+
147197
func buildModelServerArgs(modelID string, useSimulator bool, maxNumSeqs int) []string {
148198
if useSimulator {
149199
// Simulator is configured to be deliberately slow so that Prometheus

0 commit comments

Comments
 (0)