@@ -83,7 +83,7 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
8383 appLabel := name + "-decode"
8484 image := "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1"
8585 if ! useSimulator {
86- image = "vllm/vllm-openai :latest"
86+ image = "ghcr.io/llm-d/llm-d-cuda-dev :latest"
8787 }
8888 args := buildModelServerArgs (modelID , useSimulator , maxNumSeqs )
8989 labels := map [string ]string {
@@ -93,6 +93,39 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
9393 "llm-d.ai/model-pool" : poolName ,
9494 "test-resource" : "true" ,
9595 }
96+
97+ envVars := []corev1.EnvVar {
98+ {Name : "POD_NAME" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "metadata.name" }}},
99+ {Name : "POD_NAMESPACE" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "metadata.namespace" }}},
100+ {Name : "POD_IP" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "status.podIP" }}},
101+ }
102+ var volumes []corev1.Volume
103+ var volumeMounts []corev1.VolumeMount
104+
105+ if ! useSimulator {
106+ envVars = append (envVars ,
107+ corev1.EnvVar {Name : "HF_HOME" , Value : "/model-cache" },
108+ corev1.EnvVar {Name : "HF_TOKEN" , ValueFrom : & corev1.EnvVarSource {
109+ SecretKeyRef : & corev1.SecretKeySelector {
110+ LocalObjectReference : corev1.LocalObjectReference {Name : "llm-d-hf-token" },
111+ Key : "HF_TOKEN" ,
112+ },
113+ }},
114+ )
115+ volumes = []corev1.Volume {
116+ {Name : "model-storage" , VolumeSource : corev1.VolumeSource {EmptyDir : & corev1.EmptyDirVolumeSource {SizeLimit : resourcePtr ("100Gi" )}}},
117+ {Name : "torch-compile-cache" , VolumeSource : corev1.VolumeSource {EmptyDir : & corev1.EmptyDirVolumeSource {}}},
118+ {Name : "metrics-volume" , VolumeSource : corev1.VolumeSource {EmptyDir : & corev1.EmptyDirVolumeSource {}}},
119+ {Name : "triton-cache" , VolumeSource : corev1.VolumeSource {EmptyDir : & corev1.EmptyDirVolumeSource {}}},
120+ }
121+ volumeMounts = []corev1.VolumeMount {
122+ {Name : "model-storage" , MountPath : "/model-cache" },
123+ {Name : "torch-compile-cache" , MountPath : "/.cache" },
124+ {Name : "metrics-volume" , MountPath : "/.config" },
125+ {Name : "triton-cache" , MountPath : "/.triton" },
126+ }
127+ }
128+
96129 return & appsv1.Deployment {
97130 ObjectMeta : metav1.ObjectMeta {
98131 Name : appLabel ,
@@ -120,30 +153,47 @@ func buildModelServiceDeployment(namespace, name, poolName, modelID string, useS
120153 Ports : []corev1.ContainerPort {
121154 {Name : "http" , ContainerPort : 8000 , Protocol : corev1 .ProtocolTCP },
122155 },
123- Env : []corev1.EnvVar {
124- {Name : "POD_NAME" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "metadata.name" }}},
125- {Name : "POD_NAMESPACE" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "metadata.namespace" }}},
126- {Name : "POD_IP" , ValueFrom : & corev1.EnvVarSource {FieldRef : & corev1.ObjectFieldSelector {APIVersion : "v1" , FieldPath : "status.podIP" }}},
127- },
128- Resources : corev1.ResourceRequirements {
129- Requests : corev1.ResourceList {
130- corev1 .ResourceCPU : resource .MustParse ("1" ),
131- corev1 .ResourceMemory : resource .MustParse ("2Gi" ),
132- },
133- Limits : corev1.ResourceList {
134- corev1 .ResourceCPU : resource .MustParse ("2" ),
135- corev1 .ResourceMemory : resource .MustParse ("4Gi" ),
136- },
137- },
156+ Env : envVars ,
157+ Resources : buildModelServiceResources (useSimulator ),
158+ VolumeMounts : volumeMounts ,
138159 },
139160 },
161+ Volumes : volumes ,
140162 RestartPolicy : corev1 .RestartPolicyAlways ,
141163 },
142164 },
143165 },
144166 }
145167}
146168
169+ func resourcePtr (s string ) * resource.Quantity {
170+ q := resource .MustParse (s )
171+ return & q
172+ }
173+
174+ // buildModelServiceResources returns resource requirements appropriate for the
175+ // deployment mode. Real vLLM requires a GPU to detect the device type at startup;
176+ // the simulator runs on CPU only.
177+ func buildModelServiceResources (useSimulator bool ) corev1.ResourceRequirements {
178+ if useSimulator {
179+ return corev1.ResourceRequirements {
180+ Requests : corev1.ResourceList {
181+ corev1 .ResourceCPU : resource .MustParse ("1" ),
182+ corev1 .ResourceMemory : resource .MustParse ("2Gi" ),
183+ },
184+ Limits : corev1.ResourceList {
185+ corev1 .ResourceCPU : resource .MustParse ("2" ),
186+ corev1 .ResourceMemory : resource .MustParse ("4Gi" ),
187+ },
188+ }
189+ }
190+ return corev1.ResourceRequirements {
191+ Limits : corev1.ResourceList {
192+ "nvidia.com/gpu" : resource .MustParse ("1" ),
193+ },
194+ }
195+ }
196+
147197func buildModelServerArgs (modelID string , useSimulator bool , maxNumSeqs int ) []string {
148198 if useSimulator {
149199 // Simulator is configured to be deliberately slow so that Prometheus
0 commit comments