Skip to content

Commit 5b03320

Browse files
committed
Gen AI Tutorial: Remove VLLM option for initial helm chart
This change removes the option of deploying w/ VLLM for the initial version of the tutorial. The option was removed due to several deployment bugs it introduced.
1 parent 9bf511e commit 5b03320

File tree

9 files changed

+25
-42
lines changed

9 files changed

+25
-42
lines changed

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
# See values.yaml for reference values.
16+
1517
gpu:
1618
- Tesla-T4
1719
- Tesla-V100-SXM2-16GB

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# See values.yaml for reference values.
1616

1717
gpu:
18+
- Tesla-T4
1819
- Tesla-V100-SXM2-16GB
1920

2021
model:

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@
1515
# See values.yaml for reference values.
1616

1717
gpu:
18-
- NVIDIA-A10G
19-
- NVIDIA-A100-SXM4-40GB
18+
- Tesla-T4
19+
- Tesla-V100-SXM2-16GB
2020

2121
model:
2222
name: llama-2-7b
23+
tensorrtLlm:
24+
parallelism:
25+
tensor: 2

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# See values.yaml for reference values.
1616

1717
gpu:
18+
- Tesla-T4
1819
- Tesla-V100-SXM2-16GB
1920

2021
model:

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
# See values.yaml for reference values.
1616

1717
gpu:
18-
- Tesla-V100-SXM2-16GB
1918
- Tesla-T4
19+
- Tesla-V100-SXM2-16GB
2020

2121
model:
2222
name: opt125m

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
{{- $model_dt := "float16" }}
2828
{{- $model_pp := 1 }}
2929
{{- $model_tp := 1 }}
30-
{{- $model_trtllm := true }}
3130
{{- with $.Values.kubernetes }}
3231
{{- with .hostRootPath }}
3332
{{- $hostRootPath = . }}
@@ -36,7 +35,6 @@
3635
{{- with $.Values.model }}
3736
{{- $model_name = required "Property '.model.name' is required." .name }}
3837
{{- with .tensorrtLlm }}
39-
{{- $model_trtllm = .enable }}
4038
{{- with .dataType }}
4139
{{- $model_dt = . }}
4240
{{- end }}
@@ -123,14 +121,10 @@ spec:
123121
- python3
124122
- ./server.py
125123
- exec
126-
{{- if $model_trtllm }}
127124
- --engine=trtllm
128125
- --dt={{ $model_dt }}
129126
- --pp={{ $model_pp }}
130127
- --tp={{ $model_tp }}
131-
{{- else }}
132-
- --engine=vllm
133-
{{- end }}
134128
{{- with $.Values.logging }}
135129
{{- with .tritonServer }}
136130
{{- if .useIso8601 }}
@@ -191,11 +185,9 @@ spec:
191185
memory: {{ $triton_memory }}
192186
nvidia.com/gpu: {{ $model_gpus }}
193187
volumeMounts:
194-
{{- if $model_trtllm }}
195188
- mountPath: /var/run/engines
196189
name: engine-repository
197190
readOnly: false
198-
{{- end }}
199191
- mountPath: /var/run/models
200192
name: model-repository
201193
readOnly: true
@@ -217,14 +209,10 @@ spec:
217209
- ./server.py
218210
- init
219211
- --model={{ $model_lower }}
220-
{{- if $model_trtllm }}
221212
- --engine=trtllm
222213
- --dt={{ $model_dt }}
223214
- --pp={{ $model_pp }}
224215
- --tp={{ $model_tp }}
225-
{{- else }}
226-
- --engine=vllm
227-
{{- end }}
228216
{{- with $.Values.logging }}
229217
{{- with .initialization }}
230218
{{- if .verbose }}
@@ -267,11 +255,9 @@ spec:
267255
ephemeral-storage: 96Gi
268256
nvidia.com/gpu: {{ $model_gpus }}
269257
volumeMounts:
270-
{{- if $model_trtllm }}
271258
- mountPath: /var/run/engines
272259
name: engine-repository
273260
readOnly: false
274-
{{- end }}
275261
- mountPath: /var/run/models
276262
name: model-repository
277263
readOnly: false
@@ -297,19 +283,13 @@ spec:
297283
{{- end }}
298284
{{- end }}
299285
volumes:
300-
{{- if $model_trtllm }}
301286
- name: engine-repository
302287
hostPath:
303288
path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
304289
type: DirectoryOrCreate
305-
{{- end }}
306290
- name: model-repository
307291
hostPath:
308-
{{- if $model_trtllm }}
309292
path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
310-
{{- else }}
311-
path: {{ printf "%s/models/%s/vllm" $hostRootPath $model_lower }}
312-
{{- end }}
313293
type: DirectoryOrCreate
314294
{{- with $.Values.model }}
315295
{{- with .pullSecret }}

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.schema.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@
5454
"enable": {
5555
"description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.",
5656
"oneOf": [
57-
{
58-
"type": "boolean"
59-
},
57+
{ "type": "boolean" },
6058
{ "type": "null" }
6159
]
6260
},

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/values.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,6 @@ model: # (required)
3939
name: # (required)
4040
# Configuration options related to the conversion of a non-optimized model into TensorRT format.
4141
tensorrtLlm: # (optional)
42-
# When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.
43-
# When 'false', the init container will fall back to vLLM and parallelism options are ignored.
44-
enable: # (default: true)
4542
# Data type used when compiling and optimizing the model for TensorRT.
4643
# Supported options are float16, bfloat16, float32
4744
dataType: # (default: float16)

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ def hugging_face_authenticate(args):
8888
hugging_face_token = token_file.read()
8989

9090
# Use Hugging Face's CLI to complete the authentication.
91-
result = run_command([HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token])
91+
result = run_command(
92+
[HUGGING_FACE_CLI, "login", "--token"], [hugging_face_token]
93+
)
9294

9395
if result != 0:
9496
raise Exception(f"Hugging Face authentication failed. ({result})")
@@ -165,17 +167,20 @@ def execute_triton(args):
165167
cmd_args = ["mpirun", "--allow-run-as-root"]
166168

167169
for i in range(world_size):
170+
if i != 0:
171+
cmd_args += [":"]
172+
168173
cmd_args += [
169174
"-n",
170175
"1",
171176
"tritonserver",
172-
f"--model-repository={MODEL_DIRECTORY}",
173-
"--disable-auto-complete-config",
174-
]
175-
cmd_args += [
177+
f"--id=rank{i}",
176178
f"--http-port={(8000 + i * 10)}",
177179
f"--grpc-port={(8001 + i * 10)}",
178180
"--model-load-thread-count=2",
181+
f"--model-repository={MODEL_DIRECTORY}",
182+
"--disable-auto-complete-config",
183+
f"--backend-config=python,shm-region-prefix-name=rank{i}_",
179184
]
180185

181186
if i == 0:
@@ -184,7 +189,6 @@ def execute_triton(args):
184189
"--allow-gpu-metrics=false",
185190
"--allow-metrics=true",
186191
"--metrics-interval-ms=1000",
187-
f"--id=rank{i}",
188192
]
189193

190194
if args.verbose > 0:
@@ -198,14 +202,11 @@ def execute_triton(args):
198202
"--allow-http=false",
199203
"--allow-grpc=false",
200204
"--allow-metrics=false",
205+
"--log-info=false",
206+
"--log-warning=false",
207+
"--model-control-mode=explicit",
208+
"--load-model=tensorrt_llm",
201209
]
202-
cmd_args += ["--log-info=false", "--log-warning=false"]
203-
204-
cmd_args += [
205-
"--disable-auto-complete-config",
206-
f"--backend-config=python,shm-region-prefix-name=rank{i}_",
207-
":",
208-
]
209210

210211
result = run_command(cmd_args)
211212
exit(result)

0 commit comments

Comments
 (0)