Skip to content

Commit 7f7836b

Browse files
committed
Triton CLI removed
1 parent 1145f10 commit 7f7836b

File tree

8 files changed

+250
-180
lines changed

8 files changed

+250
-180
lines changed

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md

Lines changed: 172 additions & 88 deletions
Large diffs are not rendered by default.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: efs-claim-autoscaling-2
5+
spec:
6+
accessModes:
7+
- ReadWriteMany
8+
storageClassName: efs-autoscaling-sc
9+
resources:
10+
requests:
11+
storage: 200Gi
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: efs-autoscaling-pv-2
5+
spec:
6+
capacity:
7+
storage: 200Gi
8+
volumeMode: Filesystem
9+
accessModes:
10+
- ReadWriteMany
11+
persistentVolumeReclaimPolicy: Retain
12+
storageClassName: efs-autoscaling-sc
13+
csi:
14+
driver: efs.csi.aws.com
15+
volumeHandle: fs-0c6ba87870e4be751
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
kind: StorageClass
2+
apiVersion: storage.k8s.io/v1
3+
metadata:
4+
name: efs-autoscaling-sc
5+
provisioner: efs.csi.aws.com

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/deployment.yaml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@ spec:
190190
readOnly: false
191191
- mountPath: /var/run/models
192192
name: model-repository
193-
readOnly: true
194193
- mountPath: /var/run/cache
195194
name: transformers-cache
196195
readOnly: false
@@ -255,14 +254,12 @@ spec:
255254
ephemeral-storage: 96Gi
256255
nvidia.com/gpu: {{ $model_gpus }}
257256
volumeMounts:
258-
- mountPath: /var/run/engines
259-
name: engine-repository
260-
readOnly: false
261257
- mountPath: /var/run/models
262258
name: model-repository
263-
readOnly: false
264-
- mountPath: /var/run/cache
265-
name: transformers-cache
259+
- mountPath: /dev/shm
260+
name: dshm
261+
- mountPath: /var/run/engines
262+
name: engine-repository
266263
readOnly: false
267264
{{- with $.Values.model }}
268265
{{- if .pullSecret }}
@@ -286,11 +283,14 @@ spec:
286283
- name: engine-repository
287284
hostPath:
288285
path: {{ printf "%s/models/%s/%dx%d/engines" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
289-
type: DirectoryOrCreate
286+
type: DirectoryOrCreate
290287
- name: model-repository
291-
hostPath:
292-
path: {{ printf "%s/models/%s/%dx%d/models" $hostRootPath $model_lower (int $model_pp) (int $model_tp) }}
293-
type: DirectoryOrCreate
288+
persistentVolumeClaim:
289+
claimName: efs-claim-autoscaling-2
290+
- name: dshm
291+
emptyDir:
292+
medium: Memory
293+
sizeLimit: 512Gi
294294
{{- with $.Values.model }}
295295
{{- with .pullSecret }}
296296
- name: hf-secret

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/server.py

Lines changed: 6 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# Any changes here must also be made there, and vice versa.
2424
CACHE_DIRECTORY = "/var/run/cache"
2525
HUGGING_FACE_TOKEN_PATH = "/var/run/secrets/hugging_face/password"
26-
MODEL_DIRECTORY = "/var/run/models"
26+
MODEL_DIRECTORY = "/var/run/models/tensorrtllm_backend/triton_model_repo"
2727

2828
ERROR_EXIT_DELAY = 15
2929
ERROR_CODE_FATAL = 255
@@ -212,66 +212,6 @@ def execute_triton(args):
212212
exit(result)
213213

214214

215-
# ---
216-
217-
218-
def initialize_model(args):
219-
if args.model is None or len(args.model) == 0:
220-
die("Model name must be provided.")
221-
222-
hugging_face_authenticate(args)
223-
224-
engine_path = os.path.join(ENGINE_DIRECTORY, args.model)
225-
model_path = os.path.join(MODEL_DIRECTORY, args.model)
226-
227-
# When the model and plan already exist, we can exit early, happily.
228-
if os.path.exists(engine_path) and os.path.exists(model_path):
229-
write_output(
230-
f"TensorRT engine and plan detected for {args.model}. No work to do, exiting."
231-
)
232-
exit(EXIT_SUCCESS)
233-
234-
write_output(f"Begin generation of TensorRT engine and plan for {args.model}.")
235-
write_output(" ")
236-
237-
# Build up a set of args for the subprocess call.
238-
cmd_args = [
239-
"triton",
240-
"import",
241-
"--model",
242-
args.model,
243-
"--model-repository",
244-
MODEL_DIRECTORY,
245-
]
246-
247-
if args.engine == "vllm":
248-
cmd_args += ["--backend", "vllm"]
249-
250-
else:
251-
cmd_args += ["--backend", "tensorrtllm"]
252-
253-
if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]:
254-
cmd_args += ["--data-type", args.dt]
255-
256-
if args.pp > 1:
257-
cmd_args += ["--pipeline-parallelism", f"{args.pp}"]
258-
259-
if args.tp > 1:
260-
cmd_args += ["--tensor-parallelism", f"{args.tp}"]
261-
262-
# When verbose, insert the verbose flag.
263-
# It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position.
264-
# This limitation will likely be removed a future release of triton_cli.
265-
if is_verbose:
266-
cmd_args.insert(1, "--verbose")
267-
268-
result = run_command(cmd_args)
269-
exit(result)
270-
271-
272-
# ---
273-
274-
275215
def parse_arguments():
276216
parser = argparse.ArgumentParser()
277217
parser.add_argument("mode", type=str, choices=["exec", "init"])
@@ -302,31 +242,17 @@ def parse_arguments():
302242
HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY)
303243

304244
is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None
305-
306-
# Validate that `ENGINE_DIRECTORY` isn't empty.
307-
if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0:
308-
raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.")
309-
310-
# Validate that `ENGINE_DIRECTORY` actually exists.
311-
if not os.path.exists(ENGINE_DIRECTORY):
312-
raise Exception(f"Engine directory '{ENGINE_DIRECTORY}' does not exist.")
313-
314-
# Validate that `MODEL_DIRECTORY` actually exists.
315-
if not os.path.exists(MODEL_DIRECTORY):
316-
raise Exception(f"Model directory '{MODEL_DIRECTORY}' does not exist.")
317-
318245
# Parse options provided.
319246
args = parse_arguments()
320247

321-
# Update the is_verbose flag with values passed in by options.
322-
is_verbose = is_verbose or args.verbose > 0
323-
324248
if args.mode == "init":
325-
initialize_model(args)
326-
249+
print("Hello, World!")
250+
exit(EXIT_SUCCESS)
251+
327252
elif args.mode == "exec":
253+
# Update the is_verbose flag with values passed in by options.
254+
is_verbose = is_verbose or args.verbose > 0
328255
execute_triton(args)
329-
330256
else:
331257
write_error(f"usage: server.py <mode> [<options>].")
332258
write_error(f' Invalid mode ("{args.mode}") provided.')

Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/containers/triton_trt-llm.containerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
15+
ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
1616
ARG ENGINE_DEST_PATH=/var/run/engines
1717
ARG HF_HOME=/var/run/cache
1818

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: setup-ssh-efs
5+
labels:
6+
app: setup-ssh-efs
7+
spec:
8+
containers:
9+
- name: triton
10+
image: nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
11+
command: ["sleep", "infinity"]
12+
resources:
13+
limits:
14+
nvidia.com/gpu: 4
15+
requests:
16+
nvidia.com/gpu: 4
17+
volumeMounts:
18+
- mountPath: /var/run/models
19+
name: model-repository
20+
- mountPath: /dev/shm
21+
name: dshm
22+
volumes:
23+
- name: model-repository
24+
persistentVolumeClaim:
25+
claimName: efs-claim-autoscaling-2
26+
- name: dshm
27+
emptyDir:
28+
medium: Memory
29+
sizeLimit: 512Gi

0 commit comments

Comments
 (0)