triton-inference-server
diff --git a/‎Deployment/Kubernetes/README.md
Lines changed: 3 additions & 0 deletions b/‎Deployment/Kubernetes/README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md
Lines changed: 904 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md
Lines changed: 904 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml
Lines changed: 20 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml
Lines changed: 20 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml
Lines changed: 20 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml
Lines changed: 20 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml
Lines changed: 29 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml
Lines changed: 23 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml
Lines changed: 29 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml
Lines changed: 29 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml
Lines changed: 29 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml
Lines changed: 23 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml
Lines changed: 23 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt
Lines changed: 12 additions & 0 deletions b/‎Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt
Lines changed: 12 additions & 0 deletions
@@ -0,0 +1,3 @@
+# Kubernetes Deployment of Triton Server Guides
+
+* [TensorRT-LLM Gen. AI Autoscaling &amp; Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md)
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+appVersion: 0.1.0
+description: Triton + TensorRT-LLM autoscaling and load balancing example.
+icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]
+name: triton_trt-llm_aslb-example
+version: 0.1.0
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu: 
+- Tesla-T4
+- Tesla-V100-SXM2-16GB
+
+model:
+  name: gpt2
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+
+model:
+  name: llama-2-7b-chat
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 2
+
+autoscaling:
+  metric:
+    value: 1500m
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A10G
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-2-7b
+  pullSecret: hf-model-pull
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-3-70b-instruct
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 8
+
+autoscaling:
+  metric:
+    value: 3500m
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+
+model: 
+  name: llama-3-8b-instruct
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 2
+
+autoscaling:
+  metric:
+    value: 1500m
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A10G
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-3-8b
+  pullSecret: hf-model-pull
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+- Tesla-T4
+
+model:
+  name: opt125m
+  pullSecret: hf-model-pull
@@ -0,0 +1,12 @@
+{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
+
+Release Name: {{ $.Release.Name }}
+Namespace: {{ $.Release.Namespace }}
+Deployment Name: {{ $.Release.Name }}
+Service Name: {{ $.Release.Name }}
+
+Helpful commands:
+
+  $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
+  $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
+  $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments,pods,hpa,services,podmonitors
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Kubernetes Deployment of Triton Server Guides`
	`2`	`+`
	`3`	`+* [TensorRT-LLM Gen. AI Autoscaling & Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md)`