diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc7a8f96..74570a59 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,7 +65,7 @@ repos: - id: check-json - id: check-toml - id: check-yaml - exclude: ^Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/.+$ + exclude: ^Deployment/Kubernetes/[^/]+/chart/templates/.+$ - id: check-shebang-scripts-are-executable - id: end-of-file-fixer types_or: [c, c++, cuda, proto, textproto, java, python] diff --git a/Deployment/Kubernetes/README.md b/Deployment/Kubernetes/README.md index 72bcd0ea..aecee447 100644 --- a/Deployment/Kubernetes/README.md +++ b/Deployment/Kubernetes/README.md @@ -1,3 +1,4 @@ # Kubernetes Deployment of Triton Server Guides * [TensorRT-LLM Gen. AI Autoscaling & Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md) +* [Multi-Node Generative AI w/ Triton Server and TensorRT-LLM](./TensorRT-LLM_Multi-Node_Distributed_Models/README.md) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore new file mode 100644 index 00000000..462fe9f8 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/.gitignore @@ -0,0 +1,5 @@ +.vscode/ +**/.vscode/ + +dev_* +**/dev_* diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md new file mode 100644 index 00000000..7846a7b8 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/README.md @@ -0,0 +1,746 @@ + + +# Multi-Node Generative AI w/ Triton Server and TensorRT-LLM + +It almost goes without saying that large language models (LLM) are large. +LLMs often are too large to fit in the memory of a single GPU. +Therefore we need a solution which enables multiple GPUs to cooperate to enable inference serving for this very large models. + +This guide aims to explain how to perform multi-GPU, multi-node deployment of large language models using Triton Server and +TRT-LLM in a Kubernetes cluster. +Setting up multi-node LLM support using Triton Inference Server, TensorRT-LLM, and Kubernetes is not difficult, but it does +require preparation. + +We'll cover the following topics: + +* [Cluster Setup](#cluster-setup) + * [Persistent Volume Setup](#persistent-volume-setup) + * [Core Cluster Services](#core-cluster-services) + * [Kubernetes Node Feature Discovery service](#kubernetes-node-feature-discovery-service) + * [NVIDIA Device Plugin for Kubernetes](#nvidia-device-plugin-for-kubernetes) + * [NVIDIA GPU Feature Discovery service](#nvidia-gpu-feature-discovery-service) + * [Hugging Face Authorization](#hugging-face-authorization) +* [Triton Preparation](#triton-preparation) + * [Model Preparation Script](#model-preparation-script) + * [Custom Container Image](#custom-container-image) + * [Kubernetes Pull Secrets](#kubernetes-pull-secrets) +* [Triton Deployment](#triton-deployment) + * [How It Works](#how-it-works) + * [Potential Improvements](#potential-improvements) + * [Autoscaling and Gang Scheduling](#autoscaling-and-gang-scheduling) + * [Network Topology Aware Scheduling](#network-topology-aware-scheduling) +* [Developing this Guide](#developing-this-guide) + +Prior to beginning this guide/tutorial you will need a couple of things. + +* Kubernetes Control CLI (`kubectl`) + [ [documentation](https://kubernetes.io/docs/reference/kubectl/introduction/) + | [download](https://kubernetes.io/releases/download/) ] +* Helm CLI (`helm`) + [ [documentation](https://helm.sh/) + | [download](https://helm.sh/docs/intro/install) ] +* Docker CLI (`docker`) + [ [documentation](https://docs.docker.com/) + | [download](https://docs.docker.com/get-docker/) ] +* Decent text editing software for editing YAML files. +* Kubernetes cluster. +* Fully configured `kubectl` with administrator permissions to the cluster. + + + +## Cluster Setup + +The following instructions are setting up a Kubernetes cluster for the deployment of LLMs using Triton Server and TRT-LLM. + + +### Prerequisites + +This guide assumes that all nodes with NVIDIA GPUs have the following: +- A node label of `nvidia.com/gpu=present` to more easily identify nodes with NVIDIA GPUs. +- A node taint of `nvidia.com/gpu=present:NoSchedule` to prevent non-GPU pods from being deployed to GPU nodes. + +> [!Tip] +> When using a Kubernetes provider like AKS, EKA, or GKE, it is usually best to use their interface when configuring nodes +> instead of using `kubectl` to do it directly. + + +### Persistent Volume Setup + +To enable multiple pods deployed to multiple nodes to load shards of the same model so that they can used in coordination to +serve inference request too large to loaded by a single GPU, we'll need a common, shared storage location. +In Kubernetes, these common, shared storage locations are referred to as persistent volumes. +Persistent volumes can be volume mapped in to any number of pods and then accessed by processes running inside of said pods +as if they were part of the pod's file system. + +Additionally, we will need to create a persistent-volume claim which can use to assign the persistent volume to a pod. + +Unfortunately, the creation of a persistent volume will depend on how your cluster is setup, and is outside the scope of this +tutorial. +That said, we will provide a basic overview of the process. + +#### Create a Persistent Volume + +If your cluster is hosted by a cloud service provider, (CSP) like Amazon (EKS), Azure (AKS), or gCloud (GKE) +step-by-step instructions are available online for how to setup a persistent volume for your cluster. +Otherwise, you will need to work with your cluster administrator or find a separate guide online on how to setup a +persistent volume for your cluster. + +The following resources can assist with the setting up of persistent volumes for your cluster. + +* [Kubernetes Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) +* [AKS Persistent Volumes](https://learn.microsoft.com/en-us/azure/aks/azure-csi-disk-storage-provision) +* [EKS Persistent Volumes](https://aws.amazon.com/blogs/storage/persistent-storage-for-kubernetes/) +* [GKE Persistent Volumes](https://cloud.google.com/kubernetes-engine/docs/concepts/persistent-volumes) +* [OKE Persistent Volumes](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingpersistentvolumeclaim.htm) + +> [!Important] +> It is important to consider the storage requirements of the models you expect your cluster to host, and be sure to +> sufficiently size the persistent volume for the combined storage size of all models. + +Below are some example values gathered from internal testing of this tutorial. + +| Model | Parallelism | Raw Size | Converted Size | Total Size | +| :-------------- | ----------: | -------: | -------------: | ---------: | +| **Llama-3-8B** | 2 | 15Gi | 32Gi | 47Gi | +| **Llama-3-8B** | 4 | 15Gi | 36Gi | 51Gi | +| **Llama-3-70B** | 8 | 90Gi | 300Gi | 390Gi | + +#### Create a Persistent-Volume Claim + +In order to connect the Triton Server pods to the persistent volume created above, we need to create a persistent-volume +claim (PVC). You can use the [pvc.yaml](./pvc.yaml) file provided as part of this tutorial to create one. + +> [!Important] +> The `volumeName` property must match the `metadata.name` property of the persistent volume created above. + + +### Core Cluster Services + +Once all nodes are correctly labeled and tainted, use the following steps to prepare the cluster deploying large language +models across multiple nodes with Triton Server. + +The following series of steps are intended to prepare a fresh cluster. +For clusters in varying states, it is best to coordinate with your cluster administrator before installing new services and +capabilities. + +#### Kubernetes Node Feature Discovery service + +1. Add the Kubernetes Node Feature Discovery chart repository to the local cache. + + ```bash + helm repo add kube-nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts \ + && helm repo update + ``` + +2. Run the command below to install the service. + + ```bash + helm install -n kube-system node-feature-discovery kube-nfd/node-feature-discovery \ + --set nameOverride=node-feature-discovery \ + --set worker.tolerations[0].key=nvidia.com/gpu \ + --set worker.tolerations[0].operator=Exists \ + --set worker.tolerations[0].effect=NoSchedule + ``` + + > [!Note] + > The above command sets toleration values which allow for the deployment of a pod onto a node with + > a matching taint. + > See this document's [prerequisites](#prerequisites) for the taints this document expected to have been applied to GPU + > nodes in the cluster. + +#### NVIDIA Device Plugin for Kubernetes + +1. This step is unnecessary if the Device Plugin has already been installed in your cluster. + Cloud provider turnkey Kubernetes clusters, such as those from AKS, EKS, and GKE, often have the Device Plugin + automatically once a GPU node as been added to the cluster. + + To check if your cluster requires the NVIDIA Device Plugin for Kubernetes, run the following command and inspect + the output for `nvidia-device-plugin-daemonset`. + + ```bash + kubectl get daemonsets --all-namespaces + ``` + + Example output: + ```text + NAMESPACE NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE + kube-system kube-proxy 6 6 6 6 6 + ``` + +2. If `nvidia-device-plugin-daemonset` is not listed, run the command below to install the plugin. + Once installed it will provide containers access to GPUs in your clusters. + + For additional information, see + [Github/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin/blob/main/README.md). + + ```bash + kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml + ``` + +#### NVIDIA GPU Feature Discovery Service + +1. This step is unnecessary if the service has already been installed in your cluster. + + To check if your cluster requires the NVIDIA Device Plugin for Kubernetes, run the following command and inspect + the output for `nvidia-device-plugin-daemonset`. + + ```bash + kubectl get daemonsets --all-namespaces + ``` + + Example output: + ```text + NAMESPACE NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE + kube-system kube-proxy 6 6 6 6 6 + kube-system nvidia-device-plugin-daemonset 6 6 6 6 6 + ``` + +2. If `gpu-feature-discovery` is listed, skip this step and the next. + + Otherwise, use the YAML file below to install the GPU Feature Discovery service. + + > [nvidia_gpu-feature-discovery_daemonset.yaml](nvidia_gpu-feature-discovery_daemonset.yaml) + + The file above was created by downloading its contents from + [GitHub/NVIDIA](https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml) + and modified specifically for this tutorial. + + ```bash + curl https://raw.githubusercontent.com/NVIDIA/gpu-feature-discovery/v0.8.2/deployments/static/gpu-feature-discovery-daemonset.yaml \ + > nvidia_gpu-feature-discovery_daemonset.yaml + ``` + +3. Then run the command below to install the + + ```bash + kubectl apply -f ./nvidia_gpu-feature-discovery_daemonset.yaml + ``` + + +### Hugging Face Authorization + +In order to download models from Hugging Face, your pods will require an access token with the appropriate permission to +download models from their servers. + +1. If you do not already have a Hugging Face access token, you will need to created one. + To create a Hugging Face access token, + [follow their guide](https://huggingface.co/docs/hub/en/security-tokens). + +2. Once you have a token, use the command below to persist the token as a secret named `hf-model-pull` in your cluster. + + ```bash + kubectl create secret generic hf-model-pull '--from-literal=password=' + ``` + +3. To verify that your secret has been created, use the following command and inspect the output for your secret. + + ```bash + kubectl get secrets + ``` + + + +## Triton Preparation + + +### Model Preparation Script + +The intention of this script to handle the acquisition of the model file from Hugging Face, the generation of the TensorRT +engine and plan files, and the caching of said generated files. +The script depends on the fact that the Kubernetes deployment scripts we'll be using rely on the persistent volume backing the +persistent-volume claim provided as part of the Helm chart. + +Specially, the model and engine directories will me mapped to folders in the persistent volume and remapped to all subsequent +pods deployed as part of the Helm chart. +This enables the generation script to detect that the plan and engine generation steps have been completed and not repeat work. + +> [!Tip] +> This script will executed as a job every time the Helm chart is installed unless the `.model.skipConversion` property is +> set to `false`. + +When Triton Server is started, the same persistent volume folders will be mounted to its container and Triton will use the +pre-generated model plan and engine files. +Not only does this enable pods on separate nodes to share the same model engine and plan files, it drastically reduces the time +required for subsequent pod starts on the same node. + +> [!Note] +> You can look at the code used to acquire and convert the models in [containers/server.py](containers/server.py). +> This file is copied into the server container image (see below) during its creation and then executed when the conversion +> job pod is deployed. + +#### Custom Container Image + +1. Using the file below, we'll create a custom container image in the next step. + + > [triton_trt-llm.containerfile](containers/triton_trt-llm.containerfile) + +2. Run the following command to create a custom Triton Inference Server w/ all necessary tools to generate TensorRT-LLM + plan and engine files. In this example we'll use the tag `24.04` to match the date portion of `24.04-trtllm-python-py3` + from the base image. + + ```bash + docker build \ + --file ./triton_trt-llm.containerfile \ + --rm \ + --tag triton_trt-llm:24.04 \ + . + ``` + + ##### Custom Version of Triton CLI + + This custom Triton Server container image makes use of a custom version of the Triton CLI. + The relevant changes have been made available as a + [topic branch](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn) in the Triton CLI repository on + GitHub. + The changes in the branch can be + [inspected](https://github.com/triton-inference-server/triton_cli/compare/main...jwyman/aslb-mn) using the GitHub + interface, and primarily contain the addition of the ability to specify tensor parallelism when optimizing models for + TensorRT-LLM and enable support for additional models. + +3. Upload the Container Image to a Cluster Visible Repository. + + In order for your Kubernetes cluster to be able to download out new container image, it will need to be pushed to a + container image repository that nodes in your cluster can reach. + In this example, we'll use the fictional `nvcr.io/example` repository for demonstration purposes. + You will need to determine which repositories you have write access to that your cluster can also access. + + 1. First, re-tag the container image with the repository's name like below. + + ```bash + docker tag \ + triton_trt-llm:24.04 \ + nvcr.io/example/triton_trt-llm:24.04 + ``` + + 2. Next, upload the container image to your repository. + + ```bash + docker push nvcr.io/example/triton_trt-llm:24.04 + ``` + +#### Kubernetes Pull Secrets + +If your container image repository requires credentials to download images from, then you will need to create a Kubernetes +docker-registry secret. +We'll be using the `nvcr.io` container image repository example above for demonstration purposes. +Be sure to properly escape any special characters such as `$` in the password or username values. + +1. Use the command below to create the necessary secret. Secrets for your repository should be similar, but not be identical +to the example below. + + ```bash + kubectl create secret docker-registry ngc-container-pull \ + --docker-password='dGhpcyBpcyBub3QgYSByZWFsIHNlY3JldC4gaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4=' \ + --docker-server='nvcr.io' \ + --docker-username='\$oauthtoken' + ``` + +2. The above command will create a secret in your cluster named `ngc-container-pull`. + You can verify that the secret was created correctly using the following command and inspecting its output for the secret + you're looking for. + + ```bash + kubectl get secrets + ``` + +3. Ensure the contents of the secret are correct, you can run the following command. + + ```bash + kubectl get secret/ngc-container-pull -o yaml + ``` + + You should see an output similar to the following. + + ```yaml + apiVersion: v1 + data: + .dockerconfigjson: eyJhdXRocyI6eyJudmNyLmlvIjp7InVzZXJuYW1lIjoiJG9hdXRodG9rZW4iLCJwYXNzd29yZCI6IlZHaHBjeUJwY3lCdWIzUWdZU0J5WldGc0lITmxZM0psZEN3Z2FYUWdhWE1nYjI1c2VTQm1iM0lnWkdWdGIyNXpkSEpoZEdsdmJpQndkWEp3YjNObGN5ND0iLCJhdXRoIjoiSkc5aGRYUm9kRzlyWlc0NlZrZG9jR041UW5CamVVSjFZak5SWjFsVFFubGFWMFp6U1VoT2JGa3pTbXhrUTNkbllWaFJaMkZZVFdkaU1qVnpaVk5DYldJelNXZGFSMVowWWpJMWVtUklTbWhrUjJ4MlltbENkMlJZU25kaU0wNXNZM2swWjFWSGVHeFpXRTVzU1VjMWJHUnRWbmxKU0ZaNldsTkNRMWxZVG14T2FsRm5aRWM0WjJGSGJHdGFVMEo1V2xkR2MwbElUbXhaTTBwc1pFaE5hQT09In19fQ== + kind: Secret + metadata: + name: ngc-container-pull + namespace: default + type: kubernetes.io/dockerconfigjson + ``` + + The value of `.dockerconfigjson` is a base-64 encoded string which can be decoded into the following. + + ```json + { + "auths": { + "nvcr.io": { + "username":"$oauthtoken", + "password":"VGhpcyBpcyBub3QgYSByZWFsIHNlY3JldCwgaXQgaXMgb25seSBmb3IgZGVtb25zdHJhdGlvbiBwdXJwb3Nlcy4gUGxlYXNlIG5ldmVyIHVzZSBCYXNlNjQgdG8gaGlkZSByZWFsIHNlY3JldHMh", + "auth":"JG9hdXRodG9rZW46VkdocGN5QnBjeUJ1YjNRZ1lTQnlaV0ZzSUhObFkzSmxkQ3dnYVhRZ2FYTWdiMjVzZVNCbWIzSWdaR1Z0YjI1emRISmhkR2x2YmlCd2RYSndiM05sY3k0Z1VHeGxZWE5sSUc1bGRtVnlJSFZ6WlNCQ1lYTmxOalFnZEc4Z2FHbGtaU0J5WldGc0lITmxZM0psZEhNaA==" + } + } + } + ``` + + You can use this compact command line to get the above output with a single command. + + ```bash + kubectl get secret/ngc-container-pull -o json | jq -r '.data[".dockerconfigjson"]' | base64 -d | jq + ``` + + > [!Note] + > The values of `password` and `auth` are also base-64 encoded string. + > We recommend inspecting the values of the following values: + > + > * Value of `.auths['nvcr.io'].username`. + > * Base64 decoded value of `.auths['nvcr.io'].password`. + > * Base64 decoded value of `.auths['nvcr.io'].auths`. + + + +## Triton Deployment + +> [!Note] +> Deploying Triton Server with a model that fits on a single GPU is straightforward but not explained by this guide. +> For instructions and examples of deploying a model using a single GPU or multiple GPUs on a single node, use the +> [Autoscaling and Load Balancing Generative AI w/ Triton Server and TensorRT-LLM Guide](../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md) instead. + +Given the memory requirements of some AI models it is not possible to host them using a single device. +Triton and TensorRT-LLM provide a mechanism to enable a large model to be hosted by multiple GPU devices working in concert. +The provided sample Helm [chart](./chart/) provides a mechanism for taking advantage of this capability. + +To enable this feature, adjust the `model.tensorrtLlm.parallelism.tensor` value to an integer greater than 1. +Configuring a model to use tensor parallelism enables the TensorRT-LLM runtime to effectively combine the memory of multiple +GPUs to host a model too large to fit on a single GPU. + +Similarly, changing the value of `model.tensorrtLlm.parallelism.pipeline` will enable pipeline parallelism. +Pipeline parallelism is used to combine the compute capacity of multiple GPUs to process inference requests in parallel. + +> [!Important] +> The product of the values of `.tensor` and `.pipeline` should be a power of 2 greater than `0` and less than or equal to +> `32`. + +The number of GPUs required to host the model is equal to product of the values of `.tensor` and `.pipeline`. +When the model is deployed, one pod per GPU required will be created. +The Helm chart will create a leader pod and one or more work pods, depending on the number of additional pods required to +host the model. +Additionally, a model conversion job will be created to download the model from Hugging Face and then convert the downloaded +model into TRT-LLM engin and plan files. +To disable the creation of a conversion job by the Helm chart, set the values file's `model.skipConversion` property to +`false`. + +> [!Warning] +> If your cluster has insufficient resources to create the conversion job, the leader pod, and the required worker pods, +> and the job pod is not scheduled to execute first, it is possible for the example Helm chart to become "hung" due to the +> leader pod waiting on the job pod's completion and there being insufficient resources to schedule the job pod. +> +> If this occurs, it is best to delete the Helm installation and retry until the job pod is successfully scheduled. +> Once the job pod completes, it will release its resources and make them available for the other pods to start. + +### Deploying Single GPU Models + +Deploying Triton Server with a model that fits on a single GPU is straightforward using the steps below. + +1. Create a custom values file with required values: + + * Container image name. + * Model name. + * Supported / available GPU. + * Image pull secrets (if necessary). + * Hugging Face secret name. + + The provided sample Helm [chart](./chart/) include several example values files such as + [llama-3-8b_values.yaml](chart/llama-3-8b-instruct_values.yaml). + +2. Deploy LLM on Triton + TRT-LLM. + + Apply the custom values file to override the exported base values file using the command below, and create the Triton + Server Kubernetes deployment. + + > [!Tip] + > The order that the values files are specified on the command line is important with values are applied and + > override existing values in the order they are specified. + + ```bash + helm install \ + --values ./chart/values.yaml \ + --values ./chart/.yaml \ + --set 'triton.image.name=' \ + ./chart/. + ``` + + > [!Important] + > Be sure to substitute the correct values for `` and `` in the example above. + +3. Verify the Chart Installation. + + Use the following commands to inspect the installed chart and to determine if everything is working as intended. + + ```bash + kubectl get deployments,pods,services,jobs --selector='app=' + ``` + + > [!Important] + > Be sure to substitute the correct value for `` in the example above. + + You should output similar to below (assuming the installation name of "llama-3"): + + ```text + NAME READY UP-TO-DATE AVAILABLE + deployment.apps/llama-3 0/1 1 0 + + NAME READY STATUS RESTARTS + pod/llama-3-7989ffd8d-ck62t 0/1 Pending 0 + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) + service/llama-3 ClusterIP 10.100.23.237 8000/TCP,8001/TCP,8002/TCP + ``` + +4. Uninstalling the Chart + + Uninstalling a Helm chart is as straightforward as running the command below. + This is useful when experimenting with various options and configurations. + + ```bash + helm uninstall + ``` + +### How It Works + +The Helm chart creates a model-conversion job and multiple Kubernetes deployments to support the distributed model's tensor parallelism needs. +When a distributed model is deployed, a "leader" pod along with a number of "workers" to meet the model's tensor parallelism requirements are +created. +The leader pod then awaits for the conversion job to complete and for all worker pods to be successfully deployed. + +The model-conversion job is responsible for downloading the configured model from Hugging Face and converting that model into a TensorRT-LLM +ready set of engine and plan files. +The model-conversion job will place all downloaded and converted files on the provided persistent volume. + +> [!Note] +> Model downloads from Hugging Face are reused when possible. +> Converted TRT-LLM models are GPU and tensor-parallelism specific. +> Therefore a converted model will exist for every GPU the model is deployed on to as well as for every configuration of tensor parallelism. + +Once these conditions are met, the leader pod creates an [`mpirun`](https://docs.open-mpi.org/en/v5.0.x/man-openmpi/man1/mpirun.1.html) process which creates a Triton Server process in each pod of the distributed model. + +The leader pod's process is responsible for handling inference request and response functionality, as well as inference request tokenization and +result de-tokenization. +Worker pods' processes provide expanded GPU compute and memory capacity. +All of the processes are coordinated by the original `mpirun` process. +Communications between the processes is accelerated by [NVIDIA Collective Communications Library](https://developer.nvidia.com/nccl) (NCCL). +NCCL enables GPU-to-GPU direct communication and avoids the wasteful data copying from GPU-to-CPU-to-GPU that occur otherwise. + + +### Potential Improvements + +#### Autoscaling and Gang Scheduling + +This guide does not provide any solution for autoscaling or load balancing Triton deployments because Kubernetes horizontal pod +autoscaling (HPA) is not capable of managing deployments composed of multiple pods. +Additionally, because the solution provided in this tutorial makes use of multiple deployments, any automation has a high risk of concurrent, +partial deployments exhausting available resources preventing any of the deployments from succeeding. + +For an example of concurrent, partial deployments preventing each other from successfully deploying, imagine a cluster with 4 nodes, each with 8 GPUs for a total of 32 available GPUs. +Now consider a model which requires 8 GPUs to be deployed and we attempt to deploy 5 copies of it. +When individually deploying the models, each deployment is assigned 8 GPUs until there are zero available GPUs remaining resulting in the model +being successfully deployed 4 times. +At this point, the system understands that there are no more available resources and the 5 model copy fails to deploy. + +However, when attempting to deploy all 5 copies of the model simultaneously, it is highly likely that each copy will get at least 1 GPU resource +assigned to it. +This results in their insufficient resources for at least two of the copies; leaving both deployments stuck in a non-functional, partially +deployed state. + +One solution to this problem would be to leverage a gang scheduler for Kubernetes. +Gang scheduling would enable the Kubernetes scheduler to only create a pod if its entire cohort of pods can be created. +This provides a solution to the partial deployment of model pods blocking each other from being fully deployed. + +> [!Note] +> Read about [gang scheduling on Wikipedia](https://en.wikipedia.org/wiki/Gang_scheduling) for additional information. + +The above solutions, however, does not provide any kind of autoscaling solution. +To achieve this, a custom, gang-schedular-aware autoscaler would be required. + +#### Network Topology Aware Scheduling + +Triton Server w/ TensorRT-LLM leverage a highly-optimized networking stacked known as the +[NVIDIA Collective Communications Library](https://developer.nvidia.com/nccl) (NCCL) to enable tensor parallelization. +NCCL takes advantage of he ability for modern GPUs to leverage +[remote direct memory access](https://en.wikipedia.org/wiki/Remote_direct_memory_access) (RDMA) based network acceleration to optimize operations +between GPUs regardless if they're on the same or nearby machines. +This means that quality of the network between GPUs on separate machines directly affects the performance of distributed models. + +Providing a network topology aware scheduler for Kubernetes, could help ensure that the GPUs assigned to the pods of a model deployment are +relatively local to each other. +Ideally, on the same machine or at least the same networking switch to minimize network latency and the impact of bandwidth limitations. + + +## Developing this Guide + +During the development of this guide, I ran into several problems that needed to be solved before we could provide a useful +guide. +This section will outline and describe the issues I ran into and how we resolved them. + +> _This document was developed using a Kubernetes cluster provided by Amazon EKS._ +> _Clusters provisioned on-premises or provided by other cloud service providers such as Azure AKS or GCloud GKE might require_ +> _modifications to this guide._ + + +### Why This Set of Software Components? + +The set of software packages described in this document is close the minimum viable set of packages without handcrafting +custom Helm charts and YAML files for every package and dependency. +Is this the only set of packages and components that can be used to make this solution work? +Definitely not, there are several alternatives which could meet our requirements. +This set of packages and components is just the set I happen to choose for this guide. + +Below is a high-level description of why each package is listed in this guide. + +#### NVIDIA Device Plugin for Kubernetes + +Required to enable GPUs to be treated as resources by the Kubernetes scheduler. +Without this component, GPUs would not be assigned to containers correctly. + +#### NVIDIA GPU Discovery Service for Kubernetes + +Provides automatic labelling of Kubernetes nodes based on the NVIDIA devices and software available on the node. +Without the provided labels, it would not be possible to specify specific GPU SKUs when deploying models because the +Kubernetes scheduler treats all GPUs as identical (referring to them all with the generic resources name `nvidia.com/gpu`). + +#### Kubernetes Node Discovery Service + +This is a requirement for the [NVIDIA GPU Discovery Service for Kubernetes](#nvidia-gpu-discovery-service-for-kubernetes). + +#### NVIDIA DCGM Exporter + +Provides hardware monitoring and metrics for NVIDIA GPUs and other devices present in the cluster. +Without the metrics this provides, monitoring GPU utilization, temperature and other metrics would not be possible. + +While Triton Server has the capability to collect and serve NVIDIA hardware metrics, relying on Triton Server to provide this +service is non-optimal for several reasons. + +Firstly, many processes on the same machine querying the NVIDIA device driver for current state, filtering the results for +only values that pertain to the individual process, and serving them via Triton's open-metrics server is as wasteful as the +the number of Triton Server process beyond the first on the node. + +Secondly, due to the need to interface with the kernel-mode driver to retrieve hardware metrics, queries get serialized adding +additional overhead and latency to the system. + +Finally, the rate at which metrics are collected from Triton Server is not the same as the rate at which metrics are collected +from the DCGM Exporter. +Separating the metrics collection from Triton Server allows for customized metric collection rates, which enables us to +further minimize the process overhead placed on the node. + +##### Why is the DCGM Exporter Values File Custom? + +I decided to use a custom values file when installing the DCGM Exporter Helm chart for several reasons. + +Firstly, it is my professional opinion that every container in a cluster should specify resource limits and requests. +Not doing so opens the node up to a number of difficult to diagnose failure conditions related to resource exhaustion. +Out of memory errors are the most obvious and easiest to root cause. +Additionally, difficult to reproduce, transient timeout and timing errors caused CPU over-subscription can easily happen when +any container is unconstrained and quickly waste an entire engineering team's time as they attempt to triage, debug, and +resolve them. + +Secondly, the DCGM Exporter process itself spams error logs when it cannot find NVIDIA devices in the system. +This is primarily because the service was originally created for non-Kubernetes environments. +Therefore I wanted to restrict which node the exporter would get deployed to. +Fortunately, the DCGM Helm chart makes this easy by support node selector options. + +Thirdly, because nodes with NVIDIA GPUs have been tainted with the `nvidia.com/gpu=present:NoSchedule` that prevents any +pod which does not explicitly tolerate the taint from be assigned to the node, I need to add the tolerations to the DCGM +Exporter pod. + +Finally, the default Helm chart for DCGM Exporter is missing the required `--kubernetes=true` option being passed in via +command line options when the process is started. +Without this option, DCGM Exporter does not correctly associate hardware metrics with the pods actually using it, and +there would be mechanism for understand how each pod uses the GPU resources assigned to it. + + +### Why Use the Triton CLI and Not Other Tools Provided by NVIDIA? + +I chose to use the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool to optimize models for +TensorRT-LLM instead of other available tools for a couple of reasons. + +Firstly, using the Triton CLI simplifies the conversion and optimization of models into a single command. + +Secondly, relying on the Triton CLI simplifies the creation of the container because all requirements were met with a single +`pip install` command. + +#### Why Use a Custom Branch of Triton CLI Instead of an Official Release? + +I decided to use a custom [branch of Triton CLI](https://github.com/triton-inference-server/triton_cli/tree/jwyman/aslb-mn) +because there are features this guide needed that were not present in any of the official releases available. +The branch is not a Merge Request because the method used to add the needed features does not aligned with changes the +maintainers have planned. +Once we can achieve alignment, this guide will be updated to use an official release. + + +### Why Does the Chart Run a Python Script Instead of Triton Server Directly? + +There are two reasons: + +1. In order to retrieve a model from Hugging Face, convert and optimize it for TensorRT-LLM, and cache it on the host, I + decided that [pod initialization container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) was the + most straightforward solution. + + In order to make the best use of the initialization container I chose to use a custom [server.py](./containers/server.py) + script that made of the new [Triton CLI](https://github.com/triton-inference-server/triton_cli) tool. + +2. Multi-GPU deployments require a rather specialized command line to run, and generating it using Helm chart scripting was + not something I wanted to deal with. + Leveraging the custom Python script was the logical, and easiest, solution. + +#### Why is the Python Written Like That? + +Because I'm not a Python developer, but I am learning! +My background is in C/C++ with plenty of experience with shell scripting languages. + + +### Why Use a Custom Triton Image? + +I decided to use a custom image for a few reasons. + +1. Given the answer above and the use of Triton CLI and a custom Python script, the initialization container needed both + components pre-installed in it to avoid unnecessary use of ephemeral storage. + + > [!Warning] + > Use of ephemeral storage can lead to pod eviction, and therefore should be avoided whenever possible. + +2. Since the Triton + TRT-LLM image is already incredibly large, I wanted to avoid consuming additional host storage space + with yet another container image. + + Additionally, the experience of a pod appearing to be stuck in the `Pending` state while it download a container prior to + the initialization container is easier to understand compared to a short `Pending` state before the initialization + container, followed by a much longer `Pending` state before the Triton Server can start. + +3. I wanted a custom, constant environment variable set for `ENGINE_DEST_PATH` that could be used by both the initialization + and Triton Server containers. + +--- + +Software versions featured in this document: + +* Triton Inference Server v2.45.0 (24.04-trtllm-python-py3) +* TensorRT-LLM v0.9.0 +* Triton CLI v0.0.7 +* NVIDIA Device Plugin for Kubernetes v0.15.0 +* NVIDIA GPU Discovery Service for Kubernetes v0.8.2 +* NVIDIA DCGM Exporter v3.3.5 +* Kubernetes Node Discovery Service v0.15.4 +* Prometheus Stack for Kubernetes v58.7.2 +* Prometheus Adapter for Kubernetes v4.10.0 + +--- + +Author: J Wyman, System Software Architect, AI & Distributed Systems + +Copyright © 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore new file mode 100644 index 00000000..10c40355 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/.gitignore @@ -0,0 +1 @@ +dev_values.yaml diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml new file mode 100644 index 00000000..03e6d381 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +appVersion: 0.1.0 +description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial +icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png +name: triton_trt-llm_multi-node_example +version: 0.1.0 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml new file mode 100644 index 00000000..4afa2eaa --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/gpt2_values.yaml @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: gpt2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml new file mode 100644 index 00000000..803124f1 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-70b_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: NVIDIA-A10G + +model: + name: llama-2-70b + tensorrtLlm: + conversion: + gpu: 8 + memory: 256Gi + parallelism: + tensor: 8 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml new file mode 100644 index 00000000..0a701e24 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b-chat_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: llama-2-7b-chat + tensorrtLlm: + conversion: + gpu: 2 + memory: 64Gi + parallelism: + tensor: 2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml new file mode 100644 index 00000000..0b0b4666 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-2-7b_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: llama-2-7b + tensorrtLlm: + conversion: + gpu: 2 + memory: 64Gi + parallelism: + tensor: 2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml new file mode 100644 index 00000000..67b93d5b --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-70b-instruct_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: NVIDIA-A10G + +model: + name: llama-3-70b-instruct + tensorrtLlm: + conversion: + gpu: 8 + memory: 256Gi + parallelism: + tensor: 8 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml new file mode 100644 index 00000000..d849fecd --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b-instruct_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: llama-3-8b-instruct + tensorrtLlm: + conversion: + gpu: 4 + memory: 128Gi + parallelism: + tensor: 4 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml new file mode 100644 index 00000000..9f7b594e --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/llama-3-8b_values.yaml @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: llama-3-8b + tensorrtLlm: + conversion: + gpu: 2 + memory: 64Gi + parallelism: + tensor: 2 diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml new file mode 100644 index 00000000..12a4be4e --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/opt125m_values.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: Tesla-V100-SXM2-16GB + +model: + name: opt125m diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt new file mode 100644 index 00000000..6591ffbe --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/NOTES.txt @@ -0,0 +1,48 @@ +{{- $create_account := true }} +{{- $create_job := true }} +{{- $create_service := true }} +{{- with $.Values.model }} +{{- if .skipConversion }} +{{- $create_job = false }} +{{- end }} +{{- end }} +{{- with $.Values.kubernetes }} +{{- if .noService }} +{{- $create_service = false }} +{{- end }} +{{- if .serviceAccount}} +{{- $create_account = false }} +{{- end }} +{{- end }} + +{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. + +Release Name: {{ $.Release.Name }} +Namespace: {{ $.Release.Namespace }} +Deployment Name: {{ $.Release.Name }} +{{- if $create_job }} +Conversion Job: {{ $.Release.Name }} +{{- end }} +{{- if $create_service }} +Service Name: {{ $.Release.Name }} +{{- end }} +{{- if $create_account }} +ServiceAccount Name: {{ $.Release.Name }} +{{- end }} + +Helpful commands: + + $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} + $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} + $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments +{{- if $create_job -}} +,jobs +{{- end -}} +,pods +{{- if $create_service -}} +,services +{{- end -}} +,podmonitors +{{- if $create_account -}} +,serviceAccounts +{{- end -}} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml new file mode 100644 index 00000000..705e7e10 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/deployment.yaml @@ -0,0 +1,358 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +{{- $hostRootPath := "/triton" }} +{{- $image_name := "" }} +{{- with $.Values.triton }} +{{- with .image }} +{{- $image_name = required "Property '.triton.image.name' is required." .name }} +{{- else }} +{{- fail "Property '.triton.image' is required." }} +{{- end }} +{{- else }} +{{- fail "Property '.triton' is required" }} +{{- end }} +{{- $model_name := "" }} +{{- $model_dt := "float16" }} +{{- $model_pp := 1 }} +{{- $model_tp := 1 }} +{{- with $.Values.kubernetes }} +{{- with .hostRootPath }} +{{- $hostRootPath = . }} +{{- end }} +{{- end }} +{{- with $.Values.model }} +{{- $model_name = required "Property '.model.name' is required." .name }} +{{- with .tensorrtLlm }} +{{- with .dataType }} +{{- $model_dt = . }} +{{- end }} +{{- with .parallelism }} +{{- with .pipeline }} +{{- $model_pp = (int .) }} +{{- end }} +{{- with .tensor }} +{{- $model_tp = (int .) }} +{{- end }} +{{- end }} +{{- end }} +{{- else }} +{{- fail "Property '.model' is required." }} +{{- end }} +{{- $model_lower := lower $model_name }} +{{- $model_upper := upper $model_name }} +{{- $pod_count := mul $model_pp $model_tp }} +{{- $triton_cpu := 4 }} +{{- $triton_memory := "32Gi" }} +{{- with $.Values.triton }} +{{- with .image }} +{{- with .name }} +{{- $image_name = . }} +{{- end }} +{{- end }} +{{- with .resources }} +{{- with .cpu }} +{{- $triton_cpu = (int .) }} +{{- end }} +{{- with .memory }} +{{- $triton_memory = . }} +{{- end }} +{{- end }} +{{- end }} +{{- $engine_path := printf "/var/run/models/%s/%dx%d/engine" $model_lower (int $model_pp) (int $model_tp) }} +{{- $model_path := printf "/var/run/models/%s/%dx%d/model" $model_lower (int $model_pp) (int $model_tp) }} +{{- $skip_conversion := false }} +{{- with $.Values.model }} +{{- with .skipConversion }} +{{- $skip_conversion = . }} +{{- end }} +{{- end }} +{{- $hf_verbosity := "error" }} +{{- with $.Values.logging }} +{{- with .initialization }} +{{- if .verbose }} +{{- $hf_verbosity = "info" }} +{{- end }} +{{- end }} +{{- end }} +{{- $service_account := $.Release.Name }} +{{- with $.Values.kubernetes }} +{{- with .serviceAccount }} +{{- $service_account = . }} +{{- end }} +{{- end }} +{{- range $i := until (int $pod_count) }} +{{- if eq $i 0 }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $.Release.Name }}-leader + labels: + app: {{ $.Release.Name }} +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + replicas: 1 + selector: + matchLabels: + app: {{ $.Release.Name }} + pod-rank: {{ $i | quote }} + template: + metadata: + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: server + pod-rank: {{ $i | quote }} +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 8 }} +{{- end }} +{{- end }} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + - key: nvidia.com/gpu.product + operator: In + values: + - {{ required "Property '.gpu' is required." $.Values.gpu }} + containers: + - name: triton + command: + - python3 + - ./server.py + - leader + - --deployment={{ $.Release.Name }} + - --namespace={{ $.Release.Namespace }} + - --dt={{ $model_dt }} + - --pp={{ $model_pp }} + - --tp={{ $model_tp }} + - --multinode +{{- if $skip_conversion }} + - --noconvert +{{- end }} +{{- with $.Values.logging }} +{{- with .tritonServer }} +{{- if .useIso8601 }} + - --iso8601 +{{- end }} +{{- if .verbose }} + - --verbose +{{- end }} +{{- end }} +{{- end }} + env: + - name: ENGINE_DEST_PATH + value: {{ $engine_path }} + - name: MODEL_DEST_PATH + value: {{ $model_path }} +{{- with $.Values.logging }} +{{- with .tritonServer }} +{{- if .verbose }} + - name: NCCL_DEBUG + value: INFO +{{- end }} +{{- end }} +{{- end }} + image: {{ $image_name }} + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/live + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 2 + successThreshold: 1 + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + readinessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/ready + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 2 + successThreshold: 1 + resources: + limits: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: 1 + requests: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: 1 + startupProbe: + failureThreshold: 60 + httpGet: + path: /v2/health/ready + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 15 + successThreshold: 1 + volumeMounts: + - mountPath: /var/run/models + name: model-repository + readOnly: true +{{- with $.Values.triton }} +{{- with .image }} +{{- with .pullSecrets }} + imagePullSecrets: +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} +{{- end }} + restartPolicy: Always + serviceAccountName: {{ $service_account }} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +{{- with $.Values.kubernetes }} +{{- with .tolerations }} +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} + volumes: +{{- with $.Values.model }} +{{- with .pullSecret }} + - name: hf-secret + secret: + secretName: {{ . }} +{{- end }} +{{- end }} + - name: model-repository + persistentVolumeClaim: + claimName: {{ $.Values.model.persistentVolumeClaim }} + readOnly: false +{{- else }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $.Release.Name }}-worker{{ $i }} + labels: + app: {{ $.Release.Name }} +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + replicas: 1 + selector: + matchLabels: + app: {{ $.Release.Name }} + pod-rank: {{ $i | quote }} + template: + metadata: + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: worker + pod-rank: {{ $i | quote }} +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 8 }} +{{- end }} +{{- end }} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + - key: nvidia.com/gpu.product + operator: In + values: + - {{ required "Property '.gpu' is required." $.Values.gpu }} + containers: + - name: worker-{{ $i }} + command: + - python3 + - ./server.py + - worker + env: + - name: ENGINE_DEST_PATH + value: {{ $engine_path }} + - name: MODEL_DEST_PATH + value: {{ $model_path }} +{{- with $.Values.logging }} +{{- with .tritonServer }} +{{- if .verbose }} + - name: NCCL_DEBUG + value: INFO +{{- end }} +{{- end }} +{{- end }} + image: {{ $image_name }} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: 1 + requests: + cpu: {{ $triton_cpu }} + ephemeral-storage: 1Gi + memory: {{ $triton_memory }} + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /var/run/models + name: model-repository + readOnly: true +{{- with $.Values.triton }} +{{- with .image }} +{{- with .pullSecrets }} + imagePullSecrets: +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} +{{- end }} + restartPolicy: Always + serviceAccountName: {{ $service_account }} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +{{- with $.Values.kubernetes }} +{{- with .tolerations }} +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} + volumes: + - name: model-repository + persistentVolumeClaim: + claimName: {{ $.Values.model.persistentVolumeClaim }} + readOnly: true +{{- end }} +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml new file mode 100644 index 00000000..55a64568 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/job.yaml @@ -0,0 +1,227 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# NVIDIA CORPORATION and its licensors retain all intellectual property +# and proprietary rights in and to this software, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this software and related documentation without an express +# license agreement from NVIDIA CORPORATION is strictly prohibited. + +{{- $skip_conversion := false }} +{{- with $.Values.model }} +{{- with .skipConversion }} +{{- $skip_conversion = . }} +{{- end }} +{{- end }} +{{- if not $skip_conversion }} +{{- $hostRootPath := "/triton" }} +{{- $image_name := "" }} +{{- with $.Values.triton }} +{{- with .image }} +{{- $image_name = required "Property '.triton.image.name' is required." .name }} +{{- else }} +{{- fail "Property '.triton.image' is required." }} +{{- end }} +{{- else }} +{{- fail "Property '.triton' is required" }} +{{- end }} +{{- $model_name := "" }} +{{- $model_dt := "float16" }} +{{- $model_pp := 1 }} +{{- $model_tp := 1 }} +{{- with $.Values.kubernetes }} +{{- with .hostRootPath }} +{{- $hostRootPath = . }} +{{- end }} +{{- end }} +{{- with $.Values.model }} +{{- $model_name = required "Property '.model.name' is required." .name }} +{{- with .tensorrtLlm }} +{{- with .dataType }} +{{- $model_dt = . }} +{{- end }} +{{- with .parallelism }} +{{- with .pipeline }} +{{- $model_pp = (int .) }} +{{- end }} +{{- with .tensor }} +{{- $model_tp = (int .) }} +{{- end }} +{{- end }} +{{- end }} +{{- else }} +{{- fail "Property '.model' is required." }} +{{- end }} +{{- $model_lower := lower $model_name }} +{{- $model_upper := upper $model_name }} +{{- $pod_count := mul $model_pp $model_tp }} +{{- $model_cpu := 4 }} +{{- $model_gpu := 1 }} +{{- $model_memory := "32Gi" }} +{{- with $.Values.triton }} +{{- with .image }} +{{- with .name }} +{{- $image_name = . }} +{{- end }} +{{- end }} +{{- end }} +{{- with $.Values.model }} +{{- with .tensorrtLlm }} +{{- with .conversion }} +{{- with .cpu }} +{{- $model_cpu = . }} +{{- end }} +{{- with .gpu }} +{{- $model_gpu = (int .) }} +{{- end}} +{{- with .memory }} +{{- $model_memory = . }} +{{- end }} +{{- end }} +{{- end }} +{{- end }} +{{- $engine_path := printf "/var/run/models/%s/%dx%d/engine" $model_lower (int $model_pp) (int $model_tp) }} +{{- $model_path := printf "/var/run/models/%s/%dx%d/model" $model_lower (int $model_pp) (int $model_tp) }} +{{- $hf_verbosity := "error" }} +{{- with $.Values.logging }} +{{- with .initialization }} +{{- if .verbose }} +{{- $hf_verbosity = "info" }} +{{- end }} +{{- end }} +{{- end }} +{{- $service_account := $.Release.Name }} +{{- with $.Values.kubernetes }} +{{- with .serviceAccount }} +{{- $service_account = . }} +{{- end }} +{{- end }} +apiVersion: batch/v1 +kind: Job +metadata: + labels: + app: {{ $.Release.Name }} +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} +spec: + backoffLimit: 4 + template: + metadata: + labels: + app: {{ $.Release.Name }}-converter +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 8 }} +{{- end }} +{{- end }} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + - key: nvidia.com/gpu.product + operator: In + values: + - {{ required "Property '.gpu' is required." $.Values.gpu }} + containers: + - name: converter + command: + - python3 + - ./server.py + - convert + - --model={{ $model_lower }} + - --dt={{ $model_dt }} + - --pp={{ $model_pp }} + - --tp={{ $model_tp }} + - --multinode +{{- with $.Values.logging }} +{{- with .initialization }} +{{- if .verbose }} + - --verbose +{{- end }} +{{- end }} +{{- end }} + env: + - name: ENGINE_DEST_PATH + value: {{ $engine_path }} + - name: HF_HOME + value: /var/run/models/hugging_face + - name: HF_HUB_VERBOSITY + value: {{ $hf_verbosity }} + - name: MODEL_DEST_PATH + value: {{ $model_path }} +{{- with $.Values.logging }} +{{- with .initialization }} +{{- if .verbose }} + - name: NCCL_DEBUG + value: INFO +{{- end }} +{{- end }} +{{- end }} + image: {{ $image_name }} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: {{ $model_cpu }} + ephemeral-storage: 32Gi + memory: {{ $model_memory }} + nvidia.com/gpu: {{ $model_gpu }} + requests: + cpu: {{ $model_cpu }} + ephemeral-storage: 32Gi + memory: {{ $model_memory }} + nvidia.com/gpu: {{ $model_gpu }} + securityContext: + readOnlyRootFilesystem: false + runAsGroup: 0 + runAsUser: 0 + volumeMounts: +{{- with $.Values.model }} +{{- if .pullSecret }} + - mountPath: /var/run/secrets/hugging_face + name: hf-secret + readOnly: true +{{- end }} +{{- end }} + - mountPath: /var/run/models + name: model-repository + readOnly: false +{{- with $.Values.triton }} +{{- with .image }} +{{- with .pullSecrets }} + imagePullSecrets: +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} +{{- end }} + restartPolicy: Never + serviceAccountName: {{ $service_account }} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists +{{- with $.Values.kubernetes }} +{{- with .tolerations }} +{{ toYaml . | indent 6 }} +{{- end }} +{{- end }} + volumes: +{{- with $.Values.model }} +{{- with .pullSecret }} + - name: hf-secret + secret: + secretName: {{ . }} +{{- end }} +{{- end }} + - name: model-repository + persistentVolumeClaim: + claimName: {{ $.Values.model.persistentVolumeClaim }} + readOnly: false +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml new file mode 100644 index 00000000..4b91286d --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/pod-monitor.yaml @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: monitor + release: prometheus +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + selector: + matchLabels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: server + podMetricsEndpoints: + - port: metrics + path: /metrics diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml new file mode 100644 index 00000000..59903ae3 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/rbac.yaml @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $service_account := 0 }} +{{- with $.Values.kubernetes }} +{{- with .serviceAccount }} +{{- $service_account = . }} +{{- end }} +{{- end }} +{{- if not $service_account }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} +rules: +- apiGroups: + - '' + - apps + - batch + resources: + - deployments + - jobs + - pods + - pods/status + - services + verbs: + - get + - list +- apiGroups: [''] + resources: + - pods/exec + verbs: + - create + +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} +subjects: +- kind: ServiceAccount + name: {{ $.Release.Name }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $.Release.Name }} +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml new file mode 100644 index 00000000..3bf3b3d5 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/templates/service.yaml @@ -0,0 +1,52 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $noService := false }} +{{- with $.Values.kubernetes }} +{{- with .noService }} +{{- $noService = . }} +{{- end }} +{{- end }} +{{- if $noService }} +# Chart values optioned to not create a service. Service not created. +{{- else }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: service +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + ports: + - name: http + port: 8000 + targetPort: http + - name: grpc + port: 8001 + targetPort: grpc + - name: metrics + port: 8002 + targetPort: metrics + selector: + app: {{ $.Release.Name }} + app.kubernetes.io/component: server + pod-rank: {{ 0 | quote}} + type: ClusterIP +{{- end }} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json new file mode 100644 index 00000000..99917ea2 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.schema.json @@ -0,0 +1,324 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "copyright": [ + "# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.", + "# NVIDIA CORPORATION and its licensors retain all intellectual property", + "# and proprietary rights in and to this software, related documentation", + "# and any modifications thereto. Any use, reproduction, disclosure or", + "# distribution of this software and related documentation without an express", + "# license agreement from NVIDIA CORPORATION is strictly prohibited." + ], + "properties": { + "gpu": { + "description": "Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.", + "type": "string" + }, + "model": { + "description": "Configuration options related to the AI model to be deployed.", + "properties": { + "name": { + "description": "Name of the model to be served Triton Server instances.", + "pattern": "(gpt2|opt125m|llama-(2-(7b|70b)(-chat)?|3-(8b|70b)(-instruct)?))", + "type": "string" + }, + "persistentVolumeClaim": { + "description": "Persistent volume claim where model content will be persisted.", + "type": "string" + }, + "pullSecret": { + "description": "Name of the secret used to download the model from Hugging Face.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "skipConversion": { + "description": "When `false` a model conversion job is created and the leader pod will wait for the job to complete before starting Triton; otherwise this doesn't happen.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "tensorrtLlm": { + "description": "Configuration options related to the conversion of a non-optimized model into TensorRT format.", + "oneOf": [ + { + "properties": { + "conversion": { + "description": "Configuration opens related to conversion of non-TensorRT models to TensorRT engine and plan files.", + "oneOf": [ + { + "properties": { + "cpu": { + "description": "Number of logical CPU cores reserved for, and assigned to the model conversion job.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "pattern": "^\\d+m$", + "type": "string" + }, + { "type": "null" } + ] + }, + "gpu": { + "description": "Number of GPUs reserved for, and assigned to the model conversion job.", + "oneOf": [ + { + "minimum": 0, + "type": "integer" + }, + { "type": "null" } + ] + }, + "memory": { + "description": "Amount of CPU-visible system memory allocated to, and reserved for the model conversion job.", + "oneOf": [ + { + "pattern": "^\\d+[GKMgkm]i$", + "type": "string" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "dataType": { + "description": "Data type used when compiling and optimizing the model for TensorRT.", + "oneOf": [ + { + "pattern": "(bfloat16|float16|float32)", + "type": "string" + }, + { "type": "null" } + ] + }, + "enable": { + "description": "When `true`, enables conversion of models into TensorRT format before loading them into Triton Server.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "parallelism": { + "description": "Parallelism configuration options which affect how the model is converted to TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.", + "oneOf": [ + { + "properties": { + "pipeline": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + }, + "tensor": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ + "name", + "persistentVolumeClaim" + ], + "type": "object" + }, + "triton": { + "description": "Configuration options for Triton Server.", + "properties": { + "image": { + "description": "Configuration options related to the container image for Triton Server.", + "properties": { + "pullSecrets": { + "description": "Optional list of pull secrets to be used when downloading the Triton Server container image.", + "oneOf": [ + { + "items": [ + { "type": "object" } + ], + "type": "array" + }, + { "type": "null" } + ] + }, + "name": { + "description": "Name of the container image containing the version of Triton Server to be used.", + "type": "string" + } + }, + "required": [ "name" ], + "type": "object" + }, + "resources": { + "description": "Configuration options managing the resources assigned to individual Triton Server instances. ", + "oneOf": [ + { + "properties": { + "cpu": { + "description": "Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "pattern": "^\\d+m$", + "type": "string" + }, + { "type": "null" } + ] + }, + "memory": { + "description": "Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.", + "oneOf": [ + { + "pattern": "^\\d+[GKMgkm]i$", + "type": "string" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ "image" ], + "type": "object" + }, + "logging": { + "description": "Configuration options related to how various components generate logs.", + "oneOf": [ + { + "properties": { + "initialization": { + "description": "Logging configuration options specific to the initialization container.", + "oneOf": [ + { + "properties": { + "verbose": { + "description": "When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "tritonServer": { + "description": "Logging configuration options specific to Triton Server.", + "oneOf": [ + { + "properties": { + "useIso8601": { + "description": "When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. ", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "verbose": { + "description": "When `true` Triton Server uses verbose logging; otherwise standard logging is used.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "kubernetes": { + "description": "Configurations option related to the Kubernetes objects created by the chart.", + "oneOf": [ + { + "properties": { + "hostRootPath": { + "description": "Root file-system path used when mounting content to the underlying host.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "labels": { + "description": "Optional set of labels to be applied to created Kubernetes objects.", + "oneOf": [ + { "type": "object" }, + { "type": "null" } + ] + }, + "noService": { + "description": "When `false`, a service will not be created when the chart is installed; otherwise a service will be created.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "tolerations": { + "description": "Tolerations applied to every pod deployed as part of this deployment.", + "oneOf": [ + { + "items": [ + { + "description": "Toleration applied to every pod deployed as part of this deployment.", + "type": "object" + }, + { "type": "null" } + ], + "type": "array" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "required": [ + "gpu", + "model", + "triton" + ] +} diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml new file mode 100644 index 00000000..4d7e7328 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/chart/values.yaml @@ -0,0 +1,126 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The GPU SKU that supports `.model` and to which Triton Server instances can be deployed. +# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label. +# Run 'kubectl get nodes' to find node names. +# Run 'kubectl describe node ' to inspect a node's labels. +gpu: # (required) +# Example values: NVIDIA-A100-SXM4-40GB, NVIDIA-A10G, Tesla-V100-SXM2-16GB, Tesla-T4 + +# Configuration options related to the AI model to be deployed. +model: # (required) + # Name of the model to be served Triton Server instances. + # Supported values are: + # - gpt2 + # - llama-2-7b + # - llama-2-70b + # - llama-2-7b-chat + # - llama-2-70b-chat + # - llama-3-8b + # - llama-3-70b + # - llama-3-8b-instruct + # - llama-3-70b-instruct + # - opt125m + name: # (required) + # Persistent volume claim where model content will be persisted. + # Expected to support read/write many access. + persistentVolumeClaim: # (required) + # Name of the secret used to download the model from Hugging Face. + # GPT2 does not require an access token to download. + # Other models may require per repository permissions to be granted. + pullSecret: # (optional) + # When `false` a model conversion job is created and the leader pod will wait for the job to complete before starting Triton; otherwise this doesn't happen. + # When not relying on the model conversion job, the following must exist on the persistent volume: + # - models: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/model" + # - engine: "/var/run/models/{model_name}/{pipeline_parallelism}x{tensor_parallelism}/engine" + skipConversion: # (default: false) + # Configuration options related to the conversion of a non-optimized model into TensorRT format. + tensorrtLlm: # (optional) + # Configuration opens related to conversion of non-TensorRT models to TensorRT engine and plan files. + # Ignored when `model.skipConversion` is `true`. + conversion: # (optional) + # Number of logical CPU cores reserved for, and assigned to the model conversion job. + cpu: # (default: 4) + # Number of GPUs reserved for, and assigned to the model conversion job. + gpu: # (default: 1) + # Amount of CPU-visible system memory allocated to, and reserved for the model conversion job. + memory: # (default: 32Gi) + # Data type used when compiling and optimizing the model for TensorRT. + # Supported options are float16, bfloat16, float32 + dataType: # (default: float16) + # When `true`, enables conversion of models into TensorRT format before loading them into Triton Server. + # When 'false', the init container will fall back to vLLM and parallelism options are ignored. + enable: true # (default: true) + # Parallelism configuration options which affect how the model is converted to + # TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs. + parallelism: # (optional) + # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a + # subset of layers that is executed on a separate device. + # The main limitation of this method is that, due to the sequential nature of the processing, some devices or + # layers may remain idle while waiting for the output. + pipeline: # (default: 1) + # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, + # independent blocks of computation that can be executed on different devices. + # Attention blocks and multi-layer perceptron (MLP) layers are major components of transformers that can take advantage of + # tensor parallelism. + # In multi-head attention blocks, each head or group of heads can be assigned to a different device so they can be computed + # independently and in parallel. + tensor: # (default: 1) + +# Configuration options for Triton Server. +triton: # (required) + # Configuration options related to the container image for Triton Server. + image: # (required) + # Optional list of pull secrets to be used when downloading the Triton Server container image. + pullSecrets: # (optional) + # - name: ngc-container-pull + # Name of the container image containing the version of Triton Server to be used. + name: # (required) + # Configuration options managing the resources assigned to individual Triton Server instances. + resources: # (optional) + # Number of logical CPU cores reserved for, and assigned to each instance of Triton Server. + cpu: # (default: 4) + # Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server. + memory: # (default: 32Gi) + +# Configuration options related to how various components generate logs. +logging: # (optional) + # Logging configuration options specific to the initialization container. + initialization: + # When `true` the model download and generation of TRT engine and plan use verbose logging; otherwise standard logging is used. + verbose: # (default: false) + # Logging configuration options specific to Triton Server. + tritonServer: + # When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. + useIso8601: # (default: false) + # When `true` Triton Server uses verbose logging; otherwise standard logging is used. + verbose: # (default: false) + +# Configurations option related to the Kubernetes objects created by the chart. +kubernetes: # (optional) + # Root file-system path used when mounting content to the underlying host. + hostRootPath: # (default: /triton) + # Optional set of labels to be applied to created Kubernetes objects. + # These labels can be used for association with a preexisting service object. + labels: # (optional) + # customLabel: exampleValue + # When `false`, a service will not be created when the chart is installed; otherwise a service will be created. + noService: # (default: false) + # Name of the service account to use when deploying components. + # When not provided, a service account will be created. + serviceAccount: # (optional) + # Tolerations applied to every pod deployed as part of this deployment. + # Template already includes `nvidia.com/gpu=present:NoSchedule`. + tolerations: # (optional) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md new file mode 100644 index 00000000..98a9f49f --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/README.md @@ -0,0 +1,26 @@ + + + +# Container Generation + +The files in this folder are intended to be used to create the Triton Server container image. + +Run the following command to create a Triton Server container image. + +```bash +docker build --file ./triton_trt-llm.containerfile --tag . +``` diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh new file mode 100755 index 00000000..4eb88dab --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/kubessh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pod=$1 +shift +kubectl exec $pod -- /bin/sh -c "$*" diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py new file mode 100644 index 00000000..2b59895d --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/server.py @@ -0,0 +1,611 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +import signal +import subprocess +import sys +import time + +# These values are expected to match the mount points in the Helm Chart. +# Any changes here must also be made there, and vice versa. +HUGGING_FACE_TOKEN_PATH = "/var/run/secrets/hugging_face/password" + +ERROR_EXIT_DELAY = 15 +ERROR_CODE_FATAL = 255 +ERROR_CODE_USAGE = 253 +EXIT_SUCCESS = 0 + +# Environment variable keys. +CLI_VERBOSE_KEY = "TRITON_CLI_VERBOSE" +ENGINE_PATH_KEY = "ENGINE_DEST_PATH" +HUGGING_FACE_KEY = "HF_HOME" +MODEL_PATH_KEY = "MODEL_DEST_PATH" + +HUGGING_FACE_CLI = "huggingface-cli" +DELAY_BETWEEN_QUERIES = 2 + + +# --- + + +def create_directory(directory_path: str): + if directory_path is None or len(directory_path) == 0: + return + + segments = directory_path.split("/") + path = "" + + for segment in segments: + if segment is None or len(segment) == 0: + continue + + path = f"{path}/{segment}" + + if is_verbose: + write_output(f"> mkdir {path}") + + if not os.path.exists(path): + os.mkdir(path) + + +# --- + + +def die(exit_code: int): + if exit_code is None: + exit_code = ERROR_CODE_FATAL + + write_error(f" Waiting {ERROR_EXIT_DELAY} second before exiting.") + # Delay the process' termination to provide a small window for administrators to capture the logs before it exits and restarts. + time.sleep(ERROR_EXIT_DELAY) + + exit(exit_code) + + +# --- + + +def hugging_face_authenticate(args): + # Validate that `HF_HOME` environment variable was set correctly. + if HUGGING_FACE_HOME is None or len(HUGGING_FACE_HOME) == 0: + raise Exception(f"Required environment variable '{HUGGING_FACE_KEY}' not set.") + + # When a Hugging Face secret has been mounted, we'll use that to authenticate with Hugging Face. + if os.path.exists(HUGGING_FACE_TOKEN_PATH): + with open(HUGGING_FACE_TOKEN_PATH) as token_file: + write_output( + f"Hugging Face token file '{HUGGING_FACE_TOKEN_PATH}' detected, attempting to authenticate w/ Hugging Face." + ) + write_output(" ") + + hugging_face_token = token_file.read() + + # Use Hugging Face's CLI to complete the authentication. + result = run_command( + [HUGGING_FACE_CLI, "login", "--token", hugging_face_token], [3] + ) + + if result != 0: + raise Exception(f"Hugging Face authentication failed. ({result})") + + write_output("Hugging Face authentication successful.") + write_output(" ") + + +# --- + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("mode", type=str, choices=["convert", "leader", "worker"]) + parser.add_argument("--model", type=str, default=None) + parser.add_argument( + "--dt", + type=str, + default="float16", + choices=["bfloat16", "float16", "float32"], + help="Tensor type.", + ) + parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism.") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism.") + parser.add_argument("--iso8601", action="count", default=0) + parser.add_argument("--verbose", action="count", default=0) + parser.add_argument( + "--deployment", type=str, help="Name of the Kubernetes deployment." + ) + parser.add_argument( + "--namespace", + type=str, + default="default", + help="Namespace of the Kubernetes deployment.", + ) + parser.add_argument("--multinode", action="count", default=0) + parser.add_argument( + "--noconvert", + action="count", + default=0, + help="Prevents leader waiting for model conversion before inference serving begins.", + ) + + return parser.parse_args() + + +# --- + + +def remove_path(path: str): + if os.path.exists(path): + if os.path.isfile(path): + if is_verbose: + write_output(f"> rm {path}") + + os.remove(path) + else: + if is_verbose: + write_output(f"> rm -rf {path}") + + shutil.rmtree(path) + + +# --- + + +def run_command(cmd_args: [str], omit_args: [int] = None): + command = "" + + for i, arg in enumerate(cmd_args): + command += " " + if omit_args is not None and i in omit_args: + command += "*****" + else: + command += arg + + write_output(f">{command}") + write_output(" ") + + # Run triton_cli to build the TRT-LLM engine + plan. + return subprocess.call(cmd_args, stderr=sys.stderr, stdout=sys.stdout) + + +# --- + + +def signal_handler(sig, frame): + write_output(f"Signal {sig} detected, quitting.") + exit(EXIT_SUCCESS) + + +# --- + + +def wait_for_convert(args): + if args.noconvert != 0: + write_output("Leader skip waiting for model-conversion job.") + return + + write_output("Begin waiting for model-conversion job.") + + cmd_args = [ + "kubectl", + "get", + f"job/{args.deployment}", + "-n", + f"{args.namespace}", + "-o", + 'jsonpath={.status.active}{"|"}{.status.failed}{"|"}{.status.succeeded}', + ] + command = " ".join(cmd_args) + + active = 1 + failed = 0 + succeeded = 0 + + while active > 0 and succeeded == 0: + time.sleep(DELAY_BETWEEN_QUERIES) + + if is_verbose: + write_output(f"> {command}") + + output = subprocess.check_output(cmd_args).decode("utf-8") + if output is None or len(output) == 0: + continue + + if is_verbose: + write_output(output) + + output = output.strip(" ") + if len(output) > 0: + parts = output.split("|") + + if len(parts) > 2 and len(parts[2]) > 0: + succeeded = int(parts[2]) + else: + succeeded = 0 + + if len(parts) > 1 and len(parts[1]) > 0: + failed = int(parts[1]) + else: + failed = 0 + + if len(parts) > 0 and len(parts[0]) > 0: + active = int(parts[0]) + else: + active = 0 + + if active > 0: + write_output("Waiting for model-conversion job.") + elif succeeded > 0: + write_output("Model-conversion job succeeded.") + elif failed > 0: + write_error("Model-conversion job failed.") + raise RuntimeError("Model-conversion job failed.") + + write_output(" ") + + +# --- + + +def wait_for_workers(world_size: int): + if world_size is None or world_size <= 0: + raise RuntimeError("Argument `world_size` must be greater than zero.") + + write_output("Begin waiting for worker pods.") + + cmd_args = [ + "kubectl", + "get", + "pods", + "-n", + f"{args.namespace}", + "-l", + f"app={args.deployment}", + "-o", + "jsonpath='{.items[*].metadata.name}'", + ] + command = " ".join(cmd_args) + + workers = [] + + while len(workers) < world_size: + time.sleep(DELAY_BETWEEN_QUERIES) + + if is_verbose: + write_output(f"> {command}") + + output = subprocess.check_output(cmd_args).decode("utf-8") + + if is_verbose: + write_output(output) + + output = output.strip("'") + + workers = output.split(" ") + + if len(workers) < world_size: + write_output( + f"Waiting for worker pods, {len(workers)} of {world_size} ready." + ) + else: + write_output(f"{len(workers)} of {world_size} workers ready.") + + write_output(" ") + + if workers is not None and len(workers) > 1: + workers.sort() + + return workers + + +# --- + + +def write_output(message: str): + print(message, file=sys.stdout, flush=True) + + +# --- + + +def write_error(message: str): + print(message, file=sys.stderr, flush=True) + + +# --- +# Below this line are the primary functions. +# --- + + +def do_convert(args): + write_output("Initializing Model") + + if args.model is None or len(args.model) == 0: + write_error("fatal: Model name must be provided.") + die(ERROR_CODE_FATAL) + + create_directory(ENGINE_DIRECTORY) + create_directory(MODEL_DIRECTORY) + + hugging_face_authenticate(args) + + engine_path = ENGINE_DIRECTORY + engine_lock_file = os.path.join(engine_path, "lock") + engine_ready_file = os.path.join(engine_path, "ready") + model_path = MODEL_DIRECTORY + model_lock_file = os.path.join(model_path, "lock") + model_ready_file = os.path.join(model_path, "ready") + + # When the model and plan already exist, we can exit early, happily. + if os.path.exists(engine_ready_file) and os.path.exists(model_ready_file): + everything_exists = True + + if os.path.exists(engine_lock_file): + write_output("Incomplete engine directory detected, removing.") + everything_exists = False + remove_path(engine_path) + + if os.path.exists(model_lock_file): + write_output("Incomplete model directory detected, removing.") + everything_exists = False + remove_path(engine_path) + + if everything_exists: + write_output( + f"TensorRT engine and plan detected for {args.model}. No work to do, exiting." + ) + exit(EXIT_SUCCESS) + + write_output(f"Begin generation of TensorRT engine and plan for {args.model}.") + write_output(" ") + + create_directory(engine_path) + + # Create a lock file for the engine directory. + if is_verbose: + write_output(f"> echo '{args.model}' > {engine_lock_file}") + + with open(engine_lock_file, "w") as f: + f.write(args.model) + + create_directory(model_path) + + # Create a lock file for the engine model. + if is_verbose: + write_output(f"> echo '{args.model}' > {model_lock_file}") + + with open(model_lock_file, "w") as f: + f.write(args.model) + + try: + # Build up a set of args for the subprocess call. + cmd_args = [ + "triton", + "import", + "--model", + args.model, + "--model-repository", + MODEL_DIRECTORY, + ] + + cmd_args += ["--backend", "tensorrtllm"] + + if args.dt is not None and args.dt in ["bfloat", "float16", "float32"]: + cmd_args += ["--data-type", args.dt] + + if args.pp > 1: + cmd_args += ["--pipeline-parallelism", f"{args.pp}"] + + if args.tp > 1: + cmd_args += ["--tensor-parallelism", f"{args.tp}"] + + if args.tp * args.pp > 1 and args.multinode > 0: + cmd_args += ["--disable-custom-all-reduce"] + + # When verbose, insert the verbose flag. + # It is important to note that the flag must immediately follow `triton` and cannot be in another ordering position. + # This limitation will likely be removed a future release of triton_cli. + if is_verbose: + cmd_args.insert(1, "--verbose") + + result = run_command(cmd_args) + + if result == 0: + # Create the ready file. + if is_verbose: + write_output(f"> echo '{args.model}' > {engine_ready_file}") + + with open(engine_ready_file, "w") as f: + f.write(args.model) + + # Create the ready file. + if is_verbose: + write_output(f"> echo '{args.model}' > {model_ready_file}") + + with open(model_ready_file, "w") as f: + f.write(args.model) + + # Remove the lock files. + if is_verbose: + write_output(f"> rm {engine_lock_file}") + + os.remove(engine_lock_file) + + if is_verbose: + write_output(f"> rm {model_lock_file}") + + os.remove(model_lock_file) + else: + # Clean the model and engine directories when the command fails. + remove_path(engine_path) + remove_path(model_path) + + exit(result) + + except Exception as exception: + remove_path(engine_path) + remove_path(model_path) + raise exception + + +# --- + + +def do_leader(args): + world_size = args.tp * args.pp + + if world_size <= 0: + raise Exception( + "usage: Options --pp and --pp must both be equal to or greater than 1." + ) + + write_output(f"Executing Leader (world size: {world_size})") + + wait_for_convert(args) + + workers = wait_for_workers(world_size) + + if len(workers) != world_size: + write_error(f"fatal: {len(workers)} found, expected {world_size}.") + die(ERROR_EXIT_DELAY) + + cmd_args = [ + "mpirun", + "--allow-run-as-root", + ] + + if is_verbose > 0: + cmd_args += ["--debug-devel"] + + cmd_args += [ + "--report-bindings", + "-mca", + "plm_rsh_agent", + "kubessh", + "-np", + f"{world_size}", + "--host", + ",".join(workers), + ] + + # Add per node command lines separated by ':'. + for i in range(world_size): + if i != 0: + cmd_args += [":"] + + cmd_args += [ + "-n", + "1", + "tritonserver", + "--allow-cpu-metrics=false", + "--allow-gpu-metrics=false", + "--disable-auto-complete-config", + f"--id=rank{i}", + "--model-load-thread-count=2", + f"--model-repository={MODEL_DIRECTORY}", + ] + + # Rank0 node needs to support metrics collection and web services. + if i == 0: + cmd_args += [ + "--allow-metrics=true", + "--metrics-interval-ms=1000", + ] + + if is_verbose > 0: + cmd_args += ["--log-verbose=1"] + + if args.iso8601 > 0: + cmd_args += ["--log-format=ISO8601"] + + # Rank(N) nodes can disable metrics, web services, and logging. + else: + cmd_args += [ + "--allow-http=false", + "--allow-grpc=false", + "--allow-metrics=false", + "--model-control-mode=explicit", + "--load-model=tensorrt_llm", + "--log-info=false", + "--log-warning=false", + ] + + result = run_command(cmd_args) + + if result != 0: + die(result) + + exit(result) + + +# --- + + +def do_worker(args): + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + write_output("Worker paused awaiting SIGINT or SIGTERM.") + signal.pause() + + +# --- + + +write_output("Reporting system information.") +run_command(["whoami"]) +run_command(["cgget", "-n", "--values-only", "--variable memory.limit_in_bytes", "/"]) +run_command(["nvidia-smi"]) + +ENGINE_DIRECTORY = os.getenv(ENGINE_PATH_KEY) +HUGGING_FACE_HOME = os.getenv(HUGGING_FACE_KEY) +MODEL_DIRECTORY = os.getenv(MODEL_PATH_KEY) + +is_verbose = os.getenv(CLI_VERBOSE_KEY) is not None + +# Validate that `ENGINE_PATH_KEY` isn't empty. +if ENGINE_DIRECTORY is None or len(ENGINE_DIRECTORY) == 0: + raise Exception(f"Required environment variable '{ENGINE_PATH_KEY}' not set.") + +# Validate that `MODEL_PATH_KEY` isn't empty. +if MODEL_DIRECTORY is None or len(MODEL_DIRECTORY) == 0: + raise Exception(f"Required environment variable '{MODEL_PATH_KEY}' not set.") + +# Parse options provided. +args = parse_arguments() + +# Update the is_verbose flag with values passed in by options. +is_verbose = is_verbose or args.verbose > 0 + +if is_verbose: + write_output(f"{ENGINE_PATH_KEY}='{ENGINE_DIRECTORY}'") + write_output(f"{HUGGING_FACE_KEY}='{HUGGING_FACE_HOME}'") + write_output(f"{MODEL_PATH_KEY}='{MODEL_DIRECTORY}'") + +if args.mode == "convert": + do_convert(args) + +elif args.mode == "leader": + do_leader(args) + +elif args.mode == "worker": + do_worker(args) + +else: + write_error(f"usage: server.py [].") + write_error(f' Invalid mode ("{args.mode}") provided.') + write_error(f' Supported values are "init" or "exec".') + die(ERROR_CODE_USAGE) diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile new file mode 100644 index 00000000..e4fc9850 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/containers/triton_trt-llm.containerfile @@ -0,0 +1,86 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3 +ARG ENGINE_DEST_PATH=/var/run/models/engine +ARG HF_HOME=/var/run/hugging_face +ARG MODEL_DEST_PATH=/var/run/models/model + +FROM ${BASE_CONTAINER_IMAGE} + +# Set a set of useful labels. +LABEL "base"="${BASE_CONTAINER_IMAGE}" +LABEL "role"="server" + +# Stop APT (Debian package manager) from complaining about interactivity. +ENV DEBIAN_FRONTEND=noninteractive +# Set additional environment values that make usage more pleasant. +ENV TERM=xterm-256color + +RUN apt update \ + && apt install --yes \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg \ + cgroup-tools \ + && rm -rf /var/lib/apt/lists/* + +# Install kubectl because server.py script depends on it. +# Step 1: acquire the Kubernetes APT GPG key. +RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key \ + | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \ + && chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg + +# Step 2: Acquire the API sources list for Kubernetes. +RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /' \ + | tee /etc/apt/sources.list.d/kubernetes.list \ + && chmod 644 /etc/apt/sources.list.d/kubernetes.list + +# Step 3: Install kubectl. +RUN apt update \ + && apt install --yes \ + kubectl \ + && apt autoremove --yes \ + && apt purge --yes \ + && rm -rf /var/lib/apt/lists/* + +# Set Triton CLI environment variables which control where +# TRTLLM engine and model files are downloaded to; and where +# the path to the Huggingface cache. +ENV ENGINE_DEST_PATH ${ENGINE_DEST_PATH} +ENV HF_HOME ${HF_HOME} +ENV MODEL_DEST_PATH ${MODEL_DEST_PATH} + +# Set the active working directory. +WORKDIR /workspace + +# Install a custom version of Triton CLI that support Tensor parallelism and +# the 70B version of Llama models. +RUN pip --verbose install \ + --no-cache-dir \ + --no-color \ + --no-input \ + git+https://github.com/triton-inference-server/triton_cli.git@jwyman/aslb-mn + +# Copy kubessh script w/ executable permissions for everyone. +# This enables the script to be executed no matter the user the container is run as. +# This works around the issue of the file being non-executable when the container is build on a Windows host. +COPY --chmod=555 kubessh . +COPY server.py . + +RUN apt list --installed \ + && pip list --version + +ENTRYPOINT [ "/bin/bash" ] diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml new file mode 100644 index 00000000..30111dad --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_dcgm-exporter_values.yaml @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# All values are defaults unless specified otherwise. + +image: + repository: nvcr.io/nvidia/k8s/dcgm-exporter + pullPolicy: IfNotPresent + tag: 3.3.5-3.4.1-ubuntu22.04 + +arguments: + # Reduces the delay between GPU metrics collection passed to 1 second. +- --collect-interval=1000 +- --collectors=/etc/dcgm-exporter/dcp-metrics-included.csv + # Required. Enables Kubernetes specific metric collection features. +- --kubernetes=true + +serviceAccount: + create: true + annotations: { } + name: + +rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + +podLabels: { } + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9400" + # Required by Prometheus Operator for proper metrics collection. + release: prometheus +podSecurityContext: { } + +securityContext: + # Enables advanced GPU metrics features. Optional. + privileged: true + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: [ "SYS_ADMIN" ] + +service: + enable: true + type: ClusterIP + port: 9400 + address: ":9400" + annotations: + prometheus.io/port: "9400" + prometheus.io/scrape: "true" + release: prometheus + +resources: + # Sets proper resource utilization limits, and enables Kubernetes to manage the pod's resource consumption. + # All contains should have these. + limits: + cpu: 2 + memory: 1Gi + # Sets proper resource requirements, and enables Kubernetes to account for the pod's resource consumption. + # All contains should have these. + requests: + cpu: 1 + memory: 1Gi + +serviceMonitor: + enabled: true + # Reduces the delay between metric collection passes. + interval: 1s + honorLabels: false + additionalLabels: + # Useful for helping Prometheus identify metrics collectors. + monitoring: prometheus + # Required by Prometheus to identify metrics collectors. + release: prometheus + +nodeSelector: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + nvidia.com/gpu: present + +tolerations: +# Enables the DCGM Exporter pods to be deployed to nodes with GPUs. +- key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + - key: nvidia.com/gpu + operator: Exists + +kubeletPath: "/var/lib/kubelet/pod-resources" diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml new file mode 100644 index 00000000..02ac2cd8 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/nvidia_gpu-feature-discovery_daemonset.yaml @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# In the document below, the version `0.8.2` of the gpu-feature-discovery container is used. +# It is always wise to check if a new version has been released and to use the latest available release when possible. +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gpu-feature-discovery + namespace: kube-system + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu +spec: + selector: + matchLabels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/part-of: nvidia-gpu + template: + metadata: + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + # The following set of node selector match expressions restrict the nodes the service's pods + # can be deployed to, to node which meet one or more of the following criteria: + # * Nodes with NVIDIA PCIE devices attached (10DE is NVIDIA's PCIE device number). + # * Nodes with NVIDIA CPUs. + # * Nodes with NVIDIA GPUs. + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + - key: "nvidia.com/gpu" + operator: In + values: + - "true" + - present + containers: + - image: nvcr.io/nvidia/gpu-feature-discovery:v0.8.2 + name: gpu-feature-discovery + volumeMounts: + - name: output-dir + mountPath: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + mountPath: "/sys" + env: + - name: MIG_STRATEGY + value: none + securityContext: + privileged: true + # Enables the service's pods to be deployed on nodes with GPUs. + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + volumes: + - name: output-dir + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + hostPath: + path: "/sys" diff --git a/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml new file mode 100644 index 00000000..8bf110f9 --- /dev/null +++ b/Deployment/Kubernetes/TensorRT-LLM_Multi-Node_Distributed_Models/pvc.yaml @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: model-volume +spec: + accessModes: + # The PVC must support multiple, concurrent readers and writers. + # This is because multiple pods will be mapped to the PVC as each worker pod needs access to the model's data. + # Additionally, multiple models could be converted in parallel by concurrent conversion jobs. + - ReadWriteMany + resources: + requests: + # This size does not need to match the PV's `spec.capacity.storage` value, but not doing so will prevent utilization of the entire PV. + storage: 512Gi + # Depending on your storage class provider, this value should be empty or the value specified by the provider. + # Please read your provider's documentation when determining this value. + storageClassName: "" + # This value must be an exact match for the PV's `metadata.name` property. + volumeName: model-volume