diff --git a/Popular_Models_Guide/StableDiffusion/README.md b/Popular_Models_Guide/StableDiffusion/README.md index 04ecfd77..5002d7cf 100644 --- a/Popular_Models_Guide/StableDiffusion/README.md +++ b/Popular_Models_Guide/StableDiffusion/README.md @@ -29,7 +29,7 @@ # Deploying Stable Diffusion Models with Triton and TensorRT This example demonstrates how to deploy Stable Diffusion models in -Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion) +Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion) pipeline and utilities. Using the TensorRT demo as a base this example contains a reusable @@ -38,9 +38,9 @@ suitable for deploying multiple versions and configurations of Diffusion models. For more information on Stable Diffusion please visit -[stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5), -[stable-diffusion-xl](https://huggingface.co/docs/diffusers/en/using-diffusers/sdxl). For -more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion). +[stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5), +[stable-diffusion-xl](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0). For +more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion). > [!Note] > This example is given as sample code and should be reviewed before use in production settings. @@ -57,7 +57,7 @@ support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/i ## Building the Triton Inference Server Image The example is designed based on the -`nvcr.io/nvidia/tritonserver:24.01-py3` docker image and [TensorRT OSS v9.2.0](https://github.com/NVIDIA/TensorRT/releases/tag/v9.2.0). +`nvcr.io/nvidia/tritonserver:24.08-py3` docker image and [TensorRT OSS v10.4](https://github.com/NVIDIA/TensorRT/releases/tag/v10.4). A set of convenience scripts are provided to create a docker image based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the @@ -99,6 +99,15 @@ directory as `workspace`. ### Build Stable Diffusion v 1.5 Engine +> [!Note] +> +> The model +> [stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5) +> requires login in to huggingface and acceptance of terms and +> conditions of use. Please set the environment variable HF_TOKEN +> accordingly. +> + ```bash ./scripts/build_models.sh --model stable_diffusion_1_5 ``` @@ -285,27 +294,13 @@ python3 client.py --model stable_diffusion_xl --requests 10 --clients 10 ## Known Issues and Limitations -1. When shutting down the server, an invalid memory operation occurs: - - > [!Note] - > This error is also seen in standalone applications outside of the Triton Inference Server - > and we believe this is due to an interaction between imported python modules. Further - > we haven't seen any issues related to this error and believe it can be safely - > ignored. - - - ``` - free(): invalid pointer - ``` - - -2. The diffusion backend doesn't yet support using an optional refiner +1. The diffusion backend doesn't yet support using an optional refiner model unlike the [demo][demo_reference] it's based on. See also [demo_txt2img_xl.py][demo_code] -[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/9.2/demo/Diffusion/demo_txt2img_xl.py +[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/10.4/demo/Diffusion/demo_txt2img_xl.py -[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion#text-to-image-using-sdxl-stable-diffusion-xl +[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion#generate-an-image-with-stable-diffusion-xl-guided-by-a-single-text-prompt diff --git a/Popular_Models_Guide/StableDiffusion/build.sh b/Popular_Models_Guide/StableDiffusion/build.sh index b2507d77..86e37a34 100755 --- a/Popular_Models_Guide/StableDiffusion/build.sh +++ b/Popular_Models_Guide/StableDiffusion/build.sh @@ -39,7 +39,7 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile # Base Images BASE_IMAGE=nvcr.io/nvidia/tritonserver -BASE_IMAGE_TAG_DIFFUSION=24.01-py3 +BASE_IMAGE_TAG_DIFFUSION=24.08-py3 get_options() { while :; do @@ -141,7 +141,7 @@ get_options() { fi if [ -z "$TAG" ]; then - TAG="tritonserver:r24.01" + TAG="tritonserver:r24.08" if [[ $FRAMEWORK == "DIFFUSION" ]]; then TAG+="-diffusion" @@ -211,7 +211,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then set -x fi $RUN_PREFIX mkdir -p $PWD/backend/diffusion - $RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion" + $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion" { set +x; } 2>/dev/null @@ -221,7 +221,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then set -x fi - $RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model" + $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR):/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model" { set +x; } 2>/dev/null done diff --git a/Popular_Models_Guide/StableDiffusion/docker/Dockerfile b/Popular_Models_Guide/StableDiffusion/docker/Dockerfile index f4daace6..f499b4dd 100644 --- a/Popular_Models_Guide/StableDiffusion/docker/Dockerfile +++ b/Popular_Models_Guide/StableDiffusion/docker/Dockerfile @@ -29,9 +29,9 @@ ARG BASE_IMAGE_TAG=24.01-py3 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as tritonserver-stable-diffusion -RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt==9.2.0.post12.dev5 +RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt-cu12==10.4.0 -RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/9.2 --single-branch /tmp/TensorRT +RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/10.4 --single-branch /tmp/TensorRT RUN pip3 install -r /tmp/TensorRT/demo/Diffusion/requirements.txt diff --git a/Popular_Models_Guide/StableDiffusion/run.sh b/Popular_Models_Guide/StableDiffusion/run.sh index be47a600..3e377e06 100755 --- a/Popular_Models_Guide/StableDiffusion/run.sh +++ b/Popular_Models_Guide/StableDiffusion/run.sh @@ -99,7 +99,7 @@ get_options() { fi if [ -z "$IMAGE" ]; then - IMAGE="tritonserver:r24.01" + IMAGE="tritonserver:r24.08" if [[ $FRAMEWORK == "DIFFUSION" ]]; then IMAGE+="-diffusion" diff --git a/Triton_Inference_Server_Python_API/README.md b/Triton_Inference_Server_Python_API/README.md index e3b27df6..06d58c0e 100644 --- a/Triton_Inference_Server_Python_API/README.md +++ b/Triton_Inference_Server_Python_API/README.md @@ -54,14 +54,14 @@ https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html ## Installation The tutorial and Python API package are designed to be installed and -run within the `nvcr.io/nvidia/tritonserver:24.01-py3` docker image. +run within the `nvcr.io/nvidia/tritonserver:24.08-py3` docker image. A set of convenience scripts are provided to create a docker image -based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the +based on the `nvcr.io/nvidia/tritonserver:24.08-py3` image with the Python API installed plus additional dependencies required for the examples. -### Triton Inference Server 24.01 + Python API +### Triton Inference Server 24.08 + Python API #### Clone Repository ```bash @@ -69,7 +69,7 @@ git clone https://github.com/triton-inference-server/tutorials.git cd tutorials/Triton_Inference_Server_Python_API ``` -#### Build `triton-python-api:r24.01` Image +#### Build `triton-python-api:r24.08` Image ```bash ./build.sh ``` @@ -77,7 +77,7 @@ cd tutorials/Triton_Inference_Server_Python_API #### Supported Backends The built image includes all the backends shipped by default in the -tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container. +tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container. ``` dali fil identity onnxruntime openvino python pytorch repeat square tensorflow tensorrt @@ -95,7 +95,7 @@ different data types. The `identity` model copies provided inputs of ## Hello World -### Start `triton-python-api:r24.01` Container +### Start `triton-python-api:r24.08` Container The following command starts a container and volume mounts the current directory as `workspace`. @@ -163,7 +163,7 @@ This example is based on the tutorial. -#### Build `triton-python-api:r24.01-diffusion` Image and Stable Diffusion Models +#### Build `triton-python-api:r24.08-diffusion` Image and Stable Diffusion Models Please note the following command will take many minutes depending on your hardware configuration and network connection. @@ -175,7 +175,7 @@ your hardware configuration and network connection. #### Supported Backends The built image includes all the backends shipped by default in the -tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container. +tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container. ``` dali fil identity onnxruntime openvino python pytorch repeat square tensorflow tensorrt @@ -223,13 +223,13 @@ server.models() #### Example Output ```python -{('stable_diffusion', 1): {'name': 'stable_diffusion', 'version': 1, 'state': 'READY'}, ('text_encoder', 1): {'name': 'text_encoder', 'version': 1, 'state': 'READY'}, ('vae', 1): {'name': 'vae', 'version': 1, 'state': 'READY'}} +{('stable_diffusion_1_5', 1): {'name': 'stable_diffusion_1_5', 'version': 1, 'state': 'READY'}, ('stable_diffusion_xl', 1): {'name': 'stable_diffusion_xl', 'version': 1, 'state': 'READY'}} ``` ### Send an Inference Request ```python -model = server.model("stable_diffusion") +model = server.model("stable_diffusion_xl") responses = model.infer(inputs={"prompt":[["butterfly in new york, realistic, 4k, photograph"]]}) ``` diff --git a/Triton_Inference_Server_Python_API/build.sh b/Triton_Inference_Server_Python_API/build.sh index cdfa725d..4d787cb0 100755 --- a/Triton_Inference_Server_Python_API/build.sh +++ b/Triton_Inference_Server_Python_API/build.sh @@ -30,7 +30,7 @@ RUN_PREFIX= BUILD_MODELS= # Frameworks -declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["TRT_LLM"]=2 ["IDENTITY"]=3) +declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["IDENTITY"]=3) DEFAULT_FRAMEWORK=IDENTITY SOURCE_DIR=$(dirname "$(readlink -f "$0")") @@ -39,9 +39,8 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile # Base Images BASE_IMAGE=nvcr.io/nvidia/tritonserver -BASE_IMAGE_TAG_IDENTITY=24.01-py3 -BASE_IMAGE_TAG_DIFFUSION=24.01-py3 -BASE_IMAGE_TAG_TRT_LLM=24.01-trtllm-python-py3 +BASE_IMAGE_TAG_IDENTITY=24.08-py3 +BASE_IMAGE_TAG_DIFFUSION=24.08-py3 get_options() { while :; do @@ -138,11 +137,7 @@ get_options() { fi if [ -z "$TAG" ]; then - TAG="triton-python-api:r24.01" - - if [[ $FRAMEWORK == "TRT_LLM" ]]; then - TAG+="-trt-llm" - fi + TAG="triton-python-api:r24.08" if [[ $FRAMEWORK == "DIFFUSION" ]]; then TAG+="-diffusion" @@ -186,7 +181,7 @@ get_options "$@" if [[ $FRAMEWORK == DIFFUSION ]]; then BASE_IMAGE="tritonserver" - BASE_IMAGE_TAG="r24.01-diffusion" + BASE_IMAGE_TAG="r24.08-diffusion" fi # BUILD RUN TIME IMAGE @@ -207,17 +202,18 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then if [ -z "$RUN_PREFIX" ]; then set -x fi - $RUN_PREFIX mkdir -p backend/diffusion - $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.01-diffusion - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py backend/diffusion/model.py - $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_1_5/1 - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt diffusion-models/stable_diffusion_1_5/config.pbtxt - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep diffusion-models/stable_diffusion_1_5/1/.gitkeep - $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_xl/1 - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt diffusion-models/stable_diffusion_xl/config.pbtxt - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep diffusion-models/stable_diffusion_xl/1/.gitkeep - $RUN_PREFIX mkdir -p scripts/stable_diffusion - $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* scripts/stable_diffusion/ + $RUN_PREFIX mkdir -p ${SOURCE_DIR}/backend/diffusion + $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.08-diffusion + $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace tritonserver:r24.08-diffusion /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion" + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py ${SOURCE_DIR}/backend/diffusion/model.py + $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1 + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/config.pbtxt + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1/.gitkeep + $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1 + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/config.pbtxt + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1/.gitkeep + $RUN_PREFIX mkdir -p ${SOURCE_DIR}/scripts/stable_diffusion + $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* ${SOURCE_DIR}/scripts/stable_diffusion/ fi @@ -231,17 +227,6 @@ $RUN_PREFIX docker build -f $DOCKERFILE $BUILD_OPTIONS $BUILD_ARGS -t $TAG $SOUR { set +x; } 2>/dev/null -if [[ $FRAMEWORK == TRT_LLM ]]; then - if [ -z "$RUN_PREFIX" ]; then - set -x - fi - - $RUN_PREFIX docker build -f $SOURCE_DIR/docker/Dockerfile.trt-llm-engine-builder $BUILD_OPTIONS $BUILD_ARGS -t trt-llm-engine-builder $SOURCE_DIR $NO_CACHE - - { set +x; } 2>/dev/null - -fi; - if [[ $FRAMEWORK == IDENTITY ]] || [[ $BUILD_MODELS == TRUE ]]; then if [[ $FRAMEWORK == DIFFUSION ]]; then @@ -249,7 +234,7 @@ if [[ $FRAMEWORK == IDENTITY ]] || [[ $BUILD_MODELS == TRUE ]]; then set -x fi - $RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_1_5" + $RUN_PREFIX docker run --gpus all --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_xl" { set +x; } 2>/dev/null fi diff --git a/Triton_Inference_Server_Python_API/deps/requirements.txt b/Triton_Inference_Server_Python_API/deps/requirements.txt index f48e8673..7226481a 100644 --- a/Triton_Inference_Server_Python_API/deps/requirements.txt +++ b/Triton_Inference_Server_Python_API/deps/requirements.txt @@ -24,14 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -awscli -fastapi==0.97.0 -ftfy -mypy pyright pytest -ray[all]==2.9 -scipy -sphinx -sphinx-markdown-builder -starlette==0.27.0 +ray[all]==2.36.0 diff --git a/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl b/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl deleted file mode 100644 index 59c79d15..00000000 Binary files a/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl and /dev/null differ diff --git a/Triton_Inference_Server_Python_API/docker/Dockerfile b/Triton_Inference_Server_Python_API/docker/Dockerfile index da0611d6..d74c105f 100644 --- a/Triton_Inference_Server_Python_API/docker/Dockerfile +++ b/Triton_Inference_Server_Python_API/docker/Dockerfile @@ -25,37 +25,27 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_IMAGE_TAG=24.01-py3 +ARG BASE_IMAGE_TAG=24.08-py3 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as triton-python-api RUN apt-get update; apt-get install -y gdb -COPY ./deps/requirements.txt /tmp/requirements.txt - -RUN pip install --timeout=2000 -r /tmp/requirements.txt +RUN --mount=type=bind,source=./deps/requirements.txt,target=/tmp/requirements.txt \ + pip install --timeout=2000 --requirement /tmp/requirements.txt # Finish pyright install RUN pyright --help -COPY ./deps/tritonserver-2.41.0.dev0-py3-none-any.whl /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl - RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \ - "tritonserver-*.whl" | xargs -I {} pip3 install --force-reinstall --upgrade {}[all] + "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all] -RUN pip3 show tritonserver 1>/dev/null || \ - if [ $? != 0 ]; then \ - pip3 install /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl[all] ;\ - fi +# grafana +RUN apt-get install -y adduser libfontconfig1 musl && \ + wget https://dl.grafana.com/enterprise/release/grafana-enterprise_11.2.0_amd64.deb && \ + dpkg -i grafana-enterprise_11.2.0_amd64.deb && \ + rm -rf grafana-enterprise_11.2.0_amd64.deb RUN ln -sf /bin/bash /bin/sh -COPY . /workspace - -ARG RUN_TESTS=FALSE - -RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then cd /tmp && git clone -b r23.12-python-api https://github.com/triton-inference-server/core.git && cp -rf /tmp/core/python/test /workspace/deps/ ; fi - -RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then pytest /workspace/deps ; fi - diff --git a/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder b/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder deleted file mode 100644 index 83044799..00000000 --- a/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_IMAGE_TAG=23.12-trtllm-python-py3 - -FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as trt-llm-engine-builder - -ARG TRT_LLM_BACKEND_REPO=https://github.com/triton-inference-server/tensorrtllm_backend.git -ARG TRT_LLM_BACKEND_TAG=r23.12 - -# Update the submodule TensorRT-LLM repository -RUN git clone -b $TRT_LLM_BACKEND_TAG $TRT_LLM_BACKEND_REPO -WORKDIR tensorrtllm_backend -RUN apt-get update; apt-get install -y git-lfs -RUN git lfs install && git lfs pull -RUN git submodule update --init --recursive - - -# TensorRT-LLM is required for generating engines. You can skip this step if -# you already have the package installed. If you are generating engines within -# the Triton container, you have to install the TRT-LLM package. -RUN (cd tensorrt_llm && \ - bash docker/common/install_cmake.sh && \ - export PATH=/usr/local/cmake/bin:$PATH && \ - python3 ./scripts/build_wheel.py --trt_root="/usr/local/tensorrt" && \ - pip3 install ./build/tensorrt_llm*.whl) - -# # Go to the tensorrt_llm/examples/gpt directory -# cd tensorrt_llm/examples/gpt - -# # Download weights from HuggingFace Transformers -# rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 -# pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd - -# # Convert weights from HF Tranformers to FT format -# python3 hf_gpt_convert.py -p 8 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism 4 --storage-type float16 - -# # Build TensorRT engines -# python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \ -# --world_size=4 \ -# --dtype float16 \ -# --use_inflight_batching \ -# --use_gpt_attention_plugin float16 \ -# --paged_kv_cache \ -# --use_gemm_plugin float16 \ -# --remove_input_padding \ -# --use_layernorm_plugin float16 \ -# --hidden_act gelu \ -# --parallel_build \ -# --output_dir=engines/fp16/4-gpu diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/README.md b/Triton_Inference_Server_Python_API/examples/rayserve/README.md index 506d7cd4..595c4c4f 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/README.md +++ b/Triton_Inference_Server_Python_API/examples/rayserve/README.md @@ -68,13 +68,23 @@ directory as `workspace`. ```bash ./run.sh --framework diffusion +cd examples/rayserve +``` + +### Start Local Ray Cluster +The following command starts a local Ray cluster. It also starts +prometheus and grafana instances with default Ray and Ray Serve +metrics and dashboards enabled. + +``` +./start_ray.sh ``` ### Run Deployment + ```bash -cd examples/rayserve -serve run tritonserver_deployment:tritonserver_deployment +serve run tritonserver_deployment:deployment ``` ## Send Requests to Deployment @@ -95,14 +105,13 @@ curl --request GET "http://127.0.0.1:8000/identity?string_input=hello_world!" "hello_world!" ``` - ### `/generate` The generate endpoint accepts a prompt, generates an image based on the prompt using stable diffusion, and saves the image to a file. #### Example Request ``` -curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic,4k&filename=car_sample.jpg" +curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic,4k&filename=/workspace/examples/rayserve/car_sample.jpg" ``` #### Example Output @@ -110,4 +119,21 @@ curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic, ![car_sample](../../docs/car_sample.jpg) +## View Ray and Ray Serve Dashboards + +The Ray and Ray Serve dashboards are hosted on the default port and +can be used to visualize various metrics: +``` +:8265 +``` + +## Stop the Ray Serve Cluster + +The following command stops the local Ray cluster and also stops +prometheus and grafana instances. + + +```bash +./stop_ray.sh +``` diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/client.py b/Triton_Inference_Server_Python_API/examples/rayserve/client.py index b906182f..be65a0c8 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/client.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/client.py @@ -52,7 +52,7 @@ def client(endpoint, request_count, prompt, save_image, index): request_start = time.time() requests.get( f"http://127.0.0.1:8000/{endpoint}?prompt={prompt_input}{filename_input}", - timeout=60, + timeout=300, ) latencies.append(time.time() - request_start) print( diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh b/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh new file mode 100755 index 00000000..ea0f6b35 --- /dev/null +++ b/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh @@ -0,0 +1,40 @@ +#!/bin/bash -e +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +ip_address=$(hostname -I | awk '{print $1}') + +echo $ip_address + +mkdir -p /tmp/rayserve-demo; cd /tmp/rayserve-demo + +ray metrics launch-prometheus + +export RAY_GRAFANA_HOST=http://${ip_address}:3000 + +ray start --head --dashboard-host 0.0.0.0 --metrics-export-port 8080 --disable-usage-stats + +/usr/share/grafana/bin/grafana-server --homepath /usr/share/grafana --config /tmp/ray/session_latest/metrics/grafana/grafana.ini web >grafana.stdout.log 2>&1 & diff --git a/Triton_Inference_Server_Python_API/deps/requirements_torch.txt b/Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh old mode 100644 new mode 100755 similarity index 94% rename from Triton_Inference_Server_Python_API/deps/requirements_torch.txt rename to Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh index f52f87ef..eadfd231 --- a/Triton_Inference_Server_Python_API/deps/requirements_torch.txt +++ b/Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh @@ -1,3 +1,4 @@ +#!/bin/bash -e # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -24,10 +25,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -accelerate -diffusers==0.9.0 -torch -torchaudio -torchvision -transformers -transformers[onnxruntime] +ray stop +pkill prometheus.* +pkill grafana.* diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py index 1610b583..257dcffd 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py @@ -51,18 +51,50 @@ def _print_heading(message): print("-" * len(message)) -@serve.deployment(ray_actor_options={"num_gpus": 1}) +@serve.deployment( + ray_actor_options={"num_gpus": 1}, + max_ongoing_requests=1, + autoscaling_config={ + "min_replicas": 1, + "max_replicas": 8, + "max_ongoing_requests": 1, + "target_ongoing_requests": 1, + "upscale_delay_s": 2, + "downscale_delay_s": 120, + "upscaling_factor": 1, + "downscaling_factor": 1, + "metrics_interval_s": 2, + "look_back_period_s": 4, + }, +) @serve.ingress(app) class BaseDeployment: - def __init__(self): - self._image_size = 512 - self._model_id = "runwayml/stable-diffusion-v1-5" - from diffusers import StableDiffusionPipeline + def __init__(self, use_torch_compile=False): + self._image_size = 1024 + self._model_id = "stabilityai/stable-diffusion-xl-base-1.0" + from diffusers import AutoencoderKL, DiffusionPipeline - self._pipeline = StableDiffusionPipeline.from_pretrained( - self._model_id, revision="fp16", torch_dtype=torch.float16 + vae = AutoencoderKL.from_pretrained( + "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 + ) + self._pipeline = DiffusionPipeline.from_pretrained( + self._model_id, + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True, + vae=vae, ) self._pipeline = self._pipeline.to("cuda") + if use_torch_compile: + print("compiling") + print(torch._dynamo.list_backends()) + self._pipeline.unet = torch.compile( + self._pipeline.unet, + fullgraph=True, + mode="reduce-overhead", + dynamic=False, + ) + self.generate("temp") @app.get("/generate") def generate(self, prompt: str, filename: Optional[str] = None) -> None: @@ -71,13 +103,28 @@ def generate(self, prompt: str, filename: Optional[str] = None) -> None: prompt, height=self._image_size, width=self._image_size, - num_inference_steps=50, + num_inference_steps=30, ).images[0] if filename: image_.save(filename) -@serve.deployment(ray_actor_options={"num_gpus": 1}) +@serve.deployment( + ray_actor_options={"num_gpus": 1}, + max_ongoing_requests=1, + autoscaling_config={ + "min_replicas": 1, + "max_replicas": 8, + "max_ongoing_requests": 1, + "target_ongoing_requests": 1, + "upscale_delay_s": 2, + "downscale_delay_s": 120, + "upscaling_factor": 1, + "downscaling_factor": 1, + "metrics_interval_s": 2, + "look_back_period_s": 4, + }, +) @serve.ingress(app) class TritonDeployment: def __init__(self): @@ -104,11 +151,9 @@ def __init__(self): self._stable_diffusion = None self._test_model = None - if not self._triton_server.model("stable_diffusion_1_5").ready(): + if not self._triton_server.model("stable_diffusion_xl").ready(): try: - self._stable_diffusion = self._triton_server.load( - "stable_diffusion_1_5" - ) + self._stable_diffusion = self._triton_server.load("stable_diffusion_xl") if not self._stable_diffusion.ready(): raise Exception("Model not ready") @@ -120,6 +165,7 @@ def __init__(self): return _print_heading("Models") pprint(self._triton_server.models()) + self.generate("temp") @app.get("/identity") def test(self, string_input: str) -> str: @@ -148,12 +194,15 @@ def generate(self, prompt: str, filename: Optional[str] = None) -> None: image_.save(filename) -def tritonserver_deployment(_args): +def deployment(_args): return TritonDeployment.bind() -def base_deployment(_args): - return BaseDeployment.bind() +def baseline(_args): + if "use-torch-compile" in _args: + return BaseDeployment.bind(use_torch_compile=True) + else: + return BaseDeployment.bind(use_torch_compile=False) if __name__ == "__main__": diff --git a/Triton_Inference_Server_Python_API/run.sh b/Triton_Inference_Server_Python_API/run.sh index c465e7f5..c4229cf0 100755 --- a/Triton_Inference_Server_Python_API/run.sh +++ b/Triton_Inference_Server_Python_API/run.sh @@ -29,7 +29,7 @@ TAG= RUN_PREFIX= # Frameworks -declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["TRT_LLM"]=2 ["IDENTITY"]=3) +declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["IDENTITY"]=3) DEFAULT_FRAMEWORK=IDENTITY SOURCE_DIR=$(dirname "$(readlink -f "$0")") @@ -37,7 +37,6 @@ SOURCE_DIR=$(dirname "$(readlink -f "$0")") # Base Images IMAGE= IMAGE_TAG_DIFFUSERS=diffusion -IMAGE_TAG_TRT_LLM=trt-llm get_options() { while :; do @@ -56,10 +55,10 @@ get_options() { ;; --image) if [ "$2" ]; then - BASE_IMAGE=$2 + IMAGE=$2 shift else - error 'ERROR: "--base" requires an argument.' + error 'ERROR: "--image" requires an argument.' fi ;; --dry-run) @@ -100,11 +99,7 @@ get_options() { fi if [ -z "$IMAGE" ]; then - IMAGE="triton-python-api:r24.01" - - if [[ $FRAMEWORK == "TRT_LLM" ]]; then - IMAGE+="-trt-llm" - fi + IMAGE="triton-python-api:r24.08" if [[ $FRAMEWORK == "DIFFUSION" ]]; then IMAGE+="-diffusion" @@ -137,7 +132,7 @@ fi $RUN_PREFIX mkdir -p backend/diffusion -$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion $IMAGE +$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/backend/diffusion:/opt/tritonserver/backends/diffusion -v/tmp:/tmp $IMAGE { set +x; } 2>/dev/null