diff --git a/Popular_Models_Guide/StableDiffusion/README.md b/Popular_Models_Guide/StableDiffusion/README.md
index 04ecfd77..5002d7cf 100644
--- a/Popular_Models_Guide/StableDiffusion/README.md
+++ b/Popular_Models_Guide/StableDiffusion/README.md
@@ -29,7 +29,7 @@
 # Deploying Stable Diffusion Models with Triton and TensorRT
 
 This example demonstrates how to deploy Stable Diffusion models in
-Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion)
+Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion)
 pipeline and utilities.
 
 Using the TensorRT demo as a base this example contains a reusable
@@ -38,9 +38,9 @@ suitable for deploying multiple versions and configurations of
 Diffusion models.
 
 For more information on Stable Diffusion please visit
-[stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5),
-[stable-diffusion-xl](https://huggingface.co/docs/diffusers/en/using-diffusers/sdxl). For
-more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion).
+[stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5),
+[stable-diffusion-xl](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0). For
+more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion).
 
 > [!Note]
 > This example is given as sample code and should be reviewed before use in production settings.
@@ -57,7 +57,7 @@ support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/i
 ## Building the Triton Inference Server Image
 
 The example is designed based on the
-`nvcr.io/nvidia/tritonserver:24.01-py3` docker image and [TensorRT OSS v9.2.0](https://github.com/NVIDIA/TensorRT/releases/tag/v9.2.0).
+`nvcr.io/nvidia/tritonserver:24.08-py3` docker image and [TensorRT OSS v10.4](https://github.com/NVIDIA/TensorRT/releases/tag/v10.4).
 
 A set of convenience scripts are provided to create a docker image
 based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the
@@ -99,6 +99,15 @@ directory as `workspace`.
 
 ### Build Stable Diffusion v 1.5 Engine
 
+> [!Note]
+>
+> The model
+> [stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5)
+> requires login in to huggingface and acceptance of terms and
+> conditions of use. Please set the environment variable HF_TOKEN
+> accordingly.
+>
+
 ```bash
 ./scripts/build_models.sh --model stable_diffusion_1_5
 ```
@@ -285,27 +294,13 @@ python3 client.py --model stable_diffusion_xl --requests 10 --clients 10
 
 ## Known Issues and Limitations
 
-1. When shutting down the server, an invalid memory operation occurs:
-
-   > [!Note]
-   > This error is also seen in standalone applications outside of the Triton Inference Server
-   > and we believe this is due to an interaction between imported python modules. Further
-   > we haven't seen any issues related to this error and believe it can be safely
-   > ignored.
-
-
-   ```
-   free(): invalid pointer
-   ```
-
-
-2. The diffusion backend doesn't yet support using an optional refiner
+1. The diffusion backend doesn't yet support using an optional refiner
    model unlike the [demo][demo_reference] it's based on. See also
    [demo_txt2img_xl.py][demo_code]
 
 
-[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/9.2/demo/Diffusion/demo_txt2img_xl.py
+[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/10.4/demo/Diffusion/demo_txt2img_xl.py
 
 
-[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion#text-to-image-using-sdxl-stable-diffusion-xl
+[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion#generate-an-image-with-stable-diffusion-xl-guided-by-a-single-text-prompt
 
diff --git a/Popular_Models_Guide/StableDiffusion/build.sh b/Popular_Models_Guide/StableDiffusion/build.sh
index b2507d77..86e37a34 100755
--- a/Popular_Models_Guide/StableDiffusion/build.sh
+++ b/Popular_Models_Guide/StableDiffusion/build.sh
@@ -39,7 +39,7 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile
 
 # Base Images
 BASE_IMAGE=nvcr.io/nvidia/tritonserver
-BASE_IMAGE_TAG_DIFFUSION=24.01-py3
+BASE_IMAGE_TAG_DIFFUSION=24.08-py3
 
 get_options() {
     while :; do
@@ -141,7 +141,7 @@ get_options() {
     fi
 
     if [ -z "$TAG" ]; then
-        TAG="tritonserver:r24.01"
+        TAG="tritonserver:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    TAG+="-diffusion"
@@ -211,7 +211,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
 	set -x
     fi
     $RUN_PREFIX mkdir -p $PWD/backend/diffusion
-    $RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
+    $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
 
     { set +x; } 2>/dev/null
 
@@ -221,7 +221,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
 	    set -x
 	fi
 
-	$RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model"
+	$RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR):/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model"
 
 	{ set +x; } 2>/dev/null
     done
diff --git a/Popular_Models_Guide/StableDiffusion/docker/Dockerfile b/Popular_Models_Guide/StableDiffusion/docker/Dockerfile
index f4daace6..f499b4dd 100644
--- a/Popular_Models_Guide/StableDiffusion/docker/Dockerfile
+++ b/Popular_Models_Guide/StableDiffusion/docker/Dockerfile
@@ -29,9 +29,9 @@ ARG BASE_IMAGE_TAG=24.01-py3
 
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as tritonserver-stable-diffusion
 
-RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt==9.2.0.post12.dev5
+RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt-cu12==10.4.0
 
-RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/9.2 --single-branch /tmp/TensorRT
+RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/10.4 --single-branch /tmp/TensorRT
 
 RUN pip3 install -r /tmp/TensorRT/demo/Diffusion/requirements.txt
 
diff --git a/Popular_Models_Guide/StableDiffusion/run.sh b/Popular_Models_Guide/StableDiffusion/run.sh
index be47a600..3e377e06 100755
--- a/Popular_Models_Guide/StableDiffusion/run.sh
+++ b/Popular_Models_Guide/StableDiffusion/run.sh
@@ -99,7 +99,7 @@ get_options() {
     fi
 
     if [ -z "$IMAGE" ]; then
-        IMAGE="tritonserver:r24.01"
+        IMAGE="tritonserver:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    IMAGE+="-diffusion"
diff --git a/Triton_Inference_Server_Python_API/README.md b/Triton_Inference_Server_Python_API/README.md
index e3b27df6..06d58c0e 100644
--- a/Triton_Inference_Server_Python_API/README.md
+++ b/Triton_Inference_Server_Python_API/README.md
@@ -54,14 +54,14 @@ https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 ## Installation
 
 The tutorial and Python API package are designed to be installed and
-run within the `nvcr.io/nvidia/tritonserver:24.01-py3` docker image.
+run within the `nvcr.io/nvidia/tritonserver:24.08-py3` docker image.
 
 A set of convenience scripts are provided to create a docker image
-based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the
+based on the `nvcr.io/nvidia/tritonserver:24.08-py3` image with the
 Python API installed plus additional dependencies required for the
 examples.
 
-### Triton Inference Server 24.01 + Python API
+### Triton Inference Server 24.08 + Python API
 
 #### Clone Repository
 ```bash
@@ -69,7 +69,7 @@ git clone https://github.com/triton-inference-server/tutorials.git
 cd tutorials/Triton_Inference_Server_Python_API
 ```
 
-#### Build `triton-python-api:r24.01` Image
+#### Build `triton-python-api:r24.08` Image
 ```bash
 ./build.sh
 ```
@@ -77,7 +77,7 @@ cd tutorials/Triton_Inference_Server_Python_API
 #### Supported Backends
 
 The built image includes all the backends shipped by default in the
-tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container.
+tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container.
 
 ```
 dali  fil  identity  onnxruntime  openvino  python  pytorch  repeat  square  tensorflow  tensorrt
@@ -95,7 +95,7 @@ different data types. The `identity` model copies provided inputs of
 
 ## Hello World
 
-### Start `triton-python-api:r24.01` Container
+### Start `triton-python-api:r24.08` Container
 
 The following command starts a container and volume mounts the current
 directory as `workspace`.
@@ -163,7 +163,7 @@ This example is based on the
 tutorial.
 
 
-#### Build `triton-python-api:r24.01-diffusion` Image and Stable Diffusion Models
+#### Build `triton-python-api:r24.08-diffusion` Image and Stable Diffusion Models
 
 Please note the following command will take many minutes depending on
 your hardware configuration and network connection.
@@ -175,7 +175,7 @@ your hardware configuration and network connection.
 #### Supported Backends
 
 The built image includes all the backends shipped by default in the
-tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container.
+tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container.
 
 ```
 dali  fil  identity  onnxruntime  openvino  python  pytorch  repeat  square  tensorflow  tensorrt
@@ -223,13 +223,13 @@ server.models()
 
 #### Example Output
 ```python
-{('stable_diffusion', 1): {'name': 'stable_diffusion', 'version': 1, 'state': 'READY'}, ('text_encoder', 1): {'name': 'text_encoder', 'version': 1, 'state': 'READY'}, ('vae', 1): {'name': 'vae', 'version': 1, 'state': 'READY'}}
+{('stable_diffusion_1_5', 1): {'name': 'stable_diffusion_1_5', 'version': 1, 'state': 'READY'}, ('stable_diffusion_xl', 1): {'name': 'stable_diffusion_xl', 'version': 1, 'state': 'READY'}}
 ```
 
 ### Send an Inference Request
 
 ```python
-model = server.model("stable_diffusion")
+model = server.model("stable_diffusion_xl")
 responses = model.infer(inputs={"prompt":[["butterfly in new york, realistic, 4k, photograph"]]})
 ```
 
diff --git a/Triton_Inference_Server_Python_API/build.sh b/Triton_Inference_Server_Python_API/build.sh
index cdfa725d..4d787cb0 100755
--- a/Triton_Inference_Server_Python_API/build.sh
+++ b/Triton_Inference_Server_Python_API/build.sh
@@ -30,7 +30,7 @@ RUN_PREFIX=
 BUILD_MODELS=
 
 # Frameworks
-declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["TRT_LLM"]=2 ["IDENTITY"]=3)
+declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["IDENTITY"]=3)
 DEFAULT_FRAMEWORK=IDENTITY
 
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
@@ -39,9 +39,8 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile
 
 # Base Images
 BASE_IMAGE=nvcr.io/nvidia/tritonserver
-BASE_IMAGE_TAG_IDENTITY=24.01-py3
-BASE_IMAGE_TAG_DIFFUSION=24.01-py3
-BASE_IMAGE_TAG_TRT_LLM=24.01-trtllm-python-py3
+BASE_IMAGE_TAG_IDENTITY=24.08-py3
+BASE_IMAGE_TAG_DIFFUSION=24.08-py3
 
 get_options() {
     while :; do
@@ -138,11 +137,7 @@ get_options() {
     fi
 
     if [ -z "$TAG" ]; then
-        TAG="triton-python-api:r24.01"
-
-	if [[ $FRAMEWORK == "TRT_LLM" ]]; then
-	    TAG+="-trt-llm"
-	fi
+        TAG="triton-python-api:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    TAG+="-diffusion"
@@ -186,7 +181,7 @@ get_options "$@"
 
 if [[ $FRAMEWORK == DIFFUSION ]]; then
     BASE_IMAGE="tritonserver"
-    BASE_IMAGE_TAG="r24.01-diffusion"
+    BASE_IMAGE_TAG="r24.08-diffusion"
 fi
 
 # BUILD RUN TIME IMAGE
@@ -207,17 +202,18 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
     if [ -z "$RUN_PREFIX" ]; then
 	set -x
     fi
-    $RUN_PREFIX mkdir -p backend/diffusion
-    $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.01-diffusion
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py backend/diffusion/model.py
-    $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_1_5/1
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt  diffusion-models/stable_diffusion_1_5/config.pbtxt
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep  diffusion-models/stable_diffusion_1_5/1/.gitkeep
-    $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_xl/1
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt  diffusion-models/stable_diffusion_xl/config.pbtxt
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep  diffusion-models/stable_diffusion_xl/1/.gitkeep
-    $RUN_PREFIX mkdir -p scripts/stable_diffusion
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* scripts/stable_diffusion/
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/backend/diffusion
+    $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.08-diffusion
+    $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace tritonserver:r24.08-diffusion /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py ${SOURCE_DIR}/backend/diffusion/model.py
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt  ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/config.pbtxt
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep  ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1/.gitkeep
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt  ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/config.pbtxt
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep  ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1/.gitkeep
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/scripts/stable_diffusion
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* ${SOURCE_DIR}/scripts/stable_diffusion/
 
 fi
 
@@ -231,17 +227,6 @@ $RUN_PREFIX docker build -f $DOCKERFILE $BUILD_OPTIONS $BUILD_ARGS -t $TAG $SOUR
 { set +x; } 2>/dev/null
 
 
-if [[ $FRAMEWORK == TRT_LLM ]]; then
-    if [ -z "$RUN_PREFIX" ]; then
-	set -x
-    fi
-
-    $RUN_PREFIX docker build -f $SOURCE_DIR/docker/Dockerfile.trt-llm-engine-builder  $BUILD_OPTIONS $BUILD_ARGS -t trt-llm-engine-builder  $SOURCE_DIR $NO_CACHE
-
-    { set +x; } 2>/dev/null
-
-fi;
-
 if [[ $FRAMEWORK == IDENTITY ]] || [[ $BUILD_MODELS == TRUE ]]; then
 
     if [[ $FRAMEWORK == DIFFUSION ]]; then
@@ -249,7 +234,7 @@ if [[ $FRAMEWORK == IDENTITY ]] || [[ $BUILD_MODELS == TRUE ]]; then
 	    set -x
 	fi
 
-	$RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_1_5"
+	$RUN_PREFIX docker run --gpus all --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_xl"
 
 	{ set +x; } 2>/dev/null
     fi
diff --git a/Triton_Inference_Server_Python_API/deps/requirements.txt b/Triton_Inference_Server_Python_API/deps/requirements.txt
index f48e8673..7226481a 100644
--- a/Triton_Inference_Server_Python_API/deps/requirements.txt
+++ b/Triton_Inference_Server_Python_API/deps/requirements.txt
@@ -24,14 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-awscli
-fastapi==0.97.0
-ftfy
-mypy
 pyright
 pytest
-ray[all]==2.9
-scipy
-sphinx
-sphinx-markdown-builder
-starlette==0.27.0
+ray[all]==2.36.0
diff --git a/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl b/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl
deleted file mode 100644
index 59c79d15..00000000
Binary files a/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl and /dev/null differ
diff --git a/Triton_Inference_Server_Python_API/docker/Dockerfile b/Triton_Inference_Server_Python_API/docker/Dockerfile
index da0611d6..d74c105f 100644
--- a/Triton_Inference_Server_Python_API/docker/Dockerfile
+++ b/Triton_Inference_Server_Python_API/docker/Dockerfile
@@ -25,37 +25,27 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_IMAGE_TAG=24.01-py3
+ARG BASE_IMAGE_TAG=24.08-py3
 
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as triton-python-api
 
 RUN apt-get update; apt-get install -y gdb
 
-COPY ./deps/requirements.txt /tmp/requirements.txt
-
-RUN pip install --timeout=2000 -r /tmp/requirements.txt
+RUN --mount=type=bind,source=./deps/requirements.txt,target=/tmp/requirements.txt \
+    pip install --timeout=2000 --requirement /tmp/requirements.txt
 
 # Finish pyright install
 
 RUN pyright --help
 
-COPY ./deps/tritonserver-2.41.0.dev0-py3-none-any.whl /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl
-
 RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
-    "tritonserver-*.whl" | xargs -I {} pip3 install --force-reinstall --upgrade {}[all]
+    "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
 
-RUN pip3 show tritonserver 1>/dev/null || \
-    if [ $? != 0 ]; then \
-       pip3 install /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl[all] ;\
-    fi
+# grafana
+RUN apt-get install -y adduser libfontconfig1 musl && \
+    wget https://dl.grafana.com/enterprise/release/grafana-enterprise_11.2.0_amd64.deb && \
+    dpkg -i grafana-enterprise_11.2.0_amd64.deb && \
+    rm -rf grafana-enterprise_11.2.0_amd64.deb
 
 RUN ln -sf /bin/bash /bin/sh
 
-COPY . /workspace
-
-ARG RUN_TESTS=FALSE
-
-RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then cd /tmp && git clone -b r23.12-python-api https://github.com/triton-inference-server/core.git && cp -rf /tmp/core/python/test /workspace/deps/ ; fi
-
-RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then pytest /workspace/deps ; fi
-
diff --git a/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder b/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder
deleted file mode 100644
index 83044799..00000000
--- a/Triton_Inference_Server_Python_API/docker/Dockerfile.trt-llm-engine-builder
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_IMAGE_TAG=23.12-trtllm-python-py3
-
-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as trt-llm-engine-builder
-
-ARG TRT_LLM_BACKEND_REPO=https://github.com/triton-inference-server/tensorrtllm_backend.git
-ARG TRT_LLM_BACKEND_TAG=r23.12
-
-# Update the submodule TensorRT-LLM repository
-RUN git clone -b $TRT_LLM_BACKEND_TAG $TRT_LLM_BACKEND_REPO
-WORKDIR tensorrtllm_backend
-RUN apt-get update; apt-get install -y git-lfs
-RUN git lfs install &&  git lfs pull
-RUN git submodule update --init --recursive
-
-
-# TensorRT-LLM is required for generating engines. You can skip this step if
-# you already have the package installed. If you are generating engines within
-# the Triton container, you have to install the TRT-LLM package.
-RUN (cd tensorrt_llm && \
-    bash docker/common/install_cmake.sh && \
-    export PATH=/usr/local/cmake/bin:$PATH && \
-    python3 ./scripts/build_wheel.py --trt_root="/usr/local/tensorrt" && \
-    pip3 install ./build/tensorrt_llm*.whl)
-
-# # Go to the tensorrt_llm/examples/gpt directory
-# cd tensorrt_llm/examples/gpt
-
-# # Download weights from HuggingFace Transformers
-# rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2
-# pushd gpt2 && rm pytorch_model.bin model.safetensors && wget -q https://huggingface.co/gpt2-medium/resolve/main/pytorch_model.bin && popd
-
-# # Convert weights from HF Tranformers to FT format
-# python3 hf_gpt_convert.py -p 8 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism 4 --storage-type float16
-
-# # Build TensorRT engines
-# python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \
-#                  --world_size=4 \
-#                  --dtype float16 \
-#                  --use_inflight_batching \
-#                  --use_gpt_attention_plugin float16 \
-#                  --paged_kv_cache \
-#                  --use_gemm_plugin float16 \
-#                  --remove_input_padding \
-#                  --use_layernorm_plugin float16 \
-#                  --hidden_act gelu \
-#                  --parallel_build \
-#                  --output_dir=engines/fp16/4-gpu
diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/README.md b/Triton_Inference_Server_Python_API/examples/rayserve/README.md
index 506d7cd4..595c4c4f 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/README.md
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/README.md
@@ -68,13 +68,23 @@ directory as `workspace`.
 
 ```bash
 ./run.sh --framework diffusion
+cd examples/rayserve
+```
+
+### Start Local Ray Cluster
 
+The following command starts a local Ray cluster. It also starts
+prometheus and grafana instances with default Ray and Ray Serve
+metrics and dashboards enabled.
+
+```
+./start_ray.sh
 ```
 
 ### Run Deployment
+
 ```bash
-cd examples/rayserve
-serve run tritonserver_deployment:tritonserver_deployment
+serve run tritonserver_deployment:deployment
 ```
 
 ## Send Requests to Deployment
@@ -95,14 +105,13 @@ curl --request GET "http://127.0.0.1:8000/identity?string_input=hello_world!"
 "hello_world!"
 ```
 
-
 ### `/generate`
 The generate endpoint accepts a prompt, generates an image based on
 the prompt using stable diffusion, and saves the image to a file.
 
 #### Example Request
 ```
-curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic,4k&filename=car_sample.jpg"
+curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic,4k&filename=/workspace/examples/rayserve/car_sample.jpg"
 ```
 
 #### Example Output
@@ -110,4 +119,21 @@ curl --request GET "http://127.0.0.1:8000/generate?prompt=car,model-t,realistic,
 ![car_sample](../../docs/car_sample.jpg)
 
 
+## View Ray and Ray Serve Dashboards
+
+The Ray and Ray Serve dashboards are hosted on the default port and
+can be used to visualize various metrics:
 
+```
+<IP_ADDRESS>:8265
+```
+
+## Stop the Ray Serve Cluster
+
+The following command stops the local Ray cluster and also stops
+prometheus and grafana instances.
+
+
+```bash
+./stop_ray.sh
+```
diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/client.py b/Triton_Inference_Server_Python_API/examples/rayserve/client.py
index b906182f..be65a0c8 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/client.py
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/client.py
@@ -52,7 +52,7 @@ def client(endpoint, request_count, prompt, save_image, index):
         request_start = time.time()
         requests.get(
             f"http://127.0.0.1:8000/{endpoint}?prompt={prompt_input}{filename_input}",
-            timeout=60,
+            timeout=300,
         )
         latencies.append(time.time() - request_start)
     print(
diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh b/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh
new file mode 100755
index 00000000..ea0f6b35
--- /dev/null
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/start_ray.sh
@@ -0,0 +1,40 @@
+#!/bin/bash -e
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ip_address=$(hostname -I | awk '{print $1}')
+
+echo $ip_address
+
+mkdir -p /tmp/rayserve-demo; cd /tmp/rayserve-demo
+
+ray metrics launch-prometheus
+
+export RAY_GRAFANA_HOST=http://${ip_address}:3000
+
+ray start --head --dashboard-host 0.0.0.0 --metrics-export-port 8080 --disable-usage-stats
+
+/usr/share/grafana/bin/grafana-server --homepath /usr/share/grafana --config /tmp/ray/session_latest/metrics/grafana/grafana.ini web >grafana.stdout.log 2>&1 &
diff --git a/Triton_Inference_Server_Python_API/deps/requirements_torch.txt b/Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh
old mode 100644
new mode 100755
similarity index 94%
rename from Triton_Inference_Server_Python_API/deps/requirements_torch.txt
rename to Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh
index f52f87ef..eadfd231
--- a/Triton_Inference_Server_Python_API/deps/requirements_torch.txt
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/stop_ray.sh
@@ -1,3 +1,4 @@
+#!/bin/bash -e
 # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -24,10 +25,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-accelerate
-diffusers==0.9.0
-torch
-torchaudio
-torchvision
-transformers
-transformers[onnxruntime]
+ray stop
+pkill prometheus.*
+pkill grafana.*
diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
index 1610b583..257dcffd 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
@@ -51,18 +51,50 @@ def _print_heading(message):
     print("-" * len(message))
 
 
-@serve.deployment(ray_actor_options={"num_gpus": 1})
+@serve.deployment(
+    ray_actor_options={"num_gpus": 1},
+    max_ongoing_requests=1,
+    autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 8,
+        "max_ongoing_requests": 1,
+        "target_ongoing_requests": 1,
+        "upscale_delay_s": 2,
+        "downscale_delay_s": 120,
+        "upscaling_factor": 1,
+        "downscaling_factor": 1,
+        "metrics_interval_s": 2,
+        "look_back_period_s": 4,
+    },
+)
 @serve.ingress(app)
 class BaseDeployment:
-    def __init__(self):
-        self._image_size = 512
-        self._model_id = "runwayml/stable-diffusion-v1-5"
-        from diffusers import StableDiffusionPipeline
+    def __init__(self, use_torch_compile=False):
+        self._image_size = 1024
+        self._model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+        from diffusers import AutoencoderKL, DiffusionPipeline
 
-        self._pipeline = StableDiffusionPipeline.from_pretrained(
-            self._model_id, revision="fp16", torch_dtype=torch.float16
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+        )
+        self._pipeline = DiffusionPipeline.from_pretrained(
+            self._model_id,
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+            vae=vae,
         )
         self._pipeline = self._pipeline.to("cuda")
+        if use_torch_compile:
+            print("compiling")
+            print(torch._dynamo.list_backends())
+            self._pipeline.unet = torch.compile(
+                self._pipeline.unet,
+                fullgraph=True,
+                mode="reduce-overhead",
+                dynamic=False,
+            )
+        self.generate("temp")
 
     @app.get("/generate")
     def generate(self, prompt: str, filename: Optional[str] = None) -> None:
@@ -71,13 +103,28 @@ def generate(self, prompt: str, filename: Optional[str] = None) -> None:
                 prompt,
                 height=self._image_size,
                 width=self._image_size,
-                num_inference_steps=50,
+                num_inference_steps=30,
             ).images[0]
             if filename:
                 image_.save(filename)
 
 
-@serve.deployment(ray_actor_options={"num_gpus": 1})
+@serve.deployment(
+    ray_actor_options={"num_gpus": 1},
+    max_ongoing_requests=1,
+    autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 8,
+        "max_ongoing_requests": 1,
+        "target_ongoing_requests": 1,
+        "upscale_delay_s": 2,
+        "downscale_delay_s": 120,
+        "upscaling_factor": 1,
+        "downscaling_factor": 1,
+        "metrics_interval_s": 2,
+        "look_back_period_s": 4,
+    },
+)
 @serve.ingress(app)
 class TritonDeployment:
     def __init__(self):
@@ -104,11 +151,9 @@ def __init__(self):
         self._stable_diffusion = None
         self._test_model = None
 
-        if not self._triton_server.model("stable_diffusion_1_5").ready():
+        if not self._triton_server.model("stable_diffusion_xl").ready():
             try:
-                self._stable_diffusion = self._triton_server.load(
-                    "stable_diffusion_1_5"
-                )
+                self._stable_diffusion = self._triton_server.load("stable_diffusion_xl")
 
                 if not self._stable_diffusion.ready():
                     raise Exception("Model not ready")
@@ -120,6 +165,7 @@ def __init__(self):
                 return
         _print_heading("Models")
         pprint(self._triton_server.models())
+        self.generate("temp")
 
     @app.get("/identity")
     def test(self, string_input: str) -> str:
@@ -148,12 +194,15 @@ def generate(self, prompt: str, filename: Optional[str] = None) -> None:
                 image_.save(filename)
 
 
-def tritonserver_deployment(_args):
+def deployment(_args):
     return TritonDeployment.bind()
 
 
-def base_deployment(_args):
-    return BaseDeployment.bind()
+def baseline(_args):
+    if "use-torch-compile" in _args:
+        return BaseDeployment.bind(use_torch_compile=True)
+    else:
+        return BaseDeployment.bind(use_torch_compile=False)
 
 
 if __name__ == "__main__":
diff --git a/Triton_Inference_Server_Python_API/run.sh b/Triton_Inference_Server_Python_API/run.sh
index c465e7f5..c4229cf0 100755
--- a/Triton_Inference_Server_Python_API/run.sh
+++ b/Triton_Inference_Server_Python_API/run.sh
@@ -29,7 +29,7 @@ TAG=
 RUN_PREFIX=
 
 # Frameworks
-declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["TRT_LLM"]=2 ["IDENTITY"]=3)
+declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["IDENTITY"]=3)
 DEFAULT_FRAMEWORK=IDENTITY
 
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
@@ -37,7 +37,6 @@ SOURCE_DIR=$(dirname "$(readlink -f "$0")")
 # Base Images
 IMAGE=
 IMAGE_TAG_DIFFUSERS=diffusion
-IMAGE_TAG_TRT_LLM=trt-llm
 
 get_options() {
     while :; do
@@ -56,10 +55,10 @@ get_options() {
             ;;
         --image)
             if [ "$2" ]; then
-                BASE_IMAGE=$2
+                IMAGE=$2
                 shift
             else
-                error 'ERROR: "--base" requires an argument.'
+                error 'ERROR: "--image" requires an argument.'
             fi
             ;;
         --dry-run)
@@ -100,11 +99,7 @@ get_options() {
     fi
 
     if [ -z "$IMAGE" ]; then
-        IMAGE="triton-python-api:r24.01"
-
-	if [[ $FRAMEWORK == "TRT_LLM" ]]; then
-	    IMAGE+="-trt-llm"
-	fi
+        IMAGE="triton-python-api:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    IMAGE+="-diffusion"
@@ -137,7 +132,7 @@ fi
 
 $RUN_PREFIX mkdir -p backend/diffusion
 
-$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion $IMAGE
+$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/backend/diffusion:/opt/tritonserver/backends/diffusion -v/tmp:/tmp $IMAGE
 
 { set +x; } 2>/dev/null