diff --git a/default.nix b/default.nix
index c3a72ee..63dabf2 100644
--- a/default.nix
+++ b/default.nix
@@ -15,36 +15,29 @@ in
     cog_version = "0.10.0-alpha16";
     cuda = "12.1"; # todo: 12.2
     gpu = true;
-    # inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
+    # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line
     python_packages = [
       "--extra-index-url"
       "https://pypi.nvidia.com"
-      "tensorrt_llm==0.9.0"
+      "tensorrt_llm==0.10.0"
+      "tensorrt-cu12==10.1.0"
       "torch==2.2.2"
-      "tensorrt==9.3.0.post12.dev1"
-      "tensorrt-bindings==9.3.0.post12.dev1"
-      "tensorrt-libs==9.3.0.post12.dev1"
-      "nvidia-pytriton==0.5.2" # corresponds to 2.42.0
-      "httpx"
-      "nvidia-cublas-cu12<12.2"
-      "nvidia-cuda-nvrtc-cu12<12.2"
-      "nvidia-cuda-runtime-cu12<12.2"
+      "nvidia-pytriton==0.5.6" # corresponds to 2.44.0
       "omegaconf"
       "hf-transfer"
-      "tokenizers"
+      "tokenizers>=0.19.0"
     ];
     # don't ask why it needs ssh
     system_packages = [ "pget" "openssh" "openmpi" ];
   };
   python-env.pip = {
     uv.enable = true;
-    # todo: add some constraints to match cudaPackages
     constraintsList = [
-      "nvidia-cudnn-cu12<9"
+      # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}"
+      "nvidia-cublas-cu12==${cudaPackages.libcublas.version}"
     ];
     overridesList = [
-      "tokenizers==0.19.0"
-      "transformers==4.40.0"
+      "pydantic==1.10.16"
     ];
   };
   cognix.includeNix = true;
@@ -56,27 +49,31 @@ in
     # tensorrt likes doing a pip invocation from it's setup.py
     # circumvent by manually depending on tensorrt_libs, tensorrt_bindings
     # and setting this env variable
-    tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
-    # TODO remove upon next rebuild:
-    tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [
-      tensorrt-libs.public
-      tensorrt-bindings.public
+    tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true;
+    tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ];
+    tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [
+      pythonDrvs.tensorrt-cu12-libs.public
     ];
-    tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ];
-    tensorrt-libs.mkDerivation.postFixup = ''
+    # fixes tensorrt-llm build
+    tensorrt-cu12-libs.mkDerivation.postFixup = ''
       pushd $out/${site}/tensorrt_libs
-      ln -s libnvinfer.so.9 libnvinfer.so
-      ln -s libnvonnxparser.so.9 libnvonnxparser.so
+      ln -s libnvinfer.so.10 libnvinfer.so
+      ln -s libnvonnxparser.so.10 libnvonnxparser.so
       popd
     '';
-    tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
+    tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
     tensorrt-llm = {
       mkDerivation.buildInputs = [ cudaPackages.nccl ];
       mkDerivation.propagatedBuildInputs = with pythonDrvs; [
-        tensorrt-libs.public # libnvinfer, onnxparse
+        tensorrt-cu12-libs.public # libnvinfer, onnxparse
       ];
       env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ];
-      env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"];
+      env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ];
+      mkDerivation.postInstall = ''
+        pushd $out/${site}/tensorrt_llm/bin
+        patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker
+        popd
+      '';
     };
     # has some binaries that want cudart
     tritonclient.mkDerivation.postInstall = "rm -r $out/bin";
@@ -131,8 +128,8 @@ in
   deps.tensorrt-src = pkgs.fetchFromGitHub {
     owner = "NVIDIA";
     repo = "TensorRT";
-    rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch
-    hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE=";
+    rev = "v10.0.1";
+    hash = "sha256-lSEw0GM0eW2BHNBq/wTQA8v3aNueE3FT+k9F5nH1OgA=";
   };
   # todo: replace with lockfile
   deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec {
diff --git a/flake.lock b/flake.lock
index 4f4dd20..dfab778 100644
--- a/flake.lock
+++ b/flake.lock
@@ -12,11 +12,11 @@
         "rust-overlay": "rust-overlay"
       },
       "locked": {
-        "lastModified": 1718288375,
-        "narHash": "sha256-EDZYEqYzYa97bk4bJxcnsHVHr8OmrGki6o7yVCjXQGk=",
+        "lastModified": 1721228311,
+        "narHash": "sha256-EEe5Kcno5FMFSd2aYVB2ONHFpe/9k0CX1gIFjNQgV+A=",
         "owner": "datakami",
         "repo": "cognix",
-        "rev": "8eaff212007474e510af24e51bb22a009a0361a5",
+        "rev": "8c28f745d7339c495265a85fb65da3ce5592f0ef",
         "type": "github"
       },
       "original": {
@@ -32,15 +32,15 @@
         "pyproject-nix": "pyproject-nix"
       },
       "locked": {
-        "lastModified": 1710167744,
-        "narHash": "sha256-z78iB1ckRQuJluM82iCuQNjN5hqsNpd1om0q75ncza4=",
-        "owner": "yorickvp",
+        "lastModified": 1719513340,
+        "narHash": "sha256-on3zRua52KZ8G5kBOXMQOzrsA07ywVMNdcIWJEeotfo=",
+        "owner": "nix-community",
         "repo": "dream2nix",
-        "rev": "3bfbbbb19471b60cf1bb7f7c476588a36ac3fb04",
+        "rev": "4d441820e0d0916c97d7af6c4d4f6843d676e242",
         "type": "github"
       },
       "original": {
-        "owner": "yorickvp",
+        "owner": "nix-community",
         "repo": "dream2nix",
         "type": "github"
       }
@@ -83,11 +83,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1709780214,
-        "narHash": "sha256-p4iDKdveHMhfGAlpxmkCtfQO3WRzmlD11aIcThwPqhk=",
+        "lastModified": 1719436386,
+        "narHash": "sha256-NBGYaic5FLRg8AWSj6yr4g2IlMPUxNCVjRK6+RNuQBc=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "f945939fd679284d736112d3d5410eb867f3b31c",
+        "rev": "c66e984bda09e7230ea7b364e677c5ba4f0d36d0",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 9471f5f..a6f9f11 100644
--- a/flake.nix
+++ b/flake.nix
@@ -26,7 +26,7 @@
 
         cog-triton.architectures = architectures;
         # don't need this file in a runner
-        python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter ''
+        python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter ''
           rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource*
         '';
       });
diff --git a/lock.json b/lock.json
index 7207c4c..067b583 100644
--- a/lock.json
+++ b/lock.json
@@ -2,10 +2,10 @@
   "fetchPipMetadata": {
     "sources": {
       "accelerate": {
-        "sha256": "c7bb817eb974bba0ff3ea1ba0f24d55afb86d50e3d4fe98d6922dc69cf2ccff1",
+        "sha256": "71fcf4be00872194071de561634268b71417d7f5b16b178e2fa76b6f117c52b0",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/f7/fc/c55e5a2da345c9a24aa2e1e0f60eb2ca290b6a41be82da03a6d4baec4f99/accelerate-0.25.0-py3-none-any.whl",
-        "version": "0.25.0"
+        "url": "https://files.pythonhosted.org/packages/e4/74/564f621699b049b0358f7ad83d7437f8219a5d6efb69bbfcca328b60152f/accelerate-0.32.1-py3-none-any.whl",
+        "version": "0.32.1"
       },
       "aiohttp": {
         "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72",
@@ -73,6 +73,12 @@
         "url": "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl",
         "version": "8.1.7"
       },
+      "cloudpickle": {
+        "sha256": "246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/96/43/dae06432d0c4b1dc9e9149ad37b4ca8384cf6eb7700cd9215b177b914f0a/cloudpickle-3.0.0-py3-none-any.whl",
+        "version": "3.0.0"
+      },
       "cog": {
         "sha256": "0f658f2da28e37da8040d073af4f4e7a91b567a8d169f077d5afddc33793a62f",
         "type": "url",
@@ -104,10 +110,10 @@
         "version": "2.14.4"
       },
       "diffusers": {
-        "sha256": "ca258d8141a9faa85b3ce60805fc4898c91d0e73fd5b1576413dfe3b8502a8ec",
+        "sha256": "d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/13/43/d4ae69ba5f503d58c7aef13f0f93d9c84694652dc2a16f8ea3d8246ebe95/diffusers-0.15.0-py3-none-any.whl",
-        "version": "0.15.0"
+        "url": "https://files.pythonhosted.org/packages/ee/22/2e6e90c87e718e63b1a860cb627bcf27ac4998edb5f190561b5c6cde6c62/diffusers-0.29.2-py3-none-any.whl",
+        "version": "0.29.2"
       },
       "dill": {
         "sha256": "76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e",
@@ -139,12 +145,6 @@
         "url": "https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl",
         "version": "3.15.4"
       },
-      "flatbuffers": {
-        "sha256": "8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812",
-        "type": "url",
-        "url": "https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl",
-        "version": "24.3.25"
-      },
       "frozenlist": {
         "sha256": "a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a",
         "type": "url",
@@ -277,12 +277,24 @@
         "url": "https://files.pythonhosted.org/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl",
         "version": "1.1.9"
       },
+      "markdown-it-py": {
+        "sha256": "355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl",
+        "version": "3.0.0"
+      },
       "markupsafe": {
         "sha256": "2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f",
         "type": "url",
         "url": "https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
         "version": "2.1.5"
       },
+      "mdurl": {
+        "sha256": "84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl",
+        "version": "0.1.2"
+      },
       "mpi4py": {
         "sha256": "c8fa625e0f92b082ef955bfb52f19fa6691d29273d7d71135d295aa143dee6cb",
         "type": "url",
@@ -331,12 +343,6 @@
         "url": "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
         "version": "1.26.4"
       },
-      "nvidia-ammo": {
-        "sha256": "ed6b0aa3748e735923ce3825c0044a130400fcd040a2bb54580e4bcd7ef605d3",
-        "type": "url",
-        "url": "https://pypi.nvidia.com/nvidia-ammo/nvidia_ammo-0.7.4-cp310-cp310-linux_x86_64.whl",
-        "version": "0.7.4"
-      },
       "nvidia-cublas-cu12": {
         "sha256": "ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728",
         "type": "url",
@@ -391,6 +397,12 @@
         "url": "https://pypi.nvidia.com/nvidia-cusparse-cu12/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl",
         "version": "12.1.0.106"
       },
+      "nvidia-modelopt": {
+        "sha256": "f56f04280fef23727a49decf13ff8269c9cc47b95fc304fcefed79fbe8e6ef5f",
+        "type": "url",
+        "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.11.2-cp310-cp310-manylinux_2_28_x86_64.whl",
+        "version": "0.11.2"
+      },
       "nvidia-nccl-cu12": {
         "sha256": "802756f02c43c0613dc83f48a76f702462b0f1f618411768748bba9c805fce19",
         "type": "url",
@@ -410,10 +422,10 @@
         "version": "12.1.105"
       },
       "nvidia-pytriton": {
-        "sha256": "810531f752f7bdc4308b8821056ce2d5a456e6cb62966f2e07f65cff0053e42a",
+        "sha256": "6403e65c2bbab0ab2fe2b737ad612e2b88f3edf20d41aadd1d544ffb309a701c",
         "type": "url",
-        "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.2-py3-none-manylinux_2_35_x86_64.whl",
-        "version": "0.5.2"
+        "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.6-py3-none-manylinux_2_35_x86_64.whl",
+        "version": "0.5.6"
       },
       "omegaconf": {
         "sha256": "7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b",
@@ -427,23 +439,11 @@
         "url": "https://files.pythonhosted.org/packages/c6/7e/5031717c0636e6074764a2f61a459a3ecd46c20d8b83a1f1cd2513a76160/onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
         "version": "1.16.1"
       },
-      "onnx-graphsurgeon": {
-        "sha256": "10c130d6129fdeee02945f8103b5b112e6fd4d9b356e2dd3e80f53e0ebee7b5c",
-        "type": "url",
-        "url": "https://pypi.nvidia.com/onnx-graphsurgeon/onnx_graphsurgeon-0.5.2-py2.py3-none-any.whl",
-        "version": "0.5.2"
-      },
-      "onnxruntime": {
-        "sha256": "ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889",
-        "type": "url",
-        "url": "https://files.pythonhosted.org/packages/7a/cf/6aa8c56fd63f53c2c485921e411269c7b501a2b4e634bd02f226ab2d5d8e/onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
-        "version": "1.16.3"
-      },
       "optimum": {
-        "sha256": "1354dd1081179b7c490d135c7f380cee672125e17c0bfef143e616c5b756b1db",
+        "sha256": "8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/13/6d/6b03ffb8df1ab2b43d461f7cace2af5f20092f0767f53a3e9331df00e8a2/optimum-1.21.1-py3-none-any.whl",
-        "version": "1.21.1"
+        "url": "https://files.pythonhosted.org/packages/fa/e4/f832e42a1eb9d5ac4fa6379295e05aebeae507d171babc1786bfa0210299/optimum-1.21.2-py3-none-any.whl",
+        "version": "1.21.2"
       },
       "packaging": {
         "sha256": "5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124",
@@ -494,10 +494,16 @@
         "version": "16.1.0"
       },
       "pydantic": {
-        "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e",
+        "sha256": "4660dd697de1ae2d4305a85161312611f64d5360663a9ba026cd6ad9e3fe14c3",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/ae/d8/3ffbdeccf252d56c8e0b6f1f30798d3aa0ad5afaa541908207881855beeb/pydantic-1.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "version": "1.10.16"
+      },
+      "pygments": {
+        "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
-        "version": "1.10.17"
+        "url": "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl",
+        "version": "2.18.0"
       },
       "pynvml": {
         "sha256": "5cce014ac01b098d08f06178f86c37be409b80b2e903a5a03ce15eed60f55e25",
@@ -542,10 +548,10 @@
         "version": "6.0.1"
       },
       "pyzmq": {
-        "sha256": "7e0113d70b095339e99bb522fe7294f5ae6a7f3b2b8f52f659469a74b5cc7661",
+        "sha256": "ba6e5e6588e49139a0979d03a7deb9c734bde647b9a8808f26acf9c547cab1bf",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/b7/ac/18b75626cede66295a27e94d7cfe301d2d35120b200a6a46f205a171a20e/pyzmq-23.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
-        "version": "23.2.1"
+        "url": "https://files.pythonhosted.org/packages/40/4f/088d0fe18b188a0754483b7d632a97ef608dce80c2648219d071c9f1715c/pyzmq-26.0.3-cp310-cp310-manylinux_2_28_x86_64.whl",
+        "version": "26.0.3"
       },
       "regex": {
         "sha256": "1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5",
@@ -559,6 +565,12 @@
         "url": "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl",
         "version": "2.32.3"
       },
+      "rich": {
+        "sha256": "4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl",
+        "version": "13.7.1"
+      },
       "safetensors": {
         "sha256": "d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376",
         "type": "url",
@@ -578,16 +590,22 @@
         "version": "0.2.0"
       },
       "setuptools": {
-        "sha256": "b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05",
+        "sha256": "fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/42/54/2a8ecfcc9a714a6fbf86559a4b0f50b126a4ac4269ea8134f2c75c3e73de/setuptools-70.2.0-py3-none-any.whl",
-        "version": "70.2.0"
+        "url": "https://files.pythonhosted.org/packages/ef/15/88e46eb9387e905704b69849618e699dc2f54407d8953cc4ec4b8b46528d/setuptools-70.3.0-py3-none-any.whl",
+        "version": "70.3.0"
       },
       "sh": {
-        "sha256": "e4045b6c732d9ce75d571c79f5ac2234edd9ae4f5fa9d59b09705082bdca18c7",
+        "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/15/c2/79f9dea6fc544c0eb79ed5018a38860c52d597c4be66c2cf2029bea5b3fd/sh-2.0.7-py3-none-any.whl",
+        "version": "2.0.7"
+      },
+      "shellingham": {
+        "sha256": "7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/b7/09/89c28aaf2a49f226fef8587c90c6386bd2cc03a0295bc4ff7fc6ee43c01d/sh-1.14.3.tar.gz",
-        "version": "1.14.3"
+        "url": "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl",
+        "version": "1.5.4"
       },
       "six": {
         "sha256": "8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254",
@@ -620,40 +638,46 @@
         "version": "24.2.0"
       },
       "sympy": {
-        "sha256": "9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515",
+        "sha256": "6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/61/53/e18c8c97d0b2724d85c9830477e3ebea3acf1dcdc6deb344d5d9c93a9946/sympy-1.12.1-py3-none-any.whl",
-        "version": "1.12.1"
+        "url": "https://files.pythonhosted.org/packages/62/74/7e6c65ee89ff43942bffffdbb238634f16967bf327aee3c76efcf6e49587/sympy-1.13.0-py3-none-any.whl",
+        "version": "1.13.0"
       },
       "tensorrt": {
-        "sha256": "24aea5376cb8440afe2b0a22ee83f9748e586aa27303d4f80091ad48a56552a4",
+        "sha256": "7e9c8666f5bee86771451f007e25f81d65a411a26e6ea0b41faa5ec83ab863af",
+        "type": "url",
+        "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.0.1.tar.gz",
+        "version": "10.0.1"
+      },
+      "tensorrt-cu12": {
+        "sha256": "a549e2fe472eb03b2737a708c0aef0cac9cb0be1ae46bc7dad72ec1dfc81bd19",
         "type": "url",
-        "url": "https://pypi.nvidia.com/tensorrt/tensorrt-9.3.0.post12.dev1.tar.gz",
-        "version": "9.3.0.post12.dev1"
+        "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.1.0.tar.gz",
+        "version": "10.1.0"
       },
-      "tensorrt-bindings": {
-        "sha256": "c1619e4a9b23b077717af7635489cd1a12a8b4d97477088fc3c5d3a81e36bf65",
+      "tensorrt-cu12-bindings": {
+        "sha256": "91e1bd0eb348524ff209ef6b235d329983ea704b5d16f9a7ba747c08cc3c2495",
         "type": "url",
-        "url": "https://pypi.nvidia.com/tensorrt-bindings/tensorrt_bindings-9.3.0.post12.dev1-cp310-none-manylinux_2_17_x86_64.whl",
-        "version": "9.3.0.post12.dev1"
+        "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.1.0-cp310-none-manylinux_2_17_x86_64.whl",
+        "version": "10.1.0"
       },
-      "tensorrt-libs": {
-        "sha256": "ab0b6ee6cd41503273d44892cb92b92c75d046a5e468b73884978f59cca4b8d9",
+      "tensorrt-cu12-libs": {
+        "sha256": "1ad13c26b3f441267a746df6859e44eb0e8da78d4382458d1fd2eb7675abd49f",
         "type": "url",
-        "url": "https://pypi.nvidia.com/tensorrt-libs/tensorrt_libs-9.3.0.post12.dev1-py2.py3-none-manylinux_2_17_x86_64.whl",
-        "version": "9.3.0.post12.dev1"
+        "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.1.0-py2.py3-none-manylinux_2_17_x86_64.whl",
+        "version": "10.1.0"
       },
       "tensorrt-llm": {
-        "sha256": "2f60b6f8d0afee5f52a5160a44815b0af3e9cd4c46b53cc7a252377ed6cec670",
+        "sha256": "c7975326fa10b56079e0febf7c52a65ccf5b37760cd1c79d5aa3e8c7d85ce69c",
         "type": "url",
-        "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.9.0-cp310-cp310-linux_x86_64.whl",
-        "version": "0.9.0"
+        "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.10.0-cp310-cp310-linux_x86_64.whl",
+        "version": "0.10.0"
       },
       "tokenizers": {
-        "sha256": "06a56acdfe6c5d51c03ebfc6838f727fcf231c035b94f2460cca68947f6799dc",
+        "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/11/f9/8c77a471469ea7d1b52f2a25607385109c954d6444a9b0df19796beba461/tokenizers-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
-        "version": "0.19.0"
+        "url": "https://files.pythonhosted.org/packages/40/4f/eb78de4af3b17b589f43a369cbf0c3a7173f25c3d2cd93068852c07689aa/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "version": "0.19.1"
       },
       "tomli": {
         "sha256": "939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc",
@@ -674,10 +698,10 @@
         "version": "4.66.4"
       },
       "transformers": {
-        "sha256": "92797ec3368ed4476a053529a4039a12ad09167d9e371981dda4afb4bdf590ac",
+        "sha256": "71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl",
-        "version": "4.40.0"
+        "url": "https://files.pythonhosted.org/packages/05/23/ba02efa28518557e0cfe0ce5c1170000dd7501ed02ac865fc90cbe3daa93/transformers-4.40.2-py3-none-any.whl",
+        "version": "4.40.2"
       },
       "triton": {
         "sha256": "a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5",
@@ -691,6 +715,12 @@
         "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.47.0-py3-none-manylinux1_x86_64.whl",
         "version": "2.47.0"
       },
+      "typer": {
+        "sha256": "070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914",
+        "type": "url",
+        "url": "https://files.pythonhosted.org/packages/20/b5/11cf2e34fbb11b937e006286ab5b8cfd334fde1c8fa4dd7f491226931180/typer-0.12.3-py3-none-any.whl",
+        "version": "0.12.3"
+      },
       "typing-extensions": {
         "sha256": "04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d",
         "type": "url",
@@ -698,10 +728,10 @@
         "version": "4.12.2"
       },
       "typing-inspect": {
-        "sha256": "3b98390df4d999a28cf5b35d8b333425af5da2ece8a4ea9e98f71e7591347b4f",
+        "sha256": "9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f",
         "type": "url",
-        "url": "https://files.pythonhosted.org/packages/42/1c/66402db44184904a2f14722d317a4da0b5c8c78acfc3faf74362566635c5/typing_inspect-0.6.0-py3-none-any.whl",
-        "version": "0.6.0"
+        "url": "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl",
+        "version": "0.9.0"
       },
       "tzdata": {
         "sha256": "9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252",
@@ -791,8 +821,7 @@
           "psutil",
           "pyyaml",
           "safetensors",
-          "torch",
-          "transformers"
+          "torch"
         ],
         "aiohttp": [
           "aiosignal",
@@ -823,6 +852,7 @@
         "certifi": [],
         "charset-normalizer": [],
         "click": [],
+        "cloudpickle": [],
         "cog": [
           "attrs",
           "fastapi",
@@ -852,7 +882,6 @@
           "pyyaml",
           "requests",
           "tqdm",
-          "transformers",
           "xxhash"
         ],
         "diffusers": [
@@ -863,7 +892,7 @@
           "pillow",
           "regex",
           "requests",
-          "transformers"
+          "safetensors"
         ],
         "dill": [],
         "evaluate": [
@@ -877,7 +906,6 @@
           "pandas",
           "requests",
           "tqdm",
-          "transformers",
           "xxhash"
         ],
         "exceptiongroup": [],
@@ -886,7 +914,6 @@
           "starlette"
         ],
         "filelock": [],
-        "flatbuffers": [],
         "frozenlist": [],
         "fsspec": [
           "aiohttp"
@@ -949,7 +976,11 @@
           "markupsafe"
         ],
         "lark": [],
+        "markdown-it-py": [
+          "mdurl"
+        ],
         "markupsafe": [],
+        "mdurl": [],
         "mpi4py": [],
         "mpmath": [],
         "multidict": [],
@@ -960,18 +991,6 @@
         "networkx": [],
         "ninja": [],
         "numpy": [],
-        "nvidia-ammo": [
-          "networkx",
-          "ninja",
-          "numpy",
-          "onnx",
-          "onnx-graphsurgeon",
-          "onnxruntime",
-          "scipy",
-          "torch",
-          "tqdm",
-          "transformers"
-        ],
         "nvidia-cublas-cu12": [],
         "nvidia-cuda-cupti-cu12": [],
         "nvidia-cuda-nvrtc-cu12": [],
@@ -989,15 +1008,28 @@
         "nvidia-cusparse-cu12": [
           "nvidia-nvjitlink-cu12"
         ],
+        "nvidia-modelopt": [
+          "cloudpickle",
+          "ninja",
+          "numpy",
+          "packaging",
+          "pydantic",
+          "rich",
+          "scipy",
+          "tqdm"
+        ],
         "nvidia-nccl-cu12": [],
         "nvidia-nvjitlink-cu12": [],
         "nvidia-nvtx-cu12": [],
         "nvidia-pytriton": [
+          "grpcio",
+          "importlib-metadata",
           "numpy",
           "protobuf",
           "pyzmq",
           "sh",
           "tritonclient",
+          "typer",
           "typing-inspect",
           "wrapt"
         ],
@@ -1009,18 +1041,6 @@
           "numpy",
           "protobuf"
         ],
-        "onnx-graphsurgeon": [
-          "numpy",
-          "onnx"
-        ],
-        "onnxruntime": [
-          "coloredlogs",
-          "flatbuffers",
-          "numpy",
-          "packaging",
-          "protobuf",
-          "sympy"
-        ],
         "optimum": [
           "coloredlogs",
           "datasets",
@@ -1049,6 +1069,7 @@
         "pydantic": [
           "typing-extensions"
         ],
+        "pygments": [],
         "pynvml": [],
         "pyproject-hooks": [],
         "python-dateutil": [
@@ -1066,6 +1087,10 @@
           "idna",
           "urllib3"
         ],
+        "rich": [
+          "markdown-it-py",
+          "pygments"
+        ],
         "safetensors": [],
         "scipy": [
           "numpy"
@@ -1073,6 +1098,7 @@
         "sentencepiece": [],
         "setuptools": [],
         "sh": [],
+        "shellingham": [],
         "six": [],
         "sniffio": [],
         "starlette": [
@@ -1084,14 +1110,15 @@
           "mpmath"
         ],
         "tensorrt": [
-          "tensorrt-bindings",
-          "tensorrt-libs"
+          "tensorrt-cu12"
         ],
-        "tensorrt-bindings": [],
-        "tensorrt-libs": [
-          "nvidia-cublas-cu12",
-          "nvidia-cuda-runtime-cu12",
-          "nvidia-cudnn-cu12"
+        "tensorrt-cu12": [
+          "tensorrt-cu12-bindings",
+          "tensorrt-cu12-libs"
+        ],
+        "tensorrt-cu12-bindings": [],
+        "tensorrt-cu12-libs": [
+          "nvidia-cuda-runtime-cu12"
         ],
         "tensorrt-llm": [
           "accelerate",
@@ -1106,17 +1133,16 @@
           "mpi4py",
           "mpmath",
           "numpy",
-          "nvidia-ammo",
-          "nvidia-cudnn-cu12",
+          "nvidia-modelopt",
           "onnx",
           "optimum",
           "pandas",
           "polygraphy",
           "psutil",
           "pulp",
+          "pydantic",
           "pynvml",
           "sentencepiece",
-          "setuptools",
           "strenum",
           "tensorrt",
           "torch",
@@ -1153,10 +1179,13 @@
           "huggingface-hub",
           "numpy",
           "packaging",
+          "protobuf",
+          "pydantic",
           "pyyaml",
           "regex",
           "requests",
           "safetensors",
+          "sentencepiece",
           "tokenizers",
           "tqdm"
         ],
@@ -1165,7 +1194,6 @@
         ],
         "tritonclient": [
           "aiohttp",
-          "cuda-python",
           "geventhttpclient",
           "grpcio",
           "numpy",
@@ -1174,6 +1202,12 @@
           "python-rapidjson",
           "urllib3"
         ],
+        "typer": [
+          "click",
+          "rich",
+          "shellingham",
+          "typing-extensions"
+        ],
         "typing-extensions": [],
         "typing-inspect": [
           "mypy-extensions",
@@ -1214,5 +1248,5 @@
       }
     }
   },
-  "invalidationHash": "aea5c24536de46921b0505e9f29e379558d83bbd76f08cf2f49f8ffe84243032"
+  "invalidationHash": "e7e207b87a9d99d7041d2be7edfba110533a437cc9cf984be13642572fa5f156"
 }
\ No newline at end of file
diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix
index 20f901d..5db8c87 100644
--- a/nix/tensorrt-llm.nix
+++ b/nix/tensorrt-llm.nix
@@ -17,14 +17,14 @@
 }:
 stdenv.mkDerivation (o: {
   pname = "tensorrt_llm";
-  version = "0.9.0";
+  version = "0.10.0";
   src = fetchFromGitHub {
     owner = "NVIDIA";
     repo = "TensorRT-LLM";
     rev = "v${o.version}";
     fetchSubmodules = true;
     fetchLFS = true; # libtensorrt_llm_batch_manager_static.a
-    hash = "sha256-BGU56yI6yuTGHYhq5I3xYhrsKI8O4ykhDFeRP/JGCRo=";
+    hash = "sha256-eOAixXzOQRaySbUtpeAF9qMFOzwe1rosC0GOgy8CakU=";
   };
   outputs =
     if withPython then
@@ -41,6 +41,7 @@ stdenv.mkDerivation (o: {
     ninja
     python3
     cudaPackages.cuda_nvcc
+    rsync
   ];
   buildInputs =
     [
@@ -54,6 +55,10 @@ stdenv.mkDerivation (o: {
       # torch hates the split cuda, so only do it without torch
       cudaPackages.cuda_cudart
       cudaPackages.cuda_nvcc.dev
+      cudaPackages.cuda_nvrtc.dev
+      cudaPackages.cuda_nvrtc.lib
+      cudaPackages.cuda_nvml_dev.lib
+      cudaPackages.cuda_nvml_dev.dev
       cudaPackages.cuda_cccl
       cudaPackages.libcublas.lib
       cudaPackages.libcublas.dev
@@ -85,8 +90,8 @@ stdenv.mkDerivation (o: {
       pynvml # >=11.5.0
       sentencepiece # >=0.1.99
       tensorrt # ==9.2.0.post12.dev5
-      tensorrt-bindings # missed transitive dep
-      tensorrt-libs
+      tensorrt-cu12-bindings # missed transitive dep
+      tensorrt-cu12-libs
       torch # <=2.2.0a
       nvidia-ammo # ~=0.7.0; platform_machine=="x86_64"
       transformers # ==4.36.1
@@ -109,11 +114,16 @@ stdenv.mkDerivation (o: {
     "-DBUILD_PYBIND=${if withPython then "ON" else "OFF"}" # needs BUILD_PYT
     "-DBUILD_TESTS=OFF" # needs nvonnxparser.h
     # believe it or not, this is the actual binary distribution channel for tensorrt:
-    "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-libs.public}/${python3.sitePackages}/tensorrt_libs"
+    "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-cu12-libs.public}/${python3.sitePackages}/tensorrt_libs"
     "-DTRT_INCLUDE_DIR=${tensorrt-src}/include"
     "-DCMAKE_CUDA_ARCHITECTURES=${builtins.concatStringsSep ";" architectures}"
     # "-DFAST_BUILD=ON"
   ];
+  # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once
+  postPatch = ''
+    sed -i 's/#include <mpi.h>/#include <mpi.h>\n#include <cstdint>/' /build/source/cpp/include/tensorrt_llm/common/mpiUtils.h
+    sed -i 's/#pragma once/#pragma once\n#include <cuda_runtime.h>/' /build/source/cpp/tensorrt_llm/kernels/lruKernel.h
+  '';
   postBuild = lib.optionalString withPython ''
     pushd ../../
     chmod -R +w .
@@ -135,19 +145,24 @@ stdenv.mkDerivation (o: {
   installPhase =
     ''
       mkdir -p $out
-      ${rsync}/bin/rsync -a --exclude "tensorrt_llm/kernels" $src/cpp $out/
-      chmod -R u+w $out/cpp
-      mkdir -p $out/cpp/build/tensorrt_llm/plugins
+      rsync -a --chmod=u+w --include "tensorrt_llm/kernels/" --include "tensorrt_llm/kernels/kvCacheIndex.h" --exclude "tensorrt_llm/kernels/*" $src/cpp $out/
+      # rsync -a --chmod=u+w $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/
       pushd tensorrt_llm
-      cp ./libtensorrt_llm.so $out/cpp/build/tensorrt_llm/
+      mkdir -p $out/cpp/build/tensorrt_llm/
+      find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/build/tensorrt_llm/
       patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so
-      cp ./plugins/libnvinfer_plugin_tensorrt_llm.so* $out/cpp/build/tensorrt_llm/plugins/
-      for f in $out/cpp/build/tensorrt_llm/plugins/*.so*; do
+      for f in $out/cpp/build/tensorrt_llm/plugins/*.so* $out/cpp/build/tensorrt_llm/executor_worker/executorWorker; do
         if [ ! -L "$f" ]; then
-          new_path=$(patchelf --print-rpath "$f" | sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#')
+          new_path=$(patchelf --print-rpath "$f" |
+            sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#g' |
+            sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../../tensorrt_llm#g'
+          )
           patchelf --set-rpath "$new_path" "$f"
         fi
       done
+      new_path=$(patchelf --print-rpath $out/cpp/build/tensorrt_llm/libtensorrt_llm.so |
+        sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#')
+      patchelf --set-rpath "$new_path" $out/cpp/build/tensorrt_llm/libtensorrt_llm.so
       popd
     ''
     + (lib.optionalString withPython ''
diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix
index 4dab9b2..b52d81c 100644
--- a/nix/trtllm-backend.nix
+++ b/nix/trtllm-backend.nix
@@ -28,12 +28,11 @@ let
     rev = "a06e9a1157d6b5b9b34b6d05a07bb84d517f17c9";
     hash = "sha256-Ju2zV/jHUuciTs6GbkqcPG8U0y2lkIWSdAsX78DrpV4=";
   };
-  # todo: update with trt-llm 0.9?
   deps.triton_repo_core = fetchFromGitHub {
     owner = "triton-inference-server";
     repo = "core";
-    rev = "5d4a99c285c729a349265ce8dd7a4535e59d29b1";
-    hash = "sha256-WP8bwplo98GmNulX+QA+IrQEc2+GMcTjV53K438vX1g=";
+    rev = "434e50313b80fdc7ef295fcb3baeeacf65b295e4";
+    hash = "sha256-kfDXQEYuMze4E53OHHJ1YjQHnNtAEt4lzNK27K6ttVE=";
   };
   deps.googletest = fetchFromGitHub {
     owner = "google";
@@ -43,18 +42,18 @@ let
   };
 
   inherit (python3) sitePackages;
-  trt_lib_dir = "${pythonDrvs.tensorrt-libs.public}/${sitePackages}/tensorrt_libs";
+  trt_lib_dir = "${pythonDrvs.tensorrt-cu12-libs.public}/${sitePackages}/tensorrt_libs";
   # this package wants gcc12
   oldGccStdenv = stdenvAdapters.useLibsFrom stdenv gcc12Stdenv;
 in
 oldGccStdenv.mkDerivation rec {
   pname = "tensorrtllm_backend";
-  version = "0.9.0";
+  version = "0.10.0";
   src = fetchFromGitHub {
     owner = "triton-inference-server";
     repo = "tensorrtllm_backend";
     rev = "v${version}";
-    hash = "sha256-aNjVYu7sDrIj/lse/wS3vYaR/vmjtZfxzBWYi3z3KqQ=";
+    hash = "sha256-6df9MbHPqBVxpdkTcEzf99OCPtgFrK0jjDJfvE/guyA=";
   };
   nativeBuildInputs = [
     cmake
@@ -70,6 +69,8 @@ oldGccStdenv.mkDerivation rec {
     cudaPackages.cuda_cccl
     cudaPackages.libcublas.lib
     cudaPackages.libcublas.dev
+    cudaPackages.cuda_nvml_dev.lib
+    cudaPackages.cuda_nvml_dev.dev
   ];
   sourceRoot = "source/inflight_batcher_llm";
   cmakeFlags = [
@@ -84,7 +85,7 @@ oldGccStdenv.mkDerivation rec {
   ];
   postInstall = ''
     mkdir -p $out/backends/tensorrtllm
-    cp libtriton_*.so triton_tensorrtllm_worker $out/backends/tensorrtllm
+    cp libtriton_*.so trtllmExecutorWorker $out/backends/tensorrtllm
     rm -r /build/source/inflight_batcher_llm/build/_deps/repo-core-build
     rm -r /build/source/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so
   '';
@@ -94,7 +95,7 @@ oldGccStdenv.mkDerivation rec {
       --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib'
     patchelf $out/backends/tensorrtllm/libtriton_tensorrtllm_common.so \
       --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib'
-    patchelf $out/backends/tensorrtllm/triton_tensorrtllm_worker \
+    patchelf $out/backends/tensorrtllm/trtllmExecutorWorker  \
       --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib'
   '';
 }
diff --git a/predict.py b/predict.py
index 1d4254f..480526b 100644
--- a/predict.py
+++ b/predict.py
@@ -372,6 +372,9 @@ async def predict(
                         f"E2104 TritonMalformedEvent: Triton returned malformed event (no output_ids or error key): {event_data}"
                     )
 
+                if token == []:
+                    continue
+
                 n_tokens += 1
                 if n_tokens == 1:
                     first_token_time = time.time()
@@ -447,10 +450,13 @@ def _process_args(
         pad_id = self.pad_id
         end_id = self.end_id
 
-        if top_k < 0:
-            top_k = 0
-        if min_tokens < 0:
-            min_tokens = 0
+        if top_k <= 0:
+            # workaround, unneccesary with with trtllm > 0.10.0
+            top_k = None
+
+        if top_p <= 0.0:
+            # workaround, unneccesary with with trtllm > 0.10.0
+            top_p = None
 
         if not seed:
             seed = int(np.random.randint(0, 100000))
@@ -460,7 +466,11 @@ def _process_args(
             max_tokens = min(max_tokens, token_budget)
             min_tokens = min(min_tokens, token_budget)
 
-        args = {
+        if min_tokens <= 0:
+            # workaround, unneccesary with with trtllm > 0.10.0
+            min_tokens = None
+
+        args = {k: v for k, v in {
             "text_input": prompt,
             "max_tokens": max_tokens,
             "min_length": min_tokens,
@@ -474,7 +484,7 @@ def _process_args(
             "random_seed": seed,
             "pad_id": pad_id,
             "end_id": end_id,
-        }
+        }.items() if v is not None}
 
         return args
 
diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt
index 40d291d..6d54df6 100644
--- a/triton_model_repo/ensemble/config.pbtxt
+++ b/triton_model_repo/ensemble/config.pbtxt
@@ -442,4 +442,3 @@ ensemble_scheduling {
     }
   ]
 }
-
diff --git a/triton_model_repo/postprocessing/1/model.py b/triton_model_repo/postprocessing/1/model.py
index 5d5663b..ac42a0d 100644
--- a/triton_model_repo/postprocessing/1/model.py
+++ b/triton_model_repo/postprocessing/1/model.py
@@ -28,7 +28,7 @@
 
 import numpy as np
 import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
+# from transformers import AutoTokenizer
 
 
 class TritonPythonModel:
@@ -53,19 +53,19 @@ def initialize(self, args):
         """
         # Parse model configs
         model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-        self.skip_special_tokens = model_config['parameters'].get(
-            'skip_special_tokens',
-            {'string_value': "true"})['string_value'].lower() in [
-                'true', '1', 't', 'y', 'yes'
-            ]
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
+        # tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+        #     'string_value']
+        # self.skip_special_tokens = model_config['parameters'].get(
+        #     'skip_special_tokens',
+        #     {'string_value': "true"})['string_value'].lower() in [
+        #         'true', '1', 't', 'y', 'yes'
+        #     ]
+
+        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+        #                                                legacy=False,
+        #                                                padding_side='left',
+        #                                                trust_remote_code=True)
+        # self.tokenizer.pad_token = self.tokenizer.eos_token
 
         # Parse model output configs
         output_config = pb_utils.get_output_config_by_name(
@@ -133,15 +133,9 @@ def execute(self, requests):
 
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
-
             output_tensor = pb_utils.Tensor(
                 'OUTPUT',
-                tokens_batch
-            )
-
-            # output_tensor = pb_utils.Tensor(
-            #     'OUTPUT',
-            #     np.array(outputs).astype(self.output_dtype))
+                tokens_batch)
 
             outputs = []
             outputs.append(output_tensor)
@@ -207,13 +201,13 @@ def finalize(self):
         """
         print('Cleaning up...')
 
-    def _postprocessing(self, tokens_batch, sequence_lengths):
-        outputs = []
-        for batch_idx, beam_tokens in enumerate(tokens_batch):
-            for beam_idx, tokens in enumerate(beam_tokens):
-                seq_len = sequence_lengths[batch_idx][beam_idx]
-                output = self.tokenizer.decode(
-                    tokens[:seq_len],
-                    skip_special_tokens=self.skip_special_tokens)
-                outputs.append(output.encode('utf8'))
-        return outputs
+    # def _postprocessing(self, tokens_batch, sequence_lengths):
+    #     outputs = []
+    #     for batch_idx, beam_tokens in enumerate(tokens_batch):
+    #         for beam_idx, tokens in enumerate(beam_tokens):
+    #             seq_len = sequence_lengths[batch_idx][beam_idx]
+    #             output = self.tokenizer.decode(
+    #                 tokens[:seq_len],
+    #                 skip_special_tokens=self.skip_special_tokens)
+    #             outputs.append(output.encode('utf8'))
+    #     return outputs
diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt
index 5e2e37a..df87aeb 100644
--- a/triton_model_repo/postprocessing/config.pbtxt
+++ b/triton_model_repo/postprocessing/config.pbtxt
@@ -111,4 +111,3 @@ instance_group [
         kind: KIND_CPU
     }
 ]
-
diff --git a/triton_model_repo/preprocessing/1/model.py b/triton_model_repo/preprocessing/1/model.py
index a109775..62ab243 100644
--- a/triton_model_repo/preprocessing/1/model.py
+++ b/triton_model_repo/preprocessing/1/model.py
@@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
 
         flat_ids = []
         offsets = []
-        arbitrary_start_sequence_token = "!"
-        arbitrary_start_sequence_id = self.tokenizer.encode(
-            "!", add_special_tokens=False
-        )[0]
-
         for word_list in word_lists:
             item_flat_ids = []
             item_offsets = []
@@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
                 if isinstance(word, bytes):
                     word = word.decode()
 
-                word = arbitrary_start_sequence_token + word
                 ids = self.tokenizer.encode(word, add_special_tokens=False)
-                if ids[0] != arbitrary_start_sequence_id:
-                    raise ValueError(
-                        f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence. "
-                        "We then strip the corresponding first token from the stop sequence IDs. "
-                        f"However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggests there is a problem with the tokenizer that you are using."
-                    )
-                else:
-                    ids = ids[1:]
                 if len(ids) == 0:
                     continue
 
diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt
index b0fa8f2..e76fec5 100644
--- a/triton_model_repo/preprocessing/config.pbtxt
+++ b/triton_model_repo/preprocessing/config.pbtxt
@@ -138,4 +138,3 @@ instance_group [
         kind: KIND_CPU
     }
 ]
-
diff --git a/triton_model_repo/tensorrt_llm/1/.gitkeep b/triton_model_repo/tensorrt_llm/1/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt
index 911fdeb..14aab33 100644
--- a/triton_model_repo/tensorrt_llm/config.pbtxt
+++ b/triton_model_repo/tensorrt_llm/config.pbtxt
@@ -69,6 +69,13 @@ input [
     optional: true
     allow_ragged_batch: true
   },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "end_id"
     data_type: TYPE_INT32
@@ -132,6 +139,27 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "len_penalty"
     data_type: TYPE_FP32
@@ -139,6 +167,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "repetition_penalty"
     data_type: TYPE_FP32
@@ -153,6 +188,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "presence_penalty"
     data_type: TYPE_FP32
@@ -323,7 +365,7 @@ parameters: {
 parameters: {
   key: "gpt_model_path"
   value: {
-    string_value: "/src/triton_model_repo/tensorrt_llm/1"
+    string_value: "/src/triton_model_repo/tensorrt_llm/1/"
   }
 }
 parameters: {
@@ -335,7 +377,13 @@ parameters: {
 parameters: {
   key: "max_attention_window_size"
   value: {
-    string_value: "${max_attention_window_size}"
+    string_value: "4096"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
   }
 }
 parameters: {
@@ -347,21 +395,58 @@ parameters: {
 parameters: {
   key: "kv_cache_free_gpu_mem_fraction"
   value: {
-    string_value: "${kv_cache_free_gpu_mem_fraction}"
+    string_value: "0.95"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
   }
 }
 parameters: {
-  key: "enable_trt_overlap"
+  key: "kv_cache_onboard_blocks"
   value: {
-    string_value: "${enable_trt_overlap}"
+    string_value: "${kv_cache_onboard_blocks}"
   }
 }
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
 parameters: {
   key: "exclude_input_in_output"
   value: {
     string_value: "${exclude_input_in_output}"
   }
 }
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
 parameters: {
   key: "enable_kv_cache_reuse"
   value: {
@@ -417,9 +502,9 @@ parameters: {
   }
 }
 parameters: {
-  key: "worker_path"
+  key: "executor_worker_path"
   value: {
-    string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
   }
 }
 parameters: {
@@ -428,4 +513,9 @@ parameters: {
       string_value: "${medusa_choices}"
   }
 }
-
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt
index 17989a9..e8c80e8 100644
--- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt
+++ b/triton_model_repo/tensorrt_llm_bls/config.pbtxt
@@ -245,4 +245,3 @@ instance_group [
     kind : KIND_CPU
   }
 ]
-
diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py
index 5d5663b..ac42a0d 100644
--- a/triton_templates/postprocessing/1/model.py
+++ b/triton_templates/postprocessing/1/model.py
@@ -28,7 +28,7 @@
 
 import numpy as np
 import triton_python_backend_utils as pb_utils
-from transformers import AutoTokenizer
+# from transformers import AutoTokenizer
 
 
 class TritonPythonModel:
@@ -53,19 +53,19 @@ def initialize(self, args):
         """
         # Parse model configs
         model_config = json.loads(args['model_config'])
-        tokenizer_dir = model_config['parameters']['tokenizer_dir'][
-            'string_value']
-        self.skip_special_tokens = model_config['parameters'].get(
-            'skip_special_tokens',
-            {'string_value': "true"})['string_value'].lower() in [
-                'true', '1', 't', 'y', 'yes'
-            ]
-
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                       legacy=False,
-                                                       padding_side='left',
-                                                       trust_remote_code=True)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
+        # tokenizer_dir = model_config['parameters']['tokenizer_dir'][
+        #     'string_value']
+        # self.skip_special_tokens = model_config['parameters'].get(
+        #     'skip_special_tokens',
+        #     {'string_value': "true"})['string_value'].lower() in [
+        #         'true', '1', 't', 'y', 'yes'
+        #     ]
+
+        # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+        #                                                legacy=False,
+        #                                                padding_side='left',
+        #                                                trust_remote_code=True)
+        # self.tokenizer.pad_token = self.tokenizer.eos_token
 
         # Parse model output configs
         output_config = pb_utils.get_output_config_by_name(
@@ -133,15 +133,9 @@ def execute(self, requests):
 
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
-
             output_tensor = pb_utils.Tensor(
                 'OUTPUT',
-                tokens_batch
-            )
-
-            # output_tensor = pb_utils.Tensor(
-            #     'OUTPUT',
-            #     np.array(outputs).astype(self.output_dtype))
+                tokens_batch)
 
             outputs = []
             outputs.append(output_tensor)
@@ -207,13 +201,13 @@ def finalize(self):
         """
         print('Cleaning up...')
 
-    def _postprocessing(self, tokens_batch, sequence_lengths):
-        outputs = []
-        for batch_idx, beam_tokens in enumerate(tokens_batch):
-            for beam_idx, tokens in enumerate(beam_tokens):
-                seq_len = sequence_lengths[batch_idx][beam_idx]
-                output = self.tokenizer.decode(
-                    tokens[:seq_len],
-                    skip_special_tokens=self.skip_special_tokens)
-                outputs.append(output.encode('utf8'))
-        return outputs
+    # def _postprocessing(self, tokens_batch, sequence_lengths):
+    #     outputs = []
+    #     for batch_idx, beam_tokens in enumerate(tokens_batch):
+    #         for beam_idx, tokens in enumerate(beam_tokens):
+    #             seq_len = sequence_lengths[batch_idx][beam_idx]
+    #             output = self.tokenizer.decode(
+    #                 tokens[:seq_len],
+    #                 skip_special_tokens=self.skip_special_tokens)
+    #             outputs.append(output.encode('utf8'))
+    #     return outputs
diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py
index 0f561f7..62ab243 100644
--- a/triton_templates/preprocessing/1/model.py
+++ b/triton_templates/preprocessing/1/model.py
@@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
 
         flat_ids = []
         offsets = []
-        arbitrary_start_sequence_token = "!"
-        arbitrary_start_sequence_id = self.tokenizer.encode(
-            "!", add_special_tokens=False
-        )[0]
-
         for word_list in word_lists:
             item_flat_ids = []
             item_offsets = []
@@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]):
                 if isinstance(word, bytes):
                     word = word.decode()
 
-                word = arbitrary_start_sequence_token + word
                 ids = self.tokenizer.encode(word, add_special_tokens=False)
-                if ids[0] != arbitrary_start_sequence_id:
-                    raise ValueError(
-                        f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence."
-                        "We then strip the corresponding first token from the stop sequence IDs."
-                        "However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggestions there is a problem with the tokenizer that you are using."
-                    )
-                else:
-                    ids = ids[1:]
                 if len(ids) == 0:
                     continue
 
diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt
index 71d2b98..1c34f77 100644
--- a/triton_templates/tensorrt_llm/config.pbtxt
+++ b/triton_templates/tensorrt_llm/config.pbtxt
@@ -69,6 +69,13 @@ input [
     optional: true
     allow_ragged_batch: true
   },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "end_id"
     data_type: TYPE_INT32
@@ -132,6 +139,27 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "len_penalty"
     data_type: TYPE_FP32
@@ -139,6 +167,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "repetition_penalty"
     data_type: TYPE_FP32
@@ -153,6 +188,13 @@ input [
     reshape: { shape: [ ] }
     optional: true
   },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
   {
     name: "presence_penalty"
     data_type: TYPE_FP32
@@ -338,6 +380,12 @@ parameters: {
     string_value: "${max_attention_window_size}"
   }
 }
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
 parameters: {
   key: "batch_scheduler_policy"
   value: {
@@ -351,17 +399,54 @@ parameters: {
   }
 }
 parameters: {
-  key: "enable_trt_overlap"
+  key: "kv_cache_host_memory_bytes"
   value: {
-    string_value: "${enable_trt_overlap}"
+    string_value: "${kv_cache_host_memory_bytes}"
   }
 }
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
 parameters: {
   key: "exclude_input_in_output"
   value: {
     string_value: "${exclude_input_in_output}"
   }
 }
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
 parameters: {
   key: "enable_kv_cache_reuse"
   value: {
@@ -417,9 +502,9 @@ parameters: {
   }
 }
 parameters: {
-  key: "worker_path"
+  key: "executor_worker_path"
   value: {
-    string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker"
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
   }
 }
 parameters: {
@@ -428,3 +513,9 @@ parameters: {
       string_value: "${medusa_choices}"
   }
 }
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}