diff --git a/default.nix b/default.nix index c3a72ee..63dabf2 100644 --- a/default.nix +++ b/default.nix @@ -15,36 +15,29 @@ in cog_version = "0.10.0-alpha16"; cuda = "12.1"; # todo: 12.2 gpu = true; - # inspiration: echo tensorrt_llm==0.8.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line + # inspiration: echo tensorrt_llm==0.10.0 | uv pip compile - --extra-index-url https://pypi.nvidia.com -p 3.10 --prerelease=allow --annotation-style=line python_packages = [ "--extra-index-url" "https://pypi.nvidia.com" - "tensorrt_llm==0.9.0" + "tensorrt_llm==0.10.0" + "tensorrt-cu12==10.1.0" "torch==2.2.2" - "tensorrt==9.3.0.post12.dev1" - "tensorrt-bindings==9.3.0.post12.dev1" - "tensorrt-libs==9.3.0.post12.dev1" - "nvidia-pytriton==0.5.2" # corresponds to 2.42.0 - "httpx" - "nvidia-cublas-cu12<12.2" - "nvidia-cuda-nvrtc-cu12<12.2" - "nvidia-cuda-runtime-cu12<12.2" + "nvidia-pytriton==0.5.6" # corresponds to 2.44.0 "omegaconf" "hf-transfer" - "tokenizers" + "tokenizers>=0.19.0" ]; # don't ask why it needs ssh system_packages = [ "pget" "openssh" "openmpi" ]; }; python-env.pip = { uv.enable = true; - # todo: add some constraints to match cudaPackages constraintsList = [ - "nvidia-cudnn-cu12<9" + # "nvidia-cudnn-cu12==${cudaPackages.cudnn.version}" + "nvidia-cublas-cu12==${cudaPackages.libcublas.version}" ]; overridesList = [ - "tokenizers==0.19.0" - "transformers==4.40.0" + "pydantic==1.10.16" ]; }; cognix.includeNix = true; @@ -56,27 +49,31 @@ in # tensorrt likes doing a pip invocation from it's setup.py # circumvent by manually depending on tensorrt_libs, tensorrt_bindings # and setting this env variable - tensorrt.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; - # TODO remove upon next rebuild: - tensorrt.mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public - tensorrt-bindings.public + tensorrt-cu12.env.NVIDIA_TENSORRT_DISABLE_INTERNAL_PIP = true; + tensorrt-cu12.mkDerivation.buildInputs = [ python3.pkgs.pip ]; + tensorrt-cu12-bindings.mkDerivation.propagatedBuildInputs = [ + pythonDrvs.tensorrt-cu12-libs.public ]; - tensorrt-bindings.mkDerivation.propagatedBuildInputs = [ pythonDrvs.tensorrt-libs.public ]; - tensorrt-libs.mkDerivation.postFixup = '' + # fixes tensorrt-llm build + tensorrt-cu12-libs.mkDerivation.postFixup = '' pushd $out/${site}/tensorrt_libs - ln -s libnvinfer.so.9 libnvinfer.so - ln -s libnvonnxparser.so.9 libnvonnxparser.so + ln -s libnvinfer.so.10 libnvinfer.so + ln -s libnvonnxparser.so.10 libnvonnxparser.so popd ''; - tensorrt-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; + tensorrt-cu12-libs.env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; tensorrt-llm = { mkDerivation.buildInputs = [ cudaPackages.nccl ]; mkDerivation.propagatedBuildInputs = with pythonDrvs; [ - tensorrt-libs.public # libnvinfer, onnxparse + tensorrt-cu12-libs.public # libnvinfer, onnxparse ]; env.appendRunpaths = [ "/usr/lib64" "$ORIGIN" ]; - env.autoPatchelfIgnoreMissingDeps = ["libcuda.so.1"]; + env.autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" "libnvidia-ml.so.1" ]; + mkDerivation.postInstall = '' + pushd $out/${site}/tensorrt_llm/bin + patchelf --replace-needed libnvinfer_plugin_tensorrt_llm.so{.10,} executorWorker + popd + ''; }; # has some binaries that want cudart tritonclient.mkDerivation.postInstall = "rm -r $out/bin"; @@ -131,8 +128,8 @@ in deps.tensorrt-src = pkgs.fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT"; - rev = "6d1397ed4bb65933d02725623c122a157544a729"; # release/9.3 branch - hash = "sha256-XWFyMD7jjvgIihlqCJNyH5iSa1vZCDhv1maLJqMM3UE="; + rev = "v10.0.1"; + hash = "sha256-lSEw0GM0eW2BHNBq/wTQA8v3aNueE3FT+k9F5nH1OgA="; }; # todo: replace with lockfile deps.pybind11-stubgen = python3.pkgs.buildPythonPackage rec { diff --git a/flake.lock b/flake.lock index 4f4dd20..dfab778 100644 --- a/flake.lock +++ b/flake.lock @@ -12,11 +12,11 @@ "rust-overlay": "rust-overlay" }, "locked": { - "lastModified": 1718288375, - "narHash": "sha256-EDZYEqYzYa97bk4bJxcnsHVHr8OmrGki6o7yVCjXQGk=", + "lastModified": 1721228311, + "narHash": "sha256-EEe5Kcno5FMFSd2aYVB2ONHFpe/9k0CX1gIFjNQgV+A=", "owner": "datakami", "repo": "cognix", - "rev": "8eaff212007474e510af24e51bb22a009a0361a5", + "rev": "8c28f745d7339c495265a85fb65da3ce5592f0ef", "type": "github" }, "original": { @@ -32,15 +32,15 @@ "pyproject-nix": "pyproject-nix" }, "locked": { - "lastModified": 1710167744, - "narHash": "sha256-z78iB1ckRQuJluM82iCuQNjN5hqsNpd1om0q75ncza4=", - "owner": "yorickvp", + "lastModified": 1719513340, + "narHash": "sha256-on3zRua52KZ8G5kBOXMQOzrsA07ywVMNdcIWJEeotfo=", + "owner": "nix-community", "repo": "dream2nix", - "rev": "3bfbbbb19471b60cf1bb7f7c476588a36ac3fb04", + "rev": "4d441820e0d0916c97d7af6c4d4f6843d676e242", "type": "github" }, "original": { - "owner": "yorickvp", + "owner": "nix-community", "repo": "dream2nix", "type": "github" } @@ -83,11 +83,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709780214, - "narHash": "sha256-p4iDKdveHMhfGAlpxmkCtfQO3WRzmlD11aIcThwPqhk=", + "lastModified": 1719436386, + "narHash": "sha256-NBGYaic5FLRg8AWSj6yr4g2IlMPUxNCVjRK6+RNuQBc=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "f945939fd679284d736112d3d5410eb867f3b31c", + "rev": "c66e984bda09e7230ea7b364e677c5ba4f0d36d0", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 9471f5f..a6f9f11 100644 --- a/flake.nix +++ b/flake.nix @@ -26,7 +26,7 @@ cog-triton.architectures = architectures; # don't need this file in a runner - python-env.pip.drvs.tensorrt-libs.mkDerivation.postInstall = lib.mkAfter '' + python-env.pip.drvs.tensorrt-cu12-libs.mkDerivation.postInstall = lib.mkAfter '' rm $out/lib/python*/site-packages/tensorrt_libs/libnvinfer_builder_resource* ''; }); diff --git a/lock.json b/lock.json index 7207c4c..067b583 100644 --- a/lock.json +++ b/lock.json @@ -2,10 +2,10 @@ "fetchPipMetadata": { "sources": { "accelerate": { - "sha256": "c7bb817eb974bba0ff3ea1ba0f24d55afb86d50e3d4fe98d6922dc69cf2ccff1", + "sha256": "71fcf4be00872194071de561634268b71417d7f5b16b178e2fa76b6f117c52b0", "type": "url", - "url": "https://files.pythonhosted.org/packages/f7/fc/c55e5a2da345c9a24aa2e1e0f60eb2ca290b6a41be82da03a6d4baec4f99/accelerate-0.25.0-py3-none-any.whl", - "version": "0.25.0" + "url": "https://files.pythonhosted.org/packages/e4/74/564f621699b049b0358f7ad83d7437f8219a5d6efb69bbfcca328b60152f/accelerate-0.32.1-py3-none-any.whl", + "version": "0.32.1" }, "aiohttp": { "sha256": "c26959ca7b75ff768e2776d8055bf9582a6267e24556bb7f7bd29e677932be72", @@ -73,6 +73,12 @@ "url": "https://files.pythonhosted.org/packages/00/2e/d53fa4befbf2cfa713304affc7ca780ce4fc1fd8710527771b58311a3229/click-8.1.7-py3-none-any.whl", "version": "8.1.7" }, + "cloudpickle": { + "sha256": "246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7", + "type": "url", + "url": "https://files.pythonhosted.org/packages/96/43/dae06432d0c4b1dc9e9149ad37b4ca8384cf6eb7700cd9215b177b914f0a/cloudpickle-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "cog": { "sha256": "0f658f2da28e37da8040d073af4f4e7a91b567a8d169f077d5afddc33793a62f", "type": "url", @@ -104,10 +110,10 @@ "version": "2.14.4" }, "diffusers": { - "sha256": "ca258d8141a9faa85b3ce60805fc4898c91d0e73fd5b1576413dfe3b8502a8ec", + "sha256": "d5e9bb13c8097b4eed10df23d1294d2e5a418f53e3f89c7ef228b5b982970428", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/43/d4ae69ba5f503d58c7aef13f0f93d9c84694652dc2a16f8ea3d8246ebe95/diffusers-0.15.0-py3-none-any.whl", - "version": "0.15.0" + "url": "https://files.pythonhosted.org/packages/ee/22/2e6e90c87e718e63b1a860cb627bcf27ac4998edb5f190561b5c6cde6c62/diffusers-0.29.2-py3-none-any.whl", + "version": "0.29.2" }, "dill": { "sha256": "76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e", @@ -139,12 +145,6 @@ "url": "https://files.pythonhosted.org/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", "version": "3.15.4" }, - "flatbuffers": { - "sha256": "8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812", - "type": "url", - "url": "https://files.pythonhosted.org/packages/41/f0/7e988a019bc54b2dbd0ad4182ef2d53488bb02e58694cd79d61369e85900/flatbuffers-24.3.25-py2.py3-none-any.whl", - "version": "24.3.25" - }, "frozenlist": { "sha256": "a9b2de4cf0cdd5bd2dee4c4f63a653c61d2408055ab77b151c1957f221cabf2a", "type": "url", @@ -277,12 +277,24 @@ "url": "https://files.pythonhosted.org/packages/e7/9c/eef7c591e6dc952f3636cfe0df712c0f9916cedf317810a3bb53ccb65cdd/lark-1.1.9-py3-none-any.whl", "version": "1.1.9" }, + "markdown-it-py": { + "sha256": "355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", + "type": "url", + "url": "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", + "version": "3.0.0" + }, "markupsafe": { "sha256": "2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f", "type": "url", "url": "https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "2.1.5" }, + "mdurl": { + "sha256": "84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", + "type": "url", + "url": "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", + "version": "0.1.2" + }, "mpi4py": { "sha256": "c8fa625e0f92b082ef955bfb52f19fa6691d29273d7d71135d295aa143dee6cb", "type": "url", @@ -331,12 +343,6 @@ "url": "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "1.26.4" }, - "nvidia-ammo": { - "sha256": "ed6b0aa3748e735923ce3825c0044a130400fcd040a2bb54580e4bcd7ef605d3", - "type": "url", - "url": "https://pypi.nvidia.com/nvidia-ammo/nvidia_ammo-0.7.4-cp310-cp310-linux_x86_64.whl", - "version": "0.7.4" - }, "nvidia-cublas-cu12": { "sha256": "ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728", "type": "url", @@ -391,6 +397,12 @@ "url": "https://pypi.nvidia.com/nvidia-cusparse-cu12/nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", "version": "12.1.0.106" }, + "nvidia-modelopt": { + "sha256": "f56f04280fef23727a49decf13ff8269c9cc47b95fc304fcefed79fbe8e6ef5f", + "type": "url", + "url": "https://pypi.nvidia.com/nvidia-modelopt/nvidia_modelopt-0.11.2-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "0.11.2" + }, "nvidia-nccl-cu12": { "sha256": "802756f02c43c0613dc83f48a76f702462b0f1f618411768748bba9c805fce19", "type": "url", @@ -410,10 +422,10 @@ "version": "12.1.105" }, "nvidia-pytriton": { - "sha256": "810531f752f7bdc4308b8821056ce2d5a456e6cb62966f2e07f65cff0053e42a", + "sha256": "6403e65c2bbab0ab2fe2b737ad612e2b88f3edf20d41aadd1d544ffb309a701c", "type": "url", - "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.2-py3-none-manylinux_2_35_x86_64.whl", - "version": "0.5.2" + "url": "https://pypi.nvidia.com/nvidia-pytriton/nvidia_pytriton-0.5.6-py3-none-manylinux_2_35_x86_64.whl", + "version": "0.5.6" }, "omegaconf": { "sha256": "7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", @@ -427,23 +439,11 @@ "url": "https://files.pythonhosted.org/packages/c6/7e/5031717c0636e6074764a2f61a459a3ecd46c20d8b83a1f1cd2513a76160/onnx-1.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", "version": "1.16.1" }, - "onnx-graphsurgeon": { - "sha256": "10c130d6129fdeee02945f8103b5b112e6fd4d9b356e2dd3e80f53e0ebee7b5c", - "type": "url", - "url": "https://pypi.nvidia.com/onnx-graphsurgeon/onnx_graphsurgeon-0.5.2-py2.py3-none-any.whl", - "version": "0.5.2" - }, - "onnxruntime": { - "sha256": "ef2b1fc269cabd27f129fb9058917d6fdc89b188c49ed8700f300b945c81f889", - "type": "url", - "url": "https://files.pythonhosted.org/packages/7a/cf/6aa8c56fd63f53c2c485921e411269c7b501a2b4e634bd02f226ab2d5d8e/onnxruntime-1.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.16.3" - }, "optimum": { - "sha256": "1354dd1081179b7c490d135c7f380cee672125e17c0bfef143e616c5b756b1db", + "sha256": "8b3633b9312413ceac5156294a2a0cd221268baf5a2c593f4d54ec20bff296d8", "type": "url", - "url": "https://files.pythonhosted.org/packages/13/6d/6b03ffb8df1ab2b43d461f7cace2af5f20092f0767f53a3e9331df00e8a2/optimum-1.21.1-py3-none-any.whl", - "version": "1.21.1" + "url": "https://files.pythonhosted.org/packages/fa/e4/f832e42a1eb9d5ac4fa6379295e05aebeae507d171babc1786bfa0210299/optimum-1.21.2-py3-none-any.whl", + "version": "1.21.2" }, "packaging": { "sha256": "5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124", @@ -494,10 +494,16 @@ "version": "16.1.0" }, "pydantic": { - "sha256": "371dcf1831f87c9e217e2b6a0c66842879a14873114ebb9d0861ab22e3b5bb1e", + "sha256": "4660dd697de1ae2d4305a85161312611f64d5360663a9ba026cd6ad9e3fe14c3", + "type": "url", + "url": "https://files.pythonhosted.org/packages/ae/d8/3ffbdeccf252d56c8e0b6f1f30798d3aa0ad5afaa541908207881855beeb/pydantic-1.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "1.10.16" + }, + "pygments": { + "sha256": "b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a", "type": "url", - "url": "https://files.pythonhosted.org/packages/ef/a6/080cace699e89a94bd4bf34e8c12821d1f05fe4d56a0742f797b231d9a40/pydantic-1.10.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "1.10.17" + "url": "https://files.pythonhosted.org/packages/f7/3f/01c8b82017c199075f8f788d0d906b9ffbbc5a47dc9918a945e13d5a2bda/pygments-2.18.0-py3-none-any.whl", + "version": "2.18.0" }, "pynvml": { "sha256": "5cce014ac01b098d08f06178f86c37be409b80b2e903a5a03ce15eed60f55e25", @@ -542,10 +548,10 @@ "version": "6.0.1" }, "pyzmq": { - "sha256": "7e0113d70b095339e99bb522fe7294f5ae6a7f3b2b8f52f659469a74b5cc7661", + "sha256": "ba6e5e6588e49139a0979d03a7deb9c734bde647b9a8808f26acf9c547cab1bf", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/ac/18b75626cede66295a27e94d7cfe301d2d35120b200a6a46f205a171a20e/pyzmq-23.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "23.2.1" + "url": "https://files.pythonhosted.org/packages/40/4f/088d0fe18b188a0754483b7d632a97ef608dce80c2648219d071c9f1715c/pyzmq-26.0.3-cp310-cp310-manylinux_2_28_x86_64.whl", + "version": "26.0.3" }, "regex": { "sha256": "1337b7dbef9b2f71121cdbf1e97e40de33ff114801263b275aafd75303bd62b5", @@ -559,6 +565,12 @@ "url": "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", "version": "2.32.3" }, + "rich": { + "sha256": "4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222", + "type": "url", + "url": "https://files.pythonhosted.org/packages/87/67/a37f6214d0e9fe57f6ae54b2956d550ca8365857f42a1ce0392bb21d9410/rich-13.7.1-py3-none-any.whl", + "version": "13.7.1" + }, "safetensors": { "sha256": "d88b33980222085dd6001ae2cad87c6068e0991d4f5ccf44975d216db3b57376", "type": "url", @@ -578,16 +590,22 @@ "version": "0.2.0" }, "setuptools": { - "sha256": "b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05", + "sha256": "fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/54/2a8ecfcc9a714a6fbf86559a4b0f50b126a4ac4269ea8134f2c75c3e73de/setuptools-70.2.0-py3-none-any.whl", - "version": "70.2.0" + "url": "https://files.pythonhosted.org/packages/ef/15/88e46eb9387e905704b69849618e699dc2f54407d8953cc4ec4b8b46528d/setuptools-70.3.0-py3-none-any.whl", + "version": "70.3.0" }, "sh": { - "sha256": "e4045b6c732d9ce75d571c79f5ac2234edd9ae4f5fa9d59b09705082bdca18c7", + "sha256": "2f2f79a65abd00696cf2e9ad26508cf8abb6dba5745f40255f1c0ded2876926d", + "type": "url", + "url": "https://files.pythonhosted.org/packages/15/c2/79f9dea6fc544c0eb79ed5018a38860c52d597c4be66c2cf2029bea5b3fd/sh-2.0.7-py3-none-any.whl", + "version": "2.0.7" + }, + "shellingham": { + "sha256": "7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", "type": "url", - "url": "https://files.pythonhosted.org/packages/b7/09/89c28aaf2a49f226fef8587c90c6386bd2cc03a0295bc4ff7fc6ee43c01d/sh-1.14.3.tar.gz", - "version": "1.14.3" + "url": "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", + "version": "1.5.4" }, "six": { "sha256": "8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254", @@ -620,40 +638,46 @@ "version": "24.2.0" }, "sympy": { - "sha256": "9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515", + "sha256": "6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92", "type": "url", - "url": "https://files.pythonhosted.org/packages/61/53/e18c8c97d0b2724d85c9830477e3ebea3acf1dcdc6deb344d5d9c93a9946/sympy-1.12.1-py3-none-any.whl", - "version": "1.12.1" + "url": "https://files.pythonhosted.org/packages/62/74/7e6c65ee89ff43942bffffdbb238634f16967bf327aee3c76efcf6e49587/sympy-1.13.0-py3-none-any.whl", + "version": "1.13.0" }, "tensorrt": { - "sha256": "24aea5376cb8440afe2b0a22ee83f9748e586aa27303d4f80091ad48a56552a4", + "sha256": "7e9c8666f5bee86771451f007e25f81d65a411a26e6ea0b41faa5ec83ab863af", + "type": "url", + "url": "https://pypi.nvidia.com/tensorrt/tensorrt-10.0.1.tar.gz", + "version": "10.0.1" + }, + "tensorrt-cu12": { + "sha256": "a549e2fe472eb03b2737a708c0aef0cac9cb0be1ae46bc7dad72ec1dfc81bd19", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt/tensorrt-9.3.0.post12.dev1.tar.gz", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12/tensorrt-cu12-10.1.0.tar.gz", + "version": "10.1.0" }, - "tensorrt-bindings": { - "sha256": "c1619e4a9b23b077717af7635489cd1a12a8b4d97477088fc3c5d3a81e36bf65", + "tensorrt-cu12-bindings": { + "sha256": "91e1bd0eb348524ff209ef6b235d329983ea704b5d16f9a7ba747c08cc3c2495", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-bindings/tensorrt_bindings-9.3.0.post12.dev1-cp310-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-bindings/tensorrt_cu12_bindings-10.1.0-cp310-none-manylinux_2_17_x86_64.whl", + "version": "10.1.0" }, - "tensorrt-libs": { - "sha256": "ab0b6ee6cd41503273d44892cb92b92c75d046a5e468b73884978f59cca4b8d9", + "tensorrt-cu12-libs": { + "sha256": "1ad13c26b3f441267a746df6859e44eb0e8da78d4382458d1fd2eb7675abd49f", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-libs/tensorrt_libs-9.3.0.post12.dev1-py2.py3-none-manylinux_2_17_x86_64.whl", - "version": "9.3.0.post12.dev1" + "url": "https://pypi.nvidia.com/tensorrt-cu12-libs/tensorrt_cu12_libs-10.1.0-py2.py3-none-manylinux_2_17_x86_64.whl", + "version": "10.1.0" }, "tensorrt-llm": { - "sha256": "2f60b6f8d0afee5f52a5160a44815b0af3e9cd4c46b53cc7a252377ed6cec670", + "sha256": "c7975326fa10b56079e0febf7c52a65ccf5b37760cd1c79d5aa3e8c7d85ce69c", "type": "url", - "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.9.0-cp310-cp310-linux_x86_64.whl", - "version": "0.9.0" + "url": "https://pypi.nvidia.com/tensorrt-llm/tensorrt_llm-0.10.0-cp310-cp310-linux_x86_64.whl", + "version": "0.10.0" }, "tokenizers": { - "sha256": "06a56acdfe6c5d51c03ebfc6838f727fcf231c035b94f2460cca68947f6799dc", + "sha256": "8b01afb7193d47439f091cd8f070a1ced347ad0f9144952a30a41836902fe09e", "type": "url", - "url": "https://files.pythonhosted.org/packages/11/f9/8c77a471469ea7d1b52f2a25607385109c954d6444a9b0df19796beba461/tokenizers-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - "version": "0.19.0" + "url": "https://files.pythonhosted.org/packages/40/4f/eb78de4af3b17b589f43a369cbf0c3a7173f25c3d2cd93068852c07689aa/tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", + "version": "0.19.1" }, "tomli": { "sha256": "939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", @@ -674,10 +698,10 @@ "version": "4.66.4" }, "transformers": { - "sha256": "92797ec3368ed4476a053529a4039a12ad09167d9e371981dda4afb4bdf590ac", + "sha256": "71cb94301ec211a2e1d4b8c8d18dcfaa902dfa00a089dceca167a8aa265d6f2d", "type": "url", - "url": "https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl", - "version": "4.40.0" + "url": "https://files.pythonhosted.org/packages/05/23/ba02efa28518557e0cfe0ce5c1170000dd7501ed02ac865fc90cbe3daa93/transformers-4.40.2-py3-none-any.whl", + "version": "4.40.2" }, "triton": { "sha256": "a2294514340cfe4e8f4f9e5c66c702744c4a117d25e618bd08469d0bfed1e2e5", @@ -691,6 +715,12 @@ "url": "https://pypi.nvidia.com/tritonclient/tritonclient-2.47.0-py3-none-manylinux1_x86_64.whl", "version": "2.47.0" }, + "typer": { + "sha256": "070d7ca53f785acbccba8e7d28b08dcd88f79f1fbda035ade0aecec71ca5c914", + "type": "url", + "url": "https://files.pythonhosted.org/packages/20/b5/11cf2e34fbb11b937e006286ab5b8cfd334fde1c8fa4dd7f491226931180/typer-0.12.3-py3-none-any.whl", + "version": "0.12.3" + }, "typing-extensions": { "sha256": "04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", "type": "url", @@ -698,10 +728,10 @@ "version": "4.12.2" }, "typing-inspect": { - "sha256": "3b98390df4d999a28cf5b35d8b333425af5da2ece8a4ea9e98f71e7591347b4f", + "sha256": "9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f", "type": "url", - "url": "https://files.pythonhosted.org/packages/42/1c/66402db44184904a2f14722d317a4da0b5c8c78acfc3faf74362566635c5/typing_inspect-0.6.0-py3-none-any.whl", - "version": "0.6.0" + "url": "https://files.pythonhosted.org/packages/65/f3/107a22063bf27bdccf2024833d3445f4eea42b2e598abfbd46f6a63b6cb0/typing_inspect-0.9.0-py3-none-any.whl", + "version": "0.9.0" }, "tzdata": { "sha256": "9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", @@ -791,8 +821,7 @@ "psutil", "pyyaml", "safetensors", - "torch", - "transformers" + "torch" ], "aiohttp": [ "aiosignal", @@ -823,6 +852,7 @@ "certifi": [], "charset-normalizer": [], "click": [], + "cloudpickle": [], "cog": [ "attrs", "fastapi", @@ -852,7 +882,6 @@ "pyyaml", "requests", "tqdm", - "transformers", "xxhash" ], "diffusers": [ @@ -863,7 +892,7 @@ "pillow", "regex", "requests", - "transformers" + "safetensors" ], "dill": [], "evaluate": [ @@ -877,7 +906,6 @@ "pandas", "requests", "tqdm", - "transformers", "xxhash" ], "exceptiongroup": [], @@ -886,7 +914,6 @@ "starlette" ], "filelock": [], - "flatbuffers": [], "frozenlist": [], "fsspec": [ "aiohttp" @@ -949,7 +976,11 @@ "markupsafe" ], "lark": [], + "markdown-it-py": [ + "mdurl" + ], "markupsafe": [], + "mdurl": [], "mpi4py": [], "mpmath": [], "multidict": [], @@ -960,18 +991,6 @@ "networkx": [], "ninja": [], "numpy": [], - "nvidia-ammo": [ - "networkx", - "ninja", - "numpy", - "onnx", - "onnx-graphsurgeon", - "onnxruntime", - "scipy", - "torch", - "tqdm", - "transformers" - ], "nvidia-cublas-cu12": [], "nvidia-cuda-cupti-cu12": [], "nvidia-cuda-nvrtc-cu12": [], @@ -989,15 +1008,28 @@ "nvidia-cusparse-cu12": [ "nvidia-nvjitlink-cu12" ], + "nvidia-modelopt": [ + "cloudpickle", + "ninja", + "numpy", + "packaging", + "pydantic", + "rich", + "scipy", + "tqdm" + ], "nvidia-nccl-cu12": [], "nvidia-nvjitlink-cu12": [], "nvidia-nvtx-cu12": [], "nvidia-pytriton": [ + "grpcio", + "importlib-metadata", "numpy", "protobuf", "pyzmq", "sh", "tritonclient", + "typer", "typing-inspect", "wrapt" ], @@ -1009,18 +1041,6 @@ "numpy", "protobuf" ], - "onnx-graphsurgeon": [ - "numpy", - "onnx" - ], - "onnxruntime": [ - "coloredlogs", - "flatbuffers", - "numpy", - "packaging", - "protobuf", - "sympy" - ], "optimum": [ "coloredlogs", "datasets", @@ -1049,6 +1069,7 @@ "pydantic": [ "typing-extensions" ], + "pygments": [], "pynvml": [], "pyproject-hooks": [], "python-dateutil": [ @@ -1066,6 +1087,10 @@ "idna", "urllib3" ], + "rich": [ + "markdown-it-py", + "pygments" + ], "safetensors": [], "scipy": [ "numpy" @@ -1073,6 +1098,7 @@ "sentencepiece": [], "setuptools": [], "sh": [], + "shellingham": [], "six": [], "sniffio": [], "starlette": [ @@ -1084,14 +1110,15 @@ "mpmath" ], "tensorrt": [ - "tensorrt-bindings", - "tensorrt-libs" + "tensorrt-cu12" ], - "tensorrt-bindings": [], - "tensorrt-libs": [ - "nvidia-cublas-cu12", - "nvidia-cuda-runtime-cu12", - "nvidia-cudnn-cu12" + "tensorrt-cu12": [ + "tensorrt-cu12-bindings", + "tensorrt-cu12-libs" + ], + "tensorrt-cu12-bindings": [], + "tensorrt-cu12-libs": [ + "nvidia-cuda-runtime-cu12" ], "tensorrt-llm": [ "accelerate", @@ -1106,17 +1133,16 @@ "mpi4py", "mpmath", "numpy", - "nvidia-ammo", - "nvidia-cudnn-cu12", + "nvidia-modelopt", "onnx", "optimum", "pandas", "polygraphy", "psutil", "pulp", + "pydantic", "pynvml", "sentencepiece", - "setuptools", "strenum", "tensorrt", "torch", @@ -1153,10 +1179,13 @@ "huggingface-hub", "numpy", "packaging", + "protobuf", + "pydantic", "pyyaml", "regex", "requests", "safetensors", + "sentencepiece", "tokenizers", "tqdm" ], @@ -1165,7 +1194,6 @@ ], "tritonclient": [ "aiohttp", - "cuda-python", "geventhttpclient", "grpcio", "numpy", @@ -1174,6 +1202,12 @@ "python-rapidjson", "urllib3" ], + "typer": [ + "click", + "rich", + "shellingham", + "typing-extensions" + ], "typing-extensions": [], "typing-inspect": [ "mypy-extensions", @@ -1214,5 +1248,5 @@ } } }, - "invalidationHash": "aea5c24536de46921b0505e9f29e379558d83bbd76f08cf2f49f8ffe84243032" + "invalidationHash": "e7e207b87a9d99d7041d2be7edfba110533a437cc9cf984be13642572fa5f156" } \ No newline at end of file diff --git a/nix/tensorrt-llm.nix b/nix/tensorrt-llm.nix index 20f901d..5db8c87 100644 --- a/nix/tensorrt-llm.nix +++ b/nix/tensorrt-llm.nix @@ -17,14 +17,14 @@ }: stdenv.mkDerivation (o: { pname = "tensorrt_llm"; - version = "0.9.0"; + version = "0.10.0"; src = fetchFromGitHub { owner = "NVIDIA"; repo = "TensorRT-LLM"; rev = "v${o.version}"; fetchSubmodules = true; fetchLFS = true; # libtensorrt_llm_batch_manager_static.a - hash = "sha256-BGU56yI6yuTGHYhq5I3xYhrsKI8O4ykhDFeRP/JGCRo="; + hash = "sha256-eOAixXzOQRaySbUtpeAF9qMFOzwe1rosC0GOgy8CakU="; }; outputs = if withPython then @@ -41,6 +41,7 @@ stdenv.mkDerivation (o: { ninja python3 cudaPackages.cuda_nvcc + rsync ]; buildInputs = [ @@ -54,6 +55,10 @@ stdenv.mkDerivation (o: { # torch hates the split cuda, so only do it without torch cudaPackages.cuda_cudart cudaPackages.cuda_nvcc.dev + cudaPackages.cuda_nvrtc.dev + cudaPackages.cuda_nvrtc.lib + cudaPackages.cuda_nvml_dev.lib + cudaPackages.cuda_nvml_dev.dev cudaPackages.cuda_cccl cudaPackages.libcublas.lib cudaPackages.libcublas.dev @@ -85,8 +90,8 @@ stdenv.mkDerivation (o: { pynvml # >=11.5.0 sentencepiece # >=0.1.99 tensorrt # ==9.2.0.post12.dev5 - tensorrt-bindings # missed transitive dep - tensorrt-libs + tensorrt-cu12-bindings # missed transitive dep + tensorrt-cu12-libs torch # <=2.2.0a nvidia-ammo # ~=0.7.0; platform_machine=="x86_64" transformers # ==4.36.1 @@ -109,11 +114,16 @@ stdenv.mkDerivation (o: { "-DBUILD_PYBIND=${if withPython then "ON" else "OFF"}" # needs BUILD_PYT "-DBUILD_TESTS=OFF" # needs nvonnxparser.h # believe it or not, this is the actual binary distribution channel for tensorrt: - "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-libs.public}/${python3.sitePackages}/tensorrt_libs" + "-DTRT_LIB_DIR=${pythonDrvs.tensorrt-cu12-libs.public}/${python3.sitePackages}/tensorrt_libs" "-DTRT_INCLUDE_DIR=${tensorrt-src}/include" "-DCMAKE_CUDA_ARCHITECTURES=${builtins.concatStringsSep ";" architectures}" # "-DFAST_BUILD=ON" ]; + # include cstdint in cpp/tensorrt_llm/common/mpiUtils.h after pragma once + postPatch = '' + sed -i 's/#include /#include \n#include /' /build/source/cpp/include/tensorrt_llm/common/mpiUtils.h + sed -i 's/#pragma once/#pragma once\n#include /' /build/source/cpp/tensorrt_llm/kernels/lruKernel.h + ''; postBuild = lib.optionalString withPython '' pushd ../../ chmod -R +w . @@ -135,19 +145,24 @@ stdenv.mkDerivation (o: { installPhase = '' mkdir -p $out - ${rsync}/bin/rsync -a --exclude "tensorrt_llm/kernels" $src/cpp $out/ - chmod -R u+w $out/cpp - mkdir -p $out/cpp/build/tensorrt_llm/plugins + rsync -a --chmod=u+w --include "tensorrt_llm/kernels/" --include "tensorrt_llm/kernels/kvCacheIndex.h" --exclude "tensorrt_llm/kernels/*" $src/cpp $out/ + # rsync -a --chmod=u+w $src/cpp/tensorrt_llm/kernels $out/cpp/tensorrt_llm/ pushd tensorrt_llm - cp ./libtensorrt_llm.so $out/cpp/build/tensorrt_llm/ + mkdir -p $out/cpp/build/tensorrt_llm/ + find . '(' '(' -type f -executable ')' -or -type l ')' -print0 | rsync -av --chmod=u+w --files-from=- --from0 ./ $out/cpp/build/tensorrt_llm/ patchelf --add-needed 'libcudnn.so.8' --add-rpath ${cudaPackages.cudnn.lib}/lib $out/cpp/build/tensorrt_llm/libtensorrt_llm.so - cp ./plugins/libnvinfer_plugin_tensorrt_llm.so* $out/cpp/build/tensorrt_llm/plugins/ - for f in $out/cpp/build/tensorrt_llm/plugins/*.so*; do + for f in $out/cpp/build/tensorrt_llm/plugins/*.so* $out/cpp/build/tensorrt_llm/executor_worker/executorWorker; do if [ ! -L "$f" ]; then - new_path=$(patchelf --print-rpath "$f" | sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#') + new_path=$(patchelf --print-rpath "$f" | + sed 's#/build/source/cpp/build/tensorrt_llm#$ORIGIN/..#g' | + sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../../tensorrt_llm#g' + ) patchelf --set-rpath "$new_path" "$f" fi done + new_path=$(patchelf --print-rpath $out/cpp/build/tensorrt_llm/libtensorrt_llm.so | + sed 's#/build/source/cpp/tensorrt_llm#$ORIGIN/../../tensorrt_llm#') + patchelf --set-rpath "$new_path" $out/cpp/build/tensorrt_llm/libtensorrt_llm.so popd '' + (lib.optionalString withPython '' diff --git a/nix/trtllm-backend.nix b/nix/trtllm-backend.nix index 4dab9b2..b52d81c 100644 --- a/nix/trtllm-backend.nix +++ b/nix/trtllm-backend.nix @@ -28,12 +28,11 @@ let rev = "a06e9a1157d6b5b9b34b6d05a07bb84d517f17c9"; hash = "sha256-Ju2zV/jHUuciTs6GbkqcPG8U0y2lkIWSdAsX78DrpV4="; }; - # todo: update with trt-llm 0.9? deps.triton_repo_core = fetchFromGitHub { owner = "triton-inference-server"; repo = "core"; - rev = "5d4a99c285c729a349265ce8dd7a4535e59d29b1"; - hash = "sha256-WP8bwplo98GmNulX+QA+IrQEc2+GMcTjV53K438vX1g="; + rev = "434e50313b80fdc7ef295fcb3baeeacf65b295e4"; + hash = "sha256-kfDXQEYuMze4E53OHHJ1YjQHnNtAEt4lzNK27K6ttVE="; }; deps.googletest = fetchFromGitHub { owner = "google"; @@ -43,18 +42,18 @@ let }; inherit (python3) sitePackages; - trt_lib_dir = "${pythonDrvs.tensorrt-libs.public}/${sitePackages}/tensorrt_libs"; + trt_lib_dir = "${pythonDrvs.tensorrt-cu12-libs.public}/${sitePackages}/tensorrt_libs"; # this package wants gcc12 oldGccStdenv = stdenvAdapters.useLibsFrom stdenv gcc12Stdenv; in oldGccStdenv.mkDerivation rec { pname = "tensorrtllm_backend"; - version = "0.9.0"; + version = "0.10.0"; src = fetchFromGitHub { owner = "triton-inference-server"; repo = "tensorrtllm_backend"; rev = "v${version}"; - hash = "sha256-aNjVYu7sDrIj/lse/wS3vYaR/vmjtZfxzBWYi3z3KqQ="; + hash = "sha256-6df9MbHPqBVxpdkTcEzf99OCPtgFrK0jjDJfvE/guyA="; }; nativeBuildInputs = [ cmake @@ -70,6 +69,8 @@ oldGccStdenv.mkDerivation rec { cudaPackages.cuda_cccl cudaPackages.libcublas.lib cudaPackages.libcublas.dev + cudaPackages.cuda_nvml_dev.lib + cudaPackages.cuda_nvml_dev.dev ]; sourceRoot = "source/inflight_batcher_llm"; cmakeFlags = [ @@ -84,7 +85,7 @@ oldGccStdenv.mkDerivation rec { ]; postInstall = '' mkdir -p $out/backends/tensorrtllm - cp libtriton_*.so triton_tensorrtllm_worker $out/backends/tensorrtllm + cp libtriton_*.so trtllmExecutorWorker $out/backends/tensorrtllm rm -r /build/source/inflight_batcher_llm/build/_deps/repo-core-build rm -r /build/source/inflight_batcher_llm/build/libtriton_tensorrtllm_common.so ''; @@ -94,7 +95,7 @@ oldGccStdenv.mkDerivation rec { --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' patchelf $out/backends/tensorrtllm/libtriton_tensorrtllm_common.so \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' - patchelf $out/backends/tensorrtllm/triton_tensorrtllm_worker \ + patchelf $out/backends/tensorrtllm/trtllmExecutorWorker \ --add-rpath '$ORIGIN:${trt_lib_dir}:${tensorrt-llm}/cpp/build/tensorrt_llm:${tensorrt-llm}/cpp/build/tensorrt_llm/plugins:${cudaPackages.cudnn.lib}/lib' ''; } diff --git a/predict.py b/predict.py index 1d4254f..480526b 100644 --- a/predict.py +++ b/predict.py @@ -372,6 +372,9 @@ async def predict( f"E2104 TritonMalformedEvent: Triton returned malformed event (no output_ids or error key): {event_data}" ) + if token == []: + continue + n_tokens += 1 if n_tokens == 1: first_token_time = time.time() @@ -447,10 +450,13 @@ def _process_args( pad_id = self.pad_id end_id = self.end_id - if top_k < 0: - top_k = 0 - if min_tokens < 0: - min_tokens = 0 + if top_k <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 + top_k = None + + if top_p <= 0.0: + # workaround, unneccesary with with trtllm > 0.10.0 + top_p = None if not seed: seed = int(np.random.randint(0, 100000)) @@ -460,7 +466,11 @@ def _process_args( max_tokens = min(max_tokens, token_budget) min_tokens = min(min_tokens, token_budget) - args = { + if min_tokens <= 0: + # workaround, unneccesary with with trtllm > 0.10.0 + min_tokens = None + + args = {k: v for k, v in { "text_input": prompt, "max_tokens": max_tokens, "min_length": min_tokens, @@ -474,7 +484,7 @@ def _process_args( "random_seed": seed, "pad_id": pad_id, "end_id": end_id, - } + }.items() if v is not None} return args diff --git a/triton_model_repo/ensemble/config.pbtxt b/triton_model_repo/ensemble/config.pbtxt index 40d291d..6d54df6 100644 --- a/triton_model_repo/ensemble/config.pbtxt +++ b/triton_model_repo/ensemble/config.pbtxt @@ -442,4 +442,3 @@ ensemble_scheduling { } ] } - diff --git a/triton_model_repo/postprocessing/1/model.py b/triton_model_repo/postprocessing/1/model.py index 5d5663b..ac42a0d 100644 --- a/triton_model_repo/postprocessing/1/model.py +++ b/triton_model_repo/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -53,19 +53,19 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + # 'string_value'] + # self.skip_special_tokens = model_config['parameters'].get( + # 'skip_special_tokens', + # {'string_value': "true"})['string_value'].lower() in [ + # 'true', '1', 't', 'y', 'yes' + # ] + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -133,15 +133,9 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch - ) - - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) + tokens_batch) outputs = [] outputs.append(output_tensor) @@ -207,13 +201,13 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # output = self.tokenizer.decode( + # tokens[:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_model_repo/postprocessing/config.pbtxt b/triton_model_repo/postprocessing/config.pbtxt index 5e2e37a..df87aeb 100644 --- a/triton_model_repo/postprocessing/config.pbtxt +++ b/triton_model_repo/postprocessing/config.pbtxt @@ -111,4 +111,3 @@ instance_group [ kind: KIND_CPU } ] - diff --git a/triton_model_repo/preprocessing/1/model.py b/triton_model_repo/preprocessing/1/model.py index a109775..62ab243 100644 --- a/triton_model_repo/preprocessing/1/model.py +++ b/triton_model_repo/preprocessing/1/model.py @@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence. " - "We then strip the corresponding first token from the stop sequence IDs. " - f"However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggests there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue diff --git a/triton_model_repo/preprocessing/config.pbtxt b/triton_model_repo/preprocessing/config.pbtxt index b0fa8f2..e76fec5 100644 --- a/triton_model_repo/preprocessing/config.pbtxt +++ b/triton_model_repo/preprocessing/config.pbtxt @@ -138,4 +138,3 @@ instance_group [ kind: KIND_CPU } ] - diff --git a/triton_model_repo/tensorrt_llm/1/.gitkeep b/triton_model_repo/tensorrt_llm/1/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/triton_model_repo/tensorrt_llm/config.pbtxt b/triton_model_repo/tensorrt_llm/config.pbtxt index 911fdeb..14aab33 100644 --- a/triton_model_repo/tensorrt_llm/config.pbtxt +++ b/triton_model_repo/tensorrt_llm/config.pbtxt @@ -69,6 +69,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +139,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +167,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +188,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -323,7 +365,7 @@ parameters: { parameters: { key: "gpt_model_path" value: { - string_value: "/src/triton_model_repo/tensorrt_llm/1" + string_value: "/src/triton_model_repo/tensorrt_llm/1/" } } parameters: { @@ -335,7 +377,13 @@ parameters: { parameters: { key: "max_attention_window_size" value: { - string_value: "${max_attention_window_size}" + string_value: "4096" + } +} +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" } } parameters: { @@ -347,21 +395,58 @@ parameters: { parameters: { key: "kv_cache_free_gpu_mem_fraction" value: { - string_value: "${kv_cache_free_gpu_mem_fraction}" + string_value: "0.95" + } +} +parameters: { + key: "kv_cache_host_memory_bytes" + value: { + string_value: "${kv_cache_host_memory_bytes}" } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_onboard_blocks" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_onboard_blocks}" } } +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +502,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,4 +513,9 @@ parameters: { string_value: "${medusa_choices}" } } - +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +} diff --git a/triton_model_repo/tensorrt_llm_bls/config.pbtxt b/triton_model_repo/tensorrt_llm_bls/config.pbtxt index 17989a9..e8c80e8 100644 --- a/triton_model_repo/tensorrt_llm_bls/config.pbtxt +++ b/triton_model_repo/tensorrt_llm_bls/config.pbtxt @@ -245,4 +245,3 @@ instance_group [ kind : KIND_CPU } ] - diff --git a/triton_templates/postprocessing/1/model.py b/triton_templates/postprocessing/1/model.py index 5d5663b..ac42a0d 100644 --- a/triton_templates/postprocessing/1/model.py +++ b/triton_templates/postprocessing/1/model.py @@ -28,7 +28,7 @@ import numpy as np import triton_python_backend_utils as pb_utils -from transformers import AutoTokenizer +# from transformers import AutoTokenizer class TritonPythonModel: @@ -53,19 +53,19 @@ def initialize(self, args): """ # Parse model configs model_config = json.loads(args['model_config']) - tokenizer_dir = model_config['parameters']['tokenizer_dir'][ - 'string_value'] - self.skip_special_tokens = model_config['parameters'].get( - 'skip_special_tokens', - {'string_value': "true"})['string_value'].lower() in [ - 'true', '1', 't', 'y', 'yes' - ] - - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, - legacy=False, - padding_side='left', - trust_remote_code=True) - self.tokenizer.pad_token = self.tokenizer.eos_token + # tokenizer_dir = model_config['parameters']['tokenizer_dir'][ + # 'string_value'] + # self.skip_special_tokens = model_config['parameters'].get( + # 'skip_special_tokens', + # {'string_value': "true"})['string_value'].lower() in [ + # 'true', '1', 't', 'y', 'yes' + # ] + + # self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, + # legacy=False, + # padding_side='left', + # trust_remote_code=True) + # self.tokenizer.pad_token = self.tokenizer.eos_token # Parse model output configs output_config = pb_utils.get_output_config_by_name( @@ -133,15 +133,9 @@ def execute(self, requests): # Create output tensors. You need pb_utils.Tensor # objects to create pb_utils.InferenceResponse. - output_tensor = pb_utils.Tensor( 'OUTPUT', - tokens_batch - ) - - # output_tensor = pb_utils.Tensor( - # 'OUTPUT', - # np.array(outputs).astype(self.output_dtype)) + tokens_batch) outputs = [] outputs.append(output_tensor) @@ -207,13 +201,13 @@ def finalize(self): """ print('Cleaning up...') - def _postprocessing(self, tokens_batch, sequence_lengths): - outputs = [] - for batch_idx, beam_tokens in enumerate(tokens_batch): - for beam_idx, tokens in enumerate(beam_tokens): - seq_len = sequence_lengths[batch_idx][beam_idx] - output = self.tokenizer.decode( - tokens[:seq_len], - skip_special_tokens=self.skip_special_tokens) - outputs.append(output.encode('utf8')) - return outputs + # def _postprocessing(self, tokens_batch, sequence_lengths): + # outputs = [] + # for batch_idx, beam_tokens in enumerate(tokens_batch): + # for beam_idx, tokens in enumerate(beam_tokens): + # seq_len = sequence_lengths[batch_idx][beam_idx] + # output = self.tokenizer.decode( + # tokens[:seq_len], + # skip_special_tokens=self.skip_special_tokens) + # outputs.append(output.encode('utf8')) + # return outputs diff --git a/triton_templates/preprocessing/1/model.py b/triton_templates/preprocessing/1/model.py index 0f561f7..62ab243 100644 --- a/triton_templates/preprocessing/1/model.py +++ b/triton_templates/preprocessing/1/model.py @@ -268,11 +268,6 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): flat_ids = [] offsets = [] - arbitrary_start_sequence_token = "!" - arbitrary_start_sequence_id = self.tokenizer.encode( - "!", add_special_tokens=False - )[0] - for word_list in word_lists: item_flat_ids = [] item_offsets = [] @@ -281,16 +276,7 @@ def _to_word_list_format(self, word_lists: List[List[str | bytes]]): if isinstance(word, bytes): word = word.decode() - word = arbitrary_start_sequence_token + word ids = self.tokenizer.encode(word, add_special_tokens=False) - if ids[0] != arbitrary_start_sequence_id: - raise ValueError( - f"To standardize tokenizer behavior, we prepend '{arbitrary_start_sequence_token}' to the string representation of each stop sequence." - "We then strip the corresponding first token from the stop sequence IDs." - "However, the first token of the stop sequence IDs was not '{arbitrary_start_sequence_id}', which suggestions there is a problem with the tokenizer that you are using." - ) - else: - ids = ids[1:] if len(ids) == 0: continue diff --git a/triton_templates/tensorrt_llm/config.pbtxt b/triton_templates/tensorrt_llm/config.pbtxt index 71d2b98..1c34f77 100644 --- a/triton_templates/tensorrt_llm/config.pbtxt +++ b/triton_templates/tensorrt_llm/config.pbtxt @@ -69,6 +69,13 @@ input [ optional: true allow_ragged_batch: true }, + { + name: "draft_acceptance_threshold" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "end_id" data_type: TYPE_INT32 @@ -132,6 +139,27 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "runtime_top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p_reset_ids" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "len_penalty" data_type: TYPE_FP32 @@ -139,6 +167,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "early_stopping" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "repetition_penalty" data_type: TYPE_FP32 @@ -153,6 +188,13 @@ input [ reshape: { shape: [ ] } optional: true }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, { name: "presence_penalty" data_type: TYPE_FP32 @@ -338,6 +380,12 @@ parameters: { string_value: "${max_attention_window_size}" } } +parameters: { + key: "sink_token_length" + value: { + string_value: "${sink_token_length}" + } +} parameters: { key: "batch_scheduler_policy" value: { @@ -351,17 +399,54 @@ parameters: { } } parameters: { - key: "enable_trt_overlap" + key: "kv_cache_host_memory_bytes" value: { - string_value: "${enable_trt_overlap}" + string_value: "${kv_cache_host_memory_bytes}" } } +parameters: { + key: "kv_cache_onboard_blocks" + value: { + string_value: "${kv_cache_onboard_blocks}" + } +} +# enable_trt_overlap is deprecated and doesn't have any effect on the runtime +# parameters: { +# key: "enable_trt_overlap" +# value: { +# string_value: "${enable_trt_overlap}" +# } +# } parameters: { key: "exclude_input_in_output" value: { string_value: "${exclude_input_in_output}" } } +parameters: { + key: "cancellation_check_period_ms" + value: { + string_value: "${cancellation_check_period_ms}" + } +} +parameters: { + key: "stats_check_period_ms" + value: { + string_value: "${stats_check_period_ms}" + } +} +parameters: { + key: "iter_stats_max_iterations" + value: { + string_value: "${iter_stats_max_iterations}" + } +} +parameters: { + key: "request_stats_max_iterations" + value: { + string_value: "${request_stats_max_iterations}" + } +} parameters: { key: "enable_kv_cache_reuse" value: { @@ -417,9 +502,9 @@ parameters: { } } parameters: { - key: "worker_path" + key: "executor_worker_path" value: { - string_value: "/opt/tritonserver/backends/tensorrtllm/triton_tensorrtllm_worker" + string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker" } } parameters: { @@ -428,3 +513,9 @@ parameters: { string_value: "${medusa_choices}" } } +parameters: { + key: "gpu_weights_percent" + value: { + string_value: "${gpu_weights_percent}" + } +}