diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish index 5d9783b3b..912020a5a 100755 --- a/.ci/scripts/check_gibberish +++ b/.ci/scripts/check_gibberish @@ -24,6 +24,18 @@ else fi fi +####################################################################### +# +# check whether aspell spell check evailable + +if command -v aspell &> /dev/null; then + echo "Checking $TMPFILE for gibberish" +else + echo "Aspell is not installed or not in PATH." + echo "Gibberish unchecked in $TMPFILE" + exit 0 +fi + ####################################################################### # # run spell check on the extracted sequence diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs index a09944ad5..71f074cef 100755 --- a/.ci/scripts/run-docs +++ b/.ci/scripts/run-docs @@ -1,93 +1,67 @@ -# /bin/bash -x +#!/bin/bash -x -if [ "X$1" == "X" ]; then +# Check if an argument was provided +if [ -z "$1" ]; then echo "Must specify document to run" exit 1 fi -if [ "$1" == "readme" ]; then - echo "::group::Create script to run README" - python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-readme.sh - echo "::endgroup::" - - echo "::group::Run README" - echo "*******************************************" - cat ./run-readme.sh - echo "*******************************************" - bash -x ./run-readme.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "quantization" ]; then - echo "::group::Create script to run quantization" - python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-quantization.sh - echo "::endgroup::" - - echo "::group::Run quantization" - echo "*******************************************" - cat ./run-quantization.sh - echo "*******************************************" - bash -x ./run-quantization.sh - echo "::endgroup::" - - exit 0 -fi - -if [ "$1" == "gguf" ]; then - echo "::group::Create script to run gguf" - python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-gguf.sh - echo "::endgroup::" - - echo "::group::Run gguf" - echo "*******************************************" - cat ./run-gguf.sh - echo "*******************************************" - bash -x ./run-gguf.sh - echo "::endgroup::" -fi - - -if [ "$1" == "advanced" ]; then - echo "::group::Create script to run advanced" - python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-advanced.sh - echo "::endgroup::" - - echo "::group::Run advanced" - echo "*******************************************" - cat ./run-advanced.sh - echo "*******************************************" - bash -x ./run-advanced.sh - echo "::endgroup::" -fi - -if [ "$1" == "evaluation" ]; then - - exit 0 - - echo "::group::Create script to run evaluation" - python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh - # for good measure, if something happened to updown processor, - # and it did not error out, fail with an exit 1 - echo "exit 1" >> ./run-evaluation.sh - echo "::endgroup::" - - echo "::group::Run evaluation" - echo "*******************************************" - cat ./run-evaluation.sh - echo "*******************************************" - bash -x ./run-evaluation.sh -fi +# Pre-initialize variables +filepath="" +parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" +script_name="./run-${1}.sh" # Dynamically initialize script name + +# Use a case statement to handle the $1 argument +case "$1" in + "readme") + filepath="README.md" + ;; + "quantization") + filepath="docs/quantization.md" + ;; + "gguf") + filepath="docs/GGUF.md" + ;; + "advanced") + filepath="docs/ADVANCED-USERS.md" + ;; + "evaluation") + filepath="torchchat/utils/docs/evaluation.md" + ;; + "multimodal") + filepath="docs/multimodal.md" + parameters="" # Clear parameters + ;; + "native") + filepath="docs/native-execution.md" + parameters="" # Clear parameters + ;; + "distributed") + filepath="docs/distributed.md" + parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication + ;; + "local") + filepath="docs/local-model.md" + parameters="" # Clear parameters + ;; + + *) + echo "Unknown option: $1" + exit 1 + ;; +esac + +# Generate the script +echo "::group::Create script to run $1" +python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name" +# if something happened to updown processor, and it did not error out, fail with an exit 1 +echo "exit 1" >> "$script_name" +echo "::endgroup::" + +# Run the script +echo "::group::Run $1" +echo "*******************************************" +cat "$script_name" +echo "*******************************************" +bash -x "$script_name" +echo "::endgroup::" diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml index 1e0652c96..f772382d1 100644 --- a/.github/workflows/more-tests.yml +++ b/.github/workflows/more-tests.yml @@ -9,23 +9,20 @@ on: jobs: test-cuda: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - - echo "::group::Download checkpoints" # Install requirements ./install/install_requirements.sh cuda diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index a9561e3e8..2e264e6cf 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -108,7 +108,10 @@ jobs: set -eux PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu" test-gpu: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu secrets: inherit @@ -119,7 +122,7 @@ jobs: secrets-env: "HF_TOKEN_PERIODIC" runner: ${{ matrix.runner }} gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" script: | echo "::group::Print machine info" nvidia-smi diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index ee7270a5d..5dbafee9f 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -215,7 +215,10 @@ jobs: set -eux PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu" test-gpu-compile: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu strategy: @@ -224,7 +227,7 @@ jobs: with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" script: | echo "::group::Print machine info" nvidia-smi @@ -250,7 +253,10 @@ jobs: echo "::endgroup::" test-gpu-aoti-bfloat16: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu strategy: @@ -259,18 +265,13 @@ jobs: with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" nvidia-smi echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Install required packages" ./install/install_requirements.sh cuda pip3 list @@ -291,7 +292,10 @@ jobs: echo "::endgroup::" test-gpu-aoti-float32: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu strategy: @@ -300,17 +304,12 @@ jobs: with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" script: | echo "::group::Print machine info" nvidia-smi echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Install required packages" ./install/install_requirements.sh cuda pip list @@ -337,7 +336,10 @@ jobs: echo "::endgroup::" test-gpu-aoti-float16: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu strategy: @@ -346,17 +348,12 @@ jobs: with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" script: | echo "::group::Print machine info" nvidia-smi echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Install required packages" ./install/install_requirements.sh cuda pip list @@ -384,7 +381,10 @@ jobs: echo "::endgroup::" test-gpu-eval-sanity-check: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }}) needs: gather-models-gpu strategy: @@ -393,17 +393,12 @@ jobs: with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" script: | echo "::group::Print machine info" nvidia-smi echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Install required packages" ./install/install_requirements.sh cuda pip3 list @@ -731,6 +726,7 @@ jobs: git clone https://github.com/ggerganov/llama.cpp.git pushd llama.cpp + git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3 make popd @@ -941,7 +937,7 @@ jobs: path: | ./et-build ./torchchat/utils/scripts - key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }} + key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }} - if: ${{ steps.install-et.outputs.cache-hit != 'true' }} continue-on-error: true run: | @@ -1030,7 +1026,10 @@ jobs: echo "Tests complete." test-build-runner-et-android: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.4xlarge script: | @@ -1052,7 +1051,7 @@ jobs: # Pull submodules (re2, abseil) for Tiktoken git submodule sync - git submodule update --init + git submodule update --init --recursive ./runner/build_android.sh echo "Tests complete." @@ -1123,3 +1122,41 @@ jobs: echo "Generate AOTI" python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}" echo "Tests complete." + + test-torchao-experimental-mps: + strategy: + matrix: + runner: [macos-m1-stable] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.10.11 + - name: Print machine info + run: | + uname -a + if [ $(uname -s) == Darwin ]; then + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + fi + - name: Install torchchat + run: | + echo "Intalling pip3 packages" + ./install/install_requirements.sh + pip3 list + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + - name: Install torchao-ops-mps + id: install-torchao-ops-mps + run: | + bash torchchat/utils/scripts/build_torchao_ops.sh mps + - name: Run inference + run: | + python torchchat.py download stories110M + export PRMT="Once upon a time in a land far away" + echo "Generate eager" + python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 3, "groupsize": 32}}' diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml index 6a933b5f1..2c49a975f 100644 --- a/.github/workflows/run-readme-periodic.yml +++ b/.github/workflows/run-readme-periodic.yml @@ -10,24 +10,22 @@ on: jobs: test-readme: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main secrets: inherit with: runner: linux.g5.4xlarge.nvidia.gpu secrets-env: "HF_TOKEN_PERIODIC" gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Create script to run README" python3 torchchat/utils/scripts/updown.py --create-sections --file README.md > ./run-readme.sh # for good measure, if something happened to updown processor, @@ -44,23 +42,21 @@ jobs: test-quantization-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu secrets: inherit gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Create script to run quantization" python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md > ./run-quantization.sh # for good measure, if something happened to updown processor, @@ -76,24 +72,22 @@ jobs: echo "::endgroup::" test-gguf-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main secrets: inherit with: runner: linux.g5.4xlarge.nvidia.gpu secrets-env: "HF_TOKEN_PERIODIC" gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - echo "::group::Create script to run gguf" python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md > ./run-gguf.sh # for good measure, if something happened to updown processor, diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml new file mode 100644 index 000000000..1f22c4f2e --- /dev/null +++ b/.github/workflows/run-readme-pr-linuxaarch64.yml @@ -0,0 +1,114 @@ +name: Run the README instructions - with stories - on Linux aarch64 + +on: + pull_request: + push: + branches: + - main + workflow_dispatch: + +jobs: + test-readme-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.arm64.2xlarge + docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main" + gpu-arch-type: cpu-aarch64 + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-quantization-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.arm64.2xlarge + docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main" + gpu-arch-type: cpu-aarch64 + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization + + test-gguf-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.arm64.2xlarge + docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main" + gpu-arch-type: cpu-aarch64 + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-advanced-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.arm64.2xlarge + docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main" + gpu-arch-type: cpu-aarch64 + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-evaluation-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + with: + runner: linux.arm64.2xlarge + docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main" + gpu-arch-type: cpu-aarch64 + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml index 64afe2247..ce84d3b50 100644 --- a/.github/workflows/run-readme-pr-macos.yml +++ b/.github/workflows/run-readme-pr-macos.yml @@ -33,7 +33,8 @@ jobs: sysctl machdep.cpu.core_count echo "::endgroup::" - .ci/scripts/run-docs readme + echo "using workaround for #1416 and #1315 by setting torchchat device explicitly" + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme echo "::group::Completion" echo "tests complete" @@ -68,7 +69,8 @@ jobs: sysctl machdep.cpu.core_count echo "::endgroup::" - .ci/scripts/run-docs quantization + echo "using workaround for #1416 and #1315 by setting torchchat device explicitly" + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization echo "::group::Completion" echo "tests complete" @@ -103,7 +105,8 @@ jobs: sysctl machdep.cpu.core_count echo "::endgroup::" - .ci/scripts/run-docs gguf + echo "using workaround for #1416 and #1315 by setting torchchat device explicitly" + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf echo "::group::Completion" echo "tests complete" @@ -137,9 +140,113 @@ jobs: sysctl machdep.cpu.core_count echo "::endgroup::" - .ci/scripts/run-docs advanced + echo "using workaround for #1416 and #1315 by setting torchchat device explicitly" + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced echo "::group::Completion" echo "tests complete" echo "*******************************************" echo "::endgroup::" + + test-eval-macos: + runs-on: macos-14-xlarge + steps: + - name: Checkout code + uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: '3.10.11' + - name: Setup Xcode + if: runner.os == 'macOS' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.3' + - name: Run script + run: | + set -x + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env but rather as system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-multimodal-macos: + runs-on: macos-14-xlarge + steps: + - name: Checkout code + uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: '3.10.11' + - name: Setup Xcode + if: runner.os == 'macOS' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.3' + - name: Run script + run: | + set -x + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env but rather as system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs multimodal + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-native-macos: + runs-on: macos-14-xlarge + steps: + - name: Checkout code + uses: actions/checkout@v2 + - uses: actions/setup-python@v4 + with: + python-version: '3.10.11' + - name: Setup Xcode + if: runner.os == 'macOS' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.3' + - name: Run script + run: | + set -x + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env but rather as system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs native + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml index 718d5cf9e..db16bc80e 100644 --- a/.github/workflows/run-readme-pr-mps.yml +++ b/.github/workflows/run-readme-pr-mps.yml @@ -10,12 +10,13 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: runner: macos-m1-14 + timeout: 60 script: | conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp conda activate test-readme-mps-macos set -x - # NS: Remove previous installation of torch first - # as this script does not isntall anything into conda env but rather as system dep + # NS: Remove previous installation of torch first + # as this script does not install anything into conda env but rather as system dep pip3 uninstall -y torch || true set -eou pipefail @@ -35,7 +36,8 @@ jobs: test-quantization-mps-macos: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: - runner: macos-m1-14 + runner: macos-m1-14 + timeout: 60 script: | set -x conda create -y -n test-quantization-mps-macos python=3.10.11 @@ -62,7 +64,7 @@ jobs: test-gguf-mps-macos: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: - runner: macos-m1-14 # neeps MPS, was macos-m1-stable + runner: macos-m1-14 # needs MPS, was macos-m1-stable script: | set -x conda create -y -n test-quantization-mps-macos python=3.10.11 @@ -89,7 +91,7 @@ jobs: test-advanced-mps-macos: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: - runner: macos-m1-14 # neeps MPS, was macos-m1-stable + runner: macos-m1-14 # needs MPS, was macos-m1-stable script: | set -x conda create -y -n test-quantization-mps-macos python=3.10.11 @@ -112,3 +114,84 @@ jobs: echo "tests complete" echo "*******************************************" echo "::endgroup::" + + test-evaluation-mps-macos: + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + runner: macos-m1-14 # needs MPS, was macos-m1-stable + script: | + set -x + conda create -y -n test-evaluation-mps-macos python=3.10.11 + conda activate test-evaluation-mps-macos + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env + # but rather system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-multimodal-mps-macos: + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + runner: macos-m1-14 # needs MPS, was macos-m1-stable + script: | + set -x + conda create -y -n test-multimodal-mps-macos python=3.10.11 + conda activate test-multimodal-mps-macos + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env + # but rather system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs multimodal + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-native-mps-macos: + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + runner: macos-m1-14 # needs MPS, was macos-m1-stable + script: | + set -x + conda create -y -n test-native-mps-macos python=3.10.11 + conda activate test-native-mps-macos + # NS: Remove previous installation of torch first + # as this script does not isntall anything into conda env + # but rather system dep + pip3 uninstall -y torch || true + set -eou pipefail + + echo "::group::Print machine info" + uname -a + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + echo "::endgroup::" + + .ci/scripts/run-docs native + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml index cd6a95681..37c27822b 100644 --- a/.github/workflows/run-readme-pr.yml +++ b/.github/workflows/run-readme-pr.yml @@ -9,22 +9,20 @@ on: jobs: test-readme-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - .ci/scripts/run-docs readme echo "::group::Completion" @@ -33,22 +31,20 @@ jobs: echo "::endgroup::" test-readme-cpu: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme echo "::group::Completion" @@ -57,22 +53,20 @@ jobs: echo "::endgroup::" test-quantization-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - .ci/scripts/run-docs quantization echo "::group::Completion" @@ -81,41 +75,37 @@ jobs: echo "::endgroup::" test-quantization-cpu: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization test-gguf-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - .ci/scripts/run-docs gguf echo "::group::Completion" @@ -124,22 +114,20 @@ jobs: echo "::endgroup::" test-gguf-cpu: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf echo "::group::Completion" @@ -149,22 +137,20 @@ jobs: test-advanced-any: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - .ci/scripts/run-docs advanced echo "::group::Completion" @@ -174,22 +160,20 @@ jobs: test-advanced-cpu: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced echo "::group::Completion" @@ -198,6 +182,89 @@ jobs: echo "::endgroup::" test-evaluation-any: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-evaluation-cpu: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-multimodal-any: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs multimodal + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-multimodal-cpu: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal + + test-native-any: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu @@ -214,14 +281,14 @@ jobs: export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH echo "::endgroup::" - .ci/scripts/run-docs evaluation + .ci/scripts/run-docs native echo "::group::Completion" echo "tests complete" echo "*******************************************" echo "::endgroup::" - test-evaluation-cpu: + test-native-cpu: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu @@ -238,9 +305,26 @@ jobs: export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH echo "::endgroup::" - TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native + + test-distributed-cuda: + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.4" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + .ci/scripts/run-docs distributed echo "::group::Completion" echo "tests complete" echo "*******************************************" - echo "::endgroup::" \ No newline at end of file + echo "::endgroup::" diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml index b83b9904b..0b4597942 100644 --- a/.github/workflows/runner-cuda-dtype.yml +++ b/.github/workflows/runner-cuda-dtype.yml @@ -9,24 +9,21 @@ on: jobs: test-runner-aot-cuda: - uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.g5.4xlarge.nvidia.gpu secrets-env: "HF_TOKEN_PERIODIC" gpu-arch-type: cuda - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" timeout: 60 script: | echo "::group::Print machine info" uname -a echo "::endgroup::" - echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH - echo "::endgroup::" - - echo "::group::Download checkpoints" # Install requirements @@ -58,7 +55,7 @@ jobs: python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2 - ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" + ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}" done diff --git a/.gitignore b/.gitignore index 74d0a28fa..61ab1ee4d 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,10 @@ runner-et/cmake-out/* runner-aoti/cmake-out/* cmake-out/ +# Example project Android Studio ignore +torchchat/edge/android/torchchat/.idea/* + + # pte files *.pte diff --git a/.gitmodules b/.gitmodules index 7681823df..76bc1b9fd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,3 @@ -[submodule "tokenizer/third-party/abseil-cpp"] - path = tokenizer/third-party/abseil-cpp - url = https://github.com/abseil/abseil-cpp.git -[submodule "tokenizer/third-party/re2"] - path = tokenizer/third-party/re2 - url = https://github.com/google/re2.git -[submodule "tokenizer/third-party/sentencepiece"] - path = tokenizer/third-party/sentencepiece - url = https://github.com/google/sentencepiece.git +[submodule "runner/third-party/tokenizers"] + path = runner/third-party/tokenizers + url = https://github.com/pytorch-labs/tokenizers diff --git a/CMakeLists.txt b/CMakeLists.txt index 61fd4d5a6..e004dbfcb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,18 +7,21 @@ ELSE() ENDIF() project(Torchchat) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") # include tokenizer -add_subdirectory(tokenizer) +add_subdirectory(runner/third-party/tokenizers) # include et_run executable include(runner/et.cmake) if(TARGET et_run) - target_link_libraries(et_run PUBLIC tokenizer microkernels-prod) + target_link_libraries(et_run PUBLIC tokenizers microkernels-prod) + target_include_directories(et_run PUBLIC runner/third-party/tokenizers/include) endif() # include aoti_run executable include(runner/aoti.cmake) if(TARGET aoti_run) - target_link_libraries(aoti_run tokenizer) + target_link_libraries(aoti_run tokenizers) + target_include_directories(aoti_run PUBLIC runner/third-party/tokenizers/include) endif() diff --git a/README.md b/README.md index 4b910e575..51db1bfca 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,11 @@ torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android. > [!IMPORTANT] -> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!! +> Update +> +> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)! +> +> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**! > > To try it out, finish the [Installation](#Installation) section below, then hop > over to our [multimodal guide](docs/multimodal.md) to learn more. @@ -45,16 +49,16 @@ aliases. | Model | Mobile Friendly | Notes | |------------------|---|---------------------| -|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-3b`.| +|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-3b`.| |[meta-llama/Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|✅|Best for `generate`. Alias to `llama3.2-3b-base`.| -|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification . Alias to `llama3-1b-guard`.| -|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-1b`.| +|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification. Alias to `llama3-1b-guard`.| +|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-1b`.| |[meta-llama/Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|✅|Best for `generate`. Alias to `llama3.2-1b-base`.| -|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat` . Alias to `llama3.2-11B`.| -|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate` . Alias to `llama3.2-11B-base`.| -|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.1`.| +|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat`. Alias to `llama3.2-11B`.| +|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate`. Alias to `llama3.2-11B-base`.| +|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.1`.| |[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)|✅|Best for `generate`. Alias to `llama3.1-base`.| -|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3`.| +|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3`.| |[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|✅|Best for `generate`. Alias to `llama3-base`.| |[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|✅|Tuned for `chat`. Alias to `llama2`.| |[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)||Tuned for `chat`. Alias to `llama2-13b-chat`.| @@ -69,6 +73,14 @@ aliases. |[tinyllamas/stories42M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories42M`.| |[tinyllamas/stories110M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories110M`.| |[openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b)|✅|Best for `generate`. Alias to `open-llama`.| +| [ibm-granite/granite-3b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k) |✅| Alias to `granite-code` and `granite-code-3b`.| +| [ibm-granite/granite-8b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-8b-code-instruct-128k) |✅| Alias to `granite-code-8b`.| +| [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct) |✅| Alias to `granite3-2b` and `granite3`.| +| [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.| +| [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.| +| [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.| +| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.| + ## Installation The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed. @@ -231,6 +243,8 @@ python3 torchchat.py server llama3.1 ``` [skip default]: end +[shell default]: python3 torchchat.py server llama3.1 & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests + In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond. > [!NOTE] @@ -244,8 +258,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream **Example Input + Output** -[skip default]: begin - ``` curl http://127.0.0.1:5000/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -265,12 +277,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \ ] }' ``` +[skip default]: begin ``` {"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"} ``` [skip default]: end +[shell default]: kill ${server_pid} @@ -332,7 +346,7 @@ torchchat/utils/scripts/build_native.sh aoti Then run the compiled executable, with the pt2. ```bash -cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time" +cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time" ``` ## Mobile Execution @@ -404,7 +418,7 @@ torchchat/utils/scripts/build_native.sh et Execute using the runner ```bash -cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time" +cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time" ``` @@ -664,6 +678,6 @@ awesome libraries and tools you've built around local LLM inference. torchchat is released under the [BSD 3 license](LICENSE). (Additional code in this distribution is covered by the MIT and Apache Open Source -licenses.) However you may have other legal obligations that govern +licenses.) However, you may have other legal obligations that govern your use of content, such as the terms of service for third-party models. diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md index 417a823f8..17958e790 100644 --- a/docs/ADVANCED-USERS.md +++ b/docs/ADVANCED-USERS.md @@ -1,27 +1,25 @@ > [!WARNING] > Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation. -# Torchchat is still in pre-release! - - -Torchchat is currently in a pre-release state and under extensive development. - # The Lost Manual: torchchat [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license) + -This is the advanced users guide, if you're looking to get started +This is the advanced users' guide, if you're looking to get started with LLMs, please refer to the README at the root directory of the torchchat distro. This is an advanced user guide, so we will have -many more concepts and options to discuss and taking advantage of them +many more concepts and options to discuss and take advantage of them may take some effort. We welcome community contributions of all kinds. If you find @@ -41,7 +39,7 @@ While we strive to support a broad range of models, we can't test them all. We classify supported models as tested ✅, work in progress 🚧 or some restrictions ❹. -We invite community contributions of new model suport and test results! +We invite community contributions of new model support and test results! | Model | Tested | Eager | torch.compile | AOT Inductor | ExecuTorch | Fits on Mobile | |-----|--------|-------|-----|-----|-----|-----| @@ -86,7 +84,7 @@ Server C++ runtime | n/a | run.cpp model.pte | ✅ | Mobile C++ runtime | n/a | app model.pte | ✅ | Mobile C++ runtime | n/a | app + AOTI | 🚧 | -**Getting help:** Each command implements the --help option to give addititonal information about available options: +**Getting help:** Each command implements the --help option to give additional information about available options: [skip default]: begin ``` @@ -96,8 +94,8 @@ python3 torchchat.py [ export | generate | chat | eval | ... ] --help Exported models can be loaded back into torchchat for chat or text generation, letting you experiment with the exported model and valid -model quality. The python interface is the same in all cases and is -used for testing nad test harnesses too. +model quality. The Python interface is the same in all cases and is +used for testing and test harnesses, too. Torchchat comes with server C++ runtimes to execute AOT Inductor and ExecuTorch models. A mobile C++ runtimes allow you to deploy @@ -115,7 +113,7 @@ Some common models are recognized by torchchat based on their filename through `Model.from_name()` to perform a fuzzy match against a table of known model architectures. Alternatively, you can specify the index into that table with the option `--params-table ${INDEX}` where -the index is the lookup key key in the [the list of known +the index is the lookup key in the [the list of known pconfigurations](https://github.com/pytorch/torchchat/tree/main/torchchat/model_params) For example, for the stories15M model, this would be expressed as `--params-table stories15M`. (We use the model constructor @@ -237,7 +235,7 @@ which chooses the best 16-bit floating point type. The virtual device fast and virtual floating point data types fast and fast16 are best used for eager/torch.compiled execution. For export, -specify the your device choice for the target system with --device for +specify your device choice for the target system with --device for AOTI-exported DSO models, and using ExecuTorch delegate selection for ExecuTorch-exported PTE models. @@ -250,8 +248,9 @@ python3 torchchat.py generate [--compile] --checkpoint-path ${MODEL_PATH} --prom To improve performance, you can compile the model with `--compile` trading off the time to first token processed with time per token. To improve performance further, you may also compile the prefill with -`--compile_prefill`. This will increase further compilation times though. The -`--compile-prefill` option is not compatible with `--prefill-prefill`. +`--compile-prefill`. This will increase further compilation times though. +For CPU, you can use `--max-autotune` to further improve the performance +with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html). Parallel prefill is not yet supported by exported models, and may be supported in a future release. @@ -265,7 +264,7 @@ the introductory README. In addition to running eval on models in eager mode and JIT-compiled mode with `torch.compile()`, you can also load dso and pte models back into the PyTorch to evaluate the accuracy of exported model objects -(e.g., after applying quantization or other traqnsformations to +(e.g., after applying quantization or other transformations to improve speed or reduce model size). Loading exported models back into a Python-based Pytorch allows you to @@ -297,14 +296,14 @@ for ExecuTorch.) We export the stories15M model with the following command for execution with the ExecuTorch runtime (and enabling execution on a -wide range of community and vendor supported backends): +wide range of community and vendor-supported backends): ``` python3 torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_NAME}.pte ``` Alternatively, we may generate a native instruction stream binary -using AOT Inductor for CPU oor GPUs (the latter using Triton for +using AOT Inductor for CPU or GPUs (the latter using Triton for optimizations such as operator fusion): ``` @@ -319,10 +318,10 @@ the exported model artifact back into a model container with a compatible API surface for the `model.forward()` function. This enables users to test, evaluate and exercise the exported model artifact with familiar interfaces, and in conjunction with -pre-exiisting Python model unit tests and common environments such as +pre-existing Python model unit tests and common environments such as Jupyter notebooks and/or Google colab. -Here is how to load an exported model into the python environment on the example of using an exported model with `generate.oy`. +Here is how to load an exported model into the Python environment using an exported model with the `generate` command. ``` python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --pte-path ${MODEL_NAME}.pte --device cpu --prompt "Once upon a time" @@ -452,7 +451,7 @@ strategies: You can find instructions for quantizing models in [docs/quantization.md](file:///./quantization.md). Advantageously, quantization is available in eager mode as well as during export, -enabling you to do an early exploration of your quantization setttings +enabling you to do an early exploration of your quantization settings in eager mode. However, final accuracy should always be confirmed on the actual execution target, since all targets have different build processes, compilers, and kernel implementations with potentially @@ -464,9 +463,8 @@ significant impact on accuracy. ## Native (Stand-Alone) Execution of Exported Models -Refer to the [README](README.md] for an introduction toNative -execution on servers, desktops and laptops is described under -[runner-build.md]. Mobile and Edge executipon for Android and iOS are +Refer to the [README](README.md) for an introduction to native +execution on servers, desktops, and laptops. Mobile and Edge execution for Android and iOS are described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively. @@ -475,13 +473,13 @@ described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md PyTorch and ExecuTorch support a broad range of devices for running PyTorch with python (using either eager or eager + `torch.compile`) or -in a python-free environment with AOT Inductor and ExecuTorch. +in a Python-free environment with AOT Inductor and ExecuTorch. | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime | |-----|------|-----|-----|-----|-----| | x86 | Linux | ✅ | ✅ | ✅ | ✅ | -| aarch64 | Linux | n/t | n/t | n/t | n/t | +| aarch64 | Linux | ✅ | ✅ | ✅ | n/t | | aarch64 | macOS | ✅ | ✅ | ✅ | ✅ | | AMD GPU | Linux | ✅ | ✅ | ✅ | ❌| | Nvidia GPU | Linux | ✅ | ✅ | ✅ | ❌| @@ -492,65 +490,13 @@ in a python-free environment with AOT Inductor and ExecuTorch. | Mobile GPU (Vulkan) | Android | ❌|❌|❌| ✅ | | CoreML | iOS | ❌|❌|❌| ✅ | | Hexagon DSP | Android | ❌|❌|❌| ✅ | -| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ | +| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ | | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t | | ARM 32b (up to v7) | any | ❌|❌|❌|❌| *Key*: n/t -- not tested -## Runtime performance with Llama 7B, in tokens per second (4b quantization) - -| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime | -|-----|------|-----|-----|-----|-----| -| x86 | Linux | ? | ? | ? | ? | -| x86 | macOS | ? | ? | ? | ? | -| aarch64 | Linux | ? | ? | ? | ? | -| aarch64 | macOS | ? | ? | ? | ? | -| AMD GPU | Linux | ? | ? | ? | ? | -| Nvidia GPU | Linux | ? | ? | ? | ? | -| MPS | macOS | ? | ? | ? | ? | -| MPS | iOS | ? | ? | ? | ? | -| aarch64 | Android | ? | ? | ? | ? | -| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? | -| CoreML | iOS | | ? | ? | ? | ? | -| Hexagon DSP | Android | | ? | ? | ? | ? | -| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? | -| Raspberry Pi 4/5 | Android | ? | ? | ? | ? | -| ARM 32b (up to v7) | any | | ? | ? | ? | ? | - - -## Runtime performance with Llama3, in tokens per second (4b quantization) - -| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime | -|-----|------|-----|-----|-----|-----| -| x86 | Linux | ? | ? | ? | ? | -| x86 | macOS | ? | ? | ? | ? | -| aarch64 | Linux | ? | ? | ? | ? | -| aarch64 | macOS | ? | ? | ? | ? | -| AMD GPU | Linux | ? | ? | ? | ? | -| Nvidia GPU | Linux | ? | ? | ? | ? | -| MPS | macOS | ? | ? | ? | ? | -| MPS | iOS | ? | ? | ? | ? | -| aarch64 | Android | ? | ? | ? | ? | -| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? | -| CoreML | iOS | | ? | ? | ? | ? | -| Hexagon DSP | Android | | ? | ? | ? | ? | -| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? | -| Raspberry Pi 4/5 | Android | ? | ? | ? | ? | -| ARM 32b (up to v7) | any | | ? | ? | ? | ? | - - - - -# CONTRIBUTING to torchchat - -We welcome any feature requests, bug reports, or pull requests from -the community. See the [CONTRIBUTING](CONTRIBUTING.md) for -instructions how to contribute to torchchat. - - - # LICENSE Torchchat is released under the [BSD 3 license](./LICENSE). However diff --git a/docs/distributed.md b/docs/distributed.md new file mode 100644 index 000000000..3d34d7672 --- /dev/null +++ b/docs/distributed.md @@ -0,0 +1,125 @@ +# Distributed Inference with torchchat + +torchchat supports distributed inference for large language models (LLMs) on GPUs seamlessly. +At present, torchchat supports distributed inference using Python only. + +## Installation +The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed. + +> [!TIP] +> torchchat uses the latest changes from various PyTorch projects so it's highly recommended that you use a venv (by using the commands below) or CONDA. + +[skip default]: begin +```bash +git clone https://github.com/pytorch/torchchat.git +cd torchchat +python3 -m venv .venv +source .venv/bin/activate +./install/install_requirements.sh +``` +[skip default]: end + +[shell default]: ./install/install_requirements.sh + +## Login to HF for Downloading Weights +Most models use Hugging Face as the distribution channel, so you will need to create a Hugging Face account. Create a Hugging Face user access token as documented here with the write role. + +Log into Hugging Face: + +[prefix default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" + +``` +huggingface-cli login +``` + +## Enabling Distributed torchchat Inference + +To enable distributed inference, use the option `--distributed`. In addition, `--tp ` and `--pp ` +allow users to specify the types of parallelism to use where tp refers to tensor parallelism and pp to pipeline parallelism. + + +## Generate Output with Distributed torchchat Inference + +To generate output using distributed inference with 4 GPUs, you can use: +``` +python3 torchchat.py generate llama3.1 --distributed --tp 2 --pp 2 --prompt "write me a story about a boy and his bear" +``` + + +## Chat with Distributed torchchat Inference + +This mode allows you to chat with an LLM in an interactive fashion with distributed Inference. The following example uses 4 GPUs: + +[skip default]: begin +```bash +python3 torchchat.py chat llama3.1 --max-new-tokens 10 --distributed --tp 2 --pp 2 +``` +[skip default]: end + + +## A Server with Distributed torchchat Inference + +This mode exposes a REST API for interacting with a model. +The server follows the [OpenAI API specification](https://platform.openai.com/docs/api-reference/chat) for chat completions. + +To test out the REST API, **you'll need 2 terminals**: one to host the server, and one to send the request. + +In one terminal, start the server to run with 4 GPUs: + +[skip default]: begin + +```bash +python3 torchchat.py server llama3.1 --distributed --tp 2 --pp 2 +``` +[skip default]: end + + + +In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond. + +> [!NOTE] +> Since this feature is under active development, not every parameter is consumed. See api/api.py for details on +> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973). + +
+Example Query + +Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will await the full response from the server. + +**Example Input + Output** + +``` +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.1", + "stream": "true", + "max_tokens": 200, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ] + }' +``` +[skip default]: begin +``` +{"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"} +``` + +[skip default]: end + + + +
+ +[end default]: end diff --git a/docs/local-model.md b/docs/local-model.md new file mode 100644 index 000000000..2d48e2438 --- /dev/null +++ b/docs/local-model.md @@ -0,0 +1,138 @@ +# Using Local Models in Torchcha/ +Torchchat provides powerful capabilities for running large language models (LLMs) locally. This guide focuses on utilizing local copies of +model checkpoints or models in GGUF format to create a chat application. It also highlights relevant options for advanced users. + +## Prerequisites +To work with local models, you need: +1. **Model Weights**: A checkpoint file (e.g., `.pth`, `.pt`) or a GGUF file (e.g., `.gguf`). +2. **Tokenizer**: A tokenizer model file.This can either be in SentencePiece or TikToken format, depending on the tokenizer used with the model. +3. **Parameter File**: (a) A custom parameter file in JSON format, or (b) a pre-existing parameter file with `--params-path` + or `--params-table`, or (c) a pathname that’s matched against known models by longest substring in configuration name, using the same algorithm as GPT-fast. + +Ensure the tokenizer and parameter files are in the same directory as the checkpoint or GGUF file for automatic detection. +Let’s use a local download of the stories15M tinyllama model as an example: + +``` +mkdir stories15M +cd stories15M +wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt +wget https://github.com/karpathy/llama2.c/raw/refs/heads/master/tokenizer.model +cp ../torchchat/model_params/stories15M.json model.json +cd .. +``` + + +## Using Local Checkpoints +Torchchat provides the CLI flag `--checkpoint-path` for specifying local model weights. The tokenizer is +loaded from the same directory as the checkpoint with the name ‘tokenizer.model’ unless separately specified. +This example obtains the model parameters by name matching to known models because ‘stories15M’ is one of the +models known to torchchat with a configuration stories in ‘torchchat/model_params’: + + +### Example 1: Basic Text Generation + + +``` +python3 torchchat.py generate \ + --checkpoint-path stories15M/stories15M.pt \ + --prompt "Hello, my name is" +``` + + +### Example 2: Providing Additional Artifacts +The following is an example of how to specify a local model checkpoint, the model architecture, and a tokenizer file: +``` +python3 torchchat.py generate \ + --prompt "Once upon a time" \ + --checkpoint-path stories15M/stories15M.pt \ + --params-path stories15M/model.json \ + --tokenizer-path stories15M/tokenizer.model +``` + + +Alternatively, we can specify the known architecture configuration for known models using ‘--params-table’ +to specify a p[particular architecture in the ‘torchchat/model_params’: + +``` +python3 torchchat.py generate \ + --prompt "Once upon a time" \ + --checkpoint-path stories15M/stories15M.pt \ + --params-table stories15M \ + --tokenizer-path stories15M//tokenizer.model +``` + + +## Using GGUF Models +Torchchat supports loading models in GGUF format using the `--gguf-file`. Refer to GGUF.md for additional +documentation about using GGUF files in torchchat. + +The GGUF format is compatible with several quantization levels such as F16, F32, Q4_0, and Q6_K. Model +configuration information is obtained directly from the GGUF file, simplifying setup and obviating the +need for a separate `model.json` model architecture specification. + + +## Using local models +Torchchat supports all commands such as chat, browser, server and export using local models. (In fact, +known models simply download and populate the parameters specified for local models.) +Here is an example setup for running a server with a local model: + + +[skip default]: begin +``` +python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt +``` +[skip default]: end + + +[shell default]: python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests + + +In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond. + + +> [!NOTE] +> Since this feature is under active development, not every parameter is consumed. See `#api/api.pyi` for details on +> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973). + + +
+ + +Example Query +Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will +await the full response from the server. + + +**Example: using the server** +A model server used witha local model works like any other torchchat server. You can test it by sending a request with ‘curl’: +``` +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llama3.1", + "stream": "true", + "max_tokens": 200, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Hello!" + } + ] + }' +``` + + +[shell default]: kill ${server_pid} + + +
+ + +For more information about using different commands, see the root README.md and refer to the Advanced Users Guide for further details on advanced configurations and parameter tuning. + + +[end default]: end diff --git a/docs/model_customization.md b/docs/model_customization.md index 3c076fa71..7108b4ce2 100644 --- a/docs/model_customization.md +++ b/docs/model_customization.md @@ -34,6 +34,9 @@ prefill with `--compile_prefill`. To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/ +For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`. + +See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html). ## Model Precision diff --git a/docs/multimodal.md b/docs/multimodal.md index f3e3f0fe2..cd249a1fb 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -14,9 +14,11 @@ This page goes over the different commands you can run with LLama 3.2 11B Vision While we strongly encourage you to use the Hugging Face checkpoint (which is the default for torchchat when utilizing the commands with the argument `llama3.2-11B`), we also provide support for manually providing the checkpoint. This can be done by replacing the `llama3.2-11B` argument in the commands below with the following: +[skip default]: begin ``` --checkpoint-path --tokenizer-path --params-path torchchat/model_params/Llama-3.2-11B-Vision.json ``` +[skip default]: end ## Generation This generates text output based on a text prompt and (optional) image prompt. @@ -39,6 +41,9 @@ python3 torchchat.py server llama3.2-11B ``` [skip default]: end +[shell default]: python3 torchchat.py server llama3.2-11B & server_pid=$! + + In another terminal, query the server using `curl`. This query might take a few minutes to respond.
@@ -71,10 +76,13 @@ curl http://127.0.0.1:5000/v1/chat/completions \ "max_tokens": 300 }' ``` - +[skip default]: begin ``` {"id": "chatcmpl-cb7b39af-a22e-4f71-94a8-17753fa0d00c", "choices": [{"message": {"role": "assistant", "content": "The image depicts a simple black and white cartoon-style drawing of an animal face. It features a profile view, complete with two ears, expressive eyes, and a partial snout. The animal looks to the left, with its eye and mouth implied, suggesting that the drawn face might belong to a rabbit, dog, or pig. The graphic face has a bold black outline and a smaller, solid black nose. A small circle, forming part of the face, has a white background with two black quirkly short and long curved lines forming an outline of what was likely a mouth, complete with two teeth. The presence of the curve lines give the impression that the animal is smiling or speaking. Grey and black shadows behind the right ear and mouth suggest that this face is looking left and upwards. Given the prominent outline of the head and the outline of the nose, it appears that the depicted face is most likely from the side profile of a pig, although the ears make it seem like a dog and the shape of the nose makes it seem like a rabbit. Overall, it seems that this image, possibly part of a character illustration, is conveying a playful or expressive mood through its design and positioning."}, "finish_reason": "stop"}], "created": 1727487574, "model": "llama3.2", "system_fingerprint": "cpu_torch.float16", "object": "chat.completion"}% ``` +[skip default]: end + +[shell default]: kill ${server_pid}
@@ -90,6 +98,8 @@ First, follow the steps in the Server section above to start a local server. The streamlit run torchchat/usages/browser.py ``` +[skip default]: end + --- # Future Work diff --git a/docs/native-execution.md b/docs/native-execution.md index 790547e21..c22d3c3ba 100644 --- a/docs/native-execution.md +++ b/docs/native-execution.md @@ -16,14 +16,14 @@ The 'llama runner' is a native standalone application capable of running a model exported and compiled ahead-of-time with either Executorch (ET) or AOT Inductor (AOTI). Which model format to use depends on your requirements and preferences. Executorch models are -optimized for portability across a range of decices, including mobile +optimized for portability across a range of devices, including mobile and edge devices. AOT Inductor models are optimized for a particular target architecture, which may result in better performance and efficiency. Building the runners is straightforward with the included cmake build files and is covered in the next sections. We will showcase the -runners using ~~stories15M~~ llama2 7B and llama3. +runners using llama2 7B and llama3. ## What can you do with torchchat's llama runner for native execution? @@ -160,7 +160,7 @@ and native execution environments, respectively. After exporting a model, you will want to verify that the model delivers output of high quality, and works as expected. Both can be -achieved with the Python environment. All torchchat Python comands +achieved with the Python environment. All torchchat Python commands can work with exported models. Instead of loading the model from a checkpoint or GGUF file, use the `--dso-path model.so` and `--pte-path model.pte` for loading both types of exported models. This diff --git a/docs/quantization.md b/docs/quantization.md index 3415d8cb8..56fd2182e 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -59,7 +59,7 @@ for valid `bitwidth` and `groupsize` values. | linear with dynamic activations (symmetric) | `'{"linear:a8w4dq" : {"groupsize" : }}'`| | embedding | `'{"embedding": {"bitwidth": , "groupsize":}}'` | -See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/main/torchchat/utils/quantize.py#L1260-L1266). +See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/b809b69e03f8f4b75a4b27b0778f0d3695ce94c2/torchchat/utils/quantize.py#L887-L894). In addition to quantization, the [accelerator](model_customization.md#device) and [precision](model_customization.md#model-precision) can also be specified. @@ -142,7 +142,7 @@ To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental From the torchchat root directory, run ``` -sh torchchat/utils/scripts/build_torchao_ops.sh +bash torchchat/utils/scripts/build_torchao_ops.sh ``` This should take about 10 seconds to complete. @@ -150,14 +150,14 @@ This should take about 10 seconds to complete. Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners. ``` -sh torchchat/utils/scripts/build_native.sh aoti link_torchao_ops +bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops ``` ``` -sh torchchat/utils/scripts/build_native.sh et link_torchao_ops +bash torchchat/utils/scripts/build_native.sh et link_torchao_ops ``` -Note before running `sh torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `sh torchchat/utils/scripts/install_et.sh` if you have not done so already. +Note before running `bash torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `bash torchchat/utils/scripts/install_et.sh` if you have not done so already. ### Examples @@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner: ``` -OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3 ``` #### ExecuTorch @@ -193,7 +193,33 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command. ``` -./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time," +./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time," +``` + +## Experimental TorchAO MPS lowbit kernels + +WARNING: These kernels only work on devices with Apple Silicon. + +### Use + +#### linear:afpwx +The quantization scheme linear:afpwx quantizes only the weights in a groupwise manner with a specified bitwidth and groupsize. +It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize (32, 64, 128, 256). + +### Setup +To use linear:afpwx, you must set up the torchao mps experimental kernels. These will only work on device with Apple Silicon. +Currently, torchchat can only run them on Eager mode. + +From the torchchat root directory, run +``` +bash torchchat/utils/scripts/build_torchao_ops.sh mps +``` + +### Examples + +#### Eager mode +``` +python3 torchchat.py generate stories110M --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 4, "groupsize": 256}}' --prompt "Once upon a time," --num-samples 5 ``` ## Quantization Profiles diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt index e61fae3a5..e79e9c341 100644 --- a/install/.pins/et-pin.txt +++ b/install/.pins/et-pin.txt @@ -1 +1 @@ -72b3bb3194c611f7c4861e6f3b24af5de868af72 +9c043290ad3944268290e015c3063bc411e6ef6b diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt index 40f083249..2da70769c 100644 --- a/install/.pins/torchao-pin.txt +++ b/install/.pins/torchao-pin.txt @@ -1 +1 @@ -c8f1174a06dcc0102849c8348ca6573bde8847a9 +2e032c6b0de960dee554dcb08126ace718b14c6d diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 6344509d8..360ba1801 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -9,36 +9,40 @@ set -eou pipefail # Install required python dependencies for developing # Dependencies are defined in .pyproject.toml -PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python} -if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]]; +if [ -z "${PYTHON_EXECUTABLE:-}" ]; then - PYTHON_EXECUTABLE=python3 + if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]]; + then + PYTHON_EXECUTABLE=python3 + else + PYTHON_EXECUTABLE=python + fi fi - -# Check python version. Expect 3.10.x or 3.11.x -printf "import sys\nif sys.version_info.major != 3 or sys.version_info.minor < 10 :\n\tprint('Please use Python >=3.10');sys.exit(1)\n" | $PYTHON_EXECUTABLE -if [[ $? -ne 0 ]] +echo "Using python executable: $PYTHON_EXECUTABLE" + +PYTHON_SYS_VERSION="$($PYTHON_EXECUTABLE -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")" +# Check python version. Expect at least 3.10.x +if ! $PYTHON_EXECUTABLE -c " +import sys +if sys.version_info < (3, 10): + sys.exit(1) +"; then + echo "Python version must be at least 3.10.x. Detected version: $PYTHON_SYS_VERSION" exit 1 fi if [[ "$PYTHON_EXECUTABLE" == "python" ]]; then PIP_EXECUTABLE=pip -else +elif [[ "$PYTHON_EXECUTABLE" == "python3" ]]; +then PIP_EXECUTABLE=pip3 +else + PIP_EXECUTABLE=pip${PYTHON_SYS_VERSION} fi -# -# First install requirements in install/requirements.txt. Older torch may be -# installed from the dependency of other models. It will be overridden by -# newer version of torch nightly installed later in this script. -# - -( - set -x - $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -) +echo "Using pip executable: $PIP_EXECUTABLE" # Since torchchat often uses main-branch features of pytorch, only the nightly # pip versions will have the required features. The PYTORCH_NIGHTLY_VERSION value should @@ -47,38 +51,60 @@ fi # NOTE: If a newly-fetched version of the executorch repo changes the value of # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -PYTORCH_NIGHTLY_VERSION=dev20241002 +PYTORCH_NIGHTLY_VERSION=dev20250124 # Nightly version for torchvision -VISION_NIGHTLY_VERSION=dev20241002 +VISION_NIGHTLY_VERSION=dev20250124 # Nightly version for torchtune -TUNE_NIGHTLY_VERSION=dev20241010 - -# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same -( - set -x - $PIP_EXECUTABLE uninstall -y triton -) +TUNE_NIGHTLY_VERSION=dev20250124 # The pip repository that hosts nightly torch packages. cpu by default. # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly # with cuda for faster execution on cuda GPUs. if [[ -x "$(command -v nvidia-smi)" ]]; then - TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121" + TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu124" elif [[ -x "$(command -v rocminfo)" ]]; then TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2" +elif [[ -x "$(command -v xpu-smi)" ]]; +then + TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/xpu" else TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu" fi # pip packages needed by exir. -REQUIREMENTS_TO_INSTALL=( - torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}" - torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}" - torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}" +if [[ -x "$(command -v xpu-smi)" ]]; +then + REQUIREMENTS_TO_INSTALL=( + torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}" + torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}" + torchtune=="0.6.0" + ) +else + REQUIREMENTS_TO_INSTALL=( + torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}" + torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}" + torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}" + ) +fi + +# +# First install requirements in install/requirements.txt. Older torch may be +# installed from the dependency of other models. It will be overridden by +# newer version of torch nightly installed later in this script. +# +( + set -x + $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}" +) + +# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same +( + set -x + $PIP_EXECUTABLE uninstall -y triton ) # Install the requirements. --extra-index-url tells pip to look for package @@ -89,9 +115,11 @@ REQUIREMENTS_TO_INSTALL=( "${REQUIREMENTS_TO_INSTALL[@]}" ) +# For torchao need to install from github since nightly build doesn't have macos build. +# TODO: Remove this and install nightly build, once it supports macos ( set -x - $PIP_EXECUTABLE install torchao=="0.5.0" + $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d ) if [[ -x "$(command -v nvidia-smi)" ]]; then @@ -100,8 +128,6 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then $PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py ) fi - - ( set -x $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0" diff --git a/install/requirements.txt b/install/requirements.txt index d051d29cd..bd1e09174 100644 --- a/install/requirements.txt +++ b/install/requirements.txt @@ -9,12 +9,14 @@ gguf # Tiktoken tokenizer for Llama 3 and other advanced models tiktoken +# Tokenizers and jinja2 for other non-llama models that use HF tokenizers +tokenizers +jinja2 + # Miscellaneous snakeviz sentencepiece -# numpy version range required by GGUF util -numpy >= 1.17, < 2.0 -gguf +numpy >= 1.17 blobfile tomli >= 1.1.0 ; python_version < "3.11" openai diff --git a/runner/run.cpp b/runner/run.cpp index abfbb4584..d64c636bb 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -7,20 +7,21 @@ LICENSE file in the root directory of this source tree. */ /* Inference for Llama-2 Transformer model in pure C++ */ +#include "sentencepiece.h" +#include "tiktoken.h" +#include +#include +#include +#include #include +#include #include #include #include #include #include -#include -#include -#include -#include -#include -#include #include - +#include #ifdef DEBUG #include #include @@ -47,13 +48,25 @@ torch::Device aoti_device(torch::kCPU); #endif using exec_aten::ScalarType; -using torch::executor::EValue; -using executorch::extension::TensorPtr; using executorch::extension::make_tensor_ptr; +using executorch::extension::TensorPtr; +using torch::executor::EValue; using torch::executor::Module; using torch::executor::Result; #endif +using tokenizers::SPTokenizer; +using tokenizers::Tiktoken; +using tokenizers::Tokenizer; + +#define UNWRAP(x) \ + ({ \ + if (!(x).ok()) { \ + fprintf(stderr, "Got error code % " PRIu32, x.error()); \ + exit(EXIT_FAILURE); \ + } \ + std::move(x.get()); \ + }) // ---------------------------------------------------------------------------- // Transformer model @@ -65,56 +78,57 @@ enum ModelType { ModelType get_model_type(int model_int) { switch (model_int) { - case 2: - return LLAMA2_MODEL; - break; - case 3: - return LLAMA3_MODEL; - break; - default: - return UNKNOWN_MODEL; + case 2: + return LLAMA2_MODEL; + break; + case 3: + return LLAMA3_MODEL; + break; + default: + return UNKNOWN_MODEL; } } typedef struct { int vocab_size; // vocabulary size, usually 256 (byte-level) - int seq_len; // max sequence length + int seq_len; // max sequence length } Config; typedef struct { - float* logits; // output logits - int64_t* toks; // tokens seen so far; no kv-cache :( + float *logits; // output logits + int64_t *toks; // tokens seen so far; no kv-cache :( } RunState; typedef struct { - Config config; // the hyperparameters of the architecture (the blueprint) + Config config; // the hyperparameters of the architecture (the blueprint) RunState state; // buffers for the "wave" of activations in the forward pass + std::unordered_map metadata; #ifdef __AOTI_MODEL__ - torch::inductor::AOTIModelPackageLoader* runner; + torch::inductor::AOTIModelPackageLoader *runner; #else // __ET_MODEL__ - Module* runner; + Module *runner; #endif } Transformer; -void malloc_run_state(RunState* s, Config* p) { +void malloc_run_state(RunState *s, Config *p) { // we calloc instead of malloc to keep valgrind happy - s->logits = (float*)calloc(p->vocab_size, sizeof(float)); - s->toks = (int64_t*)calloc(p->seq_len, sizeof(int64_t)); + s->logits = (float *)calloc(p->vocab_size, sizeof(float)); + s->toks = (int64_t *)calloc(p->seq_len, sizeof(int64_t)); if (!s->logits || !s->toks) { fprintf(stderr, "malloc failed!\n"); exit(EXIT_FAILURE); } } -void free_run_state(RunState* s) { +void free_run_state(RunState *s) { free(s->logits); free(s->toks); } -void read_checkpoint(char* checkpoint, Config* config) { - FILE* file = fopen(checkpoint, "rb"); +void read_checkpoint(char *checkpoint, Config *config) { + FILE *file = fopen(checkpoint, "rb"); if (!file) { fprintf(stderr, "Couldn't open file %s\n", checkpoint); exit(EXIT_FAILURE); @@ -128,21 +142,9 @@ void read_checkpoint(char* checkpoint, Config* config) { config->vocab_size = abs(config->vocab_size); } -void build_transformer( - Transformer* t, - char* model_path, - int vocab_size, - int seq_len) { - // read in the Config and the Weights from the model - // read_checkpoint(model_path, &t->config); - // allocate the RunState buffers - t->config.vocab_size = vocab_size; - t->config.seq_len = seq_len; - malloc_run_state(&t->state, &t->config); - +void build_transformer(Transformer *t, char *model_path) { #ifdef __AOTI_MODEL__ t->runner = new torch::inductor::AOTIModelPackageLoader(model_path); - aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA); #else //__ET_MODEL__ t->runner = new Module( /* path to PTE model */ model_path, @@ -150,7 +152,7 @@ void build_transformer( #endif } -void free_transformer(Transformer* t) { +void free_transformer(Transformer *t) { // free the RunState buffers free_run_state(&t->state); delete t->runner; @@ -159,7 +161,7 @@ void free_transformer(Transformer* t) { // ---------------------------------------------------------------------------- // neural net blocks; the dynamics of the Transformer -void softmax(float* x, int size) { +void softmax(float *x, int size) { // find max value (for numerical stability) float max_val = x[0]; for (int i = 1; i < size; i++) { @@ -179,9 +181,9 @@ void softmax(float* x, int size) { } } -float* forward(Transformer* transformer, int token, int pos) { - Config* p = &transformer->config; - RunState* s = &transformer->state; +float *forward(Transformer *transformer, int token, int pos) { + Config *p = &transformer->config; + RunState *s = &transformer->state; s->toks[pos] = token; long token_buffer[1] = {token}; long pos_buffer[1] = {pos}; @@ -194,8 +196,8 @@ float* forward(Transformer* transformer, int token, int pos) { torch::Tensor token_tensor = torch::from_blob(token_buffer, {1, 1}, torch::kLong); torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong); - std::vector inputs{ - token_tensor.to(aoti_device), pos_tensor.to(aoti_device)}; + std::vector inputs{token_tensor.to(aoti_device), + pos_tensor.to(aoti_device)}; torch::Tensor result = transformer->runner->run(inputs)[0] .to(torch::dtype(torch::kFloat32)) @@ -204,7 +206,8 @@ float* forward(Transformer* transformer, int token, int pos) { memcpy(s->logits, logits, p->vocab_size * sizeof(float)); #else // __ET_MODEL__ TensorPtr pos_managed = make_tensor_ptr({1}, pos_buffer, ScalarType::Long); - TensorPtr tokens_managed = make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long); + TensorPtr tokens_managed = + make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long); std::vector inputs; auto tmp1 = EValue(tokens_managed); auto tmp2 = EValue(pos_managed); @@ -221,17 +224,12 @@ float* forward(Transformer* transformer, int token, int pos) { // HACK: the rest of this runner assumes that logits must be float, // so we simply convert them rather than plumbing // templating/switch-on-type through the rest of this file. - const auto& result_tensor = result[0].toTensor(); + const auto &result_tensor = result[0].toTensor(); ET_SWITCH_REALHBBF16_TYPES( - result_tensor.scalar_type(), - unused, - "forward", - CTYPE, - [&]() { - const CTYPE* logits = result_tensor.const_data_ptr(); - std::transform(logits, logits + p->vocab_size, s->logits, [](auto x) { - return static_cast(x); - }); + result_tensor.scalar_type(), unused, "forward", CTYPE, [&]() { + const CTYPE *logits = result_tensor.const_data_ptr(); + std::transform(logits, logits + p->vocab_size, s->logits, + [](auto x) { return static_cast(x); }); }); #endif @@ -249,13 +247,13 @@ typedef struct { typedef struct { int vocab_size; - ProbIndex* probindex; // buffer used in top-p sampling + ProbIndex *probindex; // buffer used in top-p sampling float temperature; float topp; unsigned long long rng_state; } Sampler; -int sample_argmax(float* probabilities, int n) { +int sample_argmax(float *probabilities, int n) { // return the index that has the highest probability int max_i = 0; float max_p = probabilities[0]; @@ -268,7 +266,7 @@ int sample_argmax(float* probabilities, int n) { return max_i; } -int sample_mult(float* probabilities, int n, float coin) { +int sample_mult(float *probabilities, int n, float coin) { // sample index from probabilities (they must sum to 1!) // coin is a random number in [0, 1), usually from random_f32() float cdf = 0.0f; @@ -281,9 +279,9 @@ int sample_mult(float* probabilities, int n, float coin) { return n - 1; // in case of rounding errors } -int compare(const void* a, const void* b) { - ProbIndex* a_ = (ProbIndex*)a; - ProbIndex* b_ = (ProbIndex*)b; +int compare(const void *a, const void *b) { + ProbIndex *a_ = (ProbIndex *)a; + ProbIndex *b_ = (ProbIndex *)b; if (a_->prob > b_->prob) return -1; if (a_->prob < b_->prob) @@ -291,12 +289,8 @@ int compare(const void* a, const void* b) { return 0; } -int sample_topp( - float* probabilities, - int n, - float topp, - ProbIndex* probindex, - float coin) { +int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex, + float coin) { // top-p sampling (or "nucleus sampling") samples from the smallest set of // tokens that exceed probability topp. This way we never sample tokens that // have very low probabilities and are less likely to go "off the rails". @@ -339,37 +333,31 @@ int sample_topp( return probindex[last_idx].index; // in case of rounding errors } -void build_sampler( - Sampler* sampler, - int vocab_size, - float temperature, - float topp, - unsigned long long rng_seed) { +void build_sampler(Sampler *sampler, int vocab_size, float temperature, + float topp, unsigned long long rng_seed) { sampler->vocab_size = vocab_size; sampler->temperature = temperature; sampler->topp = topp; sampler->rng_state = rng_seed; // buffer only used with nucleus sampling; may not need but it's ~small sampler->probindex = - (ProbIndex*)malloc(sampler->vocab_size * sizeof(ProbIndex)); + (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex)); } -void free_sampler(Sampler* sampler) { - free(sampler->probindex); -} +void free_sampler(Sampler *sampler) { free(sampler->probindex); } -unsigned int random_u32(unsigned long long* state) { +unsigned int random_u32(unsigned long long *state) { // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A *state ^= *state >> 12; *state ^= *state << 25; *state ^= *state >> 27; return (*state * 0x2545F4914F6CDD1Dull) >> 32; } -float random_f32(unsigned long long* state) { // random float32 in [0,1) +float random_f32(unsigned long long *state) { // random float32 in [0,1) return (random_u32(state) >> 8) / 16777216.0f; } -int sample(Sampler* sampler, float* logits) { +int sample(Sampler *sampler, float *logits) { // sample the token given the logits and some hyperparameters int next; if (sampler->temperature == 0.0f) { @@ -390,39 +378,37 @@ int sample(Sampler* sampler, float* logits) { next = sample_mult(logits, sampler->vocab_size, coin); } else { // top-p (nucleus) sampling, clamping the least likely tokens to zero - next = sample_topp( - logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin); + next = sample_topp(logits, sampler->vocab_size, sampler->topp, + sampler->probindex, coin); } } return next; } -Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) { - Tokenizer* tokenizer = NULL; +Tokenizer *build_tokenizer(const char *tokenizer_path, ModelType model_type) { + Tokenizer *tokenizer = NULL; switch (model_type) { - case LLAMA2_MODEL: - tokenizer = new SPTokenizer(); - tokenizer->load(tokenizer_path); - break; - case LLAMA3_MODEL: - tokenizer = new Tiktoken(); - tokenizer->load(tokenizer_path); - break; - default: - fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type); - exit(EXIT_FAILURE); + case LLAMA2_MODEL: + tokenizer = new SPTokenizer(); + tokenizer->load(tokenizer_path); + break; + case LLAMA3_MODEL: + tokenizer = new Tiktoken(); + tokenizer->load(tokenizer_path); + break; + default: + fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type); + exit(EXIT_FAILURE); } return tokenizer; } -void free_tokenizer(Tokenizer* tokenizer) { - delete tokenizer; -} +void free_tokenizer(Tokenizer *tokenizer) { delete tokenizer; } // ---------------------------------------------------------------------------- // utilities: time -void safe_printf(const char* piece) { +void safe_printf(const char *piece) { // piece might be a raw byte token, and we only want to print printable chars // or whitespace because some of the other bytes can be various control codes, // backspace, etc. @@ -454,21 +440,18 @@ long time_in_ms() { // Prints decoded tokens generated from the transformer. // The first token is not printed and is assumed to be a BOS or other similar // token -unsigned generate_from_prompt_tokens( - Transformer* transformer, - Tokenizer* tokenizer, - Sampler* sampler, - const std::vector& prompt_tokens, - unsigned pos, - const std::vector& stop_tokens, - int stop_pos, - bool print_prompt, - bool print_tok_per_sec) { +unsigned generate_from_prompt_tokens(Transformer *transformer, + Tokenizer *tokenizer, Sampler *sampler, + const std::vector &prompt_tokens, + unsigned pos, + const std::vector &stop_tokens, + int stop_pos, bool print_prompt, + bool print_tok_per_sec) { if (prompt_tokens.size() == 0) { return pos; } - uint64_t next; // will store the next token in the sequence + uint64_t next; // will store the next token in the sequence uint64_t token; // stores the current token to feed into the transformer bool done_with_prompt; // whether we are done processing prompt @@ -486,7 +469,7 @@ unsigned generate_from_prompt_tokens( if (pos_in_prompt < prompt_tokens.size()) { // Token comes from prompt token = prompt_tokens[pos_in_prompt++]; - float* logits = forward(transformer, token, pos); + float *logits = forward(transformer, token, pos); // Next token is either from prompt or if on last // prompt token, next is sampled @@ -498,29 +481,27 @@ unsigned generate_from_prompt_tokens( } else { // Token comes from next sampled from previous round. token = next; - float* logits = forward(transformer, token, pos); + float *logits = forward(transformer, token, pos); next = sample(sampler, logits); } done_with_prompt = (pos_in_prompt >= prompt_tokens.size()); // we terminate on finding the stop_token if we are done processing the // prompt (stop_tokens in the prompt do not terminate the loop) - if (done_with_prompt && - (std::find(stop_tokens.begin(), stop_tokens.end(), token) != - stop_tokens.end())) { + if (done_with_prompt && (std::find(stop_tokens.begin(), stop_tokens.end(), + token) != stop_tokens.end())) { found_stop_token = true; } // We print next in each iteration of the loop, not token if (!found_stop_token && (print_prompt || done_with_prompt)) { // The stop_token is printed as newline - bool next_is_stop = - std::find(stop_tokens.begin(), stop_tokens.end(), next) != - stop_tokens.end(); + bool next_is_stop = std::find(stop_tokens.begin(), stop_tokens.end(), + next) != stop_tokens.end(); if (next_is_stop) { printf("\n"); } else { - std::string piece = tokenizer->decode(token, next); + std::string piece = UNWRAP(tokenizer->decode(token, next)); safe_printf(piece.c_str()); // same as printf("%s", piece), but skips // "unsafe" bytes fflush(stdout); @@ -538,23 +519,16 @@ unsigned generate_from_prompt_tokens( // iteration) if (print_tok_per_sec && pos > 1) { long end = time_in_ms(); - fprintf( - stderr, - "\n\nachieved tok/s: %f\n", - (pos - 1) / (double)(end - start) * 1000); + fprintf(stderr, "\n\nachieved tok/s: %f\n", + (pos - 1) / (double)(end - start) * 1000); } return pos; } -void generate( - Transformer* transformer, - Tokenizer* tokenizer, - Sampler* sampler, - const char* prompt, - int steps, - ModelType model_type) { - const char* default_prompt = "Once upon a time"; +void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, + const char *prompt, int steps, ModelType model_type) { + const char *default_prompt = "Once upon a time"; if (prompt == NULL) { prompt = default_prompt; } @@ -566,33 +540,30 @@ void generate( std::vector prompt_tokens; std::vector stop_tokens; switch (model_type) { - case LLAMA2_MODEL: - prompt_tokens = tokenizer->encode(prompt, 1, 0); - stop_tokens.push_back(tokenizer->eos_tok()); - break; - case LLAMA3_MODEL: - prompt_tokens = tokenizer->encode(prompt, 1, 0); - stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]); - stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]); - break; - default: - fprintf(stderr, "Generate does not support model type %d.\n", model_type); - exit(EXIT_FAILURE); - } - - generate_from_prompt_tokens( - transformer, - tokenizer, - sampler, - prompt_tokens, - /*pos=*/0, - /*stop_tokens=*/stop_tokens, - /*stop_pos=*/steps - 1, - /*print_prompt=*/true, - /*print_tok_per_sec=*/true); + case LLAMA2_MODEL: + prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0)); + stop_tokens.push_back(tokenizer->eos_tok()); + break; + case LLAMA3_MODEL: + prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0)); + stop_tokens.push_back( + UNWRAP(tokenizer->encode("<|end_of_text|>", 0, 0))[0]); + stop_tokens.push_back(UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]); + break; + default: + fprintf(stderr, "Generate does not support model type %d.\n", model_type); + exit(EXIT_FAILURE); + } + + generate_from_prompt_tokens(transformer, tokenizer, sampler, prompt_tokens, + /*pos=*/0, + /*stop_tokens=*/stop_tokens, + /*stop_pos=*/steps - 1, + /*print_prompt=*/true, + /*print_tok_per_sec=*/true); } -void read_stdin(const char* guide, char* buffer, size_t bufsize) { +void read_stdin(const char *guide, char *buffer, size_t bufsize) { // read a line from stdin, up to but not including \n printf("%s", guide); if (fgets(buffer, bufsize, stdin) != NULL) { @@ -609,11 +580,10 @@ void read_stdin(const char* guide, char* buffer, size_t bufsize) { // python reference and that seemed ok, but this was not thoroughly tested and // is not safely implemented, it's more a proof of concept atm. -std::vector get_initial_prompt_tokens( - const char* cli_system_prompt, - const char* cli_user_prompt, - Tokenizer* tokenizer, - ModelType model_type) { +std::vector get_initial_prompt_tokens(const char *cli_system_prompt, + const char *cli_user_prompt, + Tokenizer *tokenizer, + ModelType model_type) { char system_prompt[512]; char user_prompt[512]; char rendered_prompt[512 * 2 + 200]; // the prompt template is ~170 @@ -622,10 +592,8 @@ std::vector get_initial_prompt_tokens( if (cli_system_prompt != NULL) { strcpy(system_prompt, cli_system_prompt); } else { - read_stdin( - "Enter system prompt (optional): ", - system_prompt, - sizeof(system_prompt)); + read_stdin("Enter system prompt (optional): ", system_prompt, + sizeof(system_prompt)); } if (cli_user_prompt != NULL) { @@ -637,48 +605,40 @@ std::vector get_initial_prompt_tokens( std::vector tokens; switch (model_type) { - case LLAMA2_MODEL: - if (system_prompt[0] != '\0') { - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "[INST] <>\n%s\n<>\n\n%s [/INST]", - system_prompt, - user_prompt); - } else { - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "[INST] %s [/INST]", - user_prompt); - } + case LLAMA2_MODEL: + if (system_prompt[0] != '\0') { + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, + "[INST] <>\n%s\n<>\n\n%s [/INST]", system_prompt, + user_prompt); + } else { + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, + "[INST] %s [/INST]", user_prompt); + } - // We need to add BOS token here and not in template because llama2 - // tokenizer does not pattern match special tokens - tokens = tokenizer->encode(rendered_prompt, 1, 0); - break; - - case LLAMA3_MODEL: - if (system_prompt[0] != '\0') { - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - system_prompt, - user_prompt); - } else { - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - user_prompt); - } - tokens = tokenizer->encode(rendered_prompt, 0, 0); - break; + // We need to add BOS token here and not in template because llama2 + // tokenizer does not pattern match special tokens + tokens = UNWRAP(tokenizer->encode(rendered_prompt, 1, 0)); + break; + + case LLAMA3_MODEL: + if (system_prompt[0] != '\0') { + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>" + "\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<" + "|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + system_prompt, user_prompt); + } else { + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%" + "s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + user_prompt); + } + tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0)); + break; - default: - fprintf(stderr, "Chat does not support model type %d.\n", model_type); - exit(EXIT_FAILURE); + default: + fprintf(stderr, "Chat does not support model type %d.\n", model_type); + exit(EXIT_FAILURE); } #ifdef DEBUG @@ -695,9 +655,8 @@ std::vector get_initial_prompt_tokens( return tokens; } -std::vector get_next_user_prompt_tokens( - Tokenizer* tokenizer, - ModelType model_type) { +std::vector get_next_user_prompt_tokens(Tokenizer *tokenizer, + ModelType model_type) { char user_prompt[512]; char rendered_prompt[512 + 150]; // the prompt template is ~100 characters. We // use 150 to be safe. @@ -706,30 +665,26 @@ std::vector get_next_user_prompt_tokens( std::vector tokens; switch (model_type) { - case LLAMA2_MODEL: - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "[INST] %s [/INST]", - user_prompt); - - // We need to add BOS token here and not in template because llama2 - // tokenizer does not pattern match special tokens - tokens = tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0); - break; - - case LLAMA3_MODEL: - snprintf( - rendered_prompt, - sizeof(rendered_prompt) - 1, - "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", - user_prompt); - tokens = tokenizer->encode(rendered_prompt, 0, 0); - break; - - default: - fprintf(stderr, "Chat does not support model type %d.\n", model_type); - exit(EXIT_FAILURE); + case LLAMA2_MODEL: + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, "[INST] %s [/INST]", + user_prompt); + + // We need to add BOS token here and not in template because llama2 + // tokenizer does not pattern match special tokens + tokens = UNWRAP(tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0)); + break; + + case LLAMA3_MODEL: + snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, + "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_" + "header_id|>assistant<|end_header_id|>\n\n", + user_prompt); + tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0)); + break; + + default: + fprintf(stderr, "Chat does not support model type %d.\n", model_type); + exit(EXIT_FAILURE); } #ifdef DEBUG @@ -746,14 +701,9 @@ std::vector get_next_user_prompt_tokens( return tokens; } -void chat( - Transformer* transformer, - Tokenizer* tokenizer, - Sampler* sampler, - const char* cli_user_prompt, - const char* cli_system_prompt, - unsigned steps, - ModelType model_type) { +void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler, + const char *cli_user_prompt, const char *cli_system_prompt, + unsigned steps, ModelType model_type) { if (steps == 0) { return; } @@ -761,16 +711,16 @@ void chat( uint64_t eot_token; std::vector prompt_tokens; switch (model_type) { - case LLAMA2_MODEL: - // llama2 uses EOS as EOT token - eot_token = tokenizer->eos_tok(); - break; - case LLAMA3_MODEL: - eot_token = tokenizer->encode("<|eot_id|>", 0, 0)[0]; - break; - default: - fprintf(stderr, "Chat does not support model type %d.\n", model_type); - exit(EXIT_FAILURE); + case LLAMA2_MODEL: + // llama2 uses EOS as EOT token + eot_token = tokenizer->eos_tok(); + break; + case LLAMA3_MODEL: + eot_token = UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]; + break; + default: + fprintf(stderr, "Chat does not support model type %d.\n", model_type); + exit(EXIT_FAILURE); } std::vector stop_tokens{eot_token}; @@ -784,11 +734,7 @@ void chat( } printf("Assistant: "); pos = generate_from_prompt_tokens( - transformer, - tokenizer, - sampler, - prompt_tokens, - pos, + transformer, tokenizer, sampler, prompt_tokens, pos, /*stop_tokens=*/stop_tokens, /*stop_pos=*/steps - 1, // We could pass in -1 here if we do not want // the model to stop mid-reply @@ -803,46 +749,40 @@ void chat( void error_usage() { fprintf(stderr, "Usage: run [options]\n"); - fprintf( - stderr, "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n"); + fprintf(stderr, + "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -t temperature in [0,inf], default 1.0\n"); - fprintf( - stderr, - " -p p value in top-p (nucleus) sampling in [0,1], default 0.9\n"); + fprintf(stderr, " -p p value in top-p (nucleus) sampling in [0,1], " + "default 0.9\n"); fprintf(stderr, " -s random seed, default time(NULL)\n"); - fprintf( - stderr, - " -n number of steps to run for, default 256. 0 = max_seq_len\n"); + fprintf(stderr, " -n number of steps to run for, default 256. 0 = " + "max_seq_len\n"); fprintf(stderr, " -i input prompt\n"); fprintf(stderr, " -z path to tokenizer\n"); fprintf(stderr, " -m mode: generate|chat, default: generate\n"); fprintf(stderr, " -y (optional) system prompt in chat mode\n"); - fprintf( - stderr, - " -v (optional) vocab size, default is model-specific.\n"); - fprintf( - stderr, " -l (optional) llama version (2 or 3), default 2.\n"); - fprintf( - stderr, - " -d (optional) device(CUDA or CPU) model was exported for\n"); + fprintf(stderr, + " -v (optional) vocab size, default is model-specific.\n"); + fprintf(stderr, + " -l (optional) llama version (2 or 3), default 2.\n"); exit(EXIT_FAILURE); } -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { // default parameters - char* model_path = NULL; - char* tokenizer_path = NULL; + char *model_path = NULL; + char *tokenizer_path = NULL; float temperature = 1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well, // but slower - int steps = 128; // number of steps to run for - const char* prompt = NULL; // prompt string + int steps = 128; // number of steps to run for + const char *prompt = NULL; // prompt string unsigned long long rng_seed = 0; // seed rng with time by default - const char* mode = "generate"; // generate|chat - char* system_prompt = + const char *mode = "generate"; // generate|chat + char *system_prompt = NULL; // the (optional) system prompt to use in chat mode int vocab_size = -1; @@ -863,64 +803,53 @@ int main(int argc, char* argv[]) { } else { error_usage(); } - for (int i = 2; i < argc; i += 2) { + for (int i = 2; i < argc; i += 1) { // do some basic validation - if (i + 1 >= argc) { - error_usage(); - } // must have arg after flag + char *parm = argv[i+1]; + // uniarg means the arg comes right after the letter in accordance with posix + int uniarg = strlen(argv[i]) > 2; + if (argv[i][0] != '-') { error_usage(); } // must start with dash - if (strlen(argv[i]) != 2) { + + if (strlen(argv[i]) < 2) { + error_usage(); + } // must have at least dash '-' and option letter + + if (uniarg) { + parm=&argv[i][2]; + } else if (i + 1 >= argc) { error_usage(); - } // must be -x (one dash, one letter) + } // must have arg after option if flag is not contiguous to option + // read in the args if (argv[i][1] == 't') { - temperature = atof(argv[i + 1]); + temperature = atof(parm); } else if (argv[i][1] == 'p') { - topp = atof(argv[i + 1]); + topp = atof(parm); } else if (argv[i][1] == 's') { - rng_seed = atoi(argv[i + 1]); + rng_seed = atoi(parm); } else if (argv[i][1] == 'n') { - steps = atoi(argv[i + 1]); + steps = atoi(parm); } else if (argv[i][1] == 'v') { - vocab_size = atoi(argv[i + 1]); + vocab_size = atoi(parm); } else if (argv[i][1] == 'i') { - prompt = argv[i + 1]; + prompt = parm; } else if (argv[i][1] == 'z') { - tokenizer_path = argv[i + 1]; + tokenizer_path = parm; } else if (argv[i][1] == 'm') { - mode = argv[i + 1]; + mode = parm; } else if (argv[i][1] == 'y') { - system_prompt = argv[i + 1]; + system_prompt = parm; } else if (argv[i][1] == 'l') { - llama_ver = atoi(argv[i + 1]); -#ifdef __AOTI_MODEL__ - } else if (argv[i][1] == 'd') { -#ifdef USE_CUDA - if (strcasecmp(argv[i + 1], "CUDA") == 0) { - aoti_device = torch::Device(torch::kCUDA); - } else -#endif - if (strcasecmp(argv[i + 1], "CPU") == 0) { - aoti_device = torch::Device(torch::kCPU); - } else { - fprintf(stderr, "Unknown device %s", argv[i + 1]); - exit(1); - } -#endif + llama_ver = atoi(parm); } else { error_usage(); } - } - ModelType model_type = get_model_type(llama_ver); - if (model_type == UNKNOWN_MODEL) { - fprintf( - stderr, - "Unknown model type passed by -l argument. Received l=%d.", - llama_ver); - error_usage(); + // account for parameter + i += (uniarg)?0:1; } if (model_path == NULL) { @@ -928,6 +857,25 @@ int main(int argc, char* argv[]) { error_usage(); } + Transformer transformer; + build_transformer(&transformer, model_path); + +#ifdef __AOTI_MODEL__ + auto aoti_metadata = transformer.runner->get_metadata(); + aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu" + ? torch::Device(torch::kCPU) + : torch::Device(torch::kCUDA); + ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"])); +#else // __ET_MODEL__ + ModelType model_type = get_model_type(llama_ver); +#endif + + if (model_type == UNKNOWN_MODEL) { + fprintf(stderr, "Unknown model type passed by -l argument. Received l=%d.", + llama_ver); + error_usage(); + } + if (tokenizer_path == NULL) { fprintf(stderr, "No tokenizer_path provided."); error_usage(); @@ -943,15 +891,19 @@ int main(int argc, char* argv[]) { if (steps < 0) steps = 0; - Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type); + Tokenizer *tokenizer = build_tokenizer(tokenizer_path, model_type); // If no tokenizer path provided, get default for model_type if (vocab_size == -1) { vocab_size = tokenizer->vocab_size(); } - Transformer transformer; - build_transformer(&transformer, model_path, vocab_size, steps); + // read in the Config and the Weights from the model + // read_checkpoint(model_path, &t->config); + // allocate the RunState buffers + transformer.config.vocab_size = vocab_size; + transformer.config.seq_len = steps; + malloc_run_state(&transformer.state, &transformer.config); Sampler sampler; build_sampler(&sampler, vocab_size, temperature, topp, rng_seed); @@ -959,14 +911,8 @@ int main(int argc, char* argv[]) { if (strcmp(mode, "generate") == 0) { generate(&transformer, tokenizer, &sampler, prompt, steps, model_type); } else if (strcmp(mode, "chat") == 0) { - chat( - &transformer, - tokenizer, - &sampler, - prompt, - system_prompt, - steps, - model_type); + chat(&transformer, tokenizer, &sampler, prompt, system_prompt, steps, + model_type); } else { fprintf(stderr, "unknown mode: %s\n", mode); error_usage(); diff --git a/runner/third-party/tokenizers b/runner/third-party/tokenizers new file mode 160000 index 000000000..3f536fc01 --- /dev/null +++ b/runner/third-party/tokenizers @@ -0,0 +1 @@ +Subproject commit 3f536fc0139f7987940f69de2aef58eec1794f6a diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..c1580e27b --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +""" +Global pytest config, fixtures, and helpers go here! +""" + +# Standard +import os +import sys + +# Make sure tests can import torchchat +sys.path.append( + os.path.realpath(os.path.join(os.path.dirname(__file__), "..")) +) diff --git a/tests/test_chat_formatters.py b/tests/test_chat_formatters.py new file mode 100644 index 000000000..2f7f7a955 --- /dev/null +++ b/tests/test_chat_formatters.py @@ -0,0 +1,216 @@ +""" +Unit tests for chat formatters +""" + +# Third Party +import pytest + +# Local +from torchchat.generate import ( + HFTokenizerChatFormatter, + Llama2ChatFormatter, + Llama3ChatFormatter, +) + +## Helpers ##################################################################### + +class DummyTokenizer: + """Dummy tokenizer that encodes as strings so it's easy to check formatting""" + def encode(self, text, *_, **__): + return text + + +class DummySPTokenizer(DummyTokenizer): + """Emulated Sentencepiece tokenizer with bos/eos""" + bos = "" + eos = "" + + +class DummyLlama3Tokenizer(DummyTokenizer): + class _IdentityDict: + def __getitem__(self, key): + return key + special_tokens = _IdentityDict() + + +class DummyHFTokenizer(DummyTokenizer): + """Dummy made up chat template scheme""" + # Sequence + bos = "" + # Turn + bot = "" + eot = "" + # Role + bor = "" + eor = "" + def apply_chat_template(self, messages, add_generation_prompt): + out = [self.bos] + role = None + for msg in messages: + role = msg["role"] + content = msg["content"] + out.append(f"{self.bot}{self.bor}{role}{self.eor}{content}{self.eot}") + if add_generation_prompt and role != "assistant": + out.append(f"{self.bot}{self.bor}assistant{self.eor}") + return "\n".join(out) + + +def check_rendering(fmt, messages, expected, add_generation_prompt): + """Render messages and compare to expected output""" + assert "".join(fmt.encode_dialog_prompt(messages, add_generation_prompt)) == expected + + +def make_message(role, text): + return {"role": role, "content": text} + + +SYSTEM_PROMPT = "You are a helpful assistant, feel free to ask me anything." +USER1 = "Hello world!" +ASSISTANT1 = "Greetings! How can I help you?" +USER2 = "Why is the sky blue?" +ASSISTANT2 = "The sky appears blue because of a phenomenon called Rayleigh scattering." + + +# Stock sets of messages to test +MSGS_NO_SYS= [ + make_message("user", USER1), +] +MSGS_SYS_USR = [ + make_message("system", SYSTEM_PROMPT), + make_message("user", USER1), +] +MSGS_SYS_USR_ASST = [ + make_message("system", SYSTEM_PROMPT), + make_message("user", USER1), + make_message("assistant", ASSISTANT1), +] +MSGS_MULTI_TURN = [ + make_message("system", SYSTEM_PROMPT), + make_message("user", USER1), + make_message("assistant", ASSISTANT1), + make_message("user", USER2), + make_message("assistant", ASSISTANT2), +] + +## Llama2ChatFormatter ######################################################### + +@pytest.mark.parametrize( + ["messages", "expected"], + [ + # single user message (no system prompt) + (MSGS_NO_SYS, f"[INST] {USER1} [/INST]"), + # sys, usr + (MSGS_SYS_USR, f"""[INST] <> +{SYSTEM_PROMPT} +<> + +{USER1} [/INST]"""), + # sys, usr, asst + (MSGS_SYS_USR_ASST, f"""[INST] <> +{SYSTEM_PROMPT} +<> + +{USER1} [/INST] {ASSISTANT1} +"""), + # sys, usr, asst, usr, asst + (MSGS_MULTI_TURN, f"""[INST] <> +{SYSTEM_PROMPT} +<> + +{USER1} [/INST] {ASSISTANT1} +[INST] {USER2} [/INST] {ASSISTANT2} +"""), + ] +) +def test_llama2_chat_formatter(messages, expected): + """Tests for Llama2 following the official guide + https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/ + """ + tok = DummySPTokenizer() + fmt = Llama2ChatFormatter(tok) + # NOTE: add_generation_prompt not used by Llama2 + check_rendering(fmt, messages, expected, True) + +## Llama3ChatFormatter ######################################################### + +@pytest.mark.parametrize( + ["messages", "expected"], + [ + # single user message (no system prompt) + (MSGS_NO_SYS, f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +{USER1}<|eot_id|>"""), + # sys, usr + (MSGS_SYS_USR, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{USER1}<|eot_id|>"""), + # sys, usr, asst + (MSGS_SYS_USR_ASST, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{ASSISTANT1}<|eot_id|>"""), + # sys, usr, asst, usr, asst + (MSGS_MULTI_TURN, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|> + +{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{ASSISTANT1}<|eot_id|><|start_header_id|>user<|end_header_id|> + +{USER2}<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +{ASSISTANT2}<|eot_id|>"""), + ] +) +@pytest.mark.parametrize("add_generation_prompt", [True, False]) +def test_llama3_chat_formatter(messages, expected, add_generation_prompt): + """Tests for Llama3 following the official guide + https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/ + """ + tok = DummyLlama3Tokenizer() + fmt = Llama3ChatFormatter(tok) + # No assistant prompt added if the last message is from the assistant + if add_generation_prompt and messages[-1]["role"] != "assistant": + expected += "<|start_header_id|>assistant<|end_header_id|>\n\n" + check_rendering(fmt, messages, expected, add_generation_prompt) + +## HFTokenizerChatFormatter #################################################### + +@pytest.mark.parametrize( + ["messages", "expected"], + [ + # single user message (no system prompt) + (MSGS_NO_SYS, f""" +user{USER1}"""), + # sys, usr + (MSGS_SYS_USR, f""" +system{SYSTEM_PROMPT} +user{USER1}"""), + # sys, usr, asst + (MSGS_SYS_USR_ASST, f""" +system{SYSTEM_PROMPT} +user{USER1} +assistant{ASSISTANT1}"""), + # sys, usr, asst, usr, asst + (MSGS_MULTI_TURN, f""" +system{SYSTEM_PROMPT} +user{USER1} +assistant{ASSISTANT1} +user{USER2} +assistant{ASSISTANT2}"""), + ] +) +@pytest.mark.parametrize("add_generation_prompt", [True, False]) +def test_hf_chat_formatter(messages, expected, add_generation_prompt): + tok = DummyHFTokenizer() + fmt = HFTokenizerChatFormatter(tok) + # No assistant prompt added if the last message is from the assistant + if add_generation_prompt and messages[-1]["role"] != "assistant": + expected += f"\n{tok.bot}{tok.bor}assistant{tok.eor}" + check_rendering(fmt, messages, expected, add_generation_prompt) diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt deleted file mode 100644 index 39c20885d..000000000 --- a/tokenizer/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -cmake_minimum_required(VERSION 3.24) -set(CMAKE_CXX_STANDARD 17) -IF(DEFINED ENV{TORCHCHAT_ROOT}) - set(TORCHCHAT_ROOT $ENV{TORCHCHAT_ROOT}) -ELSE() - set(TORCHCHAT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) -ENDIF() - -# build tokenizer library -add_library( - tokenizer - tokenizer.h - sentencepiece.cpp - tiktoken.cpp) - -target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src) - -# add RE2 as subdirectory -set(ABSL_ENABLE_INSTALL ON) -set(ABSL_PROPAGATE_CXX_STD ON) -set(_pic_flag -${CMAKE_POSITION_INDEPENDENT_CODE}) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) -add_subdirectory(third-party/abseil-cpp) -add_subdirectory(third-party/re2) -add_subdirectory(third-party/sentencepiece) -set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) - -target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static) diff --git a/tokenizer/base64.h b/tokenizer/base64.h deleted file mode 100644 index dfeefef55..000000000 --- a/tokenizer/base64.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ -// @lint-ignore-every LICENSELINT -/************************************************************************** - Copyright (c) 2023 sewenew - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - *************************************************************************/ - -#pragma once - -#include -#include -#include - -namespace base64 { - -std::string decode(const std::string_view& input); - -namespace detail { - -constexpr uint32_t DECODE_TABLE[] = { - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, - 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, - 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, - 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, - 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255}; - -inline void validate(uint32_t v) { - if (v == 255) { - fprintf(stderr, "invalid char"); - exit(EXIT_FAILURE); - } -} - -inline void decode(const std::string_view& input, std::string& output) { - if (input.size() != 4) { - fprintf(stderr, "input length must be 4, got %zu", input.size()); - exit(EXIT_FAILURE); - } - - uint32_t val = 0; - - uint8_t c = input[0]; - auto v = DECODE_TABLE[c]; - validate(v); - val = v; - - c = input[1]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - c = input[2]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - c = input[3]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - output.push_back(static_cast((val >> 16) & 0xFF)); - output.push_back(static_cast((val >> 8) & 0xFF)); - output.push_back(static_cast(val & 0xFF)); -} - -inline void decode_1_padding( - const std::string_view& input, - std::string& output) { - if (input.size() != 3) { - fprintf(stderr, "input length must be 3, got %zu", input.size()); - exit(EXIT_FAILURE); - } - - uint32_t val = 0; - - uint8_t c = input[0]; - auto v = DECODE_TABLE[c]; - validate(v); - val = v; - - c = input[1]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - c = input[2]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - output.push_back(static_cast((val >> 10) & 0xFF)); - output.push_back(static_cast((val >> 2) & 0xFF)); -} - -inline void decode_2_padding( - const std::string_view& input, - std::string& output) { - assert(input.size() == 2); - - uint32_t val = 0; - - uint8_t c = input[0]; - auto v = DECODE_TABLE[c]; - validate(v); - val = v; - - c = input[1]; - v = DECODE_TABLE[c]; - validate(v); - val = (val << 6) | v; - - output.push_back(static_cast((val >> 4) & 0xFF)); -} - -} // namespace detail - -inline std::string decode(const std::string_view& input) { - if (input.empty()) { - fprintf(stderr, "empty input"); - exit(EXIT_FAILURE); - } - - // Faster than `input.size() % 4`. - if ((input.size() & 3) != 0 || input.size() < 4) { - fprintf( - stderr, - "input length must be larger than 4 and is multiple of 4, got %zu", - input.size()); - exit(EXIT_FAILURE); - } - - std::string output; - output.reserve(input.size() / 4 * 3); - auto idx = 0U; - for (; idx < input.size() - 4; idx += 4) { - detail::decode(input.substr(idx, 4), output); - } - - // Last 4 bytes. Might contain paddings. - if (input[idx + 3] == '=') { - if (input[idx + 2] == '=') { - // Tow paddings. - detail::decode_2_padding(input.substr(idx, 2), output); - } else { - // One padding. - detail::decode_1_padding(input.substr(idx, 3), output); - } - } else { - // No padding. - detail::decode(input.substr(idx, 4), output); - } - - return output; -} -} // namespace base64 diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py index 7ad5807d1..b77ee43ea 100644 --- a/tokenizer/hf_tokenizer.py +++ b/tokenizer/hf_tokenizer.py @@ -5,11 +5,12 @@ # LICENSE file in the root directory of this source tree. # Standard -from typing import List, Optional +from typing import Dict, List, Optional import json import os # Third Party +import jinja2 from tokenizers import Tokenizer # Local @@ -37,17 +38,28 @@ def __init__(self, file_path: str): # Load the tokenizer itself self._tokenizer = Tokenizer.from_file(tokenizer_path) + # Load the chat template if we have a config path + self._chat_template: Optional[jinja2.Template] = None + # If available, parse bos/eos tokens from the tokenizer config self._bos_id, self._eos_id = None, None if tokenizer_config_path is not None: with open(tokenizer_config_path, "r") as handle: tok_config = json.load(handle) - bos_token = tok_config.get("bos_token") - eos_token = tok_config.get("eos_token") + + def _extract_token(identifier: str) -> Optional[str]: + entry: Optional[Union[str, dict]] = tok_config.get(identifier) + return entry.get("content") if isinstance(entry, dict) else entry + + bos_token = _extract_token("bos_token") + eos_token = _extract_token("eos_token") + if bos_token is not None: self._bos_id = self._tokenizer.token_to_id(bos_token) if eos_token is not None: self._eos_id = self._tokenizer.token_to_id(eos_token) + if chat_template_str := tok_config.get("chat_template"): + self._chat_template = jinja2.Template(chat_template_str) # If no eos/bos tokens found, go looking for them! if None in [self._bos_id, self._eos_id]: @@ -70,6 +82,8 @@ def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optio if len(candidate_toks) == 1: return candidate_toks[0]["id"] + ## Interface ## + def encode( self, s: str, @@ -90,3 +104,21 @@ def bos_id(self) -> int: def eos_id(self) -> int: return self._eos_id + + ## Additional Public Methods ## + + def has_chat_template(self) -> bool: + return bool(self._chat_template) + + def apply_chat_template( + self, + dialog: List[Dict[str, str]], + add_generation_prompt: bool = False, + ) -> str: + """If configured with a chat template, apply it to the list of messages + """ + if not self._chat_template: + raise ValueError("No chat template configured!") + return self._chat_template.render( + messages=dialog, add_generation_prompt=add_generation_prompt + ) diff --git a/tokenizer/sentencepiece.cpp b/tokenizer/sentencepiece.cpp deleted file mode 100644 index 0cdfc7e30..000000000 --- a/tokenizer/sentencepiece.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// sentencepiece tokenizer - -#include -#include -#include -#include -#include "absl/strings/str_replace.h" - -const char kSpaceSymbol[] = "\xe2\x96\x81"; - -SPTokenizer::SPTokenizer() - : Tokenizer(), - _processor(std::make_unique()) {} - -/** - * @brief Load the tokenizer from a file. The tokenizer file contains the - * vocabulary and scores. The format is: the first integer is the maximum - * token length, followed by a list of (word_len, word) pairs. Here we - * are reading all the vocabulary into memory and keep it sorted for fast - * lookup. - * - * @param tokenizer_path The path to the tokenizer file. - * @return void - */ -void SPTokenizer::load(const std::string& tokenizer_path) { - if (initialized_) { - fprintf(stderr, "Tokenizer already initialized.\n"); - return; - } - // read in the file - const auto status = _processor->Load(tokenizer_path); - if (!status.ok()) { - fprintf(stderr, "couldn't load %s\n. If this tokenizer artifact is for llama3, please pass `-l 3`.", tokenizer_path.c_str()); - exit(EXIT_FAILURE); - } - // load vocab_size, bos_tok, eos_tok - vocab_size_ = _processor->GetPieceSize(); - bos_tok_ = _processor->bos_id(); - eos_tok_ = _processor->eos_id(); - initialized_ = true; -} - -SPTokenizer::~SPTokenizer() {} - -/** - * @brief Decode a token into string. - * - * @param prev_token The previous token. - * @param token The current token. - * @return std::string A pointer to the string representation of the - * token. - */ -std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) { - if (!initialized_) { - fprintf(stderr, "Tokenizer not initialized\n"); - exit(EXIT_FAILURE); - } - // get rid of the control ids and - if (_processor->IsControl(token)) { - // NB: returning empty string doesn't work for some reason. It causes - // free(): invalid pointer error. - return " "; - } - - std::string result = - absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}}); - - // following BOS token, sentencepiece decoder strips any leading - // whitespace - if (prev_token == bos_tok_ && result[0] == ' ') { - result = result.substr(1); - } - - // handle <0x0A> - result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}}); - - return result; -} - -/** - * @brief Encode a string into a sequence of tokens. - * - * @param text The string to be encoded. - * @param bos The number of BOS to prepend to the token list. - * @param eos The number of EOS to append to the token list. - * @return std::vector - */ -std::vector -SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) { - if (!initialized_) { - fprintf(stderr, "Tokenizer not initialized\n"); - exit(EXIT_FAILURE); - } - // workaround a weird issue that text doesn't have correct size() - std::string input(text.c_str()); - // should we reserve memory? - std::vector res; - auto status = _processor->Encode(input, &res); - if (!status.ok()) { - fprintf(stderr, "couldn't encode %s\n", text.c_str()); - exit(EXIT_FAILURE); - } - - std::vector tokens; - for (auto i = 0; i < bos; ++i) { - tokens.push_back(bos_tok_); - } - - for (auto i = 0; i < res.size(); ++i) { - tokens.push_back(res[i]); - } - - for (auto i = 0; i < eos; ++i) { - tokens.push_back(eos_tok_); - } - return tokens; -} diff --git a/tokenizer/third-party/abseil-cpp b/tokenizer/third-party/abseil-cpp deleted file mode 160000 index 854193071..000000000 --- a/tokenizer/third-party/abseil-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc diff --git a/tokenizer/third-party/re2 b/tokenizer/third-party/re2 deleted file mode 160000 index ac82d4f62..000000000 --- a/tokenizer/third-party/re2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1 diff --git a/tokenizer/third-party/sentencepiece b/tokenizer/third-party/sentencepiece deleted file mode 160000 index 7dcb54145..000000000 --- a/tokenizer/third-party/sentencepiece +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7dcb541451b1862d73f473b3804ccf8f2a9e10f6 diff --git a/tokenizer/tiktoken.cpp b/tokenizer/tiktoken.cpp deleted file mode 100644 index 2f31f057a..000000000 --- a/tokenizer/tiktoken.cpp +++ /dev/null @@ -1,390 +0,0 @@ -// @lint-ignore-every LICENSELINT -/************************************************************************** - Copyright (c) 2023 sewenew - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - *************************************************************************/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// ------------------------------Util start------------------------------------ - -static uint64_t _max_size() { - return std::numeric_limits::max(); -} - -static Re2UPtr _create_regex(const std::string& pattern) { - assert(!pattern.empty()); - - return std::make_unique("(" + pattern + ")"); -} - -static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) { - std::string special_pattern; - for (const auto& ele : special_encoder) { - if (!special_pattern.empty()) { - special_pattern += "|"; - } - special_pattern += re2::RE2::QuoteMeta(ele.first); - } - - if (special_pattern.empty()) { - return nullptr; - } - - return _create_regex(special_pattern); -} - -static std::pair _parse(const std::string& line) { - auto pos = line.find(" "); - if (pos == std::string::npos) { - throw std::invalid_argument("invalid encoder line: " + line); - } - - auto token = base64::decode({line.data(), pos}); - uint64_t rank = 0; - try { - rank = std::stoul(line.substr(pos + 1)); - } catch (const std::exception&) { - throw std::invalid_argument("invalid encoder rank: " + line); - } - - return {std::move(token), rank}; -} - -static Encoder _load_encoder(const std::string& path) { - std::ifstream file(path); - if (!file) { - fprintf(stderr, "failed to open encoder file: %s\n", path.c_str()); - exit(EXIT_FAILURE); - } - - Encoder encoder; - std::string line; - while (std::getline(file, line)) { - auto [token, rank] = _parse(line); - - if (!encoder.emplace(std::move(token), rank).second) { - fprintf(stderr, "duplicate item: %s\n", line.c_str()); - } - } - return encoder; -} - -static Decoder _build_decoder(const Encoder& encoder) { - Decoder decoder; - for (const auto& [k, v] : encoder) { - decoder.emplace(v, k); - } - - if (encoder.size() != decoder.size()) { - fprintf(stderr, "duplicate items in encoder"); - exit(EXIT_FAILURE); - } - - return decoder; -} - -static std::vector _byte_pair_merge( - const std::string& piece, - const std::unordered_map& ranks, - std::function func) { - // This is a vector of (start, rank). - // The rank is of the byte pair starting at position start. - // The rank of the last item in the vector is not a valid value. - std::vector> parts; - parts.reserve(piece.size() + 1); - for (auto idx = 0U; idx < piece.size() + 1; ++idx) { - parts.emplace_back(idx, _max_size()); - } - - auto get_rank = [&piece, &ranks]( - const std::vector>& parts, - uint64_t start_idx, - uint64_t skip) -> std::optional { - if (start_idx + skip + 2 < parts.size()) { - auto s = parts[start_idx].first; - auto e = parts[start_idx + skip + 2].first; - auto key = piece.substr(s, e - s); - auto iter = ranks.find(key); - if (iter != ranks.end()) { - return iter->second; - } - } - return std::nullopt; - }; - - // We look up the ranks once in the beginning and iteratively update - // them during each merge, which reduces the number of rank lookups. - for (auto i = 0U; i < parts.size() - 2; ++i) { - auto rank = get_rank(parts, i, 0); - if (rank) { - // usize::MAX is a sentinel value and cannot be a valid rank - if (*rank == _max_size()) { - fprintf(stderr, "at %" PRIu32 " rank is too large\n", i); - } - parts[i].second = *rank; - } - } - - // If you have n parts and m merges, this does O(mn) work. - // We could do something with a heap and do O(m log n) work. - // It is important to consider that n is often small (<100), and as such - // the cache-locality benefits outweigh the algorithmic complexity downsides - // of the `parts` vector data structure above. - - // Note that we hash bytes, not token pairs. As long as we train BPE the way - // we currently do, this is equivalent. An easy way to break this would be - // to decouple merge priority from token index or to prevent specific token - // merges. - while (true) { - if (parts.size() == 1) { - break; - } - - // usize::MAX is a sentinel rank value allowing us to - // take the min more quickly - auto min_rank = std::make_pair(_max_size(), 0); - for (auto i = 0U; i < parts.size() - 1; ++i) { - auto rank = parts[i].second; - if (rank < min_rank.first) { - min_rank.first = rank; - min_rank.second = i; - } - } - - if (min_rank.first != _max_size()) { - auto i = min_rank.second; - - // NOTE: We are about to remove parts[i + 1]. We do not do it - // yet because there are cache-locality benefits to updating - // parts[i] and parts[i-1] before removing, which could thrash - // the cache. Thus, we update the rank calculation by skipping over - // parts[i + 1], by invoking `get_rank!` with `skip = 1`. - auto rank = get_rank(parts, i, 1); - if (rank) { - parts[i].second = *rank; - } else { - parts[i].second = _max_size(); - } - if (i > 0) { - rank = get_rank(parts, i - 1, 1); - if (rank) { - parts[i - 1].second = *rank; - } else { - parts[i - 1].second = _max_size(); - } - } - - parts.erase(parts.begin() + (i + 1)); - } else { - break; - } - } - std::vector out; - out.reserve(parts.size() - 1); - for (auto i = 0U; i < parts.size() - 1; ++i) { - auto s = parts[i].first; - auto e = parts[i + 1].first; - out.push_back(func(s, e)); - } - return out; -} - -static std::vector _byte_pair_encode( - const std::string& piece, - const Encoder& encoder) { - if (piece.size() == 1) { - auto iter = encoder.find(piece); - if (iter != encoder.end()) { - return std::vector({iter->second}); - } else { - // TODO: is it possible? - return {}; - } - } - - return _byte_pair_merge( - piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) { - std::string key = piece.substr(start, stop - start); - auto iter = encoder.find(key); - if (iter != encoder.end()) { - return iter->second; - } else { - // TODO: what if key does not exist? Should we return `unknown`? - // assert(false); // ?? - return uint64_t(0); - } - }); -} -// ------------------------------Util end------------------------------------ -// -------------------------private method start------------------------------- - -template -std::pair, re2::StringPiece> -Tiktoken::_split_with_allowed_special_token( - re2::StringPiece& input, - const T& allowed_special) { - if (!_special_token_regex) { - return std::make_pair(std::nullopt, input); - } - - auto start = input.begin(); - std::string special; - while (true) { - if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) { - // No special token. - break; - } - - if (allowed_special.count(special) == 1) { - // Found an allowed special token, split the text with it. - return std::make_pair( - special, - re2::StringPiece(start, input.begin() - start - special.size())); - } // else try to find the next special token - } - - return std::make_pair(std::nullopt, input); -} - -void Tiktoken::_encode( - re2::StringPiece& input, - std::vector& ret, - uint64_t& last_piece_token_len) { - std::string piece; - assert(_regex); - while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) { - auto iter = _encoder.find(piece); - if (iter != _encoder.end()) { - last_piece_token_len = 1; - ret.push_back(iter->second); - continue; - } - auto tokens = _byte_pair_encode(piece, _encoder); - last_piece_token_len = tokens.size(); - ret.insert(ret.end(), tokens.begin(), tokens.end()); - } -} - -template -std::pair, uint64_t> Tiktoken::_encode_with_special_token( - const std::string& text, - const T& allowed_special) { - std::vector tokens; - uint64_t last_piece_token_len = 0; - re2::StringPiece input(text); - while (true) { - auto [special, sub_input] = - _split_with_allowed_special_token(input, allowed_special); - - _encode(sub_input, tokens, last_piece_token_len); - - if (special) { - uint64_t token = 0; - try { - token = _special_token_encoder.at(*special); - } catch (const std::out_of_range&) { - // Should never go here, since special pattern includes all special - // chars. - fprintf(stderr, "unknown special token: %s\n", special->c_str()); - exit(EXIT_FAILURE); - } - - tokens.push_back(token); - last_piece_token_len = 0; - } else { - break; - } - } - - // last_piece_token_len is how many tokens came from the last regex split. - // This is used for determining unstable tokens, since you can't merge - // across (stable) regex splits - return std::make_pair(tokens, last_piece_token_len); -} - -// -------------------------private method end------------------------------- -// -------------------------public method start------------------------------- - -Tiktoken::Tiktoken() : Tokenizer() {} - -void Tiktoken::load(const std::string& path) { - _encoder = _load_encoder(path); - _special_token_encoder = _get_special_tokens(_encoder.size()); - - _decoder = _build_decoder(_encoder); - _special_token_decoder = _build_decoder(_special_token_encoder); - - _regex = _create_regex(_pattern); - _special_token_regex = _build_special_token_regex(_special_token_encoder); - - // initialize vocab_size, bos_tok, eos_tok - vocab_size_ = _encoder.size() + _special_token_encoder.size(); - bos_tok_ = _encoder.size(); // hardcoded (see _get_special_tokens) - eos_tok_ = _encoder.size() + 1; // hardcoded (see _get_special_tokens) - initialized_ = true; -} - -std::vector -Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) { - if (!initialized_) { - exit(EXIT_FAILURE); - } - auto res = _encode_with_special_token(text, _special_token_encoder).first; - for (auto i = 0; i < bos; ++i) { - res.insert(res.begin(), bos_tok_); - } - for (auto i = 0; i < eos; ++i) { - res.push_back(eos_tok_); - } - return res; -} - -std::string Tiktoken::decode(uint64_t prev, uint64_t cur) { - (void)prev; - if (!initialized_) { - exit(EXIT_FAILURE); - } - std::string ret; - - std::string token_bytes; - auto iter = _decoder.find(cur); - if (iter != _decoder.end()) { - token_bytes = iter->second; - } else { - iter = _special_token_decoder.find(cur); - if (iter != _special_token_decoder.end()) { - token_bytes = iter->second; - } else { - fprintf(stderr, "unknown token: %" PRIu64 "\n", cur); - exit(EXIT_FAILURE); - } - } - ret += token_bytes; - - return ret; -} -// -------------------------public method end------------------------------- diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h deleted file mode 100644 index 9e1977b71..000000000 --- a/tokenizer/tokenizer.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple Tokenizer interface. -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "sentencepiece_processor.h" - -class Tokenizer { - public: - explicit Tokenizer() {} - virtual ~Tokenizer() {} - - virtual void load(const std::string& tokenizer_path) = 0; - - virtual std::vector - encode(const std::string& input, int8_t bos, int8_t eos) = 0; - - virtual std::string decode(uint64_t prev_token, uint64_t token) = 0; - - // getters - int32_t vocab_size() const { - return vocab_size_; - } - - uint64_t bos_tok() const { - return bos_tok_; - } - - uint64_t eos_tok() const { - return eos_tok_; - } - - protected: - bool initialized_ = false; - int32_t vocab_size_; - uint64_t bos_tok_, eos_tok_; -}; - -// ----------------------- SPTokenizer ----------------------- -// Used by sentencepiece. Adapted from llama2.c. -struct TokenIndex { - const char* str; - int32_t id; -}; - -class SPTokenizer : public Tokenizer { - public: - explicit SPTokenizer(); - ~SPTokenizer() override; - - void load(const std::string& tokenizer_path) override; - - std::vector encode(const std::string& input, int8_t bos, int8_t eos) - override; - - std::string decode(uint64_t prev_token, uint64_t token) override; - - private: - std::unique_ptr _processor; -}; - -// ----------------------- Tiktoken ----------------------- -// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer - -using Encoder = std::unordered_map; -using Decoder = std::unordered_map; -using Re2UPtr = std::unique_ptr; - -class Tiktoken : public Tokenizer { - public: - explicit Tiktoken(); - ~Tiktoken(){}; - - void load(const std::string& tokenizer_path); - - std::vector - encode(const std::string& input, int8_t bos, int8_t eos); - - std::string decode(uint64_t prev_token, uint64_t token); - - private: - static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) { - Encoder special_tokens; - special_tokens.emplace("<|begin_of_text|>", num_base_tokens++); - special_tokens.emplace("<|end_of_text|>", num_base_tokens++); - special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++); - special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++); - special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++); - special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++); - special_tokens.emplace("<|start_header_id|>", num_base_tokens++); - special_tokens.emplace("<|end_header_id|>", num_base_tokens++); - special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++); - special_tokens.emplace("<|eot_id|>", num_base_tokens++); - for (auto i = 5; i < 251; ++i) { - special_tokens.emplace( - "<|reserved_special_token_" + std::to_string(i) + "|>", - num_base_tokens++); - } - return special_tokens; - } - - template - std::pair, re2::StringPiece> - _split_with_allowed_special_token( - re2::StringPiece& input, - const T& allowed_special); - - void _encode( - re2::StringPiece& input, - std::vector& ret, - uint64_t& last_piece_token_len); - - template - std::pair, uint64_t> _encode_with_special_token( - const std::string& text, - const T& allowed_special); - - // Removed negative lookahead \s+(?!\S) since it's not supported by RE2. - const std::string _pattern = - R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)"; - Encoder _encoder; - Encoder _special_token_encoder; - Decoder _decoder; - Decoder _special_token_decoder; - - Re2UPtr _regex; - Re2UPtr _special_token_regex; -}; diff --git a/torchchat.py b/torchchat.py index 35cdcabae..1eeee0120 100644 --- a/torchchat.py +++ b/torchchat.py @@ -6,7 +6,7 @@ import argparse import logging -import subprocess +import signal import sys # MPS ops missing with Multimodal torchtune @@ -25,7 +25,15 @@ default_device = "cpu" +def signal_handler(sig, frame): + print("\nInterrupted by user. Bye!\n") + sys.exit(0) + + if __name__ == "__main__": + # Set the signal handler for SIGINT + signal.signal(signal.SIGINT, signal_handler) + # Initialize the top-level parser parser = argparse.ArgumentParser( prog="torchchat", diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py index fb2bfb299..1e04800ab 100644 --- a/torchchat/cli/builder.py +++ b/torchchat/cli/builder.py @@ -14,16 +14,17 @@ import torch import torch._dynamo.config import torch._inductor.config -import torch.nn as nn +import torch.distributed as dist -from torch.distributed.device_mesh import DeviceMesh -from torch.distributed.elastic.multiprocessing.errors import record -from torch.distributed.elastic.utils.distributed import get_free_port - -from torchchat.distributed import launch_distributed, ParallelDims, parallelize_llama - -from torchchat.model import Model, ModelArgs, ModelType +from torchchat.distributed.utils import( + Color as color, + CUDATrackTime, + init_distributed, + GPUMemoryMonitor, +) +from torchchat.distributed.logging_utils import SingletonLogger +from torchchat.model import Model, ModelArgs, ModelType, Transformer, TransformerArgs from torchchat.model_config.model_config import resolve_model_config from torchchat.utils.build_utils import ( device_sync, @@ -34,6 +35,7 @@ from torchchat.utils.measure_time import measure_time from torchchat.utils.quantize import quantize_model + from torchtune.models.convert_weights import meta_to_tune from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE @@ -54,6 +56,7 @@ class BuilderArgs: gguf_kwargs: Optional[Dict[str, Any]] = None dso_path: Optional[Union[Path, str]] = None aoti_package_path: Optional[Union[Path, str]] = None + snapshot_path: Optional[Union[Path, str]] = None pte_path: Optional[Union[Path, str]] = None device: Optional[str] = None precision: torch.dtype = torch.float32 @@ -62,14 +65,21 @@ class BuilderArgs: pp: int = 1 tp: int = 1 chpt_from: str = "hf" + distribution_path: Optional[str] = None is_chat_model: bool = False prefill_possible: bool = False dynamic_shapes: bool = False max_seq_length: Optional[int] = None + attention_backend: str = "math" def __post_init__(self): if self.device is None: - self.device = "cuda" if torch.cuda.is_available() else "cpu" + if torch.cuda.is_available(): + self.device = "cuda" + elif torch.xpu.is_available(): + self.device = "xpu" + else: + self.device = "cpu" if not ( (self.checkpoint_path and self.checkpoint_path.is_file()) @@ -78,9 +88,10 @@ def __post_init__(self): or (self.dso_path and Path(self.dso_path).is_file()) or (self.aoti_package_path and Path(self.aoti_package_path).is_file()) or (self.pte_path and Path(self.pte_path).is_file()) + or (self.snapshot_path and Path(self.snapshot_path).is_file()) ): raise RuntimeError( - "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path" + "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path" ) if self.aoti_package_path and self.pte_path: @@ -97,7 +108,7 @@ def __post_init__(self): for param, param_msg in ignored_params: if param: print( - f"Warning: {param_msg} ignored because an exported DSO or PTE path was specified" + f"Warning: {param_msg} ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument" ) else: self.prefill_possible = True @@ -113,6 +124,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": checkpoint_path = args.checkpoint_path params_table = args.params_table + distribution_path = None if args.model: # Using a named, well-known model model_config = resolve_model_config(args.model) @@ -127,9 +139,12 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": model_config.transformer_params_key or model_config.name.split("/")[-1] ) + distribution_path = model_config.distribution_path + dso_path = getattr(args, "dso_path", None) pte_path = getattr(args, "pte_path", None) aoti_package_path = getattr(args, "aoti_package_path", None) + snapshot_path = getattr(args, "snapshot_path", None) is_chat_model = False if args.is_chat_model: @@ -157,6 +172,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": output_pte_path = getattr(args, "output_pte_path", None) output_aoti_package_path = getattr(args, "output_aoti_package_path", None) output_dso_path = getattr(args, "output_dso_path", None) + output_snapshot_path = getattr(args, "output_snapshot_path", None) if output_pte_path and args.dtype.startswith("fast"): if args.dtype == "fast": # As per Kimish, float32 should be faster on ET XNNPACK @@ -172,6 +188,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": pp = getattr(args, "pp", 1) tp = getattr(args, "tp", 1) chpt_from = getattr(args, "chpt_from", "hf") + sdp_backend_dict = { + 'math': torch.nn.attention.SDPBackend.MATH, + 'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION, + 'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION, + 'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION, + } + attention_backend = sdp_backend_dict[args.attention_backend] + if args.device == "cpu" and (args.attention_backend == "efficient_attention" + or args.attention_backend == "cudnn_attention"): + print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.") + attention_backend = torch.nn.attention.SDPBackend.MATH return cls( checkpoint_dir=checkpoint_dir, checkpoint_path=checkpoint_path, @@ -183,6 +210,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": dso_path=dso_path, aoti_package_path=aoti_package_path, pte_path=pte_path, + snapshot_path=snapshot_path, device=args.device, precision=dtype, setup_caches=( @@ -192,9 +220,11 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": pp=pp, tp=tp, chpt_from=chpt_from, + distribution_path=distribution_path, is_chat_model=is_chat_model, dynamic_shapes=getattr(args, "dynamic_shapes", False), max_seq_length=getattr(args, "max_seq_length", None), + attention_backend=attention_backend, ) @classmethod @@ -379,6 +409,8 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model: kwargs = {} else: kwargs = builder_args.gguf_kwargs + + kwargs.setdefault("device", builder_args.device) model = Model.from_gguf(builder_args.gguf_path, **kwargs) return model @@ -402,6 +434,7 @@ def _load_checkpoint(builder_args: BuilderArgs): os.path.join(builder_args.checkpoint_dir, cp_name), map_location=builder_args.device, mmap=True, + weights_only=False, ) ) checkpoint = {} @@ -464,77 +497,11 @@ def _load_model_default(builder_args: BuilderArgs) -> Model: return model -def _maybe_init_distributed( - builder_args: BuilderArgs, -) -> Tuple[Optional[DeviceMesh], Optional[ParallelDims]]: - """ - Initialize distributed related setups if the user specified - using distributed inference. If not, this is a no-op. - - Args: - builder_args (:class:`BuilderArgs`): - Command args for model building. - Returns: - Tuple[Optional[DeviceMesh], Optional[ParallelDims]]: - - The first element is an optional DeviceMesh object, - which which describes the mesh topology of devices for the DTensor. - - The second element is an optional ParallelDims object, - which represents the parallel dimensions configuration. - """ - if not builder_args.use_distributed: - return None, None - dist_config = "llama3_8B.toml" # TODO - integrate with chat cmd line - - world_mesh, parallel_dims = launch_distributed(dist_config) - - assert ( - world_mesh is not None and parallel_dims is not None - ), f"failed to launch distributed using {dist_config}" - - return world_mesh, parallel_dims - - -def _maybe_parallelize_model( - model: nn.Module, - builder_args: BuilderArgs, - world_mesh: DeviceMesh, - parallel_dims: ParallelDims, -) -> nn.Module: - """ - We parallelize the module and load the distributed checkpoint to the model - if the user specifies using distributed inference. If not, this is a no-op. - - Args: - model (:class:`nn.Module`): - Module to be parallelized. - builder_args (:class:`BuilderArgs`): - Command args for model building. - world_mesh (:class:`DeviceMesh`): - Object which describes the mesh topology - of devices for the DTensor. - parallel_dims (:class:`ParallelDims`): - Object which represents the parallel dimensions configuration. - Returns: - A :class:`nn.Module` object which is parallelized and checkpoint loaded - if the user specifies using distributed inference. - """ - if world_mesh is None: - return model - assert parallel_dims is not None - print("Applying model parallel to model ...") - parallelize_llama(model, world_mesh, parallel_dims) - return load_checkpoints_to_model(model, builder_args, world_mesh) - - def _load_model(builder_args: BuilderArgs) -> Model: - # world_mesh, parallel_dims = _maybe_init_distributed(builder_args) if builder_args.gguf_path: model = _load_model_gguf(builder_args) - # elif builder_args.use_distributed: - # model = _init_model_on_meta_device(builder_args) else: model = _load_model_default(builder_args) - # model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims) if builder_args.dso_path or builder_args.aoti_package_path: # AOTI-compoiled model will load its own weights. @@ -627,9 +594,8 @@ def do_nothing(max_batch_size, max_seq_length): # attributes will NOT be seen on by AOTI-compiled forward # function, e.g. calling model.setup_cache will NOT touch # AOTI compiled and maintained model buffers such as kv_cache. - from torch._inductor.package import load_package - aoti_compiled_model = load_package( + aoti_compiled_model = torch._inductor.aoti_load_package( str(builder_args.aoti_package_path.absolute()) ) @@ -670,6 +636,128 @@ def do_nothing(max_batch_size, max_seq_length): model = PTEModel(config, builder_args.pte_path) except Exception: raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}") + elif builder_args.snapshot_path: + # Resolve ModelArgs for constructing the PTEModel + # If a manual params_path is provided, use that + if builder_args.params_path: + config: ModelArgs = ModelArgs.from_params(builder_args.params_path) + else: + # TODO: Instead of loading the whole model, refactor to call a + # helper that generate just model.config + with measure_time("Time to load model: {time:.02f} seconds"): + model = _load_model(builder_args) + device_sync(device=builder_args.device) + config = model.config + model = None + try: + model = torch.load(builder_args.snapshot_path, weights_only=False) + except Exception: + raise RuntimeError(f"Failed to load torchchat snapshot {builder_args.snapshot_path}") + # _active_backend() does not allow DSO & AOTI to be true. + # Choose either. + from torchchat.utils.build_utils import set_backend + set_backend (dso=True, pte=False, aoti_package=False) + if (model.config != config): + raise RuntimeError("loaded model architecture mismatch") + ## + ## import all libraries with custom kernels ans custom operators + ## that quantize may be pulling in + ## + + elif builder_args.distributed: + pp_degree = builder_args.pp + tp_degree = builder_args.tp + + init_distributed() + rank = dist.get_rank() + torch.cuda.set_device(rank % torch.cuda.device_count()) + + logger = SingletonLogger.get_logger() + + gpu_memory_monitor = GPUMemoryMonitor("cuda") + logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}") + + # Model-level config + if builder_args.params_table: + model_config = ModelArgs.from_table(builder_args.params_table) + else: + raise NotImplementedError() + # Transformer-level config + config = TransformerArgs.from_params(model_config.transformer_args["text"]) + logger.info(f"Transformer Config: {config}") + + #TODO: Move into head of file after solving circular import + from torchchat.distributed.checkpoint_utils import ( + load_model_weights, + ) + + # Validate pipeline degree + assert config.n_layers % pp_degree == 0 + + # Create device mesh + device_mesh = dist.init_device_mesh( + "cuda", + (pp_degree, tp_degree), + mesh_dim_names=("pp", "tp") + ) + tp_mesh = device_mesh["tp"] + pp_mesh = device_mesh["pp"] + logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}") + + pp_rank = pp_mesh.get_local_rank() + logger.info(f"{pp_degree=}, {tp_degree=}") + + # Assuming same number of GPUs per node + device = torch.device(f"cuda:{rank % torch.cuda.device_count()}") + + # Fill in PP configs + config.stage_idx = pp_rank + config.n_stages = pp_degree + + with torch.device("meta"): + # TODO: we should create model instead of Transformer + model = Transformer(config) + + # Distribute model on TP mesh + # (Surprisingly, this works even though model is on meta device and mesh is of + # cuda devices) + model.distribute(tp_mesh) + if rank == 0: + logger.info(f"Model: {model}") + + # Load weights + logger.info(f"Loading weights for {pp_rank=} on {device=}") + with CUDATrackTime() as timer: + load_model_weights(model, builder_args.distribution_path, device, config, builder_args.chpt_from) + + logger.info( + f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}" + ) + + # Setup KV caches (after model distribution) + # The number of cache lanes is the same as the maximum number of + # micro-batches that can be "in flight" in parallel -- imagine each + # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces. + # When decoding is done for certain micro-batches, we can reuse the KV cache + # lanes. + # TODO: bump up the lane count + pipeline_lanes = 1 + seqlen_prefill=1024 + with device: + model.setup_caches(1, seqlen_prefill, cache_lanes=pipeline_lanes) + + # info on stage size and params + # stage_size = get_module_size(model) + # stage_size_formatted = bytes_to_readable(stage_size) + # stage_num_params = get_num_params(model) + # logger.info( + # f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}" + # ) + model.eval() + + model.text_transformer_args = None + model.config.model_type = model_config.model_type + model.device_mesh = device_mesh else: with measure_time("Time to load model: {time:.02f} seconds"): model = _load_model(builder_args) @@ -706,4 +794,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str: return "TikToken" if tokenizers: return "Tokenizers" - return "SentencePiece" \ No newline at end of file + return "SentencePiece" diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py index a8a2c7da8..1d531c709 100644 --- a/torchchat/cli/cli.py +++ b/torchchat/cli/cli.py @@ -17,10 +17,20 @@ allowable_params_table, ) -logging.basicConfig(level=logging.INFO, format="%(message)s") +_log_level_env = os.getenv("LOG_LEVEL", "INFO") +try: + _log_level = getattr(logging, _log_level_env.upper()) +except AttributeError: + print(f"Invalid log level: {_log_level_env}", file=sys.stderr) + _log_level = logging.INFO + + +logging.basicConfig(level=_log_level, format="%(message)s") logger = logging.getLogger(__name__) default_device = os.getenv("TORCHCHAT_DEVICE", "fast") +default_dtype = os.getenv("TORCHCHAT_PRECISION", "fast") + default_model_dir = Path( os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache") ).expanduser() @@ -149,9 +159,9 @@ def _add_model_config_args(parser, verb: str) -> None: model_config_parser.add_argument( "--dtype", - default="fast", + default=None, choices=allowable_dtype_names(), - help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32, fast16, fast", + help="Override the dtype of the model. Options: bf16, fp16, fp32, fast16, fast", ) model_config_parser.add_argument( "--quantize", @@ -165,9 +175,16 @@ def _add_model_config_args(parser, verb: str) -> None: model_config_parser.add_argument( "--device", type=str, - default=default_device, - choices=["fast", "cpu", "cuda", "mps"], - help="Hardware device to use. Options: cpu, cuda, mps", + default=None, + choices=["fast", "cpu", "cuda", "mps", "xpu"], + help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu", + ) + model_config_parser.add_argument( + "--attention-backend", + type=str, + default="math", + choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"], + help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION", ) @@ -190,6 +207,12 @@ def _add_export_output_path_args(parser) -> None: default=None, help="Output to the specified AOT Inductor .dso model file", ) + exclusive_parser.add_argument( + "--output-snapshot-path", + type=str, + default=None, + help="Output to the specified PyTorch model and sha256 file", + ) exclusive_parser.add_argument( "--output-aoti-package-path", type=str, @@ -237,7 +260,13 @@ def _add_exported_input_path_args(parser) -> None: default=None, help="Use the specified ExecuTorch .pte model file", ) - + exclusive_parser.add_argument( + "--snapshot-path", + type=Path, + default=None, + help="Use the specified torchchat snaphot .tc model file", + ) + # Add CLI Args related to JIT downloading of model artifacts def _add_jit_downloading_args(parser) -> None: @@ -513,20 +542,34 @@ def arg_init(args): if isinstance(args.quantize, str): args.quantize = json.loads(args.quantize) - # if we specify dtype in quantization recipe, replicate it as args.dtype - args.dtype = args.quantize.get("precision", {}).get("dtype", args.dtype) + # if we specify dtype in quantization recipe, allow args.dtype top override if specified + if args.dtype is None: + args.dtype = args.quantize.get("precision", {}).get("dtype", default_dtype) + else: + precision_handler = args.quantize.get("precision", None) + if precision_handler: + if precision_handler["dtype"] != args.dtype: + print('overriding json-specified dtype {precision_handler["dtype"]} with cli dtype {args.dtype}') + precision_handler["dtype"] = args.dtype if getattr(args, "output_pte_path", None): - if args.device not in ["cpu", "fast"]: + if args.device not in [None, "cpu", "fast"]: raise RuntimeError("Device not supported by ExecuTorch") args.device = "cpu" else: # Localized import to minimize expensive imports from torchchat.utils.build_utils import get_device_str - args.device = get_device_str( - args.quantize.get("executor", {}).get("accelerator", args.device) - ) + if args.device is None: + args.device = get_device_str( + args.quantize.get("executor", {}).get("accelerator", default_device) + ) + else: + args.device = get_device_str(args.device) + executor_handler = args.quantize.get("executor", None) + if executor_handler and executor_handler["accelerator"] != args.device: + print(f'overriding json-specified device {executor_handler["accelerator"]} with cli device {args.device}') + executor_handler["accelerator"] = args.device if "mps" in args.device: if getattr(args, "compile", False) or getattr(args, "compile_prefill", False): diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py index f428e4cc6..122ab0f28 100644 --- a/torchchat/cli/convert_hf_checkpoint.py +++ b/torchchat/cli/convert_hf_checkpoint.py @@ -39,19 +39,14 @@ def convert_hf_checkpoint( config = TransformerArgs.from_params(config_args) print(f"Model config {config.__dict__}") - # Load the json file containing weight mapping + # Find all candidate weight mapping index files model_map_json_matches = [Path(m) for m in glob.glob(str(model_dir / "*.index.json"))] - assert len(model_map_json_matches) <= 1, "Found multiple weight mapping files" - if len(model_map_json_matches): - model_map_json = model_map_json_matches[0] - else: - model_map_json = model_dir / "pytorch_model.bin.index.json" # If there is no weight mapping, check for a consolidated model and # tokenizer we can move. Llama 2 and Mistral have weight mappings, while # Llama 3 has a consolidated model and tokenizer. # Otherwise raise an error. - if not model_map_json.is_file(): + if not model_map_json_matches: consolidated_pth = model_dir / "original" / "consolidated.00.pth" tokenizer_pth = model_dir / "original" / "tokenizer.model" if consolidated_pth.is_file() and tokenizer_pth.is_file(): @@ -68,11 +63,30 @@ def convert_hf_checkpoint( return else: raise RuntimeError( - f"Could not find {model_map_json} or {consolidated_pth} plus {tokenizer_pth}" + f"Could not find a valid model weight map or {consolidated_pth} plus {tokenizer_pth}" ) - with open(model_map_json) as json_map: - bin_index = json.load(json_map) + # Load the json file(s) containing weight mapping + # + # NOTE: If there are multiple index files, there are two possibilities: + # 1. The files could be mapped to different weight format files (e.g. .bin + # vs .safetensors) + # 2. The files could be split subsets of the mappings that need to be + # merged + # + # In either case, we can simply keep the mappings where the target file is + # valid in the model dir. + bin_index = {} + for weight_map_file in model_map_json_matches: + with open(weight_map_file, "r") as handle: + weight_map = json.load(handle) + valid_mappings = { + k: model_dir / v + for (k, v) in weight_map.get("weight_map", {}).items() + if (model_dir / v).is_file() + } + bin_index.update(valid_mappings) + bin_files = set(bin_index.values()) weight_map = { "model.embed_tokens.weight": "tok_embeddings.weight", @@ -96,7 +110,6 @@ def convert_hf_checkpoint( "model.norm.weight": "norm.weight", "lm_head.weight": "output.weight", } - bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()} def permute(w, n_heads): return ( diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py index f145c93fb..4da2bc390 100644 --- a/torchchat/cli/download.py +++ b/torchchat/cli/download.py @@ -35,11 +35,12 @@ def _download_hf_snapshot( model_info = model_info(model_config.distribution_path, token=hf_token) model_fnames = [f.rfilename for f in model_info.siblings] - # Check the model config for preference between safetensors and pth + # Check the model config for preference between safetensors and pth/bin has_pth = any(f.endswith(".pth") for f in model_fnames) + has_bin = any(f.endswith(".bin") for f in model_fnames) has_safetensors = any(f.endswith(".safetensors") for f in model_fnames) - # If told to prefer safetensors, ignore pth files + # If told to prefer safetensors, ignore pth/bin files if model_config.prefer_safetensors: if not has_safetensors: print( @@ -47,10 +48,10 @@ def _download_hf_snapshot( file=sys.stderr, ) exit(1) - ignore_patterns = "*.pth" + ignore_patterns = ["*.pth", "*.bin"] # If the model has both, prefer pth files over safetensors - elif has_pth and has_safetensors: + elif (has_pth or has_bin) and has_safetensors: ignore_patterns = "*safetensors*" # Otherwise, download everything @@ -110,6 +111,8 @@ def _download_direct( def download_and_convert( model: str, models_dir: Path, hf_token: Optional[str] = None ) -> None: + if model is None: + raise ValueError("'download' command needs a model name or alias.") model_config = resolve_model_config(model) model_dir = models_dir / model_config.name @@ -234,4 +237,8 @@ def where_main(args) -> None: # Subcommand to download model artifacts. def download_main(args) -> None: - download_and_convert(args.model, args.model_directory, args.hf_token) + try: + download_and_convert(args.model, args.model_directory, args.hf_token) + except ValueError as e: + print(e, file=sys.stderr) + sys.exit(1) diff --git a/torchchat/distributed/checkpoint.py b/torchchat/distributed/checkpoint.py index 1830e3a75..11e397469 100644 --- a/torchchat/distributed/checkpoint.py +++ b/torchchat/distributed/checkpoint.py @@ -96,6 +96,7 @@ def _load_checkpoints_from_storage( checkpoint_path, map_location=builder_args.device, mmap=True, + weights_only=False, ) diff --git a/torchchat/distributed/checkpoint_utils.py b/torchchat/distributed/checkpoint_utils.py index cf3206e4e..806855c4b 100644 --- a/torchchat/distributed/checkpoint_utils.py +++ b/torchchat/distributed/checkpoint_utils.py @@ -17,6 +17,7 @@ from torch.distributed._tensor import DTensor from torchchat.distributed.dtensor_utils import convert_to_dtensor from torchchat.cli.builder import BuilderArgs, _load_checkpoint +from torchchat.model import ModelArgs _DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json" @@ -450,3 +451,34 @@ def load_weights_from_torchchat_format(stage_module, distribution, device, model # Fill state dict into stage module stage_module.load_state_dict(stage_state_dict, strict=False, assign=True) logger.info(f"Successfully loaded {len(updated_states)} weights into stage module") + + +def load_model_weights( + stage_module: torch.nn.Module, + distribution: str, + device: torch.device, + model_config: ModelArgs, + chpt_from: str, +): + """Load the weights from the safetensor file(s) into the model stage. + Model config is needed b/c we permute wq and wk weights based on attn heads. + + Args: + stage_module (torch.nn.Module): The model stage to load the weights into. + distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct". + device (torch.device): The device to load the weights onto. + model_config (ModelArgs): The model config. + chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf". + """ + if chpt_from == "hf": + # This format stands for: index file + multiple binary files + load_weights_from_hf_format(stage_module, distribution, device, model_config) + elif chpt_from == "torchchat": + # This format stands for: + # single binary file, OR + # multiple binary files without index files. + load_weights_from_torchchat_format( + stage_module, distribution, device, model_config + ) + else: + raise ValueError(f"Unknown checkpoint format: {chpt_from}") diff --git a/torchchat/distributed/dist_run.py b/torchchat/distributed/dist_run.py deleted file mode 100644 index 389ae41c1..000000000 --- a/torchchat/distributed/dist_run.py +++ /dev/null @@ -1,629 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -# Example run command: -# torchrun --nproc-per-node 4 dist_run.py llama2-7b-chat --pp 2 -# torchrun --nproc-per-node 4 dist_run.py llama3 --pp 2 - -import argparse -import os -from enum import auto, Enum -from pathlib import Path -from types import MethodType, SimpleNamespace -from typing import Any, Dict, List, Optional, Tuple - -import torch -import torch.distributed as dist -from torch.distributed.pipelining import PipelineStage, ScheduleGPipe -from torchchat.cli.builder import TokenizerArgs - -# TODO - these are not distributed specific, consider moving to new package -from torchchat.distributed.checkpoint_utils import ( - get_hf_config_file, - load_weights_from_hf_format, - load_weights_from_torchchat_format, -) - -from torchchat.distributed.logging_utils import SingletonLogger -from torchchat.distributed.utils import ( - bytes_to_readable, - Color as color, - CUDATrackTime, - get_module_size, - get_num_params, - GPUMemoryMonitor, -) -from torchchat.model import ModelArgs, Transformer, TransformerArgs -from torchchat.utils.build_utils import set_precision - -try: - from tokenizer.tiktoken import Tokenizer as TiktokenTokenizer -except ImportError: - TiktokenTokenizer = None -try: - from sentencepiece import SentencePieceProcessor -except ImportError: - SentencePieceProcessor = None - - -logger = SingletonLogger.get_logger() - -# Using model name to identify the model to load, for example "llama2-7b-chat". -# You can change it to other values listed below. -# For details on the name-to-distribution mapping, see README.md or models.json. -NAME_TO_DISTRIBUTION_AND_DTYPE = { - "llama2-7b-chat": ("meta-llama/Llama-2-7b-chat-hf", torch.float16), - "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16), - "llama3.1": ("meta-llama/Meta-Llama-3.1-8B-Instruct", torch.bfloat16), -} - - -def _init_distributed(): - dist.init_process_group("nccl") - rank = dist.get_rank() - world_size = dist.get_world_size() - # Assuming same number of GPUs per node - torch.cuda.set_device(rank % torch.cuda.device_count()) - return rank, world_size - - -def _create_device_mesh(pp_degree, tp_degree): - return dist.init_device_mesh( - "cuda", (pp_degree, tp_degree), mesh_dim_names=("pp", "tp") - ) - - -def dict_to_args(dictionary: Dict[str, Any]) -> SimpleNamespace: - return SimpleNamespace(**dictionary) - - -def _patch_tokenizer(tokenizer): - """Patch the tokenizer to support decoding of token ids.""" - if isinstance(tokenizer, TiktokenTokenizer): - # Patch tiktokenizer to allow a list of sequences. - # TODO: Upstream to tokenizer modules - old_decode = tokenizer.decode - - def decode( - self, token_ids: List[int | List[int]], *args, **kwargs - ) -> str | List[str]: - if len(token_ids) < 1: - return "" - if isinstance(token_ids[0], list): - return [old_decode(t, *args, **kwargs) for t in token_ids] - else: - return old_decode(token_ids, *args, **kwargs) - - tokenizer.decode = MethodType(decode, tokenizer) - return tokenizer - - -def _build_chat_tokenizer( - tokenizer_args: TokenizerArgs, -) -> SentencePieceProcessor | TiktokenTokenizer: - """Builds a tokenizer for the given model name""" - - tokenizer_args = TokenizerArgs.from_args(tokenizer_args) - tokenizer = tokenizer_args.t - assert tokenizer is not None, f"Failed to get tokenizer using {tokenconfig=}" - logger.info( - f"using tokenizer = {tokenizer.__class__.__module__}.{tokenizer.__class__.__name__}" - ) - - tokenizer = _patch_tokenizer(tokenizer) - - return tokenizer - - -def _load_model_weights( - stage_module: torch.nn.Module, - distribution: str, - device: torch.device, - model_config: ModelArgs, - chpt_from: str, -): - """Load the weights from the safetensor file(s) into the model stage. - Model config is needed b/c we permute wq and wk weights based on attn heads. - - Args: - stage_module (torch.nn.Module): The model stage to load the weights into. - distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct". - device (torch.device): The device to load the weights onto. - model_config (ModelArgs): The model config. - chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf". - """ - if chpt_from == "hf": - # This format stands for: index file + multiple binary files - load_weights_from_hf_format(stage_module, distribution, device, model_config) - elif chpt_from == "torchchat": - # This format stands for: - # single binary file, OR - # multiple binary files without index files. - load_weights_from_torchchat_format( - stage_module, distribution, device, model_config - ) - else: - raise ValueError(f"Unknown checkpoint format: {chpt_from}") - - -def _encode_strings( - strings: List[str], - tokenizer, - bos: bool, - device: torch.device, - dtype=torch.int64, -) -> List[torch.Tensor]: - """Encode a list of prompt strings into a list of tensor token ids.""" - encoded_list = [] - for string in strings: - tokens = tokenizer.encode(string) - if bos: - tokens = [tokenizer.bos_id()] + tokens - encoded_list.append(torch.tensor(tokens, dtype=dtype, device=device)) - return encoded_list - - -def _create_padded_prompts( - input_ids_list: List[torch.Tensor], - tokenizer, - seqlen: int, - start_pos: int, - device: torch.device, - pad_token_id: Optional[int] = None, -) -> Tuple[torch.Tensor, List[int]]: - """ - Create a padded tensor for multiple encoded input prompts. - - Returns: - Tuple[torch.Tensor, List[int]]: A tuple containing the padded tensor and a list of prompt lengths. - """ - pad_token_id = pad_token_id if pad_token_id is not None else tokenizer.eos_id() - - # Find the maximum prompt length - max_prompt_len = max(ids.size(0) for ids in input_ids_list) - - # Calculate the buffer size - max_new_tokens = max(0, min(seqlen - start_pos, seqlen - max_prompt_len)) - token_buffer_size = max_prompt_len + max_new_tokens - - # Create the padded batch tensor - batch_size = len(input_ids_list) - batch_seq = torch.full( - (batch_size, token_buffer_size), pad_token_id, dtype=torch.int64, device=device - ) - - prompt_lengths = [] - for i, input_ids in enumerate(input_ids_list): - prompt_len = input_ids.size(0) - batch_seq[i, :prompt_len] = input_ids - prompt_lengths.append(prompt_len) - - return batch_seq, prompt_lengths - - -def _batch_decode_next_tokens( - output: torch.Tensor, - pos: List[int] = None, - temperature: float = 1.0, - topk: int = 10, -) -> torch.Tensor: - """ - Decode the next token for each prompt in the batch. Adds temperature option for non-deterministic decoding. - - Args: - output (torch.Tensor): The output tensor to decode. - pos (List[int]): The positions of the `output` to decode in the sequence length dimension. - step (int): Step indicator. If -1, use positions from `pos`. Otherwise, use the first token. - temperature (float): Sampling temperature for non-deterministic decoding. - - Returns: - torch.Tensor: Decoded token ids. - """ - batch_size, seq_len, vocab_size = output.shape - - if pos is None: - # `pos` is not provided, so we can use the first token - next_token_logits = output[:, 0, :] - else: - # get the logits for each prompt at the specified positions - next_token_logits = output[torch.arange(batch_size), torch.tensor(pos) - 1] - - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - - # Uses top-k sampling if temperature is not 1.0, otherwise use argmax - if temperature != 1.0: - top_k = min(topk, vocab_size) # Ensure top-k is not greater than vocab size - top_k_logits, top_k_indices = torch.topk(next_token_logits, k=top_k, dim=-1) - probs = torch.softmax(top_k_logits, dim=-1) - next_token_indices = torch.multinomial(probs, num_samples=1).squeeze(-1) - next_tokens = top_k_indices.gather( - -1, next_token_indices.unsqueeze(-1) - ).squeeze(-1) - else: - # Argmax (deterministic) - next_tokens = torch.argmax(next_token_logits, dim=-1, keepdim=True) - - # Token ids in int tensor form - return next_tokens - - -def _update_padded_sequence( - padded_sequence: torch.Tensor, - new_token: torch.Tensor, - prompt_lengths: List[int], -) -> None: - for i in range(len(prompt_lengths)): - padded_sequence[i, prompt_lengths[i]] = new_token[i, 0] - # logger.info(f"updated prompt {i} with new token {new_token[i, 0]}") - - -# Decode token id into string and print it -def _decode_in_flight(token, tokenizer, tp_rank): - """decode token ids for all prompts in the batch and log them""" - # `token` is a tensor of shape (batch_size, 1). - # For TiktokenTokenizer, we need to squeeze it to 1D. - # For SentencePieceProcessor, we don't. - token_str = tokenizer.decode(token.tolist()) - # print the token string on tp rank 0 - if tp_rank == 0: - logger.info( - f"{color.green} responses ====>>>> " - f"{color.blue} {token_str} {color.reset}" - ) - return token_str - - -def _cleanup(): - dist.barrier() - dist.destroy_process_group() - - -prompts = [ - "What is Snow?", - # "Can you explain what is the purpose of back propagation in neural networks?", - "Who is Santa Claus?", - "Where does Santa live?", - "Who is Abraham Lincoln?", - # "How are models trained?", -] - - -def main( - model_name, - builder_args, - tokenizer_args, - pipe, -): - pp_degree = builder_args.pp - - rank, world_size = _init_distributed() - logger.info(f"Worker started: {rank=}, {world_size=}") - - gpu_memory_monitor = GPUMemoryMonitor("cuda") - logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}") - - distribution, model_dtype = NAME_TO_DISTRIBUTION_AND_DTYPE[model_name] - logger.info(f"Using model weights from {distribution} and dtype {model_dtype}") - - # Model-level config - model_config = ModelArgs.from_name(distribution) - # Transformer-level config - config = TransformerArgs.from_params(model_config.transformer_args["text"]) - logger.info(f"Transformer Config: {config}") - - tokenizer = _build_chat_tokenizer(tokenizer_args) - - set_precision(model_dtype) - logger.info(f"Using cache precision {model_dtype}") - - hf_config = get_hf_config_file(distribution) - if hf_config is None: - raise ValueError(f"Config file not found for model id {distribution}") - - # Validate pipeline degree - assert world_size % pp_degree == 0 - assert config.n_layers % pp_degree == 0 - - # Tensor parallel is enabled in this program - tp_degree = world_size // pp_degree - - # Create device mesh - device_mesh = _create_device_mesh(pp_degree, tp_degree) - tp_mesh = device_mesh["tp"] - pp_mesh = device_mesh["pp"] - logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}") - - tp_rank = tp_mesh.get_local_rank() - pp_rank = pp_mesh.get_local_rank() - tp_group = tp_mesh.get_group() - pp_group = pp_mesh.get_group() - logger.info(f"{pp_degree=}, {tp_degree=}") - - # Convenience variables - first_pp_rank = 0 - last_pp_rank = pp_degree - 1 - - # Assuming same number of GPUs per node - device = torch.device(f"cuda:{rank % torch.cuda.device_count()}") - - # Fill in PP configs - config.stage_idx = pp_rank - config.n_stages = pp_degree - - with torch.device("meta"): - # TODO: we should create model instead of Transformer - model = Transformer(config) - - # Distribute model on TP mesh - # (Surprisingly, this works even though model is on meta device and mesh is of - # cuda devices) - model.distribute(tp_mesh) - if rank == 0: - logger.info(f"Model: {model}") - - # Load weights - logger.info(f"Loading weights for {pp_rank=} on {device=}") - with CUDATrackTime() as timer: - _load_model_weights(model, distribution, device, config, builder_args.chpt_from) - - logger.info( - f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}" - ) - - # Batch size. Since we push batches dynamically through the pipeline rather - # than chunking them, this is effectively micro-batch size in pipeline - # sense. Thus it is interchangeable with micro-batch size below. - batch_size = 1 # len(prompt) - seqlen_prefill = 1024 # sequence length - dim = 4096 # embedding dimension - - # Setup KV caches (after model distribution) - # The number of cache lanes is the same as the maximum number of - # micro-batches that can be "in flight" in parallel -- imagine each - # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces. - # When decoding is done for certain micro-batches, we can reuse the KV cache - # lanes. - # TODO: bump up the lane count - pipeline_lanes = 1 - with device: - model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes) - - # info on stage size and params - stage_size = get_module_size(model) - stage_size_formatted = bytes_to_readable(stage_size) - stage_num_params = get_num_params(model) - logger.info( - f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}" - ) - model.eval() - - # Helper function to get example inputs and outputs for the stages. - def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]: - mb_ids = torch.randint( - 0, config.vocab_size, (batch_size, seqlen), device=device - ) - activation = torch.rand( - batch_size, seqlen, dim, device=device, dtype=model_dtype - ) - logits = torch.rand( - batch_size, seqlen, config.vocab_size, device=device, dtype=model_dtype - ) - example_inputs = (mb_ids if pp_rank == first_pp_rank else activation,) - example_outputs = (logits if pp_rank == last_pp_rank else activation,) - return example_inputs, example_outputs - - # Create prefill stage - logger.info(f"Creating pipeline stage for prefill {pp_rank=}, {pp_degree=}") - example_inputs, example_outputs = get_example_ins_outs(seqlen_prefill) - prefill_stage = PipelineStage( - model, - pp_rank, - pp_degree, - device, - input_args=example_inputs, - output_args=example_outputs, - group=pp_group, - ) - - # Create schedule - # Number of micro-batches for the schedule is 1, because each step() call we - # only push 1 micro-batch into the pipeline. But we can continuously push - # new micro-batches into the pipeline as they arrive, achieving same - # pipelining effect. - prefiller = ScheduleGPipe(prefill_stage, 1) - - # Need these global ids due to the API definition of dist.send and recv - first_pp_rank_global_id = dist.get_global_rank(pp_group, first_pp_rank) - last_pp_rank_global_id = dist.get_global_rank(pp_group, last_pp_rank) - - pipe.send("ready") - - while True: - command = pipe.recv() - assert isinstance(command, (str, list)) - if isinstance(command, str): - if command == "stop": - break - else: - raise ValueError(f"Unknown command: {command}") - else: - prompt = command - assert ( - len(prompt) == batch_size - ), f"Expecting {batch_size=} prompts but got {len(prompt)=}" - logger.info(f"{color.green}Prompt: {prompt}{color.reset}") - - start_pos = 0 - # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen - input_pos = torch.arange(seqlen_prefill, device=device) - - # encode the prompt - input_ids = _encode_strings( - prompt, tokenizer, bos=True, device=device, dtype=torch.int64 - ) - - # create a padded tensor for the input prompt - padded_sequence, prompt_lengths = _create_padded_prompts( - input_ids, tokenizer, seqlen_prefill, start_pos, device - ) - - # New token generated each iteration - # need a row dimension for each prompt in the batch - new_token = torch.zeros(batch_size, 1, device=device, dtype=torch.int64) - # Store the generated tokens - res = [] - - # Prefill phase - # Run context input through pipeline - # TODO: we need to pass `input_pos` and `cache_lane` to each stage. - lane = 0 - kwargs = {"input_pos": input_pos, "cache_lane": lane} - with torch.no_grad(), CUDATrackTime() as timer: - if pp_rank == first_pp_rank: - output = prefiller.step(padded_sequence, **kwargs) - elif pp_rank == last_pp_rank: - output = prefiller.step(**kwargs) - else: # middle pp ranks - prefiller.step(**kwargs) - - logger.info( - f"{color.green}Prefilling time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}" - ) - - # Decode the output -- first generated token - if pp_rank == last_pp_rank: - logger.info(f"{color.green}Decoding...{prompt_lengths=}{color.reset}") - new_token = _batch_decode_next_tokens(output, prompt_lengths) - res.append(new_token) - # TODO: Move to a separate decoding thread - resp = _decode_in_flight(new_token, tokenizer, tp_rank) - pipe.send((resp, new_token.tolist())) - else: - pipe.send(None) - - # seqlen = 1 now - seqlen_decode = 1 - input_pos = torch.tensor([prompt_lengths[0]], device=device) - - # Create decode stage - logger.info(f"Creating pipeline stage for decode {pp_rank=}, {pp_degree=}") - example_inputs, example_outputs = get_example_ins_outs(seqlen_decode) - decode_stage = PipelineStage( - model, - pp_rank, - pp_degree, - device, - input_args=example_inputs, - output_args=example_outputs, - group=pp_group, - ) - # create schedule - decoder = ScheduleGPipe(decode_stage, 1) - - # Decoding - with torch.no_grad(), CUDATrackTime() as timer: - while True: - command = pipe.recv() - assert isinstance(command, str) - if command == "stop": - break - elif command == "step": - pass - else: - raise ValueError(f"Unknown command: {command}") - - kwargs = {"input_pos": input_pos, "cache_lane": lane} - # sendrecv between last and first ranks, only if: - # first_pp_rank != last_pp_rank. - if pp_rank == last_pp_rank and pp_rank != first_pp_rank: - dist.send( - new_token, - dst=first_pp_rank_global_id, - group=pp_group, - ) - elif pp_rank == first_pp_rank and pp_rank != last_pp_rank: - dist.recv( - new_token, - src=last_pp_rank_global_id, - group=pp_group, - ) - - # Run data through pipeline - if pp_rank == first_pp_rank: - output = decoder.step(new_token, **kwargs) - elif pp_rank == last_pp_rank: - output = decoder.step(**kwargs) - else: # middle pp ranks - decoder.step(**kwargs) - - # Decode the output - if pp_rank == last_pp_rank: - new_token = _batch_decode_next_tokens(output) - res.append(new_token) - # TODO: Move to a separate decoding thread - resp = _decode_in_flight(new_token, tokenizer, tp_rank) - pipe.send((resp, new_token)) - else: - pipe.send(None) - - # Increment input position - input_pos += 1 - - logger.info( - f"{color.green}Decoding time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}" - ) - - # Display the decoding results - - # output formatted response via last pp group and tp rank 0 - if pp_rank == last_pp_rank and tp_rank == 0: - # `res` is a list of tensors, each being a batch of generated token ids. - # We need to concatenate them to get the full sequence of generated - # token ids. Thus cat'ing along dim 1. - res = torch.cat(res, dim=1) - res_list = res.tolist() - - responses = tokenizer.decode(res_list) - - # Show prompts and responses - for prompt_text, response_text in zip(prompt, responses): - logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}") - logger.info(f"Response: {color.red}{response_text} {color.reset}") - - # Cleanup - _cleanup() - logger.info( - f"{color.green}Success{color.white} - {color.blue}Rank {rank} has completed.{color.reset}" - ) - -# TODO: remove or make it work again -# if __name__ == "__main__": -# parser = argparse.ArgumentParser() -# parser.add_argument( -# "model_name", -# type=str, -# default="llama3", -# help="Name of the model to load", -# choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(), -# ) -# parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel degree") -# parser.add_argument( -# "--ntokens", -# type=int, -# default=40, -# help="Number of tokens to generate", -# ) -# parser.add_argument( -# "--chpt-from", -# type=str, -# default="hf", # TODO: change to torchchat once we support it well -# help="Checkpoint format to load from", -# choices=["hf", "torchchat"], -# ) -# args = parser.parse_args() - -# main() diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py deleted file mode 100644 index 51c472e4a..000000000 --- a/torchchat/distributed/generate.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -import asyncio -import atexit -import importlib.util -import subprocess -import threading -from abc import abstractmethod -from collections import deque -from dataclasses import dataclass -from functools import partial -from os import environ -from pathlib import Path -from typing import List, Optional -from uuid import uuid4 - -import torch.multiprocessing as mp -from torchchat.cli.builder import BuilderArgs, TokenizerArgs -from torchchat.distributed.dist_run import NAME_TO_DISTRIBUTION_AND_DTYPE -from torchchat.distributed.logging_utils import SingletonLogger - -logger = SingletonLogger.get_logger() - - -def _setup_env(world_size: int, rank: int, target: callable, *args, **kwargs): - environ["MASTER_ADDR"] = "localhost" - environ["MASTER_PORT"] = "29500" - environ["RDZV_BACKEND"] = "c10d" - environ["WORLD_SIZE"] = str(world_size) - environ["RANK"] = str(rank) - environ["LOCALRANK"] = str(rank) - - return target(*args, **kwargs) - - -def _launch_distributed_inference( - model_name: str, builder_args: BuilderArgs, tokenizer_args: TokenizerArgs -) -> tuple[List]: - # launch distributed inference worker, each worker gets a pipe to communicate with the main process - logger.info("Launching distributed inference ...") - - num_processes_per_node = builder_args.pp * builder_args.tp - - from torchchat.distributed.dist_run import main - - mp.set_start_method("spawn") - - pipes = [] - procs = [] - try: - for rank in range(num_processes_per_node): - server_pipe, client_pipe = mp.Pipe(duplex=True) - pipes.append(server_pipe) - procs.append( - mp.Process( - target=partial(_setup_env, num_processes_per_node, rank, main), - args=(model_name, builder_args, tokenizer_args, client_pipe), - ) - ) - procs[-1].start() - - for pipe in pipes: - assert pipe.recv() == "ready", "Starting the worker failed" - except Exception as e: - logger.error(f"Error during distributed inference: {str(e)}") - for p in procs: - p.kill() - raise e - - logger.info( - f"Done launching distributed inference on {num_processes_per_node} GPUs." - ) - return procs, pipes - - -@dataclass -class Output: - is_finished: bool = False - text: Optional[str] = None - token: Optional[list] = None - - -@dataclass -class Request: - request_id: int - prompt: str - - @classmethod - def new_request(cls, prompt): - return cls(request_id=uuid4().int, prompt=prompt) - - -class Scheduler(object): - def __init__( - self, - builder_args, - generator_args, - pipes, - loop, - ): - self.builder_args = builder_args - self.generator_args = generator_args - self.requests = {} - self.in_flight_requests = {} - self.in_flight_batch_order = [] - self.pipes = pipes - self.req_to_states = {} - self.req_to_results = {} - self.request_queue = mp.Queue() - self.loop = loop - - def schedule_request(self, req: Request): - # add request to queue and create deque and async event for response - self.req_to_states[req.request_id] = asyncio.Event() - self.req_to_results[req.request_id] = deque() - self.request_queue.put(req) - - def process_requests_loop(self): - # Continuously process requests (one at a time for now), results are routed into the requests deque - while True: - req = self.request_queue.get() - if req == "stop": - break - self.requests = {req.request_id: req.prompt} - - responses = {} - running = True - while running: - outputs = self.step() - self.req_to_results[req.request_id].append(outputs[0]) - - self.loop.call_soon_threadsafe(self.req_to_states[req.request_id].set) - - running &= not outputs[0].is_finished - - async def wait_for_request(self, req: Request) -> Output: - # Wait for request to deliver result, uses event to trigger and reads from left side of deque - is_finished = False - while not is_finished: - await self.req_to_states[req.request_id].wait() - while len(self.req_to_results[req.request_id]): - output = self.req_to_results[req.request_id].popleft() - is_finished |= output.is_finished - yield output - del self.req_to_states[req.request_id] - del self.req_to_results[req.request_id] - - def step(self) -> List[Output]: - # Make a prefill or decoding step and receive results - responses = [] - # TODO: Implement a scheduler to handle the requests - if len(self.in_flight_requests) > 0: - # Receive decoded token - for p in self.pipes: - p.send("step") - for p in self.pipes: - responses.append(p.recv()) - - else: - # Send requests to backend - self.in_flight_batch_order = list(self.requests.keys()) - prompts = [self.requests[k] for k in self.in_flight_batch_order] - for p in self.pipes: - p.send(prompts) - self.in_flight_requests = self.requests - self.requests = {} - self.current_step = 0 - # Receive first token - for p in self.pipes: - responses.append(p.recv()) - # Filter out None responses from in-between stages - responses = [r for r in responses if r is not None][0] - outputs = [] - for k, v in zip(self.in_flight_batch_order, zip(responses[0], responses[1])): - text, token_ids = v - outputs.append( - Output( - # TODO: Look for tokenizer.eos_id as well - is_finished=self.current_step >= self.generator_args.max_new_tokens, - text=text, - token=token_ids, - ) - ) - if self.current_step >= self.generator_args.max_new_tokens: - for p in self.pipes: - p.send("stop") - self.in_flight_requests = [] - - self.current_step += 1 - - return outputs - - -class DistributedGenerator(object): - def __init__( - self, - # TODO: switch this to torchchat method - model_name: str, - builder_args: BuilderArgs, - tokenizer_args: TokenizerArgs, - # TODO: move GeneratorArgs into a different module - generator_args, - profile: Optional[Path], - quantize: bool, - draft_quantize: bool, - ): - self.model_name = model_name - self.builder_args = builder_args - self.generate_args = generator_args - - self.check_args() - - self.procs, self.pipes = _launch_distributed_inference( - model_name, builder_args, tokenizer_args - ) - - self.loop = asyncio.new_event_loop() - asyncio.set_event_loop(self.loop) - - self.scheduler = Scheduler(builder_args, generator_args, self.pipes, self.loop) - - # TODO: Mode into process and use pipe or queue for comm - self.scheduler_thread = threading.Thread( - target=self.scheduler.process_requests_loop - ) - self.scheduler_thread.start() - - atexit.register(self.shutdown) - - def shutdown(self): - # Stop all processes and threads - self.scheduler.request_queue.put("stop") - self.scheduler_thread.join() - - for p in self.pipes: - p.send("stop") - for p in self.procs: - p.kill() - - def generate(self, text): - # Function to generate text from prompt - req = Request.new_request(text) - self.scheduler.schedule_request(req) - - generator = self.scheduler.wait_for_request(req) - - running = True - while running: - output = self.loop.run_until_complete(generator.__anext__()) - running &= not output.is_finished - - yield output - - def check_args(self): - if self.generate_args.chat_mode: - raise NotImplementedError( - "Currently we only support generate with --distributed" - ) - elif self.builder_args.tp < 2: - raise ValueError("TP degree must be at least 2 for distributed inference") - elif self.model_name not in NAME_TO_DISTRIBUTION_AND_DTYPE.keys(): - raise ValueError( - f"Distributed inference currently only supports then following models: {list(NAME_TO_DISTRIBUTION_AND_DTYPE.keys())}" - ) - elif self.builder_args.chpt_from == "torchchat": - raise ValueError( - f"Distributed inference currently only supports HF checkpoints" - ) diff --git a/torchchat/distributed/utils.py b/torchchat/distributed/utils.py index 46ea5d9a1..85bfe04fc 100644 --- a/torchchat/distributed/utils.py +++ b/torchchat/distributed/utils.py @@ -6,15 +6,15 @@ import itertools import os +import time from dataclasses import dataclass from datetime import timedelta -import time +from os import environ from typing import Optional import torch - from torchchat.distributed.logging_utils import SingletonLogger logger = SingletonLogger.get_logger() @@ -257,3 +257,13 @@ def get_device_info( f"with {self.device_capacity_gib:.2f}GiB memory" ) return device_info + +def run_in_dist_env(world_size: int, rank: int, target: callable): + environ["MASTER_ADDR"] = "localhost" + environ["MASTER_PORT"] = "29500" + environ["RDZV_BACKEND"] = "c10d" + environ["WORLD_SIZE"] = str(world_size) + environ["RANK"] = str(rank) + environ["LOCALRANK"] = str(rank) + + return target() diff --git a/torchchat/edge/android/torchchat/app/build.gradle.kts b/torchchat/edge/android/torchchat/app/build.gradle.kts index e0c9c196b..a98a70cab 100644 --- a/torchchat/edge/android/torchchat/app/build.gradle.kts +++ b/torchchat/edge/android/torchchat/app/build.gradle.kts @@ -57,7 +57,7 @@ dependencies { implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch-llama.aar")) + implementation(files("libs/executorch.aar")) implementation("com.google.android.material:material:1.12.0") implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") diff --git a/torchchat/export.py b/torchchat/export.py index 7c5243b68..e7cb32309 100644 --- a/torchchat/export.py +++ b/torchchat/export.py @@ -5,13 +5,13 @@ # LICENSE file in the root directory of this source tree. import os -from typing import Optional +from typing import Dict, Optional import torch +import torch._inductor import torch.nn as nn from torch.export import Dim -import torch._inductor from torchchat.cli.builder import ( _initialize_model, @@ -28,6 +28,31 @@ default_device = "cpu" +""" +Export Snapshot +""" + + +def export_snapshot( + model: nn.Module, + device: Optional[str] = None, + output_path: str = "model-snapshot.tc", +) -> str: + """ + Export the model as snapshot. + + Args: + model: The model to be exported. + device: The device to run the model on. + output_path: The path to save the exported model. + Returns: + The path to the exported model. + """ + assert output_path.endswith(".tc"), "use .tc extension for snapshots" + torch.save(model, output_path) + return output_path + + """ Export for Server """ @@ -39,6 +64,7 @@ def export_for_server( output_path: str = "model.pt2", dynamic_shapes: bool = False, package: bool = True, + metadata: Optional[Dict[str, str]] = None, ) -> str: """ Export the model using AOT Compile to get a .dso for server use cases. @@ -67,21 +93,28 @@ def export_for_server( dynamic_shapes = None with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]): - metadata = {} # TODO: put more metadata here - options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata} + options = { + "aot_inductor.package": package, + "aot_inductor.metadata": metadata or {}, + } + if not package: options = {"aot_inductor.output_path": output_path} - path = torch._export.aot_compile( + ep = torch.export.export( model, example_inputs, dynamic_shapes=dynamic_shapes, - options=options, ) if package: - from torch._inductor.package import package_aoti - path = package_aoti(output_path, path) + path = torch._inductor.aoti_compile_and_package( + ep, package_path=output_path, inductor_configs=options + ) + else: + path = torch._inductor.aot_compile( + ep.module(), example_inputs, options=options + ) print(f"The generated packaged model can be found at: {path}") return path @@ -102,13 +135,13 @@ def export_for_server( from typing import Any, Dict, Tuple, Union import executorch.exir as exir + from executorch.backends.xnnpack._passes.convert_to_linear import ( + ConvertToLinearPass, + ) from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( XnnpackDynamicallyQuantizedPartitioner, ) - from executorch.backends.xnnpack._passes.convert_to_linear import ( - ConvertToLinearPass, - ) from executorch.exir import EdgeProgramManager, to_edge from executorch.exir.capture._config import ( @@ -121,8 +154,7 @@ def export_for_server( ) from executorch.exir.tracer import Value - from torch._export import capture_pre_autograd_graph - from torch.export import export, ExportedProgram + from torch.export import export, export_for_training, ExportedProgram from torchchat.model import apply_rotary_emb, Attention from torchchat.utils.build_utils import get_precision @@ -166,18 +198,22 @@ def __init__(self, attention: Attention): self.wo = attention.wo - max_batch_size, n_heads, max_seq_length, head_dim = ( - attention.kv_cache[0].k_cache.shape - ) + max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[ + 0 + ].k_cache.shape cache_dtype = attention.kv_cache[0].k_cache.dtype # The `Attention` module being replaced can have multiple KV caches # (denoted by `cache_lanes`). Thus we follow the same setup format # as in `Attention.setup_cache`. cache_lanes = len(attention.kv_cache) - self.kv_cache = nn.ModuleList([ - CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype) - for _ in range(cache_lanes) - ]) + self.kv_cache = nn.ModuleList( + [ + CustomKVCache( + max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype + ) + for _ in range(cache_lanes) + ] + ) self.n_heads = attention.n_heads self.head_dim = attention.head_dim @@ -215,9 +251,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0): return self.wo(output) def replace_attention_with_custom_sdpa_attention(module: nn.Module): - from executorch.extension.llm.custom_ops import ( # noqa - sdpa_with_kv_cache, - ) + from executorch.extension.llm.custom_ops import custom_ops # noqa for name, child in module.named_children(): if isinstance(child, Attention): @@ -238,7 +272,9 @@ def _to_core_aten( raise ValueError( f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}" ) - core_aten_ep = export(model, example_inputs, dynamic_shapes=dynamic_shapes) + core_aten_ep = export_for_training( + model, example_inputs, dynamic_shapes=dynamic_shapes + ) if verbose: logging.info(f"Core ATen graph:\n{core_aten_ep.graph}") return core_aten_ep @@ -308,7 +344,7 @@ def export_for_et(model, device, output_path) -> str: with torch.nn.attention.sdpa_kernel( [torch.nn.attention.SDPBackend.MATH] ), torch.no_grad(): - m = capture_pre_autograd_graph(model, input, dynamic_shapes=dynamic_shapes) + m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module() edge_manager = export_to_edge( m, @@ -350,7 +386,11 @@ def main(args): print(f"Using device={builder_args.device}") set_precision(builder_args.precision) - set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path) + set_backend( + dso=args.output_dso_path, + pte=args.output_pte_path, + aoti_package=args.output_aoti_package_path, + ) builder_args.dso_path = None builder_args.pte_path = None @@ -359,6 +399,7 @@ def main(args): output_pte_path = args.output_pte_path output_dso_path = args.output_dso_path + output_snapshot_path = args.output_snapshot_path output_aoti_package_path = args.output_aoti_package_path if output_pte_path and builder_args.device != "cpu": @@ -366,12 +407,13 @@ def main(args): f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting." ) builder_args.device = "cpu" - elif "mps" in builder_args.device: + elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device: print("Warning! Device MPS not supported for export. Exporting for device CPU.") builder_args.device = "cpu" # TODO: clean this up # This mess is because ET does not support _weight_int4pack_mm right now + tokenizer_args = None if not builder_args.gguf_path: # tokenizer needed for quantization so get that here, try: @@ -382,9 +424,8 @@ def main(args): if builder_args.max_seq_length is None: if ( - (output_dso_path is not None or output_aoti_package_path is not None) - and not builder_args.dynamic_shapes - ): + output_dso_path is not None or output_aoti_package_path is not None + ) and not builder_args.dynamic_shapes: print("Setting max_seq_length to 300 for DSO export.") builder_args.max_seq_length = 300 elif output_pte_path is not None: @@ -397,11 +438,13 @@ def main(args): quantize, tokenizer, max_seq_length=builder_args.max_seq_length, - support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None, + support_tensor_subclass=output_dso_path is None + and output_aoti_package_path is None, ) model_to_pte = model model_to_dso = model model_to_aoti_package = model + model_to_snapshot = model else: if output_pte_path: _set_gguf_kwargs(builder_args, is_et=True, context="export") @@ -421,6 +464,15 @@ def main(args): model_to_dso = model_to_aoti_package _unset_gguf_kwargs(builder_args) + if output_snapshot_path: + _set_gguf_kwargs(builder_args, is_et=False, context="export") + model_to_snapshot = _initialize_model( + builder_args, + quantize, + support_tensor_subclass=False, + ) + _unset_gguf_kwargs(builder_args) + with torch.no_grad(): if output_pte_path: output_pte_path = str(os.path.abspath(output_pte_path)) @@ -435,7 +487,9 @@ def main(args): if output_dso_path: output_dso_path = str(os.path.abspath(output_dso_path)) print(f"Exporting model using AOT Inductor to {output_dso_path}") - print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.") + print( + "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead." + ) export_for_server( model_to_dso, builder_args.device, @@ -446,11 +500,33 @@ def main(args): if output_aoti_package_path: output_aoti_package_path = str(os.path.abspath(output_aoti_package_path)) - print(f"Exporting model using AOT Inductor to {output_aoti_package_path}") + + if tokenizer_args is None: + tokenizer_type = "0" + elif tokenizer_args.is_sentencepiece: + tokenizer_type = "2" # Corresponding to llama2 + else: + tokenizer_type = "3" # Corresponding to llama3 + + metadata = {"tokenizer_type": tokenizer_type} + print( + "Exporting model using AOT Inductor to " f"{output_aoti_package_path}." + ) export_for_server( model_to_aoti_package, builder_args.device, output_aoti_package_path, builder_args.dynamic_shapes, package=True, + metadata=metadata, + ) + + if output_snapshot_path: + output_snapshot_path = str(os.path.abspath(output_snapshot_path)) + print(f"Exporting model using Snapshot to {output_snapshot_path}") + export_snapshot( + model_to_snapshot, + builder_args.device, + output_snapshot_path, ) + diff --git a/torchchat/generate.py b/torchchat/generate.py index dd423b58a..7f37386ac 100644 --- a/torchchat/generate.py +++ b/torchchat/generate.py @@ -3,13 +3,15 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. -import argparse import base64 +import contextlib import itertools import logging import os import textwrap import time +from concurrent import futures +from functools import partial from abc import ABC, abstractmethod from dataclasses import dataclass @@ -21,6 +23,10 @@ import torch import torch._dynamo.config import torch._inductor.config +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.distributed.pipelining import PipelineStage, ScheduleGPipe +from torch._C import _SDPBackend as SDPBackend from PIL import Image @@ -28,7 +34,6 @@ from torchtune.data import Message, padded_collate_tiled_images_and_mask from torchtune.generation import sample as tune_sample -from torchtune.models.llama3 import llama3_tokenizer from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform from torchtune.training import set_default_dtype @@ -39,19 +44,63 @@ BuilderArgs, TokenizerArgs, ) -from torchchat.distributed.generate import DistributedGenerator +from torchchat.distributed.utils import ( + Color as color, + run_in_dist_env, +) from torchchat.model import Model, ModelType from torchchat.utils.build_utils import device_sync, set_precision from torchchat.utils.device_info import get_device_info +logger = logging.getLogger(__name__) + + +# NOTE: Logging disabled by default here due to conflicts with torch._dynamo +class NoOpLogger: + def __no_op(self, *_, **__): + pass + def __getattr__(self, name): + return self.__no_op + + +logger = ( + NoOpLogger() if os.getenv("LOG_LEVEL") is None + else logging.getLogger(__name__) +) + +## Chat Formatters ############################################################# class _ChatFormatter(ABC): + + # Messages can arrive as a standard dict with "role" and "content" as + # strings, or where "content" is a list of objects with "text" fields. + MESSAGE_TYPE = Dict[str, Union[str, List[Dict[str, str]]]] + + # A dialog is a sequence of messages + DIALOG_TYPE = List[MESSAGE_TYPE] + def __init__(self, tokenizer): self.tokenizer = tokenizer @abstractmethod - def encode_dialog_prompt(self, dialog) -> List[int]: - raise NotImplementedError() + def encode_dialog_prompt( + self, + dialog: DIALOG_TYPE, + add_generation_prompt: bool = True, + ) -> List[int]: + """Encode a sequence of messages into a sequence of token IDs, including + the chat template + + Args: + dialog (DIALOG_TYPE): The sequence of dialog messages to encode. + This will be the additional messages on top of those that have + already been processed. + add_generation_prompt (bool): Whether to include a generation prompt + at the end of the encoded sequence. + + Returns: + List[int]: A list of token IDs representing the encoded prompt. + """ class Llama3ChatFormatter(_ChatFormatter): @@ -61,7 +110,7 @@ class Llama3ChatFormatter(_ChatFormatter): """ - def encode_header(self, role) -> List[int]: + def _encode_header(self, role) -> List[int]: tokens = [] tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"]) tokens.extend(self.tokenizer.encode(role, bos=False, eos=False)) @@ -69,8 +118,8 @@ def encode_header(self, role) -> List[int]: tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False)) return tokens - def encode_message(self, message) -> List[int]: - tokens = self.encode_header(message["role"]) + def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]: + tokens = self._encode_header(message["role"]) if isinstance(message["content"], str): tokens.extend( self.tokenizer.encode(message["content"], bos=False, eos=False) @@ -85,46 +134,80 @@ def encode_message(self, message) -> List[int]: tokens.append(self.tokenizer.special_tokens["<|eot_id|>"]) return tokens - def encode_dialog_prompt(self, dialog) -> List[int]: + def encode_dialog_prompt( + self, + dialog: _ChatFormatter.DIALOG_TYPE, + add_generation_prompt: bool = True, + ) -> List[int]: tokens = [] tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"]) for message in dialog: - tokens.extend(self.encode_message(message)) + tokens.extend(self._encode_message(message)) # Add the start of an assistant message for the model to complete. - tokens.extend(self.encode_header("assistant")) # Pass role directly as a string + if add_generation_prompt and dialog and dialog[-1]["role"] != "assistant": + tokens.extend(self._encode_header("assistant")) # Pass role directly as a string return tokens -B_INST, E_INST = "[INST]", "[/INST]" -B_SYS, E_SYS = "<>", "<>" +class Llama2ChatFormatter(_ChatFormatter): + """ + Chat formatting for Llama2 + CITE: https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/ + """ + B_INST, E_INST = "[INST] ", " [/INST]" + B_SYS, E_SYS = "<>\n", "\n<>\n\n" -class Llama2ChatFormatter(_ChatFormatter): - def encode_dialog_prompt(self, dialog) -> List[int]: - tokens = self.tokenizer.encode(f"{B_INST} ") - first_message = True # Bool to handle placing the B_INST token. Behavior is weird - the system prompt should have the B_INST, but not the first user message. All following user messages *should* have it. Also, if there is no system prompt, then the user message should have it. + @staticmethod + def _get_content_str(message: _ChatFormatter.MESSAGE_TYPE) -> str: + if isinstance(message["content"], list): + return message["content"][0]["text"] + return message["content"] + + def encode_dialog_prompt( + self, + dialog: _ChatFormatter.DIALOG_TYPE, + add_generation_prompt: bool = True, # UNUSED + ) -> List[int]: + new_turn = True + tokens = [] for message in dialog: - if isinstance(message["content"], list): - content = message["content"][0]["text"] + if new_turn: + tokens += self.tokenizer.encode(f"{self.tokenizer.bos}{self.B_INST}") + content = self._get_content_str(message).strip() + role = message["role"] + if role == "system": + tokens += self.tokenizer.encode(f"{self.B_SYS}{content}{self.E_SYS}") + new_turn = False + elif role == "user": + tokens += self.tokenizer.encode(f"{content}{self.E_INST}") + new_turn = False + elif role == "assistant": + tokens += self.tokenizer.encode(f" {content} {self.tokenizer.eos}\n") + new_turn = True else: - content = message["content"] - content = content.strip() - if message["role"] == "system": - encoded = self.tokenizer.encode(f"{B_SYS}\n{content}\n{E_SYS}") - first_message = False - elif message["role"] == "user": - encoded = [self.tokenizer.bos_id()] + self.tokenizer.encode( - f"{B_INST if first_message else ''} {content} {E_INST} " - ) - first_message = True - elif message["role"] == "assistant": - encoded = self.tokenizer.encode(f"{content}\n\n") + [ - self.tokenizer.eos_id() - ] - tokens += encoded + raise ValueError("Invalid role in dialog.") return tokens + +class HFTokenizerChatFormatter(_ChatFormatter): + """Chat formatter that uses the built-in formatting capabilities of an HF + tokenizer instance + """ + def encode_dialog_prompt( + self, + dialog: _ChatFormatter.DIALOG_TYPE, + add_generation_prompt: bool = True, + ) -> List[int]: + rendered = self.tokenizer.apply_chat_template( + dialog, add_generation_prompt=add_generation_prompt + ) + logger.debug("Formatted chat prompt:\n%s", rendered) + return self.tokenizer.encode(rendered) + +## Generation ################################################################## + @dataclass class GeneratorArgs: prompt: Optional[str] = ( @@ -214,7 +297,7 @@ def from_args(cls, args): ) -class Generator: +class LocalGenerator: """ Generates text samples based on a pre-trained Transformer model and tokenizer. Args: @@ -251,6 +334,7 @@ def __init__( self.draft_quantize = draft_quantize self.is_torchtune_model = generator_args.is_torchtune_model self.dtype = builder_args.precision + self.get_user_input : Callable = input self.rank: Optional[int] = None @@ -283,9 +367,13 @@ def __init__( if self.is_llama3_model: self.chat_formatter = Llama3ChatFormatter(self.tokenizer) if generator_args.chat_mode: - logging.debug( + logger.debug( "Llama3 model detected in chat mode. Using updated sentence schemas" ) + elif self.tokenizer_args.is_hf_tokenizer: + if not self.tokenizer.has_chat_template(): + raise ValueError("Tokenizer must have a chat template") + self.chat_formatter = HFTokenizerChatFormatter(self.tokenizer) else: self.chat_formatter = Llama2ChatFormatter(self.tokenizer) @@ -341,10 +429,12 @@ def sample( temperature: float = 0, top_k: Optional[int] = None, ): + logits = logits[0, -1] + logger.debug("Logits: %s", logits) if temperature == 0 and not need_probs: - _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1) + _, idx_next = torch.topk(logits, k=1, dim=-1) return (idx_next, None) - probs = self.logits_to_probs(logits[0, -1], temperature, top_k) + probs = self.logits_to_probs(logits, temperature, top_k) idx_next = self.multinomial_sample_one_no_sync(probs) return idx_next, probs @@ -358,7 +448,7 @@ def prefill( sequential_prefill=True, **sampling_kwargs, ) -> torch.Tensor: - # logging.debug(f"x: {x}, input_pos: {input_pos}") + logger.debug("x: %s, input_pos: %s", x, input_pos) width = x.size(1) assert input_pos.size(0) == width @@ -394,14 +484,12 @@ def prefill( elif sequential_prefill: for i in range(width): x_sliced, ip_sliced = x[:, i].view(-1, 1), input_pos[i].view(-1) - # logging.debug(f" x: {x_sliced}, input_pos: {ip_sliced}") + logger.debug(" x: %s, input_pos: %s", x_sliced, ip_sliced) logits = model(x_sliced, ip_sliced) # (x[:, i], input_pos[i])da else: # input_pos: [B, S] logits = model(x, input_pos) - # print(f"logits {logits.shape}") - # print(f"x: {x},\n input_pos: {input_pos}\n") return self.sample(logits, need_probs=False, **sampling_kwargs)[0] def decode_one_token( @@ -425,7 +513,6 @@ def decode_one_token( )[:, -1:] else: logits = model(x, input_pos) - # print(f"x: {x},\n input_pos: {input_pos}\n") return self.sample(logits, need_probs=need_probs, **sampling_kwargs) """ @@ -445,6 +532,7 @@ def decode_n_tokens( callback=lambda _: _, eos_token_id: int = 2, eot_id: Optional[int] = None, + attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH, **sampling_kwargs, ): new_tokens, new_probs = [], [] @@ -453,7 +541,7 @@ def decode_n_tokens( num_new_tokens - 1 ): # -1 to save space to run an EoS if dont generate it naturally # Actually better for Inductor to codegen attention here - with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]): + with torch.nn.attention.sdpa_kernel([attention_backend]): out_token = cur_token.clone() next_token, next_prob = self.decode_one_token( @@ -591,11 +679,13 @@ def generate( Dict[str, Any] ] = None, # List of Image prompt tensors for multimodal models start_pos: int = 0, + skip_cache_setup: bool = False, draft_model: Model, speculate_k: Optional[int] = 8, sequential_prefill=True, callback=lambda x: x, max_seq_length: int, + attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH, seed: Optional[int] = None, **sampling_kwargs, ) -> torch.Tensor: @@ -614,26 +704,27 @@ def generate( max_new_tokens = min(max_new_tokens, max_seq_length - start_pos - prompt_length) # set up caches only if first inference if start_pos == 0: - model = model.to(device=device) - with torch.device(device): - if ( - self.is_torchtune_model - or self.model.config.model_type == ModelType.Flamingo - ): - # 6404 is one-gpu affordable max_seq_length for single image input - model.setup_caches( - batch_size=1, - dtype=self.dtype, - encoder_max_seq_len=6404, - decoder_max_seq_len=max_seq_length, - ) - else: - model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) - if is_speculative and draft_model is not model: - draft_model.setup_caches( - max_batch_size=1, - max_seq_length=max_seq_length, - ) + if not skip_cache_setup: + model = model.to(device=device) + with torch.device(device): + if ( + self.is_torchtune_model + or self.model.config.model_type == ModelType.Flamingo + ): + # 6404 is one-gpu affordable max_seq_length for single image input + model.setup_caches( + batch_size=1, + dtype=self.dtype, + encoder_max_seq_len=6404, + decoder_max_seq_len=max_seq_length, + ) + else: + model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length) + if is_speculative and draft_model is not model: + draft_model.setup_caches( + max_batch_size=1, + max_seq_length=max_seq_length, + ) if model.config.model_type == ModelType.Flamingo: model.reset_caches() @@ -711,6 +802,7 @@ def generate( if self.is_llama3_model else None ), + attention_backend=attention_backend, **sampling_kwargs, ): generated_tokens.append(generated_token.view(-1)) @@ -725,7 +817,8 @@ def encode_tokens(self, string, bos=True, device="cpu"): tokens = self.tokenizer.encode(string) if bos: tokens = [self.tokenizer.bos_id()] + tokens - logging.debug(f"Size after encode_tokens: {len(tokens)}") + logger.debug("Size after encode_tokens: %d", len(tokens)) + logger.debug("Token IDs: %s", tokens) return torch.tensor(tokens, dtype=torch.int, device=device) def _callback(self, x, *, buffer, done_generating): @@ -745,7 +838,6 @@ def _callback(self, x, *, buffer, done_generating): if len(buffer) == 4 or done_generating: print("".join(buffer), end="", flush=True) buffer.clear() - # print(, end='', flush=True) def _gen_model_input( self, @@ -774,7 +866,7 @@ def _gen_model_input( # Single String prompt if isinstance(prompt, str): encoded = self.encode_tokens( - prompt, bos=True, device=self.builder_args.device + prompt, bos=self.model.config.tokenizer_prepend_bos, device=self.builder_args.device ) # List of dialog else: @@ -783,7 +875,7 @@ def _gen_model_input( tokens, dtype=torch.int, device=self.builder_args.device ) - logging.debug(encoded) + logger.debug(encoded) return encoded, None # Llama 3.2 11B @@ -867,7 +959,7 @@ def _gen_model_input( if image_found: batch = padded_collate_tiled_images_and_mask( - [data], pad_direction="left", pad_max_images=1 + [data], pad_direction="left", pad_max_images=1, pad_max_tiles=transform.max_num_tiles ) encoded = batch.pop("tokens").to(device).view(-1) seq_len = encoded.size(0) @@ -898,7 +990,7 @@ def _gen_model_input( value=0, ) - logging.debug(encoded) + logger.debug(encoded) return encoded, batch def chat( @@ -914,14 +1006,14 @@ def chat( for p in itertools.chain(self.model.parameters(), self.model.buffers()) ] ) - if generator_args.compile: - if ( - self.is_speculative and self.builder_args.use_distributed - ): # and ("cuda" in builder_args.device): - torch._inductor.config.triton.cudagraph_trees = ( - False # Bug with cudagraph trees in this case - ) + if self.builder_args.distributed: + # During distributed inference the model gets sharded among the ranks + # So we need to all reduce the model size to get the total model size + model_size = torch.tensor(model_size, dtype=torch.int64, device=self.device) + dist.all_reduce(model_size) + model_size = model_size.item() + if generator_args.compile: if self.builder_args.device == "cpu": if generator_args.max_autotune: kwargs = {"mode": "max-autotune"} @@ -979,11 +1071,11 @@ def chat( print( f"Entering Chat Mode. Will continue chatting back and forth with the language model until the models max context length of {max_seq_length} tokens is hit or until the user says /bye" ) - get_system_prompt = input( + get_system_prompt = self.get_user_input( "Do you want to enter a system prompt? Enter y for yes and anything else for no. \n" ) if get_system_prompt == "y" or get_system_prompt == "Y": - self.system_prompt = input("What is your system prompt? \n") + self.system_prompt = self.get_user_input("What is your system prompt? \n") # `is_torchtune_model` is a misnomer since it doesn't capture all # torchtune models (i.e. Flamingo) @@ -1020,43 +1112,27 @@ def chat( ) for i in range(num_samples): device_sync(device=self.builder_args.device) + is_first_sample: bool = i == 0 if generator_args.chat_mode: - prompt = input("User: ") + prompt = self.get_user_input("User: ") if prompt == "/bye": print("Exiting Chat.\n") break - if not self.is_llama3_model: - if self.system_prompt: - prompt = f"{B_INST} {B_SYS}\n{self.system_prompt.strip()}\n{E_SYS}\n\n{prompt.strip()} {E_INST}" - self.system_prompt = ( - None # can only provide system prompt on first interaction - ) - else: - prompt = f"{B_INST} {prompt.strip()} {E_INST}" - encoded = self.encode_tokens( - prompt, bos=True, device=self.builder_args.device - ) - else: - if self.system_prompt: - encoded = self.chat_formatter.encode_dialog_prompt( - [ - {"role": "system", "content": self.system_prompt}, - {"role": "user", "content": prompt}, - ] - ) - self.system_prompt = None - elif i == 0: - encoded = self.chat_formatter.encode_dialog_prompt( - [{"role": "user", "content": prompt}] - ) - else: - encoded = self.chat_formatter.encode_message( - {"role": "user", "content": prompt} - ) - encoded.extend(self.chat_formatter.encode_header("assistant")) - encoded = torch.tensor( - encoded, dtype=torch.int, device=self.builder_args.device + + # Encode the additional messages added in this dialog turn. If + # this is the first turn, that includes any system prompt. + messages_to_encode = [] + if is_first_sample and self.system_prompt: + messages_to_encode.append( + {"role": "system", "content": self.system_prompt} ) + messages_to_encode.append({"role": "user", "content": prompt}) + encoded = self.chat_formatter.encode_dialog_prompt( + messages_to_encode, add_generation_prompt=True, + ) + encoded = torch.tensor( + encoded, dtype=torch.int, device=self.builder_args.device + ) if encoded.size(0) + start_pos > max_seq_length: print( "This prompt would take us past the max_seq_length. Ending Conversation." @@ -1091,11 +1167,7 @@ def callback(x, *, done_generating=False): torch._inductor.config.profiler_mark_wrapper_call = True torch._inductor.config.cpp.enable_kernel_profile = True - if (i != generator_args.num_samples - 1 or not self.profile) or ( - self.builder_args.use_distributed and self.rank != 0 - ): - import contextlib - + if i != generator_args.num_samples - 1 or not self.profile: prof = contextlib.nullcontext() else: torch.profiler._utils._init_for_cuda_graphs() @@ -1116,8 +1188,12 @@ def callback(x, *, done_generating=False): top_k=generator_args.top_k, sequential_prefill=generator_args.sequential_prefill, start_pos=start_pos, + skip_cache_setup=not is_first_sample, max_seq_length=max_seq_length, + attention_backend=self.builder_args.attention_backend, ) + if generator_args.chat_mode: + start_pos += encoded.size(0) for token_tensor, metrics in generator_func: if token_tensor is not None: start_pos += token_tensor.size(0) @@ -1125,7 +1201,7 @@ def callback(x, *, done_generating=False): if metrics is not None: aggregate_metrics.update(metrics) yield token_tensor, metrics - jit_compile = (i == 0) and ( + jit_compile = is_first_sample and ( generator_args.compile or generator_args.compile_prefill ) compilation_time = time.perf_counter() - t0 @@ -1134,12 +1210,11 @@ def callback(x, *, done_generating=False): if hasattr(prof, "export_chrome_trace"): if self.builder_args.device == "cpu": print(prof.key_averages().table(sort_by="self_cpu_time_total")) - else: + elif self.builder_args.device == "cuda": print(prof.key_averages().table(sort_by="self_cuda_time_total")) - if self.builder_args.use_distributed: - prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json") else: - prof.export_chrome_trace(f"{self.profile}.json") + print(prof.key_averages().table(sort_by="self_xpu_time_total")) + prof.export_chrome_trace(f"{self.profile}.json") if start_pos >= max_seq_length: print( @@ -1157,9 +1232,11 @@ def callback(x, *, done_generating=False): print( f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds" ) - aggregate_metrics["tokens_per_sec"].append(tokens_sec) - aggregate_metrics["first_token_per_sec"].append(first_token_sec) - aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec) + else: + # aggregate_metrics will not append when is jit_compile, which will affect the average numbers. + aggregate_metrics["tokens_per_sec"].append(tokens_sec) + aggregate_metrics["first_token_per_sec"].append(first_token_sec) + aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec) logging.info( f"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\ @@ -1197,31 +1274,348 @@ def callback(x, *, done_generating=False): f"Mean Accepted: {sum([idx * i for idx, i in enumerate(counts_aggregated)])/sum(counts_aggregated)}" ) - print( - f"\n Average tokens/sec (total): {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f} \ - \nAverage tokens/sec (first token): {torch.mean(torch.tensor(aggregate_metrics['first_token_per_sec'])).item():.2f} \ - \nAverage tokens/sec (next tokens): {torch.mean(torch.tensor(aggregate_metrics['next_tokens_per_sec'])).item():.2f} \n\ + avg_tokens_sec = torch.mean( + torch.tensor(aggregate_metrics["tokens_per_sec"]) + ).item() + avg_first_token_sec = torch.mean( + torch.tensor(aggregate_metrics["first_token_per_sec"]) + ).item() + avg_next_tokens_sec = torch.mean( + torch.tensor(aggregate_metrics["next_tokens_per_sec"]) + ).item() + + if not ( + torch.isnan(torch.tensor(avg_tokens_sec)) + or torch.isnan(torch.tensor(avg_first_token_sec)) + or torch.isnan(torch.tensor(avg_next_tokens_sec)) + ): + print( + f"\nWarning: Excluding compile in calculations \ + \n Average tokens/sec (total): {avg_tokens_sec:.2f} \ + \nAverage tokens/sec (first token): {avg_first_token_sec:.2f} \ + \nAverage tokens/sec (next tokens): {avg_next_tokens_sec:.2f} \n\ " - ) + ) if torch.cuda.is_available(): print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB") + if torch.xpu.is_available(): + print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB") -def _launch_distributed_inference( - builder_args: BuilderArgs, -): - from torch.distributed import launcher - from torch.distributed.elastic.utils.distributed import get_free_port - print("Launching distributed inference within generator") +class DistributedGenerator(LocalGenerator): + def __init__( + self, + builder_args: BuilderArgs, + speculative_builder_args: BuilderArgs, + tokenizer_args: TokenizerArgs, + generator_args: GeneratorArgs, + profile: Optional[Path], + quantize: bool, + draft_quantize: bool, + ): + + is_speculative = speculative_builder_args.checkpoint_path is not None + assert is_speculative == False, "Distributed inference with pp > 1 does not support speculative inference yet." + super().__init__( + builder_args, + speculative_builder_args, + tokenizer_args, + generator_args, + profile, + quantize, + draft_quantize, + ) + self.rank = dist.get_rank() + # Assuming same number of GPUs per node + self.device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}") + def distributed_input(prompt: str) -> str: + if dist.get_rank() == 0: + text = [input(prompt)] + else: + text = [None] + + dist.broadcast_object_list(text) + return text[0] -def main(args): + self.get_user_input: Callable = distributed_input + + if builder_args.pp > 1: + self.seqlen_prefill = 1024 # sequence length for prefill stage + + logger.warn(f"{color.yellow}Pipeline parallelism is still experimental and might be slow{color.reset}") + pp_mesh = self.model.device_mesh["pp"] + + self.pp_rank = pp_mesh.get_local_rank() + self.pp_group = pp_mesh.get_group() + + self.pp_degree = pp_mesh.size() + + # Convenience variables + self.first_pp_rank = 0 + self.last_pp_rank = self.pp_degree - 1 + + + self.first_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.first_pp_rank) + self.last_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.last_pp_rank) + + self.prefiller = self.create_prefill_stage() + self.decoder = self.create_decode_stage() + + def __del__(self): + dist.destroy_process_group() + + # Helper function to get example inputs and outputs for the stages. + def get_example_ins_outs(self, batch_size: int , seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]: + """ + This function generates example inputs and outputs for the prefill and decode stages. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: A tuple containing the example inputs and outputs. + """ + model_dtype = torch.bfloat16 + mb_ids = torch.randint( + 0, self.model.config.vocab_size, (batch_size, seqlen), device=self.device + ) + activation = torch.rand( + batch_size, seqlen, self.model.config.dim, device=self.device, dtype=model_dtype + ) + logits = torch.rand( + batch_size, seqlen, self.model.config.vocab_size, device=self.device, dtype=model_dtype + ) + example_inputs = (mb_ids if self.pp_rank == self.first_pp_rank else activation,) + example_outputs = (logits if self.pp_rank == self.last_pp_rank else activation,) + return example_inputs, example_outputs + + def create_prefill_stage(self): + """ + Creates a pipeline stage for prefilling. + + Returns: + PipelineStage: The created pipeline stage. + """ + batch_size = 1 + + # Create prefill stage + logger.debug(f"Creating pipeline stage for prefill {self.pp_rank=}, {self.pp_degree=}") + example_inputs, example_outputs = self.get_example_ins_outs(batch_size, self.seqlen_prefill) + prefill_stage = PipelineStage( + self.model, + self.pp_rank, + self.pp_degree, + self.device, + input_args=example_inputs, + output_args=example_outputs, + group=self.pp_group, + ) + + # Create schedule + # Number of micro-batches for the schedule is 1, because each step() call we + # only push 1 micro-batch into the pipeline. But we can continuously push + # new micro-batches into the pipeline as they arrive, achieving same + # pipelining effect. + prefiller = ScheduleGPipe(prefill_stage, 1) + return prefiller + + def create_decode_stage(self): + """ + Creates a decode stage for the pipeline parallelism. + + Returns: + ScheduleGPipe: The decode stage. + """ + # seqlen = 1 now + seqlen_decode = 1 + batch_size = 1 + + # Create decode stage + # logger.info(f"Creating pipeline stage for decode {self.pp_rank=}, {self.pp_degree=}") + example_inputs, example_outputs = self.get_example_ins_outs(batch_size, seqlen_decode) + decode_stage = PipelineStage( + self.model, + self.pp_rank, + self.pp_degree, + self.device, + input_args=example_inputs, + output_args=example_outputs, + group=self.pp_group, + ) + # create schedule + decoder = ScheduleGPipe(decode_stage, 1) + + return decoder + + def prefill( + self, + model: Model, + x: torch.Tensor, + input_pos: torch.Tensor, + batch: Optional[Dict[str, Any]] = None, # Inputs for multimodal models + *, + sequential_prefill=True, + **sampling_kwargs, + ) -> torch.Tensor: + """ + This function is used to prefill the model with a given prompt. For pipeline parallelism we need to pad the input. + + Returns: + torch.Tensor: The prefilled tensor. + """ + if self.builder_args.pp == 1: + return super().prefill( + model, + x, + input_pos, + batch, + sequential_prefill=sequential_prefill, + **sampling_kwargs, + ) + + pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.eos_id + prompt_length = x.size(1) + + padded_seq = torch.full( + (1, self.seqlen_prefill), pad_token_id, dtype=torch.int64, device=self.device + ) + padded_seq[:,:prompt_length] = x + input_pos = torch.arange( + self.seqlen_prefill, + device=self.device, + dtype=torch.int, + ) + + # Prefill phase + # Run context input through pipeline + # TODO: we need to pass `input_pos` and `cache_lane` to each stage. + lane = 0 + kwargs = {"input_pos": input_pos, "cache_lane": lane} + + if self.pp_rank == self.first_pp_rank: + logits = self.prefiller.step(padded_seq, **kwargs) + elif self.pp_rank == self.last_pp_rank: + logits = self.prefiller.step(**kwargs) + else: # middle pp ranks + self.prefiller.step(**kwargs) + + if self.pp_rank == self.last_pp_rank: + new_token = self.sample(logits[:,:prompt_length], need_probs=False, **sampling_kwargs)[0] + if self.pp_rank != self.first_pp_rank: + dist.send( + new_token, + dst=self.first_pp_rank_global_id, + group=self.pp_group, + ) + else: + new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64) + if self.pp_rank == self.first_pp_rank: + dist.recv( + new_token, + src=self.last_pp_rank_global_id, + group=self.pp_group, + ) + + return new_token + + def decode_one_token( + self, + model: Model, + x: torch.Tensor, + input_pos: torch.Tensor, + need_probs: bool, + batch: Optional[Dict[str, Any]] = None, # Inputs for multimodal models + **sampling_kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """ + Decodes a single token. + + # TODO: implement speculative decoding with pp>1 + Returns: + Tuple[torch.Tensor, None]: A tuple containing the decoded token and None. + """ + if self.builder_args.pp == 1: + return super().decode_one_token( + model, + x, + input_pos, + need_probs, + batch=batch, + **sampling_kwargs, + ) + + # input_pos: [B, 1] + assert input_pos.shape[-1] == 1 + + new_token = x.view(1, -1) + + lane = 0 + kwargs = {"input_pos": input_pos, "cache_lane": lane} + # Run data through pipeline + if self.pp_rank == self.first_pp_rank: + logits = self.decoder.step(new_token, **kwargs) + elif self.pp_rank == self.last_pp_rank: + logits = self.decoder.step(**kwargs) + else: # middle pp ranks + self.decoder.step(**kwargs) + + # Decode the output + if self.pp_rank == self.last_pp_rank: + new_token, _ = self.sample(logits, need_probs=need_probs, **sampling_kwargs) + if self.pp_rank != self.first_pp_rank: + dist.send( + new_token, + dst=self.first_pp_rank_global_id, + group=self.pp_group, + ) + else: + new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64) + if self.pp_rank == self.first_pp_rank: + dist.recv( + new_token, + src=self.last_pp_rank_global_id, + group=self.pp_group, + ) + #TODO: Why do we get 2d tensor here? + new_token=new_token[0] + return new_token, None + + def sample( + self, + logits, + need_probs: bool, + temperature: float = 0, + top_k: Optional[int] = None, + ): + if temperature == 0 and not need_probs: + _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1) + return (idx_next, None) + probs = self.logits_to_probs(logits[0, -1], temperature, top_k) + idx_next = self.multinomial_sample_one_no_sync(probs) + + return idx_next, probs + + +def run_generator( + args, + rank: Optional[int] =None + ): + """ + This function creates and executes a generator + """ builder_args = BuilderArgs.from_args(args) speculative_builder_args = BuilderArgs.from_speculative_args(args) tokenizer_args = TokenizerArgs.from_args(args) - generator_args = GeneratorArgs.from_args(args) - if not builder_args.distributed: + generator_args = GeneratorArgs.from_args(args) + #Setup rank 1 and up to suppress log messages and print messages + if builder_args.distributed and rank != 0: + logger.setLevel(logging.CRITICAL) + context = contextlib.redirect_stdout(None) + else: + context = contextlib.nullcontext() + + with context: + Generator = DistributedGenerator if builder_args.distributed else LocalGenerator + logger.debug("GeneratorArgs: %s", generator_args) gen = Generator( builder_args, speculative_builder_args, @@ -1233,23 +1627,25 @@ def main(args): ) if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats() + if torch.xpu.is_available(): + torch.xpu.reset_peak_memory_stats() for _ in gen.chat(generator_args): pass - else: - dist_gen = DistributedGenerator( - args.model, - builder_args, - tokenizer_args, - generator_args, - args.profile, - args.quantize, - args.draft_quantize, - ) - - response = "" - for output in dist_gen.generate(generator_args.prompt): - response += output.text - print(f"Model output: {response}") - dist_gen.shutdown() +def main(args): + builder_args = BuilderArgs.from_args(args) + + if builder_args.distributed: + world_size = builder_args.tp * builder_args.pp + + ctx = mp.get_context('spawn') + with futures.ProcessPoolExecutor(max_workers=world_size-1, mp_context=ctx) as executor: + for i in range(1,world_size): + fn = partial(run_generator, args, i) + executor.submit(run_in_dist_env, world_size, i, fn) + #Starting rank 0 + fn = partial(run_generator, args, 0) + run_in_dist_env(world_size, 0, fn) + else: + run_generator(args) diff --git a/torchchat/model.py b/torchchat/model.py index 2a3b9f12f..ce7dcb5e4 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -4,6 +4,7 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import json +import logging import os import warnings from abc import ABC, abstractmethod @@ -48,6 +49,8 @@ config_path = Path(f"{str(Path(__file__).parent)}/model_params") +logger = logging.getLogger(__name__) + class QuickGELUActivation(nn.Module): """ @@ -273,6 +276,7 @@ class TransformerArgs: # Select the desired tokenizer. Defaults to sentencepiece use_tiktoken: bool = False use_hf_tokenizer: bool = False + tokenizer_prepend_bos: bool = True max_seq_length: int = 8192 rope_scaling: Optional[Dict[str, Any]] = None # For pipeline parallel @@ -283,6 +287,11 @@ class TransformerArgs: feed_forward_bias: bool = False # Whether or not to tie the input word embeddings to the output tie_word_embeddings: bool = False + # Granite architecture multipliers + embedding_multiplier: Optional[float] = None + attention_multiplier: Optional[float] = None + residual_multiplier: Optional[float] = None + logits_scaling: Optional[float] = None def __post_init__(self): if self.n_local_heads == -1: @@ -330,6 +339,7 @@ class ModelArgs: transformer_args: Dict[str, Dict[str, Any]] use_tiktoken: bool use_hf_tokenizer: bool + tokenizer_prepend_bos: bool def __init__( self, @@ -337,6 +347,7 @@ def __init__( model_type: ModelType = ModelType.TextOnly, use_tiktoken: bool = False, use_hf_tokenizer: bool = False, + tokenizer_prepend_bos: bool = True, ) -> None: self._sanity_check(transformer_args, model_type) @@ -346,6 +357,7 @@ def __init__( # Model-level attributes self.use_tiktoken = use_tiktoken self.use_hf_tokenizer = use_hf_tokenizer + self.tokenizer_prepend_bos = tokenizer_prepend_bos def _sanity_check( self, @@ -373,7 +385,14 @@ def from_params(cls, params_path): use_tiktoken = loaded_params.get("use_tiktoken", False) use_hf_tokenizer = loaded_params.get("use_hf_tokenizer", False) - return cls(transformer_args, model_type, use_tiktoken, use_hf_tokenizer) + tokenizer_prepend_bos = loaded_params.get("tokenizer_prepend_bos", True) + return cls( + transformer_args=transformer_args, + model_type=model_type, + use_tiktoken=use_tiktoken, + use_hf_tokenizer=use_hf_tokenizer, + tokenizer_prepend_bos=tokenizer_prepend_bos, + ) @classmethod def from_table(cls, name: str): @@ -477,7 +496,9 @@ def build_model(self) -> nn.Module: for name, module_class in recipe.modules.items(): config_args = self.config.transformer_args[name] if module_class == Transformer: - modules[name] = module_class(TransformerArgs.from_params(config_args)) + transformer_args = TransformerArgs.from_params(config_args) + logger.debug("Transformer Args: %s", transformer_args) + modules[name] = module_class(transformer_args) else: modules[name] = module_class(**config_args) @@ -636,7 +657,7 @@ def __init__(self, config: TransformerArgs) -> None: self.layers[str(layer_id)] = TransformerBlock(config) if config.stage_idx == config.n_stages - 1: - self.norm = RMSNorm(config.dim, eps=config.norm_eps) + self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps) self.output = nn.Linear(config.dim, config.vocab_size, bias=False) if config.tie_word_embeddings: self.output.weight = self.tok_embeddings.weight @@ -707,6 +728,10 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int if self.tok_embeddings: x = self.tok_embeddings(x) + # For Granite architectures + if self.config.embedding_multiplier: + x = x * self.config.embedding_multiplier + for _, layer in self.layers.items(): x = layer(x, input_pos, freqs_cis, mask, cache_lane=cache_lane) @@ -714,6 +739,9 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int x = self.norm(x) if self.output: x = self.output(x) + # For granite architectures + if self.config.logits_scaling: + x = x / self.config.logits_scaling # print(f"output shape: {x.shape}") return x @@ -723,8 +751,14 @@ def __init__(self, config: TransformerArgs) -> None: super().__init__() self.attention = Attention(config) self.feed_forward = FeedForward(config) - self.ffn_norm = RMSNorm(config.dim, config.norm_eps) - self.attention_norm = RMSNorm(config.dim, config.norm_eps) + self.ffn_norm = nn.RMSNorm(config.dim, config.norm_eps) + self.attention_norm = nn.RMSNorm(config.dim, config.norm_eps) + # None for llama architecture, set for granite architectures + self.residual_multiplier = ( + config.residual_multiplier + if config.residual_multiplier is not None + else 1.0 + ) def distribute(self, device_mesh: DeviceMesh): self.attention.distribute(device_mesh) @@ -735,8 +769,8 @@ def forward( ) -> Tensor: h = x + self.attention( self.attention_norm(x), freqs_cis, mask, input_pos, cache_lane=cache_lane - ) - out = h + self.feed_forward(self.ffn_norm(h)) + ) * self.residual_multiplier + out = h + self.feed_forward(self.ffn_norm(h)) * self.residual_multiplier return out @@ -763,6 +797,7 @@ def __init__(self, config: TransformerArgs): self.head_dim = config.head_dim self.n_local_heads = config.n_local_heads self.dim = config.dim + self.attention_scale = config.attention_multiplier self._register_load_state_dict_pre_hook(self.load_hook) def setup_cache(self, max_batch_size, max_seq_length, cache_lanes: int = 1): @@ -859,7 +894,16 @@ def forward( k = k.repeat_interleave(self.n_heads // self.n_local_heads, dim=1) v = v.repeat_interleave(self.n_heads // self.n_local_heads, dim=1) - y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) + y = F.scaled_dot_product_attention( + query=q, + key=k, + value=v, + attn_mask=mask, + dropout_p=0.0, + # This is None (default) for llama architecture and set for granite + # architectures + scale=self.attention_scale, + ) # -1 = self.dim y = y.transpose(1, 2).contiguous().view(bsz, seqlen, -1) @@ -884,20 +928,6 @@ def forward(self, x: Tensor) -> Tensor: return self.w2(F.silu(self.w1(x)) * self.w3(x)) -class RMSNorm(nn.Module): - def __init__(self, dim: int, eps: float = 1e-5): - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.ones(dim)) - - def _norm(self, x): - return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps) - - def forward(self, x: Tensor) -> Tensor: - output = self._norm(x.float()).type_as(x) - return output * self.weight - - def apply_scaling(freqs: torch.Tensor, rope_scaling: Dict[str, Any]): # Check for the presence of the required keys required_keys = { @@ -981,7 +1011,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor: # For quantized_decomposed ops from executorch.kernels import quantized # no-qa # For llama::sdpa_with_kv_cache.out, preprocess ops - from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # no-qa + from executorch.extension.llm.custom_ops import custom_ops # no-qa class PTEModel(nn.Module): def __init__(self, config, path) -> None: @@ -1018,5 +1048,6 @@ def forward(self, x, input_pos): def setup_caches(self, max_batch_size, max_seq_length): pass -except: +except Exception as e: + print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}") pass diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json index 2d3dfcbeb..3c2161b9b 100644 --- a/torchchat/model_config/models.json +++ b/torchchat/model_config/models.json @@ -51,6 +51,12 @@ "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct", "transformer_params_key": "Meta-Llama-3.1-8B" }, + "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": { + "aliases": ["deepseek-r1:8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", + "tokenizer_file": "tokenizer.json" + }, "meta-llama/Meta-Llama-3.1-70B-Instruct": { "aliases": ["llama3.1-70b"], "distribution_channel": "HuggingFaceSnapshot", @@ -164,5 +170,47 @@ "https://github.com/karpathy/llama2.c/raw/master/tokenizer.model" ], "checkpoint_file": "stories110M.pt" + }, + "ibm-granite/granite-3b-code-instruct-128k": { + "aliases": ["granite-code", "granite-code-3b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-3b-code-instruct-128k", + "transformer_params_key": "Granite-3B-Code", + "tokenizer_file": "tokenizer.json" + }, + "ibm-granite/granite-8b-code-instruct-128k": { + "aliases": ["granite-code-8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-8b-code-instruct-128k", + "transformer_params_key": "Granite-8B-Code", + "tokenizer_file": "tokenizer.json" + }, + "ibm-granite/granite-3.0-2b-instruct": { + "aliases": ["granite3-2b", "granite3"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-3.0-2b-instruct", + "transformer_params_key": "Granite-3.0-2B-Instruct", + "tokenizer_file": "tokenizer.json" + }, + "ibm-granite/granite-3.0-8b-instruct": { + "aliases": ["granite3-8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-3.0-8b-instruct", + "transformer_params_key": "Granite-3.0-8B-Instruct", + "tokenizer_file": "tokenizer.json" + }, + "ibm-granite/granite-3.1-2b-instruct": { + "aliases": ["granite3.1-2b", "granite3.1"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-3.1-2b-instruct", + "transformer_params_key": "Granite-3.1-2B-Instruct", + "tokenizer_file": "tokenizer.json" + }, + "ibm-granite/granite-3.1-8b-instruct": { + "aliases": ["granite3.1-8b"], + "distribution_channel": "HuggingFaceSnapshot", + "distribution_path": "ibm-granite/granite-3.1-8b-instruct", + "transformer_params_key": "Granite-3.1-8B-Instruct", + "tokenizer_file": "tokenizer.json" } } diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json new file mode 100644 index 000000000..b9fa79cd2 --- /dev/null +++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json @@ -0,0 +1 @@ +{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}} diff --git a/torchchat/model_params/Granite-3.0-2B-Instruct.json b/torchchat/model_params/Granite-3.0-2B-Instruct.json new file mode 100644 index 000000000..1e9779cb3 --- /dev/null +++ b/torchchat/model_params/Granite-3.0-2B-Instruct.json @@ -0,0 +1,21 @@ +{ + "block_size": 8192, + "dim": 2048, + "hidden_dim": 8192, + "n_heads": 32, + "n_local_heads": 8, + "n_layers": 40, + "rope_base": 10000, + "vocab_size": 49155, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "norm_eps": 0.00001, + "rope_scaling": null, + "attention_bias": false, + "feed_forward_bias": false, + "tie_word_embeddings": true, + "embedding_multiplier": 12.0, + "attention_multiplier": 0.015625, + "residual_multiplier": 0.22, + "logits_scaling": 8.0 +} diff --git a/torchchat/model_params/Granite-3.0-8B-Instruct.json b/torchchat/model_params/Granite-3.0-8B-Instruct.json new file mode 100644 index 000000000..35db0f90d --- /dev/null +++ b/torchchat/model_params/Granite-3.0-8B-Instruct.json @@ -0,0 +1,20 @@ +{ + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "dim": 4096, + "block_size": 12800, + "hidden_dim": 12800, + "logits_scaling": 16.0, + "n_heads": 32, + "n_layers": 40, + "n_local_heads": 8, + "residual_multiplier": 0.22, + "norm_eps": 1e-05, + "rope_base": 10000, + "tie_word_embeddings": true, + "vocab_size": 49155, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "attention_bias": false, + "feed_forward_bias": false +} diff --git a/torchchat/model_params/Granite-3.1-2B-Instruct.json b/torchchat/model_params/Granite-3.1-2B-Instruct.json new file mode 100644 index 000000000..1e82036ab --- /dev/null +++ b/torchchat/model_params/Granite-3.1-2B-Instruct.json @@ -0,0 +1,20 @@ +{ + "attention_multiplier": 0.015625, + "embedding_multiplier": 12.0, + "dim": 2048, + "block_size": 8192, + "hidden_dim": 8192, + "logits_scaling": 8.0, + "n_heads": 32, + "n_layers": 40, + "n_local_heads": 8, + "residual_multiplier": 0.22, + "norm_eps": 1e-05, + "rope_base": 5000000.0, + "tie_word_embeddings": true, + "vocab_size": 49155, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "attention_bias": false, + "feed_forward_bias": false +} diff --git a/torchchat/model_params/Granite-3.1-8B-Instruct.json b/torchchat/model_params/Granite-3.1-8B-Instruct.json new file mode 100644 index 000000000..646340580 --- /dev/null +++ b/torchchat/model_params/Granite-3.1-8B-Instruct.json @@ -0,0 +1,20 @@ +{ + "attention_multiplier": 0.0078125, + "embedding_multiplier": 12.0, + "dim": 4096, + "block_size": 12800, + "hidden_dim": 12800, + "logits_scaling": 16.0, + "n_heads": 32, + "n_layers": 40, + "n_local_heads": 8, + "residual_multiplier": 0.22, + "norm_eps": 1e-05, + "rope_base": 10000000.0, + "tie_word_embeddings": true, + "vocab_size": 49155, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "attention_bias": false, + "feed_forward_bias": false +} diff --git a/torchchat/model_params/Granite-3B-Code.json b/torchchat/model_params/Granite-3B-Code.json new file mode 100644 index 000000000..0654a8f2c --- /dev/null +++ b/torchchat/model_params/Granite-3B-Code.json @@ -0,0 +1,17 @@ +{ + "block_size": 128000, + "dim": 2560, + "hidden_dim": 10240, + "n_heads": 32, + "n_local_heads": 32, + "n_layers": 32, + "rope_base": 10000000, + "vocab_size": 49152, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "norm_eps": 0.00001, + "rope_scaling": null, + "attention_bias": true, + "feed_forward_bias": true, + "tie_word_embeddings": true +} \ No newline at end of file diff --git a/torchchat/model_params/Granite-8B-Code.json b/torchchat/model_params/Granite-8B-Code.json new file mode 100644 index 000000000..079a32070 --- /dev/null +++ b/torchchat/model_params/Granite-8B-Code.json @@ -0,0 +1,17 @@ +{ + "block_size": 128000, + "dim": 4096, + "hidden_dim": 14336, + "n_heads": 32, + "n_local_heads": 8, + "n_layers": 36, + "rope_base": 10000000, + "vocab_size": 49152, + "use_hf_tokenizer": true, + "tokenizer_prepend_bos": false, + "norm_eps": 0.00001, + "rope_scaling": null, + "attention_bias": true, + "feed_forward_bias": true, + "tie_word_embeddings": true +} \ No newline at end of file diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json new file mode 100644 index 000000000..90c37250a --- /dev/null +++ b/torchchat/quant_config/cuda-32.json @@ -0,0 +1,5 @@ +{ + "executor": {"accelerator": "cuda"}, + "precision": {"dtype": "bf16"}, + "linear:int4": {"groupsize" : 32} +} diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json new file mode 100644 index 000000000..3afaa7542 --- /dev/null +++ b/torchchat/quant_config/mobile-32.json @@ -0,0 +1,4 @@ +{ + "embedding": {"bitwidth": 4, "groupsize" : 32}, + "linear:a8w4dq": {"groupsize" : 32} +} diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py index 99fd82fe8..0d1d3dce7 100644 --- a/torchchat/usages/openai_api.py +++ b/torchchat/usages/openai_api.py @@ -13,7 +13,7 @@ from dataclasses import dataclass from io import BytesIO from pwd import getpwuid -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Type import torch @@ -24,7 +24,7 @@ from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform from torchchat.cli.download import is_model_downloaded, load_model_configs -from torchchat.generate import Generator, GeneratorArgs +from torchchat.generate import LocalGenerator, DistributedGenerator, GeneratorArgs from torchchat.model import FlamingoModel from torchchat.utils.build_utils import device_sync @@ -180,7 +180,10 @@ class CompletionRequest: user: Optional[str] = None # unimplemented def __post_init__(self): - self.stream = bool(self.stream) + if isinstance(self.stream, str): + self.stream = self.stream.lower() != "false" + else: + self.stream = bool(self.stream) @dataclass @@ -267,7 +270,7 @@ class CompletionResponseChunk: usage: Optional[UsageStats] = None -class OpenAiApiGenerator(Generator): +class OpenAiApiGeneratorMixin: """A wrapper over the Generator class to interface with the OpenAI API. Implements endpoints for completion requests, both chunked and non-chunked using the dataclasses @@ -486,6 +489,15 @@ def _callback(self, x, *, buffer, done_generating): pass +def create_openai_api_generator(distributed: bool) -> Type: + """ + Factory method to create an OpenAiApiGenerator + """ + + # Base class order matters to make sure OpenAiApiGeneratorMixin overrides methods in DistributedGenerator and Generator + return type('OpenAiApiGenerator', (OpenAiApiGeneratorMixin, DistributedGenerator if distributed else LocalGenerator), {}) + + """ Helper functions for the OpenAI API Models endpoint. diff --git a/torchchat/usages/server.py b/torchchat/usages/server.py index 1fb76953b..550539a88 100644 --- a/torchchat/usages/server.py +++ b/torchchat/usages/server.py @@ -4,38 +4,89 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. +import atexit import json import logging logger = logging.getLogger(__name__) +from contextlib import nullcontext from dataclasses import asdict +from functools import partial +from os import environ from typing import Dict, List, Union import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from concurrent import futures from flask import Flask, request, Response from torchchat.cli.builder import BuilderArgs, TokenizerArgs +from torchchat.distributed.utils import run_in_dist_env from torchchat.generate import GeneratorArgs from torchchat.usages.openai_api import ( CompletionRequest, get_model_info_list, - OpenAiApiGenerator, + create_openai_api_generator, retrieve_model_info, ) OPENAI_API_VERSION = "v1" +def run_worker( + args, + rank, + queue, + ): + """ + This function creates and executes a generator + """ + gen = initialize_generator(args) + + while True: + try: + req = queue.get() + except KeyboardInterrupt: + break + + if req == "stop": + break + + for _ in gen.chunked_completion(req): + pass + def create_app(args): # noqa: C901 """ Creates a flask app that can be used to serve the model as a chat API. """ app = Flask(__name__) - gen: OpenAiApiGenerator = initialize_generator(args) + builder_args = BuilderArgs.from_args(args) + procs = [] + queue = None + if builder_args.distributed: + world_size = builder_args.tp * builder_args.pp + mp_context = mp.get_context('spawn') + queue = mp_context.Queue() + + for i in range(1, world_size): + fn = partial(run_worker, args, i, queue) + mp_context = mp.get_context('spawn') + procs.append(mp_context.Process(target=run_in_dist_env, args=(world_size, i, fn))) + procs[-1].start() + + environ["MASTER_ADDR"] = "localhost" + environ["MASTER_PORT"] = "29500" + environ["RDZV_BACKEND"] = "c10d" + environ["WORLD_SIZE"] = str(world_size) + environ["RANK"] = str(0) + environ["LOCALRANK"] = str(0) + + gen = initialize_generator(args) def _del_none(d: Union[Dict, List]) -> Union[Dict, List]: """Recursively delete None values from a dictionary.""" @@ -69,6 +120,10 @@ def chat_endpoint(): if req.stream: + if builder_args.distributed: + for _ in range(world_size-1): + queue.put(req) + def chunk_processor(chunked_completion_generator): """Inline function for postprocessing CompletionResponseChunk objects. @@ -86,8 +141,11 @@ def chunk_processor(chunked_completion_generator): ) return resp else: + if builder_args.distributed: + for _ in range(world_size-1): + queue.put(req) + response = gen.sync_completion(req) - print(response.choices[0].message.content) return json.dumps(_del_none(asdict(response))) @@ -102,16 +160,18 @@ def models_retrieve_endpoint(model_id): else: return "Model not found", 404 - return app + return app, (procs, queue) -def initialize_generator(args) -> OpenAiApiGenerator: +def initialize_generator(args) -> type: builder_args = BuilderArgs.from_args(args) speculative_builder_args = BuilderArgs.from_speculative_args(args) tokenizer_args = TokenizerArgs.from_args(args) generator_args = GeneratorArgs.from_args(args) generator_args.chat_mode = False + OpenAiApiGenerator = create_openai_api_generator(builder_args.distributed) + return OpenAiApiGenerator( builder_args=builder_args, speculative_builder_args=speculative_builder_args, @@ -124,5 +184,19 @@ def initialize_generator(args) -> OpenAiApiGenerator: def main(args): - app = create_app(args) + app, (procs, queue) = create_app(args) + + def shutdown_worker(): + while not queue.empty(): + queue.get(block=False) + for p in procs: + queue.put("stop") + for p in procs: + p.join(timeout=0.5) + for p in procs: + if p.is_alive(): + p.kill() + + atexit.register(shutdown_worker) + app.run() diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py index 2685ec2f3..a0862ff94 100644 --- a/torchchat/utils/build_utils.py +++ b/torchchat/utils/build_utils.py @@ -231,6 +231,8 @@ def find_multiple(n: int, k: int) -> int: def device_sync(device="cpu"): if "cuda" in device: torch.cuda.synchronize(device) + elif "xpu" in device: + torch.xpu.synchronize(device) elif ("cpu" in device) or ("mps" in device): pass else: @@ -279,7 +281,8 @@ def get_device_str(device) -> str: device = ( "cuda" if torch.cuda.is_available() - else "mps" if is_mps_available() else "cpu" + else "mps" if is_mps_available() + else "xpu" if torch.xpu.is_available() else "cpu" ) return device else: @@ -291,7 +294,8 @@ def get_device(device) -> str: device = ( "cuda" if torch.cuda.is_available() - else "mps" if is_mps_available() else "cpu" + else "mps" if is_mps_available() + else "xpu" if torch.xpu.is_available() else "cpu" ) return torch.device(device) diff --git a/torchchat/utils/device_info.py b/torchchat/utils/device_info.py index 9c5953944..950c03002 100644 --- a/torchchat/utils/device_info.py +++ b/torchchat/utils/device_info.py @@ -14,7 +14,7 @@ def get_device_info(device: str) -> str: """Returns a human-readable description of the hardware based on a torch.device.type Args: - device: A torch.device.type string: one of {"cpu", "cuda"}. + device: A torch.device.type string: one of {"cpu", "cuda", "xpu"}. Returns: str: A human-readable description of the hardware or an empty string if the device type is unhandled. @@ -37,4 +37,13 @@ def get_device_info(device: str) -> str: ) if device == "cuda": return torch.cuda.get_device_name(0) + if device == "xpu": + return ( + check_output( + ["xpu-smi discovery |grep 'Device Name:'"], shell=True + ) + .decode("utf-8") + .split("\n")[0] + .split("Device Name:")[1] + ) return "" diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md index 490500223..77414eeb4 100644 --- a/torchchat/utils/docs/evaluation.md +++ b/torchchat/utils/docs/evaluation.md @@ -4,8 +4,13 @@ # Evaluation Features + Torchchat provides evaluation functionality for your language model on a variety of tasks using the @@ -14,26 +19,65 @@ library. ## Usage -The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext". +The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext". + +## Examples -**Examples** +### Evaluation example with model in Python environment Running wikitext for 10 iterations ``` python3 torchchat.py eval stories15M --tasks wikitext --limit 10 ``` -Running an exported model +Running wikitext with torch.compile for 10 iterations +``` +python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10 +``` + +Running multiple tasks with torch.compile for evaluation and prefill: +``` +python3 torchchat.py eval stories15M --compile --compile-prefill --tasks wikitext hellaswag +``` + +### Evaluation with model exported to PTE with ExecuTorch + +Running an exported model with ExecuTorch (as PTE). Advantageously, because you can +load an exported PTE model back into the Python environment with torchchat, +you can run evaluation on the exported model! ``` python3 torchchat.py export stories15M --output-pte-path stories15M.pte python3 torchchat.py eval stories15M --pte-path stories15M.pte ``` -Running multiple tasks and calling eval.py directly: +Running multiple tasks directly on the created PTE mobile model: +``` +python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag +``` + +Now let's evaluate the effect of quantization on evaluation results by exporting with quantization using `--quantize` and an exemplary quantization configuration: ``` +python3 torchchat.py export stories15M --output-pte-path stories15M.pte --quantize torchchat/quant_config/mobile.json python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag ``` +Now try your own export options to explore different trade-offs between model size, evaluation speed and accuracy using model quantization! + +### Evaluation with model exported to DSO with AOT Inductor (AOTI) + +Running an exported model with AOT Inductor (DSO model). Advantageously, because you can +load an exported DSO model back into the Python environment with torchchat, +you can run evaluation on the exported model! +``` +python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so +python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so +``` + +Running multiple tasks with AOTI: +``` +python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag +``` + For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). [end default]: end diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py index 309ff807c..9e7b73b50 100644 --- a/torchchat/utils/gguf_loader.py +++ b/torchchat/utils/gguf_loader.py @@ -24,6 +24,8 @@ pack_scales_and_zeros, ) +from torchao.dtypes.utils import is_device + logger: logging.Logger = logging.getLogger(__name__) @@ -122,12 +124,13 @@ def linear_int4(input, weight_int4pack, scales_and_zeros, out_features, groupsiz input.dtype ) # cast back to input.dtype else: - c = torch.ops.aten._weight_int4pack_mm( + c = torch.ops.aten._weight_int4pack_mm_for_cpu( input, weight_int4pack, groupsize, scales_and_zeros, ) + new_shape = origin_input_size[:-1] + (out_features,) c = c.reshape(new_shape) return c @@ -178,16 +181,27 @@ def __init__( ), "must specify both weights and scales_and_zeros, or neither" if weight is None: - weight = torch.empty( - ( - out_features // 8, - in_features // (inner_k_tiles * 16), - 32, - inner_k_tiles // 2, - ), - dtype=torch.int32, - device=device, - ) + if is_device(device, "cpu"): + weight = torch.empty( + ( + out_features, + in_features // 2, + ), + dtype=torch.uint8, + device=device, + ) + else: + weight = torch.empty( + ( + out_features // 8, + in_features // (inner_k_tiles * 16), + 32, + inner_k_tiles // 2, + ), + dtype=torch.int32, + device=device, + ) + scales_and_zeros = torch.empty( (in_features // groupsize, out_features, 2), dtype=get_precision(), @@ -223,12 +237,17 @@ def _prepare_weight_and_scales_and_zeros( weight_int32, scales_and_zeros = group_quantize_tensor( weight_bf16, n_bit=4, groupsize=groupsize ) - weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to( - torch.uint8 - ) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( - weight_uint8, inner_k_tiles - ) + if is_device(weight_int32.device.type, "cpu"): + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu( + weight_int32, inner_k_tiles + ) + else: + weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to( + torch.uint8 + ) + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + weight_uint8, inner_k_tiles + ) return weight_int4pack, scales_and_zeros @classmethod @@ -570,6 +589,7 @@ def load_model_and_state_dict( load_state_dict: bool = True, load_as_quantized: bool = True, inner_k_tiles=8, + device="cpu", ) -> torch.nn.Module: """ Parses the GGUF file and returns an nn.Module on meta device along with a state_dict @@ -608,10 +628,15 @@ def load_model_and_state_dict( if load_state_dict: q, s, z = Q4_0.unpack(t) scales_and_zeros = pack_scales_and_zeros(s, z) - q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8) - weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( - q_uint8, inner_k_tiles - ) + if is_device(q.device.type, "cpu"): + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu( + q, inner_k_tiles + ) + else: + q_tmp = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8) + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + q_tmp, inner_k_tiles + ) state_dict[f"{fqn}.weight"] = weight_int4pack state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros @@ -623,7 +648,7 @@ def load_model_and_state_dict( in_features=in_features, out_features=out_features, bias=False, - device="meta", + device="cpu", groupsize=Q4_0.groupsize, inner_k_tiles=inner_k_tiles, ), diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py index 31c639dfd..933bc1b9e 100644 --- a/torchchat/utils/quantize.py +++ b/torchchat/utils/quantize.py @@ -26,7 +26,7 @@ # from functools import reduce # from math import gcd -from typing import Dict, Optional, Callable, Any, List +from typing import Any, Callable, Dict, List, Optional import torch import torch.nn as nn @@ -37,6 +37,7 @@ from torchao.quantization.quant_api import ( int4_weight_only, Int4WeightOnlyQuantizer, + int8_weight_only, Int8DynActInt4WeightQuantizer, quantize_, ) @@ -45,8 +46,8 @@ find_multiple, get_device_str, get_precision, - set_precision, name_to_dtype, + set_precision, state_dict_device, use_et_backend, ) @@ -60,28 +61,36 @@ import inspect + def get_named_parameters(func: Callable) -> List[str]: # Get the signature of the function signature = inspect.signature(func) - + # Extract the parameters from the signature parameters = signature.parameters - + # Filter and return named parameters named_params = [ - name for name, param in parameters.items() - if param.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY) + name + for name, param in parameters.items() + if param.kind + in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY) ] return named_params -def validate_args(named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None) -> Dict[str, Any]: + +def validate_args( + named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None +) -> Dict[str, Any]: for key in q_kwargs.keys(): if key not in named_params: - print(f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring.") + print( + f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring." + ) del q_kwargs[key] return q_kwargs - - + + ######################################################################### ### torchchat quantization API ### @@ -110,21 +119,43 @@ def quantize_model( if quantizer not in quantizer_class_dict: raise RuntimeError(f"unknown quantizer {quantizer} specified") else: + ao_quant = True # Use tensor subclass API for int4 weight only. - if device == "cuda" and quantizer == "linear:int4": + if (device == "cuda" or device == "xpu") and quantizer == "linear:int4": quantize_(model, int4_weight_only(q_kwargs["groupsize"])) + elif quantizer == "linear:int8": + print("quantizer is linear int8") + + # TODO: float16 quant via the AO quantize_() API seems broken. Remove this once the issue is resolved https://github.com/pytorch/ao/issues/1662 + if get_precision() == torch.float16: + print( + "model is float16 dtype - fallback to native implementation (see https://github.com/pytorch/ao/issues/1662)" + ) + ao_quant = False + else: + quantize_(model, int8_weight_only()) + else: + ao_quant = False + if ao_quant: if not support_tensor_subclass: unwrap_tensor_subclass(model) continue - + if quantizer in ["linear:a8wxdq", "embedding:wx"]: # These quantizers require float32 input weights. Note that after quantization, # the weights will no longer be float32, but lowbit integers if get_precision() != torch.float32: - print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}. Changing dtype to float32. Note that after quantization, the weights will be lowbit integers, not float32.") + print( + f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}. Changing dtype to float32. Note that after quantization, the weights will be lowbit integers, not float32." + ) set_precision(torch.float32) - - # We set global precision from quantize options if it is specified at cli.py:485 + + if quantizer == "linear:afpwx" and device != "mps": + raise RuntimeError( + "linear:afpwx quantization can only run on mps device!" + ) + + # We set global precision from quantize options if it is specified at cli.py:485 # so the precision returned by get_precision() is always the authoritative precision/dtype in torchchat precision = get_precision() @@ -141,14 +172,19 @@ def quantize_model( model = quant_handler.quantize(model) - ######################################################################### ### QuantHandler API definition ### ### (unify with torchao in future) ### class QuantHandler: - def __init__(self, model: Optional[nn.Module] = None, device="cpu", precision=None, tokenizer=None): + def __init__( + self, + model: Optional[nn.Module] = None, + device="cpu", + precision=None, + tokenizer=None, + ): self.model_ = model self.device = device self.tokenizer = tokenizer @@ -176,7 +212,15 @@ def quantize(self, model: nn.Module) -> nn.Module: class PrecisionHandler(QuantHandler): - def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, dtype): + def __init__( + self, + model: Optional[nn.Module] = None, + device="cpu", + precision=None, + tokenizer=None, + *, + dtype, + ): self.model_ = model self.device = device self.tokenizer = tokenizer @@ -205,7 +249,15 @@ def quantized_model(self) -> nn.Module: class ExecutorHandler(QuantHandler): - def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, accelerator): + def __init__( + self, + model: Optional[nn.Module] = None, + device="cpu", + precision=None, + tokenizer=None, + *, + accelerator, + ): self.model_ = model if isinstance(accelerator, str): @@ -593,7 +645,7 @@ class WeightOnlyInt8QuantHandler(QuantHandler): def __init__( self, model: Optional[nn.Module] = None, - device = None, + device=None, precision=None, tokenizer=None, *, @@ -886,10 +938,10 @@ def quantized_model(self) -> nn.Module: # class references quantizer_class_dict = { "embedding": EmbeddingOnlyQuantHandler, - "linear:int8": WeightOnlyInt8QuantHandler, "precision": PrecisionHandler, "executor": ExecutorHandler, "linear:int4": Int4WeightOnlyQuantizer, + "linear:int8": WeightOnlyInt8QuantHandler, "linear:a8w4dq": Int8DynActInt4WeightQuantizer, } @@ -915,10 +967,12 @@ def quantized_model(self) -> nn.Module: from torchao_experimental_quant_api import ( Int8DynActIntxWeightLinearQuantizer, IntxWeightEmbeddingQuantizer, + UIntxWeightOnlyLinearQuantizer, ) quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer + quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer # Try loading custom op try: @@ -927,16 +981,19 @@ def quantized_model(self) -> nn.Module: libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*") libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs)) torch.ops.load_library(libs[0]) + print("Loaded torchao cpu ops.") + except Exception as e: + print( + "Unable to load torchao cpu ops library. Slow fallback kernels will be used." + ) + + try: + libname = "libtorchao_ops_mps_aten.dylib" + libpath = f"{torchao_build_path}/cmake-out/lib/{libname}" + torch.ops.load_library(libpath) + print("Loaded torchao mps ops.") except Exception as e: - print("Failed to torchao ops library with error: ", e) - print("Slow fallback kernels will be used.") + print("Unable to load torchao mps ops library.") except Exception as e: - class ErrorHandler(QuantHandler): - def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None): - global torchao_experimental_load_error - raise Exception(f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}") - - torchao_experimental_load_error = e - quantizer_class_dict["linear:a8wxdq"] = ErrorHandler - quantizer_class_dict["embedding:wx"] = ErrorHandler + print("Unable to import torchao experimental quant_api with error: ", e) diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh index 3c2c1c846..e2b8b4fc0 100755 --- a/torchchat/utils/scripts/build_native.sh +++ b/torchchat/utils/scripts/build_native.sh @@ -64,7 +64,7 @@ fi pushd ${TORCHCHAT_ROOT} -git submodule update --init +git submodule update --init --recursive git submodule sync if [[ "$TARGET" == "et" ]]; then if [ ! -d "${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install" ]; then @@ -93,7 +93,7 @@ popd if [[ "$TARGET" == "et" ]]; then cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja else - cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja + cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja fi cmake --build ./cmake-out --target "${TARGET}"_run diff --git a/torchchat/utils/scripts/build_torchao_ops.sh b/torchchat/utils/scripts/build_torchao_ops.sh index a8fd8bea2..46e2479ac 100644 --- a/torchchat/utils/scripts/build_torchao_ops.sh +++ b/torchchat/utils/scripts/build_torchao_ops.sh @@ -5,12 +5,17 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +device=${1:-cpu} +if [[ "$device" != "cpu" && "$device" != "mps" ]]; then + echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2 + exit 1 +fi source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh" pushd ${TORCHCHAT_ROOT} find_cmake_prefix_path clone_torchao -install_torchao_aten_ops +install_torchao_aten_ops "$device" popd diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh index 84966cc35..57dcc77bf 100644 --- a/torchchat/utils/scripts/install_utils.sh +++ b/torchchat/utils/scripts/install_utils.sh @@ -88,7 +88,7 @@ install_executorch_python_libs() { echo "Building and installing python libraries" if [ "${ENABLE_ET_PYBIND}" = false ]; then echo "Not installing pybind" - bash ./install_requirements.sh + bash ./install_requirements.sh --pybind off else echo "Installing pybind" bash ./install_requirements.sh --pybind xnnpack @@ -184,8 +184,18 @@ clone_torchao() { } install_torchao_aten_ops() { - echo "Building torchao custom ops for ATen" - pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental + local device=${1:-cpu} + + if [[ "$device" == "cpu" ]]; then + echo "Building torchao custom ops for ATen" + pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental + elif [[ "$device" == "mps" ]]; then + echo "Building torchao mps custom ops for ATen" + pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental/ops/mps + else + echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2 + return 1 + fi CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \ diff --git a/torchchat/utils/scripts/updown.py b/torchchat/utils/scripts/updown.py index 86ebf803f..306e5855b 100644 --- a/torchchat/utils/scripts/updown.py +++ b/torchchat/utils/scripts/updown.py @@ -267,6 +267,8 @@ def updown_processor( lines = file.readlines() print_flag = False + # Use bash; set it to fail on the first failing command + output("#! /bin/bash", replace_list=None, suppress_list=None) output("set -eou pipefail", replace_list=None, suppress_list=None) if create_sections: