diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish
index 5d9783b3b..912020a5a 100755
--- a/.ci/scripts/check_gibberish
+++ b/.ci/scripts/check_gibberish
@@ -24,6 +24,18 @@ else
     fi
 fi
 
+#######################################################################
+#
+# check whether aspell spell check evailable
+
+if command -v aspell &> /dev/null; then
+    echo "Checking $TMPFILE for gibberish"
+else
+    echo "Aspell is not installed or not in PATH."
+    echo "Gibberish unchecked in $TMPFILE"
+    exit 0
+fi
+
 #######################################################################
 #
 # run spell check on the extracted sequence
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index a09944ad5..71f074cef 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -1,93 +1,67 @@
-# /bin/bash -x
+#!/bin/bash -x
 
-if [ "X$1" == "X" ]; then
+# Check if an argument was provided
+if [ -z "$1" ]; then
   echo "Must specify document to run"
   exit 1
 fi
 
-if [ "$1" == "readme" ]; then
-        echo "::group::Create script to run README"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-readme.sh
-        echo "::endgroup::"
-
-        echo "::group::Run README"
-        echo "*******************************************"
-        cat ./run-readme.sh
-        echo "*******************************************"
-        bash -x ./run-readme.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "quantization" ]; then
-        echo "::group::Create script to run quantization"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-quantization.sh
-        echo "::endgroup::"
-
-        echo "::group::Run quantization"
-        echo "*******************************************"
-        cat ./run-quantization.sh
-        echo "*******************************************"
-        bash -x ./run-quantization.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "gguf" ]; then
-        echo "::group::Create script to run gguf"
-        python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-gguf.sh
-        echo "::endgroup::"
-
-        echo "::group::Run gguf"
-        echo "*******************************************"
-        cat ./run-gguf.sh
-        echo "*******************************************"
-        bash -x ./run-gguf.sh
-        echo "::endgroup::"
-fi
-
-
-if [ "$1" == "advanced" ]; then
-        echo "::group::Create script to run advanced"
-        python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-advanced.sh
-        echo "::endgroup::"
-
-        echo "::group::Run advanced"
-        echo "*******************************************"
-        cat ./run-advanced.sh
-        echo "*******************************************"
-        bash -x ./run-advanced.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "evaluation" ]; then
-
-    exit 0
-
-        echo "::group::Create script to run evaluation"
-        python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-evaluation.sh
-        echo "::endgroup::"
-
-        echo "::group::Run evaluation"
-        echo "*******************************************"
-        cat ./run-evaluation.sh
-        echo "*******************************************"
-        bash -x ./run-evaluation.sh
-fi
+# Pre-initialize variables
+filepath=""
+parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
+script_name="./run-${1}.sh"  # Dynamically initialize script name
+
+# Use a case statement to handle the $1 argument
+case "$1" in
+  "readme")
+    filepath="README.md"
+    ;;
+  "quantization")
+    filepath="docs/quantization.md"
+    ;;
+  "gguf")
+    filepath="docs/GGUF.md"
+    ;;
+  "advanced")
+    filepath="docs/ADVANCED-USERS.md"
+    ;;
+  "evaluation")
+    filepath="torchchat/utils/docs/evaluation.md"
+    ;;
+  "multimodal")
+    filepath="docs/multimodal.md"
+    parameters=""  # Clear parameters
+    ;;
+  "native")
+    filepath="docs/native-execution.md"
+    parameters=""  # Clear parameters
+    ;;
+  "distributed")
+    filepath="docs/distributed.md"
+    parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"  # Use stories110M to avoid need for authentication
+    ;;
+  "local")
+    filepath="docs/local-model.md"
+    parameters=""  # Clear parameters
+    ;;
+
+  *)
+    echo "Unknown option: $1"
+    exit 1
+    ;;
+esac
+
+# Generate the script
+echo "::group::Create script to run $1"
+python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name"
+# if something happened to updown processor, and it did not error out, fail with an exit 1
+echo "exit 1" >> "$script_name"
+echo "::endgroup::"
+
+# Run the script
+echo "::group::Run $1"
+echo "*******************************************"
+cat "$script_name"
+echo "*******************************************"
+bash -x "$script_name"
+echo "::endgroup::"
diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
index 1e0652c96..f772382d1 100644
--- a/.github/workflows/more-tests.yml
+++ b/.github/workflows/more-tests.yml
@@ -9,23 +9,20 @@ on:
 
 jobs:
   test-cuda:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-
         echo "::group::Download checkpoints"
         # Install requirements
         ./install/install_requirements.sh cuda
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index a9561e3e8..2e264e6cf 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -108,7 +108,10 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
   test-gpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     secrets: inherit
@@ -119,7 +122,7 @@ jobs:
       secrets-env: "HF_TOKEN_PERIODIC"
       runner: ${{ matrix.runner }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ee7270a5d..5dbafee9f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -215,7 +215,10 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
   test-gpu-compile:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -224,7 +227,7 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
@@ -250,7 +253,10 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-bfloat16:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -259,18 +265,13 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip3 list
@@ -291,7 +292,10 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float32:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -300,17 +304,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip list
@@ -337,7 +336,10 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float16:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -346,17 +348,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip list
@@ -384,7 +381,10 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-eval-sanity-check:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -393,17 +393,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip3 list
@@ -731,6 +726,7 @@ jobs:
 
           git clone https://github.com/ggerganov/llama.cpp.git
           pushd llama.cpp
+          git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3
           make
           popd
 
@@ -941,7 +937,7 @@ jobs:
           path: |
             ./et-build
             ./torchchat/utils/scripts
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
+          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }}
       - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
         continue-on-error: true
         run: |
@@ -1030,7 +1026,10 @@ jobs:
           echo "Tests complete."
 
   test-build-runner-et-android:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.4xlarge
       script: |
@@ -1052,7 +1051,7 @@ jobs:
 
           # Pull submodules (re2, abseil) for Tiktoken
           git submodule sync
-          git submodule update --init
+          git submodule update --init --recursive
           ./runner/build_android.sh
           echo "Tests complete."
 
@@ -1123,3 +1122,41 @@ jobs:
           echo "Generate AOTI"
           python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
           echo "Tests complete."
+
+  test-torchao-experimental-mps:
+    strategy:
+      matrix:
+        runner: [macos-m1-stable]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install torchao-ops-mps
+        id: install-torchao-ops-mps
+        run: |
+          bash torchchat/utils/scripts/build_torchao_ops.sh mps
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          export PRMT="Once upon a time in a land far away"
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 3, "groupsize": 32}}'
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
index 6a933b5f1..2c49a975f 100644
--- a/.github/workflows/run-readme-periodic.yml
+++ b/.github/workflows/run-readme-periodic.yml
@@ -10,24 +10,22 @@ on:
 
 jobs:
   test-readme:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run README"
         python3 torchchat/utils/scripts/updown.py --create-sections --file README.md > ./run-readme.sh
         # for good measure, if something happened to updown processor,
@@ -44,23 +42,21 @@ jobs:
 
 
   test-quantization-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets: inherit
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run quantization"
         python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md > ./run-quantization.sh
         # for good measure, if something happened to updown processor,
@@ -76,24 +72,22 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run gguf"
         python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md > ./run-gguf.sh
         # for good measure, if something happened to updown processor,
diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
new file mode 100644
index 000000000..1f22c4f2e
--- /dev/null
+++ b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -0,0 +1,114 @@
+name: Run the README instructions - with stories - on Linux aarch64
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-readme-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-quantization-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
+
+  test-gguf-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-advanced-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-evaluation-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
index 64afe2247..ce84d3b50 100644
--- a/.github/workflows/run-readme-pr-macos.yml
+++ b/.github/workflows/run-readme-pr-macos.yml
@@ -33,7 +33,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs readme
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
   
           echo "::group::Completion"
           echo "tests complete"
@@ -68,7 +69,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs quantization
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
   
           echo "::group::Completion"
           echo "tests complete"
@@ -103,7 +105,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs gguf
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
           echo "::group::Completion"
           echo "tests complete"
@@ -137,9 +140,113 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs advanced
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
           echo "::group::Completion"
           echo "tests complete"
           echo "*******************************************"
           echo "::endgroup::"
+
+  test-eval-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs evaluation
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"   
+
+  test-multimodal-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs multimodal
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-native-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs native
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+  
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 718d5cf9e..db16bc80e 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -10,12 +10,13 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
+      timeout: 60
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
           set -x
-          # NS: Remove previous installation  of torch first
-          # as this script does not isntall anything into conda env but rather as system dep
+          # NS: Remove previous installation of torch first
+          # as this script does not install anything into conda env but rather as system dep
           pip3 uninstall -y torch || true
           set -eou pipefail
 
@@ -35,7 +36,8 @@ jobs:
   test-quantization-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  
+      runner: macos-m1-14
+      timeout: 60
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
@@ -62,7 +64,7 @@ jobs:
   test-gguf-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
@@ -89,7 +91,7 @@ jobs:
   test-advanced-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
@@ -112,3 +114,84 @@ jobs:
           echo "tests complete"
           echo "*******************************************"
           echo "::endgroup::"
+
+  test-evaluation-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-evaluation-mps-macos python=3.10.11
+          conda activate test-evaluation-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs evaluation
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-multimodal-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-multimodal-mps-macos python=3.10.11
+          conda activate test-multimodal-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs multimodal
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-native-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-native-mps-macos python=3.10.11
+          conda activate test-native-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs native
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index cd6a95681..37c27822b 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -9,22 +9,20 @@ on:
 
 jobs:
   test-readme-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -33,22 +31,20 @@ jobs:
         echo "::endgroup::"
 
   test-readme-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -57,22 +53,20 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs quantization
 
         echo "::group::Completion"
@@ -81,41 +75,37 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -124,22 +114,20 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -149,22 +137,20 @@ jobs:
 
 
   test-advanced-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -174,22 +160,20 @@ jobs:
 
 
   test-advanced-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -198,6 +182,89 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-any:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-evaluation-cpu:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-multimodal-any:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs multimodal
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-multimodal-cpu:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
+
+  test-native-any:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -214,14 +281,14 @@ jobs:
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         echo "::endgroup::"
 
-        .ci/scripts/run-docs evaluation
+        .ci/scripts/run-docs native
 
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
         echo "::endgroup::"
 
-  test-evaluation-cpu:
+  test-native-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -238,9 +305,26 @@ jobs:
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         echo "::endgroup::"
 
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
+
+  test-distributed-cuda:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs distributed
 
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
-        echo "::endgroup::"
\ No newline at end of file
+        echo "::endgroup::"
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
index b83b9904b..0b4597942 100644
--- a/.github/workflows/runner-cuda-dtype.yml
+++ b/.github/workflows/runner-cuda-dtype.yml
@@ -9,24 +9,21 @@ on:
 
 jobs:
   test-runner-aot-cuda:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-
         echo "::group::Download checkpoints"
         # Install requirements
 
@@ -58,7 +55,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2
 
-            ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
         done
 
diff --git a/.gitignore b/.gitignore
index 74d0a28fa..61ab1ee4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ runner-et/cmake-out/*
 runner-aoti/cmake-out/*
 cmake-out/
 
+# Example project Android Studio ignore
+torchchat/edge/android/torchchat/.idea/*
+
+
 # pte files
 *.pte
 
diff --git a/.gitmodules b/.gitmodules
index 7681823df..76bc1b9fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,3 @@
-[submodule "tokenizer/third-party/abseil-cpp"]
-	path = tokenizer/third-party/abseil-cpp
-	url = https://github.com/abseil/abseil-cpp.git
-[submodule "tokenizer/third-party/re2"]
-	path = tokenizer/third-party/re2
-	url = https://github.com/google/re2.git
-[submodule "tokenizer/third-party/sentencepiece"]
-	path = tokenizer/third-party/sentencepiece
-	url = https://github.com/google/sentencepiece.git
+[submodule "runner/third-party/tokenizers"]
+	path = runner/third-party/tokenizers
+	url = https://github.com/pytorch-labs/tokenizers
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61fd4d5a6..e004dbfcb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,18 +7,21 @@ ELSE()
 ENDIF()
 
 project(Torchchat)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
 
 # include tokenizer
-add_subdirectory(tokenizer)
+add_subdirectory(runner/third-party/tokenizers)
 
 # include et_run executable
 include(runner/et.cmake)
 if(TARGET et_run)
-    target_link_libraries(et_run PUBLIC tokenizer microkernels-prod)
+    target_link_libraries(et_run PUBLIC tokenizers microkernels-prod)
+    target_include_directories(et_run PUBLIC runner/third-party/tokenizers/include)
 endif()
 
 # include aoti_run executable
 include(runner/aoti.cmake)
 if(TARGET aoti_run)
-    target_link_libraries(aoti_run tokenizer)
+    target_link_libraries(aoti_run tokenizers)
+    target_include_directories(aoti_run PUBLIC runner/third-party/tokenizers/include)
 endif()
diff --git a/README.md b/README.md
index 4b910e575..51db1bfca 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,11 @@
 torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android.
 
 > [!IMPORTANT]
-> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!!
+> Update
+>
+> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)!
+>
+> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**!
 >
 > To try it out, finish the [Installation](#Installation) section below, then hop
 > over to our [multimodal guide](docs/multimodal.md) to learn more.
@@ -45,16 +49,16 @@ aliases.
 
 | Model | Mobile Friendly | Notes |
 |------------------|---|---------------------|
-|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-3b`.|
+|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-3b`.|
 |[meta-llama/Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|✅|Best for `generate`. Alias to `llama3.2-3b-base`.|
-|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification . Alias to `llama3-1b-guard`.|
-|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-1b`.|
+|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification. Alias to `llama3-1b-guard`.|
+|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-1b`.|
 |[meta-llama/Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|✅|Best for `generate`. Alias to `llama3.2-1b-base`.|
-|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat` . Alias to `llama3.2-11B`.|
-|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate` . Alias to `llama3.2-11B-base`.|
-|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.1`.|
+|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat`. Alias to `llama3.2-11B`.|
+|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate`. Alias to `llama3.2-11B-base`.|
+|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.1`.|
 |[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)|✅|Best for `generate`. Alias to `llama3.1-base`.|
-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3`.|
+|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3`.|
 |[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|✅|Best for `generate`. Alias to `llama3-base`.|
 |[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|✅|Tuned for `chat`. Alias to `llama2`.|
 |[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)||Tuned for `chat`. Alias to `llama2-13b-chat`.|
@@ -69,6 +73,14 @@ aliases.
 |[tinyllamas/stories42M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories42M`.|
 |[tinyllamas/stories110M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories110M`.|
 |[openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b)|✅|Best for `generate`. Alias to `open-llama`.|
+| [ibm-granite/granite-3b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k) |✅| Alias to `granite-code` and `granite-code-3b`.|
+| [ibm-granite/granite-8b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-8b-code-instruct-128k) |✅| Alias to `granite-code-8b`.|
+| [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct) |✅| Alias to `granite3-2b` and `granite3`.|
+| [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.|
+| [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.|
+| [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.|
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.|
+
 
 ## Installation
 The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed.
@@ -231,6 +243,8 @@ python3 torchchat.py server llama3.1
 ```
 [skip default]: end
 
+[shell default]: python3 torchchat.py server llama3.1 & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests
+
 In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
 
 > [!NOTE]
@@ -244,8 +258,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
-[skip default]: begin
-
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -265,12 +277,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \
     ]
   }'
 ```
+[skip default]: begin
 ```
 {"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
 ```
 
 [skip default]: end
 
+[shell default]: kill ${server_pid}
 
 </details>
 
@@ -332,7 +346,7 @@ torchchat/utils/scripts/build_native.sh aoti
 
 Then run the compiled executable, with the pt2.
 ```bash
-cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 ## Mobile Execution
@@ -404,7 +418,7 @@ torchchat/utils/scripts/build_native.sh et
 
 Execute using the runner
 ```bash
-cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 </details>
@@ -664,6 +678,6 @@ awesome libraries and tools you've built around local LLM inference.
 
 torchchat is released under the [BSD 3 license](LICENSE). (Additional
 code in this distribution is covered by the MIT and Apache Open Source
-licenses.) However you may have other legal obligations that govern
+licenses.) However, you may have other legal obligations that govern
 your use of content, such as the terms of service for third-party
 models.
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index 417a823f8..17958e790 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -1,27 +1,25 @@
 > [!WARNING]
 > Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation.
 
-# Torchchat is still in pre-release!
-
-
-Torchchat is currently in a pre-release state and under extensive development.
-
 
 # The Lost Manual: torchchat
 
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license)
 
+<!--
+
 [shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
 
 [shell default]: ./install/install_requirements.sh
 
 [shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
 
+-->
 
-This is the advanced users guide, if you're looking to get started
+This is the advanced users' guide, if you're looking to get started
 with LLMs, please refer to the README at the root directory of the
 torchchat distro.  This is an advanced user guide, so we will have
-many more concepts and options to discuss and taking advantage of them
+many more concepts and options to discuss and take advantage of them
 may take some effort.
 
 We welcome community contributions of all kinds.  If you find
@@ -41,7 +39,7 @@ While we strive to support a broad range of models, we can't test them
 all. We classify supported models as tested ✅, work in progress 🚧 or
 some restrictions ❹.
 
-We invite community contributions of new model suport and test results!
+We invite community contributions of new model support and test results!
 
 | Model | Tested | Eager | torch.compile | AOT Inductor | ExecuTorch | Fits on Mobile |
 |-----|--------|-------|-----|-----|-----|-----|
@@ -86,7 +84,7 @@ Server C++ runtime | n/a | run.cpp model.pte | ✅ |
 Mobile C++ runtime | n/a | app model.pte | ✅ |
 Mobile C++ runtime | n/a | app + AOTI | 🚧 |
 
-**Getting help:** Each command implements the --help option to give addititonal information about available options:
+**Getting help:** Each command implements the --help option to give additional information about available options:
 
 [skip default]: begin
 ```
@@ -96,8 +94,8 @@ python3 torchchat.py [ export | generate | chat | eval | ... ] --help
 
 Exported models can be loaded back into torchchat for chat or text
 generation, letting you experiment with the exported model and valid
-model quality. The python interface is the same in all cases and is
-used for testing nad test harnesses too.
+model quality. The Python interface is the same in all cases and is
+used for testing and test harnesses, too.
 
 Torchchat comes with server C++ runtimes to execute AOT Inductor and
 ExecuTorch models. A mobile C++ runtimes allow you to deploy
@@ -115,7 +113,7 @@ Some common models are recognized by torchchat based on their filename
 through `Model.from_name()` to perform a fuzzy match against a
 table of known model architectures. Alternatively, you can specify the
 index into that table with the option `--params-table ${INDEX}` where
-the index is the lookup key key in the [the list of known
+the index is the lookup key in the [the list of known
 pconfigurations](https://github.com/pytorch/torchchat/tree/main/torchchat/model_params)
 For example, for the stories15M model, this would be expressed as
 `--params-table stories15M`. (We use the model constructor
@@ -237,7 +235,7 @@ which chooses the best 16-bit floating point type.
 
 The virtual device fast and virtual floating point data types fast and
 fast16 are best used for eager/torch.compiled execution.  For export,
-specify the your device choice for the target system with --device for
+specify your device choice for the target system with --device for
 AOTI-exported DSO models, and using ExecuTorch delegate selection for
 ExecuTorch-exported PTE models.
 
@@ -250,8 +248,9 @@ python3 torchchat.py generate [--compile] --checkpoint-path ${MODEL_PATH} --prom
 To improve performance, you can compile the model with `--compile`
 trading off the time to first token processed with time per token.  To
 improve performance further, you may also compile the prefill with
-`--compile_prefill`. This will increase further compilation times though. The
-`--compile-prefill` option is not compatible with `--prefill-prefill`.
+`--compile-prefill`. This will increase further compilation times though. 
+For CPU, you can use `--max-autotune` to further improve the performance
+with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 Parallel prefill is not yet supported by exported models, and may be
 supported in a future release.
@@ -265,7 +264,7 @@ the introductory README.
 In addition to running eval on models in eager mode and JIT-compiled
 mode with `torch.compile()`, you can also load dso and pte models back
 into the PyTorch to evaluate the accuracy of exported model objects
-(e.g., after applying quantization or other traqnsformations to
+(e.g., after applying quantization or other transformations to
 improve speed or reduce model size).
 
 Loading exported models back into a Python-based Pytorch allows you to
@@ -297,14 +296,14 @@ for ExecuTorch.)
 
 We export the stories15M model with the following command for
 execution with the ExecuTorch runtime (and enabling execution on a
-wide range of community and vendor supported backends):
+wide range of community and vendor-supported backends):
 
 ```
 python3 torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_NAME}.pte
 ```
 
 Alternatively, we may generate a native instruction stream binary
-using AOT Inductor for CPU oor GPUs (the latter using Triton for
+using AOT Inductor for CPU or GPUs (the latter using Triton for
 optimizations such as operator fusion):
 
 ```
@@ -319,10 +318,10 @@ the exported model artifact back into a model container with a
 compatible API surface for the `model.forward()` function.  This
 enables users to test, evaluate and exercise the exported model
 artifact with familiar interfaces, and in conjunction with
-pre-exiisting Python model unit tests and common environments such as
+pre-existing Python model unit tests and common environments such as
 Jupyter notebooks and/or Google colab.
 
-Here is how to load an exported model into the python environment on the example of using an exported model with `generate.oy`.
+Here is how to load an exported model into the Python environment using an exported model with the `generate` command.
 
 ```
 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --pte-path ${MODEL_NAME}.pte --device cpu --prompt "Once upon a time"
@@ -452,7 +451,7 @@ strategies:
 You can find instructions for quantizing models in
 [docs/quantization.md](file:///./quantization.md).  Advantageously,
 quantization is available in eager mode as well as during export,
-enabling you to do an early exploration of your quantization setttings
+enabling you to do an early exploration of your quantization settings
 in eager mode.  However, final accuracy should always be confirmed on
 the actual execution target, since all targets have different build
 processes, compilers, and kernel implementations with potentially
@@ -464,9 +463,8 @@ significant impact on accuracy.
 
 ## Native (Stand-Alone) Execution of Exported Models
 
-Refer to the [README](README.md] for an introduction toNative
-execution on servers, desktops and laptops is described under
-[runner-build.md].  Mobile and Edge executipon for Android and iOS are
+Refer to the [README](README.md) for an introduction to native
+execution on servers, desktops, and laptops.  Mobile and Edge execution for Android and iOS are
 described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively.
 
 
@@ -475,13 +473,13 @@ described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md
 
 PyTorch and ExecuTorch support a broad range of devices for running
 PyTorch with python (using either eager or eager + `torch.compile`) or
-in a python-free environment with AOT Inductor and ExecuTorch.
+in a Python-free environment with AOT Inductor and ExecuTorch.
 
 
 | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
 |-----|------|-----|-----|-----|-----|
 | x86 | Linux | ✅ |  ✅ |  ✅ |  ✅ |
-| aarch64 | Linux | n/t | n/t | n/t | n/t |
+| aarch64 | Linux | ✅ | ✅ | ✅ | n/t |
 | aarch64 | macOS | ✅ |  ✅ |  ✅ |  ✅ |
 | AMD GPU | Linux |  ✅ |  ✅ |  ✅ | ❌|
 | Nvidia GPU | Linux | ✅ |  ✅ |  ✅ | ❌|
@@ -492,65 +490,13 @@ in a python-free environment with AOT Inductor and ExecuTorch.
 | Mobile GPU (Vulkan) | Android |  ❌|❌|❌| ✅ |
 | CoreML | iOS |  ❌|❌|❌| ✅ |
 | Hexagon DSP | Android | ❌|❌|❌| ✅ |
-| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ |
+| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ |
 | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t |
 | ARM 32b (up to v7) | any | ❌|❌|❌|❌|
 
 *Key*: n/t -- not tested
 
 
-## Runtime performance with Llama 7B, in tokens per second (4b quantization)
-
-| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime |
-|-----|------|-----|-----|-----|-----|
-| x86 | Linux | ? | ? | ? | ? |
-| x86 | macOS | ? | ? | ? | ? |
-| aarch64 | Linux | ? | ? | ? | ? |
-| aarch64 | macOS | ? | ? | ? | ? |
-| AMD GPU | Linux | ? | ? | ? | ? |
-| Nvidia GPU | Linux | ? | ? | ? | ? |
-| MPS | macOS | ? | ? | ? | ? |
-| MPS | iOS | ? | ? | ? | ? |
-| aarch64 | Android | ? | ? | ? | ? |
-| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? |
-| CoreML | iOS | | ? | ? | ? | ? |
-| Hexagon DSP | Android | | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Android | ? | ? | ? | ? |
-| ARM 32b (up to v7) | any | | ? | ? | ? | ? |
-
-
-## Runtime performance with Llama3, in tokens per second (4b quantization)
-
-| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime |
-|-----|------|-----|-----|-----|-----|
-| x86 | Linux | ? | ? | ? | ? |
-| x86 | macOS | ? | ? | ? | ? |
-| aarch64 | Linux | ? | ? | ? | ? |
-| aarch64 | macOS | ? | ? | ? | ? |
-| AMD GPU | Linux | ? | ? | ? | ? |
-| Nvidia GPU | Linux | ? | ? | ? | ? |
-| MPS | macOS | ? | ? | ? | ? |
-| MPS | iOS | ? | ? | ? | ? |
-| aarch64 | Android | ? | ? | ? | ? |
-| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? |
-| CoreML | iOS | | ? | ? | ? | ? |
-| Hexagon DSP | Android | | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Android | ? | ? | ? | ? |
-| ARM 32b (up to v7) | any | | ? | ? | ? | ? |
-
-
-
-
-# CONTRIBUTING to torchchat
-
-We welcome any feature requests, bug reports, or pull requests from
-the community. See the [CONTRIBUTING](CONTRIBUTING.md) for
-instructions how to contribute to torchchat.
-
-
-
 # LICENSE
 
 Torchchat is released under the [BSD 3 license](./LICENSE). However
diff --git a/docs/distributed.md b/docs/distributed.md
new file mode 100644
index 000000000..3d34d7672
--- /dev/null
+++ b/docs/distributed.md
@@ -0,0 +1,125 @@
+# Distributed Inference with torchchat
+
+torchchat supports distributed inference for large language models (LLMs) on GPUs seamlessly. 
+At present, torchchat supports distributed inference using Python only.
+
+## Installation
+The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed.
+
+> [!TIP]
+> torchchat uses the latest changes from various PyTorch projects so it's highly recommended that you use a venv (by using the commands below) or CONDA.
+
+[skip default]: begin
+```bash
+git clone https://github.com/pytorch/torchchat.git
+cd torchchat
+python3 -m venv .venv
+source .venv/bin/activate
+./install/install_requirements.sh
+```
+[skip default]: end
+
+[shell default]: ./install/install_requirements.sh
+
+## Login to HF for Downloading Weights
+Most models use Hugging Face as the distribution channel, so you will need to create a Hugging Face account. Create a Hugging Face user access token as documented here with the write role.
+
+Log into Hugging Face:
+
+[prefix default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}"
+
+```
+huggingface-cli login
+```
+
+## Enabling Distributed torchchat Inference
+
+To enable distributed inference, use the option `--distributed`.  In addition, `--tp <num>` and `--pp <num>` 
+allow users to specify the types of parallelism to use where tp refers to tensor parallelism and pp to pipeline parallelism.
+
+
+## Generate Output with Distributed torchchat Inference
+
+To generate output using distributed inference with 4 GPUs, you can use:
+```
+python3 torchchat.py generate llama3.1 --distributed --tp 2 --pp 2 --prompt "write me a story about a boy and his bear"
+```
+
+
+## Chat with Distributed torchchat Inference
+
+This mode allows you to chat with an LLM in an interactive fashion with distributed Inference.  The following example uses 4 GPUs:
+
+[skip default]: begin
+```bash
+python3 torchchat.py chat llama3.1 --max-new-tokens 10 --distributed --tp 2 --pp 2
+```
+[skip default]: end
+
+
+## A Server with Distributed torchchat Inference
+
+This mode exposes a REST API for interacting with a model.
+The server follows the [OpenAI API specification](https://platform.openai.com/docs/api-reference/chat) for chat completions.
+
+To test out the REST API, **you'll need 2 terminals**: one to host the server, and one to send the request.
+
+In one terminal, start the server to run with 4 GPUs:
+
+[skip default]: begin
+
+```bash
+python3 torchchat.py server llama3.1 --distributed --tp 2 --pp 2
+```
+[skip default]: end
+
+<!--
+[shell default]: python3 torchchat.py server llama3.1 --distributed --tp 2 --pp 2 & server_pid=$! ; sleep 180 # wait for server to be ready to accept requests
+-->
+
+In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
+
+> [!NOTE]
+> Since this feature is under active development, not every parameter is consumed. See api/api.py for details on
+> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973).
+
+<details>
+<summary>Example Query</summary>
+
+Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will await the full response from the server.
+
+**Example Input + Output**
+
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama3.1",
+    "stream": "true",
+    "max_tokens": 200,
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ]
+  }'
+```
+[skip default]: begin
+```
+{"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
+```
+
+[skip default]: end
+
+<!--
+[shell default]: kill ${server_pid}
+-->
+
+</details>
+
+[end default]: end
diff --git a/docs/local-model.md b/docs/local-model.md
new file mode 100644
index 000000000..2d48e2438
--- /dev/null
+++ b/docs/local-model.md
@@ -0,0 +1,138 @@
+# Using Local Models in Torchcha/
+Torchchat provides powerful capabilities for running large language models (LLMs) locally. This guide focuses on utilizing local copies of 
+model checkpoints or models in GGUF format to create a chat application. It also highlights relevant options for advanced users.
+
+## Prerequisites
+To work with local models, you need:
+1. **Model Weights**: A checkpoint file (e.g., `.pth`, `.pt`) or a GGUF file (e.g., `.gguf`).
+2. **Tokenizer**: A tokenizer model file.This can either be in SentencePiece or TikToken format, depending on the tokenizer used with the model.
+3. **Parameter File**: (a) A custom parameter file in JSON format, or (b) a pre-existing parameter file with `--params-path`
+   or `--params-table`, or (c) a pathname that’s matched against known models by longest substring in configuration name, using the same algorithm as GPT-fast.
+
+Ensure the tokenizer and parameter files are in the same directory as the checkpoint or GGUF file for automatic detection.
+Let’s use a local download of the stories15M tinyllama model as an example:
+
+```
+mkdir stories15M
+cd stories15M
+wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+wget https://github.com/karpathy/llama2.c/raw/refs/heads/master/tokenizer.model
+cp ../torchchat/model_params/stories15M.json model.json
+cd ..
+``` 
+
+
+## Using Local Checkpoints
+Torchchat provides the CLI flag `--checkpoint-path` for specifying local model weights. The tokenizer is 
+loaded from the same directory as the checkpoint with the name ‘tokenizer.model’ unless separately specified.  
+This example obtains the model parameters by name matching to known models because ‘stories15M’ is one of the 
+models known to torchchat with a configuration stories in ‘torchchat/model_params’:
+
+
+### Example 1: Basic Text Generation
+
+
+```
+python3 torchchat.py generate \
+ --checkpoint-path stories15M/stories15M.pt \
+ --prompt "Hello, my name is"
+```
+
+
+### Example 2: Providing Additional Artifacts
+The following is an example of how to specify a local model checkpoint, the model architecture, and a tokenizer file:
+```
+python3 torchchat.py generate \
+ --prompt "Once upon a time" \
+ --checkpoint-path stories15M/stories15M.pt \
+ --params-path stories15M/model.json \
+ --tokenizer-path stories15M/tokenizer.model
+```
+
+
+Alternatively, we can specify the known architecture configuration for known models using ‘--params-table’ 
+to specify a p[particular architecture in the ‘torchchat/model_params’:
+
+```
+python3 torchchat.py generate \
+ --prompt "Once upon a time" \
+ --checkpoint-path stories15M/stories15M.pt \
+ --params-table stories15M \
+ --tokenizer-path stories15M//tokenizer.model
+```
+
+
+## Using GGUF Models
+Torchchat supports loading models in GGUF format using the `--gguf-file`. Refer to GGUF.md for additional 
+documentation about using GGUF files in torchchat.
+
+The GGUF format is compatible with several quantization levels such as F16, F32, Q4_0, and Q6_K. Model 
+configuration information is obtained directly from the GGUF file, simplifying setup and obviating the 
+need for a separate `model.json` model architecture specification.
+
+
+## Using local models
+Torchchat supports all commands such as chat, browser, server and export using local models. (In fact, 
+known models simply download and populate the parameters specified for local models.) 
+Here is an example setup for running a server with a local model:
+
+
+[skip default]: begin
+```
+python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt
+```
+[skip default]: end
+
+
+[shell default]: python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests
+
+
+In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
+
+
+> [!NOTE]
+> Since this feature is under active development, not every parameter is consumed. See `#api/api.pyi` for details on
+> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973).
+
+
+<details>
+
+
+<summary>Example Query</summary>
+Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will 
+await the full response from the server.
+
+
+**Example: using the server**
+A model server used witha local model works like any other torchchat server.  You can test it by sending a request with ‘curl’:
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama3.1",
+    "stream": "true",
+    "max_tokens": 200,
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ]
+  }'
+```
+
+
+[shell default]: kill ${server_pid}
+
+
+</details>
+
+
+For more information about using different commands, see the root README.md and refer to the Advanced Users Guide for further details on advanced configurations and parameter tuning.
+
+
+[end default]: end
diff --git a/docs/model_customization.md b/docs/model_customization.md
index 3c076fa71..7108b4ce2 100644
--- a/docs/model_customization.md
+++ b/docs/model_customization.md
@@ -34,6 +34,9 @@ prefill with `--compile_prefill`.
 
 To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/
 
+For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`.
+
+See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 ## Model Precision
 
diff --git a/docs/multimodal.md b/docs/multimodal.md
index f3e3f0fe2..cd249a1fb 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -14,9 +14,11 @@ This page goes over the different commands you can run with LLama 3.2 11B Vision
 
 While we strongly encourage you to use the Hugging Face checkpoint (which is the default for torchchat when utilizing the commands with the argument `llama3.2-11B`), we also provide support for manually providing the checkpoint. This can be done by replacing the `llama3.2-11B` argument in the commands below with the following:
 
+[skip default]: begin
 ```
 --checkpoint-path <file.pth> --tokenizer-path <tokenizer.model> --params-path torchchat/model_params/Llama-3.2-11B-Vision.json
 ```
+[skip default]: end
 
 ##  Generation
 This generates text output based on a text prompt and (optional) image prompt.
@@ -39,6 +41,9 @@ python3 torchchat.py server llama3.2-11B
 ```
 [skip default]: end
 
+[shell default]: python3 torchchat.py server llama3.2-11B & server_pid=$!
+
+
 In another terminal, query the server using `curl`. This query might take a few minutes to respond.
 
 <details>
@@ -71,10 +76,13 @@ curl http://127.0.0.1:5000/v1/chat/completions \
     "max_tokens": 300
   }'
 ```
-
+[skip default]: begin
 ```
 {"id": "chatcmpl-cb7b39af-a22e-4f71-94a8-17753fa0d00c", "choices": [{"message": {"role": "assistant", "content": "The image depicts a simple black and white cartoon-style drawing of an animal face. It features a profile view, complete with two ears, expressive eyes, and a partial snout. The animal looks to the left, with its eye and mouth implied, suggesting that the drawn face might belong to a rabbit, dog, or pig. The graphic face has a bold black outline and a smaller, solid black nose. A small circle, forming part of the face, has a white background with two black quirkly short and long curved lines forming an outline of what was likely a mouth, complete with two teeth. The presence of the curve lines give the impression that the animal is smiling or speaking. Grey and black shadows behind the right ear and mouth suggest that this face is looking left and upwards. Given the prominent outline of the head and the outline of the nose, it appears that the depicted face is most likely from the side profile of a pig, although the ears make it seem like a dog and the shape of the nose makes it seem like a rabbit. Overall, it seems that this image, possibly part of a character illustration, is conveying a playful or expressive mood through its design and positioning."}, "finish_reason": "stop"}], "created": 1727487574, "model": "llama3.2", "system_fingerprint": "cpu_torch.float16", "object": "chat.completion"}%
 ```
+[skip default]: end
+
+[shell default]: kill ${server_pid}
 
 </details>
 
@@ -90,6 +98,8 @@ First, follow the steps in the Server section above to start a local server. The
 streamlit run torchchat/usages/browser.py
 ```
 
+[skip default]: end
+
 ---
 
 # Future Work
diff --git a/docs/native-execution.md b/docs/native-execution.md
index 790547e21..c22d3c3ba 100644
--- a/docs/native-execution.md
+++ b/docs/native-execution.md
@@ -16,14 +16,14 @@ The 'llama runner' is a native standalone application capable of
 running a model exported and compiled ahead-of-time with either
 Executorch (ET) or AOT Inductor (AOTI). Which model format to use
 depends on your requirements and preferences.  Executorch models are
-optimized for portability across a range of decices, including mobile
+optimized for portability across a range of devices, including mobile
 and edge devices.  AOT Inductor models are optimized for a particular
 target architecture, which may result in better performance and
 efficiency.
 
 Building the runners is straightforward with the included cmake build
 files and is covered in the next sections.  We will showcase the
-runners using ~~stories15M~~ llama2 7B and llama3.
+runners using  llama2 7B and llama3.
 
 ## What can you do with torchchat's llama runner for native execution?
 
@@ -160,7 +160,7 @@ and native execution environments, respectively.
 
 After exporting a model, you will want to verify that the model
 delivers output of high quality, and works as expected.  Both can be
-achieved with the Python environment.  All torchchat Python comands
+achieved with the Python environment.  All torchchat Python commands
 can work with exported models.  Instead of loading the model from a
 checkpoint or GGUF file, use the `--dso-path model.so` and
 `--pte-path model.pte` for loading both types of exported models. This
diff --git a/docs/quantization.md b/docs/quantization.md
index 3415d8cb8..56fd2182e 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -59,7 +59,7 @@ for valid `bitwidth` and `groupsize` values.
 | linear with dynamic activations (symmetric) | `'{"linear:a8w4dq" : {"groupsize" : <groupsize>}}'`|
 | embedding | `'{"embedding": {"bitwidth": <bitwidth>, "groupsize":<groupsize>}}'` |
 
-See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/main/torchchat/utils/quantize.py#L1260-L1266).
+See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/b809b69e03f8f4b75a4b27b0778f0d3695ce94c2/torchchat/utils/quantize.py#L887-L894).
 
 In addition to quantization, the [accelerator](model_customization.md#device)
 and [precision](model_customization.md#model-precision) can also be specified.
@@ -142,7 +142,7 @@ To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental
 
 From the torchchat root directory, run
 ```
-sh torchchat/utils/scripts/build_torchao_ops.sh
+bash torchchat/utils/scripts/build_torchao_ops.sh
 ```
 
 This should take about 10 seconds to complete.
@@ -150,14 +150,14 @@ This should take about 10 seconds to complete.
 Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
 ```
-sh torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
+bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
 ```
 
 ```
-sh torchchat/utils/scripts/build_native.sh et link_torchao_ops
+bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
 ```
 
-Note before running `sh torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `sh torchchat/utils/scripts/install_et.sh` if you have not done so already.
+Note before running `bash torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `bash torchchat/utils/scripts/install_et.sh` if you have not done so already.
 
 ### Examples
 
@@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so
 If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
 
 ```
-OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3
 ```
 
 #### ExecuTorch
@@ -193,7 +193,33 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e
 Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
 
 ```
-./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time,"
+```
+
+## Experimental TorchAO MPS lowbit kernels
+
+WARNING: These kernels only work on devices with Apple Silicon.
+
+### Use
+
+#### linear:afpwx
+The quantization scheme linear:afpwx quantizes only the weights in a groupwise manner with a specified bitwidth and groupsize.
+It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize (32, 64, 128, 256).
+
+### Setup
+To use linear:afpwx, you must set up the torchao mps experimental kernels. These will only work on device with Apple Silicon.
+Currently, torchchat can only run them on Eager mode.
+
+From the torchchat root directory, run
+```
+bash torchchat/utils/scripts/build_torchao_ops.sh mps
+```
+
+### Examples
+
+#### Eager mode
+```
+python3 torchchat.py generate stories110M --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 4, "groupsize": 256}}' --prompt "Once upon a time," --num-samples 5
 ```
 
 ## Quantization Profiles
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index e61fae3a5..e79e9c341 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-72b3bb3194c611f7c4861e6f3b24af5de868af72
+9c043290ad3944268290e015c3063bc411e6ef6b
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 40f083249..2da70769c 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-c8f1174a06dcc0102849c8348ca6573bde8847a9
+2e032c6b0de960dee554dcb08126ace718b14c6d
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 6344509d8..360ba1801 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -9,36 +9,40 @@ set -eou pipefail
 
 # Install required python dependencies for developing
 # Dependencies are defined in .pyproject.toml
-PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python}
-if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
+if [ -z "${PYTHON_EXECUTABLE:-}" ];
 then
-  PYTHON_EXECUTABLE=python3
+  if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
+  then
+    PYTHON_EXECUTABLE=python3
+  else
+    PYTHON_EXECUTABLE=python
+  fi
 fi
-
-# Check python version. Expect 3.10.x or 3.11.x
-printf "import sys\nif sys.version_info.major != 3 or sys.version_info.minor < 10 :\n\tprint('Please use Python >=3.10');sys.exit(1)\n" | $PYTHON_EXECUTABLE
-if [[ $? -ne 0 ]]
+echo "Using python executable: $PYTHON_EXECUTABLE"
+
+PYTHON_SYS_VERSION="$($PYTHON_EXECUTABLE -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")"
+# Check python version. Expect at least 3.10.x
+if ! $PYTHON_EXECUTABLE -c "
+import sys
+if sys.version_info < (3, 10):
+    sys.exit(1)
+";
 then
+  echo "Python version must be at least 3.10.x. Detected version: $PYTHON_SYS_VERSION"
   exit 1
 fi
 
 if [[ "$PYTHON_EXECUTABLE" == "python" ]];
 then
   PIP_EXECUTABLE=pip
-else
+elif [[ "$PYTHON_EXECUTABLE" == "python3" ]];
+then
   PIP_EXECUTABLE=pip3
+else
+  PIP_EXECUTABLE=pip${PYTHON_SYS_VERSION}
 fi
 
-#
-# First install requirements in install/requirements.txt. Older torch may be
-# installed from the dependency of other models. It will be overridden by
-# newer version of torch nightly installed later in this script.
-#
-
-(
-  set -x
-  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cu121
-)
+echo "Using pip executable: $PIP_EXECUTABLE"
 
 # Since torchchat often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The PYTORCH_NIGHTLY_VERSION value should
@@ -47,38 +51,60 @@ fi
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241002
+PYTORCH_NIGHTLY_VERSION=dev20250124
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241002
+VISION_NIGHTLY_VERSION=dev20250124
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241010
-
-# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
-(
-  set -x
-  $PIP_EXECUTABLE uninstall -y triton
-)
+TUNE_NIGHTLY_VERSION=dev20250124
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
 # with cuda for faster execution on cuda GPUs.
 if [[ -x "$(command -v nvidia-smi)" ]];
 then
-  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu124"
 elif [[ -x "$(command -v rocminfo)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
+elif [[ -x "$(command -v xpu-smi)" ]];
+then
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/xpu"
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
 
 # pip packages needed by exir.
-REQUIREMENTS_TO_INSTALL=(
-  torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
-  torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"
-  torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}"
+if [[ -x "$(command -v xpu-smi)" ]];
+then
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.6.0"
+  )
+else
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
+  )
+fi
+
+#
+# First install requirements in install/requirements.txt. Older torch may be
+# installed from the dependency of other models. It will be overridden by
+# newer version of torch nightly installed later in this script.
+#
+(
+  set -x
+  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}"
+)
+
+# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
+(
+  set -x
+  $PIP_EXECUTABLE uninstall -y triton
 )
 
 # Install the requirements. --extra-index-url tells pip to look for package
@@ -89,9 +115,11 @@ REQUIREMENTS_TO_INSTALL=(
     "${REQUIREMENTS_TO_INSTALL[@]}"
 )
 
+# For torchao need to install from github since nightly build doesn't have macos build.
+# TODO: Remove this and install nightly build, once it supports macos
 (
   set -x
-  $PIP_EXECUTABLE install torchao=="0.5.0"
+  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d
 )
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then
@@ -100,8 +128,6 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
     $PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py
   )
 fi
-
-
 (
   set -x
   $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
diff --git a/install/requirements.txt b/install/requirements.txt
index d051d29cd..bd1e09174 100644
--- a/install/requirements.txt
+++ b/install/requirements.txt
@@ -9,12 +9,14 @@ gguf
 # Tiktoken tokenizer for Llama 3 and other advanced models
 tiktoken
 
+# Tokenizers and jinja2 for other non-llama models that use HF tokenizers
+tokenizers
+jinja2
+
 # Miscellaneous
 snakeviz
 sentencepiece
-# numpy version range required by GGUF util
-numpy >= 1.17, < 2.0
-gguf
+numpy >= 1.17
 blobfile
 tomli >= 1.1.0 ; python_version < "3.11"
 openai
diff --git a/runner/run.cpp b/runner/run.cpp
index abfbb4584..d64c636bb 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -7,20 +7,21 @@ LICENSE file in the root directory of this source tree.
 */
 
 /* Inference for Llama-2 Transformer model in pure C++ */
+#include "sentencepiece.h"
+#include "tiktoken.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>
 #include <ctype.h>
+#include <iterator>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
-#include <tokenizer.h>
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
 #include <string>
-
+#include <time.h>
 #ifdef DEBUG
 #include <cassert>
 #include <iostream>
@@ -47,13 +48,25 @@ torch::Device aoti_device(torch::kCPU);
 #endif
 
 using exec_aten::ScalarType;
-using torch::executor::EValue;
-using executorch::extension::TensorPtr;
 using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using torch::executor::EValue;
 using torch::executor::Module;
 using torch::executor::Result;
 #endif
 
+using tokenizers::SPTokenizer;
+using tokenizers::Tiktoken;
+using tokenizers::Tokenizer;
+
+#define UNWRAP(x)                                                              \
+  ({                                                                           \
+    if (!(x).ok()) {                                                           \
+      fprintf(stderr, "Got error code % " PRIu32, x.error());                  \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+    std::move(x.get());                                                        \
+  })
 // ----------------------------------------------------------------------------
 // Transformer model
 
@@ -65,56 +78,57 @@ enum ModelType {
 
 ModelType get_model_type(int model_int) {
   switch (model_int) {
-    case 2:
-      return LLAMA2_MODEL;
-      break;
-    case 3:
-      return LLAMA3_MODEL;
-      break;
-    default:
-      return UNKNOWN_MODEL;
+  case 2:
+    return LLAMA2_MODEL;
+    break;
+  case 3:
+    return LLAMA3_MODEL;
+    break;
+  default:
+    return UNKNOWN_MODEL;
   }
 }
 
 typedef struct {
   int vocab_size; // vocabulary size, usually 256 (byte-level)
-  int seq_len; // max sequence length
+  int seq_len;    // max sequence length
 } Config;
 
 typedef struct {
-  float* logits; // output logits
-  int64_t* toks; // tokens seen so far; no kv-cache :(
+  float *logits; // output logits
+  int64_t *toks; // tokens seen so far; no kv-cache :(
 } RunState;
 
 typedef struct {
-  Config config; // the hyperparameters of the architecture (the blueprint)
+  Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
+  std::unordered_map<std::string, std::string> metadata;
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelPackageLoader* runner;
+  torch::inductor::AOTIModelPackageLoader *runner;
 #else // __ET_MODEL__
-  Module* runner;
+  Module *runner;
 #endif
 
 } Transformer;
 
-void malloc_run_state(RunState* s, Config* p) {
+void malloc_run_state(RunState *s, Config *p) {
   // we calloc instead of malloc to keep valgrind happy
-  s->logits = (float*)calloc(p->vocab_size, sizeof(float));
-  s->toks = (int64_t*)calloc(p->seq_len, sizeof(int64_t));
+  s->logits = (float *)calloc(p->vocab_size, sizeof(float));
+  s->toks = (int64_t *)calloc(p->seq_len, sizeof(int64_t));
   if (!s->logits || !s->toks) {
     fprintf(stderr, "malloc failed!\n");
     exit(EXIT_FAILURE);
   }
 }
 
-void free_run_state(RunState* s) {
+void free_run_state(RunState *s) {
   free(s->logits);
   free(s->toks);
 }
 
-void read_checkpoint(char* checkpoint, Config* config) {
-  FILE* file = fopen(checkpoint, "rb");
+void read_checkpoint(char *checkpoint, Config *config) {
+  FILE *file = fopen(checkpoint, "rb");
   if (!file) {
     fprintf(stderr, "Couldn't open file %s\n", checkpoint);
     exit(EXIT_FAILURE);
@@ -128,21 +142,9 @@ void read_checkpoint(char* checkpoint, Config* config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(
-    Transformer* t,
-    char* model_path,
-    int vocab_size,
-    int seq_len) {
-  // read in the Config and the Weights from the model
-  // read_checkpoint(model_path, &t->config);
-  // allocate the RunState buffers
-  t->config.vocab_size = vocab_size;
-  t->config.seq_len = seq_len;
-  malloc_run_state(&t->state, &t->config);
-
+void build_transformer(Transformer *t, char *model_path) {
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -150,7 +152,7 @@ void build_transformer(
 #endif
 }
 
-void free_transformer(Transformer* t) {
+void free_transformer(Transformer *t) {
   // free the RunState buffers
   free_run_state(&t->state);
   delete t->runner;
@@ -159,7 +161,7 @@ void free_transformer(Transformer* t) {
 // ----------------------------------------------------------------------------
 // neural net blocks; the dynamics of the Transformer
 
-void softmax(float* x, int size) {
+void softmax(float *x, int size) {
   // find max value (for numerical stability)
   float max_val = x[0];
   for (int i = 1; i < size; i++) {
@@ -179,9 +181,9 @@ void softmax(float* x, int size) {
   }
 }
 
-float* forward(Transformer* transformer, int token, int pos) {
-  Config* p = &transformer->config;
-  RunState* s = &transformer->state;
+float *forward(Transformer *transformer, int token, int pos) {
+  Config *p = &transformer->config;
+  RunState *s = &transformer->state;
   s->toks[pos] = token;
   long token_buffer[1] = {token};
   long pos_buffer[1] = {pos};
@@ -194,8 +196,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{
-      token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
+  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device),
+                                    pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
@@ -204,7 +206,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   memcpy(s->logits, logits, p->vocab_size * sizeof(float));
 #else // __ET_MODEL__
   TensorPtr pos_managed = make_tensor_ptr({1}, pos_buffer, ScalarType::Long);
-  TensorPtr tokens_managed = make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
+  TensorPtr tokens_managed =
+      make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
   std::vector<EValue> inputs;
   auto tmp1 = EValue(tokens_managed);
   auto tmp2 = EValue(pos_managed);
@@ -221,17 +224,12 @@ float* forward(Transformer* transformer, int token, int pos) {
   // HACK: the rest of this runner assumes that logits must be float,
   // so we simply convert them rather than plumbing
   // templating/switch-on-type through the rest of this file.
-  const auto& result_tensor = result[0].toTensor();
+  const auto &result_tensor = result[0].toTensor();
   ET_SWITCH_REALHBBF16_TYPES(
-      result_tensor.scalar_type(),
-      unused,
-      "forward",
-      CTYPE,
-      [&]() {
-        const CTYPE* logits = result_tensor.const_data_ptr<CTYPE>();
-        std::transform(logits, logits + p->vocab_size, s->logits, [](auto x) {
-          return static_cast<float>(x);
-        });
+      result_tensor.scalar_type(), unused, "forward", CTYPE, [&]() {
+        const CTYPE *logits = result_tensor.const_data_ptr<CTYPE>();
+        std::transform(logits, logits + p->vocab_size, s->logits,
+                       [](auto x) { return static_cast<float>(x); });
       });
 #endif
 
@@ -249,13 +247,13 @@ typedef struct {
 
 typedef struct {
   int vocab_size;
-  ProbIndex* probindex; // buffer used in top-p sampling
+  ProbIndex *probindex; // buffer used in top-p sampling
   float temperature;
   float topp;
   unsigned long long rng_state;
 } Sampler;
 
-int sample_argmax(float* probabilities, int n) {
+int sample_argmax(float *probabilities, int n) {
   // return the index that has the highest probability
   int max_i = 0;
   float max_p = probabilities[0];
@@ -268,7 +266,7 @@ int sample_argmax(float* probabilities, int n) {
   return max_i;
 }
 
-int sample_mult(float* probabilities, int n, float coin) {
+int sample_mult(float *probabilities, int n, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   float cdf = 0.0f;
@@ -281,9 +279,9 @@ int sample_mult(float* probabilities, int n, float coin) {
   return n - 1; // in case of rounding errors
 }
 
-int compare(const void* a, const void* b) {
-  ProbIndex* a_ = (ProbIndex*)a;
-  ProbIndex* b_ = (ProbIndex*)b;
+int compare(const void *a, const void *b) {
+  ProbIndex *a_ = (ProbIndex *)a;
+  ProbIndex *b_ = (ProbIndex *)b;
   if (a_->prob > b_->prob)
     return -1;
   if (a_->prob < b_->prob)
@@ -291,12 +289,8 @@ int compare(const void* a, const void* b) {
   return 0;
 }
 
-int sample_topp(
-    float* probabilities,
-    int n,
-    float topp,
-    ProbIndex* probindex,
-    float coin) {
+int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex,
+                float coin) {
   // top-p sampling (or "nucleus sampling") samples from the smallest set of
   // tokens that exceed probability topp. This way we never sample tokens that
   // have very low probabilities and are less likely to go "off the rails".
@@ -339,37 +333,31 @@ int sample_topp(
   return probindex[last_idx].index; // in case of rounding errors
 }
 
-void build_sampler(
-    Sampler* sampler,
-    int vocab_size,
-    float temperature,
-    float topp,
-    unsigned long long rng_seed) {
+void build_sampler(Sampler *sampler, int vocab_size, float temperature,
+                   float topp, unsigned long long rng_seed) {
   sampler->vocab_size = vocab_size;
   sampler->temperature = temperature;
   sampler->topp = topp;
   sampler->rng_state = rng_seed;
   // buffer only used with nucleus sampling; may not need but it's ~small
   sampler->probindex =
-      (ProbIndex*)malloc(sampler->vocab_size * sizeof(ProbIndex));
+      (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex));
 }
 
-void free_sampler(Sampler* sampler) {
-  free(sampler->probindex);
-}
+void free_sampler(Sampler *sampler) { free(sampler->probindex); }
 
-unsigned int random_u32(unsigned long long* state) {
+unsigned int random_u32(unsigned long long *state) {
   // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
   *state ^= *state >> 12;
   *state ^= *state << 25;
   *state ^= *state >> 27;
   return (*state * 0x2545F4914F6CDD1Dull) >> 32;
 }
-float random_f32(unsigned long long* state) { // random float32 in [0,1)
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
   return (random_u32(state) >> 8) / 16777216.0f;
 }
 
-int sample(Sampler* sampler, float* logits) {
+int sample(Sampler *sampler, float *logits) {
   // sample the token given the logits and some hyperparameters
   int next;
   if (sampler->temperature == 0.0f) {
@@ -390,39 +378,37 @@ int sample(Sampler* sampler, float* logits) {
       next = sample_mult(logits, sampler->vocab_size, coin);
     } else {
       // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(
-          logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
+      next = sample_topp(logits, sampler->vocab_size, sampler->topp,
+                         sampler->probindex, coin);
     }
   }
   return next;
 }
 
-Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
-  Tokenizer* tokenizer = NULL;
+Tokenizer *build_tokenizer(const char *tokenizer_path, ModelType model_type) {
+  Tokenizer *tokenizer = NULL;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      tokenizer = new SPTokenizer();
-      tokenizer->load(tokenizer_path);
-      break;
-    case LLAMA3_MODEL:
-      tokenizer = new Tiktoken();
-      tokenizer->load(tokenizer_path);
-      break;
-    default:
-      fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    tokenizer = new SPTokenizer();
+    tokenizer->load(tokenizer_path);
+    break;
+  case LLAMA3_MODEL:
+    tokenizer = new Tiktoken();
+    tokenizer->load(tokenizer_path);
+    break;
+  default:
+    fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
   return tokenizer;
 }
 
-void free_tokenizer(Tokenizer* tokenizer) {
-  delete tokenizer;
-}
+void free_tokenizer(Tokenizer *tokenizer) { delete tokenizer; }
 
 // ----------------------------------------------------------------------------
 // utilities: time
 
-void safe_printf(const char* piece) {
+void safe_printf(const char *piece) {
   // piece might be a raw byte token, and we only want to print printable chars
   // or whitespace because some of the other bytes can be various control codes,
   // backspace, etc.
@@ -454,21 +440,18 @@ long time_in_ms() {
 // Prints decoded tokens generated from the transformer.
 // The first token is not printed and is assumed to be a BOS or other similar
 // token
-unsigned generate_from_prompt_tokens(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const std::vector<uint64_t>& prompt_tokens,
-    unsigned pos,
-    const std::vector<uint64_t>& stop_tokens,
-    int stop_pos,
-    bool print_prompt,
-    bool print_tok_per_sec) {
+unsigned generate_from_prompt_tokens(Transformer *transformer,
+                                     Tokenizer *tokenizer, Sampler *sampler,
+                                     const std::vector<uint64_t> &prompt_tokens,
+                                     unsigned pos,
+                                     const std::vector<uint64_t> &stop_tokens,
+                                     int stop_pos, bool print_prompt,
+                                     bool print_tok_per_sec) {
   if (prompt_tokens.size() == 0) {
     return pos;
   }
 
-  uint64_t next; // will store the next token in the sequence
+  uint64_t next;  // will store the next token in the sequence
   uint64_t token; // stores the current token to feed into the transformer
   bool done_with_prompt; // whether we are done processing prompt
 
@@ -486,7 +469,7 @@ unsigned generate_from_prompt_tokens(
     if (pos_in_prompt < prompt_tokens.size()) {
       // Token comes from prompt
       token = prompt_tokens[pos_in_prompt++];
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
 
       // Next token is either from prompt or if on last
       // prompt token, next is sampled
@@ -498,29 +481,27 @@ unsigned generate_from_prompt_tokens(
     } else {
       // Token comes from next sampled from previous round.
       token = next;
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
       next = sample(sampler, logits);
     }
     done_with_prompt = (pos_in_prompt >= prompt_tokens.size());
 
     // we terminate on finding the stop_token if we are done processing the
     // prompt (stop_tokens in the prompt do not terminate the loop)
-    if (done_with_prompt &&
-        (std::find(stop_tokens.begin(), stop_tokens.end(), token) !=
-         stop_tokens.end())) {
+    if (done_with_prompt && (std::find(stop_tokens.begin(), stop_tokens.end(),
+                                       token) != stop_tokens.end())) {
       found_stop_token = true;
     }
 
     // We print next in each iteration of the loop, not token
     if (!found_stop_token && (print_prompt || done_with_prompt)) {
       // The stop_token is printed as newline
-      bool next_is_stop =
-          std::find(stop_tokens.begin(), stop_tokens.end(), next) !=
-          stop_tokens.end();
+      bool next_is_stop = std::find(stop_tokens.begin(), stop_tokens.end(),
+                                    next) != stop_tokens.end();
       if (next_is_stop) {
         printf("\n");
       } else {
-        std::string piece = tokenizer->decode(token, next);
+        std::string piece = UNWRAP(tokenizer->decode(token, next));
         safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
                                     // "unsafe" bytes
         fflush(stdout);
@@ -538,23 +519,16 @@ unsigned generate_from_prompt_tokens(
   // iteration)
   if (print_tok_per_sec && pos > 1) {
     long end = time_in_ms();
-    fprintf(
-        stderr,
-        "\n\nachieved tok/s: %f\n",
-        (pos - 1) / (double)(end - start) * 1000);
+    fprintf(stderr, "\n\nachieved tok/s: %f\n",
+            (pos - 1) / (double)(end - start) * 1000);
   }
 
   return pos;
 }
 
-void generate(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* prompt,
-    int steps,
-    ModelType model_type) {
-  const char* default_prompt = "Once upon a time";
+void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+              const char *prompt, int steps, ModelType model_type) {
+  const char *default_prompt = "Once upon a time";
   if (prompt == NULL) {
     prompt = default_prompt;
   }
@@ -566,33 +540,30 @@ void generate(
   std::vector<uint64_t> prompt_tokens;
   std::vector<uint64_t> stop_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->eos_tok());
-      break;
-    case LLAMA3_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
-      stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
-      break;
-    default:
-      fprintf(stderr, "Generate does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
-  }
-
-  generate_from_prompt_tokens(
-      transformer,
-      tokenizer,
-      sampler,
-      prompt_tokens,
-      /*pos=*/0,
-      /*stop_tokens=*/stop_tokens,
-      /*stop_pos=*/steps - 1,
-      /*print_prompt=*/true,
-      /*print_tok_per_sec=*/true);
+  case LLAMA2_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(tokenizer->eos_tok());
+    break;
+  case LLAMA3_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(
+        UNWRAP(tokenizer->encode("<|end_of_text|>", 0, 0))[0]);
+    stop_tokens.push_back(UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]);
+    break;
+  default:
+    fprintf(stderr, "Generate does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
+  }
+
+  generate_from_prompt_tokens(transformer, tokenizer, sampler, prompt_tokens,
+                              /*pos=*/0,
+                              /*stop_tokens=*/stop_tokens,
+                              /*stop_pos=*/steps - 1,
+                              /*print_prompt=*/true,
+                              /*print_tok_per_sec=*/true);
 }
 
-void read_stdin(const char* guide, char* buffer, size_t bufsize) {
+void read_stdin(const char *guide, char *buffer, size_t bufsize) {
   // read a line from stdin, up to but not including \n
   printf("%s", guide);
   if (fgets(buffer, bufsize, stdin) != NULL) {
@@ -609,11 +580,10 @@ void read_stdin(const char* guide, char* buffer, size_t bufsize) {
 // python reference and that seemed ok, but this was not thoroughly tested and
 // is not safely implemented, it's more a proof of concept atm.
 
-std::vector<uint64_t> get_initial_prompt_tokens(
-    const char* cli_system_prompt,
-    const char* cli_user_prompt,
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
+                                                const char *cli_user_prompt,
+                                                Tokenizer *tokenizer,
+                                                ModelType model_type) {
   char system_prompt[512];
   char user_prompt[512];
   char rendered_prompt[512 * 2 + 200]; // the prompt template is ~170
@@ -622,10 +592,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   if (cli_system_prompt != NULL) {
     strcpy(system_prompt, cli_system_prompt);
   } else {
-    read_stdin(
-        "Enter system prompt (optional): ",
-        system_prompt,
-        sizeof(system_prompt));
+    read_stdin("Enter system prompt (optional): ", system_prompt,
+               sizeof(system_prompt));
   }
 
   if (cli_user_prompt != NULL) {
@@ -637,48 +605,40 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] %s [/INST]",
-            user_prompt);
-      }
+  case LLAMA2_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]", system_prompt,
+               user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] %s [/INST]", user_prompt);
+    }
 
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, 1, 0);
-      break;
-
-    case LLAMA3_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            user_prompt);
-      }
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 1, 0));
+    break;
+
+  case LLAMA3_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
+               "\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<"
+               "|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               system_prompt, user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%"
+               "s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               user_prompt);
+    }
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
 
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -695,9 +655,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   return tokens;
 }
 
-std::vector<uint64_t> get_next_user_prompt_tokens(
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
+                                                  ModelType model_type) {
   char user_prompt[512];
   char rendered_prompt[512 + 150]; // the prompt template is ~100 characters. We
                                    // use 150 to be safe.
@@ -706,30 +665,26 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "[INST] %s [/INST]",
-          user_prompt);
-
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0);
-      break;
-
-    case LLAMA3_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-          user_prompt);
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
-
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, "[INST] %s [/INST]",
+             user_prompt);
+
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0));
+    break;
+
+  case LLAMA3_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+             "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_"
+             "header_id|>assistant<|end_header_id|>\n\n",
+             user_prompt);
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
+
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -746,14 +701,9 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   return tokens;
 }
 
-void chat(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* cli_user_prompt,
-    const char* cli_system_prompt,
-    unsigned steps,
-    ModelType model_type) {
+void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+          const char *cli_user_prompt, const char *cli_system_prompt,
+          unsigned steps, ModelType model_type) {
   if (steps == 0) {
     return;
   }
@@ -761,16 +711,16 @@ void chat(
   uint64_t eot_token;
   std::vector<uint64_t> prompt_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      // llama2 uses EOS as EOT token
-      eot_token = tokenizer->eos_tok();
-      break;
-    case LLAMA3_MODEL:
-      eot_token = tokenizer->encode("<|eot_id|>", 0, 0)[0];
-      break;
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    // llama2 uses EOS as EOT token
+    eot_token = tokenizer->eos_tok();
+    break;
+  case LLAMA3_MODEL:
+    eot_token = UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0];
+    break;
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
   std::vector<uint64_t> stop_tokens{eot_token};
@@ -784,11 +734,7 @@ void chat(
     }
     printf("Assistant: ");
     pos = generate_from_prompt_tokens(
-        transformer,
-        tokenizer,
-        sampler,
-        prompt_tokens,
-        pos,
+        transformer, tokenizer, sampler, prompt_tokens, pos,
         /*stop_tokens=*/stop_tokens,
         /*stop_pos=*/steps - 1, // We could pass in -1 here if we do not want
                                 // the model to stop mid-reply
@@ -803,46 +749,40 @@ void chat(
 
 void error_usage() {
   fprintf(stderr, "Usage:   run <model_path> [options]\n");
-  fprintf(
-      stderr, "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
+  fprintf(stderr,
+          "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
   fprintf(stderr, "Options:\n");
   fprintf(stderr, "  -t <float>  temperature in [0,inf], default 1.0\n");
-  fprintf(
-      stderr,
-      "  -p <float>  p value in top-p (nucleus) sampling in [0,1], default 0.9\n");
+  fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling in [0,1], "
+                  "default 0.9\n");
   fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
-  fprintf(
-      stderr,
-      "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
+  fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = "
+                  "max_seq_len\n");
   fprintf(stderr, "  -i <string> input prompt\n");
   fprintf(stderr, "  -z <string> path to tokenizer\n");
   fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
   fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
-  fprintf(
-      stderr,
-      "  -v <int>    (optional) vocab size, default is model-specific.\n");
-  fprintf(
-      stderr, "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
-  fprintf(
-      stderr,
-      "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
+  fprintf(stderr,
+          "  -v <int>    (optional) vocab size, default is model-specific.\n");
+  fprintf(stderr,
+          "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
   exit(EXIT_FAILURE);
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   // default parameters
-  char* model_path = NULL;
-  char* tokenizer_path = NULL;
+  char *model_path = NULL;
+  char *tokenizer_path = NULL;
   float temperature =
       1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
   float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well,
                      // but slower
 
-  int steps = 128; // number of steps to run for
-  const char* prompt = NULL; // prompt string
+  int steps = 128;                 // number of steps to run for
+  const char *prompt = NULL;       // prompt string
   unsigned long long rng_seed = 0; // seed rng with time by default
-  const char* mode = "generate"; // generate|chat
-  char* system_prompt =
+  const char *mode = "generate";   // generate|chat
+  char *system_prompt =
       NULL; // the (optional) system prompt to use in chat mode
 
   int vocab_size = -1;
@@ -863,64 +803,53 @@ int main(int argc, char* argv[]) {
   } else {
     error_usage();
   }
-  for (int i = 2; i < argc; i += 2) {
+  for (int i = 2; i < argc; i += 1) {
     // do some basic validation
-    if (i + 1 >= argc) {
-      error_usage();
-    } // must have arg after flag
+    char *parm = argv[i+1];
+    // uniarg means the arg comes right after the letter in accordance with posix
+    int uniarg = strlen(argv[i]) > 2; 
+
     if (argv[i][0] != '-') {
       error_usage();
     } // must start with dash
-    if (strlen(argv[i]) != 2) {
+
+    if (strlen(argv[i]) < 2) {
+      error_usage();
+    } // must have at least dash '-' and option letter
+    
+    if (uniarg) {
+      parm=&argv[i][2];
+    } else if (i + 1 >= argc) {
       error_usage();
-    } // must be -x (one dash, one letter)
+    } // must have arg after option if flag is not contiguous to option
+    
     // read in the args
     if (argv[i][1] == 't') {
-      temperature = atof(argv[i + 1]);
+      temperature = atof(parm);
     } else if (argv[i][1] == 'p') {
-      topp = atof(argv[i + 1]);
+      topp = atof(parm);
     } else if (argv[i][1] == 's') {
-      rng_seed = atoi(argv[i + 1]);
+      rng_seed = atoi(parm);
     } else if (argv[i][1] == 'n') {
-      steps = atoi(argv[i + 1]);
+      steps = atoi(parm);
     } else if (argv[i][1] == 'v') {
-      vocab_size = atoi(argv[i + 1]);
+      vocab_size = atoi(parm);
     } else if (argv[i][1] == 'i') {
-      prompt = argv[i + 1];
+      prompt = parm;
     } else if (argv[i][1] == 'z') {
-      tokenizer_path = argv[i + 1];
+      tokenizer_path = parm;
     } else if (argv[i][1] == 'm') {
-      mode = argv[i + 1];
+      mode = parm;
     } else if (argv[i][1] == 'y') {
-      system_prompt = argv[i + 1];
+      system_prompt = parm;
     } else if (argv[i][1] == 'l') {
-      llama_ver = atoi(argv[i + 1]);
-#ifdef __AOTI_MODEL__
-    } else if (argv[i][1] == 'd') {
-#ifdef USE_CUDA
-      if (strcasecmp(argv[i + 1], "CUDA") == 0) {
-        aoti_device = torch::Device(torch::kCUDA);
-      } else
-#endif
-          if (strcasecmp(argv[i + 1], "CPU") == 0) {
-        aoti_device = torch::Device(torch::kCPU);
-      } else {
-        fprintf(stderr, "Unknown device %s", argv[i + 1]);
-        exit(1);
-      }
-#endif
+      llama_ver = atoi(parm);
     } else {
       error_usage();
     }
-  }
 
-  ModelType model_type = get_model_type(llama_ver);
-  if (model_type == UNKNOWN_MODEL) {
-    fprintf(
-        stderr,
-        "Unknown model type passed by -l argument.  Received l=%d.",
-        llama_ver);
-    error_usage();
+    // account for parameter
+    i += (uniarg)?0:1;
   }
 
   if (model_path == NULL) {
@@ -928,6 +857,25 @@ int main(int argc, char* argv[]) {
     error_usage();
   }
 
+  Transformer transformer;
+  build_transformer(&transformer, model_path);
+
+#ifdef __AOTI_MODEL__
+  auto aoti_metadata = transformer.runner->get_metadata();
+  aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
+  ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
+#else // __ET_MODEL__
+  ModelType model_type = get_model_type(llama_ver);
+#endif
+
+  if (model_type == UNKNOWN_MODEL) {
+    fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
+            llama_ver);
+    error_usage();
+  }
+
   if (tokenizer_path == NULL) {
     fprintf(stderr, "No tokenizer_path provided.");
     error_usage();
@@ -943,15 +891,19 @@ int main(int argc, char* argv[]) {
   if (steps < 0)
     steps = 0;
 
-  Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
+  Tokenizer *tokenizer = build_tokenizer(tokenizer_path, model_type);
 
   // If no tokenizer path provided, get default for model_type
   if (vocab_size == -1) {
     vocab_size = tokenizer->vocab_size();
   }
 
-  Transformer transformer;
-  build_transformer(&transformer, model_path, vocab_size, steps);
+  // read in the Config and the Weights from the model
+  // read_checkpoint(model_path, &t->config);
+  // allocate the RunState buffers
+  transformer.config.vocab_size = vocab_size;
+  transformer.config.seq_len = steps;
+  malloc_run_state(&transformer.state, &transformer.config);
 
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
@@ -959,14 +911,8 @@ int main(int argc, char* argv[]) {
   if (strcmp(mode, "generate") == 0) {
     generate(&transformer, tokenizer, &sampler, prompt, steps, model_type);
   } else if (strcmp(mode, "chat") == 0) {
-    chat(
-        &transformer,
-        tokenizer,
-        &sampler,
-        prompt,
-        system_prompt,
-        steps,
-        model_type);
+    chat(&transformer, tokenizer, &sampler, prompt, system_prompt, steps,
+         model_type);
   } else {
     fprintf(stderr, "unknown mode: %s\n", mode);
     error_usage();
diff --git a/runner/third-party/tokenizers b/runner/third-party/tokenizers
new file mode 160000
index 000000000..3f536fc01
--- /dev/null
+++ b/runner/third-party/tokenizers
@@ -0,0 +1 @@
+Subproject commit 3f536fc0139f7987940f69de2aef58eec1794f6a
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..c1580e27b
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
+"""
+Global pytest config, fixtures, and helpers go here!
+"""
+
+# Standard
+import os
+import sys
+
+# Make sure tests can import torchchat
+sys.path.append(
+    os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
+)
diff --git a/tests/test_chat_formatters.py b/tests/test_chat_formatters.py
new file mode 100644
index 000000000..2f7f7a955
--- /dev/null
+++ b/tests/test_chat_formatters.py
@@ -0,0 +1,216 @@
+"""
+Unit tests for chat formatters
+"""
+
+# Third Party
+import pytest
+
+# Local
+from torchchat.generate import (
+    HFTokenizerChatFormatter,
+    Llama2ChatFormatter,
+    Llama3ChatFormatter,
+)
+
+## Helpers #####################################################################
+
+class DummyTokenizer:
+    """Dummy tokenizer that encodes as strings so it's easy to check formatting"""
+    def encode(self, text, *_, **__):
+        return text
+
+
+class DummySPTokenizer(DummyTokenizer):
+    """Emulated Sentencepiece tokenizer with bos/eos"""
+    bos = "<s>"
+    eos = "</s>"
+
+
+class DummyLlama3Tokenizer(DummyTokenizer):
+    class _IdentityDict:
+        def __getitem__(self, key):
+            return key
+    special_tokens = _IdentityDict()
+
+
+class DummyHFTokenizer(DummyTokenizer):
+    """Dummy made up chat template scheme"""
+    # Sequence
+    bos = "<bos>"
+    # Turn
+    bot = "<bot>"
+    eot = "<eot>"
+    # Role
+    bor = "<bor>"
+    eor = "<eor>"
+    def apply_chat_template(self, messages, add_generation_prompt):
+        out = [self.bos]
+        role = None
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            out.append(f"{self.bot}{self.bor}{role}{self.eor}{content}{self.eot}")
+        if add_generation_prompt and role != "assistant":
+            out.append(f"{self.bot}{self.bor}assistant{self.eor}")
+        return "\n".join(out)
+
+
+def check_rendering(fmt, messages, expected, add_generation_prompt):
+    """Render messages and compare to expected output"""
+    assert "".join(fmt.encode_dialog_prompt(messages, add_generation_prompt)) == expected
+
+
+def make_message(role, text):
+    return {"role": role, "content": text}
+
+
+SYSTEM_PROMPT = "You are a helpful assistant, feel free to ask me anything."
+USER1 = "Hello world!"
+ASSISTANT1 = "Greetings! How can I help you?"
+USER2 = "Why is the sky blue?"
+ASSISTANT2 = "The sky appears blue because of a phenomenon called Rayleigh scattering."
+
+
+# Stock sets of messages to test
+MSGS_NO_SYS= [
+    make_message("user", USER1),
+]
+MSGS_SYS_USR = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+]
+MSGS_SYS_USR_ASST = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+    make_message("assistant", ASSISTANT1),
+]
+MSGS_MULTI_TURN = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+    make_message("assistant", ASSISTANT1),
+    make_message("user", USER2),
+    make_message("assistant", ASSISTANT2),
+]
+
+## Llama2ChatFormatter #########################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"<s>[INST] {USER1} [/INST]"),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST]"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST] {ASSISTANT1} </s>
+"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST] {ASSISTANT1} </s>
+<s>[INST] {USER2} [/INST] {ASSISTANT2} </s>
+"""),
+    ]
+)
+def test_llama2_chat_formatter(messages, expected):
+    """Tests for Llama2 following the official guide
+    https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/
+    """
+    tok = DummySPTokenizer()
+    fmt = Llama2ChatFormatter(tok)
+    # NOTE: add_generation_prompt not used by Llama2
+    check_rendering(fmt, messages, expected, True)
+
+## Llama3ChatFormatter #########################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|>"""),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|>"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT1}<|eot_id|>"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT1}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER2}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT2}<|eot_id|>"""),
+    ]
+)
+@pytest.mark.parametrize("add_generation_prompt", [True, False])
+def test_llama3_chat_formatter(messages, expected, add_generation_prompt):
+    """Tests for Llama3 following the official guide
+    https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/
+    """
+    tok = DummyLlama3Tokenizer()
+    fmt = Llama3ChatFormatter(tok)
+    # No assistant prompt added if the last message is from the assistant
+    if add_generation_prompt and messages[-1]["role"] != "assistant":
+        expected += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    check_rendering(fmt, messages, expected, add_generation_prompt)
+
+## HFTokenizerChatFormatter ####################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"""<bos>
+<bot><bor>user<eor>{USER1}<eot>"""),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>
+<bot><bor>assistant<eor>{ASSISTANT1}<eot>"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>
+<bot><bor>assistant<eor>{ASSISTANT1}<eot>
+<bot><bor>user<eor>{USER2}<eot>
+<bot><bor>assistant<eor>{ASSISTANT2}<eot>"""),
+    ]
+)
+@pytest.mark.parametrize("add_generation_prompt", [True, False])
+def test_hf_chat_formatter(messages, expected, add_generation_prompt):
+    tok = DummyHFTokenizer()
+    fmt = HFTokenizerChatFormatter(tok)
+    # No assistant prompt added if the last message is from the assistant
+    if add_generation_prompt and messages[-1]["role"] != "assistant":
+        expected += f"\n{tok.bot}{tok.bor}assistant{tok.eor}"
+    check_rendering(fmt, messages, expected, add_generation_prompt)
diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt
deleted file mode 100644
index 39c20885d..000000000
--- a/tokenizer/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-cmake_minimum_required(VERSION 3.24)
-set(CMAKE_CXX_STANDARD 17)
-IF(DEFINED ENV{TORCHCHAT_ROOT})
-    set(TORCHCHAT_ROOT $ENV{TORCHCHAT_ROOT})
-ELSE()
-    set(TORCHCHAT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
-ENDIF()
-
-# build tokenizer library
-add_library(
-    tokenizer
-    tokenizer.h
-    sentencepiece.cpp
-    tiktoken.cpp)
-
-target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
-
-# add RE2 as subdirectory
-set(ABSL_ENABLE_INSTALL ON)
-set(ABSL_PROPAGATE_CXX_STD ON)
-set(_pic_flag
-${CMAKE_POSITION_INDEPENDENT_CODE})
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_subdirectory(third-party/abseil-cpp)
-add_subdirectory(third-party/re2)
-add_subdirectory(third-party/sentencepiece)
-set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-
-target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)
diff --git a/tokenizer/base64.h b/tokenizer/base64.h
deleted file mode 100644
index dfeefef55..000000000
--- a/tokenizer/base64.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#pragma once
-
-#include <cassert>
-#include <string>
-#include <string_view>
-
-namespace base64 {
-
-std::string decode(const std::string_view& input);
-
-namespace detail {
-
-constexpr uint32_t DECODE_TABLE[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
-
-inline void validate(uint32_t v) {
-  if (v == 255) {
-    fprintf(stderr, "invalid char");
-    exit(EXIT_FAILURE);
-  }
-}
-
-inline void decode(const std::string_view& input, std::string& output) {
-  if (input.size() != 4) {
-    fprintf(stderr, "input length must be 4, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[3];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 16) & 0xFF));
-  output.push_back(static_cast<char>((val >> 8) & 0xFF));
-  output.push_back(static_cast<char>(val & 0xFF));
-}
-
-inline void decode_1_padding(
-    const std::string_view& input,
-    std::string& output) {
-  if (input.size() != 3) {
-    fprintf(stderr, "input length must be 3, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 10) & 0xFF));
-  output.push_back(static_cast<char>((val >> 2) & 0xFF));
-}
-
-inline void decode_2_padding(
-    const std::string_view& input,
-    std::string& output) {
-  assert(input.size() == 2);
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 4) & 0xFF));
-}
-
-} // namespace detail
-
-inline std::string decode(const std::string_view& input) {
-  if (input.empty()) {
-    fprintf(stderr, "empty input");
-    exit(EXIT_FAILURE);
-  }
-
-  // Faster than `input.size() % 4`.
-  if ((input.size() & 3) != 0 || input.size() < 4) {
-    fprintf(
-        stderr,
-        "input length must be larger than 4 and is multiple of 4, got %zu",
-        input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  std::string output;
-  output.reserve(input.size() / 4 * 3);
-  auto idx = 0U;
-  for (; idx < input.size() - 4; idx += 4) {
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  // Last 4 bytes. Might contain paddings.
-  if (input[idx + 3] == '=') {
-    if (input[idx + 2] == '=') {
-      // Tow paddings.
-      detail::decode_2_padding(input.substr(idx, 2), output);
-    } else {
-      // One padding.
-      detail::decode_1_padding(input.substr(idx, 3), output);
-    }
-  } else {
-    // No padding.
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  return output;
-}
-} // namespace base64
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
index 7ad5807d1..b77ee43ea 100644
--- a/tokenizer/hf_tokenizer.py
+++ b/tokenizer/hf_tokenizer.py
@@ -5,11 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # Standard
-from typing import List, Optional
+from typing import Dict, List, Optional
 import json
 import os
 
 # Third Party
+import jinja2
 from tokenizers import Tokenizer
 
 # Local
@@ -37,17 +38,28 @@ def __init__(self, file_path: str):
         # Load the tokenizer itself
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
 
+        # Load the chat template if we have a config path
+        self._chat_template: Optional[jinja2.Template] = None
+
         # If available, parse bos/eos tokens from the tokenizer config
         self._bos_id, self._eos_id = None, None
         if tokenizer_config_path is not None:
             with open(tokenizer_config_path, "r") as handle:
                 tok_config = json.load(handle)
-            bos_token = tok_config.get("bos_token")
-            eos_token = tok_config.get("eos_token")
+
+            def _extract_token(identifier: str) -> Optional[str]:
+                entry: Optional[Union[str, dict]] = tok_config.get(identifier)
+                return entry.get("content") if isinstance(entry, dict) else entry
+
+            bos_token = _extract_token("bos_token")
+            eos_token = _extract_token("eos_token")
+
             if bos_token is not None:
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
                 self._eos_id = self._tokenizer.token_to_id(eos_token)
+            if chat_template_str := tok_config.get("chat_template"):
+                self._chat_template = jinja2.Template(chat_template_str)
 
         # If no eos/bos tokens found, go looking for them!
         if None in [self._bos_id, self._eos_id]:
@@ -70,6 +82,8 @@ def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optio
             if len(candidate_toks) == 1:
                 return candidate_toks[0]["id"]
 
+    ## Interface ##
+
     def encode(
         self,
         s: str,
@@ -90,3 +104,21 @@ def bos_id(self) -> int:
 
     def eos_id(self) -> int:
         return self._eos_id
+
+    ## Additional Public Methods ##
+
+    def has_chat_template(self) -> bool:
+        return bool(self._chat_template)
+
+    def apply_chat_template(
+        self,
+        dialog: List[Dict[str, str]],
+        add_generation_prompt: bool = False,
+    ) -> str:
+        """If configured with a chat template, apply it to the list of messages
+        """
+        if not self._chat_template:
+            raise ValueError("No chat template configured!")
+        return self._chat_template.render(
+            messages=dialog, add_generation_prompt=add_generation_prompt
+        )
diff --git a/tokenizer/sentencepiece.cpp b/tokenizer/sentencepiece.cpp
deleted file mode 100644
index 0cdfc7e30..000000000
--- a/tokenizer/sentencepiece.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// sentencepiece tokenizer
-
-#include <sentencepiece_processor.h>
-#include <tokenizer.h>
-#include <cinttypes>
-#include <string>
-#include "absl/strings/str_replace.h"
-
-const char kSpaceSymbol[] = "\xe2\x96\x81";
-
-SPTokenizer::SPTokenizer()
-    : Tokenizer(),
-      _processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
-
-/**
- * @brief Load the tokenizer from a file. The tokenizer file contains the
- * vocabulary and scores. The format is: the first integer is the maximum
- * token length, followed by a list of (word_len, word) pairs. Here we
- * are reading all the vocabulary into memory and keep it sorted for fast
- * lookup.
- *
- * @param tokenizer_path The path to the tokenizer file.
- * @return void
- */
-void SPTokenizer::load(const std::string& tokenizer_path) {
-  if (initialized_) {
-    fprintf(stderr, "Tokenizer already initialized.\n");
-    return;
-  }
-  // read in the file
-  const auto status = _processor->Load(tokenizer_path);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't load %s\n. If this tokenizer artifact is for llama3, please pass `-l 3`.", tokenizer_path.c_str());
-    exit(EXIT_FAILURE);
-  }
-  // load vocab_size, bos_tok, eos_tok
-  vocab_size_ = _processor->GetPieceSize();
-  bos_tok_ = _processor->bos_id();
-  eos_tok_ = _processor->eos_id();
-  initialized_ = true;
-}
-
-SPTokenizer::~SPTokenizer() {}
-
-/**
- * @brief Decode a token into string.
- *
- * @param prev_token The previous token.
- * @param token The current token.
- * @return std::string A pointer to the string representation of the
- * token.
- */
-std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // get rid of the control ids <s> and </s>
-  if (_processor->IsControl(token)) {
-    // NB: returning empty string doesn't work for some reason. It causes
-    // free(): invalid pointer error.
-    return " ";
-  }
-
-  std::string result =
-      absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
-
-  // following BOS token, sentencepiece decoder strips any leading
-  // whitespace
-  if (prev_token == bos_tok_ && result[0] == ' ') {
-    result = result.substr(1);
-  }
-
-  // handle <0x0A>
-  result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
-
-  return result;
-}
-
-/**
- * @brief Encode a string into a sequence of tokens.
- *
- * @param text The string to be encoded.
- * @param bos The number of BOS to prepend to the token list.
- * @param eos The number of EOS to append to the token list.
- * @return std::vector<uint64_t>
- */
-std::vector<uint64_t>
-SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // workaround a weird issue that text doesn't have correct size()
-  std::string input(text.c_str());
-  // should we reserve memory?
-  std::vector<int> res;
-  auto status = _processor->Encode(input, &res);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't encode %s\n", text.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  std::vector<uint64_t> tokens;
-  for (auto i = 0; i < bos; ++i) {
-    tokens.push_back(bos_tok_);
-  }
-
-  for (auto i = 0; i < res.size(); ++i) {
-    tokens.push_back(res[i]);
-  }
-
-  for (auto i = 0; i < eos; ++i) {
-    tokens.push_back(eos_tok_);
-  }
-  return tokens;
-}
diff --git a/tokenizer/third-party/abseil-cpp b/tokenizer/third-party/abseil-cpp
deleted file mode 160000
index 854193071..000000000
--- a/tokenizer/third-party/abseil-cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc
diff --git a/tokenizer/third-party/re2 b/tokenizer/third-party/re2
deleted file mode 160000
index ac82d4f62..000000000
--- a/tokenizer/third-party/re2
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1
diff --git a/tokenizer/third-party/sentencepiece b/tokenizer/third-party/sentencepiece
deleted file mode 160000
index 7dcb54145..000000000
--- a/tokenizer/third-party/sentencepiece
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7dcb541451b1862d73f473b3804ccf8f2a9e10f6
diff --git a/tokenizer/tiktoken.cpp b/tokenizer/tiktoken.cpp
deleted file mode 100644
index 2f31f057a..000000000
--- a/tokenizer/tiktoken.cpp
+++ /dev/null
@@ -1,390 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#include <base64.h>
-#include <tokenizer.h>
-#include <cctype>
-#include <cinttypes>
-#include <cstdint>
-#include <fstream>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <regex>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-// ------------------------------Util start------------------------------------
-
-static uint64_t _max_size() {
-  return std::numeric_limits<uint64_t>::max();
-}
-
-static Re2UPtr _create_regex(const std::string& pattern) {
-  assert(!pattern.empty());
-
-  return std::make_unique<re2::RE2>("(" + pattern + ")");
-}
-
-static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
-  std::string special_pattern;
-  for (const auto& ele : special_encoder) {
-    if (!special_pattern.empty()) {
-      special_pattern += "|";
-    }
-    special_pattern += re2::RE2::QuoteMeta(ele.first);
-  }
-
-  if (special_pattern.empty()) {
-    return nullptr;
-  }
-
-  return _create_regex(special_pattern);
-}
-
-static std::pair<std::string, uint64_t> _parse(const std::string& line) {
-  auto pos = line.find(" ");
-  if (pos == std::string::npos) {
-    throw std::invalid_argument("invalid encoder line: " + line);
-  }
-
-  auto token = base64::decode({line.data(), pos});
-  uint64_t rank = 0;
-  try {
-    rank = std::stoul(line.substr(pos + 1));
-  } catch (const std::exception&) {
-    throw std::invalid_argument("invalid encoder rank:  " + line);
-  }
-
-  return {std::move(token), rank};
-}
-
-static Encoder _load_encoder(const std::string& path) {
-  std::ifstream file(path);
-  if (!file) {
-    fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  Encoder encoder;
-  std::string line;
-  while (std::getline(file, line)) {
-    auto [token, rank] = _parse(line);
-
-    if (!encoder.emplace(std::move(token), rank).second) {
-      fprintf(stderr, "duplicate item: %s\n", line.c_str());
-    }
-  }
-  return encoder;
-}
-
-static Decoder _build_decoder(const Encoder& encoder) {
-  Decoder decoder;
-  for (const auto& [k, v] : encoder) {
-    decoder.emplace(v, k);
-  }
-
-  if (encoder.size() != decoder.size()) {
-    fprintf(stderr, "duplicate items in encoder");
-    exit(EXIT_FAILURE);
-  }
-
-  return decoder;
-}
-
-static std::vector<uint64_t> _byte_pair_merge(
-    const std::string& piece,
-    const std::unordered_map<std::string, uint64_t>& ranks,
-    std::function<uint64_t(uint64_t, uint64_t)> func) {
-  // This is a vector of (start, rank).
-  // The rank is of the byte pair starting at position start.
-  // The rank of the last item in the vector is not a valid value.
-  std::vector<std::pair<uint64_t, uint64_t>> parts;
-  parts.reserve(piece.size() + 1);
-  for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
-    parts.emplace_back(idx, _max_size());
-  }
-
-  auto get_rank = [&piece, &ranks](
-                      const std::vector<std::pair<uint64_t, uint64_t>>& parts,
-                      uint64_t start_idx,
-                      uint64_t skip) -> std::optional<uint64_t> {
-    if (start_idx + skip + 2 < parts.size()) {
-      auto s = parts[start_idx].first;
-      auto e = parts[start_idx + skip + 2].first;
-      auto key = piece.substr(s, e - s);
-      auto iter = ranks.find(key);
-      if (iter != ranks.end()) {
-        return iter->second;
-      }
-    }
-    return std::nullopt;
-  };
-
-  // We look up the ranks once in the beginning and iteratively update
-  // them during each merge, which reduces the number of rank lookups.
-  for (auto i = 0U; i < parts.size() - 2; ++i) {
-    auto rank = get_rank(parts, i, 0);
-    if (rank) {
-      // usize::MAX is a sentinel value and cannot be a valid rank
-      if (*rank == _max_size()) {
-        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
-      }
-      parts[i].second = *rank;
-    }
-  }
-
-  // If you have n parts and m merges, this does O(mn) work.
-  // We could do something with a heap and do O(m log n) work.
-  // It is important to consider that n is often small (<100), and as such
-  // the cache-locality benefits outweigh the algorithmic complexity downsides
-  // of the `parts` vector data structure above.
-
-  // Note that we hash bytes, not token pairs. As long as we train BPE the way
-  // we currently do, this is equivalent. An easy way to break this would be
-  // to decouple merge priority from token index or to prevent specific token
-  // merges.
-  while (true) {
-    if (parts.size() == 1) {
-      break;
-    }
-
-    // usize::MAX is a sentinel rank value allowing us to
-    // take the min more quickly
-    auto min_rank = std::make_pair<uint64_t, uint64_t>(_max_size(), 0);
-    for (auto i = 0U; i < parts.size() - 1; ++i) {
-      auto rank = parts[i].second;
-      if (rank < min_rank.first) {
-        min_rank.first = rank;
-        min_rank.second = i;
-      }
-    }
-
-    if (min_rank.first != _max_size()) {
-      auto i = min_rank.second;
-
-      // NOTE: We are about to remove parts[i + 1]. We do not do it
-      // yet because there are cache-locality benefits to updating
-      // parts[i] and parts[i-1] before removing, which could thrash
-      // the cache. Thus, we update the rank calculation by skipping over
-      // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
-      auto rank = get_rank(parts, i, 1);
-      if (rank) {
-        parts[i].second = *rank;
-      } else {
-        parts[i].second = _max_size();
-      }
-      if (i > 0) {
-        rank = get_rank(parts, i - 1, 1);
-        if (rank) {
-          parts[i - 1].second = *rank;
-        } else {
-          parts[i - 1].second = _max_size();
-        }
-      }
-
-      parts.erase(parts.begin() + (i + 1));
-    } else {
-      break;
-    }
-  }
-  std::vector<uint64_t> out;
-  out.reserve(parts.size() - 1);
-  for (auto i = 0U; i < parts.size() - 1; ++i) {
-    auto s = parts[i].first;
-    auto e = parts[i + 1].first;
-    out.push_back(func(s, e));
-  }
-  return out;
-}
-
-static std::vector<uint64_t> _byte_pair_encode(
-    const std::string& piece,
-    const Encoder& encoder) {
-  if (piece.size() == 1) {
-    auto iter = encoder.find(piece);
-    if (iter != encoder.end()) {
-      return std::vector<uint64_t>({iter->second});
-    } else {
-      // TODO: is it possible?
-      return {};
-    }
-  }
-
-  return _byte_pair_merge(
-      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
-        std::string key = piece.substr(start, stop - start);
-        auto iter = encoder.find(key);
-        if (iter != encoder.end()) {
-          return iter->second;
-        } else {
-          // TODO: what if key does not exist? Should we return `unknown`?
-          // assert(false); // ??
-          return uint64_t(0);
-        }
-      });
-}
-// ------------------------------Util end------------------------------------
-// -------------------------private method start-------------------------------
-
-template <typename T>
-std::pair<std::optional<std::string>, re2::StringPiece>
-Tiktoken::_split_with_allowed_special_token(
-    re2::StringPiece& input,
-    const T& allowed_special) {
-  if (!_special_token_regex) {
-    return std::make_pair(std::nullopt, input);
-  }
-
-  auto start = input.begin();
-  std::string special;
-  while (true) {
-    if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
-      // No special token.
-      break;
-    }
-
-    if (allowed_special.count(special) == 1) {
-      // Found an allowed special token, split the text with it.
-      return std::make_pair(
-          special,
-          re2::StringPiece(start, input.begin() - start - special.size()));
-    } // else try to find the next special token
-  }
-
-  return std::make_pair(std::nullopt, input);
-}
-
-void Tiktoken::_encode(
-    re2::StringPiece& input,
-    std::vector<uint64_t>& ret,
-    uint64_t& last_piece_token_len) {
-  std::string piece;
-  assert(_regex);
-  while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
-    auto iter = _encoder.find(piece);
-    if (iter != _encoder.end()) {
-      last_piece_token_len = 1;
-      ret.push_back(iter->second);
-      continue;
-    }
-    auto tokens = _byte_pair_encode(piece, _encoder);
-    last_piece_token_len = tokens.size();
-    ret.insert(ret.end(), tokens.begin(), tokens.end());
-  }
-}
-
-template <typename T>
-std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
-    const std::string& text,
-    const T& allowed_special) {
-  std::vector<uint64_t> tokens;
-  uint64_t last_piece_token_len = 0;
-  re2::StringPiece input(text);
-  while (true) {
-    auto [special, sub_input] =
-        _split_with_allowed_special_token(input, allowed_special);
-
-    _encode(sub_input, tokens, last_piece_token_len);
-
-    if (special) {
-      uint64_t token = 0;
-      try {
-        token = _special_token_encoder.at(*special);
-      } catch (const std::out_of_range&) {
-        // Should never go here, since special pattern includes all special
-        // chars.
-        fprintf(stderr, "unknown special token: %s\n", special->c_str());
-        exit(EXIT_FAILURE);
-      }
-
-      tokens.push_back(token);
-      last_piece_token_len = 0;
-    } else {
-      break;
-    }
-  }
-
-  // last_piece_token_len is how many tokens came from the last regex split.
-  // This is used for determining unstable tokens, since you can't merge
-  // across (stable) regex splits
-  return std::make_pair(tokens, last_piece_token_len);
-}
-
-// -------------------------private method end-------------------------------
-// -------------------------public method start-------------------------------
-
-Tiktoken::Tiktoken() : Tokenizer() {}
-
-void Tiktoken::load(const std::string& path) {
-  _encoder = _load_encoder(path);
-  _special_token_encoder = _get_special_tokens(_encoder.size());
-
-  _decoder = _build_decoder(_encoder);
-  _special_token_decoder = _build_decoder(_special_token_encoder);
-
-  _regex = _create_regex(_pattern);
-  _special_token_regex = _build_special_token_regex(_special_token_encoder);
-
-  // initialize vocab_size, bos_tok, eos_tok
-  vocab_size_ = _encoder.size() + _special_token_encoder.size();
-  bos_tok_ = _encoder.size(); // hardcoded (see _get_special_tokens)
-  eos_tok_ = _encoder.size() + 1; // hardcoded (see _get_special_tokens)
-  initialized_ = true;
-}
-
-std::vector<uint64_t>
-Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  auto res = _encode_with_special_token(text, _special_token_encoder).first;
-  for (auto i = 0; i < bos; ++i) {
-    res.insert(res.begin(), bos_tok_);
-  }
-  for (auto i = 0; i < eos; ++i) {
-    res.push_back(eos_tok_);
-  }
-  return res;
-}
-
-std::string Tiktoken::decode(uint64_t prev, uint64_t cur) {
-  (void)prev;
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  std::string ret;
-
-  std::string token_bytes;
-  auto iter = _decoder.find(cur);
-  if (iter != _decoder.end()) {
-    token_bytes = iter->second;
-  } else {
-    iter = _special_token_decoder.find(cur);
-    if (iter != _special_token_decoder.end()) {
-      token_bytes = iter->second;
-    } else {
-      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
-      exit(EXIT_FAILURE);
-    }
-  }
-  ret += token_bytes;
-
-  return ret;
-}
-// -------------------------public method end-------------------------------
diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
deleted file mode 100644
index 9e1977b71..000000000
--- a/tokenizer/tokenizer.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple Tokenizer interface.
-#pragma once
-
-#include <re2/re2.h>
-#include <cctype>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "sentencepiece_processor.h"
-
-class Tokenizer {
- public:
-  explicit Tokenizer() {}
-  virtual ~Tokenizer() {}
-
-  virtual void load(const std::string& tokenizer_path) = 0;
-
-  virtual std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
-
-  virtual std::string decode(uint64_t prev_token, uint64_t token) = 0;
-
-  // getters
-  int32_t vocab_size() const {
-    return vocab_size_;
-  }
-
-  uint64_t bos_tok() const {
-    return bos_tok_;
-  }
-
-  uint64_t eos_tok() const {
-    return eos_tok_;
-  }
-
- protected:
-  bool initialized_ = false;
-  int32_t vocab_size_;
-  uint64_t bos_tok_, eos_tok_;
-};
-
-// ----------------------- SPTokenizer -----------------------
-// Used by sentencepiece. Adapted from llama2.c.
-struct TokenIndex {
-  const char* str;
-  int32_t id;
-};
-
-class SPTokenizer : public Tokenizer {
- public:
-  explicit SPTokenizer();
-  ~SPTokenizer() override;
-
-  void load(const std::string& tokenizer_path) override;
-
-  std::vector<uint64_t> encode(const std::string& input, int8_t bos, int8_t eos)
-      override;
-
-  std::string decode(uint64_t prev_token, uint64_t token) override;
-
- private:
-  std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
-};
-
-// ----------------------- Tiktoken -----------------------
-// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
-
-using Encoder = std::unordered_map<std::string, uint64_t>;
-using Decoder = std::unordered_map<uint64_t, std::string>;
-using Re2UPtr = std::unique_ptr<re2::RE2>;
-
-class Tiktoken : public Tokenizer {
- public:
-  explicit Tiktoken();
-  ~Tiktoken(){};
-
-  void load(const std::string& tokenizer_path);
-
-  std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos);
-
-  std::string decode(uint64_t prev_token, uint64_t token);
-
- private:
-  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
-    Encoder special_tokens;
-    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
-    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
-    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
-    for (auto i = 5; i < 251; ++i) {
-      special_tokens.emplace(
-          "<|reserved_special_token_" + std::to_string(i) + "|>",
-          num_base_tokens++);
-    }
-    return special_tokens;
-  }
-
-  template <typename T>
-  std::pair<std::optional<std::string>, re2::StringPiece>
-  _split_with_allowed_special_token(
-      re2::StringPiece& input,
-      const T& allowed_special);
-
-  void _encode(
-      re2::StringPiece& input,
-      std::vector<uint64_t>& ret,
-      uint64_t& last_piece_token_len);
-
-  template <typename T>
-  std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
-      const std::string& text,
-      const T& allowed_special);
-
-  // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
-  const std::string _pattern =
-      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
-  Encoder _encoder;
-  Encoder _special_token_encoder;
-  Decoder _decoder;
-  Decoder _special_token_decoder;
-
-  Re2UPtr _regex;
-  Re2UPtr _special_token_regex;
-};
diff --git a/torchchat.py b/torchchat.py
index 35cdcabae..1eeee0120 100644
--- a/torchchat.py
+++ b/torchchat.py
@@ -6,7 +6,7 @@
 
 import argparse
 import logging
-import subprocess
+import signal
 import sys
 
 # MPS ops missing with Multimodal torchtune
@@ -25,7 +25,15 @@
 default_device = "cpu"
 
 
+def signal_handler(sig, frame):
+    print("\nInterrupted by user. Bye!\n")
+    sys.exit(0)
+
+
 if __name__ == "__main__":
+    # Set the signal handler for SIGINT
+    signal.signal(signal.SIGINT, signal_handler)
+
     # Initialize the top-level parser
     parser = argparse.ArgumentParser(
         prog="torchchat",
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index fb2bfb299..1e04800ab 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -14,16 +14,17 @@
 import torch
 import torch._dynamo.config
 import torch._inductor.config
-import torch.nn as nn
+import torch.distributed as dist
 
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.elastic.utils.distributed import get_free_port
-
-from torchchat.distributed import launch_distributed, ParallelDims, parallelize_llama
-
-from torchchat.model import Model, ModelArgs, ModelType
+from torchchat.distributed.utils import(
+    Color as color,
+    CUDATrackTime,
+    init_distributed,
+    GPUMemoryMonitor,
+)
+from torchchat.distributed.logging_utils import SingletonLogger
 
+from torchchat.model import Model, ModelArgs, ModelType, Transformer, TransformerArgs
 from torchchat.model_config.model_config import resolve_model_config
 from torchchat.utils.build_utils import (
     device_sync,
@@ -34,6 +35,7 @@
 from torchchat.utils.measure_time import measure_time
 from torchchat.utils.quantize import quantize_model
 
+
 from torchtune.models.convert_weights import meta_to_tune
 
 from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
@@ -54,6 +56,7 @@ class BuilderArgs:
     gguf_kwargs: Optional[Dict[str, Any]] = None
     dso_path: Optional[Union[Path, str]] = None
     aoti_package_path: Optional[Union[Path, str]] = None
+    snapshot_path: Optional[Union[Path, str]] = None
     pte_path: Optional[Union[Path, str]] = None
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
@@ -62,14 +65,21 @@ class BuilderArgs:
     pp: int = 1
     tp: int = 1
     chpt_from: str = "hf"
+    distribution_path: Optional[str] = None
     is_chat_model: bool = False
     prefill_possible: bool = False
     dynamic_shapes: bool = False
     max_seq_length: Optional[int] = None
+    attention_backend: str = "math"
 
     def __post_init__(self):
         if self.device is None:
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.xpu.is_available():
+                self.device = "xpu"
+            else:
+                self.device = "cpu"
 
         if not (
             (self.checkpoint_path and self.checkpoint_path.is_file())
@@ -78,9 +88,10 @@ def __post_init__(self):
             or (self.dso_path and Path(self.dso_path).is_file())
             or (self.aoti_package_path and Path(self.aoti_package_path).is_file())
             or (self.pte_path and Path(self.pte_path).is_file())
+            or (self.snapshot_path and Path(self.snapshot_path).is_file())
         ):
             raise RuntimeError(
-                "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
+                "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
             )
 
         if self.aoti_package_path and self.pte_path:
@@ -97,7 +108,7 @@ def __post_init__(self):
             for param, param_msg in ignored_params:
                 if param:
                     print(
-                        f"Warning: {param_msg} ignored because an exported DSO or PTE path was specified"
+                        f"Warning: {param_msg} ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument"
                     )
         else:
             self.prefill_possible = True
@@ -113,6 +124,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
 
         checkpoint_path = args.checkpoint_path
         params_table = args.params_table
+        distribution_path = None
         if args.model:  # Using a named, well-known model
             model_config = resolve_model_config(args.model)
 
@@ -127,9 +139,12 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
                 model_config.transformer_params_key or model_config.name.split("/")[-1]
             )
 
+            distribution_path = model_config.distribution_path
+
         dso_path = getattr(args, "dso_path", None)
         pte_path = getattr(args, "pte_path", None)
         aoti_package_path = getattr(args, "aoti_package_path", None)
+        snapshot_path = getattr(args, "snapshot_path", None)
 
         is_chat_model = False
         if args.is_chat_model:
@@ -157,6 +172,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         output_pte_path = getattr(args, "output_pte_path", None)
         output_aoti_package_path = getattr(args, "output_aoti_package_path", None)
         output_dso_path = getattr(args, "output_dso_path", None)
+        output_snapshot_path = getattr(args, "output_snapshot_path", None)
         if output_pte_path and args.dtype.startswith("fast"):
             if args.dtype == "fast":
                 # As per Kimish, float32 should be faster on ET XNNPACK
@@ -172,6 +188,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
+        sdp_backend_dict = {
+            'math': torch.nn.attention.SDPBackend.MATH,
+            'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+        }
+        attention_backend = sdp_backend_dict[args.attention_backend]
+        if args.device == "cpu" and (args.attention_backend == "efficient_attention"
+                                     or args.attention_backend == "cudnn_attention"):
+            print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
+            attention_backend = torch.nn.attention.SDPBackend.MATH
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -183,6 +210,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             dso_path=dso_path,
             aoti_package_path=aoti_package_path,
             pte_path=pte_path,
+            snapshot_path=snapshot_path,
             device=args.device,
             precision=dtype,
             setup_caches=(
@@ -192,9 +220,11 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             pp=pp,
             tp=tp,
             chpt_from=chpt_from,
+            distribution_path=distribution_path,
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
+            attention_backend=attention_backend,
         )
 
     @classmethod
@@ -379,6 +409,8 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
         kwargs = {}
     else:
         kwargs = builder_args.gguf_kwargs
+
+    kwargs.setdefault("device", builder_args.device)
     model = Model.from_gguf(builder_args.gguf_path, **kwargs)
     return model
 
@@ -402,6 +434,7 @@ def _load_checkpoint(builder_args: BuilderArgs):
                     os.path.join(builder_args.checkpoint_dir, cp_name),
                     map_location=builder_args.device,
                     mmap=True,
+                    weights_only=False,
                 )
             )
         checkpoint = {}
@@ -464,77 +497,11 @@ def _load_model_default(builder_args: BuilderArgs) -> Model:
     return model
 
 
-def _maybe_init_distributed(
-    builder_args: BuilderArgs,
-) -> Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
-    """
-    Initialize distributed related setups if the user specified
-    using distributed inference. If not, this is a no-op.
-
-    Args:
-        builder_args (:class:`BuilderArgs`):
-            Command args for model building.
-    Returns:
-        Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
-            - The first element is an optional DeviceMesh object,
-            which which describes the mesh topology of devices for the DTensor.
-            - The second element is an optional ParallelDims object,
-            which represents the parallel dimensions configuration.
-    """
-    if not builder_args.use_distributed:
-        return None, None
-    dist_config = "llama3_8B.toml"  # TODO - integrate with chat cmd line
-
-    world_mesh, parallel_dims = launch_distributed(dist_config)
-
-    assert (
-        world_mesh is not None and parallel_dims is not None
-    ), f"failed to launch distributed using {dist_config}"
-
-    return world_mesh, parallel_dims
-
-
-def _maybe_parallelize_model(
-    model: nn.Module,
-    builder_args: BuilderArgs,
-    world_mesh: DeviceMesh,
-    parallel_dims: ParallelDims,
-) -> nn.Module:
-    """
-    We parallelize the module and load the distributed checkpoint to the model
-    if the user specifies using distributed inference. If not, this is a no-op.
-
-    Args:
-        model (:class:`nn.Module`):
-            Module to be parallelized.
-        builder_args (:class:`BuilderArgs`):
-            Command args for model building.
-        world_mesh (:class:`DeviceMesh`):
-            Object which describes the mesh topology
-            of devices for the DTensor.
-        parallel_dims (:class:`ParallelDims`):
-            Object which represents the parallel dimensions configuration.
-    Returns:
-        A :class:`nn.Module` object which is parallelized and checkpoint loaded
-        if the user specifies using distributed inference.
-    """
-    if world_mesh is None:
-        return model
-    assert parallel_dims is not None
-    print("Applying model parallel to model ...")
-    parallelize_llama(model, world_mesh, parallel_dims)
-    return load_checkpoints_to_model(model, builder_args, world_mesh)
-
-
 def _load_model(builder_args: BuilderArgs) -> Model:
-    # world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
     if builder_args.gguf_path:
         model = _load_model_gguf(builder_args)
-    # elif builder_args.use_distributed:
-    #    model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-    # model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
     if builder_args.dso_path or builder_args.aoti_package_path:
         # AOTI-compoiled model will load its own weights.
@@ -627,9 +594,8 @@ def do_nothing(max_batch_size, max_seq_length):
             # attributes will NOT be seen on by AOTI-compiled forward
             # function, e.g. calling model.setup_cache will NOT touch
             # AOTI compiled and maintained model buffers such as kv_cache.
-            from torch._inductor.package import load_package
 
-            aoti_compiled_model = load_package(
+            aoti_compiled_model = torch._inductor.aoti_load_package(
                 str(builder_args.aoti_package_path.absolute())
             )
 
@@ -670,6 +636,128 @@ def do_nothing(max_batch_size, max_seq_length):
             model = PTEModel(config, builder_args.pte_path)
         except Exception:
             raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}")
+    elif builder_args.snapshot_path:
+        # Resolve ModelArgs for constructing the PTEModel
+        # If a manual params_path is provided, use that
+        if builder_args.params_path:
+            config: ModelArgs = ModelArgs.from_params(builder_args.params_path)
+        else:
+            # TODO: Instead of loading the whole model, refactor to call a
+            # helper that generate just model.config
+            with measure_time("Time to load model: {time:.02f} seconds"):
+                model = _load_model(builder_args)
+                device_sync(device=builder_args.device)
+                config = model.config
+                model = None
+        try:
+            model = torch.load(builder_args.snapshot_path, weights_only=False)
+        except Exception:
+            raise RuntimeError(f"Failed to load torchchat snapshot {builder_args.snapshot_path}")
+        # _active_backend() does not allow DSO & AOTI to be true. 
+        # Choose either.
+        from torchchat.utils.build_utils import set_backend
+        set_backend (dso=True, pte=False, aoti_package=False)
+        if (model.config != config):
+            raise RuntimeError("loaded model architecture mismatch")
+        ##        
+        ## import all libraries with custom kernels ans custom operators
+        ## that quantize may be pulling in
+        ##
+
+    elif builder_args.distributed:
+        pp_degree = builder_args.pp
+        tp_degree = builder_args.tp
+
+        init_distributed()
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+
+        logger = SingletonLogger.get_logger()
+
+        gpu_memory_monitor = GPUMemoryMonitor("cuda")
+        logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
+
+        # Model-level config
+        if builder_args.params_table:
+            model_config = ModelArgs.from_table(builder_args.params_table)
+        else:
+            raise NotImplementedError()
+        # Transformer-level config
+        config = TransformerArgs.from_params(model_config.transformer_args["text"])
+        logger.info(f"Transformer Config: {config}")
+
+        #TODO: Move into head of file after solving circular import
+        from torchchat.distributed.checkpoint_utils import (
+            load_model_weights,
+            )
+
+        # Validate pipeline degree
+        assert config.n_layers % pp_degree == 0
+
+        # Create device mesh
+        device_mesh = dist.init_device_mesh(
+            "cuda",
+            (pp_degree, tp_degree),
+            mesh_dim_names=("pp", "tp")
+            )
+        tp_mesh = device_mesh["tp"]
+        pp_mesh = device_mesh["pp"]
+        logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}")
+
+        pp_rank = pp_mesh.get_local_rank()
+        logger.info(f"{pp_degree=}, {tp_degree=}")
+
+        # Assuming same number of GPUs per node
+        device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+
+        # Fill in PP configs
+        config.stage_idx = pp_rank
+        config.n_stages = pp_degree
+
+        with torch.device("meta"):
+            # TODO: we should create model instead of Transformer
+            model = Transformer(config)
+
+        # Distribute model on TP mesh
+        # (Surprisingly, this works even though model is on meta device and mesh is of
+        # cuda devices)
+        model.distribute(tp_mesh)
+        if rank == 0:
+            logger.info(f"Model: {model}")
+
+        # Load weights
+        logger.info(f"Loading weights for {pp_rank=} on {device=}")
+        with CUDATrackTime() as timer:
+            load_model_weights(model, builder_args.distribution_path, device, config, builder_args.chpt_from)
+
+        logger.info(
+            f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
+        )
+
+        # Setup KV caches (after model distribution)
+        # The number of cache lanes is the same as the maximum number of
+        # micro-batches that can be "in flight" in parallel -- imagine each
+        # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
+        # When decoding is done for certain micro-batches, we can reuse the KV cache
+        # lanes.
+        # TODO: bump up the lane count
+        pipeline_lanes = 1
+        seqlen_prefill=1024
+        with device:
+            model.setup_caches(1, seqlen_prefill, cache_lanes=pipeline_lanes)
+
+        # info on stage size and params
+        # stage_size = get_module_size(model)
+        # stage_size_formatted = bytes_to_readable(stage_size)
+        # stage_num_params = get_num_params(model)
+        # logger.info(
+        #     f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}"
+        # )
+        model.eval()
+
+        model.text_transformer_args = None
+        model.config.model_type = model_config.model_type
+        model.device_mesh = device_mesh
     else:
         with measure_time("Time to load model: {time:.02f} seconds"):
             model = _load_model(builder_args)
@@ -706,4 +794,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
         return "TikToken"
     if tokenizers:
         return "Tokenizers"
-    return "SentencePiece"
\ No newline at end of file
+    return "SentencePiece"
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index a8a2c7da8..1d531c709 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -17,10 +17,20 @@
     allowable_params_table,
 )
 
-logging.basicConfig(level=logging.INFO, format="%(message)s")
+_log_level_env = os.getenv("LOG_LEVEL", "INFO")
+try:
+    _log_level = getattr(logging, _log_level_env.upper())
+except AttributeError:
+    print(f"Invalid log level: {_log_level_env}", file=sys.stderr)
+    _log_level = logging.INFO
+
+
+logging.basicConfig(level=_log_level, format="%(message)s")
 logger = logging.getLogger(__name__)
 
 default_device = os.getenv("TORCHCHAT_DEVICE", "fast")
+default_dtype = os.getenv("TORCHCHAT_PRECISION", "fast")
+
 default_model_dir = Path(
     os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache")
 ).expanduser()
@@ -149,9 +159,9 @@ def _add_model_config_args(parser, verb: str) -> None:
 
     model_config_parser.add_argument(
         "--dtype",
-        default="fast",
+        default=None,
         choices=allowable_dtype_names(),
-        help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32, fast16, fast",
+        help="Override the dtype of the model. Options: bf16, fp16, fp32, fast16, fast",
     )
     model_config_parser.add_argument(
         "--quantize",
@@ -165,9 +175,16 @@ def _add_model_config_args(parser, verb: str) -> None:
     model_config_parser.add_argument(
         "--device",
         type=str,
-        default=default_device,
-        choices=["fast", "cpu", "cuda", "mps"],
-        help="Hardware device to use. Options: cpu, cuda, mps",
+        default=None,
+        choices=["fast", "cpu", "cuda", "mps", "xpu"],
+        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
+    )
+    model_config_parser.add_argument(
+        "--attention-backend",
+        type=str,
+        default="math",
+        choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
+        help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
     )
 
 
@@ -190,6 +207,12 @@ def _add_export_output_path_args(parser) -> None:
         default=None,
         help="Output to the specified AOT Inductor .dso model file",
     )
+    exclusive_parser.add_argument( 
+        "--output-snapshot-path",
+        type=str,
+        default=None,
+        help="Output to the specified PyTorch model and sha256 file",
+    )
     exclusive_parser.add_argument(
         "--output-aoti-package-path",
         type=str,
@@ -237,7 +260,13 @@ def _add_exported_input_path_args(parser) -> None:
         default=None,
         help="Use the specified ExecuTorch .pte model file",
     )
-
+    exclusive_parser.add_argument(
+        "--snapshot-path",
+        type=Path,
+        default=None,
+        help="Use the specified torchchat snaphot .tc model file",
+    )
+ 
 
 # Add CLI Args related to JIT downloading of model artifacts
 def _add_jit_downloading_args(parser) -> None:
@@ -513,20 +542,34 @@ def arg_init(args):
     if isinstance(args.quantize, str):
         args.quantize = json.loads(args.quantize)
 
-    # if we specify dtype in quantization recipe, replicate it as args.dtype
-    args.dtype = args.quantize.get("precision", {}).get("dtype", args.dtype)
+    # if we specify dtype in quantization recipe, allow args.dtype top override if specified
+    if args.dtype is None:
+        args.dtype = args.quantize.get("precision", {}).get("dtype", default_dtype)
+    else:
+        precision_handler = args.quantize.get("precision", None)
+        if precision_handler:
+            if precision_handler["dtype"] != args.dtype:
+                print('overriding json-specified dtype {precision_handler["dtype"]} with cli dtype {args.dtype}')
+                precision_handler["dtype"] = args.dtype
 
     if getattr(args, "output_pte_path", None):
-        if args.device not in ["cpu", "fast"]:
+        if args.device not in [None, "cpu", "fast"]:
             raise RuntimeError("Device not supported by ExecuTorch")
         args.device = "cpu"
     else:
         # Localized import to minimize expensive imports
         from torchchat.utils.build_utils import get_device_str
 
-        args.device = get_device_str(
-            args.quantize.get("executor", {}).get("accelerator", args.device)
-        )
+        if args.device is None:
+            args.device = get_device_str(
+                args.quantize.get("executor", {}).get("accelerator", default_device)
+            )
+        else:
+            args.device = get_device_str(args.device)
+            executor_handler = args.quantize.get("executor", None)
+            if executor_handler and executor_handler["accelerator"] != args.device:
+                print(f'overriding json-specified device {executor_handler["accelerator"]} with cli device {args.device}')
+                executor_handler["accelerator"] = args.device
 
     if "mps" in args.device:
         if getattr(args, "compile", False) or getattr(args, "compile_prefill", False):
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
index f428e4cc6..122ab0f28 100644
--- a/torchchat/cli/convert_hf_checkpoint.py
+++ b/torchchat/cli/convert_hf_checkpoint.py
@@ -39,19 +39,14 @@ def convert_hf_checkpoint(
     config = TransformerArgs.from_params(config_args)
     print(f"Model config {config.__dict__}")
 
-    # Load the json file containing weight mapping
+    # Find all candidate weight mapping index files
     model_map_json_matches = [Path(m) for m in glob.glob(str(model_dir / "*.index.json"))]
-    assert len(model_map_json_matches) <= 1, "Found multiple weight mapping files"
-    if len(model_map_json_matches):
-        model_map_json = model_map_json_matches[0]
-    else:
-        model_map_json = model_dir / "pytorch_model.bin.index.json"
 
     # If there is no weight mapping, check for a consolidated model and
     # tokenizer we can move. Llama 2 and Mistral have weight mappings, while
     # Llama 3 has a consolidated model and tokenizer.
     # Otherwise raise an error.
-    if not model_map_json.is_file():
+    if not model_map_json_matches:
         consolidated_pth = model_dir / "original" / "consolidated.00.pth"
         tokenizer_pth = model_dir / "original" / "tokenizer.model"
         if consolidated_pth.is_file() and tokenizer_pth.is_file():
@@ -68,11 +63,30 @@ def convert_hf_checkpoint(
             return
         else:
             raise RuntimeError(
-                f"Could not find {model_map_json} or {consolidated_pth} plus {tokenizer_pth}"
+                f"Could not find a valid model weight map or {consolidated_pth} plus {tokenizer_pth}"
             )
 
-    with open(model_map_json) as json_map:
-        bin_index = json.load(json_map)
+    # Load the json file(s) containing weight mapping
+    #
+    # NOTE: If there are multiple index files, there are two possibilities:
+    #   1. The files could be mapped to different weight format files (e.g. .bin
+    #       vs .safetensors)
+    #   2. The files could be split subsets of the mappings that need to be
+    #       merged
+    #
+    # In either case, we can simply keep the mappings where the target file is
+    # valid in the model dir.
+    bin_index = {}
+    for weight_map_file in model_map_json_matches:
+        with open(weight_map_file, "r") as handle:
+            weight_map = json.load(handle)
+        valid_mappings = {
+            k: model_dir / v
+            for (k, v) in weight_map.get("weight_map", {}).items()
+            if (model_dir / v).is_file()
+        }
+        bin_index.update(valid_mappings)
+    bin_files = set(bin_index.values())
 
     weight_map = {
         "model.embed_tokens.weight": "tok_embeddings.weight",
@@ -96,7 +110,6 @@ def convert_hf_checkpoint(
         "model.norm.weight": "norm.weight",
         "lm_head.weight": "output.weight",
     }
-    bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
         return (
diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
index f145c93fb..4da2bc390 100644
--- a/torchchat/cli/download.py
+++ b/torchchat/cli/download.py
@@ -35,11 +35,12 @@ def _download_hf_snapshot(
         model_info = model_info(model_config.distribution_path, token=hf_token)
         model_fnames = [f.rfilename for f in model_info.siblings]
 
-        # Check the model config for preference between safetensors and pth
+        # Check the model config for preference between safetensors and pth/bin
         has_pth = any(f.endswith(".pth") for f in model_fnames)
+        has_bin = any(f.endswith(".bin") for f in model_fnames)
         has_safetensors = any(f.endswith(".safetensors") for f in model_fnames)
 
-        # If told to prefer safetensors, ignore pth files
+        # If told to prefer safetensors, ignore pth/bin files
         if model_config.prefer_safetensors:
             if not has_safetensors:
                 print(
@@ -47,10 +48,10 @@ def _download_hf_snapshot(
                     file=sys.stderr,
                 )
                 exit(1)
-            ignore_patterns = "*.pth"
+            ignore_patterns = ["*.pth", "*.bin"]
 
         # If the model has both, prefer pth files over safetensors
-        elif has_pth and has_safetensors:
+        elif (has_pth or has_bin) and has_safetensors:
             ignore_patterns = "*safetensors*"
 
         # Otherwise, download everything
@@ -110,6 +111,8 @@ def _download_direct(
 def download_and_convert(
     model: str, models_dir: Path, hf_token: Optional[str] = None
 ) -> None:
+    if model is None:
+        raise ValueError("'download' command needs a model name or alias.")
     model_config = resolve_model_config(model)
     model_dir = models_dir / model_config.name
 
@@ -234,4 +237,8 @@ def where_main(args) -> None:
 
 # Subcommand to download model artifacts.
 def download_main(args) -> None:
-    download_and_convert(args.model, args.model_directory, args.hf_token)
+    try:
+        download_and_convert(args.model, args.model_directory, args.hf_token)
+    except ValueError as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
diff --git a/torchchat/distributed/checkpoint.py b/torchchat/distributed/checkpoint.py
index 1830e3a75..11e397469 100644
--- a/torchchat/distributed/checkpoint.py
+++ b/torchchat/distributed/checkpoint.py
@@ -96,6 +96,7 @@ def _load_checkpoints_from_storage(
         checkpoint_path,
         map_location=builder_args.device,
         mmap=True,
+        weights_only=False,
     )
 
 
diff --git a/torchchat/distributed/checkpoint_utils.py b/torchchat/distributed/checkpoint_utils.py
index cf3206e4e..806855c4b 100644
--- a/torchchat/distributed/checkpoint_utils.py
+++ b/torchchat/distributed/checkpoint_utils.py
@@ -17,6 +17,7 @@
 from torch.distributed._tensor import DTensor
 from torchchat.distributed.dtensor_utils import convert_to_dtensor
 from torchchat.cli.builder import BuilderArgs, _load_checkpoint
+from torchchat.model import ModelArgs
 
 
 _DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
@@ -450,3 +451,34 @@ def load_weights_from_torchchat_format(stage_module, distribution, device, model
     # Fill state dict into stage module
     stage_module.load_state_dict(stage_state_dict, strict=False, assign=True)
     logger.info(f"Successfully loaded {len(updated_states)} weights into stage module")
+
+
+def load_model_weights(
+    stage_module: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+    model_config: ModelArgs,
+    chpt_from: str,
+):
+    """Load the weights from the safetensor file(s) into the model stage.
+    Model config is needed b/c we permute wq and wk weights based on attn heads.
+
+    Args:
+        stage_module (torch.nn.Module): The model stage to load the weights into.
+        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
+        device (torch.device): The device to load the weights onto.
+        model_config (ModelArgs): The model config.
+        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
+    """
+    if chpt_from == "hf":
+        # This format stands for: index file + multiple binary files
+        load_weights_from_hf_format(stage_module, distribution, device, model_config)
+    elif chpt_from == "torchchat":
+        # This format stands for:
+        # single binary file, OR
+        # multiple binary files without index files.
+        load_weights_from_torchchat_format(
+            stage_module, distribution, device, model_config
+        )
+    else:
+        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
diff --git a/torchchat/distributed/dist_run.py b/torchchat/distributed/dist_run.py
deleted file mode 100644
index 389ae41c1..000000000
--- a/torchchat/distributed/dist_run.py
+++ /dev/null
@@ -1,629 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Example run command:
-# torchrun --nproc-per-node 4 dist_run.py llama2-7b-chat --pp 2
-# torchrun --nproc-per-node 4 dist_run.py llama3 --pp 2
-
-import argparse
-import os
-from enum import auto, Enum
-from pathlib import Path
-from types import MethodType, SimpleNamespace
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-import torch.distributed as dist
-from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
-from torchchat.cli.builder import TokenizerArgs
-
-# TODO - these are not distributed specific, consider moving to new package
-from torchchat.distributed.checkpoint_utils import (
-    get_hf_config_file,
-    load_weights_from_hf_format,
-    load_weights_from_torchchat_format,
-)
-
-from torchchat.distributed.logging_utils import SingletonLogger
-from torchchat.distributed.utils import (
-    bytes_to_readable,
-    Color as color,
-    CUDATrackTime,
-    get_module_size,
-    get_num_params,
-    GPUMemoryMonitor,
-)
-from torchchat.model import ModelArgs, Transformer, TransformerArgs
-from torchchat.utils.build_utils import set_precision
-
-try:
-    from tokenizer.tiktoken import Tokenizer as TiktokenTokenizer
-except ImportError:
-    TiktokenTokenizer = None
-try:
-    from sentencepiece import SentencePieceProcessor
-except ImportError:
-    SentencePieceProcessor = None
-
-
-logger = SingletonLogger.get_logger()
-
-# Using model name to identify the model to load, for example "llama2-7b-chat".
-# You can change it to other values listed below.
-# For details on the name-to-distribution mapping, see README.md or models.json.
-NAME_TO_DISTRIBUTION_AND_DTYPE = {
-    "llama2-7b-chat": ("meta-llama/Llama-2-7b-chat-hf", torch.float16),
-    "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16),
-    "llama3.1": ("meta-llama/Meta-Llama-3.1-8B-Instruct", torch.bfloat16),
-}
-
-
-def _init_distributed():
-    dist.init_process_group("nccl")
-    rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    # Assuming same number of GPUs per node
-    torch.cuda.set_device(rank % torch.cuda.device_count())
-    return rank, world_size
-
-
-def _create_device_mesh(pp_degree, tp_degree):
-    return dist.init_device_mesh(
-        "cuda", (pp_degree, tp_degree), mesh_dim_names=("pp", "tp")
-    )
-
-
-def dict_to_args(dictionary: Dict[str, Any]) -> SimpleNamespace:
-    return SimpleNamespace(**dictionary)
-
-
-def _patch_tokenizer(tokenizer):
-    """Patch the tokenizer to support decoding of token ids."""
-    if isinstance(tokenizer, TiktokenTokenizer):
-        # Patch tiktokenizer to allow a list of sequences.
-        # TODO: Upstream to tokenizer modules
-        old_decode = tokenizer.decode
-
-        def decode(
-            self, token_ids: List[int | List[int]], *args, **kwargs
-        ) -> str | List[str]:
-            if len(token_ids) < 1:
-                return ""
-            if isinstance(token_ids[0], list):
-                return [old_decode(t, *args, **kwargs) for t in token_ids]
-            else:
-                return old_decode(token_ids, *args, **kwargs)
-
-        tokenizer.decode = MethodType(decode, tokenizer)
-    return tokenizer
-
-
-def _build_chat_tokenizer(
-    tokenizer_args: TokenizerArgs,
-) -> SentencePieceProcessor | TiktokenTokenizer:
-    """Builds a tokenizer for the given model name"""
-
-    tokenizer_args = TokenizerArgs.from_args(tokenizer_args)
-    tokenizer = tokenizer_args.t
-    assert tokenizer is not None, f"Failed to get tokenizer using {tokenconfig=}"
-    logger.info(
-        f"using tokenizer = {tokenizer.__class__.__module__}.{tokenizer.__class__.__name__}"
-    )
-
-    tokenizer = _patch_tokenizer(tokenizer)
-
-    return tokenizer
-
-
-def _load_model_weights(
-    stage_module: torch.nn.Module,
-    distribution: str,
-    device: torch.device,
-    model_config: ModelArgs,
-    chpt_from: str,
-):
-    """Load the weights from the safetensor file(s) into the model stage.
-    Model config is needed b/c we permute wq and wk weights based on attn heads.
-
-    Args:
-        stage_module (torch.nn.Module): The model stage to load the weights into.
-        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
-        device (torch.device): The device to load the weights onto.
-        model_config (ModelArgs): The model config.
-        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
-    """
-    if chpt_from == "hf":
-        # This format stands for: index file + multiple binary files
-        load_weights_from_hf_format(stage_module, distribution, device, model_config)
-    elif chpt_from == "torchchat":
-        # This format stands for:
-        # single binary file, OR
-        # multiple binary files without index files.
-        load_weights_from_torchchat_format(
-            stage_module, distribution, device, model_config
-        )
-    else:
-        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
-
-
-def _encode_strings(
-    strings: List[str],
-    tokenizer,
-    bos: bool,
-    device: torch.device,
-    dtype=torch.int64,
-) -> List[torch.Tensor]:
-    """Encode a list of prompt strings into a list of tensor token ids."""
-    encoded_list = []
-    for string in strings:
-        tokens = tokenizer.encode(string)
-        if bos:
-            tokens = [tokenizer.bos_id()] + tokens
-        encoded_list.append(torch.tensor(tokens, dtype=dtype, device=device))
-    return encoded_list
-
-
-def _create_padded_prompts(
-    input_ids_list: List[torch.Tensor],
-    tokenizer,
-    seqlen: int,
-    start_pos: int,
-    device: torch.device,
-    pad_token_id: Optional[int] = None,
-) -> Tuple[torch.Tensor, List[int]]:
-    """
-    Create a padded tensor for multiple encoded input prompts.
-
-    Returns:
-        Tuple[torch.Tensor, List[int]]: A tuple containing the padded tensor and a list of prompt lengths.
-    """
-    pad_token_id = pad_token_id if pad_token_id is not None else tokenizer.eos_id()
-
-    # Find the maximum prompt length
-    max_prompt_len = max(ids.size(0) for ids in input_ids_list)
-
-    # Calculate the buffer size
-    max_new_tokens = max(0, min(seqlen - start_pos, seqlen - max_prompt_len))
-    token_buffer_size = max_prompt_len + max_new_tokens
-
-    # Create the padded batch tensor
-    batch_size = len(input_ids_list)
-    batch_seq = torch.full(
-        (batch_size, token_buffer_size), pad_token_id, dtype=torch.int64, device=device
-    )
-
-    prompt_lengths = []
-    for i, input_ids in enumerate(input_ids_list):
-        prompt_len = input_ids.size(0)
-        batch_seq[i, :prompt_len] = input_ids
-        prompt_lengths.append(prompt_len)
-
-    return batch_seq, prompt_lengths
-
-
-def _batch_decode_next_tokens(
-    output: torch.Tensor,
-    pos: List[int] = None,
-    temperature: float = 1.0,
-    topk: int = 10,
-) -> torch.Tensor:
-    """
-    Decode the next token for each prompt in the batch. Adds temperature option for non-deterministic decoding.
-
-    Args:
-        output (torch.Tensor): The output tensor to decode.
-        pos (List[int]): The positions of the `output` to decode in the sequence length dimension.
-        step (int): Step indicator. If -1, use positions from `pos`. Otherwise, use the first token.
-        temperature (float): Sampling temperature for non-deterministic decoding.
-
-    Returns:
-        torch.Tensor: Decoded token ids.
-    """
-    batch_size, seq_len, vocab_size = output.shape
-
-    if pos is None:
-        # `pos` is not provided, so we can use the first token
-        next_token_logits = output[:, 0, :]
-    else:
-        # get the logits for each prompt at the specified positions
-        next_token_logits = output[torch.arange(batch_size), torch.tensor(pos) - 1]
-
-    if temperature != 1.0:
-        next_token_logits = next_token_logits / temperature
-
-    # Uses top-k sampling if temperature is not 1.0, otherwise use argmax
-    if temperature != 1.0:
-        top_k = min(topk, vocab_size)  # Ensure top-k is not greater than vocab size
-        top_k_logits, top_k_indices = torch.topk(next_token_logits, k=top_k, dim=-1)
-        probs = torch.softmax(top_k_logits, dim=-1)
-        next_token_indices = torch.multinomial(probs, num_samples=1).squeeze(-1)
-        next_tokens = top_k_indices.gather(
-            -1, next_token_indices.unsqueeze(-1)
-        ).squeeze(-1)
-    else:
-        # Argmax (deterministic)
-        next_tokens = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-
-    # Token ids in int tensor form
-    return next_tokens
-
-
-def _update_padded_sequence(
-    padded_sequence: torch.Tensor,
-    new_token: torch.Tensor,
-    prompt_lengths: List[int],
-) -> None:
-    for i in range(len(prompt_lengths)):
-        padded_sequence[i, prompt_lengths[i]] = new_token[i, 0]
-        # logger.info(f"updated prompt {i} with new token {new_token[i, 0]}")
-
-
-# Decode token id into string and print it
-def _decode_in_flight(token, tokenizer, tp_rank):
-    """decode token ids for all prompts in the batch and log them"""
-    # `token` is a tensor of shape (batch_size, 1).
-    # For TiktokenTokenizer, we need to squeeze it to 1D.
-    # For SentencePieceProcessor, we don't.
-    token_str = tokenizer.decode(token.tolist())
-    # print the token string on tp rank 0
-    if tp_rank == 0:
-        logger.info(
-            f"{color.green} responses ====>>>> "
-            f"{color.blue} {token_str} {color.reset}"
-        )
-    return token_str
-
-
-def _cleanup():
-    dist.barrier()
-    dist.destroy_process_group()
-
-
-prompts = [
-    "What is Snow?",
-    # "Can you explain what is the purpose of back propagation in neural networks?",
-    "Who is Santa Claus?",
-    "Where does Santa live?",
-    "Who is Abraham Lincoln?",
-    # "How are models trained?",
-]
-
-
-def main(
-    model_name,
-    builder_args,
-    tokenizer_args,
-    pipe,
-):
-    pp_degree = builder_args.pp
-
-    rank, world_size = _init_distributed()
-    logger.info(f"Worker started: {rank=}, {world_size=}")
-
-    gpu_memory_monitor = GPUMemoryMonitor("cuda")
-    logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
-
-    distribution, model_dtype = NAME_TO_DISTRIBUTION_AND_DTYPE[model_name]
-    logger.info(f"Using model weights from {distribution} and dtype {model_dtype}")
-
-    # Model-level config
-    model_config = ModelArgs.from_name(distribution)
-    # Transformer-level config
-    config = TransformerArgs.from_params(model_config.transformer_args["text"])
-    logger.info(f"Transformer Config: {config}")
-
-    tokenizer = _build_chat_tokenizer(tokenizer_args)
-
-    set_precision(model_dtype)
-    logger.info(f"Using cache precision {model_dtype}")
-
-    hf_config = get_hf_config_file(distribution)
-    if hf_config is None:
-        raise ValueError(f"Config file not found for model id {distribution}")
-
-    # Validate pipeline degree
-    assert world_size % pp_degree == 0
-    assert config.n_layers % pp_degree == 0
-
-    # Tensor parallel is enabled in this program
-    tp_degree = world_size // pp_degree
-
-    # Create device mesh
-    device_mesh = _create_device_mesh(pp_degree, tp_degree)
-    tp_mesh = device_mesh["tp"]
-    pp_mesh = device_mesh["pp"]
-    logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}")
-
-    tp_rank = tp_mesh.get_local_rank()
-    pp_rank = pp_mesh.get_local_rank()
-    tp_group = tp_mesh.get_group()
-    pp_group = pp_mesh.get_group()
-    logger.info(f"{pp_degree=}, {tp_degree=}")
-
-    # Convenience variables
-    first_pp_rank = 0
-    last_pp_rank = pp_degree - 1
-
-    # Assuming same number of GPUs per node
-    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
-
-    # Fill in PP configs
-    config.stage_idx = pp_rank
-    config.n_stages = pp_degree
-
-    with torch.device("meta"):
-        # TODO: we should create model instead of Transformer
-        model = Transformer(config)
-
-    # Distribute model on TP mesh
-    # (Surprisingly, this works even though model is on meta device and mesh is of
-    # cuda devices)
-    model.distribute(tp_mesh)
-    if rank == 0:
-        logger.info(f"Model: {model}")
-
-    # Load weights
-    logger.info(f"Loading weights for {pp_rank=} on {device=}")
-    with CUDATrackTime() as timer:
-        _load_model_weights(model, distribution, device, config, builder_args.chpt_from)
-
-    logger.info(
-        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-    )
-
-    # Batch size. Since we push batches dynamically through the pipeline rather
-    # than chunking them, this is effectively micro-batch size in pipeline
-    # sense. Thus it is interchangeable with micro-batch size below.
-    batch_size = 1  # len(prompt)
-    seqlen_prefill = 1024  # sequence length
-    dim = 4096  # embedding dimension
-
-    # Setup KV caches (after model distribution)
-    # The number of cache lanes is the same as the maximum number of
-    # micro-batches that can be "in flight" in parallel -- imagine each
-    # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
-    # When decoding is done for certain micro-batches, we can reuse the KV cache
-    # lanes.
-    # TODO: bump up the lane count
-    pipeline_lanes = 1
-    with device:
-        model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes)
-
-    # info on stage size and params
-    stage_size = get_module_size(model)
-    stage_size_formatted = bytes_to_readable(stage_size)
-    stage_num_params = get_num_params(model)
-    logger.info(
-        f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}"
-    )
-    model.eval()
-
-    # Helper function to get example inputs and outputs for the stages.
-    def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        mb_ids = torch.randint(
-            0, config.vocab_size, (batch_size, seqlen), device=device
-        )
-        activation = torch.rand(
-            batch_size, seqlen, dim, device=device, dtype=model_dtype
-        )
-        logits = torch.rand(
-            batch_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
-        )
-        example_inputs = (mb_ids if pp_rank == first_pp_rank else activation,)
-        example_outputs = (logits if pp_rank == last_pp_rank else activation,)
-        return example_inputs, example_outputs
-
-    # Create prefill stage
-    logger.info(f"Creating pipeline stage for prefill {pp_rank=}, {pp_degree=}")
-    example_inputs, example_outputs = get_example_ins_outs(seqlen_prefill)
-    prefill_stage = PipelineStage(
-        model,
-        pp_rank,
-        pp_degree,
-        device,
-        input_args=example_inputs,
-        output_args=example_outputs,
-        group=pp_group,
-    )
-
-    # Create schedule
-    # Number of micro-batches for the schedule is 1, because each step() call we
-    # only push 1 micro-batch into the pipeline. But we can continuously push
-    # new micro-batches into the pipeline as they arrive, achieving same
-    # pipelining effect.
-    prefiller = ScheduleGPipe(prefill_stage, 1)
-
-    # Need these global ids due to the API definition of dist.send and recv
-    first_pp_rank_global_id = dist.get_global_rank(pp_group, first_pp_rank)
-    last_pp_rank_global_id = dist.get_global_rank(pp_group, last_pp_rank)
-
-    pipe.send("ready")
-
-    while True:
-        command = pipe.recv()
-        assert isinstance(command, (str, list))
-        if isinstance(command, str):
-            if command == "stop":
-                break
-            else:
-                raise ValueError(f"Unknown command: {command}")
-        else:
-            prompt = command
-            assert (
-                len(prompt) == batch_size
-            ), f"Expecting {batch_size=} prompts but got {len(prompt)=}"
-            logger.info(f"{color.green}Prompt: {prompt}{color.reset}")
-
-            start_pos = 0
-            # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen
-            input_pos = torch.arange(seqlen_prefill, device=device)
-
-        # encode the prompt
-        input_ids = _encode_strings(
-            prompt, tokenizer, bos=True, device=device, dtype=torch.int64
-        )
-
-        # create a padded tensor for the input prompt
-        padded_sequence, prompt_lengths = _create_padded_prompts(
-            input_ids, tokenizer, seqlen_prefill, start_pos, device
-        )
-
-        # New token generated each iteration
-        # need a row dimension for each prompt in the batch
-        new_token = torch.zeros(batch_size, 1, device=device, dtype=torch.int64)
-        # Store the generated tokens
-        res = []
-
-        # Prefill phase
-        # Run context input through pipeline
-        # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
-        lane = 0
-        kwargs = {"input_pos": input_pos, "cache_lane": lane}
-        with torch.no_grad(), CUDATrackTime() as timer:
-            if pp_rank == first_pp_rank:
-                output = prefiller.step(padded_sequence, **kwargs)
-            elif pp_rank == last_pp_rank:
-                output = prefiller.step(**kwargs)
-            else:  # middle pp ranks
-                prefiller.step(**kwargs)
-
-        logger.info(
-            f"{color.green}Prefilling time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-        )
-
-        # Decode the output -- first generated token
-        if pp_rank == last_pp_rank:
-            logger.info(f"{color.green}Decoding...{prompt_lengths=}{color.reset}")
-            new_token = _batch_decode_next_tokens(output, prompt_lengths)
-            res.append(new_token)
-            # TODO: Move to a separate decoding thread
-            resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-            pipe.send((resp, new_token.tolist()))
-        else:
-            pipe.send(None)
-
-        # seqlen = 1 now
-        seqlen_decode = 1
-        input_pos = torch.tensor([prompt_lengths[0]], device=device)
-
-        # Create decode stage
-        logger.info(f"Creating pipeline stage for decode {pp_rank=}, {pp_degree=}")
-        example_inputs, example_outputs = get_example_ins_outs(seqlen_decode)
-        decode_stage = PipelineStage(
-            model,
-            pp_rank,
-            pp_degree,
-            device,
-            input_args=example_inputs,
-            output_args=example_outputs,
-            group=pp_group,
-        )
-        # create schedule
-        decoder = ScheduleGPipe(decode_stage, 1)
-
-        # Decoding
-        with torch.no_grad(), CUDATrackTime() as timer:
-            while True:
-                command = pipe.recv()
-                assert isinstance(command, str)
-                if command == "stop":
-                    break
-                elif command == "step":
-                    pass
-                else:
-                    raise ValueError(f"Unknown command: {command}")
-
-                kwargs = {"input_pos": input_pos, "cache_lane": lane}
-                # sendrecv between last and first ranks, only if:
-                # first_pp_rank != last_pp_rank.
-                if pp_rank == last_pp_rank and pp_rank != first_pp_rank:
-                    dist.send(
-                        new_token,
-                        dst=first_pp_rank_global_id,
-                        group=pp_group,
-                    )
-                elif pp_rank == first_pp_rank and pp_rank != last_pp_rank:
-                    dist.recv(
-                        new_token,
-                        src=last_pp_rank_global_id,
-                        group=pp_group,
-                    )
-
-                # Run data through pipeline
-                if pp_rank == first_pp_rank:
-                    output = decoder.step(new_token, **kwargs)
-                elif pp_rank == last_pp_rank:
-                    output = decoder.step(**kwargs)
-                else:  # middle pp ranks
-                    decoder.step(**kwargs)
-
-                # Decode the output
-                if pp_rank == last_pp_rank:
-                    new_token = _batch_decode_next_tokens(output)
-                    res.append(new_token)
-                    # TODO: Move to a separate decoding thread
-                    resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-                    pipe.send((resp, new_token))
-                else:
-                    pipe.send(None)
-
-                # Increment input position
-                input_pos += 1
-
-        logger.info(
-            f"{color.green}Decoding time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-        )
-
-        # Display the decoding results
-
-        # output formatted response via last pp group and tp rank 0
-        if pp_rank == last_pp_rank and tp_rank == 0:
-            # `res` is a list of tensors, each being a batch of generated token ids.
-            # We need to concatenate them to get the full sequence of generated
-            # token ids. Thus cat'ing along dim 1.
-            res = torch.cat(res, dim=1)
-            res_list = res.tolist()
-
-            responses = tokenizer.decode(res_list)
-
-            # Show prompts and responses
-            for prompt_text, response_text in zip(prompt, responses):
-                logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}")
-                logger.info(f"Response: {color.red}{response_text} {color.reset}")
-
-    # Cleanup
-    _cleanup()
-    logger.info(
-        f"{color.green}Success{color.white} - {color.blue}Rank {rank} has completed.{color.reset}"
-    )
-
-# TODO: remove or make it work again
-# if __name__ == "__main__":
-#     parser = argparse.ArgumentParser()
-#     parser.add_argument(
-#         "model_name",
-#         type=str,
-#         default="llama3",
-#         help="Name of the model to load",
-#         choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(),
-#     )
-#     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel degree")
-#     parser.add_argument(
-#         "--ntokens",
-#         type=int,
-#         default=40,
-#         help="Number of tokens to generate",
-#     )
-#     parser.add_argument(
-#         "--chpt-from",
-#         type=str,
-#         default="hf",  # TODO: change to torchchat once we support it well
-#         help="Checkpoint format to load from",
-#         choices=["hf", "torchchat"],
-#     )
-#     args = parser.parse_args()
-
-#     main()
diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py
deleted file mode 100644
index 51c472e4a..000000000
--- a/torchchat/distributed/generate.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import asyncio
-import atexit
-import importlib.util
-import subprocess
-import threading
-from abc import abstractmethod
-from collections import deque
-from dataclasses import dataclass
-from functools import partial
-from os import environ
-from pathlib import Path
-from typing import List, Optional
-from uuid import uuid4
-
-import torch.multiprocessing as mp
-from torchchat.cli.builder import BuilderArgs, TokenizerArgs
-from torchchat.distributed.dist_run import NAME_TO_DISTRIBUTION_AND_DTYPE
-from torchchat.distributed.logging_utils import SingletonLogger
-
-logger = SingletonLogger.get_logger()
-
-
-def _setup_env(world_size: int, rank: int, target: callable, *args, **kwargs):
-    environ["MASTER_ADDR"] = "localhost"
-    environ["MASTER_PORT"] = "29500"
-    environ["RDZV_BACKEND"] = "c10d"
-    environ["WORLD_SIZE"] = str(world_size)
-    environ["RANK"] = str(rank)
-    environ["LOCALRANK"] = str(rank)
-
-    return target(*args, **kwargs)
-
-
-def _launch_distributed_inference(
-    model_name: str, builder_args: BuilderArgs, tokenizer_args: TokenizerArgs
-) -> tuple[List]:
-    # launch distributed inference worker, each worker gets a pipe to communicate with the main process
-    logger.info("Launching distributed inference ...")
-
-    num_processes_per_node = builder_args.pp * builder_args.tp
-
-    from torchchat.distributed.dist_run import main
-
-    mp.set_start_method("spawn")
-
-    pipes = []
-    procs = []
-    try:
-        for rank in range(num_processes_per_node):
-            server_pipe, client_pipe = mp.Pipe(duplex=True)
-            pipes.append(server_pipe)
-            procs.append(
-                mp.Process(
-                    target=partial(_setup_env, num_processes_per_node, rank, main),
-                    args=(model_name, builder_args, tokenizer_args, client_pipe),
-                )
-            )
-            procs[-1].start()
-
-        for pipe in pipes:
-            assert pipe.recv() == "ready", "Starting the worker failed"
-    except Exception as e:
-        logger.error(f"Error during distributed inference: {str(e)}")
-        for p in procs:
-            p.kill()
-        raise e
-
-    logger.info(
-        f"Done launching distributed inference on {num_processes_per_node} GPUs."
-    )
-    return procs, pipes
-
-
-@dataclass
-class Output:
-    is_finished: bool = False
-    text: Optional[str] = None
-    token: Optional[list] = None
-
-
-@dataclass
-class Request:
-    request_id: int
-    prompt: str
-
-    @classmethod
-    def new_request(cls, prompt):
-        return cls(request_id=uuid4().int, prompt=prompt)
-
-
-class Scheduler(object):
-    def __init__(
-        self,
-        builder_args,
-        generator_args,
-        pipes,
-        loop,
-    ):
-        self.builder_args = builder_args
-        self.generator_args = generator_args
-        self.requests = {}
-        self.in_flight_requests = {}
-        self.in_flight_batch_order = []
-        self.pipes = pipes
-        self.req_to_states = {}
-        self.req_to_results = {}
-        self.request_queue = mp.Queue()
-        self.loop = loop
-
-    def schedule_request(self, req: Request):
-        # add request to queue and create deque and async event for response
-        self.req_to_states[req.request_id] = asyncio.Event()
-        self.req_to_results[req.request_id] = deque()
-        self.request_queue.put(req)
-
-    def process_requests_loop(self):
-        # Continuously process requests (one at a time for now), results are routed into the requests deque
-        while True:
-            req = self.request_queue.get()
-            if req == "stop":
-                break
-            self.requests = {req.request_id: req.prompt}
-
-            responses = {}
-            running = True
-            while running:
-                outputs = self.step()
-                self.req_to_results[req.request_id].append(outputs[0])
-
-                self.loop.call_soon_threadsafe(self.req_to_states[req.request_id].set)
-
-                running &= not outputs[0].is_finished
-
-    async def wait_for_request(self, req: Request) -> Output:
-        # Wait for request to deliver result, uses event to trigger and reads from left side of deque
-        is_finished = False
-        while not is_finished:
-            await self.req_to_states[req.request_id].wait()
-            while len(self.req_to_results[req.request_id]):
-                output = self.req_to_results[req.request_id].popleft()
-                is_finished |= output.is_finished
-                yield output
-        del self.req_to_states[req.request_id]
-        del self.req_to_results[req.request_id]
-
-    def step(self) -> List[Output]:
-        # Make a prefill or decoding step and receive results
-        responses = []
-        # TODO: Implement a scheduler to handle the requests
-        if len(self.in_flight_requests) > 0:
-            # Receive decoded token
-            for p in self.pipes:
-                p.send("step")
-            for p in self.pipes:
-                responses.append(p.recv())
-
-        else:
-            # Send requests to backend
-            self.in_flight_batch_order = list(self.requests.keys())
-            prompts = [self.requests[k] for k in self.in_flight_batch_order]
-            for p in self.pipes:
-                p.send(prompts)
-            self.in_flight_requests = self.requests
-            self.requests = {}
-            self.current_step = 0
-            # Receive first token
-            for p in self.pipes:
-                responses.append(p.recv())
-        # Filter out None responses from in-between stages
-        responses = [r for r in responses if r is not None][0]
-        outputs = []
-        for k, v in zip(self.in_flight_batch_order, zip(responses[0], responses[1])):
-            text, token_ids = v
-            outputs.append(
-                Output(
-                    # TODO: Look for tokenizer.eos_id as well
-                    is_finished=self.current_step >= self.generator_args.max_new_tokens,
-                    text=text,
-                    token=token_ids,
-                )
-            )
-        if self.current_step >= self.generator_args.max_new_tokens:
-            for p in self.pipes:
-                p.send("stop")
-            self.in_flight_requests = []
-
-        self.current_step += 1
-
-        return outputs
-
-
-class DistributedGenerator(object):
-    def __init__(
-        self,
-        # TODO: switch this to torchchat method
-        model_name: str,
-        builder_args: BuilderArgs,
-        tokenizer_args: TokenizerArgs,
-        # TODO: move GeneratorArgs into a different module
-        generator_args,
-        profile: Optional[Path],
-        quantize: bool,
-        draft_quantize: bool,
-    ):
-        self.model_name = model_name
-        self.builder_args = builder_args
-        self.generate_args = generator_args
-
-        self.check_args()
-
-        self.procs, self.pipes = _launch_distributed_inference(
-            model_name, builder_args, tokenizer_args
-        )
-
-        self.loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(self.loop)
-
-        self.scheduler = Scheduler(builder_args, generator_args, self.pipes, self.loop)
-
-        # TODO: Mode into process and use pipe or queue for comm
-        self.scheduler_thread = threading.Thread(
-            target=self.scheduler.process_requests_loop
-        )
-        self.scheduler_thread.start()
-
-        atexit.register(self.shutdown)
-
-    def shutdown(self):
-        # Stop all processes and threads
-        self.scheduler.request_queue.put("stop")
-        self.scheduler_thread.join()
-
-        for p in self.pipes:
-            p.send("stop")
-        for p in self.procs:
-            p.kill()
-
-    def generate(self, text):
-        # Function to generate text from prompt
-        req = Request.new_request(text)
-        self.scheduler.schedule_request(req)
-
-        generator = self.scheduler.wait_for_request(req)
-
-        running = True
-        while running:
-            output = self.loop.run_until_complete(generator.__anext__())
-            running &= not output.is_finished
-
-            yield output
-
-    def check_args(self):
-        if self.generate_args.chat_mode:
-            raise NotImplementedError(
-                "Currently we only support generate with --distributed"
-            )
-        elif self.builder_args.tp < 2:
-            raise ValueError("TP degree must be at least 2 for distributed inference")
-        elif self.model_name not in NAME_TO_DISTRIBUTION_AND_DTYPE.keys():
-            raise ValueError(
-                f"Distributed inference currently only supports then following models: {list(NAME_TO_DISTRIBUTION_AND_DTYPE.keys())}"
-            )
-        elif self.builder_args.chpt_from == "torchchat":
-            raise ValueError(
-                f"Distributed inference currently only supports HF checkpoints"
-            )
diff --git a/torchchat/distributed/utils.py b/torchchat/distributed/utils.py
index 46ea5d9a1..85bfe04fc 100644
--- a/torchchat/distributed/utils.py
+++ b/torchchat/distributed/utils.py
@@ -6,15 +6,15 @@
 
 import itertools
 import os
+import time
 from dataclasses import dataclass
 from datetime import timedelta
-import time
+from os import environ
 from typing import Optional
 
 
 import torch
 
-
 from torchchat.distributed.logging_utils import SingletonLogger
 logger = SingletonLogger.get_logger()
 
@@ -257,3 +257,13 @@ def get_device_info(
             f"with {self.device_capacity_gib:.2f}GiB memory"
         )
         return device_info
+
+def run_in_dist_env(world_size: int, rank: int, target: callable):
+    environ["MASTER_ADDR"] = "localhost"
+    environ["MASTER_PORT"] = "29500"
+    environ["RDZV_BACKEND"] = "c10d"
+    environ["WORLD_SIZE"] = str(world_size)
+    environ["RANK"] = str(rank)
+    environ["LOCALRANK"] = str(rank)
+
+    return target()
diff --git a/torchchat/edge/android/torchchat/app/build.gradle.kts b/torchchat/edge/android/torchchat/app/build.gradle.kts
index e0c9c196b..a98a70cab 100644
--- a/torchchat/edge/android/torchchat/app/build.gradle.kts
+++ b/torchchat/edge/android/torchchat/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama.aar"))
+  implementation(files("libs/executorch.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")
diff --git a/torchchat/export.py b/torchchat/export.py
index 7c5243b68..e7cb32309 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -5,13 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
+import torch._inductor
 import torch.nn as nn
 
 from torch.export import Dim
-import torch._inductor
 
 from torchchat.cli.builder import (
     _initialize_model,
@@ -28,6 +28,31 @@
 default_device = "cpu"
 
 
+"""
+Export Snapshot
+"""
+
+
+def export_snapshot(
+    model: nn.Module,
+    device: Optional[str] = None,
+    output_path: str = "model-snapshot.tc",
+) -> str:
+    """
+    Export the model as snapshot.
+
+    Args:
+        model: The model to be exported.
+        device: The device to run the model on.
+        output_path: The path to save the exported model.
+    Returns:
+        The path to the exported model.
+    """
+    assert output_path.endswith(".tc"), "use .tc extension for snapshots"
+    torch.save(model, output_path)
+    return output_path
+
+
 """
 Export for Server
 """
@@ -39,6 +64,7 @@ def export_for_server(
     output_path: str = "model.pt2",
     dynamic_shapes: bool = False,
     package: bool = True,
+    metadata: Optional[Dict[str, str]] = None,
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -67,21 +93,28 @@ def export_for_server(
         dynamic_shapes = None
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-        metadata = {}  # TODO: put more metadata here
-        options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata}
+        options = {
+            "aot_inductor.package": package,
+            "aot_inductor.metadata": metadata or {},
+        }
+
         if not package:
             options = {"aot_inductor.output_path": output_path}
 
-        path = torch._export.aot_compile(
+        ep = torch.export.export(
             model,
             example_inputs,
             dynamic_shapes=dynamic_shapes,
-            options=options,
         )
 
         if package:
-            from torch._inductor.package import package_aoti
-            path = package_aoti(output_path, path)
+            path = torch._inductor.aoti_compile_and_package(
+                ep, package_path=output_path, inductor_configs=options
+            )
+        else:
+            path = torch._inductor.aot_compile(
+                ep.module(), example_inputs, options=options
+            )
 
     print(f"The generated packaged model can be found at: {path}")
     return path
@@ -102,13 +135,13 @@ def export_for_server(
     from typing import Any, Dict, Tuple, Union
 
     import executorch.exir as exir
+    from executorch.backends.xnnpack._passes.convert_to_linear import (
+        ConvertToLinearPass,
+    )
 
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
-    from executorch.backends.xnnpack._passes.convert_to_linear import (
-        ConvertToLinearPass,
-    )
     from executorch.exir import EdgeProgramManager, to_edge
 
     from executorch.exir.capture._config import (
@@ -121,8 +154,7 @@ def export_for_server(
     )
     from executorch.exir.tracer import Value
 
-    from torch._export import capture_pre_autograd_graph
-    from torch.export import export, ExportedProgram
+    from torch.export import export, export_for_training, ExportedProgram
 
     from torchchat.model import apply_rotary_emb, Attention
     from torchchat.utils.build_utils import get_precision
@@ -166,18 +198,22 @@ def __init__(self, attention: Attention):
 
             self.wo = attention.wo
 
-            max_batch_size, n_heads, max_seq_length, head_dim = (
-                attention.kv_cache[0].k_cache.shape
-            )
+            max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[
+                0
+            ].k_cache.shape
             cache_dtype = attention.kv_cache[0].k_cache.dtype
             # The `Attention` module being replaced can have multiple KV caches
             # (denoted by `cache_lanes`).  Thus we follow the same setup format
             # as in `Attention.setup_cache`.
             cache_lanes = len(attention.kv_cache)
-            self.kv_cache = nn.ModuleList([
-                CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype)
-                for _ in range(cache_lanes)
-            ])
+            self.kv_cache = nn.ModuleList(
+                [
+                    CustomKVCache(
+                        max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
+                    )
+                    for _ in range(cache_lanes)
+                ]
+            )
 
             self.n_heads = attention.n_heads
             self.head_dim = attention.head_dim
@@ -215,9 +251,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.extension.llm.custom_ops import (  # noqa
-            sdpa_with_kv_cache,
-        )
+        from executorch.extension.llm.custom_ops import custom_ops  # noqa
 
         for name, child in module.named_children():
             if isinstance(child, Attention):
@@ -238,7 +272,9 @@ def _to_core_aten(
             raise ValueError(
                 f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
             )
-        core_aten_ep = export(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        core_aten_ep = export_for_training(
+            model, example_inputs, dynamic_shapes=dynamic_shapes
+        )
         if verbose:
             logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
         return core_aten_ep
@@ -308,7 +344,7 @@ def export_for_et(model, device, output_path) -> str:
         with torch.nn.attention.sdpa_kernel(
             [torch.nn.attention.SDPBackend.MATH]
         ), torch.no_grad():
-            m = capture_pre_autograd_graph(model, input, dynamic_shapes=dynamic_shapes)
+            m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module()
 
             edge_manager = export_to_edge(
                 m,
@@ -350,7 +386,11 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
+    set_backend(
+        dso=args.output_dso_path,
+        pte=args.output_pte_path,
+        aoti_package=args.output_aoti_package_path,
+    )
 
     builder_args.dso_path = None
     builder_args.pte_path = None
@@ -359,6 +399,7 @@ def main(args):
 
     output_pte_path = args.output_pte_path
     output_dso_path = args.output_dso_path
+    output_snapshot_path = args.output_snapshot_path
     output_aoti_package_path = args.output_aoti_package_path
 
     if output_pte_path and builder_args.device != "cpu":
@@ -366,12 +407,13 @@ def main(args):
             f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting."
         )
         builder_args.device = "cpu"
-    elif "mps" in builder_args.device:
+    elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device:
         print("Warning! Device MPS not supported for export. Exporting for device CPU.")
         builder_args.device = "cpu"
 
     # TODO: clean this up
     # This mess is because ET does not support _weight_int4pack_mm right now
+    tokenizer_args = None
     if not builder_args.gguf_path:
         # tokenizer needed for quantization so get that here,
         try:
@@ -382,9 +424,8 @@ def main(args):
 
         if builder_args.max_seq_length is None:
             if (
-                (output_dso_path is not None or output_aoti_package_path is not None)
-                and not builder_args.dynamic_shapes
-            ):
+                output_dso_path is not None or output_aoti_package_path is not None
+            ) and not builder_args.dynamic_shapes:
                 print("Setting max_seq_length to 300 for DSO export.")
                 builder_args.max_seq_length = 300
             elif output_pte_path is not None:
@@ -397,11 +438,13 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
-            support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None,
+            support_tensor_subclass=output_dso_path is None
+            and output_aoti_package_path is None,
         )
         model_to_pte = model
         model_to_dso = model
         model_to_aoti_package = model
+        model_to_snapshot = model
     else:
         if output_pte_path:
             _set_gguf_kwargs(builder_args, is_et=True, context="export")
@@ -421,6 +464,15 @@ def main(args):
             model_to_dso = model_to_aoti_package
             _unset_gguf_kwargs(builder_args)
 
+        if output_snapshot_path:
+            _set_gguf_kwargs(builder_args, is_et=False, context="export")
+            model_to_snapshot = _initialize_model(
+                builder_args,
+                quantize,
+                support_tensor_subclass=False,
+            )
+            _unset_gguf_kwargs(builder_args)
+ 
     with torch.no_grad():
         if output_pte_path:
             output_pte_path = str(os.path.abspath(output_pte_path))
@@ -435,7 +487,9 @@ def main(args):
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.")
+            print(
+                "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
+            )
             export_for_server(
                 model_to_dso,
                 builder_args.device,
@@ -446,11 +500,33 @@ def main(args):
 
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
-            print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")
+
+            if tokenizer_args is None:
+                tokenizer_type = "0"
+            elif tokenizer_args.is_sentencepiece:
+                tokenizer_type = "2"  # Corresponding to llama2
+            else:
+                tokenizer_type = "3"  # Corresponding to llama3
+
+            metadata = {"tokenizer_type": tokenizer_type}
+            print(
+                "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
+            )
             export_for_server(
                 model_to_aoti_package,
                 builder_args.device,
                 output_aoti_package_path,
                 builder_args.dynamic_shapes,
                 package=True,
+                metadata=metadata,
+            )
+
+        if output_snapshot_path:
+            output_snapshot_path = str(os.path.abspath(output_snapshot_path))
+            print(f"Exporting model using Snapshot to {output_snapshot_path}")
+            export_snapshot(
+                model_to_snapshot,
+                builder_args.device,
+                output_snapshot_path,
             )
+
diff --git a/torchchat/generate.py b/torchchat/generate.py
index dd423b58a..7f37386ac 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -3,13 +3,15 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import argparse
 import base64
+import contextlib
 import itertools
 import logging
 import os
 import textwrap
 import time
+from concurrent import futures
+from functools import partial
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -21,6 +23,10 @@
 import torch
 import torch._dynamo.config
 import torch._inductor.config
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torch._C import _SDPBackend as SDPBackend
 
 from PIL import Image
 
@@ -28,7 +34,6 @@
 from torchtune.data import Message, padded_collate_tiled_images_and_mask
 
 from torchtune.generation import sample as tune_sample
-from torchtune.models.llama3 import llama3_tokenizer
 
 from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
 from torchtune.training import set_default_dtype
@@ -39,19 +44,63 @@
     BuilderArgs,
     TokenizerArgs,
 )
-from torchchat.distributed.generate import DistributedGenerator
+from torchchat.distributed.utils import (
+    Color as color,
+    run_in_dist_env,
+)
 from torchchat.model import Model, ModelType
 from torchchat.utils.build_utils import device_sync, set_precision
 from torchchat.utils.device_info import get_device_info
 
+logger = logging.getLogger(__name__)
+
+
+# NOTE: Logging disabled by default here due to conflicts with torch._dynamo
+class NoOpLogger:
+    def __no_op(self, *_, **__):
+        pass
+    def __getattr__(self, name):
+        return self.__no_op
+
+
+logger = (
+    NoOpLogger() if os.getenv("LOG_LEVEL") is None
+    else logging.getLogger(__name__)
+)
+
+## Chat Formatters #############################################################
 
 class _ChatFormatter(ABC):
+
+    # Messages can arrive as a standard dict with "role" and "content" as
+    # strings, or where "content" is a list of objects with "text" fields.
+    MESSAGE_TYPE = Dict[str, Union[str, List[Dict[str, str]]]]
+
+    # A dialog is a sequence of messages
+    DIALOG_TYPE = List[MESSAGE_TYPE]
+
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     @abstractmethod
-    def encode_dialog_prompt(self, dialog) -> List[int]:
-        raise NotImplementedError()
+    def encode_dialog_prompt(
+        self,
+        dialog: DIALOG_TYPE,
+        add_generation_prompt: bool = True,
+    ) -> List[int]:
+        """Encode a sequence of messages into a sequence of token IDs, including
+        the chat template
+
+        Args:
+            dialog (DIALOG_TYPE): The sequence of dialog messages to encode.
+                This will be the additional messages on top of those that have
+                already been processed.
+            add_generation_prompt (bool): Whether to include a generation prompt
+                at the end of the encoded sequence.
+
+        Returns:
+            List[int]: A list of token IDs representing the encoded prompt.
+        """
 
 
 class Llama3ChatFormatter(_ChatFormatter):
@@ -61,7 +110,7 @@ class Llama3ChatFormatter(_ChatFormatter):
 
     """
 
-    def encode_header(self, role) -> List[int]:
+    def _encode_header(self, role) -> List[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
         tokens.extend(self.tokenizer.encode(role, bos=False, eos=False))
@@ -69,8 +118,8 @@ def encode_header(self, role) -> List[int]:
         tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
         return tokens
 
-    def encode_message(self, message) -> List[int]:
-        tokens = self.encode_header(message["role"])
+    def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:
+        tokens = self._encode_header(message["role"])
         if isinstance(message["content"], str):
             tokens.extend(
                 self.tokenizer.encode(message["content"], bos=False, eos=False)
@@ -85,46 +134,80 @@ def encode_message(self, message) -> List[int]:
         tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
         return tokens
 
-    def encode_dialog_prompt(self, dialog) -> List[int]:
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool = True,
+    ) -> List[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
         for message in dialog:
-            tokens.extend(self.encode_message(message))
+            tokens.extend(self._encode_message(message))
         # Add the start of an assistant message for the model to complete.
-        tokens.extend(self.encode_header("assistant"))  # Pass role directly as a string
+        if add_generation_prompt and dialog and dialog[-1]["role"] != "assistant":
+            tokens.extend(self._encode_header("assistant")) # Pass role directly as a string
         return tokens
 
 
-B_INST, E_INST = "[INST]", "[/INST]"
-B_SYS, E_SYS = "<<SYS>>", "<</SYS>>"
+class Llama2ChatFormatter(_ChatFormatter):
+    """
+    Chat formatting for Llama2
+    CITE: https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/
+    """
 
+    B_INST, E_INST = "[INST] ", " [/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
-class Llama2ChatFormatter(_ChatFormatter):
-    def encode_dialog_prompt(self, dialog) -> List[int]:
-        tokens = self.tokenizer.encode(f"{B_INST} ")
-        first_message = True  # Bool to handle placing the B_INST token. Behavior is weird - the system prompt should have the B_INST, but not the first user message. All following user messages *should* have it. Also, if there is no system prompt, then the user message should have it.
+    @staticmethod
+    def _get_content_str(message: _ChatFormatter.MESSAGE_TYPE) -> str:
+        if isinstance(message["content"], list):
+            return message["content"][0]["text"]
+        return message["content"]
+
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool = True, # UNUSED
+    ) -> List[int]:
+        new_turn = True
+        tokens = []
         for message in dialog:
-            if isinstance(message["content"], list):
-                content = message["content"][0]["text"]
+            if new_turn:
+                tokens += self.tokenizer.encode(f"{self.tokenizer.bos}{self.B_INST}")
+            content = self._get_content_str(message).strip()
+            role = message["role"]
+            if role == "system":
+                tokens += self.tokenizer.encode(f"{self.B_SYS}{content}{self.E_SYS}")
+                new_turn = False
+            elif role == "user":
+                tokens += self.tokenizer.encode(f"{content}{self.E_INST}")
+                new_turn = False
+            elif role == "assistant":
+                tokens += self.tokenizer.encode(f" {content} {self.tokenizer.eos}\n")
+                new_turn = True
             else:
-                content = message["content"]
-            content = content.strip()
-            if message["role"] == "system":
-                encoded = self.tokenizer.encode(f"{B_SYS}\n{content}\n{E_SYS}")
-                first_message = False
-            elif message["role"] == "user":
-                encoded = [self.tokenizer.bos_id()] + self.tokenizer.encode(
-                    f"{B_INST if first_message else ''} {content} {E_INST} "
-                )
-                first_message = True
-            elif message["role"] == "assistant":
-                encoded = self.tokenizer.encode(f"{content}\n\n") + [
-                    self.tokenizer.eos_id()
-                ]
-            tokens += encoded
+                raise ValueError("Invalid role in dialog.")
         return tokens
 
 
+
+class HFTokenizerChatFormatter(_ChatFormatter):
+    """Chat formatter that uses the built-in formatting capabilities of an HF
+    tokenizer instance
+    """
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool = True,
+    ) -> List[int]:
+        rendered = self.tokenizer.apply_chat_template(
+            dialog, add_generation_prompt=add_generation_prompt
+        )
+        logger.debug("Formatted chat prompt:\n%s", rendered)
+        return self.tokenizer.encode(rendered)
+
+## Generation ##################################################################
+
 @dataclass
 class GeneratorArgs:
     prompt: Optional[str] = (
@@ -214,7 +297,7 @@ def from_args(cls, args):
         )
 
 
-class Generator:
+class LocalGenerator:
     """
     Generates text samples based on a pre-trained Transformer model and tokenizer.
     Args:
@@ -251,6 +334,7 @@ def __init__(
         self.draft_quantize = draft_quantize
         self.is_torchtune_model = generator_args.is_torchtune_model
         self.dtype = builder_args.precision
+        self.get_user_input : Callable = input
 
         self.rank: Optional[int] = None
 
@@ -283,9 +367,13 @@ def __init__(
         if self.is_llama3_model:
             self.chat_formatter = Llama3ChatFormatter(self.tokenizer)
             if generator_args.chat_mode:
-                logging.debug(
+                logger.debug(
                     "Llama3 model detected in chat mode. Using updated sentence schemas"
                 )
+        elif self.tokenizer_args.is_hf_tokenizer:
+            if not self.tokenizer.has_chat_template():
+                raise ValueError("Tokenizer must have a chat template")
+            self.chat_formatter = HFTokenizerChatFormatter(self.tokenizer)
         else:
             self.chat_formatter = Llama2ChatFormatter(self.tokenizer)
 
@@ -341,10 +429,12 @@ def sample(
         temperature: float = 0,
         top_k: Optional[int] = None,
     ):
+        logits = logits[0, -1]
+        logger.debug("Logits: %s", logits)
         if temperature == 0 and not need_probs:
-            _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1)
+            _, idx_next = torch.topk(logits, k=1, dim=-1)
             return (idx_next, None)
-        probs = self.logits_to_probs(logits[0, -1], temperature, top_k)
+        probs = self.logits_to_probs(logits, temperature, top_k)
         idx_next = self.multinomial_sample_one_no_sync(probs)
         return idx_next, probs
 
@@ -358,7 +448,7 @@ def prefill(
         sequential_prefill=True,
         **sampling_kwargs,
     ) -> torch.Tensor:
-        # logging.debug(f"x: {x}, input_pos: {input_pos}")
+        logger.debug("x: %s, input_pos: %s", x, input_pos)
         width = x.size(1)
         assert input_pos.size(0) == width
 
@@ -394,14 +484,12 @@ def prefill(
         elif sequential_prefill:
             for i in range(width):
                 x_sliced, ip_sliced = x[:, i].view(-1, 1), input_pos[i].view(-1)
-                # logging.debug(f"<sliced> x: {x_sliced}, input_pos: {ip_sliced}")
+                logger.debug("<sliced> x: %s, input_pos: %s", x_sliced, ip_sliced)
                 logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])da
         else:
             # input_pos: [B, S]
             logits = model(x, input_pos)
-            # print(f"logits {logits.shape}")
 
-        # print(f"x: {x},\n  input_pos: {input_pos}\n")
         return self.sample(logits, need_probs=False, **sampling_kwargs)[0]
 
     def decode_one_token(
@@ -425,7 +513,6 @@ def decode_one_token(
             )[:, -1:]
         else:
             logits = model(x, input_pos)
-        # print(f"x: {x},\n  input_pos: {input_pos}\n")
         return self.sample(logits, need_probs=need_probs, **sampling_kwargs)
 
     """
@@ -445,6 +532,7 @@ def decode_n_tokens(
         callback=lambda _: _,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
         new_tokens, new_probs = [], []
@@ -453,7 +541,7 @@ def decode_n_tokens(
             num_new_tokens - 1
         ):  # -1 to save space to run an EoS if dont generate it naturally
             # Actually better for Inductor to codegen attention here
-            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            with torch.nn.attention.sdpa_kernel([attention_backend]):
 
                 out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
@@ -591,11 +679,13 @@ def generate(
             Dict[str, Any]
         ] = None,  # List of Image prompt tensors for multimodal models
         start_pos: int = 0,
+        skip_cache_setup: bool = False,
         draft_model: Model,
         speculate_k: Optional[int] = 8,
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -614,26 +704,27 @@ def generate(
         max_new_tokens = min(max_new_tokens, max_seq_length - start_pos - prompt_length)
         # set up caches only if first inference
         if start_pos == 0:
-            model = model.to(device=device)
-            with torch.device(device):
-                if (
-                    self.is_torchtune_model
-                    or self.model.config.model_type == ModelType.Flamingo
-                ):
-                    # 6404 is one-gpu affordable max_seq_length for single image input
-                    model.setup_caches(
-                        batch_size=1,
-                        dtype=self.dtype,
-                        encoder_max_seq_len=6404,
-                        decoder_max_seq_len=max_seq_length,
-                    )
-                else:
-                    model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
-                if is_speculative and draft_model is not model:
-                    draft_model.setup_caches(
-                        max_batch_size=1,
-                        max_seq_length=max_seq_length,
-                    )
+            if not skip_cache_setup:
+                model = model.to(device=device)
+                with torch.device(device):
+                    if (
+                        self.is_torchtune_model
+                        or self.model.config.model_type == ModelType.Flamingo
+                    ):
+                        # 6404 is one-gpu affordable max_seq_length for single image input
+                        model.setup_caches(
+                            batch_size=1,
+                            dtype=self.dtype,
+                            encoder_max_seq_len=6404,
+                            decoder_max_seq_len=max_seq_length,
+                        )
+                    else:
+                        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+                    if is_speculative and draft_model is not model:
+                        draft_model.setup_caches(
+                            max_batch_size=1,
+                            max_seq_length=max_seq_length,
+                        )
             if model.config.model_type == ModelType.Flamingo:
                 model.reset_caches()
 
@@ -711,6 +802,7 @@ def generate(
                     if self.is_llama3_model
                     else None
                 ),
+                attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
                 generated_tokens.append(generated_token.view(-1))
@@ -725,7 +817,8 @@ def encode_tokens(self, string, bos=True, device="cpu"):
         tokens = self.tokenizer.encode(string)
         if bos:
             tokens = [self.tokenizer.bos_id()] + tokens
-        logging.debug(f"Size after encode_tokens: {len(tokens)}")
+        logger.debug("Size after encode_tokens: %d", len(tokens))
+        logger.debug("Token IDs: %s", tokens)
         return torch.tensor(tokens, dtype=torch.int, device=device)
 
     def _callback(self, x, *, buffer, done_generating):
@@ -745,7 +838,6 @@ def _callback(self, x, *, buffer, done_generating):
         if len(buffer) == 4 or done_generating:
             print("".join(buffer), end="", flush=True)
             buffer.clear()
-        # print(, end='', flush=True)
 
     def _gen_model_input(
         self,
@@ -774,7 +866,7 @@ def _gen_model_input(
             # Single String prompt
             if isinstance(prompt, str):
                 encoded = self.encode_tokens(
-                    prompt, bos=True, device=self.builder_args.device
+                    prompt, bos=self.model.config.tokenizer_prepend_bos, device=self.builder_args.device
                 )
             # List of dialog
             else:
@@ -783,7 +875,7 @@ def _gen_model_input(
                     tokens, dtype=torch.int, device=self.builder_args.device
                 )
 
-            logging.debug(encoded)
+            logger.debug(encoded)
             return encoded, None
 
         # Llama 3.2 11B
@@ -867,7 +959,7 @@ def _gen_model_input(
 
             if image_found:
                 batch = padded_collate_tiled_images_and_mask(
-                    [data], pad_direction="left", pad_max_images=1
+                    [data], pad_direction="left", pad_max_images=1, pad_max_tiles=transform.max_num_tiles
                 )
                 encoded = batch.pop("tokens").to(device).view(-1)
                 seq_len = encoded.size(0)
@@ -898,7 +990,7 @@ def _gen_model_input(
                 value=0,
             )
 
-        logging.debug(encoded)
+        logger.debug(encoded)
         return encoded, batch
 
     def chat(
@@ -914,14 +1006,14 @@ def chat(
                 for p in itertools.chain(self.model.parameters(), self.model.buffers())
             ]
         )
-        if generator_args.compile:
-            if (
-                self.is_speculative and self.builder_args.use_distributed
-            ):  # and ("cuda" in builder_args.device):
-                torch._inductor.config.triton.cudagraph_trees = (
-                    False  # Bug with cudagraph trees in this case
-                )
+        if self.builder_args.distributed:
+            # During distributed inference the model gets sharded among the ranks
+            # So we need to all reduce the model size to get the total model size
+            model_size = torch.tensor(model_size, dtype=torch.int64, device=self.device)
+            dist.all_reduce(model_size)
+            model_size = model_size.item()
 
+        if generator_args.compile:
             if self.builder_args.device == "cpu":
                 if generator_args.max_autotune:
                     kwargs = {"mode": "max-autotune"}
@@ -979,11 +1071,11 @@ def chat(
             print(
                 f"Entering Chat Mode. Will continue chatting back and forth with the language model until the models max context length of {max_seq_length} tokens is hit or until the user says /bye"
             )
-            get_system_prompt = input(
+            get_system_prompt = self.get_user_input(
                 "Do you want to enter a system prompt? Enter y for yes and anything else for no. \n"
             )
             if get_system_prompt == "y" or get_system_prompt == "Y":
-                self.system_prompt = input("What is your system prompt? \n")
+                self.system_prompt = self.get_user_input("What is your system prompt? \n")
 
         # `is_torchtune_model` is a misnomer since it doesn't capture all
         # torchtune models (i.e. Flamingo)
@@ -1020,43 +1112,27 @@ def chat(
         )
         for i in range(num_samples):
             device_sync(device=self.builder_args.device)
+            is_first_sample: bool = i == 0
             if generator_args.chat_mode:
-                prompt = input("User: ")
+                prompt = self.get_user_input("User: ")
                 if prompt == "/bye":
                     print("Exiting Chat.\n")
                     break
-                if not self.is_llama3_model:
-                    if self.system_prompt:
-                        prompt = f"{B_INST} {B_SYS}\n{self.system_prompt.strip()}\n{E_SYS}\n\n{prompt.strip()} {E_INST}"
-                        self.system_prompt = (
-                            None  # can only provide system prompt on first interaction
-                        )
-                    else:
-                        prompt = f"{B_INST} {prompt.strip()} {E_INST}"
-                    encoded = self.encode_tokens(
-                        prompt, bos=True, device=self.builder_args.device
-                    )
-                else:
-                    if self.system_prompt:
-                        encoded = self.chat_formatter.encode_dialog_prompt(
-                            [
-                                {"role": "system", "content": self.system_prompt},
-                                {"role": "user", "content": prompt},
-                            ]
-                        )
-                        self.system_prompt = None
-                    elif i == 0:
-                        encoded = self.chat_formatter.encode_dialog_prompt(
-                            [{"role": "user", "content": prompt}]
-                        )
-                    else:
-                        encoded = self.chat_formatter.encode_message(
-                            {"role": "user", "content": prompt}
-                        )
-                        encoded.extend(self.chat_formatter.encode_header("assistant"))
-                    encoded = torch.tensor(
-                        encoded, dtype=torch.int, device=self.builder_args.device
+
+                # Encode the additional messages added in this dialog turn. If
+                # this is the first turn, that includes any system prompt.
+                messages_to_encode = []
+                if is_first_sample and self.system_prompt:
+                    messages_to_encode.append(
+                        {"role": "system", "content": self.system_prompt}
                     )
+                messages_to_encode.append({"role": "user", "content": prompt})
+                encoded = self.chat_formatter.encode_dialog_prompt(
+                    messages_to_encode, add_generation_prompt=True,
+                )
+                encoded = torch.tensor(
+                    encoded, dtype=torch.int, device=self.builder_args.device
+                )
                 if encoded.size(0) + start_pos > max_seq_length:
                     print(
                         "This prompt would take us past the max_seq_length. Ending Conversation."
@@ -1091,11 +1167,7 @@ def callback(x, *, done_generating=False):
 
                 torch._inductor.config.profiler_mark_wrapper_call = True
                 torch._inductor.config.cpp.enable_kernel_profile = True
-            if (i != generator_args.num_samples - 1 or not self.profile) or (
-                self.builder_args.use_distributed and self.rank != 0
-            ):
-                import contextlib
-
+            if i != generator_args.num_samples - 1 or not self.profile:
                 prof = contextlib.nullcontext()
             else:
                 torch.profiler._utils._init_for_cuda_graphs()
@@ -1116,8 +1188,12 @@ def callback(x, *, done_generating=False):
                     top_k=generator_args.top_k,
                     sequential_prefill=generator_args.sequential_prefill,
                     start_pos=start_pos,
+                    skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
+                    attention_backend=self.builder_args.attention_backend,
                 )
+                if generator_args.chat_mode:
+                    start_pos += encoded.size(0)
                 for token_tensor, metrics in generator_func:
                     if token_tensor is not None:
                         start_pos += token_tensor.size(0)
@@ -1125,7 +1201,7 @@ def callback(x, *, done_generating=False):
                     if metrics is not None:
                         aggregate_metrics.update(metrics)
                     yield token_tensor, metrics
-            jit_compile = (i == 0) and (
+            jit_compile = is_first_sample and (
                 generator_args.compile or generator_args.compile_prefill
             )
             compilation_time = time.perf_counter() - t0
@@ -1134,12 +1210,11 @@ def callback(x, *, done_generating=False):
             if hasattr(prof, "export_chrome_trace"):
                 if self.builder_args.device == "cpu":
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
-                else:
+                elif self.builder_args.device == "cuda":
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
-                if self.builder_args.use_distributed:
-                    prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
                 else:
-                    prof.export_chrome_trace(f"{self.profile}.json")
+                    print(prof.key_averages().table(sort_by="self_xpu_time_total"))
+                prof.export_chrome_trace(f"{self.profile}.json")
 
             if start_pos >= max_seq_length:
                 print(
@@ -1157,9 +1232,11 @@ def callback(x, *, done_generating=False):
                 print(
                     f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds"
                 )
-            aggregate_metrics["tokens_per_sec"].append(tokens_sec)
-            aggregate_metrics["first_token_per_sec"].append(first_token_sec)
-            aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
+            else:
+                # aggregate_metrics will not append when is jit_compile, which will affect the average numbers.
+                aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+                aggregate_metrics["first_token_per_sec"].append(first_token_sec)
+                aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
 
             logging.info(
                 f"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\
@@ -1197,31 +1274,348 @@ def callback(x, *, done_generating=False):
                 f"Mean Accepted: {sum([idx * i for idx, i in enumerate(counts_aggregated)])/sum(counts_aggregated)}"
             )
 
-        print(
-            f"\n      Average tokens/sec (total): {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f} \
-                \nAverage tokens/sec (first token): {torch.mean(torch.tensor(aggregate_metrics['first_token_per_sec'])).item():.2f} \
-                \nAverage tokens/sec (next tokens): {torch.mean(torch.tensor(aggregate_metrics['next_tokens_per_sec'])).item():.2f} \n\
+        avg_tokens_sec = torch.mean(
+            torch.tensor(aggregate_metrics["tokens_per_sec"])
+        ).item()
+        avg_first_token_sec = torch.mean(
+            torch.tensor(aggregate_metrics["first_token_per_sec"])
+        ).item()
+        avg_next_tokens_sec = torch.mean(
+            torch.tensor(aggregate_metrics["next_tokens_per_sec"])
+        ).item()
+
+        if not (
+            torch.isnan(torch.tensor(avg_tokens_sec))
+            or torch.isnan(torch.tensor(avg_first_token_sec))
+            or torch.isnan(torch.tensor(avg_next_tokens_sec))
+        ):
+            print(
+                f"\nWarning: Excluding compile in calculations \
+                \n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
+                \nAverage tokens/sec (first token): {avg_first_token_sec:.2f} \
+                \nAverage tokens/sec (next tokens): {avg_next_tokens_sec:.2f} \n\
                 "
-        )
+            )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
+        if torch.xpu.is_available():
+            print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB")
 
 
-def _launch_distributed_inference(
-    builder_args: BuilderArgs,
-):
-    from torch.distributed import launcher
-    from torch.distributed.elastic.utils.distributed import get_free_port
 
-    print("Launching distributed inference within generator")
+class DistributedGenerator(LocalGenerator):
+    def __init__(
+        self,
+        builder_args: BuilderArgs,
+        speculative_builder_args: BuilderArgs,
+        tokenizer_args: TokenizerArgs,
+        generator_args: GeneratorArgs,
+        profile: Optional[Path],
+        quantize: bool,
+        draft_quantize: bool,
+        ):
+        
+        is_speculative = speculative_builder_args.checkpoint_path is not None
+        assert is_speculative == False, "Distributed inference with pp > 1 does not support speculative inference yet."
+        super().__init__(
+            builder_args,
+            speculative_builder_args,
+            tokenizer_args,
+            generator_args,
+            profile,
+            quantize,
+            draft_quantize,
+        )
+        self.rank = dist.get_rank()
+        # Assuming same number of GPUs per node
+        self.device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
 
+        def distributed_input(prompt: str) -> str:
+            if dist.get_rank() == 0:
+                text = [input(prompt)]
+            else:
+                text = [None]
+            
+            dist.broadcast_object_list(text)
+            return text[0]
 
-def main(args):
+        self.get_user_input: Callable = distributed_input
+
+        if builder_args.pp > 1:
+            self.seqlen_prefill = 1024  # sequence length for prefill stage
+
+            logger.warn(f"{color.yellow}Pipeline parallelism is still experimental and might be slow{color.reset}")
+            pp_mesh = self.model.device_mesh["pp"]
+
+            self.pp_rank = pp_mesh.get_local_rank()
+            self.pp_group = pp_mesh.get_group()
+
+            self.pp_degree = pp_mesh.size()
+
+            # Convenience variables
+            self.first_pp_rank = 0
+            self.last_pp_rank = self.pp_degree - 1
+
+
+            self.first_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.first_pp_rank)
+            self.last_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.last_pp_rank)
+
+            self.prefiller = self.create_prefill_stage()
+            self.decoder = self.create_decode_stage()
+
+    def __del__(self):
+        dist.destroy_process_group()
+
+    # Helper function to get example inputs and outputs for the stages.
+    def get_example_ins_outs(self, batch_size: int , seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        This function generates example inputs and outputs for the prefill and decode stages.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the example inputs and outputs.
+        """
+        model_dtype = torch.bfloat16
+        mb_ids = torch.randint(
+            0, self.model.config.vocab_size, (batch_size, seqlen), device=self.device
+        )
+        activation = torch.rand(
+            batch_size, seqlen, self.model.config.dim, device=self.device, dtype=model_dtype
+        )
+        logits = torch.rand(
+            batch_size, seqlen, self.model.config.vocab_size, device=self.device, dtype=model_dtype
+        )
+        example_inputs = (mb_ids if self.pp_rank == self.first_pp_rank else activation,)
+        example_outputs = (logits if self.pp_rank == self.last_pp_rank else activation,)
+        return example_inputs, example_outputs
+
+    def create_prefill_stage(self):
+        """
+        Creates a pipeline stage for prefilling.
+
+        Returns:
+            PipelineStage: The created pipeline stage.
+        """
+        batch_size = 1
+
+        # Create prefill stage
+        logger.debug(f"Creating pipeline stage for prefill {self.pp_rank=}, {self.pp_degree=}")
+        example_inputs, example_outputs = self.get_example_ins_outs(batch_size, self.seqlen_prefill)
+        prefill_stage = PipelineStage(
+            self.model,
+            self.pp_rank,
+            self.pp_degree,
+            self.device,
+            input_args=example_inputs,
+            output_args=example_outputs,
+            group=self.pp_group,
+        )
+
+        # Create schedule
+        # Number of micro-batches for the schedule is 1, because each step() call we
+        # only push 1 micro-batch into the pipeline. But we can continuously push
+        # new micro-batches into the pipeline as they arrive, achieving same
+        # pipelining effect.
+        prefiller = ScheduleGPipe(prefill_stage, 1)
+        return prefiller
+
+    def create_decode_stage(self):
+        """
+        Creates a decode stage for the pipeline parallelism.
+
+        Returns:
+            ScheduleGPipe: The decode stage.
+        """
+        # seqlen = 1 now
+        seqlen_decode = 1
+        batch_size = 1
+
+        # Create decode stage
+        # logger.info(f"Creating pipeline stage for decode {self.pp_rank=}, {self.pp_degree=}")
+        example_inputs, example_outputs = self.get_example_ins_outs(batch_size, seqlen_decode)
+        decode_stage = PipelineStage(
+            self.model,
+            self.pp_rank,
+            self.pp_degree,
+            self.device,
+            input_args=example_inputs,
+            output_args=example_outputs,
+            group=self.pp_group,
+        )
+        # create schedule
+        decoder = ScheduleGPipe(decode_stage, 1)
+
+        return decoder
+
+    def prefill(
+        self,
+        model: Model,
+        x: torch.Tensor,
+        input_pos: torch.Tensor,
+        batch: Optional[Dict[str, Any]] = None,  # Inputs for multimodal models
+        *,
+        sequential_prefill=True,
+        **sampling_kwargs,
+    ) -> torch.Tensor:
+        """
+        This function is used to prefill the model with a given prompt. For pipeline parallelism we need to pad the input.
+
+        Returns:
+            torch.Tensor: The prefilled tensor.
+        """
+        if self.builder_args.pp == 1:
+            return super().prefill(
+                model,
+                x,
+                input_pos,
+                batch,
+                sequential_prefill=sequential_prefill,
+                **sampling_kwargs,
+            )
+
+        pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.eos_id
+        prompt_length = x.size(1)
+
+        padded_seq = torch.full(
+            (1, self.seqlen_prefill), pad_token_id, dtype=torch.int64, device=self.device
+            )
+        padded_seq[:,:prompt_length] = x
+        input_pos = torch.arange(
+            self.seqlen_prefill,
+            device=self.device,
+            dtype=torch.int,
+            )
+
+        # Prefill phase
+        # Run context input through pipeline
+        # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
+        lane = 0
+        kwargs = {"input_pos": input_pos, "cache_lane": lane}
+        
+        if self.pp_rank == self.first_pp_rank:
+            logits = self.prefiller.step(padded_seq, **kwargs)
+        elif self.pp_rank == self.last_pp_rank:
+            logits = self.prefiller.step(**kwargs)
+        else:  # middle pp ranks
+            self.prefiller.step(**kwargs)
+
+        if self.pp_rank == self.last_pp_rank:
+            new_token = self.sample(logits[:,:prompt_length], need_probs=False, **sampling_kwargs)[0]
+            if self.pp_rank != self.first_pp_rank:
+                dist.send(
+                    new_token,
+                    dst=self.first_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+        else:
+            new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64)
+            if self.pp_rank == self.first_pp_rank:
+                dist.recv(
+                    new_token,
+                    src=self.last_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+
+        return new_token
+
+    def decode_one_token(
+        self,
+        model: Model,
+        x: torch.Tensor,
+        input_pos: torch.Tensor,
+        need_probs: bool,
+        batch: Optional[Dict[str, Any]] = None,  # Inputs for multimodal models
+        **sampling_kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Decodes a single token.
+
+        # TODO: implement speculative decoding with pp>1
+        Returns:
+            Tuple[torch.Tensor, None]: A tuple containing the decoded token and None.
+        """
+        if self.builder_args.pp == 1:
+            return super().decode_one_token(
+                model,
+                x,
+                input_pos,
+                need_probs,
+                batch=batch,
+                **sampling_kwargs,
+            )
+
+        # input_pos: [B, 1]
+        assert input_pos.shape[-1] == 1
+
+        new_token = x.view(1, -1)
+
+        lane = 0
+        kwargs = {"input_pos": input_pos, "cache_lane": lane}
+        # Run data through pipeline
+        if self.pp_rank == self.first_pp_rank:
+            logits = self.decoder.step(new_token, **kwargs)
+        elif self.pp_rank == self.last_pp_rank:
+            logits = self.decoder.step(**kwargs)
+        else:  # middle pp ranks
+            self.decoder.step(**kwargs)
+
+        # Decode the output
+        if self.pp_rank == self.last_pp_rank:
+            new_token, _ = self.sample(logits, need_probs=need_probs, **sampling_kwargs)
+            if self.pp_rank != self.first_pp_rank:
+                dist.send(
+                    new_token,
+                    dst=self.first_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+        else:
+            new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64)
+            if self.pp_rank == self.first_pp_rank:
+                dist.recv(
+                    new_token,
+                    src=self.last_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+                #TODO: Why do we get 2d tensor here?
+                new_token=new_token[0]
+        return new_token, None
+
+    def sample(
+        self,
+        logits,
+        need_probs: bool,
+        temperature: float = 0,
+        top_k: Optional[int] = None,
+    ):
+        if temperature == 0 and not need_probs:
+            _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1)
+            return (idx_next, None)
+        probs = self.logits_to_probs(logits[0, -1], temperature, top_k)
+        idx_next = self.multinomial_sample_one_no_sync(probs)
+        
+        return idx_next, probs
+
+
+def run_generator(
+    args,
+    rank: Optional[int] =None
+    ):
+    """
+    This function creates and executes a generator 
+    """
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
-    generator_args = GeneratorArgs.from_args(args)
-    if not builder_args.distributed:
+    generator_args = GeneratorArgs.from_args(args)    
+    #Setup rank 1 and up to suppress log messages and print messages
+    if builder_args.distributed and rank != 0:
+        logger.setLevel(logging.CRITICAL)
+        context = contextlib.redirect_stdout(None)
+    else:
+        context = contextlib.nullcontext()
+
+    with context:
+        Generator = DistributedGenerator if builder_args.distributed else LocalGenerator
+        logger.debug("GeneratorArgs: %s", generator_args)
         gen = Generator(
             builder_args,
             speculative_builder_args,
@@ -1233,23 +1627,25 @@ def main(args):
         )
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
+        if torch.xpu.is_available():
+            torch.xpu.reset_peak_memory_stats()
 
         for _ in gen.chat(generator_args):
             pass
-    else:
-        dist_gen = DistributedGenerator(
-            args.model,
-            builder_args,
-            tokenizer_args,
-            generator_args,
-            args.profile,
-            args.quantize,
-            args.draft_quantize,
-        )
-
-        response = ""
-        for output in dist_gen.generate(generator_args.prompt):
-            response += output.text
 
-        print(f"Model output: {response}")
-        dist_gen.shutdown()
+def main(args):
+    builder_args = BuilderArgs.from_args(args)
+    
+    if builder_args.distributed:
+        world_size = builder_args.tp * builder_args.pp
+
+        ctx = mp.get_context('spawn')
+        with futures.ProcessPoolExecutor(max_workers=world_size-1, mp_context=ctx) as executor:
+            for i in range(1,world_size):
+                fn = partial(run_generator, args, i)
+                executor.submit(run_in_dist_env, world_size, i, fn)
+            #Starting rank 0
+            fn = partial(run_generator, args, 0)
+            run_in_dist_env(world_size, 0, fn)
+    else:
+        run_generator(args)
diff --git a/torchchat/model.py b/torchchat/model.py
index 2a3b9f12f..ce7dcb5e4 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import json
+import logging
 import os
 import warnings
 from abc import ABC, abstractmethod
@@ -48,6 +49,8 @@
 
 config_path = Path(f"{str(Path(__file__).parent)}/model_params")
 
+logger = logging.getLogger(__name__)
+
 
 class QuickGELUActivation(nn.Module):
     """
@@ -273,6 +276,7 @@ class TransformerArgs:
     # Select the desired tokenizer. Defaults to sentencepiece
     use_tiktoken: bool = False
     use_hf_tokenizer: bool = False
+    tokenizer_prepend_bos: bool = True
     max_seq_length: int = 8192
     rope_scaling: Optional[Dict[str, Any]] = None
     # For pipeline parallel
@@ -283,6 +287,11 @@ class TransformerArgs:
     feed_forward_bias: bool = False
     # Whether or not to tie the input word embeddings to the output
     tie_word_embeddings: bool = False
+    # Granite architecture multipliers
+    embedding_multiplier: Optional[float] = None
+    attention_multiplier: Optional[float] = None
+    residual_multiplier: Optional[float] = None
+    logits_scaling: Optional[float] = None
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -330,6 +339,7 @@ class ModelArgs:
     transformer_args: Dict[str, Dict[str, Any]]
     use_tiktoken: bool
     use_hf_tokenizer: bool
+    tokenizer_prepend_bos: bool
 
     def __init__(
         self,
@@ -337,6 +347,7 @@ def __init__(
         model_type: ModelType = ModelType.TextOnly,
         use_tiktoken: bool = False,
         use_hf_tokenizer: bool = False,
+        tokenizer_prepend_bos: bool = True,
     ) -> None:
         self._sanity_check(transformer_args, model_type)
 
@@ -346,6 +357,7 @@ def __init__(
         # Model-level attributes
         self.use_tiktoken = use_tiktoken
         self.use_hf_tokenizer = use_hf_tokenizer
+        self.tokenizer_prepend_bos = tokenizer_prepend_bos
 
     def _sanity_check(
         self,
@@ -373,7 +385,14 @@ def from_params(cls, params_path):
 
         use_tiktoken = loaded_params.get("use_tiktoken", False)
         use_hf_tokenizer = loaded_params.get("use_hf_tokenizer", False)
-        return cls(transformer_args, model_type, use_tiktoken, use_hf_tokenizer)
+        tokenizer_prepend_bos = loaded_params.get("tokenizer_prepend_bos", True)
+        return cls(
+            transformer_args=transformer_args,
+            model_type=model_type,
+            use_tiktoken=use_tiktoken,
+            use_hf_tokenizer=use_hf_tokenizer,
+            tokenizer_prepend_bos=tokenizer_prepend_bos,
+        )
 
     @classmethod
     def from_table(cls, name: str):
@@ -477,7 +496,9 @@ def build_model(self) -> nn.Module:
         for name, module_class in recipe.modules.items():
             config_args = self.config.transformer_args[name]
             if module_class == Transformer:
-                modules[name] = module_class(TransformerArgs.from_params(config_args))
+                transformer_args = TransformerArgs.from_params(config_args)
+                logger.debug("Transformer Args: %s", transformer_args)
+                modules[name] = module_class(transformer_args)
             else:
                 modules[name] = module_class(**config_args)
 
@@ -636,7 +657,7 @@ def __init__(self, config: TransformerArgs) -> None:
             self.layers[str(layer_id)] = TransformerBlock(config)
 
         if config.stage_idx == config.n_stages - 1:
-            self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+            self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
             self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
             if config.tie_word_embeddings:
                 self.output.weight = self.tok_embeddings.weight
@@ -707,6 +728,10 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int
         if self.tok_embeddings:
             x = self.tok_embeddings(x)
 
+            # For Granite architectures
+            if self.config.embedding_multiplier:
+                x = x * self.config.embedding_multiplier
+
         for _, layer in self.layers.items():
             x = layer(x, input_pos, freqs_cis, mask, cache_lane=cache_lane)
 
@@ -714,6 +739,9 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int
             x = self.norm(x)
         if self.output:
             x = self.output(x)
+        # For granite architectures
+        if self.config.logits_scaling:
+            x = x / self.config.logits_scaling
         # print(f"output shape: {x.shape}")
         return x
 
@@ -723,8 +751,14 @@ def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
         self.attention = Attention(config)
         self.feed_forward = FeedForward(config)
-        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
-        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+        self.ffn_norm = nn.RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = nn.RMSNorm(config.dim, config.norm_eps)
+        # None for llama architecture, set for granite architectures
+        self.residual_multiplier = (
+            config.residual_multiplier
+            if config.residual_multiplier is not None
+            else 1.0
+        )
 
     def distribute(self, device_mesh: DeviceMesh):
         self.attention.distribute(device_mesh)
@@ -735,8 +769,8 @@ def forward(
     ) -> Tensor:
         h = x + self.attention(
             self.attention_norm(x), freqs_cis, mask, input_pos, cache_lane=cache_lane
-        )
-        out = h + self.feed_forward(self.ffn_norm(h))
+        ) * self.residual_multiplier
+        out = h + self.feed_forward(self.ffn_norm(h)) * self.residual_multiplier
         return out
 
 
@@ -763,6 +797,7 @@ def __init__(self, config: TransformerArgs):
         self.head_dim = config.head_dim
         self.n_local_heads = config.n_local_heads
         self.dim = config.dim
+        self.attention_scale = config.attention_multiplier
         self._register_load_state_dict_pre_hook(self.load_hook)
 
     def setup_cache(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
@@ -859,7 +894,16 @@ def forward(
 
         k = k.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = F.scaled_dot_product_attention(
+            query=q,
+            key=k,
+            value=v,
+            attn_mask=mask,
+            dropout_p=0.0,
+            # This is None (default) for llama architecture and set for granite
+            # architectures
+            scale=self.attention_scale,
+        )
 
         # -1 = self.dim
         y = y.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
@@ -884,20 +928,6 @@ def forward(self, x: Tensor) -> Tensor:
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
 
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
-
-    def forward(self, x: Tensor) -> Tensor:
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
 def apply_scaling(freqs: torch.Tensor, rope_scaling: Dict[str, Any]):
     # Check for the presence of the required keys
     required_keys = {
@@ -981,7 +1011,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     # For quantized_decomposed ops
     from executorch.kernels import quantized  # no-qa
     # For llama::sdpa_with_kv_cache.out, preprocess ops
-    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # no-qa
+    from executorch.extension.llm.custom_ops import custom_ops  # no-qa
 
     class PTEModel(nn.Module):
         def __init__(self, config, path) -> None:
@@ -1018,5 +1048,6 @@ def forward(self, x, input_pos):
         def setup_caches(self, max_batch_size, max_seq_length):
             pass
 
-except:
+except Exception as e:
+    print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}")
     pass
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
index 2d3dfcbeb..3c2161b9b 100644
--- a/torchchat/model_config/models.json
+++ b/torchchat/model_config/models.json
@@ -51,6 +51,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3.1-8B"
     },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
+        "aliases": ["deepseek-r1:8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "tokenizer_file": "tokenizer.json"
+    },
     "meta-llama/Meta-Llama-3.1-70B-Instruct": {
         "aliases": ["llama3.1-70b"],
         "distribution_channel": "HuggingFaceSnapshot",
@@ -164,5 +170,47 @@
             "https://github.com/karpathy/llama2.c/raw/master/tokenizer.model"
         ],
         "checkpoint_file": "stories110M.pt"
+    },
+    "ibm-granite/granite-3b-code-instruct-128k": {
+        "aliases": ["granite-code", "granite-code-3b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3b-code-instruct-128k",
+        "transformer_params_key": "Granite-3B-Code",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-8b-code-instruct-128k": {
+        "aliases": ["granite-code-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-8b-code-instruct-128k",
+        "transformer_params_key": "Granite-8B-Code",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.0-2b-instruct": {
+        "aliases": ["granite3-2b", "granite3"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.0-2b-instruct",
+        "transformer_params_key": "Granite-3.0-2B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.0-8b-instruct": {
+        "aliases": ["granite3-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.0-8b-instruct",
+        "transformer_params_key": "Granite-3.0-8B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.1-2b-instruct": {
+        "aliases": ["granite3.1-2b", "granite3.1"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.1-2b-instruct",
+        "transformer_params_key": "Granite-3.1-2B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.1-8b-instruct": {
+        "aliases": ["granite3.1-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.1-8b-instruct",
+        "transformer_params_key": "Granite-3.1-8B-Instruct",
+        "tokenizer_file": "tokenizer.json"
     }
 }
diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
new file mode 100644
index 000000000..b9fa79cd2
--- /dev/null
+++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
@@ -0,0 +1 @@
+{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}}
diff --git a/torchchat/model_params/Granite-3.0-2B-Instruct.json b/torchchat/model_params/Granite-3.0-2B-Instruct.json
new file mode 100644
index 000000000..1e9779cb3
--- /dev/null
+++ b/torchchat/model_params/Granite-3.0-2B-Instruct.json
@@ -0,0 +1,21 @@
+{
+    "block_size": 8192,
+    "dim": 2048,
+    "hidden_dim": 8192,
+    "n_heads": 32,
+    "n_local_heads": 8,
+    "n_layers": 40,
+    "rope_base": 10000,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": false,
+    "feed_forward_bias": false,
+    "tie_word_embeddings": true,
+    "embedding_multiplier": 12.0,
+    "attention_multiplier": 0.015625,
+    "residual_multiplier": 0.22,
+    "logits_scaling": 8.0
+}
diff --git a/torchchat/model_params/Granite-3.0-8B-Instruct.json b/torchchat/model_params/Granite-3.0-8B-Instruct.json
new file mode 100644
index 000000000..35db0f90d
--- /dev/null
+++ b/torchchat/model_params/Granite-3.0-8B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.0078125,
+    "embedding_multiplier": 12.0,
+    "dim": 4096,
+    "block_size": 12800,
+    "hidden_dim": 12800,
+    "logits_scaling": 16.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 10000,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}
diff --git a/torchchat/model_params/Granite-3.1-2B-Instruct.json b/torchchat/model_params/Granite-3.1-2B-Instruct.json
new file mode 100644
index 000000000..1e82036ab
--- /dev/null
+++ b/torchchat/model_params/Granite-3.1-2B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.015625,
+    "embedding_multiplier": 12.0,
+    "dim": 2048,
+    "block_size": 8192,
+    "hidden_dim": 8192,
+    "logits_scaling": 8.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 5000000.0,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}
diff --git a/torchchat/model_params/Granite-3.1-8B-Instruct.json b/torchchat/model_params/Granite-3.1-8B-Instruct.json
new file mode 100644
index 000000000..646340580
--- /dev/null
+++ b/torchchat/model_params/Granite-3.1-8B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.0078125,
+    "embedding_multiplier": 12.0,
+    "dim": 4096,
+    "block_size": 12800,
+    "hidden_dim": 12800,
+    "logits_scaling": 16.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 10000000.0,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}
diff --git a/torchchat/model_params/Granite-3B-Code.json b/torchchat/model_params/Granite-3B-Code.json
new file mode 100644
index 000000000..0654a8f2c
--- /dev/null
+++ b/torchchat/model_params/Granite-3B-Code.json
@@ -0,0 +1,17 @@
+{
+    "block_size": 128000,
+    "dim": 2560,
+    "hidden_dim": 10240,
+    "n_heads": 32,
+    "n_local_heads": 32,
+    "n_layers": 32,
+    "rope_base": 10000000,
+    "vocab_size": 49152,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": true,
+    "feed_forward_bias": true,
+    "tie_word_embeddings": true
+}
\ No newline at end of file
diff --git a/torchchat/model_params/Granite-8B-Code.json b/torchchat/model_params/Granite-8B-Code.json
new file mode 100644
index 000000000..079a32070
--- /dev/null
+++ b/torchchat/model_params/Granite-8B-Code.json
@@ -0,0 +1,17 @@
+{
+    "block_size": 128000,
+    "dim": 4096,
+    "hidden_dim": 14336,
+    "n_heads": 32,
+    "n_local_heads": 8,
+    "n_layers": 36,
+    "rope_base": 10000000,
+    "vocab_size": 49152,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": true,
+    "feed_forward_bias": true,
+    "tie_word_embeddings": true
+}
\ No newline at end of file
diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json
new file mode 100644
index 000000000..90c37250a
--- /dev/null
+++ b/torchchat/quant_config/cuda-32.json
@@ -0,0 +1,5 @@
+{
+    "executor": {"accelerator": "cuda"},
+    "precision": {"dtype": "bf16"},
+    "linear:int4": {"groupsize" : 32}
+}
diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json
new file mode 100644
index 000000000..3afaa7542
--- /dev/null
+++ b/torchchat/quant_config/mobile-32.json
@@ -0,0 +1,4 @@
+{
+    "embedding": {"bitwidth": 4, "groupsize" : 32},
+    "linear:a8w4dq": {"groupsize" : 32}
+}
diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py
index 99fd82fe8..0d1d3dce7 100644
--- a/torchchat/usages/openai_api.py
+++ b/torchchat/usages/openai_api.py
@@ -13,7 +13,7 @@
 from dataclasses import dataclass
 from io import BytesIO
 from pwd import getpwuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Type
 
 import torch
 
@@ -24,7 +24,7 @@
 from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
 
 from torchchat.cli.download import is_model_downloaded, load_model_configs
-from torchchat.generate import Generator, GeneratorArgs
+from torchchat.generate import LocalGenerator, DistributedGenerator, GeneratorArgs
 from torchchat.model import FlamingoModel
 
 from torchchat.utils.build_utils import device_sync
@@ -180,7 +180,10 @@ class CompletionRequest:
     user: Optional[str] = None  # unimplemented
 
     def __post_init__(self):
-        self.stream = bool(self.stream)
+        if isinstance(self.stream, str):
+            self.stream = self.stream.lower() != "false"
+        else:
+            self.stream = bool(self.stream)
 
 
 @dataclass
@@ -267,7 +270,7 @@ class CompletionResponseChunk:
     usage: Optional[UsageStats] = None
 
 
-class OpenAiApiGenerator(Generator):
+class OpenAiApiGeneratorMixin:
     """A wrapper over the Generator class to interface with the OpenAI API.
 
     Implements endpoints for completion requests, both chunked and non-chunked using the dataclasses
@@ -486,6 +489,15 @@ def _callback(self, x, *, buffer, done_generating):
         pass
 
 
+def create_openai_api_generator(distributed: bool) -> Type:
+    """
+    Factory method to create an OpenAiApiGenerator
+    """
+
+    # Base class order matters to make sure OpenAiApiGeneratorMixin overrides methods in DistributedGenerator and Generator
+    return type('OpenAiApiGenerator', (OpenAiApiGeneratorMixin, DistributedGenerator if distributed else LocalGenerator), {})
+
+
 """
 Helper functions for the OpenAI API Models endpoint.
 
diff --git a/torchchat/usages/server.py b/torchchat/usages/server.py
index 1fb76953b..550539a88 100644
--- a/torchchat/usages/server.py
+++ b/torchchat/usages/server.py
@@ -4,38 +4,89 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import atexit
 import json
 
 import logging
 
 logger = logging.getLogger(__name__)
 
+from contextlib import nullcontext
 from dataclasses import asdict
+from functools import partial
+from os import environ
 from typing import Dict, List, Union
 
 import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from concurrent import futures
 from flask import Flask, request, Response
 
 from torchchat.cli.builder import BuilderArgs, TokenizerArgs
+from torchchat.distributed.utils import run_in_dist_env
 from torchchat.generate import GeneratorArgs
 
 from torchchat.usages.openai_api import (
     CompletionRequest,
     get_model_info_list,
-    OpenAiApiGenerator,
+    create_openai_api_generator,
     retrieve_model_info,
 )
 
 OPENAI_API_VERSION = "v1"
 
 
+def run_worker(
+    args,
+    rank,
+    queue,
+    ):
+    """
+    This function creates and executes a generator 
+    """
+    gen = initialize_generator(args)
+    
+    while True:
+        try:
+            req = queue.get()
+        except KeyboardInterrupt:
+            break
+
+        if req == "stop":
+            break
+        
+        for _ in gen.chunked_completion(req):
+            pass
+
 def create_app(args):  # noqa: C901
     """
     Creates a flask app that can be used to serve the model as a chat API.
     """
     app = Flask(__name__)
 
-    gen: OpenAiApiGenerator = initialize_generator(args)
+    builder_args = BuilderArgs.from_args(args)
+    procs = []
+    queue = None
+    if builder_args.distributed:
+        world_size = builder_args.tp * builder_args.pp
+        mp_context = mp.get_context('spawn')
+        queue = mp_context.Queue()
+    
+        for i in range(1, world_size):
+            fn = partial(run_worker, args, i, queue)
+            mp_context = mp.get_context('spawn')
+            procs.append(mp_context.Process(target=run_in_dist_env, args=(world_size, i, fn)))
+            procs[-1].start()
+
+        environ["MASTER_ADDR"] = "localhost"
+        environ["MASTER_PORT"] = "29500"
+        environ["RDZV_BACKEND"] = "c10d"
+        environ["WORLD_SIZE"] = str(world_size)
+        environ["RANK"] = str(0)
+        environ["LOCALRANK"] = str(0)
+
+    gen = initialize_generator(args)
 
     def _del_none(d: Union[Dict, List]) -> Union[Dict, List]:
         """Recursively delete None values from a dictionary."""
@@ -69,6 +120,10 @@ def chat_endpoint():
 
         if req.stream:
 
+            if builder_args.distributed:
+                for _ in range(world_size-1):
+                    queue.put(req)
+
             def chunk_processor(chunked_completion_generator):
                 """Inline function for postprocessing CompletionResponseChunk objects.
 
@@ -86,8 +141,11 @@ def chunk_processor(chunked_completion_generator):
             )
             return resp
         else:
+            if builder_args.distributed:
+                for _ in range(world_size-1):
+                    queue.put(req)
+
             response = gen.sync_completion(req)
-            print(response.choices[0].message.content)
 
             return json.dumps(_del_none(asdict(response)))
 
@@ -102,16 +160,18 @@ def models_retrieve_endpoint(model_id):
         else:
             return "Model not found", 404
 
-    return app
+    return app, (procs, queue)
 
 
-def initialize_generator(args) -> OpenAiApiGenerator:
+def initialize_generator(args) -> type:
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
     generator_args = GeneratorArgs.from_args(args)
     generator_args.chat_mode = False
 
+    OpenAiApiGenerator = create_openai_api_generator(builder_args.distributed)
+
     return OpenAiApiGenerator(
         builder_args=builder_args,
         speculative_builder_args=speculative_builder_args,
@@ -124,5 +184,19 @@ def initialize_generator(args) -> OpenAiApiGenerator:
 
 
 def main(args):
-    app = create_app(args)
+    app, (procs, queue) = create_app(args)
+
+    def shutdown_worker():
+        while not queue.empty():
+            queue.get(block=False)
+        for p in procs:
+            queue.put("stop")
+        for p in procs:
+            p.join(timeout=0.5)
+        for p in procs:
+            if p.is_alive():
+                p.kill()
+
+    atexit.register(shutdown_worker)
+
     app.run()
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
index 2685ec2f3..a0862ff94 100644
--- a/torchchat/utils/build_utils.py
+++ b/torchchat/utils/build_utils.py
@@ -231,6 +231,8 @@ def find_multiple(n: int, k: int) -> int:
 def device_sync(device="cpu"):
     if "cuda" in device:
         torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
     elif ("cpu" in device) or ("mps" in device):
         pass
     else:
@@ -279,7 +281,8 @@ def get_device_str(device) -> str:
         device = (
             "cuda"
             if torch.cuda.is_available()
-            else "mps" if is_mps_available() else "cpu"
+            else "mps" if is_mps_available()
+            else "xpu" if torch.xpu.is_available()  else "cpu"
         )
         return device
     else:
@@ -291,7 +294,8 @@ def get_device(device) -> str:
         device = (
             "cuda"
             if torch.cuda.is_available()
-            else "mps" if is_mps_available() else "cpu"
+            else "mps" if is_mps_available()
+            else "xpu" if torch.xpu.is_available()  else "cpu"
         )
     return torch.device(device)
 
diff --git a/torchchat/utils/device_info.py b/torchchat/utils/device_info.py
index 9c5953944..950c03002 100644
--- a/torchchat/utils/device_info.py
+++ b/torchchat/utils/device_info.py
@@ -14,7 +14,7 @@ def get_device_info(device: str) -> str:
     """Returns a human-readable description of the hardware based on a torch.device.type
 
     Args:
-        device: A torch.device.type string: one of {"cpu", "cuda"}.
+        device: A torch.device.type string: one of {"cpu", "cuda", "xpu"}.
     Returns:
         str: A human-readable description of the hardware or an empty string if the device type is unhandled.
 
@@ -37,4 +37,13 @@ def get_device_info(device: str) -> str:
             )
     if device == "cuda":
         return torch.cuda.get_device_name(0)
+    if device == "xpu":
+        return (
+            check_output(
+                ["xpu-smi discovery |grep 'Device Name:'"], shell=True
+            )
+            .decode("utf-8")
+            .split("\n")[0]
+            .split("Device Name:")[1]
+            )
     return ""
diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md
index 490500223..77414eeb4 100644
--- a/torchchat/utils/docs/evaluation.md
+++ b/torchchat/utils/docs/evaluation.md
@@ -4,8 +4,13 @@
 
 # Evaluation Features
 
+<!--
+
 [shell default]: ./install/install_requirements.sh
 
+[shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
+
+-->
 
 Torchchat provides evaluation functionality for your language model on
 a variety of tasks using the
@@ -14,26 +19,65 @@ library.
 
 ## Usage
 
-The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
+The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext".
+
+## Examples
 
-**Examples**
+### Evaluation example with model in Python environment
 
 Running wikitext for 10 iterations
 ```
 python3 torchchat.py eval stories15M --tasks wikitext --limit 10
 ```
 
-Running an exported model
+Running wikitext with torch.compile for 10 iterations
+```
+python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10
+```
+
+Running multiple tasks with torch.compile for evaluation and prefill:
+```
+python3 torchchat.py eval stories15M --compile --compile-prefill --tasks wikitext hellaswag
+```
+
+### Evaluation with model exported to PTE with ExecuTorch
+
+Running an exported model with ExecuTorch (as PTE).  Advantageously, because you can 
+load an exported PTE model back into the Python environment with torchchat,
+you can run evaluation on the exported model!
 ```
 python3 torchchat.py export stories15M --output-pte-path stories15M.pte
 python3 torchchat.py eval stories15M --pte-path stories15M.pte
 ```
 
-Running multiple tasks and calling eval.py directly:
+Running multiple tasks directly on the created PTE mobile model:
+```
+python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
+```
+
+Now let's evaluate the effect of quantization on evaluation results by exporting with quantization using `--quantize` and an exemplary quantization configuration:
 ```
+python3 torchchat.py export stories15M --output-pte-path stories15M.pte --quantize torchchat/quant_config/mobile.json
 python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
 ```
 
+Now try your own export options to explore different trade-offs between model size, evaluation speed and accuracy using model quantization!
+
+### Evaluation with model exported to DSO with AOT Inductor (AOTI)
+
+Running an exported model with AOT Inductor (DSO model).  Advantageously, because you can 
+load an exported DSO model back into the Python environment with torchchat,
+you can run evaluation on the exported model!
+```
+python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so
+python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so
+```
+
+Running multiple tasks with AOTI:
+```
+python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag
+```
+
 For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
 
 [end default]: end
diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py
index 309ff807c..9e7b73b50 100644
--- a/torchchat/utils/gguf_loader.py
+++ b/torchchat/utils/gguf_loader.py
@@ -24,6 +24,8 @@
     pack_scales_and_zeros,
 )
 
+from torchao.dtypes.utils import is_device
+
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -122,12 +124,13 @@ def linear_int4(input, weight_int4pack, scales_and_zeros, out_features, groupsiz
             input.dtype
         )  # cast back to input.dtype
     else:
-        c = torch.ops.aten._weight_int4pack_mm(
+        c = torch.ops.aten._weight_int4pack_mm_for_cpu(
             input,
             weight_int4pack,
             groupsize,
             scales_and_zeros,
         )
+
     new_shape = origin_input_size[:-1] + (out_features,)
     c = c.reshape(new_shape)
     return c
@@ -178,16 +181,27 @@ def __init__(
         ), "must specify both weights and scales_and_zeros, or neither"
 
         if weight is None:
-            weight = torch.empty(
-                (
-                    out_features // 8,
-                    in_features // (inner_k_tiles * 16),
-                    32,
-                    inner_k_tiles // 2,
-                ),
-                dtype=torch.int32,
-                device=device,
-            )
+            if is_device(device, "cpu"):
+                weight = torch.empty(
+                    (
+                        out_features,
+                        in_features // 2,
+                    ),
+                    dtype=torch.uint8,
+                    device=device,
+                )
+            else:
+                weight = torch.empty(
+                    (
+                        out_features // 8,
+                        in_features // (inner_k_tiles * 16),
+                        32,
+                        inner_k_tiles // 2,
+                    ),
+                    dtype=torch.int32,
+                    device=device,
+                )
+
             scales_and_zeros = torch.empty(
                 (in_features // groupsize, out_features, 2),
                 dtype=get_precision(),
@@ -223,12 +237,17 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
-            torch.uint8
-        )
-        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-            weight_uint8, inner_k_tiles
-        )
+        if is_device(weight_int32.device.type, "cpu"):
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                weight_int32, inner_k_tiles
+            )
+        else:
+            weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
+                torch.uint8
+            )
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                weight_uint8, inner_k_tiles
+            )
         return weight_int4pack, scales_and_zeros
 
     @classmethod
@@ -570,6 +589,7 @@ def load_model_and_state_dict(
     load_state_dict: bool = True,
     load_as_quantized: bool = True,
     inner_k_tiles=8,
+    device="cpu",
 ) -> torch.nn.Module:
     """
     Parses the GGUF file and returns an nn.Module on meta device along with a state_dict
@@ -608,10 +628,15 @@ def load_model_and_state_dict(
             if load_state_dict:
                 q, s, z = Q4_0.unpack(t)
                 scales_and_zeros = pack_scales_and_zeros(s, z)
-                q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-                weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                    q_uint8, inner_k_tiles
-                )
+                if is_device(q.device.type, "cpu"):
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                        q, inner_k_tiles
+                    )
+                else:
+                    q_tmp = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                        q_tmp, inner_k_tiles
+                    )
                 state_dict[f"{fqn}.weight"] = weight_int4pack
                 state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros
 
@@ -623,7 +648,7 @@ def load_model_and_state_dict(
                     in_features=in_features,
                     out_features=out_features,
                     bias=False,
-                    device="meta",
+                    device="cpu",
                     groupsize=Q4_0.groupsize,
                     inner_k_tiles=inner_k_tiles,
                 ),
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 31c639dfd..933bc1b9e 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -26,7 +26,7 @@
 
 # from functools import reduce
 # from math import gcd
-from typing import Dict, Optional, Callable, Any, List
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.nn as nn
@@ -37,6 +37,7 @@
 from torchao.quantization.quant_api import (
     int4_weight_only,
     Int4WeightOnlyQuantizer,
+    int8_weight_only,
     Int8DynActInt4WeightQuantizer,
     quantize_,
 )
@@ -45,8 +46,8 @@
     find_multiple,
     get_device_str,
     get_precision,
-    set_precision,
     name_to_dtype,
+    set_precision,
     state_dict_device,
     use_et_backend,
 )
@@ -60,28 +61,36 @@
 
 import inspect
 
+
 def get_named_parameters(func: Callable) -> List[str]:
     # Get the signature of the function
     signature = inspect.signature(func)
-    
+
     # Extract the parameters from the signature
     parameters = signature.parameters
-    
+
     # Filter and return named parameters
     named_params = [
-        name for name, param in parameters.items()
-        if param.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
+        name
+        for name, param in parameters.items()
+        if param.kind
+        in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
     ]
     return named_params
 
-def validate_args(named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None) -> Dict[str, Any]:
+
+def validate_args(
+    named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None
+) -> Dict[str, Any]:
     for key in q_kwargs.keys():
         if key not in named_params:
-            print(f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring.")
+            print(
+                f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring."
+            )
             del q_kwargs[key]
     return q_kwargs
-    
-            
+
+
 #########################################################################
 ###                  torchchat quantization API                       ###
 
@@ -110,21 +119,43 @@ def quantize_model(
         if quantizer not in quantizer_class_dict:
             raise RuntimeError(f"unknown quantizer {quantizer} specified")
         else:
+            ao_quant = True
             # Use tensor subclass API for int4 weight only.
-            if device == "cuda" and quantizer == "linear:int4":
+            if (device == "cuda" or device == "xpu") and quantizer == "linear:int4":
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
+            elif quantizer == "linear:int8":
+                print("quantizer is linear int8")
+
+                # TODO: float16 quant via the AO quantize_() API seems broken. Remove this once the issue is resolved https://github.com/pytorch/ao/issues/1662
+                if get_precision() == torch.float16:
+                    print(
+                        "model is float16 dtype - fallback to native implementation (see https://github.com/pytorch/ao/issues/1662)"
+                    )
+                    ao_quant = False
+                else:
+                    quantize_(model, int8_weight_only())
+            else:
+                ao_quant = False
+            if ao_quant:
                 if not support_tensor_subclass:
                     unwrap_tensor_subclass(model)
                 continue
-            
+
             if quantizer in ["linear:a8wxdq", "embedding:wx"]:
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers
                 if get_precision() != torch.float32:
-                    print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32.")
+                    print(
+                        f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32."
+                    )
                     set_precision(torch.float32)
-                
-            # We set global precision from quantize options if it is specified at cli.py:485 
+
+            if quantizer == "linear:afpwx" and device != "mps":
+                raise RuntimeError(
+                    "linear:afpwx quantization can only run on mps device!"
+                )
+
+            # We set global precision from quantize options if it is specified at cli.py:485
             # so the precision returned by get_precision() is always the authoritative precision/dtype in torchchat
             precision = get_precision()
 
@@ -141,14 +172,19 @@ def quantize_model(
             model = quant_handler.quantize(model)
 
 
-
 #########################################################################
 ###                QuantHandler API definition                        ###
 ###               (unify with torchao in future)                      ###
 
 
 class QuantHandler:
-    def __init__(self, model: Optional[nn.Module] = None, device="cpu", precision=None, tokenizer=None):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+    ):
         self.model_ = model
         self.device = device
         self.tokenizer = tokenizer
@@ -176,7 +212,15 @@ def quantize(self, model: nn.Module) -> nn.Module:
 
 
 class PrecisionHandler(QuantHandler):
-    def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, dtype):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+        *,
+        dtype,
+    ):
         self.model_ = model
         self.device = device
         self.tokenizer = tokenizer
@@ -205,7 +249,15 @@ def quantized_model(self) -> nn.Module:
 
 
 class ExecutorHandler(QuantHandler):
-    def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, accelerator):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+        *,
+        accelerator,
+    ):
         self.model_ = model
 
         if isinstance(accelerator, str):
@@ -593,7 +645,7 @@ class WeightOnlyInt8QuantHandler(QuantHandler):
     def __init__(
         self,
         model: Optional[nn.Module] = None,
-        device = None,
+        device=None,
         precision=None,
         tokenizer=None,
         *,
@@ -886,10 +938,10 @@ def quantized_model(self) -> nn.Module:
 # class references
 quantizer_class_dict = {
     "embedding": EmbeddingOnlyQuantHandler,
-    "linear:int8": WeightOnlyInt8QuantHandler,
     "precision": PrecisionHandler,
     "executor": ExecutorHandler,
     "linear:int4": Int4WeightOnlyQuantizer,
+    "linear:int8": WeightOnlyInt8QuantHandler,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
 
@@ -915,10 +967,12 @@ def quantized_model(self) -> nn.Module:
     from torchao_experimental_quant_api import (
         Int8DynActIntxWeightLinearQuantizer,
         IntxWeightEmbeddingQuantizer,
+        UIntxWeightOnlyLinearQuantizer,
     )
 
     quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer
     quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer
+    quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op
     try:
@@ -927,16 +981,19 @@ def quantized_model(self) -> nn.Module:
         libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
         libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
         torch.ops.load_library(libs[0])
+        print("Loaded torchao cpu ops.")
+    except Exception as e:
+        print(
+            "Unable to load torchao cpu ops library. Slow fallback kernels will be used."
+        )
+
+    try:
+        libname = "libtorchao_ops_mps_aten.dylib"
+        libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"
+        torch.ops.load_library(libpath)
+        print("Loaded torchao mps ops.")
     except Exception as e:
-        print("Failed to torchao ops library with error: ", e)
-        print("Slow fallback kernels will be used.")
+        print("Unable to load torchao mps ops library.")
 
 except Exception as e:
-    class ErrorHandler(QuantHandler):
-        def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None):
-            global torchao_experimental_load_error
-            raise Exception(f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}")
-            
-    torchao_experimental_load_error = e
-    quantizer_class_dict["linear:a8wxdq"] = ErrorHandler
-    quantizer_class_dict["embedding:wx"] = ErrorHandler
+    print("Unable to import torchao experimental quant_api with error: ", e)
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 3c2c1c846..e2b8b4fc0 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -64,7 +64,7 @@ fi
 
 
 pushd ${TORCHCHAT_ROOT}
-git submodule update --init
+git submodule update --init --recursive
 git submodule sync
 if [[ "$TARGET" == "et" ]]; then
   if [ ! -d "${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install" ]; then
@@ -93,7 +93,7 @@ popd
 if [[ "$TARGET" == "et" ]]; then
     cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 else
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 
diff --git a/torchchat/utils/scripts/build_torchao_ops.sh b/torchchat/utils/scripts/build_torchao_ops.sh
index a8fd8bea2..46e2479ac 100644
--- a/torchchat/utils/scripts/build_torchao_ops.sh
+++ b/torchchat/utils/scripts/build_torchao_ops.sh
@@ -5,12 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+device=${1:-cpu}
 
+if [[ "$device" != "cpu" && "$device" != "mps" ]]; then
+  echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2
+  exit 1
+fi
 
 source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
 
 pushd ${TORCHCHAT_ROOT}
 find_cmake_prefix_path
 clone_torchao
-install_torchao_aten_ops
+install_torchao_aten_ops "$device"
 popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 84966cc35..57dcc77bf 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -88,7 +88,7 @@ install_executorch_python_libs() {
   echo "Building and installing python libraries"
   if [ "${ENABLE_ET_PYBIND}" = false ]; then
       echo "Not installing pybind"
-      bash ./install_requirements.sh
+      bash ./install_requirements.sh --pybind off
   else
       echo "Installing pybind"
       bash ./install_requirements.sh --pybind xnnpack
@@ -184,8 +184,18 @@ clone_torchao() {
 }
 
 install_torchao_aten_ops() {
-  echo "Building torchao custom ops for ATen"
-  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+  local device=${1:-cpu}
+
+  if [[ "$device" == "cpu" ]]; then
+    echo "Building torchao custom ops for ATen"
+    pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+  elif [[ "$device" == "mps" ]]; then
+    echo "Building torchao mps custom ops for ATen"
+    pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental/ops/mps
+  else
+    echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2
+    return 1
+  fi
 
   CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
diff --git a/torchchat/utils/scripts/updown.py b/torchchat/utils/scripts/updown.py
index 86ebf803f..306e5855b 100644
--- a/torchchat/utils/scripts/updown.py
+++ b/torchchat/utils/scripts/updown.py
@@ -267,6 +267,8 @@ def updown_processor(
         lines = file.readlines()
     print_flag = False
 
+    # Use bash; set it to fail on the first failing command
+    output("#! /bin/bash", replace_list=None, suppress_list=None)
     output("set -eou pipefail", replace_list=None, suppress_list=None)
 
     if create_sections: