From ce994b39275a3d5e1031f6afc068f0d32a747c01 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:10:31 -0800
Subject: [PATCH 1/7] Create run-readme-pr-linuxaarch64 (#1350)

* Create run-readme-pr-linuxaarch64

Test torchchat on aarch64 linux

* Rename run-readme-pr-linuxaarch64 to run-readme-pr-linuxaarch64.yml

add yml extension.

* Update ADVANCED-USERS.md

Update doc to indicate testing for ARMv8/aarch64 on Linux/raspbian is introduced by this PR

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .../workflows/run-readme-pr-linuxaarch64.yml  | 124 ++++++++++++++++++
 docs/ADVANCED-USERS.md                        |   4 +-
 2 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/run-readme-pr-linuxaarch64.yml

diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
new file mode 100644
index 000000000..1f920a12f
--- /dev/null
+++ b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -0,0 +1,124 @@
+name: Run the README instructions - with stories - on Linux aarch64
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-readme-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-quantization-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
+
+  test-gguf-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-advanced-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-evaluation-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index a8d02c2f9..17958e790 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
 |-----|------|-----|-----|-----|-----|
 | x86 | Linux | ✅ |  ✅ |  ✅ |  ✅ |
-| aarch64 | Linux | n/t | n/t | n/t | n/t |
+| aarch64 | Linux | ✅ | ✅ | ✅ | n/t |
 | aarch64 | macOS | ✅ |  ✅ |  ✅ |  ✅ |
 | AMD GPU | Linux |  ✅ |  ✅ |  ✅ | ❌|
 | Nvidia GPU | Linux | ✅ |  ✅ |  ✅ | ❌|
@@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Mobile GPU (Vulkan) | Android |  ❌|❌|❌| ✅ |
 | CoreML | iOS |  ❌|❌|❌| ✅ |
 | Hexagon DSP | Android | ❌|❌|❌| ✅ |
-| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ |
+| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ |
 | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t |
 | ARM 32b (up to v7) | any | ❌|❌|❌|❌|
 

From 2b555b1bb41e76d8b5d01501fa762da69d14b16d Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:26:57 -0800
Subject: [PATCH 2/7] Bump test-readme-mps-macos timeout (#1451)

---
 .github/workflows/run-readme-pr-mps.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 0d70a4c1d..7ab5b1558 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -10,7 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
-      timeout: 50
+      timeout: 60
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos

From 4c7183c2c348593c09818db6270bca1bb659b34f Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:02:49 -0800
Subject: [PATCH 3/7] Update torch/tune/vision pins to 1/19/25 (#1467)

* Update install_requirements.sh

* Update pytorch minor version

* Update install_requirements.sh
---
 install/install_requirements.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 146e11096..264c3496d 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241218
+PYTORCH_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241218
+VISION_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241218
+TUNE_NIGHTLY_VERSION=dev20250119
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
@@ -79,15 +79,15 @@ fi
 if [[ -x "$(command -v xpu-smi)" ]];
 then
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    torchtune=="0.5.0"
+    torchtune=="0.6.0"
   )
 else
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
+    torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
   )
 fi
 

From 24eb642bdd13d0a9ea9eb0055bd556a0b9640cd6 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:21:58 -0800
Subject: [PATCH 4/7] Add warning in PTEModel when not defined (#1468)

* Add warning in PTEModel when not defined

* Add missing parans
---
 torchchat/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchchat/model.py b/torchchat/model.py
index f50d2a8be..28429370c 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -1062,5 +1062,6 @@ def forward(self, x, input_pos):
         def setup_caches(self, max_batch_size, max_seq_length):
             pass
 
-except:
+except Exception as e:
+    print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}")
     pass

From 45cd239cb360663c2728e46df35841e0196de588 Mon Sep 17 00:00:00 2001
From: YanbingJiang <yanbing.jiang@intel.com>
Date: Wed, 22 Jan 2025 09:55:30 +0800
Subject: [PATCH 5/7] Add attention_backend as a configurable option (#1456)

bump this into the constructor of BuilderArgs

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py | 13 +++++++++++++
 torchchat/cli/cli.py     |  7 +++++++
 torchchat/generate.py    |  7 ++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 69db14a4b..755817d1e 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -69,6 +69,7 @@ class BuilderArgs:
     prefill_possible: bool = False
     dynamic_shapes: bool = False
     max_seq_length: Optional[int] = None
+    attention_backend: str = "math"
 
     def __post_init__(self):
         if self.device is None:
@@ -183,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
+        sdp_backend_dict = {
+            'math': torch.nn.attention.SDPBackend.MATH,
+            'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+        }
+        attention_backend = sdp_backend_dict[args.attention_backend]
+        if args.device == "cpu" and (args.attention_backend == "efficient_attention"
+                                     or args.attention_backend == "cudnn_attention"):
+            print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
+            attention_backend = torch.nn.attention.SDPBackend.MATH
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -207,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
+            attention_backend=attention_backend,
         )
 
     @classmethod
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index 723f25ea4..70f404635 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -179,6 +179,13 @@ def _add_model_config_args(parser, verb: str) -> None:
         choices=["fast", "cpu", "cuda", "mps", "xpu"],
         help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
     )
+    model_config_parser.add_argument(
+        "--attention-backend",
+        type=str,
+        default="math",
+        choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
+        help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
+    )
 
 
 # Add CLI Args representing output paths of exported model files
diff --git a/torchchat/generate.py b/torchchat/generate.py
index 8ec4d4d5d..a596187f5 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -26,6 +26,7 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torch._C import _SDPBackend as SDPBackend
 
 from PIL import Image
 
@@ -531,6 +532,7 @@ def decode_n_tokens(
         callback=lambda _: _,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
         new_tokens, new_probs = [], []
@@ -539,7 +541,7 @@ def decode_n_tokens(
             num_new_tokens - 1
         ):  # -1 to save space to run an EoS if dont generate it naturally
             # Actually better for Inductor to codegen attention here
-            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            with torch.nn.attention.sdpa_kernel([attention_backend]):
 
                 out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
@@ -683,6 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
+        attention_backend: str = "math",
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -799,6 +802,7 @@ def generate(
                     if self.is_llama3_model
                     else None
                 ),
+                attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
                 generated_tokens.append(generated_token.view(-1))
@@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False):
                     start_pos=start_pos,
                     skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
+                    attention_backend=self.builder_args.attention_backend,
                 )
                 for token_tensor, metrics in generator_func:
                     if token_tensor is not None:

From b2d8f2a501c39a038547d9f92982df3aa439e5e7 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:58:34 -0800
Subject: [PATCH 6/7] Update import of sdpa_with_kv_cache to custom_ops (#1470)

---
 torchchat/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/model.py b/torchchat/model.py
index 28429370c..c01ff1262 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -1025,7 +1025,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     # For quantized_decomposed ops
     from executorch.kernels import quantized  # no-qa
     # For llama::sdpa_with_kv_cache.out, preprocess ops
-    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # no-qa
+    from executorch.extension.llm.custom_ops import custom_ops  # no-qa
 
     class PTEModel(nn.Module):
         def __init__(self, config, path) -> None:

From 025d41220ee505e6eb2346ab479db3e8402f92b3 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 18:56:16 -0800
Subject: [PATCH 7/7] Typo: Fix generate signature type hint for
 attention_backend (#1471)

`attention_backend` is a SDPBackend, not a string
---
 torchchat/generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index a596187f5..a06e215f4 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -685,7 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
-        attention_backend: str = "math",
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor: