From ce994b39275a3d5e1031f6afc068f0d32a747c01 Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Tue, 21 Jan 2025 15:10:31 -0800 Subject: [PATCH 1/7] Create run-readme-pr-linuxaarch64 (#1350) * Create run-readme-pr-linuxaarch64 Test torchchat on aarch64 linux * Rename run-readme-pr-linuxaarch64 to run-readme-pr-linuxaarch64.yml add yml extension. * Update ADVANCED-USERS.md Update doc to indicate testing for ARMv8/aarch64 on Linux/raspbian is introduced by this PR --------- Co-authored-by: Jack-Khuu --- .../workflows/run-readme-pr-linuxaarch64.yml | 124 ++++++++++++++++++ docs/ADVANCED-USERS.md | 4 +- 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/run-readme-pr-linuxaarch64.yml diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml new file mode 100644 index 000000000..1f920a12f --- /dev/null +++ b/.github/workflows/run-readme-pr-linuxaarch64.yml @@ -0,0 +1,124 @@ +name: Run the README instructions - with stories - on Linux aarch64 + +on: + pull_request: + push: + branches: + - main + workflow_dispatch: + +jobs: + test-readme-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux-aarch64 + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-quantization-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux-aarch64 + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization + + test-gguf-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux-aarch64 + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-advanced-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux-aarch64 + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" + + test-evaluation-cpu: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux-aarch64 + gpu-arch-type: cuda + gpu-arch-version: "12.1" + timeout: 60 + script: | + echo "::group::Print machine info" + uname -a + echo "::endgroup::" + + echo "::group::Install newer objcopy that supports --set-section-alignment" + yum install -y devtoolset-10-binutils + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + echo "::endgroup::" + + TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation + + echo "::group::Completion" + echo "tests complete" + echo "*******************************************" + echo "::endgroup::" diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md index a8d02c2f9..17958e790 100644 --- a/docs/ADVANCED-USERS.md +++ b/docs/ADVANCED-USERS.md @@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch. | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime | |-----|------|-----|-----|-----|-----| | x86 | Linux | ✅ | ✅ | ✅ | ✅ | -| aarch64 | Linux | n/t | n/t | n/t | n/t | +| aarch64 | Linux | ✅ | ✅ | ✅ | n/t | | aarch64 | macOS | ✅ | ✅ | ✅ | ✅ | | AMD GPU | Linux | ✅ | ✅ | ✅ | ❌| | Nvidia GPU | Linux | ✅ | ✅ | ✅ | ❌| @@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch. | Mobile GPU (Vulkan) | Android | ❌|❌|❌| ✅ | | CoreML | iOS | ❌|❌|❌| ✅ | | Hexagon DSP | Android | ❌|❌|❌| ✅ | -| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ | +| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ | | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t | | ARM 32b (up to v7) | any | ❌|❌|❌|❌| From 2b555b1bb41e76d8b5d01501fa762da69d14b16d Mon Sep 17 00:00:00 2001 From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com> Date: Tue, 21 Jan 2025 15:26:57 -0800 Subject: [PATCH 2/7] Bump test-readme-mps-macos timeout (#1451) --- .github/workflows/run-readme-pr-mps.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml index 0d70a4c1d..7ab5b1558 100644 --- a/.github/workflows/run-readme-pr-mps.yml +++ b/.github/workflows/run-readme-pr-mps.yml @@ -10,7 +10,7 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: runner: macos-m1-14 - timeout: 50 + timeout: 60 script: | conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp conda activate test-readme-mps-macos From 4c7183c2c348593c09818db6270bca1bb659b34f Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Tue, 21 Jan 2025 17:02:49 -0800 Subject: [PATCH 3/7] Update torch/tune/vision pins to 1/19/25 (#1467) * Update install_requirements.sh * Update pytorch minor version * Update install_requirements.sh --- install/install_requirements.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 146e11096..264c3496d 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE" # NOTE: If a newly-fetched version of the executorch repo changes the value of # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -PYTORCH_NIGHTLY_VERSION=dev20241218 +PYTORCH_NIGHTLY_VERSION=dev20250119 # Nightly version for torchvision -VISION_NIGHTLY_VERSION=dev20241218 +VISION_NIGHTLY_VERSION=dev20250119 # Nightly version for torchtune -TUNE_NIGHTLY_VERSION=dev20241218 +TUNE_NIGHTLY_VERSION=dev20250119 # The pip repository that hosts nightly torch packages. cpu by default. # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly @@ -79,15 +79,15 @@ fi if [[ -x "$(command -v xpu-smi)" ]]; then REQUIREMENTS_TO_INSTALL=( - torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}" + torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}" torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}" - torchtune=="0.5.0" + torchtune=="0.6.0" ) else REQUIREMENTS_TO_INSTALL=( - torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}" + torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}" torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}" - torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}" + torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}" ) fi From 24eb642bdd13d0a9ea9eb0055bd556a0b9640cd6 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Tue, 21 Jan 2025 17:21:58 -0800 Subject: [PATCH 4/7] Add warning in PTEModel when not defined (#1468) * Add warning in PTEModel when not defined * Add missing parans --- torchchat/model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchchat/model.py b/torchchat/model.py index f50d2a8be..28429370c 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -1062,5 +1062,6 @@ def forward(self, x, input_pos): def setup_caches(self, max_batch_size, max_seq_length): pass -except: +except Exception as e: + print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}") pass From 45cd239cb360663c2728e46df35841e0196de588 Mon Sep 17 00:00:00 2001 From: YanbingJiang Date: Wed, 22 Jan 2025 09:55:30 +0800 Subject: [PATCH 5/7] Add attention_backend as a configurable option (#1456) bump this into the constructor of BuilderArgs Co-authored-by: Jack-Khuu --- torchchat/cli/builder.py | 13 +++++++++++++ torchchat/cli/cli.py | 7 +++++++ torchchat/generate.py | 7 ++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py index 69db14a4b..755817d1e 100644 --- a/torchchat/cli/builder.py +++ b/torchchat/cli/builder.py @@ -69,6 +69,7 @@ class BuilderArgs: prefill_possible: bool = False dynamic_shapes: bool = False max_seq_length: Optional[int] = None + attention_backend: str = "math" def __post_init__(self): if self.device is None: @@ -183,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": pp = getattr(args, "pp", 1) tp = getattr(args, "tp", 1) chpt_from = getattr(args, "chpt_from", "hf") + sdp_backend_dict = { + 'math': torch.nn.attention.SDPBackend.MATH, + 'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION, + 'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION, + 'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION, + } + attention_backend = sdp_backend_dict[args.attention_backend] + if args.device == "cpu" and (args.attention_backend == "efficient_attention" + or args.attention_backend == "cudnn_attention"): + print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.") + attention_backend = torch.nn.attention.SDPBackend.MATH return cls( checkpoint_dir=checkpoint_dir, checkpoint_path=checkpoint_path, @@ -207,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs": is_chat_model=is_chat_model, dynamic_shapes=getattr(args, "dynamic_shapes", False), max_seq_length=getattr(args, "max_seq_length", None), + attention_backend=attention_backend, ) @classmethod diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py index 723f25ea4..70f404635 100644 --- a/torchchat/cli/cli.py +++ b/torchchat/cli/cli.py @@ -179,6 +179,13 @@ def _add_model_config_args(parser, verb: str) -> None: choices=["fast", "cpu", "cuda", "mps", "xpu"], help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu", ) + model_config_parser.add_argument( + "--attention-backend", + type=str, + default="math", + choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"], + help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION", + ) # Add CLI Args representing output paths of exported model files diff --git a/torchchat/generate.py b/torchchat/generate.py index 8ec4d4d5d..a596187f5 100644 --- a/torchchat/generate.py +++ b/torchchat/generate.py @@ -26,6 +26,7 @@ import torch.distributed as dist import torch.multiprocessing as mp from torch.distributed.pipelining import PipelineStage, ScheduleGPipe +from torch._C import _SDPBackend as SDPBackend from PIL import Image @@ -531,6 +532,7 @@ def decode_n_tokens( callback=lambda _: _, eos_token_id: int = 2, eot_id: Optional[int] = None, + attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH, **sampling_kwargs, ): new_tokens, new_probs = [], [] @@ -539,7 +541,7 @@ def decode_n_tokens( num_new_tokens - 1 ): # -1 to save space to run an EoS if dont generate it naturally # Actually better for Inductor to codegen attention here - with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]): + with torch.nn.attention.sdpa_kernel([attention_backend]): out_token = cur_token.clone() next_token, next_prob = self.decode_one_token( @@ -683,6 +685,7 @@ def generate( sequential_prefill=True, callback=lambda x: x, max_seq_length: int, + attention_backend: str = "math", seed: Optional[int] = None, **sampling_kwargs, ) -> torch.Tensor: @@ -799,6 +802,7 @@ def generate( if self.is_llama3_model else None ), + attention_backend=attention_backend, **sampling_kwargs, ): generated_tokens.append(generated_token.view(-1)) @@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False): start_pos=start_pos, skip_cache_setup=not is_first_sample, max_seq_length=max_seq_length, + attention_backend=self.builder_args.attention_backend, ) for token_tensor, metrics in generator_func: if token_tensor is not None: From b2d8f2a501c39a038547d9f92982df3aa439e5e7 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Tue, 21 Jan 2025 17:58:34 -0800 Subject: [PATCH 6/7] Update import of sdpa_with_kv_cache to custom_ops (#1470) --- torchchat/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchchat/model.py b/torchchat/model.py index 28429370c..c01ff1262 100644 --- a/torchchat/model.py +++ b/torchchat/model.py @@ -1025,7 +1025,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor: # For quantized_decomposed ops from executorch.kernels import quantized # no-qa # For llama::sdpa_with_kv_cache.out, preprocess ops - from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # no-qa + from executorch.extension.llm.custom_ops import custom_ops # no-qa class PTEModel(nn.Module): def __init__(self, config, path) -> None: From 025d41220ee505e6eb2346ab479db3e8402f92b3 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Tue, 21 Jan 2025 18:56:16 -0800 Subject: [PATCH 7/7] Typo: Fix generate signature type hint for attention_backend (#1471) `attention_backend` is a SDPBackend, not a string --- torchchat/generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchchat/generate.py b/torchchat/generate.py index a596187f5..a06e215f4 100644 --- a/torchchat/generate.py +++ b/torchchat/generate.py @@ -685,7 +685,7 @@ def generate( sequential_prefill=True, callback=lambda x: x, max_seq_length: int, - attention_backend: str = "math", + attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH, seed: Optional[int] = None, **sampling_kwargs, ) -> torch.Tensor: