Skip to content

Commit

Permalink
Merge branch 'main' into patch-24
Browse files Browse the repository at this point in the history
  • Loading branch information
mikekgfb authored Jan 22, 2025
2 parents 8efcdff + 025d412 commit 62d8583
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 12 deletions.
124 changes: 124 additions & 0 deletions .github/workflows/run-readme-pr-linuxaarch64.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
name: Run the README instructions - with stories - on Linux aarch64

on:
pull_request:
push:
branches:
- main
workflow_dispatch:

jobs:
test-readme-cpu:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux-aarch64
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
test-quantization-cpu:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux-aarch64
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
test-gguf-cpu:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux-aarch64
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
test-advanced-cpu:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux-aarch64
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
test-evaluation-cpu:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux-aarch64
gpu-arch-type: cuda
gpu-arch-version: "12.1"
timeout: 60
script: |
echo "::group::Print machine info"
uname -a
echo "::endgroup::"
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
echo "::endgroup::"
TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
echo "::group::Completion"
echo "tests complete"
echo "*******************************************"
echo "::endgroup::"
4 changes: 2 additions & 2 deletions docs/ADVANCED-USERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
| Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
|-----|------|-----|-----|-----|-----|
| x86 | Linux |||||
| aarch64 | Linux | n/t | n/t | n/t | n/t |
| aarch64 | Linux | | | | n/t |
| aarch64 | macOS |||||
| AMD GPU | Linux |||||
| Nvidia GPU | Linux |||||
Expand All @@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
| Mobile GPU (Vulkan) | Android |||||
| CoreML | iOS |||||
| Hexagon DSP | Android |||||
| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t ||
| Raspberry Pi 4/5 | Raspbian | | | ||
| Raspberry Pi 4/5 | Android |||| n/t |
| ARM 32b (up to v7) | any |||||

Expand Down
14 changes: 7 additions & 7 deletions install/install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
# NOTE: If a newly-fetched version of the executorch repo changes the value of
# PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
# package versions.
PYTORCH_NIGHTLY_VERSION=dev20241218
PYTORCH_NIGHTLY_VERSION=dev20250119

# Nightly version for torchvision
VISION_NIGHTLY_VERSION=dev20241218
VISION_NIGHTLY_VERSION=dev20250119

# Nightly version for torchtune
TUNE_NIGHTLY_VERSION=dev20241218
TUNE_NIGHTLY_VERSION=dev20250119

# The pip repository that hosts nightly torch packages. cpu by default.
# If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
Expand All @@ -79,15 +79,15 @@ fi
if [[ -x "$(command -v xpu-smi)" ]];
then
REQUIREMENTS_TO_INSTALL=(
torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
torchtune=="0.5.0"
torchtune=="0.6.0"
)
else
REQUIREMENTS_TO_INSTALL=(
torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
)
fi

Expand Down
13 changes: 13 additions & 0 deletions torchchat/cli/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class BuilderArgs:
prefill_possible: bool = False
dynamic_shapes: bool = False
max_seq_length: Optional[int] = None
attention_backend: str = "math"

def __post_init__(self):
if self.device is None:
Expand Down Expand Up @@ -183,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
pp = getattr(args, "pp", 1)
tp = getattr(args, "tp", 1)
chpt_from = getattr(args, "chpt_from", "hf")
sdp_backend_dict = {
'math': torch.nn.attention.SDPBackend.MATH,
'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
}
attention_backend = sdp_backend_dict[args.attention_backend]
if args.device == "cpu" and (args.attention_backend == "efficient_attention"
or args.attention_backend == "cudnn_attention"):
print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
attention_backend = torch.nn.attention.SDPBackend.MATH
return cls(
checkpoint_dir=checkpoint_dir,
checkpoint_path=checkpoint_path,
Expand All @@ -207,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
is_chat_model=is_chat_model,
dynamic_shapes=getattr(args, "dynamic_shapes", False),
max_seq_length=getattr(args, "max_seq_length", None),
attention_backend=attention_backend,
)

@classmethod
Expand Down
7 changes: 7 additions & 0 deletions torchchat/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ def _add_model_config_args(parser, verb: str) -> None:
choices=["fast", "cpu", "cuda", "mps", "xpu"],
help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
)
model_config_parser.add_argument(
"--attention-backend",
type=str,
default="math",
choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
)


# Add CLI Args representing output paths of exported model files
Expand Down
7 changes: 6 additions & 1 deletion torchchat/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
from torch._C import _SDPBackend as SDPBackend

from PIL import Image

Expand Down Expand Up @@ -531,6 +532,7 @@ def decode_n_tokens(
callback=lambda _: _,
eos_token_id: int = 2,
eot_id: Optional[int] = None,
attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
**sampling_kwargs,
):
new_tokens, new_probs = [], []
Expand All @@ -539,7 +541,7 @@ def decode_n_tokens(
num_new_tokens - 1
): # -1 to save space to run an EoS if dont generate it naturally
# Actually better for Inductor to codegen attention here
with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
with torch.nn.attention.sdpa_kernel([attention_backend]):

out_token = cur_token.clone()
next_token, next_prob = self.decode_one_token(
Expand Down Expand Up @@ -683,6 +685,7 @@ def generate(
sequential_prefill=True,
callback=lambda x: x,
max_seq_length: int,
attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
seed: Optional[int] = None,
**sampling_kwargs,
) -> torch.Tensor:
Expand Down Expand Up @@ -799,6 +802,7 @@ def generate(
if self.is_llama3_model
else None
),
attention_backend=attention_backend,
**sampling_kwargs,
):
generated_tokens.append(generated_token.view(-1))
Expand Down Expand Up @@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False):
start_pos=start_pos,
skip_cache_setup=not is_first_sample,
max_seq_length=max_seq_length,
attention_backend=self.builder_args.attention_backend,
)
for token_tensor, metrics in generator_func:
if token_tensor is not None:
Expand Down
5 changes: 3 additions & 2 deletions torchchat/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,7 +1025,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
# For quantized_decomposed ops
from executorch.kernels import quantized # no-qa
# For llama::sdpa_with_kv_cache.out, preprocess ops
from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # no-qa
from executorch.extension.llm.custom_ops import custom_ops # no-qa

class PTEModel(nn.Module):
def __init__(self, config, path) -> None:
Expand Down Expand Up @@ -1062,5 +1062,6 @@ def forward(self, x, input_pos):
def setup_caches(self, max_batch_size, max_seq_length):
pass

except:
except Exception as e:
print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}")
pass

0 comments on commit 62d8583

Please sign in to comment.