From bdac616154682650f184d01c1876df9fd72d5916 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 15 Jan 2025 11:55:09 -0800
Subject: [PATCH 01/24] update experimental kernels in torchchat

---
 .github/workflows/pull.yml      | 55 +++++++++++++++++++++----
 docs/quantization.md            | 16 +++++---
 install/install_requirements.sh |  4 +-
 torchchat/utils/quantize.py     | 71 +++++++++++++++++++++++++--------
 4 files changed, 116 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 5dbafee9f..a6300211f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1055,7 +1055,54 @@ jobs:
           ./runner/build_android.sh
           echo "Tests complete."
 
-  test-torchao-experimental:
+  test-torchao-experimental-python:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          export PRMT="Once upon a time in a land far away"
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
+          echo "Generate compile"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
+          echo "Export AOTI"
+          python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
+          echo "Generate AOTI"
+          python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
+          echo "Tests complete."
+
+  test-torchao-experimental-cpp:
     strategy:
       matrix:
         runner: [macos-14-xlarge]
@@ -1109,18 +1156,12 @@ jobs:
           python torchchat.py download stories110M
           wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
           export PRMT="Once upon a time in a land far away"
-          echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
-          echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
           echo "Export and run ET (C++ runner)"
           python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Export and run AOTI (C++ runner)"
           python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
-          echo "Generate AOTI"
-          python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
           echo "Tests complete."
 
   test-torchao-experimental-mps:
diff --git a/docs/quantization.md b/docs/quantization.md
index 704a7ed6a..3eaeae36b 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -120,13 +120,15 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my n
 
 ## Experimental TorchAO lowbit kernels
 
-WARNING: These kernels only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+If you are on a Mac with Apple Silicon, we have 1-8 quantization available for embedding and linear layers, backed by CPU and MPS kernels.
+
+The CPU kernels are installed automatically by the torchchat install script and can be used out of the box.  To use the MPS kernels, follow the setup instructions below.
 
 ### Use
 
 #### linear:a8wxdq
 The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
-It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
+It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7, 8), groupsize (-1 if channelwise desired), and has_weight_zeros (true, false).
 The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
 Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.
 
@@ -138,7 +140,9 @@ The quantization scheme embedding:wx quantizes embeddings in a groupwise manner
 You should expect high performance on ARM CPU if groupsize is divisible by 32.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
 
 ### Setup
-To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+If you are using the torchao ops from python, they are available out of the box on a Mac with Apple Silicon, and you can skip these setup steps.
+
+If you plan to use the kernels from the AOTI/ExecuTorch C++ runners, follow the setup steps below.
 
 From the torchchat root directory, run
 ```
@@ -147,7 +151,7 @@ bash torchchat/utils/scripts/build_torchao_ops.sh
 
 This should take about 10 seconds to complete.
 
-Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
+When building the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
 ```
 bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
@@ -175,8 +179,8 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype fl
 
 #### AOTI
 ```
-OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time,"  --num-samples 5
+OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-aoti-package-path llama3_1.pt2
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --aoti-package-path llama3_1.pt2 --prompt "Once upon a time,"  --num-samples 5
 ```
 
 If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 360ba1801..35a6967a9 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -117,9 +117,11 @@ fi
 
 # For torchao need to install from github since nightly build doesn't have macos build.
 # TODO: Remove this and install nightly build, once it supports macos
+# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded
+# if on Mac with Apple Silicon
 (
   set -x
-  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d
+  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@11333ba2cb5c4e792bc4f5c0d70c12991f972008
 )
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index b90d098b3..3f7ed1b66 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -50,6 +50,18 @@
     state_dict_device,
     use_et_backend,
 )
+from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import (
+    PackedLinearInt8DynamicActivationIntxWeightLayout,
+)
+from torchao.experimental.quant_api import (
+    int8_dynamic_activation_intx_weight,
+    IntxWeightEmbeddingQuantizer,
+)
+from torchao.quantization.granularity import (
+    PerGroup,
+    PerRow,
+)
+from torchao.dtypes import PlainLayout
 
 
 # Flag for whether the a8wxdq quantizer is available.
@@ -117,7 +129,47 @@ def quantize_model(
                     unwrap_tensor_subclass(model)
                 continue
 
-            if quantizer in ["linear:a8wxdq", "embedding:wx"]:
+            if quantizer == "linear:a8wxdq":
+                if get_precision() != torch.float32:
+                    print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32.")
+                    set_precision(torch.float32)
+
+                group_size = q_kwargs["groupsize"]
+                bit_width = q_kwargs["bitwidth"]
+                has_weight_zeros = q_kwargs["has_weight_zeros"]
+                granularity = PerRow()
+                if group_size != -1:
+                    granularity = PerGroup(group_size)
+                weight_dtype = getattr(torch, f"int{bit_width}")
+
+                try:
+                    quantize_(
+                        model, 
+                        int8_dynamic_activation_intx_weight(
+                            weight_dtype=weight_dtype,
+                            granularity=granularity,
+                            has_weight_zeros=has_weight_zeros,
+                            layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
+                        ),
+                    )
+                except Exception as e:
+                    print("Encountered error during quantization: {e}")
+                    print("Trying with PlainLayout")
+                    quantize_(
+                        model, 
+                        int8_dynamic_activation_intx_weight(
+                            weight_dtype=weight_dtype,
+                            granularity=granularity,
+                            has_weight_zeros=has_weight_zeros,
+                            layout=PlainLayout(),
+                        ),
+                    )
+
+                if not support_tensor_subclass:
+                    unwrap_tensor_subclass(model)
+                continue
+
+            if quantizer == "embedding:wx":
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers
                 if get_precision() != torch.float32:
@@ -889,10 +941,12 @@ def quantized_model(self) -> nn.Module:
 # class references
 quantizer_class_dict = {
     "embedding": EmbeddingOnlyQuantHandler,
+    "embedding:wx": IntxWeightEmbeddingQuantizer,
     "linear:int8": WeightOnlyInt8QuantHandler,
     "precision": PrecisionHandler,
     "executor": ExecutorHandler,
     "linear:int4": Int4WeightOnlyQuantizer,
+    "linear:a8wxdq": None, # uses quantize_ API
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
 
@@ -916,26 +970,11 @@ def quantized_model(self) -> nn.Module:
         torchao_experimental_quant_api
     )
     from torchao_experimental_quant_api import (
-        Int8DynActIntxWeightLinearQuantizer,
-        IntxWeightEmbeddingQuantizer,
         UIntxWeightOnlyLinearQuantizer,
     )
-
-    quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer
-    quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer
     quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op
-    try:
-        import glob
-
-        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
-        libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
-        torch.ops.load_library(libs[0])
-        print("Loaded torchao cpu ops.")
-    except Exception as e:
-        print("Unable to load torchao cpu ops library. Slow fallback kernels will be used.")
-
     try:
         libname = "libtorchao_ops_mps_aten.dylib"
         libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"

From 74363e432ff12684fc6aa04fc8ff76d944900e58 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:09:54 -0800
Subject: [PATCH 02/24] Update docs/quantization.md

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 docs/quantization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/quantization.md b/docs/quantization.md
index 3eaeae36b..d1de63b14 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -140,7 +140,7 @@ The quantization scheme embedding:wx quantizes embeddings in a groupwise manner
 You should expect high performance on ARM CPU if groupsize is divisible by 32.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
 
 ### Setup
-If you are using the torchao ops from python, they are available out of the box on a Mac with Apple Silicon, and you can skip these setup steps.
+If you are using the torchao ops from python (i.e not with a C++ runner), they are available out of the box on a Mac with Apple Silicon, and you can skip these setup steps.
 
 If you plan to use the kernels from the AOTI/ExecuTorch C++ runners, follow the setup steps below.
 

From 48f568d98187dcea90c81b06ef03ac4725fbd49c Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:10:03 -0800
Subject: [PATCH 03/24] Update torchchat/utils/quantize.py

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/utils/quantize.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 3f7ed1b66..70a2651ca 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -137,9 +137,7 @@ def quantize_model(
                 group_size = q_kwargs["groupsize"]
                 bit_width = q_kwargs["bitwidth"]
                 has_weight_zeros = q_kwargs["has_weight_zeros"]
-                granularity = PerRow()
-                if group_size != -1:
-                    granularity = PerGroup(group_size)
+                granularity = PerRow() if group_size == -1 else PerGroup(group_size) 
                 weight_dtype = getattr(torch, f"int{bit_width}")
 
                 try:

From 525701de15d8748573fc88b936c0b8e33ac124a3 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:10:11 -0800
Subject: [PATCH 04/24] Update torchchat/utils/quantize.py

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/utils/quantize.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 70a2651ca..499b9507c 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -968,8 +968,7 @@ def quantized_model(self) -> nn.Module:
         torchao_experimental_quant_api
     )
     from torchao_experimental_quant_api import (
-        UIntxWeightOnlyLinearQuantizer,
-    )
+        from torchao_experimental_quant_api import UIntxWeightOnlyLinearQuantizer
     quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op

From f9a7bb9a628537e5ce4f6b75f675c76e4a5ea647 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Fri, 17 Jan 2025 14:04:49 -0800
Subject: [PATCH 05/24] Fixing import typo in quantize.py

---
 torchchat/utils/quantize.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 499b9507c..15736b035 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -967,8 +967,7 @@ def quantized_model(self) -> nn.Module:
     torchao_experimental_quant_api_spec.loader.exec_module(
         torchao_experimental_quant_api
     )
-    from torchao_experimental_quant_api import (
-        from torchao_experimental_quant_api import UIntxWeightOnlyLinearQuantizer
+    from torchao_experimental_quant_api import UIntxWeightOnlyLinearQuantizer
     quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op

From 0abe175e01a330315d78c5859916fc31aca24f59 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Fri, 17 Jan 2025 17:47:24 -0800
Subject: [PATCH 06/24] Bump ET pin to pick up AO changes

---
 install/.pins/et-pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index e79e9c341..640cd889c 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-9c043290ad3944268290e015c3063bc411e6ef6b
+9836b39fe690e1906f133b4a233863149c30d499

From 76e8ec53bb6ad85d11a62d8faebd33b941993d88 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 11 Feb 2025 11:32:37 -0800
Subject: [PATCH 07/24] Bump torchao-pin to match ET and torchchat

---
 install/.pins/torchao-pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 2da70769c..48cc62670 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-2e032c6b0de960dee554dcb08126ace718b14c6d
+11333ba2cb5c4e792bc4f5c0d70c12991f972008

From 3e04645446d80ddedf55a5706aaaaa18e152c62b Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 26 Feb 2025 16:13:12 -0800
Subject: [PATCH 08/24] Update torchao-pin.txt

---
 install/.pins/torchao-pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 48cc62670..4ef0f3788 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-11333ba2cb5c4e792bc4f5c0d70c12991f972008
+7d8794622f3ac7ffa98761314019a20fba06edef

From 94fcd9af8d2ef97857e4668be17ac68c57050c10 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 26 Feb 2025 17:09:58 -0800
Subject: [PATCH 09/24] Split up AOTI and ET tests

---
 .github/workflows/pull.yml | 54 +++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 9c4e43957..3ca972a81 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1123,6 +1123,56 @@ jobs:
           echo "Tests complete."
 
   test-torchao-experimental-cpp:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install torchao-ops
+        id: install-torchao-ops
+        run: |
+          bash torchchat/utils/scripts/build_torchao_ops.sh
+      - name: Install runner
+        run: |
+          echo "Installing runner"
+          bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          export PRMT="Once upon a time in a land far away"
+          echo "Export and run AOTI (C++ runner)"
+          python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
+          ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
+          echo "Tests complete."
+
+  test-torchao-experimental-et:
     strategy:
       matrix:
         runner: [macos-14-xlarge]
@@ -1175,10 +1225,6 @@ jobs:
           echo "Export and run ET (C++ runner)"
           python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
-          echo "Export and run AOTI (C++ runner)"
-          python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
-          ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
-          echo "Tests complete."
 
   test-torchao-experimental-mps:
     strategy:

From 7e56c553b4a07048a4e4571dcc7edbc3a733a267 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 26 Feb 2025 17:55:20 -0800
Subject: [PATCH 10/24] Bump ET pin to 2-26-25 with new AO pin

---
 install/.pins/et-pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index 5f0c55ac2..6664d4e72 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-791472d6706b027552f39f11b28d034e4839c9af
\ No newline at end of file
+68042847fd0eb6aac94ab2ffad8e1440fca865f4

From 77e8a62652d6619dd76b10c49ea09beda5f443ad Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 26 Feb 2025 18:04:28 -0800
Subject: [PATCH 11/24] Undo et pin bump; fails basic install

---
 install/.pins/et-pin.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index 6664d4e72..ecad1b9bb 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-68042847fd0eb6aac94ab2ffad8e1440fca865f4
+791472d6706b027552f39f11b28d034e4839c9af

From 94ad51a620674d7a267226b9235c46038462f821 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 18:42:05 -0700
Subject: [PATCH 12/24] update

---
 install/.pins/torchao-pin.txt   | 2 +-
 install/install_requirements.sh | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 4ef0f3788..a18a37c8c 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-7d8794622f3ac7ffa98761314019a20fba06edef
+1eb4d3354edb78d4651b28556200fa615c1b68c3
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 11fa280b8..f4bcaba0a 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -130,9 +130,10 @@ fi
 # TODO: Remove this and install nightly build, once it supports macos
 # USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded
 # if on Mac with Apple Silicon
+export TORCHAO_PIN=$(cat install/.pins/torchao-pin.txt)
 (
   set -x
-  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@7d8794622f3ac7ffa98761314019a20fba06edef
+  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@${TORCHAO_PIN}
 )
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then

From 34cb931617b76eb4bdd95ed176f0bed2d5dd3cc4 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 18:54:16 -0700
Subject: [PATCH 13/24] up

---
 torchchat/utils/scripts/install_utils.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 83b412be0..ac1df9d85 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -200,6 +200,7 @@ install_torchao_aten_ops() {
   CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
+    -DTORCHAO_BUILD_CPU_AARCH64=ON \
     -DCMAKE_BUILD_TYPE="Release" \
     -S . \
     -B ${CMAKE_OUT_DIR} -G Ninja
@@ -217,6 +218,7 @@ install_torchao_executorch_ops() {
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DCMAKE_BUILD_TYPE="Release" \
     -DTORCHAO_BUILD_EXECUTORCH_OPS=ON \
+    -DTORCHAO_BUILD_CPU_AARCH64=ON \
     -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
     -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
     -S . \

From b564fc14a66e9c99dde795c89839ec3eae140de9 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 18:57:54 -0700
Subject: [PATCH 14/24] up

---
 torchchat/utils/scripts/build_native.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index b8481b4cc..e2b8b4fc0 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -86,9 +86,6 @@ if [[ "$TARGET" == "et" ]]; then
     EXECUTORCH_LIBRARIES="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libexecutorch_no_prim_ops.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libextension_threadpool.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libcpuinfo.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libpthreadpool.a"
     install_torchao_executorch_ops
   fi
-elif [[ "$LINK_TORCHAO_OPS" == "ON" ]]; then
-  # Install OMP when using AOTI with linked torchao ops
-  brew install libomp
 fi
 popd
 

From 9eed5d1bd77ab7c8a9fcf1c334502f3acf3a938e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 21:54:05 -0700
Subject: [PATCH 15/24] up

---
 .github/workflows/pull.yml              | 14 +++++++-------
 torchchat/utils/scripts/build_native.sh |  3 +++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3ca972a81..ebdc295d5 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -292,7 +292,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Run inference with quantize file"
-        for DEVICE in cpu; do # cuda 
+        for DEVICE in cpu; do # cuda
           # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
           # follow up with torchao as a separate PR
           echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
@@ -349,7 +349,7 @@ jobs:
         # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
         # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
         # echo "::endgroup::"
-        
+
   test-gpu-aoti-float16:
     permissions:
       id-token: write
@@ -1198,6 +1198,11 @@ jobs:
             sysctl machdep.cpu.brand_string
             sysctl machdep.cpu.core_count
           fi
+      - name: Install ET
+        run: |
+          echo "Installing ExecuTorch"
+          export TORCHCHAT_ROOT=${PWD}
+          bash torchchat/utils/scripts/install_et.sh
       - name: Install torchchat
         run: |
           echo "Intalling pip3 packages"
@@ -1208,11 +1213,6 @@ jobs:
         id: install-torchao-ops
         run: |
           bash torchchat/utils/scripts/build_torchao_ops.sh
-      - name: Install ET
-        run: |
-          echo "Installing ExecuTorch"
-          export TORCHCHAT_ROOT=${PWD}
-          bash torchchat/utils/scripts/install_et.sh
       - name: Install runner
         run: |
           echo "Installing runner"
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index e2b8b4fc0..32ee59829 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -86,6 +86,9 @@ if [[ "$TARGET" == "et" ]]; then
     EXECUTORCH_LIBRARIES="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libexecutorch_no_prim_ops.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libextension_threadpool.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libcpuinfo.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libpthreadpool.a"
     install_torchao_executorch_ops
   fi
+elif [[ "$LINK_TORCHAO_OPS" == "ON" ]]; then
+   # Install OMP when using AOTI with linked torchao ops
+   brew install libomp
 fi
 popd
 

From 14365c424affd30f7535d18a41d3213ede64da56 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 22:10:08 -0700
Subject: [PATCH 16/24] up

---
 .github/workflows/pull.yml               | 10 +++++-----
 runner/aoti.cmake                        |  2 +-
 torchchat/utils/scripts/build_native.sh  |  7 +++----
 torchchat/utils/scripts/install_utils.sh |  2 ++
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ebdc295d5..ac664a95f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1198,17 +1198,17 @@ jobs:
             sysctl machdep.cpu.brand_string
             sysctl machdep.cpu.core_count
           fi
-      - name: Install ET
-        run: |
-          echo "Installing ExecuTorch"
-          export TORCHCHAT_ROOT=${PWD}
-          bash torchchat/utils/scripts/install_et.sh
       - name: Install torchchat
         run: |
           echo "Intalling pip3 packages"
           ./install/install_requirements.sh
           pip3 list
           python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install ET
+        run: |
+          echo "Installing ExecuTorch"
+          export TORCHCHAT_ROOT=${PWD}
+          bash torchchat/utils/scripts/install_et.sh
       - name: Install torchao-ops
         id: install-torchao-ops
         run: |
diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index ae907b391..3bfe294ea 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -15,7 +15,7 @@ ENDIF()
 
 find_package(CUDA)
 
-find_package(Torch 2.4.0)
+find_package(Torch REQUIRED)
 if(Torch_FOUND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")
 
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 32ee59829..e36de9290 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -57,6 +57,7 @@ while (( "$#" )); do
 done
 
 source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
+find_cmake_prefix_path
 
 if [ -z "${ET_BUILD_DIR}" ]; then
     ET_BUILD_DIR="et-build"
@@ -80,8 +81,6 @@ if [[ "$TARGET" == "et" ]]; then
       exit 1
     fi
 
-    source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
-    find_cmake_prefix_path
     EXECUTORCH_INCLUDE_DIRS="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/include;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src"
     EXECUTORCH_LIBRARIES="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libexecutorch_no_prim_ops.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libextension_threadpool.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libcpuinfo.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libpthreadpool.a"
     install_torchao_executorch_ops
@@ -94,9 +93,9 @@ popd
 
 # CMake commands
 if [[ "$TARGET" == "et" ]]; then
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH="${MY_CMAKE_PREFIX_PATH}" -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 else
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH="${MY_CMAKE_PREFIX_PATH}" -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index ac1df9d85..0c4c1f7b6 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -186,6 +186,8 @@ clone_torchao() {
 install_torchao_aten_ops() {
   local device=${1:-cpu}
 
+  USE_CPP=1 pip install "${TORCHCHAT_ROOT}/torchao-build/src/ao"
+
   if [[ "$device" == "cpu" ]]; then
     echo "Building torchao custom ops for ATen"
     pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental

From 66d90e14ccb8f9c0499e61f50f6f1844a62edb63 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Mon, 10 Mar 2025 22:29:58 -0700
Subject: [PATCH 17/24] up

---
 torchchat/export.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchchat/export.py b/torchchat/export.py
index 997639ffe..bad97cd35 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -439,7 +439,8 @@ def main(args):
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
             support_tensor_subclass=output_dso_path is None
-            and output_aoti_package_path is None,
+            and output_aoti_package_path is None
+            and output_pte_path is None,
         )
         model_to_pte = model
         model_to_dso = model

From 12cbd13371448f7be4a02817a031377597bbc2a8 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 09:23:54 -0700
Subject: [PATCH 18/24] up

---
 runner/aoti.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runner/aoti.cmake b/runner/aoti.cmake
index 3bfe294ea..ae907b391 100644
--- a/runner/aoti.cmake
+++ b/runner/aoti.cmake
@@ -15,7 +15,7 @@ ENDIF()
 
 find_package(CUDA)
 
-find_package(Torch REQUIRED)
+find_package(Torch 2.4.0)
 if(Torch_FOUND)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g ${TORCH_CXX_FLAGS} -fpermissive")
 

From 28d1a99c1bf76cce5e8683b77895d941b547bd1f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 11:57:45 -0700
Subject: [PATCH 19/24] up

---
 install/.pins/torchao-pin.txt            | 2 +-
 torchchat/utils/scripts/install_utils.sh | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index a18a37c8c..c1b84754c 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-1eb4d3354edb78d4651b28556200fa615c1b68c3
+711fa0809f06fc97febd0c3fe72563c3fe227e51
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 0c4c1f7b6..019309ef6 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -203,6 +203,7 @@ install_torchao_aten_ops() {
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DTORCHAO_BUILD_CPU_AARCH64=ON \
+    -DTORCHAO_PARALLEL_BACKEND=OPENMP \
     -DCMAKE_BUILD_TYPE="Release" \
     -S . \
     -B ${CMAKE_OUT_DIR} -G Ninja

From d2cc25aab67e5894f3de6c72b45214ad3c0647e7 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 13:01:01 -0700
Subject: [PATCH 20/24] up

---
 torchchat/utils/scripts/install_utils.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 019309ef6..478c0867d 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -204,6 +204,7 @@ install_torchao_aten_ops() {
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DTORCHAO_BUILD_CPU_AARCH64=ON \
     -DTORCHAO_PARALLEL_BACKEND=OPENMP \
+    -DOpenMP_ROOT="/opt/homebrew/opt/libomp" \
     -DCMAKE_BUILD_TYPE="Release" \
     -S . \
     -B ${CMAKE_OUT_DIR} -G Ninja

From d79f870b24ae4fd032355112a4f72c528e1cad6a Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:20:24 -0700
Subject: [PATCH 21/24] up

---
 .github/workflows/pull.yml                   | 13 +++++++------
 torchchat/utils/scripts/build_native.sh      |  1 +
 torchchat/utils/scripts/build_torchao_ops.sh |  1 -
 torchchat/utils/scripts/clone_torchao.sh     | 12 ++++++++++++
 torchchat/utils/scripts/install_utils.sh     |  2 +-
 5 files changed, 21 insertions(+), 8 deletions(-)
 create mode 100644 torchchat/utils/scripts/clone_torchao.sh

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ac664a95f..e44d9d037 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1154,10 +1154,10 @@ jobs:
           ./install/install_requirements.sh
           pip3 list
           python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
-      - name: Install torchao-ops
-        id: install-torchao-ops
+      - name: Clone torchao
+        id: clone-torchao
         run: |
-          bash torchchat/utils/scripts/build_torchao_ops.sh
+          bash torchchat/utils/scripts/clone_torchao.sh
       - name: Install runner
         run: |
           echo "Installing runner"
@@ -1209,10 +1209,10 @@ jobs:
           echo "Installing ExecuTorch"
           export TORCHCHAT_ROOT=${PWD}
           bash torchchat/utils/scripts/install_et.sh
-      - name: Install torchao-ops
-        id: install-torchao-ops
+      - name: Clone torchao
+        id: clone-torchao
         run: |
-          bash torchchat/utils/scripts/build_torchao_ops.sh
+          bash torchchat/utils/scripts/clone_torchao.sh
       - name: Install runner
         run: |
           echo "Installing runner"
@@ -1256,6 +1256,7 @@ jobs:
       - name: Install torchao-ops-mps
         id: install-torchao-ops-mps
         run: |
+          bash torchchat/utils/scripts/clone_torchao.sh
           bash torchchat/utils/scripts/build_torchao_ops.sh mps
       - name: Run inference
         run: |
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index e36de9290..d0e141678 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -88,6 +88,7 @@ if [[ "$TARGET" == "et" ]]; then
 elif [[ "$LINK_TORCHAO_OPS" == "ON" ]]; then
    # Install OMP when using AOTI with linked torchao ops
    brew install libomp
+   install_torchao_aten_ops cpu
 fi
 popd
 
diff --git a/torchchat/utils/scripts/build_torchao_ops.sh b/torchchat/utils/scripts/build_torchao_ops.sh
index 46e2479ac..a8388d8d7 100644
--- a/torchchat/utils/scripts/build_torchao_ops.sh
+++ b/torchchat/utils/scripts/build_torchao_ops.sh
@@ -16,6 +16,5 @@ source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
 
 pushd ${TORCHCHAT_ROOT}
 find_cmake_prefix_path
-clone_torchao
 install_torchao_aten_ops "$device"
 popd
diff --git a/torchchat/utils/scripts/clone_torchao.sh b/torchchat/utils/scripts/clone_torchao.sh
new file mode 100644
index 000000000..834e9434a
--- /dev/null
+++ b/torchchat/utils/scripts/clone_torchao.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
+
+pushd ${TORCHCHAT_ROOT}
+clone_torchao
+popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 478c0867d..9613fd740 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -204,7 +204,7 @@ install_torchao_aten_ops() {
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
     -DTORCHAO_BUILD_CPU_AARCH64=ON \
     -DTORCHAO_PARALLEL_BACKEND=OPENMP \
-    -DOpenMP_ROOT="/opt/homebrew/opt/libomp" \
+    -DOpenMP_ROOT="$(brew --prefix)/opt/libomp" \
     -DCMAKE_BUILD_TYPE="Release" \
     -S . \
     -B ${CMAKE_OUT_DIR} -G Ninja

From a8106fd80ce1528ca972d85c9a03beb47f946ab5 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:50:08 -0700
Subject: [PATCH 22/24] up

---
 install/install_requirements.sh          | 10 +---------
 torchchat/utils/scripts/install_et.sh    |  4 ++++
 torchchat/utils/scripts/install_utils.sh |  2 --
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index f4bcaba0a..0e58409c8 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -126,15 +126,7 @@ then
 )
 fi
 
-# For torchao need to install from github since nightly build doesn't have macos build.
-# TODO: Remove this and install nightly build, once it supports macos
-# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded
-# if on Mac with Apple Silicon
-export TORCHAO_PIN=$(cat install/.pins/torchao-pin.txt)
-(
-  set -x
-  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@${TORCHAO_PIN}
-)
+bash install/install_torchao.sh
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then
   (
diff --git a/torchchat/utils/scripts/install_et.sh b/torchchat/utils/scripts/install_et.sh
index 8062a8316..3fd7e2a1b 100755
--- a/torchchat/utils/scripts/install_et.sh
+++ b/torchchat/utils/scripts/install_et.sh
@@ -19,4 +19,8 @@ pushd ${TORCHCHAT_ROOT}
 find_cmake_prefix_path
 clone_executorch
 install_executorch_libs $ENABLE_ET_PYBIND
+
+# During installation, ET uninstalls torchchat's preferred version of torchao
+# so we reinstall here
+bash install/install/install_torchao.sh
 popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 9613fd740..68987e666 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -186,8 +186,6 @@ clone_torchao() {
 install_torchao_aten_ops() {
   local device=${1:-cpu}
 
-  USE_CPP=1 pip install "${TORCHCHAT_ROOT}/torchao-build/src/ao"
-
   if [[ "$device" == "cpu" ]]; then
     echo "Building torchao custom ops for ATen"
     pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental

From aa6fb7034e569504fc5d373a14deee702ab07c81 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:55:14 -0700
Subject: [PATCH 23/24] up

---
 install/install_torchao.sh | 39 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 install/install_torchao.sh

diff --git a/install/install_torchao.sh b/install/install_torchao.sh
new file mode 100644
index 000000000..84974040a
--- /dev/null
+++ b/install/install_torchao.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded
+# if on Mac with Apple Silicon
+
+if [ -z "${PYTHON_EXECUTABLE:-}" ];
+then
+  if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
+  then
+    PYTHON_EXECUTABLE=python3
+  else
+    PYTHON_EXECUTABLE=python
+  fi
+fi
+echo "Using python executable: $PYTHON_EXECUTABLE"
+
+if [[ "$PYTHON_EXECUTABLE" == "python" ]];
+then
+  PIP_EXECUTABLE=pip
+elif [[ "$PYTHON_EXECUTABLE" == "python3" ]];
+then
+  PIP_EXECUTABLE=pip3
+else
+  PIP_EXECUTABLE=pip${PYTHON_SYS_VERSION}
+fi
+echo "Using pip executable: $PIP_EXECUTABLE"
+
+
+export TORCHAO_PIN=$(cat install/.pins/torchao-pin.txt)
+(
+  set -x
+  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@${TORCHAO_PIN}
+)

From 8a9a644a1ff83595fd12ab07930ae44a418ceb73 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 11 Mar 2025 15:16:19 -0700
Subject: [PATCH 24/24] up

---
 torchchat/utils/scripts/install_et.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/utils/scripts/install_et.sh b/torchchat/utils/scripts/install_et.sh
index 3fd7e2a1b..531e80a6e 100755
--- a/torchchat/utils/scripts/install_et.sh
+++ b/torchchat/utils/scripts/install_et.sh
@@ -22,5 +22,5 @@ install_executorch_libs $ENABLE_ET_PYBIND
 
 # During installation, ET uninstalls torchchat's preferred version of torchao
 # so we reinstall here
-bash install/install/install_torchao.sh
+bash install/install_torchao.sh
 popd