pytorch
diff --git a/‎.github/workflows/regression_test.yml‎
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/regression_test.yml‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎.github/workflows/xpu_test.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/xpu_test.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 50 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 50 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 7 deletions b/‎README.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 3 additions & 12 deletions b/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 3 additions & 12 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 3 additions & 0 deletions b/‎docs/source/conf.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/llms.txt‎
Lines changed: 43 additions & 0 deletions b/‎docs/source/llms.txt‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎test/dtypes/test_nf4.py‎
Lines changed: 3 additions & 0 deletions b/‎test/dtypes/test_nf4.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/kernel/test_autotuner.py‎
Lines changed: 84 additions & 1 deletion b/‎test/kernel/test_autotuner.py‎
Lines changed: 84 additions & 1 deletion
@@ -25,12 +25,12 @@ jobs:
         include:
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            torch-spec: '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
           - name: CPU Nightly
             runs-on: linux.4xlarge
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
+            torch-spec: '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
 
@@ -61,38 +61,38 @@ jobs:
         include:
           - name: CUDA 2.8
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: 'torch==2.8.0'
+            torch-spec: 'torch==2.8.0 torchvision==0.23.0'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
           - name: CUDA 2.9
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: 'torch==2.9.1'
+            torch-spec: 'torch==2.9.1 torchvision==0.24.1'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
           - name: CUDA 2.10
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: 'torch==2.10.0'
+            torch-spec: 'torch==2.10.0 torchvision==0.25.0'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
             dev-requirements-overrides: ""
 
           - name: CPU 2.8
             runs-on: linux.4xlarge
-            torch-spec: 'torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu'
+            torch-spec: 'torch==2.8.0 torchvision==0.23.0 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""
           - name: CPU 2.9
             runs-on: linux.4xlarge
-            torch-spec: 'torch==2.9.1 --index-url https://download.pytorch.org/whl/cpu'
+            torch-spec: 'torch==2.9.1 torchvision==0.24.1 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""
           - name: CPU 2.10
             runs-on: linux.4xlarge
-            torch-spec: 'torch==2.10.0 --index-url https://download.pytorch.org/whl/cpu'
+            torch-spec: 'torch==2.10.0 torchvision==0.25.0 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
             dev-requirements-overrides: ""
 
@@ -8,6 +8,10 @@ on:
   push:
     tags:
       - ciflow/xpu/*
+  workflow_dispatch:
+  schedule:
+    # Every Saturday at 4 PM UTC
+    - cron: '0 16 * * 6'
 
 permissions:
   id-token: write
 
@@ -1,3 +1,51 @@
-# TorchAO Claude Instructions
+# TorchAO
 
-Fill me in
+PyTorch-native library for quantization, sparsity, and low-precision training.
+
+## Config Classes
+
+All configs inherit from `AOBaseConfig`. Defined in `torchao/quantization/quant_api.py`. Use `FqnToConfig` to apply different configs to different layers by module name.
+
+## Stable vs Prototype
+
+- **Stable** (`torchao/quantization/`, `torchao/float8/`, `torchao/sparsity/`, `torchao/optim/`): API stability guaranteed.
+- **Prototype** (`torchao/prototype/`): Experimental, API may change without notice.
+
+See [docs/source/workflows/index.md](docs/source/workflows/index.md) for the full dtype x hardware status matrix.
+
+## Architecture and Contributing
+
+- [Quantization Overview](docs/source/contributing/quantization_overview.rst) - full stack walkthrough, tensor subclasses, quantization flows
+- [Contributor Guide](docs/source/contributing/contributor_guide.rst) - how to add tensors, kernels, configs
+- [Inference Workflows](docs/source/workflows/inference.md) - which config to use for which hardware
+- [PT2E Quantization](docs/source/pt2e_quantization/index.rst) - PyTorch 2 Export quantization for deployment backends (X86, XPU, ExecuTorch)
+
+These render at https://docs.pytorch.org/ao/main/
+
+## Deprecated APIs
+
+Do not use or recommend these:
+- `AffineQuantizedTensor` (AQT) in `torchao/dtypes/` - old v1 system, being removed
+- `autoquant()` - deleted
+- Layout registration system (`PlainLayout`, `Float8Layout`, `TensorCoreTiledLayout`, etc.) - deleted
+- `TorchAODType` - deprecated
+- `change_linear_weights_to_int4_woqtensors` - deleted, use `quantize_(model, Int4WeightOnlyConfig())`
+
+New tensor types should inherit from `TorchAOBaseTensor` in `torchao/utils.py`, not AQT.
+
+## Development
+
+```bash
+# Setup
+USE_CPP=0 pip install -e . --no-build-isolation   # CPU-only
+USE_CUDA=1 pip install -e . --no-build-isolation   # With CUDA
+
+# Test (mirrors source structure)
+pytest test/quantization/test_quant_api.py
+pytest test/float8/
+pytest test/prototype/mx_formats/
+```
+
+## Commit Messages
+
+- Do not commit without explicit request from the user
@@ -309,14 +309,14 @@ TorchAO is integrated into some of the leading open-source libraries including:
 
 If you find the torchao library useful, please cite it in your work as below.
 
-<!-- TODO: update to cite CodeML paper after Jul 2025 -->
 ```bibtex
-@software{torchao,
+@misc{or2025torchao,
   title={TorchAO: PyTorch-Native Training-to-Serving Model Optimization},
-  author={torchao},
-  url={https://github.com/pytorch/ao},
-  license={BSD-3-Clause},
-  month={oct},
-  year={2024}
+  author={Andrew Or and Apurva Jain and Daniel Vega-Myhre and Jesse Cai and Charles David Hernandez and Zhenrui Zheng and Driss Guessous and Vasiliy Kuznetsov and Christian Puhrsch and Mark Saroufim and Supriya Rao and Thien Tran and Aleksandar Samardžić},
+  year={2025},
+  eprint={2507.16099},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2507.16099},
 }
 ```
@@ -112,12 +112,7 @@ def get_gemm_times(
 
     bf16_time_s = get_gpu_kernel_gemm_time_s(torch.mm, x_bf16, w_bf16)
 
-    if recipe_name in (
-        "mxfp4_cutlass",
-        "nvfp4",
-        "nvfp4_static",
-        "nvfp4_no_global_scale",
-    ):
+    if recipe_name in ("mxfp4_cutlass", "nvfp4", "nvfp4_static"):
         d1, d2, d3 = torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2, torch.bfloat16
         A = torch.randint(0, 255, (M, K // 2), device=device, dtype=torch.uint8).view(
             d1
@@ -156,7 +151,7 @@ def get_gemm_times(
         scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         scale_a = to_blocked(scale_a)
         scale_b = to_blocked(scale_b)
-    elif recipe_name in ("nvfp4", "nvfp4_static", "nvfp4_no_global_scale"):
+    elif recipe_name in ("nvfp4", "nvfp4_static"):
         scale_a = torch.ones(M, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_b = torch.ones(N, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_a = to_blocked(scale_a)
@@ -182,7 +177,7 @@ def do_matmul(A, B):
                 swizzle_b=SwizzleType.SWIZZLE_32_4_4,
                 output_dtype=d3,
             )
-        if recipe_name in ("nvfp4", "nvfp4_static", "nvfp4_no_global_scale"):
+        if recipe_name in ("nvfp4", "nvfp4_static"):
             return torch._scaled_mm(
                 A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=False
             )
@@ -802,10 +797,6 @@ def run(
                     config = NVFP4DynamicActivationNVFP4WeightConfig(
                         use_dynamic_per_tensor_scale=True,
                     )
-                elif recipe_name == "nvfp4_no_global_scale":
-                    config = NVFP4DynamicActivationNVFP4WeightConfig(
-                        use_dynamic_per_tensor_scale=False,
-                    )
                 elif recipe_name == "nvfp4_static":
                     config_calib = NVFP4DynamicActivationNVFP4WeightConfig(
                         step="prepare",
 
@@ -243,6 +243,9 @@
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 
+# Files to copy to the docs root (served at docs.pytorch.org/ao/llms.txt)
+html_extra_path = ["llms.txt"]
+
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
 
@@ -0,0 +1,43 @@
+# TorchAO
+
+> PyTorch-native library for quantization, sparsity, and low-precision training. Provides the quantize_() API with Config classes for int4/int8/float8/MX weight and activation quantization, composable with torch.compile.
+
+## Docs
+
+- [Quick Start](https://docs.pytorch.org/ao/stable/quick_start.html)
+- [Workflows Matrix](https://docs.pytorch.org/ao/main/workflows.html): Status of every dtype x hardware combination
+- [API Reference](https://docs.pytorch.org/ao/stable/api_reference/index.html)
+- [Inference Quantization](https://docs.pytorch.org/ao/main/workflows/inference.html)
+- [Float8 Training](https://docs.pytorch.org/ao/main/workflows/training.html)
+- [QAT](https://docs.pytorch.org/ao/main/workflows/qat.html)
+- [Quantization Overview](https://docs.pytorch.org/ao/main/contributing/quantization_overview.html): Architecture and internals
+- [Contributor Guide](https://docs.pytorch.org/ao/main/contributing/contributor_guide.html): How to add tensors, kernels, configs
+- [PT2E Quantization](https://docs.pytorch.org/ao/main/pt2e_quantization/index.html): PyTorch 2 Export quantization for deployment backends (X86, XPU, ExecuTorch)
+
+## Code
+
+- [quantize_() and Config classes](https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_api.py): Main entry point
+- [Tensor subclasses](https://github.com/pytorch/ao/tree/main/torchao/quantization/quantize_/workflows): Int4Tensor, Int8Tensor, Float8Tensor, etc.
+- [Granularity](https://github.com/pytorch/ao/blob/main/torchao/quantization/granularity.py): PerTensor, PerRow, PerGroup, PerBlock, PerToken
+- [Float8 training](https://github.com/pytorch/ao/tree/main/torchao/float8): Scaled float8 training recipes
+- [Sparsity](https://github.com/pytorch/ao/tree/main/torchao/sparsity): Semi-structured 2:4 sparsity
+- [Quantized optimizers](https://github.com/pytorch/ao/tree/main/torchao/optim): AdamW8bit, AdamW4bit, AdamWFp8
+- [QAT](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat): Quantization-aware training
+- [MX formats](https://github.com/pytorch/ao/tree/main/torchao/prototype/mx_formats): MXFP8, MXFP4, NVFP4 (prototype)
+- [MoE training](https://github.com/pytorch/ao/tree/main/torchao/prototype/moe_training): MXFP8 MoE training (prototype)
+
+## Deprecated APIs
+
+Do not use or recommend these:
+- `AffineQuantizedTensor` (AQT) in `torchao/dtypes/` - old v1 system, being removed. New tensor types inherit from `TorchAOBaseTensor`
+- `autoquant()` - deleted
+- Layout registration system (`PlainLayout`, `Float8Layout`, `TensorCoreTiledLayout`, etc.) - deleted
+- `TorchAODType` - deprecated
+- `change_linear_weights_to_int4_woqtensors` - deleted, use `quantize_(model, Int4WeightOnlyConfig())`
+
+## Optional
+
+- [Tutorials](https://github.com/pytorch/ao/tree/main/tutorials)
+- [Benchmarks](https://github.com/pytorch/ao/tree/main/benchmarks)
+- [Contributing](https://github.com/pytorch/ao/blob/main/CONTRIBUTING.md)
+- [MSLK kernels](https://github.com/pytorch/MSLK): Optional accelerated kernels
@@ -771,6 +771,9 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(2)
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
+    @unittest.skip(
+        "Skipped due to PyTorch autograd metadata issue with DTensor redistribute"
+    )
     def test_comm(self):
         self.run_subtests(
             {"input_size": [512, 2048]},
 
@@ -13,7 +13,7 @@
 import torch
 from parameterized import parameterized
 
-from torchao.utils import is_sm_at_least_90
+from torchao.utils import is_sm_at_least_90, torch_version_at_least
 
 logging.basicConfig(level=logging.INFO)
 
@@ -96,5 +96,88 @@ def test_int_scaled_mm(self, device, dtype):
         torch.testing.assert_allclose(out32_1, out32_2)
 
 
+class TestIntScaledMatmulCPUPaths(unittest.TestCase):
+    """
+    Tests for the CPU-specific paths inside _int_scaled_matmul_cpu.
+    Because the u8s8 VNNI branch is gated on runtime CPU feature detection,
+    CI machines are unlikely to exercise it naturally.  We monkeypatch the
+    two helper functions so each branch can be tested on any machine.
+    """
+
+    def _make_inputs(self, m=64, k=32, n=16, dtype=torch.bfloat16):
+        a = torch.randint(-128, 127, (m, k), dtype=torch.int8)
+        b = torch.randint(-128, 127, (k, n), dtype=torch.int8)
+        scales = torch.randn(m, 1, dtype=dtype)
+        return a, b, scales
+
+    def _reference(self, a, b, scales):
+        from torchao.kernel.intmm import safe_int_mm
+
+        return safe_int_mm(a, b).to(scales.dtype) * scales
+
+    @unittest.skipIf(not torch_version_at_least("2.12.0.dev"), "Need torch 2.12+")
+    def test_vnni_path_via_monkeypatch(self):
+        """Force the u8s8 VNNI branch and verify against the reference result."""
+        import torchao.kernel.intmm as intmm_mod
+
+        a, b, scales = self._make_inputs()
+        expected = self._reference(a, b, scales)
+
+        orig_amx = intmm_mod._cpu_is_amx_tile_supported
+        orig_vnni = intmm_mod._cpu_is_vnni_supported
+        try:
+            # Simulate: no AMX, but VNNI present → u8s8 compensation path
+            intmm_mod._cpu_is_amx_tile_supported = lambda: False
+            intmm_mod._cpu_is_vnni_supported = lambda: True
+            result = intmm_mod._int_scaled_matmul_cpu(a, b, scales)
+        finally:
+            intmm_mod._cpu_is_amx_tile_supported = orig_amx
+            intmm_mod._cpu_is_vnni_supported = orig_vnni
+
+        torch.testing.assert_close(result, expected)
+
+    @unittest.skipIf(not torch_version_at_least("2.12.0.dev"), "Need torch 2.12+")
+    def test_amx_path_via_monkeypatch(self):
+        """Force the s8s8 AMX/fallback branch and verify against the reference result."""
+        import torchao.kernel.intmm as intmm_mod
+
+        a, b, scales = self._make_inputs()
+        expected = self._reference(a, b, scales)
+
+        orig_amx = intmm_mod._cpu_is_amx_tile_supported
+        orig_vnni = intmm_mod._cpu_is_vnni_supported
+        try:
+            # Simulate: AMX present → s8s8 direct path (no compensation)
+            intmm_mod._cpu_is_amx_tile_supported = lambda: True
+            intmm_mod._cpu_is_vnni_supported = lambda: False
+            result = intmm_mod._int_scaled_matmul_cpu(a, b, scales)
+        finally:
+            intmm_mod._cpu_is_amx_tile_supported = orig_amx
+            intmm_mod._cpu_is_vnni_supported = orig_vnni
+
+        torch.testing.assert_close(result, expected)
+
+    @unittest.skipIf(not torch_version_at_least("2.12.0.dev"), "Need torch 2.12+")
+    def test_no_simd_path_via_monkeypatch(self):
+        """Force the no-AMX/no-VNNI branch and verify against the reference result."""
+        import torchao.kernel.intmm as intmm_mod
+
+        a, b, scales = self._make_inputs()
+        expected = self._reference(a, b, scales)
+
+        orig_amx = intmm_mod._cpu_is_amx_tile_supported
+        orig_vnni = intmm_mod._cpu_is_vnni_supported
+        try:
+            # Simulate: neither AMX nor VNNI → s8s8 reference path
+            intmm_mod._cpu_is_amx_tile_supported = lambda: False
+            intmm_mod._cpu_is_vnni_supported = lambda: False
+            result = intmm_mod._int_scaled_matmul_cpu(a, b, scales)
+        finally:
+            intmm_mod._cpu_is_amx_tile_supported = orig_amx
+            intmm_mod._cpu_is_vnni_supported = orig_vnni
+
+        torch.testing.assert_close(result, expected)
+
+
 if __name__ == "__main__":
     unittest.main()