pytorch
diff --git a/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 29 additions & 29 deletions b/‎.github/workflows/float8nocompile_test.yaml
Lines changed: 29 additions & 29 deletions
diff --git a/‎.github/workflows/regression_test_rocm.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/regression_test_rocm.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/float8/float8_roofline.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/float8/training/README.md
Lines changed: 1 addition & 0 deletions b/‎benchmarks/float8/training/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/float8/training/float8_training_benchmark.sh
Lines changed: 2 additions & 1 deletion b/‎benchmarks/float8/training/float8_training_benchmark.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 199 additions & 0 deletions b/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 199 additions & 0 deletions
diff --git a/‎examples/sam2_amg_server/compile_export_utils.py
Lines changed: 3 additions & 2 deletions b/‎examples/sam2_amg_server/compile_export_utils.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/sam2_vos_example/compile_export_utils.py
Lines changed: 3 additions & 2 deletions b/‎examples/sam2_vos_example/compile_export_utils.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎setup.py
Lines changed: 0 additions & 12 deletions b/‎setup.py
Lines changed: 0 additions & 12 deletions
@@ -21,33 +21,33 @@ concurrency:
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
-jobs:
-  test:
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - name: SM-89
-            runs-on: linux.g6.4xlarge.experimental.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.1"
+# jobs:
+#   test:
+#     strategy:
+#       fail-fast: false
+#       matrix:
+#         include:
+#           - name: H100
+#             runs-on: linux.aws.h100
+#             torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124'
+#             gpu-arch-type: "cuda"
+#             gpu-arch-version: "12.4"
 
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    with:
-      timeout: 300
-      runner: ${{ matrix.runs-on }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
-      submodules: recursive
-      script: |
-        conda create -n venv python=3.9 -y
-        conda activate venv
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        python -m pip install --upgrade pip
-        pip install ${{ matrix.torch-spec }}
-        pip install -r dev-requirements.txt
-        pip install .
-        cd torchao/prototype/float8nocompile
-        pytest kernels/ --verbose -s
-        pytest test/train_test.py --verbose -s
+#     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+#     with:
+#       timeout: 300
+#       runner: ${{ matrix.runs-on }}
+#       gpu-arch-type: ${{ matrix.gpu-arch-type }}
+#       gpu-arch-version: ${{ matrix.gpu-arch-version }}
+#       submodules: recursive
+#       script: |
+#         conda create -n venv python=3.9 -y
+#         conda activate venv
+#         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+#         python -m pip install --upgrade pip
+#         pip install ${{ matrix.torch-spec }}
+#         pip install -r dev-requirements.txt
+#         pip install .
+#         cd torchao/prototype/float8nocompile
+#         pytest kernels/ --verbose -s
+#         pytest test/train_test.py --verbose -s
@@ -43,6 +43,8 @@ jobs:
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
         pip install -r dev-requirements.txt
+        pip uninstall -y bitsandbytes
+        pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
         pip install .
         export CONDA=$(dirname $(dirname $(which conda)))
         export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
 
@@ -36,11 +36,11 @@ jobs:
           # Install executorch first because it installs its own version
           # of torch and torchao, which we do not want to use
           pip install executorch
-          pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
+          pip install torch==2.7.0.dev20250311 --index-url "https://download.pytorch.org/whl/nightly/cpu" --force-reinstall
           pip install numpy
           pip install pytest
           pip install parameterized
-          USE_CPP=1 pip install .
+          USE_CPP=1 TOCHAO_BUILD_KLEIDIAI=1 pip install .
       - name: Run python tests
         run: |
           conda activate venv
@@ -103,6 +103,7 @@ jobs:
           pip install parameterized
           pip install pyyaml
           pip install numpy
+          pip install importlib-metadata
       - name: Print pip freeze
         run: |
           pip freeze
 
@@ -372,7 +372,7 @@ def run(
             ).requires_grad_()
 
             # get the gradient of the right shape
-            grad_output = torch.randn(N_val, K_val, dtype=torch.bfloat16, device="cuda")
+            grad_output = torch.randn(M_val, N_val, dtype=torch.bfloat16, device="cuda")
 
             # get the bf16 gpu kernel time
             torch._dynamo.reset()
 
@@ -14,5 +14,6 @@ Training parameters can be configured via environment variables.
     - `FLOAT8_RECIPE_WITH_BEST_SETTINGS`: "rowwise" or "tensorwise". Applies float8 training with the specified scaling recipe, as well as additional training configs which are optimal for that scaling recipe. See `float8_training_benchmark.sh` for more details.
     - `BATCH_SIZE`: Defaults to 1.
     - `STEPS`: Defaults to 100.
+    - `EXTRA_ARGS`: Extra arguments to pass to torchtitan training script. See [torchtitan](https://github.com/pytorch/torchtitan) docs for the full list of options.
 
 **NOTE**: `torch.compile` and FSDP2 are always used. Other forms of parallelism supported in torchtitan are not yet supported in this script.
@@ -22,6 +22,7 @@ if [ -z "${TORCHTITAN_ROOT}" ]; then
   echo " * FLOAT8_RECIPE_WITH_BEST_SETTINGS: "rowwise" or "tensorwise". if set, use float8 training in torchtitan with the specified recipe, including the additional settings which are optimal for that recipe. otherwise, use bf16 mixed precision training."
   echo " * BATCH_SIZE: defaults to 1."
   echo " * STEPS: defaults to 100."
+  echo " * EXTRA_ARGS: additional arguments to pass to the torchtitan training script."
   exit 1
 fi
 
@@ -44,7 +45,7 @@ cd ${TORCHTITAN_ROOT}
 echo "float8 args: ${FLOAT8_ARGS}"
 
 # run the command with the specified arguments
-CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=${BATCH_SIZE} --training.compile ${FLOAT8_ARGS} 2>&1 | tee ${LOG_FILE}
+CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=${BATCH_SIZE} --training.compile ${FLOAT8_ARGS} ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
 
 # return to original working directory
 cd $original_dir
 
@@ -0,0 +1,199 @@
+from typing import Callable, Tuple
+
+import fire
+import torch
+import triton
+from torch._inductor.utils import do_bench_using_profiling
+
+from torchao.prototype.mx_formats.custom_cast import (
+    triton_to_mxfp8_dim1,
+)
+from torchao.prototype.mx_formats.mx_tensor import to_mx
+
+torch.manual_seed(0)
+
+bytes_per_el_bf16 = 2
+bytes_per_el_fp8 = 1
+
+
+def scale_dim0_reference(x_hp, block_size) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x_hp.is_contiguous()
+    x_hp_d0_block = x_hp.reshape(-1, block_size)
+    x_hp_d0_block_abs = x_hp_d0_block.abs()
+    amax_dim0 = torch.amax(x_hp_d0_block_abs, dim=1).unsqueeze(1)
+    x_hp_d0_block_normalized = x_hp_d0_block / amax_dim0
+    x_hp_d0_normalized = x_hp_d0_block_normalized.reshape(x_hp.shape)
+    return x_hp_d0_normalized, amax_dim0
+
+
+def scale_dim1_reference(x_hp, block_size) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x_hp.is_contiguous()
+    x_hp_d1 = x_hp.t().contiguous()
+    x_hp_d1_block = x_hp_d1.reshape(-1, block_size)
+    x_hp_d1_block_abs = x_hp_d1_block.abs()
+    amax_dim1 = torch.amax(x_hp_d1_block_abs, dim=1).unsqueeze(1)
+    x_hp_d1_block_normalized = x_hp_d1_block / amax_dim1
+    x_hp_d1_normalized = x_hp_d1_block_normalized.reshape(x_hp_d1.shape)
+    return x_hp_d1_normalized, amax_dim1
+
+
+def scale_dim0_dim1_reference(
+    x_hp: torch.Tensor, block_size
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # normalize across dim0
+    x_hp_d0_normalized, amax_dim0 = scale_dim0_reference(x_hp, block_size)
+    # normalize across dim1
+    x_hp_d1_normalized, amax_dim1 = scale_dim1_reference(x_hp, block_size)
+    return x_hp_d0_normalized, x_hp_d1_normalized.t(), amax_dim0, amax_dim1
+
+
+def to_mx_dim0_reference(x_hp, block_size):
+    scale_d0, data_d0 = to_mx(x_hp, torch.float8_e4m3fn, block_size)
+    return data_d0, scale_d0
+
+
+def to_mx_dim1_reference(x_hp, block_size):
+    x_hp = x_hp.t().contiguous()
+    scale_d1, data_d1 = to_mx(x_hp, torch.float8_e4m3fn, block_size)
+    return data_d1.t(), scale_d1
+
+
+def benchmark_cuda_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
+    """Thin wrapper around do_bench_using_profiling"""
+    no_args = lambda: func(*args, **kwargs)
+    time = do_bench_using_profiling(no_args)
+    return time * 1e3
+
+
+def run(
+    M: int = 16384,
+    K: int = 16384,
+    BLOCK_SIZE: int = 32,
+    mode: str = "dim0",
+):
+    print(f"M {M} K {K} BLOCK_SIZE {BLOCK_SIZE}")
+    print(f"GPU: {torch.cuda.get_device_name(0)}")
+    print(f"torch version: {torch.__version__}")
+    print(f"triton version: {triton.__version__}")
+    print(f"mode: {mode}")
+    assert mode in ("dim0", "dim1", "dim0_dim1", "dim0_mx", "dim1_mx", "dim1_mx_triton")
+
+    x = torch.randn(M, K, dtype=torch.bfloat16, device="cuda") * 1000
+
+    if mode == "dim0":
+        scale_dim0_reference_c = torch.compile(scale_dim0_reference)
+        y_d0, s_d0 = scale_dim0_reference_c(x, BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = scale_dim0_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: scale_dim0_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d0.dtype == torch.bfloat16
+        assert s_d0.dtype == torch.bfloat16
+        bytes_rw = sum(t.numel() for t in [x, y_d0, s_d0]) * bytes_per_el_bf16
+        bps = bytes_rw / (time_us / 1e6)
+
+    elif mode == "dim1":
+        scale_dim1_reference_c = torch.compile(scale_dim1_reference)
+        y_d1, s_d1 = scale_dim1_reference_c(x, BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = scale_dim1_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: scale_dim1_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.bfloat16
+        assert s_d1.dtype == torch.bfloat16
+        bytes_rw = sum(t.numel() for t in [x, y_d1, s_d1]) * bytes_per_el_bf16
+        bps = bytes_rw / (time_us / 1e6)
+
+    elif mode == "dim0_dim1":
+        scale_dim0_dim1_reference_c = torch.compile(scale_dim0_dim1_reference)
+        y_d0, y_d1, s_d0, s_d1 = scale_dim0_dim1_reference_c(x, BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = scale_dim0_dim1_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: scale_dim0_dim1_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d0.dtype == torch.bfloat16
+        assert s_d0.dtype == torch.bfloat16
+        assert y_d1.dtype == torch.bfloat16
+        assert s_d1.dtype == torch.bfloat16
+        bytes_rw = (
+            sum(t.numel() for t in [x, y_d0, y_d1, s_d0, s_d1]) * bytes_per_el_bf16
+        )
+        bps = bytes_rw / (time_us / 1e6)
+
+    elif mode == "dim0_mx":
+        to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
+        y_d0, s_d0 = to_mx_dim0_reference_c(x, BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = to_mx_dim0_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: to_mx_dim0_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.uint8
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mx":
+        to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
+        y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = to_mx_dim1_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: to_mx_dim1_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.uint8
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mx_triton":
+        y_d1, s_d1 = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+
+        for _ in range(2):
+            __ = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    else:
+        raise AssertionError(f"unknown mode {mode}")
+
+    print("time_us", time_us)
+    print("mem_bw_gbps", bps / 1e9)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
@@ -119,9 +119,10 @@ def aot_compile(
             "triton.cudagraphs": True,
         }
 
-    from torch.export import export_for_inference
+    from torch.export import export_for_training
 
-    exported = export_for_inference(fn, sample_args, sample_kwargs)
+    exported = export_for_training(fn, sample_args, sample_kwargs, strict=True)
+    exported.run_decompositions()
     output_path = torch._inductor.aoti_compile_and_package(
         exported,
         package_path=str(path),
 
@@ -82,9 +82,10 @@ def aot_compile(
             "triton.cudagraphs": True,
         }
 
-    from torch.export import export_for_inference
+    from torch.export import export_for_training
 
-    exported = export_for_inference(fn, sample_args, sample_kwargs)
+    exported = export_for_training(fn, sample_args, sample_kwargs, strict=True)
+    exported.run_decompositions()
     output_path = torch._inductor.aoti_compile_and_package(
         exported,
         package_path=str(path),
 
@@ -299,18 +299,6 @@ def get_extensions():
             extra_compile_args["nvcc"].append("-g")
             extra_link_args.append("/DEBUG")
 
-    curdir = os.path.dirname(os.path.curdir)
-    extensions_dir = os.path.join(curdir, "torchao", "csrc")
-    sources = list(glob.glob(os.path.join(extensions_dir, "**/*.cpp"), recursive=True))
-
-    extensions_cuda_dir = os.path.join(extensions_dir, "cuda")
-    cuda_sources = list(
-        glob.glob(os.path.join(extensions_cuda_dir, "**/*.cu"), recursive=True)
-    )
-
-    if use_cuda:
-        sources += cuda_sources
-
     # Get base directory and source paths
     curdir = os.path.dirname(os.path.curdir)
     extensions_dir = os.path.join(curdir, "torchao", "csrc")