Add benchmark script

Tcc0403 · Tcc0403 · commit cae59315e4df · 2025-03-22T01:34:37.000+08:00
diff --git a/benchmark/scripts/benchmark_dyt.py b/benchmark/scripts/benchmark_dyt.py
@@ -0,0 +1,147 @@
+import os
+import sys
+
+import torch
+import triton
+
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+
+from liger_kernel.utils import infer_device
+
+device = infer_device()
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+
+def bench_speed_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.transformers.test_dyt import LigerDyT
+    from test.transformers.test_dyt import TorchDyT
+
+    hidden_size = input.x
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    extra_benchmark_config = input.extra_benchmark_config
+    BT = extra_benchmark_config["BT"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (BT, hidden_size)
+    torch_y = lambda x: TorchDyT(hidden_size=hidden_size).to(device)(x)
+    torch_compile_y = lambda x: torch.compile(
+        TorchDyT(hidden_size=hidden_size).to(device)
+    )(x)
+    triton_y = lambda x: LigerDyT(hidden_size=hidden_size).to(device)(x)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return triton_y(x)
+        elif provider == "torch":
+            return torch_y(x)
+        elif provider == "torch_compile":
+            return torch_compile_y(x)
+
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(dy, retain_graph=True),
+            quantiles=QUANTILES,
+            grad_to_none=[x],
+            rep=500,
+        )
+    elif mode == "full":
+
+        def full():
+            y = fwd()
+            y.backward(dy)
+
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full, quantiles=QUANTILES, grad_to_none=[x], rep=500
+        )
+
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+
+
+def bench_memory_dyt(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    from test.transformers.test_dyt import LigerDyT
+    from test.transformers.test_dyt import TorchDyT
+
+    hidden_size = input.x
+    provider = input.kernel_provider
+    extra_benchmark_config = input.extra_benchmark_config
+    BT = extra_benchmark_config["BT"]
+    dtype = extra_benchmark_config["dtype"]
+
+    x_shape = (BT, hidden_size)
+    torch_y = lambda x: TorchDyT(hidden_size=hidden_size).to(device)(x)
+    torch_compile_y = lambda x: torch.compile(
+        TorchDyT(hidden_size=hidden_size).to(device)
+    )(x)
+    triton_y = lambda x: LigerDyT(hidden_size=hidden_size).to(device)(x)
+
+    x = torch.randn(x_shape, dtype=dtype, device=device)
+    dy = torch.randn_like(x)
+    x.requires_grad_(True)
+
+    def fwd():
+        if provider == "liger":
+            return triton_y(x)
+        elif provider == "torch":
+            return torch_y(x)
+        elif provider == "torch_compile":
+            return torch_compile_y(x)
+
+    def full():
+        y = fwd()
+        y.backward(dy, retain_graph=True)
+
+    mem_50, mem_20, mem_80 = _test_memory(full, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+
+
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+
+    common_configs = {
+        "kernel_name": "dyt",
+        "x_name": "hidden_size",
+        "x_label": "hidden size",
+        "x_values": [2**i for i in range(10, 15)],
+        "kernel_providers": ["liger", "torch", "torch_compile"],
+        "extra_benchmark_configs": [{"BT": 4096, "dtype": torch.float32}],
+        "overwrite": args.overwrite,
+    }
+
+    run_benchmarks(
+        bench_test_fn=bench_speed_dyt,
+        kernel_operation_modes=["forward", "backward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_dyt,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )
diff --git a/test/transformers/test_dyt.py b/test/transformers/test_dyt.py
@@ -13,15 +13,14 @@
 
 
 class TorchDyT(nn.Module):
-    def __init__(self, hidden_size, init_alpha, dtype):
+    def __init__(self, hidden_size, init_alpha=0.5):
         super().__init__()
         self.alpha = nn.Parameter(torch.ones(1) * init_alpha)
         self.gamma = nn.Parameter(torch.ones(hidden_size))
         self.beta = nn.Parameter(torch.zeros(hidden_size))
-        self.dtype = dtype
 
     def forward(self, x):
-        return (self.gamma * torch.tanh((self.alpha * x).to(torch.float32)) + self.beta).to(self.dtype)
+        return self.gamma * torch.tanh(self.alpha * x) + self.beta
 
 
 set_seed(42)
@@ -55,12 +54,16 @@ def test_liger_dyt_correctness(B, T, hidden_size, init_alpha, dtype, atol, rtol)
     gamma = torch.randn(hidden_size, device=device, dtype=dtype)
     beta = torch.randn(hidden_size, device=device, dtype=dtype)
 
-    torch_dyt = TorchDyT(hidden_size=hidden_size, init_alpha=init_alpha, dtype=dtype).to(device).to(dtype)
+    torch_dyt = (
+        TorchDyT(hidden_size=hidden_size, init_alpha=init_alpha).to(device).to(dtype)
+    )
     torch_dyt.alpha.data = alpha.clone()
     torch_dyt.gamma.data = gamma.clone()
     torch_dyt.beta.data = beta.clone()
 
-    liger_dyt = LigerDyT(hidden_size=hidden_size, init_alpha=init_alpha).to(device).to(dtype)
+    liger_dyt = (
+        LigerDyT(hidden_size=hidden_size, init_alpha=init_alpha).to(device).to(dtype)
+    )
     liger_dyt.alpha.data = alpha.clone()
     liger_dyt.gamma.data = gamma.clone()
     liger_dyt.beta.data = beta.clone()
@@ -75,9 +78,15 @@ def test_liger_dyt_correctness(B, T, hidden_size, init_alpha, dtype, atol, rtol)
     liger_output.backward(grad_output)
 
     assert_verbose_allclose(x1.grad, x2.grad, rtol=rtol, atol=atol)
-    assert_verbose_allclose(torch_dyt.alpha.grad, liger_dyt.alpha.grad, rtol=rtol, atol=atol)
-    assert_verbose_allclose(torch_dyt.gamma.grad, liger_dyt.gamma.grad, rtol=rtol, atol=atol)
-    assert_verbose_allclose(torch_dyt.beta.grad, liger_dyt.beta.grad, rtol=rtol, atol=atol)
+    assert_verbose_allclose(
+        torch_dyt.alpha.grad, liger_dyt.alpha.grad, rtol=rtol, atol=atol
+    )
+    assert_verbose_allclose(
+        torch_dyt.gamma.grad, liger_dyt.gamma.grad, rtol=rtol, atol=atol
+    )
+    assert_verbose_allclose(
+        torch_dyt.beta.grad, liger_dyt.beta.grad, rtol=rtol, atol=atol
+    )
 
 
 @pytest.mark.parametrize(
@@ -99,7 +108,9 @@ def test_liger_dyt_correctness(B, T, hidden_size, init_alpha, dtype, atol, rtol)
             torch.bfloat16,
             1e-8,
             5e-2,
-            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+            marks=pytest.mark.skipif(
+                not supports_bfloat16(), reason="bfloat16 not supported on this GPU"
+            ),
         ),
     ],
 )
@@ -128,8 +139,8 @@ def test_liger_dyt_functional(B, T, hidden_size, dtype, atol, rtol):
     assert_verbose_allclose(output1, output2, rtol=rtol, atol=atol)
 
     grad_output = torch.randn_like(_input)
-    output1.backward(grad_output, retain_graph=True)
-    output2.backward(grad_output, retain_graph=True)
+    output1.backward(grad_output)
+    output2.backward(grad_output)
 
     assert_verbose_allclose(x1.grad, x2.grad, rtol=rtol, atol=atol)
     assert_verbose_allclose(alpha1.grad, alpha2.grad, rtol=rtol, atol=atol)