add xpu tuning to FLJSD (#647)

mgrabban · web-flow · commit 4854fd4c3d63 · 2025-04-02T11:56:35.000-07:00
## Summary Tuning on XPU: In fused linear JSD, if device is xpu, set MAX_FUSED_SIZE to 4096 instead of default 65536 // 2. This gives slightly better performance on xpu. Very similar to #645 ## Testing Done - Hardware Type: Intel(R) Data Center GPU Max 1550 - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/ops/fused_linear_jsd.py b/src/liger_kernel/ops/fused_linear_jsd.py
@@ -8,11 +8,12 @@
 from liger_kernel.ops.utils import amp_custom_fwd
 from liger_kernel.ops.utils import element_mul_kernel
 from liger_kernel.ops.utils import is_hip
+from liger_kernel.utils import infer_device
 
 # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
 # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
 # The optimal maximum block size depends on your hardware, your kernel, and your dtype
-MAX_FUSED_SIZE = 65536 // 2
+MAX_FUSED_SIZE = 4096 if infer_device() == "xpu" else 65536 // 2
 
 
 def fused_linear_jsd_forward(