Fix issue with fp64 constants (#506)

jansel · web-flow · commit 2e1ea33cf24c · 2025-08-19T17:24:18.000-07:00
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -14,7 +14,7 @@
 
 import sympy
 import torch
-from torch._inductor.codegen.triton import texpr
+from torch._inductor.codegen.triton import TritonPrinter
 from torch.fx.graph import _Namespace
 
 from .._compat import get_tensor_descriptor_fn_name
@@ -599,3 +599,22 @@ def current() -> DeviceFunction:
             return tls.functions[-1]
         except (AttributeError, IndexError):
             raise NoCurrentFunction from None
+
+
+class HelionTritonPrinter(TritonPrinter):
+    """Custom Triton printer that avoids wrapping float literals in tl.full().
+
+    Inductor's default TritonPrinter prints SymPy Float as a 0-D Triton value
+    via tl.full([], <val>, tl.float64). We override this to emit the raw numeric
+    literal, letting downstream type promotion and casts handle dtype.
+    """
+
+    def _print_Float(self, expr: sympy.Expr) -> str:
+        return str(expr)
+
+    def _print_ToFloat(self, expr: sympy.Expr) -> str:
+        return f"{expr} + 0.0"
+
+
+def texpr(expr: sympy.Expr) -> str:
+    return HelionTritonPrinter().doprint(expr)
diff --git a/test/test_broadcasting.expected b/test/test_broadcasting.expected
@@ -246,3 +246,24 @@ def fn(a, b, *, _launcher=_default_launcher):
     _BLOCK_SIZE_1 = 16
     _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0) * triton.cdiv(a.size(1), _BLOCK_SIZE_1),), a, b, out, a.size(0), a.size(1), a.stride(0), a.stride(1), b.stride(0), out.stride(0), out.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
+
+--- assertExpectedJournal(TestBroadcasting.test_python_float_promotion)
+from __future__ import annotations
+
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _helion_fn(a, a_size_0, a_stride_0, beta, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    b = tl.load(tl.make_block_ptr(a, [a_size_0], [a_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), boundary_check=[0], padding_option='zero')
+    sub = 1.0 + -1 * beta
+    v_0 = b * sub
+    tl.store(tl.make_block_ptr(a, [a_size_0], [a_stride_0], [offset_0], [_BLOCK_SIZE_0], [0]), v_0, boundary_check=[0])
+
+def fn(a, beta, *, _launcher=_default_launcher):
+    _BLOCK_SIZE_0 = 16
+    _launcher(_helion_fn, (triton.cdiv(a.size(0), _BLOCK_SIZE_0),), a, a.size(0), a.stride(0), beta, _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return a
diff --git a/test/test_broadcasting.py b/test/test_broadcasting.py
@@ -106,6 +106,26 @@ def fn(a, b):
         torch.testing.assert_close(out, sum(args))
         self.assertExpectedJournal(code)
 
+    def test_python_float_promotion(self):
+        # Repro for https://github.com/pytorch/helion/issues/493
+        # Python floats should follow PyTorch type promotion (no unintended fp64 upcast)
+        @helion.kernel(config={"block_size": 16, "indexing": "block_ptr"})
+        def fn(a, beta):
+            for tile0 in hl.tile(a.shape[0]):
+                b = a[tile0]
+                a[tile0] = (1 - beta) * b
+            return a
+
+        a = torch.randn(1024, device=DEVICE)
+        beta = 1.5
+        args = (a, beta)
+
+        # Expected behavior matches PyTorch promotion rules on tensors
+        expected = (1 - beta) * a
+        code, out = code_and_output(fn, args)
+        torch.testing.assert_close(out, expected)
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_specialize.expected b/test/test_specialize.expected
@@ -79,9 +79,7 @@ def _helion_fn(x, out, x_size_0, out_stride_0, out_stride_1, x_stride_0, x_strid
     mask_0 = indices_0 < x_size_0
     indices_1 = tl.arange(0, _RDIM_SIZE_1).to(tl.int32)
     mask_1 = indices_1 < 500
-    sym_float = tl.full([], 512.0, tl.float64)
-    truediv = tl.full([], 0.001953125, tl.float64)
-    acc = tl.full([_BLOCK_SIZE_0, 512], truediv, tl.float32)
+    acc = tl.full([_BLOCK_SIZE_0, 512], 0.001953125, tl.float32)
     acc2 = tl.full([512, 512], 1.0, tl.float32)
     _mask_to = tl.where(tl.broadcast_to(mask_0[:, None], [_BLOCK_SIZE_0, 512]), acc, 0)
     acc_1 = tl.dot(_mask_to, acc2, input_precision='tf32')