tenstorrent
diff --git a/‎tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/ttnn/unit_tests/operations/eltwise/test_binary_composite.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/ttnn/unit_tests/operations/eltwise/test_div_ops.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/ttnn/unit_tests/operations/eltwise/test_div_ops.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/ttnn/unit_tests/operations/eltwise/test_fmod.py‎
Lines changed: 22 additions & 1 deletion b/‎tests/ttnn/unit_tests/operations/eltwise/test_fmod.py‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎tt_metal/hw/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎tt_metal/hw/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_binary_fmod.h‎
Lines changed: 148 additions & 0 deletions b/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_binary_fmod.h‎
Lines changed: 148 additions & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod_int32.h‎
Lines changed: 0 additions & 49 deletions b/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_fmod_int32.h‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎…lk_math_eltwise_binary_sfpu_fmod_int32.h‎ ‎…k_math_eltwise_binary_sfpu_binary_fmod.h‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_fmod_int32.h renamed to tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_binary_fmod.h
Lines changed: 17 additions & 1 deletion b/‎…lk_math_eltwise_binary_sfpu_fmod_int32.h‎ ‎…k_math_eltwise_binary_sfpu_binary_fmod.h‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_fmod_int32.h renamed to tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_binary_sfpu_binary_fmod.h
Lines changed: 17 additions & 1 deletion
@@ -456,8 +456,8 @@ def test_binary_fmod_decimal_ttnn(input_shapes, device):
     golden_function = ttnn.get_golden_function(ttnn.fmod)
     golden_tensor = golden_function(in_data1, in_data2, device=device)
 
-    comp_pass = compare_pcc([output_tensor], [golden_tensor], 0.9999)
-    assert comp_pass
+    output_torch = ttnn.to_torch(output_tensor)
+    assert torch.allclose(output_torch, golden_tensor, rtol=5e-2, atol=1e-5)
 
 
 @pytest.mark.parametrize(
@@ -476,7 +476,7 @@ def test_fmod_ttnn(input_shapes, device):
         golden_function = ttnn.get_golden_function(ttnn.fmod)
         golden_tensor = golden_function(in_data1, scalar, device=device)
 
-        comp_pass = compare_pcc([output_tensor], [golden_tensor])
+        comp_pass = assert_with_ulp(golden_tensor, output_tensor, 1)
         assert comp_pass, f"Failed for scalar={scalar}"
 
 
 
@@ -9,6 +9,7 @@
 from models.common.utility_functions import torch_random
 from functools import partial
 from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+from tests.ttnn.utils_for_testing import assert_with_ulp
 
 pytestmark = pytest.mark.use_module_device
 
@@ -141,8 +142,7 @@ def test_binary_fmod_bf16(
     output = ttnn.fmod(input_tensor_a, input_tensor_b)
     output = ttnn.to_torch(output)
 
-    pcc = ttnn.pearson_correlation_coefficient(torch_output_tensor, output)
-    assert pcc >= 0.99
+    assert_with_ulp(torch_output_tensor, output, 1)
 
 
 # This test was added for #17362
 
@@ -5,7 +5,7 @@
 import torch
 import pytest
 import ttnn
-from tests.ttnn.nightly.unit_tests.operations.eltwise.backward.utility_funcs import compare_equal
+from tests.ttnn.utils_for_testing import assert_with_ulp
 
 
 @pytest.mark.parametrize(
@@ -44,3 +44,24 @@ def test_fmod_nan(testing_dtype, device):
     output_tensor = ttnn.to_torch(tt_result)
 
     assert torch.equal(torch.isnan(golden), torch.isnan(output_tensor))
+
+
+@pytest.mark.parametrize("dtype", ["bfloat16", "float32"])
+def test_fmod_binary_accuracy(device, dtype):
+    """Test fmod binary operation with specific values."""
+    torch_dtype = getattr(torch, dtype)
+    ttnn_dtype = getattr(ttnn, dtype)
+
+    torch_input_a = torch.tensor([[5.0, 7.0, -5.0, -7.0, 3.5, 10.0, 1.5, -1.5, 9.0, 15.0]], dtype=torch_dtype)
+    torch_input_b = torch.tensor([[2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 0.5, 0.5, -2.0, -4.0]], dtype=torch_dtype)
+
+    golden_fn = ttnn.get_golden_function(ttnn.fmod)
+    golden = golden_fn(torch_input_a, torch_input_b, device=device)
+
+    input_tensor_a = ttnn.from_torch(torch_input_a, dtype=ttnn_dtype, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_b, dtype=ttnn_dtype, layout=ttnn.TILE_LAYOUT, device=device)
+
+    output = ttnn.fmod(input_tensor_a, input_tensor_b)
+    output = ttnn.to_torch(output)
+
+    assert_with_ulp(golden, output, 1)
@@ -611,7 +611,7 @@ target_sources(
             inc/api/compute/eltwise_unary/where.h
             inc/api/compute/ema.h
             inc/api/compute/experimental/mul_reduce_scalar.h
-            inc/api/compute/fmod_int32.h
+            inc/api/compute/binary_fmod.h
             inc/api/compute/gcd.h
             inc/api/compute/layernorm.h
             inc/api/compute/lcm.h
 
@@ -0,0 +1,148 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "ckernel_sfpu_remainder_int32.h"
+#include "sfpi.h"
+
+namespace ckernel::sfpu {
+
+// FMOD = a - trunc(a / b) * b
+// Implemented using 32-bit integer remainder kernel (see ckernel_sfpu_remainder_int32.h)
+sfpi_inline void calculate_fmod_int32_body(
+    const uint dst_index_in0, const uint dst_index_in1, const uint dst_index_out) {
+    // size of each tile in Dest is 64/SFP_DESTREG_STRIDE = 32 rows when using sfpi to load/store
+    constexpr uint dst_tile_size_sfpi = 32;
+
+    // Read inputs
+    sfpi::vInt a_signed = sfpi::dst_reg[dst_index_in0 * dst_tile_size_sfpi];
+    sfpi::vInt b_signed = sfpi::dst_reg[dst_index_in1 * dst_tile_size_sfpi];
+
+    // Compute unsigned remainder
+    sfpi::vInt r = compute_unsigned_remainder_int32(a_signed, b_signed);
+
+    // FMOD sign handling (result has the same sign as a)
+    v_if(a_signed < 0) { r = -r; }
+    v_endif;
+
+    sfpi::dst_reg[dst_index_out * dst_tile_size_sfpi] = r;
+}
+
+template <bool is_fp32_dest_acc_en = false>
+sfpi_inline sfpi::vFloat _sfpu_binary_fmod_(sfpi::vFloat in0, sfpi::vFloat in1) {
+    // fmod(a, b) = a - trunc(a/b) * b
+
+    sfpi::vFloat a = in0;
+    sfpi::vFloat b = in1;
+    sfpi::vFloat b_abs = sfpi::abs(b);
+
+    // Compute reciprocal 1/b
+    sfpi::vFloat recip = ckernel::sfpu::_sfpu_reciprocal_<2>(b);
+
+    // Compute a/b = a * (1/b)
+    sfpi::vFloat div_result = a * recip;
+
+    // Compute trunc(a/b)
+    // Input in LReg0, output in LReg1. LReg2/LReg3 are clobbered by _trunc_body_(),
+    // so we must read them to inform the SFPI register allocator they are not immediately available.
+    sfpi::l_reg[sfpi::LRegs::LReg0] = div_result;
+    _trunc_body_();
+    sfpi::vFloat trunc_div = sfpi::l_reg[sfpi::LRegs::LReg1];
+    sfpi::vFloat tmp2 = sfpi::l_reg[sfpi::LRegs::LReg2];
+    sfpi::vFloat tmp3 = sfpi::l_reg[sfpi::LRegs::LReg3];
+
+    // Compute fmod = a - trunc(a/b) * b
+    sfpi::vFloat result = a - trunc_div * b;
+
+    // Post-correction - fmod result must satisfy |result| < |b|
+    // If |result| >= |b|, the truncation was wrong by 1
+    sfpi::vFloat result_abs = sfpi::abs(result);
+
+    // If result >= b, we truncated too low, add/subtract b to correct
+    v_if(result_abs >= b_abs) {
+        // Determine correction direction based on sign of result
+        v_if(result >= sfpi::vFloat(0.0f)) {
+            result = result - b_abs;  // result was positive and too big
+        }
+        v_else {
+            result = result + b_abs;  // result was negative and too big (magnitude)
+        }
+        v_endif;
+    }
+    v_endif;
+
+    // Sign correction - fmod result must have same sign as 'a' (or be zero)
+    // If a > 0 and result < 0, the truncation was 1 too high, need to add b
+    // If a < 0 and result > 0, the truncation was 1 too low, need to subtract b
+    // This fixes cases where a/b ≈ 0.9999999 but rounds to 1 due to reciprocal error
+    v_if(a >= sfpi::vFloat(0.0f)) {
+        // a is positive, result should be >= 0
+        v_if(result < sfpi::vFloat(0.0f)) {
+            result = result + b_abs;  // over-truncated
+        }
+        v_endif;
+    }
+    v_else {
+        // a is negative, result should be <= 0
+        v_if(result > sfpi::vFloat(0.0f)) {
+            result = result - b_abs;  // under-truncated
+        }
+        v_endif;
+    }
+    v_endif;
+
+    // Handle special cases using conditional assignment (NOT early return!)
+    // When a == b, fmod(a, b) = 0
+    v_if(a == b) { result = sfpi::vFloat(0.0f); }
+    v_endif;
+
+    // Handle division by zero - return NaN
+    v_if(b == sfpi::vFloat(0.0f)) { result = sfpi::vFloat(std::numeric_limits<float>::quiet_NaN()); }
+    v_endif;
+
+    if constexpr (!is_fp32_dest_acc_en) {
+        result = reinterpret<sfpi::vFloat>(sfpi::float_to_fp16b(result, 0));
+    }
+
+    return result;
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS>
+inline void calculate_fmod_int32(const uint dst_index_in0, const uint dst_index_in1, const uint dst_index_out) {
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        calculate_fmod_int32_body(dst_index_in0, dst_index_in1, dst_index_out);
+        sfpi::dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE, int ITERATIONS = 8, bool is_fp32_dest_acc_en = false>
+inline void calculate_sfpu_binary_fmod(const uint dst_index_in0, const uint dst_index_in1, const uint dst_index_out) {
+    for (int d = 0; d < ITERATIONS; d++) {
+        // size of each tile in Dest is 64/SFP_DESTREG_STRIDE = 32 rows when using sfpi to load/store
+        constexpr uint dst_tile_size_sfpi = 32;
+        sfpi::vFloat in0 = sfpi::dst_reg[dst_index_in0 * dst_tile_size_sfpi];
+        sfpi::vFloat in1 = sfpi::dst_reg[dst_index_in1 * dst_tile_size_sfpi];
+
+        sfpi::vFloat result = _sfpu_binary_fmod_<is_fp32_dest_acc_en>(in0, in1);
+
+        sfpi::dst_reg[dst_index_out * dst_tile_size_sfpi] = result;
+        sfpi::dst_reg++;
+    }
+}
+
+template <bool APPROXIMATION_MODE>
+inline void fmod_int32_init() {
+    div_floor_init<APPROXIMATION_MODE>();
+}
+
+template <bool APPROXIMATION_MODE>
+inline void fmod_binary_init() {
+    _init_sfpu_reciprocal_<false>();
+}
+
+}  // namespace ckernel::sfpu
@@ -6,7 +6,7 @@
 
 #include "llk_math_eltwise_binary_sfpu_init.h"
 #include "llk_math_eltwise_binary_sfpu_params.h"
-#include "ckernel_sfpu_fmod_int32.h"
+#include "ckernel_sfpu_binary_fmod.h"
 
 namespace ckernel {
 
@@ -22,4 +22,20 @@ inline void llk_math_eltwise_binary_sfpu_fmod_int32(
         sfpu::calculate_fmod_int32<APPROXIMATE, 8>, dst_index0, dst_index1, odst, vector_mode);
 }
 
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_binary_sfpu_binary_fmod_init() {
+    llk_math_eltwise_binary_sfpu_init<SfpuType::unused, APPROXIMATE>(sfpu::fmod_binary_init<APPROXIMATE>);
+}
+
+template <bool APPROXIMATE, bool is_fp32_dest_acc_en = false>
+inline void llk_math_eltwise_binary_sfpu_binary_fmod(
+    uint dst_index0, uint32_t dst_index1, uint32_t odst, int vector_mode = VectorMode::RC) {
+    _llk_math_eltwise_binary_sfpu_params_<APPROXIMATE>(
+        sfpu::calculate_sfpu_binary_fmod<APPROXIMATE, 8, is_fp32_dest_acc_en>,
+        dst_index0,
+        dst_index1,
+        odst,
+        vector_mode);
+}
+
 }  // namespace ckernel