tenstorrent
diff --git a/‎tests/ttnn/nightly/unit_tests/operations/eltwise/test_lerp.py‎
Lines changed: 88 additions & 40 deletions b/‎tests/ttnn/nightly/unit_tests/operations/eltwise/test_lerp.py‎
Lines changed: 88 additions & 40 deletions
diff --git a/‎tests/ttnn/unit_tests/operations/eltwise/test_ternary_composite.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/ttnn/unit_tests/operations/eltwise/test_ternary_composite.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_lerp.h‎
Lines changed: 40 additions & 0 deletions b/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_lerp.h‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_ternary_sfpu_lerp.h‎
Lines changed: 29 additions & 0 deletions b/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/llk_math_eltwise_ternary_sfpu_lerp.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions b/‎tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_lerp.h‎
Lines changed: 40 additions & 0 deletions b/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_lerp.h‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_ternary_sfpu_lerp.h‎
Lines changed: 29 additions & 0 deletions b/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/llk_math_eltwise_ternary_sfpu_lerp.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions b/‎tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu_types.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tt_metal/hw/inc/api/compute/eltwise_unary/lerp.h‎
Lines changed: 36 additions & 0 deletions b/‎tt_metal/hw/inc/api/compute/eltwise_unary/lerp.h‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎ttnn/cpp/ttnn/operations/eltwise/ternary/device/kernels/compute/ternary_sfpu_col_scalar_bcast_tts_tst.cpp‎
Lines changed: 1 addition & 0 deletions b/‎ttnn/cpp/ttnn/operations/eltwise/ternary/device/kernels/compute/ternary_sfpu_col_scalar_bcast_tts_tst.cpp‎
Lines changed: 1 addition & 0 deletions
@@ -8,69 +8,117 @@
 
 import ttnn
 
-from math import pi
-from tests.ttnn.utils_for_testing import assert_with_pcc
+from tests.ttnn.utils_for_testing import assert_with_ulp
+
+
+def run_lerp_test(
+    device,
+    h,
+    w,
+    low,
+    high,
+    end,
+    weight,
+    ttnn_function,
+    use_scalar_weight=False,
+    ulp_threshold=1,
+    input_dtype="bfloat16",
+    output_dtype=None,
+):
+    torch_input_dtype = getattr(torch, input_dtype)
+
+    torch_input_tensor_a = torch.linspace(low, high, steps=h * w, dtype=torch_input_dtype).reshape((h, w))
+    torch_input_tensor_b = torch.full((h, w), end, dtype=torch_input_dtype)
+
+    golden_function = ttnn.get_golden_function(ttnn_function)
+
+    if use_scalar_weight:
+        torch_weight = weight
+        ttnn_weight = weight
+    else:
+        torch_weight = torch.full((h, w), weight, dtype=torch_input_dtype)
+        ttnn_weight = ttnn.from_torch(torch_weight, layout=ttnn.TILE_LAYOUT, device=device)
 
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
 
-def run_lerp_test_float(device, h, w, low, high, end, weight, ttnn_function, torch_function, pcc=0.9999):
-    torch_input_tensor_a = torch.linspace(low, high, steps=h * w, dtype=torch.bfloat16).reshape((h, w))
-    torch_input_tensor_b = torch.full((h, w), end, dtype=torch.bfloat16)
+    calculated_tensor = None
+    if output_dtype is not None:
+        torch_dtype = getattr(torch, output_dtype)
+        ttnn_output_dtype = getattr(ttnn, output_dtype)
+        torch_input_tensor_a = torch_input_tensor_a.to(torch_dtype)
+        torch_input_tensor_b = torch_input_tensor_b.to(torch_dtype)
+        calculated_tensor = ttnn.empty((h, w), dtype=ttnn_output_dtype, layout=ttnn.TILE_LAYOUT, device=device)
 
-    torch_output_tensor = torch_function(torch_input_tensor_a, torch_input_tensor_b, weight)
+    golden_output_tensor = golden_function(
+        torch_input_tensor_a,
+        torch_input_tensor_b,
+        torch_weight,
+    )
 
-    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
-    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    calculated_tensor = ttnn_function(input_tensor_a, input_tensor_b, ttnn_weight, output_tensor=calculated_tensor)
 
-    output_tensor = ttnn_function(input_tensor_a, input_tensor_b, weight)
-    output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT)
-    output_tensor = ttnn.from_device(output_tensor)
-    output_tensor = ttnn.to_torch(output_tensor)
+    if output_dtype is not None:
+        assert calculated_tensor.dtype == ttnn_output_dtype
 
-    assert_with_pcc(torch_output_tensor, output_tensor, pcc)
+    calculated_tensor = ttnn.to_torch(calculated_tensor)
+    assert_with_ulp(golden_output_tensor, calculated_tensor, ulp_threshold=ulp_threshold)
 
 
 @pytest.mark.parametrize("h", [64])
 @pytest.mark.parametrize("w", [128])
 @pytest.mark.parametrize("weight", [0.5])
-def test_lerp_float_a(device, h, w, weight):
-    run_lerp_test_float(device, h, w, 0, 90, 100, weight, ttnn.lerp, torch.lerp)
+@pytest.mark.parametrize("input_dtype", ["bfloat16", "float32"])
+def test_lerp_float_a(device, h, w, weight, input_dtype):
+    run_lerp_test(device, h, w, 0, 90, 100, weight, ttnn.lerp, use_scalar_weight=True, input_dtype=input_dtype)
 
 
 @pytest.mark.parametrize("h", [64])
 @pytest.mark.parametrize("w", [128])
 @pytest.mark.parametrize("weight", [0.75])
-def test_lerp_float_b(device, h, w, weight):
-    run_lerp_test_float(device, h, w, 1, 80, 99, weight, ttnn.lerp, torch.lerp, pcc=0.999)
-
-
-def run_lerp_test_tensor(device, h, w, low, high, end, weight, ttnn_function, torch_function, pcc=0.9999):
-    torch_input_tensor_a = torch.linspace(low, high, steps=h * w, dtype=torch.bfloat16).reshape((h, w))
-    torch_input_tensor_b = torch.full((h, w), end, dtype=torch.bfloat16)
-    torch_weight = torch.full((h, w), weight, dtype=torch.bfloat16)
-
-    torch_output_tensor = torch_function(torch_input_tensor_a, torch_input_tensor_b, torch_weight)
-
-    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
-    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
-    input_weight = ttnn.from_torch(torch_weight, layout=ttnn.TILE_LAYOUT, device=device)
-
-    output_tensor = ttnn_function(input_tensor_a, input_tensor_b, input_weight)
-    output_tensor = ttnn.to_layout(output_tensor, ttnn.ROW_MAJOR_LAYOUT)
-    output_tensor = ttnn.from_device(output_tensor)
-    output_tensor = ttnn.to_torch(output_tensor)
-
-    assert_with_pcc(torch_output_tensor, output_tensor, pcc)
+@pytest.mark.parametrize("input_dtype", ["bfloat16", "float32"])
+def test_lerp_float_b(device, h, w, weight, input_dtype):
+    run_lerp_test(
+        device, h, w, 1, 80, 99, weight, ttnn.lerp, use_scalar_weight=True, ulp_threshold=2, input_dtype=input_dtype
+    )
 
 
 @pytest.mark.parametrize("h", [64])
 @pytest.mark.parametrize("w", [128])
 @pytest.mark.parametrize("weight", [0.5])
-def test_lerp_tensor_a(device, h, w, weight):
-    run_lerp_test_tensor(device, h, w, 0, 90, 100, weight, ttnn.lerp, torch.lerp)
+@pytest.mark.parametrize("input_dtype", ["bfloat16", "float32"])
+def test_lerp_tensor_a(device, h, w, weight, input_dtype):
+    run_lerp_test(device, h, w, 0, 90, 100, weight, ttnn.lerp, use_scalar_weight=False, input_dtype=input_dtype)
 
 
 @pytest.mark.parametrize("h", [64])
 @pytest.mark.parametrize("w", [128])
 @pytest.mark.parametrize("weight", [0.75])
-def test_lerp_tensor_b(device, h, w, weight):
-    run_lerp_test_tensor(device, h, w, 1, 80, 99, weight, ttnn.lerp, torch.lerp, pcc=0.999)
+@pytest.mark.parametrize("input_dtype", ["bfloat16", "float32"])
+def test_lerp_tensor_b(device, h, w, weight, input_dtype):
+    run_lerp_test(
+        device, h, w, 1, 80, 99, weight, ttnn.lerp, use_scalar_weight=False, ulp_threshold=2, input_dtype=input_dtype
+    )
+
+
+@pytest.mark.parametrize("h", [64])
+@pytest.mark.parametrize("w", [9472])
+@pytest.mark.parametrize("weight", [0.75])
+@pytest.mark.parametrize("input_dtype", ["bfloat16", "float32"])
+def test_lerp_fp32_preallocated_output(device, h, w, weight, input_dtype):
+    """Lerp with bfloat16 inputs (two tensors + scalar weight) and preallocated float32 output.
+    Checks that output is correct within 1 ULP for float32."""
+    run_lerp_test(
+        device,
+        h,
+        w,
+        1,
+        80,
+        99,
+        weight,
+        ttnn.lerp,
+        use_scalar_weight=True,
+        ulp_threshold=1,
+        output_dtype="float32",
+        input_dtype=input_dtype,
+    )
@@ -223,8 +223,8 @@ def test_lerp_overload_ttnn(input_shapes, value, device):
     golden_fn = ttnn.get_golden_function(ttnn.lerp)
     golden_tensor = golden_fn(in_data1, in_data2, value)
 
-    comp_pass = compare_pcc([output_tensor], [golden_tensor])
-    assert comp_pass
+    output_torch = output_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
+    assert_with_ulp(golden_tensor, output_torch, ulp_threshold=2)
 
 
 @pytest.mark.parametrize(
@@ -244,8 +244,8 @@ def test_lerp_ttnn(input_shapes, device):
     golden_fn = ttnn.get_golden_function(ttnn.lerp)
     golden_tensor = golden_fn(in_data1, in_data2, in_data3)
 
-    comp_pass = compare_pcc([output_tensor], [golden_tensor])
-    assert comp_pass
+    output_torch = output_tensor.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
+    assert_with_ulp(golden_tensor, output_torch, ulp_threshold=2)
 
 
 @pytest.mark.parametrize(
 
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_defs.h"
+#include "sfpi.h"
+#include "ckernel_sfpu_binary.h"
+
+namespace ckernel::sfpu {
+
+template <bool APPROXIMATION_MODE, bool is_fp32_dest_acc_en, DataFormat data_format, int ITERATIONS>
+inline void calculate_lerp(
+    const uint dst_index_in0,  // input (start)
+    const uint dst_index_in1,  // end
+    const uint dst_index_in2,  // weight
+    const uint dst_index_out) {
+    static_assert(
+        data_format == DataFormat::Float32 || data_format == DataFormat::Float16_b,
+        "Unsupported data format for calculate_lerp(). Supported data formats are: Float32, Float16_b.");
+
+    // size of each tile in Dest is 64/SFP_DESTREG_STRIDE = 32 rows when using sfpi to load/store
+    constexpr uint dst_tile_size_sfpi = 32;
+    // lerp: out = input + weight * (end - input)
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        sfpi::vFloat in0 = sfpi::dst_reg[dst_index_in0 * dst_tile_size_sfpi];
+        sfpi::vFloat in1 = sfpi::dst_reg[dst_index_in1 * dst_tile_size_sfpi];
+        sfpi::vFloat in2 = sfpi::dst_reg[dst_index_in2 * dst_tile_size_sfpi];
+        sfpi::vFloat result = in0 + in2 * (in1 - in0);
+        if constexpr (!is_fp32_dest_acc_en) {
+            result = float32_to_bf16_rne(result);
+        }
+        sfpi::dst_reg[dst_index_out * dst_tile_size_sfpi] = result;
+        sfpi::dst_reg++;
+    }
+}
+
+}  // namespace ckernel::sfpu
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_eltwise_ternary_sfpu_params.h"
+#include "ckernel_sfpu_lerp.h"
+
+namespace ckernel {
+
+template <bool APPROXIMATE, bool is_fp32_dest_acc_en, DataFormat data_format, int ITERATIONS = 8>
+inline void llk_math_eltwise_ternary_sfpu_lerp(
+    uint dst_index0, uint dst_index1, uint dst_index2, uint odst, int vector_mode = (int)VectorMode::RC) {
+    _llk_math_eltwise_ternary_sfpu_params_<APPROXIMATE>(
+        sfpu::calculate_lerp<APPROXIMATE, is_fp32_dest_acc_en, data_format, ITERATIONS>,
+        dst_index0,
+        dst_index1,
+        dst_index2,
+        odst,
+        vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_ternary_sfpu_lerp_init() {
+    _llk_math_eltwise_ternary_sfpu_init_<SfpuType::lerp>();
+}
+
+}  // namespace ckernel
@@ -151,4 +151,5 @@ enum class SfpuType {
     unary_max_uint32,
     unary_min_uint32,
     addcdiv,
+    lerp,
 };
@@ -0,0 +1,40 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_defs.h"
+#include "sfpi.h"
+#include "ckernel_sfpu_binary.h"
+
+namespace ckernel::sfpu {
+
+template <bool APPROXIMATION_MODE, bool is_fp32_dest_acc_en, DataFormat data_format, int ITERATIONS>
+inline void calculate_lerp(
+    const uint dst_index_in0,  // input (start)
+    const uint dst_index_in1,  // end
+    const uint dst_index_in2,  // weight
+    const uint dst_index_out) {
+    static_assert(
+        data_format == DataFormat::Float32 || data_format == DataFormat::Float16_b,
+        "Unsupported data format for calculate_lerp(). Supported data formats are: Float32, Float16_b.");
+
+    // size of each tile in Dest is 64/SFP_DESTREG_STRIDE = 32 rows when using sfpi to load/store
+    constexpr uint dst_tile_size_sfpi = 32;
+    // lerp: out = input + weight * (end - input)
+#pragma GCC unroll 8
+    for (int d = 0; d < ITERATIONS; d++) {
+        sfpi::vFloat in0 = sfpi::dst_reg[dst_index_in0 * dst_tile_size_sfpi];
+        sfpi::vFloat in1 = sfpi::dst_reg[dst_index_in1 * dst_tile_size_sfpi];
+        sfpi::vFloat in2 = sfpi::dst_reg[dst_index_in2 * dst_tile_size_sfpi];
+        sfpi::vFloat result = in0 + in2 * (in1 - in0);
+        if constexpr (!is_fp32_dest_acc_en) {
+            result = float32_to_bf16_rne(result);
+        }
+        sfpi::dst_reg[dst_index_out * dst_tile_size_sfpi] = result;
+        sfpi::dst_reg++;
+    }
+}
+
+}  // namespace ckernel::sfpu
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "llk_math_eltwise_ternary_sfpu_params.h"
+#include "ckernel_sfpu_lerp.h"
+
+namespace ckernel {
+
+template <bool APPROXIMATE, bool is_fp32_dest_acc_en, DataFormat data_format, int ITERATIONS = 8>
+inline void llk_math_eltwise_ternary_sfpu_lerp(
+    uint dst_index0, uint dst_index1, uint dst_index2, uint odst, int vector_mode = (int)VectorMode::RC) {
+    _llk_math_eltwise_ternary_sfpu_params_<APPROXIMATE>(
+        sfpu::calculate_lerp<APPROXIMATE, is_fp32_dest_acc_en, data_format, ITERATIONS>,
+        dst_index0,
+        dst_index1,
+        dst_index2,
+        odst,
+        vector_mode);
+}
+
+template <bool APPROXIMATE>
+inline void llk_math_eltwise_ternary_sfpu_lerp_init() {
+    _llk_math_eltwise_ternary_sfpu_init_<SfpuType::lerp>();
+}
+
+}  // namespace ckernel
@@ -151,4 +151,5 @@ enum class SfpuType {
     unary_max_uint32,
     unary_min_uint32,
     addcdiv,
+    lerp,
 };
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "api/compute/common_globals.h"
+#ifdef TRISC_MATH
+#include "llk_math_eltwise_ternary_sfpu_lerp.h"
+#endif
+
+namespace ckernel {
+
+// clang-format off
+/**
+ * Performs elementwise linear interpolation (lerp): out = input + weight * (end - input)
+ *
+ * | Argument | Description                                                | Type     | Valid Range                                           | Required |
+ * |----------|------------------------------------------------------------|----------|-------------------------------------------------------|----------|
+ * | idst0    | Index of the tile in DST register buffer (input/start)   | uint32_t | Must be less than the size of the DST register buffer | True     |
+ * | idst1    | Index of the tile in DST register buffer (end)           | uint32_t | Must be less than the size of the DST register buffer | True     |
+ * | idst2    | Index of the tile in DST register buffer (weight)        | uint32_t | Must be less than the size of the DST register buffer | True     |
+ * | odst     | Index of the tile in DST register buffer (output)        | uint32_t | Must be less than the size of the DST register buffer | True     |
+ */
+// clang-format on
+template <DataFormat data_format>
+ALWI void lerp_tile(uint32_t idst0, uint32_t idst1, uint32_t idst2, uint32_t odst) {
+    MATH((llk_math_eltwise_ternary_sfpu_lerp<APPROX, DST_ACCUM_MODE, data_format>(idst0, idst1, idst2, odst)));
+}
+
+/**
+ * Please refer to documentation for any_init.
+ */
+ALWI void lerp_tile_init() { MATH((llk_math_eltwise_ternary_sfpu_lerp_init<APPROX>())); }
+
+}  // namespace ckernel
@@ -6,6 +6,7 @@
 
 #include "api/compute/eltwise_unary/eltwise_unary.h"
 #include "api/compute/eltwise_unary/where.h"
+#include "api/compute/eltwise_unary/lerp.h"
 #include "api/compute/eltwise_unary/fill.h"
 #include "ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_utils_common.hpp"
 #include "ttnn/operations/eltwise/binary_ng/device/kernels/compute/eltwise_utils_sfpu.hpp"