#0: Update test

VirdhatchaniKN · VirdhatchaniKN · commit ba5bac713260 · 2026-01-25T01:24:55.000Z
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py b/tests/ttnn/unit_tests/operations/eltwise/test_binary_fp32.py
@@ -194,21 +194,19 @@ def test_squared_sum_fp32_activ(device):
     "shape",
     [
         [1, 1, 16, 16],
-        [1, 1, 80, 80],
-        [1, 1, 320, 384],
         [1, 3, 320, 384],
     ],
 )
 def test_add_fp32_input_activ(device, ttnn_function, shape):
     x_torch = torch.ones(shape, dtype=torch.float32) * 2
     y_torch = torch.ones(shape, dtype=torch.float32) * 4
-    z_torch = torch.square(torch.nn.functional.silu(x_torch) + y_torch)
+    z_torch = torch.pow(torch.nn.functional.silu(x_torch) + y_torch, 4)
     x_tt = ttnn.from_torch(x_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device)
     y_tt = ttnn.from_torch(y_torch, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device)
     z_tt_add = ttnn.add(
         x_tt,
         y_tt,
-        activations=[ttnn.UnaryWithParam(ttnn.UnaryOpType.SQUARE)],
+        activations=[ttnn.UnaryWithParam(ttnn.UnaryOpType.POWER, 4)],
         input_tensor_a_activations=[ttnn.UnaryOpType.SILU],
     )
     tt_out = ttnn.to_torch(z_tt_add)
diff --git a/tests/ttnn/unit_tests/operations/eltwise/test_unary_pow.py b/tests/ttnn/unit_tests/operations/eltwise/test_unary_pow.py
@@ -1,24 +1,26 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2026 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
 
 import torch
 import pytest
 import ttnn
-from tests.ttnn.utils_for_testing import assert_with_ulp
-from tests.ttnn.unit_tests.operations.eltwise.test_expm1 import flush_subnormal_values
+from tests.ttnn.utils_for_testing import (
+    assert_with_ulp,
+    generate_all_bfloat16_bitpatterns,
+    flush_subnormal_values_to_zero,
+)
 
 
 def generate_clean_bf16_tensor(dtype=torch.bfloat16):
-    all_bitpatterns = torch.arange(0, 2**16, dtype=torch.int32).to(torch.uint16)
-    input_tensor = all_bitpatterns.view(torch.bfloat16)  # 65536 values
-    fp32 = input_tensor.to(torch.float32)
+    all_bf16 = generate_all_bfloat16_bitpatterns(torch.bfloat16).flatten()
+    fp32 = all_bf16.to(torch.float32)
 
     # Remove special values (NaN, -0.0, +inf, -inf, subnormals)
     neg_zero_mask = (fp32 == 0.0) & torch.signbit(fp32)
     tiny = torch.finfo(torch.bfloat16).tiny  # 2**-126
     good_mask = torch.isfinite(fp32) & ~neg_zero_mask & (fp32.abs() >= tiny)
-    fp32 = fp32[good_mask]  # 65024 values
+    fp32 = fp32[good_mask]  # ~65024 clean values
 
     return fp32.to(dtype)
 
@@ -41,7 +43,7 @@ def test_pow_arange_masking(exponent, device):
     # Generate all possible bit pattern for bf16
     tt_input = generate_clean_bf16_tensor(torch.bfloat16)
     # If input is subnormal then we assume hardware will flush it to 0.0
-    tt_input = flush_subnormal_values(tt_input)
+    tt_input = flush_subnormal_values_to_zero(tt_input)
 
     tt_in = ttnn.from_torch(
         tt_input,
@@ -57,7 +59,29 @@ def test_pow_arange_masking(exponent, device):
     tt_result = ttnn.pow(tt_in, exponent)
     result = ttnn.to_torch(tt_result)
     # If expected output is subnormal then its calculated value should be 0.0 (hardware assumed to flush to 0.0)
-    result = flush_subnormal_values(result)
-    golden = flush_subnormal_values(golden)
+    result = flush_subnormal_values_to_zero(result)
+    golden = flush_subnormal_values_to_zero(golden)
 
     assert_with_ulp(golden, result, 1, allow_nonfinite=True)
+
+
+@pytest.mark.parametrize(
+    "op_type,exponent",
+    [
+        (ttnn.UnaryOpType.POWER_ITERATIVE, 0),
+        (ttnn.UnaryOpType.POWER_ITERATIVE, 2),
+        (ttnn.UnaryOpType.POWER, 0),
+        (ttnn.UnaryOpType.POWER, 2),
+        (ttnn.UnaryOpType.POWER, 1.5),
+        (ttnn.UnaryOpType.POWER, -1.9),
+    ],
+)
+def test_power_as_activation(device, op_type, exponent):
+    x_torch = torch.rand([16, 16], dtype=torch.bfloat16) + 1.5
+    z_torch = torch.pow(x_torch + x_torch, exponent)
+
+    x_tt = ttnn.from_torch(x_torch, dtype=ttnn.bfloat16, layout=ttnn.TILE_LAYOUT, device=device)
+    z_tt = ttnn.add(x_tt, x_tt, activations=[ttnn.UnaryWithParam(op_type, exponent)])
+    tt_out = ttnn.to_torch(z_tt)
+
+    assert_with_ulp(z_torch, tt_out, 1)
diff --git a/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_unary_power.h b/tt_metal/hw/ckernels/blackhole/metal/llk_api/llk_sfpu/ckernel_sfpu_unary_power.h
@@ -51,7 +51,7 @@ sfpi_inline sfpi::vFloat _sfpu_unary_power_21f_(sfpi::vFloat base, sfpi::vFloat
     sfpi::vFloat x = sfpi::setexp(abs_base, 127);  // set exp to exp bias (put base in range of 1-2)
 
     // 3rd order polynomial approx - determined using rminimax over [1,2]
-    sfpi::vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f;
+    vFloat series_result = PolynomialEvaluator::eval(x, -0x1.952992p+0f, 0x2.4f5388p+0f, -0xd.e712ap-4f, 0x2.44734p-4f);
 
     // Convert exponent to float
     sfpi::vInt exp = sfpi::exexp(base);
@@ -192,14 +192,14 @@ inline void calculate_unary_power(const uint32_t exponent) {
  */
 template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
 inline void calculate_unary_power_iterative(const uint32_t exponent) {
-    // Old iterative approach for integer exponents 0, 1, 2, 3
-    // exponent contains IEEE 754 float bits - convert to actual integer
+    // iterative approach for positive integer exponents
+    // exponent contains IEEE 754 float bits - convert to integer
     const float exp_float = Converter::as_float(exponent);
     const uint exp = (uint)exp_float;
 #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++) {
-        vFloat in = sfpi::dst_reg[0];
-        vFloat result = 1.0f;
+        sfpi::vFloat in = sfpi::dst_reg[0];
+        sfpi::vFloat result = 1.0f;
         uint e = exp;
         while (e > 0) {
             if (e & 1) {
diff --git a/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_unary_power.h b/tt_metal/hw/ckernels/wormhole_b0/metal/llk_api/llk_sfpu/ckernel_sfpu_unary_power.h
@@ -52,7 +52,7 @@ sfpi_inline sfpi::vFloat _sfpu_unary_power_21f_(sfpi::vFloat base, sfpi::vFloat
     sfpi::vFloat x = sfpi::setexp(abs_base, 127);  // set exp to exp bias (put base in range of 1-2)
 
     // 3rd order polynomial approx - determined using rminimax over [1,2]
-    sfpi::vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f;
+    vFloat series_result = PolynomialEvaluator::eval(x, -0x1.952992p+0f, 0x2.4f5388p+0f, -0xd.e712ap-4f, 0x2.44734p-4f);
 
     // Convert exponent to float
     sfpi::vInt exp = sfpi::exexp(base);
@@ -193,14 +193,14 @@ inline void calculate_unary_power(const uint32_t exponent) {
  */
 template <bool APPROXIMATION_MODE, int ITERATIONS = 8>
 inline void calculate_unary_power_iterative(const uint32_t exponent) {
-    // Old iterative approach for integer exponents 0, 1, 2, 3
-    // exponent contains IEEE 754 float bits - convert to actual integer
+    // iterative approach for positive integer exponents
+    // exponent contains IEEE 754 float bits - convert to integer
     const float exp_float = Converter::as_float(exponent);
     const uint exp = (uint)exp_float;
 #pragma GCC unroll 8
     for (int d = 0; d < ITERATIONS; d++) {
-        vFloat in = sfpi::dst_reg[0];
-        vFloat result = 1.0f;
+        sfpi::vFloat in = sfpi::dst_reg[0];
+        sfpi::vFloat result = 1.0f;
         uint e = exp;
         while (e > 0) {
             if (e & 1) {
diff --git a/tt_metal/include/compute_kernel_api.h b/tt_metal/include/compute_kernel_api.h
@@ -348,7 +348,7 @@ ALWI void power_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_init<APPRO
  * acquired state via *acquire_dst* call. This call is blocking and is only
  * available on the compute engine.
  *
- * Note: Uses iterative multiplication for positive integer exponents. Optimal for small exponents (0,1,2,3).
+ * Note: Unlike power_tile, power_iterative_tile() only supports positive integer scalars. It uses an iterative multiplication loop to compute values, and is faster than power_tile for small exponents (e.g. 1, 2, 3)
  *
  * Return value: None
  *
@@ -358,14 +358,14 @@ ALWI void power_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_init<APPRO
  * | param0          | The exponent as IEEE 754 float bits                                        | uint32_t | Must be a positive integer exponent                   | True     |
  */
 // clang-format on
-ALWI void power_tile_iterative(uint32_t idst, uint32_t param0) {
+ALWI void power_iterative_tile(uint32_t idst, uint32_t param0) {
     MATH((llk_math_eltwise_unary_sfpu_power_iterative<APPROX>(idst, param0)));
 }
 
 /**
  * Please refer to documentation for any_init.
  */
-ALWI void power_tile_iterative_init() { MATH((llk_math_eltwise_unary_sfpu_power_iterative_init<APPROX>())); }
+ALWI void power_iterative_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_iterative_init<APPROX>())); }
 
 // clang-format off
 // exp2 : y = 2 ^ x  ==> [y = exp(x * log(2))]
diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp
@@ -1029,8 +1029,8 @@ ALWI void power_tile_to_cb(
     copy_tile_init_with_dt(cb_x);
     copy_tile(cb_x, 0, dst0);
 
-    power_tile_iterative_init();
-    power_tile_iterative(dst0, p);
+    power_iterative_tile_init();
+    power_iterative_tile(dst0, p);
 
     if (p_is_negative) {
         recip_tile_init();
@@ -1124,8 +1124,8 @@ ALWI void power_tile_with_abs_x_to_cb(
     abs_tile_init();
     abs_tile(dst0);
 
-    power_tile_iterative_init();
-    power_tile_iterative(dst0, p);
+    power_iterative_tile_init();
+    power_iterative_tile(dst0, p);
 
     if (p_is_negative) {
         recip_tile_init();
@@ -1219,8 +1219,8 @@ ALWI void power_and_recip_tile_to_cb(
     copy_tile_init_with_dt(cb_x);
     copy_tile(cb_x, 0, dst0);
 
-    power_tile_iterative_init();
-    power_tile_iterative(dst0, p);
+    power_iterative_tile_init();
+    power_iterative_tile(dst0, p);
 
     if (p_is_negative) {
         recip_tile_init();
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_composite.hpp
@@ -29,7 +29,7 @@ namespace operations::binary {
 struct ExecutePower {
     static Tensor invoke(
         const Tensor& input_tensor,
-        uint32_t exponent,
+        int32_t exponent,
         const std::optional<MemoryConfig>& output_mem_config = std::nullopt,
         const std::optional<Tensor>& optional_output_tensor = std::nullopt);
 
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/binary_nanobind.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/binary_nanobind.cpp
@@ -2089,7 +2089,7 @@ void bind_power(nb::module_& mod, const binary_operation_t& /*operation*/, const
         ttnn::nanobind_overload_t{
             [](const binary_operation_t& self,
                const Tensor& input_tensor,
-               uint32_t exponent,
+               int32_t exponent,
                const std::optional<MemoryConfig>& memory_config,
                const std::optional<Tensor>& output_tensor) -> ttnn::Tensor {
                 return self(input_tensor, exponent, memory_config, output_tensor);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_composite_op.cpp
@@ -809,10 +809,9 @@ Tensor ExecutePower::invoke(
     float exponent,
     const std::optional<MemoryConfig>& output_mem_config,
     const std::optional<Tensor>& output_tensor) {
-    // For exponents 0, 1, 2, 3: use iterative approach
-    if (exponent == 0.0f || exponent == 1.0f || exponent == 2.0f || exponent == 3.0f) {
-        return ttnn::operations::unary::ExecuteUnaryTSVariant<ttnn::operations::unary::UnaryOpType::POWER_ITERATIVE>::
-            invoke(input_a, exponent, output_mem_config, output_tensor);
+    float exponent_floor = std::floor(exponent);
+    if (static_cast<int32_t>(exponent_floor) == exponent) {
+        return ExecutePower::invoke(input_a, static_cast<int32_t>(exponent), output_mem_config, output_tensor);
     }
     return ttnn::operations::unary::ExecuteUnaryTSVariant<ttnn::operations::unary::UnaryOpType::POWER>::invoke(
         input_a, exponent, output_mem_config, output_tensor);
@@ -821,11 +820,11 @@ Tensor ExecutePower::invoke(
 // power - integer exponent
 Tensor ExecutePower::invoke(
     const Tensor& input,
-    uint32_t exponent,
+    int32_t exponent,
     const std::optional<MemoryConfig>& output_mem_config,
     const std::optional<Tensor>& output_tensor) {
     // For exponents 0, 1, 2, 3: use iterative approach
-    if (exponent <= 3) {
+    if (exponent == 0 || exponent == 1 || exponent == 2 || exponent == 3) {
         return ttnn::operations::unary::ExecuteUnaryTSVariant<ttnn::operations::unary::UnaryOpType::POWER_ITERATIVE>::
             invoke(input, exponent, output_mem_config, output_tensor);
     }
diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/common/unary_op_utils.cpp
@@ -152,8 +152,8 @@ std::pair<std::string, std::string> get_op_init_and_func_parameterized(
         case UnaryOpType::POWER_ITERATIVE:
             // For exponents 0, 1, 2, 3: use iterative approach
             return {
-                "power_tile_iterative_init();",
-                fmt::format("power_tile_iterative({}, {:#x}u);", idst, std::bit_cast<uint32_t>(param0))};
+                "power_iterative_tile_init();",
+                fmt::format("power_iterative_tile({}, {:#x}u);", idst, std::bit_cast<uint32_t>(param0))};
         case UnaryOpType::LEAKY_RELU:
             return {
                 "leaky_relu_tile_init();",

Original file line number	Diff line number	Diff line change
`@@ -348,7 +348,7 @@ ALWI void power_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_init<APPRO`
`348`	`348`	`* acquired state via acquire_dst call. This call is blocking and is only`
`349`	`349`	`* available on the compute engine.`
`350`	`350`	`*`
`351`		`- * Note: Uses iterative multiplication for positive integer exponents. Optimal for small exponents (0,1,2,3).`
	`351`	`+ * Note: Unlike power_tile, power_iterative_tile() only supports positive integer scalars. It uses an iterative multiplication loop to compute values, and is faster than power_tile for small exponents (e.g. 1, 2, 3)`
`352`	`352`	`*`
`353`	`353`	`* Return value: None`
`354`	`354`	`*`
`@@ -358,14 +358,14 @@ ALWI void power_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_init<APPRO`
`358`	`358`	`* \| param0 \| The exponent as IEEE 754 float bits \| uint32_t \| Must be a positive integer exponent \| True \|`
`359`	`359`	`*/`
`360`	`360`	`// clang-format on`
`361`		`-ALWI void power_tile_iterative(uint32_t idst, uint32_t param0) {`
	`361`	`+ALWI void power_iterative_tile(uint32_t idst, uint32_t param0) {`
`362`	`362`	`MATH((llk_math_eltwise_unary_sfpu_power_iterative<APPROX>(idst, param0)));`
`363`	`363`	`}`
`364`	`364`
`365`	`365`	`/**`
`366`	`366`	`* Please refer to documentation for any_init.`
`367`	`367`	`*/`
`368`		`-ALWI void power_tile_iterative_init() { MATH((llk_math_eltwise_unary_sfpu_power_iterative_init<APPROX>())); }`
	`368`	`+ALWI void power_iterative_tile_init() { MATH((llk_math_eltwise_unary_sfpu_power_iterative_init<APPROX>())); }`
`369`	`369`
`370`	`370`	`// clang-format off`
`371`	`371`	`// exp2 : y = 2 ^ x ==> [y = exp(x * log(2))]`