tenstorrent
diff --git a/‎forge/csrc/ops/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎forge/csrc/ops/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎forge/csrc/ops/op.cpp‎
Lines changed: 14 additions & 0 deletions b/‎forge/csrc/ops/op.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎forge/csrc/ops/op.hpp‎
Lines changed: 1 addition & 0 deletions b/‎forge/csrc/ops/op.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎forge/csrc/ops/op_interface.hpp‎
Lines changed: 1 addition & 0 deletions b/‎forge/csrc/ops/op_interface.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎forge/csrc/ops/op_topk.cpp‎
Lines changed: 88 additions & 0 deletions b/‎forge/csrc/ops/op_topk.cpp‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎forge/forge/compile.py‎
Lines changed: 28 additions & 11 deletions b/‎forge/forge/compile.py‎
Lines changed: 28 additions & 11 deletions
diff --git a/‎forge/forge/op/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎forge/forge/op/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -87,6 +87,7 @@ add_library(ops
     op_unsqueeze.cpp
     op_update_cache.cpp
     op_upsample_2d.cpp
+    op_topk.cpp
     op_where.cpp
     python_bindings.cpp)
 
 
@@ -125,6 +125,7 @@ class NewToOldOpType
         mapping_[OpType::Unsqueeze] = "unsqueeze";
         mapping_[OpType::UpdateCache] = "update_cache";
         mapping_[OpType::Upsample2d] = "upsample2d";
+        mapping_[OpType::TopK] = "topk";
         mapping_[OpType::Where] = "where";
     }
 
@@ -227,6 +228,7 @@ class OldToNewOpType
         mapping_["unsqueeze"] = OpType::Unsqueeze;
         mapping_["update_cache"] = OpType::UpdateCache;
         mapping_["upsample2d"] = OpType::Upsample2d;
+        mapping_["topk"] = OpType::TopK;
         mapping_["where"] = OpType::Where;
     }
 
@@ -394,6 +396,7 @@ at::Tensor Op::eval(const graphlib::OpType &old_op_type, const std::vector<at::T
         case OpType::Unsqueeze: return unsqueeze::eval(old_op_type, *this, tensors);
         case OpType::UpdateCache: return update_cache::eval(old_op_type, *this, tensors);
         case OpType::Upsample2d: return upsample_2d::eval(old_op_type, *this, tensors);
+        case OpType::TopK: return topk::eval(old_op_type, *this, tensors);
         case OpType::Where: return where::eval(old_op_type, *this, tensors);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -489,6 +492,7 @@ std::tuple<graphlib::Shape, std::vector<graphlib::DimBroadcast>> Op::shape(
         case OpType::Unsqueeze: return unsqueeze::shape(old_op_type, *this, inputs);
         case OpType::UpdateCache: return update_cache::shape(old_op_type, *this, inputs);
         case OpType::Upsample2d: return upsample_2d::shape(old_op_type, *this, inputs);
+        case OpType::TopK: return topk::shape(old_op_type, *this, inputs);
         case OpType::Where: return where::shape(old_op_type, *this, inputs);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -589,6 +593,7 @@ tt::graphlib::NodeContext Op::backward(
         case OpType::Unsqueeze: return unsqueeze::backward(old_op_type, *this, context, operand, inputs, output, gradient);
         case OpType::UpdateCache: return update_cache::backward(old_op_type, *this, context, operand, inputs, output, gradient);
         case OpType::Upsample2d: return upsample_2d::backward(old_op_type, *this, context, operand, inputs, output, gradient);
+        case OpType::TopK: return topk::backward(old_op_type, *this, context, operand, inputs, output, gradient);
         case OpType::Where: return where::backward(old_op_type, *this, context, operand, inputs, output, gradient);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -706,6 +711,7 @@ void Op::decompose_initial(
         case OpType::Unsqueeze: return;
         case OpType::UpdateCache: return;
         case OpType::Upsample2d: return;
+        case OpType::TopK: return;
         case OpType::Where: return where::decompose_initial(old_op_type, *this, dc, inputs);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -802,6 +808,7 @@ void Op::decompose_post_optimize(
         case OpType::Unsqueeze: return;
         case OpType::UpdateCache: return;
         case OpType::Upsample2d: return;
+        case OpType::TopK: return;
         case OpType::Where: return where::decompose_post_optimize(old_op_type, *this, dc, inputs);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -899,6 +906,7 @@ void Op::decompose_post_autograd(
         case OpType::Unsqueeze: return;
         case OpType::UpdateCache: return;
         case OpType::Upsample2d: return;
+        case OpType::TopK: return;
         case OpType::Where: return where::decompose_post_autograd(old_op_type, *this, dc, inputs);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -994,6 +1002,7 @@ long Op::initial_flops_estimate(
         case OpType::Unsqueeze: return 0;
         case OpType::UpdateCache: return 0;
         case OpType::Upsample2d: return 0;
+        case OpType::TopK: return 0;
         case OpType::Where: return where::initial_flops_estimate(old_op_type, *this, inputs);
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }  // clang-format on
@@ -1088,6 +1097,7 @@ bool Op::is_tm(const graphlib::OpType &old_op_type) const
         case OpType::Unsqueeze: return true;
         case OpType::UpdateCache: return false;
         case OpType::Upsample2d: return false;
+        case OpType::TopK: return false;
         case OpType::Where: return false;
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }
@@ -1182,6 +1192,7 @@ bool Op::is_eltwise(const graphlib::OpType &old_op_type) const
         case OpType::Unsqueeze: return false;
         case OpType::UpdateCache: return false;
         case OpType::Upsample2d: return false;
+        case OpType::TopK: return false;
         case OpType::Where: return true;
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }
@@ -1276,6 +1287,7 @@ bool Op::is_eltwise_unary(const graphlib::OpType &old_op_type) const
         case OpType::Unsqueeze: return false;
         case OpType::UpdateCache: return false;
         case OpType::Upsample2d: return false;
+        case OpType::TopK: return false;
         case OpType::Where: return false;
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }
@@ -1370,6 +1382,7 @@ bool Op::is_eltwise_binary(const graphlib::OpType &old_op_type) const
         case OpType::Unsqueeze: return false;
         case OpType::UpdateCache: return false;
         case OpType::Upsample2d: return false;
+        case OpType::TopK: return false;
         case OpType::Where: return false;
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }
@@ -1463,6 +1476,7 @@ bool Op::is_eltwise_nary(const graphlib::OpType &old_op_type) const
         case OpType::Unsqueeze: return false;
         case OpType::UpdateCache: return false;
         case OpType::Upsample2d: return false;
+        case OpType::TopK: return false;
         case OpType::Where: return true;
         default: TT_ASSERT(false, "Unknown OpType."); unreachable();
     }
 
@@ -129,6 +129,7 @@ enum class OpType : uint32_t
     Unsqueeze,
     UpdateCache,
     Upsample2d,
+    TopK,
     Where,
 };
 
 
@@ -166,6 +166,7 @@ DECLARE_OP_INTERFACE(transpose);
 DECLARE_OP_INTERFACE(unsqueeze);
 DECLARE_OP_INTERFACE(update_cache);
 DECLARE_OP_INTERFACE(upsample_2d);
+DECLARE_OP_INTERFACE(topk);
 DECLARE_OP_INTERFACE(where);
 
 #undef DECLARE_OP_INTERFACE
 
@@ -0,0 +1,88 @@
+// SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "autograd/autograd.hpp"
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/shape.hpp"
+#include "op.hpp"
+#include "op_interface.hpp"
+#include "ops/op_common.hpp"
+#include "torch/extension.h"  // Needed for c++ to/from python type conversion.
+#include "torch/torch.h"
+#include "utils/assert.hpp"
+
+namespace tt
+{
+namespace ops
+{
+namespace topk
+{
+using namespace graphlib;
+
+// Attributes expected:
+//  - k: int (required)
+//  - dim: int (required)
+//  - largest: bool (optional; default true)
+//  - sorted: bool (optional; default true)
+
+at::Tensor eval(const graphlib::OpType &old_op_type, const Op &op, const std::vector<at::Tensor> &tensors)
+{
+    TT_DBG_ASSERT(op.type() == OpType::TopK, "Wrong op type.");
+    TT_ASSERT(tensors.size() == 1, "TopK should have one input tensor");
+
+    const int64_t k = static_cast<int64_t>(op.attr_as<int>("k"));
+    const int64_t dim = static_cast<int64_t>(op.attr_as<int>("dim"));
+    const bool largest = op.has_attr("largest") ? op.attr_as<bool>("largest") : true;
+    const bool sorted = op.has_attr("sorted") ? op.attr_as<bool>("sorted") : true;
+
+    // torch::topk returns a tuple (values, indices). Our infra is single-output; return values for now.
+    auto result = torch::topk(tensors[0], k, dim, largest, sorted);
+    at::Tensor values = std::get<0>(result);
+    // at::Tensor indices = std::get<1>(result); // kept for future multi-output support
+
+    return values;
+}
+
+std::tuple<Shape, std::vector<DimBroadcast>> shape(
+    const graphlib::OpType &old_op_type, const Op &op, const std::vector<std::vector<std::uint32_t>> &in_shapes)
+{
+    TT_DBG_ASSERT(op.type() == OpType::TopK, "Wrong op type.");
+    TT_ASSERT(in_shapes.size() == 1, "TopK should have one input shape");
+
+    const auto &input = in_shapes[0];
+    TT_ASSERT(!input.empty(), "TopK input must have rank >= 1");
+
+    const int dim = op.attr_as<int>("dim");
+    TT_ASSERT(dim >= -static_cast<int>(input.size()) && dim < static_cast<int>(input.size()), "TopK dim out of range");
+
+    const int pos_dim = dim < 0 ? dim + static_cast<int>(input.size()) : dim;
+    std::vector<uint32_t> out_shape = input;
+    out_shape[pos_dim] = static_cast<uint32_t>(op.attr_as<int>("k"));
+
+    return {Shape::create(out_shape), {}};
+}
+
+// No autograd for now
+
+tt::graphlib::NodeContext backward(
+    const graphlib::OpType &old_op_type,
+    const Op &op,
+    autograd::autograd_context &ac,
+    int operand,
+    const std::vector<NodeContext> &inputs,
+    const NodeContext &output,
+    const NodeContext &gradient)
+{
+    TT_DBG_ASSERT(op.type() == OpType::TopK, "Wrong op type.");
+    TT_THROW(false, "TopK does not have backward.");
+    unreachable();
+}
+
+}  // namespace topk
+}  // namespace ops
+}  // namespace tt
@@ -1136,6 +1136,9 @@ def generate_graph(
             input_names_known = False
     inputs, _, _ = flatten_inputs(inputs)
 
+    # Track counts to ensure unique output names per base name
+    output_name_counts: Dict[str, int] = {}
+
     for out in all_subgraph_outputs:
         module = output_to_module_map[out]
         assert module is not None
@@ -1164,9 +1167,14 @@ def generate_graph(
                 raise RuntimeError("Untraced output tensor encountered")
 
         else:
+            base_name = module_name + ".output_" + out.src_op.name
+            count = output_name_counts.get(base_name, 0)
+            unique_name = base_name if count == 0 else f"{base_name}_{count}"
+            output_name_counts[base_name] = count + 1
+
             outq = create_output(
                 graph,
-                module_name + ".output_" + out.src_op.name,
+                unique_name,
                 out.shape.get_pytorch_shape(),
                 out.data_format,
                 module.is_loss,
@@ -1177,8 +1185,10 @@ def generate_graph(
 
     recorded_parameters = {}
 
-    while pending_tensors:
+    # Map to ensure we create an op node only once per source op name
+    op_node_by_name: Dict[str, int] = {}
 
+    while pending_tensors:
         tensor, output, port_index, operand_broadcast, subgraph_idx = pending_tensors.popleft()
 
         if tensor in visited_tensors:
@@ -1333,15 +1343,21 @@ def generate_graph(
         tags = {}
         if tensor.src_layer is not None:
             tags["layer"] = tensor.src_layer
-        op = create_op_node(
-            graph,
-            tensor.src_op.name,
-            tensor.src_op.cpp_op_type,
-            tensor.shape.get_pytorch_shape(),
-            tensor.data_format,
-            subgraph_idx,
-            tags,
-        )
+        # Reuse the same op node if we already created one for this src_op.name
+        existing = op_node_by_name.get(tensor.src_op.name)
+        if existing is not None:
+            op = existing
+        else:
+            op = create_op_node(
+                graph,
+                tensor.src_op.name,
+                tensor.src_op.cpp_op_type,
+                tensor.shape.get_pytorch_shape(),
+                tensor.data_format,
+                subgraph_idx,
+                tags,
+            )
+            op_node_by_name[tensor.src_op.name] = op
 
         visited_tensors[tensor] = op
         if return_intermediate and tensor.has_value():
@@ -1364,6 +1380,7 @@ def generate_graph(
         if output_tensor in module_output_tensor_to_node
     ]
     module_targets = [module_target_tensor_to_node[target_tensor] for target_tensor in target_tensors]
+
     out_requires_grad = [
         output_tensor.requires_grad
         for output_tensor in all_subgraph_outputs
 
@@ -72,3 +72,4 @@
 from .kv_cache import FillCache, UpdateCache
 from .misc import CumSum
 import forge.op.loss
+from .topk import TopK