openxla · copybara-service · Apr 13, 2026
diff --git a/shardy/dialect/sdy/ir/utils.cc b/shardy/dialect/sdy/ir/utils.cc
@@ -1108,5 +1108,11 @@ FuncOp cloneFuncRecursively(FuncOp funcOp, SymbolTable& symbolTable) {
   return clonedFuncOp;
 }
 
+FuncOp getFuncOpOrDie(StringRef funcSymName, const SymbolTable& symbolTable) {
+  FuncOp funcOp = symbolTable.lookup<FuncOp>(funcSymName);
+  CHECK(funcOp) << "Failed to lookup function: " << funcSymName.str();
+  return funcOp;
+}
+
 }  // namespace sdy
 }  // namespace mlir
diff --git a/shardy/dialect/sdy/ir/utils.h b/shardy/dialect/sdy/ir/utils.h
@@ -678,6 +678,11 @@ Operation* getCommonSupportedReductionOp(stablehlo::ScatterOp scatter);
 mlir::func::FuncOp cloneFuncRecursively(func::FuncOp funcOp,
                                         SymbolTable& symbolTable);
 
+// Returns the funcOp on `funcSymName`. Dies if the func does not exist on the
+// `symbolTable`.
+func::FuncOp getFuncOpOrDie(StringRef funcSymName,
+                            const SymbolTable& symbolTable);
+
 }  // namespace sdy
 }  // namespace mlir
 

diff --git a/shardy/dialect/sdy/transforms/import/BUILD b/shardy/dialect/sdy/transforms/import/BUILD
@@ -36,6 +36,7 @@ cc_library(
         "add_data_flow_edges.cc",
         "apply_sharding_constraints.cc",
         "constant_or_scalar_splitter.cc",
+        "equilise_call_and_func_result_shardings.cc",
         "import_func_calls.cc",
         "import_pipeline.cc",
         "inline_meshes.cc",

diff --git a/shardy/dialect/sdy/transforms/import/add_data_flow_edges.cc b/shardy/dialect/sdy/transforms/import/add_data_flow_edges.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/PatternMatch.h"
@@ -30,6 +31,9 @@ namespace sdy {
 
 namespace {
 
+using func::CallOp;
+using func::FuncOp;
+
 struct AddDataFlowEdgesPass
     : public impl::AddDataFlowEdgesPassBase<AddDataFlowEdgesPass> {
   using AddDataFlowEdgesPassBase::AddDataFlowEdgesPassBase;
@@ -51,20 +55,25 @@ struct AddDataFlowEdgesPass
   }
 
   void runOnOperation() final {
-    func::FuncOp funcOp = getOperation();
-    IRRewriter rewriter(funcOp);
+    ModuleOp moduleOp = getOperation();
+    SymbolTable symbolTable(moduleOp);
+    IRRewriter rewriter(moduleOp);
 
-    funcOp.walk([&](ShardableDataFlowOpInterface op) {
+    moduleOp.walk([&](ShardableDataFlowOpInterface op) {
       // Add the data flow edges for result owners and block argument owners.
       addDataFlowEdges(op.getBlockArgumentEdgeOwners(), rewriter);
       addDataFlowEdges(op.getOpResultEdgeOwners(), rewriter);
     });
+
+    llvm::SmallDenseSet<StringRef> funcNames;
     if (enableNativeNonFlatSupport) {
-      // TODO(enver): Do not create data flow edge if the func has no callers,
-      // such as the entry function.
-      addDataFlowEdges(funcOp.getArguments(), rewriter);
-      funcOp.walk([&](func::CallOp callOp) {
+      moduleOp.walk([&](CallOp callOp) {
         addDataFlowEdges(callOp.getResults(), rewriter);
+        StringRef funcName = callOp.getCallee();
+        if (auto [_, inserted] = funcNames.insert(funcName); inserted) {
+          FuncOp funcOp = getFuncOpOrDie(funcName, symbolTable);
+          addDataFlowEdges(funcOp.getArguments(), rewriter);
+        }
       });
     }
   }

diff --git a/shardy/dialect/sdy/transforms/import/equilise_call_and_func_result_shardings.cc b/shardy/dialect/sdy/transforms/import/equilise_call_and_func_result_shardings.cc
@@ -0,0 +1,59 @@
+/* Copyright 2026 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "shardy/dialect/sdy/ir/dialect.h"
+#include "shardy/dialect/sdy/ir/utils.h"
+#include "shardy/dialect/sdy/transforms/import/passes.h"  // IWYU pragma: keep
+
+namespace mlir {
+namespace sdy {
+
+#define GEN_PASS_DEF_EQUILISECALLANDFUNCRESULTSHARDINGSPASS
+#include "shardy/dialect/sdy/transforms/import/passes.h.inc"
+
+namespace {
+
+using func::CallOp;
+using func::FuncOp;
+
+struct EquiliseCallAndFuncResultShardingsPass
+    : public impl::EquiliseCallAndFuncResultShardingsPassBase<
+          EquiliseCallAndFuncResultShardingsPass> {
+  using EquiliseCallAndFuncResultShardingsPassBase::
+      EquiliseCallAndFuncResultShardingsPassBase;
+
+  void runOnOperation() override {
+    ModuleOp moduleOp = getOperation();
+    SymbolTable symbolTable(moduleOp);
+
+    moduleOp.walk([&](CallOp callOp) {
+      FuncOp funcOp = getFuncOpOrDie(callOp.getCallee(), symbolTable);
+      if (!getShardingPerValue(callOp)) {
+        if (TensorShardingPerValueAttr funcResultShardings =
+                getFuncResultShardings(funcOp, symbolTable);
+            funcResultShardings) {
+          setShardings(callOp, funcResultShardings);
+        }
+      }
+    });
+  }
+};
+
+}  // namespace
+
+}  // namespace sdy
+}  // namespace mlir
diff --git a/shardy/dialect/sdy/transforms/import/import_func_calls.cc b/shardy/dialect/sdy/transforms/import/import_func_calls.cc
@@ -63,17 +63,13 @@ void importCallOp(
   SDY_CHECK(funcOp) << "Failed to lookup function: " << calleeName.str();
 
   rewriter.setInsertionPoint(callOp);
-  TensorShardingPerValueAttr callOpResultShardings =
-      getShardingPerValue(callOp);
   auto namedCompOp = NamedComputationOp::create(
       rewriter, callOp->getLoc(), callOp->getResultTypes(),
       getOriginalFuncName(funcOp), callOp.getOperands(),
       /*inShardings=*/getFuncArgShardings(funcOp, symbolTable),
       // TODO(b/439018088): Take func result shardings if call op result
       // shardings are empty.
-      /*outShardings=*/
-      callOpResultShardings ? callOpResultShardings
-                            : getFuncResultShardings(funcOp, symbolTable));
+      /*outShardings=*/getShardingPerValue(callOp));
   namedCompOp->setAttrs(namedCompAttrs);
 
   Region& namedCompRegion = namedCompOp.getRegion();

diff --git a/shardy/dialect/sdy/transforms/import/import_pipeline.cc b/shardy/dialect/sdy/transforms/import/import_pipeline.cc
@@ -40,9 +40,10 @@ void addImportPipeline(OpPassManager& pm, int& dumpIndex,
   // of the propagation itself.
   pm.addPass(mlir::sdy::createSaveModuleOpPass(
       options.dumpDirectory, "before_propagation", dumpIndex++));
+  pm.addPass(createEquiliseCallAndFuncResultShardingsPass());
   pm.addPass(createImportFuncCallsPass());
 
-  pm.addNestedPass<func::FuncOp>(createAddDataFlowEdgesPass(
+  pm.addPass(createAddDataFlowEdgesPass(
       AddDataFlowEdgesPassOptions{options.enableNativeNonFlatSupport}));
   pm.addPass(
       createApplyShardingConstraintsPass(ApplyShardingConstraintsPassOptions{

diff --git a/shardy/dialect/sdy/transforms/import/passes.td b/shardy/dialect/sdy/transforms/import/passes.td
@@ -28,6 +28,17 @@ def ImportFuncCallsPass : Pass<"sdy-import-func-calls", "ModuleOp"> {
   let dependentDialects = ["mlir::sdy::SdyDialect"];
 }
 
+def EquiliseCallAndFuncResultShardingsPass : Pass<"sdy-equilise-call-and-func-result-shardings", "ModuleOp"> {
+  let summary = "Set call result shardings as the func result shardings, if empty.";
+  let description = [{
+     Creates a pass to equilise call and func result shardings if the call
+     result shardings are empty. Notably, it keeps call result sharding if the
+     call already has result shardings, even if all individual result shardings
+     are empty.
+  }];
+  let dependentDialects = ["mlir::sdy::SdyDialect"];
+}
+
 def LiftInlinedMeshesPass : Pass<"sdy-lift-inlined-meshes", "ModuleOp"> {
   let summary = "Lifts inlined `MeshAttr`s in shardings as symbol `MeshOp`s.";
   let description = [{
@@ -54,7 +65,7 @@ def InlineMeshesPass : Pass<"sdy-inline-meshes", "ModuleOp"> {
   let dependentDialects = ["mlir::sdy::SdyDialect"];
 }
 
-def AddDataFlowEdgesPass : Pass<"sdy-add-data-flow-edges", "func::FuncOp"> {
+def AddDataFlowEdgesPass : Pass<"sdy-add-data-flow-edges", "ModuleOp"> {
   let summary = "Inserts `DataFlowEdgeOp` for every data-flow edge.";
   let description = [{
     Inserts `DataFlowEdgeOp` for every value that is the owner of a data-flow

diff --git a/...ialect/sdy/transforms/import/test/add_data_flow_edges_enable_native_non_flat_support.mlir b/...ialect/sdy/transforms/import/test/add_data_flow_edges_enable_native_non_flat_support.mlir
@@ -10,7 +10,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_on_func_with_single_argument(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_with_single_argument(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -31,7 +31,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_on_func_multiple_users_on_func_result(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_multiple_users_on_func_result(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   // CHECK-NEXT: %[[NEGATE:.*]] = stablehlo.negate %[[EDGE]] : tensor<8xf32>
@@ -56,7 +56,7 @@ func.func @bar(%arg0: tensor<8xf32>) ->(tensor<8xf32>, tensor<8xf32>) {
 
 // CHECK-LABEL: @simple_call_graph_on_func_with_multiple_results(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_with_multiple_results(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]]:2 = call @bar(%1) : (tensor<8xf32>) -> (tensor<8xf32>, tensor<8xf32>)
+  // CHECK:      %[[CALL:.*]]:2 = call @bar(%0) : (tensor<8xf32>) -> (tensor<8xf32>, tensor<8xf32>)
   // CHECK-NEXT: %[[EDGE0:.*]] = sdy.data_flow_edge %[[CALL]]#0 : tensor<8xf32>
   // CHECK-NEXT: %[[EDGE1:.*]] = sdy.data_flow_edge %[[CALL]]#1 : tensor<8xf32>
   // CHECK-NEXT: %[[ADD:.*]] = stablehlo.add %[[EDGE0]], %[[EDGE1]] : tensor<8xf32>
@@ -83,7 +83,7 @@ func.func @bar(%arg0: tensor<8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"a"}]
 
 // CHECK-LABEL: @simple_call_graph_on_func_with_sharded_argument(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_with_sharded_argument(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -104,7 +104,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @multiple_calls_on_same_func(%arg0: tensor<8xf32>)
 func.func @multiple_calls_on_same_func(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL0:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL0:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE0:.*]] = sdy.data_flow_edge %[[CALL0]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS0:.*]] = stablehlo.abs %[[EDGE0]] : tensor<8xf32>
   // CHECK-NEXT: %[[CALL1:.*]] = call @bar(%[[ABS0]]) : (tensor<8xf32>) -> tensor<8xf32>
@@ -131,7 +131,7 @@ func.func @bar(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_on_func_with_multiple_argument(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_with_multiple_argument(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1, %2) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0, %1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -154,7 +154,7 @@ func.func @bar(%arg0: tensor<8xf32>, %arg1: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_on_func_with_multiple_argument_same_operand(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_on_func_with_multiple_argument_same_operand(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1, %1) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0, %0) : (tensor<8xf32>, tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -188,7 +188,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_chain_call_graph(%arg0: tensor<8xf32>)
 func.func @simple_chain_call_graph(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]] : tensor<8xf32>
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -222,8 +222,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_non_flat_call_graph(%arg0: tensor<8xf32>)
 func.func @simple_non_flat_call_graph(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK-NEXT: %[[EDGE0:.*]] = sdy.data_flow_edge %arg0 : tensor<8xf32>
-  // CHECK-NEXT: %[[NEGATE:.*]] = stablehlo.negate %[[EDGE0]] : tensor<8xf32>
+  // CHECK-NEXT: %[[NEGATE:.*]] = stablehlo.negate %arg0 : tensor<8xf32>
   // CHECK-NEXT: %[[CALL0:.*]] = call @bar(%[[NEGATE]]) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE1:.*]] = sdy.data_flow_edge %[[CALL0]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS0:.*]] = stablehlo.abs %[[EDGE1]] : tensor<8xf32>
@@ -264,7 +263,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_non_flat_call_graph_one_after_the_other(%arg0: tensor<8xf32>)
 func.func @simple_non_flat_call_graph_one_after_the_other(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL0:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL0:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE0:.*]] = sdy.data_flow_edge %[[CALL0]] : tensor<8xf32>
   // CHECK-NEXT: %[[CALL1:.*]] = call @foo(%[[EDGE0]]) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE1:.*]] = sdy.data_flow_edge %[[CALL1]] : tensor<8xf32>
@@ -301,7 +300,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @call_on_same_func_twice_input_of_one_is_output_of_the_other(%arg0: tensor<8xf32>)
 func.func @call_on_same_func_twice_input_of_one_is_output_of_the_other(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL0:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL0:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE0:.*]] = sdy.data_flow_edge %[[CALL0]] : tensor<8xf32>
   // CHECK-NEXT: %[[CALL1:.*]] = call @bar(%[[EDGE0]]) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE1:.*]] = sdy.data_flow_edge %[[CALL1]] : tensor<8xf32>
@@ -325,7 +324,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_argument_is_input_to_call(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_argument_is_input_to_call(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%arg0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: %[[ABS:.*]] = stablehlo.abs %[[EDGE]]
   %0 = call @bar(%arg0) : (tensor<8xf32>) -> (tensor<8xf32>)
@@ -345,7 +344,7 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_result_is_the_output_of_call(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_result_is_the_output_of_call(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%1) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: return %[[EDGE]]
   %0 = stablehlo.abs %arg0 : tensor<8xf32>
@@ -366,10 +365,33 @@ func.func @bar(%arg0: tensor<8xf32>) -> tensor<8xf32> {
 
 // CHECK-LABEL: @simple_call_graph_entry_contains_call_only(%arg0: tensor<8xf32>)
 func.func @simple_call_graph_entry_contains_call_only(%arg0: tensor<8xf32>) -> tensor<8xf32> {
-  // CHECK:      %[[CALL:.*]] = call @bar(%0) : (tensor<8xf32>) -> tensor<8xf32>
+  // CHECK:      %[[CALL:.*]] = call @bar(%arg0) : (tensor<8xf32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[EDGE:.*]] = sdy.data_flow_edge %[[CALL]] : tensor<8xf32>
   // CHECK-NEXT: return %[[EDGE]]
   %0 = call @bar(%arg0) : (tensor<8xf32>) -> (tensor<8xf32>)
   return %0 : tensor<8xf32>
 }
 
+// -----
+
+// CHECK-LABEL: func private @top_k_gt_f32_comparator
+func.func private @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  // CHECK-NEXT: %0 = stablehlo.compare
+  // CHECK-NEXT: return %0
+  %0 = stablehlo.compare GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @custom_call_partial_reduce
+func.func @custom_call_partial_reduce(%arg0: tensor<16x4xf32>, %arg1: tensor<16x4xf32>, %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<16x2xf32>, tensor<16x2xf32>) {
+  %0:2 = stablehlo.custom_call @PartialReduce(%arg0, %arg1, %arg2, %arg3) {
+    mhlo.backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 0.9 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      top_k = 2 : i64},
+    called_computations = [@top_k_gt_f32_comparator]} :
+    (tensor<16x4xf32>, tensor<16x4xf32>, tensor<f32>, tensor<i32>) -> (tensor<16x2xf32>, tensor<16x2xf32>)
+  return %0#0, %0#1 : tensor<16x2xf32>, tensor<16x2xf32>
+}