[SPMD] Add Wrap custom call

wsmoses · Google-ML-Automation · commit 69c2319dfc40 · 2026-02-21T08:37:11.000-08:00
PiperOrigin-RevId: 873355682
diff --git a/xla/service/spmd/custom_call_handler.cc b/xla/service/spmd/custom_call_handler.cc
@@ -84,6 +84,7 @@ ParseOpaqueAsAttributes(const HloInstruction* hlo) {
 
 constexpr char kSPMDOpRotateRight[] = "_SPMDInternalOp_RotateRight";
 constexpr char kSPMDOpMultiRotate[] = "_SPMDInternalOp_MultiRotate";
+constexpr char kSPMDOpWrap[] = "_SPMDInternalOp_Wrap";
 
 }  // namespace
 
@@ -579,6 +580,218 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_MultiRotate(
   return absl::OkStatus();
 }
 
+absl::Status SpmdPartitioningVisitor::HandleCustomCallSPMDInternal_Wrap(
+    HloInstruction* hlo) {
+  TF_ASSIGN_OR_RETURN(auto attrs, ParseOpaqueAsAttributes(hlo));
+  auto dim_it = attrs.find("dimension");
+  TF_RET_CHECK(dim_it != attrs.end())
+      << "No dimension attribute in SPMD multi rotate op";
+  int64_t dim = dim_it->second;
+
+  auto left_amount_it = attrs.find("left_amount");
+  TF_RET_CHECK(left_amount_it != attrs.end())
+      << "No left_amount attribute in SPMD multi rotate op";
+  int64_t left_amount = left_amount_it->second;
+
+  auto right_amount_it = attrs.find("right_amount");
+  TF_RET_CHECK(right_amount_it != attrs.end())
+      << "No right_amount attribute in SPMD multi rotate op";
+  int64_t right_amount = right_amount_it->second;
+
+  int32_t totalResults = left_amount + right_amount + 1;
+
+  PartitionedHlo input = GetPartitionedHlo(hlo->operand(0));
+  HloSharding element_sharding = hlo->sharding();
+
+  TF_RET_CHECK(
+      !(element_sharding.IsReplicated() || element_sharding.IsTileMaximal()))
+      << "MultiRotate op requires sharding along the rotate dimension.";
+
+  input = input.Reshard(element_sharding);
+
+  const Shape& pre_wrap_shape = hlo->operand(0)->shape().tuple_shapes(0);
+  const int64_t full_pre_wrap_size = pre_wrap_shape.dimensions(dim);
+  const int64_t shard_size = input.hlo()->shape().dimensions(dim);
+
+  const int64_t participating_shards =
+      CeilOfRatio(full_pre_wrap_size, shard_size);
+  const int64_t right_padding =
+      participating_shards * shard_size - full_pre_wrap_size;
+
+  HloInstruction* local_input = input.hlo();
+  HloInstruction* padded_local_input = local_input;
+
+  if (right_padding > 0) {
+    auto paddingConfig = MakeNoPaddingConfig(pre_wrap_shape.dimensions_size());
+    paddingConfig.mutable_dimensions(dim)->set_edge_padding_high(right_padding);
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(pre_wrap_shape.element_type())));
+    Shape padded_shape = local_input->shape();
+    padded_shape.set_dimensions(dim,
+                                padded_shape.dimensions(dim) + right_padding);
+    padded_local_input = b_.AddInstruction(HloInstruction::CreatePad(
+        padded_shape, local_input, zero, paddingConfig));
+  }
+
+  HloInstruction* left_halo = nullptr;
+  if (left_amount > 0) {
+    std::vector<std::pair<int64_t, int64_t>> pairs;
+    element_sharding.tile_assignment().Each(
+        [&](absl::Span<const int64_t> indices, int64_t device) {
+          if (indices[dim] >= participating_shards) {
+            return;
+          }
+          std::vector<int64_t> dst_idx(indices.begin(), indices.end());
+          dst_idx[dim] += 1;
+          dst_idx[dim] %= participating_shards;
+          pairs.emplace_back(device,
+                             element_sharding.tile_assignment()(dst_idx));
+        });
+
+    Shape slice_shape = padded_local_input->shape();
+    slice_shape.set_dimensions(dim, left_amount);
+    std::vector<int64_t> slice_starts(full_pre_wrap_size.dimensions_size(), 0);
+    slice_starts[dim] = 0;
+    std::vector<int64_t> slice_limits(
+        padded_local_input->shape().dimensions().begin(),
+        padded_local_input->shape().dimensions().end());
+    slice_limits[dim] = left_amount;
+    HloInstruction* slice_to_send =
+        b_.AddInstruction(HloInstruction::CreateSlice(
+            slice_shape, padded_local_input, slice_starts, slice_limits,
+            std::vector<int64_t>(full_pre_wrap_size.dimensions_size(), 1)));
+
+    left_halo = collective_ops_creator_.create_collective_permute(
+        &b_, slice_to_send, pairs, NewChannel());
+  }
+
+  HloInstruction* shard_offset = MakePartitionOffsets(
+      pre_wrap_shape, element_sharding, MakePartitioningState().partition_id,
+      &b_, {dim})[dim];
+
+  HloInstruction* zero_offset = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32_t>(0)));
+
+  HloInstruction* right_halo = nullptr;
+  if (right_amount > 0) {
+    std::vector<std::pair<int64_t, int64_t>> pairs;
+    element_sharding.tile_assignment().Each(
+        [&](absl::Span<const int64_t> indices, int64_t device) {
+          if (indices[dim] >= participating_shards) {
+            return;
+          }
+          std::vector<int64_t> dst_idx(indices.begin(), indices.end());
+          dst_idx[dim] += participating_shards - 1;
+          dst_idx[dim] %= participating_shards;
+          pairs.emplace_back(device,
+                             element_sharding.tile_assignment()(dst_idx));
+        });
+
+    HloInstruction* base_start =
+        b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32_t>(shard_size - right_amount)));
+    HloInstruction* padding_val =
+        b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32_t>(right_padding)));
+
+    HloInstruction* total_mesh_size =
+        b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32_t>(participating_shards * shard_size)));
+
+    // shard_offset tells us the current start index.
+    // The last shard is the one where shard_offset + shard_size ==
+    // total_mesh_size.
+    HloInstruction* shard_size_inst =
+        b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32_t>(shard_size)));
+    HloInstruction* current_shard_end = b_.AddInstruction(
+        HloInstruction::CreateBinary(shard_offset->shape(), HloOpcode::kAdd,
+                                     shard_offset, shard_size_inst));
+
+    HloInstruction* is_last = b_.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::ChangeElementType(shard_offset->shape(), PRED),
+        current_shard_end, total_mesh_size, Comparison::Direction::kEq));
+
+    HloInstruction* offset = b_.AddInstruction(
+        HloInstruction::CreateTernary(shard_offset->shape(), HloOpcode::kSelect,
+                                      is_last, padding_val, zero_offset));
+
+    HloInstruction* start_idx = b_.AddInstruction(HloInstruction::CreateBinary(
+        base_start->shape(), HloOpcode::kSubtract, base_start, offset));
+
+    Shape dynamic_slice_shape = padded_local_input->shape();
+    dynamic_slice_shape.set_dimensions(dim, right_amount);
+
+    std::vector<HloInstruction*> start_indices(
+        pre_wrap_shape.dimensions_size());
+    for (int i = 0; i < pre_wrap_shape.dimensions_size(); i++) {
+      start_indices[i] = zero_offset;
+    }
+    start_indices[dim] = start_idx;
+
+    std::vector<int64_t> slice_sizes(
+        padded_local_input->shape().dimensions().begin(),
+        padded_local_input->shape().dimensions().end());
+    slice_sizes[dim] = right_amount;
+
+    HloInstruction* slice_to_send =
+        b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+            dynamic_slice_shape, padded_local_input, start_indices,
+            slice_sizes));
+
+    right_halo = collective_ops_creator_.create_collective_permute(
+        &b_, slice_to_send, pairs, NewChannel());
+  }
+
+  std::vector<HloInstruction*> concat_ops;
+  if (left_halo) {
+    concat_ops.push_back(left_halo);
+  }
+  concat_ops.push_back(padded_local_input);
+  if (right_halo) {
+    concat_ops.push_back(right_halo);
+  }
+
+  HloInstruction* super_shard = padded_local_input;
+  if (concat_ops.size() > 1) {
+    Shape concat_shape = local_input->shape();
+    concat_shape.set_dimensions(
+        dim, shard_size + right_padding + left_amount + right_amount);
+    super_shard = b_.AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_ops, dim));
+  }
+
+  int64_t post_wrap_shard_size =
+      CeilOfRatio(hlo->shape().dimensions(dim), participating_shards);
+
+  Shape slice_shape = super_shard->shape();
+  slice_shape.set_dimensions(dim, post_wrap_shard_size);
+
+  HloInstruction* shard_size_change =
+      b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32_t>(post_wrap_shard_size - shard_size)));
+
+  HloInstruction* start = b_.AddInstruction(
+      HloInstruction::CreateBinary(shard_offset->shape(), HloOpcode::kMultiply,
+                                   shard_offset, shard_size_change));
+
+  std::vector<HloInstruction*> start_indices(pre_wrap_shape.dimensions_size());
+  for (int i = 0; i < pre_wrap_shape.dimensions_size(); i++) {
+    start_indices[i] = zero_offset;
+  }
+  start_indices[dim] = start;
+
+  std::vector<int64_t> slice_sizes(super_shard->shape().dimensions().begin(),
+                                   super_shard->shape().dimensions().end());
+  slice_sizes[dim] = post_wrap_shard_size;
+
+  HloInstruction* result = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+      slice_shape, super_shard, start_indices, slice_sizes));
+
+  SetPartitionedHlo(hlo, [&] { return result; });
+  return absl::OkStatus();
+}
+
 std::unique_ptr<HloInstruction> CreateCustomCallSPMDInternal_RotateRight(
     HloInstruction* input, int64_t dim, int64_t amount) {
   std::string opaque = absl::StrCat("dimension=", dim, ",amount=", amount);
@@ -624,6 +837,9 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   if (hlo->custom_call_target() == kSPMDOpMultiRotate) {
     return HandleCustomCallSPMDInternal_MultiRotate(hlo);
   }
+  if (hlo->custom_call_target() == kSPMDOpWrap) {
+    return HandleCustomCallSPMDInternal_Wrap(hlo);
+  }
 
   if (hlo->sharding().HasUniqueDevice()) {
     return HandleSingleDevice(hlo);
diff --git a/xla/service/spmd/spmd_partitioner.h b/xla/service/spmd/spmd_partitioner.h
@@ -751,6 +751,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   // Convenient custom ops defined by the partitioner itself.
   absl::Status HandleCustomCallSPMDInternal_RotateRight(HloInstruction* hlo);
   absl::Status HandleCustomCallSPMDInternal_MultiRotate(HloInstruction* hlo);
+  absl::Status HandleCustomCallSPMDInternal_Wrap(HloInstruction* hlo);
 
   virtual std::unique_ptr<SpmdPartitioningVisitor> Clone() const;