microsoft · Binyang2014 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/python/mscclpp/language/collectives.py b/python/mscclpp/language/collectives.py
@@ -236,3 +236,46 @@ def init_buffers(self):
             }
             rank_buffers.append(buffers)
         return rank_buffers
+
+
+class SendRecv(Collective):
+    """A SendRecv collective communication pattern.
+
+    SendRecv performs a point-to-point send/receive operation.
+    Each rank sends its input buffer to the next rank and receives data from the
+    previous rank into its output buffer.
+
+    This operation creates input and output buffers both sized by chunk_factor,
+    as each rank sends and receives the same amount of data.
+    """
+
+    def __init__(self, num_ranks, chunk_factor, inplace):
+        """Initialize a new SendRecv collective.
+
+        Args:
+            num_ranks (int): The number of ranks participating in the SendRecv.
+            chunk_factor (int): The size factor for data chunks.
+            inplace (bool): Whether the operation should be performed in-place.
+
+        Example:
+            >>> sendrecv = SendRecv(num_ranks=4, chunk_factor=1, inplace=False)
+        """
+        Collective.__init__(self, num_ranks, chunk_factor, inplace)
+        self.name = "sendrecv"
+
+    def init_buffers(self):
+        """Initialize buffers for the SendRecv operation.
+
+        Creates input and output buffers both sized by chunk_factor.
+
+        Returns:
+            list: A list of buffer dictionaries, one for each rank.
+        """
+        rank_buffers = []
+        for rank in range(self.num_ranks):
+            buffers = {
+                BufferType.input: BaseBuffer(rank, BufferType.input, 0, self.chunk_factor),
+                BufferType.output: BaseBuffer(rank, BufferType.output, 0, self.chunk_factor),
+            }
+            rank_buffers.append(buffers)
+        return rank_buffers
diff --git a/python/mscclpp/language/rank.py b/python/mscclpp/language/rank.py
@@ -304,11 +304,16 @@ def __init__(self, rank: int, buffer_type: BufferType, offset: int, size: int):
         self.size = offset + size
 
     def __getitem__(self, key):
-        if self.offset + key.stop > self.size:
-            raise RuntimeError(
-                f"Index range from {self.offset + key.start} - {self.offset + key.stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
-            )
-        return Chunk(self.rank, self.buffer_type, self.offset + key.start, key.stop - key.start)
+        if isinstance(key, slice):
+            start = key.start if key.start is not None else 0
+            stop = key.stop if key.stop is not None else (self.size - self.offset)
+            if self.offset + stop > self.size:
+                raise RuntimeError(
+                    f"Index range from {self.offset + start} - {self.offset + stop} is out of bounds for buffer {self.buffer_type}. Buffer size: {self.size}"
+                )
+            return Chunk(self.rank, self.buffer_type, self.offset + start, stop - start)
+        else:
+            raise TypeError(f"Buffer indices must be slices, not {type(key).__name__}")
 
 
 class Buffer(BaseBuffer):

diff --git a/python/mscclpp/language/tests/multi_node/send_recv.py b/python/mscclpp/language/tests/multi_node/send_recv.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def send_recv(name, nnodes, gpus_per_node, split_mask, instances):
+    gpu_size = nnodes * gpus_per_node
+    collective = SendRecv(gpu_size, 1, False)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="Simple",
+        num_threads_per_block=1024,
+        use_double_scratch_buffer=False,
+        min_message_size=0,
+        max_message_size=2**64 - 1,
+        instances=instances,
+    ):
+        # Creating separate port channels for next and prev directions.
+        # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer
+        # and get distinct tags. To ensure cross-rank tag matching (rank A's prev_channel signal
+        # arrives at rank B's next_channel wait), we create channels in opposite order for the
+        # "higher" rank so that tags cross-match:
+        #   Lower rank:  [next(tag0), prev(tag1)]
+        #   Higher rank:  [prev(tag0), next(tag1)]
+        # Then lower.prev(tag1) == higher.next(tag1) and higher.prev(tag0) == lower.next(tag0)
+        # When prev != next (3+ nodes), each channel targets a different peer so each gets tag 0
+        # and this ordering doesn't matter.
+        group_size = split_mask + 1
+        num_groups = gpu_size // group_size
+        next_channels = {}  # channel for sending to next rank
+        prev_channels = {}  # channel for receiving from prev rank
+        prev_next_ids = {}
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                position_in_group = global_rank_id & split_mask
+                group_id = global_rank_id // group_size
+                next_group_id = (group_id + 1) % num_groups
+                next_global_rank_id = next_group_id * group_size + position_in_group
+                prev_group_id = (group_id - 1 + num_groups) % num_groups
+                prev_global_rank_id = prev_group_id * group_size + position_in_group
+                if prev_global_rank_id == next_global_rank_id and global_rank_id > prev_global_rank_id:
+                    # Higher rank: create prev first, then next (swapped order)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                else:
+                    # Lower rank or different peers: create next first, then prev
+                    next_channels[global_rank_id] = PortChannel(next_global_rank_id, global_rank_id)
+                    prev_channels[global_rank_id] = PortChannel(prev_global_rank_id, global_rank_id)
+                prev_next_ids[global_rank_id] = (prev_global_rank_id, next_global_rank_id)
+
+        # sync with the next rank and the previous rank in the group
+        for node in range(nnodes):
+            for gpu in range(gpus_per_node):
+                global_rank_id = gpu + gpus_per_node * node
+                prev_global_rank_id, next_global_rank_id = prev_next_ids[global_rank_id]
+                prev_channels[global_rank_id].signal(tb=0, data_sync=SyncType.none)
+                next_channels[global_rank_id].wait(tb=0, data_sync=SyncType.after)
+
+                src_rank = Rank(global_rank_id)
+                src_buffer = src_rank.get_input_buffer()
+                dst_rank = Rank(next_global_rank_id)
+                dst_buffer = dst_rank.get_output_buffer()
+
+                next_channels[global_rank_id].put_with_signal(dst_buffer[:], src_buffer[:], tb=0)
+                prev_channels[global_rank_id].wait(tb=0, data_sync=SyncType.none)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--nnodes", type=int, default=1, help="number of nodes")
+parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
+parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x3, help="split mask (e.g. 0x3)")
+parser.add_argument("--instances", type=int, default=4, help="number of instances")
+
+args = parser.parse_args()
+
+send_recv(args.name, args.nnodes, args.gpus_per_node, args.split_mask, args.instances)
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
@@ -14,6 +14,7 @@
 from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
+from typing import Callable, Union
 
 import cupy as cp
 from mpi4py import MPI
@@ -34,13 +35,16 @@ def parse_dtype(dtype_str):
         raise ValueError(f"Unknown data type: {dtype_str}")
 
 
-def bench_time(n_iters: int, n_graph_iters: int, func):
-    # capture cuda graph for n_iters of the kernel launch
+def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Callable]]):
+    """Benchmark execution time. func can be a single callable or a list of 2 for double-buffer."""
     stream = cp.cuda.Stream(non_blocking=True)
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            func(stream)
+            if isinstance(func, list):
+                func[i % 2](stream)
+            else:
+                func(stream)
         graph = stream.end_capture()
 
     # now run a warm up round
@@ -61,16 +65,19 @@ def bench_time(n_iters: int, n_graph_iters: int, func):
 
 def bench_correctness(
     collective: str,
-    input_buf: cp.ndarray,
-    result_buf: cp.ndarray,
-    test_buf: cp.ndarray,
+    input_buf: Union[cp.ndarray, list[cp.ndarray]],
+    result_buf: Union[cp.ndarray, list[cp.ndarray]],
+    test_buf: Union[cp.ndarray, list[cp.ndarray]],
     dtype_str: str,
     rank: int,
     num_ranks: int,
     n_iters: int,
-    func,
+    func: Union[Callable, list[Callable]],
+    split_mask: int = 0,
 ):
+    """Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
     type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
+    double_buf = isinstance(input_buf, list)
 
     fill_data_kernel_name = "fill_data_%s" % dtype_str
     if "allgather" in collective:
@@ -79,8 +86,10 @@ def bench_correctness(
         coll = "reduce_scatter"
     elif "allreduce" in collective:
         coll = "all_reduce"
+    elif "sendrecv" in collective:
+        coll = "send_recv"
     else:
-        coll = "all_to_all"
+        raise ValueError(f"Unknown collective: {collective}")
     test_data_kernel_name = "test_data_%s_%s" % (coll, dtype_str)
 
     file_dir = os.path.dirname(os.path.abspath(__file__))
@@ -97,11 +106,27 @@ def bench_correctness(
     with stream:
         stream.begin_capture()
         for i in range(n_iters):
-            fill_data_params = pack(input_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(rank, i)
+            if double_buf:
+                idx = i % 2
+                cur_input = input_buf[idx]
+                cur_result = result_buf[idx]
+                cur_test = test_buf[idx]
+                cur_func = func[idx]
+            else:
+                cur_input = input_buf
+                cur_result = result_buf
+                cur_test = test_buf
+                cur_func = func
+
+            fill_data_params = (
+                pack(cur_input) + struct.pack("Q", cur_input.nbytes // type_size) + pack(rank, i, split_mask)
+            )
             fill_data_kernel.launch_kernel(fill_data_params, nblocks, nthreads, 0, stream)
-            func(stream)
+            cur_func(stream)
             test_data_params = (
-                pack(result_buf, test_buf) + struct.pack("Q", input_buf.nbytes // type_size) + pack(num_ranks, rank, i)
+                pack(cur_result, cur_test)
+                + struct.pack("Q", cur_input.nbytes // type_size)
+                + pack(num_ranks, rank, i, split_mask)
             )
             test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
         graph = stream.end_capture()
@@ -147,6 +172,13 @@ def build_bufs(
     assert (size % type_size) == 0, "size %d not multiple of type size %d" % (size, type_size)
     nelems = size // type_size
 
+    # Sendrecv uses double buffering: return lists of 2 buffers
+    if "sendrecv" in collective:
+        input_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
+        result_bufs = [GpuBuffer(nelems, dtype=dtype) for _ in range(2)]
+        test_bufs = [cp.zeros(nelems, dtype=dtype) for _ in range(2)]
+        return input_bufs, result_bufs, test_bufs, nelems
+
     if "allgather" in collective:
         assert (nelems % num_ranks) == 0, "nelems %d not multiple of num_ranks %d" % (nelems, num_ranks)
         nelems_input = nelems if in_place else nelems // num_ranks
@@ -173,7 +205,7 @@ def build_bufs(
 
     test_buf = cp.zeros(nelems, dtype=dtype)
 
-    return input_buf, result_buf, test_buf
+    return input_buf, result_buf, test_buf, nelems
 
 
 def main(
@@ -184,6 +216,7 @@ def main(
     packet_type: PacketType = PacketType.LL16,
     n_iters: int = 10,
     n_graph_iters: int = 10,
+    split_mask: int = 0,
 ):
     mscclpp_group = CommGroup(MPI.COMM_WORLD)
     cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
@@ -195,7 +228,7 @@ def main(
     collective = execution_plan.collective
 
     dtype = parse_dtype(dtype_str)
-    input_buf, result_buf, test_buf = build_bufs(
+    input_buf, result_buf, test_buf, nelem = build_bufs(
         collective,
         size,
         in_place,
@@ -204,17 +237,36 @@ def main(
         mscclpp_group.nranks,
     )
 
-    executor_func = lambda stream: executor.execute(
-        mscclpp_group.my_rank,
-        input_buf.data.ptr,
-        result_buf.data.ptr,
-        input_buf.nbytes,
-        result_buf.nbytes,
-        dtype_to_mscclpp_dtype(dtype_str),
-        execution_plan,
-        stream.ptr,
-        packet_type,
-    )
+    sendrecv_mode = "sendrecv" in collective
+
+    if sendrecv_mode:
+        # Double-buffer: create two executor funcs, one per buffer pair
+        executor_funcs = []
+        for idx in range(2):
+            func = lambda stream, i=idx: executor.execute(
+                mscclpp_group.my_rank,
+                input_buf[i].data.ptr,
+                result_buf[i].data.ptr,
+                input_buf[i].nbytes,
+                result_buf[i].nbytes,
+                dtype_to_mscclpp_dtype(dtype),
+                execution_plan,
+                stream.ptr,
+                packet_type,
+            )
+            executor_funcs.append(func)
+    else:
+        executor_func = lambda stream: executor.execute(
+            mscclpp_group.my_rank,
+            input_buf.data.ptr,
+            result_buf.data.ptr,
+            input_buf.nbytes,
+            result_buf.nbytes,
+            dtype_to_mscclpp_dtype(dtype),
+            execution_plan,
+            stream.ptr,
+            packet_type,
+        )
 
     mscclpp_group.barrier()
     bench_correctness(
@@ -226,17 +278,21 @@ def main(
         mscclpp_group.my_rank,
         mscclpp_group.nranks,
         n_iters,
-        executor_func,
+        executor_funcs if sendrecv_mode else executor_func,
+        split_mask=split_mask,
     )
 
     mscclpp_group.barrier()
-    execution_time = bench_time(n_iters, n_graph_iters, executor_func)
+    execution_time = bench_time(n_iters, n_graph_iters, executor_funcs if sendrecv_mode else executor_func)
     if npkit_dump_dir is not None:
         npkit.dump(npkit_dump_dir)
         npkit.shutdown()
+
+    result_nbytes = result_buf[0].nbytes if sendrecv_mode else result_buf.nbytes
     print(
         f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_buf.nbytes} bytes data type: {dtype_str} "
+        f"data size: {result_nbytes} bytes data type: {dtype().dtype.name} "
+        f"bandwidth: {result_nbytes / (execution_time * 1e-6) / (1024**3):.2f} GB/s, "
         f"packet type: {packet_type}"
     )
     executor = None
@@ -252,6 +308,9 @@ def main(
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
     parser.add_argument("--n_graph_iters", type=int, default=10)
+    parser.add_argument(
+        "--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)"
+    )
     args = parser.parse_args()
 
     packet_type = PacketType.LL16
@@ -267,4 +326,5 @@ def main(
         packet_type,
         args.n_iters,
         args.n_graph_iters,
+        args.split_mask,
     )