|
| 1 | +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import paddle |
| 16 | +import paddle.distributed as dist |
| 17 | +from paddle.distributed.communication.batch_isend_irecv import ( |
| 18 | + _coalescing_manager as batch_isend_irecv_coalescing_manager, |
| 19 | +) |
| 20 | + |
| 21 | + |
| 22 | +def gather_varlen(input, dst, group, all_shape_and_dtype): |
| 23 | + """Gather variable-length tensors from all ranks to *dst*. |
| 24 | +
|
| 25 | + The destination rank pre-allocates a single contiguous buffer for all |
| 26 | + incoming data to avoid memory fragmentation from intermediate concat. |
| 27 | + Non-destination ranks send their local slice and return None. |
| 28 | +
|
| 29 | + Args: |
| 30 | + input: Local tensor slice (may be None if this rank contributes nothing). |
| 31 | + dst: Global rank of the destination. |
| 32 | + group: The process group. |
| 33 | + all_shape_and_dtype: List of (shape, dtype) tuples, one per rank. |
| 34 | + shape is None (or shape[0] == 0) when a rank has no data. |
| 35 | +
|
| 36 | + Returns: |
| 37 | + Concatenated 1-D tensor on the destination rank; None elsewhere. |
| 38 | + """ |
| 39 | + tasks = [] |
| 40 | + |
| 41 | + if group.ranks[group.rank] == dst: |
| 42 | + # Destination: allocate one contiguous buffer and receive all slices. |
| 43 | + total_len = sum([s[0] for s, _ in all_shape_and_dtype if s is not None]) |
| 44 | + dtype = all_shape_and_dtype[0][1] |
| 45 | + output_tensor = paddle.empty([total_len], dtype=dtype) |
| 46 | + |
| 47 | + task_info_list = [] |
| 48 | + current_offset = 0 |
| 49 | + |
| 50 | + with batch_isend_irecv_coalescing_manager(group, tasks): |
| 51 | + for src in range(group.nranks): |
| 52 | + shape = all_shape_and_dtype[src][0] |
| 53 | + if shape is None or shape[0] == 0: |
| 54 | + continue |
| 55 | + length = shape[0] |
| 56 | + if src != group.rank: |
| 57 | + recv_tensor = paddle.empty(shape, dtype=all_shape_and_dtype[src][1]) |
| 58 | + task = dist.irecv(recv_tensor, group.ranks[src], group=group) |
| 59 | + tasks.append(task) |
| 60 | + task_info_list.append((task, recv_tensor, current_offset, length)) |
| 61 | + else: |
| 62 | + output_tensor[current_offset : current_offset + length] = input |
| 63 | + current_offset += length |
| 64 | + |
| 65 | + for task, recv_tensor, offset, length in task_info_list: |
| 66 | + task.wait() |
| 67 | + output_tensor[offset : offset + length] = recv_tensor |
| 68 | + del recv_tensor |
| 69 | + |
| 70 | + return output_tensor |
| 71 | + |
| 72 | + else: |
| 73 | + # Sender: push local slice to dst and return None. |
| 74 | + with batch_isend_irecv_coalescing_manager(group, tasks): |
| 75 | + if input is not None and input.shape[0] != 0: |
| 76 | + task = dist.isend(input, dst, group=group) |
| 77 | + tasks.append(task) |
| 78 | + |
| 79 | + for task in tasks: |
| 80 | + task.wait() |
| 81 | + |
| 82 | + return None |
| 83 | + |
| 84 | + |
| 85 | +def get_sharding_info(buffer, param_name, world_size, rank): |
| 86 | + """Compute per-rank element counts and local offset for a sharded parameter. |
| 87 | +
|
| 88 | + ShardingV2 splits the flat param storage evenly across ranks. This |
| 89 | + function intersects each rank's slice of that storage with the parameter's |
| 90 | + global range to produce the element count each rank owns. |
| 91 | +
|
| 92 | + Args: |
| 93 | + buffer: The FusedCommBuffer that contains the parameter. |
| 94 | + param_name: Name of the parameter. |
| 95 | + world_size: Number of ranks in the sharding group. |
| 96 | + rank: Local rank in the sharding group. |
| 97 | +
|
| 98 | + Returns: |
| 99 | + indices: List of element counts per rank (length == world_size). |
| 100 | + my_slice_offset: Offset of this rank's slice within the full flat param. |
| 101 | + """ |
| 102 | + grad_view = buffer._sharding_param_grad_view[param_name] |
| 103 | + |
| 104 | + param_global_start = grad_view._index |
| 105 | + param_global_end = grad_view._index + grad_view._padded_size |
| 106 | + |
| 107 | + # ShardingV2 splits the storage buffer evenly across ranks. |
| 108 | + shard_size = buffer.param_storage.shape[0] // world_size |
| 109 | + |
| 110 | + indices = [] |
| 111 | + my_slice_offset = 0 |
| 112 | + current_relative_offset = 0 |
| 113 | + |
| 114 | + for r in range(world_size): |
| 115 | + r_start = r * shard_size |
| 116 | + r_end = (r + 1) * shard_size |
| 117 | + start = max(param_global_start, r_start) |
| 118 | + end = min(param_global_end, r_end) |
| 119 | + length = max(0, end - start) |
| 120 | + indices.append(length) |
| 121 | + if r == rank: |
| 122 | + my_slice_offset = current_relative_offset |
| 123 | + current_relative_offset += length |
| 124 | + |
| 125 | + return indices, my_slice_offset |
| 126 | + |
| 127 | + |
| 128 | +def should_use_muon(name, shape): |
| 129 | + """Return True if a parameter should receive Muon (orthogonal) updates. |
| 130 | +
|
| 131 | + Muon applies only to 2-D weight matrices. Embeddings, biases, and |
| 132 | + LM-head weights fall back to AdamW. |
| 133 | + """ |
| 134 | + if len(shape) != 2: |
| 135 | + return False |
| 136 | + name = name.lower() |
| 137 | + if "embed" in name or "bias" in name or "lm_head" in name: |
| 138 | + return False |
| 139 | + return True |
0 commit comments