NVIDIA · voegtlel · Jun 30, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/docs/source/advanced/epochized_blending.md b/docs/source/advanced/epochized_blending.md
@@ -25,20 +25,24 @@ splits:
         split_part: val
 ```
 
-Now, the call to `get_train_dataset` requires the additional parameter `repeat=False` to interrupt iterating after one epoch:
+Now, the call to `get_train_dataset` requires the additional parameter `repeat=False` to interrupt iterating after one epoch.
+In addition, the `RedistributeLoader` is used to synchronize the last samples when some ranks have already exhausted their data:
 
 ```py
 from megatron.energon import get_train_dataset, get_loader, WorkerConfig
+from megatron.energon.sync_end import RedistributeLoader
 
 
-loader = get_loader(get_train_dataset(
+# Set repeat=False to avoid repeating the dataset.
+# Use RedistributeLoader to synchronize the end of rank exhaustion. Only works with initialized torch distributed.
+loader = RedistributeLoader(get_loader(get_train_dataset(
     'metadataset.yaml',
     batch_size=2,
     shuffle_buffer_size=100,
     max_samples_per_sequence=100,
     worker_config=WorkerConfig.default_worker_config(),
     repeat=False,
-))
+)))
 
 # This will now stop iterating after the datasets have been iterated (coco 5 times, coyo-train 2
 # times and coyo-val 1 times). Of course, the data is still being shuffled between all those
@@ -54,3 +58,10 @@ for batch in loader:
 
 If used as dataset for `get_val_dataset`, the `repetitions` are ignored.
 The metadataset would also work without setting `repeat=False`, but then the shuffle buffer will shuffle samples across bounderies of epochs.
+
+## Sample Redistribution
+
+There are two options for handling the end of the data iteration:
+
+* `RedistributeLoader`: Redistributes samples when a rank is exhausted before other ranks. On next epoch, includes the incomplete batches' samples.
+* `StopFirstLoader`: Stop as soon as the first rank is exhausted. The next epoch will iterate until the next loader stops, restarting all ranks once.
diff --git a/src/megatron/energon/sync_end/__init__.py b/src/megatron/energon/sync_end/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-License-Identifier: BSD-3-Clause
+from megatron.energon.sync_end.redistribute import RedistributeDataLoaderState, RedistributeLoader
+from megatron.energon.sync_end.stop_first_end import StopFirstDataLoaderState, StopFirstLoader
+
+"""
+Provides wrappers for the dataset loaders that allow for synchronization at the end of the dataset.
+I.e. if running a training with repeat=False, the loaders will typically exhaust at different times, which may require
+synchronization across ranks.
+
+The wrappers are:
+- RedistributeLoader: Redistributes the last samples to the ranks that are not exhausted.
+- StopFirstLoader: Stops iterating as soon as the first rank is exhausted.
+"""
+
+__all__ = [
+    "RedistributeLoader",
+    "RedistributeDataLoaderState",
+    "StopFirstLoader",
+    "StopFirstDataLoaderState",
+]