levanter: add randomize_epochs flag to MixtureDataset

eric-czech · claude · eric-czech · commit 73840ccd4327 · 2026-05-16T00:30:22.000Z
When set, each pass through a finite mixture component uses an independent
permutation of its samples instead of the natural-order cycle (`raw_idx % L`).
The flag defaults to False so existing behavior is preserved.

The permutation is built by a new module-level helper `_compute_epoch_assignment`
that derives a per-(dataset_id, epoch) Feistel permutation via
`fold_in(fold_in(key, dataset_id), epoch)`, cached per instance. `_remap_indices`
now takes the dataset id and dispatches to the per-epoch permutation when the
flag is on; under FIRST_STOP_STRATEGY the flag is a no-op since no component
completes more than one pass.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/lib/levanter/src/levanter/data/mixture.py b/lib/levanter/src/levanter/data/mixture.py
@@ -15,6 +15,7 @@
 from levanter.utils.jax_utils import local_cpu_mesh
 
 from levanter.data import AsyncDataset
+from levanter.data._prp import Permutation
 from levanter.schedule import BatchSchedule
 from levanter.utils.index import Index
 from levanter.utils.thread_utils import blocking_wait, future_from_value
@@ -47,6 +48,11 @@ class MixtureDataset(AsyncDataset[T]):
             - FIRST_STOP_STRATEGY: stop when one dataset has been exhausted
             - ALL_STOP_STRATEGY: stop when all datasets have been exhausted
             - RESTART_STRATEGY: restart the dataset when it has been exhausted
+        randomize_epochs: if True, each pass through a finite mixture component uses an
+            independent permutation of its samples; if False, the component is accessed in
+            natural order via ``raw_idx % length``. Takes effect only for finite components
+            under ``RESTART_STRATEGY`` or ``ALL_STOP_STRATEGY``; under ``FIRST_STOP_STRATEGY``
+            no component completes more than one pass, so there is no second epoch to permute.
         key: random key for datasets sampling
     """
 
@@ -57,6 +63,7 @@ def __init__(
         block_size: int,
         *,
         randomize_blocks: bool = True,
+        randomize_epochs: bool = False,
         key: PRNGKeyArray | int,
         stop_strategy: str = StopStrategy.RESTART_STRATEGY,
     ):
@@ -94,6 +101,7 @@ def __init__(
             raise ValueError(f"Block size must be at most 2^16, got {block_size}")
 
         self.randomize_blocks = randomize_blocks
+        self.randomize_epochs = randomize_epochs
 
         # this stupid dance is to ensure that the key is on CPU so we don't end up with weird device placement issues
         # in recent JAX.
@@ -255,7 +263,7 @@ async def get_batch(self, indices: Sequence[int]) -> Sequence[T]:
                 batch_futures.append(future_from_value([]))
             else:
                 dataset = self._dataset_of_id(dataset_id)
-                indices_for_dataset = await self._remap_indices(dataset, indices_for_dataset)
+                indices_for_dataset = await self._remap_indices(dataset, indices_for_dataset, dataset_id)
                 batch_futures.append(dataset.get_batch(indices_for_dataset))
 
         batches = await asyncio.gather(*batch_futures)
@@ -279,14 +287,12 @@ async def getitem_async(self, index: int) -> T:
         dataset_id, dataset_index = self._index_into_dataset_for_id(permuted_ids[index], block_id)
 
         dataset = self._dataset_of_id(dataset_id)
-        dataset_index = (await self._remap_indices(dataset, [dataset_index]))[0]
+        dataset_index = (await self._remap_indices(dataset, [dataset_index], dataset_id))[0]
 
         return await dataset.getitem_async(dataset_index)
 
-    async def _remap_indices(self, ds, indices_into_ds):
-        """
-        Handles wrap around for datasets that have finite length
-        """
+    async def _remap_indices(self, ds, indices_into_ds, dataset_id: int):
+        """Handles wrap around for datasets that have finite length."""
         if self.stop_strategy in [StopStrategy.RESTART_STRATEGY, StopStrategy.ALL_STOP_STRATEGY]:
             if ds.is_finite():
                 length_of_dataset = await ds.async_len()
@@ -295,7 +301,10 @@ async def _remap_indices(self, ds, indices_into_ds):
                         "MixtureDataset in RESTART_STRATEGY encountered an empty finite dataset "
                         "(`async_len()` returned 0). Restart strategy does not support empty datasets."
                     )
-                indices_into_ds = [idx % length_of_dataset for idx in indices_into_ds]
+                if self.randomize_epochs:
+                    indices_into_ds = self._apply_epoch_permutation(dataset_id, length_of_dataset, indices_into_ds)
+                else:
+                    indices_into_ds = [idx % length_of_dataset for idx in indices_into_ds]
 
             return indices_into_ds
 
@@ -304,6 +313,22 @@ async def _remap_indices(self, ds, indices_into_ds):
 
         raise ValueError(f"Unknown stop strategy: {self.stop_strategy}")
 
+    def _apply_epoch_permutation(self, dataset_id: int, length: int, indices_into_ds: Sequence[int]) -> list[int]:
+        raw = np.asarray(indices_into_ds, dtype=np.int64)
+        epochs = raw // length
+        in_epoch = raw % length
+        out = np.empty_like(in_epoch)
+        # A batch may straddle an epoch boundary; each epoch uses its own permutation.
+        for epoch in np.unique(epochs).tolist():
+            mask = epochs == epoch
+            perm = self._get_epoch_permutation(dataset_id, int(epoch), length)
+            out[mask] = perm(in_epoch[mask])
+        return [int(x) for x in out]
+
+    @functools.lru_cache(maxsize=128)
+    def _get_epoch_permutation(self, dataset_id: int, epoch: int, length: int) -> Permutation:
+        return _compute_epoch_assignment(dataset_id, epoch, length, self.key)
+
     def _set_finiteness_cache(self, finite_length: int | None) -> int | None:
         self._cached_finite_length = finite_length
         self._is_finite_cache = finite_length is not None
@@ -503,6 +528,14 @@ def _compute_block_assignment(base_ids, index, key):
     return permuted_ids
 
 
+def _compute_epoch_assignment(dataset_id: int, epoch: int, length: int, key: PRNGKeyArray) -> Permutation:
+    with local_cpu_mesh():
+        sub_key = jax.random.fold_in(key, dataset_id)
+        epoch_key = jax.random.fold_in(sub_key, epoch)
+        epoch_key = jax.device_put(jax.device_get(epoch_key))
+    return Permutation.make("feistel", length, epoch_key)
+
+
 def rescale_mixture_schedule_for_batch_schedule(
     mixture_schedule: Sequence[Tuple[int, dict[str, float]]], batch_schedule: BatchSchedule
 ) -> List[Tuple[int, dict[str, float]]]:
diff --git a/lib/levanter/tests/test_mixture.py b/lib/levanter/tests/test_mixture.py
@@ -203,13 +203,13 @@ async def test_mixture_dataset_remap_indices():
     dses = datasets()
     mixture_ds = MixtureDataset(dses, weights(), block_size=10, key=key())
 
-    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [0, 1, 2])
+    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [0, 1, 2], 0)
     assert len(remapped_indices) == 3
     assert remapped_indices == [0, 1, 2]
 
     # check wrap around
     len_ds1 = await dses["ds1"].async_len()
-    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [len_ds1 - 1, len_ds1, len_ds1 + 1])
+    remapped_indices = await mixture_ds._remap_indices(dses["ds1"], [len_ds1 - 1, len_ds1, len_ds1 + 1], 0)
     assert len(remapped_indices) == 3
 
     assert remapped_indices == [len_ds1 - 1, 0, 1]
@@ -266,6 +266,36 @@ async def test_mixture_dataset_randomizes_blocks():
     assert not np.all(block_assignment_1 == block_assignment_3), "Block assignments should be randomized"
 
 
+@pytest.mark.asyncio
+async def test_mixture_dataset_randomize_epochs_permutes_finite_component():
+    """Each pass through a finite component is its own permutation when randomize_epochs=True."""
+    finite_length = 8
+    finite = ListAsyncDataset(list(range(finite_length)))  # value at index k is k
+    dses = {"finite": finite, "infinite": InfiniteCounterDataset()}
+    bs = 2 * finite_length
+    weights = {"finite": 0.5, "infinite": 0.5}
+
+    # randomize_blocks=False + finite registered first ⇒ positions [0, L) of every block are finite.
+    async def finite_orderings(ds: MixtureDataset, num_epochs: int) -> list[list[int]]:
+        return [list(await ds.get_batch(list(range(e * bs, e * bs + finite_length)))) for e in range(num_epochs)]
+
+    shuffled = MixtureDataset(dses, weights, block_size=bs, key=key(), randomize_blocks=False, randomize_epochs=True)
+    epochs = await finite_orderings(shuffled, num_epochs=3)
+
+    expected = list(range(finite_length))
+    for i, ordering in enumerate(epochs):
+        assert sorted(ordering) == expected, f"Epoch {i} is not a permutation of [0, L): {ordering}"
+
+    distinct = {tuple(o) for o in epochs}
+    assert len(distinct) >= 2, f"Expected per-epoch orderings to differ, got {epochs}"
+    assert epochs[0] != expected, f"Epoch 0 should not be the identity order, got {epochs[0]}"
+
+    baseline = MixtureDataset(dses, weights, block_size=bs, key=key(), randomize_blocks=False, randomize_epochs=False)
+    baseline_epochs = await finite_orderings(baseline, num_epochs=3)
+    for i, ordering in enumerate(baseline_epochs):
+        assert ordering == expected, f"Default (randomize_epochs=False) epoch {i}: {ordering}"
+
+
 @pytest.mark.asyncio
 async def test_mixture_dataset_samples_all_elements():
     mixture_ds = MixtureDataset(datasets(), weights(), block_size=10, key=key())