levanter: parallelize build_caches over components (#5388)

ravwojdyla-agent · ravwojdyla · web-flow · commit 8a465b6ceded · 2026-05-14T01:06:25.000Z
* parallelize `LmDataConfig.build_caches` since sequential GCS
round-trips dominated startup (~40 min for ~100 components in the
Datakit Testbed before the first training step)
* run per-component work in a `ThreadPoolExecutor` with
`max_workers=min(32, len(items))`; work is GCS-metadata-bound (ledger
reads, per-shard `ShardedTreeCache.__init__`) so threads fit [^1]
* refactor the loop body into a `_build_one` helper returning `(name,
cache_or_None)`; pre-filter eligible components (skip zero-weight train,
`DirectDatasetComponent`, raise on unsupported types) before scheduling,
then post-filter `None` results when keying the result dict
* wrap the executor in `rigging.timing.log_time` so total wall time per
`build_caches[&lt;split&gt;]` lands in the logs
* skip and exception semantics unchanged — one bad component still fails
the whole build
* add unit tests in `lib/levanter/tests/test_text.py`
* `test_build_caches_returns_all_components_in_parallel` — 4-component
build, asserts the result dict is keyed by name with the right cache
contents
* `test_build_caches_propagates_exception_from_one_component` — mixed
good/bad pair must raise so errors aren't swallowed by `pool.map`

[^1]: cap of 32 avoids hammering GCS on very large component lists.

---------

Co-authored-by: Rafal Wojdyla &lt;ravwojdyla@gmail.com&gt;
diff --git a/lib/levanter/src/levanter/data/text/datasets.py b/lib/levanter/src/levanter/data/text/datasets.py
@@ -7,6 +7,7 @@
 import logging
 import os
 from collections.abc import Callable, Mapping, Sequence
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from functools import cached_property
 from typing import Literal, NotRequired, TypeAlias, TypeVar, TypedDict
@@ -18,6 +19,7 @@
 from draccus import ChoiceRegistry, field
 from haliax import Axis
 from jaxtyping import PRNGKeyArray
+from rigging.timing import log_time
 
 import levanter
 from levanter.data import AsyncDataset
@@ -861,58 +863,78 @@ def validation_grug_sets(self, *, seq_len: int) -> Mapping[str, AsyncDataset[Gru
         return self._validation_datasets_unwrapped(Pos)
 
     def build_caches(self, split: str) -> dict[str, TreeCache[dict]]:
-        caches: dict[str, TreeCache[dict]] = {}
+        items: list[tuple[str, "DatasetComponent"]] = []
         for name, component in self.components.items():
             if split == "train" and not self._has_nonzero_weight(name):
                 continue
-
             if isinstance(component, DirectDatasetComponent):
                 continue
-
             if not isinstance(component, DatasetComponent):
                 raise ValueError(f"Unsupported component type for {name}: {type(component)}")
-
+            items.append((name, component))
+
+        if not items:
+            return {}
+
+        # Loads are pure GCS metadata reads and parallelize cleanly. Builds may
+        # enter `_distributed_build_cache`, which uses unidentified jax
+        # collectives paired across processes by dispatch order — running
+        # multiple of those concurrently can cross-wire status broadcasts or
+        # hang. Classify each component in the pool, then build any misses
+        # serially in the original component order.
+        def _load_or_defer(
+            item: tuple[str, "DatasetComponent"],
+        ) -> tuple[str, TreeCache[dict] | None, tuple[str, ShardedDataSource, LmDatasetFormatBase] | None]:
+            name, component = item
             cache_root = _component_cache_dir(name, component, self.cache_dir)
+            cache_path = os.path.join(cache_root, split)
             source = component.source
 
             if source is None:
                 try:
-                    caches[name] = load_lm_dataset_cache(
-                        os.path.join(cache_root, split), component.format, self.the_tokenizer, self.enforce_eos
-                    )
+                    cache = load_lm_dataset_cache(cache_path, component.format, self.the_tokenizer, self.enforce_eos)
                 except FileNotFoundError:
                     raise ValueError(f"No source and no cache found for component {name} split {split}")
-                continue
+                return name, cache, None
 
             shard_source = source.get_shard_source(split)
+            cache_exists = fsspec_utils.exists(cache_path)
+
             if shard_source is None:
-                cache_path = os.path.join(cache_root, split)
-                if not fsspec_utils.exists(cache_path):
+                if not cache_exists:
                     logger.warning(f"No source for {name} in {split} split and no cache at {cache_path}, skipping")
-                    continue
-                caches[name] = load_lm_dataset_cache(
-                    cache_path, component.format, self.the_tokenizer, self.enforce_eos
-                )
-                continue
+                    return name, None, None
+                cache = load_lm_dataset_cache(cache_path, component.format, self.the_tokenizer, self.enforce_eos)
+                return name, cache, None
 
-            cache_path = os.path.join(cache_root, split)
             if not self.auto_build_caches:
-                if not fsspec_utils.exists(cache_path):
+                if not cache_exists:
                     raise FileNotFoundError(f"Cache not found at {cache_path} and auto_build_caches is disabled")
-                caches[name] = load_lm_dataset_cache(
-                    cache_path, component.format, self.the_tokenizer, self.enforce_eos
-                )
-                continue
+                cache = load_lm_dataset_cache(cache_path, component.format, self.the_tokenizer, self.enforce_eos)
+                return name, cache, None
+
+            if cache_exists:
+                cache = load_lm_dataset_cache(cache_path, component.format, self.the_tokenizer, self.enforce_eos)
+                return name, cache, None
+            return name, None, (cache_path, shard_source, component.format)
 
+        caches: dict[str, TreeCache[dict]] = {}
+        to_build: list[tuple[str, tuple[str, ShardedDataSource, LmDatasetFormatBase]]] = []
+        max_workers = min(32, len(items))
+        with (
+            log_time(f"build_caches[{split}] over {len(items)} components"),
+            ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="build_caches") as pool,
+        ):
+            for name, cache, build_args in pool.map(_load_or_defer, items):
+                if cache is not None:
+                    caches[name] = cache
+                elif build_args is not None:
+                    to_build.append((name, build_args))
+
+        for name, (cache_path, shard_source, fmt) in to_build:
             caches[name] = build_lm_dataset_cache(
-                cache_path,
-                shard_source,
-                component.format,
-                self.the_tokenizer,
-                self.cache_options,
-                self.enforce_eos,
+                cache_path, shard_source, fmt, self.the_tokenizer, self.cache_options, self.enforce_eos
             )
-
         return caches
 
     @property
diff --git a/lib/levanter/tests/test_text.py b/lib/levanter/tests/test_text.py
@@ -779,3 +779,38 @@ def test_chat_dataset_build_and_pack(dummy_chat_data):
 
             # loss_weight should coincide with assistant tokens only
             assert_loss_weight_matches_all_assistants(ex, tokenizer)
+
+
+# --- LmDataConfig.build_caches ---------------------------------------------
+
+
+def _write_prebuilt_jsonl(path: Path, records: list[dict]) -> None:
+    with path.open("w") as f:
+        for record in records:
+            f.write(json.dumps(record) + "\n")
+
+
+def _prebuilt_train_component(jsonl_path: Path) -> DatasetComponent:
+    return DatasetComponent(
+        source=UrlDatasetSourceConfig(train_urls=[str(jsonl_path)], validation_urls=[]),
+        format=PrebuiltLmDatasetFormat(),
+    )
+
+
+def test_build_caches_propagates_exception_from_one_component(tmp_path):
+    p_good = tmp_path / "good.jsonl"
+    _write_prebuilt_jsonl(p_good, [{"input_ids": [1, 2, 3, 4]}])
+    good = _prebuilt_train_component(p_good)
+    bad = DatasetComponent(
+        source=None,
+        cache_dir=str(tmp_path / "bad_missing"),
+        format=PrebuiltLmDatasetFormat(),
+    )
+    config = LmDataConfig(
+        components={"good": good, "bad": bad},
+        cache_dir=str(tmp_path / "caches"),
+        tokenizer="passthrough",
+        vocab_size=16,
+    )
+    with pytest.raises(ValueError, match="No source and no cache"):
+        config.build_caches("train")