missed a typecheck in zephryization (#2141)

dlwh · web-flow · commit 4594079c341f · 2025-12-03T01:46:20.000Z
diff --git a/lib/levanter/src/levanter/data/audio.py b/lib/levanter/src/levanter/data/audio.py
@@ -391,7 +391,6 @@ def _has_validation_set(self):
     def build_or_load_cache(
         self,
         split: str,
-        logger_name: Optional[str] = None,
         cache_options: CacheOptions = CacheOptions.default(),
     ) -> Optional[ProcessedAudioCache]:
         split_cache_dir = os.path.join(self.cache_dir, split)
@@ -577,14 +576,6 @@ def build_caches(self, split: str) -> Dict[str, ProcessedAudioCache]:
             else:
                 caches[name] = cache
 
-        # in practice it works best if we block on validation caches
-        if split == "validation":
-            for cache in caches.values():
-                cache.cache.await_finished()
-
-        else:
-            logger.info(f"Not waiting for {split} caches to finish building")
-
         return caches
 
     @property
diff --git a/lib/levanter/src/levanter/data/text.py b/lib/levanter/src/levanter/data/text.py
@@ -915,7 +915,7 @@ def build_lm_dataset_cache(
     tokenizer: HfTokenizer,
     options: CacheOptions = CacheOptions.default(),
     enforce_eos=True,
-):
+) -> TreeCache[dict]:
     """
     Creates a cache for a dataset. If the cache already exists, it will be loaded. Otherwise, it will be built.
 
@@ -943,10 +943,6 @@ def build_lm_dataset_cache(
     except FileNotFoundError:
         pass
 
-    if source is None:
-        logger.info(f"No data for {name}")
-        return None
-
     logger.info(f"Building cache for {name}...")
     return build_or_load_cache(
         cache_dir,
@@ -1339,7 +1335,7 @@ def validation_sets(self, Pos: Axis) -> Mapping[str, AsyncDataset[LmExample]]:
         return validation_datasets
 
     def build_caches(self, split: str) -> Dict[str, TreeCache[dict]]:
-        caches = {}
+        caches: dict[str, TreeCache[dict]] = {}
         for name, source_config in self.configs.items():
             # Skip datasets with zero weight in all stages
             if isinstance(self.train_weights, dict):
@@ -1387,13 +1383,6 @@ def build_caches(self, split: str) -> Dict[str, TreeCache[dict]]:
                     self.enforce_eos,
                 )
 
-        # In practice, it works best if we block on validation caches
-        if split == "validation":
-            for cache in caches.values():
-                cache.await_finished()
-        else:
-            logger.info(f"Not waiting for {split} caches to finish building")
-
         return caches
 
     @property