@@ -915,7 +915,7 @@ def build_lm_dataset_cache(
915915 tokenizer : HfTokenizer ,
916916 options : CacheOptions = CacheOptions .default (),
917917 enforce_eos = True ,
918- ):
918+ ) -> TreeCache [ dict ] :
919919 """
920920 Creates a cache for a dataset. If the cache already exists, it will be loaded. Otherwise, it will be built.
921921
@@ -943,10 +943,6 @@ def build_lm_dataset_cache(
943943 except FileNotFoundError :
944944 pass
945945
946- if source is None :
947- logger .info (f"No data for { name } " )
948- return None
949-
950946 logger .info (f"Building cache for { name } ..." )
951947 return build_or_load_cache (
952948 cache_dir ,
@@ -1339,7 +1335,7 @@ def validation_sets(self, Pos: Axis) -> Mapping[str, AsyncDataset[LmExample]]:
13391335 return validation_datasets
13401336
13411337 def build_caches (self , split : str ) -> Dict [str , TreeCache [dict ]]:
1342- caches = {}
1338+ caches : dict [ str , TreeCache [ dict ]] = {}
13431339 for name , source_config in self .configs .items ():
13441340 # Skip datasets with zero weight in all stages
13451341 if isinstance (self .train_weights , dict ):
@@ -1387,13 +1383,6 @@ def build_caches(self, split: str) -> Dict[str, TreeCache[dict]]:
13871383 self .enforce_eos ,
13881384 )
13891385
1390- # In practice, it works best if we block on validation caches
1391- if split == "validation" :
1392- for cache in caches .values ():
1393- cache .await_finished ()
1394- else :
1395- logger .info (f"Not waiting for { split } caches to finish building" )
1396-
13971386 return caches
13981387
13991388 @property
0 commit comments