55
55
import pandas as pd
56
56
import requests
57
57
from datasets import (
58
+ DatasetBuilder ,
59
+ DownloadConfig ,
58
60
IterableDataset ,
59
61
IterableDatasetDict ,
60
- get_dataset_split_names ,
62
+ StreamingDownloadManager ,
63
+ load_dataset_builder ,
61
64
)
62
65
from datasets import load_dataset as hf_load_dataset
63
66
from huggingface_hub import HfApi
@@ -316,24 +319,59 @@ def load_iterables(
316
319
if self .get_limit () is not None :
317
320
self .log_limited_loading ()
318
321
319
- if self .split is not None :
320
- return {
321
- self .split : ReusableGenerator (
322
- self .split_generator , gen_kwargs = {"split" : self .split }
323
- )
324
- }
325
-
326
322
try :
327
- split_names = get_dataset_split_names (
323
+ dataset_builder = load_dataset_builder (
324
+ # split_names = get_dataset_split_names(
328
325
path = self .path ,
329
- config_name = self .name ,
326
+ name = self .name ,
330
327
trust_remote_code = settings .allow_unverified_code ,
331
328
)
329
+
330
+ if self .split is not None :
331
+ return {
332
+ self .split : ReusableGenerator (
333
+ self .split_generator ,
334
+ gen_kwargs = {
335
+ "split" : self .split ,
336
+ "dataset_builder" : dataset_builder ,
337
+ },
338
+ )
339
+ }
340
+
341
+ info = dataset_builder .info
342
+ if info .splits is None :
343
+ download_config = DownloadConfig ()
344
+ dataset_builder ._check_manual_download (
345
+ StreamingDownloadManager (
346
+ base_path = dataset_builder .base_path ,
347
+ download_config = download_config ,
348
+ )
349
+ )
350
+ # try:
351
+ info .splits = {
352
+ split_generator .name : {
353
+ "name" : split_generator .name ,
354
+ "dataset_name" : self .path ,
355
+ }
356
+ for split_generator in dataset_builder ._split_generators (
357
+ StreamingDownloadManager (
358
+ base_path = dataset_builder .base_path ,
359
+ download_config = download_config ,
360
+ )
361
+ )
362
+ }
363
+ # except Exception as err:
364
+ # raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
365
+
332
366
return {
333
367
split_name : ReusableGenerator (
334
- self .split_generator , gen_kwargs = {"split" : split_name }
368
+ self .split_generator ,
369
+ gen_kwargs = {
370
+ "split" : split_name ,
371
+ "dataset_builder" : dataset_builder ,
372
+ },
335
373
)
336
- for split_name in split_names
374
+ for split_name in info . splits
337
375
}
338
376
339
377
except :
@@ -360,13 +398,22 @@ def load_iterables(
360
398
361
399
return dataset
362
400
363
- def split_generator (self , split : str ) -> Generator :
401
+ def split_generator (self , split : str , dataset_builder : DatasetBuilder ) -> Generator :
364
402
dataset = self .__class__ ._loader_cache .get (str (self ) + "_" + split , None )
365
403
if dataset is None :
366
404
try :
367
- dataset = self . stream_dataset ( split )
405
+ dataset = dataset_builder . as_streaming_dataset ( split = split )
368
406
except NotImplementedError : # streaming is not supported for zipped files so we load without streaming
369
- dataset = self .load_dataset (split )
407
+ dataset_builder .download_and_prepare (
408
+ # download_config=download_config,
409
+ # download_mode=download_mode,
410
+ # verification_mode=verification_mode,
411
+ # num_proc=num_proc,
412
+ # storage_options=storage_options,
413
+ )
414
+
415
+ # Build dataset for splits
416
+ dataset = dataset_builder .as_dataset (split = split )
370
417
371
418
if self .filtering_lambda is not None :
372
419
dataset = self .filter_load (dataset )
0 commit comments