Skip to content

Commit 016faa4

Browse files
committed
try with dataset_builder
Signed-off-by: dafnapension <[email protected]>
1 parent a4121c7 commit 016faa4

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

src/unitxt/loaders.py

+62-15
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,12 @@
5555
import pandas as pd
5656
import requests
5757
from datasets import (
58+
DatasetBuilder,
59+
DownloadConfig,
5860
IterableDataset,
5961
IterableDatasetDict,
60-
get_dataset_split_names,
62+
StreamingDownloadManager,
63+
load_dataset_builder,
6164
)
6265
from datasets import load_dataset as hf_load_dataset
6366
from huggingface_hub import HfApi
@@ -316,24 +319,59 @@ def load_iterables(
316319
if self.get_limit() is not None:
317320
self.log_limited_loading()
318321

319-
if self.split is not None:
320-
return {
321-
self.split: ReusableGenerator(
322-
self.split_generator, gen_kwargs={"split": self.split}
323-
)
324-
}
325-
326322
try:
327-
split_names = get_dataset_split_names(
323+
dataset_builder = load_dataset_builder(
324+
# split_names = get_dataset_split_names(
328325
path=self.path,
329-
config_name=self.name,
326+
name=self.name,
330327
trust_remote_code=settings.allow_unverified_code,
331328
)
329+
330+
if self.split is not None:
331+
return {
332+
self.split: ReusableGenerator(
333+
self.split_generator,
334+
gen_kwargs={
335+
"split": self.split,
336+
"dataset_builder": dataset_builder,
337+
},
338+
)
339+
}
340+
341+
info = dataset_builder.info
342+
if info.splits is None:
343+
download_config = DownloadConfig()
344+
dataset_builder._check_manual_download(
345+
StreamingDownloadManager(
346+
base_path=dataset_builder.base_path,
347+
download_config=download_config,
348+
)
349+
)
350+
# try:
351+
info.splits = {
352+
split_generator.name: {
353+
"name": split_generator.name,
354+
"dataset_name": self.path,
355+
}
356+
for split_generator in dataset_builder._split_generators(
357+
StreamingDownloadManager(
358+
base_path=dataset_builder.base_path,
359+
download_config=download_config,
360+
)
361+
)
362+
}
363+
# except Exception as err:
364+
# raise SplitsNotFoundError("The split names could not be parsed from the dataset config.") from err
365+
332366
return {
333367
split_name: ReusableGenerator(
334-
self.split_generator, gen_kwargs={"split": split_name}
368+
self.split_generator,
369+
gen_kwargs={
370+
"split": split_name,
371+
"dataset_builder": dataset_builder,
372+
},
335373
)
336-
for split_name in split_names
374+
for split_name in info.splits
337375
}
338376

339377
except:
@@ -360,13 +398,22 @@ def load_iterables(
360398

361399
return dataset
362400

363-
def split_generator(self, split: str) -> Generator:
401+
def split_generator(self, split: str, dataset_builder: DatasetBuilder) -> Generator:
364402
dataset = self.__class__._loader_cache.get(str(self) + "_" + split, None)
365403
if dataset is None:
366404
try:
367-
dataset = self.stream_dataset(split)
405+
dataset = dataset_builder.as_streaming_dataset(split=split)
368406
except NotImplementedError: # streaming is not supported for zipped files so we load without streaming
369-
dataset = self.load_dataset(split)
407+
dataset_builder.download_and_prepare(
408+
# download_config=download_config,
409+
# download_mode=download_mode,
410+
# verification_mode=verification_mode,
411+
# num_proc=num_proc,
412+
# storage_options=storage_options,
413+
)
414+
415+
# Build dataset for splits
416+
dataset = dataset_builder.as_dataset(split=split)
370417

371418
if self.filtering_lambda is not None:
372419
dataset = self.filter_load(dataset)

utils/.secrets.baseline

+2-2
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@
151151
"filename": "src/unitxt/loaders.py",
152152
"hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
153153
"is_verified": false,
154-
"line_number": 599,
154+
"line_number": 646,
155155
"is_secret": false
156156
}
157157
],
@@ -184,5 +184,5 @@
184184
}
185185
]
186186
},
187-
"generated_at": "2025-01-26T10:03:47Z"
187+
"generated_at": "2025-01-27T14:50:50Z"
188188
}

0 commit comments

Comments
 (0)