@@ -259,9 +259,6 @@ def stream_dataset(self, split: str) -> Union[IterableDatasetDict, IterableDatas
259
259
)
260
260
except ValueError as e :
261
261
if "trust_remote_code" in str (e ):
262
- logger .critical (
263
- f"while raising trust_remote error, settings.allow_unverified_code = { settings .allow_unverified_code } "
264
- )
265
262
raise ValueError (
266
263
f"{ self .__class__ .__name__ } cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
267
264
) from e
@@ -319,30 +316,28 @@ def load_iterables(
319
316
if self .get_limit () is not None :
320
317
self .log_limited_loading ()
321
318
322
- if not isinstance (self , LoadFromHFSpace ):
323
- # try the following for LoadHF only
324
- if self .split is not None :
325
- return {
326
- self .split : ReusableGenerator (
327
- self .split_generator , gen_kwargs = {"split" : self .split }
328
- )
329
- }
319
+ if self .split is not None :
320
+ return {
321
+ self .split : ReusableGenerator (
322
+ self .split_generator , gen_kwargs = {"split" : self .split }
323
+ )
324
+ }
330
325
331
- try :
332
- split_names = get_dataset_split_names (
333
- path = self .path ,
334
- config_name = self .name ,
335
- trust_remote_code = settings .allow_unverified_code ,
326
+ try :
327
+ split_names = get_dataset_split_names (
328
+ path = self .path ,
329
+ config_name = self .name ,
330
+ trust_remote_code = settings .allow_unverified_code ,
331
+ )
332
+ return {
333
+ split_name : ReusableGenerator (
334
+ self .split_generator , gen_kwargs = {"split" : split_name }
336
335
)
337
- return {
338
- split_name : ReusableGenerator (
339
- self .split_generator , gen_kwargs = {"split" : split_name }
340
- )
341
- for split_name in split_names
342
- }
336
+ for split_name in split_names
337
+ }
343
338
344
- except :
345
- pass # do nothing, and just continue to the usual load dataset
339
+ except :
340
+ pass # do nothing, and just continue to the usual load dataset
346
341
# self.split is None and
347
342
# split names are not known before the splits themselves are loaded, and we need to load them here
348
343
@@ -473,14 +468,20 @@ def prepare(self):
473
468
self .downloader = getattr (sklearn_datatasets , f"fetch_{ self .dataset_name } " )
474
469
475
470
def load_iterables (self ):
476
- with TemporaryDirectory () as temp_directory :
477
- for split in self .splits :
478
- split_data = self .downloader (subset = split )
479
- targets = [split_data ["target_names" ][t ] for t in split_data ["target" ]]
480
- df = pd .DataFrame ([split_data ["data" ], targets ]).T
481
- df .columns = ["data" , "target" ]
482
- df .to_csv (os .path .join (temp_directory , f"{ split } .csv" ), index = None )
483
- return hf_load_dataset (temp_directory , streaming = False )
471
+ return {
472
+ split_name : ReusableGenerator (
473
+ self .split_generator , gen_kwargs = {"split" : split_name }
474
+ )
475
+ for split_name in self .splits
476
+ }
477
+
478
+ def split_generator (self , split : str ) -> Generator :
479
+ split_data = self .downloader (subset = split )
480
+ targets = [split_data ["target_names" ][t ] for t in split_data ["target" ]]
481
+ df = pd .DataFrame ([split_data ["data" ], targets ]).T
482
+ df .columns = ["data" , "target" ]
483
+ dataset = df .to_dict ("records" )
484
+ yield from dataset
484
485
485
486
486
487
class MissingKaggleCredentialsError (ValueError ):
0 commit comments