refresh loaders from just_lazy_loader

dafnapension · dafnapension · commit 586b4ae722af · 2025-02-09T18:11:45.000+02:00
Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -67,7 +67,7 @@
 from tqdm import tqdm
 
 from .dataclass import OptionalField
-from .error_utils import UnitxtError
+from .error_utils import UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .generator_utils import ReusableGenerator
 from .logging_utils import get_logger
@@ -227,7 +227,7 @@ class LoadHF(Loader):
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     revision: Optional[str] = None
-    streaming: bool = None
+    streaming = None
     filtering_lambda: Optional[str] = None
     num_proc: Optional[int] = None
     requirements_list: List[str] = OptionalField(default_factory=list)
@@ -314,27 +314,25 @@ def load_dataset(
                             next(iter(dataset[k]))
                             break
 
-                except:
-                    try:
-                        current_streaming = kwargs["streaming"]
-                        logger.info(
-                            f"needed to swap streaming from {current_streaming} to {not current_streaming} for path {self.path}"
-                        )
-                        # try the opposite way of streaming
-                        kwargs["streaming"] = not kwargs["streaming"]
-                        dataset = hf_load_dataset(**kwargs)
-                        if isinstance(dataset, (Dataset, IterableDataset)):
-                            next(iter(dataset))
-                        else:
-                            for k in dataset.keys():
-                                next(iter(dataset[k]))
-                                break
-
-                    except ValueError as e:
-                        if "trust_remote_code" in str(e):
-                            raise ValueError(
-                                f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
-                            ) from e
+                except Exception as e:
+                    if e is ValueError and "trust_remote_code" in str(e):
+                        raise ValueError(
+                            f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
+                        ) from e
+
+                    current_streaming = kwargs["streaming"]
+                    logger.info(
+                        f"needed to swap streaming from {current_streaming} to {not current_streaming} for path {self.path}"
+                    )
+                    # try the opposite way of streaming
+                    kwargs["streaming"] = not kwargs["streaming"]
+                    dataset = hf_load_dataset(**kwargs)
+                    if isinstance(dataset, (Dataset, IterableDataset)):
+                        next(iter(dataset))
+                    else:
+                        for k in dataset.keys():
+                            next(iter(dataset[k]))
+                            break
 
             if self.filtering_lambda is not None:
                 dataset = dataset.filter(eval(self.filtering_lambda))
@@ -373,6 +371,9 @@ def get_splits(self) -> List[str]:
                 # split names are known before the split themselves are pulled from HF,
                 # and we can postpone that pulling of the splits until actually demanded
                 return list(dataset_info.splits.keys())
+            UnitxtWarning(
+                f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
+            )
             return None
         except:
             return None
@@ -915,9 +916,9 @@ class LoadFromHFSpace(LoadHF):
             )
     """
 
+    path = None
     space_name: str
     data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
-    path: Optional[str] = None
     revision: Optional[str] = None
     use_token: Optional[bool] = None
     token_env: Optional[str] = None
@@ -1055,8 +1056,6 @@ def _maybe_set_classification_policy(self):
     def load_data(self):
         self._map_wildcard_path_to_full_paths()
         self.path = self._download_data()
-        if self.splits is None and isinstance(self.data_files, dict):
-            self.splits = sorted(self.data_files.keys())
 
         return super().load_data()
 
@@ -1091,7 +1090,7 @@ class LoadFromAPI(Loader):
 
     urls: Dict[str, str]
     chunksize: int = 100000
-    streaming: bool = False
+    streaming = False
     api_key_env_var: str = "SQL_API_KEY"
     headers: Optional[Dict[str, Any]] = None
     data_field: str = "data"
diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline
@@ -151,7 +151,7 @@
         "filename": "src/unitxt/loaders.py",
         "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
         "is_verified": false,
-        "line_number": 629,
+        "line_number": 630,
         "is_secret": false
       }
     ],
@@ -184,5 +184,5 @@
       }
     ]
   },
-  "generated_at": "2025-02-08T13:56:45Z"
+  "generated_at": "2025-02-09T12:07:07Z"
 }

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@`
`151`	`151`	`"filename": "src/unitxt/loaders.py",`
`152`	`152`	`"hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",`
`153`	`153`	`"is_verified": false,`
`154`		`- "line_number": 629,`
	`154`	`+ "line_number": 630,`
`155`	`155`	`"is_secret": false`
`156`	`156`	`}`
`157`	`157`	`],`
`@@ -184,5 +184,5 @@`
`184`	`184`	`}`
`185`	`185`	`]`
`186`	`186`	`},`
`187`		`- "generated_at": "2025-02-08T13:56:45Z"`
	`187`	`+ "generated_at": "2025-02-09T12:07:07Z"`
`188`	`188`	`}`