IBM · elronbandel · Feb 2, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/src/unitxt/api.py b/src/unitxt/api.py
@@ -139,7 +139,7 @@ def load_dataset(
     dataset_query: Optional[str] = None,
     split: Optional[str] = None,
     streaming: bool = False,
-    disable_cache: Optional[bool] = None,
+    disable_cache: Optional[bool] = True,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Loads dataset.
@@ -188,9 +188,6 @@ def load_dataset(
     if split is not None:
         stream = stream[split]
 
-    if disable_cache is None:
-        disable_cache = settings.disable_hf_datasets_cache
-
     if streaming:
         dataset = stream.to_iterable_dataset(
             features=UNITXT_DATASET_SCHEMA,

diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -210,7 +210,7 @@ class LoadHF(Loader):
         Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
     ] = None
     revision: Optional[str] = None
-    streaming: bool = True
+    streaming: bool = False
     filtering_lambda: Optional[str] = None
     num_proc: Optional[int] = None
     requirements_list: List[str] = OptionalField(default_factory=list)
@@ -823,6 +823,7 @@ class LoadFromHFSpace(LoadHF):
     use_token: Optional[bool] = None
     token_env: Optional[str] = None
     requirements_list: List[str] = ["huggingface_hub"]
+    streaming = True
 
     def _get_token(self) -> Optional[Union[bool, str]]:
         if self.token_env:
@@ -953,45 +954,6 @@ def load_data(self):
         self._map_wildcard_path_to_full_paths()
         self.path = self._download_data()
         return super().load_data()
-
-        # url: str
-
-        # _requirements_list: List[str] = ["opendatasets"]
-        # data_classification_policy = ["public"]
-
-        # def verify(self):
-        #     super().verify()
-        #     if not os.path.isfile("kaggle.json"):
-        #         raise MissingKaggleCredentialsError(
-        #             "Please obtain kaggle credentials https://christianjmills.com/posts/kaggle-obtain-api-key-tutorial/ and save them to local ./kaggle.json file"
-        #         )
-
-        #     if self.streaming:
-        #         raise NotImplementedError("LoadFromKaggle cannot load with streaming.")
-
-        # def prepare(self):
-        #     super().prepare()
-        #     from opendatasets import download
-
-        #     self.downloader = download
-
-        # def load_iterables(self):
-        #     with TemporaryDirectory() as temp_directory:
-        #         self.downloader(self.url, temp_directory)
-        #         return hf_load_dataset(temp_directory, streaming=False)
-
-        # class LoadFromAPI(Loader):
-        #     """Loads data from from API"""
-
-        #     urls: Dict[str, str]
-        #     chunksize: int = 100000
-        #     loader_limit: Optional[int] = None
-        #     streaming: bool = False
-
-        #     def _maybe_set_classification_policy(self):
-        #         self.set_default_data_classification(["proprietary"], "when loading from API")
-
-        #     def load_iterables(self):
         self.api_key = os.getenv("SQL_API_KEY", None)
         if not self.api_key:
             raise ValueError(

diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
@@ -149,7 +149,7 @@ def __getattr__(self, key):
     settings.skip_artifacts_prepare_and_verify = (bool, False)
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
-    settings.disable_hf_datasets_cache = (bool, True)
+    settings.disable_hf_datasets_cache = (bool, False)
     settings.loader_cache_size = (int, 1)
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"