measure performance with/without loaded local file-system cache, and add documentation

dafnapension · dafnapension · commit a3645fffb160 · 2025-03-17T20:05:22.000+02:00
Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -43,12 +43,17 @@ jobs:
     - name: Prepare the dirs for performance evaluation in main
       run: |
         mkdir -p performance_action
+        mkdir -p performance_action/hf_fs_cache
         cp performance/bluebench_profiler.py performance_action/bluebench_profiler.py
         cp performance/compare_benchmark_performance_results.py performance_action/compare_benchmark_performance_results.py
 
-    - name: Run performance on PR just to warm the cache, output will be overwritten
+    - name: Run performance on PR just to fill the file systems cache
+      env: 
+        UNITXT_HF_LOAD_FROM_OFFLINE: "False"
+        UNITXT_HF_SAVE_TO_OFFLINE: "True"
+        UNITXT_HF_OFFLINE_DATASETS_PATH: performance_action/hf_fs_cache
       run : |
-        python performance_action/bluebench_profiler.py --output_file performance_action/pr_results.json
+        python performance_action/bluebench_profiler.py --output_file performance_action/pr_results.json --populate_fs_cache  >> $GITHUB_STEP_SUMMARY
 
     - name: Checkout main branch
       uses: actions/checkout@v4
@@ -57,6 +62,10 @@ jobs:
         clean: false
 
     - name: Run performance on main branch
+      env: 
+        UNITXT_HF_SAVE_TO_OFFLINE: "False"        
+        UNITXT_HF_LOAD_FROM_OFFLINE: "True"
+        UNITXT_HF_OFFLINE_DATASETS_PATH: performance_action/hf_fs_cache
       run: |
         python performance_action/bluebench_profiler.py --output_file performance_action/main_results.json
 
@@ -67,6 +76,10 @@ jobs:
         clean: false
 
     - name: Run performance on PR branch
+      env:
+        UNITXT_HF_SAVE_TO_OFFLINE: "False"        
+        UNITXT_HF_LOAD_FROM_OFFLINE: "True"
+        UNITXT_HF_OFFLINE_DATASETS_PATH: performance_action/hf_fs_cache    
       run: |
         python performance_action/bluebench_profiler.py --output_file performance_action/pr_results.json
 
diff --git a/docs/docs/adding_dataset.rst b/docs/docs/adding_dataset.rst
@@ -64,7 +64,7 @@ If a catalogued task fits your use case, you may reference it by name:
     task='tasks.translation.directed',
 
 Loading the Dataset
----------------------
+--------------------
 
 To load data from an external source, we use a loader.
 For example, to load the `wmt16` translation dataset from the HuggingFace hub:
@@ -75,6 +75,13 @@ For example, to load the `wmt16` translation dataset from the HuggingFace hub:
 
 More loaders for different sources are available in the  :class:`loaders <unitxt.loaders>` section.
 
+Loading from (and savig to) local file-system
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Setting env variable ``UNITXT_HF_LOAD_FROM_OFFLINE=true``, will have loaders fetch the data from the local file-system directory
+specified by env variable ``UNITXT_HF_OFFLINE_DATASET_PATH``. To have loaders save the data they fetched from an 
+outside hub, in that specified local file-system directory, set ``UNITXT_HF_SAVE_TO_OFFLINE=true``
+
 The Preprocessing Pipeline
 ---------------------------
 
diff --git a/performance/bluebench_profiler.py b/performance/bluebench_profiler.py
@@ -17,6 +17,7 @@
     CrossProviderInferenceEngine,
     InferenceEngine,
 )
+from unitxt.loaders import Loader
 from unitxt.logging_utils import get_logger
 from unitxt.operator import MultiStreamOperator
 from unitxt.settings_utils import get_settings
@@ -120,7 +121,11 @@ def collect_loaded_dataset_iterators(self, recipe: Union[DatasetRecipe, Benchmar
             if recipe.steps[1].generators:
                 for stream_name in recipe.steps[1].generators:
                     if recipe.steps[1].generators[stream_name].water_mark > -1:
-                        to_ret[stream_name] = (recipe.steps[1].generators[stream_name].measured_stream.gen_kwargs["stream"].gen_kwargs["stream"], recipe.steps[1].generators[stream_name].water_mark)
+                        stream = recipe.steps[1].generators[stream_name].measured_stream
+                        while not isinstance(stream.generator.__self__, Loader):
+                            assert "stream" in stream.gen_kwargs
+                            stream = stream.gen_kwargs["stream"]
+                        to_ret[stream_name] = (stream, recipe.steps[1].generators[stream_name].water_mark)
         else:
             # recipe is a benchmark
             for subset_name in recipe.subsets:
@@ -163,14 +168,23 @@ def profiler_do_the_profiling(self, dataset_query: str, **kwargs):
         t0 = time()
         recipe = load_recipe(dataset_query, **kwargs)
         t0_25 = time()
-        recipe()
-        t0_5 = time()
         self.equip_with_watermarker(recipe)
+        t0_5 = time()
+        ms = recipe()
         t1 = time()
+        water_marks = self.collect_water_marks(recipe)
+        logger.critical(f"water marks for query {dataset_query} following recipe(): {water_marks}")
+        t1_5 = time()
         dataset = _source_to_dataset(source=recipe)
         t2 = time()
+        water_marks = self.collect_water_marks(recipe)
+        logger.critical(f"water marks for query {dataset_query} following _source_to_dataset(recipe): {water_marks}")
+        t2_5 = time()
         dataset = self.list_from_dataset(dataset)
         t3 = time()
+        water_marks = self.collect_water_marks(recipe)
+        logger.critical(f"water marks for query {dataset_query} following list out all from dataset: {water_marks}")
+        t3_5 = time()
         model = self.profiler_instantiate_model()
         t4 = time()
         if isinstance(dataset, dict):
@@ -181,31 +195,31 @@ def profiler_do_the_profiling(self, dataset_query: str, **kwargs):
                 dataset = dataset[split_name]
         predictions = model.infer(dataset=dataset)
         t5 = time()
-        evaluation_result = evaluate(predictions=predictions, data=dataset)
+        evaluate(predictions=predictions, data=dataset)
         t6 = time()
         # now just streaming through recipe, without generating an HF dataset:
         ms = recipe()
         total_production_length_of_recipe = {k: len(list(ms[k])) for k in ms}
         t7 = time()
         # now just loading the specific instances actually loaded above, and listing right after recipe.loader(),
         # to report the loading time from the total processing time.
-        water_marks = self.collect_water_marks(recipe)
+        # water_marks = self.collect_water_marks(recipe)
         pulling_dict = self.collect_loaded_dataset_iterators(recipe)
         t8=time()
         self.enumerate_from_loaders(pulling_dict)
         t9 = time()
-        logger.critical(f"water marks = {water_marks}")
-        logger.critical(f"length of evaluation_result, over the returned dataset from Unitxt.load_dataset: {len(evaluation_result)}")
+        # logger.critical(f"water marks = {water_marks}")
+        # logger.critical(f"length of evaluation_result, over the returned dataset from Unitxt.load_dataset: {len(evaluation_result)}")
         logger.critical(f"lengths of total production of recipe: {total_production_length_of_recipe}")
 
         return {
             "load_recipe" : t0_25 - t0,
-            "recipe()": t0_5 - t0_25,
-            "source_to_dataset": t2-t1,
-            "list_out_dataset" : t3 - t2,
+            "recipe()": t1 - t0_5,
+            "source_to_dataset": t2-t1_5,
+            "list_out_dataset" : t3 - t2_5,
             "just_load_and_list": t9-t8,
             "just_stream_through_recipe": t7-t6,
-            "instantiate_model": t4 - t3,
+            "instantiate_model": t4 - t3_5,
             "inference_time" : t5 - t4,
             "evaluation_time" : t6 - t5,
         }
@@ -239,7 +253,6 @@ def profile_no_cprofile():
             res[k] += dsq_time[k]
     return {k: round(res[k], 3) for k in res}
 
-
 def find_cummtime_of(func_name: str, file_name: str, pst_printout: str) -> float:
     relevant_lines = list(
         filter(
@@ -312,13 +325,34 @@ def main():
         action="store_true",
         help="whether to employ cProfile or just time diffs.",
     )
+    parser.add_argument(
+        "--populate_fs_cache",
+        action="store_true",
+        help="whether to save the downloaded datasets to a file-system cache.",
+    )
     args = parser.parse_args()
 
     # Ensure the directory for the output file exists
     output_dir = os.path.dirname(args.output_file)
     if output_dir:
         os.makedirs(output_dir, exist_ok=True)
 
+    if args.populate_fs_cache:
+        assert os.path.exists(settings.hf_offline_datasets_path)
+        assert settings.hf_save_to_offline
+        t0 = time()
+        queries = dataset_query if isinstance(dataset_query, list) else [dataset_query]
+        for dsq in queries:
+            recipe = load_recipe(dsq)
+            ms = recipe()
+            for split in ms:
+                list(ms[split])
+        t1 = time()
+        print(f"Time to fetch the needed datasets from their hubs and save them in the local file-system: {round(t1-t0,3)} seconds")
+        return
+
+    if settings.hf_load_from_offline:
+        assert os.path.exists(settings.hf_offline_datasets_path)
 
     dict_to_print = profile_no_cprofile()
 
diff --git a/performance/compare_benchmark_performance_results.py b/performance/compare_benchmark_performance_results.py
@@ -24,11 +24,11 @@
 print(f"used_eager_mode in main = {main_perf['used_eager_mode']}")
 print(f"used_eager_mode in PR = {pr_perf['used_eager_mode']}")
 print(f"use Mocked inference = {os.environ['UNITXT_MOCK_INFERENCE_MODE']}")
-print("Raw datasets, that are loaded and processed here, are assumed to reside in local file ststem when the run starts.")
+print("Given the raw datasets stored in the local file system, their processing through the Unitxt pipeline lasts as detailed below.")
 
 ratios = {}
 for k in pr_perf:
-    if not isinstance(pr_perf, float):
+    if not isinstance(pr_perf[k], float):
         continue
     ratios[k] = pr_perf[k] / main_perf[k] if main_perf[k] > 0 else 1
 
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -72,6 +72,7 @@
 from .logging_utils import get_logger
 from .operator import SourceOperator
 from .operators import Set
+from .random_utils import new_random_generator
 from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
@@ -85,20 +86,20 @@ def __init__(self, path):
         super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
 
 def hf_load_dataset(path: str, *args, **kwargs):
-    if settings.hf_offline_datasets_path is not None:
-        path = os.path.join(settings.hf_offline_datasets_path, path)
     try:
         return _hf_load_dataset(
             path,
             *args, **kwargs,
                 download_config=DownloadConfig(
                     max_retries=settings.loaders_max_retries,
+                    cache_dir=settings.hf_offline_datasets_path if settings.hf_save_to_offline else None,
                 ),
+                cache_dir=settings.hf_offline_datasets_path if settings.hf_load_from_offline else None,
                 verification_mode="no_checks",
                 trust_remote_code=settings.allow_unverified_code,
                 download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
             )
-    except ValueError as e:
+    except Exception as e:
         if "trust_remote_code" in str(e):
             raise UnitxtUnverifiedCodeError(path) from e
 
@@ -307,8 +308,8 @@ def load_dataset(
                 split=split,
                 num_proc=self.num_proc,
             )
-            self.__class__._loader_cache.max_size = settings.loader_cache_size
             if not disable_memory_caching:
+                self.__class__._loader_cache.max_size = settings.loader_cache_size
                 self.__class__._loader_cache[dataset_id] = dataset
         return dataset
 
@@ -334,6 +335,7 @@ def get_splits(self):
                 download_config=DownloadConfig(
                     max_retries=settings.loaders_max_retries,
                     extract_on_the_fly=True,
+                    cache_dir = settings.hf_offline_datasets_path if settings.hf_load_from_offline else None
                 ),
             )
         except Exception as e:
@@ -409,13 +411,25 @@ def _maybe_set_classification_policy(self):
             ["proprietary"], "when loading from local files"
         )
 
-    def get_reader(self):
+    def get_reader(self)->callable:
         if self.file_type == "csv":
             return pd.read_csv
         if self.file_type == "json":
             return pd.read_json
         raise ValueError()
 
+    def get_writer(self, df:pd.DataFrame)->callable:
+        if self.file_type == "csv":
+            return df.to_csv
+        if self.file_type == "json":
+            return df.to_json
+        raise ValueError()
+
+    def get_path_to_local(self, path_to_hub:str)->str:
+        rand = new_random_generator(sub_seed=path_to_hub)
+        file_path_to_simple_string = str(rand.randint(100000, 999999))
+        return os.path.join(settings.hf_offline_datasets_path, file_path_to_simple_string+"."+self.file_type)
+
     def get_args(self):
         args = {}
         if self.file_type == "csv":
@@ -438,29 +452,33 @@ def split_generator(self, split: str) -> Generator:
         if dataset is None:
             if self.get_limit() is not None:
                 self.log_limited_loading()
+            reader = self.get_reader()
+            file_path = self.files[split]
+            if settings.hf_load_from_offline:
+                file_path = self.get_path_to_local(file_path)
             for attempt in range(settings.loaders_max_retries):
                 try:
-                    reader = self.get_reader()
-                    if self.get_limit() is not None:
-                        self.log_limited_loading()
-
                     try:
-                        dataset = reader(self.files[split], **self.get_args()).to_dict(
-                            "records"
-                        )
+                        df = reader(file_path, **self.get_args())
                         break
                     except ValueError:
                         import fsspec
-
-                        with fsspec.open(self.files[split], mode="rt") as f:
-                            dataset = reader(f, **self.get_args()).to_dict("records")
+                        with fsspec.open(file_path, mode="rt") as f:
+                            df = reader(f, **self.get_args())
                         break
                 except Exception as e:
                     logger.debug(f"Attempt csv load {attempt + 1} failed: {e}")
                     if attempt < settings.loaders_max_retries - 1:
                         time.sleep(2)
                     else:
                         raise e
+            if settings.hf_save_to_offline:
+                file_path = self.get_path_to_local(self.files[split])
+                writer = self.get_writer(df)
+                writer (file_path, index=False)
+
+            dataset = df.to_dict("records")
+
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             self.__class__._loader_cache[dataset_id] = dataset
 
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
@@ -156,6 +156,8 @@ def __getattr__(self, key):
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"
     settings.default_format = None
+    settings.hf_load_from_offline = (bool, False)
+    settings.hf_save_to_offline = (bool, False)
     settings.hf_offline_datasets_path = None
     settings.hf_offline_metrics_path = None
     settings.hf_offline_models_path = None