increase loader cache

dafnapension · dafnapension · commit ac7c19890e5c · 2025-01-23T12:42:13.000+02:00
Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/performance/bluebench_profiler.py b/performance/bluebench_profiler.py
@@ -59,26 +59,25 @@ def profiler_instantiate_benchmark_recipe(
     def profiler_generate_benchmark_dataset(
         self, benchmark_recipe: Benchmark, split: str, **kwargs
     ) -> List[Dict[str, Any]]:
+        stream = benchmark_recipe()[split]
+
+        # to charge here for the time of generating all instances of the split
+        return list(stream)
+
+    def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
         with settings.context(
             disable_hf_datasets_cache=False,
             allow_unverified_code=True,
-            mock_inference_mode=True,
         ):
-            stream = benchmark_recipe()[split]
-
-            # to charge here for the time of generating all instances
-            return list(stream)
+            benchmark_recipe = self.profiler_instantiate_benchmark_recipe(
+                dataset_query=dataset_query, **kwargs
+            )
 
-    def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
-        benchmark_recipe = self.profiler_instantiate_benchmark_recipe(
-            dataset_query=dataset_query, **kwargs
-        )
-
-        dataset = self.profiler_generate_benchmark_dataset(
-            benchmark_recipe=benchmark_recipe, split=split, **kwargs
-        )
+            dataset = self.profiler_generate_benchmark_dataset(
+                benchmark_recipe=benchmark_recipe, split=split, **kwargs
+            )
 
-        logger.critical(f"length of evaluation_result: {len(dataset)}")
+            logger.critical(f"length of bluegench generated dataset: {len(dataset)}")
 
 
 dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
diff --git a/src/unitxt/fusion.py b/src/unitxt/fusion.py
@@ -4,9 +4,12 @@
 from .dataclass import NonPositionalField
 from .operator import SourceOperator
 from .random_utils import new_random_generator
+from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
 
+settings = get_settings()
+
 
 class BaseFusion(SourceOperator):
     """BaseFusion operator that combines multiple multistreams into one.
@@ -75,26 +78,30 @@ def prepare(self):
 
     # flake8: noqa: C901
     def fusion_generator(self, split) -> Generator:
-        for origin_name, origin in self.named_subsets.items():
-            multi_stream = origin()
-            if split not in multi_stream:
-                continue
-            emitted_from_this_split = 0
-            try:
-                for instance in multi_stream[split]:
-                    if (
-                        self.max_instances_per_subset is not None
-                        and emitted_from_this_split >= self.max_instances_per_subset
-                    ):
-                        break
-                    if isinstance(origin_name, str):
-                        if "subset" not in instance:
-                            instance["subset"] = []
-                        instance["subset"].insert(0, origin_name)
-                    emitted_from_this_split += 1
-                    yield instance
-            except Exception as e:
-                raise RuntimeError(f"Exception in subset: {origin_name}") from e
+        with settings.context(
+            disable_hf_datasets_cache=False,
+            allow_unverified_code=True,
+        ):
+            for origin_name, origin in self.named_subsets.items():
+                multi_stream = origin()
+                if split not in multi_stream:
+                    continue
+                emitted_from_this_split = 0
+                try:
+                    for instance in multi_stream[split]:
+                        if (
+                            self.max_instances_per_subset is not None
+                            and emitted_from_this_split >= self.max_instances_per_subset
+                        ):
+                            break
+                        if isinstance(origin_name, str):
+                            if "subset" not in instance:
+                                instance["subset"] = []
+                            instance["subset"].insert(0, origin_name)
+                        emitted_from_this_split += 1
+                        yield instance
+                except Exception as e:
+                    raise RuntimeError(f"Exception in subset: {origin_name}") from e
 
 
 class WeightedFusion(BaseFusion):
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -259,6 +259,9 @@ def stream_dataset(self, split: str) -> Union[IterableDatasetDict, IterableDatas
                 )
             except ValueError as e:
                 if "trust_remote_code" in str(e):
+                    logger.critical(
+                        f"while raising trust_remote error, settings.allow_unverified_code = {settings.allow_unverified_code}"
+                    )
                     raise ValueError(
                         f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
                     ) from e
@@ -327,7 +330,9 @@ def load_iterables(
 
             try:
                 split_names = get_dataset_split_names(
-                    path=self.path, config_name=self.name
+                    path=self.path,
+                    config_name=self.name,
+                    trust_remote_code=settings.allow_unverified_code,
                 )
                 return {
                     split_name: ReusableGenerator(
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
@@ -150,7 +150,7 @@ def __getattr__(self, key):
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
     settings.disable_hf_datasets_cache = (bool, True)
-    settings.loader_cache_size = (int, 1)
+    settings.loader_cache_size = (int, 25)
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"
     settings.default_format = None
diff --git a/src/unitxt/test_utils/card.py b/src/unitxt/test_utils/card.py
@@ -291,18 +291,21 @@ def test_card(
     else:
         template_card_indices = range(len(card.templates))
 
-    for template_card_index in template_card_indices:
-        examples = load_examples_from_dataset_recipe(
-            card, template_card_index=template_card_index, debug=debug, **kwargs
-        )
-        if test_exact_match_score_when_predictions_equal_references:
-            test_correct_predictions(
-                examples=examples, strict=strict, exact_match_score=exact_match_score
-            )
-        if test_full_mismatch_score_with_full_mismatch_prediction_values:
-            test_wrong_predictions(
-                examples=examples,
-                strict=strict,
-                maximum_full_mismatch_score=maximum_full_mismatch_score,
-                full_mismatch_prediction_values=full_mismatch_prediction_values,
+    with settings.context(allow_unverified_code=True):
+        for template_card_index in template_card_indices:
+            examples = load_examples_from_dataset_recipe(
+                card, template_card_index=template_card_index, debug=debug, **kwargs
             )
+            if test_exact_match_score_when_predictions_equal_references:
+                test_correct_predictions(
+                    examples=examples,
+                    strict=strict,
+                    exact_match_score=exact_match_score,
+                )
+            if test_full_mismatch_score_with_full_mismatch_prediction_values:
+                test_wrong_predictions(
+                    examples=examples,
+                    strict=strict,
+                    maximum_full_mismatch_score=maximum_full_mismatch_score,
+                    full_mismatch_prediction_values=full_mismatch_prediction_values,
+                )
diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline
@@ -151,7 +151,7 @@
         "filename": "src/unitxt/loaders.py",
         "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
         "is_verified": false,
-        "line_number": 560,
+        "line_number": 565,
         "is_secret": false
       }
     ],
@@ -184,5 +184,5 @@
       }
     ]
   },
-  "generated_at": "2025-01-22T08:29:46Z"
+  "generated_at": "2025-01-22T11:47:57Z"
 }

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@`
`151`	`151`	`"filename": "src/unitxt/loaders.py",`
`152`	`152`	`"hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",`
`153`	`153`	`"is_verified": false,`
`154`		`- "line_number": 560,`
	`154`	`+ "line_number": 565,`
`155`	`155`	`"is_secret": false`
`156`	`156`	`}`
`157`	`157`	`],`
`@@ -184,5 +184,5 @@`
`184`	`184`	`}`
`185`	`185`	`]`
`186`	`186`	`},`
`187`		`- "generated_at": "2025-01-22T08:29:46Z"`
	`187`	`+ "generated_at": "2025-01-22T11:47:57Z"`
`188`	`188`	`}`