log limit once per data and increase loader cache

dafnapension · dafnapension · commit 2cf85db8b5d7 · 2025-01-26T22:36:41.000+02:00
Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/performance/bluebench_profiler.py b/performance/bluebench_profiler.py
@@ -14,7 +14,6 @@
 
 logger = get_logger()
 settings = get_settings()
-settings.allow_unverified_code = True
 
 
 class BlueBenchProfiler:
@@ -59,26 +58,25 @@ def profiler_instantiate_benchmark_recipe(
     def profiler_generate_benchmark_dataset(
         self, benchmark_recipe: Benchmark, split: str, **kwargs
     ) -> List[Dict[str, Any]]:
+        stream = benchmark_recipe()[split]
+
+        # to charge here for the time of generating all instances of the split
+        return list(stream)
+
+    def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
         with settings.context(
             disable_hf_datasets_cache=False,
             allow_unverified_code=True,
-            mock_inference_mode=True,
         ):
-            stream = benchmark_recipe()[split]
-
-            # to charge here for the time of generating all instances
-            return list(stream)
+            benchmark_recipe = self.profiler_instantiate_benchmark_recipe(
+                dataset_query=dataset_query, **kwargs
+            )
 
-    def profiler_do_the_profiling(self, dataset_query: str, split: str, **kwargs):
-        benchmark_recipe = self.profiler_instantiate_benchmark_recipe(
-            dataset_query=dataset_query, **kwargs
-        )
-
-        dataset = self.profiler_generate_benchmark_dataset(
-            benchmark_recipe=benchmark_recipe, split=split, **kwargs
-        )
+            dataset = self.profiler_generate_benchmark_dataset(
+                benchmark_recipe=benchmark_recipe, split=split, **kwargs
+            )
 
-        logger.critical(f"length of evaluation_result: {len(dataset)}")
+            logger.critical(f"length of bluegench generated dataset: {len(dataset)}")
 
 
 dataset_query = "benchmarks.bluebench[loader_limit=30,max_samples_per_subset=30]"
diff --git a/src/unitxt/fusion.py b/src/unitxt/fusion.py
@@ -4,9 +4,12 @@
 from .dataclass import NonPositionalField
 from .operator import SourceOperator
 from .random_utils import new_random_generator
+from .settings_utils import get_settings
 from .stream import DynamicStream, MultiStream
 from .type_utils import isoftype
 
+settings = get_settings()
+
 
 class BaseFusion(SourceOperator):
     """BaseFusion operator that combines multiple multistreams into one.
@@ -75,26 +78,30 @@ def prepare(self):
 
     # flake8: noqa: C901
     def fusion_generator(self, split) -> Generator:
-        for origin_name, origin in self.named_subsets.items():
-            multi_stream = origin()
-            if split not in multi_stream:
-                continue
-            emitted_from_this_split = 0
-            try:
-                for instance in multi_stream[split]:
-                    if (
-                        self.max_instances_per_subset is not None
-                        and emitted_from_this_split >= self.max_instances_per_subset
-                    ):
-                        break
-                    if isinstance(origin_name, str):
-                        if "subset" not in instance:
-                            instance["subset"] = []
-                        instance["subset"].insert(0, origin_name)
-                    emitted_from_this_split += 1
-                    yield instance
-            except Exception as e:
-                raise RuntimeError(f"Exception in subset: {origin_name}") from e
+        with settings.context(
+            disable_hf_datasets_cache=False,
+            allow_unverified_code=True,
+        ):
+            for origin_name, origin in self.named_subsets.items():
+                multi_stream = origin()
+                if split not in multi_stream:
+                    continue
+                emitted_from_this_split = 0
+                try:
+                    for instance in multi_stream[split]:
+                        if (
+                            self.max_instances_per_subset is not None
+                            and emitted_from_this_split >= self.max_instances_per_subset
+                        ):
+                            break
+                        if isinstance(origin_name, str):
+                            if "subset" not in instance:
+                                instance["subset"] = []
+                            instance["subset"].insert(0, origin_name)
+                        emitted_from_this_split += 1
+                        yield instance
+                except Exception as e:
+                    raise RuntimeError(f"Exception in subset: {origin_name}") from e
 
 
 class WeightedFusion(BaseFusion):
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -237,11 +237,6 @@ def filter_load(self, dataset):
         logger.info(f"\nLoading filtered by: {self.filtering_lambda};")
         return dataset.filter(eval(self.filtering_lambda))
 
-    def log_limited_loading(self, split: str):
-        logger.info(
-            f"\nLoading of split {split} limited to {self.get_limit()} instances by setting {self.get_limiter()};"
-        )
-
     # returns Dict when split names are not known in advance, and just the single split dataset - if known
     def stream_dataset(self, split: str) -> Union[IterableDatasetDict, IterableDataset]:
         with tempfile.TemporaryDirectory() as dir_to_be_deleted:
@@ -264,6 +259,9 @@ def stream_dataset(self, split: str) -> Union[IterableDatasetDict, IterableDatas
                 )
             except ValueError as e:
                 if "trust_remote_code" in str(e):
+                    logger.critical(
+                        f"while raising trust_remote error, settings.allow_unverified_code = {settings.allow_unverified_code}"
+                    )
                     raise ValueError(
                         f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
                     ) from e
@@ -317,6 +315,10 @@ def _maybe_set_classification_policy(self):
     def load_iterables(
         self
     ) -> Union[Dict[str, ReusableGenerator], IterableDatasetDict]:
+        # log limit once for the whole data
+        if self.get_limit() is not None:
+            self.log_limited_loading()
+
         if not isinstance(self, LoadFromHFSpace):
             # try the following for LoadHF only
             if self.split is not None:
@@ -328,7 +330,9 @@ def load_iterables(
 
             try:
                 split_names = get_dataset_split_names(
-                    path=self.path, config_name=self.name
+                    path=self.path,
+                    config_name=self.name,
+                    trust_remote_code=settings.allow_unverified_code,
                 )
                 return {
                     split_name: ReusableGenerator(
@@ -354,8 +358,6 @@ def load_iterables(
 
         limit = self.get_limit()
         if limit is not None:
-            for split_name in dataset:
-                self.log_limited_loading(split_name)
             result = {}
             for split_name in dataset:
                 result[split_name] = dataset[split_name].take(limit)
@@ -376,7 +378,6 @@ def split_generator(self, split: str) -> Generator:
 
             limit = self.get_limit()
             if limit is not None:
-                self.log_limited_loading(split)
                 dataset = dataset.take(limit)
 
             self.__class__._loader_cache.max_size = settings.loader_cache_size
@@ -439,6 +440,9 @@ def get_args(self):
         return args
 
     def load_iterables(self):
+        # log once for the whole data
+        if self.get_limit() is not None:
+            self.log_limited_loading()
         iterables = {}
         for split_name in self.files.keys():
             iterables[split_name] = ReusableGenerator(
@@ -448,14 +452,9 @@ def load_iterables(self):
         return iterables
 
     def split_generator(self, split: str) -> Generator:
-        if self.get_limit() is not None:
-            self.log_limited_loading()
-            dataset = pd.read_csv(
-                self.files[split], nrows=self.get_limit(), sep=self.sep
-            ).to_dict("records")
-        else:
-            dataset = pd.read_csv(self.files[split], sep=self.sep).to_dict("records")
-
+        dataset = pd.read_csv(
+            self.files[split], nrows=self.get_limit(), sep=self.sep
+        ).to_dict("records")
         yield from dataset
 
 
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
@@ -150,7 +150,7 @@ def __getattr__(self, key):
     settings.data_classification_policy = None
     settings.mock_inference_mode = (bool, False)
     settings.disable_hf_datasets_cache = (bool, True)
-    settings.loader_cache_size = (int, 1)
+    settings.loader_cache_size = (int, 25)
     settings.task_data_as_text = (bool, True)
     settings.default_provider = "watsonx"
     settings.default_format = None
diff --git a/src/unitxt/test_utils/card.py b/src/unitxt/test_utils/card.py
@@ -291,18 +291,21 @@ def test_card(
     else:
         template_card_indices = range(len(card.templates))
 
-    for template_card_index in template_card_indices:
-        examples = load_examples_from_dataset_recipe(
-            card, template_card_index=template_card_index, debug=debug, **kwargs
-        )
-        if test_exact_match_score_when_predictions_equal_references:
-            test_correct_predictions(
-                examples=examples, strict=strict, exact_match_score=exact_match_score
-            )
-        if test_full_mismatch_score_with_full_mismatch_prediction_values:
-            test_wrong_predictions(
-                examples=examples,
-                strict=strict,
-                maximum_full_mismatch_score=maximum_full_mismatch_score,
-                full_mismatch_prediction_values=full_mismatch_prediction_values,
+    with settings.context(allow_unverified_code=True):
+        for template_card_index in template_card_indices:
+            examples = load_examples_from_dataset_recipe(
+                card, template_card_index=template_card_index, debug=debug, **kwargs
             )
+            if test_exact_match_score_when_predictions_equal_references:
+                test_correct_predictions(
+                    examples=examples,
+                    strict=strict,
+                    exact_match_score=exact_match_score,
+                )
+            if test_full_mismatch_score_with_full_mismatch_prediction_values:
+                test_wrong_predictions(
+                    examples=examples,
+                    strict=strict,
+                    maximum_full_mismatch_score=maximum_full_mismatch_score,
+                    full_mismatch_prediction_values=full_mismatch_prediction_values,
+                )
diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline
@@ -151,7 +151,7 @@
         "filename": "src/unitxt/loaders.py",
         "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",
         "is_verified": false,
-        "line_number": 566,
+        "line_number": 565,
         "is_secret": false
       }
     ],

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@`
`151`	`151`	`"filename": "src/unitxt/loaders.py",`
`152`	`152`	`"hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742",`
`153`	`153`	`"is_verified": false,`
`154`		`- "line_number": 566,`
	`154`	`+ "line_number": 565,`
`155`	`155`	`"is_secret": false`
`156`	`156`	`}`
`157`	`157`	`],`