Merge branch 'main' into fix-azure-openai

martinscooper · web-flow · commit fd89934b4477 · 2025-03-17T16:10:21.000-03:00
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,5 @@ benchmark_output/*
 src.lock
 docs/_static/data.js
 cache
+
+inference_engine_cache/
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,8 +60,6 @@ docs = [
     "datasets",
     "evaluate",
     "nltk",
-    "sacrebleu",
-    "absl-py",
     "rouge_score",
     "scikit-learn",
     "jiwer",
@@ -89,8 +87,7 @@ tests = [
     "editdistance",
     "rouge-score",
     "nltk",
-    "mecab-python3",
-    "sacrebleu[ko]",
+    "sacrebleu[ko,ja]",
     "scikit-learn<=1.5.2",
     "jiwer",
     "conllu",
diff --git a/src/unitxt/error_utils.py b/src/unitxt/error_utils.py
@@ -18,6 +18,7 @@ class Documentation:
     BENCHMARKS = "docs/benchmark.html"
     DATA_CLASSIFICATION_POLICY = "docs/data_classification_policy.html"
     CATALOG = "docs/saving_and_loading_from_catalog.html"
+    SETTINGS = "docs/settings.html"
 
 
 def additional_info(path: str) -> str:
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
@@ -57,7 +57,7 @@ class LLMJudge(BulkInstanceMetric):
     # option_selection_strategy: OptionSelectionStrategyEnum = (
     #     OptionSelectionStrategyEnum.PARSE_OUTPUT_TEXT
     # )
-    evaluator_name: EvaluatorNameEnum = None
+    evaluator_name: Optional[Union[str,EvaluatorNameEnum]] = None
     check_positional_bias: bool = True
     context_fields: Union[str, List[str], Dict[str, str]] = ["context"]
     generate_summaries: bool = True
@@ -78,8 +78,6 @@ def prepare(self):
 
         if self.evaluator_name is None:
             self.evaluator_name = self.inference_engine.get_engine_id()
-        elif not isinstance(self.evaluator_name, EvaluatorNameEnum):
-            self.evaluator_name = EvaluatorNameEnum[self.evaluator_name]
 
     def before_process_multi_stream(self):
         super().before_process_multi_stream()
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -67,7 +67,7 @@
 from tqdm import tqdm
 
 from .dataclass import NonPositionalField
-from .error_utils import UnitxtError, UnitxtWarning
+from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
 from .operator import SourceOperator
@@ -80,19 +80,27 @@
 logger = get_logger()
 settings = get_settings()
 
+class UnitxtUnverifiedCodeError(UnitxtError):
+    def __init__(self, path):
+        super().__init__(f"Loader cannot load and run remote code from {path} in huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE.", Documentation.SETTINGS)
+
 def hf_load_dataset(path: str, *args, **kwargs):
     if settings.hf_offline_datasets_path is not None:
         path = os.path.join(settings.hf_offline_datasets_path, path)
-    return _hf_load_dataset(
-        path,
-        *args, **kwargs,
-            download_config=DownloadConfig(
-                max_retries=settings.loaders_max_retries,
-            ),
-            verification_mode="no_checks",
-            trust_remote_code=settings.allow_unverified_code,
-            download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
-        )
+    try:
+        return _hf_load_dataset(
+            path,
+            *args, **kwargs,
+                download_config=DownloadConfig(
+                    max_retries=settings.loaders_max_retries,
+                ),
+                verification_mode="no_checks",
+                trust_remote_code=settings.allow_unverified_code,
+                download_mode= "force_redownload" if settings.disable_hf_datasets_cache else "reuse_dataset_if_exists"
+            )
+    except ValueError as e:
+        if "trust_remote_code" in str(e):
+            raise UnitxtUnverifiedCodeError(path) from e
 
 class Loader(SourceOperator):
     """A base class for all loaders.
@@ -288,22 +296,17 @@ def load_dataset(
         if dataset is None:
             if streaming is None:
                 streaming = self.is_streaming()
-            try:
-                dataset = hf_load_dataset(
-                    self.path,
-                    name=self.name,
-                    data_dir=self.data_dir,
-                    data_files=self.data_files,
-                    revision=self.revision,
-                    streaming=streaming,
-                    split=split,
-                    num_proc=self.num_proc,
-                )
-            except ValueError as e:
-                if "trust_remote_code" in str(e):
-                    raise ValueError(
-                        f"{self.__class__.__name__} cannot run remote code from huggingface without setting unitxt.settings.allow_unverified_code=True or by setting environment variable: UNITXT_ALLOW_UNVERIFIED_CODE."
-                    ) from e
+
+            dataset = hf_load_dataset(
+                self.path,
+                name=self.name,
+                data_dir=self.data_dir,
+                data_files=self.data_files,
+                revision=self.revision,
+                streaming=streaming,
+                split=split,
+                num_proc=self.num_proc,
+            )
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             if not disable_memory_caching:
                 self.__class__._loader_cache[dataset_id] = dataset
@@ -333,7 +336,9 @@ def get_splits(self):
                     extract_on_the_fly=True,
                 ),
             )
-        except:
+        except Exception as e:
+            if "trust_remote_code" in str(e):
+                raise UnitxtUnverifiedCodeError(self.path) from e
             UnitxtWarning(
                 f'LoadHF(path="{self.path}", name="{self.name}") could not retrieve split names without loading the dataset. Consider defining "splits" in the LoadHF definition to improve loading time.'
             )