Lazy load vectara hhem model because it is gated in HF (#1946)

yoavkatz · web-flow · commit 6c794b7de365 · 2025-10-12T13:51:10.000+03:00
* Temporary(?) remove vectara hhem model because it became restricted. Signed-off-by: Yoav Katz <katz@il.ibm.com> * Revert "Temporary(?) remove vectara hhem model because it became restricted." This reverts commit 3c9ad94. * Moved model loading in HHEM based metrics from prepare to compute because model is gated (to allow catalog prep) Signed-off-by: Yoav Katz <katz@il.ibm.com> * Added model to HHEM metric class Signed-off-by: Yoav Katz <katz@il.ibm.com> * Disabled HHEM metric test Signed-off-by: Yoav Katz <katz@il.ibm.com> --------- Signed-off-by: Yoav Katz <katz@il.ibm.com>
diff --git a/prepare/metrics/hhem.py b/prepare/metrics/hhem.py
@@ -1,6 +1,5 @@
 from unitxt import add_to_catalog
 from unitxt.metrics import FaithfulnessHHEM
-from unitxt.test_utils.metrics import test_metric
 
 pairs = [
     ("The capital of France is Berlin.", "The capital of France is Paris."),
@@ -29,11 +28,11 @@
 
 references = [[p[0]] for p in pairs]
 metric = FaithfulnessHHEM()
-outputs = test_metric(
-    metric=metric,
-    predictions=predictions,
-    references=references,
-    instance_targets=instance_targets,
-    global_target=global_target,
-)
+# outputs = test_metric(
+#    metric=metric,
+#    predictions=predictions,
+#    references=references,
+#    instance_targets=instance_targets,
+#    global_target=global_target,
+# )
 add_to_catalog(metric, "metrics.vectara_groundedness_hhem_2_1", overwrite=True)
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
@@ -5258,12 +5258,11 @@ class FaithfulnessHHEM(BulkInstanceMetric):
     # single_reference_per_prediction = True
     max_context_words = 4096
     reduction_map = {"mean": [main_score]}
+    model = None
 
     _requirements_list: List[str] = ["transformers", "torch"]
 
-    @retry_connection_with_exponential_backoff(backoff_factor=2)
-    def prepare(self):
-        super().prepare()
+    def load_model(self):
         import torch
 
         if torch.cuda.is_available():
@@ -5281,6 +5280,11 @@ def prepare(self):
             model_path, trust_remote_code=True
         ).to(device)
 
+    @retry_connection_with_exponential_backoff(backoff_factor=2)
+    def prepare(self):
+        super().prepare()
+        # load_model() moved from prepare() to compute() because model is gated in HF
+
     def compute(
         self,
         references: List[List[Any]],
@@ -5289,6 +5293,8 @@ def compute(
     ) -> List[Dict[str, Any]]:
         from tqdm import tqdm
 
+        if self.model is None:
+            self.load_model()
         # treat the references as the contexts and the predictions as answers
         # concat references