Add hf to cross provider inference engine (#1866)

yoavkatz · web-flow · commit 4e5433a39714 · 2025-07-14T12:53:50.000+03:00
* Added option to run HF inference in CrossProviderInferenceEngline

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Changed example to use hf in CrossProviderInferenceEngine

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Revert unintended deletion

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* More example changes

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Changed to provider name from hf to hf-local

Fixed additional examples.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

---------

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;
diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py
@@ -3,9 +3,7 @@
     TaskCard,
 )
 from unitxt.collections_operators import Wrap
-from unitxt.inference import (
-    HFPipelineBasedInferenceEngine,
-)
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.loaders import LoadFromDictionary
 from unitxt.operators import Rename, Set
 from unitxt.templates import MultiReferenceTemplate, TemplatesDict
@@ -78,13 +76,8 @@
 )
 
 
-# Infer using Llama-3.2-1B base using HF API
-model = HFPipelineBasedInferenceEngine(
-    model_name="meta-llama/Llama-3.2-1B", max_new_tokens=32
-)
-# Change to this to infer with external APIs:
-# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
-# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "hf-local"]
 
 predictions = model(dataset)
 results = evaluate(predictions=predictions, data=dataset)
diff --git a/examples/evaluate_using_metrics_ensemble.py b/examples/evaluate_using_metrics_ensemble.py
@@ -1,17 +1,15 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import (
-    HFPipelineBasedInferenceEngine,
-)
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.metrics import MetricsEnsemble
 
 logger = get_logger()
 
 # define the metrics ensemble
 ensemble_metric = MetricsEnsemble(
     metrics=[
-        "metrics.llm_as_judge.rating.llama_3_70b_instruct.generic_single_turn",
-        "metrics.llm_as_judge.rating.llama_3_8b_instruct_ibm_genai_template_mt_bench_single_turn",
+        "metrics.llm_as_judge.direct.watsonx.llama3_3_70b[criteria=metrics.llm_as_judge.direct.criteria.answer_relevance, context_fields=[question]]",
+        "metrics.llm_as_judge.direct.watsonx.llama3_3_70b[criteria=metrics.llm_as_judge.direct.criteria.correctness_based_on_ground_truth, context_fields=[question,answers]]",
     ],
     weights=[0.75, 0.25],
 )
@@ -27,13 +25,8 @@
     split="test",
 )
 
-# Infer using SmolLM2 using HF API
-model = HFPipelineBasedInferenceEngine(
-    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
-)
 # Change to this to infer with external APIs:
-# CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
-# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
 
 predictions = model(dataset)
 
diff --git a/examples/inference_using_cross_provider.py b/examples/inference_using_cross_provider.py
@@ -2,7 +2,7 @@
 from unitxt.text_utils import print_dict
 
 if __name__ == "__main__":
-    for provider in ["watsonx", "rits", "watsonx-sdk"]:
+    for provider in ["watsonx", "rits", "watsonx-sdk", "hf-local"]:
         print()
         print("------------------------------------------------ ")
         print("PROVIDER:", provider)
diff --git a/examples/multiple_choice_qa_evaluation.py b/examples/multiple_choice_qa_evaluation.py
@@ -3,7 +3,7 @@
 from unitxt import get_logger, load_dataset
 from unitxt.api import LoadFromDictionary, TaskCard, evaluate
 from unitxt.blocks import Rename
-from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.operators import IndexOf, ListFieldValues
 from unitxt.templates import MultipleChoiceTemplate
 
@@ -61,14 +61,8 @@
     format="formats.chat_api",
 )
 
-# Infer using Llama-3.2-1B base using HF API
-model = HFPipelineBasedInferenceEngine(
-    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
-)
-# Change to this to infer with external APIs:
-# from unitxt.inference import CrossProviderInferenceEngine
-# model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
-# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+model = CrossProviderInferenceEngine(model="SmolLM2-1.7B-Instruct", provider="hf-local")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama","hf-local"]
 
 
 predictions = model(dataset)
@@ -79,7 +73,7 @@
 
 
 print("Instance Results:")
-print(results.instance_scores)
+print(results.instance_scores.summary)
 
 print("Global Results:")
 print(results.global_scores.summary)
diff --git a/examples/qa_evaluation.py b/examples/qa_evaluation.py
@@ -1,8 +1,6 @@
 from unitxt import get_logger
 from unitxt.api import create_dataset, evaluate
-from unitxt.inference import (
-    HFPipelineBasedInferenceEngine,
-)
+from unitxt.inference import CrossProviderInferenceEngine
 
 logger = get_logger()
 
@@ -30,14 +28,9 @@
     format="formats.chat_api",
 )
 
-# Infer using SmolLM2 using HF API
-model = HFPipelineBasedInferenceEngine(
-    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
-)
-# Change to this to infer with external APIs:
-# from unitxt.inference import CrossProviderInferenceEngine
-# engine = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
-# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
+model = CrossProviderInferenceEngine(model="SmolLM2-1.7B-Instruct", provider="hf-local")
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "hf-local"]
+# (model must be available in the provider service)
 
 
 predictions = model(dataset)
diff --git a/examples/standalone_qa_evaluation.py b/examples/standalone_qa_evaluation.py
@@ -1,7 +1,7 @@
 from unitxt import get_logger
 from unitxt.api import create_dataset, evaluate
 from unitxt.blocks import Task
-from unitxt.inference import HFPipelineBasedInferenceEngine
+from unitxt.inference import CrossProviderInferenceEngine
 from unitxt.templates import InputOutputTemplate
 
 logger = get_logger()
@@ -37,21 +37,17 @@
 )
 
 
-# Infer using SmolLM2 using HF API
-model = HFPipelineBasedInferenceEngine(
-    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct", max_new_tokens=32
+model = CrossProviderInferenceEngine(
+    model="SmolLM2-1.7B-Instruct", provider="hf-local", use_cache=False
 )
-# Change to this to infer with external APIs:
-# from unitxt.inference import CrossProviderInferenceEngine
-# model = CrossProviderInferenceEngine(model="llama-3-2-1b-instruct", provider="watsonx")
-# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam". "rits"]
-
+# The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "rits", "hf-local"]
+# (model must be available in the provider service)
 
 predictions = model(dataset)
 results = evaluate(predictions=predictions, data=dataset)
 
 print("Instance Results:")
-print(results.instance_scores)
+print(results.instance_scores.summary)
 
 print("Global Results:")
 print(results.global_scores.summary)
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -79,7 +79,7 @@ class StandardAPIParamsMixin(Artifact):
     n: Optional[int] = None
     parallel_tool_calls: Optional[bool] = None
     service_tier: Optional[Literal["auto", "default"]] = None
-    credentials: Optional[Dict[str, str]] = {}
+    credentials: Optional[Dict[str, str]] = None
     extra_headers: Optional[Dict[str, str]] = None
 
 
@@ -468,7 +468,7 @@ def _is_loaded(self):
 
 
 class HFGenerationParamsMixin(Artifact):
-    max_new_tokens: int
+    max_new_tokens: Optional[int] = None
     do_sample: bool = False
     temperature: Optional[float] = None
     top_p: Optional[float] = None
@@ -3362,6 +3362,8 @@ def get_engine_id(self):
         return get_model_and_label_id(self.model, self.label)
 
     def prepare_engine(self):
+        if self.credentials is None:
+            self.credentials = {}
         # Initialize the token bucket rate limiter
         self._rate_limiter = AsyncTokenBucket(
             rate=self.max_requests_per_second,
@@ -3477,7 +3479,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     user requests.
 
     Current _supported_apis = ["watsonx", "together-ai", "open-ai", "aws", "ollama",
-    "bam", "watsonx-sdk", "rits", "vertex-ai"]
+    "bam", "watsonx-sdk", "rits", "vertex-ai","hf-local"]
 
     Args:
         provider (Optional):
@@ -3684,6 +3686,11 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
             "gpt-4-1": "replicate/openai/gpt-4.1",
         },
+        "hf-local": {
+            "granite-3-3-8b-instruct": "ibm-granite/granite-3.3-8b-instruct",
+            "llama-3-3-8b-instruct": "meta-llama/Llama-3.3-8B-Instruct",
+            "SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+        },
     }
     provider_model_map["watsonx"] = {
         k: f"watsonx/{v}" for k, v in provider_model_map["watsonx-sdk"].items()
@@ -3701,12 +3708,14 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         "azure": LiteLLMInferenceEngine,
         "vertex-ai": LiteLLMInferenceEngine,
         "replicate": LiteLLMInferenceEngine,
+        "hf-local": HFAutoModelInferenceEngine,
     }
 
     _provider_param_renaming = {
         "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
         "watsonx-sdk": {"model": "model_name"},
         "rits": {"model": "model_name"},
+        "hf-local": {"model": "model_name"},
     }
 
     def get_return_object(self, **kwargs):