Fix some bugs in inference engine tests (#1682)

elronbandel · eladven · web-flow · commit a3e37acdad91 · 2025-03-18T20:26:04.000+02:00
* Fix some bugs in inference engine tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Fix some bugs in inference engine tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Fix bug in name conversion in rits

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Add engine id

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* fix

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* fix

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Use greedy decoding and remove redundant cache

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Fix hf-auto model test

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Touch up watsonx tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Fix inference tests.

1. Use local inference engine on CPU when test inference engine, for reproducability.
2. In cache maechanisim, don't assum that infer on empty list yields empty list.

Signed-off-by: Elad Venezian &lt;eladv@il.ibm.com&gt;

* Fix setting of data classification policy

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

---------

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;
Signed-off-by: Elad Venezian &lt;eladv@il.ibm.com&gt;
Co-authored-by: Elad Venezian &lt;eladv@il.ibm.com&gt;
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -27,10 +27,11 @@ jobs:
       WML_URL: ${{ secrets.WML_URL }}
       WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
       WML_APIKEY: ${{ secrets.WML_APIKEY }}
-      WX_URL: ${{ secrets.WX_URL }}
-      WX_PROJECT_ID: ${{ secrets.WX_PROJECT_ID }}
-      WX_API_KEY: ${{ secrets.WX_API_KEY }}
+      WX_URL: ${{ secrets.WML_URL }} # Similar to WML_URL
+      WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} # Similar to WML_PROJECT_ID
+      WX_API_KEY: ${{ secrets.WML_APIKEY }} # Similar to WML_APIKEY
       GENAI_KEY: ${{ secrets.GENAI_KEY }}
+
     steps:
       - uses: actions/checkout@v4
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -115,7 +115,7 @@ watsonx = [
     "ibm-watsonx-ai==1.1.14"
 ]
 inference-tests = [
-  "litellm==v1.52.9",
+  "litellm>=1.52.9",
   "tenacity",
   "diskcache",
   "numpy==1.26.4",
diff --git a/src/unitxt/api.py b/src/unitxt/api.py
@@ -21,7 +21,7 @@
 from .logging_utils import get_logger
 from .metric_utils import EvaluationResults, _compute, _inference_post_process
 from .operator import SourceOperator
-from .schema import loads_instance
+from .schema import loads_batch
 from .settings_utils import get_constants, get_settings
 from .standard import DatasetRecipe
 from .task import Task
@@ -98,6 +98,7 @@ def create_dataset(
     train_set: Optional[List[Dict[Any, Any]]] = None,
     validation_set: Optional[List[Dict[Any, Any]]] = None,
     split: Optional[str] = None,
+    data_classification_policy:  Optional[List[str]] = None,
     **kwargs,
 ) -> Union[DatasetDict, IterableDatasetDict, Dataset, IterableDataset]:
     """Creates dataset from input data based on a specific task.
@@ -108,6 +109,7 @@ def create_dataset(
         train_set : optional train_set
         validation_set: optional validation set
         split: optional one split to choose
+        data_classification_policy: data_classification_policy
         **kwargs: Arguments used to load dataset from provided datasets (see load_dataset())
 
     Returns:
@@ -129,7 +131,7 @@ def create_dataset(
             f"No 'template' was passed to the create_dataset() and the given task ('{task.__id__}') has no 'default_template' field."
         )
 
-    card = TaskCard(loader=LoadFromDictionary(data=data), task=task)
+    card = TaskCard(loader=LoadFromDictionary(data=data, data_classification_policy=data_classification_policy), task=task)
     return load_dataset(card=card, split=split, **kwargs)
 
 
@@ -283,7 +285,7 @@ def produce(
     result = _get_produce_with_cache(dataset_query, **kwargs)(instance_or_instances)
     if not is_list:
         return result[0]
-    return Dataset.from_list(result).with_transform(loads_instance)
+    return Dataset.from_list(result).with_transform(loads_batch)
 
 
 def infer(
diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Union
+from typing import Dict, Optional, Union
 
 import datasets
 
@@ -46,7 +46,7 @@
 from .random_utils import __file__ as _
 from .recipe import __file__ as _
 from .register import __file__ as _
-from .schema import loads_instance
+from .schema import loads_batch, loads_instance
 from .serializers import __file__ as _
 from .settings_utils import get_constants
 from .span_lableing_operators import __file__ as _
@@ -115,6 +115,13 @@ def _download_and_prepare(
             dl_manager, "no_checks", **prepare_splits_kwargs
         )
 
+    def as_streaming_dataset(self, split: Optional[str] = None, base_path: Optional[str] = None) -> Union[Dict[str, datasets.IterableDataset], datasets.IterableDataset]:
+        return (
+            super()
+            .as_streaming_dataset(split, base_path=base_path)
+            .map(loads_instance)
+        )
+
     def as_dataset(
         self,
         split: Optional[datasets.Split] = None,
@@ -157,5 +164,5 @@ def as_dataset(
         return (
             super()
             .as_dataset(split, run_post_process, verification_mode, in_memory)
-            .with_transform(loads_instance)
+            .with_transform(loads_batch)
         )
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -13,6 +13,7 @@
 import uuid
 from collections import Counter
 from datetime import datetime
+from itertools import islice
 from multiprocessing.pool import ThreadPool
 from typing import (
     Any,
@@ -55,6 +56,11 @@
 logger = get_logger()
 
 
+def batched(lst, n):
+    it = iter(lst)
+    while batch := list(islice(it, n)):
+        yield batch
+
 class StandardAPIParamsMixin(Artifact):
     model: str
     frequency_penalty: Optional[float] = None
@@ -227,12 +233,8 @@ def infer(
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
-                if isinstance(dataset, Dataset):
-                    dataset = dataset.to_list()
-                dataset_batches = [dataset[i:i + self.cache_batch_size]
-                                    for i in range(0, len(dataset), self.cache_batch_size)]
                 result = []
-                for batch_num, batch in enumerate(dataset_batches):
+                for batch_num, batch in enumerate(batched(dataset, self.cache_batch_size)):
                     cached_results = []
                     missing_examples = []
                     for i, item in enumerate(batch):
@@ -243,16 +245,19 @@ def infer(
                         else:
                             missing_examples.append((i, item)) # each element is index in batch and example
                     # infare on missing examples only, without indices
-                    logger.info(f"Inferring batch {batch_num} / {len(dataset_batches)}")
-                    inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
-                    # recombined to index and value
-                    inferred_results = list(zip([e[0] for e in missing_examples], inferred_results))
-                    # Add missing examples to cache
-                    for (_, item), (_, prediction) in zip(missing_examples, inferred_results):
-                        if prediction is None:
-                            continue
-                        cache_key = self._get_cache_key(item)
-                        self._cache[cache_key] = prediction
+                    logger.info(f"Inferring batch {batch_num} / {len(dataset) // self.cache_batch_size}")
+                    if len(missing_examples) > 0:
+                        inferred_results = self._infer([e[1] for e in missing_examples], return_meta_data)
+                        # recombined to index and value
+                        inferred_results = list(zip([e[0] for e in missing_examples], inferred_results))
+                        # Add missing examples to cache
+                        for (_, item), (_, prediction) in zip(missing_examples, inferred_results):
+                            if prediction is None:
+                                continue
+                            cache_key = self._get_cache_key(item)
+                            self._cache[cache_key] = prediction
+                    else:
+                        inferred_results = []
 
                     # Combine cached and inferred results in original order
                     batch_predictions = [p[1] for p in sorted(cached_results + inferred_results)]
@@ -1798,6 +1803,10 @@ class RITSInferenceEngine(
     label: str = "rits"
     data_classification_policy = ["public", "proprietary"]
 
+    model_names_dict = {
+        "microsoft/phi-4": "microsoft-phi-4"
+    }
+
     def get_default_headers(self):
         return {"RITS_API_KEY": self.credentials["api_key"]}
 
@@ -1818,8 +1827,10 @@ def get_base_url_from_model_name(model_name: str):
             RITSInferenceEngine._get_model_name_for_endpoint(model_name)
         )
 
-    @staticmethod
-    def _get_model_name_for_endpoint(model_name: str):
+    @classmethod
+    def _get_model_name_for_endpoint(cls, model_name: str):
+        if model_name in cls.model_names_dict:
+            return cls.model_names_dict[model_name]
         return (
             model_name.split("/")[-1]
             .lower()
@@ -2959,15 +2970,12 @@ def prepare_engine(self):
             capacity=self.max_requests_per_second,
         )
         self.inference_type = "litellm"
-        import litellm
         from litellm import acompletion
-        from litellm.caching.caching import Cache
 
-        litellm.cache = Cache(type="disk")
 
         self._completion = acompletion
         # Initialize a semaphore to limit concurrency
-        self._semaphore = asyncio.Semaphore(self.max_requests_per_second)
+        self._semaphore = asyncio.Semaphore(round(self.max_requests_per_second))
 
     async def _infer_instance(
         self, index: int, instance: Dict[str, Any]
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
@@ -845,7 +845,7 @@ def verify(self):
 
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
-            ["proprietary"], "when loading from python dictionary"
+            self.data_classification_policy or ["proprietary"], "when loading from python dictionary"
         )
 
     def load_iterables(self) -> MultiStream:
diff --git a/src/unitxt/schema.py b/src/unitxt/schema.py
@@ -67,8 +67,7 @@ def load_chat_source(chat_str):
                     )
     return chat
 
-
-def loads_instance(batch):
+def loads_batch(batch):
     if (
         "source" in batch
         and isinstance(batch["source"][0], str)
@@ -86,6 +85,24 @@ def loads_instance(batch):
         batch["task_data"] = [json.loads(d) for d in batch["task_data"]]
     return batch
 
+def loads_instance(instance):
+    if (
+        "source" in instance
+        and isinstance(instance["source"], str)
+        and (
+            instance["source"].startswith('[{"role":')
+            or instance["source"].startswith('[{"content":')
+        )
+    ):
+        instance["source"] = load_chat_source(instance["source"])
+    if (
+        not settings.task_data_as_text
+        and "task_data" in instance
+        and isinstance(instance["task_data"], str)
+    ):
+        instance["task_data"] = json.loads(instance["task_data"])
+    return instance
+
 
 class FinalizeDataset(InstanceOperatorValidator):
     group_by: List[List[str]]
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ watsonx = [`
`115`	`115`	`"ibm-watsonx-ai==1.1.14"`
`116`	`116`	`]`
`117`	`117`	`inference-tests = [`
`118`		`- "litellm==v1.52.9",`
	`118`	`+ "litellm>=1.52.9",`
`119`	`119`	`"tenacity",`
`120`	`120`	`"diskcache",`
`121`	`121`	`"numpy==1.26.4",`
Original file line number	Diff line number	Diff line change
`@@ -845,7 +845,7 @@ def verify(self):`
`845`	`845`
`846`	`846`	`def _maybe_set_classification_policy(self):`
`847`	`847`	`self.set_default_data_classification(`
`848`		`- ["proprietary"], "when loading from python dictionary"`
	`848`	`+ self.data_classification_policy or ["proprietary"], "when loading from python dictionary"`
`849`	`849`	`)`
`850`	`850`
`851`	`851`	`def load_iterables(self) -> MultiStream:`