IBM · yoavkatz · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 11, 2025
diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
@@ -1,68 +1,103 @@
+import json
+import time
+
 import pandas as pd
+from lh_eval_api import LakeHouseLoader
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import CrossProviderInferenceEngine
-
-model = CrossProviderInferenceEngine(
-    model="llama-3-8b-instruct", max_tokens=32, provider="bam"
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+    WMLInferenceEngineChat,
+    WMLInferenceEngineGeneration,
 )
-"""
-We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
-watsonx, bam, openai, azure, aws and more.
-
-For the arguments these inference engines can receive, please refer to the classes documentation or read
-about the the open ai api arguments the CrossProviderInferenceEngine follows.
-"""
-
-card = "cards.boolq.classification"
-template = "templates.classification.multi_class.relation.default"
-
-df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])
-
-for format in [
-    "formats.llama3_instruct",
-    "formats.empty",
-    "formats.llama3_instruct_all_demos_in_one_turn",
-]:
-    for system_prompt in [
-        "system_prompts.models.llama2",
-        "system_prompts.empty",
+
+x = LakeHouseLoader # To avoid warnings, of unused imports.
+print("Creating cross_provider_rits ...")
+cross_provider_rits = CrossProviderInferenceEngine(model="granite-3-8b-instruct", max_tokens=32, provider="rits")
+
+print("Creating cross_provider_watsonx ...")
+cross_provider_watsonx = CrossProviderInferenceEngine(model="granite-3-8b-instruct", max_tokens=32, provider="watsonx")
+print("Creating wml_gen ...")
+wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-3-8b-instruct",max_new_tokens=32)
+print("Creating wml_chat ...")
+wml_chat=   WMLInferenceEngineChat(model_name="ibm/granite-3-8b-instruct",max_tokens=32,top_logprobs=None)
+
+#wml_chat = WMLInferenceEngineChat(
+#    model_name="ibm/granite-vision-3-2-2b",max_tokens=32,top_logprobs=None
+#)
+#wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-vision-3-2-2b",max_new_tokens=32)
+
+df = pd.DataFrame(columns=["model","format", "system_prompt", "f1_micro", "ci_low", "ci_high", "duration", "num_instances","type_of_input"])
+
+#model_list = [(cross_provider_rits,"cross_provider_rits"),(wml_chat,"wml-chat"),(cross_provider_watsonx, "cross-provider-watsonx")]
+#model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(cross_provider_rits,"cross_provider_rits")]
+model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(wml_chat,"wml-chat"),(wml_gen,"wml-gen") ]
+#model_list = [(cross_provider_rits,"cross_provider_rits")]
+for (model,model_name) in model_list:
+    print(model_name)
+    card = "cards.cat"
+    template = "templates.classification.multi_label.instruct_question_select"
+
+    for format in [
+        "formats.chat_api[repeat_instruction_per_turn=True,add_target_prefix=False]",
+        "formats.chat_api[repeat_instruction_per_turn=True]",
+        "formats.granite_instruct_custom",
+        "formats.chat_api",
+#        "formats.empty",
     ]:
-        dataset = load_dataset(
-            card=card,
-            template=template,
-            format=format,
-            system_prompt=system_prompt,
-            num_demos=2,
-            demos_pool_size=50,
-            loader_limit=300,
-            max_test_instances=100,
-            split="test",
-        )
-
-        predictions = model(dataset)
-        results = evaluate(predictions=predictions, data=dataset)
-
-        print(
-            f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
-        )
-
-        print(
-            results.instance_scores.to_df(
-                columns=[
-                    "source",
-                    "prediction",
-                ]
+        for system_prompt in [
+            "system_prompts.models.granite_instruct_classify",
+#            "system_prompts.empty",
+        ]:
+            if (model_name == "wml-gen" and  "formats.chat_api" in format):
+                continue
+            if (model_name == "wml-chat" and  "formats.chat_api" not in format):
+                continue
+            dataset = load_dataset(
+                card=card,
+                template=template,
+                format=format,
+                system_prompt=system_prompt,
+                num_demos=5,
+                demos_pool_size=100,
+                loader_limit=1000,
+                max_test_instances=128  ,
+                split="test",
+            )
+            type_of_input = (type(dataset[0]["source"]))
+
+            print("Starting inference...")
+            start = time.perf_counter()
+            predictions = model(dataset)
+            end = time.perf_counter()
+            duration = end-start
+            print("End of inference...")
+
+            results = evaluate(predictions=predictions, data=dataset)
+
+            print(
+                f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
             )
-        )
-
-        global_scores = results.global_scores
-        df.loc[len(df)] = [
-            format,
-            system_prompt,
-            global_scores["score"],
-            global_scores["score_ci_low"],
-            global_scores["score_ci_high"],
-        ]
-
-        df = df.round(decimals=2)
-        print(df.to_markdown())
+
+            print("Example prompt:")
+
+            print(json.dumps(results.instance_scores[0]["source"], indent=4))
+
+            print("Example prediction:")
+
+            print(json.dumps(results.instance_scores[0]["prediction"], indent=4))
+
+            global_scores = results.global_scores
+            df.loc[len(df)] = [
+                model_name,
+                format,
+                system_prompt,
+                global_scores["score"],
+                global_scores["score_ci_low"],
+                global_scores["score_ci_high"],
+                duration,
+                len(predictions),
+                type_of_input
+            ]
+
+            df = df.round(decimals=2)
+            print(df.to_markdown())
diff --git a/examples/inference_using_ibm_watsonx_ai.py b/examples/inference_using_ibm_watsonx_ai.py
@@ -1,14 +1,13 @@
-import os
 
 from unitxt.api import load_dataset
 from unitxt.inference import WMLInferenceEngine
 from unitxt.text_utils import print_dict
 
 if __name__ == "__main__":
     # Set required env variables using your WML credentials:
-    os.environ["WML_URL"] = ""
-    os.environ["WML_PROJECT_ID"] = ""
-    os.environ["WML_APIKEY"] = ""
+    # os.environ["WML_URL"] = ""
+    # os.environ["WML_PROJECT_ID"] = ""
+    # os.environ["WML_APIKEY"] = ""
 
     # Preparing WML inference engine:
     model_name = "google/flan-t5-xl"

diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py
@@ -351,6 +351,10 @@ class ChatAPIFormat(BaseFormat):
         The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
     """
 
+    repeat_instruction_per_turn: bool = False
+    add_target_prefix : bool = True
+
+
     def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
         # Regular expression to find <img> tags with src attribute
         img_tag_pattern = re.compile(
@@ -415,9 +419,10 @@ def to_chat(
     ) -> List[Message]:
         messages = []
 
-        if system_prompt or instruction:
+        if system_prompt or (instruction and not self.repeat_instruction_per_turn):
             system_content = self.to_content(
-                system_prompt + ("\n" if system_prompt != "" else "") + instruction,
+                system_prompt + ("\n" if system_prompt != "" else "") +
+                    (instruction if not self.repeat_instruction_per_turn else ""),
                 media,
             )
             messages.append(
@@ -428,9 +433,13 @@ def to_chat(
             )
 
         for demo_instance in demos:
-            user_content = self.to_content(demo_instance["source"], media)
+            text = demo_instance["source"]
+            if (instruction and self.repeat_instruction_per_turn):
+                text = f"{instruction}\n{text}"
+
+            user_content = self.to_content(text, media)
             assistant_content = self.to_content(
-                target_prefix + demo_instance["target"], media
+                (target_prefix if self.add_target_prefix else "") + demo_instance["target"], media
             )
             messages.extend(
                 [
@@ -442,7 +451,11 @@ def to_chat(
                 ]
             )
 
-        last_user_content = self.to_content(source, media)
+        text = source
+        if (instruction and self.repeat_instruction_per_turn):
+            text = f"{instruction}\n{text}"
+
+        last_user_content = self.to_content(text    , media)
 
         messages.extend([{"role": "user", "content": last_user_content}])
 

diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -2025,6 +2025,9 @@ class WMLInferenceEngineBase(
         deployment_id (str, optional):
             Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
+        concurrency_limit (int):
+            Number of concurrent requests sent to a model. Default is 10,
+            which is also the maximum value for the generation.
         parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
             Defines inference parameters and their values. Deprecated attribute, please pass respective
             parameters directly to the respective class instead.
@@ -2033,6 +2036,7 @@ class WMLInferenceEngineBase(
     credentials: Optional[CredentialsWML] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
+    concurrency_limit: int = 10
     label: str = "wml"
     _requirements_list = {
         "ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
@@ -2286,11 +2290,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
 
     If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.
 
-    Args:
-        concurrency_limit (int):
-            Number of concurrent requests sent to a model. Default is 10,
-            which is also the maximum value.
-
     Examples:
         .. code-block:: python
 
@@ -2314,8 +2313,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
             results = wml_inference.infer(dataset["test"])
     """
 
-    concurrency_limit: int = 10
-
     def verify(self):
         super().verify()
 
@@ -2567,6 +2564,32 @@ def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]
         # images as SDK allows sending only one image per message.
         return [messages]
 
+    def _handle_async_requests(
+        self,
+        messages: List[List[Dict[str, Any]]],
+        params: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        async def handle_async_requests(start_idx, end_idx):
+            coroutines = [
+                self._model.achat(messages=messages[idx], params=params)
+                for idx in range(start_idx, end_idx)
+            ]
+            batch_results = await asyncio.gather(*coroutines)
+            return list(batch_results)
+
+        loop = asyncio.get_event_loop()
+        results = []
+
+        for batch_idx in range(0, len(messages), self.concurrency_limit):
+            batch_results = loop.run_until_complete(
+                handle_async_requests(
+                    batch_idx, min(batch_idx + self.concurrency_limit, len(messages))
+                )
+            )
+            results.extend(batch_results)
+
+        return results
+
     def _send_requests(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
@@ -2582,27 +2605,25 @@ def _send_requests(
             output_type = "message"
             params["logprobs"] = False
 
-        final_results = []
-
-        for instance in dataset:
-            messages = self.to_messages(instance)
-
-            for message in messages:
-                result = self._model.chat(
-                    messages=message,
-                    params=params,
-                )
+        indexed_messages = [
+            (i, message)
+            for i in range(len(dataset))
+            for message in self.to_messages(dataset[i])
+        ]
 
-                final_results.append(
-                    self.get_return_object(
-                        result["choices"][0][output_type]["content"],
-                        result,
-                        instance["source"],
-                        return_meta_data,
-                    )
-                )
+        results = self._handle_async_requests(
+            [msg[1] for msg in indexed_messages], params
+        )
 
-        return final_results
+        return [
+            self.get_return_object(
+                result["choices"][0][output_type]["content"],
+                result,
+                dataset[idx[0]]["source"],
+                return_meta_data,
+            )
+            for result, idx in zip(results, indexed_messages)
+        ]
 
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data: