IBM · yoavkatz · Jun 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
@@ -1,68 +1,127 @@
+import json
+import time
+
 import pandas as pd
+from lh_eval_api import LakeHouseLoader
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+    WMLInferenceEngineChat,
+    WMLInferenceEngineGeneration,
+)
+
+x = LakeHouseLoader  # To avoid warnings, of unused imports.
+print("Creating cross_provider_rits ...")
+cross_provider_rits = CrossProviderInferenceEngine(
+    model="granite-3-8b-instruct", max_tokens=32, provider="rits"
+)
 
-model = CrossProviderInferenceEngine(
-    model="llama-3-8b-instruct", max_tokens=32, provider="bam"
+print("Creating cross_provider_watsonx ...")
+cross_provider_watsonx = CrossProviderInferenceEngine(
+    model="granite-3-8b-instruct", max_tokens=32, provider="watsonx"
+)
+print("Creating wml_gen ...")
+wml_gen = WMLInferenceEngineGeneration(
+    model_name="ibm/granite-3-8b-instruct", max_new_tokens=32
+)
+print("Creating wml_chat ...")
+wml_chat = WMLInferenceEngineChat(
+    model_name="ibm/granite-3-8b-instruct", max_tokens=32, top_logprobs=None
 )
-"""
-We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
-watsonx, bam, openai, azure, aws and more.
-
-For the arguments these inference engines can receive, please refer to the classes documentation or read
-about the the open ai api arguments the CrossProviderInferenceEngine follows.
-"""
-
-card = "cards.boolq.classification"
-template = "templates.classification.multi_class.relation.default"
-
-df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])
-
-for format in [
-    "formats.llama3_instruct",
-    "formats.empty",
-    "formats.llama3_instruct_all_demos_in_one_turn",
-]:
-    for system_prompt in [
-        "system_prompts.models.llama2",
-        "system_prompts.empty",
+
+# wml_chat = WMLInferenceEngineChat(
+#    model_name="ibm/granite-vision-3-2-2b",max_tokens=32,top_logprobs=None
+# )
+# wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-vision-3-2-2b",max_new_tokens=32)
+
+df = pd.DataFrame(
+    columns=[
+        "model",
+        "format",
+        "system_prompt",
+        "f1_micro",
+        "ci_low",
+        "ci_high",
+        "duration",
+        "num_instances",
+        "type_of_input",
+    ]
+)
+
+# model_list = [(cross_provider_rits,"cross_provider_rits"),(wml_chat,"wml-chat"),(cross_provider_watsonx, "cross-provider-watsonx")]
+# model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(cross_provider_rits,"cross_provider_rits")]
+model_list = [
+    (cross_provider_watsonx, "cross-provider-watsonx"),
+    (wml_chat, "wml-chat"),
+    (wml_gen, "wml-gen"),
+]
+# model_list = [(cross_provider_rits,"cross_provider_rits")]
+for model, model_name in model_list:
+    print(model_name)
+    card = "cards.cat"
+    template = "templates.classification.multi_label.instruct_question_select"
+
+    for format in [
+        "formats.chat_api[place_instruction_in_user_turns=True,add_target_prefix=False]",
+        "formats.chat_api[place_instruction_in_user_turns=True]",
+        "formats.granite_instruct_custom",
+        "formats.chat_api",
+        #        "formats.empty",
     ]:
-        dataset = load_dataset(
-            card=card,
-            template=template,
-            format=format,
-            system_prompt=system_prompt,
-            num_demos=2,
-            demos_pool_size=50,
-            loader_limit=300,
-            max_test_instances=100,
-            split="test",
-        )
-
-        predictions = model(dataset)
-        results = evaluate(predictions=predictions, data=dataset)
-
-        print(
-            f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
-        )
-
-        print(
-            results.instance_scores.to_df(
-                columns=[
-                    "source",
-                    "prediction",
-                ]
+        for system_prompt in [
+            "system_prompts.models.granite_instruct_classify",
+            #            "system_prompts.empty",
+        ]:
+            if model_name == "wml-gen" and "formats.chat_api" in format:
+                continue
+            if model_name == "wml-chat" and "formats.chat_api" not in format:
+                continue
+            dataset = load_dataset(
+                card=card,
+                template=template,
+                format=format,
+                system_prompt=system_prompt,
+                num_demos=5,
+                demos_pool_size=100,
+                loader_limit=1000,
+                max_test_instances=128,
+                split="test",
             )
-        )
-
-        global_scores = results.global_scores
-        df.loc[len(df)] = [
-            format,
-            system_prompt,
-            global_scores["score"],
-            global_scores["score_ci_low"],
-            global_scores["score_ci_high"],
-        ]
-
-        df = df.round(decimals=2)
-        print(df.to_markdown())
+            type_of_input = type(dataset[0]["source"])
+
+            print("Starting inference...")
+            start = time.perf_counter()
+            predictions = model(dataset)
+            end = time.perf_counter()
+            duration = end - start
+            print("End of inference...")
+
+            results = evaluate(predictions=predictions, data=dataset)
+
+            print(
+                f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
+            )
+
+            print("Example prompt:")
+
+            print(json.dumps(results.instance_scores[0]["source"], indent=4))
+
+            print("Example prediction:")
+
+            print(json.dumps(results.instance_scores[0]["prediction"], indent=4))
+
+            global_scores = results.global_scores
+            df.loc[len(df)] = [
+                model_name,
+                format,
+                system_prompt,
+                global_scores["score"],
+                global_scores["score_ci_low"],
+                global_scores["score_ci_high"],
+                duration,
+                len(predictions),
+                type_of_input,
+            ]
+
+            df = df.round(decimals=2)
+            print(df.to_markdown())
diff --git a/examples/evaluate_granite_thinking.py b/examples/evaluate_granite_thinking.py
@@ -0,0 +1,70 @@
+from unitxt import get_logger
+from unitxt.api import create_dataset, evaluate
+from unitxt.formats import HFSystemFormat
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.processors import ExtractWithRegex, PostProcess
+from unitxt.task import Task
+from unitxt.templates import InputOutputTemplate
+
+logger = get_logger()
+
+# Set up question answer pairs in a dictionary
+test_set = [
+    {
+        "question": "If I had 32 apples, I lost 5 apples, and gain twice more as many as I have.  How many do I have at the end",
+        "answer": "81",
+    },
+]
+
+
+# define the QA task
+task = Task(
+    input_fields={"question": str},
+    reference_fields={"answer": str},
+    prediction_type=str,
+    metrics=["metrics.accuracy"],
+)
+
+
+# Create a simple template that formats the input.
+# Add lowercase normalization as a post processor.
+
+
+for thinking in [True, False]:
+    postprocessors = ["processors.lower_case"]
+    if thinking:
+        postprocessors.append(
+            PostProcess(
+                ExtractWithRegex(regex="<response>(.*)</response"),
+                process_references=False,
+            )
+        )
+
+    template = InputOutputTemplate(
+        instruction="Answer the following question with the single numeric answer.",
+        input_format="{question}",
+        output_format="{answer}",
+        postprocessors=postprocessors,
+    )
+    dataset = create_dataset(
+        task=task,
+        test_set=test_set,
+        template=template,
+        split="test",
+        format=HFSystemFormat(
+            model_name="ibm-granite/granite-3.3-8b-instruct",
+            chat_kwargs_dict={"thinking": thinking},
+            place_instruction_in_user_turns=True,
+        ),
+    )
+
+    model = CrossProviderInferenceEngine(
+        model="granite-3-3-8b-instruct", provider="rits", use_cache=False
+    )
+
+    predictions = model(dataset)
+
+    results = evaluate(predictions=predictions, data=dataset)
+
+    print("Instance Results when Thinking=", thinking)
+    print(results.instance_scores)
diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py
@@ -358,6 +358,9 @@ class ChatAPIFormat(BaseFormat):
         The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
     """
 
+    place_instruction_in_user_turns: bool = False
+    add_target_prefix: bool = True
+
     def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
         # Regular expression to find <img> tags with src attribute
         img_tag_pattern = re.compile(
@@ -422,9 +425,11 @@ def to_chat(
     ) -> List[Message]:
         messages = []
 
-        if system_prompt or instruction:
+        if system_prompt or (instruction and not self.place_instruction_in_user_turns):
             system_content = self.to_content(
-                system_prompt + ("\n" if system_prompt != "" else "") + instruction,
+                system_prompt
+                + ("\n" if system_prompt != "" else "")
+                + (instruction if not self.place_instruction_in_user_turns else ""),
                 media,
             )
             messages.append(
@@ -435,9 +440,15 @@ def to_chat(
             )
 
         for demo_instance in demos:
-            user_content = self.to_content(demo_instance["source"], media)
+            text = demo_instance["source"]
+            if instruction and self.place_instruction_in_user_turns:
+                text = f"{instruction}\n{text}"
+
+            user_content = self.to_content(text, media)
             assistant_content = self.to_content(
-                target_prefix + demo_instance["target"], media
+                (target_prefix if self.add_target_prefix else "")
+                + demo_instance["target"],
+                media,
             )
             messages.extend(
                 [
@@ -449,7 +460,11 @@ def to_chat(
                 ]
             )
 
-        last_user_content = self.to_content(source, media)
+        text = source
+        if instruction and self.place_instruction_in_user_turns:
+            text = f"{instruction}\n{text}"
+
+        last_user_content = self.to_content(text, media)
 
         messages.extend([{"role": "user", "content": last_user_content}])
 
@@ -492,6 +507,7 @@ class HFSystemFormat(ChatAPIFormat):
     """
 
     model_name: str
+    chat_kwargs_dict: Dict[str, str] = {}
     _requirements_list = ["transformers", "Jinja2"]
 
     @retry_connection_with_exponential_backoff(backoff_factor=2)
@@ -515,7 +531,10 @@ def _format_instance_to_source(
         )
         return (
             self.tokenizer.apply_chat_template(
-                chat, tokenize=False, add_generation_prompt=True
+                chat,
+                tokenize=False,
+                add_generation_prompt=True,
+                **self.chat_kwargs_dict,
             )
             + target_prefix
         )