Add option to store template instruction in user role and not system role and added granite thinking example (#1667)

yoavkatz · pawelknes · elronbandel · web-flow · commit b55532f07fbf · 2025-06-10T13:31:59.000+03:00
* support for asynchronous requests in wml chat

Signed-off-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;

* Test branch to evaluate impact of different format

* Avoid removal of needed import

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Made inference_using_ibm_watsonx_ai work with env variables out of the box

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Renamed repeat_instruction_per_turn to place_instruction_in_user_turns to highlight where instructions are placed.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added example of granite thinking

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added documentation for 'place_instruction_in_user_turns'

Removed 'add_target_prefix' which is not related.

Improved example.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added granite thinking with MMLU

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated prints

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Added example of inference with cross provider without load_dataset

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Improved example.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated multi format example

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Simplfied doc

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

---------

Signed-off-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;
Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;
Co-authored-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;
Co-authored-by: Elron Bandel &lt;elronbandel@gmail.com&gt;
diff --git a/docs/docs/examples.rst b/docs/docs/examples.rst
@@ -134,7 +134,7 @@ Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Format
 Evaluate the impact of different formats and system prompts
 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-This example demonstrates how different formats and system prompts affect the input provided to a llama3 chat model and evaluate their impact on the obtained scores.
+This example demonstrates how different formats and system prompts affect the input provided to a granite chat model and evaluate their impact on the obtained scores.
 
 `Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_formats.py>`__
 
diff --git a/examples/evaluate_different_formats.py b/examples/evaluate_different_formats.py
@@ -1,68 +1,165 @@
+import json
+import time
+
 import pandas as pd
 from unitxt.api import evaluate, load_dataset
-from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+    WMLInferenceEngineChat,
+    WMLInferenceEngineGeneration,
+)
+
+print("Creating cross_provider_rits ...")
+cross_provider_rits = CrossProviderInferenceEngine(
+    model="granite-3-8b-instruct", max_tokens=32, provider="rits", temperature=0
+)
 
-model = CrossProviderInferenceEngine(
-    model="llama-3-8b-instruct", max_tokens=32, provider="bam"
+print("Creating cross_provider_watsonx ...")
+cross_provider_watsonx = CrossProviderInferenceEngine(
+    model="granite-3-8b-instruct", max_tokens=32, provider="watsonx", temperature=0
+)
+print("Creating wml_gen ...")
+wml_gen = WMLInferenceEngineGeneration(
+    model_name="ibm/granite-3-8b-instruct", max_new_tokens=32, temperature=0
+)
+print("Creating wml_chat ...")
+wml_chat = WMLInferenceEngineChat(
+    model_name="ibm/granite-3-8b-instruct", max_tokens=32, temperature=0
 )
-"""
-We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
-watsonx, bam, openai, azure, aws and more.
-
-For the arguments these inference engines can receive, please refer to the classes documentation or read
-about the the open ai api arguments the CrossProviderInferenceEngine follows.
-"""
-
-card = "cards.boolq.classification"
-template = "templates.classification.multi_class.relation.default"
-
-df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])
-
-for format in [
-    "formats.llama3_instruct",
-    "formats.empty",
-    "formats.llama3_instruct_all_demos_in_one_turn",
-]:
-    for system_prompt in [
-        "system_prompts.models.llama2",
-        "system_prompts.empty",
+
+df = pd.DataFrame(
+    columns=[
+        "model",
+        "format",
+        "system_prompt",
+        "f1_micro",
+        "ci_low",
+        "ci_high",
+        "duration",
+        "num_instances",
+        "type_of_input",
+    ]
+)
+
+model_list = [
+    (cross_provider_watsonx, "cross-provider-watsonx"),
+    (wml_chat, "wml-chat"),
+    (wml_gen, "wml-gen"),
+]
+
+# This example compares the impact of different formats on a classification dataset
+#
+# formats.chat_api  - creates a list of OpenAI messages, where the instruction appears in the system prompt.
+#
+# [
+#    {
+#        "role": "system",
+#        "content": "Classify the contractual clauses of the following text to one of these options: Records, Warranties... "
+#    },
+#    {
+#        "role": "user",
+#        "content": "text: Each Credit Party shall maintain..."
+#    },
+#    {
+#        "role": "assistant",
+#        "content": "The contractual clauses is Records"
+#    },
+#    {
+#        "role": "user",
+#        "content": "text: Executive agrees to be employed with the Company...."
+#    }
+# ]
+#
+# formats.chat_api[place_instruction_in_user_turns=True] - creates a list of OpenAI messages, where the instruction appears in each user turn prompt.
+#
+# [
+#     {
+#         "role": "user",
+#         "content": "Classify the contractual clauses of the following text to one of these options: ...
+#                      text: Each Credit Party shall maintain...."
+#     },
+#     {
+#         "role": "assistant",
+#         "content": "The contractual clauses is Records"
+#     },
+#     {
+#         "role": "user",
+#         "content": "Classify the contractual clauses of the following text to one of these options: ...
+#                     text: Executive agrees to be employed with the Company...
+#     }
+# ]
+#
+# formats.empty  - pass inputs as a single string
+#
+# "Classify the contractual clauses of the following text to one of these options: Records, Warranties,.
+# text: Each Credit Party shall maintain...
+# The contractual clauses is Records
+#
+# text: Executive agrees to be employed with the Company,...
+# The contractual clauses is "
+
+for model, model_name in model_list:
+    print(model_name)
+    card = "cards.ledgar"
+    template = "templates.classification.multi_class.instruction"
+    for format in [
+        "formats.chat_api[place_instruction_in_user_turns=True]",
+        "formats.chat_api",
+        "formats.empty",
     ]:
-        dataset = load_dataset(
-            card=card,
-            template=template,
-            format=format,
-            system_prompt=system_prompt,
-            num_demos=2,
-            demos_pool_size=50,
-            loader_limit=300,
-            max_test_instances=100,
-            split="test",
-        )
-
-        predictions = model(dataset)
-        results = evaluate(predictions=predictions, data=dataset)
-
-        print(
-            f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
-        )
-
-        print(
-            results.instance_scores.to_df(
-                columns=[
-                    "source",
-                    "prediction",
-                ]
+        for system_prompt in [
+            "system_prompts.empty",
+        ]:
+            if model_name == "wml-gen" and "formats.chat_api" in format:
+                continue
+            if model_name == "wml-chat" and "formats.chat_api" not in format:
+                continue
+            dataset = load_dataset(
+                card=card,
+                format=format,
+                system_prompt=system_prompt,
+                template=template,
+                num_demos=5,
+                demos_pool_size=100,
+                loader_limit=1000,
+                max_test_instances=128,
+                split="test",
             )
-        )
-
-        global_scores = results.global_scores
-        df.loc[len(df)] = [
-            format,
-            system_prompt,
-            global_scores["score"],
-            global_scores["score_ci_low"],
-            global_scores["score_ci_high"],
-        ]
-
-        df = df.round(decimals=2)
-        print(df.to_markdown())
+            type_of_input = type(dataset[0]["source"])
+
+            print("Starting inference...")
+            start = time.perf_counter()
+            predictions = model(dataset)
+            end = time.perf_counter()
+            duration = end - start
+            print("End of inference...")
+
+            results = evaluate(predictions=predictions, data=dataset)
+
+            print(
+                f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
+            )
+
+            print("Example prompt:")
+
+            print(json.dumps(results.instance_scores[0]["source"], indent=4))
+
+            print("Example prediction:")
+
+            print(json.dumps(results.instance_scores[0]["prediction"], indent=4))
+
+            global_scores = results.global_scores
+            df.loc[len(df)] = [
+                model_name,
+                format,
+                system_prompt,
+                global_scores["score"],
+                global_scores["score_ci_low"],
+                global_scores["score_ci_high"],
+                duration,
+                len(predictions),
+                type_of_input,
+            ]
+
+            df = df.round(decimals=2)
+            print(df.to_markdown())
diff --git a/examples/evaluate_granite_thinking.py b/examples/evaluate_granite_thinking.py
@@ -0,0 +1,70 @@
+from unitxt import get_logger
+from unitxt.api import create_dataset, evaluate
+from unitxt.formats import HFSystemFormat
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.processors import ExtractWithRegex, PostProcess
+from unitxt.task import Task
+from unitxt.templates import InputOutputTemplate
+
+logger = get_logger()
+
+# Set up question answer pairs in a dictionary
+test_set = [
+    {
+        "question": "If I had 32 apples, I lost 5 apples, and gain twice more as many as I have.  How many do I have at the end",
+        "answer": "81",
+    },
+]
+
+
+# define the QA task
+task = Task(
+    input_fields={"question": str},
+    reference_fields={"answer": str},
+    prediction_type=str,
+    metrics=["metrics.accuracy"],
+)
+
+
+# Create a simple template that formats the input.
+# Add lowercase normalization as a post processor.
+
+
+for thinking in [True, False]:
+    postprocessors = ["processors.lower_case"]
+    if thinking:
+        postprocessors.append(
+            PostProcess(
+                ExtractWithRegex(regex="<response>(.*)</response"),
+                process_references=False,
+            )
+        )
+
+    template = InputOutputTemplate(
+        instruction="Answer the following question with the single numeric answer.  Do not answer in complete sentences.  Just return the answer.",
+        input_format="{question}",
+        output_format="{answer}",
+        postprocessors=postprocessors,
+    )
+    dataset = create_dataset(
+        task=task,
+        test_set=test_set,
+        template=template,
+        split="test",
+        format=HFSystemFormat(
+            model_name="ibm-granite/granite-3.3-8b-instruct",
+            chat_kwargs_dict={"thinking": thinking},
+            place_instruction_in_user_turns=True,
+        ),
+    )
+
+    model = CrossProviderInferenceEngine(
+        model="granite-3-3-8b-instruct", provider="rits", use_cache=False
+    )
+
+    predictions = model(dataset)
+
+    results = evaluate(predictions=predictions, data=dataset)
+
+    print("Instance Results when Thinking=", thinking)
+    print(results.instance_scores)
diff --git a/examples/evaluate_granite_thinking_mmlu.py b/examples/evaluate_granite_thinking_mmlu.py
@@ -0,0 +1,58 @@
+from unitxt.api import evaluate, load_dataset
+from unitxt.formats import HFSystemFormat
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.processors import ExtractWithRegex, PostProcess
+from unitxt.templates import MultipleChoiceTemplate
+
+for thinking in [True, False]:
+    postprocessors = ["processors.first_character"]
+    if thinking:
+        postprocessors = [
+            PostProcess(
+                ExtractWithRegex(regex="<response>(.*)</response"),
+                process_references=False,
+            ),
+            "processors.first_character",
+        ]
+
+    template = MultipleChoiceTemplate(
+        input_format="""The following are multiple choice questions (with answers) about {topic}.
+    {question}
+    Answers:
+    {choices}
+    The response should be returned as a single letter: A, B, C, or D. Do not answer in sentences. Only return the single letter answer.""",
+        target_field="answer",
+        choices_separator="\n",
+        postprocessors=postprocessors,
+    )
+    dataset = load_dataset(
+        card="cards.mmlu.abstract_algebra",
+        template=template,
+        split="test",
+        format=HFSystemFormat(
+            model_name="ibm-granite/granite-3.3-8b-instruct",
+            chat_kwargs_dict={"thinking": thinking},
+            place_instruction_in_user_turns=True,
+        ),
+        loader_limit=100,
+    )
+
+    model = CrossProviderInferenceEngine(
+        model="granite-3-3-8b-instruct", provider="rits", temperature=0
+    )
+
+    predictions = model(dataset)
+
+    results = evaluate(predictions=predictions, data=dataset)
+
+    print("Instance Results when Thinking=", thinking)
+
+    for instance in results.instance_scores:
+        if instance["processed_prediction"] not in ["A", "B", "C", "D"]:
+            print(
+                "Problematic prediction (could not be parsed to a acceptable single letter answer):"
+            )
+            print(instance["prediction"])
+
+    print("Global Results when Thinking=", thinking)
+    print(results.global_scores.summary)
diff --git a/examples/inference_using_cross_provider.py b/examples/inference_using_cross_provider.py
@@ -0,0 +1,26 @@
+from unitxt.inference import CrossProviderInferenceEngine
+from unitxt.text_utils import print_dict
+
+if __name__ == "__main__":
+    for provider in ["watsonx", "rits", "watsonx-sdk"]:
+        print()
+        print("------------------------------------------------ ")
+        print("PROVIDER:", provider)
+        model = CrossProviderInferenceEngine(
+            model="granite-3-3-8b-instruct", provider=provider, temperature=0
+        )
+
+        # Loading dataset:
+        test_data = [
+            {
+                "source": [{"content": "Hello, how are you?", "role": "user"}],
+                "data_classification_policy": ["public"],
+            }
+        ]
+
+        # Performing inference:
+        predictions = model(test_data)
+        for inp, prediction in zip(test_data, predictions):
+            result = {**inp, "prediction": prediction}
+
+            print_dict(result, keys_to_print=["source", "prediction"])
diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py