Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/docs/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ Related documentation: :ref:`Templates tutorial <adding_template>`, :ref:`Format
Evaluate the impact of different formats and system prompts
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

This example demonstrates how different formats and system prompts affect the input provided to a llama3 chat model and evaluate their impact on the obtained scores.
This example demonstrates how different formats and system prompts affect the input provided to a granite chat model and evaluate their impact on the obtained scores.

`Example code <https://github.com/IBM/unitxt/blob/main/examples/evaluate_different_formats.py>`__

Expand Down
221 changes: 159 additions & 62 deletions examples/evaluate_different_formats.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,165 @@
import json
import time

import pandas as pd
from unitxt.api import evaluate, load_dataset
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.inference import (
CrossProviderInferenceEngine,
WMLInferenceEngineChat,
WMLInferenceEngineGeneration,
)

print("Creating cross_provider_rits ...")
cross_provider_rits = CrossProviderInferenceEngine(
model="granite-3-8b-instruct", max_tokens=32, provider="rits", temperature=0
)

model = CrossProviderInferenceEngine(
model="llama-3-8b-instruct", max_tokens=32, provider="bam"
print("Creating cross_provider_watsonx ...")
cross_provider_watsonx = CrossProviderInferenceEngine(
model="granite-3-8b-instruct", max_tokens=32, provider="watsonx", temperature=0
)
print("Creating wml_gen ...")
wml_gen = WMLInferenceEngineGeneration(
model_name="ibm/granite-3-8b-instruct", max_new_tokens=32, temperature=0
)
print("Creating wml_chat ...")
wml_chat = WMLInferenceEngineChat(
model_name="ibm/granite-3-8b-instruct", max_tokens=32, temperature=0
)
"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.

For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""

card = "cards.boolq.classification"
template = "templates.classification.multi_class.relation.default"

df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])

for format in [
"formats.llama3_instruct",
"formats.empty",
"formats.llama3_instruct_all_demos_in_one_turn",
]:
for system_prompt in [
"system_prompts.models.llama2",
"system_prompts.empty",

df = pd.DataFrame(
columns=[
"model",
"format",
"system_prompt",
"f1_micro",
"ci_low",
"ci_high",
"duration",
"num_instances",
"type_of_input",
]
)

model_list = [
(cross_provider_watsonx, "cross-provider-watsonx"),
(wml_chat, "wml-chat"),
(wml_gen, "wml-gen"),
]

# This example compares the impact of different formats on a classification dataset
#
# formats.chat_api - creates a list of OpenAI messages, where the instruction appears in the system prompt.
#
# [
# {
# "role": "system",
# "content": "Classify the contractual clauses of the following text to one of these options: Records, Warranties... "
# },
# {
# "role": "user",
# "content": "text: Each Credit Party shall maintain..."
# },
# {
# "role": "assistant",
# "content": "The contractual clauses is Records"
# },
# {
# "role": "user",
# "content": "text: Executive agrees to be employed with the Company...."
# }
# ]
#
# formats.chat_api[place_instruction_in_user_turns=True] - creates a list of OpenAI messages, where the instruction appears in each user turn prompt.
#
# [
# {
# "role": "user",
# "content": "Classify the contractual clauses of the following text to one of these options: ...
# text: Each Credit Party shall maintain...."
# },
# {
# "role": "assistant",
# "content": "The contractual clauses is Records"
# },
# {
# "role": "user",
# "content": "Classify the contractual clauses of the following text to one of these options: ...
# text: Executive agrees to be employed with the Company...
# }
# ]
#
# formats.empty - pass inputs as a single string
#
# "Classify the contractual clauses of the following text to one of these options: Records, Warranties,.
# text: Each Credit Party shall maintain...
# The contractual clauses is Records
#
# text: Executive agrees to be employed with the Company,...
# The contractual clauses is "

for model, model_name in model_list:
print(model_name)
card = "cards.ledgar"
template = "templates.classification.multi_class.instruction"
for format in [
"formats.chat_api[place_instruction_in_user_turns=True]",
"formats.chat_api",
"formats.empty",
]:
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=2,
demos_pool_size=50,
loader_limit=300,
max_test_instances=100,
split="test",
)

predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)

print(
results.instance_scores.to_df(
columns=[
"source",
"prediction",
]
for system_prompt in [
"system_prompts.empty",
]:
if model_name == "wml-gen" and "formats.chat_api" in format:
continue
if model_name == "wml-chat" and "formats.chat_api" not in format:
continue
dataset = load_dataset(
card=card,
format=format,
system_prompt=system_prompt,
template=template,
num_demos=5,
demos_pool_size=100,
loader_limit=1000,
max_test_instances=128,
split="test",
)
)

global_scores = results.global_scores
df.loc[len(df)] = [
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
]

df = df.round(decimals=2)
print(df.to_markdown())
type_of_input = type(dataset[0]["source"])

print("Starting inference...")
start = time.perf_counter()
predictions = model(dataset)
end = time.perf_counter()
duration = end - start
print("End of inference...")

results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)

print("Example prompt:")

print(json.dumps(results.instance_scores[0]["source"], indent=4))

print("Example prediction:")

print(json.dumps(results.instance_scores[0]["prediction"], indent=4))

global_scores = results.global_scores
df.loc[len(df)] = [
model_name,
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
duration,
len(predictions),
type_of_input,
]

df = df.round(decimals=2)
print(df.to_markdown())
70 changes: 70 additions & 0 deletions examples/evaluate_granite_thinking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from unitxt import get_logger
from unitxt.api import create_dataset, evaluate
from unitxt.formats import HFSystemFormat
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.processors import ExtractWithRegex, PostProcess
from unitxt.task import Task
from unitxt.templates import InputOutputTemplate

logger = get_logger()

# Set up question answer pairs in a dictionary
test_set = [
{
"question": "If I had 32 apples, I lost 5 apples, and gain twice more as many as I have. How many do I have at the end",
"answer": "81",
},
]


# define the QA task
task = Task(
input_fields={"question": str},
reference_fields={"answer": str},
prediction_type=str,
metrics=["metrics.accuracy"],
)


# Create a simple template that formats the input.
# Add lowercase normalization as a post processor.


for thinking in [True, False]:
postprocessors = ["processors.lower_case"]
if thinking:
postprocessors.append(
PostProcess(
ExtractWithRegex(regex="<response>(.*)</response"),
process_references=False,
)
)

template = InputOutputTemplate(
instruction="Answer the following question with the single numeric answer. Do not answer in complete sentences. Just return the answer.",
input_format="{question}",
output_format="{answer}",
postprocessors=postprocessors,
)
dataset = create_dataset(
task=task,
test_set=test_set,
template=template,
split="test",
format=HFSystemFormat(
model_name="ibm-granite/granite-3.3-8b-instruct",
chat_kwargs_dict={"thinking": thinking},
place_instruction_in_user_turns=True,
),
)

model = CrossProviderInferenceEngine(
model="granite-3-3-8b-instruct", provider="rits", use_cache=False
)

predictions = model(dataset)

results = evaluate(predictions=predictions, data=dataset)

print("Instance Results when Thinking=", thinking)
print(results.instance_scores)
58 changes: 58 additions & 0 deletions examples/evaluate_granite_thinking_mmlu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from unitxt.api import evaluate, load_dataset
from unitxt.formats import HFSystemFormat
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.processors import ExtractWithRegex, PostProcess
from unitxt.templates import MultipleChoiceTemplate

for thinking in [True, False]:
postprocessors = ["processors.first_character"]
if thinking:
postprocessors = [
PostProcess(
ExtractWithRegex(regex="<response>(.*)</response"),
process_references=False,
),
"processors.first_character",
]

template = MultipleChoiceTemplate(
input_format="""The following are multiple choice questions (with answers) about {topic}.
{question}
Answers:
{choices}
The response should be returned as a single letter: A, B, C, or D. Do not answer in sentences. Only return the single letter answer.""",
target_field="answer",
choices_separator="\n",
postprocessors=postprocessors,
)
dataset = load_dataset(
card="cards.mmlu.abstract_algebra",
template=template,
split="test",
format=HFSystemFormat(
model_name="ibm-granite/granite-3.3-8b-instruct",
chat_kwargs_dict={"thinking": thinking},
place_instruction_in_user_turns=True,
),
loader_limit=100,
)

model = CrossProviderInferenceEngine(
model="granite-3-3-8b-instruct", provider="rits", temperature=0
)

predictions = model(dataset)

results = evaluate(predictions=predictions, data=dataset)

print("Instance Results when Thinking=", thinking)

for instance in results.instance_scores:
if instance["processed_prediction"] not in ["A", "B", "C", "D"]:
print(
"Problematic prediction (could not be parsed to a acceptable single letter answer):"
)
print(instance["prediction"])

print("Global Results when Thinking=", thinking)
print(results.global_scores.summary)
26 changes: 26 additions & 0 deletions examples/inference_using_cross_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.text_utils import print_dict

if __name__ == "__main__":
for provider in ["watsonx", "rits", "watsonx-sdk"]:
print()
print("------------------------------------------------ ")
print("PROVIDER:", provider)
model = CrossProviderInferenceEngine(
model="granite-3-3-8b-instruct", provider=provider, temperature=0
)

# Loading dataset:
test_data = [
{
"source": [{"content": "Hello, how are you?", "role": "user"}],
"data_classification_policy": ["public"],
}
]

# Performing inference:
predictions = model(test_data)
for inp, prediction in zip(test_data, predictions):
result = {**inp, "prediction": prediction}

print_dict(result, keys_to_print=["source", "prediction"])
Loading
Loading