Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 121 additions & 62 deletions examples/evaluate_different_formats.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,127 @@
import json
import time

import pandas as pd
from lh_eval_api import LakeHouseLoader
from unitxt.api import evaluate, load_dataset
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.inference import (
CrossProviderInferenceEngine,
WMLInferenceEngineChat,
WMLInferenceEngineGeneration,
)

x = LakeHouseLoader # To avoid warnings, of unused imports.
print("Creating cross_provider_rits ...")
cross_provider_rits = CrossProviderInferenceEngine(
model="granite-3-8b-instruct", max_tokens=32, provider="rits"
)

model = CrossProviderInferenceEngine(
model="llama-3-8b-instruct", max_tokens=32, provider="bam"
print("Creating cross_provider_watsonx ...")
cross_provider_watsonx = CrossProviderInferenceEngine(
model="granite-3-8b-instruct", max_tokens=32, provider="watsonx"
)
print("Creating wml_gen ...")
wml_gen = WMLInferenceEngineGeneration(
model_name="ibm/granite-3-8b-instruct", max_new_tokens=32
)
print("Creating wml_chat ...")
wml_chat = WMLInferenceEngineChat(
model_name="ibm/granite-3-8b-instruct", max_tokens=32, top_logprobs=None
)
"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.

For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""

card = "cards.boolq.classification"
template = "templates.classification.multi_class.relation.default"

df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])

for format in [
"formats.llama3_instruct",
"formats.empty",
"formats.llama3_instruct_all_demos_in_one_turn",
]:
for system_prompt in [
"system_prompts.models.llama2",
"system_prompts.empty",

# wml_chat = WMLInferenceEngineChat(
# model_name="ibm/granite-vision-3-2-2b",max_tokens=32,top_logprobs=None
# )
# wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-vision-3-2-2b",max_new_tokens=32)

df = pd.DataFrame(
columns=[
"model",
"format",
"system_prompt",
"f1_micro",
"ci_low",
"ci_high",
"duration",
"num_instances",
"type_of_input",
]
)

# model_list = [(cross_provider_rits,"cross_provider_rits"),(wml_chat,"wml-chat"),(cross_provider_watsonx, "cross-provider-watsonx")]
# model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(cross_provider_rits,"cross_provider_rits")]
model_list = [
(cross_provider_watsonx, "cross-provider-watsonx"),
(wml_chat, "wml-chat"),
(wml_gen, "wml-gen"),
]
# model_list = [(cross_provider_rits,"cross_provider_rits")]
for model, model_name in model_list:
print(model_name)
card = "cards.cat"
template = "templates.classification.multi_label.instruct_question_select"

for format in [
"formats.chat_api[place_instruction_in_user_turns=True,add_target_prefix=False]",
"formats.chat_api[place_instruction_in_user_turns=True]",
"formats.granite_instruct_custom",
"formats.chat_api",
# "formats.empty",
]:
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=2,
demos_pool_size=50,
loader_limit=300,
max_test_instances=100,
split="test",
)

predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)

print(
results.instance_scores.to_df(
columns=[
"source",
"prediction",
]
for system_prompt in [
"system_prompts.models.granite_instruct_classify",
# "system_prompts.empty",
]:
if model_name == "wml-gen" and "formats.chat_api" in format:
continue
if model_name == "wml-chat" and "formats.chat_api" not in format:
continue
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=5,
demos_pool_size=100,
loader_limit=1000,
max_test_instances=128,
split="test",
)
)

global_scores = results.global_scores
df.loc[len(df)] = [
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
]

df = df.round(decimals=2)
print(df.to_markdown())
type_of_input = type(dataset[0]["source"])

print("Starting inference...")
start = time.perf_counter()
predictions = model(dataset)
end = time.perf_counter()
duration = end - start
print("End of inference...")

results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)

print("Example prompt:")

print(json.dumps(results.instance_scores[0]["source"], indent=4))

print("Example prediction:")

print(json.dumps(results.instance_scores[0]["prediction"], indent=4))

global_scores = results.global_scores
df.loc[len(df)] = [
model_name,
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
duration,
len(predictions),
type_of_input,
]

df = df.round(decimals=2)
print(df.to_markdown())
70 changes: 70 additions & 0 deletions examples/evaluate_granite_thinking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from unitxt import get_logger
from unitxt.api import create_dataset, evaluate
from unitxt.formats import HFSystemFormat
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.processors import ExtractWithRegex, PostProcess
from unitxt.task import Task
from unitxt.templates import InputOutputTemplate

logger = get_logger()

# Set up question answer pairs in a dictionary
test_set = [
{
"question": "If I had 32 apples, I lost 5 apples, and gain twice more as many as I have. How many do I have at the end",
"answer": "81",
},
]


# define the QA task
task = Task(
input_fields={"question": str},
reference_fields={"answer": str},
prediction_type=str,
metrics=["metrics.accuracy"],
)


# Create a simple template that formats the input.
# Add lowercase normalization as a post processor.


for thinking in [True, False]:
postprocessors = ["processors.lower_case"]
if thinking:
postprocessors.append(
PostProcess(
ExtractWithRegex(regex="<response>(.*)</response"),
process_references=False,
)
)

template = InputOutputTemplate(
instruction="Answer the following question with the single numeric answer.",
input_format="{question}",
output_format="{answer}",
postprocessors=postprocessors,
)
dataset = create_dataset(
task=task,
test_set=test_set,
template=template,
split="test",
format=HFSystemFormat(
model_name="ibm-granite/granite-3.3-8b-instruct",
chat_kwargs_dict={"thinking": thinking},
place_instruction_in_user_turns=True,
),
)

model = CrossProviderInferenceEngine(
model="granite-3-3-8b-instruct", provider="rits", use_cache=False
)

predictions = model(dataset)

results = evaluate(predictions=predictions, data=dataset)

print("Instance Results when Thinking=", thinking)
print(results.instance_scores)
31 changes: 25 additions & 6 deletions src/unitxt/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,9 @@ class ChatAPIFormat(BaseFormat):
The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
"""

place_instruction_in_user_turns: bool = False
add_target_prefix: bool = True

def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
# Regular expression to find <img> tags with src attribute
img_tag_pattern = re.compile(
Expand Down Expand Up @@ -422,9 +425,11 @@ def to_chat(
) -> List[Message]:
messages = []

if system_prompt or instruction:
if system_prompt or (instruction and not self.place_instruction_in_user_turns):
system_content = self.to_content(
system_prompt + ("\n" if system_prompt != "" else "") + instruction,
system_prompt
+ ("\n" if system_prompt != "" else "")
+ (instruction if not self.place_instruction_in_user_turns else ""),
media,
)
messages.append(
Expand All @@ -435,9 +440,15 @@ def to_chat(
)

for demo_instance in demos:
user_content = self.to_content(demo_instance["source"], media)
text = demo_instance["source"]
if instruction and self.place_instruction_in_user_turns:
text = f"{instruction}\n{text}"

user_content = self.to_content(text, media)
assistant_content = self.to_content(
target_prefix + demo_instance["target"], media
(target_prefix if self.add_target_prefix else "")
+ demo_instance["target"],
media,
)
messages.extend(
[
Expand All @@ -449,7 +460,11 @@ def to_chat(
]
)

last_user_content = self.to_content(source, media)
text = source
if instruction and self.place_instruction_in_user_turns:
text = f"{instruction}\n{text}"

last_user_content = self.to_content(text, media)

messages.extend([{"role": "user", "content": last_user_content}])

Expand Down Expand Up @@ -492,6 +507,7 @@ class HFSystemFormat(ChatAPIFormat):
"""

model_name: str
chat_kwargs_dict: Dict[str, str] = {}
_requirements_list = ["transformers", "Jinja2"]

@retry_connection_with_exponential_backoff(backoff_factor=2)
Expand All @@ -515,7 +531,10 @@ def _format_instance_to_source(
)
return (
self.tokenizer.apply_chat_template(
chat, tokenize=False, add_generation_prompt=True
chat,
tokenize=False,
add_generation_prompt=True,
**self.chat_kwargs_dict,
)
+ target_prefix
)
Loading