Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test branch to evaluate impact of different format #1667

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 98 additions & 63 deletions examples/evaluate_different_formats.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,103 @@
import json
import time

import pandas as pd
from lh_eval_api import LakeHouseLoader
from unitxt.api import evaluate, load_dataset
from unitxt.inference import CrossProviderInferenceEngine

model = CrossProviderInferenceEngine(
model="llama-3-8b-instruct", max_tokens=32, provider="bam"
from unitxt.inference import (
CrossProviderInferenceEngine,
WMLInferenceEngineChat,
WMLInferenceEngineGeneration,
)
"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.

For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""

card = "cards.boolq.classification"
template = "templates.classification.multi_class.relation.default"

df = pd.DataFrame(columns=["format", "system_prompt", "f1_micro", "ci_low", "ci_high"])

for format in [
"formats.llama3_instruct",
"formats.empty",
"formats.llama3_instruct_all_demos_in_one_turn",
]:
for system_prompt in [
"system_prompts.models.llama2",
"system_prompts.empty",

x = LakeHouseLoader # To avoid warnings, of unused imports.
print("Creating cross_provider_rits ...")
cross_provider_rits = CrossProviderInferenceEngine(model="granite-3-8b-instruct", max_tokens=32, provider="rits")

print("Creating cross_provider_watsonx ...")
cross_provider_watsonx = CrossProviderInferenceEngine(model="granite-3-8b-instruct", max_tokens=32, provider="watsonx")
print("Creating wml_gen ...")
wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-3-8b-instruct",max_new_tokens=32)
print("Creating wml_chat ...")
wml_chat= WMLInferenceEngineChat(model_name="ibm/granite-3-8b-instruct",max_tokens=32,top_logprobs=None)

#wml_chat = WMLInferenceEngineChat(
# model_name="ibm/granite-vision-3-2-2b",max_tokens=32,top_logprobs=None
#)
#wml_gen= WMLInferenceEngineGeneration(model_name="ibm/granite-vision-3-2-2b",max_new_tokens=32)

df = pd.DataFrame(columns=["model","format", "system_prompt", "f1_micro", "ci_low", "ci_high", "duration", "num_instances","type_of_input"])

#model_list = [(cross_provider_rits,"cross_provider_rits"),(wml_chat,"wml-chat"),(cross_provider_watsonx, "cross-provider-watsonx")]
#model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(cross_provider_rits,"cross_provider_rits")]
model_list = [(cross_provider_watsonx, "cross-provider-watsonx"),(wml_chat,"wml-chat"),(wml_gen,"wml-gen") ]
#model_list = [(cross_provider_rits,"cross_provider_rits")]
for (model,model_name) in model_list:
print(model_name)
card = "cards.cat"
template = "templates.classification.multi_label.instruct_question_select"

for format in [
"formats.chat_api[repeat_instruction_per_turn=True,add_target_prefix=False]",
"formats.chat_api[repeat_instruction_per_turn=True]",
"formats.granite_instruct_custom",
"formats.chat_api",
# "formats.empty",
]:
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=2,
demos_pool_size=50,
loader_limit=300,
max_test_instances=100,
split="test",
)

predictions = model(dataset)
results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)

print(
results.instance_scores.to_df(
columns=[
"source",
"prediction",
]
for system_prompt in [
"system_prompts.models.granite_instruct_classify",
# "system_prompts.empty",
]:
if (model_name == "wml-gen" and "formats.chat_api" in format):
continue
if (model_name == "wml-chat" and "formats.chat_api" not in format):
continue
dataset = load_dataset(
card=card,
template=template,
format=format,
system_prompt=system_prompt,
num_demos=5,
demos_pool_size=100,
loader_limit=1000,
max_test_instances=128 ,
split="test",
)
type_of_input = (type(dataset[0]["source"]))

print("Starting inference...")
start = time.perf_counter()
predictions = model(dataset)
end = time.perf_counter()
duration = end-start
print("End of inference...")

results = evaluate(predictions=predictions, data=dataset)

print(
f"Sample input and output for format '{format}' and system prompt '{system_prompt}':"
)
)

global_scores = results.global_scores
df.loc[len(df)] = [
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
]

df = df.round(decimals=2)
print(df.to_markdown())

print("Example prompt:")

print(json.dumps(results.instance_scores[0]["source"], indent=4))

print("Example prediction:")

print(json.dumps(results.instance_scores[0]["prediction"], indent=4))

global_scores = results.global_scores
df.loc[len(df)] = [
model_name,
format,
system_prompt,
global_scores["score"],
global_scores["score_ci_low"],
global_scores["score_ci_high"],
duration,
len(predictions),
type_of_input
]

df = df.round(decimals=2)
print(df.to_markdown())
7 changes: 3 additions & 4 deletions examples/inference_using_ibm_watsonx_ai.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import os

from unitxt.api import load_dataset
from unitxt.inference import WMLInferenceEngine
from unitxt.text_utils import print_dict

if __name__ == "__main__":
# Set required env variables using your WML credentials:
os.environ["WML_URL"] = ""
os.environ["WML_PROJECT_ID"] = ""
os.environ["WML_APIKEY"] = ""
# os.environ["WML_URL"] = ""
# os.environ["WML_PROJECT_ID"] = ""
# os.environ["WML_APIKEY"] = ""

# Preparing WML inference engine:
model_name = "google/flan-t5-xl"
Expand Down
23 changes: 18 additions & 5 deletions src/unitxt/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,10 @@ class ChatAPIFormat(BaseFormat):
The resulting `messages` is now a dictionary ready for sending to the OpenAI API.
"""

repeat_instruction_per_turn: bool = False
add_target_prefix : bool = True


def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
# Regular expression to find <img> tags with src attribute
img_tag_pattern = re.compile(
Expand Down Expand Up @@ -415,9 +419,10 @@ def to_chat(
) -> List[Message]:
messages = []

if system_prompt or instruction:
if system_prompt or (instruction and not self.repeat_instruction_per_turn):
system_content = self.to_content(
system_prompt + ("\n" if system_prompt != "" else "") + instruction,
system_prompt + ("\n" if system_prompt != "" else "") +
(instruction if not self.repeat_instruction_per_turn else ""),
media,
)
messages.append(
Expand All @@ -428,9 +433,13 @@ def to_chat(
)

for demo_instance in demos:
user_content = self.to_content(demo_instance["source"], media)
text = demo_instance["source"]
if (instruction and self.repeat_instruction_per_turn):
text = f"{instruction}\n{text}"

user_content = self.to_content(text, media)
assistant_content = self.to_content(
target_prefix + demo_instance["target"], media
(target_prefix if self.add_target_prefix else "") + demo_instance["target"], media
)
messages.extend(
[
Expand All @@ -442,7 +451,11 @@ def to_chat(
]
)

last_user_content = self.to_content(source, media)
text = source
if (instruction and self.repeat_instruction_per_turn):
text = f"{instruction}\n{text}"

last_user_content = self.to_content(text , media)

messages.extend([{"role": "user", "content": last_user_content}])

Expand Down
73 changes: 47 additions & 26 deletions src/unitxt/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2025,6 +2025,9 @@ class WMLInferenceEngineBase(
deployment_id (str, optional):
Deployment ID of a tuned model to be used for
inference. Mutually exclusive with 'model_name'.
concurrency_limit (int):
Number of concurrent requests sent to a model. Default is 10,
which is also the maximum value for the generation.
parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
Defines inference parameters and their values. Deprecated attribute, please pass respective
parameters directly to the respective class instead.
Expand All @@ -2033,6 +2036,7 @@ class WMLInferenceEngineBase(
credentials: Optional[CredentialsWML] = None
model_name: Optional[str] = None
deployment_id: Optional[str] = None
concurrency_limit: int = 10
label: str = "wml"
_requirements_list = {
"ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
Expand Down Expand Up @@ -2286,11 +2290,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi

If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.

Args:
concurrency_limit (int):
Number of concurrent requests sent to a model. Default is 10,
which is also the maximum value.

Examples:
.. code-block:: python

Expand All @@ -2314,8 +2313,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
results = wml_inference.infer(dataset["test"])
"""

concurrency_limit: int = 10

def verify(self):
super().verify()

Expand Down Expand Up @@ -2567,6 +2564,32 @@ def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]
# images as SDK allows sending only one image per message.
return [messages]

def _handle_async_requests(
self,
messages: List[List[Dict[str, Any]]],
params: Dict[str, Any],
) -> List[Dict[str, Any]]:
async def handle_async_requests(start_idx, end_idx):
coroutines = [
self._model.achat(messages=messages[idx], params=params)
for idx in range(start_idx, end_idx)
]
batch_results = await asyncio.gather(*coroutines)
return list(batch_results)

loop = asyncio.get_event_loop()
results = []

for batch_idx in range(0, len(messages), self.concurrency_limit):
batch_results = loop.run_until_complete(
handle_async_requests(
batch_idx, min(batch_idx + self.concurrency_limit, len(messages))
)
)
results.extend(batch_results)

return results

def _send_requests(
self,
dataset: Union[List[Dict[str, Any]], Dataset],
Expand All @@ -2582,27 +2605,25 @@ def _send_requests(
output_type = "message"
params["logprobs"] = False

final_results = []

for instance in dataset:
messages = self.to_messages(instance)

for message in messages:
result = self._model.chat(
messages=message,
params=params,
)
indexed_messages = [
(i, message)
for i in range(len(dataset))
for message in self.to_messages(dataset[i])
]

final_results.append(
self.get_return_object(
result["choices"][0][output_type]["content"],
result,
instance["source"],
return_meta_data,
)
)
results = self._handle_async_requests(
[msg[1] for msg in indexed_messages], params
)

return final_results
return [
self.get_return_object(
result["choices"][0][output_type]["content"],
result,
dataset[idx[0]]["source"],
return_meta_data,
)
for result, idx in zip(results, indexed_messages)
]

def get_return_object(self, predict_result, result, input_text, return_meta_data):
if return_meta_data:
Expand Down
Loading