Skip to content

Commit a8c9a6e

Browse files
committed
Fix llm judge artifacts
Signed-off-by: Martín Santillán Cooper <[email protected]>
1 parent 4aad1e0 commit a8c9a6e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+59
-6
lines changed

prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@
8989
# now = define the judge metric using the criteria
9090
adherence_metric = LLMJudgeDirect(
9191
inference_engine=CrossProviderInferenceEngine( # or your favorite inference model
92-
model="llama-3-3-70b-instruct", max_tokens=1024
92+
model="llama-3-3-70b-instruct", max_tokens=1024, provider="watsonx"
9393
),
9494
criteria=adherence_criteria,
9595
# the fields from the generation task to be presented to the judge. Those fields must be present

prepare/metrics/llm_as_judge/llm_as_judge.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def get_evaluator(
2323
provider: ModelProviderEnum,
2424
) -> Union[LLMJudgeDirect, LLMJudgePairwise]:
2525
evaluator_metadata = get_evaluator_metadata(name)
26-
inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0}
26+
inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0, "provider": provider.value}
2727
model_name = EVALUATOR_TO_MODEL_ID[name]
2828

2929
if provider == ModelProviderEnum.AZURE_OPENAI:
@@ -86,8 +86,14 @@ def get_evaluator(
8686
.replace(".", "_")
8787
.replace(" ", "_")
8888
)
89-
90-
provider_name = provider.value.lower() if provider != ModelProviderEnum.AZURE_OPENAI else "azure_openai"
89+
provider_name = ""
90+
# for backward compatibility, ideally we would use cross inference engines provider ids
91+
if provider == ModelProviderEnum.AZURE_OPENAI:
92+
provider_name = "azure_openai"
93+
elif provider == ModelProviderEnum.OPENAI:
94+
provider_name = "openai"
95+
else:
96+
provider_name = provider.value.lower()
9197

9298
add_to_catalog(
9399
evaluator,

src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/gpt_4o.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/gpt-4o-2024-08-06/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_mini.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-mini/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_preview.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-preview/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/direct/openai/gpt_4o.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "gpt-4o-2024-08-06"
910
},
1011
"evaluator_name": "GPT4",

src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_mini.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "o1-mini"
910
},
1011
"evaluator_name": "O1_MINI",

src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_preview.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "o1-preview"
910
},
1011
"evaluator_name": "O1_PREVIEW",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_0_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-1-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_2_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-2-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_2_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_405b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-405b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_405B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_70B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_3_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-3-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_3_70B",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral8_7b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "mixtral-8x7b-instruct-v01"
910
},
1011
"evaluator_name": "MIXTRAL8_7b",

src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral_large.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "mistral-large-instruct"
910
},
1011
"evaluator_name": "MIXTRAL_LARGE",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_0_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "granite-3-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_2_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "granite-3-2-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_2_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_405b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-405b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_405B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_70B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_3_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-3-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_3_70B",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral8_7b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "mixtral-8x7b-instruct-v01"
910
},
1011
"evaluator_name": "MIXTRAL8_7b",

src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral_large.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "mistral-large-instruct"
910
},
1011
"evaluator_name": "MIXTRAL_LARGE",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/gpt_4o.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/gpt-4o-2024-08-06/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_mini.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-mini/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_preview.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "azure",
89
"credentials": {
910
"api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-preview/chat/completions?api-version=2024-08-01-preview"
1011
},

src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/gpt_4o.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "gpt-4o-2024-08-06"
910
},
1011
"evaluator_name": "GPT4",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_mini.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "o1-mini"
910
},
1011
"evaluator_name": "O1_MINI",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_preview.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "open-ai",
89
"model": "o1-preview"
910
},
1011
"evaluator_name": "O1_PREVIEW",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_0_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-1-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_2_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "granite-3-2-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_2_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_405b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-405b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_405B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_70B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_3_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "llama-3-3-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_3_70B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral8_7b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "mixtral-8x7b-instruct-v01"
910
},
1011
"evaluator_name": "MIXTRAL8_7b",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral_large.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "rits",
89
"model": "mistral-large-instruct"
910
},
1011
"evaluator_name": "MIXTRAL_LARGE",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_0_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "granite-3-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_2_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "granite-3-2-8b-instruct"
910
},
1011
"evaluator_name": "GRANITE3_2_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_405b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-405b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_405B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_70B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_8b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-1-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_1_8B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_3_70b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "llama-3-3-70b-instruct"
910
},
1011
"evaluator_name": "LLAMA3_3_70B",

src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/mixtral8_7b.json

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"max_tokens": 1024,
66
"seed": 42,
77
"temperature": 0,
8+
"provider": "watsonx",
89
"model": "mixtral-8x7b-instruct-v01"
910
},
1011
"evaluator_name": "MIXTRAL8_7b",

0 commit comments

Comments
 (0)