Skip to content

Commit 9d77d49

Browse files
committed
Unify llm judges into a single prepare file
Signed-off-by: Martín Santillán Cooper <[email protected]>
1 parent 3c9a29c commit 9d77d49

File tree

5 files changed

+160
-248
lines changed

5 files changed

+160
-248
lines changed

prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py

Lines changed: 0 additions & 129 deletions
This file was deleted.
Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Union
1+
from typing import Optional, Union
22

33
from unitxt import add_to_catalog, get_logger
44
from unitxt.inference import CrossProviderInferenceEngine
@@ -8,6 +8,7 @@
88
EVALUATOR_TO_MODEL_ID,
99
EVALUATORS_METADATA,
1010
PAIRWISE_CRITERIA,
11+
EvaluatorMetadata,
1112
EvaluatorNameEnum,
1213
EvaluatorTypeEnum,
1314
ModelProviderEnum,
@@ -16,17 +17,24 @@
1617

1718
logger = get_logger()
1819

19-
2020
def get_evaluator(
2121
name: EvaluatorNameEnum,
2222
evaluator_type: EvaluatorTypeEnum,
23-
provider: ModelProviderEnum,
23+
provider: Optional[ModelProviderEnum] = None,
24+
evaluator_params: Optional[dict] = None,
2425
) -> Union[LLMJudgeDirect, LLMJudgePairwise]:
2526
evaluator_metadata = get_evaluator_metadata(name)
26-
inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0, "provider": provider.value}
27+
inference_params = {
28+
"max_tokens": 1024,
29+
"seed": 42,
30+
"temperature": 0,
31+
}
32+
if provider is not None:
33+
inference_params["provider"] = provider.value
34+
2735
model_name = EVALUATOR_TO_MODEL_ID[name]
2836

29-
if provider == ModelProviderEnum.AZURE_OPENAI:
37+
if provider is not None and provider == ModelProviderEnum.AZURE_OPENAI:
3038
inference_params["credentials"] = {}
3139
inference_params["credentials"]["api_base"] = (
3240
f"https://eteopenai.azure-api.net/openai/deployments/{model_name}/chat/completions?api-version=2024-08-01-preview"
@@ -42,6 +50,9 @@ def get_evaluator(
4250
"generate_summaries": False,
4351
}
4452

53+
if evaluator_params is not None:
54+
params.update(evaluator_params)
55+
4556
evaluator_klass = (
4657
LLMJudgeDirect
4758
if evaluator_type == EvaluatorTypeEnum.DIRECT
@@ -51,6 +62,28 @@ def get_evaluator(
5162
return evaluator_klass(**params)
5263

5364

65+
def get_evaluator_catalog_name(
66+
evaluator_metadata: EvaluatorMetadata,
67+
provider: ModelProviderEnum,
68+
prefix: str = "",
69+
):
70+
metric_name = (
71+
evaluator_metadata.name.value.lower()
72+
.replace("-", "_")
73+
.replace(".", "_")
74+
.replace(" ", "_")
75+
)
76+
provider_name = ""
77+
# for backward compatibility, ideally we would use cross inference engines provider ids
78+
if provider == ModelProviderEnum.AZURE_OPENAI:
79+
provider_name = "azure_openai"
80+
elif provider == ModelProviderEnum.OPENAI:
81+
provider_name = "openai"
82+
else:
83+
provider_name = provider.value.lower()
84+
return f"metrics.{prefix}.{provider_name}.{metric_name}"
85+
86+
5487
logger.debug("Registering criteria...")
5588
# Register all the predefined criterisa
5689
for criteria in DIRECT_CRITERIA:
@@ -67,36 +100,53 @@ def get_evaluator(
67100
overwrite=True,
68101
)
69102

70-
logger.debug("Registering evaluators...")
103+
104+
logger.debug("Registering generic judges (no criterion is set)...")
71105
for evaluator_metadata in EVALUATORS_METADATA:
72106
for provider in evaluator_metadata.providers:
73107
for evaluator_type in [
74108
EvaluatorTypeEnum.DIRECT,
75109
EvaluatorTypeEnum.PAIRWISE,
76110
]:
77-
evaluator = get_evaluator(
78-
name=evaluator_metadata.name,
79-
evaluator_type=evaluator_type,
80-
provider=provider,
81-
)
82-
83-
metric_name = (
84-
evaluator_metadata.name.value.lower()
85-
.replace("-", "_")
86-
.replace(".", "_")
87-
.replace(" ", "_")
88-
)
89-
provider_name = ""
90-
# for backward compatibility, ideally we would use cross inference engines provider ids
91-
if provider == ModelProviderEnum.AZURE_OPENAI:
92-
provider_name = "azure_openai"
93-
elif provider == ModelProviderEnum.OPENAI:
94-
provider_name = "openai"
95-
else:
96-
provider_name = provider.value.lower()
97-
98111
add_to_catalog(
99-
evaluator,
100-
f"metrics.llm_as_judge.{evaluator_type.value}.{provider_name}.{metric_name}",
112+
get_evaluator(
113+
name=evaluator_metadata.name,
114+
evaluator_type=evaluator_type,
115+
provider=provider,
116+
),
117+
get_evaluator_catalog_name(evaluator_metadata, provider, f"llm_as_judge.{evaluator_type.value}"),
101118
overwrite=True,
102119
)
120+
121+
logger.debug("Registering judges with a specific criterion...")
122+
add_to_catalog(
123+
get_evaluator(
124+
name=EvaluatorNameEnum.LLAMA3_3_70B,
125+
evaluator_type=EvaluatorTypeEnum.DIRECT,
126+
# provider=ModelProviderEnum.WATSONX,
127+
evaluator_params={
128+
"criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format",
129+
"context_fields": {
130+
"question": "question",
131+
"instructions": "metadata/template/instruction",
132+
},
133+
},
134+
),
135+
"metrics.rag.response_generation.adherence_with_format.llama_3_3_70b_instruct_judge",
136+
overwrite=True,
137+
)
138+
139+
140+
add_to_catalog(
141+
get_evaluator(
142+
name=EvaluatorNameEnum.LLAMA3_3_70B,
143+
evaluator_type=EvaluatorTypeEnum.DIRECT,
144+
# provider=ModelProviderEnum.WATSONX,
145+
evaluator_params={
146+
"criteria": "metrics.llm_as_judge.direct.criteria.answer_completeness",
147+
"context_fields": {"question": "question", "reference_answers": "reference_answers"},
148+
},
149+
),
150+
"metrics.rag.response_generation.answer_completeness.llama_3_3_70b_instruct_judge",
151+
overwrite=True,
152+
)

src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json

Lines changed: 6 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,55 +2,16 @@
22
"__type__": "llm_judge_direct",
33
"inference_engine": {
44
"__type__": "cross_provider_inference_engine",
5-
"model": "llama-3-3-70b-instruct",
65
"max_tokens": 1024,
6+
"seed": 42,
77
"temperature": 0,
8-
"provider": "watsonx"
9-
},
10-
"criteria": {
11-
"__type__": "criteria_with_options",
12-
"name": "adherence_with_format",
13-
"description": "The response aligns with the requested structure, style, or format (e.g., bullet points, headings, specific phrasing).",
14-
"options": [
15-
{
16-
"__type__": "criteria_option",
17-
"name": "Excellent",
18-
"description": "The response perfectly aligns with the requested structure, style, or format, with no deviations."
19-
},
20-
{
21-
"__type__": "criteria_option",
22-
"name": "Good",
23-
"description": "The response aligns well with the requested structure, style, or format, with minor deviations that do not affect clarity or usability."
24-
},
25-
{
26-
"__type__": "criteria_option",
27-
"name": "mediocre",
28-
"description": "The response generally follows the requested structure, style, or format, but noticeable inconsistencies or omissions are present."
29-
},
30-
{
31-
"__type__": "criteria_option",
32-
"name": "Bad",
33-
"description": "The response only partially aligns with the requested structure, style, or format, with significant inconsistencies or a lack of adherence."
34-
},
35-
{
36-
"__type__": "criteria_option",
37-
"name": "Very Bad",
38-
"description": "The response fails to align with the requested structure, style, or format."
39-
}
40-
],
41-
"option_map": {
42-
"Excellent": 1.0,
43-
"Good": 0.75,
44-
"mediocre": 0.5,
45-
"Bad": 0.25,
46-
"Very Bad": 0
47-
}
8+
"model": "llama-3-3-70b-instruct"
489
},
10+
"evaluator_name": "LLAMA3_3_70B",
11+
"generate_summaries": false,
12+
"criteria": "metrics.llm_as_judge.direct.criteria.adherence_with_format",
4913
"context_fields": {
5014
"question": "question",
5115
"instructions": "metadata/template/instruction"
52-
},
53-
"criteria_field": "criteria",
54-
"generate_summaries": false,
55-
"check_positional_bias": false
16+
}
5617
}

0 commit comments

Comments
 (0)