Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 22e2dee

Browse files
committedMar 19, 2025·
Adapt examples to use LLAMA 3.1 to 3.3
Signed-off-by: Martín Santillán Cooper <msantillancooper@ibm.com>
1 parent 04297d0 commit 22e2dee

8 files changed

+12
-12
lines changed
 

‎docs/docs/llm_as_judge.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ An LLM as a Judge metric consists of several essential components:
4646
1. The judge model, such as *Llama-3-8B-Instruct* or *gpt-3.5-turbo*, which evaluates the performance of other models.
4747
2. The platform responsible for executing the judge model, such as Huggingface, OpenAI API and IBM's deployment platforms such as WatsonX and RITS.
4848
A lot of these model and catalog combinations are already predefined in our catalog. The models are prefixed by metrics.llm_as_judge.direct followed by the platform and the model name.
49-
For instance, *metrics.llm_as_judge.direct.rits.llama3_1_70b* refers to *llama3 70B* model that uses RITS deployment service.
49+
For instance, *metrics.llm_as_judge.direct.rits.llama3_3_70b* refers to *llama3 70B* model that uses RITS deployment service.
5050

5151
3. The criteria to evaluate the model's response. There are predefined criteria in the catalog and the user can also define a custom criteria.
5252
Each criteria specifies fine-grained options that help steer the model to evaluate the response more precisely.
@@ -86,7 +86,7 @@ We pass the criteria to the judge model's metric as criteria and the question as
8686
8787
criteria = "metrics.llm_as_judge.direct.criteria.answer_relevance"
8888
metrics = [
89-
f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criteria}, context_fields=[question]]"
89+
f"metrics.llm_as_judge.direct.rits.llama3_3_70b[criteria={criteria}, context_fields=[question]]"
9090
]
9191
9292
Once the metric is created, a dataset is created for the appropriate task.
@@ -155,13 +155,13 @@ Below is an example where the user mandates that the model respond with the temp
155155
End to end Direct example
156156
----------------------------
157157
Unitxt can also obtain model's responses for a given dataset and then run LLM-as-a-judge evaluations on the model's responses.
158-
Here, we will get *llama-3.2 1B* instruct's responses and then evaluate them for answer relevance, coherence and conciseness using *llama3_1_70b* judge model
158+
Here, we will get *llama-3.2 1B* instruct's responses and then evaluate them for answer relevance, coherence and conciseness using *llama3_3_70b* judge model
159159

160160
.. code-block:: python
161161
162162
criteria = ["answer_relevance", "coherence", "conciseness"]
163163
metrics = [
164-
"metrics.llm_as_judge.direct.rits.llama3_1_70b"
164+
"metrics.llm_as_judge.direct.rits.llama3_3_70b"
165165
"[context_fields=[context,question],"
166166
f"criteria=metrics.llm_as_judge.direct.criteria.{criterion},"
167167
f"score_prefix={criterion}_]"
@@ -298,7 +298,7 @@ Below is an example where we compare the responses of three models for two quest
298298
reference_fields={"criteria": Any},
299299
prediction_type=List[str],
300300
metrics=[
301-
"metrics.llm_as_judge.pairwise.watsonx.llama3_1_70b[context_fields=question,criteria_field=criteria]"
301+
"metrics.llm_as_judge.pairwise.watsonx.llama3_3_70b[context_fields=question,criteria_field=criteria]"
302302
],
303303
default_template=NullTemplate(),
304304
),

‎examples/evaluate_existing_dataset_by_llm_as_judge_direct.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# We set loader_limit to 20 to reduce download time.
1515
criteria = ["answer_relevance", "coherence", "conciseness"]
1616
metrics = [
17-
"metrics.llm_as_judge.direct.rits.llama3_1_70b"
17+
"metrics.llm_as_judge.direct.rits.llama3_3_70b"
1818
"[context_fields=[context,question],"
1919
f"criteria=metrics.llm_as_judge.direct.criteria.{criterion}]"
2020
for criterion in criteria

‎examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
reference_fields={"criteria": Any},
2929
prediction_type=str,
3030
metrics=[
31-
"metrics.llm_as_judge.direct.watsonx.llama3_1_70b[context_fields=question,criteria_field=criteria]"
31+
"metrics.llm_as_judge.direct.watsonx.llama3_3_70b[context_fields=question,criteria_field=criteria]"
3232
],
3333
),
3434
)

‎examples/evaluate_llm_as_judge_direct_predefined_criteria.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
criterion = "metrics.llm_as_judge.direct.criteria.answer_relevance"
1313
metrics = [
14-
f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criterion}, context_fields=[question]]"
14+
f"metrics.llm_as_judge.direct.rits.llama3_3_70b[criteria={criterion}, context_fields=[question]]"
1515
]
1616

1717
dataset = create_dataset(

‎examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
reference_fields={"criteria": Any},
3030
prediction_type=List[str],
3131
metrics=[
32-
"metrics.llm_as_judge.pairwise.rits.llama3_1_70b[context_fields=question,criteria_field=criteria]"
32+
"metrics.llm_as_judge.pairwise.rits.llama3_3_70b[context_fields=question,criteria_field=criteria]"
3333
],
3434
default_template=NullTemplate(),
3535
),

‎examples/evaluate_llm_as_judge_pairwise_predefined_criteria.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
reference_fields={"criteria": Any},
3030
prediction_type=List[str],
3131
metrics=[
32-
"metrics.llm_as_judge.pairwise.watsonx.llama3_1_70b[context_fields=question,criteria_field=criteria]"
32+
"metrics.llm_as_judge.pairwise.watsonx.llama3_3_70b[context_fields=question,criteria_field=criteria]"
3333
],
3434
default_template=NullTemplate(),
3535
),

‎src/unitxt/assistant/assessment/assistant_inference_engine.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _infer(self, dataset: Union[List[Dict[str, Any]], Dataset], return_meta_data
3333

3434
criteria = "metrics.llm_as_judge.direct.criteria.answer_completeness"
3535
metrics = [
36-
f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criteria}, context_fields=[answers]]"
36+
f"metrics.llm_as_judge.direct.rits.llama3_3_70b[criteria={criteria}, context_fields=[answers]]"
3737
]
3838

3939
dataset = create_dataset(

‎tests/library/test_metrics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2945,7 +2945,7 @@ def test_llm_as_judge(self):
29452945

29462946
criterion = "metrics.llm_as_judge.direct.criteria.answer_relevance"
29472947
metrics = [
2948-
f"metrics.llm_as_judge.direct.rits.llama3_1_70b[criteria={criterion}, context_fields=[question]]"
2948+
f"metrics.llm_as_judge.direct.rits.llama3_3_70b[criteria={criterion}, context_fields=[question]]"
29492949
]
29502950

29512951
dataset = create_dataset(

0 commit comments

Comments
 (0)
Please sign in to comment.