opendatahub-io · sheltoncyril · Sep 15, 2025 · Aug 27, 2025 · Sep 4, 2025 · Sep 4, 2025
@@ -0,0 +1,111 @@
+lmeval_hf_custom_task_data = {
+    "task_list": {
+        "custom": {
+            "systemPrompts": [
+                {"name": "sp_0", "value": "Be concise. At every point give the shortest acceptable answer."}
+            ],
+            "templates": [
+                {
+                    "name": "tp_0",
+                    "value": '{ "__type__": "input_output_template", '
+                    '"input_format": "{text_a_type}: {text_a}\\n'
+                    '{text_b_type}: {text_b}", '
+                    '"output_format": "{label}", '
+                    '"target_prefix": '
+                    '"The {type_of_relation} class is ", '
+                    '"instruction": "Given a {text_a_type} and {text_b_type} '
+                    'classify the {type_of_relation} of the {text_b_type} to one of {classes}.",'
+                    ' "postprocessors": [ "processors.take_first_non_empty_line",'
+                    ' "processors.lower_case_till_punc" ] }',
+                }
+            ],
+        },
+        "taskRecipes": [{"card": {"name": "cards.wnli"}, "systemPrompt": {"ref": "sp_0"}, "template": {"ref": "tp_0"}}],
+    }
+}
+
+lmeval_hf_llmaaj_task_data = {
+    "task_list": {
+        "custom": {
+            "templates": [
+                {
+                    "name": "response_assessment.rating.mt_bench_single_turn",
+                    "value": '{\n    "__type__": "input_output_template",\n    "instruction":'
+                    ' "Please act as an impartial judge and evaluate the quality of the '
+                    "response provided by an AI assistant to the user question displayed below."
+                    " Your evaluation should consider factors such as the helpfulness, relevance,"
+                    " accuracy, depth, creativity, and level of detail of the response. Begin your"
+                    " evaluation by providing a short explanation. Be as objective as possible. "
+                    "After providing your explanation, you must rate the response on a scale of 1 to 10"
+                    ' by strictly following this format: \\"[[rating]]\\", for example: \\"Rating: '
+                    '[[5]]\\".\\n\\n",\n    "input_format": "[Question]\\n{question}\\n\\n[The Start '
+                    "of Assistant's Answer]\\n{answer}\\n[The End of Assistant's Answer]\",\n    "
+                    '"output_format": "[[{rating}]]",\n    "postprocessors": [\n        '
+                    '"processors.extract_mt_bench_rating_judgment"\n    ]\n}\n',
+                }
+            ],
+            "tasks": [
+                {
+                    "name": "response_assessment.rating.single_turn",
+                    "value": '{\n    "__type__": "task",\n    "input_fields": {\n        '
+                    '"question": "str",\n        "answer": "str"\n    },\n    '
+                    '"outputs": {\n        "rating": "float"\n    },\n    '
+                    '"metrics": [\n        "metrics.spearman"\n    ]\n}\n',
+                }
+            ],
+            "metrics": [
+                {
+                    "name": "llmaaj_metric",
+                    "value": '{\n    "__type__": "llm_as_judge",\n    "inference_model": {\n        '
+                    '"__type__": "hf_pipeline_based_inference_engine",\n        '
+                    '"model_name": "mistralai/Mistral-7B-Instruct-v0.2",\n        '
+                    '"max_new_tokens": 256,\n        "use_fp16": true\n    },\n    '
+                    '"template": "templates.response_assessment.rating.mt_bench_single_turn",\n    '
+                    '"task": "rating.single_turn",\n    '
+                    '"format": "formats.models.mistral.instruction",\n    '
+                    '"main_score": "mistral_7b_instruct_v0_2_huggingface_template_mt_bench_single_turn"\n}'
+                    "\n",
+                }
+            ],
+        },
+        "taskRecipes": [
+            {
+                "card": {
+                    "custom": '{\n    "__type__": "task_card",\n    "loader": '
+                    '{\n        "__type__": "load_hf",\n        '
+                    '"path": "OfirArviv/mt_bench_single_score_gpt4_judgement",\n        '
+                    '"split": "train"\n    },\n    "preprocess_steps": [\n        '
+                    '{\n            "__type__": "rename_splits",\n            '
+                    '"mapper": {\n                "train": "test"\n            }\n        },\n        '
+                    '{\n            "__type__": "filter_by_condition",\n            '
+                    '"values": {\n                "turn": 1\n            },\n            '
+                    '"condition": "eq"\n        },\n        {\n            '
+                    '"__type__": "filter_by_condition",\n            '
+                    '"values": {\n                "reference": "[]"\n            },\n            '
+                    '"condition": "eq"\n        },\n        {\n            '
+                    '"__type__": "rename",\n            "field_to_field": {\n                '
+                    '"model_input": "question",\n                '
+                    '"score": "rating",\n                '
+                    '"category": "group",\n                '
+                    '"model_output": "answer"\n            }\n        },\n        '
+                    '{\n            "__type__": "literal_eval",\n            '
+                    '"field": "question"\n        },\n        '
+                    '{\n            "__type__": "copy",\n            '
+                    '"field": "question/0",\n            '
+                    '"to_field": "question"\n        },\n        '
+                    '{\n            "__type__": "literal_eval",\n            '
+                    '"field": "answer"\n        },\n        {\n            '
+                    '"__type__": "copy",\n            '
+                    '"field": "answer/0",\n            '
+                    '"to_field": "answer"\n        }\n    ],\n    '
+                    '"task": "tasks.response_assessment.rating.single_turn",\n    '
+                    '"templates": [\n        '
+                    '"templates.response_assessment.rating.mt_bench_single_turn"\n    ]\n}\n',
+                    "template": {"ref": "response_assessment.rating.mt_bench_single_turn"},
+                    "format": "formats.models.mistral.instruction",
+                    "metrics": [{"ref": "llmaaj_metric"}],
+                }
+            }
+        ],
+    }
+}
@@ -1,7 +1,7 @@
 import pytest
 from typing import List
 
-
+from tests.model_explainability.lm_eval.constants import lmeval_hf_llmaaj_task_data, lmeval_hf_custom_task_data
 from tests.model_explainability.utils import validate_tai_component_images
 
 from tests.model_explainability.lm_eval.utils import get_lmeval_tasks, validate_lmeval_job_pod_and_logs
@@ -29,35 +29,14 @@
         ),
         pytest.param(
             {"name": "test-lmeval-hf-custom-task"},
-            {
-                "task_list": {
-                    "custom": {
-                        "systemPrompts": [
-                            {"name": "sp_0", "value": "Be concise. At every point give the shortest acceptable answer."}
-                        ],
-                        "templates": [
-                            {
-                                "name": "tp_0",
-                                "value": '{ "__type__": "input_output_template", '
-                                '"input_format": "{text_a_type}: {text_a}\\n'
-                                '{text_b_type}: {text_b}", '
-                                '"output_format": "{label}", '
-                                '"target_prefix": '
-                                '"The {type_of_relation} class is ", '
-                                '"instruction": "Given a {text_a_type} and {text_b_type} '
-                                'classify the {type_of_relation} of the {text_b_type} to one of {classes}.",'
-                                ' "postprocessors": [ "processors.take_first_non_empty_line",'
-                                ' "processors.lower_case_till_punc" ] }',
-                            }
-                        ],
-                    },
-                    "taskRecipes": [
-                        {"card": {"name": "cards.wnli"}, "systemPrompt": {"ref": "sp_0"}, "template": {"ref": "tp_0"}}
-                    ],
-                }
-            },
+            lmeval_hf_custom_task_data,
             id="custom_task",
         ),
+        pytest.param(
+            {"name": "test-lmeval-hf-llmaaj"},
+            lmeval_hf_llmaaj_task_data,
+            id="llmaaj_task",
+        ),
     ],
     indirect=True,
 )

@@ -102,7 +102,7 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None:
     )
     lmevaljob_pod.wait_for_status(status=lmevaljob_pod.Status.RUNNING, timeout=Timeout.TIMEOUT_5MIN)
     try:
-        lmevaljob_pod.wait_for_status(status=Pod.Status.SUCCEEDED, timeout=Timeout.TIMEOUT_20MIN)
+        lmevaljob_pod.wait_for_status(status=Pod.Status.SUCCEEDED, timeout=Timeout.TIMEOUT_40MIN)
     except TimeoutExpiredError as e:
         raise UnexpectedFailureError("LMEval job pod failed from a running state.") from e
     if not bool(re.search(pod_success_log_regex, lmevaljob_pod.log())):