Fix llm judge artifacts

martinscooper · martinscooper · commit a8c9a6ef46d4 · 2025-03-20T14:57:41.000-03:00
Signed-off-by: Martín Santillán Cooper &lt;msantillancooper@ibm.com&gt;
diff --git a/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py b/prepare/metrics/llm_as_judge/direct/llama_3_3_70b_instruct_adherence_completeness.py
@@ -89,7 +89,7 @@
 # now = define the judge metric using the criteria
 adherence_metric = LLMJudgeDirect(
     inference_engine=CrossProviderInferenceEngine(  # or your favorite inference model
-        model="llama-3-3-70b-instruct", max_tokens=1024
+        model="llama-3-3-70b-instruct", max_tokens=1024, provider="watsonx"
     ),
     criteria=adherence_criteria,
     # the fields from the generation task to be presented to the judge. Those fields must be present
diff --git a/prepare/metrics/llm_as_judge/llm_as_judge.py b/prepare/metrics/llm_as_judge/llm_as_judge.py
@@ -23,7 +23,7 @@ def get_evaluator(
     provider: ModelProviderEnum,
 ) -> Union[LLMJudgeDirect, LLMJudgePairwise]:
     evaluator_metadata = get_evaluator_metadata(name)
-    inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0}
+    inference_params = {"max_tokens": 1024, "seed": 42, "temperature": 0, "provider": provider.value}
     model_name = EVALUATOR_TO_MODEL_ID[name]
 
     if provider == ModelProviderEnum.AZURE_OPENAI:
@@ -86,8 +86,14 @@ def get_evaluator(
                 .replace(".", "_")
                 .replace(" ", "_")
             )
-
-            provider_name = provider.value.lower() if provider != ModelProviderEnum.AZURE_OPENAI else "azure_openai"
+            provider_name = ""
+            # for backward compatibility, ideally we would use cross inference engines provider ids
+            if provider == ModelProviderEnum.AZURE_OPENAI:
+                provider_name = "azure_openai"
+            elif provider == ModelProviderEnum.OPENAI:
+                provider_name = "openai"
+            else:
+                provider_name = provider.value.lower()
 
             add_to_catalog(
                 evaluator,
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/gpt_4o.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/gpt_4o.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/gpt-4o-2024-08-06/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_mini.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_mini.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-mini/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_preview.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/azure_openai/o1_preview.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-preview/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/gpt_4o.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/gpt_4o.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "gpt-4o-2024-08-06"
     },
     "evaluator_name": "GPT4",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_mini.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_mini.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "o1-mini"
     },
     "evaluator_name": "O1_MINI",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_preview.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/openai/o1_preview.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "o1-preview"
     },
     "evaluator_name": "O1_PREVIEW",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_0_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_0_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-8b-instruct"
     },
     "evaluator_name": "GRANITE3_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-1-8b-instruct"
     },
     "evaluator_name": "GRANITE3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_2_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/granite3_2_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-2-8b-instruct"
     },
     "evaluator_name": "GRANITE3_2_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_405b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_405b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-405b-instruct"
     },
     "evaluator_name": "LLAMA3_1_405B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/llama3_3_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-3-70b-instruct"
     },
     "evaluator_name": "LLAMA3_3_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral8_7b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral8_7b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "mixtral-8x7b-instruct-v01"
     },
     "evaluator_name": "MIXTRAL8_7b",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral_large.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/rits/mixtral_large.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "mistral-large-instruct"
     },
     "evaluator_name": "MIXTRAL_LARGE",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_0_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_0_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "granite-3-8b-instruct"
     },
     "evaluator_name": "GRANITE3_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_2_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/granite3_2_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "granite-3-2-8b-instruct"
     },
     "evaluator_name": "GRANITE3_2_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_405b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_405b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-405b-instruct"
     },
     "evaluator_name": "LLAMA3_1_405B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/llama3_3_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-3-70b-instruct"
     },
     "evaluator_name": "LLAMA3_3_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral8_7b.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral8_7b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "mixtral-8x7b-instruct-v01"
     },
     "evaluator_name": "MIXTRAL8_7b",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral_large.json b/src/unitxt/catalog/metrics/llm_as_judge/direct/watsonx/mixtral_large.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "mistral-large-instruct"
     },
     "evaluator_name": "MIXTRAL_LARGE",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/gpt_4o.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/gpt_4o.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/gpt-4o-2024-08-06/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_mini.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_mini.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-mini/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_preview.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/azure_openai/o1_preview.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "azure",
         "credentials": {
             "api_base": "https://eteopenai.azure-api.net/openai/deployments/o1-preview/chat/completions?api-version=2024-08-01-preview"
         },
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/gpt_4o.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/gpt_4o.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "gpt-4o-2024-08-06"
     },
     "evaluator_name": "GPT4",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_mini.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_mini.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "o1-mini"
     },
     "evaluator_name": "O1_MINI",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_preview.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/openai/o1_preview.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "open-ai",
         "model": "o1-preview"
     },
     "evaluator_name": "O1_PREVIEW",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_0_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_0_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-8b-instruct"
     },
     "evaluator_name": "GRANITE3_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-1-8b-instruct"
     },
     "evaluator_name": "GRANITE3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_2_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/granite3_2_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "granite-3-2-8b-instruct"
     },
     "evaluator_name": "GRANITE3_2_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_405b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_405b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-405b-instruct"
     },
     "evaluator_name": "LLAMA3_1_405B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/llama3_3_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "llama-3-3-70b-instruct"
     },
     "evaluator_name": "LLAMA3_3_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral8_7b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral8_7b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "mixtral-8x7b-instruct-v01"
     },
     "evaluator_name": "MIXTRAL8_7b",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral_large.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/rits/mixtral_large.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "rits",
         "model": "mistral-large-instruct"
     },
     "evaluator_name": "MIXTRAL_LARGE",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_0_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_0_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "granite-3-8b-instruct"
     },
     "evaluator_name": "GRANITE3_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_2_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/granite3_2_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "granite-3-2-8b-instruct"
     },
     "evaluator_name": "GRANITE3_2_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_405b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_405b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-405b-instruct"
     },
     "evaluator_name": "LLAMA3_1_405B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_8b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_1_8b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-1-70b-instruct"
     },
     "evaluator_name": "LLAMA3_1_8B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_3_70b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/llama3_3_70b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "llama-3-3-70b-instruct"
     },
     "evaluator_name": "LLAMA3_3_70B",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/mixtral8_7b.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/mixtral8_7b.json
@@ -5,6 +5,7 @@
         "max_tokens": 1024,
         "seed": 42,
         "temperature": 0,
+        "provider": "watsonx",
         "model": "mixtral-8x7b-instruct-v01"
     },
     "evaluator_name": "MIXTRAL8_7b",
diff --git a/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/mixtral_large.json b/src/unitxt/catalog/metrics/llm_as_judge/pairwise/watsonx/mixtral_large.json
diff --git a/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/adherence_with_format/llama_3_3_70b_instruct_judge.json
diff --git a/src/unitxt/llm_as_judge_constants.py b/src/unitxt/llm_as_judge_constants.py