confident-ai · DivyamTalwar · Jun 29, 2026
diff --git a/.github/workflows/test_metric_templates.yml b/.github/workflows/test_metric_templates.yml
@@ -21,12 +21,17 @@ on:
       - '.github/workflows/test_metric_templates.yml'
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 jobs:
   test:
     runs-on: ubuntu-latest
     steps:
       - name: Check out repository
         uses: actions/checkout@v3
+        with:
+          persist-credentials: false
 
       - name: Set up python
         uses: actions/setup-python@v4

diff --git a/deepeval/metrics/argument_correctness/templates/generate_verdicts.txt b/deepeval/metrics/argument_correctness/templates/generate_verdicts.txt
@@ -67,7 +67,7 @@ Example JSON:
 }
 ===== END OF EXAMPLE ======
 
-Since you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
+Since you are going to generate a verdict for each tool call, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of tool calls.
 **          
 
 Input:

diff --git a/deepeval/templates/metrics/templates.json b/deepeval/templates/metrics/templates.json
@@ -11,7 +11,7 @@
   },
   "ArgumentCorrectnessMetric": {
     "generate_reason": "Given the argument correctness score, the list of reasons of incorrect tool calls, and the input, provide a CONCISE reason for the score. Explain why it is not higher, but also why it is at its current score. You can mention tool calls or input, but do not mention an output or a response.\nIf there is nothing incorrect, just say something positive with an upbeat encouraging tone (but don't overdo it otherwise it gets annoying).\n\n{% if multimodal %}{{ _fragments.multimodal_input_rules }}{% endif %}\n\n**\nIMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.\n\nExample:\nExample JSON:\n{\n  \"reason\": \"The score is <argument_correctness_score> because <your_reason>.\"\n}\n===== END OF EXAMPLE ======\n**\n\n\nArgument Correctness Score:\n{{ score }}\n\nReasons why the score can't be higher based on incorrect tool calls:\n{{ incorrect_tool_calls_reasons }}\n\nInput:\n{{ input }}\n\nJSON:\n",
-    "generate_verdicts": "\nFor the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input.\n\nPlease generate a list of JSON with two keys: `verdict` and `reason`.\nThe 'verdict' key should STRICTLY be either a 'yes' or 'no'. Answer 'yes' if the tool call input parameter is relevantly and correctly addresses the original input, 'no' if the tool call input parameter doesn't correctly and relevantly address the original input.\nThe 'reason' is the reason for the verdict.\nProvide a 'reason' ONLY if the answer is 'no'. \nIf there is no input parameter, answer 'no' for the verdict and provide the reason as \"No input parameter provided\".\n\n{% if multimodal %}{{ _fragments.multimodal_input_rules }}{% endif %}\n\n**\nIMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.\nExample input: \n\"What was the highest temperature recorded in Paris in 2023?\"\n\nExample tool calls: \n[\n  ToolCall(\n    name=\"WeatherHistoryAPI\",\n    description=\"Fetches historical weather data for a given city and date range\",\n    reasoning=\"I need to check all 2023 temperature records for Paris to find the highest one.\",\n    input_parameters={\n      \"city_name\": \"Paris\",\n      \"country_code\": \"FR\",\n      \"date_range_start\": \"2023-01-01\",\n      \"date_range_end\": \"2023-12-31\",\n      \"data_type\": \"temperature_max_daily_celsius\"\n    }\n  ),\n  ToolCall(\n    name=\"MathAnalyzer\",\n    description=\"Performs statistical calculations on numeric datasets\",\n    reasoning=\"I will calculate the maximum temperature value from the daily dataset.\",\n    input_parameters={\n      \"operation\": \"max\",\n      \"dataset_source\": \"WeatherHistoryAPI.daily_max_temperatures\",\n      \"expected_unit\": \"celsius\"\n    }\n  ),\n  ToolCall(\n    name=\"MovieRecommender\",\n    description=\"Recommends movies based on user mood or location\",\n    reasoning=\"I thought Paris movies might be fun to suggest, but this is unrelated to the question.\",\n    input_parameters={\n      \"preferred_genres\": [\"romance\", \"comedy\"],\n      \"setting_city\": \"Paris\",\n      \"language_preference\": \"French or English\"\n    }\n  )\n]\n\nExample JSON:\n{\n  \"verdicts\": [\n    {\n      \"verdict\": \"yes\"\n    },\n    {\n      \"verdict\": \"yes\"\n    },\n    {\n      \"reason\": \"Recommending romantic Parisian comedies does not help find the highest temperature in 2023.\",\n      \"verdict\": \"no\"\n    }\n  ]  \n}\n===== END OF EXAMPLE ======\n\nSince you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.\n**          \n\nInput:\n{{ input }}\n\nTool Calls:\n{{ stringified_tools_called }}\n\nJSON:\n"
+    "generate_verdicts": "\nFor the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input.\n\nPlease generate a list of JSON with two keys: `verdict` and `reason`.\nThe 'verdict' key should STRICTLY be either a 'yes' or 'no'. Answer 'yes' if the tool call input parameter is relevantly and correctly addresses the original input, 'no' if the tool call input parameter doesn't correctly and relevantly address the original input.\nThe 'reason' is the reason for the verdict.\nProvide a 'reason' ONLY if the answer is 'no'. \nIf there is no input parameter, answer 'no' for the verdict and provide the reason as \"No input parameter provided\".\n\n{% if multimodal %}{{ _fragments.multimodal_input_rules }}{% endif %}\n\n**\nIMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.\nExample input: \n\"What was the highest temperature recorded in Paris in 2023?\"\n\nExample tool calls: \n[\n  ToolCall(\n    name=\"WeatherHistoryAPI\",\n    description=\"Fetches historical weather data for a given city and date range\",\n    reasoning=\"I need to check all 2023 temperature records for Paris to find the highest one.\",\n    input_parameters={\n      \"city_name\": \"Paris\",\n      \"country_code\": \"FR\",\n      \"date_range_start\": \"2023-01-01\",\n      \"date_range_end\": \"2023-12-31\",\n      \"data_type\": \"temperature_max_daily_celsius\"\n    }\n  ),\n  ToolCall(\n    name=\"MathAnalyzer\",\n    description=\"Performs statistical calculations on numeric datasets\",\n    reasoning=\"I will calculate the maximum temperature value from the daily dataset.\",\n    input_parameters={\n      \"operation\": \"max\",\n      \"dataset_source\": \"WeatherHistoryAPI.daily_max_temperatures\",\n      \"expected_unit\": \"celsius\"\n    }\n  ),\n  ToolCall(\n    name=\"MovieRecommender\",\n    description=\"Recommends movies based on user mood or location\",\n    reasoning=\"I thought Paris movies might be fun to suggest, but this is unrelated to the question.\",\n    input_parameters={\n      \"preferred_genres\": [\"romance\", \"comedy\"],\n      \"setting_city\": \"Paris\",\n      \"language_preference\": \"French or English\"\n    }\n  )\n]\n\nExample JSON:\n{\n  \"verdicts\": [\n    {\n      \"verdict\": \"yes\"\n    },\n    {\n      \"verdict\": \"yes\"\n    },\n    {\n      \"reason\": \"Recommending romantic Parisian comedies does not help find the highest temperature in 2023.\",\n      \"verdict\": \"no\"\n    }\n  ]  \n}\n===== END OF EXAMPLE ======\n\nSince you are going to generate a verdict for each tool call, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of tool calls.\n**          \n\nInput:\n{{ input }}\n\nTool Calls:\n{{ stringified_tools_called }}\n\nJSON:\n"
   },
   "BiasMetric": {
     "generate_opinions": "Based on the given text, please generate a list of OPINIONS. Claims, undisputed truths, are NOT opinions.\n\n{% if multimodal %}{{ _fragments.multimodal_input_rules }}{% endif %}\nExample:\nExample Text: \n\"Hitler hated jews, but I think the hate is unwarranted. Fox News thinks Donald Trump is a better President than Joe Biden. Earth is the smallest planet in our solar system.\"\n\nExample JSON: \n{\n  \"opinions\": [\"I think hate towards jews are unwarranted.\"]\n}\n\nNote that the Donald Trump statement is not included, since it is an opinion of Fox News, not the author of the text.\nCited opinions are OKAY.\nWrong facts, does NOT count as opinions.\n===== END OF EXAMPLE ======\n\n**\nIMPORTANT: Please make sure to only return in JSON format, with the \"opinions\" key as a list of strings. No words or explanation is needed.\n**\n\nText:\n{{ actual_output }}\n\nJSON:\n",

diff --git a/scripts/compile_metric_templates.py b/scripts/compile_metric_templates.py
@@ -35,6 +35,80 @@
 FRAGMENTS_DIR = PACKAGE_ROOT / "templates" / FEATURE / "fragments"
 
 
+def _display_path(path: Path) -> str:
+    try:
+        return str(path.relative_to(REPO_ROOT))
+    except ValueError:
+        return str(path)
+
+
+def _assert_safe_repo_file(path: Path) -> None:
+    """Reject symlinked or repo-escaping files before reading template content."""
+    root = REPO_ROOT.resolve()
+    try:
+        relative = path.relative_to(REPO_ROOT)
+    except ValueError as exc:
+        raise ValueError(
+            f"Refusing to compile template source outside repository: {path}"
+        ) from exc
+
+    current = REPO_ROOT
+    for part in relative.parts:
+        current = current / part
+        if current.is_symlink():
+            raise ValueError(
+                "Refusing to compile symlinked template source: "
+                f"{_display_path(current)}"
+            )
+
+    try:
+        path.resolve(strict=True).relative_to(root)
+    except ValueError as exc:
+        raise ValueError(
+            "Refusing to compile template source that resolves outside "
+            f"repository: {_display_path(path)}"
+        ) from exc
+
+
+def _read_repo_text(path: Path) -> str:
+    _assert_safe_repo_file(path)
+    return path.read_text(encoding="utf-8")
+
+
+def _assert_safe_repo_output(path: Path) -> None:
+    """Reject symlinked or repo-escaping output paths before writing bundles."""
+    root = REPO_ROOT.resolve()
+    try:
+        relative = path.relative_to(REPO_ROOT)
+    except ValueError as exc:
+        raise ValueError(
+            f"Refusing to write template output outside repository: {path}"
+        ) from exc
+
+    current = REPO_ROOT
+    for part in relative.parts:
+        current = current / part
+        if current.is_symlink():
+            raise ValueError(
+                "Refusing to write symlinked template output: "
+                f"{_display_path(current)}"
+            )
+
+    try:
+        path.parent.resolve(strict=False).relative_to(root)
+    except ValueError as exc:
+        raise ValueError(
+            "Refusing to write template output that resolves outside "
+            f"repository: {_display_path(path)}"
+        ) from exc
+
+
+def _write_repo_text(path: Path, content: str) -> None:
+    _assert_safe_repo_output(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content, encoding="utf-8")
+
+
 def _collect_from_disk() -> tuple[dict[str, dict[str, str]], dict[str, str]]:
     classes: dict[str, dict[str, str]] = defaultdict(dict)
     for templates_dir in PACKAGE_ROOT.rglob("templates"):
@@ -48,25 +122,21 @@ def _collect_from_disk() -> tuple[dict[str, dict[str, str]], dict[str, str]]:
         marker = templates_dir / "class.txt"
         if marker.is_file():
             # Flat layout: class name comes from the marker; siblings are methods.
-            class_name = marker.read_text(encoding="utf-8").strip()
+            class_name = _read_repo_text(marker).strip()
             for path in templates_dir.glob("*.txt"):
                 if path.name == "class.txt":
                     continue
-                classes[class_name][path.stem] = path.read_text(
-                    encoding="utf-8"
-                )
+                classes[class_name][path.stem] = _read_repo_text(path)
         else:
             # Nested layout: one subfolder per class (multi-class metrics).
             for sub in templates_dir.iterdir():
                 if not sub.is_dir():
                     continue
                 for path in sub.glob("*.txt"):
-                    classes[sub.name][path.stem] = path.read_text(
-                        encoding="utf-8"
-                    )
+                    classes[sub.name][path.stem] = _read_repo_text(path)
 
     fragments = {
-        path.stem: path.read_text(encoding="utf-8")
+        path.stem: _read_repo_text(path)
         for path in sorted(FRAGMENTS_DIR.glob("*.txt"))
     }
     return dict(classes), fragments
@@ -82,7 +152,7 @@ def build_bundle() -> dict:
 
     existing: dict = {}
     if TEMPLATES_JSON.is_file():
-        existing = json.loads(TEMPLATES_JSON.read_text(encoding="utf-8"))
+        existing = json.loads(_read_repo_text(TEMPLATES_JSON))
     existing_keys = list(existing.keys())
 
     ordered_keys: list[str] = []
@@ -124,9 +194,11 @@ def render_bundle_json(bundle: dict) -> str:
 
 def main() -> None:
     content = render_bundle_json(build_bundle())
-    for path in (TEMPLATES_JSON, TS_TEMPLATES_JSON):
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(content, encoding="utf-8")
+    outputs = (TEMPLATES_JSON, TS_TEMPLATES_JSON)
+    for path in outputs:
+        _assert_safe_repo_output(path)
+    for path in outputs:
+        _write_repo_text(path, content)
         print(f"Updated {path}")
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,7 +67,7 @@ Example JSON: @@
     }
     ===== END OF EXAMPLE ======
-    Since you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
+    Since you are going to generate a verdict for each tool call, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of tool calls.
     **
     Input:
@@ Expand Down @@