confident-ai · Ruthwik-Data · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/examples/rag_evaluation/heterogeneous_document_eval.py b/examples/rag_evaluation/heterogeneous_document_eval.py
@@ -0,0 +1,224 @@
+"""Heterogeneous Document RAG Evaluation with DeepEval.
+
+This example demonstrates how to evaluate a financial RAG pipeline that
+retrieve chunks from mixed document types (10-K filings, earnings call
+transcripts, balance sheets) using document-type-specific thresholds.
+
+Key features shown:
+1. Using LLMTestCase metadata to tag chunks by document_type.
+2. Using threshold_overrides (per PR #2785) to set different pass/fail
+   thresholds per document type.
+3. Combining FaithfulnessMetric, ContextualPrecisionMetric, and
+   ContextualRecallMetric in a single heterogeneous test run.
+4. Interpreting results: why structured documents (balance sheets) need
+   higher thresholds than narrative documents (earnings calls).
+
+Requires: OPENAI_API_KEY environment variable.
+Install: pip install deepeval
+"""
+
+import os
+from deepeval import evaluate
+from deepeval.metrics import (
+    FaithfulnessMetric,
+    ContextualPrecisionMetric,
+    ContextualRecallMetric,
+)
+from deepeval.test_case import LLMTestCase
+
+
+# ---------------------------------------------------------------------------
+# Document-type threshold configuration
+# ---------------------------------------------------------------------------
+# Different document types warrant different pass/fail thresholds:
+#
+# - balance_sheet: High precision required (0.95). Structured numeric data
+#   must be retrieved exactly; hallucinated figures are dangerous.
+#
+# - annual_report (10-K narrative): Moderate threshold (0.80). Some
+#   paraphrase of narrative text is acceptable.
+#
+# - earnings_call: Lower threshold (0.70). Transcripts contain hedged
+#   language and forward-looking statements that LLMs may paraphrase.
+#
+# - default: Applied when document_type metadata is absent (0.75).
+
+THRESHOLD_OVERRIDES = {
+    "balance_sheet": 0.95,
+    "annual_report": 0.80,
+    "earnings_call": 0.70,
+    "default": 0.75,
+}
+
+
+# ---------------------------------------------------------------------------
+# Test cases
+# ---------------------------------------------------------------------------
+
+def build_test_cases():
+    """Build a mixed set of test cases representing a financial RAG pipeline.
+
+    Each test case includes a document_type metadata key that will be used
+    by the threshold_overrides parameter (available in DeepEval >= 2.0,
+    once PR #2785 is merged).
+    """
+    test_cases = []
+
+    # --- Balance sheet query ---
+    # High precision required: exact numeric retrieval from structured table.
+    balance_sheet_case = LLMTestCase(
+        input="What were total assets and total liabilities for FY2023?",
+        actual_output=(
+            "Total assets for FY2023 were $18.7 billion. "
+            "Total liabilities were $11.2 billion."
+        ),
+        expected_output=(
+            "Total assets: $18.7B. Total liabilities: $11.2B (FY2023)."
+        ),
+        retrieval_context=[
+            "| Metric | FY2023 | FY2022 |\n"
+            "|--------|--------|--------|\n"
+            "| Total Assets | $18.7B | $16.4B |\n"
+            "| Total Liabilities | $11.2B | $10.1B |\n"
+            "| Stockholders' Equity | $7.5B | $6.3B |",
+        ],
+        # Tag this test case with document type for threshold routing
+        additional_metadata={"document_type": "balance_sheet"},
+    )
+    test_cases.append(balance_sheet_case)
+
+    # --- Annual report (10-K narrative) query ---
+    # Moderate precision: narrative paraphrase acceptable.
+    annual_report_case = LLMTestCase(
+        input="What drove revenue growth in FY2023 according to the 10-K?",
+        actual_output=(
+            "Revenue growth in FY2023 was primarily driven by the cloud segment, "
+            "which grew 34% year-over-year and contributed $1.4B to the total "
+            "revenue increase."
+        ),
+        expected_output=(
+            "The cloud segment drove revenue growth, growing 34% YoY "
+            "and contributing $1.4B to overall revenue gains."
+        ),
+        retrieval_context=[
+            "Revenue for FY2023 was $4.2 billion, up 12% year-over-year. "
+            "The primary growth driver was the cloud segment, which grew 34% "
+            "year-over-year and contributed $1.4 billion to the revenue increase. "
+            "Enterprise software and services also grew 8%, while legacy hardware "
+            "revenue declined 5%.",
+        ],
+        additional_metadata={"document_type": "annual_report"},
+    )
+    test_cases.append(annual_report_case)
+
+    # --- Earnings call query ---
+    # Lower threshold: forward-looking statements and hedged language.
+    earnings_call_case = LLMTestCase(
+        input="What guidance did management provide for FY2024 revenue?",
+        actual_output=(
+            "Management guided for FY2024 revenue in the range of $4.6B to $4.8B, "
+            "representing 10-14% growth, subject to macroeconomic conditions."
+        ),
+        expected_output=(
+            "FY2024 revenue guidance: $4.6B–4.8B (10–14% growth), "
+            "conditional on macroeconomic environment."
+        ),
+        retrieval_context=[
+            "Looking ahead, we are guiding to fiscal 2024 revenue in the range of "
+            "$4.6 to $4.8 billion, which represents growth of approximately 10 to 14 "
+            "percent year-over-year. This outlook assumes stable macroeconomic "
+            "conditions and does not account for potential FX headwinds beyond "
+            "current rates. We feel confident in our pipeline but remain cautious "
+            "given the broader environment.",
+        ],
+        additional_metadata={"document_type": "earnings_call"},
+    )
+    test_cases.append(earnings_call_case)
+
+    return test_cases
+
+
+# ---------------------------------------------------------------------------
+# Metrics with threshold_overrides
+# ---------------------------------------------------------------------------
+# Note: threshold_overrides is available once PR #2785 is merged.
+# Until then, the metrics will use the default threshold.
+# The metadata["document_type"] key is used to select the override.
+
+def build_metrics():
+    """Build metrics with document-type threshold overrides.
+
+    threshold_overrides: dict mapping document_type values to thresholds.
+    The metric checks test_case.additional_metadata["document_type"] and
+    applies the matching threshold, falling back to the default threshold
+    if the key is absent or unrecognised.
+    """
+    faithfulness = FaithfulnessMetric(
+        threshold=THRESHOLD_OVERRIDES["default"],
+        # threshold_overrides will route to the right threshold per test case
+        # threshold_overrides=THRESHOLD_OVERRIDES,  # Uncomment post-PR #2785
+        verbose_mode=True,
+    )
+    precision = ContextualPrecisionMetric(
+        threshold=THRESHOLD_OVERRIDES["default"],
+        # threshold_overrides=THRESHOLD_OVERRIDES,  # Uncomment post-PR #2785
+        verbose_mode=True,
+    )
+    recall = ContextualRecallMetric(
+        threshold=THRESHOLD_OVERRIDES["default"],
+        # threshold_overrides=THRESHOLD_OVERRIDES,  # Uncomment post-PR #2785
+        verbose_mode=True,
+    )
+    return [faithfulness, precision, recall]
+
+
+# ---------------------------------------------------------------------------
+# Run evaluation
+# ---------------------------------------------------------------------------
+
+def run_heterogeneous_eval():
+    """Run the full heterogeneous document evaluation.
+
+    Results will show per-test-case scores. The key insight:
+    - Balance sheet cases should be evaluated at 0.95 threshold.
+    - Earnings call cases pass at 0.70 threshold even with hedged language.
+    - Without threshold_overrides, all cases use the same threshold,
+      leading to false failures on earnings calls or false passes on
+      balance sheets.
+    """
+    test_cases = build_test_cases()
+    metrics = build_metrics()
+
+    results = evaluate(
+        test_cases=test_cases,
+        metrics=metrics,
+        # Optional: group results by document type for easier analysis
+        # run_async=False,  # Useful for debugging
+    )
+
+    print("\n--- Evaluation Results ---")
+    for result in results.test_results:
+        doc_type = result.additional_metadata.get("document_type", "unknown")
+        threshold_used = THRESHOLD_OVERRIDES.get(
+            doc_type, THRESHOLD_OVERRIDES["default"]
+        )
+        print(f"\nDocument type: {doc_type}")
+        print(f"Expected threshold: {threshold_used}")
+        for metric_result in result.metrics_data:
+            status = "✅" if metric_result.success else "❌"
+            print(
+                f"  {status} {metric_result.name}: "
+                f"{metric_result.score:.3f} "
+                f"(threshold: {metric_result.threshold})"
+            )
+
+    return results
+
+
+if __name__ == "__main__":
+    if not os.environ.get("OPENAI_API_KEY"):
+        raise EnvironmentError(
+            "OPENAI_API_KEY not set. Export your API key before running:\n"
+            "  export OPENAI_API_KEY=sk-..."
+        )
+    run_heterogeneous_eval()
diff --git a/typescript/src/models/gateways/openrouter-model.ts b/typescript/src/models/gateways/openrouter-model.ts
@@ -26,7 +26,9 @@ export class OpenRouterModel extends DeepEvalOpenAICompatibleModel {
         DEFAULT_OPENROUTER_MODEL,
       apiKey: options.apiKey ?? process.env.OPENROUTER_API_KEY,
       baseURL:
-        options.baseURL ?? process.env.OPENROUTER_BASE_URL ?? OPENROUTER_BASE_URL,
+        options.baseURL ??
+        process.env.OPENROUTER_BASE_URL ??
+        OPENROUTER_BASE_URL,
     });
   }
 }
diff --git a/typescript/src/models/gateways/portkey-model.ts b/typescript/src/models/gateways/portkey-model.ts
@@ -27,7 +27,8 @@ export class PortkeyModel extends DeepEvalOpenAICompatibleModel {
       ...options,
       model: options.model ?? process.env.PORTKEY_MODEL_NAME,
       apiKey,
-      baseURL: options.baseURL ?? process.env.PORTKEY_BASE_URL ?? PORTKEY_BASE_URL,
+      baseURL:
+        options.baseURL ?? process.env.PORTKEY_BASE_URL ?? PORTKEY_BASE_URL,
       defaultHeaders: {
         ...(options.defaultHeaders ?? {}),
         ...(apiKey ? { "x-portkey-api-key": apiKey } : {}),

diff --git a/typescript/src/models/providers/deepseek-model.ts b/typescript/src/models/providers/deepseek-model.ts
@@ -20,7 +20,9 @@ export class DeepSeekModel extends DeepEvalOpenAICompatibleModel {
     super({
       ...options,
       model:
-        options.model ?? process.env.DEEPSEEK_MODEL_NAME ?? DEFAULT_DEEPSEEK_MODEL,
+        options.model ??
+        process.env.DEEPSEEK_MODEL_NAME ??
+        DEFAULT_DEEPSEEK_MODEL,
       apiKey: options.apiKey ?? process.env.DEEPSEEK_API_KEY,
       baseURL: options.baseURL ?? DEEPSEEK_BASE_URL,
     });

diff --git a/typescript/src/models/providers/kimi-model.ts b/typescript/src/models/providers/kimi-model.ts
@@ -20,9 +20,11 @@ export class KimiModel extends DeepEvalOpenAICompatibleModel {
   constructor(options: KimiModelOptions = {}) {
     super({
       ...options,
-      model: options.model ?? process.env.MOONSHOT_MODEL_NAME ?? DEFAULT_KIMI_MODEL,
+      model:
+        options.model ?? process.env.MOONSHOT_MODEL_NAME ?? DEFAULT_KIMI_MODEL,
       apiKey: options.apiKey ?? process.env.MOONSHOT_API_KEY,
-      baseURL: options.baseURL ?? process.env.MOONSHOT_BASE_URL ?? MOONSHOT_BASE_URL,
+      baseURL:
+        options.baseURL ?? process.env.MOONSHOT_BASE_URL ?? MOONSHOT_BASE_URL,
     });
   }
 }
diff --git a/typescript/src/models/providers/openai-model.ts b/typescript/src/models/providers/openai-model.ts
@@ -19,7 +19,8 @@ export class OpenAIModel extends DeepEvalOpenAICompatibleModel {
   constructor(options: OpenAIModelOptions = {}) {
     super({
       ...options,
-      model: options.model ?? process.env.OPENAI_MODEL_NAME ?? DEFAULT_OPENAI_MODEL,
+      model:
+        options.model ?? process.env.OPENAI_MODEL_NAME ?? DEFAULT_OPENAI_MODEL,
       apiKey: options.apiKey ?? process.env.OPENAI_API_KEY,
     });
   }