Skip to content

Commit 9eb75d0

Browse files
authored
docs(examples): add heterogeneous financial document RAG evaluation example with threshold_overrides
This script evaluates a financial RAG pipeline using document-type-specific thresholds for various document types, including balance sheets and earnings calls. It implements metrics for faithfulness, precision, and recall, and demonstrates how to handle different thresholds based on document type.
1 parent 4aa1973 commit 9eb75d0

1 file changed

Lines changed: 224 additions & 0 deletions

File tree

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
"""Heterogeneous Document RAG Evaluation with DeepEval.
2+
3+
This example demonstrates how to evaluate a financial RAG pipeline that
4+
retrieve chunks from mixed document types (10-K filings, earnings call
5+
transcripts, balance sheets) using document-type-specific thresholds.
6+
7+
Key features shown:
8+
1. Using LLMTestCase metadata to tag chunks by document_type.
9+
2. Using threshold_overrides (per PR #2785) to set different pass/fail
10+
thresholds per document type.
11+
3. Combining FaithfulnessMetric, ContextualPrecisionMetric, and
12+
ContextualRecallMetric in a single heterogeneous test run.
13+
4. Interpreting results: why structured documents (balance sheets) need
14+
higher thresholds than narrative documents (earnings calls).
15+
16+
Requires: OPENAI_API_KEY environment variable.
17+
Install: pip install deepeval
18+
"""
19+
20+
import os
21+
from deepeval import evaluate
22+
from deepeval.metrics import (
23+
FaithfulnessMetric,
24+
ContextualPrecisionMetric,
25+
ContextualRecallMetric,
26+
)
27+
from deepeval.test_case import LLMTestCase
28+
29+
30+
# ---------------------------------------------------------------------------
31+
# Document-type threshold configuration
32+
# ---------------------------------------------------------------------------
33+
# Different document types warrant different pass/fail thresholds:
34+
#
35+
# - balance_sheet: High precision required (0.95). Structured numeric data
36+
# must be retrieved exactly; hallucinated figures are dangerous.
37+
#
38+
# - annual_report (10-K narrative): Moderate threshold (0.80). Some
39+
# paraphrase of narrative text is acceptable.
40+
#
41+
# - earnings_call: Lower threshold (0.70). Transcripts contain hedged
42+
# language and forward-looking statements that LLMs may paraphrase.
43+
#
44+
# - default: Applied when document_type metadata is absent (0.75).
45+
46+
THRESHOLD_OVERRIDES = {
47+
"balance_sheet": 0.95,
48+
"annual_report": 0.80,
49+
"earnings_call": 0.70,
50+
"default": 0.75,
51+
}
52+
53+
54+
# ---------------------------------------------------------------------------
55+
# Test cases
56+
# ---------------------------------------------------------------------------
57+
58+
def build_test_cases():
59+
"""Build a mixed set of test cases representing a financial RAG pipeline.
60+
61+
Each test case includes a document_type metadata key that will be used
62+
by the threshold_overrides parameter (available in DeepEval >= 2.0,
63+
once PR #2785 is merged).
64+
"""
65+
test_cases = []
66+
67+
# --- Balance sheet query ---
68+
# High precision required: exact numeric retrieval from structured table.
69+
balance_sheet_case = LLMTestCase(
70+
input="What were total assets and total liabilities for FY2023?",
71+
actual_output=(
72+
"Total assets for FY2023 were $18.7 billion. "
73+
"Total liabilities were $11.2 billion."
74+
),
75+
expected_output=(
76+
"Total assets: $18.7B. Total liabilities: $11.2B (FY2023)."
77+
),
78+
retrieval_context=[
79+
"| Metric | FY2023 | FY2022 |\n"
80+
"|--------|--------|--------|\n"
81+
"| Total Assets | $18.7B | $16.4B |\n"
82+
"| Total Liabilities | $11.2B | $10.1B |\n"
83+
"| Stockholders' Equity | $7.5B | $6.3B |",
84+
],
85+
# Tag this test case with document type for threshold routing
86+
additional_metadata={"document_type": "balance_sheet"},
87+
)
88+
test_cases.append(balance_sheet_case)
89+
90+
# --- Annual report (10-K narrative) query ---
91+
# Moderate precision: narrative paraphrase acceptable.
92+
annual_report_case = LLMTestCase(
93+
input="What drove revenue growth in FY2023 according to the 10-K?",
94+
actual_output=(
95+
"Revenue growth in FY2023 was primarily driven by the cloud segment, "
96+
"which grew 34% year-over-year and contributed $1.4B to the total "
97+
"revenue increase."
98+
),
99+
expected_output=(
100+
"The cloud segment drove revenue growth, growing 34% YoY "
101+
"and contributing $1.4B to overall revenue gains."
102+
),
103+
retrieval_context=[
104+
"Revenue for FY2023 was $4.2 billion, up 12% year-over-year. "
105+
"The primary growth driver was the cloud segment, which grew 34% "
106+
"year-over-year and contributed $1.4 billion to the revenue increase. "
107+
"Enterprise software and services also grew 8%, while legacy hardware "
108+
"revenue declined 5%.",
109+
],
110+
additional_metadata={"document_type": "annual_report"},
111+
)
112+
test_cases.append(annual_report_case)
113+
114+
# --- Earnings call query ---
115+
# Lower threshold: forward-looking statements and hedged language.
116+
earnings_call_case = LLMTestCase(
117+
input="What guidance did management provide for FY2024 revenue?",
118+
actual_output=(
119+
"Management guided for FY2024 revenue in the range of $4.6B to $4.8B, "
120+
"representing 10-14% growth, subject to macroeconomic conditions."
121+
),
122+
expected_output=(
123+
"FY2024 revenue guidance: $4.6B–4.8B (10–14% growth), "
124+
"conditional on macroeconomic environment."
125+
),
126+
retrieval_context=[
127+
"Looking ahead, we are guiding to fiscal 2024 revenue in the range of "
128+
"$4.6 to $4.8 billion, which represents growth of approximately 10 to 14 "
129+
"percent year-over-year. This outlook assumes stable macroeconomic "
130+
"conditions and does not account for potential FX headwinds beyond "
131+
"current rates. We feel confident in our pipeline but remain cautious "
132+
"given the broader environment.",
133+
],
134+
additional_metadata={"document_type": "earnings_call"},
135+
)
136+
test_cases.append(earnings_call_case)
137+
138+
return test_cases
139+
140+
141+
# ---------------------------------------------------------------------------
142+
# Metrics with threshold_overrides
143+
# ---------------------------------------------------------------------------
144+
# Note: threshold_overrides is available once PR #2785 is merged.
145+
# Until then, the metrics will use the default threshold.
146+
# The metadata["document_type"] key is used to select the override.
147+
148+
def build_metrics():
149+
"""Build metrics with document-type threshold overrides.
150+
151+
threshold_overrides: dict mapping document_type values to thresholds.
152+
The metric checks test_case.additional_metadata["document_type"] and
153+
applies the matching threshold, falling back to the default threshold
154+
if the key is absent or unrecognised.
155+
"""
156+
faithfulness = FaithfulnessMetric(
157+
threshold=THRESHOLD_OVERRIDES["default"],
158+
# threshold_overrides will route to the right threshold per test case
159+
# threshold_overrides=THRESHOLD_OVERRIDES, # Uncomment post-PR #2785
160+
verbose_mode=True,
161+
)
162+
precision = ContextualPrecisionMetric(
163+
threshold=THRESHOLD_OVERRIDES["default"],
164+
# threshold_overrides=THRESHOLD_OVERRIDES, # Uncomment post-PR #2785
165+
verbose_mode=True,
166+
)
167+
recall = ContextualRecallMetric(
168+
threshold=THRESHOLD_OVERRIDES["default"],
169+
# threshold_overrides=THRESHOLD_OVERRIDES, # Uncomment post-PR #2785
170+
verbose_mode=True,
171+
)
172+
return [faithfulness, precision, recall]
173+
174+
175+
# ---------------------------------------------------------------------------
176+
# Run evaluation
177+
# ---------------------------------------------------------------------------
178+
179+
def run_heterogeneous_eval():
180+
"""Run the full heterogeneous document evaluation.
181+
182+
Results will show per-test-case scores. The key insight:
183+
- Balance sheet cases should be evaluated at 0.95 threshold.
184+
- Earnings call cases pass at 0.70 threshold even with hedged language.
185+
- Without threshold_overrides, all cases use the same threshold,
186+
leading to false failures on earnings calls or false passes on
187+
balance sheets.
188+
"""
189+
test_cases = build_test_cases()
190+
metrics = build_metrics()
191+
192+
results = evaluate(
193+
test_cases=test_cases,
194+
metrics=metrics,
195+
# Optional: group results by document type for easier analysis
196+
# run_async=False, # Useful for debugging
197+
)
198+
199+
print("\n--- Evaluation Results ---")
200+
for result in results.test_results:
201+
doc_type = result.additional_metadata.get("document_type", "unknown")
202+
threshold_used = THRESHOLD_OVERRIDES.get(
203+
doc_type, THRESHOLD_OVERRIDES["default"]
204+
)
205+
print(f"\nDocument type: {doc_type}")
206+
print(f"Expected threshold: {threshold_used}")
207+
for metric_result in result.metrics_data:
208+
status = "✅" if metric_result.success else "❌"
209+
print(
210+
f" {status} {metric_result.name}: "
211+
f"{metric_result.score:.3f} "
212+
f"(threshold: {metric_result.threshold})"
213+
)
214+
215+
return results
216+
217+
218+
if __name__ == "__main__":
219+
if not os.environ.get("OPENAI_API_KEY"):
220+
raise EnvironmentError(
221+
"OPENAI_API_KEY not set. Export your API key before running:\n"
222+
" export OPENAI_API_KEY=sk-..."
223+
)
224+
run_heterogeneous_eval()

0 commit comments

Comments
 (0)