Skip to content

Commit 4e233d9

Browse files
jioffe502claude
andcommitted
Add BEIR metrics support for recall evaluation
- Add optional BEIR evaluation (NDCG, MAP, Precision) to recall tests - Configurable via enable_beir in test_configs.yaml or ENABLE_BEIR env var - Add beir>=2.0.0 dependency to harness - Add nvidia/llama-nemotron-embed-vl-1b-v2 to known embedding models Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent ef12337 commit 4e233d9

File tree

6 files changed

+116
-22
lines changed

6 files changed

+116
-22
lines changed

tools/harness/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ description = "Add your description here"
55
readme = "README.md"
66
requires-python = ">=3.12"
77
dependencies = [
8+
"beir>=2.0.0",
89
"datasets>=2.0.0",
910
"docker>=7.1.0",
1011
"milvus-lite==2.4.12",

tools/harness/src/nv_ingest_harness/cases/recall.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def evaluate_recall_with_reranker(
1818
evaluation_params: Dict,
1919
use_reranker: bool,
2020
log_path: str = "test_results",
21-
) -> Tuple[Dict[int, float], float]:
21+
) -> Tuple[Dict, float]:
2222
"""
2323
Run recall evaluation with specified reranker setting.
2424
@@ -30,32 +30,51 @@ def evaluate_recall_with_reranker(
3030
log_path: Path for logging output
3131
3232
Returns:
33-
Tuple of (scores_dict, elapsed_time)
33+
Tuple of (results_dict, elapsed_time)
34+
results_dict may be {k: score} or {"recall": {...}, "beir": {...}} if BEIR enabled
3435
"""
3536
mode_str = "with reranker" if use_reranker else "without reranker"
3637
print("\n" + "=" * 60)
3738
print(f"Running Recall Evaluation ({mode_str})")
3839
print("=" * 60)
3940

4041
eval_start = time.time()
41-
scores = evaluator(
42+
results = evaluator(
4243
collection_name=collection_name,
4344
nv_ranker=use_reranker,
4445
**evaluation_params,
4546
)
4647
eval_time = time.time() - eval_start
4748

48-
# Log results
49+
# Handle both old format {k: score} and new format {"recall": {...}, "beir": {...}}
50+
if isinstance(results, dict) and "recall" in results:
51+
recall_scores = results["recall"]
52+
beir_metrics = results.get("beir")
53+
else:
54+
recall_scores = results
55+
beir_metrics = None
56+
57+
# Log recall results
58+
reranker_suffix = "with" if use_reranker else "no"
4959
print(f"\nMultimodal Recall ({mode_str}):")
50-
for k in sorted(scores.keys()):
51-
score = scores[k]
60+
for k in sorted(recall_scores.keys()):
61+
score = recall_scores[k]
5262
print(f" - Recall @{k}: {score:.3f}")
53-
reranker_suffix = "with" if use_reranker else "no"
5463
kv_event_log(f"recall_multimodal_@{k}_{reranker_suffix}_reranker", score, log_path)
5564

56-
kv_event_log(f"recall_eval_time_s_{'with' if use_reranker else 'no'}_reranker", eval_time, log_path)
65+
# Log BEIR metrics if available
66+
if beir_metrics:
67+
print(f"\nBEIR Metrics ({mode_str}):")
68+
for metric_name, values in beir_metrics.items():
69+
for k_str, score in values.items():
70+
print(f" - {k_str}: {score:.5f}")
71+
# Log with format: ndcg_10_no_reranker
72+
k_num = k_str.split("@")[1] if "@" in k_str else k_str
73+
kv_event_log(f"{metric_name}_{k_num}_{reranker_suffix}_reranker", score, log_path)
74+
75+
kv_event_log(f"recall_eval_time_s_{reranker_suffix}_reranker", eval_time, log_path)
5776

58-
return scores, eval_time
77+
return results, eval_time
5978

6079

6180
def main(config=None, log_path: str = "test_results") -> int:
@@ -141,6 +160,7 @@ def main(config=None, log_path: str = "test_results") -> int:
141160
"vdb_backend": vdb_backend,
142161
"nv_ranker_endpoint": f"http://{hostname}:8020/v1/ranking",
143162
"nv_ranker_model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
163+
"enable_beir": config.enable_beir,
144164
}
145165
if vdb_backend == "lancedb":
146166
evaluation_params["table_path"] = lancedb_path

tools/harness/src/nv_ingest_harness/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class TestConfig:
9898
recall_top_k: int = 10
9999
ground_truth_dir: Optional[str] = None
100100
recall_dataset: Optional[str] = None
101+
enable_beir: bool = False # Enable BEIR metrics (NDCG, MAP, Precision)
101102

102103
def validate(self) -> List[str]:
103104
"""Validate configuration and return list of errors"""
@@ -350,6 +351,7 @@ def parse_list(value: str) -> List[str]:
350351
"RECALL_TOP_K": ("recall_top_k", parse_int),
351352
"GROUND_TRUTH_DIR": ("ground_truth_dir", str),
352353
"RECALL_DATASET": ("recall_dataset", str),
354+
"ENABLE_BEIR": ("enable_beir", parse_bool),
353355
}
354356

355357
overrides = {}

tools/harness/src/nv_ingest_harness/utils/interact.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def embed_info(
4949
"nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": 2048,
5050
"nvidia/llama-3.2-nv-embedqa-1b-v2": 2048,
5151
"nvidia/llama-3.2-nemoretriever-300m-embed-v1": 2048,
52+
"nvidia/llama-nemotron-embed-vl-1b-v2": 2048,
5253
"nvidia/nv-embedqa-e5-v5": 1024,
5354
}
5455

tools/harness/src/nv_ingest_harness/utils/recall.py

Lines changed: 76 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,67 @@
1010
import pandas as pd
1111
from collections import defaultdict
1212
from functools import partial
13-
from typing import Dict, Optional, Callable
13+
from typing import Dict, List, Optional, Callable, Any, Union
1414

1515
from nv_ingest_client.util.milvus import nvingest_retrieval
1616

1717
from nv_ingest_harness.utils.cases import get_repo_root
1818

1919

20+
def _compute_beir_metrics(
21+
all_retrieved: List[List[Dict]],
22+
query_df: pd.DataFrame,
23+
k_values: List[int] = [1, 5, 10],
24+
) -> Optional[Dict[str, Dict[str, float]]]:
25+
"""
26+
Compute BEIR metrics from retrieval results.
27+
28+
Args:
29+
all_retrieved: List of retrieval results per query. Each result is a list of
30+
dicts with 'entity' containing source info.
31+
query_df: DataFrame with 'query' and 'expected_pdf' columns, optionally 'query_id'.
32+
k_values: Cutoff values for evaluation (default [1, 5, 10]).
33+
34+
Returns:
35+
Dict with keys 'ndcg', 'map', 'recall', 'precision', each containing
36+
metric values like {'NDCG@1': 0.17, 'NDCG@5': 0.35, ...}, or None if BEIR unavailable.
37+
"""
38+
try:
39+
from beir.retrieval.evaluation import EvaluateRetrieval
40+
except ImportError:
41+
return None
42+
43+
# Build results dict: {query_id: {doc_id: score}}
44+
results = {}
45+
for idx, answers in enumerate(all_retrieved):
46+
if "query_id" in query_df.columns:
47+
query_id = str(query_df.iloc[idx]["query_id"])
48+
else:
49+
query_id = str(idx)
50+
51+
results[query_id] = {}
52+
num_results = len(answers)
53+
for rank, r in enumerate(answers):
54+
source_id = r.get("entity", {}).get("source", {}).get("source_id", "")
55+
doc_id = os.path.basename(source_id).split(".")[0]
56+
score = (num_results - rank) / num_results if num_results > 0 else 0
57+
results[query_id][doc_id] = score
58+
59+
# Build qrels dict: {query_id: {doc_id: relevance}}
60+
qrels = {}
61+
for idx, row in query_df.iterrows():
62+
if "query_id" in query_df.columns:
63+
query_id = str(row["query_id"])
64+
else:
65+
query_id = str(idx)
66+
qrels[query_id] = {str(row["expected_pdf"]): 1}
67+
68+
# Evaluate
69+
ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values, ignore_identical_ids=True)
70+
71+
return {"ndcg": ndcg, "map": _map, "recall": recall, "precision": precision}
72+
73+
2074
def _get_retrieval_func(
2175
vdb_backend: str,
2276
table_path: Optional[str] = None,
@@ -176,7 +230,8 @@ def get_recall_scores_pdf_only(
176230
batch_size: int = 100,
177231
vdb_backend: str = "milvus",
178232
table_path: Optional[str] = None,
179-
) -> Dict[int, float]:
233+
enable_beir: bool = False,
234+
) -> Union[Dict[int, float], Dict[str, Any]]:
180235
"""
181236
Calculate recall@k scores for queries against a VDB collection using PDF-only matching.
182237
@@ -199,11 +254,14 @@ def get_recall_scores_pdf_only(
199254
batch_size: Number of queries to process per batch (prevents gRPC size limit errors).
200255
vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
201256
table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
257+
enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
202258
203259
Returns:
204-
Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0).
260+
If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
261+
If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
205262
"""
206263
hits = defaultdict(list)
264+
all_retrieved = [] # Collect for BEIR computation
207265

208266
reranker_kwargs = {}
209267
if nv_ranker:
@@ -262,6 +320,10 @@ def get_recall_scores_pdf_only(
262320
**reranker_kwargs,
263321
)
264322

323+
# Collect results for BEIR if enabled
324+
if enable_beir:
325+
all_retrieved.extend(batch_answers)
326+
265327
for expected_pdf, retrieved_answers in zip(batch_expected_pdfs, batch_answers):
266328
# Extract PDF names only (no page numbers)
267329
retrieved_pdfs = [
@@ -276,6 +338,11 @@ def get_recall_scores_pdf_only(
276338

277339
recall_scores = {k: np.mean(hits[k]) for k in hits if len(hits[k]) > 0}
278340

341+
# Compute BEIR metrics if enabled
342+
if enable_beir:
343+
beir_metrics = _compute_beir_metrics(all_retrieved, query_df, k_values=[1, 5, 10])
344+
return {"recall": recall_scores, "beir": beir_metrics}
345+
279346
return recall_scores
280347

281348

@@ -781,7 +848,8 @@ def vidore_recall(
781848
nv_ranker_model_name: Optional[str] = None,
782849
vdb_backend: str = "milvus",
783850
table_path: Optional[str] = None,
784-
) -> Dict[int, float]:
851+
enable_beir: bool = False,
852+
) -> Union[Dict[int, float], Dict[str, Any]]:
785853
"""
786854
Evaluate recall@k for Vidore V3 dataset using PDF-only matching.
787855
@@ -803,9 +871,11 @@ def vidore_recall(
803871
nv_ranker_model_name: Optional custom reranker model name.
804872
vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
805873
table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
874+
enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
806875
807876
Returns:
808-
Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0).
877+
If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
878+
If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
809879
"""
810880
loader = partial(
811881
vidore_load_ground_truth,
@@ -829,6 +899,7 @@ def vidore_recall(
829899
nv_ranker_model_name=nv_ranker_model_name,
830900
vdb_backend=vdb_backend,
831901
table_path=table_path,
902+
enable_beir=enable_beir,
832903
)
833904

834905

tools/harness/test_configs.yaml

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ active:
8282
enable_image_storage: false # Enable server-side image storage (defaults to MinIO; set IMAGE_STORAGE_URI=file://... to opt into disk)
8383

8484
# Storage configuration
85-
spill_dir: /raid/jioffe/tmp/spill
85+
spill_dir: /tmp/spill
8686
artifacts_dir: null # null = use default (tools/harness/artifacts)
8787
collection_name: null # null = auto-generated
8888

@@ -99,13 +99,14 @@ recall:
9999
# Recall evaluation settings
100100
recall_top_k: 10
101101
ground_truth_dir: null
102+
enable_beir: false # Enable BEIR metrics (NDCG, MAP, Precision) - requires beir package
102103

103104
# Pre-configured datasets
104105
# Each dataset includes path, extraction settings, and recall evaluator
105106
# Use: uv run nv-ingest-harness-run --case=e2e --dataset=bo767
106107
datasets:
107108
bo767:
108-
path: /raid/jioffe/bo767
109+
path: /path/to/bo767
109110
extract_text: true
110111
extract_tables: true
111112
extract_charts: true
@@ -114,7 +115,7 @@ datasets:
114115
recall_dataset: bo767
115116

116117
earnings:
117-
path: /raid/jioffe/earnings_consulting
118+
path: /path/to/earnings_consulting
118119
extract_text: true
119120
extract_tables: true
120121
extract_charts: true
@@ -123,7 +124,7 @@ datasets:
123124
recall_dataset: earnings
124125

125126
bo20:
126-
path: /raid/jioffe/bo20
127+
path: /path/to/bo20
127128
extract_text: true
128129
extract_tables: true
129130
extract_charts: true
@@ -132,7 +133,7 @@ datasets:
132133
recall_dataset: null
133134

134135
financebench:
135-
path: /raid/jioffe/financebench
136+
path: /path/to/financebench
136137
extract_text: true
137138
extract_tables: true
138139
extract_charts: true
@@ -281,15 +282,13 @@ dataset_groups:
281282
- vidore_v3_physics
282283
- vidore_v3_finance_fr
283284

284-
# Vidore English-only (excludes finance_fr)
285+
# Vidore English-only
285286
vidore_english:
286287
- vidore_v3_finance_en
287288
- vidore_v3_industrial
288289
- vidore_v3_computer_science
289290
- vidore_v3_pharmaceuticals
290291
- vidore_v3_hr
291-
- vidore_v3_energy
292-
- vidore_v3_physics
293292

294293
# Vidore quick test (smallest datasets)
295294
vidore_quick:

0 commit comments

Comments
 (0)