Add BEIR metrics support for recall evaluation

jioffe502 · claude · jioffe502 · commit 4e233d914d55 · 2026-02-05T18:06:06.000Z
- Add optional BEIR evaluation (NDCG, MAP, Precision) to recall tests
- Configurable via enable_beir in test_configs.yaml or ENABLE_BEIR env var
- Add beir&gt;=2.0.0 dependency to harness
- Add nvidia/llama-nemotron-embed-vl-1b-v2 to known embedding models

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/tools/harness/pyproject.toml b/tools/harness/pyproject.toml
@@ -5,6 +5,7 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "beir>=2.0.0",
     "datasets>=2.0.0",
     "docker>=7.1.0",
     "milvus-lite==2.4.12",
diff --git a/tools/harness/src/nv_ingest_harness/cases/recall.py b/tools/harness/src/nv_ingest_harness/cases/recall.py
@@ -18,7 +18,7 @@ def evaluate_recall_with_reranker(
     evaluation_params: Dict,
     use_reranker: bool,
     log_path: str = "test_results",
-) -> Tuple[Dict[int, float], float]:
+) -> Tuple[Dict, float]:
     """
     Run recall evaluation with specified reranker setting.
 
@@ -30,32 +30,51 @@ def evaluate_recall_with_reranker(
         log_path: Path for logging output
 
     Returns:
-        Tuple of (scores_dict, elapsed_time)
+        Tuple of (results_dict, elapsed_time)
+        results_dict may be {k: score} or {"recall": {...}, "beir": {...}} if BEIR enabled
     """
     mode_str = "with reranker" if use_reranker else "without reranker"
     print("\n" + "=" * 60)
     print(f"Running Recall Evaluation ({mode_str})")
     print("=" * 60)
 
     eval_start = time.time()
-    scores = evaluator(
+    results = evaluator(
         collection_name=collection_name,
         nv_ranker=use_reranker,
         **evaluation_params,
     )
     eval_time = time.time() - eval_start
 
-    # Log results
+    # Handle both old format {k: score} and new format {"recall": {...}, "beir": {...}}
+    if isinstance(results, dict) and "recall" in results:
+        recall_scores = results["recall"]
+        beir_metrics = results.get("beir")
+    else:
+        recall_scores = results
+        beir_metrics = None
+
+    # Log recall results
+    reranker_suffix = "with" if use_reranker else "no"
     print(f"\nMultimodal Recall ({mode_str}):")
-    for k in sorted(scores.keys()):
-        score = scores[k]
+    for k in sorted(recall_scores.keys()):
+        score = recall_scores[k]
         print(f"  - Recall @{k}: {score:.3f}")
-        reranker_suffix = "with" if use_reranker else "no"
         kv_event_log(f"recall_multimodal_@{k}_{reranker_suffix}_reranker", score, log_path)
 
-    kv_event_log(f"recall_eval_time_s_{'with' if use_reranker else 'no'}_reranker", eval_time, log_path)
+    # Log BEIR metrics if available
+    if beir_metrics:
+        print(f"\nBEIR Metrics ({mode_str}):")
+        for metric_name, values in beir_metrics.items():
+            for k_str, score in values.items():
+                print(f"  - {k_str}: {score:.5f}")
+                # Log with format: ndcg_10_no_reranker
+                k_num = k_str.split("@")[1] if "@" in k_str else k_str
+                kv_event_log(f"{metric_name}_{k_num}_{reranker_suffix}_reranker", score, log_path)
+
+    kv_event_log(f"recall_eval_time_s_{reranker_suffix}_reranker", eval_time, log_path)
 
-    return scores, eval_time
+    return results, eval_time
 
 
 def main(config=None, log_path: str = "test_results") -> int:
@@ -141,6 +160,7 @@ def main(config=None, log_path: str = "test_results") -> int:
             "vdb_backend": vdb_backend,
             "nv_ranker_endpoint": f"http://{hostname}:8020/v1/ranking",
             "nv_ranker_model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+            "enable_beir": config.enable_beir,
         }
         if vdb_backend == "lancedb":
             evaluation_params["table_path"] = lancedb_path
diff --git a/tools/harness/src/nv_ingest_harness/config.py b/tools/harness/src/nv_ingest_harness/config.py
@@ -98,6 +98,7 @@ class TestConfig:
     recall_top_k: int = 10
     ground_truth_dir: Optional[str] = None
     recall_dataset: Optional[str] = None
+    enable_beir: bool = False  # Enable BEIR metrics (NDCG, MAP, Precision)
 
     def validate(self) -> List[str]:
         """Validate configuration and return list of errors"""
@@ -350,6 +351,7 @@ def parse_list(value: str) -> List[str]:
         "RECALL_TOP_K": ("recall_top_k", parse_int),
         "GROUND_TRUTH_DIR": ("ground_truth_dir", str),
         "RECALL_DATASET": ("recall_dataset", str),
+        "ENABLE_BEIR": ("enable_beir", parse_bool),
     }
 
     overrides = {}
diff --git a/tools/harness/src/nv_ingest_harness/utils/interact.py b/tools/harness/src/nv_ingest_harness/utils/interact.py
@@ -49,6 +49,7 @@ def embed_info(
         "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": 2048,
         "nvidia/llama-3.2-nv-embedqa-1b-v2": 2048,
         "nvidia/llama-3.2-nemoretriever-300m-embed-v1": 2048,
+        "nvidia/llama-nemotron-embed-vl-1b-v2": 2048,
         "nvidia/nv-embedqa-e5-v5": 1024,
     }
 
diff --git a/tools/harness/src/nv_ingest_harness/utils/recall.py b/tools/harness/src/nv_ingest_harness/utils/recall.py
@@ -10,13 +10,67 @@
 import pandas as pd
 from collections import defaultdict
 from functools import partial
-from typing import Dict, Optional, Callable
+from typing import Dict, List, Optional, Callable, Any, Union
 
 from nv_ingest_client.util.milvus import nvingest_retrieval
 
 from nv_ingest_harness.utils.cases import get_repo_root
 
 
+def _compute_beir_metrics(
+    all_retrieved: List[List[Dict]],
+    query_df: pd.DataFrame,
+    k_values: List[int] = [1, 5, 10],
+) -> Optional[Dict[str, Dict[str, float]]]:
+    """
+    Compute BEIR metrics from retrieval results.
+
+    Args:
+        all_retrieved: List of retrieval results per query. Each result is a list of
+                      dicts with 'entity' containing source info.
+        query_df: DataFrame with 'query' and 'expected_pdf' columns, optionally 'query_id'.
+        k_values: Cutoff values for evaluation (default [1, 5, 10]).
+
+    Returns:
+        Dict with keys 'ndcg', 'map', 'recall', 'precision', each containing
+        metric values like {'NDCG@1': 0.17, 'NDCG@5': 0.35, ...}, or None if BEIR unavailable.
+    """
+    try:
+        from beir.retrieval.evaluation import EvaluateRetrieval
+    except ImportError:
+        return None
+
+    # Build results dict: {query_id: {doc_id: score}}
+    results = {}
+    for idx, answers in enumerate(all_retrieved):
+        if "query_id" in query_df.columns:
+            query_id = str(query_df.iloc[idx]["query_id"])
+        else:
+            query_id = str(idx)
+
+        results[query_id] = {}
+        num_results = len(answers)
+        for rank, r in enumerate(answers):
+            source_id = r.get("entity", {}).get("source", {}).get("source_id", "")
+            doc_id = os.path.basename(source_id).split(".")[0]
+            score = (num_results - rank) / num_results if num_results > 0 else 0
+            results[query_id][doc_id] = score
+
+    # Build qrels dict: {query_id: {doc_id: relevance}}
+    qrels = {}
+    for idx, row in query_df.iterrows():
+        if "query_id" in query_df.columns:
+            query_id = str(row["query_id"])
+        else:
+            query_id = str(idx)
+        qrels[query_id] = {str(row["expected_pdf"]): 1}
+
+    # Evaluate
+    ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values, ignore_identical_ids=True)
+
+    return {"ndcg": ndcg, "map": _map, "recall": recall, "precision": precision}
+
+
 def _get_retrieval_func(
     vdb_backend: str,
     table_path: Optional[str] = None,
@@ -176,7 +230,8 @@ def get_recall_scores_pdf_only(
     batch_size: int = 100,
     vdb_backend: str = "milvus",
     table_path: Optional[str] = None,
-) -> Dict[int, float]:
+    enable_beir: bool = False,
+) -> Union[Dict[int, float], Dict[str, Any]]:
     """
     Calculate recall@k scores for queries against a VDB collection using PDF-only matching.
 
@@ -199,11 +254,14 @@ def get_recall_scores_pdf_only(
         batch_size: Number of queries to process per batch (prevents gRPC size limit errors).
         vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
         table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
+        enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
 
     Returns:
-        Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0).
+        If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
+        If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
     """
     hits = defaultdict(list)
+    all_retrieved = []  # Collect for BEIR computation
 
     reranker_kwargs = {}
     if nv_ranker:
@@ -262,6 +320,10 @@ def get_recall_scores_pdf_only(
                 **reranker_kwargs,
             )
 
+        # Collect results for BEIR if enabled
+        if enable_beir:
+            all_retrieved.extend(batch_answers)
+
         for expected_pdf, retrieved_answers in zip(batch_expected_pdfs, batch_answers):
             # Extract PDF names only (no page numbers)
             retrieved_pdfs = [
@@ -276,6 +338,11 @@ def get_recall_scores_pdf_only(
 
     recall_scores = {k: np.mean(hits[k]) for k in hits if len(hits[k]) > 0}
 
+    # Compute BEIR metrics if enabled
+    if enable_beir:
+        beir_metrics = _compute_beir_metrics(all_retrieved, query_df, k_values=[1, 5, 10])
+        return {"recall": recall_scores, "beir": beir_metrics}
+
     return recall_scores
 
 
@@ -781,7 +848,8 @@ def vidore_recall(
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
     table_path: Optional[str] = None,
-) -> Dict[int, float]:
+    enable_beir: bool = False,
+) -> Union[Dict[int, float], Dict[str, Any]]:
     """
     Evaluate recall@k for Vidore V3 dataset using PDF-only matching.
 
@@ -803,9 +871,11 @@ def vidore_recall(
         nv_ranker_model_name: Optional custom reranker model name.
         vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
         table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
+        enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
 
     Returns:
-        Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0).
+        If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
+        If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
     """
     loader = partial(
         vidore_load_ground_truth,
@@ -829,6 +899,7 @@ def vidore_recall(
         nv_ranker_model_name=nv_ranker_model_name,
         vdb_backend=vdb_backend,
         table_path=table_path,
+        enable_beir=enable_beir,
     )
 
 
diff --git a/tools/harness/test_configs.yaml b/tools/harness/test_configs.yaml
@@ -82,7 +82,7 @@ active:
   enable_image_storage: false  # Enable server-side image storage (defaults to MinIO; set IMAGE_STORAGE_URI=file://... to opt into disk)
 
   # Storage configuration
-  spill_dir: /raid/jioffe/tmp/spill
+  spill_dir: /tmp/spill
   artifacts_dir: null  # null = use default (tools/harness/artifacts)
   collection_name: null  # null = auto-generated
 
@@ -99,13 +99,14 @@ recall:
   # Recall evaluation settings
   recall_top_k: 10
   ground_truth_dir: null
+  enable_beir: false  # Enable BEIR metrics (NDCG, MAP, Precision) - requires beir package
 
 # Pre-configured datasets
 # Each dataset includes path, extraction settings, and recall evaluator
 # Use: uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 datasets:
   bo767:
-    path: /raid/jioffe/bo767
+    path: /path/to/bo767
     extract_text: true
     extract_tables: true
     extract_charts: true
@@ -114,7 +115,7 @@ datasets:
     recall_dataset: bo767
 
   earnings:
-    path: /raid/jioffe/earnings_consulting
+    path: /path/to/earnings_consulting
     extract_text: true
     extract_tables: true
     extract_charts: true
@@ -123,7 +124,7 @@ datasets:
     recall_dataset: earnings
 
   bo20:
-    path: /raid/jioffe/bo20
+    path: /path/to/bo20
     extract_text: true
     extract_tables: true
     extract_charts: true
@@ -132,7 +133,7 @@ datasets:
     recall_dataset: null
 
   financebench:
-    path: /raid/jioffe/financebench
+    path: /path/to/financebench
     extract_text: true
     extract_tables: true
     extract_charts: true
@@ -281,15 +282,13 @@ dataset_groups:
     - vidore_v3_physics
     - vidore_v3_finance_fr
 
-  # Vidore English-only (excludes finance_fr)
+  # Vidore English-only
   vidore_english:
     - vidore_v3_finance_en
     - vidore_v3_industrial
     - vidore_v3_computer_science
     - vidore_v3_pharmaceuticals
     - vidore_v3_hr
-    - vidore_v3_energy
-    - vidore_v3_physics
 
   # Vidore quick test (smallest datasets)
   vidore_quick:

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@ def embed_info(`
`49`	`49`	`"nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": 2048,`
`50`	`50`	`"nvidia/llama-3.2-nv-embedqa-1b-v2": 2048,`
`51`	`51`	`"nvidia/llama-3.2-nemoretriever-300m-embed-v1": 2048,`
	`52`	`+ "nvidia/llama-nemotron-embed-vl-1b-v2": 2048,`
`52`	`53`	`"nvidia/nv-embedqa-e5-v5": 1024,`
`53`	`54`	`}`
`54`	`55`