diff --git a/tools/harness/README.md b/tools/harness/README.md index 1edfbd237..efa78684b 100644 --- a/tools/harness/README.md +++ b/tools/harness/README.md @@ -143,6 +143,37 @@ uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20 # Custom path still works (uses active section config) uv run nv-ingest-harness-run --case=e2e --dataset=/custom/path + +# List available datasets and groups +uv run nv-ingest-harness-run --list-datasets +``` + +#### Dataset Groups + +Run multiple related datasets with a single command using dataset groups: + +```yaml +# In test_configs.yaml +dataset_groups: + vidore: # All 8 Vidore V3 benchmark datasets + - vidore_v3_finance_en + - vidore_v3_industrial + - ... + vidore_quick: # Quick test with smallest datasets + - vidore_v3_hr + - vidore_v3_industrial +``` + +**Usage:** +```bash +# Run all Vidore datasets +uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore + +# Run quick test (smallest 2 datasets) +uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore_quick + +# Mix groups and individual datasets +uv run nv-ingest-harness-run --case=e2e --dataset=vidore_quick,bo20 ``` **Dataset Extraction Settings:** diff --git a/tools/harness/pyproject.toml b/tools/harness/pyproject.toml index 6e2be71fb..7e5de7a4c 100644 --- a/tools/harness/pyproject.toml +++ b/tools/harness/pyproject.toml @@ -5,7 +5,9 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.12" dependencies = [ + "beir>=2.0.0", "click>=8.1.8", + "datasets>=2.0.0", "docker>=7.1.0", "pyyaml>=6.0", "requests>=2.31.0", diff --git a/tools/harness/src/nv_ingest_harness/cases/e2e.py b/tools/harness/src/nv_ingest_harness/cases/e2e.py index 8e2eb4e4d..ee8e9337a 100644 --- a/tools/harness/src/nv_ingest_harness/cases/e2e.py +++ b/tools/harness/src/nv_ingest_harness/cases/e2e.py @@ -66,8 +66,11 @@ def main(config=None, log_path: str = "test_results") -> int: extract_charts = config.extract_charts extract_images = config.extract_images extract_infographics = config.extract_infographics + extract_page_as_image = config.extract_page_as_image + extract_method = config.extract_method text_depth = config.text_depth table_output_format = config.table_output_format + image_elements_modality = config.image_elements_modality # Optional pipeline steps enable_caption = config.enable_caption @@ -80,6 +83,11 @@ def main(config=None, log_path: str = "test_results") -> int: model_name, dense_dim = embed_info() + # Deployment fingerprint - detect silent fallback to wrong model + if dense_dim == 1024: + print("WARNING: Embedding model returned dim=1024 (nv-embedqa-e5-v5 fallback)") + print("WARNING: Expected dim=2048 for multimodal embed. Check embedding NIM status.") + # Log configuration for transparency print("=== Test Configuration ===") print(f"Dataset: {data_dir}") @@ -155,15 +163,20 @@ def main(config=None, log_path: str = "test_results") -> int: ingestor = ingestor.pdf_split_config(pages_per_chunk=pdf_split_page_count) # Extraction step - ingestor = ingestor.extract( - extract_text=extract_text, - extract_tables=extract_tables, - extract_charts=extract_charts, - extract_images=extract_images, - text_depth=text_depth, - table_output_format=table_output_format, - extract_infographics=extract_infographics, - ) + extract_kwargs = { + "extract_text": extract_text, + "extract_tables": extract_tables, + "extract_charts": extract_charts, + "extract_images": extract_images, + "text_depth": text_depth, + "table_output_format": table_output_format, + "extract_infographics": extract_infographics, + } + if extract_page_as_image: + extract_kwargs["extract_page_as_image"] = True + if extract_method: + extract_kwargs["extract_method"] = extract_method + ingestor = ingestor.extract(**extract_kwargs) # Optional pipeline steps if enable_caption: @@ -181,7 +194,10 @@ def main(config=None, log_path: str = "test_results") -> int: ) # Embed (must come before storage per pipeline ordering) - ingestor = ingestor.embed(model_name=model_name) + embed_kwargs = {"model_name": model_name} + if image_elements_modality: + embed_kwargs["image_elements_modality"] = image_elements_modality + ingestor = ingestor.embed(**embed_kwargs) # Store images to disk (server-side image storage) - optional # Note: Supports both MinIO (s3://) and local disk (file://) via storage_uri @@ -245,6 +261,24 @@ def main(config=None, log_path: str = "test_results") -> int: # Optional: log chunk stats and per-type breakdown if vdb_backend != "lancedb": milvus_chunks(f"http://{hostname}:19530", collection_name) + # Verify collection vector dimension matches expected + try: + from pymilvus import MilvusClient + + mc = MilvusClient(uri=f"http://{hostname}:19530") + col_info = mc.describe_collection(collection_name) + for field in col_info.get("fields", []): + params = field.get("params", {}) + if "dim" in params: + actual_dim = int(params["dim"]) + if actual_dim != dense_dim: + print(f"WARNING: Collection vector dim={actual_dim} != expected dim={dense_dim}") + print("WARNING: Collection may have been created with a different embedding model") + else: + print(f"Collection vector dim={actual_dim} matches expected dim={dense_dim}") + mc.close() + except Exception as e: + print(f"Could not verify collection schema: {e}") text_results, table_results, chart_results = segment_results(results) kv_event_log("text_chunks", sum(len(x) for x in text_results), log_path) kv_event_log("table_chunks", sum(len(x) for x in table_results), log_path) diff --git a/tools/harness/src/nv_ingest_harness/cases/recall.py b/tools/harness/src/nv_ingest_harness/cases/recall.py index a92f93ab2..a16959d5f 100644 --- a/tools/harness/src/nv_ingest_harness/cases/recall.py +++ b/tools/harness/src/nv_ingest_harness/cases/recall.py @@ -18,7 +18,7 @@ def evaluate_recall_with_reranker( evaluation_params: Dict, use_reranker: bool, log_path: str = "test_results", -) -> Tuple[Dict[int, float], float]: +) -> Tuple[Dict, float]: """ Run recall evaluation with specified reranker setting. @@ -30,7 +30,8 @@ def evaluate_recall_with_reranker( log_path: Path for logging output Returns: - Tuple of (scores_dict, elapsed_time) + Tuple of (results_dict, elapsed_time) + results_dict may be {k: score} or {"recall": {...}, "beir": {...}} if BEIR enabled """ mode_str = "with reranker" if use_reranker else "without reranker" print("\n" + "=" * 60) @@ -38,24 +39,42 @@ def evaluate_recall_with_reranker( print("=" * 60) eval_start = time.time() - scores = evaluator( + results = evaluator( collection_name=collection_name, nv_ranker=use_reranker, **evaluation_params, ) eval_time = time.time() - eval_start - # Log results + # Handle both old format {k: score} and new format {"recall": {...}, "beir": {...}} + if isinstance(results, dict) and "recall" in results: + recall_scores = results["recall"] + beir_metrics = results.get("beir") + else: + recall_scores = results + beir_metrics = None + + # Log recall results + reranker_suffix = "with" if use_reranker else "no" print(f"\nMultimodal Recall ({mode_str}):") - for k in sorted(scores.keys()): - score = scores[k] + for k in sorted(recall_scores.keys()): + score = recall_scores[k] print(f" - Recall @{k}: {score:.3f}") - reranker_suffix = "with" if use_reranker else "no" kv_event_log(f"recall_multimodal_@{k}_{reranker_suffix}_reranker", score, log_path) - kv_event_log(f"recall_eval_time_s_{'with' if use_reranker else 'no'}_reranker", eval_time, log_path) + # Log BEIR metrics if available + if beir_metrics: + print(f"\nBEIR Metrics ({mode_str}):") + for metric_name, values in beir_metrics.items(): + for k_str, score in values.items(): + print(f" - {k_str}: {score:.5f}") + # Log with format: ndcg_10_no_reranker + k_num = k_str.split("@")[1] if "@" in k_str else k_str + kv_event_log(f"{metric_name}_{k_num}_{reranker_suffix}_reranker", score, log_path) + + kv_event_log(f"recall_eval_time_s_{reranker_suffix}_reranker", eval_time, log_path) - return scores, eval_time + return results, eval_time def main(config=None, log_path: str = "test_results") -> int: @@ -69,6 +88,11 @@ def main(config=None, log_path: str = "test_results") -> int: gpu_search = config.gpu_search model_name, dense_dim = embed_info() + # Deployment fingerprint - detect silent fallback to wrong model + if dense_dim == 1024: + print("WARNING: Embedding model returned dim=1024 (nv-embedqa-e5-v5 fallback)") + print("WARNING: Expected dim=2048 for multimodal embed. Check embedding NIM status.") + # Recall-specific configuration with defaults reranker_mode = getattr(config, "reranker_mode", "none") recall_top_k = getattr(config, "recall_top_k", 10) @@ -126,6 +150,27 @@ def main(config=None, log_path: str = "test_results") -> int: if lancedb_path: print(f"Using LanceDB at: {lancedb_path}") + # Verify collection schema if using Milvus + if vdb_backend == "milvus": + try: + from pymilvus import MilvusClient + + verify_uri = f"http://{hostname}:19530" + mc = MilvusClient(uri=verify_uri) + col_info = mc.describe_collection(collection_name) + for field in col_info.get("fields", []): + params = field.get("params", {}) + if "dim" in params: + actual_dim = int(params["dim"]) + if actual_dim != dense_dim: + print(f"WARNING: Collection vector dim={actual_dim} != embed model dim={dense_dim}") + print("WARNING: Collection may have been created with a different embedding model") + else: + print(f"Collection vector dim={actual_dim} matches embed model dim={dense_dim}") + mc.close() + except Exception as e: + print(f"Could not verify collection schema: {e}") + try: recall_results = {} @@ -141,7 +186,11 @@ def main(config=None, log_path: str = "test_results") -> int: "vdb_backend": vdb_backend, "nv_ranker_endpoint": f"http://{hostname}:8020/v1/ranking", "nv_ranker_model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2", + "enable_beir": config.enable_beir, } + language_filter = getattr(config, "language_filter", None) + if language_filter and recall_dataset.startswith("vidore_"): + evaluation_params["language_filter"] = language_filter if vdb_backend == "lancedb": evaluation_params["table_path"] = lancedb_path diff --git a/tools/harness/src/nv_ingest_harness/cli/nightly.py b/tools/harness/src/nv_ingest_harness/cli/nightly.py index 68a41f9e1..513295ae9 100644 --- a/tools/harness/src/nv_ingest_harness/cli/nightly.py +++ b/tools/harness/src/nv_ingest_harness/cli/nightly.py @@ -6,6 +6,7 @@ import os import subprocess import sys +import time from pathlib import Path from typing import Any @@ -348,7 +349,9 @@ def main( service_manager.stop() return 1 - print("Services ready!") + # Warm-up: let services stabilize and connect before running tests + print("Services ready! Sleeping 60s for warm-up...") + time.sleep(60) all_results = [] diff --git a/tools/harness/src/nv_ingest_harness/cli/run.py b/tools/harness/src/nv_ingest_harness/cli/run.py index 6dc4b155d..73f2e443d 100644 --- a/tools/harness/src/nv_ingest_harness/cli/run.py +++ b/tools/harness/src/nv_ingest_harness/cli/run.py @@ -1,6 +1,7 @@ import json import os import sys +import time from pathlib import Path import click @@ -69,6 +70,10 @@ def run_datasets( service_manager.stop() return 1 + # Warm-up: let services stabilize and connect before running tests + print("Services ready! Sleeping 60s for warm-up...") + time.sleep(60) + # Run each dataset for dataset_name in dataset_list: print(f"\n{'='*60}") @@ -279,14 +284,17 @@ def __init__(self, file_path, original_stream): def write(self, data): self.original.write(data) - self.file.write(data) + if not self.file.closed: + self.file.write(data) def flush(self): self.original.flush() - self.file.flush() + if not self.file.closed: + self.file.flush() def close(self): - self.file.close() + if not self.file.closed: + self.file.close() tee_stdout = TeeFile(stdout_path, sys.stdout) old_stdout = sys.stdout @@ -369,6 +377,11 @@ def close(self): default=None, help="Path to test config YAML (default: tools/harness/test_configs.yaml)", ) +@click.option( + "--list-datasets", + is_flag=True, + help="List available datasets and groups, then exit", +) def main( case, managed, @@ -382,14 +395,54 @@ def main( sku, dump_logs, test_config_path, + list_datasets, ): + # Handle --list-datasets + if list_datasets: + from nv_ingest_harness.config import list_datasets as get_datasets + + config_file = test_config_path or str(Path(__file__).resolve().parents[3] / "test_configs.yaml") + info = get_datasets(config_file=config_file) + + print("Available Datasets:") + print("-" * 50) + for name, config in sorted(info["datasets"].items()): + if isinstance(config, dict): + path = config.get("path", "N/A") + recall = config.get("recall_dataset") + recall_str = f" [recall: {recall}]" if recall else "" + else: + path = config + recall_str = "" + print(f" {name}: {path}{recall_str}") + + if info.get("groups"): + print("\nDataset Groups:") + print("-" * 50) + for name, members in sorted(info["groups"].items()): + print(f" {name} ({len(members)} datasets):") + for m in members: + print(f" - {m}") + + return 0 if not dataset: print("Error: --dataset is required. Use --dataset= or --dataset=,", file=sys.stderr) + print(" Use --list-datasets to see available datasets and groups", file=sys.stderr) return 1 - # Parse dataset(s) - handle both single and comma-separated - dataset_list = [d.strip() for d in dataset.split(",") if d.strip()] + # Parse dataset(s) - handle single, comma-separated, and groups + import yaml + + config_path = ( + Path(test_config_path) if test_config_path else Path(__file__).resolve().parents[3] / "test_configs.yaml" + ) + with open(config_path) as f: + yaml_data = yaml.safe_load(f) + + from nv_ingest_harness.config import expand_dataset_names + + dataset_list = expand_dataset_names(yaml_data, dataset) if not dataset_list: print("Error: No valid datasets found", file=sys.stderr) return 1 diff --git a/tools/harness/src/nv_ingest_harness/config.py b/tools/harness/src/nv_ingest_harness/config.py index 420533525..b34bb8b97 100644 --- a/tools/harness/src/nv_ingest_harness/config.py +++ b/tools/harness/src/nv_ingest_harness/config.py @@ -65,8 +65,11 @@ class TestConfig: extract_charts: bool = True extract_images: bool = False extract_infographics: bool = True + extract_page_as_image: bool = False + extract_method: Optional[str] = None text_depth: str = "page" table_output_format: str = "markdown" + image_elements_modality: Optional[str] = None # Optional pipeline steps enable_caption: bool = False @@ -95,6 +98,8 @@ class TestConfig: recall_top_k: int = 10 ground_truth_dir: Optional[str] = None recall_dataset: Optional[str] = None + enable_beir: bool = False # Enable BEIR metrics (NDCG, MAP, Precision) + language_filter: Optional[str] = None # Filter queries by language (e.g., "english") def validate(self) -> List[str]: """Validate configuration and return list of errors""" @@ -329,7 +334,10 @@ def parse_list(value: str) -> List[str]: "EXTRACT_CHARTS": ("extract_charts", parse_bool), "EXTRACT_IMAGES": ("extract_images", parse_bool), "EXTRACT_INFOGRAPHICS": ("extract_infographics", parse_bool), + "EXTRACT_PAGE_AS_IMAGE": ("extract_page_as_image", parse_bool), + "EXTRACT_METHOD": ("extract_method", str), "TEXT_DEPTH": ("text_depth", str), + "IMAGE_ELEMENTS_MODALITY": ("image_elements_modality", str), "TABLE_OUTPUT_FORMAT": ("table_output_format", str), "ENABLE_CAPTION": ("enable_caption", parse_bool), "CAPTION_PROMPT": ("caption_prompt", str), @@ -346,6 +354,8 @@ def parse_list(value: str) -> List[str]: "RECALL_TOP_K": ("recall_top_k", parse_int), "GROUND_TRUTH_DIR": ("ground_truth_dir", str), "RECALL_DATASET": ("recall_dataset", str), + "ENABLE_BEIR": ("enable_beir", parse_bool), + "LANGUAGE_FILTER": ("language_filter", str), } overrides = {} @@ -359,14 +369,55 @@ def parse_list(value: str) -> List[str]: return overrides +def expand_dataset_names(yaml_data: dict, dataset_input: str) -> List[str]: + """ + Expand a dataset input string to a list of dataset names. + + Handles: + - Single dataset name: "bo767" -> ["bo767"] + - Comma-separated: "bo767,earnings" -> ["bo767", "earnings"] + - Group name: "vidore" -> ["vidore_v3_finance_en", "vidore_v3_industrial", ...] + - Mixed: "vidore_quick,bo767" -> ["vidore_v3_hr", "vidore_v3_industrial", "bo767"] + + Args: + yaml_data: Parsed YAML data containing datasets and dataset_groups + dataset_input: Raw dataset input string + + Returns: + List of individual dataset names (expanded from groups) + """ + dataset_groups = yaml_data.get("dataset_groups", {}) + + raw_names = [name.strip() for name in dataset_input.split(",") if name.strip()] + + expanded = [] + for name in raw_names: + if name in dataset_groups: + expanded.extend(dataset_groups[name]) + else: + expanded.append(name) + + seen = set() + result = [] + for name in expanded: + if name not in seen: + seen.add(name) + result.append(name) + + return result + + def list_datasets(config_file: str = "test_configs.yaml") -> dict: - """List available dataset shortcuts""" - config_path = Path(__file__).parent / config_file + """List available dataset shortcuts and groups""" + config_path = Path(__file__).resolve().parents[2] / config_file with open(config_path) as f: yaml_data = yaml.safe_load(f) - return yaml_data.get("datasets", {}) + return { + "datasets": yaml_data.get("datasets", {}), + "groups": yaml_data.get("dataset_groups", {}), + } def list_presets(config_file: str = "test_configs.yaml") -> List[str]: diff --git a/tools/harness/src/nv_ingest_harness/utils/interact.py b/tools/harness/src/nv_ingest_harness/utils/interact.py index 5a29f3bb5..36ec8bd59 100644 --- a/tools/harness/src/nv_ingest_harness/utils/interact.py +++ b/tools/harness/src/nv_ingest_harness/utils/interact.py @@ -46,6 +46,7 @@ def embed_info( "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": 2048, "nvidia/llama-3.2-nv-embedqa-1b-v2": 2048, "nvidia/llama-3.2-nemoretriever-300m-embed-v1": 2048, + "nvidia/llama-nemotron-embed-vl-1b-v2": 2048, "nvidia/nv-embedqa-e5-v5": 1024, } diff --git a/tools/harness/src/nv_ingest_harness/utils/recall.py b/tools/harness/src/nv_ingest_harness/utils/recall.py index d5b20d863..c7ba2571f 100644 --- a/tools/harness/src/nv_ingest_harness/utils/recall.py +++ b/tools/harness/src/nv_ingest_harness/utils/recall.py @@ -10,13 +10,75 @@ import pandas as pd from collections import defaultdict from functools import partial -from typing import Dict, Optional, Callable +from typing import Dict, List, Optional, Callable, Any, Union from nv_ingest_client.util.milvus import nvingest_retrieval from nv_ingest_harness.utils.cases import get_repo_root +def _compute_beir_metrics( + all_retrieved: List[List[Dict]], + query_df: pd.DataFrame, + k_values: List[int] = [1, 5, 10], + qrels_dict: Optional[Dict[str, Dict[str, float]]] = None, +) -> Optional[Dict[str, Dict[str, float]]]: + """ + Compute BEIR metrics from retrieval results. + + Args: + all_retrieved: List of retrieval results per query. Each result is a list of + dicts with 'entity' containing source info. + query_df: DataFrame with 'query' and 'expected_pdf' columns, optionally 'query_id'. + k_values: Cutoff values for evaluation (default [1, 5, 10]). + qrels_dict: Optional full qrels {query_id: {doc_id: relevance}}. If None, builds + single-doc qrels from expected_pdf (for finance_bench etc). + + Returns: + Dict with keys 'ndcg', 'map', 'recall', 'precision', each containing + metric values like {'NDCG@1': 0.17, 'NDCG@5': 0.35, ...}, or None if BEIR unavailable. + """ + try: + from beir.retrieval.evaluation import EvaluateRetrieval + except ImportError: + return None + + # Build results dict: {query_id: {doc_id: score}} + results = {} + for idx, answers in enumerate(all_retrieved): + if "query_id" in query_df.columns: + query_id = str(query_df.iloc[idx]["query_id"]) + else: + query_id = str(idx) + + results[query_id] = {} + num_results = len(answers) + for rank, r in enumerate(answers): + source_id = r.get("entity", {}).get("source", {}).get("source_id", "") + doc_id = os.path.basename(source_id).split(".")[0] + score = (num_results - rank) / num_results if num_results > 0 else 0 + # Keep first (highest-ranked) occurrence per doc — dedup multiple chunks from same PDF + if doc_id not in results[query_id]: + results[query_id][doc_id] = score + + # Build qrels dict: use provided full qrels or fall back to single expected_pdf + if qrels_dict is not None: + qrels = qrels_dict + else: + qrels = {} + for idx, row in query_df.iterrows(): + if "query_id" in query_df.columns: + query_id = str(row["query_id"]) + else: + query_id = str(idx) + qrels[query_id] = {str(row["expected_pdf"]): 1} + + # Evaluate + ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values) + + return {"ndcg": ndcg, "map": _map, "recall": recall, "precision": precision} + + def _get_retrieval_func( vdb_backend: str, table_path: Optional[str] = None, @@ -176,7 +238,8 @@ def get_recall_scores_pdf_only( batch_size: int = 100, vdb_backend: str = "milvus", table_path: Optional[str] = None, -) -> Dict[int, float]: + enable_beir: bool = False, +) -> Union[Dict[int, float], Dict[str, Any]]: """ Calculate recall@k scores for queries against a VDB collection using PDF-only matching. @@ -199,11 +262,14 @@ def get_recall_scores_pdf_only( batch_size: Number of queries to process per batch (prevents gRPC size limit errors). vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus". table_path: Path to LanceDB database directory (required if vdb_backend="lancedb"). + enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision). Returns: - Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0). + If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores. + If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics. """ hits = defaultdict(list) + all_retrieved = [] # Collect for BEIR computation reranker_kwargs = {} if nv_ranker: @@ -213,7 +279,11 @@ def get_recall_scores_pdf_only( reranker_kwargs["nv_ranker_model_name"] = nv_ranker_model_name queries = query_df["query"].to_list() - expected_pdfs = query_df["expected_pdf"].to_list() + # Support multi-doc ground truth (expected_pdfs) for Vidore; fallback to expected_pdf + if "expected_pdfs" in query_df.columns: + expected_pdfs = query_df["expected_pdfs"].to_list() + else: + expected_pdfs = [[ep] for ep in query_df["expected_pdf"].to_list()] # Process queries in batches to avoid gRPC message size limits num_queries = len(queries) @@ -262,20 +332,33 @@ def get_recall_scores_pdf_only( **reranker_kwargs, ) - for expected_pdf, retrieved_answers in zip(batch_expected_pdfs, batch_answers): - # Extract PDF names only (no page numbers) - retrieved_pdfs = [ - os.path.basename(result.get("entity", {}).get("source", {}).get("source_id", "")).split(".")[0] - for result in retrieved_answers - ] + # Collect results for BEIR if enabled + if enable_beir: + all_retrieved.extend(batch_answers) + + for expected_pdfs_for_query, retrieved_answers in zip(batch_expected_pdfs, batch_answers): + # Extract PDF names only (no page numbers), deduplicated to unique PDFs + retrieved_pdfs = list( + dict.fromkeys( + os.path.basename(result.get("entity", {}).get("source", {}).get("source_id", "")).split(".")[0] + for result in retrieved_answers + ) + ) - # Finance_bench uses k values [1, 5, 10] + # Hit = any relevant doc in top-k unique PDFs for k in [1, 5, 10]: if k <= top_k: - hits[k].append(expected_pdf in retrieved_pdfs[:k]) + hit = any(exp in retrieved_pdfs[:k] for exp in expected_pdfs_for_query) + hits[k].append(hit) recall_scores = {k: np.mean(hits[k]) for k in hits if len(hits[k]) > 0} + # Compute BEIR metrics if enabled + if enable_beir: + qrels_dict = query_df.attrs.get("qrels") + beir_metrics = _compute_beir_metrics(all_retrieved, query_df, k_values=[1, 5, 10], qrels_dict=qrels_dict) + return {"recall": recall_scores, "beir": beir_metrics} + return recall_scores @@ -719,6 +802,141 @@ def bo10k_recall( ) +def vidore_load_ground_truth( + ground_truth_dir: Optional[str] = None, + dataset_name: str = "vidore_v3_finance_en", + language_filter: Optional[str] = None, +) -> pd.DataFrame: + """ + Load Vidore V3 ground truth from HuggingFace datasets. + + Uses the industry-standard HuggingFace datasets API which provides: + - Automatic local caching (~/.cache/huggingface/datasets/) + - No redundant downloads on subsequent runs + - Always retrieves latest benchmark version + + Args: + ground_truth_dir: Unused (kept for API compatibility with other loaders) + dataset_name: Vidore dataset name (e.g., "vidore_v3_finance_en") + language_filter: Optional language filter ("english" or None for all) + + Returns: + DataFrame with columns: 'query', 'expected_pdf', 'expected_pdfs', 'query_id' + df.attrs["qrels"] contains full BEIR-compatible qrels dict with graded relevance + """ + from datasets import load_dataset + + hf_name = f"vidore/{dataset_name}" + + # Load queries and qrels from HuggingFace (cached automatically) + queries_ds = load_dataset(hf_name, data_dir="queries", split="test") + qrels_ds = load_dataset(hf_name, data_dir="qrels", split="test") + + # Build full qrels: {query_id: {corpus_id: score}} — matches notebook's format_qrels + full_qrels = defaultdict(dict) + for row in qrels_ds: + full_qrels[str(row["query_id"])][str(row["corpus_id"])] = row["score"] + full_qrels = dict(full_qrels) + + rows = [] + for row in queries_ds: + query_id = row["query_id"] + + # Apply language filter if specified + if language_filter and row.get("language", "").lower() != language_filter.lower(): + continue + + qid_str = str(query_id) + if qid_str in full_qrels: + relevant_docs = sorted(full_qrels[qid_str].items(), key=lambda x: x[1], reverse=True) + rows.append( + { + "query": row["query"], + "expected_pdf": relevant_docs[0][0], + "expected_pdfs": [doc_id for doc_id, _ in relevant_docs], + "query_id": qid_str, + } + ) + + if not rows: + raise ValueError(f"No valid queries found for {dataset_name}") + + df = pd.DataFrame(rows) + df.attrs["qrels"] = full_qrels + return df + + +def vidore_recall( + collection_name: str, + dataset_name: str = "vidore_v3_finance_en", + language_filter: Optional[str] = None, + hostname: str = "localhost", + sparse: bool = False, + hybrid: bool = False, + model_name: str = None, + top_k: int = 10, + gpu_search: bool = False, + nv_ranker: bool = False, + ground_truth_dir: Optional[str] = None, + nv_ranker_endpoint: Optional[str] = None, + nv_ranker_model_name: Optional[str] = None, + vdb_backend: str = "milvus", + table_path: Optional[str] = None, + enable_beir: bool = False, +) -> Union[Dict[int, float], Dict[str, Any]]: + """ + Evaluate recall@k for Vidore V3 dataset using PDF-only matching. + + Loads ground truth from HuggingFace datasets API and evaluates recall + against the specified VDB collection. + + Args: + collection_name: VDB collection/table name to query. + dataset_name: Vidore dataset name (e.g., "vidore_v3_finance_en"). + language_filter: Optional language filter ("english" or None for all). + hostname: Service hostname for embedding endpoint. + sparse: Enable hybrid sparse-dense retrieval if True (Milvus only). + model_name: Embedding model name for query encoding. + top_k: Maximum number of results to retrieve and evaluate. + gpu_search: Use GPU acceleration for Milvus search. + nv_ranker: Enable NVIDIA reranker for result reranking. + ground_truth_dir: Unused (kept for API compatibility). + nv_ranker_endpoint: Optional custom reranker endpoint URL. + nv_ranker_model_name: Optional custom reranker model name. + vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus". + table_path: Path to LanceDB database directory (required if vdb_backend="lancedb"). + enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision). + + Returns: + If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores. + If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics. + """ + loader = partial( + vidore_load_ground_truth, + dataset_name=dataset_name, + language_filter=language_filter, + ) + + return evaluate_recall_orchestrator( + loader_func=loader, + scorer_func=get_recall_scores_pdf_only, + collection_name=collection_name, + hostname=hostname, + sparse=sparse, + hybrid=hybrid, + model_name=model_name, + top_k=top_k, + gpu_search=gpu_search, + nv_ranker=nv_ranker, + ground_truth_dir=ground_truth_dir, + nv_ranker_endpoint=nv_ranker_endpoint, + nv_ranker_model_name=nv_ranker_model_name, + vdb_backend=vdb_backend, + table_path=table_path, + enable_beir=enable_beir, + ) + + def jp20_recall( collection_name: str, hostname: str = "localhost", @@ -777,7 +995,7 @@ def get_dataset_evaluator(dataset_name: str) -> Optional[Callable]: Get the recall evaluator function for a given dataset. Args: - dataset_name: Name of the dataset (e.g., 'bo767', 'finance_bench') + dataset_name: Name of the dataset (e.g., 'bo767', 'finance_bench', 'vidore_v3_finance_en') Returns: Evaluator function or None if not found @@ -791,4 +1009,21 @@ def get_dataset_evaluator(dataset_name: str) -> Optional[Callable]: "jp20": jp20_recall, } - return evaluators.get(dataset_name.lower()) + # Vidore V3 benchmark datasets + vidore_datasets = [ + "vidore_v3_finance_en", + "vidore_v3_industrial", + "vidore_v3_computer_science", + "vidore_v3_pharmaceuticals", + "vidore_v3_hr", + "vidore_v3_energy", + "vidore_v3_physics", + "vidore_v3_finance_fr", + ] + + dataset_lower = dataset_name.lower() + if dataset_lower in evaluators: + return evaluators[dataset_lower] + if dataset_lower in vidore_datasets: + return partial(vidore_recall, dataset_name=dataset_lower) + return None diff --git a/tools/harness/test_configs.yaml b/tools/harness/test_configs.yaml index 00bfb374d..91c68e5e3 100644 --- a/tools/harness/test_configs.yaml +++ b/tools/harness/test_configs.yaml @@ -59,7 +59,7 @@ active: sparse: false # Use sparse embeddings (Milvus only) gpu_search: false # Use GPU for search embedding_model: auto # auto-detect or specify model name - vdb_backend: lancedb # milvus or lancedb + vdb_backend: milvus # milvus or lancedb hybrid: false # LanceDB hybrid retrieval (FTS + vector) # Extraction configuration @@ -94,11 +94,13 @@ active: # docker compose --profile reranker up -d recall: recall_dataset: null - reranker_mode: both # Options: "none", "with", "both" + reranker_mode: none # Options: "none", "with", "both" # Recall evaluation settings recall_top_k: 10 ground_truth_dir: null + enable_beir: false # Enable BEIR metrics (NDCG, MAP, Precision) - requires beir package + language_filter: null # Filter queries by language (e.g., "english") - Vidore only # Pre-configured datasets # Each dataset includes path, extraction settings, and recall evaluator @@ -177,3 +179,145 @@ datasets: extract_images: false extract_infographics: true recall_dataset: jp20 + + # Vidore V3 Benchmark Datasets + # See: https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d + vidore_v3_finance_en: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_finance_en + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_finance_en + enable_beir: true + + vidore_v3_industrial: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_industrial + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_industrial + enable_beir: true + + vidore_v3_computer_science: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_computer_science + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_computer_science + enable_beir: true + + vidore_v3_pharmaceuticals: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_pharmaceuticals + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_pharmaceuticals + enable_beir: true + + vidore_v3_hr: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_hr + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_hr + enable_beir: true + + vidore_v3_energy: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_energy + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_energy + enable_beir: true + + vidore_v3_physics: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_physics + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_physics + enable_beir: true + + vidore_v3_finance_fr: + path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_finance_fr + extract_text: true + extract_tables: true + extract_charts: true + extract_images: false + extract_infographics: true + extract_page_as_image: true + text_depth: page + table_output_format: markdown + image_elements_modality: text_image + recall_dataset: vidore_v3_finance_fr + enable_beir: true + +# Dataset groups for running multiple datasets together +# Use: uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore +dataset_groups: + # All Vidore V3 datasets + vidore: + - vidore_v3_finance_en + - vidore_v3_industrial + - vidore_v3_computer_science + - vidore_v3_pharmaceuticals + - vidore_v3_hr + - vidore_v3_energy + - vidore_v3_physics + - vidore_v3_finance_fr + + # Vidore English-only (set language_filter: english in recall section) + vidore_english: + - vidore_v3_finance_en + - vidore_v3_industrial + - vidore_v3_computer_science + - vidore_v3_pharmaceuticals + - vidore_v3_hr + + # Vidore quick test (smallest datasets) + vidore_quick: + # - vidore_v3_hr + # - vidore_v3_industrial + - vidore_v3_computer_science