diff --git a/tools/harness/README.md b/tools/harness/README.md
index 1edfbd237..efa78684b 100644
--- a/tools/harness/README.md
+++ b/tools/harness/README.md
@@ -143,6 +143,37 @@ uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20
 
 # Custom path still works (uses active section config)
 uv run nv-ingest-harness-run --case=e2e --dataset=/custom/path
+
+# List available datasets and groups
+uv run nv-ingest-harness-run --list-datasets
+```
+
+#### Dataset Groups
+
+Run multiple related datasets with a single command using dataset groups:
+
+```yaml
+# In test_configs.yaml
+dataset_groups:
+  vidore:           # All 8 Vidore V3 benchmark datasets
+    - vidore_v3_finance_en
+    - vidore_v3_industrial
+    - ...
+  vidore_quick:     # Quick test with smallest datasets
+    - vidore_v3_hr
+    - vidore_v3_industrial
+```
+
+**Usage:**
+```bash
+# Run all Vidore datasets
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore
+
+# Run quick test (smallest 2 datasets)
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore_quick
+
+# Mix groups and individual datasets
+uv run nv-ingest-harness-run --case=e2e --dataset=vidore_quick,bo20
 ```
 
 **Dataset Extraction Settings:**
diff --git a/tools/harness/pyproject.toml b/tools/harness/pyproject.toml
index 6e2be71fb..7e5de7a4c 100644
--- a/tools/harness/pyproject.toml
+++ b/tools/harness/pyproject.toml
@@ -5,7 +5,9 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "beir>=2.0.0",
     "click>=8.1.8",
+    "datasets>=2.0.0",
     "docker>=7.1.0",
     "pyyaml>=6.0",
     "requests>=2.31.0",
diff --git a/tools/harness/src/nv_ingest_harness/cases/e2e.py b/tools/harness/src/nv_ingest_harness/cases/e2e.py
index 8e2eb4e4d..ee8e9337a 100644
--- a/tools/harness/src/nv_ingest_harness/cases/e2e.py
+++ b/tools/harness/src/nv_ingest_harness/cases/e2e.py
@@ -66,8 +66,11 @@ def main(config=None, log_path: str = "test_results") -> int:
     extract_charts = config.extract_charts
     extract_images = config.extract_images
     extract_infographics = config.extract_infographics
+    extract_page_as_image = config.extract_page_as_image
+    extract_method = config.extract_method
     text_depth = config.text_depth
     table_output_format = config.table_output_format
+    image_elements_modality = config.image_elements_modality
 
     # Optional pipeline steps
     enable_caption = config.enable_caption
@@ -80,6 +83,11 @@ def main(config=None, log_path: str = "test_results") -> int:
 
     model_name, dense_dim = embed_info()
 
+    # Deployment fingerprint - detect silent fallback to wrong model
+    if dense_dim == 1024:
+        print("WARNING: Embedding model returned dim=1024 (nv-embedqa-e5-v5 fallback)")
+        print("WARNING: Expected dim=2048 for multimodal embed. Check embedding NIM status.")
+
     # Log configuration for transparency
     print("=== Test Configuration ===")
     print(f"Dataset: {data_dir}")
@@ -155,15 +163,20 @@ def main(config=None, log_path: str = "test_results") -> int:
         ingestor = ingestor.pdf_split_config(pages_per_chunk=pdf_split_page_count)
 
     # Extraction step
-    ingestor = ingestor.extract(
-        extract_text=extract_text,
-        extract_tables=extract_tables,
-        extract_charts=extract_charts,
-        extract_images=extract_images,
-        text_depth=text_depth,
-        table_output_format=table_output_format,
-        extract_infographics=extract_infographics,
-    )
+    extract_kwargs = {
+        "extract_text": extract_text,
+        "extract_tables": extract_tables,
+        "extract_charts": extract_charts,
+        "extract_images": extract_images,
+        "text_depth": text_depth,
+        "table_output_format": table_output_format,
+        "extract_infographics": extract_infographics,
+    }
+    if extract_page_as_image:
+        extract_kwargs["extract_page_as_image"] = True
+    if extract_method:
+        extract_kwargs["extract_method"] = extract_method
+    ingestor = ingestor.extract(**extract_kwargs)
 
     # Optional pipeline steps
     if enable_caption:
@@ -181,7 +194,10 @@ def main(config=None, log_path: str = "test_results") -> int:
         )
 
     # Embed (must come before storage per pipeline ordering)
-    ingestor = ingestor.embed(model_name=model_name)
+    embed_kwargs = {"model_name": model_name}
+    if image_elements_modality:
+        embed_kwargs["image_elements_modality"] = image_elements_modality
+    ingestor = ingestor.embed(**embed_kwargs)
 
     # Store images to disk (server-side image storage) - optional
     # Note: Supports both MinIO (s3://) and local disk (file://) via storage_uri
@@ -245,6 +261,24 @@ def main(config=None, log_path: str = "test_results") -> int:
     # Optional: log chunk stats and per-type breakdown
     if vdb_backend != "lancedb":
         milvus_chunks(f"http://{hostname}:19530", collection_name)
+        # Verify collection vector dimension matches expected
+        try:
+            from pymilvus import MilvusClient
+
+            mc = MilvusClient(uri=f"http://{hostname}:19530")
+            col_info = mc.describe_collection(collection_name)
+            for field in col_info.get("fields", []):
+                params = field.get("params", {})
+                if "dim" in params:
+                    actual_dim = int(params["dim"])
+                    if actual_dim != dense_dim:
+                        print(f"WARNING: Collection vector dim={actual_dim} != expected dim={dense_dim}")
+                        print("WARNING: Collection may have been created with a different embedding model")
+                    else:
+                        print(f"Collection vector dim={actual_dim} matches expected dim={dense_dim}")
+            mc.close()
+        except Exception as e:
+            print(f"Could not verify collection schema: {e}")
     text_results, table_results, chart_results = segment_results(results)
     kv_event_log("text_chunks", sum(len(x) for x in text_results), log_path)
     kv_event_log("table_chunks", sum(len(x) for x in table_results), log_path)
diff --git a/tools/harness/src/nv_ingest_harness/cases/recall.py b/tools/harness/src/nv_ingest_harness/cases/recall.py
index a92f93ab2..a16959d5f 100644
--- a/tools/harness/src/nv_ingest_harness/cases/recall.py
+++ b/tools/harness/src/nv_ingest_harness/cases/recall.py
@@ -18,7 +18,7 @@ def evaluate_recall_with_reranker(
     evaluation_params: Dict,
     use_reranker: bool,
     log_path: str = "test_results",
-) -> Tuple[Dict[int, float], float]:
+) -> Tuple[Dict, float]:
     """
     Run recall evaluation with specified reranker setting.
 
@@ -30,7 +30,8 @@ def evaluate_recall_with_reranker(
         log_path: Path for logging output
 
     Returns:
-        Tuple of (scores_dict, elapsed_time)
+        Tuple of (results_dict, elapsed_time)
+        results_dict may be {k: score} or {"recall": {...}, "beir": {...}} if BEIR enabled
     """
     mode_str = "with reranker" if use_reranker else "without reranker"
     print("\n" + "=" * 60)
@@ -38,24 +39,42 @@ def evaluate_recall_with_reranker(
     print("=" * 60)
 
     eval_start = time.time()
-    scores = evaluator(
+    results = evaluator(
         collection_name=collection_name,
         nv_ranker=use_reranker,
         **evaluation_params,
     )
     eval_time = time.time() - eval_start
 
-    # Log results
+    # Handle both old format {k: score} and new format {"recall": {...}, "beir": {...}}
+    if isinstance(results, dict) and "recall" in results:
+        recall_scores = results["recall"]
+        beir_metrics = results.get("beir")
+    else:
+        recall_scores = results
+        beir_metrics = None
+
+    # Log recall results
+    reranker_suffix = "with" if use_reranker else "no"
     print(f"\nMultimodal Recall ({mode_str}):")
-    for k in sorted(scores.keys()):
-        score = scores[k]
+    for k in sorted(recall_scores.keys()):
+        score = recall_scores[k]
         print(f"  - Recall @{k}: {score:.3f}")
-        reranker_suffix = "with" if use_reranker else "no"
         kv_event_log(f"recall_multimodal_@{k}_{reranker_suffix}_reranker", score, log_path)
 
-    kv_event_log(f"recall_eval_time_s_{'with' if use_reranker else 'no'}_reranker", eval_time, log_path)
+    # Log BEIR metrics if available
+    if beir_metrics:
+        print(f"\nBEIR Metrics ({mode_str}):")
+        for metric_name, values in beir_metrics.items():
+            for k_str, score in values.items():
+                print(f"  - {k_str}: {score:.5f}")
+                # Log with format: ndcg_10_no_reranker
+                k_num = k_str.split("@")[1] if "@" in k_str else k_str
+                kv_event_log(f"{metric_name}_{k_num}_{reranker_suffix}_reranker", score, log_path)
+
+    kv_event_log(f"recall_eval_time_s_{reranker_suffix}_reranker", eval_time, log_path)
 
-    return scores, eval_time
+    return results, eval_time
 
 
 def main(config=None, log_path: str = "test_results") -> int:
@@ -69,6 +88,11 @@ def main(config=None, log_path: str = "test_results") -> int:
     gpu_search = config.gpu_search
     model_name, dense_dim = embed_info()
 
+    # Deployment fingerprint - detect silent fallback to wrong model
+    if dense_dim == 1024:
+        print("WARNING: Embedding model returned dim=1024 (nv-embedqa-e5-v5 fallback)")
+        print("WARNING: Expected dim=2048 for multimodal embed. Check embedding NIM status.")
+
     # Recall-specific configuration with defaults
     reranker_mode = getattr(config, "reranker_mode", "none")
     recall_top_k = getattr(config, "recall_top_k", 10)
@@ -126,6 +150,27 @@ def main(config=None, log_path: str = "test_results") -> int:
     if lancedb_path:
         print(f"Using LanceDB at: {lancedb_path}")
 
+    # Verify collection schema if using Milvus
+    if vdb_backend == "milvus":
+        try:
+            from pymilvus import MilvusClient
+
+            verify_uri = f"http://{hostname}:19530"
+            mc = MilvusClient(uri=verify_uri)
+            col_info = mc.describe_collection(collection_name)
+            for field in col_info.get("fields", []):
+                params = field.get("params", {})
+                if "dim" in params:
+                    actual_dim = int(params["dim"])
+                    if actual_dim != dense_dim:
+                        print(f"WARNING: Collection vector dim={actual_dim} != embed model dim={dense_dim}")
+                        print("WARNING: Collection may have been created with a different embedding model")
+                    else:
+                        print(f"Collection vector dim={actual_dim} matches embed model dim={dense_dim}")
+            mc.close()
+        except Exception as e:
+            print(f"Could not verify collection schema: {e}")
+
     try:
         recall_results = {}
 
@@ -141,7 +186,11 @@ def main(config=None, log_path: str = "test_results") -> int:
             "vdb_backend": vdb_backend,
             "nv_ranker_endpoint": f"http://{hostname}:8020/v1/ranking",
             "nv_ranker_model_name": "nvidia/llama-3.2-nv-rerankqa-1b-v2",
+            "enable_beir": config.enable_beir,
         }
+        language_filter = getattr(config, "language_filter", None)
+        if language_filter and recall_dataset.startswith("vidore_"):
+            evaluation_params["language_filter"] = language_filter
         if vdb_backend == "lancedb":
             evaluation_params["table_path"] = lancedb_path
 
diff --git a/tools/harness/src/nv_ingest_harness/cli/nightly.py b/tools/harness/src/nv_ingest_harness/cli/nightly.py
index 68a41f9e1..513295ae9 100644
--- a/tools/harness/src/nv_ingest_harness/cli/nightly.py
+++ b/tools/harness/src/nv_ingest_harness/cli/nightly.py
@@ -6,6 +6,7 @@
 import os
 import subprocess
 import sys
+import time
 from pathlib import Path
 from typing import Any
 
@@ -348,7 +349,9 @@ def main(
             service_manager.stop()
             return 1
 
-        print("Services ready!")
+        # Warm-up: let services stabilize and connect before running tests
+        print("Services ready! Sleeping 60s for warm-up...")
+        time.sleep(60)
 
     all_results = []
 
diff --git a/tools/harness/src/nv_ingest_harness/cli/run.py b/tools/harness/src/nv_ingest_harness/cli/run.py
index 6dc4b155d..73f2e443d 100644
--- a/tools/harness/src/nv_ingest_harness/cli/run.py
+++ b/tools/harness/src/nv_ingest_harness/cli/run.py
@@ -1,6 +1,7 @@
 import json
 import os
 import sys
+import time
 from pathlib import Path
 
 import click
@@ -69,6 +70,10 @@ def run_datasets(
             service_manager.stop()
             return 1
 
+        # Warm-up: let services stabilize and connect before running tests
+        print("Services ready! Sleeping 60s for warm-up...")
+        time.sleep(60)
+
     # Run each dataset
     for dataset_name in dataset_list:
         print(f"\n{'='*60}")
@@ -279,14 +284,17 @@ def __init__(self, file_path, original_stream):
 
         def write(self, data):
             self.original.write(data)
-            self.file.write(data)
+            if not self.file.closed:
+                self.file.write(data)
 
         def flush(self):
             self.original.flush()
-            self.file.flush()
+            if not self.file.closed:
+                self.file.flush()
 
         def close(self):
-            self.file.close()
+            if not self.file.closed:
+                self.file.close()
 
     tee_stdout = TeeFile(stdout_path, sys.stdout)
     old_stdout = sys.stdout
@@ -369,6 +377,11 @@ def close(self):
     default=None,
     help="Path to test config YAML (default: tools/harness/test_configs.yaml)",
 )
+@click.option(
+    "--list-datasets",
+    is_flag=True,
+    help="List available datasets and groups, then exit",
+)
 def main(
     case,
     managed,
@@ -382,14 +395,54 @@ def main(
     sku,
     dump_logs,
     test_config_path,
+    list_datasets,
 ):
+    # Handle --list-datasets
+    if list_datasets:
+        from nv_ingest_harness.config import list_datasets as get_datasets
+
+        config_file = test_config_path or str(Path(__file__).resolve().parents[3] / "test_configs.yaml")
+        info = get_datasets(config_file=config_file)
+
+        print("Available Datasets:")
+        print("-" * 50)
+        for name, config in sorted(info["datasets"].items()):
+            if isinstance(config, dict):
+                path = config.get("path", "N/A")
+                recall = config.get("recall_dataset")
+                recall_str = f" [recall: {recall}]" if recall else ""
+            else:
+                path = config
+                recall_str = ""
+            print(f"  {name}: {path}{recall_str}")
+
+        if info.get("groups"):
+            print("\nDataset Groups:")
+            print("-" * 50)
+            for name, members in sorted(info["groups"].items()):
+                print(f"  {name} ({len(members)} datasets):")
+                for m in members:
+                    print(f"    - {m}")
+
+        return 0
 
     if not dataset:
         print("Error: --dataset is required. Use --dataset=<name> or --dataset=<name1>,<name2>", file=sys.stderr)
+        print("       Use --list-datasets to see available datasets and groups", file=sys.stderr)
         return 1
 
-    # Parse dataset(s) - handle both single and comma-separated
-    dataset_list = [d.strip() for d in dataset.split(",") if d.strip()]
+    # Parse dataset(s) - handle single, comma-separated, and groups
+    import yaml
+
+    config_path = (
+        Path(test_config_path) if test_config_path else Path(__file__).resolve().parents[3] / "test_configs.yaml"
+    )
+    with open(config_path) as f:
+        yaml_data = yaml.safe_load(f)
+
+    from nv_ingest_harness.config import expand_dataset_names
+
+    dataset_list = expand_dataset_names(yaml_data, dataset)
     if not dataset_list:
         print("Error: No valid datasets found", file=sys.stderr)
         return 1
diff --git a/tools/harness/src/nv_ingest_harness/config.py b/tools/harness/src/nv_ingest_harness/config.py
index 420533525..b34bb8b97 100644
--- a/tools/harness/src/nv_ingest_harness/config.py
+++ b/tools/harness/src/nv_ingest_harness/config.py
@@ -65,8 +65,11 @@ class TestConfig:
     extract_charts: bool = True
     extract_images: bool = False
     extract_infographics: bool = True
+    extract_page_as_image: bool = False
+    extract_method: Optional[str] = None
     text_depth: str = "page"
     table_output_format: str = "markdown"
+    image_elements_modality: Optional[str] = None
 
     # Optional pipeline steps
     enable_caption: bool = False
@@ -95,6 +98,8 @@ class TestConfig:
     recall_top_k: int = 10
     ground_truth_dir: Optional[str] = None
     recall_dataset: Optional[str] = None
+    enable_beir: bool = False  # Enable BEIR metrics (NDCG, MAP, Precision)
+    language_filter: Optional[str] = None  # Filter queries by language (e.g., "english")
 
     def validate(self) -> List[str]:
         """Validate configuration and return list of errors"""
@@ -329,7 +334,10 @@ def parse_list(value: str) -> List[str]:
         "EXTRACT_CHARTS": ("extract_charts", parse_bool),
         "EXTRACT_IMAGES": ("extract_images", parse_bool),
         "EXTRACT_INFOGRAPHICS": ("extract_infographics", parse_bool),
+        "EXTRACT_PAGE_AS_IMAGE": ("extract_page_as_image", parse_bool),
+        "EXTRACT_METHOD": ("extract_method", str),
         "TEXT_DEPTH": ("text_depth", str),
+        "IMAGE_ELEMENTS_MODALITY": ("image_elements_modality", str),
         "TABLE_OUTPUT_FORMAT": ("table_output_format", str),
         "ENABLE_CAPTION": ("enable_caption", parse_bool),
         "CAPTION_PROMPT": ("caption_prompt", str),
@@ -346,6 +354,8 @@ def parse_list(value: str) -> List[str]:
         "RECALL_TOP_K": ("recall_top_k", parse_int),
         "GROUND_TRUTH_DIR": ("ground_truth_dir", str),
         "RECALL_DATASET": ("recall_dataset", str),
+        "ENABLE_BEIR": ("enable_beir", parse_bool),
+        "LANGUAGE_FILTER": ("language_filter", str),
     }
 
     overrides = {}
@@ -359,14 +369,55 @@ def parse_list(value: str) -> List[str]:
     return overrides
 
 
+def expand_dataset_names(yaml_data: dict, dataset_input: str) -> List[str]:
+    """
+    Expand a dataset input string to a list of dataset names.
+
+    Handles:
+    - Single dataset name: "bo767" -> ["bo767"]
+    - Comma-separated: "bo767,earnings" -> ["bo767", "earnings"]
+    - Group name: "vidore" -> ["vidore_v3_finance_en", "vidore_v3_industrial", ...]
+    - Mixed: "vidore_quick,bo767" -> ["vidore_v3_hr", "vidore_v3_industrial", "bo767"]
+
+    Args:
+        yaml_data: Parsed YAML data containing datasets and dataset_groups
+        dataset_input: Raw dataset input string
+
+    Returns:
+        List of individual dataset names (expanded from groups)
+    """
+    dataset_groups = yaml_data.get("dataset_groups", {})
+
+    raw_names = [name.strip() for name in dataset_input.split(",") if name.strip()]
+
+    expanded = []
+    for name in raw_names:
+        if name in dataset_groups:
+            expanded.extend(dataset_groups[name])
+        else:
+            expanded.append(name)
+
+    seen = set()
+    result = []
+    for name in expanded:
+        if name not in seen:
+            seen.add(name)
+            result.append(name)
+
+    return result
+
+
 def list_datasets(config_file: str = "test_configs.yaml") -> dict:
-    """List available dataset shortcuts"""
-    config_path = Path(__file__).parent / config_file
+    """List available dataset shortcuts and groups"""
+    config_path = Path(__file__).resolve().parents[2] / config_file
 
     with open(config_path) as f:
         yaml_data = yaml.safe_load(f)
 
-    return yaml_data.get("datasets", {})
+    return {
+        "datasets": yaml_data.get("datasets", {}),
+        "groups": yaml_data.get("dataset_groups", {}),
+    }
 
 
 def list_presets(config_file: str = "test_configs.yaml") -> List[str]:
diff --git a/tools/harness/src/nv_ingest_harness/utils/interact.py b/tools/harness/src/nv_ingest_harness/utils/interact.py
index 5a29f3bb5..36ec8bd59 100644
--- a/tools/harness/src/nv_ingest_harness/utils/interact.py
+++ b/tools/harness/src/nv_ingest_harness/utils/interact.py
@@ -46,6 +46,7 @@ def embed_info(
         "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": 2048,
         "nvidia/llama-3.2-nv-embedqa-1b-v2": 2048,
         "nvidia/llama-3.2-nemoretriever-300m-embed-v1": 2048,
+        "nvidia/llama-nemotron-embed-vl-1b-v2": 2048,
         "nvidia/nv-embedqa-e5-v5": 1024,
     }
 
diff --git a/tools/harness/src/nv_ingest_harness/utils/recall.py b/tools/harness/src/nv_ingest_harness/utils/recall.py
index d5b20d863..c7ba2571f 100644
--- a/tools/harness/src/nv_ingest_harness/utils/recall.py
+++ b/tools/harness/src/nv_ingest_harness/utils/recall.py
@@ -10,13 +10,75 @@
 import pandas as pd
 from collections import defaultdict
 from functools import partial
-from typing import Dict, Optional, Callable
+from typing import Dict, List, Optional, Callable, Any, Union
 
 from nv_ingest_client.util.milvus import nvingest_retrieval
 
 from nv_ingest_harness.utils.cases import get_repo_root
 
 
+def _compute_beir_metrics(
+    all_retrieved: List[List[Dict]],
+    query_df: pd.DataFrame,
+    k_values: List[int] = [1, 5, 10],
+    qrels_dict: Optional[Dict[str, Dict[str, float]]] = None,
+) -> Optional[Dict[str, Dict[str, float]]]:
+    """
+    Compute BEIR metrics from retrieval results.
+
+    Args:
+        all_retrieved: List of retrieval results per query. Each result is a list of
+                      dicts with 'entity' containing source info.
+        query_df: DataFrame with 'query' and 'expected_pdf' columns, optionally 'query_id'.
+        k_values: Cutoff values for evaluation (default [1, 5, 10]).
+        qrels_dict: Optional full qrels {query_id: {doc_id: relevance}}. If None, builds
+                    single-doc qrels from expected_pdf (for finance_bench etc).
+
+    Returns:
+        Dict with keys 'ndcg', 'map', 'recall', 'precision', each containing
+        metric values like {'NDCG@1': 0.17, 'NDCG@5': 0.35, ...}, or None if BEIR unavailable.
+    """
+    try:
+        from beir.retrieval.evaluation import EvaluateRetrieval
+    except ImportError:
+        return None
+
+    # Build results dict: {query_id: {doc_id: score}}
+    results = {}
+    for idx, answers in enumerate(all_retrieved):
+        if "query_id" in query_df.columns:
+            query_id = str(query_df.iloc[idx]["query_id"])
+        else:
+            query_id = str(idx)
+
+        results[query_id] = {}
+        num_results = len(answers)
+        for rank, r in enumerate(answers):
+            source_id = r.get("entity", {}).get("source", {}).get("source_id", "")
+            doc_id = os.path.basename(source_id).split(".")[0]
+            score = (num_results - rank) / num_results if num_results > 0 else 0
+            # Keep first (highest-ranked) occurrence per doc — dedup multiple chunks from same PDF
+            if doc_id not in results[query_id]:
+                results[query_id][doc_id] = score
+
+    # Build qrels dict: use provided full qrels or fall back to single expected_pdf
+    if qrels_dict is not None:
+        qrels = qrels_dict
+    else:
+        qrels = {}
+        for idx, row in query_df.iterrows():
+            if "query_id" in query_df.columns:
+                query_id = str(row["query_id"])
+            else:
+                query_id = str(idx)
+            qrels[query_id] = {str(row["expected_pdf"]): 1}
+
+    # Evaluate
+    ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(qrels, results, k_values)
+
+    return {"ndcg": ndcg, "map": _map, "recall": recall, "precision": precision}
+
+
 def _get_retrieval_func(
     vdb_backend: str,
     table_path: Optional[str] = None,
@@ -176,7 +238,8 @@ def get_recall_scores_pdf_only(
     batch_size: int = 100,
     vdb_backend: str = "milvus",
     table_path: Optional[str] = None,
-) -> Dict[int, float]:
+    enable_beir: bool = False,
+) -> Union[Dict[int, float], Dict[str, Any]]:
     """
     Calculate recall@k scores for queries against a VDB collection using PDF-only matching.
 
@@ -199,11 +262,14 @@ def get_recall_scores_pdf_only(
         batch_size: Number of queries to process per batch (prevents gRPC size limit errors).
         vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
         table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
+        enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
 
     Returns:
-        Dictionary mapping k values (1, 5, 10) to recall scores (float 0.0-1.0).
+        If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
+        If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
     """
     hits = defaultdict(list)
+    all_retrieved = []  # Collect for BEIR computation
 
     reranker_kwargs = {}
     if nv_ranker:
@@ -213,7 +279,11 @@ def get_recall_scores_pdf_only(
             reranker_kwargs["nv_ranker_model_name"] = nv_ranker_model_name
 
     queries = query_df["query"].to_list()
-    expected_pdfs = query_df["expected_pdf"].to_list()
+    # Support multi-doc ground truth (expected_pdfs) for Vidore; fallback to expected_pdf
+    if "expected_pdfs" in query_df.columns:
+        expected_pdfs = query_df["expected_pdfs"].to_list()
+    else:
+        expected_pdfs = [[ep] for ep in query_df["expected_pdf"].to_list()]
 
     # Process queries in batches to avoid gRPC message size limits
     num_queries = len(queries)
@@ -262,20 +332,33 @@ def get_recall_scores_pdf_only(
                 **reranker_kwargs,
             )
 
-        for expected_pdf, retrieved_answers in zip(batch_expected_pdfs, batch_answers):
-            # Extract PDF names only (no page numbers)
-            retrieved_pdfs = [
-                os.path.basename(result.get("entity", {}).get("source", {}).get("source_id", "")).split(".")[0]
-                for result in retrieved_answers
-            ]
+        # Collect results for BEIR if enabled
+        if enable_beir:
+            all_retrieved.extend(batch_answers)
+
+        for expected_pdfs_for_query, retrieved_answers in zip(batch_expected_pdfs, batch_answers):
+            # Extract PDF names only (no page numbers), deduplicated to unique PDFs
+            retrieved_pdfs = list(
+                dict.fromkeys(
+                    os.path.basename(result.get("entity", {}).get("source", {}).get("source_id", "")).split(".")[0]
+                    for result in retrieved_answers
+                )
+            )
 
-            # Finance_bench uses k values [1, 5, 10]
+            # Hit = any relevant doc in top-k unique PDFs
             for k in [1, 5, 10]:
                 if k <= top_k:
-                    hits[k].append(expected_pdf in retrieved_pdfs[:k])
+                    hit = any(exp in retrieved_pdfs[:k] for exp in expected_pdfs_for_query)
+                    hits[k].append(hit)
 
     recall_scores = {k: np.mean(hits[k]) for k in hits if len(hits[k]) > 0}
 
+    # Compute BEIR metrics if enabled
+    if enable_beir:
+        qrels_dict = query_df.attrs.get("qrels")
+        beir_metrics = _compute_beir_metrics(all_retrieved, query_df, k_values=[1, 5, 10], qrels_dict=qrels_dict)
+        return {"recall": recall_scores, "beir": beir_metrics}
+
     return recall_scores
 
 
@@ -719,6 +802,141 @@ def bo10k_recall(
     )
 
 
+def vidore_load_ground_truth(
+    ground_truth_dir: Optional[str] = None,
+    dataset_name: str = "vidore_v3_finance_en",
+    language_filter: Optional[str] = None,
+) -> pd.DataFrame:
+    """
+    Load Vidore V3 ground truth from HuggingFace datasets.
+
+    Uses the industry-standard HuggingFace datasets API which provides:
+    - Automatic local caching (~/.cache/huggingface/datasets/)
+    - No redundant downloads on subsequent runs
+    - Always retrieves latest benchmark version
+
+    Args:
+        ground_truth_dir: Unused (kept for API compatibility with other loaders)
+        dataset_name: Vidore dataset name (e.g., "vidore_v3_finance_en")
+        language_filter: Optional language filter ("english" or None for all)
+
+    Returns:
+        DataFrame with columns: 'query', 'expected_pdf', 'expected_pdfs', 'query_id'
+        df.attrs["qrels"] contains full BEIR-compatible qrels dict with graded relevance
+    """
+    from datasets import load_dataset
+
+    hf_name = f"vidore/{dataset_name}"
+
+    # Load queries and qrels from HuggingFace (cached automatically)
+    queries_ds = load_dataset(hf_name, data_dir="queries", split="test")
+    qrels_ds = load_dataset(hf_name, data_dir="qrels", split="test")
+
+    # Build full qrels: {query_id: {corpus_id: score}} — matches notebook's format_qrels
+    full_qrels = defaultdict(dict)
+    for row in qrels_ds:
+        full_qrels[str(row["query_id"])][str(row["corpus_id"])] = row["score"]
+    full_qrels = dict(full_qrels)
+
+    rows = []
+    for row in queries_ds:
+        query_id = row["query_id"]
+
+        # Apply language filter if specified
+        if language_filter and row.get("language", "").lower() != language_filter.lower():
+            continue
+
+        qid_str = str(query_id)
+        if qid_str in full_qrels:
+            relevant_docs = sorted(full_qrels[qid_str].items(), key=lambda x: x[1], reverse=True)
+            rows.append(
+                {
+                    "query": row["query"],
+                    "expected_pdf": relevant_docs[0][0],
+                    "expected_pdfs": [doc_id for doc_id, _ in relevant_docs],
+                    "query_id": qid_str,
+                }
+            )
+
+    if not rows:
+        raise ValueError(f"No valid queries found for {dataset_name}")
+
+    df = pd.DataFrame(rows)
+    df.attrs["qrels"] = full_qrels
+    return df
+
+
+def vidore_recall(
+    collection_name: str,
+    dataset_name: str = "vidore_v3_finance_en",
+    language_filter: Optional[str] = None,
+    hostname: str = "localhost",
+    sparse: bool = False,
+    hybrid: bool = False,
+    model_name: str = None,
+    top_k: int = 10,
+    gpu_search: bool = False,
+    nv_ranker: bool = False,
+    ground_truth_dir: Optional[str] = None,
+    nv_ranker_endpoint: Optional[str] = None,
+    nv_ranker_model_name: Optional[str] = None,
+    vdb_backend: str = "milvus",
+    table_path: Optional[str] = None,
+    enable_beir: bool = False,
+) -> Union[Dict[int, float], Dict[str, Any]]:
+    """
+    Evaluate recall@k for Vidore V3 dataset using PDF-only matching.
+
+    Loads ground truth from HuggingFace datasets API and evaluates recall
+    against the specified VDB collection.
+
+    Args:
+        collection_name: VDB collection/table name to query.
+        dataset_name: Vidore dataset name (e.g., "vidore_v3_finance_en").
+        language_filter: Optional language filter ("english" or None for all).
+        hostname: Service hostname for embedding endpoint.
+        sparse: Enable hybrid sparse-dense retrieval if True (Milvus only).
+        model_name: Embedding model name for query encoding.
+        top_k: Maximum number of results to retrieve and evaluate.
+        gpu_search: Use GPU acceleration for Milvus search.
+        nv_ranker: Enable NVIDIA reranker for result reranking.
+        ground_truth_dir: Unused (kept for API compatibility).
+        nv_ranker_endpoint: Optional custom reranker endpoint URL.
+        nv_ranker_model_name: Optional custom reranker model name.
+        vdb_backend: VDB backend to use ("milvus" or "lancedb"). Default is "milvus".
+        table_path: Path to LanceDB database directory (required if vdb_backend="lancedb").
+        enable_beir: If True, also compute BEIR metrics (NDCG, MAP, Precision).
+
+    Returns:
+        If enable_beir=False: Dictionary mapping k values (1, 5, 10) to recall scores.
+        If enable_beir=True: Dictionary with 'recall' and 'beir' keys containing metrics.
+    """
+    loader = partial(
+        vidore_load_ground_truth,
+        dataset_name=dataset_name,
+        language_filter=language_filter,
+    )
+
+    return evaluate_recall_orchestrator(
+        loader_func=loader,
+        scorer_func=get_recall_scores_pdf_only,
+        collection_name=collection_name,
+        hostname=hostname,
+        sparse=sparse,
+        hybrid=hybrid,
+        model_name=model_name,
+        top_k=top_k,
+        gpu_search=gpu_search,
+        nv_ranker=nv_ranker,
+        ground_truth_dir=ground_truth_dir,
+        nv_ranker_endpoint=nv_ranker_endpoint,
+        nv_ranker_model_name=nv_ranker_model_name,
+        vdb_backend=vdb_backend,
+        table_path=table_path,
+        enable_beir=enable_beir,
+    )
+
+
 def jp20_recall(
     collection_name: str,
     hostname: str = "localhost",
@@ -777,7 +995,7 @@ def get_dataset_evaluator(dataset_name: str) -> Optional[Callable]:
     Get the recall evaluator function for a given dataset.
 
     Args:
-        dataset_name: Name of the dataset (e.g., 'bo767', 'finance_bench')
+        dataset_name: Name of the dataset (e.g., 'bo767', 'finance_bench', 'vidore_v3_finance_en')
 
     Returns:
         Evaluator function or None if not found
@@ -791,4 +1009,21 @@ def get_dataset_evaluator(dataset_name: str) -> Optional[Callable]:
         "jp20": jp20_recall,
     }
 
-    return evaluators.get(dataset_name.lower())
+    # Vidore V3 benchmark datasets
+    vidore_datasets = [
+        "vidore_v3_finance_en",
+        "vidore_v3_industrial",
+        "vidore_v3_computer_science",
+        "vidore_v3_pharmaceuticals",
+        "vidore_v3_hr",
+        "vidore_v3_energy",
+        "vidore_v3_physics",
+        "vidore_v3_finance_fr",
+    ]
+
+    dataset_lower = dataset_name.lower()
+    if dataset_lower in evaluators:
+        return evaluators[dataset_lower]
+    if dataset_lower in vidore_datasets:
+        return partial(vidore_recall, dataset_name=dataset_lower)
+    return None
diff --git a/tools/harness/test_configs.yaml b/tools/harness/test_configs.yaml
index 00bfb374d..91c68e5e3 100644
--- a/tools/harness/test_configs.yaml
+++ b/tools/harness/test_configs.yaml
@@ -59,7 +59,7 @@ active:
   sparse: false  # Use sparse embeddings (Milvus only)
   gpu_search: false  # Use GPU for search
   embedding_model: auto  # auto-detect or specify model name
-  vdb_backend: lancedb  # milvus or lancedb
+  vdb_backend: milvus  # milvus or lancedb
   hybrid: false  # LanceDB hybrid retrieval (FTS + vector)
 
   # Extraction configuration
@@ -94,11 +94,13 @@ active:
 # docker compose --profile reranker up -d
 recall:
   recall_dataset: null
-  reranker_mode: both  # Options: "none", "with", "both"
+  reranker_mode: none  # Options: "none", "with", "both"
 
   # Recall evaluation settings
   recall_top_k: 10
   ground_truth_dir: null
+  enable_beir: false  # Enable BEIR metrics (NDCG, MAP, Precision) - requires beir package
+  language_filter: null  # Filter queries by language (e.g., "english") - Vidore only
 
 # Pre-configured datasets
 # Each dataset includes path, extraction settings, and recall evaluator
@@ -177,3 +179,145 @@ datasets:
     extract_images: false
     extract_infographics: true
     recall_dataset: jp20
+
+  # Vidore V3 Benchmark Datasets
+  # See: https://huggingface.co/collections/vidore/vidore-benchmark-667173f98e70a1c0fa4db00d
+  vidore_v3_finance_en:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_finance_en
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_finance_en
+    enable_beir: true
+
+  vidore_v3_industrial:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_industrial
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_industrial
+    enable_beir: true
+
+  vidore_v3_computer_science:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_computer_science
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_computer_science
+    enable_beir: true
+
+  vidore_v3_pharmaceuticals:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_pharmaceuticals
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_pharmaceuticals
+    enable_beir: true
+
+  vidore_v3_hr:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_hr
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_hr
+    enable_beir: true
+
+  vidore_v3_energy:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_energy
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_energy
+    enable_beir: true
+
+  vidore_v3_physics:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_physics
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_physics
+    enable_beir: true
+
+  vidore_v3_finance_fr:
+    path: /datasets/nv-ingest/vidore_v3_corpus_pdf/vidore_v3_finance_fr
+    extract_text: true
+    extract_tables: true
+    extract_charts: true
+    extract_images: false
+    extract_infographics: true
+    extract_page_as_image: true
+    text_depth: page
+    table_output_format: markdown
+    image_elements_modality: text_image
+    recall_dataset: vidore_v3_finance_fr
+    enable_beir: true
+
+# Dataset groups for running multiple datasets together
+# Use: uv run nv-ingest-harness-run --case=e2e_recall --dataset=vidore
+dataset_groups:
+  # All Vidore V3 datasets
+  vidore:
+    - vidore_v3_finance_en
+    - vidore_v3_industrial
+    - vidore_v3_computer_science
+    - vidore_v3_pharmaceuticals
+    - vidore_v3_hr
+    - vidore_v3_energy
+    - vidore_v3_physics
+    - vidore_v3_finance_fr
+
+  # Vidore English-only (set language_filter: english in recall section)
+  vidore_english:
+    - vidore_v3_finance_en
+    - vidore_v3_industrial
+    - vidore_v3_computer_science
+    - vidore_v3_pharmaceuticals
+    - vidore_v3_hr
+
+  # Vidore quick test (smallest datasets)
+  vidore_quick:
+    # - vidore_v3_hr
+    # - vidore_v3_industrial
+    - vidore_v3_computer_science