NeMo-Retriever/tools/harness/test_configs.yaml at a520c5df41ba0e0b90153bf5c3f0f299f3c8f4d0 · NVIDIA/NeMo-Retriever · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# nv-ingest Test Configuration
# Edit the 'active' section directly to configure your test runs

# Active configuration (used by default when running tests)
active:
  # Dataset configuration
  dataset_dir: /path/to/dataset
  test_name: null  # Auto-generated if null

  # API configuration
  api_version: v2 # v1 or v2
  pdf_split_page_count: null # V2 only: pages per chunk (null = default 32)

  # Infrastructure
  hostname: localhost
  readiness_timeout: 600

  # Docker Compose configuration
  compose:
    profiles:
      - retrieval
      - reranker  # Required for recall evaluation

  # Helm configuration
  helm:
    bin: microk8s helm  # Helm binary command (e.g., "helm", "microk8s helm", "k3s helm")
    sudo: true  # Prepend sudo to helm commands (needed for microk8s/k3s without user in group)
    kubectl_bin: microk8s kubectl  # kubectl binary command (e.g., "kubectl", "microk8s kubectl")
    kubectl_sudo: null  # Prepend sudo to kubectl commands (null = same as helm_sudo)
    chart: nemo-microservices/nv-ingest  # Remote chart reference (set to null to use local chart from ./helm)
    chart_version: 26.03.0-RC2  # Chart version (required for remote charts)
    release: nv-ingest
    namespace: nv-ingest
    values_file: .helm-env  # Optional: path to values file
    # Multiple port forwards for Helm services
    port_forwards:
      - service: nv-ingest
        local_port: 7670
        remote_port: 7670
      - service: nv-ingest-milvus
        local_port: 19530
        remote_port: 19530
      - service: nv-ingest-milvus
        local_port: 9091
        remote_port: 9091
      - service: nv-ingest-minio
        local_port: 9000
        remote_port: 9000
      - service: "*embed*"  # Wildcard pattern to match embedding services
        local_port: 8012
        remote_port: 8000
      - service: "*rerank*"  # Wildcard pattern to match reranker services
        local_port: 8020
        remote_port: 8000
    values:  # inline Helm values
      nimOperator.rerankqa.enabled: true

  # Runtime configuration
  sparse: false  # Use sparse embeddings (Milvus only)
  gpu_search: false  # Use GPU for search
  embedding_model: auto  # auto-detect or specify model name
  vdb_backend: lancedb  # milvus or lancedb
  hybrid: false  # LanceDB hybrid retrieval (FTS + vector)

  # Extraction configuration
  extract_text: true
  extract_tables: true
  extract_charts: true
  extract_images: false
  extract_infographics: true
  text_depth: page
  # table_output_format: markdown

  # Optional pipeline steps
  enable_caption: false  # Enable image captioning (ensure vlm profile is enabled)
  caption_prompt: null  # Override caption user prompt (null = service default)
  caption_reasoning: null  # Enable reasoning: true = enable, false = disable, null = service default (default is false)

  enable_split: false  # Enable text chunking
  split_chunk_size: 1024  # Chunk size for text splitting
  split_chunk_overlap: 150  # Overlap for text splitting
  enable_image_storage: false  # Enable server-side image storage (defaults to MinIO; set IMAGE_STORAGE_URI=file://... to opt into disk)

  # Storage configuration
  spill_dir: /tmp/spill
  artifacts_dir: null  # null = use default (tools/harness/artifacts)
  collection_name: null  # null = auto-generated

  # Custom UDF Configs - add specific udf configs here if needed.
  llm_summarization_model: nvdev/nvidia/llama-3.1-nemotron-70b-instruct  # LLM for summarization - this does NOT propagate to the UDF, but useful for artifact storage.

# Recall configuration (only used when running recall or e2e_recall tests)
# Remember to start the reranker service before running recall tests:
# docker compose --profile reranker up -d
recall:
  recall_dataset: null
  reranker_mode: both  # Options: "none", "with", "both"

  # Recall evaluation settings
  recall_top_k: 10
  ground_truth_dir: null

# Pre-configured datasets
# Each dataset includes path, extraction settings, and recall evaluator
# Use: uv run nv-ingest-harness-run --case=e2e --dataset=bo767
datasets:
  bo767:
    path: /path/to/bo767
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: false
    extract_infographics: false
    recall_dataset: bo767

  earnings:
    path: /path/to/earnings_consulting
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: false
    extract_infographics: false
    recall_dataset: earnings

  bo20:
    path: /path/to/bo20
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: true
    extract_infographics: true
    recall_dataset: null

  financebench:
    path: /path/to/financebench
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: true
    extract_infographics: true
    recall_dataset: finance_bench

  # Single file example
  single:
    path: data/multimodal_test.pdf
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: true
    extract_infographics: true
    recall_dataset: null  # No recall evaluator for single file

# Example of a custom dataset configuration
  jensen_single:
    path: /path/to/jensen.pdf
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: true
    extract_infographics: true
    recall_dataset: null  # No recall evaluator for single file

  bo10k:
    path: /path/to/bo10k
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: false
    extract_infographics: true
    recall_dataset: bo10k

  jp20:
    path: /path/to/jp20
    extract_text: true
    extract_tables: true
    extract_charts: true
    extract_images: false
    extract_infographics: true
    recall_dataset: jp20