-
Notifications
You must be signed in to change notification settings - Fork 308
Expand file tree
/
Copy pathtest_configs.yaml
More file actions
179 lines (158 loc) · 5.64 KB
/
test_configs.yaml
File metadata and controls
179 lines (158 loc) · 5.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# nv-ingest Test Configuration
# Edit the 'active' section directly to configure your test runs
# Active configuration (used by default when running tests)
active:
# Dataset configuration
dataset_dir: /path/to/dataset
test_name: null # Auto-generated if null
# API configuration
api_version: v2 # v1 or v2
pdf_split_page_count: null # V2 only: pages per chunk (null = default 32)
# Infrastructure
hostname: localhost
readiness_timeout: 600
# Docker Compose configuration
compose:
profiles:
- retrieval
- reranker # Required for recall evaluation
# Helm configuration
helm:
bin: microk8s helm # Helm binary command (e.g., "helm", "microk8s helm", "k3s helm")
sudo: true # Prepend sudo to helm commands (needed for microk8s/k3s without user in group)
kubectl_bin: microk8s kubectl # kubectl binary command (e.g., "kubectl", "microk8s kubectl")
kubectl_sudo: null # Prepend sudo to kubectl commands (null = same as helm_sudo)
chart: nemo-microservices/nv-ingest # Remote chart reference (set to null to use local chart from ./helm)
chart_version: 26.03.0-RC2 # Chart version (required for remote charts)
release: nv-ingest
namespace: nv-ingest
values_file: .helm-env # Optional: path to values file
# Multiple port forwards for Helm services
port_forwards:
- service: nv-ingest
local_port: 7670
remote_port: 7670
- service: nv-ingest-milvus
local_port: 19530
remote_port: 19530
- service: nv-ingest-milvus
local_port: 9091
remote_port: 9091
- service: nv-ingest-minio
local_port: 9000
remote_port: 9000
- service: "*embed*" # Wildcard pattern to match embedding services
local_port: 8012
remote_port: 8000
- service: "*rerank*" # Wildcard pattern to match reranker services
local_port: 8020
remote_port: 8000
values: # inline Helm values
nimOperator.rerankqa.enabled: true
# Runtime configuration
sparse: false # Use sparse embeddings (Milvus only)
gpu_search: false # Use GPU for search
embedding_model: auto # auto-detect or specify model name
vdb_backend: lancedb # milvus or lancedb
hybrid: false # LanceDB hybrid retrieval (FTS + vector)
# Extraction configuration
extract_text: true
extract_tables: true
extract_charts: true
extract_images: false
extract_infographics: true
text_depth: page
# table_output_format: markdown
# Optional pipeline steps
enable_caption: false # Enable image captioning (ensure vlm profile is enabled)
caption_prompt: null # Override caption user prompt (null = service default)
caption_reasoning: null # Enable reasoning: true = enable, false = disable, null = service default (default is false)
enable_split: false # Enable text chunking
split_chunk_size: 1024 # Chunk size for text splitting
split_chunk_overlap: 150 # Overlap for text splitting
enable_image_storage: false # Enable server-side image storage (defaults to MinIO; set IMAGE_STORAGE_URI=file://... to opt into disk)
# Storage configuration
spill_dir: /tmp/spill
artifacts_dir: null # null = use default (tools/harness/artifacts)
collection_name: null # null = auto-generated
# Custom UDF Configs - add specific udf configs here if needed.
llm_summarization_model: nvdev/nvidia/llama-3.1-nemotron-70b-instruct # LLM for summarization - this does NOT propagate to the UDF, but useful for artifact storage.
# Recall configuration (only used when running recall or e2e_recall tests)
# Remember to start the reranker service before running recall tests:
# docker compose --profile reranker up -d
recall:
recall_dataset: null
reranker_mode: both # Options: "none", "with", "both"
# Recall evaluation settings
recall_top_k: 10
ground_truth_dir: null
# Pre-configured datasets
# Each dataset includes path, extraction settings, and recall evaluator
# Use: uv run nv-ingest-harness-run --case=e2e --dataset=bo767
datasets:
bo767:
path: /path/to/bo767
extract_text: true
extract_tables: true
extract_charts: true
extract_images: false
extract_infographics: false
recall_dataset: bo767
earnings:
path: /path/to/earnings_consulting
extract_text: true
extract_tables: true
extract_charts: true
extract_images: false
extract_infographics: false
recall_dataset: earnings
bo20:
path: /path/to/bo20
extract_text: true
extract_tables: true
extract_charts: true
extract_images: true
extract_infographics: true
recall_dataset: null
financebench:
path: /path/to/financebench
extract_text: true
extract_tables: true
extract_charts: true
extract_images: true
extract_infographics: true
recall_dataset: finance_bench
# Single file example
single:
path: data/multimodal_test.pdf
extract_text: true
extract_tables: true
extract_charts: true
extract_images: true
extract_infographics: true
recall_dataset: null # No recall evaluator for single file
# Example of a custom dataset configuration
jensen_single:
path: /path/to/jensen.pdf
extract_text: true
extract_tables: true
extract_charts: true
extract_images: true
extract_infographics: true
recall_dataset: null # No recall evaluator for single file
bo10k:
path: /path/to/bo10k
extract_text: true
extract_tables: true
extract_charts: true
extract_images: false
extract_infographics: true
recall_dataset: bo10k
jp20:
path: /path/to/jp20
extract_text: true
extract_tables: true
extract_charts: true
extract_images: false
extract_infographics: true
recall_dataset: jp20