Skip to content

Commit 1835ba7

Browse files
committed
Add source_id column back to lancedb
1 parent fbd2e28 commit 1835ba7

File tree

4 files changed

+15
-1
lines changed

4 files changed

+15
-1
lines changed

nemo_retriever/src/nemo_retriever/ingest_modes/batch.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,9 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "BatchI
295295
This does not run extraction yet; it records configuration so the batch
296296
executor can build a concrete pipeline later.
297297
298+
If all input files have a ``.txt`` extension, the pipeline automatically
299+
delegates to :meth:`extract_txt` with default :class:`TextChunkParams`.
300+
298301
Resource-tuning kwargs (auto-detected from available resources if omitted):
299302
300303
- ``pdf_split_batch_size``: Batch size for PDF split stage (default 1).
@@ -308,6 +311,13 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "BatchI
308311
- ``ocr_cpus_per_actor``: CPUs reserved per OCR actor (default 1).
309312
"""
310313

314+
if self._input_documents and all(f.lower().endswith(".txt") for f in self._input_documents):
315+
txt_params = TextChunkParams(
316+
max_tokens=kwargs.pop("max_tokens", 1024),
317+
overlap_tokens=kwargs.pop("overlap_tokens", 0),
318+
)
319+
return self.extract_txt(params=txt_params)
320+
311321
resolved = _coerce_params(params, ExtractParams, kwargs)
312322
if (
313323
any(

nemo_retriever/src/nemo_retriever/ingest_modes/lancedb_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,9 @@ def lancedb_schema(vector_dim: int = 2048) -> Any:
197197
pa.field("pdf_basename", pa.string()),
198198
pa.field("page_number", pa.int32()),
199199
pa.field("source", pa.string()),
200-
pa.field("source_id", pa.string()),
200+
pa.field(
201+
"source_id", pa.string()
202+
), # Different than the source. Field contains path+page_number for aggregation tasks
201203
pa.field("path", pa.string()),
202204
pa.field("text", pa.string()),
203205
pa.field("metadata", pa.string()),

nemo_retriever/src/nemo_retriever/utils/hf_model_registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
"nvidia/llama-nemotron-embed-vl-1b-v2": "859e1f2dac29c56c37a5279cf55f53f3e74efc6b",
2929
"meta-llama/Llama-3.2-1B": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
3030
"intfloat/e5-large-unsupervised": "15af9288f69a6291f37bfb89b47e71abc747b206",
31+
"nvidia/llama-nemotron-rerank-1b-v2": "aee9a1be0bbd89489f8bd0ec5763614c8bb85878",
3132
}
3233

3334

nemo_retriever/tests/test_lancedb_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ def test_returns_schema_with_correct_fields(self):
198198
assert "text" in names
199199
assert "metadata" in names
200200
assert "source" in names
201+
assert "source_id" in names
201202
assert len(names) == 10
202203

203204

0 commit comments

Comments
 (0)