Skip to content

Commit db7b306

Browse files
committed
fix in process extract to handle txt
1 parent 885978a commit db7b306

File tree

1 file changed

+6
-0
lines changed

1 file changed

+6
-0
lines changed

nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,12 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "InProc
10011001
# NOTE: `kwargs` passed to `.extract()` are intended primarily for PDF extraction
10021002
# (e.g. `extract_text`, `dpi`, etc). Downstream model stages do NOT necessarily
10031003
# accept the same keyword arguments. Keep per-stage kwargs isolated.
1004+
if self._input_documents and all(f.lower().endswith(".txt") for f in self._input_documents):
1005+
txt_params = TextChunkParams(
1006+
max_tokens=kwargs.pop("max_tokens", 1024),
1007+
overlap_tokens=kwargs.pop("overlap_tokens", 0),
1008+
)
1009+
return self.extract_txt(params=txt_params)
10041010

10051011
resolved = _coerce_params(params, ExtractParams, kwargs)
10061012
if (

0 commit comments

Comments
 (0)