Skip to content

Commit 9d40553

Browse files
authored
Adapt to JSON Return Value of Parser.ingest (#2232)
Adapt to the case where the return value structure of parser.ingest is JSON.
1 parent be0c050 commit 9d40553

File tree

1 file changed

+14
-2
lines changed
  • py/core/providers/ingestion/r2r

1 file changed

+14
-2
lines changed

py/core/providers/ingestion/r2r/base.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,9 +387,18 @@ async def parse(
387387
else:
388388
# Standard parsing for non-override cases
389389
async for text in self.parsers[document.document_type].ingest(
390-
file_content, **ingestion_config_override
390+
file_content,
391+
**ingestion_config_override,
392+
document=document,
391393
):
392-
if text is not None:
394+
if text is not None and isinstance(text, dict):
395+
contents.append(
396+
{
397+
"content": text.get("content", ""),
398+
"metadata": text.get("metadata", {}),
399+
}
400+
)
401+
elif text is not None:
393402
contents.append({"content": text})
394403

395404
if not contents:
@@ -401,12 +410,15 @@ async def parse(
401410
iteration = 0
402411
for content_item in contents:
403412
chunk_text = content_item["content"]
413+
parser_generated = content_item.get("metadata", {})
404414
chunks = self.chunk(chunk_text, ingestion_config_override)
405415

406416
for chunk in chunks:
407417
metadata = {**document.metadata, "chunk_order": iteration}
408418
if "page_number" in content_item:
409419
metadata["page_number"] = content_item["page_number"]
420+
if parser_generated:
421+
metadata["parser_generated"] = parser_generated
410422

411423
extraction = DocumentChunk(
412424
id=generate_extraction_id(document.id, iteration),

0 commit comments

Comments
 (0)