@@ -387,9 +387,18 @@ async def parse(
387387 else :
388388 # Standard parsing for non-override cases
389389 async for text in self .parsers [document .document_type ].ingest (
390- file_content , ** ingestion_config_override
390+ file_content ,
391+ ** ingestion_config_override ,
392+ document = document ,
391393 ):
392- if text is not None :
394+ if text is not None and isinstance (text , dict ):
395+ contents .append (
396+ {
397+ "content" : text .get ("content" , "" ),
398+ "metadata" : text .get ("metadata" , {}),
399+ }
400+ )
401+ elif text is not None :
393402 contents .append ({"content" : text })
394403
395404 if not contents :
@@ -401,12 +410,15 @@ async def parse(
401410 iteration = 0
402411 for content_item in contents :
403412 chunk_text = content_item ["content" ]
413+ parser_generated = content_item .get ("metadata" , {})
404414 chunks = self .chunk (chunk_text , ingestion_config_override )
405415
406416 for chunk in chunks :
407417 metadata = {** document .metadata , "chunk_order" : iteration }
408418 if "page_number" in content_item :
409419 metadata ["page_number" ] = content_item ["page_number" ]
420+ if parser_generated :
421+ metadata ["parser_generated" ] = parser_generated
410422
411423 extraction = DocumentChunk (
412424 id = generate_extraction_id (document .id , iteration ),
0 commit comments