@@ -203,35 +203,47 @@ async def parse_fallback(
203
203
ingestion_config : dict ,
204
204
parser_name : str ,
205
205
) -> AsyncGenerator [FallbackElement , None ]:
206
- context = ""
207
- async for text in self .parsers [parser_name ].ingest (
206
+ contents = []
207
+ async for chunk in self .parsers [parser_name ].ingest (
208
208
file_content , ** ingestion_config
209
209
): # type: ignore
210
- if text is not None :
211
- context += text + "\n \n "
212
- logging .info (f"Fallback ingestion with config = { ingestion_config } " )
210
+ if isinstance (chunk , dict ) and chunk .get ("content" ):
211
+ contents .append (chunk )
212
+ elif chunk : # Handle string output for backward compatibility
213
+ contents .append ({"content" : chunk })
213
214
214
- if not context . strip () :
215
+ if not contents :
215
216
logging .warning (
216
217
"No valid text content was extracted during parsing"
217
218
)
218
219
return
219
220
220
- loop = asyncio .get_event_loop ()
221
- splitter = RecursiveCharacterTextSplitter (
222
- chunk_size = ingestion_config ["new_after_n_chars" ],
223
- chunk_overlap = ingestion_config ["overlap" ],
224
- )
225
- chunks = await loop .run_in_executor (
226
- None , splitter .create_documents , [context ]
227
- )
221
+ logging .info (f"Fallback ingestion with config = { ingestion_config } " )
222
+
223
+ iteration = 0
224
+ for content_item in contents :
225
+ text = content_item ["content" ]
228
226
229
- for chunk_id , text_chunk in enumerate ( chunks ):
230
- yield FallbackElement (
231
- text = text_chunk . page_content ,
232
- metadata = { "chunk_id" : chunk_id } ,
227
+ loop = asyncio . get_event_loop ()
228
+ splitter = RecursiveCharacterTextSplitter (
229
+ chunk_size = ingestion_config [ "new_after_n_chars" ] ,
230
+ chunk_overlap = ingestion_config [ "overlap" ] ,
233
231
)
234
- await asyncio .sleep (0 )
232
+ chunks = await loop .run_in_executor (
233
+ None , splitter .create_documents , [text ]
234
+ )
235
+
236
+ for text_chunk in chunks :
237
+ metadata = {"chunk_id" : iteration }
238
+ if "page_number" in content_item :
239
+ metadata ["page_number" ] = content_item ["page_number" ]
240
+
241
+ yield FallbackElement (
242
+ text = text_chunk .page_content ,
243
+ metadata = metadata ,
244
+ )
245
+ iteration += 1
246
+ await asyncio .sleep (0 )
235
247
236
248
async def parse (
237
249
self ,
0 commit comments