Skip to content

Commit 3772ef5

Browse files
authored
Fix bug on unstructured fallback parsing (#1995)
* Fix bug on unstructured fallback parsing * Bump release
1 parent 9140000 commit 3772ef5

File tree

2 files changed

+32
-20
lines changed

2 files changed

+32
-20
lines changed

py/core/providers/ingestion/unstructured/base.py

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -203,35 +203,47 @@ async def parse_fallback(
203203
ingestion_config: dict,
204204
parser_name: str,
205205
) -> AsyncGenerator[FallbackElement, None]:
206-
context = ""
207-
async for text in self.parsers[parser_name].ingest(
206+
contents = []
207+
async for chunk in self.parsers[parser_name].ingest(
208208
file_content, **ingestion_config
209209
): # type: ignore
210-
if text is not None:
211-
context += text + "\n\n"
212-
logging.info(f"Fallback ingestion with config = {ingestion_config}")
210+
if isinstance(chunk, dict) and chunk.get("content"):
211+
contents.append(chunk)
212+
elif chunk: # Handle string output for backward compatibility
213+
contents.append({"content": chunk})
213214

214-
if not context.strip():
215+
if not contents:
215216
logging.warning(
216217
"No valid text content was extracted during parsing"
217218
)
218219
return
219220

220-
loop = asyncio.get_event_loop()
221-
splitter = RecursiveCharacterTextSplitter(
222-
chunk_size=ingestion_config["new_after_n_chars"],
223-
chunk_overlap=ingestion_config["overlap"],
224-
)
225-
chunks = await loop.run_in_executor(
226-
None, splitter.create_documents, [context]
227-
)
221+
logging.info(f"Fallback ingestion with config = {ingestion_config}")
222+
223+
iteration = 0
224+
for content_item in contents:
225+
text = content_item["content"]
228226

229-
for chunk_id, text_chunk in enumerate(chunks):
230-
yield FallbackElement(
231-
text=text_chunk.page_content,
232-
metadata={"chunk_id": chunk_id},
227+
loop = asyncio.get_event_loop()
228+
splitter = RecursiveCharacterTextSplitter(
229+
chunk_size=ingestion_config["new_after_n_chars"],
230+
chunk_overlap=ingestion_config["overlap"],
233231
)
234-
await asyncio.sleep(0)
232+
chunks = await loop.run_in_executor(
233+
None, splitter.create_documents, [text]
234+
)
235+
236+
for text_chunk in chunks:
237+
metadata = {"chunk_id": iteration}
238+
if "page_number" in content_item:
239+
metadata["page_number"] = content_item["page_number"]
240+
241+
yield FallbackElement(
242+
text=text_chunk.page_content,
243+
metadata=metadata,
244+
)
245+
iteration += 1
246+
await asyncio.sleep(0)
235247

236248
async def parse(
237249
self,

py/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "r2r"
7-
version = "3.4.1"
7+
version = "3.4.2"
88
description = "SciPhi R2R"
99
readme = "README.md"
1010
license = {text = "MIT"}

0 commit comments

Comments
 (0)