Skip to content

Commit

Permalink
Fix bug on unstructured fallback parsing (#1995)
Browse files Browse the repository at this point in the history
* Fix bug on unstructured fallback parsing

* Bump release
  • Loading branch information
NolanTrem authored Feb 21, 2025
1 parent 9140000 commit 3772ef5
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 20 deletions.
50 changes: 31 additions & 19 deletions py/core/providers/ingestion/unstructured/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,35 +203,47 @@ async def parse_fallback(
ingestion_config: dict,
parser_name: str,
) -> AsyncGenerator[FallbackElement, None]:
context = ""
async for text in self.parsers[parser_name].ingest(
contents = []
async for chunk in self.parsers[parser_name].ingest(
file_content, **ingestion_config
): # type: ignore
if text is not None:
context += text + "\n\n"
logging.info(f"Fallback ingestion with config = {ingestion_config}")
if isinstance(chunk, dict) and chunk.get("content"):
contents.append(chunk)
elif chunk: # Handle string output for backward compatibility
contents.append({"content": chunk})

if not context.strip():
if not contents:
logging.warning(
"No valid text content was extracted during parsing"
)
return

loop = asyncio.get_event_loop()
splitter = RecursiveCharacterTextSplitter(
chunk_size=ingestion_config["new_after_n_chars"],
chunk_overlap=ingestion_config["overlap"],
)
chunks = await loop.run_in_executor(
None, splitter.create_documents, [context]
)
logging.info(f"Fallback ingestion with config = {ingestion_config}")

iteration = 0
for content_item in contents:
text = content_item["content"]

for chunk_id, text_chunk in enumerate(chunks):
yield FallbackElement(
text=text_chunk.page_content,
metadata={"chunk_id": chunk_id},
loop = asyncio.get_event_loop()
splitter = RecursiveCharacterTextSplitter(
chunk_size=ingestion_config["new_after_n_chars"],
chunk_overlap=ingestion_config["overlap"],
)
await asyncio.sleep(0)
chunks = await loop.run_in_executor(
None, splitter.create_documents, [text]
)

for text_chunk in chunks:
metadata = {"chunk_id": iteration}
if "page_number" in content_item:
metadata["page_number"] = content_item["page_number"]

yield FallbackElement(
text=text_chunk.page_content,
metadata=metadata,
)
iteration += 1
await asyncio.sleep(0)

async def parse(
self,
Expand Down
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "r2r"
version = "3.4.1"
version = "3.4.2"
description = "SciPhi R2R"
readme = "README.md"
license = {text = "MIT"}
Expand Down

0 comments on commit 3772ef5

Please sign in to comment.