fix: handle exceptions in parsing images gracefully (#10)

AsiaCao · humpydonkey · web-flow · commit b1277c0d06a7 · 2025-03-21T15:03:01.000-07:00
## Changes

1. Handle exceptions in parsing images gracefully
2. Set a cap on the max parallelism
3. Minor docs updates

---------

Co-authored-by: Yazhou Cao &lt;cyz19892002@gmail.com&gt;
diff --git a/agentic_doc/common.py b/agentic_doc/common.py
@@ -21,6 +21,12 @@ class ChunkType(str, Enum):
 
 
 class ChunkGroundingBox(BaseModel):
+    """
+    A bounding box of a chunk.
+
+    The coordinates are in the format of [left, top, right, bottom].
+    """
+
     l: float  # noqa: E741
     t: float
     r: float
diff --git a/agentic_doc/config.py b/agentic_doc/config.py
@@ -1,11 +1,13 @@
 import json
+import logging
 from typing import Literal
 
 import structlog
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 _LOGGER = structlog.get_logger(__name__)
+_MAX_PARALLEL_TASKS = 100
 
 
 class Settings(BaseSettings):
@@ -36,3 +38,13 @@ def __str__(self) -> str:
 
 settings = Settings()
 _LOGGER.info(f"Settings loaded: {settings}")
+
+if settings.batch_size * settings.max_workers > _MAX_PARALLEL_TASKS:
+    raise ValueError(
+        f"Batch size * max workers must be less than {_MAX_PARALLEL_TASKS}."
+        " Please reduce the batch size or max workers."
+        " Current settings: batch_size={settings.batch_size}, max_workers={settings.max_workers}"
+    )
+
+if settings.retry_logging_style == "inline_block":
+    logging.getLogger("httpx").setLevel(logging.WARNING)
diff --git a/agentic_doc/parse.py b/agentic_doc/parse.py
@@ -101,19 +101,9 @@ def parse_and_save_document(
     file_type = "pdf" if file_path.suffix.lower() == ".pdf" else "image"
 
     if file_type == "image":
-        result_raw = _send_parsing_request(str(file_path))
-        result_raw = {
-            **result_raw["data"],
-            "doc_type": "image",
-            "start_page_idx": 0,
-            "end_page_idx": 0,
-        }
-        result = ParsedDocument.model_validate(result_raw)
+        result = _parse_image(file_path)
     elif file_type == "pdf":
-        with tempfile.TemporaryDirectory() as temp_dir:
-            parts = split_pdf(file_path, temp_dir)
-            part_results = _parse_doc_in_parallel(parts, doc_name=file_path.name)
-            result = _merge_part_results(part_results)
+        result = _parse_pdf(file_path)
     else:
         raise ValueError(f"Unsupported file type: {file_type}")
 
@@ -130,6 +120,37 @@ def parse_and_save_document(
     return save_path
 
 
+def _parse_pdf(file_path: Union[str, Path]) -> ParsedDocument:
+    with tempfile.TemporaryDirectory() as temp_dir:
+        parts = split_pdf(file_path, temp_dir)
+        file_path = Path(file_path)
+        part_results = _parse_doc_in_parallel(parts, doc_name=file_path.name)
+        return _merge_part_results(part_results)
+
+
+def _parse_image(file_path: Union[str, Path]) -> ParsedDocument:
+    try:
+        result_raw = _send_parsing_request(str(file_path))
+        result_raw = {
+            **result_raw["data"],
+            "doc_type": "image",
+            "start_page_idx": 0,
+            "end_page_idx": 0,
+        }
+        return ParsedDocument.model_validate(result_raw)
+    except Exception as e:
+        error_msg = str(e)
+        _LOGGER.error(f"Error parsing image '{file_path}' due to: {error_msg}")
+        chunks = [Chunk.error_chunk(error_msg, 0)]
+        return ParsedDocument(
+            markdown="",
+            chunks=chunks,
+            start_page_idx=0,
+            end_page_idx=0,
+            doc_type="image",
+        )
+
+
 def _merge_part_results(results: list[ParsedDocument]) -> ParsedDocument:
     if not results:
         _LOGGER.warning(