Skip to content

Commit b1277c0

Browse files
AsiaCaohumpydonkey
andauthored
fix: handle exceptions in parsing images gracefully (#10)
## Changes 1. Handle exceptions in parsing images gracefully 2. Set a cap on the max parallelism 3. Minor docs updates --------- Co-authored-by: Yazhou Cao <[email protected]>
1 parent eb25693 commit b1277c0

File tree

3 files changed

+51
-12
lines changed

3 files changed

+51
-12
lines changed

agentic_doc/common.py

+6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ class ChunkType(str, Enum):
2121

2222

2323
class ChunkGroundingBox(BaseModel):
24+
"""
25+
A bounding box of a chunk.
26+
27+
The coordinates are in the format of [left, top, right, bottom].
28+
"""
29+
2430
l: float # noqa: E741
2531
t: float
2632
r: float

agentic_doc/config.py

+12
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import json
2+
import logging
23
from typing import Literal
34

45
import structlog
56
from pydantic import Field
67
from pydantic_settings import BaseSettings, SettingsConfigDict
78

89
_LOGGER = structlog.get_logger(__name__)
10+
_MAX_PARALLEL_TASKS = 100
911

1012

1113
class Settings(BaseSettings):
@@ -36,3 +38,13 @@ def __str__(self) -> str:
3638

3739
settings = Settings()
3840
_LOGGER.info(f"Settings loaded: {settings}")
41+
42+
if settings.batch_size * settings.max_workers > _MAX_PARALLEL_TASKS:
43+
raise ValueError(
44+
f"Batch size * max workers must be less than {_MAX_PARALLEL_TASKS}."
45+
" Please reduce the batch size or max workers."
46+
" Current settings: batch_size={settings.batch_size}, max_workers={settings.max_workers}"
47+
)
48+
49+
if settings.retry_logging_style == "inline_block":
50+
logging.getLogger("httpx").setLevel(logging.WARNING)

agentic_doc/parse.py

+33-12
Original file line numberDiff line numberDiff line change
@@ -101,19 +101,9 @@ def parse_and_save_document(
101101
file_type = "pdf" if file_path.suffix.lower() == ".pdf" else "image"
102102

103103
if file_type == "image":
104-
result_raw = _send_parsing_request(str(file_path))
105-
result_raw = {
106-
**result_raw["data"],
107-
"doc_type": "image",
108-
"start_page_idx": 0,
109-
"end_page_idx": 0,
110-
}
111-
result = ParsedDocument.model_validate(result_raw)
104+
result = _parse_image(file_path)
112105
elif file_type == "pdf":
113-
with tempfile.TemporaryDirectory() as temp_dir:
114-
parts = split_pdf(file_path, temp_dir)
115-
part_results = _parse_doc_in_parallel(parts, doc_name=file_path.name)
116-
result = _merge_part_results(part_results)
106+
result = _parse_pdf(file_path)
117107
else:
118108
raise ValueError(f"Unsupported file type: {file_type}")
119109

@@ -130,6 +120,37 @@ def parse_and_save_document(
130120
return save_path
131121

132122

123+
def _parse_pdf(file_path: Union[str, Path]) -> ParsedDocument:
124+
with tempfile.TemporaryDirectory() as temp_dir:
125+
parts = split_pdf(file_path, temp_dir)
126+
file_path = Path(file_path)
127+
part_results = _parse_doc_in_parallel(parts, doc_name=file_path.name)
128+
return _merge_part_results(part_results)
129+
130+
131+
def _parse_image(file_path: Union[str, Path]) -> ParsedDocument:
132+
try:
133+
result_raw = _send_parsing_request(str(file_path))
134+
result_raw = {
135+
**result_raw["data"],
136+
"doc_type": "image",
137+
"start_page_idx": 0,
138+
"end_page_idx": 0,
139+
}
140+
return ParsedDocument.model_validate(result_raw)
141+
except Exception as e:
142+
error_msg = str(e)
143+
_LOGGER.error(f"Error parsing image '{file_path}' due to: {error_msg}")
144+
chunks = [Chunk.error_chunk(error_msg, 0)]
145+
return ParsedDocument(
146+
markdown="",
147+
chunks=chunks,
148+
start_page_idx=0,
149+
end_page_idx=0,
150+
doc_type="image",
151+
)
152+
153+
133154
def _merge_part_results(results: list[ParsedDocument]) -> ParsedDocument:
134155
if not results:
135156
_LOGGER.warning(

0 commit comments

Comments
 (0)