@@ -101,19 +101,9 @@ def parse_and_save_document(
101
101
file_type = "pdf" if file_path .suffix .lower () == ".pdf" else "image"
102
102
103
103
if file_type == "image" :
104
- result_raw = _send_parsing_request (str (file_path ))
105
- result_raw = {
106
- ** result_raw ["data" ],
107
- "doc_type" : "image" ,
108
- "start_page_idx" : 0 ,
109
- "end_page_idx" : 0 ,
110
- }
111
- result = ParsedDocument .model_validate (result_raw )
104
+ result = _parse_image (file_path )
112
105
elif file_type == "pdf" :
113
- with tempfile .TemporaryDirectory () as temp_dir :
114
- parts = split_pdf (file_path , temp_dir )
115
- part_results = _parse_doc_in_parallel (parts , doc_name = file_path .name )
116
- result = _merge_part_results (part_results )
106
+ result = _parse_pdf (file_path )
117
107
else :
118
108
raise ValueError (f"Unsupported file type: { file_type } " )
119
109
@@ -130,6 +120,37 @@ def parse_and_save_document(
130
120
return save_path
131
121
132
122
123
+ def _parse_pdf (file_path : Union [str , Path ]) -> ParsedDocument :
124
+ with tempfile .TemporaryDirectory () as temp_dir :
125
+ parts = split_pdf (file_path , temp_dir )
126
+ file_path = Path (file_path )
127
+ part_results = _parse_doc_in_parallel (parts , doc_name = file_path .name )
128
+ return _merge_part_results (part_results )
129
+
130
+
131
+ def _parse_image (file_path : Union [str , Path ]) -> ParsedDocument :
132
+ try :
133
+ result_raw = _send_parsing_request (str (file_path ))
134
+ result_raw = {
135
+ ** result_raw ["data" ],
136
+ "doc_type" : "image" ,
137
+ "start_page_idx" : 0 ,
138
+ "end_page_idx" : 0 ,
139
+ }
140
+ return ParsedDocument .model_validate (result_raw )
141
+ except Exception as e :
142
+ error_msg = str (e )
143
+ _LOGGER .error (f"Error parsing image '{ file_path } ' due to: { error_msg } " )
144
+ chunks = [Chunk .error_chunk (error_msg , 0 )]
145
+ return ParsedDocument (
146
+ markdown = "" ,
147
+ chunks = chunks ,
148
+ start_page_idx = 0 ,
149
+ end_page_idx = 0 ,
150
+ doc_type = "image" ,
151
+ )
152
+
153
+
133
154
def _merge_part_results (results : list [ParsedDocument ]) -> ParsedDocument :
134
155
if not results :
135
156
_LOGGER .warning (
0 commit comments