Skip to content

Commit 961f8d3

Browse files
authored
Merge pull request #238 from enoch3712/232-add-multiple-image-processing-on-extraction
Mulriple image support added for extraction
2 parents b9e4090 + a469c24 commit 961f8d3

File tree

3 files changed

+111
-45
lines changed

3 files changed

+111
-45
lines changed

extract_thinker/concatenation_handler.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -161,17 +161,24 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
161161
"text": f"##Content\n\n{item['content']}"
162162
})
163163

164-
# Add image if available
165-
if isinstance(item, dict) and "image" in item:
166-
if item["image"]:
167-
message_content.append({
168-
"type": "image_url",
169-
"image_url": {
170-
"url": f"data:image/jpeg;base64,{encode_image(item['image'])}"
171-
}
172-
})
164+
# Add images if available
165+
if isinstance(item, dict):
166+
images = []
167+
if "images" in item and isinstance(item["images"], list):
168+
images.extend(item["images"])
169+
if "image" in item and item["image"] is not None:
170+
images.append(item["image"])
171+
172+
for img in images:
173+
if img:
174+
message_content.append({
175+
"type": "image_url",
176+
"image_url": {
177+
"url": f"data:image/jpeg;base64,{encode_image(img)}"
178+
}
179+
})
173180
else:
174-
# Fallback to original single-item handling
181+
# Handle single item
175182
if isinstance(content, dict):
176183
# Add text content if available
177184
if "content" in content:
@@ -181,16 +188,20 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
181188
})
182189

183190
# Add images
184-
if "image" in content or "images" in content:
185-
images = content.get("images", [content.get("image")])
186-
for img in images:
187-
if img:
188-
message_content.append({
189-
"type": "image_url",
190-
"image_url": {
191-
"url": f"data:image/jpeg;base64,{encode_image(img)}"
192-
}
193-
})
191+
images = []
192+
if "images" in content and isinstance(content["images"], list):
193+
images.extend(content["images"])
194+
if "image" in content and content["image"] is not None:
195+
images.append(content["image"])
196+
197+
for img in images:
198+
if img:
199+
message_content.append({
200+
"type": "image_url",
201+
"image_url": {
202+
"url": f"data:image/jpeg;base64,{encode_image(img)}"
203+
}
204+
})
194205

195206
return message_content
196207

extract_thinker/extractor.py

Lines changed: 70 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(
4949
self.llm_interceptors: List[LlmInterceptor] = []
5050
self.is_classify_image: bool = False
5151
self._skip_loading: bool = False
52+
self.chunk_height: int = 1500
5253

5354
def add_interceptor(
5455
self, interceptor: Union[LoaderInterceptor, LlmInterceptor]
@@ -184,16 +185,27 @@ def _map_to_universal_format(
184185
Maps loaded content to a universal format that _extract can process.
185186
The universal format is:
186187
{
187-
"content": str, # The text content
188-
"images": List[bytes], # Optional list of image bytes if vision=True
189-
"metadata": Dict[str, Any] # Optional metadata
188+
"content": str, # The text content (joined from pages)
189+
"images": List[bytes],
190+
# Optional list of image bytes if vision=True (can hold multiple)
191+
"metadata": {}
190192
}
191193
"""
192194
if content is None:
193195
return {"content": "", "images": [], "metadata": {}}
194196

195197
# If content is already in universal format, return as is
196198
if isinstance(content, dict) and "content" in content:
199+
# Ensure 'images' is a list
200+
if "image" in content and "images" not in content:
201+
# Merge single 'image' into 'images'
202+
content["images"] = [content["image"]] if content["image"] else []
203+
del content["image"]
204+
elif "images" in content and not isinstance(content["images"], list):
205+
# If 'images' is mistakenly a single byte blob, fix it
206+
content["images"] = [content["images"]] if content["images"] else []
207+
elif "images" not in content:
208+
content["images"] = []
197209
return content
198210

199211
# Handle list of pages from document loader
@@ -207,8 +219,13 @@ def _map_to_universal_format(
207219
if 'content' in page:
208220
text_content.append(page['content'])
209221
# Extract images if vision mode is enabled
210-
if vision and 'image' in page:
211-
images.append(page['image'])
222+
if vision:
223+
# If there's a list of images
224+
if 'images' in page and isinstance(page['images'], list):
225+
images.extend(page['images'])
226+
# Or just a single 'image'
227+
elif 'image' in page and page['image']:
228+
images.append(page['image'])
212229

213230
return {
214231
"content": "\n\n".join(text_content) if text_content else "",
@@ -230,11 +247,18 @@ def _map_to_universal_format(
230247
if isinstance(text_content, list):
231248
text_content = "\n".join(text_content)
232249

250+
images = []
251+
if vision:
252+
if "images" in content and isinstance(content["images"], list):
253+
images.extend(content["images"])
254+
elif "image" in content and content["image"]:
255+
images.append(content["image"])
256+
233257
return {
234258
"content": text_content,
235-
"images": content.get("images", []) if vision else [],
259+
"images": images,
236260
"metadata": {k: v for k, v in content.items()
237-
if k not in ["text", "images", "content"]}
261+
if k not in ["text", "images", "image", "content"]}
238262
}
239263

240264
raise ValueError(f"Unsupported content format: {type(content)}")
@@ -1067,7 +1091,7 @@ def _add_images_to_message_content(
10671091
elif isinstance(content, dict):
10681092
# Handle legacy format
10691093
image_data = content.get('image') or content.get('images')
1070-
self._append_images(image_data, message_content)
1094+
self._append_images(image_data[0], message_content)
10711095

10721096
def _append_images(
10731097
self,
@@ -1078,27 +1102,54 @@ def _append_images(
10781102
Append images to the message content.
10791103
10801104
Args:
1081-
image_data: The image data to process.
1105+
image_data: The image data to process. Can be:
1106+
- A dictionary with 'image' or 'images' keys
1107+
- A list of images
1108+
- A single image
10821109
message_content: The message content to append images to.
10831110
"""
10841111
if not image_data:
10851112
return
10861113

1114+
images_list = []
10871115
if isinstance(image_data, dict):
1088-
images_list = image_data.values()
1116+
# Handle dictionary format
1117+
if "images" in image_data:
1118+
# If "images" key exists, it should be a list of images
1119+
if isinstance(image_data["images"], list):
1120+
images_list.extend(image_data["images"])
1121+
else:
1122+
# Single image in "images" key
1123+
images_list.append(image_data["images"])
1124+
elif "image" in image_data and image_data["image"] is not None:
1125+
# Single image in "image" key
1126+
images_list.append(image_data["image"])
10891127
elif isinstance(image_data, list):
1090-
images_list = image_data
1128+
# Process list of images or image dictionaries
1129+
for item in image_data:
1130+
if isinstance(item, dict):
1131+
# Handle nested image dictionaries
1132+
if "images" in item and isinstance(item["images"], list):
1133+
images_list.extend(item["images"])
1134+
elif "image" in item and item["image"] is not None:
1135+
images_list.append(item["image"])
1136+
else:
1137+
# Raw image data
1138+
images_list.append(item)
10911139
else:
1092-
images_list = [image_data]
1140+
# Single raw image
1141+
images_list.append(image_data)
10931142

1143+
# Process all collected images
10941144
for img in images_list:
1095-
base64_image = encode_image(img)
1096-
message_content.append({
1097-
"type": "image_url",
1098-
"image_url": {
1099-
"url": f"data:image/jpeg;base64,{base64_image}"
1100-
}
1101-
})
1145+
if img is not None: # Skip None values
1146+
base64_image = encode_image(img)
1147+
message_content.append({
1148+
"type": "image_url",
1149+
"image_url": {
1150+
"url": f"data:image/jpeg;base64,{base64_image}"
1151+
}
1152+
})
11021153

11031154
def _build_messages(
11041155
self,

extract_thinker/pagination_handler.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -491,16 +491,21 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
491491
"""Build content for vision request."""
492492
message_content = []
493493

494-
# Add text content if available
494+
# If there's textual 'content', push it first
495495
if isinstance(content, dict) and "content" in content:
496496
message_content.append({
497497
"type": "text",
498498
"text": f"##Content\n\n{content['content']}"
499499
})
500-
501-
# Add images
502-
if isinstance(content, dict) and ("image" in content or "images" in content):
503-
images = content.get("images", [content.get("image")])
500+
501+
# Now handle multiple images
502+
if isinstance(content, dict):
503+
images = []
504+
if "images" in content and isinstance(content["images"], list):
505+
images.extend(content["images"])
506+
if "image" in content and content["image"] is not None:
507+
images.append(content["image"])
508+
504509
for img in images:
505510
if img:
506511
message_content.append({
@@ -509,7 +514,6 @@ def _build_vision_content(self, content: Any) -> List[Dict[str, Any]]:
509514
"url": f"data:image/jpeg;base64,{encode_image(img)}"
510515
}
511516
})
512-
513517
return message_content
514518

515519
def _build_text_content(self, content: Any) -> str:

0 commit comments

Comments
 (0)