Skip to content

Commit a3e052d

Browse files
authored
Merge pull request #239 from enoch3712/230-document-loader---image-generation-for-url
Image generation for URL in DocumentLoader
2 parents 961f8d3 + 060b547 commit a3e052d

12 files changed

+401
-82
lines changed

extract_thinker/document_loader/document_loader.py

+136-8
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,36 @@
33
from io import BytesIO
44
from PIL import Image
55
import pypdfium2 as pdfium
6-
from typing import Any, Dict, Union
6+
from typing import Any, Dict, Union, List
77
from cachetools import TTLCache
88
import os
99
import magic
1010
from extract_thinker.utils import get_file_extension, check_mime_type
11+
from playwright.sync_api import sync_playwright
12+
from urllib.parse import urlparse
13+
import base64
14+
import math
1115

1216
class DocumentLoader(ABC):
13-
def __init__(self, content: Any = None, cache_ttl: int = 300):
17+
# SUPPORTED_FORMATS = [
18+
# "pdf", "jpg", "jpeg", "png", "tiff", "bmp"
19+
# ]
20+
21+
def __init__(self, content: Any = None, cache_ttl: int = 300, screenshot_timeout: int = 1000):
1422
"""Initialize loader.
1523
1624
Args:
1725
content: Initial content
1826
cache_ttl: Cache time-to-live in seconds
27+
screenshot_timeout: Timeout in milliseconds to wait for page content load when capturing a screenshot.
1928
"""
2029
self.content = content
2130
self.file_path = None
2231
self.cache = TTLCache(maxsize=100, ttl=cache_ttl)
2332
self.vision_mode = False
2433
self.max_image_size = None # Changed to None by default
34+
self.is_url = False # Indicates if the source is a URL
35+
self.screenshot_timeout = screenshot_timeout
2536

2637
def set_max_image_size(self, size: int) -> None:
2738
"""Set the maximum image size."""
@@ -31,6 +42,10 @@ def set_vision_mode(self, enabled: bool = True) -> None:
3142
"""Enable or disable vision mode processing."""
3243
self.vision_mode = enabled
3344

45+
def set_screenshot_timeout(self, timeout: int) -> None:
46+
"""Set the screenshot timeout in milliseconds for capturing a screenshot from a URL."""
47+
self.screenshot_timeout = timeout
48+
3449
def can_handle(self, source: Union[str, BytesIO]) -> bool:
3550
"""
3651
Checks if the loader can handle the given source.
@@ -60,7 +75,6 @@ def _can_handle_file_path(self, file_path: str) -> bool:
6075
def _can_handle_stream(self, stream: BytesIO) -> bool:
6176
"""Checks if the loader can handle the given BytesIO stream."""
6277
try:
63-
# Read the first few bytes to determine file type
6478
mime = magic.from_buffer(stream.getvalue(), mime=True)
6579
stream.seek(0) # Reset stream position
6680
return check_mime_type(mime, self.SUPPORTED_FORMATS)
@@ -85,19 +99,36 @@ def convert_to_images(self, file: Union[str, io.BytesIO, io.BufferedReader], sca
8599
raise TypeError("file must be a file path (str) or a file-like stream")
86100

87101
def _convert_file_to_images(self, file_path: str, scale: float) -> Dict[int, bytes]:
88-
# Check if the file is already an image
102+
"""Convert file to images, handling both URLs and local files."""
103+
# Check if it's a URL
104+
if self._is_url(file_path):
105+
self.is_url = True # Set the instance variable if the source is a URL
106+
try:
107+
screenshot = self._capture_screenshot_from_url(file_path)
108+
# Convert screenshot to PIL Image for potential resizing
109+
img = Image.open(BytesIO(screenshot))
110+
img = self._resize_if_needed(img)
111+
112+
# Split into vertical chunks
113+
chunks = self._split_image_vertically(img)
114+
115+
# Return dictionary with chunks as list
116+
return {0: chunks} # All chunks from URL are considered "page 0"
117+
118+
except Exception as e:
119+
raise ValueError(f"Failed to capture screenshot from URL: {str(e)}")
120+
121+
# Existing code for local files...
89122
try:
90123
Image.open(file_path)
91124
is_image = True
92125
except IOError:
93126
is_image = False
94127

95128
if is_image:
96-
# If it is, return it as is
97129
with open(file_path, "rb") as f:
98130
return {0: f.read()}
99131

100-
# If it's not an image, proceed with the conversion
101132
return self._convert_pdf_to_images(pdfium.PdfDocument(file_path), scale)
102133

103134
def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Dict[int, bytes]:
@@ -163,13 +194,15 @@ def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
163194
Checks if the loader can handle the source in vision mode.
164195
165196
Args:
166-
source: Either a file path (str) or a BytesIO stream
197+
source: Either a file path (str), URL, or a BytesIO stream
167198
168199
Returns:
169200
bool: True if the loader can handle the source in vision mode
170201
"""
171202
try:
172203
if isinstance(source, str):
204+
if self._is_url(source):
205+
return True # URLs are always supported in vision mode
173206
ext = get_file_extension(source).lower()
174207
return ext in ['pdf', 'jpg', 'jpeg', 'png', 'tiff', 'bmp']
175208
elif isinstance(source, BytesIO):
@@ -210,4 +243,99 @@ def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool:
210243
# List of extensions that support pagination
211244
return ext in ['pdf']
212245
except Exception:
213-
return False
246+
return False
247+
248+
@staticmethod
249+
def _check_playwright_dependencies():
250+
"""
251+
Check if the playwright dependency is installed.
252+
Raises:
253+
ImportError: If playwright is not installed.
254+
"""
255+
try:
256+
from playwright.sync_api import sync_playwright
257+
except ImportError:
258+
raise ImportError(
259+
"You are using vision with url. You need to install playwright."
260+
"`pip install playwright` and run `playwright install`."
261+
)
262+
263+
def _capture_screenshot_from_url(self, url: str) -> bytes:
264+
"""
265+
Captures a full-page screenshot of a URL using Playwright.
266+
267+
Args:
268+
url: The URL to capture
269+
270+
Returns:
271+
bytes: The screenshot image data
272+
"""
273+
# Optional: Check if playwright is installed before attempting to use it.
274+
self._check_playwright_dependencies()
275+
276+
from playwright.sync_api import sync_playwright # Import after the dependency check
277+
278+
with sync_playwright() as p:
279+
browser = p.chromium.launch(headless=True)
280+
page = browser.new_page()
281+
282+
try:
283+
# Navigate to URL
284+
page.goto(url, wait_until='networkidle')
285+
286+
# Optional: Handle cookie consent popups (customize selectors as needed)
287+
try:
288+
page.click('button:has-text("Accept")', timeout=10000)
289+
except Exception:
290+
pass # Ignore if no cookie banner is found
291+
292+
# Wait for content to load with the configurable timeout
293+
page.wait_for_timeout(self.screenshot_timeout)
294+
295+
# Capture full page screenshot
296+
screenshot = page.screenshot(full_page=True)
297+
298+
return screenshot
299+
300+
finally:
301+
browser.close()
302+
303+
def _split_image_vertically(self, img: Image.Image, chunk_height: int = 1000) -> List[bytes]:
304+
"""
305+
Splits a tall PIL Image into vertical chunks of `chunk_height`.
306+
Returns a list of bytes in PNG format, in top-to-bottom order.
307+
308+
Args:
309+
img: PIL Image to split
310+
chunk_height: Height of each chunk in pixels
311+
312+
Returns:
313+
List of PNG-encoded bytes for each chunk
314+
"""
315+
width, height = img.size
316+
num_chunks = math.ceil(height / chunk_height)
317+
318+
chunks_bytes = []
319+
for i in range(num_chunks):
320+
top = i * chunk_height
321+
bottom = min((i + 1) * chunk_height, height)
322+
crop_box = (0, top, width, bottom)
323+
324+
# Crop the chunk
325+
chunk_img = img.crop(crop_box)
326+
327+
# Convert chunk to bytes
328+
chunk_bytes = io.BytesIO()
329+
chunk_img.save(chunk_bytes, format="PNG", optimize=True)
330+
chunk_bytes.seek(0)
331+
chunks_bytes.append(chunk_bytes.read())
332+
333+
return chunks_bytes
334+
335+
def _is_url(self, source: str) -> bool:
336+
"""Check if the source string is a URL."""
337+
try:
338+
result = urlparse(source)
339+
return bool(result.scheme and result.netloc)
340+
except:
341+
return False

extract_thinker/document_loader/document_loader_beautiful_soup.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ def __post_init__(self):
5252
class DocumentLoaderBeautifulSoup(CachedDocumentLoader):
5353
"""Loader that uses BeautifulSoup4 to load HTML content."""
5454

55-
SUPPORTED_FORMATS = ['html', 'htm']
55+
SUPPORTED_FORMATS = [
56+
'html', 'htm', 'url' # Add URL support
57+
]
5658

5759
def __init__(
5860
self,
@@ -257,9 +259,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
257259
raise ValueError(f"Error loading HTML content: {str(e)}")
258260

259261
def can_handle(self, source: Union[str, BytesIO]) -> bool:
260-
"""Check if the loader can handle this source."""
261-
if isinstance(source, BytesIO):
262+
"""Override to add URL support."""
263+
if isinstance(source, str) and self._is_url(source):
262264
return True
263-
if self._is_url(source):
264-
return True
265-
return get_file_extension(source) in self.SUPPORTED_FORMATS
265+
return super().can_handle(source)

extract_thinker/document_loader/document_loader_docling.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from io import BytesIO
22
from typing import Any, Dict, List, Union, Optional
33
from dataclasses import dataclass, field
4+
from urllib.parse import urlparse
45

56
from cachetools import cachedmethod
67
from cachetools.keys import hashkey
@@ -120,7 +121,9 @@ class DocumentLoaderDocling(CachedDocumentLoader):
120121
# XML (including PubMed .nxml)
121122
"xml", "nxml",
122123
# Plain text
123-
"txt"
124+
"txt",
125+
# URL support
126+
"url"
124127
]
125128

126129
def __init__(
@@ -212,37 +215,43 @@ def can_handle(self, source: Union[str, BytesIO]) -> bool:
212215
self.vision_mode
213216
))
214217
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
218+
from docling.document_converter import ConversionResult
215219
"""
216220
Load and parse the document using Docling.
217221
218222
Returns:
219223
A list of dictionaries, each representing a "page" with:
220224
- "content": text from that page
221225
- "image": optional image bytes if vision_mode is True
222-
- "markdown": Markdown string of that page
223226
"""
224227
if not self.can_handle(source):
225228
raise ValueError(f"Cannot handle source: {source}")
226229

227230
# Convert the source to a docling "ConversionResult"
228-
conv_result = self._docling_convert(source)
229-
230-
test = conv_result.document.export_to_markdown()
231-
print(test)
231+
conv_result: ConversionResult = self._docling_convert(source)
232232

233-
# Build the output list of page data
233+
# If the source is a URL, return a single page with all the content.
234+
if isinstance(source, str) and self._is_url(source):
235+
content = conv_result.document.export_to_markdown()
236+
print(content) # Log the exported markdown, if needed
237+
page_output = {"content": content, "image": None}
238+
# Handle image extraction if vision_mode is enabled
239+
if self.vision_mode:
240+
images_dict = self.convert_to_images(source)
241+
page_output["images"] = images_dict.get(0)
242+
return [page_output]
243+
244+
# Build the output list of page data for non-URL sources
234245
pages_output = []
235246
for p in conv_result.pages:
236247
page_dict = {
237248
"content": conv_result.document.export_to_markdown(page_no=p.page_no+1),
238249
"image": None
239250
}
240-
241251
# Handle image extraction if vision_mode is enabled
242252
if self.vision_mode:
243253
images_dict = self.convert_to_images(source)
244254
page_dict["image"] = images_dict.get(p.page_no)
245-
246255
pages_output.append(page_dict)
247256

248257
# Fallback for documents without explicit pages

0 commit comments

Comments
 (0)