3
3
from io import BytesIO
4
4
from PIL import Image
5
5
import pypdfium2 as pdfium
6
- from typing import Any , Dict , Union
6
+ from typing import Any , Dict , Union , List
7
7
from cachetools import TTLCache
8
8
import os
9
9
import magic
10
10
from extract_thinker .utils import get_file_extension , check_mime_type
11
+ from playwright .sync_api import sync_playwright
12
+ from urllib .parse import urlparse
13
+ import base64
14
+ import math
11
15
12
16
class DocumentLoader (ABC ):
13
- def __init__ (self , content : Any = None , cache_ttl : int = 300 ):
17
+ # SUPPORTED_FORMATS = [
18
+ # "pdf", "jpg", "jpeg", "png", "tiff", "bmp"
19
+ # ]
20
+
21
+ def __init__ (self , content : Any = None , cache_ttl : int = 300 , screenshot_timeout : int = 1000 ):
14
22
"""Initialize loader.
15
23
16
24
Args:
17
25
content: Initial content
18
26
cache_ttl: Cache time-to-live in seconds
27
+ screenshot_timeout: Timeout in milliseconds to wait for page content load when capturing a screenshot.
19
28
"""
20
29
self .content = content
21
30
self .file_path = None
22
31
self .cache = TTLCache (maxsize = 100 , ttl = cache_ttl )
23
32
self .vision_mode = False
24
33
self .max_image_size = None # Changed to None by default
34
+ self .is_url = False # Indicates if the source is a URL
35
+ self .screenshot_timeout = screenshot_timeout
25
36
26
37
def set_max_image_size (self , size : int ) -> None :
27
38
"""Set the maximum image size."""
@@ -31,6 +42,10 @@ def set_vision_mode(self, enabled: bool = True) -> None:
31
42
"""Enable or disable vision mode processing."""
32
43
self .vision_mode = enabled
33
44
45
+ def set_screenshot_timeout (self , timeout : int ) -> None :
46
+ """Set the screenshot timeout in milliseconds for capturing a screenshot from a URL."""
47
+ self .screenshot_timeout = timeout
48
+
34
49
def can_handle (self , source : Union [str , BytesIO ]) -> bool :
35
50
"""
36
51
Checks if the loader can handle the given source.
@@ -60,7 +75,6 @@ def _can_handle_file_path(self, file_path: str) -> bool:
60
75
def _can_handle_stream (self , stream : BytesIO ) -> bool :
61
76
"""Checks if the loader can handle the given BytesIO stream."""
62
77
try :
63
- # Read the first few bytes to determine file type
64
78
mime = magic .from_buffer (stream .getvalue (), mime = True )
65
79
stream .seek (0 ) # Reset stream position
66
80
return check_mime_type (mime , self .SUPPORTED_FORMATS )
@@ -85,19 +99,36 @@ def convert_to_images(self, file: Union[str, io.BytesIO, io.BufferedReader], sca
85
99
raise TypeError ("file must be a file path (str) or a file-like stream" )
86
100
87
101
def _convert_file_to_images (self , file_path : str , scale : float ) -> Dict [int , bytes ]:
88
- # Check if the file is already an image
102
+ """Convert file to images, handling both URLs and local files."""
103
+ # Check if it's a URL
104
+ if self ._is_url (file_path ):
105
+ self .is_url = True # Set the instance variable if the source is a URL
106
+ try :
107
+ screenshot = self ._capture_screenshot_from_url (file_path )
108
+ # Convert screenshot to PIL Image for potential resizing
109
+ img = Image .open (BytesIO (screenshot ))
110
+ img = self ._resize_if_needed (img )
111
+
112
+ # Split into vertical chunks
113
+ chunks = self ._split_image_vertically (img )
114
+
115
+ # Return dictionary with chunks as list
116
+ return {0 : chunks } # All chunks from URL are considered "page 0"
117
+
118
+ except Exception as e :
119
+ raise ValueError (f"Failed to capture screenshot from URL: { str (e )} " )
120
+
121
+ # Existing code for local files...
89
122
try :
90
123
Image .open (file_path )
91
124
is_image = True
92
125
except IOError :
93
126
is_image = False
94
127
95
128
if is_image :
96
- # If it is, return it as is
97
129
with open (file_path , "rb" ) as f :
98
130
return {0 : f .read ()}
99
131
100
- # If it's not an image, proceed with the conversion
101
132
return self ._convert_pdf_to_images (pdfium .PdfDocument (file_path ), scale )
102
133
103
134
def _convert_stream_to_images (self , file_stream : io .BytesIO , scale : float ) -> Dict [int , bytes ]:
@@ -163,13 +194,15 @@ def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
163
194
Checks if the loader can handle the source in vision mode.
164
195
165
196
Args:
166
- source: Either a file path (str) or a BytesIO stream
197
+ source: Either a file path (str), URL, or a BytesIO stream
167
198
168
199
Returns:
169
200
bool: True if the loader can handle the source in vision mode
170
201
"""
171
202
try :
172
203
if isinstance (source , str ):
204
+ if self ._is_url (source ):
205
+ return True # URLs are always supported in vision mode
173
206
ext = get_file_extension (source ).lower ()
174
207
return ext in ['pdf' , 'jpg' , 'jpeg' , 'png' , 'tiff' , 'bmp' ]
175
208
elif isinstance (source , BytesIO ):
@@ -210,4 +243,99 @@ def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool:
210
243
# List of extensions that support pagination
211
244
return ext in ['pdf' ]
212
245
except Exception :
213
- return False
246
+ return False
247
+
248
+ @staticmethod
249
+ def _check_playwright_dependencies ():
250
+ """
251
+ Check if the playwright dependency is installed.
252
+ Raises:
253
+ ImportError: If playwright is not installed.
254
+ """
255
+ try :
256
+ from playwright .sync_api import sync_playwright
257
+ except ImportError :
258
+ raise ImportError (
259
+ "You are using vision with url. You need to install playwright."
260
+ "`pip install playwright` and run `playwright install`."
261
+ )
262
+
263
+ def _capture_screenshot_from_url (self , url : str ) -> bytes :
264
+ """
265
+ Captures a full-page screenshot of a URL using Playwright.
266
+
267
+ Args:
268
+ url: The URL to capture
269
+
270
+ Returns:
271
+ bytes: The screenshot image data
272
+ """
273
+ # Optional: Check if playwright is installed before attempting to use it.
274
+ self ._check_playwright_dependencies ()
275
+
276
+ from playwright .sync_api import sync_playwright # Import after the dependency check
277
+
278
+ with sync_playwright () as p :
279
+ browser = p .chromium .launch (headless = True )
280
+ page = browser .new_page ()
281
+
282
+ try :
283
+ # Navigate to URL
284
+ page .goto (url , wait_until = 'networkidle' )
285
+
286
+ # Optional: Handle cookie consent popups (customize selectors as needed)
287
+ try :
288
+ page .click ('button:has-text("Accept")' , timeout = 10000 )
289
+ except Exception :
290
+ pass # Ignore if no cookie banner is found
291
+
292
+ # Wait for content to load with the configurable timeout
293
+ page .wait_for_timeout (self .screenshot_timeout )
294
+
295
+ # Capture full page screenshot
296
+ screenshot = page .screenshot (full_page = True )
297
+
298
+ return screenshot
299
+
300
+ finally :
301
+ browser .close ()
302
+
303
+ def _split_image_vertically (self , img : Image .Image , chunk_height : int = 1000 ) -> List [bytes ]:
304
+ """
305
+ Splits a tall PIL Image into vertical chunks of `chunk_height`.
306
+ Returns a list of bytes in PNG format, in top-to-bottom order.
307
+
308
+ Args:
309
+ img: PIL Image to split
310
+ chunk_height: Height of each chunk in pixels
311
+
312
+ Returns:
313
+ List of PNG-encoded bytes for each chunk
314
+ """
315
+ width , height = img .size
316
+ num_chunks = math .ceil (height / chunk_height )
317
+
318
+ chunks_bytes = []
319
+ for i in range (num_chunks ):
320
+ top = i * chunk_height
321
+ bottom = min ((i + 1 ) * chunk_height , height )
322
+ crop_box = (0 , top , width , bottom )
323
+
324
+ # Crop the chunk
325
+ chunk_img = img .crop (crop_box )
326
+
327
+ # Convert chunk to bytes
328
+ chunk_bytes = io .BytesIO ()
329
+ chunk_img .save (chunk_bytes , format = "PNG" , optimize = True )
330
+ chunk_bytes .seek (0 )
331
+ chunks_bytes .append (chunk_bytes .read ())
332
+
333
+ return chunks_bytes
334
+
335
+ def _is_url (self , source : str ) -> bool :
336
+ """Check if the source string is a URL."""
337
+ try :
338
+ result = urlparse (source )
339
+ return bool (result .scheme and result .netloc )
340
+ except :
341
+ return False
0 commit comments