2525 cast ,
2626)
2727
28+ import playa
2829from PIL import Image , ImageDraw , ImageFont
2930from playa .document import Document , PageList
3031from playa .page import ContentObject , Page , Annotation
3132from playa .structure import Element
32- from playa .utils import Rect , get_transformed_bound
33+ from playa .utils import Rect , transform_bbox
3334
3435if TYPE_CHECKING :
3536 import pypdfium2 # types: ignore
@@ -57,7 +58,15 @@ def make_poppler_args(dpi: int, width: int, height: int) -> List[str]:
5758
5859
5960@functools .singledispatch
60- def _popple (pdf , tempdir : Path , args : List [str ]) -> None :
61+ def _popple (pdf , tempdir : Path , args : List [str ]) -> List [Tuple [int , float , float ]]:
62+ raise NotImplementedError
63+
64+
65+ @_popple .register (str )
66+ @_popple .register (PathLike )
67+ def _popple_path (
68+ pdf : Union [str , PathLike ], tempdir : Path , args : List [str ]
69+ ) -> List [Tuple [int , float , float ]]:
6170 subprocess .run (
6271 [
6372 "pdftoppm" ,
@@ -67,11 +76,16 @@ def _popple(pdf, tempdir: Path, args: List[str]) -> None:
6776 ],
6877 check = True ,
6978 )
79+ with playa .open (pdf ) as doc :
80+ return [(page .page_idx , page .width , page .height ) for page in doc .pages ]
7081
7182
7283@_popple .register (Document )
73- def _popple_doc (pdf : Document , tempdir : Path , args : List [str ]) -> None :
84+ def _popple_doc (
85+ pdf : Document , tempdir : Path , args : List [str ]
86+ ) -> List [Tuple [int , float , float ]]:
7487 pdfpdf = tempdir / "pdf.pdf"
88+ # FIXME: This is... not great (can we popple in a pipeline please?)
7589 with open (pdfpdf , "wb" ) as outfh :
7690 outfh .write (pdf .buffer )
7791 subprocess .run (
@@ -83,10 +97,14 @@ def _popple_doc(pdf: Document, tempdir: Path, args: List[str]) -> None:
8397 ],
8498 check = True ,
8599 )
100+ pdfpdf .unlink ()
101+ return [(page .page_idx , page .width , page .height ) for page in pdf .pages ]
86102
87103
88104@_popple .register (Page )
89- def _popple_page (pdf : Page , tempdir : Path , args : List [str ]) -> None :
105+ def _popple_page (
106+ pdf : Page , tempdir : Path , args : List [str ]
107+ ) -> List [Tuple [int , float , float ]]:
90108 assert pdf .doc is not None # bug in PLAYA-PDF, oops, it cannot be None
91109 pdfpdf = tempdir / "pdf.pdf"
92110 with open (pdfpdf , "wb" ) as outfh :
@@ -105,10 +123,14 @@ def _popple_page(pdf: Page, tempdir: Path, args: List[str]) -> None:
105123 ],
106124 check = True ,
107125 )
126+ pdfpdf .unlink ()
127+ return [(pdf .page_idx , pdf .width , pdf .height )]
108128
109129
110130@_popple .register (PageList )
111- def _popple_pages (pdf : PageList , tempdir : Path , args : List [str ]) -> None :
131+ def _popple_pages (
132+ pdf : PageList , tempdir : Path , args : List [str ]
133+ ) -> List [Tuple [int , float , float ]]:
112134 pdfpdf = tempdir / "pdf.pdf"
113135 assert pdf [0 ].doc is not None # bug in PLAYA-PDF, oops, it cannot be None
114136 with open (pdfpdf , "wb" ) as outfh :
@@ -142,6 +164,8 @@ def _popple_pages(pdf: PageList, tempdir: Path, args: List[str]) -> None:
142164 ],
143165 check = True ,
144166 )
167+ pdfpdf .unlink ()
168+ return [(page .page_idx , page .width , page .height ) for page in pdf ]
145169
146170
147171def popple (
@@ -173,21 +197,28 @@ def popple(
173197 args = make_poppler_args (dpi , width , height )
174198 with tempfile .TemporaryDirectory () as tempdir :
175199 temppath = Path (tempdir )
176- _popple (pdf , temppath , args )
177- for ppm in sorted (temppath .iterdir ()):
178- if ppm .suffix == ".ppm" :
179- yield Image .open (ppm )
200+ # FIXME: Possible to Popple in a Parallel Pipeline
201+ page_sizes = _popple (pdf , temppath , args )
202+ for (page_idx , page_width , page_height ), ppm in zip (
203+ page_sizes ,
204+ (path for path in sorted (temppath .iterdir ()) if path .suffix == ".ppm" ),
205+ ):
206+ img = Image .open (ppm )
207+ img .info ["page_index" ] = page_idx
208+ img .info ["page_width" ] = page_width
209+ img .info ["page_height" ] = page_height
210+ yield img
180211
181212
182213@functools .singledispatch
183214def _get_pdfium_pages (
184215 pdf : Union [str , PathLike , Document , Page , PageList ],
185- ) -> Iterator ["pypdfium2.PdfPage" ]:
216+ ) -> Iterator [Tuple [ int , "pypdfium2.PdfPage" ] ]:
186217 import pypdfium2
187218
188219 doc = pypdfium2 .PdfDocument (pdf )
189- for page in doc :
190- yield page
220+ for idx , page in enumerate ( doc ) :
221+ yield idx , page
191222 page .close ()
192223 doc .close ()
193224
@@ -209,31 +240,33 @@ def _get_pdfium_doc(pdf: Document) -> Iterator["pypdfium2.PdfDocument"]:
209240
210241
211242@_get_pdfium_pages .register (Document )
212- def _get_pdfium_pages_doc (pdf : Document ) -> Iterator ["pypdfium2.PdfPage" ]:
243+ def _get_pdfium_pages_doc (pdf : Document ) -> Iterator [Tuple [ int , "pypdfium2.PdfPage" ] ]:
213244 with _get_pdfium_doc (pdf ) as doc :
214- for page in doc :
215- yield page
245+ for idx , page in enumerate ( doc ) :
246+ yield idx , page
216247 page .close ()
217248
218249
219250@_get_pdfium_pages .register (Page )
220- def _get_pdfium_pages_page (page : Page ) -> Iterator ["pypdfium2.PdfPage" ]:
251+ def _get_pdfium_pages_page (page : Page ) -> Iterator [Tuple [ int , "pypdfium2.PdfPage" ] ]:
221252 pdf = page .doc
222253 assert pdf is not None
223254 with _get_pdfium_doc (pdf ) as doc :
224255 pdfium_page = doc [page .page_idx ]
225- yield pdfium_page
256+ yield page . page_idx , pdfium_page
226257 pdfium_page .close ()
227258
228259
229260@_get_pdfium_pages .register (PageList )
230- def _get_pdfium_pages_pagelist (pages : PageList ) -> Iterator ["pypdfium2.PdfPage" ]:
261+ def _get_pdfium_pages_pagelist (
262+ pages : PageList ,
263+ ) -> Iterator [Tuple [int , "pypdfium2.PdfPage" ]]:
231264 pdf = pages .doc
232265 assert pdf is not None
233266 with _get_pdfium_doc (pdf ) as doc :
234267 for page in pages :
235268 pdfium_page = doc [page .page_idx ]
236- yield pdfium_page
269+ yield page . page_idx , pdfium_page
237270 pdfium_page .close ()
238271
239272
@@ -252,7 +285,8 @@ def pdfium(
252285 width: Render to this width in pixels.
253286 height: Render to this height in pixels.
254287 Yields:
255- Pillow `Image.Image` objects, one per page.
288+ Pillow `Image.Image` objects, one per page. Page width and height are
289+ available in the `info` property of the images.
256290 Raises:
257291 ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
258292 NotInstalledError: If PyPDFium2 is not installed.
@@ -263,24 +297,30 @@ def pdfium(
263297 import pypdfium2 # noqa: F401
264298 except ImportError as e :
265299 raise NotInstalledError ("PyPDFium2 does not seem to be installed" ) from e
266- for page in _get_pdfium_pages (pdf ):
300+ for idx , page in _get_pdfium_pages (pdf ):
301+ page_width = page .get_width ()
302+ page_height = page .get_height ()
267303 if width == 0 and height == 0 :
268304 scale = (dpi or 72 ) / 72
269- yield page .render (scale = scale ).to_pil ()
305+ img = page .render (scale = scale ).to_pil ()
270306 else :
271307 if width and height :
272308 # Scale to longest side (since pypdfium2 doesn't
273309 # appear to allow non-1:1 aspect ratio)
274- scale = max (width / page . get_width () , height / page . get_height () )
310+ scale = max (width / page_width , height / page_height )
275311 img = page .render (scale = scale ).to_pil ()
276312 # Resize down to desired size
277- yield img .resize (size = (width , height ))
313+ img = img .resize (size = (width , height ))
278314 elif width :
279315 scale = width / page .get_width ()
280- yield page .render (scale = scale ).to_pil ()
316+ img = page .render (scale = scale ).to_pil ()
281317 elif height :
282318 scale = height / page .get_height ()
283- yield page .render (scale = scale ).to_pil ()
319+ img = page .render (scale = scale ).to_pil ()
320+ img .info ["page_index" ] = idx
321+ img .info ["page_width" ] = page_width
322+ img .info ["page_height" ] = page_height
323+ yield img
284324
285325
286326METHODS = [popple , pdfium ]
@@ -301,10 +341,14 @@ def convert(
301341 width: Render to this width in pixels (0 to keep aspect ratio).
302342 height: Render to this height in pixels (0 to keep aspect ratio).
303343 Yields:
304- Pillow `Image.Image` objects, one per page.
344+ Pillow `Image.Image` objects, one per page. The original page
345+ width and height in default user space units are available in
346+ the `info` property of these images as `page_width` and
347+ `page_height`
305348 Raises:
306349 ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
307350 NotInstalledError: If no renderer is available
351+
308352 """
309353 for method in METHODS :
310354 try :
@@ -365,7 +409,7 @@ def get_box_content(obj: Union[ContentObject, Element]) -> Rect:
365409@get_box .register (Annotation )
366410def get_box_annotation (obj : Annotation ) -> Rect :
367411 """Get the bounding box of an Annotation"""
368- return get_transformed_bound (obj .page .ctm , obj .rect )
412+ return transform_bbox (obj .page .ctm , obj .rect )
369413
370414
371415@functools .singledispatch
0 commit comments