33models and/or visualisation.`
44"""
55
6+ import itertools
67import functools
78import subprocess
89import tempfile
910from os import PathLike
1011from pathlib import Path
11- from typing import Dict , Iterable , Iterator , Union , List , TYPE_CHECKING
12+ from typing import TYPE_CHECKING , Callable , Dict , Iterable , Iterator , List , Union
13+
1214from PIL import Image , ImageDraw , ImageFont
1315from playa .document import Document , PageList
14- from playa .page import ContentObject , Page
15-
16+ from playa .page import ContentObject , Page , Annotation
17+ from playa .structure import Element
18+ from playa .utils import Rect , get_transformed_bound , get_bound
19+ from playa import resolve
1620
1721if TYPE_CHECKING :
1822 import pypdfium2 # types: ignore
@@ -301,6 +305,109 @@ def show(page: Page, dpi: int = 72) -> Image.Image:
301305 return next (convert (page , dpi = dpi ))
302306
303307
308+ LabelFunc = Callable [[ContentObject ], str ]
309+ BoxFunc = Callable [[ContentObject ], Rect ]
310+
311+
312+ @functools .singledispatch
313+ def get_box (obj : Union [Annotation , ContentObject , Element , Rect ]) -> Rect :
314+ """Default function to get the bounding box for an object."""
315+ raise RuntimeError (f"Don't know how to get the box for { obj !r} " )
316+
317+
318+ @get_box .register (tuple )
319+ def get_box_rect (obj : Rect ) -> Rect :
320+ """Get the bounding box of a ContentObject"""
321+ return obj
322+
323+
324+ @get_box .register (ContentObject )
325+ def get_box_content (obj : ContentObject ) -> Rect :
326+ """Get the bounding box of a ContentObject"""
327+ return obj .bbox
328+
329+
330+ @get_box .register (Annotation )
331+ def get_box_annotation (obj : Annotation ) -> Rect :
332+ """Get the bounding box of an Annotation"""
333+ return get_transformed_bound (obj .page .ctm , obj .rect )
334+
335+
336+ @get_box .register (Element )
337+ def get_box_element (obj : Element ) -> Rect :
338+ """Get the bounding box for a structural Element"""
339+ # It might just *have* a BBox already
340+ page = obj .page
341+ if page is None :
342+ raise ValueError ("Has no page, has no content! No box for you!" )
343+ if "BBox" in obj .props :
344+ return get_transformed_bound (page .ctm , resolve (obj .props ["BBox" ]))
345+ else :
346+ return _get_marked_content_box (obj )
347+
348+
349+ def _get_marked_content_box (el : Element ) -> Rect :
350+ """Get the bounding box of an Element's marked content.
351+
352+ This will be superseded in PLAYA 0.3.x so do not use!
353+ """
354+ page = el .page
355+ if page is None :
356+ raise ValueError ("Has no page, has no content! No box for you!" )
357+
358+ def get_mcids (k ):
359+ k = resolve (k )
360+ if isinstance (k , int ):
361+ yield k
362+ elif isinstance (k , list ):
363+ for kk in k :
364+ yield from get_mcids (kk )
365+ elif isinstance (k , dict ):
366+ if "K" in k :
367+ yield from get_mcids (k ["K" ])
368+
369+ mcids = set (get_mcids (el .props ["K" ]))
370+ pts = itertools .chain .from_iterable (
371+ ((x0 , y0 ), (x1 , y1 ))
372+ for x0 , y0 , x1 , y1 in (obj .bbox for obj in page if obj .mcid in mcids )
373+ )
374+ return get_bound (pts )
375+
376+
377+ @functools .singledispatch
378+ def get_label (obj : Union [Annotation , ContentObject , Element , Rect ]) -> str :
379+ """Default function to get the label text for an object."""
380+ return str (obj )
381+
382+
383+ @get_label .register (ContentObject )
384+ def get_label_content (obj : ContentObject ) -> str :
385+ """Get the label text for a ContentObject."""
386+ return obj .object_type
387+
388+
389+ @get_label .register (Annotation )
390+ def get_label_annotation (obj : Annotation ) -> str :
391+ """Get the default label text for an Annotation.
392+
393+ Note: This is just a default.
394+ This is one of many possible options, so you may wish to
395+ define your own custom LabelFunc.
396+ """
397+ return obj .subtype
398+
399+
400+ @get_label .register (Element )
401+ def get_label_element (obj : Element ) -> str :
402+ """Get the default label text for an Element.
403+
404+ Note: This is just a default.
405+ This is one of many possible options, so you may wish to
406+ define your own custom LabelFunc.
407+ """
408+ return obj .type
409+
410+
304411def box (
305412 objs : Iterable [ContentObject ],
306413 * ,
@@ -310,30 +417,41 @@ def box(
310417 label_size : int = 9 ,
311418 label_margin : int = 1 ,
312419 image : Union [Image .Image , None ] = None ,
420+ labelfunc : LabelFunc = get_label ,
421+ boxfunc : BoxFunc = get_box ,
313422) -> Union [Image .Image , None ]:
314423 """Draw boxes around things in a page of a PDF."""
315424 draw : ImageDraw .ImageDraw
316425 font = ImageFont .load_default (label_size )
317426 for obj in objs :
318427 if image is None :
319428 image = show (obj .page )
320- left , top , right , bottom = obj .bbox
429+ try :
430+ left , top , right , bottom = boxfunc (obj )
431+ except ValueError : # it has no content and no box
432+ continue
321433 draw = ImageDraw .ImageDraw (image )
322434 obj_color = (
323435 color if isinstance (color , str ) else color .get (obj .object_type , "red" )
324436 )
325437 draw .rectangle ((left , top , right , bottom ), outline = obj_color )
326438 if label :
327- text = obj . object_type
439+ text = labelfunc ( obj )
328440 tl , tt , tr , tb = font .getbbox (text )
441+ label_box = (
442+ left ,
443+ top - tb - label_margin * 2 ,
444+ left + tr + label_margin * 2 ,
445+ top ,
446+ )
329447 draw .rectangle (
330- ( left , top - tb - label_margin * 2 , left + tr + label_margin * 2 , top ) ,
448+ label_box ,
331449 outline = obj_color ,
332450 fill = obj_color ,
333451 )
334452 draw .text (
335453 xy = (left + label_margin , top - label_margin ),
336- text = obj . object_type ,
454+ text = text ,
337455 font = font ,
338456 fill = "white" ,
339457 anchor = "ld" ,
@@ -350,7 +468,10 @@ def mark(
350468 label_color : str = "white" ,
351469 label_size : int = 9 ,
352470 label_margin : int = 1 ,
471+ outline : bool = False ,
353472 image : Union [Image .Image , None ] = None ,
473+ labelfunc : LabelFunc = get_label ,
474+ boxfunc : BoxFunc = get_box ,
354475) -> Union [Image .Image , None ]:
355476 """Highlight things in a page of a PDF."""
356477 overlay : Image .Image
@@ -363,34 +484,69 @@ def mark(
363484 image = show (obj .page )
364485 overlay = Image .new ("RGB" , image .size )
365486 mask = Image .new ("L" , image .size , 255 )
366- left , top , right , bottom = obj .bbox
487+ try :
488+ left , top , right , bottom = boxfunc (obj )
489+ except ValueError : # it has no content and no box
490+ continue
367491 draw = ImageDraw .ImageDraw (overlay )
368492 obj_color = (
369493 color if isinstance (color , str ) else color .get (obj .object_type , "red" )
370494 )
371495 draw .rectangle ((left , top , right , bottom ), fill = obj_color )
496+ mask_draw = ImageDraw .ImageDraw (mask )
497+ mask_draw .rectangle ((left , top , right , bottom ), fill = alpha )
498+ if outline :
499+ draw .rectangle ((left , top , right , bottom ), outline = "black" )
500+ mask_draw .rectangle ((left , top , right , bottom ), outline = 0 )
372501 if label :
373- text = obj . object_type
502+ text = labelfunc ( obj )
374503 tl , tt , tr , tb = font .getbbox (text )
504+ label_box = (
505+ left ,
506+ top - tb - label_margin * 2 ,
507+ left + tr + label_margin * 2 ,
508+ top ,
509+ )
375510 draw .rectangle (
376- ( left , top - tb - label_margin * 2 , left + tr + label_margin * 2 , top ) ,
511+ label_box ,
377512 outline = obj_color ,
378513 fill = obj_color ,
379514 )
380- draw .text (
381- xy = (left + label_margin , top - label_margin ),
382- text = obj .object_type ,
383- font = font ,
384- fill = "white" ,
385- anchor = "ld" ,
386- )
387- draw = ImageDraw .ImageDraw (mask )
388- draw .rectangle ((left , top , right , bottom ), fill = alpha )
389- if label :
390- draw .rectangle (
391- (left , top - tb - label_margin * 2 , left + tr + label_margin * 2 , top ),
515+ mask_draw .rectangle (
516+ label_box ,
392517 fill = alpha ,
393518 )
519+ if outline :
520+ draw .rectangle (
521+ label_box ,
522+ outline = "black" ,
523+ )
524+ mask_draw .rectangle (
525+ label_box ,
526+ outline = 0 ,
527+ )
528+ draw .text (
529+ xy = (left + label_margin , top - label_margin ),
530+ text = text ,
531+ font = font ,
532+ fill = "black" ,
533+ anchor = "ld" ,
534+ )
535+ mask_draw .text (
536+ xy = (left + label_margin , top - label_margin ),
537+ text = text ,
538+ font = font ,
539+ fill = 0 ,
540+ anchor = "ld" ,
541+ )
542+ else :
543+ draw .text (
544+ xy = (left + label_margin , top - label_margin ),
545+ text = text ,
546+ font = font ,
547+ fill = "white" ,
548+ anchor = "ld" ,
549+ )
394550 if image is None :
395551 return None
396552 return Image .composite (image , overlay , mask )
0 commit comments