Skip to content

Commit 9772b54

Browse files
authored
Merge pull request #7 from dhdaines/more_boxes
More boxes
2 parents 173f709 + 0d1d69f commit 9772b54

File tree

7 files changed

+216
-24
lines changed

7 files changed

+216
-24
lines changed

README.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,27 @@ page = pdf.pages[3]
4444
pi.show(page)
4545
```
4646

47-
You can of course draw boxes around those individual PDF objects, or
47+
Something quite interesting to do is, if your PDF contains a logical
48+
structure tree, to look at the bounding boxes of the contents of those
49+
structure elements (FIXME: This is not a very efficient way to do
50+
this, and it will be optimized in an upcoming PLAYA):
51+
52+
```python
53+
pi.box(pdf.structure.find_all(lambda el: el.page is page))
54+
```
55+
56+
![Structure Elements](./docs/page3-elements.png)
57+
58+
Alternately, if you have annotations (such as links), you can look at
59+
those too:
60+
61+
```python
62+
pi.box(page.annotations)
63+
```
64+
65+
![Annotations](./docs/page2-annotations.png)
66+
67+
You can of course draw boxes around individual PDF objects, or
4868
one particular sort of object, or filter them with a generator
4969
expression:
5070

@@ -62,12 +82,28 @@ semi-transparent colour, which otherwise works the same way:
6282
pi.mark(page.images)
6383
```
6484

85+
![Annotations](./docs/page298-images.png)
86+
6587
If you wish you can give each type of object a different colour:
6688

6789
```python
68-
pi.mark(page, color={"text": "red", "image": "blue", "path": green"})
90+
pi.mark(page, color={"text": "red", "image": "blue", "path": "green"})
6991
```
7092

93+
![Annotations](./docs/page298-colors.png)
94+
95+
You can also add outlines and labels around the highlighting:
96+
97+
```python
98+
pi.mark(page, outline=True, label=True,
99+
color={"text": "red", "image": "blue", "path": "green"})
100+
```
101+
102+
![Annotations](./docs/page298-outlines.png)
103+
104+
There are even more options! For now you will need to look at the
105+
source code, documentation is Coming Soon.
106+
71107
## Working in the PDF mine
72108

73109
`pdfminer.six` is widely used for text extraction and layout analysis

docs/page2-annotations.png

122 KB
Loading

docs/page298-colors.png

275 KB
Loading

docs/page298-images.png

275 KB
Loading

docs/page298-outlines.png

290 KB
Loading

docs/page3-elements.png

107 KB
Loading

src/paves/image.py

Lines changed: 178 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,20 @@
33
models and/or visualisation.`
44
"""
55

6+
import itertools
67
import functools
78
import subprocess
89
import tempfile
910
from os import PathLike
1011
from pathlib import Path
11-
from typing import Dict, Iterable, Iterator, Union, List, TYPE_CHECKING
12+
from typing import TYPE_CHECKING, Callable, Dict, Iterable, Iterator, List, Union
13+
1214
from PIL import Image, ImageDraw, ImageFont
1315
from playa.document import Document, PageList
14-
from playa.page import ContentObject, Page
15-
16+
from playa.page import ContentObject, Page, Annotation
17+
from playa.structure import Element
18+
from playa.utils import Rect, get_transformed_bound, get_bound
19+
from playa import resolve
1620

1721
if TYPE_CHECKING:
1822
import pypdfium2 # types: ignore
@@ -301,6 +305,109 @@ def show(page: Page, dpi: int = 72) -> Image.Image:
301305
return next(convert(page, dpi=dpi))
302306

303307

308+
LabelFunc = Callable[[ContentObject], str]
309+
BoxFunc = Callable[[ContentObject], Rect]
310+
311+
312+
@functools.singledispatch
313+
def get_box(obj: Union[Annotation, ContentObject, Element, Rect]) -> Rect:
314+
"""Default function to get the bounding box for an object."""
315+
raise RuntimeError(f"Don't know how to get the box for {obj!r}")
316+
317+
318+
@get_box.register(tuple)
319+
def get_box_rect(obj: Rect) -> Rect:
320+
"""Get the bounding box of a ContentObject"""
321+
return obj
322+
323+
324+
@get_box.register(ContentObject)
325+
def get_box_content(obj: ContentObject) -> Rect:
326+
"""Get the bounding box of a ContentObject"""
327+
return obj.bbox
328+
329+
330+
@get_box.register(Annotation)
331+
def get_box_annotation(obj: Annotation) -> Rect:
332+
"""Get the bounding box of an Annotation"""
333+
return get_transformed_bound(obj.page.ctm, obj.rect)
334+
335+
336+
@get_box.register(Element)
337+
def get_box_element(obj: Element) -> Rect:
338+
"""Get the bounding box for a structural Element"""
339+
# It might just *have* a BBox already
340+
page = obj.page
341+
if page is None:
342+
raise ValueError("Has no page, has no content! No box for you!")
343+
if "BBox" in obj.props:
344+
return get_transformed_bound(page.ctm, resolve(obj.props["BBox"]))
345+
else:
346+
return _get_marked_content_box(obj)
347+
348+
349+
def _get_marked_content_box(el: Element) -> Rect:
350+
"""Get the bounding box of an Element's marked content.
351+
352+
This will be superseded in PLAYA 0.3.x so do not use!
353+
"""
354+
page = el.page
355+
if page is None:
356+
raise ValueError("Has no page, has no content! No box for you!")
357+
358+
def get_mcids(k):
359+
k = resolve(k)
360+
if isinstance(k, int):
361+
yield k
362+
elif isinstance(k, list):
363+
for kk in k:
364+
yield from get_mcids(kk)
365+
elif isinstance(k, dict):
366+
if "K" in k:
367+
yield from get_mcids(k["K"])
368+
369+
mcids = set(get_mcids(el.props["K"]))
370+
pts = itertools.chain.from_iterable(
371+
((x0, y0), (x1, y1))
372+
for x0, y0, x1, y1 in (obj.bbox for obj in page if obj.mcid in mcids)
373+
)
374+
return get_bound(pts)
375+
376+
377+
@functools.singledispatch
378+
def get_label(obj: Union[Annotation, ContentObject, Element, Rect]) -> str:
379+
"""Default function to get the label text for an object."""
380+
return str(obj)
381+
382+
383+
@get_label.register(ContentObject)
384+
def get_label_content(obj: ContentObject) -> str:
385+
"""Get the label text for a ContentObject."""
386+
return obj.object_type
387+
388+
389+
@get_label.register(Annotation)
390+
def get_label_annotation(obj: Annotation) -> str:
391+
"""Get the default label text for an Annotation.
392+
393+
Note: This is just a default.
394+
This is one of many possible options, so you may wish to
395+
define your own custom LabelFunc.
396+
"""
397+
return obj.subtype
398+
399+
400+
@get_label.register(Element)
401+
def get_label_element(obj: Element) -> str:
402+
"""Get the default label text for an Element.
403+
404+
Note: This is just a default.
405+
This is one of many possible options, so you may wish to
406+
define your own custom LabelFunc.
407+
"""
408+
return obj.type
409+
410+
304411
def box(
305412
objs: Iterable[ContentObject],
306413
*,
@@ -310,30 +417,41 @@ def box(
310417
label_size: int = 9,
311418
label_margin: int = 1,
312419
image: Union[Image.Image, None] = None,
420+
labelfunc: LabelFunc = get_label,
421+
boxfunc: BoxFunc = get_box,
313422
) -> Union[Image.Image, None]:
314423
"""Draw boxes around things in a page of a PDF."""
315424
draw: ImageDraw.ImageDraw
316425
font = ImageFont.load_default(label_size)
317426
for obj in objs:
318427
if image is None:
319428
image = show(obj.page)
320-
left, top, right, bottom = obj.bbox
429+
try:
430+
left, top, right, bottom = boxfunc(obj)
431+
except ValueError: # it has no content and no box
432+
continue
321433
draw = ImageDraw.ImageDraw(image)
322434
obj_color = (
323435
color if isinstance(color, str) else color.get(obj.object_type, "red")
324436
)
325437
draw.rectangle((left, top, right, bottom), outline=obj_color)
326438
if label:
327-
text = obj.object_type
439+
text = labelfunc(obj)
328440
tl, tt, tr, tb = font.getbbox(text)
441+
label_box = (
442+
left,
443+
top - tb - label_margin * 2,
444+
left + tr + label_margin * 2,
445+
top,
446+
)
329447
draw.rectangle(
330-
(left, top - tb - label_margin * 2, left + tr + label_margin * 2, top),
448+
label_box,
331449
outline=obj_color,
332450
fill=obj_color,
333451
)
334452
draw.text(
335453
xy=(left + label_margin, top - label_margin),
336-
text=obj.object_type,
454+
text=text,
337455
font=font,
338456
fill="white",
339457
anchor="ld",
@@ -350,7 +468,10 @@ def mark(
350468
label_color: str = "white",
351469
label_size: int = 9,
352470
label_margin: int = 1,
471+
outline: bool = False,
353472
image: Union[Image.Image, None] = None,
473+
labelfunc: LabelFunc = get_label,
474+
boxfunc: BoxFunc = get_box,
354475
) -> Union[Image.Image, None]:
355476
"""Highlight things in a page of a PDF."""
356477
overlay: Image.Image
@@ -363,34 +484,69 @@ def mark(
363484
image = show(obj.page)
364485
overlay = Image.new("RGB", image.size)
365486
mask = Image.new("L", image.size, 255)
366-
left, top, right, bottom = obj.bbox
487+
try:
488+
left, top, right, bottom = boxfunc(obj)
489+
except ValueError: # it has no content and no box
490+
continue
367491
draw = ImageDraw.ImageDraw(overlay)
368492
obj_color = (
369493
color if isinstance(color, str) else color.get(obj.object_type, "red")
370494
)
371495
draw.rectangle((left, top, right, bottom), fill=obj_color)
496+
mask_draw = ImageDraw.ImageDraw(mask)
497+
mask_draw.rectangle((left, top, right, bottom), fill=alpha)
498+
if outline:
499+
draw.rectangle((left, top, right, bottom), outline="black")
500+
mask_draw.rectangle((left, top, right, bottom), outline=0)
372501
if label:
373-
text = obj.object_type
502+
text = labelfunc(obj)
374503
tl, tt, tr, tb = font.getbbox(text)
504+
label_box = (
505+
left,
506+
top - tb - label_margin * 2,
507+
left + tr + label_margin * 2,
508+
top,
509+
)
375510
draw.rectangle(
376-
(left, top - tb - label_margin * 2, left + tr + label_margin * 2, top),
511+
label_box,
377512
outline=obj_color,
378513
fill=obj_color,
379514
)
380-
draw.text(
381-
xy=(left + label_margin, top - label_margin),
382-
text=obj.object_type,
383-
font=font,
384-
fill="white",
385-
anchor="ld",
386-
)
387-
draw = ImageDraw.ImageDraw(mask)
388-
draw.rectangle((left, top, right, bottom), fill=alpha)
389-
if label:
390-
draw.rectangle(
391-
(left, top - tb - label_margin * 2, left + tr + label_margin * 2, top),
515+
mask_draw.rectangle(
516+
label_box,
392517
fill=alpha,
393518
)
519+
if outline:
520+
draw.rectangle(
521+
label_box,
522+
outline="black",
523+
)
524+
mask_draw.rectangle(
525+
label_box,
526+
outline=0,
527+
)
528+
draw.text(
529+
xy=(left + label_margin, top - label_margin),
530+
text=text,
531+
font=font,
532+
fill="black",
533+
anchor="ld",
534+
)
535+
mask_draw.text(
536+
xy=(left + label_margin, top - label_margin),
537+
text=text,
538+
font=font,
539+
fill=0,
540+
anchor="ld",
541+
)
542+
else:
543+
draw.text(
544+
xy=(left + label_margin, top - label_margin),
545+
text=text,
546+
font=font,
547+
fill="white",
548+
anchor="ld",
549+
)
394550
if image is None:
395551
return None
396552
return Image.composite(image, overlay, mask)

0 commit comments

Comments
 (0)