Skip to content

Commit af05e4f

Browse files
authored
Merge pull request #8 from dhdaines/playa05
Update for PLAYA 0.5
2 parents f126cc7 + 87a081a commit af05e4f

File tree

5 files changed

+89
-72
lines changed

5 files changed

+89
-72
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ classifiers = [
3030
"Programming Language :: Python :: Implementation :: PyPy",
3131
]
3232
dependencies = [
33-
"playa-pdf>=0.3.0,<=0.4.2",
33+
"playa-pdf>=0.5.0",
3434
"pillow",
3535
]
3636

@@ -58,7 +58,7 @@ exclude = [
5858
extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
5959

6060
[tool.hatch.envs.default]
61-
dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
61+
dependencies = [ "pytest", "pytest-xdist", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
6262

6363
[tool.hatch.envs.hatch-static-analysis]
6464
config-path = "none" # Disable hatch's unreasonable ruff defaults

src/paves/bears.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
TextObject,
2323
XObjectObject,
2424
)
25-
from playa.utils import Point, apply_matrix_norm, apply_matrix_pt, get_bound
25+
from playa.utils import Point, apply_matrix_pt, get_bound
26+
from paves.compat import subpaths
2627

2728
# Stub out Polars if not present
2829
try:
@@ -251,7 +252,7 @@ def make_path(
251252

252253
@process_object.register
253254
def _(obj: PathObject) -> Iterator[LayoutDict]:
254-
for path in obj:
255+
for path in subpaths(obj):
255256
ops = []
256257
pts: List[Point] = []
257258
for seg in path.raw_segments:
@@ -363,18 +364,16 @@ def _(obj: ImageObject) -> Iterator[LayoutDict]:
363364
def _(obj: TextObject) -> Iterator[LayoutDict]:
364365
for glyph in obj:
365366
x0, y0, x1, y1 = glyph.bbox
366-
tstate = glyph.textstate
367367
gstate = glyph.gstate
368-
# apparently we can assert this?
369-
font = tstate.font
370-
assert font is not None
371-
glyph_x, glyph_y = apply_matrix_norm(glyph.ctm, tstate.glyph_offset)
368+
font = glyph.font
369+
glyph_origin_x, glyph_origin_y = glyph.origin
370+
line_origin_x, line_origin_y = obj.line_matrix[-2:]
372371
(a, b, c, d, e, f) = glyph.matrix
373372
if font.vertical:
374-
size = abs(tstate.fontsize * a)
373+
size = abs(gstate.fontsize * a)
375374
else:
376-
size = abs(tstate.fontsize * d)
377-
scaling = tstate.scaling * 0.01 # FIXME: unnecessary?
375+
size = abs(gstate.fontsize * d)
376+
scaling = gstate.scaling * 0.01 # FIXME: unnecessary?
378377
upright = a * d * scaling > 0 and b * c <= 0
379378

380379
yield LayoutDict(
@@ -388,9 +387,9 @@ def _(obj: TextObject) -> Iterator[LayoutDict]:
388387
text=glyph.text,
389388
cid=glyph.cid,
390389
fontname=font.fontname,
391-
glyph_offset_x=glyph_x,
392-
glyph_offset_y=glyph_y,
393-
render_mode=tstate.render_mode,
390+
glyph_offset_x=glyph_origin_x - line_origin_x,
391+
glyph_offset_y=glyph_origin_y - line_origin_y,
392+
render_mode=gstate.render_mode,
394393
dash_pattern=gstate.dash.dash,
395394
dash_phase=gstate.dash.phase,
396395
stroking_colorspace=gstate.scs.name,

src/paves/compat.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Compatibility functions.
3+
"""
4+
5+
from typing import Iterator, List
6+
7+
from playa.content import PathObject, PathSegment
8+
9+
10+
def subpaths(path: PathObject) -> Iterator[PathObject]:
11+
"""Iterate over "subpaths".
12+
13+
Note: subpaths inherit the values of `fill` and `evenodd` from
14+
the parent path, but these values are no longer meaningful
15+
since the winding rules must be applied to the composite path
16+
as a whole (this is not a bug, just don't rely on them to know
17+
which regions are filled or not).
18+
19+
"""
20+
# FIXME: Is there an itertool or a more_itertool for this?
21+
segs: List[PathSegment] = []
22+
for seg in path.raw_segments:
23+
if seg.operator == "m" and segs:
24+
yield PathObject(
25+
_pageref=path._pageref,
26+
gstate=path.gstate,
27+
ctm=path.ctm,
28+
mcstack=path.mcstack,
29+
raw_segments=segs,
30+
stroke=path.stroke,
31+
fill=path.fill,
32+
evenodd=path.evenodd,
33+
)
34+
segs = []
35+
segs.append(seg)
36+
if segs:
37+
yield PathObject(
38+
_pageref=path._pageref,
39+
gstate=path.gstate,
40+
ctm=path.ctm,
41+
mcstack=path.mcstack,
42+
raw_segments=segs,
43+
stroke=path.stroke,
44+
fill=path.fill,
45+
evenodd=path.evenodd,
46+
)

src/paves/image.py

Lines changed: 3 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
models and/or visualisation.`
44
"""
55

6-
import itertools
76
import functools
87
import subprocess
98
import tempfile
@@ -24,8 +23,7 @@
2423
from playa.document import Document, PageList
2524
from playa.page import ContentObject, Page, Annotation
2625
from playa.structure import Element
27-
from playa.utils import Rect, get_transformed_bound, get_bound
28-
from playa import resolve
26+
from playa.utils import Rect, get_transformed_bound
2927

3028
if TYPE_CHECKING:
3129
import pypdfium2 # types: ignore
@@ -331,7 +329,8 @@ def get_box_rect(obj: Rect) -> Rect:
331329

332330

333331
@get_box.register(ContentObject)
334-
def get_box_content(obj: ContentObject) -> Rect:
332+
@get_box.register(Element)
333+
def get_box_content(obj: Union[ContentObject, Element]) -> Rect:
335334
"""Get the bounding box of a ContentObject"""
336335
return obj.bbox
337336

@@ -342,47 +341,6 @@ def get_box_annotation(obj: Annotation) -> Rect:
342341
return get_transformed_bound(obj.page.ctm, obj.rect)
343342

344343

345-
@get_box.register(Element)
346-
def get_box_element(obj: Element) -> Rect:
347-
"""Get the bounding box for a structural Element"""
348-
# It might just *have* a BBox already
349-
page = obj.page
350-
if page is None:
351-
raise ValueError("Has no page, has no content! No box for you!")
352-
if "BBox" in obj.props:
353-
return get_transformed_bound(page.ctm, resolve(obj.props["BBox"]))
354-
else:
355-
return _get_marked_content_box(obj)
356-
357-
358-
def _get_marked_content_box(el: Element) -> Rect:
359-
"""Get the bounding box of an Element's marked content.
360-
361-
This will be superseded in PLAYA 0.5 so do not use!
362-
"""
363-
page = el.page
364-
if page is None:
365-
raise ValueError("Has no page, has no content! No box for you!")
366-
367-
def get_mcids(k):
368-
k = resolve(k)
369-
if isinstance(k, int):
370-
yield k
371-
elif isinstance(k, list):
372-
for kk in k:
373-
yield from get_mcids(kk)
374-
elif isinstance(k, dict):
375-
if "K" in k:
376-
yield from get_mcids(k["K"])
377-
378-
mcids = set(get_mcids(el.props["K"]))
379-
pts = itertools.chain.from_iterable(
380-
((x0, y0), (x1, y1))
381-
for x0, y0, x1, y1 in (obj.bbox for obj in page if obj.mcid in mcids)
382-
)
383-
return get_bound(pts)
384-
385-
386344
@functools.singledispatch
387345
def get_label(obj: Union[Annotation, ContentObject, Element, Rect]) -> str:
388346
"""Default function to get the label text for an object."""

src/paves/miner.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
from playa.data_structures import NameTree, NumberTree
3232
from playa.document import Document as PDFDocument
3333
from playa.exceptions import PDFException
34-
from playa.page import (
34+
from playa.content import (
3535
ContentObject,
3636
GlyphObject,
3737
GraphicState,
@@ -52,7 +52,9 @@
5252
apply_matrix_pt,
5353
decode_text,
5454
get_bound,
55+
transform_bbox,
5556
)
57+
from paves.compat import subpaths
5658

5759
PSException = Exception
5860
__all__ = [
@@ -519,33 +521,45 @@ def __init__(
519521
glyph: GlyphObject,
520522
) -> None:
521523
LTText.__init__(self)
522-
textstate = glyph.textstate
523524
gstate = glyph.gstate
524525
matrix = glyph.matrix
526+
font = glyph.font
525527
if glyph.text is None:
526-
logger.debug("undefined: %r, %r", textstate.font, glyph.cid)
528+
logger.debug("undefined: %r, %r", font, glyph.cid)
527529
# Horrible awful pdfminer.six behaviour
528530
self._text = "(cid:%d)" % glyph.cid
529531
else:
530532
self._text = glyph.text
531-
self.matrix = matrix
532533
self.mcstack = glyph.mcstack
533-
font = textstate.font
534-
assert font is not None
535534
self.fontname = font.fontname
536-
self.render_mode = textstate.render_mode
537535
self.graphicstate = gstate
536+
self.render_mode = gstate.render_mode
538537
self.stroking_color = gstate.scolor
539538
self.non_stroking_color = gstate.ncolor
540539
self.scs = gstate.scs
541540
self.ncs = gstate.ncs
542-
self.adv = glyph.adv
541+
scaling = gstate.scaling * 0.01
542+
fontsize = gstate.fontsize
543543
(a, b, c, d, e, f) = matrix
544-
scaling = textstate.scaling
545544
# FIXME: Still really not sure what this means
546545
self.upright = a * d * scaling > 0 and b * c <= 0
547-
LTComponent.__init__(self, glyph.bbox, glyph.mcstack)
548-
# FIXME: This is now quite wrong for rotated glyphs
546+
# Unscale the matrix to match pdfminer.six
547+
xscale = 1 / (fontsize * scaling)
548+
yscale = 1 / fontsize
549+
self.matrix = (a * xscale, b * yscale, c * xscale, d * yscale, e, f)
550+
# Recreate pdfminer.six's bogus bboxes
551+
if font.vertical:
552+
vdisp = font.vdisp(glyph.cid)
553+
self.adv = vdisp * fontsize
554+
vx, vy = font.position(glyph.cid)
555+
textbox = (-vx, vy + vdisp, -vx + 1, vy)
556+
else:
557+
textwidth = font.char_width(glyph.cid)
558+
self.adv = textwidth * fontsize * scaling
559+
textbox = (0, font.get_descent(), textwidth, font.get_descent() + 1)
560+
miner_box = transform_bbox(glyph.matrix, textbox)
561+
LTComponent.__init__(self, miner_box, glyph.mcstack)
562+
# FIXME: This is quite wrong for rotated glyphs, but so is pdfminer.six
549563
if font.vertical:
550564
self.size = self.width
551565
else:
@@ -1157,7 +1171,7 @@ def process_object(obj: ContentObject) -> Iterator[LTComponent]:
11571171

11581172
@process_object.register
11591173
def _(obj: PathObject) -> Iterator[LTComponent]:
1160-
for path in obj:
1174+
for path in subpaths(obj):
11611175
ops = []
11621176
pts: List[Point] = []
11631177
for seg in path.raw_segments:

0 commit comments

Comments
 (0)