Skip to content

Commit 173f709

Browse files
authored
Merge pull request #6 from dhdaines/seeing_stars
Basic visualization stuff
2 parents b4eb2dd + 6b1c026 commit 173f709

File tree

8 files changed

+576
-68
lines changed

8 files changed

+576
-68
lines changed

README.md

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,54 @@ There will be dependencies. Oh, there will be dependencies.
2121
pip install paves
2222
```
2323

24-
## Workin' in a PDF mine
24+
## Looking at Stuff in a PDF
25+
26+
When poking around in a PDF, it is useful not simply to read
27+
descriptions of objects (text, images, etc) but also to visualise them
28+
in the rendered document. `pdfplumber` is quite nice for this, though
29+
it is oriented towards the particular set of objects that it can
30+
extract from the PDF.
31+
32+
The primary goal of PLAYA-PDF is to give access to all the objects and
33+
particularly the metadata in a PDF. One goal of PAVÉS (because there
34+
are a few) is to give an easy way to visualise these objects and
35+
metadata.
36+
37+
First, maybe you want to just look at a page in your Jupyter notebook.
38+
Okay!
39+
40+
```python
41+
import playa, paves.image as pi
42+
pdf = playa.open("my_awesome.pdf")
43+
page = pdf.pages[3]
44+
pi.show(page)
45+
```
46+
47+
You can of course draw boxes around those individual PDF objects, or
48+
one particular sort of object, or filter them with a generator
49+
expression:
50+
51+
```python
52+
pi.box(page) # outlines everything
53+
pi.box(page.texts)
54+
pi.box(page.images)
55+
pi.box(t for t in page.texts if "spam" in t.chars)
56+
```
57+
58+
Alternately you can "highlight" objects by overlaying them with a
59+
semi-transparent colour, which otherwise works the same way:
60+
61+
```python
62+
pi.mark(page.images)
63+
```
64+
65+
If you wish you can give each type of object a different colour:
66+
67+
```python
68+
pi.mark(page, color={"text": "red", "image": "blue", "path": green"})
69+
```
70+
71+
## Working in the PDF mine
2572

2673
`pdfminer.six` is widely used for text extraction and layout analysis
2774
due to its liberal licensing terms. Unfortunately it is quite slow

pyproject.toml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ classifiers = [
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
]
2727
dependencies = [
28-
"playa-pdf >= 0.2.8, < 0.3", # not considered harmful as we depend on internals
28+
"playa-pdf>=0.3.0",
2929
"pillow",
3030
]
3131

@@ -37,6 +37,9 @@ Source = "https://github.com/dhdaines/paves"
3737
[tool.hatch.version]
3838
source = "vcs"
3939

40+
[tool.hatch.metadata]
41+
allow-direct-references = true
42+
4043
[tool.hatch.build.hooks.vcs]
4144
version-file = "src/paves/__about__.py"
4245

@@ -47,10 +50,10 @@ exclude = [
4750
]
4851

4952
[tool.hatch.envs.hatch-test]
50-
extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu" ]
53+
extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
5154

5255
[tool.hatch.envs.default]
53-
dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pillow" ]
56+
dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
5457

5558
[tool.hatch.envs.hatch-static-analysis]
5659
config-path = "none" # Disable hatch's unreasonable ruff defaults
@@ -80,3 +83,6 @@ exclude_lines = [
8083
"if __name__ == .__main__.:",
8184
"if TYPE_CHECKING:",
8285
]
86+
87+
[tool.mypy]
88+
mypy_path = "stubs"

src/paves/bears.py

Lines changed: 185 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,205 @@
44
sorts (pandas or polars, your choice).
55
"""
66

7+
import logging
8+
import multiprocessing
79
from functools import singledispatch
810
from multiprocessing.context import BaseContext
911
from pathlib import Path
10-
import logging
11-
import multiprocessing
12+
from typing import Iterator, List, Tuple, TypedDict, Union, cast
1213

13-
from typing import cast, Iterator, List, Union
14+
import playa
15+
from playa import DeviceSpace
16+
from playa.color import ColorSpace
1417
from playa.page import (
15-
Page,
1618
ContentObject,
17-
PathObject,
1819
ImageObject,
20+
Page,
21+
PathObject,
1922
TextObject,
2023
XObjectObject,
2124
)
22-
from playa.utils import (
23-
apply_matrix_norm,
24-
apply_matrix_pt,
25-
Point,
26-
get_bound,
27-
)
28-
import playa
29-
from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA # noqa: F401
25+
from playa.utils import Point, apply_matrix_norm, apply_matrix_pt, get_bound
26+
27+
# Stub out Polars if not present
28+
try:
29+
import polars as pl
30+
except ImportError:
31+
32+
class pl: # type: ignore
33+
def Array(*args, **kwargs): ...
34+
def List(*args, **kwargs): ...
35+
def Object(*args, **kwargs): ...
36+
3037

3138
LOG = logging.getLogger(__name__)
3239

3340

41+
class LayoutDict(TypedDict, total=False):
42+
"""Dictionary-based layout objects.
43+
44+
These closely match the dictionaries returned by pdfplumber. The
45+
type of coordinates returned are determined by the `space`
46+
argument passed to `Document`. By default, `(0, 0)` is
47+
the top-left corner of the page, with 72 units per inch.
48+
49+
All values can be converted to strings in some meaningful fashion,
50+
such that you can simply write one of these to a CSV. You can access
51+
the field names through the `__annotations__` property:
52+
53+
writer = DictWriter(fieldnames=LayoutDict.__annotations__.keys())
54+
dictwriter.write_rows(writer)
55+
56+
Attributes:
57+
page_index: Index (0-based) of page.
58+
page_label: Page label if any.
59+
object_type: Type of object as a string.
60+
mcid: Containing marked content section ID (or None if marked
61+
content has no ID, such as artifacts or if there is no logical
62+
structure).
63+
tag: Containing marked content tag name (or None if not in a marked
64+
content section).
65+
xobjid: String name of containing Form XObject, if any.
66+
cid: Integer character ID of glyph, if `object_type == "char"`.
67+
text: Unicode mapping for glyph, if any.
68+
fontname: str
69+
size: Font size in device space.
70+
glyph_offset_x: Horizontal offset (in device space) of glyph
71+
from start of line.
72+
glyph_offset_y: Vertical offset (in device space) of glyph from
73+
start of line.
74+
render_mode: Text rendering mode.
75+
upright: FIXME: Not really sure what this means. pdfminer.six?
76+
x0: Minimum x coordinate of bounding box (top or bottom
77+
depending on device space).
78+
x1: Maximum x coordinate of bounding box (top or bottom
79+
depending on device space).
80+
y0: Minimum y coordinate of bounding box (left or right
81+
depending on device space).
82+
x1: Minimum x coordinate of bounding box (left or right
83+
depending on device space).
84+
stroking_colorspace: String name of colour space for stroking
85+
operations.
86+
stroking_color: Numeric parameters for stroking color.
87+
stroking_pattern: Name of stroking pattern, if any.
88+
non_stroking_colorspace: String name of colour space for non-stroking
89+
operations.
90+
non_stroking_color: Numeric parameters for non-stroking color.
91+
non_stroking_pattern: Name of stroking pattern, if any.
92+
path_ops: Sequence of path operations (e.g. `"mllh"` for a
93+
triangle or `"mlllh"` for a quadrilateral)
94+
dash_pattern: Sequence of user space units for alternating
95+
stroke and non-stroke segments of dash pattern, `()` for solid
96+
line. (Cannot be in device space because this would depend on
97+
which direction the line or curve is drawn).
98+
dash_phase: Initial position in `dash_pattern` in user space
99+
units. (see above for why it's in user space)
100+
evenodd: Was this path filled with Even-Odd (if `True`) or
101+
Nonzero-Winding-Number rule (if `False`)? Note that this is
102+
**meaningless** for determining if a path is actually filled
103+
since subpaths have already been decomposed. If you really
104+
care then use the lazy API instead.
105+
stroke: Is this path stroked?
106+
fill: Is this path filled?
107+
linewidth: Line width in user space units (again, not possible
108+
to transform to device space).
109+
pts_x: X coordinates of path endpoints, one for each character
110+
in `path_ops`. This is optimized for CSV/DataFrame output, if
111+
you care about the control points then use the lazy API.
112+
pts_y: Y coordinates of path endpoints, one for each character
113+
in `path_ops`. This is optimized for CSV/DataFrame output, if
114+
you care about the control points then use the lazy API.
115+
stream: Object number and generation number for the content
116+
stream associated with an image, or `None` for inline images.
117+
If you want image data then use the lazy API.
118+
imagemask: Is this image a mask?
119+
image_colorspace: String description of image colour space, or
120+
`None` if irrelevant/forbidden,
121+
srcsize: Source dimensions of image in pixels.
122+
bits: Number of bits per channel of image.
123+
"""
124+
125+
page_index: int
126+
page_label: Union[str, None]
127+
object_type: str
128+
mcid: Union[int, None]
129+
tag: Union[str, None]
130+
xobjid: Union[str, None]
131+
cid: int
132+
text: Union[str, None]
133+
fontname: str
134+
size: float
135+
glyph_offset_x: float
136+
glyph_offset_y: float
137+
render_mode: int
138+
upright: bool
139+
x0: float
140+
y0: float
141+
x1: float
142+
y1: float
143+
stroking_colorspace: str
144+
stroking_color: Tuple[float, ...]
145+
stroking_pattern: Union[str, None]
146+
non_stroking_colorspace: str
147+
non_stroking_color: Tuple[float, ...]
148+
non_stroking_pattern: Union[str, None]
149+
path_ops: str
150+
dash_pattern: Tuple[float, ...]
151+
dash_phase: float
152+
evenodd: bool
153+
stroke: bool
154+
fill: bool
155+
linewidth: float
156+
pts_x: List[float]
157+
pts_y: List[float]
158+
stream: Union[Tuple[int, int], None]
159+
imagemask: bool
160+
image_colorspace: Union[ColorSpace, None]
161+
srcsize: Tuple[int, int]
162+
bits: int
163+
164+
165+
fieldnames = LayoutDict.__annotations__.keys()
166+
schema = {
167+
"page_index": int,
168+
"page_label": str,
169+
"object_type": str,
170+
"mcid": int,
171+
"tag": str,
172+
"xobjid": str,
173+
"text": str,
174+
"cid": int,
175+
"fontname": str,
176+
"size": float,
177+
"glyph_offset_x": float,
178+
"glyph_offset_y": float,
179+
"render_mode": int,
180+
"upright": bool,
181+
"x0": float,
182+
"x1": float,
183+
"y0": float,
184+
"y1": float,
185+
"stroking_colorspace": str,
186+
"non_stroking_colorspace": str,
187+
"stroking_color": pl.List(float),
188+
"non_stroking_color": pl.List(float),
189+
"path_ops": str,
190+
"dash_pattern": pl.List(float),
191+
"dash_phase": float,
192+
"evenodd": bool,
193+
"stroke": bool,
194+
"fill": bool,
195+
"linewidth": float,
196+
"pts_x": pl.List(float),
197+
"pts_y": pl.List(float),
198+
"stream": pl.Array(int, 2),
199+
"imagemask": bool,
200+
"image_colorspace": str,
201+
"srcsize": pl.Array(int, 2),
202+
"bits": int,
203+
}
204+
205+
34206
@singledispatch
35207
def process_object(obj: ContentObject) -> Iterator[LayoutDict]:
36208
"""Handle obj according to its type"""

0 commit comments

Comments
 (0)