Skip to content

Commit 921edc0

Browse files
authored
Merge pull request #13 from dhdaines/tables_and_text_oh_my
Tables and text, oh my!
2 parents 3ebc853 + 0d01fd2 commit 921edc0

File tree

8 files changed

+765
-34
lines changed

8 files changed

+765
-34
lines changed

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ classifiers = [
3030
"Programming Language :: Python :: Implementation :: PyPy",
3131
]
3232
dependencies = [
33-
"playa-pdf>=0.6.1",
33+
"playa-pdf @ git+https://github.com/dhdaines/playa.git",
3434
"pillow",
3535
]
3636

@@ -66,9 +66,14 @@ config-path = "none" # Disable hatch's unreasonable ruff defaults
6666
[tool.hatch.envs.types]
6767
extra-dependencies = [
6868
"mypy>=1.0.0",
69+
"transformers[torch]",
6970
]
7071
[tool.hatch.envs.types.scripts]
7172
check = "mypy --install-types --non-interactive {args:src/paves tests}"
73+
[tool.hatch.envs.types.env-vars]
74+
# Avoid downloading gigabytes of CUDA junk
75+
PIP_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"
76+
UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"
7277

7378
[tool.coverage.run]
7479
source_pkgs = ["paves", "tests"]

src/paves/image.py

Lines changed: 72 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,12 @@
2525
cast,
2626
)
2727

28+
import playa
2829
from PIL import Image, ImageDraw, ImageFont
2930
from playa.document import Document, PageList
3031
from playa.page import ContentObject, Page, Annotation
3132
from playa.structure import Element
32-
from playa.utils import Rect, get_transformed_bound
33+
from playa.utils import Rect, transform_bbox
3334

3435
if TYPE_CHECKING:
3536
import pypdfium2 # types: ignore
@@ -57,7 +58,15 @@ def make_poppler_args(dpi: int, width: int, height: int) -> List[str]:
5758

5859

5960
@functools.singledispatch
60-
def _popple(pdf, tempdir: Path, args: List[str]) -> None:
61+
def _popple(pdf, tempdir: Path, args: List[str]) -> List[Tuple[int, float, float]]:
62+
raise NotImplementedError
63+
64+
65+
@_popple.register(str)
66+
@_popple.register(PathLike)
67+
def _popple_path(
68+
pdf: Union[str, PathLike], tempdir: Path, args: List[str]
69+
) -> List[Tuple[int, float, float]]:
6170
subprocess.run(
6271
[
6372
"pdftoppm",
@@ -67,11 +76,16 @@ def _popple(pdf, tempdir: Path, args: List[str]) -> None:
6776
],
6877
check=True,
6978
)
79+
with playa.open(pdf) as doc:
80+
return [(page.page_idx, page.width, page.height) for page in doc.pages]
7081

7182

7283
@_popple.register(Document)
73-
def _popple_doc(pdf: Document, tempdir: Path, args: List[str]) -> None:
84+
def _popple_doc(
85+
pdf: Document, tempdir: Path, args: List[str]
86+
) -> List[Tuple[int, float, float]]:
7487
pdfpdf = tempdir / "pdf.pdf"
88+
# FIXME: This is... not great (can we popple in a pipeline please?)
7589
with open(pdfpdf, "wb") as outfh:
7690
outfh.write(pdf.buffer)
7791
subprocess.run(
@@ -83,10 +97,14 @@ def _popple_doc(pdf: Document, tempdir: Path, args: List[str]) -> None:
8397
],
8498
check=True,
8599
)
100+
pdfpdf.unlink()
101+
return [(page.page_idx, page.width, page.height) for page in pdf.pages]
86102

87103

88104
@_popple.register(Page)
89-
def _popple_page(pdf: Page, tempdir: Path, args: List[str]) -> None:
105+
def _popple_page(
106+
pdf: Page, tempdir: Path, args: List[str]
107+
) -> List[Tuple[int, float, float]]:
90108
assert pdf.doc is not None # bug in PLAYA-PDF, oops, it cannot be None
91109
pdfpdf = tempdir / "pdf.pdf"
92110
with open(pdfpdf, "wb") as outfh:
@@ -105,10 +123,14 @@ def _popple_page(pdf: Page, tempdir: Path, args: List[str]) -> None:
105123
],
106124
check=True,
107125
)
126+
pdfpdf.unlink()
127+
return [(pdf.page_idx, pdf.width, pdf.height)]
108128

109129

110130
@_popple.register(PageList)
111-
def _popple_pages(pdf: PageList, tempdir: Path, args: List[str]) -> None:
131+
def _popple_pages(
132+
pdf: PageList, tempdir: Path, args: List[str]
133+
) -> List[Tuple[int, float, float]]:
112134
pdfpdf = tempdir / "pdf.pdf"
113135
assert pdf[0].doc is not None # bug in PLAYA-PDF, oops, it cannot be None
114136
with open(pdfpdf, "wb") as outfh:
@@ -142,6 +164,8 @@ def _popple_pages(pdf: PageList, tempdir: Path, args: List[str]) -> None:
142164
],
143165
check=True,
144166
)
167+
pdfpdf.unlink()
168+
return [(page.page_idx, page.width, page.height) for page in pdf]
145169

146170

147171
def popple(
@@ -173,21 +197,28 @@ def popple(
173197
args = make_poppler_args(dpi, width, height)
174198
with tempfile.TemporaryDirectory() as tempdir:
175199
temppath = Path(tempdir)
176-
_popple(pdf, temppath, args)
177-
for ppm in sorted(temppath.iterdir()):
178-
if ppm.suffix == ".ppm":
179-
yield Image.open(ppm)
200+
# FIXME: Possible to Popple in a Parallel Pipeline
201+
page_sizes = _popple(pdf, temppath, args)
202+
for (page_idx, page_width, page_height), ppm in zip(
203+
page_sizes,
204+
(path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
205+
):
206+
img = Image.open(ppm)
207+
img.info["page_index"] = page_idx
208+
img.info["page_width"] = page_width
209+
img.info["page_height"] = page_height
210+
yield img
180211

181212

182213
@functools.singledispatch
183214
def _get_pdfium_pages(
184215
pdf: Union[str, PathLike, Document, Page, PageList],
185-
) -> Iterator["pypdfium2.PdfPage"]:
216+
) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
186217
import pypdfium2
187218

188219
doc = pypdfium2.PdfDocument(pdf)
189-
for page in doc:
190-
yield page
220+
for idx, page in enumerate(doc):
221+
yield idx, page
191222
page.close()
192223
doc.close()
193224

@@ -209,31 +240,33 @@ def _get_pdfium_doc(pdf: Document) -> Iterator["pypdfium2.PdfDocument"]:
209240

210241

211242
@_get_pdfium_pages.register(Document)
212-
def _get_pdfium_pages_doc(pdf: Document) -> Iterator["pypdfium2.PdfPage"]:
243+
def _get_pdfium_pages_doc(pdf: Document) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
213244
with _get_pdfium_doc(pdf) as doc:
214-
for page in doc:
215-
yield page
245+
for idx, page in enumerate(doc):
246+
yield idx, page
216247
page.close()
217248

218249

219250
@_get_pdfium_pages.register(Page)
220-
def _get_pdfium_pages_page(page: Page) -> Iterator["pypdfium2.PdfPage"]:
251+
def _get_pdfium_pages_page(page: Page) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
221252
pdf = page.doc
222253
assert pdf is not None
223254
with _get_pdfium_doc(pdf) as doc:
224255
pdfium_page = doc[page.page_idx]
225-
yield pdfium_page
256+
yield page.page_idx, pdfium_page
226257
pdfium_page.close()
227258

228259

229260
@_get_pdfium_pages.register(PageList)
230-
def _get_pdfium_pages_pagelist(pages: PageList) -> Iterator["pypdfium2.PdfPage"]:
261+
def _get_pdfium_pages_pagelist(
262+
pages: PageList,
263+
) -> Iterator[Tuple[int, "pypdfium2.PdfPage"]]:
231264
pdf = pages.doc
232265
assert pdf is not None
233266
with _get_pdfium_doc(pdf) as doc:
234267
for page in pages:
235268
pdfium_page = doc[page.page_idx]
236-
yield pdfium_page
269+
yield page.page_idx, pdfium_page
237270
pdfium_page.close()
238271

239272

@@ -252,7 +285,8 @@ def pdfium(
252285
width: Render to this width in pixels.
253286
height: Render to this height in pixels.
254287
Yields:
255-
Pillow `Image.Image` objects, one per page.
288+
Pillow `Image.Image` objects, one per page. Page width and height are
289+
available in the `info` property of the images.
256290
Raises:
257291
ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
258292
NotInstalledError: If PyPDFium2 is not installed.
@@ -263,24 +297,30 @@ def pdfium(
263297
import pypdfium2 # noqa: F401
264298
except ImportError as e:
265299
raise NotInstalledError("PyPDFium2 does not seem to be installed") from e
266-
for page in _get_pdfium_pages(pdf):
300+
for idx, page in _get_pdfium_pages(pdf):
301+
page_width = page.get_width()
302+
page_height = page.get_height()
267303
if width == 0 and height == 0:
268304
scale = (dpi or 72) / 72
269-
yield page.render(scale=scale).to_pil()
305+
img = page.render(scale=scale).to_pil()
270306
else:
271307
if width and height:
272308
# Scale to longest side (since pypdfium2 doesn't
273309
# appear to allow non-1:1 aspect ratio)
274-
scale = max(width / page.get_width(), height / page.get_height())
310+
scale = max(width / page_width, height / page_height)
275311
img = page.render(scale=scale).to_pil()
276312
# Resize down to desired size
277-
yield img.resize(size=(width, height))
313+
img = img.resize(size=(width, height))
278314
elif width:
279315
scale = width / page.get_width()
280-
yield page.render(scale=scale).to_pil()
316+
img = page.render(scale=scale).to_pil()
281317
elif height:
282318
scale = height / page.get_height()
283-
yield page.render(scale=scale).to_pil()
319+
img = page.render(scale=scale).to_pil()
320+
img.info["page_index"] = idx
321+
img.info["page_width"] = page_width
322+
img.info["page_height"] = page_height
323+
yield img
284324

285325

286326
METHODS = [popple, pdfium]
@@ -301,10 +341,14 @@ def convert(
301341
width: Render to this width in pixels (0 to keep aspect ratio).
302342
height: Render to this height in pixels (0 to keep aspect ratio).
303343
Yields:
304-
Pillow `Image.Image` objects, one per page.
344+
Pillow `Image.Image` objects, one per page. The original page
345+
width and height in default user space units are available in
346+
the `info` property of these images as `page_width` and
347+
`page_height`
305348
Raises:
306349
ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
307350
NotInstalledError: If no renderer is available
351+
308352
"""
309353
for method in METHODS:
310354
try:
@@ -365,7 +409,7 @@ def get_box_content(obj: Union[ContentObject, Element]) -> Rect:
365409
@get_box.register(Annotation)
366410
def get_box_annotation(obj: Annotation) -> Rect:
367411
"""Get the bounding box of an Annotation"""
368-
return get_transformed_bound(obj.page.ctm, obj.rect)
412+
return transform_bbox(obj.page.ctm, obj.rect)
369413

370414

371415
@functools.singledispatch

0 commit comments

Comments
 (0)