Make tables 60-100x faster with PLAYA-PDF 0.7.0 and its good page.structure (#14)

dhdaines · David Huggins-Daines · web-flow · commit e0601fd13ffb · 2025-08-04T18:19:46.000-06:00
* feat: optimize table extraction with parent tree

* fix: elements will be hashable too

* feat: use new page structure and new playa

* docs: clarify and improve structure docs

---------

Co-authored-by: David Huggins-Daines &lt;dhdaines@ecolingui.ca&gt;
diff --git a/README.md b/README.md
@@ -51,16 +51,27 @@ structure tree, to look at the bounding boxes of the contents of those
 structure elements for a given page:
 
 ```python
-pi.box(pdf.structure.find_all(lambda el: el.page is page))
+pi.box(page.structure)
 ```
 
 ![Structure Elements](./docs/page3-elements.png)
 
-You can also look at the marked content sections, which are the
-leaf-nodes of the structure tree:
+Note however that this only gives you the elements associated with
+*marked content sections*, which are the leaf nodes of the structure
+tree.  So, you can also search up the structure tree to find things
+like tables, figures, or list items:
 
 ```python
-pi.box(page.structure)
+pi.box(page.structure.find_all("Table"))
+pi.box(page.structure.find_all("Figure"))
+pi.box(page.structure.find_all("LI"))
+```
+
+You can even search with regular expressions, to find headers for
+instance:
+
+```python
+pi.box(page.structure.find_all(re.compile(r"H\d+")))
 ```
 
 Alternately, if you have annotations (such as links), you can look at
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
     "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-    "playa-pdf @ git+https://github.com/dhdaines/playa.git",
+    "playa-pdf>=0.7.0",
     "pillow",
 ]
 
diff --git a/src/paves/tables.py b/src/paves/tables.py
@@ -1,6 +1,5 @@
 """
-Simple and not at all Java-damaged interface for table detection
-and structure prediction.
+Simple and not at all Java-damaged interface for table detection.
 """
 
 from copy import copy
@@ -16,7 +15,12 @@
 from playa.content import ContentObject, GraphicState, MarkedContent
 from playa.page import Annotation
 from playa.pdftypes import Matrix, Rect, BBOX_NONE
-from playa.structure import Element, ContentItem, ContentObject as StructContentObject
+from playa.structure import (
+    Element,
+    ContentItem,
+    ContentObject as StructContentObject,
+    Tree,
+)
 from playa.utils import get_bound_rects
 from playa.worker import _ref_page
 
@@ -139,7 +143,7 @@ def table_elements(
     pdf: Union[str, PathLike, Document, Page, PageList],
 ) -> Iterator[Element]:
     """Iterate over all text objects in a PDF, page, or pages"""
-    raise NotImplementedError
+    raise NotImplementedError(f"Not implemented for {type(pdf)}")
 
 
 @table_elements.register(str)
@@ -155,24 +159,23 @@ def table_elements_path(pdf: Union[str, PathLike]) -> Iterator[Element]:
 def table_elements_doc(pdf: Document) -> Iterator[Element]:
     structure = pdf.structure
     if structure is None:
-        raise TypeError
+        raise TypeError("Document has no logical structure")
     return structure.find_all("Table")
 
 
 @table_elements.register
 def table_elements_pagelist(pages: PageList) -> Iterator[Element]:
-    structure = pages.doc.structure
-    if structure is None:
-        raise TypeError
-    # FIXME: Accelerate this with the ParentTree too
-    return (table for table in structure.find_all("Table") if table.page in pages)
+    if pages.doc.structure is None:
+        raise TypeError("Document has no logical structure")
+    for page in pages:
+        yield from table_elements_page(page)
 
 
 @table_elements.register
 def table_elements_page(page: Page) -> Iterator[Element]:
-    # FIXME: Accelerate this with the ParentTree
-    pagelist = page.doc.pages[(page.page_idx,)]
-    return table_elements_pagelist(pagelist)
+    if page.structure is None:
+        raise TypeError("Page has no ParentTree")
+    return page.structure.find_all("Table")
 
 
 def table_elements_to_objects(

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ classifiers = [`
`30`	`30`	`"Programming Language :: Python :: Implementation :: PyPy",`
`31`	`31`	`]`
`32`	`32`	`dependencies = [`
`33`		`- "playa-pdf @ git+https://github.com/dhdaines/playa.git",`
	`33`	`+ "playa-pdf>=0.7.0",`
`34`	`34`	`"pillow",`
`35`	`35`	`]`
`36`	`36`