dhdaines
diff --git a/‎README.md‎
Lines changed: 48 additions & 1 deletion b/‎README.md‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 9 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎src/paves/bears.py‎
Lines changed: 185 additions & 13 deletions b/‎src/paves/bears.py‎
Lines changed: 185 additions & 13 deletions
@@ -21,7 +21,54 @@ There will be dependencies.  Oh, there will be dependencies.
 pip install paves
 ```
 
-## Workin' in a PDF mine
+## Looking at Stuff in a PDF
+
+When poking around in a PDF, it is useful not simply to read
+descriptions of objects (text, images, etc) but also to visualise them
+in the rendered document.  `pdfplumber` is quite nice for this, though
+it is oriented towards the particular set of objects that it can
+extract from the PDF.
+
+The primary goal of PLAYA-PDF is to give access to all the objects and
+particularly the metadata in a PDF.  One goal of PAVÉS (because there
+are a few) is to give an easy way to visualise these objects and
+metadata.
+
+First, maybe you want to just look at a page in your Jupyter notebook.
+Okay!
+
+```python
+import playa, paves.image as pi
+pdf = playa.open("my_awesome.pdf")
+page = pdf.pages[3]
+pi.show(page)
+```
+
+You can of course draw boxes around those individual PDF objects, or
+one particular sort of object, or filter them with a generator
+expression:
+
+```python
+pi.box(page)  # outlines everything
+pi.box(page.texts)
+pi.box(page.images)
+pi.box(t for t in page.texts if "spam" in t.chars)
+```
+
+Alternately you can "highlight" objects by overlaying them with a
+semi-transparent colour, which otherwise works the same way:
+
+```python
+pi.mark(page.images)
+```
+
+If you wish you can give each type of object a different colour:
+
+```python
+pi.mark(page, color={"text": "red", "image": "blue", "path": green"})
+```
+
+## Working in the PDF mine
 
 `pdfminer.six` is widely used for text extraction and layout analysis
 due to its liberal licensing terms.  Unfortunately it is quite slow
 
@@ -25,7 +25,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "playa-pdf >= 0.2.8, < 0.3",  # not considered harmful as we depend on internals
+  "playa-pdf>=0.3.0",
   "pillow",
 ]
 
@@ -37,6 +37,9 @@ Source = "https://github.com/dhdaines/paves"
 [tool.hatch.version]
 source = "vcs"
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [tool.hatch.build.hooks.vcs]
 version-file = "src/paves/__about__.py"
 
@@ -47,10 +50,10 @@ exclude = [
 ]
 
 [tool.hatch.envs.hatch-test]
-extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu" ]
+extra-dependencies = [ "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
 
 [tool.hatch.envs.default]
-dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pillow" ]
+dependencies = [ "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu", "pypdfium2" ]
 
 [tool.hatch.envs.hatch-static-analysis]
 config-path = "none"  # Disable hatch's unreasonable ruff defaults
@@ -80,3 +83,6 @@ exclude_lines = [
   "if __name__ == .__main__.:",
   "if TYPE_CHECKING:",
 ]
+
+[tool.mypy]
+mypy_path = "stubs"
@@ -4,33 +4,205 @@
 sorts (pandas or polars, your choice).
 """
 
+import logging
+import multiprocessing
 from functools import singledispatch
 from multiprocessing.context import BaseContext
 from pathlib import Path
-import logging
-import multiprocessing
+from typing import Iterator, List, Tuple, TypedDict, Union, cast
 
-from typing import cast, Iterator, List, Union
+import playa
+from playa import DeviceSpace
+from playa.color import ColorSpace
 from playa.page import (
-    Page,
     ContentObject,
-    PathObject,
     ImageObject,
+    Page,
+    PathObject,
     TextObject,
     XObjectObject,
 )
-from playa.utils import (
-    apply_matrix_norm,
-    apply_matrix_pt,
-    Point,
-    get_bound,
-)
-import playa
-from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA  # noqa: F401
+from playa.utils import Point, apply_matrix_norm, apply_matrix_pt, get_bound
+
+# Stub out Polars if not present
+try:
+    import polars as pl
+except ImportError:
+
+    class pl:  # type: ignore
+        def Array(*args, **kwargs): ...
+        def List(*args, **kwargs): ...
+        def Object(*args, **kwargs): ...
+
 
 LOG = logging.getLogger(__name__)
 
 
+class LayoutDict(TypedDict, total=False):
+    """Dictionary-based layout objects.
+
+    These closely match the dictionaries returned by pdfplumber.  The
+    type of coordinates returned are determined by the `space`
+    argument passed to `Document`.  By default, `(0, 0)` is
+    the top-left corner of the page, with 72 units per inch.
+
+    All values can be converted to strings in some meaningful fashion,
+    such that you can simply write one of these to a CSV.  You can access
+    the field names through the `__annotations__` property:
+
+        writer = DictWriter(fieldnames=LayoutDict.__annotations__.keys())
+        dictwriter.write_rows(writer)
+
+    Attributes:
+      page_index: Index (0-based) of page.
+      page_label: Page label if any.
+      object_type: Type of object as a string.
+      mcid: Containing marked content section ID (or None if marked
+        content has no ID, such as artifacts or if there is no logical
+        structure).
+      tag: Containing marked content tag name (or None if not in a marked
+        content section).
+      xobjid: String name of containing Form XObject, if any.
+      cid: Integer character ID of glyph, if `object_type == "char"`.
+      text: Unicode mapping for glyph, if any.
+      fontname: str
+      size: Font size in device space.
+      glyph_offset_x: Horizontal offset (in device space) of glyph
+        from start of line.
+      glyph_offset_y: Vertical offset (in device space) of glyph from
+        start of line.
+      render_mode: Text rendering mode.
+      upright: FIXME: Not really sure what this means.  pdfminer.six?
+      x0: Minimum x coordinate of bounding box (top or bottom
+        depending on device space).
+      x1: Maximum x coordinate of bounding box (top or bottom
+        depending on device space).
+      y0: Minimum y coordinate of bounding box (left or right
+        depending on device space).
+      x1: Minimum x coordinate of bounding box (left or right
+        depending on device space).
+      stroking_colorspace: String name of colour space for stroking
+        operations.
+      stroking_color: Numeric parameters for stroking color.
+      stroking_pattern: Name of stroking pattern, if any.
+      non_stroking_colorspace: String name of colour space for non-stroking
+        operations.
+      non_stroking_color: Numeric parameters for non-stroking color.
+      non_stroking_pattern: Name of stroking pattern, if any.
+      path_ops: Sequence of path operations (e.g. `"mllh"` for a
+        triangle or `"mlllh"` for a quadrilateral)
+      dash_pattern: Sequence of user space units for alternating
+        stroke and non-stroke segments of dash pattern, `()` for solid
+        line. (Cannot be in device space because this would depend on
+        which direction the line or curve is drawn).
+      dash_phase: Initial position in `dash_pattern` in user space
+        units.  (see above for why it's in user space)
+      evenodd: Was this path filled with Even-Odd (if `True`) or
+        Nonzero-Winding-Number rule (if `False`)?  Note that this is
+        **meaningless** for determining if a path is actually filled
+        since subpaths have already been decomposed.  If you really
+        care then use the lazy API instead.
+      stroke: Is this path stroked?
+      fill: Is this path filled?
+      linewidth: Line width in user space units (again, not possible
+        to transform to device space).
+      pts_x: X coordinates of path endpoints, one for each character
+        in `path_ops`.  This is optimized for CSV/DataFrame output, if
+        you care about the control points then use the lazy API.
+      pts_y: Y coordinates of path endpoints, one for each character
+        in `path_ops`.  This is optimized for CSV/DataFrame output, if
+        you care about the control points then use the lazy API.
+      stream: Object number and generation number for the content
+        stream associated with an image, or `None` for inline images.
+        If you want image data then use the lazy API.
+      imagemask: Is this image a mask?
+      image_colorspace: String description of image colour space, or
+        `None` if irrelevant/forbidden,
+      srcsize: Source dimensions of image in pixels.
+      bits: Number of bits per channel of image.
+    """
+
+    page_index: int
+    page_label: Union[str, None]
+    object_type: str
+    mcid: Union[int, None]
+    tag: Union[str, None]
+    xobjid: Union[str, None]
+    cid: int
+    text: Union[str, None]
+    fontname: str
+    size: float
+    glyph_offset_x: float
+    glyph_offset_y: float
+    render_mode: int
+    upright: bool
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    stroking_colorspace: str
+    stroking_color: Tuple[float, ...]
+    stroking_pattern: Union[str, None]
+    non_stroking_colorspace: str
+    non_stroking_color: Tuple[float, ...]
+    non_stroking_pattern: Union[str, None]
+    path_ops: str
+    dash_pattern: Tuple[float, ...]
+    dash_phase: float
+    evenodd: bool
+    stroke: bool
+    fill: bool
+    linewidth: float
+    pts_x: List[float]
+    pts_y: List[float]
+    stream: Union[Tuple[int, int], None]
+    imagemask: bool
+    image_colorspace: Union[ColorSpace, None]
+    srcsize: Tuple[int, int]
+    bits: int
+
+
+fieldnames = LayoutDict.__annotations__.keys()
+schema = {
+    "page_index": int,
+    "page_label": str,
+    "object_type": str,
+    "mcid": int,
+    "tag": str,
+    "xobjid": str,
+    "text": str,
+    "cid": int,
+    "fontname": str,
+    "size": float,
+    "glyph_offset_x": float,
+    "glyph_offset_y": float,
+    "render_mode": int,
+    "upright": bool,
+    "x0": float,
+    "x1": float,
+    "y0": float,
+    "y1": float,
+    "stroking_colorspace": str,
+    "non_stroking_colorspace": str,
+    "stroking_color": pl.List(float),
+    "non_stroking_color": pl.List(float),
+    "path_ops": str,
+    "dash_pattern": pl.List(float),
+    "dash_phase": float,
+    "evenodd": bool,
+    "stroke": bool,
+    "fill": bool,
+    "linewidth": float,
+    "pts_x": pl.List(float),
+    "pts_y": pl.List(float),
+    "stream": pl.Array(int, 2),
+    "imagemask": bool,
+    "image_colorspace": str,
+    "srcsize": pl.Array(int, 2),
+    "bits": int,
+}
+
+
 @singledispatch
 def process_object(obj: ContentObject) -> Iterator[LayoutDict]:
     """Handle obj according to its type"""