|
4 | 4 | sorts (pandas or polars, your choice). |
5 | 5 | """ |
6 | 6 |
|
| 7 | +import logging |
| 8 | +import multiprocessing |
7 | 9 | from functools import singledispatch |
8 | 10 | from multiprocessing.context import BaseContext |
9 | 11 | from pathlib import Path |
10 | | -import logging |
11 | | -import multiprocessing |
| 12 | +from typing import Iterator, List, Tuple, TypedDict, Union, cast |
12 | 13 |
|
13 | | -from typing import cast, Iterator, List, Union |
| 14 | +import playa |
| 15 | +from playa import DeviceSpace |
| 16 | +from playa.color import ColorSpace |
14 | 17 | from playa.page import ( |
15 | | - Page, |
16 | 18 | ContentObject, |
17 | | - PathObject, |
18 | 19 | ImageObject, |
| 20 | + Page, |
| 21 | + PathObject, |
19 | 22 | TextObject, |
20 | 23 | XObjectObject, |
21 | 24 | ) |
22 | | -from playa.utils import ( |
23 | | - apply_matrix_norm, |
24 | | - apply_matrix_pt, |
25 | | - Point, |
26 | | - get_bound, |
27 | | -) |
28 | | -import playa |
29 | | -from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA # noqa: F401 |
| 25 | +from playa.utils import Point, apply_matrix_norm, apply_matrix_pt, get_bound |
| 26 | + |
| 27 | +# Stub out Polars if not present |
| 28 | +try: |
| 29 | + import polars as pl |
| 30 | +except ImportError: |
| 31 | + |
| 32 | + class pl: # type: ignore |
| 33 | + def Array(*args, **kwargs): ... |
| 34 | + def List(*args, **kwargs): ... |
| 35 | + def Object(*args, **kwargs): ... |
| 36 | + |
30 | 37 |
|
31 | 38 | LOG = logging.getLogger(__name__) |
32 | 39 |
|
33 | 40 |
|
| 41 | +class LayoutDict(TypedDict, total=False): |
| 42 | + """Dictionary-based layout objects. |
| 43 | +
|
| 44 | + These closely match the dictionaries returned by pdfplumber. The |
| 45 | + type of coordinates returned are determined by the `space` |
| 46 | + argument passed to `Document`. By default, `(0, 0)` is |
| 47 | + the top-left corner of the page, with 72 units per inch. |
| 48 | +
|
| 49 | + All values can be converted to strings in some meaningful fashion, |
| 50 | + such that you can simply write one of these to a CSV. You can access |
| 51 | + the field names through the `__annotations__` property: |
| 52 | +
|
| 53 | + writer = DictWriter(fieldnames=LayoutDict.__annotations__.keys()) |
| 54 | + dictwriter.write_rows(writer) |
| 55 | +
|
| 56 | + Attributes: |
| 57 | + page_index: Index (0-based) of page. |
| 58 | + page_label: Page label if any. |
| 59 | + object_type: Type of object as a string. |
| 60 | + mcid: Containing marked content section ID (or None if marked |
| 61 | + content has no ID, such as artifacts or if there is no logical |
| 62 | + structure). |
| 63 | + tag: Containing marked content tag name (or None if not in a marked |
| 64 | + content section). |
| 65 | + xobjid: String name of containing Form XObject, if any. |
| 66 | + cid: Integer character ID of glyph, if `object_type == "char"`. |
| 67 | + text: Unicode mapping for glyph, if any. |
| 68 | + fontname: str |
| 69 | + size: Font size in device space. |
| 70 | + glyph_offset_x: Horizontal offset (in device space) of glyph |
| 71 | + from start of line. |
| 72 | + glyph_offset_y: Vertical offset (in device space) of glyph from |
| 73 | + start of line. |
| 74 | + render_mode: Text rendering mode. |
| 75 | + upright: FIXME: Not really sure what this means. pdfminer.six? |
| 76 | + x0: Minimum x coordinate of bounding box (top or bottom |
| 77 | + depending on device space). |
| 78 | + x1: Maximum x coordinate of bounding box (top or bottom |
| 79 | + depending on device space). |
| 80 | + y0: Minimum y coordinate of bounding box (left or right |
| 81 | + depending on device space). |
| 82 | + x1: Minimum x coordinate of bounding box (left or right |
| 83 | + depending on device space). |
| 84 | + stroking_colorspace: String name of colour space for stroking |
| 85 | + operations. |
| 86 | + stroking_color: Numeric parameters for stroking color. |
| 87 | + stroking_pattern: Name of stroking pattern, if any. |
| 88 | + non_stroking_colorspace: String name of colour space for non-stroking |
| 89 | + operations. |
| 90 | + non_stroking_color: Numeric parameters for non-stroking color. |
| 91 | + non_stroking_pattern: Name of stroking pattern, if any. |
| 92 | + path_ops: Sequence of path operations (e.g. `"mllh"` for a |
| 93 | + triangle or `"mlllh"` for a quadrilateral) |
| 94 | + dash_pattern: Sequence of user space units for alternating |
| 95 | + stroke and non-stroke segments of dash pattern, `()` for solid |
| 96 | + line. (Cannot be in device space because this would depend on |
| 97 | + which direction the line or curve is drawn). |
| 98 | + dash_phase: Initial position in `dash_pattern` in user space |
| 99 | + units. (see above for why it's in user space) |
| 100 | + evenodd: Was this path filled with Even-Odd (if `True`) or |
| 101 | + Nonzero-Winding-Number rule (if `False`)? Note that this is |
| 102 | + **meaningless** for determining if a path is actually filled |
| 103 | + since subpaths have already been decomposed. If you really |
| 104 | + care then use the lazy API instead. |
| 105 | + stroke: Is this path stroked? |
| 106 | + fill: Is this path filled? |
| 107 | + linewidth: Line width in user space units (again, not possible |
| 108 | + to transform to device space). |
| 109 | + pts_x: X coordinates of path endpoints, one for each character |
| 110 | + in `path_ops`. This is optimized for CSV/DataFrame output, if |
| 111 | + you care about the control points then use the lazy API. |
| 112 | + pts_y: Y coordinates of path endpoints, one for each character |
| 113 | + in `path_ops`. This is optimized for CSV/DataFrame output, if |
| 114 | + you care about the control points then use the lazy API. |
| 115 | + stream: Object number and generation number for the content |
| 116 | + stream associated with an image, or `None` for inline images. |
| 117 | + If you want image data then use the lazy API. |
| 118 | + imagemask: Is this image a mask? |
| 119 | + image_colorspace: String description of image colour space, or |
| 120 | + `None` if irrelevant/forbidden, |
| 121 | + srcsize: Source dimensions of image in pixels. |
| 122 | + bits: Number of bits per channel of image. |
| 123 | + """ |
| 124 | + |
| 125 | + page_index: int |
| 126 | + page_label: Union[str, None] |
| 127 | + object_type: str |
| 128 | + mcid: Union[int, None] |
| 129 | + tag: Union[str, None] |
| 130 | + xobjid: Union[str, None] |
| 131 | + cid: int |
| 132 | + text: Union[str, None] |
| 133 | + fontname: str |
| 134 | + size: float |
| 135 | + glyph_offset_x: float |
| 136 | + glyph_offset_y: float |
| 137 | + render_mode: int |
| 138 | + upright: bool |
| 139 | + x0: float |
| 140 | + y0: float |
| 141 | + x1: float |
| 142 | + y1: float |
| 143 | + stroking_colorspace: str |
| 144 | + stroking_color: Tuple[float, ...] |
| 145 | + stroking_pattern: Union[str, None] |
| 146 | + non_stroking_colorspace: str |
| 147 | + non_stroking_color: Tuple[float, ...] |
| 148 | + non_stroking_pattern: Union[str, None] |
| 149 | + path_ops: str |
| 150 | + dash_pattern: Tuple[float, ...] |
| 151 | + dash_phase: float |
| 152 | + evenodd: bool |
| 153 | + stroke: bool |
| 154 | + fill: bool |
| 155 | + linewidth: float |
| 156 | + pts_x: List[float] |
| 157 | + pts_y: List[float] |
| 158 | + stream: Union[Tuple[int, int], None] |
| 159 | + imagemask: bool |
| 160 | + image_colorspace: Union[ColorSpace, None] |
| 161 | + srcsize: Tuple[int, int] |
| 162 | + bits: int |
| 163 | + |
| 164 | + |
| 165 | +fieldnames = LayoutDict.__annotations__.keys() |
| 166 | +schema = { |
| 167 | + "page_index": int, |
| 168 | + "page_label": str, |
| 169 | + "object_type": str, |
| 170 | + "mcid": int, |
| 171 | + "tag": str, |
| 172 | + "xobjid": str, |
| 173 | + "text": str, |
| 174 | + "cid": int, |
| 175 | + "fontname": str, |
| 176 | + "size": float, |
| 177 | + "glyph_offset_x": float, |
| 178 | + "glyph_offset_y": float, |
| 179 | + "render_mode": int, |
| 180 | + "upright": bool, |
| 181 | + "x0": float, |
| 182 | + "x1": float, |
| 183 | + "y0": float, |
| 184 | + "y1": float, |
| 185 | + "stroking_colorspace": str, |
| 186 | + "non_stroking_colorspace": str, |
| 187 | + "stroking_color": pl.List(float), |
| 188 | + "non_stroking_color": pl.List(float), |
| 189 | + "path_ops": str, |
| 190 | + "dash_pattern": pl.List(float), |
| 191 | + "dash_phase": float, |
| 192 | + "evenodd": bool, |
| 193 | + "stroke": bool, |
| 194 | + "fill": bool, |
| 195 | + "linewidth": float, |
| 196 | + "pts_x": pl.List(float), |
| 197 | + "pts_y": pl.List(float), |
| 198 | + "stream": pl.Array(int, 2), |
| 199 | + "imagemask": bool, |
| 200 | + "image_colorspace": str, |
| 201 | + "srcsize": pl.Array(int, 2), |
| 202 | + "bits": int, |
| 203 | +} |
| 204 | + |
| 205 | + |
34 | 206 | @singledispatch |
35 | 207 | def process_object(obj: ContentObject) -> Iterator[LayoutDict]: |
36 | 208 | """Handle obj according to its type""" |
|
0 commit comments