Merge pull request #4 from dhdaines/no_ref

dhdaines · web-flow · commit e9b08b2ff23a · 2025-01-22T08:53:29.000-05:00
feat: no need to ref/unref with new PLAYA-PDF (fixes #1)
diff --git a/README.md b/README.md
@@ -39,8 +39,8 @@ This is generally faster than `pdfminer.six`.  You can often make it
 even faster on large documents by running in parallel with the
 `max_workers` argument, which is the same as the one you will find in
 `concurrent.futures.ProcessPoolExecutor`.  If you pass `None` it will
-use all your CPUs, but due to some unfortunate overhead (which will be
-fixed soon) this isn't so great, so 2-4 workers is best:
+use all your CPUs, but due to some unavoidable overhead, it usually
+doesn't help to use more than 2-4:
 
 ```
 for page in extract(path, laparams, max_workers=2):
@@ -116,9 +116,8 @@ from paves.bears import SCHEMA
 df = polars.DataFrame(extract(path), schema=SCHEMA)
 ```
 
-As above, you can use multiple CPUs with `max_workers`, though this
-will scale considerably better since the objects are (mostly) easily
-serializable.
+As above, you can use multiple CPUs with `max_workers`, and this will
+scale considerably better than `paves.miner`.
 
 ## License
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
 dependencies = [
-  "playa-pdf >= 0.2.7, < 0.3"  # not considered harmful as we depend on internals
+  "playa-pdf >= 0.2.8, < 0.3"  # not considered harmful as we depend on internals
 ]
 
 [project.urls]
diff --git a/src/paves/bears.py b/src/paves/bears.py
@@ -27,7 +27,6 @@
 )
 import playa
 from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA  # noqa: F401
-from paves.miner import unref_colorspace, ref_colorspace
 
 LOG = logging.getLogger(__name__)
 
@@ -181,9 +180,7 @@ def _(obj: ImageObject) -> Iterator[LayoutDict]:
         srcsize=obj.srcsize,
         imagemask=obj.imagemask,
         bits=obj.bits,
-        image_colorspace=None
-        if obj.colorspace is None
-        else unref_colorspace(obj.colorspace),
+        image_colorspace=obj.colorspace,
         stream=stream_id,
         page_index=0,
         page_label="0",
@@ -272,8 +269,4 @@ def extract(
         mp_context=mp_context,
     ) as pdf:
         for page in pdf.pages.map(extract_page):
-            for dic in page:
-                cs = dic.get("image_colorspace")
-                if cs is not None:
-                    dic["image_colorspace"] = ref_colorspace(cs, pdf)
-                yield dic
+            yield from page
diff --git a/src/paves/miner.py b/src/paves/miner.py
@@ -8,7 +8,6 @@
 import heapq
 import logging
 import multiprocessing
-import weakref
 from typing import (
     Callable,
     Dict,
@@ -1211,117 +1210,6 @@ def _(obj: TextObject) -> Iterator[LTComponent]:
         yield LTChar(glyph)
 
 
-def unref_list(items: Iterable[PDFObject]) -> List[PDFObject]:
-    """Unlink object references if necessary for serialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    out: List[PDFObject] = []
-    for v in items:
-        if isinstance(v, dict):
-            out.append(unref_dict(v))
-        elif isinstance(v, list):
-            out.append(unref_list(v))
-        elif isinstance(v, PDFObjRef):
-            out.append(PDFObjRef(None, v.objid))
-        else:
-            out.append(v)
-    return out
-
-
-def ref_list(items: Iterable[PDFObject], doc: PDFDocument) -> List[PDFObject]:
-    """Relink object references if necessary after deserialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    out: List[PDFObject] = []
-    for v in items:
-        if isinstance(v, dict):
-            out.append(ref_dict(v, doc))
-        elif isinstance(v, list):
-            out.append(ref_list(v, doc))
-        elif isinstance(v, PDFObjRef):
-            out.append(PDFObjRef(weakref.ref(doc), v.objid))
-        else:
-            out.append(v)
-    return out
-
-
-def unref_dict(props: Dict[str, PDFObject]) -> Dict[str, PDFObject]:
-    """Unlink object references if necessary for serialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    return dict(zip(props.keys(), unref_list(props.values())))
-
-
-def ref_dict(props: Dict[str, PDFObject], doc: PDFDocument) -> Dict[str, PDFObject]:
-    """Relink object references if necessary after deserialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    return dict(zip(props.keys(), ref_list(props.values(), doc)))
-
-
-def unref_colorspace(cs: ColorSpace) -> ColorSpace:
-    """Unlink object references if necessary for serialization.
-
-    FIXME: This functionality should go into PLAYA soon.
-    """
-    if cs.spec is not None and isinstance(cs.spec, list):
-        return ColorSpace(
-            name=cs.name, ncomponents=cs.ncomponents, spec=unref_list(cs.spec)
-        )
-    return cs
-
-
-def ref_colorspace(cs: ColorSpace, doc: PDFDocument) -> ColorSpace:
-    """Relink object references if necessary after deserialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    if cs.spec is not None and isinstance(cs.spec, list):
-        return ColorSpace(
-            name=cs.name, ncomponents=cs.ncomponents, spec=ref_list(cs.spec, doc)
-        )
-    return cs
-
-
-def unref_gstate(gs: GraphicState) -> None:
-    """Unlink object references if necessary for serialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    gs.scs = unref_colorspace(gs.scs)
-    gs.ncs = unref_colorspace(gs.ncs)
-
-
-def ref_gstate(gs: GraphicState, doc: PDFDocument) -> None:
-    """Relink object references if necessary after deserialization.
-
-    FIXME: This functionality should go into PLAYA soon."""
-    gs.scs = ref_colorspace(gs.scs, doc)
-    gs.ncs = ref_colorspace(gs.ncs, doc)
-
-
-def unref_component(item: Union[LTContainer, LTItem]) -> None:
-    """Unlink object references if necessary for serialization."""
-    if isinstance(item, LTComponent):
-        for idx, mcs in enumerate(item.mcstack):
-            if mcs.props:
-                item.mcstack[idx] = MarkedContent(
-                    mcid=mcs.mcid, tag=mcs.tag, props=unref_dict(mcs.props)
-                )
-    if isinstance(item, LTChar):
-        unref_gstate(item.graphicstate)
-        item.ncs = item.graphicstate.ncs
-    if isinstance(item, LTImage):
-        if item.colorspace is not None:
-            item.colorspace = unref_colorspace(item.colorspace)
-        # Content streams should never be serialized, since it would
-        # copy their data unnecessarily (and also their attributes
-        # contain indirect object references)
-        # FIXME: What about the generation number?
-        item.stream = item.stream.objid  # type: ignore[assignment]
-    if isinstance(item, LTContainer):
-        for child in item:
-            unref_component(child)
-
-
 def extract_page(page: Page, laparams: Union[LAParams, None] = None) -> LTPage:
     """Extract an LTPage from a Page, and possibly do some layout analysis.
 
@@ -1362,33 +1250,9 @@ def extract_page(page: Page, laparams: Union[LAParams, None] = None) -> LTPage:
     if laparams is not None:
         ltpage.analyze(laparams)
 
-    # We do, however, need to "unreference" any indirect object
-    # references before serializing.
-    if playa.document.__pdf is not None:
-        unref_component(ltpage)
     return ltpage
 
 
-def ref_component(item: Union[LTContainer, LTItem], doc: PDFDocument) -> None:
-    """Relink object references if necessary after deserialization."""
-    if isinstance(item, LTComponent):
-        for idx, mcs in enumerate(item.mcstack):
-            if mcs.props:
-                item.mcstack[idx] = MarkedContent(
-                    mcid=mcs.mcid, tag=mcs.tag, props=ref_dict(mcs.props, doc)
-                )
-    if isinstance(item, LTChar):
-        ref_gstate(item.graphicstate, doc)
-        item.ncs = item.graphicstate.ncs
-    if isinstance(item, LTImage):
-        if item.colorspace is not None:
-            item.colorspace = ref_colorspace(item.colorspace, doc)
-        item.stream = doc[item.stream]  # type: ignore[assignment, index]
-    if isinstance(item, LTContainer):
-        for child in item:
-            ref_component(child, doc)
-
-
 def extract(
     path: Path,
     laparams: Union[LAParams, None] = None,
@@ -1408,7 +1272,4 @@ def extract(
             for page in pdf.pages:
                 yield extract_page(page, laparams)
         else:
-            # And "rereference" indirect object references after deserializing
-            for ltpage in pdf.pages.map(partial(extract_page, laparams=laparams)):
-                ref_component(ltpage, pdf)
-                yield ltpage
+            yield from pdf.pages.map(partial(extract_page, laparams=laparams))

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ classifiers = [`
`25`	`25`	`"Programming Language :: Python :: Implementation :: PyPy",`
`26`	`26`	`]`
`27`	`27`	`dependencies = [`
`28`		`- "playa-pdf >= 0.2.7, < 0.3" # not considered harmful as we depend on internals`
	`28`	`+ "playa-pdf >= 0.2.8, < 0.3" # not considered harmful as we depend on internals`
`29`	`29`	`]`
`30`	`30`
`31`	`31`	`[project.urls]`