Skip to content

Commit e9b08b2

Browse files
authored
Merge pull request #4 from dhdaines/no_ref
feat: no need to ref/unref with new PLAYA-PDF (fixes #1)
2 parents 3fc9bdd + 544441b commit e9b08b2

File tree

4 files changed

+8
-155
lines changed

4 files changed

+8
-155
lines changed

README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ This is generally faster than `pdfminer.six`. You can often make it
3939
even faster on large documents by running in parallel with the
4040
`max_workers` argument, which is the same as the one you will find in
4141
`concurrent.futures.ProcessPoolExecutor`. If you pass `None` it will
42-
use all your CPUs, but due to some unfortunate overhead (which will be
43-
fixed soon) this isn't so great, so 2-4 workers is best:
42+
use all your CPUs, but due to some unavoidable overhead, it usually
43+
doesn't help to use more than 2-4:
4444

4545
```
4646
for page in extract(path, laparams, max_workers=2):
@@ -116,9 +116,8 @@ from paves.bears import SCHEMA
116116
df = polars.DataFrame(extract(path), schema=SCHEMA)
117117
```
118118

119-
As above, you can use multiple CPUs with `max_workers`, though this
120-
will scale considerably better since the objects are (mostly) easily
121-
serializable.
119+
As above, you can use multiple CPUs with `max_workers`, and this will
120+
scale considerably better than `paves.miner`.
122121

123122
## License
124123

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ classifiers = [
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
]
2727
dependencies = [
28-
"playa-pdf >= 0.2.7, < 0.3" # not considered harmful as we depend on internals
28+
"playa-pdf >= 0.2.8, < 0.3" # not considered harmful as we depend on internals
2929
]
3030

3131
[project.urls]

src/paves/bears.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
)
2828
import playa
2929
from playa import DeviceSpace, LayoutDict, fieldnames as FIELDNAMES, schema as SCHEMA # noqa: F401
30-
from paves.miner import unref_colorspace, ref_colorspace
3130

3231
LOG = logging.getLogger(__name__)
3332

@@ -181,9 +180,7 @@ def _(obj: ImageObject) -> Iterator[LayoutDict]:
181180
srcsize=obj.srcsize,
182181
imagemask=obj.imagemask,
183182
bits=obj.bits,
184-
image_colorspace=None
185-
if obj.colorspace is None
186-
else unref_colorspace(obj.colorspace),
183+
image_colorspace=obj.colorspace,
187184
stream=stream_id,
188185
page_index=0,
189186
page_label="0",
@@ -272,8 +269,4 @@ def extract(
272269
mp_context=mp_context,
273270
) as pdf:
274271
for page in pdf.pages.map(extract_page):
275-
for dic in page:
276-
cs = dic.get("image_colorspace")
277-
if cs is not None:
278-
dic["image_colorspace"] = ref_colorspace(cs, pdf)
279-
yield dic
272+
yield from page

src/paves/miner.py

Lines changed: 1 addition & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import heapq
99
import logging
1010
import multiprocessing
11-
import weakref
1211
from typing import (
1312
Callable,
1413
Dict,
@@ -1211,117 +1210,6 @@ def _(obj: TextObject) -> Iterator[LTComponent]:
12111210
yield LTChar(glyph)
12121211

12131212

1214-
def unref_list(items: Iterable[PDFObject]) -> List[PDFObject]:
1215-
"""Unlink object references if necessary for serialization.
1216-
1217-
FIXME: This functionality should go into PLAYA soon."""
1218-
out: List[PDFObject] = []
1219-
for v in items:
1220-
if isinstance(v, dict):
1221-
out.append(unref_dict(v))
1222-
elif isinstance(v, list):
1223-
out.append(unref_list(v))
1224-
elif isinstance(v, PDFObjRef):
1225-
out.append(PDFObjRef(None, v.objid))
1226-
else:
1227-
out.append(v)
1228-
return out
1229-
1230-
1231-
def ref_list(items: Iterable[PDFObject], doc: PDFDocument) -> List[PDFObject]:
1232-
"""Relink object references if necessary after deserialization.
1233-
1234-
FIXME: This functionality should go into PLAYA soon."""
1235-
out: List[PDFObject] = []
1236-
for v in items:
1237-
if isinstance(v, dict):
1238-
out.append(ref_dict(v, doc))
1239-
elif isinstance(v, list):
1240-
out.append(ref_list(v, doc))
1241-
elif isinstance(v, PDFObjRef):
1242-
out.append(PDFObjRef(weakref.ref(doc), v.objid))
1243-
else:
1244-
out.append(v)
1245-
return out
1246-
1247-
1248-
def unref_dict(props: Dict[str, PDFObject]) -> Dict[str, PDFObject]:
1249-
"""Unlink object references if necessary for serialization.
1250-
1251-
FIXME: This functionality should go into PLAYA soon."""
1252-
return dict(zip(props.keys(), unref_list(props.values())))
1253-
1254-
1255-
def ref_dict(props: Dict[str, PDFObject], doc: PDFDocument) -> Dict[str, PDFObject]:
1256-
"""Relink object references if necessary after deserialization.
1257-
1258-
FIXME: This functionality should go into PLAYA soon."""
1259-
return dict(zip(props.keys(), ref_list(props.values(), doc)))
1260-
1261-
1262-
def unref_colorspace(cs: ColorSpace) -> ColorSpace:
1263-
"""Unlink object references if necessary for serialization.
1264-
1265-
FIXME: This functionality should go into PLAYA soon.
1266-
"""
1267-
if cs.spec is not None and isinstance(cs.spec, list):
1268-
return ColorSpace(
1269-
name=cs.name, ncomponents=cs.ncomponents, spec=unref_list(cs.spec)
1270-
)
1271-
return cs
1272-
1273-
1274-
def ref_colorspace(cs: ColorSpace, doc: PDFDocument) -> ColorSpace:
1275-
"""Relink object references if necessary after deserialization.
1276-
1277-
FIXME: This functionality should go into PLAYA soon."""
1278-
if cs.spec is not None and isinstance(cs.spec, list):
1279-
return ColorSpace(
1280-
name=cs.name, ncomponents=cs.ncomponents, spec=ref_list(cs.spec, doc)
1281-
)
1282-
return cs
1283-
1284-
1285-
def unref_gstate(gs: GraphicState) -> None:
1286-
"""Unlink object references if necessary for serialization.
1287-
1288-
FIXME: This functionality should go into PLAYA soon."""
1289-
gs.scs = unref_colorspace(gs.scs)
1290-
gs.ncs = unref_colorspace(gs.ncs)
1291-
1292-
1293-
def ref_gstate(gs: GraphicState, doc: PDFDocument) -> None:
1294-
"""Relink object references if necessary after deserialization.
1295-
1296-
FIXME: This functionality should go into PLAYA soon."""
1297-
gs.scs = ref_colorspace(gs.scs, doc)
1298-
gs.ncs = ref_colorspace(gs.ncs, doc)
1299-
1300-
1301-
def unref_component(item: Union[LTContainer, LTItem]) -> None:
1302-
"""Unlink object references if necessary for serialization."""
1303-
if isinstance(item, LTComponent):
1304-
for idx, mcs in enumerate(item.mcstack):
1305-
if mcs.props:
1306-
item.mcstack[idx] = MarkedContent(
1307-
mcid=mcs.mcid, tag=mcs.tag, props=unref_dict(mcs.props)
1308-
)
1309-
if isinstance(item, LTChar):
1310-
unref_gstate(item.graphicstate)
1311-
item.ncs = item.graphicstate.ncs
1312-
if isinstance(item, LTImage):
1313-
if item.colorspace is not None:
1314-
item.colorspace = unref_colorspace(item.colorspace)
1315-
# Content streams should never be serialized, since it would
1316-
# copy their data unnecessarily (and also their attributes
1317-
# contain indirect object references)
1318-
# FIXME: What about the generation number?
1319-
item.stream = item.stream.objid # type: ignore[assignment]
1320-
if isinstance(item, LTContainer):
1321-
for child in item:
1322-
unref_component(child)
1323-
1324-
13251213
def extract_page(page: Page, laparams: Union[LAParams, None] = None) -> LTPage:
13261214
"""Extract an LTPage from a Page, and possibly do some layout analysis.
13271215
@@ -1362,33 +1250,9 @@ def extract_page(page: Page, laparams: Union[LAParams, None] = None) -> LTPage:
13621250
if laparams is not None:
13631251
ltpage.analyze(laparams)
13641252

1365-
# We do, however, need to "unreference" any indirect object
1366-
# references before serializing.
1367-
if playa.document.__pdf is not None:
1368-
unref_component(ltpage)
13691253
return ltpage
13701254

13711255

1372-
def ref_component(item: Union[LTContainer, LTItem], doc: PDFDocument) -> None:
1373-
"""Relink object references if necessary after deserialization."""
1374-
if isinstance(item, LTComponent):
1375-
for idx, mcs in enumerate(item.mcstack):
1376-
if mcs.props:
1377-
item.mcstack[idx] = MarkedContent(
1378-
mcid=mcs.mcid, tag=mcs.tag, props=ref_dict(mcs.props, doc)
1379-
)
1380-
if isinstance(item, LTChar):
1381-
ref_gstate(item.graphicstate, doc)
1382-
item.ncs = item.graphicstate.ncs
1383-
if isinstance(item, LTImage):
1384-
if item.colorspace is not None:
1385-
item.colorspace = ref_colorspace(item.colorspace, doc)
1386-
item.stream = doc[item.stream] # type: ignore[assignment, index]
1387-
if isinstance(item, LTContainer):
1388-
for child in item:
1389-
ref_component(child, doc)
1390-
1391-
13921256
def extract(
13931257
path: Path,
13941258
laparams: Union[LAParams, None] = None,
@@ -1408,7 +1272,4 @@ def extract(
14081272
for page in pdf.pages:
14091273
yield extract_page(page, laparams)
14101274
else:
1411-
# And "rereference" indirect object references after deserializing
1412-
for ltpage in pdf.pages.map(partial(extract_page, laparams=laparams)):
1413-
ref_component(ltpage, pdf)
1414-
yield ltpage
1275+
yield from pdf.pages.map(partial(extract_page, laparams=laparams))

0 commit comments

Comments
 (0)