88import heapq
99import logging
1010import multiprocessing
11- import weakref
1211from typing import (
1312 Callable ,
1413 Dict ,
@@ -1211,117 +1210,6 @@ def _(obj: TextObject) -> Iterator[LTComponent]:
12111210 yield LTChar (glyph )
12121211
12131212
1214- def unref_list (items : Iterable [PDFObject ]) -> List [PDFObject ]:
1215- """Unlink object references if necessary for serialization.
1216-
1217- FIXME: This functionality should go into PLAYA soon."""
1218- out : List [PDFObject ] = []
1219- for v in items :
1220- if isinstance (v , dict ):
1221- out .append (unref_dict (v ))
1222- elif isinstance (v , list ):
1223- out .append (unref_list (v ))
1224- elif isinstance (v , PDFObjRef ):
1225- out .append (PDFObjRef (None , v .objid ))
1226- else :
1227- out .append (v )
1228- return out
1229-
1230-
1231- def ref_list (items : Iterable [PDFObject ], doc : PDFDocument ) -> List [PDFObject ]:
1232- """Relink object references if necessary after deserialization.
1233-
1234- FIXME: This functionality should go into PLAYA soon."""
1235- out : List [PDFObject ] = []
1236- for v in items :
1237- if isinstance (v , dict ):
1238- out .append (ref_dict (v , doc ))
1239- elif isinstance (v , list ):
1240- out .append (ref_list (v , doc ))
1241- elif isinstance (v , PDFObjRef ):
1242- out .append (PDFObjRef (weakref .ref (doc ), v .objid ))
1243- else :
1244- out .append (v )
1245- return out
1246-
1247-
1248- def unref_dict (props : Dict [str , PDFObject ]) -> Dict [str , PDFObject ]:
1249- """Unlink object references if necessary for serialization.
1250-
1251- FIXME: This functionality should go into PLAYA soon."""
1252- return dict (zip (props .keys (), unref_list (props .values ())))
1253-
1254-
1255- def ref_dict (props : Dict [str , PDFObject ], doc : PDFDocument ) -> Dict [str , PDFObject ]:
1256- """Relink object references if necessary after deserialization.
1257-
1258- FIXME: This functionality should go into PLAYA soon."""
1259- return dict (zip (props .keys (), ref_list (props .values (), doc )))
1260-
1261-
1262- def unref_colorspace (cs : ColorSpace ) -> ColorSpace :
1263- """Unlink object references if necessary for serialization.
1264-
1265- FIXME: This functionality should go into PLAYA soon.
1266- """
1267- if cs .spec is not None and isinstance (cs .spec , list ):
1268- return ColorSpace (
1269- name = cs .name , ncomponents = cs .ncomponents , spec = unref_list (cs .spec )
1270- )
1271- return cs
1272-
1273-
1274- def ref_colorspace (cs : ColorSpace , doc : PDFDocument ) -> ColorSpace :
1275- """Relink object references if necessary after deserialization.
1276-
1277- FIXME: This functionality should go into PLAYA soon."""
1278- if cs .spec is not None and isinstance (cs .spec , list ):
1279- return ColorSpace (
1280- name = cs .name , ncomponents = cs .ncomponents , spec = ref_list (cs .spec , doc )
1281- )
1282- return cs
1283-
1284-
1285- def unref_gstate (gs : GraphicState ) -> None :
1286- """Unlink object references if necessary for serialization.
1287-
1288- FIXME: This functionality should go into PLAYA soon."""
1289- gs .scs = unref_colorspace (gs .scs )
1290- gs .ncs = unref_colorspace (gs .ncs )
1291-
1292-
1293- def ref_gstate (gs : GraphicState , doc : PDFDocument ) -> None :
1294- """Relink object references if necessary after deserialization.
1295-
1296- FIXME: This functionality should go into PLAYA soon."""
1297- gs .scs = ref_colorspace (gs .scs , doc )
1298- gs .ncs = ref_colorspace (gs .ncs , doc )
1299-
1300-
1301- def unref_component (item : Union [LTContainer , LTItem ]) -> None :
1302- """Unlink object references if necessary for serialization."""
1303- if isinstance (item , LTComponent ):
1304- for idx , mcs in enumerate (item .mcstack ):
1305- if mcs .props :
1306- item .mcstack [idx ] = MarkedContent (
1307- mcid = mcs .mcid , tag = mcs .tag , props = unref_dict (mcs .props )
1308- )
1309- if isinstance (item , LTChar ):
1310- unref_gstate (item .graphicstate )
1311- item .ncs = item .graphicstate .ncs
1312- if isinstance (item , LTImage ):
1313- if item .colorspace is not None :
1314- item .colorspace = unref_colorspace (item .colorspace )
1315- # Content streams should never be serialized, since it would
1316- # copy their data unnecessarily (and also their attributes
1317- # contain indirect object references)
1318- # FIXME: What about the generation number?
1319- item .stream = item .stream .objid # type: ignore[assignment]
1320- if isinstance (item , LTContainer ):
1321- for child in item :
1322- unref_component (child )
1323-
1324-
13251213def extract_page (page : Page , laparams : Union [LAParams , None ] = None ) -> LTPage :
13261214 """Extract an LTPage from a Page, and possibly do some layout analysis.
13271215
@@ -1362,33 +1250,9 @@ def extract_page(page: Page, laparams: Union[LAParams, None] = None) -> LTPage:
13621250 if laparams is not None :
13631251 ltpage .analyze (laparams )
13641252
1365- # We do, however, need to "unreference" any indirect object
1366- # references before serializing.
1367- if playa .document .__pdf is not None :
1368- unref_component (ltpage )
13691253 return ltpage
13701254
13711255
1372- def ref_component (item : Union [LTContainer , LTItem ], doc : PDFDocument ) -> None :
1373- """Relink object references if necessary after deserialization."""
1374- if isinstance (item , LTComponent ):
1375- for idx , mcs in enumerate (item .mcstack ):
1376- if mcs .props :
1377- item .mcstack [idx ] = MarkedContent (
1378- mcid = mcs .mcid , tag = mcs .tag , props = ref_dict (mcs .props , doc )
1379- )
1380- if isinstance (item , LTChar ):
1381- ref_gstate (item .graphicstate , doc )
1382- item .ncs = item .graphicstate .ncs
1383- if isinstance (item , LTImage ):
1384- if item .colorspace is not None :
1385- item .colorspace = ref_colorspace (item .colorspace , doc )
1386- item .stream = doc [item .stream ] # type: ignore[assignment, index]
1387- if isinstance (item , LTContainer ):
1388- for child in item :
1389- ref_component (child , doc )
1390-
1391-
13921256def extract (
13931257 path : Path ,
13941258 laparams : Union [LAParams , None ] = None ,
@@ -1408,7 +1272,4 @@ def extract(
14081272 for page in pdf .pages :
14091273 yield extract_page (page , laparams )
14101274 else :
1411- # And "rereference" indirect object references after deserializing
1412- for ltpage in pdf .pages .map (partial (extract_page , laparams = laparams )):
1413- ref_component (ltpage , pdf )
1414- yield ltpage
1275+ yield from pdf .pages .map (partial (extract_page , laparams = laparams ))
0 commit comments