11import logging
22import os
3- import platform
3+ import re
44import shutil
55import subprocess
6- import tarfile
76import tempfile
8- import urllib .request
97from pathlib import Path
108
11- import pypdfium2 as pdfium
129from docling_core .types .doc .document import ImageRef
1310
1411from docling .backend .latex .engines .base import RenderEngine
12+ from docling .utils .locks import pypdfium2_lock
1513
1614_log = logging .getLogger (__name__ )
1715
1816
1917class TectonicEngine (RenderEngine ):
20- def __init__ (self ):
18+ _PDFTEX_ASSIGNMENT_PATTERN = re .compile (
19+ r"(?m)^([ \t]*)(\\(?:pdfcompresslevel|pdfminorversion|pdfobjcompresslevel)"
20+ r"\s*=\s*.*)$"
21+ )
22+ _INPUT_COMMAND_PATTERN = re .compile (
23+ r"""\\(?P<command>input|include)\s*\{(?P<path>[^{}\n]+)\}"""
24+ )
25+ _INCLUDEGRAPHICS_PATTERN = re .compile (
26+ r"""\\includegraphics(?:\s*\[[^\]]*\])?\s*\{(?P<path>[^{}\n]+)\}"""
27+ )
28+ _LATEX_GRAPHICS_EXTENSIONS = (".pdf" , ".png" , ".jpg" , ".jpeg" , ".eps" , ".svg" )
29+
30+ def __init__ (
31+ self ,
32+ allow_download : bool = True ,
33+ timeout : float = 60.0 ,
34+ allow_shell_escape : bool = True ,
35+ ):
2136 self .cache_dir = Path .home () / ".cache" / "docling" / "tectonic"
22- self .binary_path = None
37+ self .binary_path = self .cache_dir / "tectonic"
38+ self .allow_download = allow_download
39+ self .timeout = timeout
40+ self .allow_shell_escape = allow_shell_escape
2341 self ._is_available = False
24-
2542 self .install ()
2643
2744 def is_available (self ) -> bool :
@@ -35,69 +52,224 @@ def install(self):
3552 _log .info (f"Using system tectonic at { self .binary_path } " )
3653 return
3754
38- self .binary_path = self .cache_dir / "tectonic"
3955 if self .binary_path .exists () and os .access (self .binary_path , os .X_OK ):
4056 self ._is_available = True
4157 return
4258
43- system = platform .system ().lower ()
59+ if not self .allow_download :
60+ _log .warning (
61+ "Tectonic binary not found and automatic download is disabled; "
62+ "TikZ rendering will fall back."
63+ )
64+ return
4465
45- if system not in [ "linux" , "darwin" ] :
46- _log . warning ( f"Tectonic engine is not supported on { system } " )
66+ if self . _download_binary () :
67+ self . _is_available = True
4768 return
4869
70+ _log .warning ("Tectonic binary is unavailable; TikZ rendering will fall back." )
71+
72+ def _download_binary (self ) -> bool :
4973 try :
5074 self .cache_dir .mkdir (parents = True , exist_ok = True )
5175 _log .info ("Downloading Tectonic binary using the official script..." )
52-
5376 subprocess .run (
5477 "curl --proto '=https' --tlsv1.2 -fsSL https://drop-sh.fullyjustified.net | sh" ,
5578 shell = True ,
5679 cwd = self .cache_dir ,
5780 check = True ,
5881 )
59-
6082 if self .binary_path .exists ():
6183 self .binary_path .chmod (0o755 )
62- self ._is_available = True
63- _log .info (f"Tectonic successfully installed at { self .binary_path } " )
64- else :
65- _log .warning ("Tectonic binary not found after extraction." )
66-
84+ _log .info ("Tectonic successfully installed at %s" , self .binary_path )
85+ return True
86+ _log .warning ("Tectonic binary not found after extraction." )
87+ return False
6788 except Exception as e :
68- _log .warning (f"Failed to install Tectonic: { e } " )
89+ _log .warning ("Failed to install Tectonic: %s" , e )
90+ return False
91+
92+ @classmethod
93+ def _sanitize_preamble_for_tectonic (cls , preamble : str ) -> str :
94+ """Drop assignment-only pdfTeX primitives Tectonic/XeTeX does not provide."""
95+ return cls ._PDFTEX_ASSIGNMENT_PATTERN .sub (
96+ r"\1% docling: removed for Tectonic compatibility: \2" , preamble
97+ )
98+
99+ @staticmethod
100+ def _strip_comments (text : str ) -> str :
101+ return re .sub (r"(?m)(?<!\\)%.*$" , "" , text )
102+
103+ @classmethod
104+ def _resolve_local_dependency (
105+ cls , source_root : Path , raw_path : str , * , is_tex : bool
106+ ) -> Path | None :
107+ raw_path = raw_path .strip ()
108+ if not raw_path :
109+ return None
110+
111+ candidate = Path (raw_path )
112+ if candidate .is_absolute ():
113+ _log .warning ("Absolute TikZ dependency paths are not staged: %s" , raw_path )
114+ return None
115+
116+ resolved = (source_root / candidate ).resolve ()
117+ try :
118+ if not resolved .is_relative_to (source_root ):
119+ _log .warning (
120+ "Path traversal attempt blocked for TikZ dependency: %s" , raw_path
121+ )
122+ return None
123+ except ValueError :
124+ _log .warning ("Invalid TikZ dependency path: %s" , raw_path )
125+ return None
126+
127+ if is_tex and not resolved .suffix :
128+ resolved = resolved .with_suffix (".tex" )
129+ return resolved
130+
131+ @classmethod
132+ def _find_existing_asset (cls , source_root : Path , raw_path : str ) -> Path | None :
133+ base_path = cls ._resolve_local_dependency (source_root , raw_path , is_tex = False )
134+ if base_path is None :
135+ return None
136+ if base_path .exists ():
137+ return base_path
138+ if base_path .suffix :
139+ return None
69140
70- def render (self , tikz_code : str , preamble : str = "" ) -> ImageRef | None :
141+ for suffix in cls ._LATEX_GRAPHICS_EXTENSIONS :
142+ candidate = base_path .with_suffix (suffix )
143+ if candidate .exists ():
144+ return candidate
145+ return None
146+
147+ @classmethod
148+ def _collect_local_dependencies (
149+ cls , text : str , source_root : Path , seen_tex_files : set [Path ] | None = None
150+ ) -> set [Path ]:
151+ if seen_tex_files is None :
152+ seen_tex_files = set ()
153+
154+ dependencies : set [Path ] = set ()
155+ stripped_text = cls ._strip_comments (text )
156+
157+ for match in cls ._INPUT_COMMAND_PATTERN .finditer (stripped_text ):
158+ source_path = cls ._resolve_local_dependency (
159+ source_root , match .group ("path" ), is_tex = True
160+ )
161+ if source_path is None :
162+ continue
163+ if not source_path .exists ():
164+ _log .warning ("TikZ dependency not found: %s" , match .group ("path" ))
165+ continue
166+
167+ dependencies .add (source_path )
168+ if source_path in seen_tex_files :
169+ continue
170+
171+ seen_tex_files .add (source_path )
172+ try :
173+ nested_text = source_path .read_text (encoding = "utf-8" )
174+ except Exception as exc :
175+ _log .warning ("Failed to read TikZ dependency %s: %s" , source_path , exc )
176+ continue
177+
178+ dependencies .update (
179+ cls ._collect_local_dependencies (
180+ nested_text , source_root , seen_tex_files = seen_tex_files
181+ )
182+ )
183+
184+ for match in cls ._INCLUDEGRAPHICS_PATTERN .finditer (stripped_text ):
185+ asset_path = cls ._find_existing_asset (source_root , match .group ("path" ))
186+ if asset_path is None :
187+ _log .warning ("TikZ asset not found: %s" , match .group ("path" ))
188+ continue
189+ dependencies .add (asset_path )
190+
191+ return dependencies
192+
193+ @classmethod
194+ def _stage_local_dependencies (
195+ cls , temp_path : Path , preamble : str , tikz_code : str , source_root : Path | None
196+ ) -> None :
197+ if source_root is None :
198+ return
199+
200+ source_root = source_root .resolve ()
201+ if not source_root .exists () or not source_root .is_dir ():
202+ _log .warning ("TikZ source root is not a directory: %s" , source_root )
203+ return
204+
205+ dependencies = cls ._collect_local_dependencies (
206+ preamble + "\n " + tikz_code , source_root
207+ )
208+ for source_path in dependencies :
209+ relative_path = source_path .relative_to (source_root )
210+ staged_path = temp_path / relative_path
211+ staged_path .parent .mkdir (parents = True , exist_ok = True )
212+ shutil .copy2 (source_path , staged_path )
213+
214+ def render (
215+ self , tikz_code : str , preamble : str = "" , source_root : Path | None = None
216+ ) -> ImageRef | None :
71217 if not self .is_available ():
72218 return None
73219
74220 # Fallback preamble if none provided
75221 if not preamble .strip ():
76- preamble = "\\ usepackage{tikz}\n \\ usepackage{pgfplots}\n \\ pgfplotsset{compat=newest}"
222+ preamble = (
223+ "\\ usepackage{tikz}\n "
224+ "\\ usepackage{pgfplots}\n "
225+ "\\ pgfplotsset{compat=newest}"
226+ )
227+ else :
228+ preamble = self ._sanitize_preamble_for_tectonic (preamble )
77229
78- # Minimal LaTeX document wrapping the Tikz code
79- latex_doc = f"""\\ documentclass[tikz, border=2pt]{{standalone}}
80- { preamble }
81- \\ begin{{document}}
82- { tikz_code }
83- \\ end{{document}}
84- """
230+ latex_doc = (
231+ "\\ documentclass[border=20pt]{standalone}\n "
232+ + preamble
233+ + "\n "
234+ + "\\ begin{document}\n "
235+ + tikz_code
236+ + "\n "
237+ + "\\ end{document}\n "
238+ )
85239
86240 with tempfile .TemporaryDirectory () as temp_dir :
87241 temp_path = Path (temp_dir )
242+ self ._stage_local_dependencies (temp_path , preamble , tikz_code , source_root )
88243 tex_file = temp_path / "diagram.tex"
89244 tex_file .write_text (latex_doc , encoding = "utf-8" )
90245
246+ cmd = [str (self .binary_path )]
247+ if self .allow_shell_escape :
248+ cmd .extend (["-Z" , "shell-escape" ])
249+ cmd .append ("--print" )
250+ cmd .append (str (tex_file ))
251+
91252 try :
92253 subprocess .run (
93- [ str ( self . binary_path ), str ( tex_file )] ,
254+ cmd ,
94255 cwd = temp_dir ,
95256 capture_output = True ,
96257 check = True ,
258+ timeout = self .timeout ,
97259 )
98260 except subprocess .CalledProcessError as e :
261+ stderr = e .stderr .decode ("utf-8" , errors = "replace" )
262+ stdout = e .stdout .decode ("utf-8" , errors = "replace" )
99263 _log .warning (
100- f"Tectonic compilation failed: { e .stderr .decode ('utf-8' , errors = 'replace' )} "
264+ "Tectonic compilation failed: %s\n STDOUT: %s" ,
265+ stderr ,
266+ stdout ,
267+ )
268+ return None
269+ except subprocess .TimeoutExpired :
270+ _log .warning (
271+ "Tectonic compilation timed out after %s seconds" ,
272+ self .timeout ,
101273 )
102274 return None
103275
@@ -107,10 +279,19 @@ def render(self, tikz_code: str, preamble: str = "") -> ImageRef | None:
107279 return None
108280
109281 try :
110- with pdfium .PdfDocument (pdf_file ) as pdf :
111- page = pdf [0 ]
112- pil_image = page .render (scale = 300 / 72 ).to_pil ()
113- page .close ()
282+ import pypdfium2 as pdfium
283+
284+ from docling .backend .docx .drawingml .utils import crop_whitespace
285+
286+ with pypdfium2_lock :
287+ with pdfium .PdfDocument (pdf_file ) as pdf :
288+ page = pdf [0 ]
289+ pil_image = page .render (scale = 300 / 72 ).to_pil ()
290+ page .close ()
291+
292+ # Auto-crop the generous border added by standalone,
293+ # keeping a small padding (10px) for clean margins.
294+ pil_image = crop_whitespace (pil_image , padding = 10 )
114295
115296 return ImageRef .from_pil (pil_image , dpi = 300 )
116297 except Exception as e :
0 commit comments