Skip to content

Commit ea2408d

Browse files
feat(latex): add optional Tectonic TikZ rendering with isolated dependency staging
Add opt-in TikZ image rendering for the LaTeX backend using Tectonic, while preserving stable fallback behavior when rendering fails. What this changes: - add optional `tikz_engine="tectonic"` backend support for TikZ diagrams - render `tikzpicture` environments asynchronously during LaTeX parsing - preserve raw TikZ code as `PictureMeta.code` whenever rendering fails, times out, or rasterization cannot complete - add Tectonic engine options for: - automatic binary download - per-diagram timeout - shell escape control - make shell escape explicit opt-in via CLI/backend config - sanitize known pdfTeX-only assignment lines in preambles for better Tectonic/XeTeX compatibility - restore file-backed relative TikZ compatibility by staging only explicit local dependencies (`\input`, `\include`, `\includegraphics`) into the temp render directory - block dependency path traversal and avoid ambient source-directory search - rasterize generated PDFs with locking and crop whitespace from output CLI / config updates: - add `--tikz-engine` / `-T` - add `--no-tikz-engine-download` - add `--tikz-engine-timeout` - add `--tikz-shell-escape` Tests: - add focused Tectonic engine tests for download behavior, timeout, preamble sanitization, shell escape toggling, dependency staging, and path traversal blocking - add backend tests for TikZ fallback behavior and file-backed source-root handling Signed-off-by: Aditya Sasidhar <telikicherlaadityasasidhar@gmail.com>
1 parent ceee2b1 commit ea2408d

6 files changed

Lines changed: 691 additions & 56 deletions

File tree

docling/backend/latex/engines/tectonic.py

Lines changed: 215 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,44 @@
11
import logging
22
import os
3-
import platform
3+
import re
44
import shutil
55
import subprocess
6-
import tarfile
76
import tempfile
8-
import urllib.request
97
from pathlib import Path
108

11-
import pypdfium2 as pdfium
129
from docling_core.types.doc.document import ImageRef
1310

1411
from docling.backend.latex.engines.base import RenderEngine
12+
from docling.utils.locks import pypdfium2_lock
1513

1614
_log = logging.getLogger(__name__)
1715

1816

1917
class TectonicEngine(RenderEngine):
20-
def __init__(self):
18+
_PDFTEX_ASSIGNMENT_PATTERN = re.compile(
19+
r"(?m)^([ \t]*)(\\(?:pdfcompresslevel|pdfminorversion|pdfobjcompresslevel)"
20+
r"\s*=\s*.*)$"
21+
)
22+
_INPUT_COMMAND_PATTERN = re.compile(
23+
r"""\\(?P<command>input|include)\s*\{(?P<path>[^{}\n]+)\}"""
24+
)
25+
_INCLUDEGRAPHICS_PATTERN = re.compile(
26+
r"""\\includegraphics(?:\s*\[[^\]]*\])?\s*\{(?P<path>[^{}\n]+)\}"""
27+
)
28+
_LATEX_GRAPHICS_EXTENSIONS = (".pdf", ".png", ".jpg", ".jpeg", ".eps", ".svg")
29+
30+
def __init__(
31+
self,
32+
allow_download: bool = True,
33+
timeout: float = 60.0,
34+
allow_shell_escape: bool = True,
35+
):
2136
self.cache_dir = Path.home() / ".cache" / "docling" / "tectonic"
22-
self.binary_path = None
37+
self.binary_path = self.cache_dir / "tectonic"
38+
self.allow_download = allow_download
39+
self.timeout = timeout
40+
self.allow_shell_escape = allow_shell_escape
2341
self._is_available = False
24-
2542
self.install()
2643

2744
def is_available(self) -> bool:
@@ -35,69 +52,224 @@ def install(self):
3552
_log.info(f"Using system tectonic at {self.binary_path}")
3653
return
3754

38-
self.binary_path = self.cache_dir / "tectonic"
3955
if self.binary_path.exists() and os.access(self.binary_path, os.X_OK):
4056
self._is_available = True
4157
return
4258

43-
system = platform.system().lower()
59+
if not self.allow_download:
60+
_log.warning(
61+
"Tectonic binary not found and automatic download is disabled; "
62+
"TikZ rendering will fall back."
63+
)
64+
return
4465

45-
if system not in ["linux", "darwin"]:
46-
_log.warning(f"Tectonic engine is not supported on {system}")
66+
if self._download_binary():
67+
self._is_available = True
4768
return
4869

70+
_log.warning("Tectonic binary is unavailable; TikZ rendering will fall back.")
71+
72+
def _download_binary(self) -> bool:
4973
try:
5074
self.cache_dir.mkdir(parents=True, exist_ok=True)
5175
_log.info("Downloading Tectonic binary using the official script...")
52-
5376
subprocess.run(
5477
"curl --proto '=https' --tlsv1.2 -fsSL https://drop-sh.fullyjustified.net | sh",
5578
shell=True,
5679
cwd=self.cache_dir,
5780
check=True,
5881
)
59-
6082
if self.binary_path.exists():
6183
self.binary_path.chmod(0o755)
62-
self._is_available = True
63-
_log.info(f"Tectonic successfully installed at {self.binary_path}")
64-
else:
65-
_log.warning("Tectonic binary not found after extraction.")
66-
84+
_log.info("Tectonic successfully installed at %s", self.binary_path)
85+
return True
86+
_log.warning("Tectonic binary not found after extraction.")
87+
return False
6788
except Exception as e:
68-
_log.warning(f"Failed to install Tectonic: {e}")
89+
_log.warning("Failed to install Tectonic: %s", e)
90+
return False
91+
92+
@classmethod
93+
def _sanitize_preamble_for_tectonic(cls, preamble: str) -> str:
94+
"""Drop assignment-only pdfTeX primitives Tectonic/XeTeX does not provide."""
95+
return cls._PDFTEX_ASSIGNMENT_PATTERN.sub(
96+
r"\1% docling: removed for Tectonic compatibility: \2", preamble
97+
)
98+
99+
@staticmethod
100+
def _strip_comments(text: str) -> str:
101+
return re.sub(r"(?m)(?<!\\)%.*$", "", text)
102+
103+
@classmethod
104+
def _resolve_local_dependency(
105+
cls, source_root: Path, raw_path: str, *, is_tex: bool
106+
) -> Path | None:
107+
raw_path = raw_path.strip()
108+
if not raw_path:
109+
return None
110+
111+
candidate = Path(raw_path)
112+
if candidate.is_absolute():
113+
_log.warning("Absolute TikZ dependency paths are not staged: %s", raw_path)
114+
return None
115+
116+
resolved = (source_root / candidate).resolve()
117+
try:
118+
if not resolved.is_relative_to(source_root):
119+
_log.warning(
120+
"Path traversal attempt blocked for TikZ dependency: %s", raw_path
121+
)
122+
return None
123+
except ValueError:
124+
_log.warning("Invalid TikZ dependency path: %s", raw_path)
125+
return None
126+
127+
if is_tex and not resolved.suffix:
128+
resolved = resolved.with_suffix(".tex")
129+
return resolved
130+
131+
@classmethod
132+
def _find_existing_asset(cls, source_root: Path, raw_path: str) -> Path | None:
133+
base_path = cls._resolve_local_dependency(source_root, raw_path, is_tex=False)
134+
if base_path is None:
135+
return None
136+
if base_path.exists():
137+
return base_path
138+
if base_path.suffix:
139+
return None
69140

70-
def render(self, tikz_code: str, preamble: str = "") -> ImageRef | None:
141+
for suffix in cls._LATEX_GRAPHICS_EXTENSIONS:
142+
candidate = base_path.with_suffix(suffix)
143+
if candidate.exists():
144+
return candidate
145+
return None
146+
147+
@classmethod
148+
def _collect_local_dependencies(
149+
cls, text: str, source_root: Path, seen_tex_files: set[Path] | None = None
150+
) -> set[Path]:
151+
if seen_tex_files is None:
152+
seen_tex_files = set()
153+
154+
dependencies: set[Path] = set()
155+
stripped_text = cls._strip_comments(text)
156+
157+
for match in cls._INPUT_COMMAND_PATTERN.finditer(stripped_text):
158+
source_path = cls._resolve_local_dependency(
159+
source_root, match.group("path"), is_tex=True
160+
)
161+
if source_path is None:
162+
continue
163+
if not source_path.exists():
164+
_log.warning("TikZ dependency not found: %s", match.group("path"))
165+
continue
166+
167+
dependencies.add(source_path)
168+
if source_path in seen_tex_files:
169+
continue
170+
171+
seen_tex_files.add(source_path)
172+
try:
173+
nested_text = source_path.read_text(encoding="utf-8")
174+
except Exception as exc:
175+
_log.warning("Failed to read TikZ dependency %s: %s", source_path, exc)
176+
continue
177+
178+
dependencies.update(
179+
cls._collect_local_dependencies(
180+
nested_text, source_root, seen_tex_files=seen_tex_files
181+
)
182+
)
183+
184+
for match in cls._INCLUDEGRAPHICS_PATTERN.finditer(stripped_text):
185+
asset_path = cls._find_existing_asset(source_root, match.group("path"))
186+
if asset_path is None:
187+
_log.warning("TikZ asset not found: %s", match.group("path"))
188+
continue
189+
dependencies.add(asset_path)
190+
191+
return dependencies
192+
193+
@classmethod
194+
def _stage_local_dependencies(
195+
cls, temp_path: Path, preamble: str, tikz_code: str, source_root: Path | None
196+
) -> None:
197+
if source_root is None:
198+
return
199+
200+
source_root = source_root.resolve()
201+
if not source_root.exists() or not source_root.is_dir():
202+
_log.warning("TikZ source root is not a directory: %s", source_root)
203+
return
204+
205+
dependencies = cls._collect_local_dependencies(
206+
preamble + "\n" + tikz_code, source_root
207+
)
208+
for source_path in dependencies:
209+
relative_path = source_path.relative_to(source_root)
210+
staged_path = temp_path / relative_path
211+
staged_path.parent.mkdir(parents=True, exist_ok=True)
212+
shutil.copy2(source_path, staged_path)
213+
214+
def render(
215+
self, tikz_code: str, preamble: str = "", source_root: Path | None = None
216+
) -> ImageRef | None:
71217
if not self.is_available():
72218
return None
73219

74220
# Fallback preamble if none provided
75221
if not preamble.strip():
76-
preamble = "\\usepackage{tikz}\n\\usepackage{pgfplots}\n\\pgfplotsset{compat=newest}"
222+
preamble = (
223+
"\\usepackage{tikz}\n"
224+
"\\usepackage{pgfplots}\n"
225+
"\\pgfplotsset{compat=newest}"
226+
)
227+
else:
228+
preamble = self._sanitize_preamble_for_tectonic(preamble)
77229

78-
# Minimal LaTeX document wrapping the Tikz code
79-
latex_doc = f"""\\documentclass[tikz, border=2pt]{{standalone}}
80-
{preamble}
81-
\\begin{{document}}
82-
{tikz_code}
83-
\\end{{document}}
84-
"""
230+
latex_doc = (
231+
"\\documentclass[border=20pt]{standalone}\n"
232+
+ preamble
233+
+ "\n"
234+
+ "\\begin{document}\n"
235+
+ tikz_code
236+
+ "\n"
237+
+ "\\end{document}\n"
238+
)
85239

86240
with tempfile.TemporaryDirectory() as temp_dir:
87241
temp_path = Path(temp_dir)
242+
self._stage_local_dependencies(temp_path, preamble, tikz_code, source_root)
88243
tex_file = temp_path / "diagram.tex"
89244
tex_file.write_text(latex_doc, encoding="utf-8")
90245

246+
cmd = [str(self.binary_path)]
247+
if self.allow_shell_escape:
248+
cmd.extend(["-Z", "shell-escape"])
249+
cmd.append("--print")
250+
cmd.append(str(tex_file))
251+
91252
try:
92253
subprocess.run(
93-
[str(self.binary_path), str(tex_file)],
254+
cmd,
94255
cwd=temp_dir,
95256
capture_output=True,
96257
check=True,
258+
timeout=self.timeout,
97259
)
98260
except subprocess.CalledProcessError as e:
261+
stderr = e.stderr.decode("utf-8", errors="replace")
262+
stdout = e.stdout.decode("utf-8", errors="replace")
99263
_log.warning(
100-
f"Tectonic compilation failed: {e.stderr.decode('utf-8', errors='replace')}"
264+
"Tectonic compilation failed: %s\nSTDOUT: %s",
265+
stderr,
266+
stdout,
267+
)
268+
return None
269+
except subprocess.TimeoutExpired:
270+
_log.warning(
271+
"Tectonic compilation timed out after %s seconds",
272+
self.timeout,
101273
)
102274
return None
103275

@@ -107,10 +279,19 @@ def render(self, tikz_code: str, preamble: str = "") -> ImageRef | None:
107279
return None
108280

109281
try:
110-
with pdfium.PdfDocument(pdf_file) as pdf:
111-
page = pdf[0]
112-
pil_image = page.render(scale=300 / 72).to_pil()
113-
page.close()
282+
import pypdfium2 as pdfium
283+
284+
from docling.backend.docx.drawingml.utils import crop_whitespace
285+
286+
with pypdfium2_lock:
287+
with pdfium.PdfDocument(pdf_file) as pdf:
288+
page = pdf[0]
289+
pil_image = page.render(scale=300 / 72).to_pil()
290+
page.close()
291+
292+
# Auto-crop the generous border added by standalone,
293+
# keeping a small padding (10px) for clean margins.
294+
pil_image = crop_whitespace(pil_image, padding=10)
114295

115296
return ImageRef.from_pil(pil_image, dpi=300)
116297
except Exception as e:

0 commit comments

Comments
 (0)