Skip to content

Commit d73abf3

Browse files
committed
fix(security): reject symlinks/hardlinks in BaseFileComponent TAR extraction (GHSA-ccv6-r384-xp75)
`BaseFileComponent._unpack_bundle._safe_extract_tar` accepted any TAR member type and only checked that `output_dir / member.name` did not escape the extract dir. That check was performed before extraction, so a symlink whose *target* was an absolute path (or `../` escape) was extracted untouched. Once on disk the link was iterated by `temp_dir_path.iterdir()` and handed to `process_files()`, whose concrete implementations (FileComponent, DoclingInline/Remote, NvidiaIngest, VideoFile, Unstructured) call `path.read_bytes()` and follow the link to read arbitrary host files. The reporter's exploit chain leaks `~/.langflow/secret_key`, forges a JWT for an admin user, and then runs arbitrary code through the Python interpreter node, achieving RCE. Python's `tarfile` only defaults to the safe `data` filter on Python 3.14, which langflow's `requires-python = ">=3.10,<3.14"` excludes — so every supported interpreter was vulnerable. Fix: - `_safe_extract_tar` now rejects symbolic-link, hard-link, FIFO, and device-node members with a `ValueError` and only extracts regular files and directories. - `_unpack_and_collect_files` skips any `is_symlink()` entries from the extracted bundle directory and from recursive directory walks as defense-in-depth in case a future bundle format slips a link through. - New `tests/unit/base/data/test_base_file_unpack.py` covers symlink (abs + relative escape), hardlink, FIFO rejection, benign tar/zip extraction, the post-extraction symlink filter, and an end-to-end repro mirroring the advisory PoC (real filesystem symlink → tarfile.add). Refs: https://github.com/langflow-ai/langflow/security/advisories/GHSA-ccv6-r384-xp75
1 parent 7640ce6 commit d73abf3

2 files changed

Lines changed: 260 additions & 4 deletions

File tree

src/lfx/src/lfx/base/data/base_file.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,10 @@ def _unpack_and_collect_files(self, files: list[BaseFile]) -> list[BaseFile]:
702702
data = file.data
703703

704704
if path.is_dir():
705-
# Recurse into directories
705+
# Recurse into directories. Skip symlinks defensively so that a
706+
# link planted in a previously-extracted bundle (or a directory
707+
# the user pointed at) cannot be dereferenced into an arbitrary
708+
# host file (GHSA-ccv6-r384-xp75).
706709
collected_files.extend(
707710
[
708711
BaseFileComponent.BaseFile(
@@ -711,7 +714,7 @@ def _unpack_and_collect_files(self, files: list[BaseFile]) -> list[BaseFile]:
711714
delete_after_processing=delete_after_processing,
712715
)
713716
for sub_path in path.rglob("*")
714-
if sub_path.is_file()
717+
if sub_path.is_file() and not sub_path.is_symlink()
715718
]
716719
)
717720
elif path.suffix[1:] in self.SUPPORTED_BUNDLE_EXTENSIONS:
@@ -720,7 +723,11 @@ def _unpack_and_collect_files(self, files: list[BaseFile]) -> list[BaseFile]:
720723
self._temp_dirs.append(temp_dir)
721724
temp_dir_path = Path(temp_dir.name)
722725
self._unpack_bundle(path, temp_dir_path)
723-
subpaths = list(temp_dir_path.iterdir())
726+
# Drop any symlink that may have slipped through extraction.
727+
# `_unpack_bundle` rejects link members for TAR archives, but
728+
# this guard keeps the contract in place for any future bundle
729+
# type added to SUPPORTED_BUNDLE_EXTENSIONS.
730+
subpaths = [p for p in temp_dir_path.iterdir() if not p.is_symlink()]
724731
self.log(f"Unpacked bundle {path.name} into {subpaths}")
725732
collected_files.extend(
726733
[
@@ -768,11 +775,24 @@ def _safe_extract_zip(bundle: ZipFile, output_dir: Path):
768775
bundle.extract(member, path=output_dir)
769776

770777
def _safe_extract_tar(bundle: tarfile.TarFile, output_dir: Path):
771-
"""Safely extract TAR files."""
778+
"""Safely extract TAR files.
779+
780+
Only regular files and directories are extracted. Symlinks, hardlinks,
781+
and device/FIFO members are rejected because they could be made to
782+
point at arbitrary locations on the host filesystem and lead to
783+
arbitrary file read once the extracted entries are subsequently
784+
ingested by `process_files()` (GHSA-ccv6-r384-xp75).
785+
"""
772786
for member in bundle.getmembers():
773787
# Filter out resource fork information for automatic production of mac
774788
if Path(member.name).name.startswith("._"):
775789
continue
790+
if member.issym() or member.islnk():
791+
msg = f"Refusing to extract link member from TAR File: {member.name!r} -> {member.linkname!r}"
792+
raise ValueError(msg)
793+
if not (member.isfile() or member.isdir()):
794+
msg = f"Refusing to extract non-regular TAR member: {member.name!r}"
795+
raise ValueError(msg)
776796
member_path = output_dir / member.name
777797
# Ensure no path traversal outside `output_dir`
778798
if not member_path.resolve().is_relative_to(output_dir.resolve()):
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
"""Security regression tests for BaseFileComponent bundle extraction.
2+
3+
Covers GHSA-ccv6-r384-xp75: a TAR member that is a symlink, hardlink, or device
4+
node could be made to point at an arbitrary host file. When the extracted entry
5+
is later read by `process_files()` the host file's contents would be ingested
6+
into the downstream sink. The fix in `_safe_extract_tar` rejects every member
7+
that is not a regular file or directory, and `_unpack_and_collect_files` skips
8+
any symlinks defensively before handing entries to `process_files()`.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import io
14+
import tarfile
15+
import zipfile
16+
from typing import TYPE_CHECKING
17+
18+
import pytest
19+
from lfx.base.data.base_file import BaseFileComponent
20+
21+
if TYPE_CHECKING:
22+
from pathlib import Path
23+
24+
25+
class _StubFileComponent(BaseFileComponent):
26+
"""Minimal concrete subclass used to exercise the unpack helpers."""
27+
28+
VALID_EXTENSIONS = ["txt"]
29+
30+
def __init__(self, **data):
31+
super().__init__(**data)
32+
self.set_attributes(
33+
{
34+
"path": [],
35+
"file_path": None,
36+
"separator": "\n\n",
37+
"silent_errors": False,
38+
"delete_server_file_after_processing": True,
39+
"ignore_unsupported_extensions": True,
40+
"ignore_unspecified_files": False,
41+
}
42+
)
43+
44+
def process_files(self, file_list): # pragma: no cover - not exercised here
45+
return file_list
46+
47+
48+
def _add_file(tar: tarfile.TarFile, name: str, payload: bytes) -> None:
49+
info = tarfile.TarInfo(name=name)
50+
info.size = len(payload)
51+
info.type = tarfile.REGTYPE
52+
tar.addfile(info, io.BytesIO(payload))
53+
54+
55+
def _add_symlink(tar: tarfile.TarFile, name: str, target: str) -> None:
56+
info = tarfile.TarInfo(name=name)
57+
info.type = tarfile.SYMTYPE
58+
info.linkname = target
59+
tar.addfile(info)
60+
61+
62+
def _add_hardlink(tar: tarfile.TarFile, name: str, target: str) -> None:
63+
info = tarfile.TarInfo(name=name)
64+
info.type = tarfile.LNKTYPE
65+
info.linkname = target
66+
tar.addfile(info)
67+
68+
69+
def _add_fifo(tar: tarfile.TarFile, name: str) -> None:
70+
info = tarfile.TarInfo(name=name)
71+
info.type = tarfile.FIFOTYPE
72+
tar.addfile(info)
73+
74+
75+
def _build_tar(tmp_path: Path, name: str, populate) -> Path:
76+
bundle = tmp_path / name
77+
with tarfile.open(bundle, "w") as tar:
78+
populate(tar)
79+
return bundle
80+
81+
82+
def _build_zip(tmp_path: Path, name: str, files: dict[str, bytes]) -> Path:
83+
bundle = tmp_path / name
84+
with zipfile.ZipFile(bundle, "w") as zf:
85+
for member, payload in files.items():
86+
zf.writestr(member, payload)
87+
return bundle
88+
89+
90+
@pytest.fixture
91+
def component() -> _StubFileComponent:
92+
return _StubFileComponent()
93+
94+
95+
def test_tar_with_absolute_symlink_is_rejected(tmp_path, component):
96+
"""A symlink whose target is an absolute host path must be refused."""
97+
secret = tmp_path / "secret.txt"
98+
secret.write_bytes(b"jwt-signing-secret")
99+
100+
bundle = _build_tar(
101+
tmp_path,
102+
"evil.tar",
103+
lambda tar: _add_symlink(tar, "leak", str(secret)),
104+
)
105+
106+
extract_dir = tmp_path / "out_abs"
107+
extract_dir.mkdir()
108+
with pytest.raises(ValueError, match="Refusing to extract link member"):
109+
component._unpack_bundle(bundle, extract_dir)
110+
assert list(extract_dir.iterdir()) == []
111+
112+
113+
def test_tar_with_relative_escape_symlink_is_rejected(tmp_path, component):
114+
"""A symlink that uses ../ to escape the extract dir must be refused."""
115+
bundle = _build_tar(
116+
tmp_path,
117+
"escape.tar",
118+
lambda tar: _add_symlink(tar, "leak", "../../etc/passwd"),
119+
)
120+
121+
extract_dir = tmp_path / "out_rel"
122+
extract_dir.mkdir()
123+
with pytest.raises(ValueError, match="Refusing to extract link member"):
124+
component._unpack_bundle(bundle, extract_dir)
125+
assert list(extract_dir.iterdir()) == []
126+
127+
128+
def test_tar_with_hardlink_is_rejected(tmp_path, component):
129+
"""Hardlinks have the same arbitrary-target risk as symlinks."""
130+
131+
def populate(tar):
132+
_add_file(tar, "real.txt", b"ok")
133+
_add_hardlink(tar, "leak", "../etc/passwd")
134+
135+
bundle = _build_tar(tmp_path, "hardlink.tar", populate)
136+
137+
extract_dir = tmp_path / "out_hl"
138+
extract_dir.mkdir()
139+
with pytest.raises(ValueError, match="Refusing to extract link member"):
140+
component._unpack_bundle(bundle, extract_dir)
141+
142+
143+
def test_tar_with_fifo_member_is_rejected(tmp_path, component):
144+
"""Non-regular members (FIFO/device) must be refused."""
145+
bundle = _build_tar(tmp_path, "fifo.tar", lambda tar: _add_fifo(tar, "pipe"))
146+
147+
extract_dir = tmp_path / "out_fifo"
148+
extract_dir.mkdir()
149+
with pytest.raises(ValueError, match="Refusing to extract non-regular TAR member"):
150+
component._unpack_bundle(bundle, extract_dir)
151+
152+
153+
def test_tar_with_only_regular_files_extracts(tmp_path, component):
154+
"""The fix must not regress benign archives."""
155+
156+
def populate(tar):
157+
_add_file(tar, "a.txt", b"alpha")
158+
_add_file(tar, "nested/b.txt", b"beta")
159+
160+
bundle = _build_tar(tmp_path, "ok.tar", populate)
161+
162+
extract_dir = tmp_path / "out_ok"
163+
extract_dir.mkdir()
164+
component._unpack_bundle(bundle, extract_dir)
165+
166+
assert (extract_dir / "a.txt").read_bytes() == b"alpha"
167+
assert (extract_dir / "nested" / "b.txt").read_bytes() == b"beta"
168+
169+
170+
def test_zip_with_only_regular_files_extracts(tmp_path, component):
171+
"""ZIP path must remain working unchanged."""
172+
bundle = _build_zip(tmp_path, "ok.zip", {"a.txt": b"alpha", "nested/b.txt": b"beta"})
173+
174+
extract_dir = tmp_path / "out_zip"
175+
extract_dir.mkdir()
176+
component._unpack_bundle(bundle, extract_dir)
177+
178+
assert (extract_dir / "a.txt").read_bytes() == b"alpha"
179+
assert (extract_dir / "nested" / "b.txt").read_bytes() == b"beta"
180+
181+
182+
def test_collect_files_skips_symlinks_in_extracted_dir(tmp_path, component):
183+
"""Defense-in-depth check for the post-extraction iteration.
184+
185+
A symlink that somehow lands in an unpacked dir must not be passed to
186+
``process_files()``. Simulated by manually planting one, since
187+
``_safe_extract_tar`` would otherwise refuse it.
188+
"""
189+
extract_root = tmp_path / "extracted"
190+
extract_root.mkdir()
191+
real = extract_root / "doc.txt"
192+
real.write_bytes(b"hello")
193+
secret = tmp_path / "secret.txt"
194+
secret.write_bytes(b"top-secret")
195+
(extract_root / "leak").symlink_to(secret)
196+
197+
files = [BaseFileComponent.BaseFile(data=None, path=extract_root)]
198+
collected = component._unpack_and_collect_files(files)
199+
200+
paths = {bf.path.name for bf in collected}
201+
assert "doc.txt" in paths
202+
assert "leak" not in paths
203+
204+
205+
def test_unpack_bundle_rejects_unsupported_format(tmp_path, component):
206+
"""An input that is neither zip nor tar still raises clearly."""
207+
bogus = tmp_path / "not-a-bundle.bin"
208+
bogus.write_bytes(b"not an archive")
209+
210+
extract_dir = tmp_path / "out_bogus"
211+
extract_dir.mkdir()
212+
with pytest.raises(ValueError, match="Unsupported bundle format"):
213+
component._unpack_bundle(bogus, extract_dir)
214+
215+
216+
def test_real_filesystem_symlink_in_a_tar_via_tarfile_is_rejected(tmp_path, component):
217+
"""End-to-end repro of the advisory's PoC archive shape.
218+
219+
Builds the tar from a real filesystem symlink (the way the reporter's
220+
archive was produced) and confirms extraction is refused.
221+
"""
222+
target = tmp_path / "host_secret"
223+
target.write_bytes(b"x")
224+
workdir = tmp_path / "src"
225+
workdir.mkdir()
226+
(workdir / "leak").symlink_to(target)
227+
228+
bundle = tmp_path / "from_fs.tar"
229+
with tarfile.open(bundle, "w") as tar:
230+
tar.add(workdir / "leak", arcname="leak")
231+
232+
extract_dir = tmp_path / "out_fs"
233+
extract_dir.mkdir()
234+
with pytest.raises(ValueError, match="Refusing to extract link member"):
235+
component._unpack_bundle(bundle, extract_dir)
236+
assert list(extract_dir.iterdir()) == []

0 commit comments

Comments
 (0)