Skip to content

Commit aec4fce

Browse files
authored
Merge pull request nltk#3522 from ekaf/pathsec
Add central security sentinel for file and network access
2 parents eec4ee3 + 1b0e519 commit aec4fce

File tree

10 files changed

+646
-97
lines changed

10 files changed

+646
-97
lines changed

SECURITY.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,85 @@
33
## Reporting a Vulnerability
44

55
Please report security issues to `nltk.team@gmail.com`
6+
7+
## Security Hardening
8+
9+
NLTK includes a centralized I/O security module (`nltk.pathsec`) that
10+
validates file paths, network URLs, and zip archives. During the initial
11+
transition phase, it operates by default in **warn-only mode**, to avoid
12+
breaking existing workflows. In a later release, it will be switched to
13+
enforce the stricter security policy by default.
14+
15+
### Enabling strict enforcement
16+
17+
If you are running NLTK in a security-sensitive environment (web
18+
applications, multi-tenant pipelines, CI/CD systems, or any context
19+
where untrusted input may reach NLTK), you should enable strict
20+
enforcement:
21+
22+
```python
23+
import nltk.pathsec
24+
nltk.pathsec.ENFORCE = True
25+
```
26+
27+
With `ENFORCE = True`, unauthorized file access, SSRF attempts, and
28+
zip-slip attacks will raise `PermissionError` instead of emitting
29+
warnings.
30+
31+
32+
### Current Working Directory (CWD) Access
33+
34+
To maintain a "zero-friction" experience for students and researchers,
35+
NLTK permits access to resources located in the process's current
36+
working directory by default.
37+
38+
* **Standard Mode (`ENFORCE=False`):** Accessing data in the CWD is
39+
permitted but triggers a `RuntimeWarning` to alert users that this
40+
behavior may be insecure in shared or server-side environments.
41+
42+
* **Strict Mode (`ENFORCE=True`):** Implicit CWD access is **disabled**.
43+
To authorize the local directory in strict mode, users must explicitly
44+
append it to the search path:
45+
```python
46+
import nltk
47+
nltk.data.path.append('.')
48+
```
49+
50+
51+
### What is protected
52+
53+
- **Path traversal**: file access is validated against allowed NLTK
54+
data directories (`nltk.data.path`, `NLTK_DATA` environment
55+
variable, and standard system locations).
56+
- **SSRF prevention**: `urlopen` resolves hostnames via DNS and blocks
57+
requests to loopback, private, link-local, and multicast IP ranges,
58+
including obfuscated forms (e.g. decimal IP notation).
59+
- **Zip-slip protection**: zip extraction validates that member paths
60+
stay within the target directory.
61+
- **Pickle safety**: `nltk.data.load()` uses `RestrictedUnpickler`
62+
which blocks all class/function globals. Other pickle loading uses
63+
`pickle_load()` which emits a security warning.
64+
65+
### Configuring allowed data paths
66+
67+
NLTK determines allowed data directories from:
68+
69+
1. `nltk.data.path` (configurable at runtime)
70+
2. `NLTK_DATA` environment variable
71+
3. Standard locations (`~/nltk_data`, `/usr/share/nltk_data`, etc.)
72+
4. System temp directory
73+
74+
If you use a custom data location, add it to `nltk.data.path`:
75+
76+
```python
77+
import nltk
78+
nltk.data.path.append('/my/custom/data')
79+
```
80+
81+
### Note on symlinks
82+
83+
NLTK's corpus readers perform lexical path containment checks when
84+
joining file paths. These checks do not resolve symlinks. If your
85+
threat model includes attackers who can place symlinks inside your
86+
NLTK data directories, enable `ENFORCE = True` for full path
87+
resolution and validation.

nltk/corpus/reader/api.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,22 +221,27 @@ def raw(self, fileids=None):
221221

222222
def open(self, file):
223223
"""
224-
Return an open stream that can be used to read the given file.
225-
Security patched: prevents path traversal & absolute path access.
224+
Return an open stream for the given file.
225+
Security patched: prevents path traversal and scoped escapes.
226226
"""
227-
# -------- SECURITY PATCH START --------
228-
file = str(file)
227+
# Layer 1: Lexical guard
228+
if os.path.isabs(file) or ".." in file.replace("\\", "/"):
229+
raise ValueError(f"CorpusReader paths must be relative: {file}")
229230

230-
if os.path.isabs(file):
231-
raise ValueError("Absolute paths are not allowed")
231+
path = self._root.join(file)
232232

233-
if ".." in file.replace("\\", "/").split("/"):
234-
raise ValueError("Path traversal attempt blocked")
235-
# -------- SECURITY PATCH END --------
233+
# Layer 2: Scoped resolved guard (Fixes symlink escape test)
234+
from nltk.pathsec import validate_path
236235

237-
encoding = self.encoding(file)
238-
stream = self._root.join(file).open(encoding)
239-
return stream
236+
validate_path(path, context="CorpusReader", required_root=self._root)
237+
238+
# --- FIX: Handle dict-based encodings (e.g., UDHR corpus) ---
239+
encoding = self._encoding
240+
if isinstance(encoding, dict):
241+
encoding = encoding.get(file)
242+
243+
# Layer 3: Global sentinel check happens inside path.open()
244+
return path.open(encoding=encoding)
240245

241246
def encoding(self, file):
242247
"""

nltk/corpus/reader/childes.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -620,8 +620,11 @@ def demo(corpus_root=None):
620620
demo('/path/to/childes/data-xml/Eng-USA/")
621621
"""
622622
)
623-
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
624-
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
623+
624+
# To test remote fetching securely, use the pathsec wrapper:
625+
# from nltk.pathsec import urlopen, ZipFile
626+
# corpus_root_http = urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
627+
# corpus_root_http_bates = ZipFile(cStringIO.StringIO(corpus_root_http.read()))
625628
##this fails
626629
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
627630

nltk/corpus/reader/mte.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def words(self, fileids=None):
250250
"""
251251
return concat(
252252
[
253-
MTEFileReader(os.path.join(self._root, f)).words()
253+
MTEFileReader(os.path.join(str(self._root), f)).words()
254254
for f in self.__fileids(fileids)
255255
]
256256
)
@@ -264,7 +264,7 @@ def sents(self, fileids=None):
264264
"""
265265
return concat(
266266
[
267-
MTEFileReader(os.path.join(self._root, f)).sents()
267+
MTEFileReader(os.path.join(str(self._root), f)).sents()
268268
for f in self.__fileids(fileids)
269269
]
270270
)
@@ -278,7 +278,7 @@ def paras(self, fileids=None):
278278
"""
279279
return concat(
280280
[
281-
MTEFileReader(os.path.join(self._root, f)).paras()
281+
MTEFileReader(os.path.join(str(self._root), f)).paras()
282282
for f in self.__fileids(fileids)
283283
]
284284
)
@@ -292,7 +292,7 @@ def lemma_words(self, fileids=None):
292292
"""
293293
return concat(
294294
[
295-
MTEFileReader(os.path.join(self._root, f)).lemma_words()
295+
MTEFileReader(os.path.join(str(self._root), f)).lemma_words()
296296
for f in self.__fileids(fileids)
297297
]
298298
)
@@ -311,7 +311,7 @@ def tagged_words(self, fileids=None, tagset="msd", tags=""):
311311
if tagset == "universal" or tagset == "msd":
312312
return concat(
313313
[
314-
MTEFileReader(os.path.join(self._root, f)).tagged_words(
314+
MTEFileReader(os.path.join(str(self._root), f)).tagged_words(
315315
tagset, tags
316316
)
317317
for f in self.__fileids(fileids)
@@ -330,7 +330,7 @@ def lemma_sents(self, fileids=None):
330330
"""
331331
return concat(
332332
[
333-
MTEFileReader(os.path.join(self._root, f)).lemma_sents()
333+
MTEFileReader(os.path.join(str(self._root), f)).lemma_sents()
334334
for f in self.__fileids(fileids)
335335
]
336336
)
@@ -349,7 +349,7 @@ def tagged_sents(self, fileids=None, tagset="msd", tags=""):
349349
if tagset == "universal" or tagset == "msd":
350350
return concat(
351351
[
352-
MTEFileReader(os.path.join(self._root, f)).tagged_sents(
352+
MTEFileReader(os.path.join(str(self._root), f)).tagged_sents(
353353
tagset, tags
354354
)
355355
for f in self.__fileids(fileids)
@@ -368,7 +368,7 @@ def lemma_paras(self, fileids=None):
368368
"""
369369
return concat(
370370
[
371-
MTEFileReader(os.path.join(self._root, f)).lemma_paras()
371+
MTEFileReader(os.path.join(str(self._root), f)).lemma_paras()
372372
for f in self.__fileids(fileids)
373373
]
374374
)
@@ -388,7 +388,7 @@ def tagged_paras(self, fileids=None, tagset="msd", tags=""):
388388
if tagset == "universal" or tagset == "msd":
389389
return concat(
390390
[
391-
MTEFileReader(os.path.join(self._root, f)).tagged_paras(
391+
MTEFileReader(os.path.join(str(self._root), f)).tagged_paras(
392392
tagset, tags
393393
)
394394
for f in self.__fileids(fileids)

nltk/data.py

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,17 @@
3939
import re
4040
import sys
4141
import textwrap
42+
import urllib.request
4243
import zipfile
4344
from abc import ABCMeta, abstractmethod
4445
from gzip import WRITE as GZ_WRITE
4546
from gzip import GzipFile
4647
from io import BytesIO, TextIOWrapper
47-
from urllib.request import url2pathname, urlopen
48+
from urllib.request import url2pathname
49+
50+
from nltk.pathsec import ZipFile
51+
from nltk.pathsec import open as _secure_open
52+
from nltk.pathsec import urlopen as _secure_urlopen
4853

4954
# Reject unsafe no-protocol paths: traversal segments, trailing '..', absolute paths,
5055
# backslashes, Windows drive letters. Use a raw-string pattern and do not anchor only
@@ -374,20 +379,13 @@ def path(self):
374379
"""The absolute path identified by this path pointer."""
375380
return self._path
376381

377-
# ==============================
378-
# SECURITY PATCH ENFORCING SANDBOX
379-
# ==============================
380382
def open(self, encoding=None):
381383
"""
382384
Secure open — prevents absolute direct access outside pointer root.
385+
Path validation is enforced by pathsec.open() which checks the
386+
resolved path against allowed NLTK data roots.
383387
"""
384-
path = os.path.normpath(self._path)
385-
386-
# Block direct access when the path is a filesystem root (e.g. "/" or "C:\\").
387-
if os.path.isabs(path) and os.path.dirname(path) == path:
388-
raise ValueError(f"Direct absolute file access blocked: {path}")
389-
390-
stream = open(self._path, "rb")
388+
stream = _secure_open(self._path, "rb")
391389
if encoding is not None:
392390
stream = SeekableUnicodeStreamReader(stream, encoding)
393391
return stream
@@ -499,7 +497,7 @@ def __init__(self, zipfile, entry=""):
499497
@property
500498
def zipfile(self):
501499
"""
502-
The zipfile.ZipFile object used to access the zip file
500+
The ZipFile object used to access the zip file
503501
containing the entry identified by this path pointer.
504502
"""
505503
return self._zipfile
@@ -1116,16 +1114,29 @@ def _open(resource_url):
11161114
loaded from. The default protocol is "nltk:", which searches
11171115
for the file in the the NLTK data package.
11181116
"""
1119-
resource_url = normalize_resource_url(resource_url)
1120-
protocol, path_ = split_resource_url(resource_url)
1117+
# Restore "no protocol" handling for internal resilience
1118+
resource_url = str(resource_url)
1119+
if ":" not in resource_url:
1120+
resource_url = "nltk:" + resource_url
11211121

1122-
if protocol is None or protocol.lower() == "nltk":
1123-
return find(path_, path + [""]).open()
1124-
elif protocol.lower() == "file":
1125-
# urllib might not use mode='rb', so handle this one ourselves:
1126-
return find(path_, [""]).open()
1122+
protocol, path_ = resource_url.split(":", 1)
1123+
1124+
if protocol == "nltk":
1125+
# If find() or .open() raises a ValueError (security) or LookupError,
1126+
# let it bubble up or handle it based on load() logic.
1127+
return find(path_).open()
1128+
elif protocol == "file":
1129+
local_path = url2pathname(path_)
1130+
try:
1131+
# 1. Attempt to use NLTK's standard search paths (Safe/Normalized)
1132+
return find(local_path).open()
1133+
except (LookupError, ValueError):
1134+
# 2. Fallback for absolute paths (e.g., file:///etc/passwd)
1135+
# This ensures even direct file access hits the pathsec sentinel.
1136+
return _secure_open(local_path, "rb")
11271137
else:
1128-
return urlopen(resource_url)
1138+
# Network protocols (http, https, ftp)
1139+
return _secure_urlopen(resource_url)
11291140

11301141

11311142
######################################################################
@@ -1164,9 +1175,9 @@ def __repr__(self):
11641175
######################################################################
11651176

11661177

1167-
class OpenOnDemandZipFile(zipfile.ZipFile):
1178+
class OpenOnDemandZipFile(ZipFile):
11681179
"""
1169-
A subclass of ``zipfile.ZipFile`` that closes its file pointer
1180+
A subclass of ``ZipFile`` that closes its file pointer
11701181
whenever it is not using it; and re-opens it when it needs to read
11711182
data from the zipfile. This is useful for reducing the number of
11721183
open file handles when many zip files are being accessed at once.
@@ -1178,7 +1189,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
11781189
def __init__(self, filename):
11791190
if not isinstance(filename, str):
11801191
raise TypeError("ReopenableZipFile filename must be a string")
1181-
zipfile.ZipFile.__init__(self, filename)
1192+
ZipFile.__init__(self, filename)
11821193
assert self.filename == filename
11831194
self.close()
11841195
# After closing a ZipFile object, the _fileRefCnt needs to be cleared
@@ -1187,12 +1198,11 @@ def __init__(self, filename):
11871198

11881199
def read(self, name):
11891200
assert self.fp is None
1190-
self.fp = open(self.filename, "rb")
1201+
# This will be validated by pathsec.open
1202+
self.fp = _secure_open(self.filename, "rb")
11911203
value = zipfile.ZipFile.read(self, name)
1192-
# Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
1193-
# Since we only opened one file here, we add 1.
1194-
self._fileRefCnt += 1
1195-
self.close()
1204+
self.fp.close()
1205+
self.fp = None
11961206
return value
11971207

11981208
def write(self, *args, **kwargs):

nltk/downloader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,10 @@
171171
import zipfile
172172
from hashlib import md5, sha256
173173
from urllib.error import HTTPError, URLError
174-
from urllib.request import urlopen
175174
from xml.etree import ElementTree
176175

177176
import nltk
177+
from nltk.pathsec import ZipFile, urlopen
178178

179179
# urllib2 = nltk.internals.import_from_stdlib('urllib2')
180180

@@ -2392,7 +2392,7 @@ def _unzip_iter(filename, root, verbose=True):
23922392
sys.stdout.flush()
23932393

23942394
try:
2395-
zf = zipfile.ZipFile(filename)
2395+
zf = ZipFile(filename)
23962396
except Exception as e:
23972397
yield ErrorMessage(filename, e)
23982398
# Flush the "Unzipping ..." line here because the try/finally that
@@ -2593,7 +2593,7 @@ def _find_packages(root):
25932593
``(pkg_xml, zf, subdir)``, where:
25942594
- ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
25952595
package
2596-
- ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
2596+
- ``zf`` is a ``ZipFile`` for the package's contents.
25972597
- ``subdir`` is the subdirectory (relative to ``root``) where
25982598
the package was found (e.g. 'corpora' or 'grammars').
25992599
"""
@@ -2608,7 +2608,7 @@ def _find_packages(root):
26082608
xmlfilename = os.path.join(dirname, filename)
26092609
zipfilename = xmlfilename[:-4] + ".zip"
26102610
try:
2611-
zf = zipfile.ZipFile(zipfilename)
2611+
zf = ZipFile(zipfilename)
26122612
except Exception as e:
26132613
raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e
26142614
try:

0 commit comments

Comments
 (0)