ekaf
diff --git a/‎SECURITY.md‎
Lines changed: 82 additions & 0 deletions b/‎SECURITY.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎nltk/corpus/reader/api.py‎
Lines changed: 17 additions & 12 deletions b/‎nltk/corpus/reader/api.py‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎nltk/corpus/reader/childes.py‎
Lines changed: 5 additions & 2 deletions b/‎nltk/corpus/reader/childes.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎nltk/corpus/reader/mte.py‎
Lines changed: 9 additions & 9 deletions b/‎nltk/corpus/reader/mte.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎nltk/data.py‎
Lines changed: 38 additions & 28 deletions b/‎nltk/data.py‎
Lines changed: 38 additions & 28 deletions
diff --git a/‎nltk/downloader.py‎
Lines changed: 4 additions & 4 deletions b/‎nltk/downloader.py‎
Lines changed: 4 additions & 4 deletions
@@ -3,3 +3,85 @@
 ## Reporting a Vulnerability
 
 Please report security issues to `nltk.team@gmail.com`
+
+## Security Hardening
+
+NLTK includes a centralized I/O security module (`nltk.pathsec`) that
+validates file paths, network URLs, and zip archives. During the initial
+transition phase, it operates by default in **warn-only mode**, to avoid
+breaking existing workflows. In a later release, it will be switched to
+enforce the stricter security policy by default.
+
+### Enabling strict enforcement
+
+If you are running NLTK in a security-sensitive environment (web
+applications, multi-tenant pipelines, CI/CD systems, or any context
+where untrusted input may reach NLTK), you should enable strict
+enforcement:
+
+```python
+import nltk.pathsec
+nltk.pathsec.ENFORCE = True
+```
+
+With `ENFORCE = True`, unauthorized file access, SSRF attempts, and
+zip-slip attacks will raise `PermissionError` instead of emitting
+warnings.
+
+
+### Current Working Directory (CWD) Access
+
+To maintain a "zero-friction" experience for students and researchers,
+NLTK permits access to resources located in the process's current
+working directory by default.
+
+* **Standard Mode (`ENFORCE=False`):** Accessing data in the CWD is
+permitted but triggers a `RuntimeWarning` to alert users that this
+behavior may be insecure in shared or server-side environments.
+
+* **Strict Mode (`ENFORCE=True`):** Implicit CWD access is **disabled**.
+To authorize the local directory in strict mode, users must explicitly
+append it to the search path:
+  ```python
+  import nltk
+  nltk.data.path.append('.')
+  ```
+
+
+### What is protected
+
+- **Path traversal**: file access is validated against allowed NLTK
+  data directories (`nltk.data.path`, `NLTK_DATA` environment
+  variable, and standard system locations).
+- **SSRF prevention**: `urlopen` resolves hostnames via DNS and blocks
+  requests to loopback, private, link-local, and multicast IP ranges,
+  including obfuscated forms (e.g. decimal IP notation).
+- **Zip-slip protection**: zip extraction validates that member paths
+  stay within the target directory.
+- **Pickle safety**: `nltk.data.load()` uses `RestrictedUnpickler`
+  which blocks all class/function globals. Other pickle loading uses
+  `pickle_load()` which emits a security warning.
+
+### Configuring allowed data paths
+
+NLTK determines allowed data directories from:
+
+1. `nltk.data.path` (configurable at runtime)
+2. `NLTK_DATA` environment variable
+3. Standard locations (`~/nltk_data`, `/usr/share/nltk_data`, etc.)
+4. System temp directory
+
+If you use a custom data location, add it to `nltk.data.path`:
+
+```python
+import nltk
+nltk.data.path.append('/my/custom/data')
+```
+
+### Note on symlinks
+
+NLTK's corpus readers perform lexical path containment checks when
+joining file paths. These checks do not resolve symlinks. If your
+threat model includes attackers who can place symlinks inside your
+NLTK data directories, enable `ENFORCE = True` for full path
+resolution and validation.
@@ -221,22 +221,27 @@ def raw(self, fileids=None):
 
     def open(self, file):
         """
-        Return an open stream that can be used to read the given file.
-        Security patched: prevents path traversal & absolute path access.
+        Return an open stream for the given file.
+        Security patched: prevents path traversal and scoped escapes.
         """
-        # -------- SECURITY PATCH START --------
-        file = str(file)
+        # Layer 1: Lexical guard
+        if os.path.isabs(file) or ".." in file.replace("\\", "/"):
+            raise ValueError(f"CorpusReader paths must be relative: {file}")
 
-        if os.path.isabs(file):
-            raise ValueError("Absolute paths are not allowed")
+        path = self._root.join(file)
 
-        if ".." in file.replace("\\", "/").split("/"):
-            raise ValueError("Path traversal attempt blocked")
-        # -------- SECURITY PATCH END --------
+        # Layer 2: Scoped resolved guard (Fixes symlink escape test)
+        from nltk.pathsec import validate_path
 
-        encoding = self.encoding(file)
-        stream = self._root.join(file).open(encoding)
-        return stream
+        validate_path(path, context="CorpusReader", required_root=self._root)
+
+        # --- FIX: Handle dict-based encodings (e.g., UDHR corpus) ---
+        encoding = self._encoding
+        if isinstance(encoding, dict):
+            encoding = encoding.get(file)
+
+        # Layer 3: Global sentinel check happens inside path.open()
+        return path.open(encoding=encoding)
 
     def encoding(self, file):
         """
 
@@ -620,8 +620,11 @@ def demo(corpus_root=None):
         demo('/path/to/childes/data-xml/Eng-USA/")
         """
         )
-        # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
-        # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
+
+        # To test remote fetching securely, use the pathsec wrapper:
+        # from nltk.pathsec import urlopen, ZipFile
+        # corpus_root_http = urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
+        # corpus_root_http_bates = ZipFile(cStringIO.StringIO(corpus_root_http.read()))
         ##this fails
         # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
 
 
@@ -250,7 +250,7 @@ def words(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).words()
+                MTEFileReader(os.path.join(str(self._root), f)).words()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -264,7 +264,7 @@ def sents(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).sents()
+                MTEFileReader(os.path.join(str(self._root), f)).sents()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -278,7 +278,7 @@ def paras(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).paras()
+                MTEFileReader(os.path.join(str(self._root), f)).paras()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -292,7 +292,7 @@ def lemma_words(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).lemma_words()
+                MTEFileReader(os.path.join(str(self._root), f)).lemma_words()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -311,7 +311,7 @@ def tagged_words(self, fileids=None, tagset="msd", tags=""):
         if tagset == "universal" or tagset == "msd":
             return concat(
                 [
-                    MTEFileReader(os.path.join(self._root, f)).tagged_words(
+                    MTEFileReader(os.path.join(str(self._root), f)).tagged_words(
                         tagset, tags
                     )
                     for f in self.__fileids(fileids)
@@ -330,7 +330,7 @@ def lemma_sents(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).lemma_sents()
+                MTEFileReader(os.path.join(str(self._root), f)).lemma_sents()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -349,7 +349,7 @@ def tagged_sents(self, fileids=None, tagset="msd", tags=""):
         if tagset == "universal" or tagset == "msd":
             return concat(
                 [
-                    MTEFileReader(os.path.join(self._root, f)).tagged_sents(
+                    MTEFileReader(os.path.join(str(self._root), f)).tagged_sents(
                         tagset, tags
                     )
                     for f in self.__fileids(fileids)
@@ -368,7 +368,7 @@ def lemma_paras(self, fileids=None):
         """
         return concat(
             [
-                MTEFileReader(os.path.join(self._root, f)).lemma_paras()
+                MTEFileReader(os.path.join(str(self._root), f)).lemma_paras()
                 for f in self.__fileids(fileids)
             ]
         )
@@ -388,7 +388,7 @@ def tagged_paras(self, fileids=None, tagset="msd", tags=""):
         if tagset == "universal" or tagset == "msd":
             return concat(
                 [
-                    MTEFileReader(os.path.join(self._root, f)).tagged_paras(
+                    MTEFileReader(os.path.join(str(self._root), f)).tagged_paras(
                         tagset, tags
                     )
                     for f in self.__fileids(fileids)
 
@@ -39,12 +39,17 @@
 import re
 import sys
 import textwrap
+import urllib.request
 import zipfile
 from abc import ABCMeta, abstractmethod
 from gzip import WRITE as GZ_WRITE
 from gzip import GzipFile
 from io import BytesIO, TextIOWrapper
-from urllib.request import url2pathname, urlopen
+from urllib.request import url2pathname
+
+from nltk.pathsec import ZipFile
+from nltk.pathsec import open as _secure_open
+from nltk.pathsec import urlopen as _secure_urlopen
 
 # Reject unsafe no-protocol paths: traversal segments, trailing '..', absolute paths,
 # backslashes, Windows drive letters. Use a raw-string pattern and do not anchor only
@@ -374,20 +379,13 @@ def path(self):
         """The absolute path identified by this path pointer."""
         return self._path
 
-    # ==============================
-    # SECURITY PATCH ENFORCING SANDBOX
-    # ==============================
     def open(self, encoding=None):
         """
         Secure open — prevents absolute direct access outside pointer root.
+        Path validation is enforced by pathsec.open() which checks the
+        resolved path against allowed NLTK data roots.
         """
-        path = os.path.normpath(self._path)
-
-        # Block direct access when the path is a filesystem root (e.g. "/" or "C:\\").
-        if os.path.isabs(path) and os.path.dirname(path) == path:
-            raise ValueError(f"Direct absolute file access blocked: {path}")
-
-        stream = open(self._path, "rb")
+        stream = _secure_open(self._path, "rb")
         if encoding is not None:
             stream = SeekableUnicodeStreamReader(stream, encoding)
         return stream
@@ -499,7 +497,7 @@ def __init__(self, zipfile, entry=""):
     @property
     def zipfile(self):
         """
-        The zipfile.ZipFile object used to access the zip file
+        The ZipFile object used to access the zip file
         containing the entry identified by this path pointer.
         """
         return self._zipfile
@@ -1116,16 +1114,29 @@ def _open(resource_url):
         loaded from.  The default protocol is "nltk:", which searches
         for the file in the the NLTK data package.
     """
-    resource_url = normalize_resource_url(resource_url)
-    protocol, path_ = split_resource_url(resource_url)
+    # Restore "no protocol" handling for internal resilience
+    resource_url = str(resource_url)
+    if ":" not in resource_url:
+        resource_url = "nltk:" + resource_url
 
-    if protocol is None or protocol.lower() == "nltk":
-        return find(path_, path + [""]).open()
-    elif protocol.lower() == "file":
-        # urllib might not use mode='rb', so handle this one ourselves:
-        return find(path_, [""]).open()
+    protocol, path_ = resource_url.split(":", 1)
+
+    if protocol == "nltk":
+        # If find() or .open() raises a ValueError (security) or LookupError,
+        # let it bubble up or handle it based on load() logic.
+        return find(path_).open()
+    elif protocol == "file":
+        local_path = url2pathname(path_)
+        try:
+            # 1. Attempt to use NLTK's standard search paths (Safe/Normalized)
+            return find(local_path).open()
+        except (LookupError, ValueError):
+            # 2. Fallback for absolute paths (e.g., file:///etc/passwd)
+            # This ensures even direct file access hits the pathsec sentinel.
+            return _secure_open(local_path, "rb")
     else:
-        return urlopen(resource_url)
+        # Network protocols (http, https, ftp)
+        return _secure_urlopen(resource_url)
 
 
 ######################################################################
@@ -1164,9 +1175,9 @@ def __repr__(self):
 ######################################################################
 
 
-class OpenOnDemandZipFile(zipfile.ZipFile):
+class OpenOnDemandZipFile(ZipFile):
     """
-    A subclass of ``zipfile.ZipFile`` that closes its file pointer
+    A subclass of ``ZipFile`` that closes its file pointer
     whenever it is not using it; and re-opens it when it needs to read
     data from the zipfile.  This is useful for reducing the number of
     open file handles when many zip files are being accessed at once.
@@ -1178,7 +1189,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
     def __init__(self, filename):
         if not isinstance(filename, str):
             raise TypeError("ReopenableZipFile filename must be a string")
-        zipfile.ZipFile.__init__(self, filename)
+        ZipFile.__init__(self, filename)
         assert self.filename == filename
         self.close()
         # After closing a ZipFile object, the _fileRefCnt needs to be cleared
@@ -1187,12 +1198,11 @@ def __init__(self, filename):
 
     def read(self, name):
         assert self.fp is None
-        self.fp = open(self.filename, "rb")
+        # This will be validated by pathsec.open
+        self.fp = _secure_open(self.filename, "rb")
         value = zipfile.ZipFile.read(self, name)
-        # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
-        # Since we only opened one file here, we add 1.
-        self._fileRefCnt += 1
-        self.close()
+        self.fp.close()
+        self.fp = None
         return value
 
     def write(self, *args, **kwargs):
 
@@ -171,10 +171,10 @@
 import zipfile
 from hashlib import md5, sha256
 from urllib.error import HTTPError, URLError
-from urllib.request import urlopen
 from xml.etree import ElementTree
 
 import nltk
+from nltk.pathsec import ZipFile, urlopen
 
 # urllib2 = nltk.internals.import_from_stdlib('urllib2')
 
@@ -2392,7 +2392,7 @@ def _unzip_iter(filename, root, verbose=True):
         sys.stdout.flush()
 
     try:
-        zf = zipfile.ZipFile(filename)
+        zf = ZipFile(filename)
     except Exception as e:
         yield ErrorMessage(filename, e)
         # Flush the "Unzipping ..." line here because the try/finally that
@@ -2593,7 +2593,7 @@ def _find_packages(root):
     ``(pkg_xml, zf, subdir)``, where:
       - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
         package
-      - ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
+      - ``zf`` is a ``ZipFile`` for the package's contents.
       - ``subdir`` is the subdirectory (relative to ``root``) where
         the package was found (e.g. 'corpora' or 'grammars').
     """
@@ -2608,7 +2608,7 @@ def _find_packages(root):
                 xmlfilename = os.path.join(dirname, filename)
                 zipfilename = xmlfilename[:-4] + ".zip"
                 try:
-                    zf = zipfile.ZipFile(zipfilename)
+                    zf = ZipFile(zipfilename)
                 except Exception as e:
                     raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e
                 try:
Original file line number	Diff line number	Diff line change
`@@ -250,7 +250,7 @@ def words(self, fileids=None):`
`250`	`250`	`"""`
`251`	`251`	`return concat(`
`252`	`252`	`[`
`253`		`- MTEFileReader(os.path.join(self._root, f)).words()`
	`253`	`+ MTEFileReader(os.path.join(str(self._root), f)).words()`
`254`	`254`	`for f in self.__fileids(fileids)`
`255`	`255`	`]`
`256`	`256`	`)`
`@@ -264,7 +264,7 @@ def sents(self, fileids=None):`
`264`	`264`	`"""`
`265`	`265`	`return concat(`
`266`	`266`	`[`
`267`		`- MTEFileReader(os.path.join(self._root, f)).sents()`
	`267`	`+ MTEFileReader(os.path.join(str(self._root), f)).sents()`
`268`	`268`	`for f in self.__fileids(fileids)`
`269`	`269`	`]`
`270`	`270`	`)`
`@@ -278,7 +278,7 @@ def paras(self, fileids=None):`
`278`	`278`	`"""`
`279`	`279`	`return concat(`
`280`	`280`	`[`
`281`		`- MTEFileReader(os.path.join(self._root, f)).paras()`
	`281`	`+ MTEFileReader(os.path.join(str(self._root), f)).paras()`
`282`	`282`	`for f in self.__fileids(fileids)`
`283`	`283`	`]`
`284`	`284`	`)`
`@@ -292,7 +292,7 @@ def lemma_words(self, fileids=None):`
`292`	`292`	`"""`
`293`	`293`	`return concat(`
`294`	`294`	`[`
`295`		`- MTEFileReader(os.path.join(self._root, f)).lemma_words()`
	`295`	`+ MTEFileReader(os.path.join(str(self._root), f)).lemma_words()`
`296`	`296`	`for f in self.__fileids(fileids)`
`297`	`297`	`]`
`298`	`298`	`)`
`@@ -311,7 +311,7 @@ def tagged_words(self, fileids=None, tagset="msd", tags=""):`
`311`	`311`	`if tagset == "universal" or tagset == "msd":`
`312`	`312`	`return concat(`
`313`	`313`	`[`
`314`		`- MTEFileReader(os.path.join(self._root, f)).tagged_words(`
	`314`	`+ MTEFileReader(os.path.join(str(self._root), f)).tagged_words(`
`315`	`315`	`tagset, tags`
`316`	`316`	`)`
`317`	`317`	`for f in self.__fileids(fileids)`
`@@ -330,7 +330,7 @@ def lemma_sents(self, fileids=None):`
`330`	`330`	`"""`
`331`	`331`	`return concat(`
`332`	`332`	`[`
`333`		`- MTEFileReader(os.path.join(self._root, f)).lemma_sents()`
	`333`	`+ MTEFileReader(os.path.join(str(self._root), f)).lemma_sents()`
`334`	`334`	`for f in self.__fileids(fileids)`
`335`	`335`	`]`
`336`	`336`	`)`
`@@ -349,7 +349,7 @@ def tagged_sents(self, fileids=None, tagset="msd", tags=""):`
`349`	`349`	`if tagset == "universal" or tagset == "msd":`
`350`	`350`	`return concat(`
`351`	`351`	`[`
`352`		`- MTEFileReader(os.path.join(self._root, f)).tagged_sents(`
	`352`	`+ MTEFileReader(os.path.join(str(self._root), f)).tagged_sents(`
`353`	`353`	`tagset, tags`
`354`	`354`	`)`
`355`	`355`	`for f in self.__fileids(fileids)`
`@@ -368,7 +368,7 @@ def lemma_paras(self, fileids=None):`
`368`	`368`	`"""`
`369`	`369`	`return concat(`
`370`	`370`	`[`
`371`		`- MTEFileReader(os.path.join(self._root, f)).lemma_paras()`
	`371`	`+ MTEFileReader(os.path.join(str(self._root), f)).lemma_paras()`
`372`	`372`	`for f in self.__fileids(fileids)`
`373`	`373`	`]`
`374`	`374`	`)`
`@@ -388,7 +388,7 @@ def tagged_paras(self, fileids=None, tagset="msd", tags=""):`
`388`	`388`	`if tagset == "universal" or tagset == "msd":`
`389`	`389`	`return concat(`
`390`	`390`	`[`
`391`		`- MTEFileReader(os.path.join(self._root, f)).tagged_paras(`
	`391`	`+ MTEFileReader(os.path.join(str(self._root), f)).tagged_paras(`
`392`	`392`	`tagset, tags`
`393`	`393`	`)`
`394`	`394`	`for f in self.__fileids(fileids)`