3939import re
4040import sys
4141import textwrap
42+ import urllib .request
4243import zipfile
4344from abc import ABCMeta , abstractmethod
4445from gzip import WRITE as GZ_WRITE
4546from gzip import GzipFile
4647from io import BytesIO , TextIOWrapper
47- from urllib .request import url2pathname , urlopen
48+ from urllib .request import url2pathname
49+
50+ from nltk .pathsec import ZipFile
51+ from nltk .pathsec import open as _secure_open
52+ from nltk .pathsec import urlopen as _secure_urlopen
4853
4954# Reject unsafe no-protocol paths: traversal segments, trailing '..', absolute paths,
5055# backslashes, Windows drive letters. Use a raw-string pattern and do not anchor only
@@ -374,20 +379,13 @@ def path(self):
374379 """The absolute path identified by this path pointer."""
375380 return self ._path
376381
377- # ==============================
378- # SECURITY PATCH ENFORCING SANDBOX
379- # ==============================
380382 def open (self , encoding = None ):
381383 """
382384 Secure open — prevents absolute direct access outside pointer root.
385+ Path validation is enforced by pathsec.open() which checks the
386+ resolved path against allowed NLTK data roots.
383387 """
384- path = os .path .normpath (self ._path )
385-
386- # Block direct access when the path is a filesystem root (e.g. "/" or "C:\\").
387- if os .path .isabs (path ) and os .path .dirname (path ) == path :
388- raise ValueError (f"Direct absolute file access blocked: { path } " )
389-
390- stream = open (self ._path , "rb" )
388+ stream = _secure_open (self ._path , "rb" )
391389 if encoding is not None :
392390 stream = SeekableUnicodeStreamReader (stream , encoding )
393391 return stream
@@ -499,7 +497,7 @@ def __init__(self, zipfile, entry=""):
499497 @property
500498 def zipfile (self ):
501499 """
502- The zipfile. ZipFile object used to access the zip file
500+ The ZipFile object used to access the zip file
503501 containing the entry identified by this path pointer.
504502 """
505503 return self ._zipfile
@@ -1116,16 +1114,29 @@ def _open(resource_url):
11161114 loaded from. The default protocol is "nltk:", which searches
11171115 for the file in the the NLTK data package.
11181116 """
1119- resource_url = normalize_resource_url (resource_url )
1120- protocol , path_ = split_resource_url (resource_url )
1117+ # Restore "no protocol" handling for internal resilience
1118+ resource_url = str (resource_url )
1119+ if ":" not in resource_url :
1120+ resource_url = "nltk:" + resource_url
11211121
1122- if protocol is None or protocol .lower () == "nltk" :
1123- return find (path_ , path + ["" ]).open ()
1124- elif protocol .lower () == "file" :
1125- # urllib might not use mode='rb', so handle this one ourselves:
1126- return find (path_ , ["" ]).open ()
1122+ protocol , path_ = resource_url .split (":" , 1 )
1123+
1124+ if protocol == "nltk" :
1125+ # If find() or .open() raises a ValueError (security) or LookupError,
1126+ # let it bubble up or handle it based on load() logic.
1127+ return find (path_ ).open ()
1128+ elif protocol == "file" :
1129+ local_path = url2pathname (path_ )
1130+ try :
1131+ # 1. Attempt to use NLTK's standard search paths (Safe/Normalized)
1132+ return find (local_path ).open ()
1133+ except (LookupError , ValueError ):
1134+ # 2. Fallback for absolute paths (e.g., file:///etc/passwd)
1135+ # This ensures even direct file access hits the pathsec sentinel.
1136+ return _secure_open (local_path , "rb" )
11271137 else :
1128- return urlopen (resource_url )
1138+ # Network protocols (http, https, ftp)
1139+ return _secure_urlopen (resource_url )
11291140
11301141
11311142######################################################################
@@ -1164,9 +1175,9 @@ def __repr__(self):
11641175######################################################################
11651176
11661177
1167- class OpenOnDemandZipFile (zipfile . ZipFile ):
1178+ class OpenOnDemandZipFile (ZipFile ):
11681179 """
1169- A subclass of ``zipfile. ZipFile`` that closes its file pointer
1180+ A subclass of ``ZipFile`` that closes its file pointer
11701181 whenever it is not using it; and re-opens it when it needs to read
11711182 data from the zipfile. This is useful for reducing the number of
11721183 open file handles when many zip files are being accessed at once.
@@ -1178,7 +1189,7 @@ class OpenOnDemandZipFile(zipfile.ZipFile):
11781189 def __init__ (self , filename ):
11791190 if not isinstance (filename , str ):
11801191 raise TypeError ("ReopenableZipFile filename must be a string" )
1181- zipfile . ZipFile .__init__ (self , filename )
1192+ ZipFile .__init__ (self , filename )
11821193 assert self .filename == filename
11831194 self .close ()
11841195 # After closing a ZipFile object, the _fileRefCnt needs to be cleared
@@ -1187,12 +1198,11 @@ def __init__(self, filename):
11871198
11881199 def read (self , name ):
11891200 assert self .fp is None
1190- self .fp = open (self .filename , "rb" )
1201+ # This will be validated by pathsec.open
1202+ self .fp = _secure_open (self .filename , "rb" )
11911203 value = zipfile .ZipFile .read (self , name )
1192- # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code.
1193- # Since we only opened one file here, we add 1.
1194- self ._fileRefCnt += 1
1195- self .close ()
1204+ self .fp .close ()
1205+ self .fp = None
11961206 return value
11971207
11981208 def write (self , * args , ** kwargs ):
0 commit comments