Skip to content

Commit 0fb814d

Browse files
authored
Use native ntlk download (#3796)
This PR changes how we download NLTK data to use the native nltk downloader. We had moved to our own hosted NLTK dataset because of this CVE: https://nvd.nist.gov/vuln/detail/CVE-2024-39705 Ref: #3361 Latest versions of NLTK have fixed this issue: https://github.com/nltk/nltk/blob/develop/ChangeLog
1 parent 9445a2d commit 0fb814d

File tree

4 files changed

+12
-86
lines changed

4 files changed

+12
-86
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
## 0.16.9-dev0
1+
## 0.16.9
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
- **Fix NLTK Download** to not download from unstructured S3 Bucket
10+
911
## 0.16.8
1012

1113
### Enhancements

test_unstructured/nlp/test_tokenize.py

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99

1010
def test_nltk_packages_download_if_not_present():
11+
tokenize._download_nltk_packages_if_not_present.cache_clear()
1112
with patch.object(nltk, "find", side_effect=LookupError):
1213
with patch.object(tokenize, "download_nltk_packages") as mock_download:
1314
tokenize._download_nltk_packages_if_not_present()
@@ -16,6 +17,7 @@ def test_nltk_packages_download_if_not_present():
1617

1718

1819
def test_nltk_packages_do_not_download_if():
20+
tokenize._download_nltk_packages_if_not_present.cache_clear()
1921
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
2022
tokenize._download_nltk_packages_if_not_present()
2123

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.9-dev0" # pragma: no cover
1+
__version__ = "0.16.9" # pragma: no cover

unstructured/nlp/tokenize.py

+6-84
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
from __future__ import annotations
22

3-
import hashlib
43
import os
5-
import sys
6-
import tarfile
7-
import tempfile
8-
import urllib.request
94
from functools import lru_cache
105
from typing import Final, List, Tuple
116

@@ -16,86 +11,10 @@
1611

1712
CACHE_MAX_SIZE: Final[int] = 128
1813

19-
NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
20-
NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}"
21-
NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
22-
23-
24-
# NOTE(robinson) - mimic default dir logic from NLTK
25-
# https://github.com/nltk/nltk/
26-
# blob/8c233dc585b91c7a0c58f96a9d99244a379740d5/nltk/downloader.py#L1046
27-
def get_nltk_data_dir() -> str | None:
28-
"""Locates the directory the nltk data will be saved too. The directory
29-
set by the NLTK environment variable takes highest precedence. Otherwise
30-
the default is determined by the rules indicated below. Returns None when
31-
the directory is not writable.
32-
33-
On Windows, the default download directory is
34-
``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
35-
directory containing Python, e.g. ``C:\\Python311``.
36-
37-
On all other platforms, the default directory is the first of
38-
the following which exists or which can be created with write
39-
permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
40-
``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
41-
"""
42-
# Check if we are on GAE where we cannot write into filesystem.
43-
if "APPENGINE_RUNTIME" in os.environ:
44-
return
45-
46-
# Check if we have sufficient permissions to install in a
47-
# variety of system-wide locations.
48-
for nltkdir in nltk.data.path:
49-
if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
50-
return nltkdir
51-
52-
# On Windows, use %APPDATA%
53-
if sys.platform == "win32" and "APPDATA" in os.environ:
54-
homedir = os.environ["APPDATA"]
55-
56-
# Otherwise, install in the user's home directory.
57-
else:
58-
homedir = os.path.expanduser("~/")
59-
if homedir == "~/":
60-
raise ValueError("Could not find a default download directory")
61-
62-
# NOTE(robinson) - NLTK appends nltk_data to the homedir. That's already
63-
# present in the tar file so we don't have to do that here.
64-
return homedir
65-
6614

6715
def download_nltk_packages():
68-
nltk_data_dir = get_nltk_data_dir()
69-
70-
if nltk_data_dir is None:
71-
raise OSError("NLTK data directory does not exist or is not writable.")
72-
73-
# Check if the path ends with "nltk_data" and remove it if it does
74-
if nltk_data_dir.endswith("nltk_data"):
75-
nltk_data_dir = os.path.dirname(nltk_data_dir)
76-
77-
def sha256_checksum(filename: str, block_size: int = 65536):
78-
sha256 = hashlib.sha256()
79-
with open(filename, "rb") as f:
80-
for block in iter(lambda: f.read(block_size), b""):
81-
sha256.update(block)
82-
return sha256.hexdigest()
83-
84-
with tempfile.TemporaryDirectory() as temp_dir_path:
85-
tgz_file_path = os.path.join(temp_dir_path, NLTK_DATA_FILENAME)
86-
urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file_path)
87-
88-
file_hash = sha256_checksum(tgz_file_path)
89-
if file_hash != NLTK_DATA_SHA256:
90-
os.remove(tgz_file_path)
91-
raise ValueError(f"SHA-256 mismatch: expected {NLTK_DATA_SHA256}, got {file_hash}")
92-
93-
# Extract the contents
94-
if not os.path.exists(nltk_data_dir):
95-
os.makedirs(nltk_data_dir)
96-
97-
with tarfile.open(tgz_file_path, "r:gz") as tar:
98-
tar.extractall(path=nltk_data_dir)
16+
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
17+
nltk.download("punkt_tab", quiet=True)
9918

10019

10120
def check_for_nltk_package(package_name: str, package_category: str) -> bool:
@@ -109,10 +28,13 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
10928
try:
11029
nltk.find(f"{package_category}/{package_name}", paths=paths)
11130
return True
112-
except LookupError:
31+
except (LookupError, OSError):
11332
return False
11433

11534

35+
# We cache this because we do not want to attempt
36+
# downloading the packages multiple times
37+
@lru_cache()
11638
def _download_nltk_packages_if_not_present():
11739
"""If required NLTK packages are not available, download them."""
11840

0 commit comments

Comments
 (0)