1
1
from __future__ import annotations
2
2
3
- import hashlib
4
3
import os
5
- import sys
6
- import tarfile
7
- import tempfile
8
- import urllib .request
9
4
from functools import lru_cache
10
5
from typing import Final , List , Tuple
11
6
16
11
17
12
CACHE_MAX_SIZE : Final [int ] = 128
18
13
19
- NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz"
20
- NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{ NLTK_DATA_FILENAME } "
21
- NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663"
22
-
23
-
24
- # NOTE(robinson) - mimic default dir logic from NLTK
25
- # https://github.com/nltk/nltk/
26
- # blob/8c233dc585b91c7a0c58f96a9d99244a379740d5/nltk/downloader.py#L1046
27
- def get_nltk_data_dir () -> str | None :
28
- """Locates the directory the nltk data will be saved too. The directory
29
- set by the NLTK environment variable takes highest precedence. Otherwise
30
- the default is determined by the rules indicated below. Returns None when
31
- the directory is not writable.
32
-
33
- On Windows, the default download directory is
34
- ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
35
- directory containing Python, e.g. ``C:\\ Python311``.
36
-
37
- On all other platforms, the default directory is the first of
38
- the following which exists or which can be created with write
39
- permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
40
- ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
41
- """
42
- # Check if we are on GAE where we cannot write into filesystem.
43
- if "APPENGINE_RUNTIME" in os .environ :
44
- return
45
-
46
- # Check if we have sufficient permissions to install in a
47
- # variety of system-wide locations.
48
- for nltkdir in nltk .data .path :
49
- if os .path .exists (nltkdir ) and nltk .internals .is_writable (nltkdir ):
50
- return nltkdir
51
-
52
- # On Windows, use %APPDATA%
53
- if sys .platform == "win32" and "APPDATA" in os .environ :
54
- homedir = os .environ ["APPDATA" ]
55
-
56
- # Otherwise, install in the user's home directory.
57
- else :
58
- homedir = os .path .expanduser ("~/" )
59
- if homedir == "~/" :
60
- raise ValueError ("Could not find a default download directory" )
61
-
62
- # NOTE(robinson) - NLTK appends nltk_data to the homedir. That's already
63
- # present in the tar file so we don't have to do that here.
64
- return homedir
65
-
66
14
67
15
def download_nltk_packages ():
68
- nltk_data_dir = get_nltk_data_dir ()
69
-
70
- if nltk_data_dir is None :
71
- raise OSError ("NLTK data directory does not exist or is not writable." )
72
-
73
- # Check if the path ends with "nltk_data" and remove it if it does
74
- if nltk_data_dir .endswith ("nltk_data" ):
75
- nltk_data_dir = os .path .dirname (nltk_data_dir )
76
-
77
- def sha256_checksum (filename : str , block_size : int = 65536 ):
78
- sha256 = hashlib .sha256 ()
79
- with open (filename , "rb" ) as f :
80
- for block in iter (lambda : f .read (block_size ), b"" ):
81
- sha256 .update (block )
82
- return sha256 .hexdigest ()
83
-
84
- with tempfile .TemporaryDirectory () as temp_dir_path :
85
- tgz_file_path = os .path .join (temp_dir_path , NLTK_DATA_FILENAME )
86
- urllib .request .urlretrieve (NLTK_DATA_URL , tgz_file_path )
87
-
88
- file_hash = sha256_checksum (tgz_file_path )
89
- if file_hash != NLTK_DATA_SHA256 :
90
- os .remove (tgz_file_path )
91
- raise ValueError (f"SHA-256 mismatch: expected { NLTK_DATA_SHA256 } , got { file_hash } " )
92
-
93
- # Extract the contents
94
- if not os .path .exists (nltk_data_dir ):
95
- os .makedirs (nltk_data_dir )
96
-
97
- with tarfile .open (tgz_file_path , "r:gz" ) as tar :
98
- tar .extractall (path = nltk_data_dir )
16
+ nltk .download ("averaged_perceptron_tagger_eng" , quiet = True )
17
+ nltk .download ("punkt_tab" , quiet = True )
99
18
100
19
101
20
def check_for_nltk_package (package_name : str , package_category : str ) -> bool :
@@ -109,10 +28,13 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
109
28
try :
110
29
nltk .find (f"{ package_category } /{ package_name } " , paths = paths )
111
30
return True
112
- except LookupError :
31
+ except ( LookupError , OSError ) :
113
32
return False
114
33
115
34
35
+ # We cache this because we do not want to attempt
36
+ # downloading the packages multiple times
37
+ @lru_cache ()
116
38
def _download_nltk_packages_if_not_present ():
117
39
"""If required NLTK packages are not available, download them."""
118
40
0 commit comments