added auto_download logic to download data runtime (#3883)

christinestraub · web-flow · commit a447b813a961 · 2025-01-27T19:11:32.000Z
- **Add auto-download for NLTK for Python Enviroment** When user import
`tokenize`, It will automatically download nltk data.

- Added `AUTO_DOWNLOAD_NLTK` flag in `tokenize.py` to download
`NLTK_DATA`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,17 @@
-## 0.16.16-dev2
+## 0.16.16
 
 ### Enhancements
 
 ### Features
 - **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.
 
 ### Fixes
+- **Add auto-download for NLTK for Python Enviroment** When user import tokenize, It will  automatic download nltk data from `tokenize.py` file. Added `AUTO_DOWNLOAD_NLTK` flag in `tokenize.py` to download `NLTK_DATA`.
 - **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`.  Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.
 
 * **Drop usage of ndjson dependency**
 
 ## 0.16.15
-
 ### Enhancements
 
 ### Features
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -64,7 +64,7 @@ langdetect==1.0.9
     # via -r ./base.in
 lxml==5.3.0
     # via -r ./base.in
-marshmallow==3.25.1
+marshmallow==3.26.0
     # via
     #   dataclasses-json
     #   unstructured-client
diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -32,7 +32,7 @@ exceptiongroup==1.2.2
     # via
     #   -c ./base.txt
     #   anyio
-fonttools==4.55.4
+fonttools==4.55.5
     # via matplotlib
 h11==0.14.0
     # via
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -42,15 +42,15 @@ filelock==3.17.0
     #   transformers
 flatbuffers==25.1.21
     # via onnxruntime
-fonttools==4.55.4
+fonttools==4.55.5
     # via matplotlib
 fsspec==2024.12.0
     # via
     #   huggingface-hub
     #   torch
 google-api-core[grpc]==2.24.0
     # via google-cloud-vision
-google-auth==2.37.0
+google-auth==2.38.0
     # via
     #   google-api-core
     #   google-cloud-vision
diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt
@@ -12,5 +12,5 @@ python-pptx==1.0.2
     # via -r ./extra-pptx.in
 typing-extensions==4.12.2
     # via python-pptx
-xlsxwriter==3.2.0
+xlsxwriter==3.2.1
     # via python-pptx
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -54,7 +54,7 @@ exceptiongroup==1.2.2
     #   -c ./base.txt
     #   anyio
     #   pytest
-faker==34.0.0
+faker==35.0.0
     # via jsf
 flake8==7.1.1
     # via
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.16-dev2"  # pragma: no cover
+__version__ = "0.16.16"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -12,11 +12,6 @@
 CACHE_MAX_SIZE: Final[int] = 128
 
 
-def download_nltk_packages():
-    nltk.download("averaged_perceptron_tagger_eng", quiet=True)
-    nltk.download("punkt_tab", quiet=True)
-
-
 def check_for_nltk_package(package_name: str, package_category: str) -> bool:
     """Checks to see if the specified NLTK package exists on the image."""
     paths: list[str] = []
@@ -32,6 +27,27 @@ def check_for_nltk_package(package_name: str, package_category: str) -> bool:
         return False
 
 
+def download_nltk_packages():
+    """If required NLTK packages are not available, download them."""
+
+    tagger_available = check_for_nltk_package(
+        package_category="taggers",
+        package_name="averaged_perceptron_tagger_eng",
+    )
+    tokenizer_available = check_for_nltk_package(
+        package_category="tokenizers", package_name="punkt_tab"
+    )
+
+    if (not tokenizer_available) or (not tagger_available):
+        nltk.download("averaged_perceptron_tagger_eng", quiet=True)
+        nltk.download("punkt_tab", quiet=True)
+
+
+# auto download nltk packages if the environment variable is set
+if os.getenv("AUTO_DOWNLOAD_NLTK", "True").lower() == "true":
+    download_nltk_packages()
+
+
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def sent_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.16-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.16.16" # pragma: no cover`