Skip to content

Commit 1c8b2b2

Browse files
authored
feat: add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR config parameteres (#3014)
This PR introduces GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR controlling where temporary files are stored during partition flow, via tempfile.tempdir. #### Edit: Renamed prefixes from STORAGE_ to UNSTRUCTURED_CACHE_ #### Edit 2: Renamed prefixes from UNSTRUCTURED_CACHE to GLOBAL_WORKING_DIR_
1 parent ec987dc commit 1c8b2b2

File tree

9 files changed

+122
-7
lines changed

9 files changed

+122
-7
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.0-dev14
1+
## 0.14.0-dev15
22

33
### BREAKING CHANGES
44

@@ -9,6 +9,7 @@
99
* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
1010
* **Faster evaluation** Support for concurrent processing of documents during evaluation
1111
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
12+
* **Add GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR** configuration parameteres to control temporary storage.
1213

1314
### Features
1415
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.

Diff for: test_unstructured/partition/pdf_image/test_pdf_image_utils.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,9 @@ def test_save_elements(
143143
assert not el.metadata.image_mime_type
144144

145145

146-
def test_save_elements_with_output_dir_path_none():
146+
@pytest.mark.parametrize("storage_enabled", [False, True])
147+
def test_save_elements_with_output_dir_path_none(monkeypatch, storage_enabled):
148+
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", storage_enabled)
147149
with (
148150
patch("PIL.Image.open"),
149151
patch("unstructured.partition.pdf_image.pdf_image_utils.write_image"),
@@ -161,7 +163,12 @@ def test_save_elements_with_output_dir_path_none():
161163
)
162164

163165
# Verify that the images are saved in the expected directory
164-
expected_output_dir = os.path.join(tmpdir, "figures")
166+
if storage_enabled:
167+
from unstructured.partition.utils.config import env_config
168+
169+
expected_output_dir = os.path.join(env_config.GLOBAL_WORKING_PROCESS_DIR, "figures")
170+
else:
171+
expected_output_dir = os.path.join(tmpdir, "figures")
165172
assert os.path.exists(expected_output_dir)
166173
assert os.path.isdir(expected_output_dir)
167174
os.chdir(original_cwd)

Diff for: test_unstructured/partition/utils/test_config.py

+47
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
import shutil
2+
import tempfile
3+
from pathlib import Path
4+
5+
import pytest
6+
7+
18
def test_default_config():
29
from unstructured.partition.utils.config import env_config
310

@@ -9,3 +16,43 @@ def test_env_override(monkeypatch):
916
from unstructured.partition.utils.config import env_config
1017

1118
assert env_config.IMAGE_CROP_PAD == 1
19+
20+
21+
@pytest.fixture()
22+
def _setup_tmpdir():
23+
from unstructured.partition.utils.config import env_config
24+
25+
_tmpdir = tempfile.tempdir
26+
_storage_tmpdir = env_config.GLOBAL_WORKING_PROCESS_DIR
27+
_storage_tmpdir_bak = f"{env_config.GLOBAL_WORKING_PROCESS_DIR}_bak"
28+
if Path(_storage_tmpdir).is_dir():
29+
shutil.move(_storage_tmpdir, _storage_tmpdir_bak)
30+
tempfile.tempdir = None
31+
yield
32+
if Path(_storage_tmpdir_bak).is_dir():
33+
if Path(_storage_tmpdir).is_dir():
34+
shutil.rmtree(_storage_tmpdir)
35+
shutil.move(_storage_tmpdir_bak, _storage_tmpdir)
36+
tempfile.tempdir = _tmpdir
37+
38+
39+
@pytest.mark.usefixtures("_setup_tmpdir")
40+
def test_env_storage_disabled(monkeypatch):
41+
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "false")
42+
from unstructured.partition.utils.config import env_config
43+
44+
assert not env_config.GLOBAL_WORKING_DIR_ENABLED
45+
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
46+
assert not Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
47+
assert tempfile.gettempdir() != env_config.GLOBAL_WORKING_PROCESS_DIR
48+
49+
50+
@pytest.mark.usefixtures("_setup_tmpdir")
51+
def test_env_storage_enabled(monkeypatch):
52+
monkeypatch.setenv("GLOBAL_WORKING_DIR_ENABLED", "true")
53+
from unstructured.partition.utils.config import env_config
54+
55+
assert env_config.GLOBAL_WORKING_DIR_ENABLED
56+
assert str(Path.home() / ".cache/unstructured") == env_config.GLOBAL_WORKING_DIR
57+
assert Path(env_config.GLOBAL_WORKING_PROCESS_DIR).is_dir()
58+
assert tempfile.gettempdir() == env_config.GLOBAL_WORKING_PROCESS_DIR

Diff for: unstructured/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .partition.utils.config import env_config
2+
3+
# init env_config
4+
env_config

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.0-dev14" # pragma: no cover
1+
__version__ = "0.14.0-dev15" # pragma: no cover

Diff for: unstructured/metrics/evaluate.py

-1
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,6 @@ def _try_process_document(self, doc: Path) -> Optional[list]:
160160
@abstractmethod
161161
def _process_document(self, doc: Path) -> list:
162162
"""Should return all metadata and metrics for a single document."""
163-
pass
164163

165164

166165
@dataclass

Diff for: unstructured/partition/pdf.py

+9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import os
77
import re
88
import warnings
9+
from pathlib import Path
910
from typing import IO, TYPE_CHECKING, Any, Iterator, Optional, cast
1011

1112
import numpy as np
@@ -438,6 +439,14 @@ def _partition_pdf_or_image_local(
438439
)
439440

440441
if analysis:
442+
if not analyzed_image_output_dir_path:
443+
if env_config.GLOBAL_WORKING_DIR_ENABLED:
444+
analyzed_image_output_dir_path = str(
445+
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
446+
)
447+
else:
448+
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
449+
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
441450
annotate_layout_elements(
442451
inferred_document_layout=inferred_document_layout,
443452
extracted_layout=extracted_layout,

Diff for: unstructured/partition/pdf_image/pdf_image_utils.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import tempfile
55
from copy import deepcopy
66
from io import BytesIO
7-
from pathlib import PurePath
7+
from pathlib import Path, PurePath
88
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Tuple, Union, cast
99

1010
import cv2
@@ -131,7 +131,10 @@ def save_elements(
131131
"""
132132

133133
if not output_dir_path:
134-
output_dir_path = os.path.join(os.getcwd(), "figures")
134+
if env_config.GLOBAL_WORKING_DIR_ENABLED:
135+
output_dir_path = str(Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "figures")
136+
else:
137+
output_dir_path = str(Path.cwd() / "figures")
135138
os.makedirs(output_dir_path, exist_ok=True)
136139

137140
with tempfile.TemporaryDirectory() as temp_dir:

Diff for: unstructured/partition/utils/config.py

+45
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,28 @@
77
"""
88

99
import os
10+
import tempfile
1011
from dataclasses import dataclass
12+
from functools import lru_cache
13+
from pathlib import Path
1114

1215
from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT
1316

1417

18+
@lru_cache(maxsize=1)
19+
def get_tempdir(dir: str) -> str:
20+
tempdir = Path(dir) / f"tmp/{os.getpgid(0)}"
21+
return str(tempdir)
22+
23+
1524
@dataclass
1625
class ENVConfig:
1726
"""class for configuring enviorment parameters"""
1827

28+
def __post_init__(self):
29+
if self.GLOBAL_WORKING_DIR_ENABLED:
30+
self._setup_tmpdir(self.GLOBAL_WORKING_PROCESS_DIR)
31+
1932
def _get_string(self, var: str, default_value: str = "") -> str:
2033
"""attempt to get the value of var from the os environment; if not present return the
2134
default_value"""
@@ -31,6 +44,15 @@ def _get_float(self, var: str, default_value: float) -> float:
3144
return float(value)
3245
return default_value
3346

47+
def _get_bool(self, var: str, default_value: bool) -> bool:
48+
if value := self._get_string(var):
49+
return value.lower() in ("true", "1", "t")
50+
return default_value
51+
52+
def _setup_tmpdir(self, tmpdir: str) -> None:
53+
Path(tmpdir).mkdir(parents=True, exist_ok=True)
54+
tempfile.tempdir = tmpdir
55+
3456
@property
3557
def IMAGE_CROP_PAD(self) -> int:
3658
"""extra image content to add around an identified element region; measured in pixels"""
@@ -117,5 +139,28 @@ def PDF_ANNOTATION_THRESHOLD(self) -> float:
117139

118140
return self._get_float("PDF_ANNOTATION_THRESHOLD", 0.9)
119141

142+
@property
143+
def GLOBAL_WORKING_DIR_ENABLED(self) -> bool:
144+
"""Enable usage of GLOBAL_WORKING_DIR and GLOBAL_WORKING_PROCESS_DIR."""
145+
return self._get_bool("GLOBAL_WORKING_DIR_ENABLED", False)
146+
147+
@property
148+
def GLOBAL_WORKING_DIR(self) -> str:
149+
"""Path to Unstructured cache directory."""
150+
return self._get_string("GLOBAL_WORKING_DIR", str(Path.home() / ".cache/unstructured"))
151+
152+
@property
153+
def GLOBAL_WORKING_PROCESS_DIR(self) -> str:
154+
"""Path to Unstructured cache tempdir. Overrides TMPDIR, TEMP and TMP.
155+
Defaults to '{GLOBAL_WORKING_DIR}/tmp/{os.getpgid(0)}'.
156+
"""
157+
default_tmpdir = get_tempdir(dir=self.GLOBAL_WORKING_DIR)
158+
tmpdir = self._get_string("GLOBAL_WORKING_PROCESS_DIR", default_tmpdir)
159+
if tmpdir == "":
160+
tmpdir = default_tmpdir
161+
if self.GLOBAL_WORKING_DIR_ENABLED:
162+
self._setup_tmpdir(tmpdir)
163+
return tmpdir
164+
120165

121166
env_config = ENVConfig()

0 commit comments

Comments
 (0)