Skip to content

Commit f2e0bbb

Browse files
authored
fix: DATA_DIR_KEY (#214)
1 parent 62bb644 commit f2e0bbb

File tree

5 files changed

+69
-19
lines changed

5 files changed

+69
-19
lines changed

Diff for: .gitignore

+9
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,12 @@ __pycache__/
1212
.idea/
1313
openapi.json
1414
openapi_client.json
15+
16+
# Environments
17+
.env
18+
.envrc
19+
.venv*
20+
venv*
21+
env/
22+
ENV/
23+
env.bak/

Diff for: _test_unstructured_client/integration/test_decorators.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import tempfile
44
from pathlib import Path
5+
from typing import Literal
56

67
import httpx
78
import json
@@ -15,6 +16,7 @@
1516
from unstructured_client import UnstructuredClient
1617
from unstructured_client.models import shared, operations
1718
from unstructured_client.models.errors import HTTPValidationError
19+
from unstructured_client.models.shared.partition_parameters import Strategy
1820
from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
1921
from unstructured_client._hooks.custom import form_utils
2022
from unstructured_client._hooks.custom import split_pdf_hook
@@ -105,19 +107,22 @@ def test_integration_split_pdf_has_same_output_as_non_split(
105107
)
106108
assert len(diff) == 0
107109

108-
@pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [
109-
("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16
110-
]# pages
111-
)
112-
@pytest.mark.parametrize( ("use_caching", "cache_dir"), [
110+
111+
@pytest.mark.parametrize(("filename", "expected_ok", "strategy"), [
112+
("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages
113+
])
114+
@pytest.mark.parametrize(("use_caching", "cache_dir"), [
113115
(True, None), # Use default cache dir
114116
(True, Path(tempfile.gettempdir()) / "test_integration_unstructured_client1"), # Use custom cache dir
115117
(False, None), # Don't use caching
116118
(False, Path(tempfile.gettempdir()) / "test_integration_unstructured_client2"), # Don't use caching, use custom cache dir
117119
])
118120
def test_integration_split_pdf_with_caching(
119-
filename: str, expected_ok: bool, strategy: str, use_caching: bool,
120-
cache_dir: Path | None
121+
filename: str,
122+
expected_ok: bool,
123+
strategy: Literal[Strategy.HI_RES],
124+
use_caching: bool,
125+
cache_dir: Path | None,
121126
):
122127
try:
123128
response = requests.get("http://localhost:8000/general/docs")
@@ -140,10 +145,9 @@ def test_integration_split_pdf_with_caching(
140145
parameters = shared.PartitionParameters(
141146
files=files,
142147
strategy=strategy,
143-
languages=["eng"],
144148
split_pdf_page=True,
145149
split_pdf_cache_tmp_data=use_caching,
146-
split_pdf_cache_dir=cache_dir,
150+
split_pdf_cache_tmp_data_dir=str(cache_dir),
147151
)
148152

149153
req = operations.PartitionRequest(
@@ -185,6 +189,7 @@ def test_integration_split_pdf_with_caching(
185189
if cache_dir:
186190
assert not Path(cache_dir).exists()
187191

192+
188193
@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
189194
def test_long_pages_hi_res(filename):
190195
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(

Diff for: _test_unstructured_client/unit/test_split_pdf_hook.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,26 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import io
5-
import logging
64
from asyncio import Task
75
from collections import Counter
86
from functools import partial
9-
from typing import Coroutine
7+
from pathlib import Path
8+
from unittest.mock import MagicMock, patch
109

1110
import httpx
1211
import pytest
1312
import requests
14-
from requests_toolbelt import MultipartDecoder, MultipartEncoder
13+
from requests_toolbelt import MultipartDecoder
1514

1615
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
1716
from unstructured_client._hooks.custom.form_utils import (
17+
FormData,
1818
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
1919
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
2020
PARTITION_FORM_PAGE_RANGE_KEY,
2121
)
2222
from unstructured_client._hooks.custom.split_pdf_hook import (
23+
DEFAULT_CACHE_TMP_DATA_DIR,
2324
DEFAULT_CONCURRENCY_LEVEL,
2425
DEFAULT_STARTING_PAGE_NUMBER,
2526
MAX_CONCURRENCY_LEVEL,
@@ -434,3 +435,30 @@ async def test_remaining_tasks_cancelled_when_fails_disallowed():
434435
await asyncio.sleep(1)
435436
print("Cancelled amount: ", cancelled_counter["cancelled"])
436437
assert len(tasks) > cancelled_counter["cancelled"] > 0
438+
439+
440+
@patch("unstructured_client._hooks.custom.form_utils.Path")
441+
def test_unit_get_split_pdf_cache_tmp_data_dir_uses_dir_from_form_data(mock_path: MagicMock):
442+
"""Test get_split_pdf_cache_tmp_data_dir uses the directory from the form data."""
443+
# -- Create the form_data
444+
dir_key = form_utils.PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY # -- "split_pdf_cache_tmp_data_dir"
445+
mock_dir = "/mock/dir"
446+
form_data: FormData = {dir_key: mock_dir}
447+
448+
# -- Mock the Path object in form_utils
449+
mock_path_instance = MagicMock()
450+
mock_path.return_value = mock_path_instance
451+
mock_path_instance.exists.return_value = True
452+
mock_path_instance.resolve.return_value = Path(mock_dir)
453+
454+
result = form_utils.get_split_pdf_cache_tmp_data_dir(
455+
form_data = form_data,
456+
key=dir_key,
457+
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR # -- tempfile.gettempdir()
458+
)
459+
460+
assert dir_key == "split_pdf_cache_tmp_data_dir"
461+
assert form_data.get(dir_key) == "/mock/dir"
462+
mock_path.assert_called_once_with(mock_dir)
463+
mock_path_instance.exists.assert_called_once()
464+
assert result == str(Path(mock_dir).resolve())

Diff for: _test_unstructured_client/unit_utils.py

+7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import pathlib
56
from typing import Any
67
from unittest.mock import (
78
ANY,
@@ -31,6 +32,12 @@
3132
"property_mock",
3233
)
3334

35+
def sample_docs_path(file_name: str) -> str:
36+
"""Resolve the absolute-path to `file_name` in the sample-docs directory."""
37+
sample_docs_dir = pathlib.Path(__file__).parent.parent / "_sample_docs"
38+
file_path = sample_docs_dir / file_name
39+
return str(file_path.resolve())
40+
3441

3542
# ------------------------------------------------------------------------------------------------
3643
# MOCKING FIXTURES

Diff for: src/unstructured_client/_hooks/custom/split_pdf_hook.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
2626
PARTITION_FORM_FILES_KEY,
2727
PARTITION_FORM_PAGE_RANGE_KEY,
28-
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
28+
PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
29+
PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
2930
PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY,
30-
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
31+
PARTITION_FORM_SPLIT_PDF_PAGE_KEY,
32+
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
3133
)
3234
from unstructured_client._hooks.types import (
3335
AfterErrorContext,
@@ -315,7 +317,7 @@ def before_request(
315317

316318
self.cache_tmp_data_dir = form_utils.get_split_pdf_cache_tmp_data_dir(
317319
form_data,
318-
key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_KEY,
320+
key=PARTITION_FORM_SPLIT_CACHE_TMP_DATA_DIR_KEY,
319321
fallback_value=DEFAULT_CACHE_TMP_DATA_DIR,
320322
)
321323

@@ -546,7 +548,7 @@ def _get_pdf_chunk_paths(
546548
return pdf_chunk_paths
547549

548550
def _get_pdf_chunk_files(
549-
self, pdf_chunks: list[Tuple[Path, int]]
551+
self, pdf_chunks: list[Tuple[Path, int]]
550552
) -> Generator[Tuple[BinaryIO, int], None, None]:
551553
"""Yields the file objects for the given pdf chunk paths.
552554
@@ -573,8 +575,7 @@ def _get_pdf_chunk_files(
573575
raise
574576
yield pdf_chunk_file, offset
575577

576-
def _await_elements(
577-
self, operation_id: str) -> Optional[list]:
578+
def _await_elements(self, operation_id: str) -> Optional[list]:
578579
"""
579580
Waits for the partition requests to complete and returns the flattened
580581
elements.

0 commit comments

Comments
 (0)