Skip to content

Commit 6bceac1

Browse files
feat: expose retry params in partition via api (#3724)
This PR: - adds parameters to control the retry-mechanism behaviour for `partition_via_api`: ``` retries_initial_interval: [int] = None, retries_max_interval: Optional[int] = None, retries_exponent: Optional[float] = None, retries_max_elapsed_time: Optional[int] = None, retries_connection_errors: Optional[bool] = None, ``` - adds tests that check using them according to defaults
1 parent a11ad22 commit 6bceac1

File tree

4 files changed

+242
-4
lines changed

4 files changed

+242
-4
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.16.1-dev6
1+
## 0.16.1-dev7
22

33
### Enhancements
44

55
* **Bump `unstructured-inference` to 0.7.39** and upgrade other dependencies
66
* **Round coordinates** Round coordinates when computing bounding box overlaps in `pdfminer_processing.py` to nearest machine precision. This can help reduce underterministic behavior from machine precision that affects which bounding boxes to combine.
7+
* **Request retry parameters in `partition_via_api` function.** Expose retry-mechanism related parameters in the `partition_via_api` function to allow users to configure the retry behavior of the API requests.
78

89
### Features
910

Diff for: test_unstructured/partition/test_api.py

+107-1
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,24 @@
44
import os
55
import pathlib
66
from typing import Any
7+
from unittest.mock import Mock
78

89
import pytest
910
import requests
1011
from unstructured_client.general import General
1112
from unstructured_client.models import shared
1213
from unstructured_client.models.operations import PartitionRequest
1314
from unstructured_client.models.shared import PartitionParameters
15+
from unstructured_client.utils import retries
1416

1517
from unstructured.documents.elements import ElementType, NarrativeText
16-
from unstructured.partition.api import partition_multiple_via_api, partition_via_api
18+
from unstructured.partition.api import (
19+
DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC,
20+
DEFAULT_RETRIES_MAX_INTERVAL_SEC,
21+
get_retries_config,
22+
partition_multiple_via_api,
23+
partition_via_api,
24+
)
1725

1826
from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
1927

@@ -180,13 +188,110 @@ def test_partition_via_api_image_block_extraction():
180188
assert isinstance(image_data, bytes)
181189

182190

191+
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
192+
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
193+
def test_partition_via_api_retries_config():
194+
elements = partition_via_api(
195+
filename=example_doc_path("pdf/embedded-images-tables.pdf"),
196+
strategy="fast",
197+
api_key=get_api_key(),
198+
# The url has changed since the 06/24 API release while the sdk defaults to the old url
199+
api_url=API_URL,
200+
retries_initial_interval=5,
201+
retries_max_interval=15,
202+
retries_max_elapsed_time=100,
203+
retries_connection_errors=True,
204+
retries_exponent=1.5,
205+
)
206+
207+
assert len(elements) > 0
208+
209+
183210
# Note(austin) - This test is way too noisy against the hosted api
184211
# def test_partition_via_api_invalid_request_data_kwargs():
185212
# filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf")
186213
# with pytest.raises(SDKError):
187214
# partition_via_api(filename=filename, strategy="not_a_strategy")
188215

189216

217+
def test_retries_config_with_parameters_set():
218+
sdk = Mock()
219+
retries_config = get_retries_config(
220+
retries_connection_errors=True,
221+
retries_exponent=1.75,
222+
retries_initial_interval=20,
223+
retries_max_elapsed_time=1000,
224+
retries_max_interval=100,
225+
sdk=sdk,
226+
)
227+
228+
assert retries_config.retry_connection_errors
229+
assert retries_config.backoff.exponent == 1.75
230+
assert retries_config.backoff.initial_interval == 20
231+
assert retries_config.backoff.max_elapsed_time == 1000
232+
assert retries_config.backoff.max_interval == 100
233+
234+
235+
def test_retries_config_none_parameters_return_empty_config():
236+
sdk = Mock()
237+
retries_config = get_retries_config(
238+
retries_connection_errors=None,
239+
retries_exponent=None,
240+
retries_initial_interval=None,
241+
retries_max_elapsed_time=None,
242+
retries_max_interval=None,
243+
sdk=sdk,
244+
)
245+
246+
assert retries_config is None
247+
248+
249+
def test_retries_config_with_no_parameters_set():
250+
retry_config = retries.RetryConfig(
251+
"backoff", retries.BackoffStrategy(3000, 720000, 1.88, 1800000), True
252+
)
253+
sdk = Mock()
254+
sdk.sdk_configuration.retry_config = retry_config
255+
retries_config = get_retries_config(
256+
retries_connection_errors=True,
257+
retries_exponent=None,
258+
retries_initial_interval=None,
259+
retries_max_elapsed_time=None,
260+
retries_max_interval=None,
261+
sdk=sdk,
262+
)
263+
264+
assert retries_config.retry_connection_errors
265+
assert retries_config.backoff.exponent == 1.88
266+
assert retries_config.backoff.initial_interval == 3000
267+
assert retries_config.backoff.max_elapsed_time == 1800000
268+
assert retries_config.backoff.max_interval == 720000
269+
270+
271+
def test_retries_config_cascade():
272+
# notice max_interval is set to 0 which is incorrect - so the DEFAULT_RETRIES_MAX_INTERVAL_SEC
273+
# should be used
274+
retry_config = retries.RetryConfig(
275+
"backoff", retries.BackoffStrategy(3000, 0, 1.88, None), True
276+
)
277+
sdk = Mock()
278+
sdk.sdk_configuration.retry_config = retry_config
279+
retries_config = get_retries_config(
280+
retries_connection_errors=False,
281+
retries_exponent=1.75,
282+
retries_initial_interval=20,
283+
retries_max_elapsed_time=None,
284+
retries_max_interval=None,
285+
sdk=sdk,
286+
)
287+
288+
assert not retries_config.retry_connection_errors
289+
assert retries_config.backoff.exponent == 1.75
290+
assert retries_config.backoff.initial_interval == 20
291+
assert retries_config.backoff.max_elapsed_time == DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC
292+
assert retries_config.backoff.max_interval == DEFAULT_RETRIES_MAX_INTERVAL_SEC
293+
294+
190295
def test_partition_multiple_via_api_with_single_filename(request: FixtureRequest):
191296
partition_mock_ = method_mock(
192297
request, requests, "post", return_value=FakeResponse(status_code=200)
@@ -522,4 +627,5 @@ def expected_call_():
522627
xml_keep_tags=False,
523628
)
524629
),
630+
None, # retries kwarg
525631
]

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.1-dev6" # pragma: no cover
1+
__version__ = "0.16.1-dev7" # pragma: no cover

Diff for: unstructured/partition/api.py

+132-1
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,20 @@
66
import requests
77
from unstructured_client import UnstructuredClient
88
from unstructured_client.models import operations, shared
9+
from unstructured_client.utils import retries
910

1011
from unstructured.documents.elements import Element
1112
from unstructured.logger import logger
1213
from unstructured.partition.common.common import exactly_one
1314
from unstructured.staging.base import elements_from_dicts, elements_from_json
1415

16+
# Default retry configuration taken from the client code
17+
DEFAULT_RETRIES_INITIAL_INTERVAL_SEC = 3000
18+
DEFAULT_RETRIES_MAX_INTERVAL_SEC = 720000
19+
DEFAULT_RETRIES_EXPONENT = 1.5
20+
DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC = 1800000
21+
DEFAULT_RETRIES_CONNECTION_ERRORS = True
22+
1523

1624
def partition_via_api(
1725
filename: Optional[str] = None,
@@ -21,6 +29,11 @@ def partition_via_api(
2129
api_url: str = "https://api.unstructured.io/general/v0/general",
2230
api_key: str = "",
2331
metadata_filename: Optional[str] = None,
32+
retries_initial_interval: [int] = None,
33+
retries_max_interval: Optional[int] = None,
34+
retries_exponent: Optional[float] = None,
35+
retries_max_elapsed_time: Optional[int] = None,
36+
retries_connection_errors: Optional[bool] = None,
2437
**request_kwargs: Any,
2538
) -> list[Element]:
2639
"""Partitions a document using the Unstructured REST API. This is equivalent to
@@ -44,6 +57,21 @@ def partition_via_api(
4457
The URL for the Unstructured API. Defaults to the hosted Unstructured API.
4558
api_key
4659
The API key to pass to the Unstructured API.
60+
retries_initial_interval
61+
Defines the time interval (in seconds) to wait before the first retry in case of a request
62+
failure. Defaults to 3000. If set should be > 0.
63+
retries_max_interval
64+
Defines the maximum time interval (in seconds) to wait between retries (the interval
65+
between retries is increased as using exponential increase algorithm
66+
- this setting limits it). Defaults to 720000. If set should be > 0.
67+
retries_exponent
68+
Defines the exponential factor to increase the interval between retries. Defaults to 1.5.
69+
If set should be > 0.0.
70+
retries_max_elapsed_time
71+
Defines the maximum time (in seconds) to wait for retries. If exceeded, the original
72+
exception is raised. Defaults to 1800000. If set should be > 0.
73+
retries_connection_errors
74+
Defines whether to retry on connection errors. Defaults to True.
4775
request_kwargs
4876
Additional parameters to pass to the data field of the request to the Unstructured API.
4977
For example the `strategy` parameter.
@@ -87,7 +115,19 @@ def partition_via_api(
87115
partition_parameters=shared.PartitionParameters(files=files, **request_kwargs)
88116
)
89117

90-
response = sdk.general.partition(request=req)
118+
retries_config = get_retries_config(
119+
retries_connection_errors=retries_connection_errors,
120+
retries_exponent=retries_exponent,
121+
retries_initial_interval=retries_initial_interval,
122+
retries_max_elapsed_time=retries_max_elapsed_time,
123+
retries_max_interval=retries_max_interval,
124+
sdk=sdk,
125+
)
126+
127+
response = sdk.general.partition(
128+
request=req,
129+
retries=retries_config,
130+
)
91131

92132
if response.status_code == 200:
93133
return elements_from_json(text=response.raw_response.text)
@@ -97,6 +137,97 @@ def partition_via_api(
97137
)
98138

99139

140+
def get_retries_config(
141+
retries_connection_errors: Optional[bool],
142+
retries_exponent: Optional[float],
143+
retries_initial_interval: Optional[int],
144+
retries_max_elapsed_time: Optional[int],
145+
retries_max_interval: Optional[int],
146+
sdk: UnstructuredClient,
147+
) -> Optional[retries.RetryConfig]:
148+
"""Constructs a RetryConfig object from the provided parameters. If any of the parameters
149+
are None, the default values are taken from the SDK configuration or the default constants.
150+
151+
If all parameters are None, returns None (and the SDK-managed defaults are used within the
152+
client)
153+
154+
The solution is not perfect as the RetryConfig object does not include the defaults by
155+
itself so we might need to construct it basing on our defaults.
156+
157+
Parameters
158+
----------
159+
retries_connection_errors
160+
Defines whether to retry on connection errors. If not set the
161+
DEFAULT_RETRIES_CONNECTION_ERRORS constant is used.
162+
retries_exponent
163+
Defines the exponential factor to increase the interval between retries.
164+
If set, should be > 0.0 (otherwise the DEFAULT_RETRIES_EXPONENT constant is used)
165+
retries_initial_interval
166+
Defines the time interval to wait before the first retry in case of a request failure.
167+
If set, should be > 0 (otherwise the DEFAULT_RETRIES_INITIAL_INTERVAL_SEC constant is used)
168+
retries_max_elapsed_time
169+
Defines the maximum time to wait for retries. If exceeded, the original exception is raised.
170+
If set, should be > 0 (otherwise the DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC constant is used)
171+
retries_max_interval
172+
Defines the maximum time interval to wait between retries. If set, should be > 0
173+
(otherwise the DEFAULT_RETRIES_MAX_INTERVAL_SEC constant is used)
174+
sdk
175+
The UnstructuredClient object to take the default values from.
176+
"""
177+
retries_config = None
178+
sdk_default_retries_config = sdk.sdk_configuration.retry_config
179+
if any(
180+
setting is not None
181+
for setting in (
182+
retries_initial_interval,
183+
retries_max_interval,
184+
retries_exponent,
185+
retries_max_elapsed_time,
186+
retries_connection_errors,
187+
)
188+
):
189+
190+
def get_backoff_default(setting_name: str, default_value: Any) -> Any:
191+
if sdk_default_retries_config: # noqa: SIM102
192+
if setting_value := getattr(sdk_default_retries_config.backoff, setting_name):
193+
return setting_value
194+
return default_value
195+
196+
default_retries_connneciton_errors = (
197+
sdk_default_retries_config.retry_connection_errors
198+
if sdk_default_retries_config.retry_connection_errors is not None
199+
else DEFAULT_RETRIES_CONNECTION_ERRORS
200+
)
201+
202+
backoff_strategy = retries.BackoffStrategy(
203+
initial_interval=(
204+
retries_initial_interval
205+
or get_backoff_default("initial_interval", DEFAULT_RETRIES_INITIAL_INTERVAL_SEC)
206+
),
207+
max_interval=(
208+
retries_max_interval
209+
or get_backoff_default("max_interval", DEFAULT_RETRIES_MAX_INTERVAL_SEC)
210+
),
211+
exponent=(
212+
retries_exponent or get_backoff_default("exponent", DEFAULT_RETRIES_EXPONENT)
213+
),
214+
max_elapsed_time=(
215+
retries_max_elapsed_time
216+
or get_backoff_default("max_elapsed_time", DEFAULT_RETRIES_MAX_ELAPSED_TIME_SEC)
217+
),
218+
)
219+
retries_config = retries.RetryConfig(
220+
strategy="backoff",
221+
backoff=backoff_strategy,
222+
retry_connection_errors=(
223+
retries_connection_errors
224+
if retries_connection_errors is not None
225+
else default_retries_connneciton_errors
226+
),
227+
)
228+
return retries_config
229+
230+
100231
def partition_multiple_via_api(
101232
filenames: Optional[list[str]] = None,
102233
content_types: Optional[list[str]] = None,

0 commit comments

Comments
 (0)