Skip to content

Commit 2c3fcbe

Browse files
authored
fix: Do not throw error when in a uvloop context (#142)
PDF page splitting uses asyncio but the SDK is not async. Therefore, we had to manage our own event loop, which can lead to issues in other event loop contexts. Uvloop is one context that does not allow us to use nested event loops. When we find ourselves in a uvloop.Loop, we have to fallback to non splitting mode. #135 will make the whole SDK async so we don't have to hack this. Closes #133
1 parent 22fb177 commit 2c3fcbe

File tree

3 files changed

+48
-5
lines changed

3 files changed

+48
-5
lines changed

Diff for: Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
99

1010
.PHONY: install-test
1111
install-test:
12-
pip install pytest pytest-asyncio pytest-mock requests_mock pypdf deepdiff requests-toolbelt
12+
pip install pytest pytest-asyncio pytest-mock requests_mock pypdf deepdiff requests-toolbelt uvloop
1313

1414
.PHONY: install-dev
1515
install-dev:

Diff for: _test_unstructured_client/integration/test_integration_freemium.py

+29
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,32 @@ def test_partition_handling_server_error(error, split_pdf, monkeypatch, doc_path
9191

9292
with pytest.raises(sdk_raises):
9393
response = client.general.partition(req)
94+
95+
96+
def test_uvloop_partitions_without_errors(client, doc_path):
97+
async def call_api():
98+
filename = "layout-parser-paper-fast.pdf"
99+
with open(doc_path / filename, "rb") as f:
100+
files = shared.Files(
101+
content=f.read(),
102+
file_name=filename,
103+
)
104+
105+
req = shared.PartitionParameters(
106+
files=files,
107+
strategy="fast",
108+
languages=["eng"],
109+
split_pdf_page=True,
110+
)
111+
112+
resp = client.general.partition(req)
113+
114+
if resp is not None:
115+
return resp.elements
116+
else:
117+
return []
118+
119+
import uvloop
120+
uvloop.install()
121+
elements = asyncio.run(call_api())
122+
assert len(elements) > 0

Diff for: src/unstructured_client/_hooks/custom/split_pdf_hook.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@ async def run_tasks(coroutines: list[Awaitable], allow_failed: bool = False) ->
7373
return sorted(results, key=lambda x: x[0])
7474

7575

76+
def context_is_uvloop():
77+
"""Return true if uvloop is installed and we're currently in a uvloop context. Our asyncio splitting code currently doesn't work under uvloop."""
78+
try:
79+
import uvloop # pylint: disable=import-outside-toplevel
80+
loop = asyncio.get_event_loop()
81+
return isinstance(loop, uvloop.Loop)
82+
except ImportError:
83+
return False
84+
7685
def get_optimal_split_size(num_pages: int, concurrency_level: int) -> int:
7786
"""Distributes pages to workers evenly based on the number of pages and desired concurrency level."""
7887
if num_pages < MAX_PAGES_PER_SPLIT * concurrency_level:
@@ -94,10 +103,6 @@ class SplitPdfHook(SDKInitHook, BeforeRequestHook, AfterSuccessHook, AfterErrorH
94103
"""
95104

96105
def __init__(self) -> None:
97-
# This allows us to use an event loop in an env with an existing loop
98-
# Temporary fix until we can improve the async splitting behavior
99-
nest_asyncio.apply()
100-
101106
self.client: Optional[requests.Session] = None
102107
self.coroutines_to_execute: dict[
103108
str, list[Coroutine[Any, Any, requests.Response]]
@@ -121,6 +126,8 @@ def sdk_init(
121126
self.client = client
122127
return base_url, client
123128

129+
130+
# pylint: disable=too-many-return-statements
124131
def before_request(
125132
self, hook_ctx: BeforeRequestContext, request: requests.PreparedRequest
126133
) -> Union[requests.PreparedRequest, Exception]:
@@ -143,6 +150,13 @@ def before_request(
143150
logger.warning("HTTP client not accessible! Continuing without splitting.")
144151
return request
145152

153+
if context_is_uvloop():
154+
logger.warning("Splitting is currently incompatible with uvloop. Continuing without splitting.")
155+
return request
156+
157+
# This allows us to use an event loop in an env with an existing loop
158+
# Temporary fix until we can improve the async splitting behavior
159+
nest_asyncio.apply()
146160
operation_id = hook_ctx.operation_id
147161
content_type = request.headers.get("Content-Type")
148162
body = request.body

0 commit comments

Comments
 (0)