Skip to content

Commit 5e30cda

Browse files
authored
feat: Parameter to send custom page range when splitting pdf (#125)
# New parameter Add a client side param called `split_pdf_page_range` which takes a list of two integers, `[start_page, end_page]`. If `split_pdf_page` is `True` and a range is set, slice the doc from `start_page` up to and including `end_page`. Only this page range will be sent to the API. The subset of pages is still split up as needed. # Other changes Allow our custom hooks to properly access list parameters, so we're able to intercept `split_pdf_page_range`. We need extra handling to get list params out of the request in `parse_form_data`, and to rebuild the payload in `create_request_body`. # Testing Check out this branch and set up a request to your local API: ``` client = UnstructuredClient(api_key_auth="", server_url="localhost:8000") filename = "_sample_docs/layout-parser-paper.pdf" with open(filename, "rb") as f: files = shared.Files( content=f.read(), file_name=filename, ) req = shared.PartitionParameters( files=files, strategy="fast", split_pdf_page=True, split_pdf_page_range=[1, 16], ) resp = client.general.partition(req) ``` Test out various page ranges and confirm that the returned elements are within the range. Invalid ranges should throw a ValueError (pages are out of bounds, or end_page < start_page).
1 parent 2d50bdf commit 5e30cda

File tree

11 files changed

+290
-44
lines changed

11 files changed

+290
-44
lines changed

Diff for: README.md

+12
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ res = s.general.partition(request=operations.PartitionRequest(
5555
content='0x2cC94b2FEF'.encode(),
5656
file_name='your_file_here',
5757
),
58+
split_pdf_page_range=[
59+
1,
60+
10,
61+
],
5862
strategy=shared.Strategy.AUTO,
5963
),
6064
))
@@ -110,6 +114,10 @@ res = s.general.partition(request=operations.PartitionRequest(
110114
content='0x2cC94b2FEF'.encode(),
111115
file_name='your_file_here',
112116
),
117+
split_pdf_page_range=[
118+
1,
119+
10,
120+
],
113121
strategy=shared.Strategy.AUTO,
114122
),
115123
),
@@ -139,6 +147,10 @@ res = s.general.partition(request=operations.PartitionRequest(
139147
content='0x2cC94b2FEF'.encode(),
140148
file_name='your_file_here',
141149
),
150+
split_pdf_page_range=[
151+
1,
152+
10,
153+
],
142154
strategy=shared.Strategy.AUTO,
143155
),
144156
))

Diff for: USAGE.md

+4
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ res = s.general.partition(request=operations.PartitionRequest(
1414
content='0x2cC94b2FEF'.encode(),
1515
file_name='your_file_here',
1616
),
17+
split_pdf_page_range=[
18+
1,
19+
10,
20+
],
1721
strategy=shared.Strategy.AUTO,
1822
),
1923
))

Diff for: _test_unstructured_client/integration/test_decorators.py

+70
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,73 @@ def test_integration_split_pdf_for_file_with_no_name():
110110
)
111111

112112
pytest.raises(ValueError, client.general.partition, req)
113+
114+
115+
@pytest.mark.parametrize("starting_page_number", [1, 100])
116+
@pytest.mark.parametrize(
117+
"page_range, expected_ok, expected_pages",
118+
[
119+
(["1", "14"], True, (1, 14)), # Valid range, start on boundary
120+
(["4", "16"], True, (4, 16)), # Valid range, end on boundary
121+
(["2", "5"], True, (2, 5)), # Valid range within boundary
122+
# A 1 page doc wouldn't normally be split,
123+
# but this code still needs to return the page range
124+
(["6", "6"], True, (6, 6)),
125+
(["2", "100"], False, None), # End page too high
126+
(["50", "100"], False, None), # Range too high
127+
(["-50", "5"], False, None), # Start page too low
128+
(["-50", "-2"], False, None), # Range too low
129+
(["10", "2"], False, None), # Backwards range
130+
],
131+
)
132+
def test_integration_split_pdf_with_page_range(
133+
starting_page_number: int,
134+
page_range: list[int],
135+
expected_ok: bool,
136+
expected_pages: tuple[int, int],
137+
caplog,
138+
):
139+
"""
140+
Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct.
141+
We should also be able to offset the metadata with starting_page_number.
142+
143+
Requires unstructured-api running in bg. See Makefile for how to run it.
144+
"""
145+
try:
146+
response = requests.get("http://localhost:8000/general/docs")
147+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
148+
except requests.exceptions.ConnectionError:
149+
assert False, "The unstructured-api is not running on localhost:8000"
150+
151+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
152+
153+
filename = "_sample_docs/layout-parser-paper.pdf"
154+
with open(filename, "rb") as f:
155+
files = shared.Files(
156+
content=f.read(),
157+
file_name=filename,
158+
)
159+
160+
req = shared.PartitionParameters(
161+
files=files,
162+
strategy="fast",
163+
split_pdf_page=True,
164+
split_pdf_page_range=page_range,
165+
starting_page_number=starting_page_number,
166+
)
167+
168+
try:
169+
resp = client.general.partition(req)
170+
except ValueError as exc:
171+
assert not expected_ok
172+
assert "is out of bounds." in caplog.text
173+
assert "is out of bounds." in str(exc)
174+
return
175+
176+
page_numbers = set([e["metadata"]["page_number"] for e in resp.elements])
177+
178+
min_page_number = expected_pages[0] + starting_page_number - 1
179+
max_page_number = expected_pages[1] + starting_page_number - 1
180+
181+
assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}"
182+
assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}"

Diff for: _test_unstructured_client/unit/test_split_pdf_hook.py

+81-15
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from unstructured_client._hooks.custom.form_utils import (
1010
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
1111
PARTITION_FORM_STARTING_PAGE_NUMBER_KEY,
12+
PARTITION_FORM_PAGE_RANGE_KEY,
1213
)
1314
from unstructured_client._hooks.custom.split_pdf_hook import (
1415
DEFAULT_CONCURRENCY_LEVEL,
@@ -122,7 +123,8 @@ def test_unit_create_response():
122123

123124

124125
def test_unit_create_request():
125-
"""Test create request method properly sets file, Content-Type and Content-Length headers."""
126+
"""Test create request method properly sets file, Content-Type and Content-Length headers.
127+
List parameters should be flattened in the body."""
126128

127129
# Prepare test data
128130
request = requests.PreparedRequest()
@@ -133,27 +135,27 @@ def test_unit_create_request():
133135
form_data = {
134136
"parameter_1": "value_1",
135137
"parameter_2": "value_2",
138+
"list_parameter": ["value_1", "value_2"],
136139
}
137140
page = (io.BytesIO(b"page_content"), 1)
138141
filename = "test_file.pdf"
139142

140143
# Expected results
141-
expected_payload = {
142-
"parameter_1": "value_1",
143-
"parameter_2": "value_2",
144-
"split_pdf_page": "false",
145-
"starting_page_number": "7",
146-
}
147144
expected_page_filename = "test_file.pdf"
148145
expected_body = MultipartEncoder(
149-
fields={
150-
**expected_payload,
151-
"files": (
146+
fields=[
147+
("parameter_1", "value_1"),
148+
("parameter_2", "value_2"),
149+
("list_parameter", "value_1"),
150+
("list_parameter", "value_2"),
151+
("split_pdf_page", "false"),
152+
("starting_page_number", "7"),
153+
("files", (
152154
expected_page_filename,
153155
page[0],
154156
"application/pdf",
155-
),
156-
}
157+
)),
158+
]
157159
)
158160
expected_url = ""
159161

@@ -164,7 +166,10 @@ def test_unit_create_request():
164166
# Assert the request object
165167
assert request_obj.method == "POST"
166168
assert request_obj.url == expected_url
167-
assert request_obj.data.fields == expected_body.fields
169+
170+
# Validate fields ignoring order
171+
assert set(request_obj.data.fields) == set(expected_body.fields)
172+
168173
assert request_content_type.startswith("multipart/form-data")
169174

170175

@@ -191,11 +196,37 @@ def test_unit_decode_content_disposition():
191196

192197

193198
def test_unit_parse_form_data():
194-
"""Test parse form data method properly parses the form data and returns dictionary."""
199+
"""Test parse form data method properly parses the form data and returns dictionary.
200+
Parameters with the same key should be consolidated to a list."""
195201

196202
# Prepare test data
203+
test_form_data = (
204+
b"--boundary\r\n"
205+
b"Content-Disposition: form-data; name=\"files\"; filename=\"test_file.pdf\"\r\n"
206+
b"\r\n"
207+
b"file_content\r\n"
208+
b"--boundary\r\n"
209+
b"Content-Disposition: form-data; name=\"parameter_1\"\r\n"
210+
b"\r\n"
211+
b"value_1\r\n"
212+
b"--boundary\r\n"
213+
b"Content-Disposition: form-data; name=\"parameter_2\"\r\n"
214+
b"\r\n"
215+
b"value_2\r\n"
216+
b"--boundary\r\n"
217+
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
218+
b"\r\n"
219+
b"value_1\r\n"
220+
b"--boundary\r\n"
221+
b"Content-Disposition: form-data; name=\"list_parameter\"\r\n"
222+
b"\r\n"
223+
b"value_2\r\n"
224+
b"--boundary--\r\n"
225+
)
226+
227+
197228
decoded_data = MultipartDecoder(
198-
b'--boundary\r\nContent-Disposition: form-data; name="files"; filename="test_file.pdf"\r\n\r\nfile_content\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_1"\r\n\r\nvalue_1\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_2"\r\n\r\nvalue_2\r\n--boundary--\r\n',
229+
test_form_data,
199230
"multipart/form-data; boundary=boundary",
200231
)
201232

@@ -204,6 +235,7 @@ def test_unit_parse_form_data():
204235
"files": shared.Files(b"file_content", "test_file.pdf"),
205236
"parameter_1": "value_1",
206237
"parameter_2": "value_2",
238+
"list_parameter": ["value_1", "value_2"],
207239
}
208240

209241
# Parse form data
@@ -212,6 +244,7 @@ def test_unit_parse_form_data():
212244
# Assert the parsed form data
213245
assert form_data.get("parameter_1") == expected_form_data.get("parameter_1")
214246
assert form_data.get("parameter_2") == expected_form_data.get("parameter_2")
247+
assert form_data.get("list_parameter") == expected_form_data.get("list_parameter")
215248
assert form_data.get("files").file_name == expected_form_data.get("files").file_name
216249

217250
assert form_data.get("files").content == expected_form_data.get("files").content
@@ -366,3 +399,36 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
366399
fallback_value=DEFAULT_STARTING_PAGE_NUMBER,
367400
)
368401
assert result == expected_result
402+
403+
404+
@pytest.mark.parametrize(
405+
"page_range, expected_result",
406+
[
407+
(["1", "14"], (1, 14)), # Valid range, start on boundary
408+
(["4", "16"], (4, 16)), # Valid range, end on boundary
409+
(None, (1, 20)), # Range not specified, defaults to full range
410+
(["2", "5"], (2, 5)), # Valid range within boundary
411+
(["2", "100"], None), # End page too high
412+
(["50", "100"], None), # Range too high
413+
(["-50", "5"], None), # Start page too low
414+
(["-50", "-2"], None), # Range too low
415+
(["10", "2"], None), # Backwards range
416+
(["foo", "foo"], None), # Parse error
417+
],
418+
)
419+
def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
420+
"""Test get_page_range method with different inputs.
421+
Ranges that are out of bounds for a 20 page doc will throw a ValueError."""
422+
form_data = {"split_pdf_page_range[]": page_range}
423+
try:
424+
result = form_utils.get_page_range(
425+
form_data,
426+
key=PARTITION_FORM_PAGE_RANGE_KEY,
427+
max_pages=20,
428+
)
429+
except ValueError as exc:
430+
assert not expected_result
431+
assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc)
432+
return
433+
434+
assert result == expected_result

0 commit comments

Comments
 (0)