Skip to content

Commit 7d5a11c

Browse files
Add PDF split/merge support with live tests
- Introduced `split_pdf` and `merge_pdfs` methods in both sync and async clients. - Added `PdfSplitPayload` and `PdfMergePayload` models for payload validation and serialization. - Created associated types and serializers for page groupings and merge sources. - Added comprehensive live and unit tests for splitting and merging functionality. Assisted-by: Codex
1 parent 08f4aad commit 7d5a11c

7 files changed

Lines changed: 1081 additions & 2 deletions

File tree

src/pdfrest/client.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,20 @@
4040
GifPdfRestPayload,
4141
JpegPdfRestPayload,
4242
PdfInfoPayload,
43+
PdfMergePayload,
4344
PdfRedactionApplyPayload,
4445
PdfRedactionPreviewPayload,
4546
PdfRestRawFileResponse,
47+
PdfSplitPayload,
4648
PngPdfRestPayload,
4749
TiffPdfRestPayload,
4850
UploadURLs,
4951
)
5052
from .types import (
5153
ALL_PDF_INFO_QUERIES,
5254
PdfInfoQuery,
55+
PdfMergeInput,
56+
PdfPageSelection,
5357
PdfRedactionInstruction,
5458
PdfRGBColor,
5559
)
@@ -1591,6 +1595,61 @@ def apply_redactions(
15911595
timeout=timeout,
15921596
)
15931597

1598+
def split_pdf(
1599+
self,
1600+
file: PdfRestFile | Sequence[PdfRestFile],
1601+
*,
1602+
page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None,
1603+
output_prefix: str | None = None,
1604+
extra_query: Query | None = None,
1605+
extra_headers: AnyMapping | None = None,
1606+
extra_body: Body | None = None,
1607+
timeout: TimeoutTypes | None = None,
1608+
) -> PdfRestFileBasedResponse:
1609+
"""Split a PDF into one or more PDF files based on the provided page groups."""
1610+
1611+
payload: dict[str, Any] = {"files": file}
1612+
if page_groups is not None:
1613+
payload["page_groups"] = page_groups
1614+
if output_prefix is not None:
1615+
payload["output_prefix"] = output_prefix
1616+
1617+
return self._post_file_operation(
1618+
endpoint="/split-pdf",
1619+
payload=payload,
1620+
payload_model=PdfSplitPayload,
1621+
extra_query=extra_query,
1622+
extra_headers=extra_headers,
1623+
extra_body=extra_body,
1624+
timeout=timeout,
1625+
)
1626+
1627+
def merge_pdfs(
1628+
self,
1629+
sources: Sequence[PdfMergeInput],
1630+
*,
1631+
output_prefix: str | None = None,
1632+
extra_query: Query | None = None,
1633+
extra_headers: AnyMapping | None = None,
1634+
extra_body: Body | None = None,
1635+
timeout: TimeoutTypes | None = None,
1636+
) -> PdfRestFileBasedResponse:
1637+
"""Merge multiple PDFs (or page subsets) into a single PDF file."""
1638+
1639+
payload: dict[str, Any] = {"sources": sources}
1640+
if output_prefix is not None:
1641+
payload["output_prefix"] = output_prefix
1642+
1643+
return self._post_file_operation(
1644+
endpoint="/merged-pdf",
1645+
payload=payload,
1646+
payload_model=PdfMergePayload,
1647+
extra_query=extra_query,
1648+
extra_headers=extra_headers,
1649+
extra_body=extra_body,
1650+
timeout=timeout,
1651+
)
1652+
15941653
def convert_to_png(
15951654
self,
15961655
files: PdfRestFile | Sequence[PdfRestFile],
@@ -1963,6 +2022,61 @@ async def _convert_to_graphic(
19632022
timeout=timeout,
19642023
)
19652024

2025+
async def split_pdf(
2026+
self,
2027+
file: PdfRestFile | Sequence[PdfRestFile],
2028+
*,
2029+
page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None,
2030+
output_prefix: str | None = None,
2031+
extra_query: Query | None = None,
2032+
extra_headers: AnyMapping | None = None,
2033+
extra_body: Body | None = None,
2034+
timeout: TimeoutTypes | None = None,
2035+
) -> PdfRestFileBasedResponse:
2036+
"""Asynchronously split a PDF into one or more PDF files."""
2037+
2038+
payload: dict[str, Any] = {"files": file}
2039+
if page_groups is not None:
2040+
payload["page_groups"] = page_groups
2041+
if output_prefix is not None:
2042+
payload["output_prefix"] = output_prefix
2043+
2044+
return await self._post_file_operation(
2045+
endpoint="/split-pdf",
2046+
payload=payload,
2047+
payload_model=PdfSplitPayload,
2048+
extra_query=extra_query,
2049+
extra_headers=extra_headers,
2050+
extra_body=extra_body,
2051+
timeout=timeout,
2052+
)
2053+
2054+
async def merge_pdfs(
2055+
self,
2056+
sources: Sequence[PdfMergeInput],
2057+
*,
2058+
output_prefix: str | None = None,
2059+
extra_query: Query | None = None,
2060+
extra_headers: AnyMapping | None = None,
2061+
extra_body: Body | None = None,
2062+
timeout: TimeoutTypes | None = None,
2063+
) -> PdfRestFileBasedResponse:
2064+
"""Asynchronously merge multiple PDFs (or page subsets) into a single PDF."""
2065+
2066+
payload: dict[str, Any] = {"sources": sources}
2067+
if output_prefix is not None:
2068+
payload["output_prefix"] = output_prefix
2069+
2070+
return await self._post_file_operation(
2071+
endpoint="/merged-pdf",
2072+
payload=payload,
2073+
payload_model=PdfMergePayload,
2074+
extra_query=extra_query,
2075+
extra_headers=extra_headers,
2076+
extra_body=extra_body,
2077+
timeout=timeout,
2078+
)
2079+
19662080
async def convert_to_png(
19672081
self,
19682082
files: PdfRestFile | Sequence[PdfRestFile],

src/pdfrest/models/_internal.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
Field,
1616
HttpUrl,
1717
PlainSerializer,
18+
model_serializer,
1819
model_validator,
1920
)
2021

@@ -122,6 +123,12 @@ def join_tuple(value: str | int | tuple[str | int, ...]) -> str:
122123
return ",".join(join_tuple(v) for v in value)
123124

124125

126+
def _serialize_grouped_page_ranges(
127+
value: list[list[str | int | tuple[str | int, ...]]],
128+
) -> list[str]:
129+
return [_serialize_page_ranges(v) for v in value]
130+
131+
125132
def _serialize_redactions(value: list[_PdfRedactionVariant]) -> str:
126133
payload = [entry.model_dump(mode="json", exclude_none=True) for entry in value]
127134
return json.dumps(payload, separators=(",", ":"))
@@ -182,6 +189,17 @@ def _ascending_page_range(
182189
return range
183190

184191

192+
_PageRangeTupleWithLast = Annotated[
193+
tuple[PageNumber, PageNumber]
194+
| tuple[Literal["last"], PageNumber]
195+
| tuple[PageNumber, Literal["last"]],
196+
BeforeValidator(_split_page_range_tuple),
197+
]
198+
199+
SplitMergePageRange = (
200+
Literal["even", "odd", "last"] | PageNumber | _PageRangeTupleWithLast
201+
)
202+
185203
_AscendingPageRangeTuple = Annotated[
186204
tuple[PageNumber, PageNumber] | tuple[PageNumber, Literal["last"]],
187205
BeforeValidator(_split_page_range_tuple),
@@ -349,6 +367,121 @@ class PngPdfRestPayload(BasePdfRestGraphicPayload[Literal["rgb", "rgba", "gray"]
349367
color_model: Annotated[Literal["rgb", "rgba", "gray"], Field(default="rgb")]
350368

351369

370+
_DEFAULT_FULL_DOCUMENT_RANGE: list[str] = ["1-last"]
371+
372+
373+
class PdfSplitPayload(BaseModel):
374+
"""Adapt caller options into a pdfRest-ready split request payload."""
375+
376+
files: Annotated[
377+
list[PdfRestFile],
378+
Field(
379+
min_length=1,
380+
max_length=1,
381+
validation_alias=AliasChoices("file", "files"),
382+
serialization_alias="id",
383+
),
384+
BeforeValidator(_ensure_list),
385+
AfterValidator(
386+
_allowed_mime_types("application/pdf", error_msg="Must be a PDF file")
387+
),
388+
PlainSerializer(_serialize_as_first_file_id),
389+
]
390+
page_groups: Annotated[
391+
list[
392+
Annotated[
393+
list[SplitMergePageRange],
394+
BeforeValidator(_ensure_list),
395+
BeforeValidator(_split_comma_string),
396+
]
397+
]
398+
| None,
399+
Field(
400+
default=None,
401+
validation_alias=AliasChoices("pages", "page_groups"),
402+
serialization_alias="pages",
403+
min_length=1,
404+
),
405+
BeforeValidator(_ensure_list),
406+
BeforeValidator(_int_to_string),
407+
PlainSerializer(_serialize_grouped_page_ranges),
408+
]
409+
output_prefix: Annotated[
410+
str | None,
411+
Field(serialization_alias="output", min_length=1, default=None),
412+
AfterValidator(_validate_output_prefix),
413+
] = None
414+
415+
416+
class _PdfMergeItem(BaseModel):
417+
file: Annotated[
418+
PdfRestFile,
419+
AfterValidator(
420+
_allowed_mime_types("application/pdf", error_msg="Must be a PDF file")
421+
),
422+
]
423+
pages: Annotated[
424+
list[SplitMergePageRange],
425+
Field(
426+
min_length=1,
427+
default_factory=lambda: list(_DEFAULT_FULL_DOCUMENT_RANGE).copy(),
428+
),
429+
BeforeValidator(_list_of_strings),
430+
BeforeValidator(_ensure_list),
431+
PlainSerializer(_serialize_page_ranges),
432+
]
433+
434+
@model_validator(mode="before")
435+
@classmethod
436+
def _transform_input(cls, data: Any) -> Any:
437+
if isinstance(data, tuple):
438+
if len(data) != 2:
439+
msg = (
440+
"Tuple merge entries must contain exactly two items: (file, pages)."
441+
)
442+
raise ValueError(msg)
443+
file_candidate, pages = data
444+
return {"file": file_candidate, "pages": pages}
445+
if isinstance(data, PdfRestFile):
446+
return {"file": data}
447+
return data
448+
449+
450+
class PdfMergePayload(BaseModel):
451+
"""Adapt caller options into a pdfRest-ready merge request payload."""
452+
453+
sources: Annotated[
454+
list[_PdfMergeItem],
455+
Field(
456+
min_length=2,
457+
validation_alias=AliasChoices("sources", "documents", "files"),
458+
),
459+
BeforeValidator(_ensure_list),
460+
]
461+
output_prefix: Annotated[
462+
str | None,
463+
Field(serialization_alias="output", min_length=1, default=None),
464+
AfterValidator(_validate_output_prefix),
465+
] = None
466+
467+
@model_serializer(mode="wrap")
468+
def _serialize_pdf_merge_payload(
469+
self, handler: Callable[[PdfMergePayload], dict[str, Any]]
470+
) -> dict[str, Any]:
471+
# Invoke all the serializers on the payload, which then properly serializes
472+
# all the fields.
473+
payload = handler(self)
474+
# Reorganize the serialized data into the parallel arrays that pdfRest expects
475+
payload["type"] = ["id"] * len(self.sources)
476+
payload["pages"] = [
477+
source.get("pages", _DEFAULT_FULL_DOCUMENT_RANGE[0])
478+
for source in payload["sources"]
479+
]
480+
payload["id"] = [source["file"]["id"] for source in payload["sources"]]
481+
del payload["sources"]
482+
return payload
483+
484+
352485
class BmpPdfRestPayload(BasePdfRestGraphicPayload[Literal["rgb", "gray"]]):
353486
"""Adapt caller options into a pdfRest-ready BMP request payload."""
354487

src/pdfrest/types/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from .public import (
44
ALL_PDF_INFO_QUERIES,
55
PdfInfoQuery,
6+
PdfMergeInput,
7+
PdfMergeSource,
8+
PdfPageSelection,
69
PdfRedactionInstruction,
710
PdfRedactionPreset,
811
PdfRedactionType,
@@ -12,6 +15,9 @@
1215
__all__ = [
1316
"ALL_PDF_INFO_QUERIES",
1417
"PdfInfoQuery",
18+
"PdfMergeInput",
19+
"PdfMergeSource",
20+
"PdfPageSelection",
1521
"PdfRGBColor",
1622
"PdfRedactionInstruction",
1723
"PdfRedactionPreset",

src/pdfrest/types/public.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,22 @@
22

33
from __future__ import annotations
44

5-
from typing import Literal, cast, get_args
5+
from collections.abc import Sequence
6+
from typing import TYPE_CHECKING, Any, Literal, cast, get_args
67

7-
from typing_extensions import TypedDict
8+
from typing_extensions import Required, TypedDict
9+
10+
if TYPE_CHECKING:
11+
from pdfrest.models import PdfRestFile
12+
else: # pragma: no cover - used only for typing at runtime
13+
PdfRestFile = Any
814

915
__all__ = (
1016
"ALL_PDF_INFO_QUERIES",
1117
"PdfInfoQuery",
18+
"PdfMergeInput",
19+
"PdfMergeSource",
20+
"PdfPageSelection",
1221
"PdfRGBColor",
1322
"PdfRedactionInstruction",
1423
"PdfRedactionPreset",
@@ -77,3 +86,13 @@ class PdfRedactionInstruction(TypedDict):
7786

7887

7988
PdfRGBColor = tuple[int, int, int]
89+
90+
PdfPageSelection = str | int | Sequence[str | int]
91+
92+
93+
class PdfMergeSource(TypedDict, total=False):
94+
file: Required[PdfRestFile]
95+
pages: PdfPageSelection | None
96+
97+
98+
PdfMergeInput = PdfRestFile | PdfMergeSource | tuple[PdfRestFile, PdfPageSelection]

0 commit comments

Comments
 (0)