chore/change default split page behavior to true (#118)

awalker4 · web-flow · commit eabf1167fa38 · 2024-06-17T16:50:47.000-04:00
* Set the split_pdf_page default to true and run `make client-generate`
locally.
* Update the readme, add another reference back to our docs
* Change some warning logs to info. The user should not be warned about
default behavior for non pdf files

# Testing
Use the client locally and verify that split mode is the default, and
that the client behavior is consistent with older versions.

* Set up (or activate) your pyenv for the client: `pyenv virtualenv 3.12
unstructured-client; pyenv activate unstructured-client`
* Check out this branch and install: `pip install -e .`
* Run this sample script in the top level of the client repo. Try
different files in `_sample_docs` and verify that the logging and
results look acceptable.

```
from unstructured_client import UnstructuredClient
from unstructured_client.models import shared, operations

import json

api_key = "free-api-key"
filename = "_sample_docs/layout-parser-paper.pdf"

s = UnstructuredClient(
    api_key_auth=api_key,
)

with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = operations.PartitionRequest(
    shared.PartitionParameters(
        files=files,
        strategy=shared.Strategy.AUTO
    ),
)

try:
    resp = s.general.partition(req)
    print(json.dumps(resp.elements, indent=4))
except Exception as e:
    print(e)
```
diff --git a/.speakeasy/gen.lock b/.speakeasy/gen.lock
@@ -1,12 +1,12 @@
 lockVersion: 2.0.0
 id: 8b5fa338-9106-4734-abf0-e30d67044a90
 management:
-  docChecksum: 5365c99c52e23b044ef9916ecf51b1a9
+  docChecksum: c7e23b3b8242eb21eccb2091bcc57c72
   docVersion: 1.0.35
   speakeasyVersion: 1.308.1
   generationVersion: 2.342.6
-  releaseVersion: 0.23.5
-  configChecksum: e210d7bff3afd386269cb7c6adeef630
+  releaseVersion: 0.23.6
+  configChecksum: 4e2e510c7f4b61e04b61acf7de2939a3
   repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
   repoSubDirectory: .
   installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
diff --git a/README.md b/README.md
@@ -72,7 +72,9 @@ Refer to the [API parameters page](https://docs.unstructured.io/api-reference/ap
 
 #### Splitting PDF by pages
 
-In order to speed up processing of long PDF files, `split_pdf_page` can be set to `True` (defaults to `False`). It will cause the PDF to be split at client side, before sending to API, and combining individual responses as single result. This parameter will affect only PDF files, no need to disable it for other filetypes.
+See [page splitting](https://docs.unstructured.io/api-reference/api-services/sdk#page-splitting) for more details.
+
+In order to speed up processing of large PDF files, the client splits up PDFs into smaller files, sends these to the API concurrently, and recombines the results. `split_pdf_page` can be set to `False` to disable this.
 
 The amount of workers utilized for splitting PDFs is dictated by the `split_pdf_concurrency_level` parameter, with a default of 5 and a maximum of 15 to keep resource usage and costs in check. The splitting process leverages `asyncio` to manage concurrency effectively.
 The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document. Because the splitting process uses `asyncio` the client can encouter event loop issues if it is nested in another async runner, like running in a `gevent` spawned task. Instead, this is safe to run in multiprocessing workers (e.g., using `multiprocessing.Pool` with `fork` context).
@@ -83,7 +85,6 @@ req = shared.PartitionParameters(
     files=files,
     strategy="fast",
     languages=["eng"],
-    split_pdf_page=True,
     split_pdf_concurrency_level=8
 )
 ```
diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -276,7 +276,7 @@ def test_unit_is_pdf_invalid_extension(caplog):
     """Test is pdf method returns False for file with invalid extension."""
     file = shared.Files(b"txt_content", "test_file.txt")
 
-    with caplog.at_level(logging.WARNING):
+    with caplog.at_level(logging.INFO):
         result = pdf_utils.is_pdf(file)
 
     assert result is False
diff --git a/gen.yaml b/gen.yaml
@@ -10,7 +10,7 @@ generation:
   auth:
     oAuth2ClientCredentialsEnabled: false
 python:
-  version: 0.23.5
+  version: 0.23.6
   additionalDependencies:
     dependencies:
       deepdiff: '>=6.0'
diff --git a/overlay_client.yaml b/overlay_client.yaml
@@ -10,7 +10,7 @@ actions:
           "type": "boolean",
           "title": "Split Pdf Page",
           "description": "This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.",
-          "default": false,
+          "default": true,
         }
   - target: $["components"]["schemas"]["partition_parameters"]["properties"]
     update:
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 
 setuptools.setup(
     name='unstructured-client',
-    version='0.23.5',
+    version='0.23.6',
     author='Unstructured',
     description='Python Client SDK for Unstructured API',
     license = 'MIT',
diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py
@@ -59,7 +59,7 @@ def is_pdf(file: shared.Files) -> bool:
         True if the file is a PDF, False otherwise.
     """
     if not file.file_name.endswith(".pdf"):
-        logger.warning("Given file doesn't have '.pdf' extension. Continuing without splitting.")
+        logger.info("Given file doesn't have '.pdf' extension, so splitting is not enabled.")
         return False
 
     try:
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -135,7 +135,7 @@ def before_request(
             or not isinstance(file, shared.Files)
             or not pdf_utils.is_pdf(file)
         ):
-            logger.warning("File could not be split. Partitioning without split.")
+            logger.info("Partitioning without split.")
             return request
 
         starting_page_number = form_utils.get_starting_page_number(
@@ -160,7 +160,7 @@ def before_request(
         logger.info("Determined optimal split size of %d pages.", split_size)
 
         if split_size >= len(pdf.pages):
-            logger.warning(
+            logger.info(
                 "Document has too few pages (%d) to be split efficiently. Partitioning without split.",
                 len(pdf.pages),
             )
diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py
@@ -83,7 +83,7 @@ class PartitionParameters:
     r"""The document types that you want to skip table extraction with. Default: []"""
     split_pdf_concurrency_level: Optional[int] = dataclasses.field(default=5, metadata={'multipart_form': { 'field_name': 'split_pdf_concurrency_level' }})
     r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend."""
-    split_pdf_page: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }})
+    split_pdf_page: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }})
     r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend."""
     starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }})
     r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27."""
diff --git a/src/unstructured_client/sdkconfiguration.py b/src/unstructured_client/sdkconfiguration.py
@@ -29,9 +29,9 @@ class SDKConfiguration:
     server: Optional[str] = ''
     language: str = 'python'
     openapi_doc_version: str = '1.0.35'
-    sdk_version: str = '0.23.5'
+    sdk_version: str = '0.23.6'
     gen_version: str = '2.342.6'
-    user_agent: str = 'speakeasy-sdk/python 0.23.5 2.342.6 1.0.35 unstructured-client'
+    user_agent: str = 'speakeasy-sdk/python 0.23.6 2.342.6 1.0.35 unstructured-client'
     retry_config: Optional[RetryConfig] = None
 
     def __post_init__(self):

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ actions:`
`10`	`10`	`"type": "boolean",`
`11`	`11`	`"title": "Split Pdf Page",`
`12`	`12`	`"description": "This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.",`
`13`		`- "default": false,`
	`13`	`+ "default": true,`
`14`	`14`	`}`
`15`	`15`	`- target: $["components"]["schemas"]["partition_parameters"]["properties"]`
`16`	`16`	`update:`