Skip to content

Commit 5237ab3

Browse files
chore: extend logging (#77)
This PR is a clone of Unstructured-IO/unstructured-python-client#96. ### Summary: - Added logs to split pdf hook to improve transparency on the process. - Added retry counter to retry hook logs Tested on _sample_docs/layout-parser-paper-fast.pdf and _sample_docs/layout-parser-paper.pdf. First one will log that it cannot split if run with pdf split.
1 parent 874faf5 commit 5237ab3

File tree

8 files changed

+132
-22
lines changed

8 files changed

+132
-22
lines changed

.eslintrc.js

+1
Original file line numberDiff line numberDiff line change
@@ -24,5 +24,6 @@ module.exports = {
2424
"import/no-named-as-default-member": "off",
2525

2626
"import/no-default-export": "error",
27+
"object-curly-spacing": ["error", "always"],
2728
},
2829
};

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ node_modules/
1414
# Custom ignores
1515
openapi.json
1616
openapi_client.json
17+
18+
.idea/

src/hooks/custom/LogRetryHook.ts

+24-5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,27 @@ import { AfterErrorContext, AfterErrorHook } from "../types";
55
* after encountering a 5xx error.
66
*/
77
export class LogRetryHook implements AfterErrorHook {
8+
private retriesCounter: Map<string, number> = new Map();
9+
10+
/**
11+
* Log retries to give users visibility into requests.
12+
* @param response - The response object received from the server.
13+
* @param operationID - The unique identifier for the operation being logged.
14+
*/
15+
private logRetries(response: Response | null, operationID: string): void {
16+
if (response && response.status >= 500) {
17+
console.warn(
18+
"Failed to process a request due to API server error with status code %d. " +
19+
"Attempting retry number %d after sleep.",
20+
response.status,
21+
this.retriesCounter.get(operationID)
22+
);
23+
if (response.statusText) {
24+
console.warn("Server message - %s", response.statusText);
25+
}
26+
}
27+
}
28+
829
/**
930
* Executes after an error occurs during a request.
1031
* @param _context - The context object containing information about the request.
@@ -17,11 +38,9 @@ export class LogRetryHook implements AfterErrorHook {
1738
response: Response | null,
1839
error: unknown
1940
): { response: Response | null; error: unknown } {
20-
if (response && response.status >= 500) {
21-
console.warn(
22-
`Got an error: '${response.status} ${response.statusText}'. Retrying request!`
23-
);
24-
}
41+
const currentCount = this.retriesCounter.get(_context.operationID) || 0;
42+
this.retriesCounter.set(_context.operationID, currentCount + 1);
43+
this.logRetries(response, _context.operationID);
2544
return { response, error };
2645
}
2746
}

src/hooks/custom/SplitPdfHook.ts

+48-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
SDKInitOptions,
1313
} from "../types";
1414
import {
15+
getOptimalSplitSize,
1516
getSplitPdfConcurrencyLevel,
1617
getStartingPageNumber,
1718
loadPdf,
@@ -83,38 +84,76 @@ export class SplitPdfHook
8384
(formData.get(PARTITION_FORM_SPLIT_PDF_PAGE_KEY) as string) ?? "false"
8485
);
8586
const file = formData.get(PARTITION_FORM_FILES_KEY) as File | null;
86-
const startingPageNumber = getStartingPageNumber(formData);
8787

8888
if (!splitPdfPage) {
89+
console.info("Partitioning without split.")
8990
return request;
9091
}
9192

93+
console.info("Preparing to split document for partition.")
9294
if (!this.client) {
93-
console.warn("HTTP client not accessible! Continuing without splitting.");
95+
console.warn("HTTP client not accessible! Partitioning without split.");
9496
return request;
9597
}
9698

9799
const [error, pdf, pagesCount] = await loadPdf(file);
98100
if (file === null || pdf === null || error) {
101+
console.warn("File could not be split. Partitioning without split.")
99102
return request;
100103
}
101104

102105
if (pagesCount < MIN_PAGES_PER_THREAD) {
103106
console.warn(
104-
`PDF has less than ${MIN_PAGES_PER_THREAD} pages. Continuing without splitting.`
107+
`PDF has less than ${MIN_PAGES_PER_THREAD} pages. Partitioning without split.`
105108
);
106109
return request;
107110
}
108111

112+
const startingPageNumber = getStartingPageNumber(formData);
113+
console.info("Starting page number set to %d", startingPageNumber);
114+
109115
const concurrencyLevel = getSplitPdfConcurrencyLevel(formData);
110-
const splits = await splitPdf(pdf, concurrencyLevel);
116+
console.info("Concurrency level set to %d", concurrencyLevel)
117+
118+
const splitSize = await getOptimalSplitSize(pagesCount, concurrencyLevel);
119+
console.info("Determined optimal split size of %d pages.", splitSize)
120+
121+
if (splitSize >= pagesCount) {
122+
console.warn(
123+
"Document has too few pages (%d) to be split efficiently. Partitioning without split.",
124+
pagesCount,
125+
)
126+
return request;
127+
}
128+
129+
const splits = await splitPdf(pdf, splitSize);
130+
const numberOfSplits = splits.length
131+
console.info(
132+
"Document split into %d, %d-paged sets.",
133+
numberOfSplits,
134+
splitSize,
135+
)
136+
console.info(
137+
"Partitioning %d, %d-paged sets.",
138+
numberOfSplits,
139+
splitSize,
140+
)
141+
111142
const headers = prepareRequestHeaders(request);
112143

113144
const requests: Request[] = [];
114145

146+
let setIndex = 1
115147
for (const { content, startPage } of splits) {
116148
// Both startPage and startingPageNumber are 1-based, so we need to subtract 1
117149
const firstPageNumber = startPage + startingPageNumber - 1;
150+
console.info(
151+
"Partitioning set #%d (pages %d-%d).",
152+
setIndex,
153+
firstPageNumber,
154+
Math.min(firstPageNumber + splitSize - 1, pagesCount),
155+
);
156+
118157
const body = await prepareRequestBody(
119158
formData,
120159
content,
@@ -126,6 +165,7 @@ export class SplitPdfHook
126165
body,
127166
});
128167
requests.push(req);
168+
setIndex+=1;
129169
}
130170

131171
this.partitionResponses[operationID] = new Array(requests.length);
@@ -173,6 +213,8 @@ export class SplitPdfHook
173213

174214
this.clearOperation(operationID);
175215

216+
console.info("Successfully processed the request.")
217+
176218
return new Response(body, {
177219
headers: headers,
178220
status: response.status,
@@ -199,6 +241,7 @@ export class SplitPdfHook
199241
const responses = await this.awaitAllRequests(operationID);
200242

201243
if (!responses?.length) {
244+
console.error("Failed to process the request.");
202245
this.clearOperation(operationID);
203246
return { response, error };
204247
}
@@ -214,6 +257,7 @@ export class SplitPdfHook
214257
});
215258

216259
this.clearOperation(operationID);
260+
console.info("Successfully processed the request.");
217261

218262
return { response: finalResponse, error: null };
219263
}

src/hooks/custom/utils/pdf.ts

+24-9
Original file line numberDiff line numberDiff line change
@@ -38,28 +38,43 @@ export async function pdfPagesToBlob(
3838
});
3939
}
4040

41+
/**
42+
* Calculates the optimal split size for processing pages with a specified concurrency level.
43+
*
44+
* @param pagesCount - The total number of pages to process.
45+
* @param concurrencyLevel - The level of concurrency to be used.
46+
* @returns A promise that resolves to the optimal number of pages per split,
47+
* ensuring it does not exceed the maximum or fall below the minimum threshold.
48+
*/
49+
export async function getOptimalSplitSize(
50+
pagesCount: number,
51+
concurrencyLevel: number
52+
): Promise<number> {
53+
let splitSize = MAX_PAGES_PER_THREAD;
54+
if (pagesCount < MAX_PAGES_PER_THREAD * concurrencyLevel) {
55+
splitSize = Math.ceil(pagesCount / concurrencyLevel);
56+
}
57+
splitSize = Math.max(splitSize, MIN_PAGES_PER_THREAD);
58+
59+
return splitSize;
60+
}
61+
62+
4163
/**
4264
* Retrieves an array of splits, with the start and end page numbers, from a PDF file.
4365
* Distribution of pages per split is made in as much uniform manner as possible.
4466
*
4567
* @param pdf - The PDF file to extract pages from.
46-
* @param concurrencyLevel - Number of maximum parallel requests.
68+
* @param splitSize - The number of pages per split.
4769
* @returns A promise that resolves to an array of objects containing Blob files and
4870
* start and end page numbers from the original document.
4971
*/
5072
export async function splitPdf(
5173
pdf: PDFDocument,
52-
concurrencyLevel: number
74+
splitSize: number
5375
): Promise<PdfSplit[]> {
5476
const pdfSplits: PdfSplit[] = [];
5577
const pagesCount = pdf.getPages().length;
56-
57-
let splitSize = MAX_PAGES_PER_THREAD;
58-
if (pagesCount < MAX_PAGES_PER_THREAD * concurrencyLevel) {
59-
splitSize = Math.ceil(pagesCount / concurrencyLevel);
60-
}
61-
splitSize = Math.max(splitSize, MIN_PAGES_PER_THREAD);
62-
6378
const numberOfSplits = Math.ceil(pagesCount / splitSize);
6479

6580
for (let i = 0; i < numberOfSplits; ++i) {

src/hooks/custom/utils/request.ts

+8
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,17 @@ export async function prepareResponseBody(
2828
responses: Response[]
2929
): Promise<string> {
3030
const allElements: any[] = [];
31+
let index = 1;
3132
for (const res of responses) {
33+
if (res.status == 200) {
34+
console.info("Successfully partitioned set #%d, elements added to the final result.", index);
35+
} else {
36+
console.warn("Failed to partition set #%d, its elements will be omitted in the final result.", index);
37+
}
38+
3239
const resElements = await res.json();
3340
allElements.push(resElements);
41+
index++;
3442
}
3543
return JSON.stringify(allElements.flat());
3644
}

test/integration/SplitPdfHook.test.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -170,10 +170,12 @@ describe("SplitPdfHook integration tests check splitted file is same as not spli
170170
});
171171
} catch (e) {
172172
if (!expectedOk) {
173-
expect(consoleErrorSpy).toHaveBeenCalledTimes(1);
173+
expect(consoleErrorSpy).toHaveBeenCalledTimes(2);
174+
expect(consoleErrorSpy).toHaveBeenCalledWith("Failed to process the request.");
174175
expect(consoleWarnSpy).toHaveBeenCalledWith(
175176
"Attempted to interpret file as pdf, but error arose when splitting by pages. Reverting to non-split pdf handling path."
176177
);
178+
expect(consoleWarnSpy).toHaveBeenCalledWith("File could not be split. Partitioning without split.");
177179
return;
178180
} else {
179181
throw e;

test/unit/utils/pdf.test.ts

+22-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import {
66
pdfPagesToBlob,
77
splitPdf,
88
} from "../../../src/hooks/custom/utils";
9+
import { getOptimalSplitSize } from "../../../hooks/custom/utils";
10+
import { MAX_PAGES_PER_THREAD, MIN_PAGES_PER_THREAD } from "../../../hooks/custom/common";
911

1012
describe("Pdf utility functions", () => {
1113
const filename = "test/data/layout-parser-paper.pdf";
@@ -39,17 +41,34 @@ describe("Pdf utility functions", () => {
3941
});
4042
});
4143

44+
describe("getOptimalSplitSize", () => {
45+
it("should return the maximum pages per thread when pagesCount is high", async () => {
46+
const result = await getOptimalSplitSize(100, 4);
47+
expect(result).toBe(MAX_PAGES_PER_THREAD);
48+
});
49+
50+
it("should return the minimum pages per thread when pagesCount is low", async () => {
51+
const result = await getOptimalSplitSize(2, 4);
52+
expect(result).toBe(MIN_PAGES_PER_THREAD);
53+
});
54+
55+
it("should return an appropriate split size for a given pagesCount and concurrencyLevel", async () => {
56+
const result = await getOptimalSplitSize(10, 3);
57+
expect(result).toBe(Math.ceil(10 / 3));
58+
});
59+
});
60+
4261
describe("splitPdf", () => {
4362
it("should split the PDF into one batch", async () => {
44-
const result = await splitPdf(pdf, 1);
63+
const result = await splitPdf(pdf, 16);
4564

4665
expect(result).toHaveLength(1);
4766
expect(result[0]?.startPage).toBe(1);
4867
expect(result[0]?.endPage).toBe(16);
4968
});
5069

5170
it("should split the PDF into 3 batches", async () => {
52-
const result = await splitPdf(pdf, 3);
71+
const result = await splitPdf(pdf, 6);
5372

5473
// Verify the expected behavior
5574
expect(result).toHaveLength(3);
@@ -62,7 +81,7 @@ describe("Pdf utility functions", () => {
6281
});
6382

6483
it("should split the PDF into 4 batches", async () => {
65-
const result = await splitPdf(pdf, 5);
84+
const result = await splitPdf(pdf, 4);
6685

6786
// Verify the expected behavior
6887
expect(result).toHaveLength(4);

0 commit comments

Comments
 (0)