Skip to content

Commit 6452011

Browse files
committed
fix(pdf-server): handle separated field/widget trees in extractFormSchema
pdfjs getFieldObjects() returns the full field-tree array. For PDFs with a separated structure (pdf-lib, some authoring tools) the typed widget sits at fields[1+] behind a typeless container at fields[0]; the previous code only inspected fields[0] and skipped them all. Pick the first entry with a non-empty type instead. Makes the e2e forms.pdf fixture fully generated (no checked-in third-party asset on the hot path); fw9.pdf stays as a unit-test fixture for the hierarchical/XFA case.
1 parent 9d54c70 commit 6452011

4 files changed

Lines changed: 107 additions & 18 deletions

File tree

examples/pdf-server/server.test.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@ import os from "node:os";
44
import path from "node:path";
55
import { Client } from "@modelcontextprotocol/sdk/client/index.js";
66
import { InMemoryTransport } from "@modelcontextprotocol/sdk/inMemory.js";
7+
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
8+
import { PDFDocument } from "pdf-lib";
79
import {
810
createPdfCache,
911
createServer,
12+
extractFormSchema,
1013
validateUrl,
1114
isAncestorDir,
1215
allowedLocalFiles,
@@ -289,6 +292,84 @@ describe("PDF Cache with Timeouts", () => {
289292
// through manual testing or E2E tests.
290293
});
291294

295+
describe("extractFormSchema field-tree handling", () => {
296+
async function schemaFor(bytes: Uint8Array) {
297+
const doc = await getDocument({ data: bytes }).promise;
298+
try {
299+
return await extractFormSchema(doc);
300+
} finally {
301+
doc.destroy();
302+
}
303+
}
304+
305+
it("handles pdf-lib separated field/widget structure", async () => {
306+
const d = await PDFDocument.create();
307+
const form = d.getForm();
308+
d.addPage([612, 792]);
309+
form
310+
.createTextField("alpha")
311+
.addToPage(d.getPage(0), { x: 50, y: 700, width: 200, height: 20 });
312+
form
313+
.createCheckBox("agree")
314+
.addToPage(d.getPage(0), { x: 50, y: 660, width: 20, height: 20 });
315+
form
316+
.createDropdown("choice")
317+
.addToPage(d.getPage(0), { x: 50, y: 620, width: 100, height: 20 });
318+
319+
const schema = await schemaFor(await d.save());
320+
expect(schema).not.toBeNull();
321+
expect(schema!.properties.alpha).toEqual({
322+
type: "string",
323+
title: "alpha",
324+
});
325+
expect(schema!.properties.agree).toEqual({
326+
type: "boolean",
327+
title: "agree",
328+
});
329+
expect(schema!.properties.choice.type).toBe("string");
330+
});
331+
332+
it("handles fields with multiple widgets across pages", async () => {
333+
const d = await PDFDocument.create();
334+
const form = d.getForm();
335+
d.addPage([612, 792]);
336+
d.addPage([612, 792]);
337+
const tf = form.createTextField("shared");
338+
tf.addToPage(d.getPage(0), { x: 50, y: 700, width: 200, height: 20 });
339+
tf.addToPage(d.getPage(1), { x: 50, y: 700, width: 200, height: 20 });
340+
341+
const schema = await schemaFor(await d.save());
342+
expect(schema?.properties.shared).toEqual({
343+
type: "string",
344+
title: "shared",
345+
});
346+
});
347+
348+
it("skips container nodes and finds leaf fields (W-9 style)", async () => {
349+
const bytes = fs.readFileSync(
350+
path.join(__dirname, "../../tests/helpers/assets/fw9.pdf"),
351+
);
352+
const doc = await getDocument({ data: new Uint8Array(bytes) }).promise;
353+
try {
354+
const fo = (await doc.getFieldObjects()) as Record<string, unknown[]>;
355+
// Container nodes (no leaf type) should not crash extraction
356+
expect(fo["topmostSubform[0]"]).toBeDefined();
357+
// Schema is null for W-9 (mechanical names), but extraction must not throw
358+
const schema = await extractFormSchema(doc);
359+
expect(schema).toBeNull();
360+
} finally {
361+
doc.destroy();
362+
}
363+
});
364+
365+
it("returns null when no AcroForm present", async () => {
366+
const d = await PDFDocument.create();
367+
d.addPage([612, 792]);
368+
const schema = await schemaFor(await d.save());
369+
expect(schema).toBeNull();
370+
});
371+
});
372+
292373
describe("validateUrl with MCP roots (allowedLocalDirs)", () => {
293374
const savedFiles = new Set(allowedLocalFiles);
294375
const savedDirs = new Set(allowedLocalDirs);

examples/pdf-server/server.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,7 @@ async function extractFormFieldInfo(
10391039
return fields;
10401040
}
10411041

1042-
async function extractFormSchema(
1042+
export async function extractFormSchema(
10431043
pdfDoc: PDFDocumentProxy,
10441044
fieldObjects?: Record<string, PdfJsFieldObject[]> | null,
10451045
): Promise<{
@@ -1063,7 +1063,11 @@ async function extractFormSchema(
10631063

10641064
const properties: Record<string, PrimitiveSchemaDefinition> = {};
10651065
for (const [name, fields] of Object.entries(fieldObjects)) {
1066-
const field = fields[0]; // first widget determines the type
1066+
// pdfjs returns the full field-tree array: for separated structures
1067+
// (pdf-lib) the typed widget is at [1+] behind a container at [0]; for
1068+
// merged/leaf entries (W-9, most authoring tools) it's at [0]. Pick the
1069+
// first entry that actually has a field type.
1070+
const field = fields.find((f) => f.type) ?? fields[0];
10671071
if (!field.editable) continue;
10681072

10691073
switch (field.type) {

tests/e2e/pdf-incremental-load.spec.ts

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,14 @@ test.describe("PDF Server — incremental loading", () => {
7979
}) => {
8080
await displayPdf(page, `${rangeServer.baseUrl}/forms.pdf`);
8181
const sc = await readStructuredContent(page);
82-
// formSchema may be null when field names are mechanical (W-9 uses
83-
// f1_01[0]-style names), but formFields (bounding boxes) is always
84-
// populated when the PDF has an AcroForm.
85-
const fields = sc.formFields as unknown[] | undefined;
86-
expect(fields).toBeDefined();
87-
expect(fields!.length).toBeGreaterThanOrEqual(10);
82+
const fields = sc.formFields as Array<{ name: string }> | undefined;
83+
expect(fields?.map((f) => f.name).sort()).toEqual([
84+
"city",
85+
"email",
86+
"name",
87+
"notes",
88+
"phone",
89+
]);
8890
});
8991

9092
test("display_pdf on a no-forms PDF fetches <30% of the file", async ({
@@ -93,7 +95,7 @@ test.describe("PDF Server — incremental loading", () => {
9395
const fileSize = rangeServer.fileSizes["/noforms.pdf"];
9496
await displayPdf(page, `${rangeServer.baseUrl}/noforms.pdf`);
9597
const sc = await readStructuredContent(page);
96-
expect(sc.formSchema ?? null).toBeNull();
98+
expect(sc.formFields).toBeUndefined();
9799

98100
const { totalBytesServed } = rangeServer.stats();
99101
// Guard against display_pdf downloading the whole file for form analysis.

tests/helpers/range-counting-server.ts

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
*/
66
import https from "node:https";
77
import { execFileSync } from "node:child_process";
8-
import fs, { mkdtempSync, readFileSync } from "node:fs";
8+
import { mkdtempSync, readFileSync } from "node:fs";
99
import { tmpdir } from "node:os";
1010
import path from "node:path";
11-
import { fileURLToPath } from "node:url";
1211
import type { AddressInfo } from "node:net";
1312
import { PDFDocument, StandardFonts } from "pdf-lib";
1413

@@ -96,13 +95,16 @@ function makeRandomJpeg(len: number): Uint8Array {
9695
}
9796

9897
async function buildFormsPdf(): Promise<Uint8Array> {
99-
// pdf-lib generates a separated field/widget tree that pdfjs's
100-
// getFieldObjects() reports without type/editable, so extractFormSchema
101-
// skips them. Use a real-world form PDF (IRS W-9) instead — it's the same
102-
// asset the server is expected to handle in production.
103-
return fs.readFileSync(
104-
path.join(path.dirname(fileURLToPath(import.meta.url)), "assets/fw9.pdf"),
105-
);
98+
const doc = await PDFDocument.create();
99+
const form = doc.getForm();
100+
for (let p = 0; p < 2; p++) doc.addPage([612, 792]);
101+
const [page1] = doc.getPages();
102+
const fields = ["name", "email", "phone", "city", "notes"];
103+
fields.forEach((name, i) => {
104+
const f = form.createTextField(name);
105+
f.addToPage(page1, { x: 100, y: 650 - i * 60, width: 300, height: 24 });
106+
});
107+
return doc.save();
106108
}
107109

108110
function generateSelfSignedCert(): { key: Buffer; cert: Buffer } {

0 commit comments

Comments
 (0)