Skip to content

Commit 9d54c70

Browse files
committed
perf(pdf-server): lazy form extraction via range transport + incremental viewer scans
Server: display_pdf now opens the document via PDFDataRangeTransport (disableAutoFetch) and only runs the per-page form/annotation walk when getFieldObjects() is non-empty, so form-free PDFs are probed with ~10-25% of bytes instead of a full download. The unused viewFieldInfo Map is removed. Viewer: getDocument sets disableAutoFetch/disableStream; baseline annotation scan and field-name mapping run lazily per rendered page instead of walking every page after load, so first paint no longer schedules a full-file pull. E2E: new range-counting HTTPS fixture (W-9 for forms, generated text+image PDF for no-forms) with stallAfterBytes control, and four regression tests asserting form fields are returned, <30% served on no-forms display_pdf, first page renders while later ranges are stalled, and overlap stays bounded.
1 parent 30a78b6 commit 9d54c70

7 files changed

Lines changed: 541 additions & 82 deletions

File tree

examples/pdf-server/server.ts

Lines changed: 64 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import {
3131
import "./pdfjs-polyfill.js";
3232
import {
3333
getDocument,
34+
PDFDataRangeTransport,
3435
VerbosityLevel,
3536
version as PDFJS_VERSION,
3637
} from "pdfjs-dist/legacy/build/pdf.mjs";
@@ -368,9 +369,6 @@ export const viewSourcePaths = new Map<string, string>();
368369
/** Valid form field names per viewer UUID (populated during display_pdf) */
369370
const viewFieldNames = new Map<string, Set<string>>();
370371

371-
/** Detailed form field info per viewer UUID (populated during display_pdf) */
372-
const viewFieldInfo = new Map<string, FormFieldInfo[]>();
373-
374372
/**
375373
* Active fs.watch per view. Only created for local files when interact is
376374
* enabled (stdio). Watcher is re-established on `rename` events to survive
@@ -387,7 +385,7 @@ const viewFileWatches = new Map<string, ViewFileWatch>();
387385
/**
388386
* Per-view heartbeat. THIS is what the sweep iterates — not commandQueues.
389387
*
390-
* Why not commandQueues: display_pdf populates viewFieldNames/viewFieldInfo/
388+
* Why not commandQueues: display_pdf populates viewFieldNames/
391389
* viewFileWatches but never touches commandQueues (only enqueueCommand does,
392390
* and it's triply gated). And dequeueCommands deletes the entry on every poll,
393391
* so even when it exists the sweep's TTL window is ~200ms wide. Net effect:
@@ -409,7 +407,6 @@ function pruneStaleQueues(): void {
409407
viewLastActivity.delete(uuid);
410408
commandQueues.delete(uuid);
411409
viewFieldNames.delete(uuid);
412-
viewFieldInfo.delete(uuid);
413410
viewsPolled.delete(uuid);
414411
viewSourcePaths.delete(uuid);
415412
stopFileWatch(uuid);
@@ -882,6 +879,28 @@ export function createPdfCache(
882879
};
883880
}
884881

882+
/**
883+
* pdf.js range transport backed by {@link PdfCache.readPdfRange}. Lets
884+
* getDocument() fetch only the byte ranges it needs (xref, /AcroForm dict)
885+
* instead of the whole file. With disableAutoFetch, a PDF without form
886+
* fields is opened with ~5% of bytes fetched.
887+
*/
888+
export class PdfCacheRangeTransport extends PDFDataRangeTransport {
889+
constructor(
890+
private url: string,
891+
length: number,
892+
private readPdfRange: PdfCache["readPdfRange"],
893+
) {
894+
super(length, null);
895+
}
896+
override requestDataRange(begin: number, end: number): void {
897+
this.readPdfRange(this.url, begin, end - begin).then(
898+
({ data }) => this.onDataRange(begin, data),
899+
() => this.abort(),
900+
);
901+
}
902+
}
903+
885904
// =============================================================================
886905
// MCP Roots
887906
// =============================================================================
@@ -1020,19 +1039,23 @@ async function extractFormFieldInfo(
10201039
return fields;
10211040
}
10221041

1023-
async function extractFormSchema(pdfDoc: PDFDocumentProxy): Promise<{
1042+
async function extractFormSchema(
1043+
pdfDoc: PDFDocumentProxy,
1044+
fieldObjects?: Record<string, PdfJsFieldObject[]> | null,
1045+
): Promise<{
10241046
type: "object";
10251047
properties: Record<string, PrimitiveSchemaDefinition>;
10261048
required?: string[];
10271049
} | null> {
1028-
let fieldObjects: Record<string, PdfJsFieldObject[]> | null;
1029-
try {
1030-
fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
1031-
string,
1032-
PdfJsFieldObject[]
1033-
> | null;
1034-
} catch {
1035-
return null;
1050+
if (fieldObjects === undefined) {
1051+
try {
1052+
fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
1053+
string,
1054+
PdfJsFieldObject[]
1055+
> | null;
1056+
} catch {
1057+
return null;
1058+
}
10361059
}
10371060
if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
10381061
return null;
@@ -1434,7 +1457,7 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
14341457
const { totalBytes } = await readPdfRange(normalized, 0, 1);
14351458
const uuid = randomUUID();
14361459
// Start the heartbeat now so the sweep can clean up viewFieldNames/
1437-
// viewFieldInfo/viewFileWatches even if no interact calls ever happen.
1460+
// viewFileWatches even if no interact calls ever happen.
14381461
if (!disableInteract) touchView(uuid);
14391462

14401463
// Check writability (governs save button; see isWritablePath doc).
@@ -1462,21 +1485,36 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
14621485
}
14631486
}
14641487

1465-
// Extract form field schema + detailed field info from a single
1466-
// download/parse pass.
1488+
// Extract form field schema + detailed field info via range transport so
1489+
// PDFs without forms only fetch the trailer/xref/catalog (~5% of bytes).
1490+
// PDFs with forms still pull most of the file once getAnnotations walks
1491+
// every page, but those are typically small.
14671492
let formSchema: Awaited<ReturnType<typeof extractFormSchema>> = null;
14681493
let fieldInfo: FormFieldInfo[] = [];
14691494
try {
1470-
const { data } = await readPdfRange(normalized, 0, totalBytes);
14711495
const pdfDoc = await getDocument({
1472-
data,
1496+
range: new PdfCacheRangeTransport(
1497+
normalized,
1498+
totalBytes,
1499+
readPdfRange,
1500+
),
1501+
length: totalBytes,
1502+
disableAutoFetch: true,
1503+
disableStream: true,
1504+
rangeChunkSize: 64 * 1024,
14731505
standardFontDataUrl: STANDARD_FONT_DATA_URL,
14741506
StandardFontDataFactory: FetchStandardFontDataFactory,
14751507
verbosity: VerbosityLevel.ERRORS,
14761508
}).promise;
14771509
try {
1478-
formSchema = await extractFormSchema(pdfDoc);
1479-
fieldInfo = await extractFormFieldInfo(pdfDoc);
1510+
const fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
1511+
string,
1512+
PdfJsFieldObject[]
1513+
> | null;
1514+
if (fieldObjects && Object.keys(fieldObjects).length > 0) {
1515+
formSchema = await extractFormSchema(pdfDoc, fieldObjects);
1516+
fieldInfo = await extractFormFieldInfo(pdfDoc);
1517+
}
14801518
} finally {
14811519
pdfDoc.destroy();
14821520
}
@@ -1486,14 +1524,11 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
14861524
if (formSchema) {
14871525
viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties)));
14881526
}
1489-
if (fieldInfo.length > 0) {
1490-
viewFieldInfo.set(uuid, fieldInfo);
1491-
if (!viewFieldNames.has(uuid)) {
1492-
viewFieldNames.set(
1493-
uuid,
1494-
new Set(fieldInfo.map((f) => f.name).filter(Boolean)),
1495-
);
1496-
}
1527+
if (fieldInfo.length > 0 && !viewFieldNames.has(uuid)) {
1528+
viewFieldNames.set(
1529+
uuid,
1530+
new Set(fieldInfo.map((f) => f.name).filter(Boolean)),
1531+
);
14971532
}
14981533

14991534
// Elicit form field values if requested and client supports it

examples/pdf-server/src/mcp-app.ts

Lines changed: 77 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,11 @@ const imageCache = new Map<string, HTMLImageElement>();
150150

151151
/** Annotations imported from the PDF file (baseline for diff computation). */
152152
let pdfBaselineAnnotations: PdfAnnotationDef[] = [];
153+
/** Pages whose native annotations have already been imported into the baseline. */
154+
const baselineScannedPages = new Set<number>();
155+
/** Native-annotation ids the user deleted (from restored localStorage diff) —
156+
* the lazy per-page scan must NOT re-add these to annotationMap. */
157+
const restoredRemovedIds = new Set<string>();
153158

154159
// Dirty flag — tracks unsaved local changes
155160
let isDirty = false;
@@ -2679,52 +2684,49 @@ function annotationStorageKey(): string | null {
26792684
}
26802685

26812686
/**
2682-
* Import annotations from the loaded PDF to establish the baseline.
2683-
* These are the annotations that exist in the PDF file itself.
2687+
* Import one page's native annotations into the baseline. Called lazily from
2688+
* renderPage() so we don't walk every page (and pull most of the file via
2689+
* range requests) before the user sees anything. Idempotent per page.
26842690
*/
2685-
async function loadBaselineAnnotations(
2686-
doc: pdfjsLib.PDFDocumentProxy,
2687-
): Promise<void> {
2688-
pdfBaselineAnnotations = [];
2689-
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
2690-
try {
2691-
const page = await doc.getPage(pageNum);
2692-
const annotations = await page.getAnnotations();
2693-
for (let i = 0; i < annotations.length; i++) {
2694-
const ann = annotations[i];
2695-
const def = importPdfjsAnnotation(ann, pageNum, i);
2696-
if (def) {
2697-
pdfBaselineAnnotations.push(def);
2698-
// Add to annotationMap if not already present (from localStorage restore)
2699-
if (!annotationMap.has(def.id)) {
2700-
annotationMap.set(def.id, { def, elements: [] });
2701-
}
2702-
} else if (ann.annotationType !== 20) {
2703-
// Widget (type 20) is expected to be skipped; anything else we
2704-
// don't import will still be painted by page.render() onto the
2705-
// canvas as unselectable pixels. Log so we can diagnose
2706-
// "ghost annotations" (visible but not in panel, not clickable).
2707-
log.info(
2708-
`[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
2709-
`type=${ann.annotationType}`,
2710-
`subtype=${ann.subtype ?? "?"}`,
2711-
`name=${ann.name ?? "?"}`,
2712-
`rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
2713-
);
2714-
}
2691+
function scanPageBaselineAnnotations(
2692+
pageNum: number,
2693+
annotations: unknown[],
2694+
): void {
2695+
if (baselineScannedPages.has(pageNum)) return;
2696+
baselineScannedPages.add(pageNum);
2697+
let imported = 0;
2698+
for (let i = 0; i < annotations.length; i++) {
2699+
const ann = annotations[i] as {
2700+
annotationType?: number;
2701+
subtype?: string;
2702+
name?: string;
2703+
rect?: number[];
2704+
};
2705+
const def = importPdfjsAnnotation(ann, pageNum, i);
2706+
if (def) {
2707+
pdfBaselineAnnotations.push(def);
2708+
imported++;
2709+
if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
2710+
annotationMap.set(def.id, { def, elements: [] });
27152711
}
2716-
} catch (err) {
2717-
// Log the error — a thrown import for one annotation silently
2718-
// drops the REST of that page's annotations too.
2712+
} else if (ann.annotationType !== 20) {
2713+
// Widget (type 20) is expected to be skipped; anything else we
2714+
// don't import will still be painted by page.render() onto the
2715+
// canvas as unselectable pixels. Log so we can diagnose
2716+
// "ghost annotations" (visible but not in panel, not clickable).
27192717
log.info(
2720-
`[WARN] Baseline: page ${pageNum} annotation import failed:`,
2721-
err,
2718+
`[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
2719+
`type=${ann.annotationType}`,
2720+
`subtype=${ann.subtype ?? "?"}`,
2721+
`name=${ann.name ?? "?"}`,
2722+
`rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
27222723
);
27232724
}
27242725
}
2725-
log.info(
2726-
`Loaded ${pdfBaselineAnnotations.length} baseline annotations from PDF`,
2727-
);
2726+
if (imported > 0) {
2727+
updateAnnotationsBadge();
2728+
renderAnnotationPanel();
2729+
}
27282730
}
27292731

27302732
function persistAnnotations(): void {
@@ -2765,11 +2767,11 @@ function restoreAnnotations(): void {
27652767
const diff = deserializeDiff(raw);
27662768

27672769
// Merge baseline + diff. The loop below is add-only, so we MUST also
2768-
// delete: loadBaselineAnnotations() runs between the two restore calls
2769-
// and re-seeds annotationMap with every baseline id — including the
2770-
// ones in diff.removed. Without this, the zombie survives the restore,
2771-
// and the next persistAnnotations() sees it in currentIds → computeDiff
2772-
// produces removed=[] → the deletion is permanently lost from storage.
2770+
// delete: the per-page baseline scan re-seeds annotationMap with every
2771+
// native id it encounters — including ones in diff.removed. Without the
2772+
// deletes here AND the restoredRemovedIds tombstones below, the zombie
2773+
// survives, and the next persistAnnotations() sees it in currentIds →
2774+
// computeDiff produces removed=[] → the deletion is permanently lost.
27732775
const merged = mergeAnnotations(pdfBaselineAnnotations, diff);
27742776
for (const def of merged) {
27752777
if (!annotationMap.has(def.id)) {
@@ -2778,6 +2780,9 @@ function restoreAnnotations(): void {
27782780
}
27792781
for (const id of diff.removed) {
27802782
annotationMap.delete(id);
2783+
// Tombstone so the lazy per-page baseline scan (which runs AFTER this
2784+
// restore) doesn't resurrect it.
2785+
restoredRemovedIds.add(id);
27812786
}
27822787

27832788
// Restore form fields
@@ -2869,6 +2874,14 @@ async function buildFieldNameMap(
28692874
// getFieldObjects may fail on some PDFs
28702875
}
28712876

2877+
// No AcroForm → nothing to map. Skip the per-page widget walk so form-free
2878+
// PDFs (the common large case) don't pull every page after first paint.
2879+
// getFieldObjects() itself only reads the catalog/AcroForm dict via range
2880+
// transport, so this gate is cheap.
2881+
if (!cachedFieldObjects || Object.keys(cachedFieldObjects).length === 0) {
2882+
return false;
2883+
}
2884+
28722885
// Scan every page's widget annotations to collect the CORRECT storage keys,
28732886
// plus labels, pages, positions, AND fieldValue (what the widget renders
28742887
// — which can differ from getFieldObjects().value if the PDF is internally
@@ -3362,6 +3375,9 @@ async function renderPage() {
33623375
formLayerEl.style.setProperty("--total-scale-factor", `${scale}`);
33633376
try {
33643377
const annotations = await page.getAnnotations();
3378+
// Lazy baseline import — piggyback on the annotations we just fetched
3379+
// for this page instead of walking all pages upfront.
3380+
scanPageBaselineAnnotations(pageToRender, annotations);
33653381
if (annotations.length > 0) {
33663382
const linkService = {
33673383
getDestinationHash: () => "#",
@@ -4406,6 +4422,8 @@ async function reloadPdf(): Promise<void> {
44064422
undoStack.length = 0;
44074423
redoStack.length = 0;
44084424
pdfBaselineAnnotations = [];
4425+
baselineScannedPages.clear();
4426+
restoredRemovedIds.clear();
44094427
pdfBaselineFormValues.clear();
44104428
pageTextCache.clear();
44114429
pageTextItemsCache.clear();
@@ -4449,11 +4467,11 @@ async function reloadPdf(): Promise<void> {
44494467
log.info("PDF reloaded:", totalPages, "pages,", totalBytes, "bytes");
44504468

44514469
showViewer();
4452-
// Render immediately — annotation/form scans below are O(numPages) and
4453-
// do NOT block the canvas. See same pattern in the initial-load path.
4470+
// Render immediately — baseline-annotation scan now happens per-page
4471+
// inside renderPage(); buildFieldNameMap below early-returns when no
4472+
// AcroForm is present. See same pattern in the initial-load path.
44544473
await renderPage();
44554474

4456-
await loadBaselineAnnotations(document);
44574475
const seeded = await buildFieldNameMap(document);
44584476
syncFormValuesToStorage();
44594477
if (seeded) await renderPage();
@@ -4509,6 +4527,11 @@ async function loadPdfProgressively(urlToLoad: string): Promise<{
45094527
const loadingTask = pdfjsLib.getDocument({
45104528
range: transport,
45114529
standardFontDataUrl: STANDARD_FONT_DATA_URL,
4530+
// Only fetch ranges renderPage()/getFieldObjects() actually ask for.
4531+
// Without these pdfjs background-prefetches the whole file regardless of
4532+
// the per-page lazy scans below.
4533+
disableAutoFetch: true,
4534+
disableStream: true,
45124535
});
45134536

45144537
try {
@@ -4673,12 +4696,13 @@ app.ontoolresult = async (result: CallToolResult) => {
46734696
scale = fitScale;
46744697
log.info("Initial fit scale:", scale);
46754698
}
4676-
await renderPage();
4677-
4678-
// Import annotations from the PDF to establish baseline
4679-
await loadBaselineAnnotations(document);
4680-
// Restore any persisted user diff
4699+
// Restore any persisted user diff BEFORE first render so the per-page
4700+
// baseline scan inside renderPage() can honour the removed-id tombstones
4701+
// and not resurrect annotations the user deleted last session.
4702+
// restoreAnnotations is sync (localStorage read) so first paint is not
4703+
// delayed.
46814704
restoreAnnotations();
4705+
await renderPage();
46824706

46834707
// Build field name → annotation ID mapping for form filling
46844708
const seeded = await buildFieldNameMap(document);

playwright.config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ export default defineConfig({
4747
env: {
4848
...process.env,
4949
EXAMPLE: process.env.EXAMPLE ?? "",
50+
// pdf-incremental-load.spec.ts serves test PDFs over self-signed HTTPS;
51+
// the pdf-server's upstream fetch must accept that cert.
52+
NODE_TLS_REJECT_UNAUTHORIZED: "0",
5053
},
5154
},
5255
// Snapshot configuration

0 commit comments

Comments
 (0)