modelcontextprotocol
diff --git a/‎examples/pdf-server/server.ts‎
Lines changed: 64 additions & 29 deletions b/‎examples/pdf-server/server.ts‎
Lines changed: 64 additions & 29 deletions
diff --git a/‎examples/pdf-server/src/mcp-app.ts‎
Lines changed: 77 additions & 53 deletions b/‎examples/pdf-server/src/mcp-app.ts‎
Lines changed: 77 additions & 53 deletions
diff --git a/‎playwright.config.ts‎
Lines changed: 3 additions & 0 deletions b/‎playwright.config.ts‎
Lines changed: 3 additions & 0 deletions
@@ -31,6 +31,7 @@ import {
 import "./pdfjs-polyfill.js";
 import {
   getDocument,
+  PDFDataRangeTransport,
   VerbosityLevel,
   version as PDFJS_VERSION,
 } from "pdfjs-dist/legacy/build/pdf.mjs";
@@ -368,9 +369,6 @@ export const viewSourcePaths = new Map<string, string>();
 /** Valid form field names per viewer UUID (populated during display_pdf) */
 const viewFieldNames = new Map<string, Set<string>>();
 
-/** Detailed form field info per viewer UUID (populated during display_pdf) */
-const viewFieldInfo = new Map<string, FormFieldInfo[]>();
-
 /**
  * Active fs.watch per view. Only created for local files when interact is
  * enabled (stdio). Watcher is re-established on `rename` events to survive
@@ -387,7 +385,7 @@ const viewFileWatches = new Map<string, ViewFileWatch>();
 /**
  * Per-view heartbeat. THIS is what the sweep iterates — not commandQueues.
  *
- * Why not commandQueues: display_pdf populates viewFieldNames/viewFieldInfo/
+ * Why not commandQueues: display_pdf populates viewFieldNames/
  * viewFileWatches but never touches commandQueues (only enqueueCommand does,
  * and it's triply gated). And dequeueCommands deletes the entry on every poll,
  * so even when it exists the sweep's TTL window is ~200ms wide. Net effect:
@@ -409,7 +407,6 @@ function pruneStaleQueues(): void {
       viewLastActivity.delete(uuid);
       commandQueues.delete(uuid);
       viewFieldNames.delete(uuid);
-      viewFieldInfo.delete(uuid);
       viewsPolled.delete(uuid);
       viewSourcePaths.delete(uuid);
       stopFileWatch(uuid);
@@ -882,6 +879,28 @@ export function createPdfCache(
   };
 }
 
+/**
+ * pdf.js range transport backed by {@link PdfCache.readPdfRange}. Lets
+ * getDocument() fetch only the byte ranges it needs (xref, /AcroForm dict)
+ * instead of the whole file. With disableAutoFetch, a PDF without form
+ * fields is opened with ~5% of bytes fetched.
+ */
+export class PdfCacheRangeTransport extends PDFDataRangeTransport {
+  constructor(
+    private url: string,
+    length: number,
+    private readPdfRange: PdfCache["readPdfRange"],
+  ) {
+    super(length, null);
+  }
+  override requestDataRange(begin: number, end: number): void {
+    this.readPdfRange(this.url, begin, end - begin).then(
+      ({ data }) => this.onDataRange(begin, data),
+      () => this.abort(),
+    );
+  }
+}
+
 // =============================================================================
 // MCP Roots
 // =============================================================================
@@ -1020,19 +1039,23 @@ async function extractFormFieldInfo(
   return fields;
 }
 
-async function extractFormSchema(pdfDoc: PDFDocumentProxy): Promise<{
+async function extractFormSchema(
+  pdfDoc: PDFDocumentProxy,
+  fieldObjects?: Record<string, PdfJsFieldObject[]> | null,
+): Promise<{
   type: "object";
   properties: Record<string, PrimitiveSchemaDefinition>;
   required?: string[];
 } | null> {
-  let fieldObjects: Record<string, PdfJsFieldObject[]> | null;
-  try {
-    fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
-      string,
-      PdfJsFieldObject[]
-    > | null;
-  } catch {
-    return null;
+  if (fieldObjects === undefined) {
+    try {
+      fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
+        string,
+        PdfJsFieldObject[]
+      > | null;
+    } catch {
+      return null;
+    }
   }
   if (!fieldObjects || Object.keys(fieldObjects).length === 0) {
     return null;
@@ -1434,7 +1457,7 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
       const { totalBytes } = await readPdfRange(normalized, 0, 1);
       const uuid = randomUUID();
       // Start the heartbeat now so the sweep can clean up viewFieldNames/
-      // viewFieldInfo/viewFileWatches even if no interact calls ever happen.
+      // viewFileWatches even if no interact calls ever happen.
       if (!disableInteract) touchView(uuid);
 
       // Check writability (governs save button; see isWritablePath doc).
@@ -1462,21 +1485,36 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
         }
       }
 
-      // Extract form field schema + detailed field info from a single
-      // download/parse pass.
+      // Extract form field schema + detailed field info via range transport so
+      // PDFs without forms only fetch the trailer/xref/catalog (~5% of bytes).
+      // PDFs with forms still pull most of the file once getAnnotations walks
+      // every page, but those are typically small.
       let formSchema: Awaited<ReturnType<typeof extractFormSchema>> = null;
       let fieldInfo: FormFieldInfo[] = [];
       try {
-        const { data } = await readPdfRange(normalized, 0, totalBytes);
         const pdfDoc = await getDocument({
-          data,
+          range: new PdfCacheRangeTransport(
+            normalized,
+            totalBytes,
+            readPdfRange,
+          ),
+          length: totalBytes,
+          disableAutoFetch: true,
+          disableStream: true,
+          rangeChunkSize: 64 * 1024,
           standardFontDataUrl: STANDARD_FONT_DATA_URL,
           StandardFontDataFactory: FetchStandardFontDataFactory,
           verbosity: VerbosityLevel.ERRORS,
         }).promise;
         try {
-          formSchema = await extractFormSchema(pdfDoc);
-          fieldInfo = await extractFormFieldInfo(pdfDoc);
+          const fieldObjects = (await pdfDoc.getFieldObjects()) as Record<
+            string,
+            PdfJsFieldObject[]
+          > | null;
+          if (fieldObjects && Object.keys(fieldObjects).length > 0) {
+            formSchema = await extractFormSchema(pdfDoc, fieldObjects);
+            fieldInfo = await extractFormFieldInfo(pdfDoc);
+          }
         } finally {
           pdfDoc.destroy();
         }
@@ -1486,14 +1524,11 @@ Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before
       if (formSchema) {
         viewFieldNames.set(uuid, new Set(Object.keys(formSchema.properties)));
       }
-      if (fieldInfo.length > 0) {
-        viewFieldInfo.set(uuid, fieldInfo);
-        if (!viewFieldNames.has(uuid)) {
-          viewFieldNames.set(
-            uuid,
-            new Set(fieldInfo.map((f) => f.name).filter(Boolean)),
-          );
-        }
+      if (fieldInfo.length > 0 && !viewFieldNames.has(uuid)) {
+        viewFieldNames.set(
+          uuid,
+          new Set(fieldInfo.map((f) => f.name).filter(Boolean)),
+        );
       }
 
       // Elicit form field values if requested and client supports it
 
@@ -150,6 +150,11 @@ const imageCache = new Map<string, HTMLImageElement>();
 
 /** Annotations imported from the PDF file (baseline for diff computation). */
 let pdfBaselineAnnotations: PdfAnnotationDef[] = [];
+/** Pages whose native annotations have already been imported into the baseline. */
+const baselineScannedPages = new Set<number>();
+/** Native-annotation ids the user deleted (from restored localStorage diff) —
+ * the lazy per-page scan must NOT re-add these to annotationMap. */
+const restoredRemovedIds = new Set<string>();
 
 // Dirty flag — tracks unsaved local changes
 let isDirty = false;
@@ -2679,52 +2684,49 @@ function annotationStorageKey(): string | null {
 }
 
 /**
- * Import annotations from the loaded PDF to establish the baseline.
- * These are the annotations that exist in the PDF file itself.
+ * Import one page's native annotations into the baseline. Called lazily from
+ * renderPage() so we don't walk every page (and pull most of the file via
+ * range requests) before the user sees anything. Idempotent per page.
  */
-async function loadBaselineAnnotations(
-  doc: pdfjsLib.PDFDocumentProxy,
-): Promise<void> {
-  pdfBaselineAnnotations = [];
-  for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
-    try {
-      const page = await doc.getPage(pageNum);
-      const annotations = await page.getAnnotations();
-      for (let i = 0; i < annotations.length; i++) {
-        const ann = annotations[i];
-        const def = importPdfjsAnnotation(ann, pageNum, i);
-        if (def) {
-          pdfBaselineAnnotations.push(def);
-          // Add to annotationMap if not already present (from localStorage restore)
-          if (!annotationMap.has(def.id)) {
-            annotationMap.set(def.id, { def, elements: [] });
-          }
-        } else if (ann.annotationType !== 20) {
-          // Widget (type 20) is expected to be skipped; anything else we
-          // don't import will still be painted by page.render() onto the
-          // canvas as unselectable pixels. Log so we can diagnose
-          // "ghost annotations" (visible but not in panel, not clickable).
-          log.info(
-            `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
-            `type=${ann.annotationType}`,
-            `subtype=${ann.subtype ?? "?"}`,
-            `name=${ann.name ?? "?"}`,
-            `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
-          );
-        }
+function scanPageBaselineAnnotations(
+  pageNum: number,
+  annotations: unknown[],
+): void {
+  if (baselineScannedPages.has(pageNum)) return;
+  baselineScannedPages.add(pageNum);
+  let imported = 0;
+  for (let i = 0; i < annotations.length; i++) {
+    const ann = annotations[i] as {
+      annotationType?: number;
+      subtype?: string;
+      name?: string;
+      rect?: number[];
+    };
+    const def = importPdfjsAnnotation(ann, pageNum, i);
+    if (def) {
+      pdfBaselineAnnotations.push(def);
+      imported++;
+      if (!annotationMap.has(def.id) && !restoredRemovedIds.has(def.id)) {
+        annotationMap.set(def.id, { def, elements: [] });
       }
-    } catch (err) {
-      // Log the error — a thrown import for one annotation silently
-      // drops the REST of that page's annotations too.
+    } else if (ann.annotationType !== 20) {
+      // Widget (type 20) is expected to be skipped; anything else we
+      // don't import will still be painted by page.render() onto the
+      // canvas as unselectable pixels. Log so we can diagnose
+      // "ghost annotations" (visible but not in panel, not clickable).
       log.info(
-        `[WARN] Baseline: page ${pageNum} annotation import failed:`,
-        err,
+        `[WARN] Baseline: skipped PDF annotation on page ${pageNum}`,
+        `type=${ann.annotationType}`,
+        `subtype=${ann.subtype ?? "?"}`,
+        `name=${ann.name ?? "?"}`,
+        `rect=${ann.rect ? JSON.stringify(ann.rect) : "none"}`,
       );
     }
   }
-  log.info(
-    `Loaded ${pdfBaselineAnnotations.length} baseline annotations from PDF`,
-  );
+  if (imported > 0) {
+    updateAnnotationsBadge();
+    renderAnnotationPanel();
+  }
 }
 
 function persistAnnotations(): void {
@@ -2765,11 +2767,11 @@ function restoreAnnotations(): void {
     const diff = deserializeDiff(raw);
 
     // Merge baseline + diff. The loop below is add-only, so we MUST also
-    // delete: loadBaselineAnnotations() runs between the two restore calls
-    // and re-seeds annotationMap with every baseline id — including the
-    // ones in diff.removed. Without this, the zombie survives the restore,
-    // and the next persistAnnotations() sees it in currentIds → computeDiff
-    // produces removed=[] → the deletion is permanently lost from storage.
+    // delete: the per-page baseline scan re-seeds annotationMap with every
+    // native id it encounters — including ones in diff.removed. Without the
+    // deletes here AND the restoredRemovedIds tombstones below, the zombie
+    // survives, and the next persistAnnotations() sees it in currentIds →
+    // computeDiff produces removed=[] → the deletion is permanently lost.
     const merged = mergeAnnotations(pdfBaselineAnnotations, diff);
     for (const def of merged) {
       if (!annotationMap.has(def.id)) {
@@ -2778,6 +2780,9 @@ function restoreAnnotations(): void {
     }
     for (const id of diff.removed) {
       annotationMap.delete(id);
+      // Tombstone so the lazy per-page baseline scan (which runs AFTER this
+      // restore) doesn't resurrect it.
+      restoredRemovedIds.add(id);
     }
 
     // Restore form fields
@@ -2869,6 +2874,14 @@ async function buildFieldNameMap(
     // getFieldObjects may fail on some PDFs
   }
 
+  // No AcroForm → nothing to map. Skip the per-page widget walk so form-free
+  // PDFs (the common large case) don't pull every page after first paint.
+  // getFieldObjects() itself only reads the catalog/AcroForm dict via range
+  // transport, so this gate is cheap.
+  if (!cachedFieldObjects || Object.keys(cachedFieldObjects).length === 0) {
+    return false;
+  }
+
   // Scan every page's widget annotations to collect the CORRECT storage keys,
   // plus labels, pages, positions, AND fieldValue (what the widget renders
   // — which can differ from getFieldObjects().value if the PDF is internally
@@ -3362,6 +3375,9 @@ async function renderPage() {
     formLayerEl.style.setProperty("--total-scale-factor", `${scale}`);
     try {
       const annotations = await page.getAnnotations();
+      // Lazy baseline import — piggyback on the annotations we just fetched
+      // for this page instead of walking all pages upfront.
+      scanPageBaselineAnnotations(pageToRender, annotations);
       if (annotations.length > 0) {
         const linkService = {
           getDestinationHash: () => "#",
@@ -4406,6 +4422,8 @@ async function reloadPdf(): Promise<void> {
   undoStack.length = 0;
   redoStack.length = 0;
   pdfBaselineAnnotations = [];
+  baselineScannedPages.clear();
+  restoredRemovedIds.clear();
   pdfBaselineFormValues.clear();
   pageTextCache.clear();
   pageTextItemsCache.clear();
@@ -4449,11 +4467,11 @@ async function reloadPdf(): Promise<void> {
     log.info("PDF reloaded:", totalPages, "pages,", totalBytes, "bytes");
 
     showViewer();
-    // Render immediately — annotation/form scans below are O(numPages) and
-    // do NOT block the canvas. See same pattern in the initial-load path.
+    // Render immediately — baseline-annotation scan now happens per-page
+    // inside renderPage(); buildFieldNameMap below early-returns when no
+    // AcroForm is present. See same pattern in the initial-load path.
     await renderPage();
 
-    await loadBaselineAnnotations(document);
     const seeded = await buildFieldNameMap(document);
     syncFormValuesToStorage();
     if (seeded) await renderPage();
@@ -4509,6 +4527,11 @@ async function loadPdfProgressively(urlToLoad: string): Promise<{
   const loadingTask = pdfjsLib.getDocument({
     range: transport,
     standardFontDataUrl: STANDARD_FONT_DATA_URL,
+    // Only fetch ranges renderPage()/getFieldObjects() actually ask for.
+    // Without these pdfjs background-prefetches the whole file regardless of
+    // the per-page lazy scans below.
+    disableAutoFetch: true,
+    disableStream: true,
   });
 
   try {
@@ -4673,12 +4696,13 @@ app.ontoolresult = async (result: CallToolResult) => {
       scale = fitScale;
       log.info("Initial fit scale:", scale);
     }
-    await renderPage();
-
-    // Import annotations from the PDF to establish baseline
-    await loadBaselineAnnotations(document);
-    // Restore any persisted user diff
+    // Restore any persisted user diff BEFORE first render so the per-page
+    // baseline scan inside renderPage() can honour the removed-id tombstones
+    // and not resurrect annotations the user deleted last session.
+    // restoreAnnotations is sync (localStorage read) so first paint is not
+    // delayed.
     restoreAnnotations();
+    await renderPage();
 
     // Build field name → annotation ID mapping for form filling
     const seeded = await buildFieldNameMap(document);
 
@@ -47,6 +47,9 @@ export default defineConfig({
     env: {
       ...process.env,
       EXAMPLE: process.env.EXAMPLE ?? "",
+      // pdf-incremental-load.spec.ts serves test PDFs over self-signed HTTPS;
+      // the pdf-server's upstream fetch must accept that cert.
+      NODE_TLS_REJECT_UNAUTHORIZED: "0",
     },
   },
   // Snapshot configuration