mcp-document-reader/index.js at main · hannasdev/mcp-document-reader · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env node
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { SSEServerTransport } from "@modelcontextprotocol/sdk/server/sse.js";
import { extractText } from "unpdf";
import { readFileSync, readdirSync, statSync } from "node:fs";
import { join, resolve, extname } from "node:path";
import http from "node:http";
import { z } from "zod";

// ---------------------------------------------------------------------------
// Config — CLI flags take precedence over env vars
// ---------------------------------------------------------------------------
const args = process.argv.slice(2);
function getFlag(name) {
  const i = args.indexOf(name);
  return i !== -1 ? args[i + 1] : undefined;
}

const DOC_DIR = resolve(getFlag("--dir") ?? process.env.DOC_DIR ?? process.env.PDF_DIR ?? ".");
const HTTP_PORT = parseInt(getFlag("--port") ?? process.env.HTTP_PORT ?? "3000", 10);
const USE_STDIO = args.includes("--stdio") || process.env.MCP_TRANSPORT === "stdio";

// ---------------------------------------------------------------------------
// Document helpers
// ---------------------------------------------------------------------------
const SUPPORTED_EXTS = new Set([".pdf"]);

function listDocs(dir, prefix = "") {
  const results = [];
  let entries;
  try {
    entries = readdirSync(dir, { withFileTypes: true });
  } catch {
    return results;
  }
  for (const entry of entries) {
    const rel = prefix ? `${prefix}/${entry.name}` : entry.name;
    const abs = join(dir, entry.name);
    if (entry.isDirectory()) {
      results.push(...listDocs(abs, rel));
    } else if (SUPPORTED_EXTS.has(extname(entry.name).toLowerCase())) {
      const { size } = statSync(abs);
      results.push({ name: entry.name, path: rel, size_kb: Math.round(size / 1024) });
    }
  }
  return results;
}

function resolvePath(input) {
  const abs = input.startsWith("/") ? input : join(DOC_DIR, input);
  const normalized = resolve(abs);
  if (!normalized.startsWith(resolve(DOC_DIR))) {
    throw new Error(`Path must be within the document directory (${DOC_DIR})`);
  }
  return normalized;
}

async function extractPages(buf) {
  const result = await extractText(new Uint8Array(buf));
  return Array.isArray(result.text) ? result.text : [result.text];
}

// ---------------------------------------------------------------------------
// MCP server factory — one instance per connection (SDK 1.29+ requirement)
// ---------------------------------------------------------------------------
const PACKAGE_VERSION = "1.0.0";

function createMcpServer() {
  const s = new McpServer({ name: "mcp-document-reader", version: PACKAGE_VERSION });

  // ── list_documents ─────────────────────────────────────────────────────────
  s.tool(
    "list_documents",
    "List all documents available for reading. Call this first when you don't know the exact filename.",
    {},
    async () => {
      const files = listDocs(DOC_DIR);
      if (files.length === 0) {
        return { content: [{ type: "text", text: `No documents found in ${DOC_DIR}.` }] };
      }
      const lines = files.map((f) => `${f.path}  (${f.size_kb} KB)`);
      return { content: [{ type: "text", text: lines.join("\n") }] };
    }
  );

  // ── get_document_info ──────────────────────────────────────────────────────
  s.tool(
    "get_document_info",
    "Get metadata for a document without extracting all its text: total page count and file size. " +
    "Call this before read_document to plan how many chunks you will need.",
    {
      path: z.string().describe("Filename or path of the document."),
    },
    async ({ path: inputPath }) => {
      let absPath;
      try { absPath = resolvePath(inputPath); } catch (e) {
        return { content: [{ type: "text", text: e.message }] };
      }
      let buf;
      try { buf = readFileSync(absPath); } catch {
        return { content: [{ type: "text", text: `File not found: ${absPath}` }] };
      }
      let totalPages;
      try {
        const pages = await extractPages(buf);
        totalPages = pages.length;
      } catch (e) {
        return { content: [{ type: "text", text: `Failed to read document: ${e.message}` }] };
      }
      const size_kb = Math.round(buf.length / 1024);
      return { content: [{ type: "text", text: `${totalPages} pages, ${size_kb} KB\nTo read: call read_document with start_page and end_page (50 pages per chunk recommended).` }] };
    }
  );

  // ── read_document ──────────────────────────────────────────────────────────
  s.tool(
    "read_document",
    "Extract text from a document in page-range chunks. " +
    "IMPORTANT: You MUST specify start_page to read past the first 50 pages — the default is always page 1. " +
    "Before reading, call get_document_info first to see the total page count and plan your reads. " +
    "Always read in chunks of 30–50 pages. Each response shows [Pages X–Y of Z total] and tells you the next start_page. " +
    "If you are unsure of the filename, call list_documents first. " +
    "To jump to a specific topic or resume from a known location, use search_document instead.",
    {
      path: z.string().describe("Filename or path of the document."),
      start_page: z
        .number().int().positive()
        .optional()
        .describe("First page to read (1-indexed). Default: 1."),
      end_page: z
        .number().int().positive()
        .optional()
        .describe("Last page to read, inclusive. Default: start_page + 49 (50-page chunk)."),
    },
    async ({ path: inputPath, start_page = 1, end_page }) => {
      let absPath;
      try { absPath = resolvePath(inputPath); } catch (e) {
        return { content: [{ type: "text", text: e.message }] };
      }
      let buf;
      try { buf = readFileSync(absPath); } catch {
        const files = listDocs(DOC_DIR);
        const hint = files.length > 0
          ? `\n\nAvailable documents:\n${files.map((f) => f.path).join("\n")}`
          : `\n\nNo documents found in ${DOC_DIR}.`;
        return { content: [{ type: "text", text: `File not found: ${absPath}${hint}` }] };
      }
      let pages;
      try { pages = await extractPages(buf); } catch (e) {
        return { content: [{ type: "text", text: `Failed to extract text: ${e.message}` }] };
      }

      const totalPages = pages.length;
      if (totalPages === 0) {
        return { content: [{ type: "text", text: "(no text extracted — may be a scanned/image PDF)" }] };
      }

      const clamp = (n) => Math.max(1, Math.min(n, totalPages));
      const s = clamp(start_page);
      const e = clamp(end_page ?? s + 49);

      const chunkText = pages.slice(s - 1, e).join("\n\n");
      const header = `[Pages ${s}–${e} of ${totalPages} total]\n\n`;
      const footer = e < totalPages
        ? `\n\n[${totalPages - e} pages remaining. Call read_document with start_page=${e + 1} to continue.]`
        : "\n\n[End of document.]";

      return { content: [{ type: "text", text: header + (chunkText || "(no text on these pages)") + footer }] };
    }
  );

  // ── search_document ────────────────────────────────────────────────────────
  s.tool(
    "search_document",
    "Search for a keyword or phrase across all pages of a document. " +
    "Returns matching page numbers and a short context snippet for each hit. " +
    "Use this to jump directly to a topic, character, or section without reading from page 1. " +
    "Also use this to resume from where you left off by searching for the last thing you read.",
    {
      path: z.string().describe("Filename or path of the document."),
      query: z.string().describe("Case-insensitive keyword or phrase to search for."),
      max_results: z.number().int().positive().optional().describe("Maximum number of matches to return. Default: 20."),
      context_chars: z.number().int().positive().optional().describe("Characters of surrounding text to include per match. Default: 200."),
    },
    async ({ path: inputPath, query, max_results = 20, context_chars = 200 }) => {
      let absPath;
      try { absPath = resolvePath(inputPath); } catch (e) {
        return { content: [{ type: "text", text: e.message }] };
      }
      let buf;
      try { buf = readFileSync(absPath); } catch {
        return { content: [{ type: "text", text: `File not found: ${absPath}` }] };
      }
      let pages;
      try { pages = await extractPages(buf); } catch (e) {
        return { content: [{ type: "text", text: `Failed to read document: ${e.message}` }] };
      }

      const lowerQuery = query.toLowerCase();
      const matches = [];
      for (let i = 0; i < pages.length && matches.length < max_results; i++) {
        const pageText = pages[i];
        const lowerPage = pageText.toLowerCase();
        let idx = lowerPage.indexOf(lowerQuery);
        while (idx !== -1 && matches.length < max_results) {
          const start = Math.max(0, idx - Math.floor(context_chars / 2));
          const end = Math.min(pageText.length, idx + query.length + Math.floor(context_chars / 2));
          const snippet = (start > 0 ? "…" : "") + pageText.slice(start, end).replace(/\n+/g, " ") + (end < pageText.length ? "…" : "");
          matches.push(`Page ${i + 1}: ${snippet}`);
          idx = lowerPage.indexOf(lowerQuery, idx + 1);
        }
      }

      if (matches.length === 0) {
        return { content: [{ type: "text", text: `No matches found for "${query}" in ${pages.length} pages.` }] };
      }

      const header = `${matches.length} match${matches.length === 1 ? "" : "es"} for "${query}" (${pages.length} pages total):\n\n`;
      return { content: [{ type: "text", text: header + matches.join("\n\n") }] };
    }
  );

  return s;
}

// ---------------------------------------------------------------------------
// Transport — stdio (Claude Desktop / npx) or SSE (Docker / self-hosted)
// ---------------------------------------------------------------------------
if (USE_STDIO) {
  const server = createMcpServer();
  const transport = new StdioServerTransport();
  await server.connect(transport);
} else {
  const activeSessions = new Map();

  const httpServer = http.createServer(async (req, res) => {
    if (req.method === "GET" && req.url === "/sse") {
      const transport = new SSEServerTransport("/message", res);
      const sessionId = transport.sessionId;
      const existing = activeSessions.get(sessionId);
      if (existing) {
        try { await existing.transport.close(); } catch {}
        try { await existing.server.close(); } catch {}
        activeSessions.delete(sessionId);
      }
      const sessionServer = createMcpServer();
      activeSessions.set(sessionId, { transport, server: sessionServer });
      res.on("close", () => activeSessions.delete(sessionId));
      await sessionServer.connect(transport);
      return;
    }

    if (req.method === "POST" && req.url.startsWith("/message")) {
      const url = new URL(req.url, "http://localhost");
      const session = activeSessions.get(url.searchParams.get("sessionId"));
      if (!session) { res.writeHead(404); res.end("Session not found"); return; }
      await session.transport.handlePostMessage(req, res);
      return;
    }

    if (req.method === "GET" && req.url === "/healthz") {
      res.writeHead(200); res.end("ok"); return;
    }

    res.writeHead(404); res.end("Not found");
  });

  httpServer.listen(HTTP_PORT, "0.0.0.0", () => {
    process.stderr.write(`[mcp-document-reader] Listening on port ${HTTP_PORT} — DOC_DIR=${DOC_DIR}\n`);
  });
}