|
| 1 | +import fs from "node:fs/promises"; |
| 2 | +import path from "node:path"; |
| 3 | +import pdf from "pdf-parse"; |
| 4 | + |
| 5 | +const repoRoot = process.cwd(); |
| 6 | +const papersDir = path.join(repoRoot, "public", "papers"); |
| 7 | +const outDir = path.join(repoRoot, "app", "papers"); |
| 8 | +const outFile = path.join(outDir, "papers.generated.json"); |
| 9 | + |
| 10 | +const isPdf = (fileName) => fileName.toLowerCase().endsWith(".pdf"); |
| 11 | + |
| 12 | +const normalizeString = (value) => { |
| 13 | + if (!value || typeof value !== "string") return null; |
| 14 | + const trimmed = value.replace(/\0/g, "").trim(); |
| 15 | + if (!trimmed) return null; |
| 16 | + if (trimmed.toLowerCase() === "untitled") return null; |
| 17 | + return trimmed; |
| 18 | +}; |
| 19 | + |
| 20 | +const titleFromFilename = (fileName) => { |
| 21 | + const withoutExt = fileName.replace(/\.pdf$/i, ""); |
| 22 | + return withoutExt.replace(/[_-]+/g, " ").replace(/\s+/g, " ").trim(); |
| 23 | +}; |
| 24 | + |
| 25 | +const guessAuthorsFromText = (text) => { |
| 26 | + if (!text || typeof text !== "string") return null; |
| 27 | + |
| 28 | + const lines = text |
| 29 | + .split(/\r?\n/) |
| 30 | + .map((l) => l.trim()) |
| 31 | + .filter(Boolean) |
| 32 | + .slice(0, 30); |
| 33 | + |
| 34 | + for (const line of lines) { |
| 35 | + const clean = line.replace(/\s+/g, " ").trim(); |
| 36 | + if (clean.length < 6 || clean.length > 140) continue; |
| 37 | + |
| 38 | + const looksLikeAuthors = |
| 39 | + /,/.test(clean) || /\band\b/i.test(clean) || /\bet\s+al\b/i.test(clean); |
| 40 | + |
| 41 | + const hasLetters = /[A-Za-z]/.test(clean); |
| 42 | + const hasAtLeastTwoWords = clean.split(" ").length >= 2; |
| 43 | + |
| 44 | + if (looksLikeAuthors && hasLetters && hasAtLeastTwoWords) { |
| 45 | + return clean; |
| 46 | + } |
| 47 | + } |
| 48 | + |
| 49 | + return null; |
| 50 | +}; |
| 51 | + |
| 52 | +const tryParsePdf = async (filePath) => { |
| 53 | + const buffer = await fs.readFile(filePath); |
| 54 | + |
| 55 | + // Keep it light: metadata + first page text only. |
| 56 | + const data = await pdf(buffer, { max: 1 }); |
| 57 | + |
| 58 | + const title = normalizeString(data?.info?.Title) ?? normalizeString(data?.metadata?.get?.("dc:title")); |
| 59 | + const author = normalizeString(data?.info?.Author) ?? normalizeString(data?.metadata?.get?.("dc:creator")); |
| 60 | + |
| 61 | + const guessedAuthors = author ?? guessAuthorsFromText(data?.text); |
| 62 | + |
| 63 | + return { |
| 64 | + title, |
| 65 | + authors: guessedAuthors, |
| 66 | + }; |
| 67 | +}; |
| 68 | + |
| 69 | +const listPdfFiles = async () => { |
| 70 | + const entries = await fs.readdir(papersDir, { withFileTypes: true }); |
| 71 | + return entries |
| 72 | + .filter((e) => e.isFile()) |
| 73 | + .map((e) => e.name) |
| 74 | + .filter((name) => isPdf(name)); |
| 75 | +}; |
| 76 | + |
| 77 | +const main = async () => { |
| 78 | + let pdfFiles; |
| 79 | + try { |
| 80 | + pdfFiles = await listPdfFiles(); |
| 81 | + } catch (err) { |
| 82 | + console.error(`Failed to read papers directory: ${papersDir}`); |
| 83 | + console.error(err); |
| 84 | + process.exitCode = 1; |
| 85 | + return; |
| 86 | + } |
| 87 | + |
| 88 | + // Fast path: if the generated index is newer than every PDF, skip re-parsing. |
| 89 | + try { |
| 90 | + const outStat = await fs.stat(outFile); |
| 91 | + let newestPdfMtimeMs = 0; |
| 92 | + for (const fileName of pdfFiles) { |
| 93 | + const stat = await fs.stat(path.join(papersDir, fileName)); |
| 94 | + newestPdfMtimeMs = Math.max(newestPdfMtimeMs, stat.mtimeMs); |
| 95 | + } |
| 96 | + |
| 97 | + if (outStat.mtimeMs >= newestPdfMtimeMs) { |
| 98 | + console.log(`Papers index already up to date -> ${path.relative(repoRoot, outFile)}`); |
| 99 | + return; |
| 100 | + } |
| 101 | + } catch { |
| 102 | + // Missing output file, or stat failed: proceed to generate. |
| 103 | + } |
| 104 | + |
| 105 | + const items = []; |
| 106 | + for (const fileName of pdfFiles) { |
| 107 | + const filePath = path.join(papersDir, fileName); |
| 108 | + |
| 109 | + try { |
| 110 | + const stat = await fs.stat(filePath); |
| 111 | + const meta = await tryParsePdf(filePath); |
| 112 | + |
| 113 | + items.push({ |
| 114 | + fileName, |
| 115 | + href: `/papers/${encodeURIComponent(fileName)}`, |
| 116 | + title: meta.title ?? titleFromFilename(fileName), |
| 117 | + authors: meta.authors, |
| 118 | + bytes: stat.size, |
| 119 | + }); |
| 120 | + } catch (err) { |
| 121 | + items.push({ |
| 122 | + fileName, |
| 123 | + href: `/papers/${encodeURIComponent(fileName)}`, |
| 124 | + title: titleFromFilename(fileName), |
| 125 | + authors: null, |
| 126 | + bytes: null, |
| 127 | + error: String(err?.message ?? err), |
| 128 | + }); |
| 129 | + } |
| 130 | + } |
| 131 | + |
| 132 | + items.sort((a, b) => { |
| 133 | + const at = (a.title ?? a.fileName).toLowerCase(); |
| 134 | + const bt = (b.title ?? b.fileName).toLowerCase(); |
| 135 | + if (at < bt) return -1; |
| 136 | + if (at > bt) return 1; |
| 137 | + return a.fileName.toLowerCase().localeCompare(b.fileName.toLowerCase()); |
| 138 | + }); |
| 139 | + |
| 140 | + await fs.mkdir(outDir, { recursive: true }); |
| 141 | + await fs.writeFile(outFile, JSON.stringify({ generatedAt: new Date().toISOString(), items }, null, 2) + "\n", "utf8"); |
| 142 | + |
| 143 | + console.log(`Generated ${items.length} paper entries -> ${path.relative(repoRoot, outFile)}`); |
| 144 | +}; |
| 145 | + |
| 146 | +await main(); |
0 commit comments