Parse series info from PDF and CBZ files

chrox · chrox · commit 96dac68155e4 · 2026-05-06T00:20:26.000+08:00
diff --git a/comic-book.js b/comic-book.js
@@ -1,3 +1,55 @@
+// Read series metadata from a ComicInfo.xml entry, if present.
+// Spec: https://anansi-project.github.io/docs/comicinfo/intro
+const readComicInfoXML = async ({ entries, loadBlob }) => {
+    const entry = entries.find(e => e.filename.toLowerCase() === 'comicinfo.xml')
+    if (!entry) return null
+    let text
+    try {
+        text = await (await loadBlob(entry.filename)).text()
+    } catch {
+        return null
+    }
+    let doc
+    try {
+        doc = new DOMParser().parseFromString(text, 'application/xml')
+    } catch {
+        return null
+    }
+    if (!doc || doc.getElementsByTagName('parsererror').length) return null
+    const get = name => doc.getElementsByTagName(name).item(0)?.textContent?.trim() || undefined
+    return {
+        title: get('Title'),
+        publisher: get('Publisher'),
+        language: get('LanguageISO'),
+        author: get('Writer'),
+        series: get('Series'),
+        seriesPosition: get('Number'),
+        seriesTotal: get('Count'),
+    }
+}
+
+const readComicBookInfo = async ({ getComment }) => {
+    let info
+    try {
+        info = JSON.parse(await getComment() || '')['ComicBookInfo/1.0']
+    } catch {
+        return null
+    }
+    if (!info) return null
+    const year = info.publicationYear
+    const month = info.publicationMonth
+    const mm = month && month >= 1 && month <= 12 ? String(month).padStart(2, '0') : null
+    return {
+        title: info.title,
+        publisher: info.publisher,
+        language: info.language || info.lang,
+        author: info.credits ? info.credits.map(c => `${c.person} (${c.role})`).join(', ') : '',
+        published: year && month ? `${year}-${mm}` : undefined,
+        series: info.series,
+        seriesPosition: info.issue == null ? undefined : String(info.issue),
+    }
+}
+
 export const makeComicBook = async ({ entries, loadBlob, getSize, getComment }, file) => {
     const cache = new Map()
     const urls = new Map()
@@ -24,25 +76,23 @@ export const makeComicBook = async ({ entries, loadBlob, getSize, getComment },
     if (!files.length) throw new Error('No supported image files in archive')
 
     const book = {}
-    try {
-        const jsonComment = JSON.parse(await getComment() || '')
-        const info = jsonComment['ComicBookInfo/1.0']
-        if (info) {
-            const year = info.publicationYear
-            const month = info.publicationMonth
-            const mm = month && month >= 1 && month <= 12 ? String(month).padStart(2, '0') : null
-            book.metadata = {
-                title: info.title || file.name,
-                publisher: info.publisher,
-                language: info.language || info.lang,
-                author: info.credits ? info.credits.map(c => `${c.person} (${c.role})`).join(', ') : '',
-                published: year && month ? `${year}-${mm}` : undefined,
-            }
-        } else {
-            book.metadata = { title: file.name }
-        }
-    } catch {
-        book.metadata = { title: file.name }
+    // Prefer ComicInfo.xml (Anansi standard) over ComicBookInfo (JSON in zip comment).
+    // Fields missing from the preferred source fall through to the secondary one.
+    const xml = await readComicInfoXML({ entries, loadBlob })
+    const cbi = await readComicBookInfo({ getComment })
+    const merged = { ...(cbi || {}), ...(xml || {}) }
+    book.metadata = {
+        title: merged.title || file.name,
+        publisher: merged.publisher,
+        language: merged.language,
+        author: merged.author,
+        published: merged.published,
+    }
+    if (merged.series) {
+        const series = { name: merged.series }
+        if (merged.seriesPosition) series.position = merged.seriesPosition
+        if (merged.seriesTotal) series.total = merged.seriesTotal
+        book.metadata.belongsTo = { series }
     }
     book.getCover = () => loadBlob(files[0])
     book.sections = files.map(name => ({
diff --git a/pdf.js b/pdf.js
@@ -316,6 +316,35 @@ const makeTOCItem = async (item, pdf) => {
 
 const MAX_CACHED_PAGES = 8
 
+const CALIBRE_NS = 'http://calibre-ebook.com/xmp-namespace'
+const CALIBRE_SI_NS = 'http://calibre-ebook.com/xmp-namespace-series-index'
+const RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+
+// Calibre writes series metadata into the XMP packet as
+// <calibre:series rdf:parseType="Resource">
+//   <rdf:value>Name</rdf:value>
+//   <calibreSI:series_index>1.00</calibreSI:series_index>
+// </calibre:series>
+const parseCalibreSeriesFromXMP = raw => {
+    if (!raw || typeof raw !== 'string') return null
+    let doc
+    try {
+        doc = new DOMParser().parseFromString(raw, 'application/xml')
+    } catch {
+        return null
+    }
+    if (!doc || doc.getElementsByTagName('parsererror').length) return null
+    const seriesEls = doc.getElementsByTagNameNS(CALIBRE_NS, 'series')
+    const seriesEl = seriesEls.item(0)
+    if (!seriesEl) return null
+    const valueEl = seriesEl.getElementsByTagNameNS(RDF_NS, 'value').item(0)
+    const name = valueEl?.textContent?.trim()
+    if (!name) return null
+    const indexEl = seriesEl.getElementsByTagNameNS(CALIBRE_SI_NS, 'series_index').item(0)
+    const position = indexEl?.textContent?.trim()
+    return position ? { name, position } : { name }
+}
+
 export const makePDF = async file => {
     const transport = new pdfjsLib.PDFDataRangeTransport(file.size, [])
     transport.requestDataRange = (begin, end) => {
@@ -354,6 +383,9 @@ export const makePDF = async file => {
         rights: metadata?.get('dc:rights'),
     }
 
+    const calibreSeries = parseCalibreSeriesFromXMP(metadata?.getRaw?.())
+    if (calibreSeries) book.metadata.belongsTo = { series: calibreSeries }
+
     const outline = await pdf.getOutline()
     book.toc = outline ? await Promise.all(outline.map(item => makeTOCItem(item, pdf))) : null