backend.ai-webui/packages/backend.ai-docs-toolkit/src/markdown-processor-web.ts at e79100ad56ca5a1f01cf9ebeb5bacba8b057d5e5 · lablup/backend.ai-webui · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
/**
 * Markdown processor for web preview with extended syntax support.
 * Supports: admonitions, code block titles, line highlighting, details/summary.
 */

import fs from "fs";
import path from "path";
import { Marked } from "marked";
import {
  slugify,
  slugFromNavPath,
  deduplicateH1,
  substituteTemplateVars,
  normalizeRstTables,
  convertIndentedNotes,
  resolveMarkdownPath,
  getImageDimensions,
  IMAGE_SCALE_FACTOR,
} from "./markdown-processor.js";
import type { Chapter, Heading } from "./markdown-processor.js";
import {
  processAdmonitions,
  processCodeBlockMeta,
  parseHighlightLines,
  parseShellSessionLines,
  escapeHtml,
  stripHtmlTags,
  getFigureLabel,
  parseImageSizeHint,
} from "./markdown-extensions.js";
import type { ResolvedDocConfig } from "./config.js";
import { DEFAULT_CODE_LIGHT_THEME } from "./config.js";
import { highlight as shikiHighlight } from "./shiki-highlighter.js";

export type { Chapter, Heading };

interface NavEntry {
  title: string;
  path: string;
}

// ── Anchor resolution types ────────────────────────────────────

export interface AnchorEntry {
  /** Chapter slug that owns this anchor */
  chapterSlug: string;
  /** Nav path of the source file (e.g., "vfolder/vfolder.md") */
  filePath: string;
  /** Source: heading-derived or explicit <a id> tag */
  source: "heading" | "explicit";
  /** The final ID as it appears in the rendered HTML */
  resolvedId: string;
}

export interface AnchorRegistry {
  /** Maps raw anchor ID (as written in markdown) to its entries */
  anchors: Map<string, AnchorEntry[]>;
  /** Set of all resolved IDs for quick "already resolved?" checks */
  resolvedIds: Set<string>;
}

export interface LinkDiagnostic {
  type: "broken-link" | "duplicate-anchor" | "ambiguous-link";
  anchorId: string;
  sourceFile: string;
  message: string;
}

/**
 * Rewrite image paths for web preview.
 * Resolves relative image paths from the md file's directory to absolute URL paths
 * that the preview server can serve.
 */
function rewriteImagePathsForWeb(
  markdown: string,
  mdFilePath: string,
  lang: string,
  srcDir: string,
): string {
  const mdDir = path.dirname(mdFilePath);
  const langDir = path.resolve(srcDir, lang);
  return markdown.replace(
    /!\[([^\]]*)\]\(([^)]+\.(?:png|jpe?g|gif|svg|webp))\)/gi,
    (match, alt, imgPath) => {
      // Skip already-absolute URLs
      if (/^(?:https?|file):\/\//.test(imgPath)) return match;

      // Treat leading-slash paths as already web-absolute
      if (imgPath.startsWith("/")) return match;

      // Resolve relative to the md file's directory
      const resolved = path.resolve(mdDir, imgPath);

      // Ensure the resolved path is within the language directory
      if (!resolved.startsWith(langDir + path.sep)) return match;

      // Make path relative to lang dir and convert to a URL path
      const relToLang = path.relative(langDir, resolved);
      const webPath = "/" + relToLang.split(path.sep).join("/");
      return `![${alt}](${webPath})`;
    },
  );
}

/**
 * Fix legacy malformed cross-reference links where HTML tags leaked into href.
 * This handles a narrow edge case from RST-to-Markdown migration.
 */
function fixMalformedCrossReferences(
  html: string,
  chapterSlug: string,
): string {
  return html.replace(
    /href="#([^"]*)<([^>]+)>[^"]*"/g,
    (_, _text, anchor) => `href="#${chapterSlug}-${slugify(anchor)}"`,
  );
}

// ── Anchor resolution functions ────────────────────────────────

interface RenderedChapter {
  chapter: Chapter;
  filePath: string;
}

/**
 * Build a global anchor registry from rendered chapters.
 * Uses actual heading IDs from the Marked renderer (guarantees ID accuracy)
 * and extracts explicit <a id="..."> anchors from rendered HTML.
 */
function buildAnchorRegistryFromRendered(
  rendered: RenderedChapter[],
): AnchorRegistry {
  const anchors = new Map<string, AnchorEntry[]>();
  const resolvedIds = new Set<string>();

  const addEntry = (rawId: string, entry: AnchorEntry) => {
    const existing = anchors.get(rawId) ?? [];
    existing.push(entry);
    anchors.set(rawId, existing);
    resolvedIds.add(entry.resolvedId);
  };

  for (const { chapter, filePath } of rendered) {
    // Register headings using IDs from the Marked renderer (already accurate)
    for (const heading of chapter.headings) {
      resolvedIds.add(heading.id);

      // Derive the raw slug by stripping the chapter-slug prefix
      const prefix = chapter.slug + "-";
      const rawSlug = heading.id.startsWith(prefix)
        ? heading.id.slice(prefix.length)
        : heading.id;

      addEntry(rawSlug, {
        chapterSlug: chapter.slug,
        filePath,
        source: "heading",
        resolvedId: heading.id,
      });
    }

    // Extract explicit <a ... id="..."> anchors from rendered HTML
    const explicitRegex = /<a\s+[^>]*?\bid="([^"]+)"[^>]*>/g;
    let match;
    while ((match = explicitRegex.exec(chapter.htmlContent)) !== null) {
      const anchorId = match[1];
      // Skip heading IDs (already registered above)
      if (resolvedIds.has(anchorId)) continue;

      addEntry(anchorId, {
        chapterSlug: chapter.slug,
        filePath,
        source: "explicit",
        resolvedId: anchorId,
      });
      resolvedIds.add(anchorId);
    }
  }

  return { anchors, resolvedIds };
}

/**
 * Detect duplicate anchors across different chapters and record diagnostics.
 */
function detectDuplicateAnchors(
  registry: AnchorRegistry,
  diagnostics: LinkDiagnostic[],
): void {
  for (const [anchorId, entries] of registry.anchors) {
    const chaptersWithExplicit = [
      ...new Set(
        entries.filter((e) => e.source === "explicit").map((e) => e.filePath),
      ),
    ];
    if (chaptersWithExplicit.length > 1) {
      diagnostics.push({
        type: "duplicate-anchor",
        anchorId,
        sourceFile: chaptersWithExplicit.join(", "),
        message: `Duplicate explicit anchor <a id="${anchorId}"> in: ${chaptersWithExplicit.join(", ")}`,
      });
    }
  }
}

/**
 * Rewrite fragment-only links (#anchor) using the global anchor registry.
 *
 * Resolution order for each href="#anchorId":
 * 1. If anchorId is already a resolved ID (e.g., chapter-prefixed heading ID) → skip
 * 2. Look up anchorId in registry
 * 3. Prefer entries in the current chapter (same-page priority)
 * 4. For cross-chapter links:
 *    - single-page mode: rewrite to #resolvedId
 *    - multi-page mode: rewrite to ./targetChapter.html#resolvedId
 */
function rewriteCrossPageLinks(
  html: string,
  currentChapterSlug: string,
  registry: AnchorRegistry,
  diagnostics: LinkDiagnostic[],
  sourceFile: string,
  multiPage: boolean = false,
): string {
  const reportedAnchors = new Set<string>();
  return html.replace(/href="#([^"]+)"/g, (fullMatch, anchorId: string) => {
    // Already a resolved ID (e.g., heading hash-links from the renderer) → skip
    // But only if the ID belongs to the current chapter; otherwise fall through
    // to cross-chapter resolution below.
    if (registry.resolvedIds.has(anchorId)) {
      const entries = registry.anchors.get(anchorId);
      const inCurrentChapter = entries?.some(
        (e) => e.chapterSlug === currentChapterSlug,
      );
      if (inCurrentChapter || !entries) {
        return fullMatch;
      }
    }

    const entries = registry.anchors.get(anchorId);
    if (!entries || entries.length === 0) {
      if (!reportedAnchors.has(anchorId)) {
        reportedAnchors.add(anchorId);
        diagnostics.push({
          type: "broken-link",
          anchorId,
          sourceFile,
          message: `No matching anchor found for #${anchorId}`,
        });
      }
      return fullMatch;
    }

    // Prefer entries in the current chapter (same-page priority)
    const sameChapter = entries.filter(
      (e) => e.chapterSlug === currentChapterSlug,
    );
    if (sameChapter.length > 0) {
      // Explicit anchors keep their raw ID which already works in-page
      const explicit = sameChapter.find((e) => e.source === "explicit");
      if (explicit) return fullMatch;
      // Heading-only: rewrite to chapter-prefixed resolved ID
      return `href="#${sameChapter[0].resolvedId}"`;
    }

    // Cross-chapter link — resolve target first so diagnostic is accurate
    const targetExplicit = entries.find((e) => e.source === "explicit");
    const target = targetExplicit ?? entries[0];

    const uniqueChapters = [...new Set(entries.map((e) => e.chapterSlug))];
    if (uniqueChapters.length > 1 && !reportedAnchors.has(anchorId)) {
      reportedAnchors.add(anchorId);
      diagnostics.push({
        type: "ambiguous-link",
        anchorId,
        sourceFile,
        message: `Ambiguous link #${anchorId} found in chapters: ${uniqueChapters.join(", ")}. Resolved to: ${target.chapterSlug}`,
      });
    }

    if (multiPage) {
      return `href="./${target.chapterSlug}.html#${target.resolvedId}"`;
    }

    // Single-page mode: explicit <a id> tags are all in one page, link works as-is
    if (target.source === "explicit") {
      return fullMatch;
    }
    return `href="#${target.resolvedId}"`;
  });
}

/**
 * Log anchor resolution diagnostics to the console.
 */
function reportLinkDiagnostics(diagnostics: LinkDiagnostic[]): void {
  const broken = diagnostics.filter((d) => d.type === "broken-link");
  const duplicates = diagnostics.filter((d) => d.type === "duplicate-anchor");
  const ambiguous = diagnostics.filter((d) => d.type === "ambiguous-link");

  if (duplicates.length > 0) {
    console.warn(`\n⚠ Duplicate anchors (${duplicates.length}):`);
    for (const d of duplicates) {
      console.warn(`  ${d.message}`);
    }
  }
  if (broken.length > 0) {
    console.warn(`\n⚠ Broken links (${broken.length}):`);
    for (const d of broken) {
      console.warn(`  [${d.sourceFile}] #${d.anchorId}`);
    }
  }
  if (ambiguous.length > 0) {
    console.log(`\nℹ Ambiguous links (${ambiguous.length}):`);
    for (const d of ambiguous) {
      console.log(`  [${d.sourceFile}] ${d.message}`);
    }
  }
}

/**
 * Build a custom Marked renderer for web preview.
 * Handles headings with anchor links, images with doc-image class,
 * and code blocks with title/line-highlighting support.
 *
 * Code blocks: F4 swaps the legacy `escapeHtml` body with Shiki-tokenized
 * inner HTML. Tokenization is async (Shiki loads grammars on demand) so a
 * pre-pass populates `highlightedCode` (keyed by `lang\0source`) before
 * `marked.parse()` runs. The renderer is purely sync here — it just looks
 * up the pre-rendered HTML, falling back to `escapeHtml` if the lookup
 * misses (defensive: should not happen because the pre-pass walks the
 * exact same tokens marked will render).
 */
/**
 * Convert a web-absolute image URL (e.g. `/sessions_all/images/foo.png`,
 * the form `rewriteImagePathsForWeb` produces) back to a disk path under
 * `<srcDir>/<lang>/…` so we can read the PNG header for natural-size
 * auto-capping. Returns null for off-tree URLs (http(s):, leading-slash
 * paths outside the lang dir, paths that escape the lang root via `..`)
 * — the caller must fall back to unsized rendering in that case.
 */
function resolveWebImageDiskPath(
  href: string,
  srcDir: string | undefined,
  lang: string | undefined,
): string | null {
  if (!srcDir || !lang) return null;
  if (/^(?:https?|file):\/\//.test(href)) return null;
  if (!href.startsWith("/")) return null;
  const langRoot = path.resolve(srcDir, lang);
  const resolved = path.resolve(langRoot, "." + href);
  // Defense in depth: rewriteImagePathsForWeb already drops out-of-tree
  // markdown image refs, but a path like `/../other-lang/foo.png` that
  // slipped past it would otherwise let the renderer read PNG headers
  // outside the language root just to compute display dimensions.
  if (resolved !== langRoot && !resolved.startsWith(langRoot + path.sep)) {
    return null;
  }
  return resolved;
}

function buildWebRenderer(
  chapterSlug: string,
  headings: Heading[],
  options?: {
    chapterIndex?: number;
    lang?: string;
    srcDir?: string;
    figureLabels?: Record<string, string>;
    /** Pre-rendered Shiki HTML keyed by `${lang}\0${code}`. */
    highlightedCode?: Map<string, string>;
  },
) {
  let imgCounter = 0;
  const chapterIndex = options?.chapterIndex ?? 0;
  const figureLabel = getFigureLabel(options?.lang, options?.figureLabels);
  const highlightedCode = options?.highlightedCode;
  const srcDir = options?.srcDir;
  const lang = options?.lang;

  return {
    heading(text: string, level: number, _raw: string): string {
      const plainText = stripHtmlTags(text);
      const id = `${chapterSlug}-${slugify(plainText)}`;
      headings.push({ level, text: plainText, id });
      const escapedPlainText = escapeHtml(plainText);
      return `<h${level} id="${id}">${text}<a class="hash-link" href="#${id}" aria-label="Direct link to ${escapedPlainText}">#</a></h${level}>\n`;
    },
    image(href: string, title: string | null, text: string): string {
      const titleAttr = title ? ` title="${title}"` : "";
      const { cleanAlt, sizeHint } = parseImageSizeHint(text || "");

      // Size resolution order (matches the PDF renderer):
      //   1. Explicit `![alt =<w>](url)` hint wins absolutely.
      //   2. Otherwise, read the PNG/JPEG header and cap at the natural
      //      CSS display width (pixel width × IMAGE_SCALE_FACTOR, which
      //      undoes the 2× zoom capture convention). This keeps small
      //      captures (a 760-px-wide notification → 380 CSS px) from
      //      stretching to fill the full article column.
      //   3. If neither path produces a size, emit no style — the
      //      `.doc-image { max-width: 100% }` rule still prevents
      //      horizontal overflow for legacy (non-2× / oversize) images.
      let styleAttr = "";
      if (sizeHint && sizeHint !== "auto") {
        styleAttr = ` style="width:${sizeHint}"`;
      } else if (!sizeHint) {
        const diskPath = resolveWebImageDiskPath(href, srcDir, lang);
        if (diskPath) {
          const dims = getImageDimensions(diskPath);
          if (dims) {
            const cap = Math.round(dims.width * IMAGE_SCALE_FACTOR);
            styleAttr = ` style="max-width:${cap}px"`;
          }
        }
      }

      if (chapterIndex > 0) {
        imgCounter++;
        const figNum = `${figureLabel} ${chapterIndex}.${imgCounter}`;
        const caption = cleanAlt
          ? `<figcaption>${figNum} &mdash; ${escapeHtml(cleanAlt)}</figcaption>`
          : `<figcaption>${figNum}</figcaption>`;
        return `<figure class="doc-figure"><img src="${href}" alt="${cleanAlt}" class="doc-image"${titleAttr}${styleAttr} />${caption}</figure>\n`;
      }

      return `<img src="${href}" alt="${cleanAlt}" class="doc-image"${titleAttr}${styleAttr} />\n`;
    },
    code(code: string, infostring: string | undefined): string {
      const info = infostring || "";
      const langMatch = info.match(/^(\w+)/);
      const titleMatch = info.match(/data-title="([^"]*)"/);
      const highlightMatch = info.match(/data-highlight="([^"]*)"/);

      const lang = langMatch?.[1] || "";
      const title = titleMatch?.[1] || "";
      const highlightSpec = highlightMatch?.[1] || "";
      const highlightLines = parseHighlightLines(highlightSpec);

      // F4: line-highlight feature (data-highlight="1,3-5") wraps each line
      // in `.code-line.highlighted`. Mixing this with Shiki's per-token
      // colored spans is messy (we'd have to walk Shiki's output and split
      // by line preserving colors), so when an author uses data-highlight
      // we fall back to the legacy un-highlighted line wrappers. This keeps
      // the line-highlight feature working unchanged. Authors who want
      // both can land that in a follow-up; F4 spec doesn't require it.
      //
      // Shellsession blocks (FR-2756) take precedence over the highlight
      // path: even when a shellsession block sets `data-highlight=…`, we
      // must still pull the prompt-stripped HTML from the precompute map
      // (the legacy line-wrap path would leak literal `$` prompts into
      // the DOM and clipboard). Combining shellsession with line-highlight
      // is intentionally out of scope for v1.
      const isShellSession = lang === "shellsession" || lang === "console";
      let codeHtml: string;
      if (isShellSession) {
        const key = `${lang}|||${code}`;
        const pre = highlightedCode?.get(key);
        codeHtml = pre ?? escapeHtml(code);
      } else if (highlightLines.size > 0) {
        const lines = code.split("\n");
        codeHtml = lines
          .map((line, idx) => {
            const lineNum = idx + 1;
            const cls = highlightLines.has(lineNum)
              ? "code-line highlighted"
              : "code-line";
            return `<span class="${cls}">${escapeHtml(line)}</span>`;
          })
          .join("\n");
      } else {
        // Look up Shiki's pre-rendered output. The pre-pass uses the exact
        // same `(lang, code)` tuple, so a miss means either the pre-pass
        // wasn't run (catalog mode) or the renderer was invoked with a
        // token that didn't go through Marked's lexer (no realistic path
        // today, but defensive). Fall back to escaped plaintext.
        const key = `${lang}|||${code}`;
        const pre = highlightedCode?.get(key);
        codeHtml = pre ?? escapeHtml(code);
      }

      // Add `language-{lang}` for legacy CSS hooks (existing themes target
      // these classes). Shiki adds its own `.shiki` / `.line` classes inside
      // the `<code>` body — both shells coexist without conflict.
      const langClass = lang
        ? ` class="language-${lang} shiki-host"`
        : ` class="shiki-host"`;
      const preBlock = `<pre${langClass}><code>${codeHtml}</code></pre>`;

      // Emit the BAI dark frame whenever we have either a language hint
      // or a title — the header renders a language pill (left), filename
      // text slot (middle, flex-grows so the copy button lands flush
      // right), and an empty slot the runtime copy script populates with
      // its SVG button. Plain fenced blocks with no lang/title fall
      // through to bare <pre>; code-copy.js still wraps them with the
      // doc-code-block-wrapper at runtime so they get the dark frame.
      if (lang || title) {
        const langPill = lang
          ? `<span class="code-block-lang">${escapeHtml(lang)}</span>`
          : "";
        const titleText = title
          ? `<span class="code-block-title-text">${escapeHtml(title)}</span>`
          : `<span class="code-block-title-text" aria-hidden="true"></span>`;
        return `<div class="code-block-wrapper"><div class="code-block-title">${langPill}${titleText}</div>${preBlock}</div>\n`;
      }

      return preBlock + "\n";
    },
  };
}

export interface WebProcessingOptions {
  /** Enable multi-page link resolution (default: false) */
  multiPage?: boolean;
  /**
   * If provided, the processor pushes link diagnostics into this array
   * instead of (only) printing warnings. The website generator uses this
   * to decide whether to fail under `--strict`.
   */
  diagnosticsSink?: LinkDiagnostic[];
}

/**
 * Strip the outer `<span class="line">…</span>` Shiki wraps around every
 * tokenized line. Used by the shellsession path so each command's
 * inline-styled tokens can be re-wrapped under our own `.cmd-line` shell
 * without producing a nested `.line` element. Returns the input unchanged
 * if the expected wrapper is absent (defensive: future Shiki versions or
 * the plaintext fallback may emit a different shape).
 */
function unwrapShikiSingleLine(html: string): string {
  const open = '<span class="line">';
  const close = "</span>";
  if (html.startsWith(open) && html.endsWith(close)) {
    return html.slice(open.length, html.length - close.length);
  }
  return html;
}

/**
 * Walk the rendered markdown's tokens and pre-tokenize every fenced code
 * block via Shiki. Returns a map keyed by `${lang}|||${rawCode}` whose
 * value is the pre-rendered inner HTML for `<code>`. The renderer in
 * `buildWebRenderer` reads from this map synchronously.
 *
 * Shiki tokenization is async (loadLanguage / loadTheme), but the actual
 * `codeToHtml` call is synchronous once the grammar is loaded. We do the
 * loading upfront here so the marked render pipeline stays sync.
 *
 * The cache inside `shikiHighlight` makes repeat blocks (across chapters or
 * across languages) free after the first sighting — important for the
 * "≤ +50% wall-clock per language" budget when building all 4 langs.
 *
 * Shellsession blocks (FR-2756) take a different path: they are parsed
 * line-by-line and the prompt is stripped from the DOM. Cmd lines run
 * through Shiki as `bash` in parallel (Promise.all) so a long transcript
 * doesn't serialize highlighter calls. Shellsession runs even when
 * `data-highlight=` is set on the fence — the line-highlight overlay is
 * not currently combined with shellsession (out of scope for v1), but
 * we must not skip the prompt-stripping pass, otherwise a shellsession
 * block with `{1}` highlighting would leak literal `$` prompts into the
 * DOM and clipboard.
 */
async function precomputeShikiBlocks(
  markdown: string,
  theme: string,
): Promise<Map<string, string>> {
  const map = new Map<string, string>();
  const seen = new Set<string>();

  // Lex the same markdown the renderer will parse so we see the same `text`
  // marked passes to `code()`. Walking marked's lexer output (rather than
  // a regex over the raw markdown) handles list-indented code blocks, CRLF
  // normalization, and the trailing-newline trimming the lexer applies —
  // any of which would cause key mismatches if we tokenized by regex.
  const lexer = new Marked();
  const tokens = lexer.lexer(markdown);

  type AnyToken = {
    type: string;
    lang?: string;
    text?: string;
    tokens?: AnyToken[];
    items?: AnyToken[];
  };

  const visit = async (node: AnyToken | AnyToken[]): Promise<void> => {
    if (Array.isArray(node)) {
      for (const child of node) await visit(child);
      return;
    }
    if (node.type === "code" && typeof node.text === "string") {
      const info = (node.lang ?? "").trim();
      const langMatch = info.match(/^(\w+)/);
      const lang = langMatch?.[1] ?? "";
      const isShellSession = lang === "shellsession" || lang === "console";
      const hasLineHighlight = /data-highlight="[^"]*\d+/.test(info);

      // For non-shellsession blocks with line-highlight, the renderer
      // takes its legacy line-wrapping path and ignores the Shiki map,
      // so we skip pre-tokenization. Shellsession blocks must NOT skip
      // — even with highlighting set, they need the prompt-stripping
      // pass or `$` characters would land in the DOM/clipboard.
      if (hasLineHighlight && !isShellSession) return;

      const key = `${lang}|||${node.text}`;
      if (!seen.has(key)) {
        seen.add(key);
        if (isShellSession) {
          // Terminal transcript: split into prompt/cmd/output rows. The
          // prompt is removed from the DOM here and restored visually
          // by CSS ::before on .cmd-line so drag-copy / button-copy
          // never include the prompt char (FR-2756). Cmd lines are
          // tokenized in parallel via Promise.all so a long transcript
          // doesn't serialize highlighter calls.
          const lines = parseShellSessionLines(node.text);
          const segments = await Promise.all(
            lines.map(async (ln) => {
              if (ln.type === "cmd") {
                const tokenized = await shikiHighlight({
                  code: ln.text,
                  lang: "bash",
                  theme,
                });
                // Shiki always wraps a single line in <span class="line">…</span>;
                // unwrap so we can put our own .cmd-line wrapper on it without
                // emitting nested .line elements.
                const inner = unwrapShikiSingleLine(tokenized.innerHtml);
                return `<span class="line cmd-line" data-prompt="${ln.prompt}">${inner}</span>`;
              }
              return `<span class="line output-line">${escapeHtml(ln.text)}</span>`;
            }),
          );
          map.set(key, segments.join("\n"));
        } else {
          const result = await shikiHighlight({
            code: node.text,
            lang,
            theme,
          });
          map.set(key, result.innerHtml);
        }
      }
    }
    if (node.tokens) await visit(node.tokens);
    if (node.items) await visit(node.items);
  };

  await visit(tokens as unknown as AnyToken[]);
  return map;
}

export async function processMarkdownFilesForWeb(
  lang: string,
  navigation: NavEntry[],
  srcDir: string,
  version: string,
  config?: ResolvedDocConfig,
  options?: WebProcessingOptions,
): Promise<Chapter[]> {
  const diagnostics: LinkDiagnostic[] = [];
  let chapterIndex = 0;

  const pathFallbacks = config?.pathFallbacks ?? {};
  const admonitionTitles = config?.admonitionTitles;
  const figureLabels = config?.figureLabels;
  // F4: Shiki theme. Resolved config always has a value; in the bare
  // `processMarkdownFilesForWeb({ config: undefined })` path we fall back
  // to the same default the resolver would pick.
  const shikiTheme = config?.code?.lightTheme ?? DEFAULT_CODE_LIGHT_THEME;

  // ── Pass 1: Render all chapters to HTML ──────────────────────
  const rendered: RenderedChapter[] = [];

  for (const nav of navigation) {
    let mdPath: string;
    try {
      mdPath = resolveMarkdownPath(lang, nav.path, srcDir, pathFallbacks);
    } catch {
      console.warn(`Skipping missing file: ${nav.path} (${lang})`);
      continue;
    }

    chapterIndex++;
    let markdown = fs.readFileSync(mdPath, "utf-8");
    const chapterSlug = slugFromNavPath(nav.path);

    // Pre-processing pipeline (reused from PDF processor)
    markdown = deduplicateH1(markdown);
    markdown = substituteTemplateVars(markdown, version);
    markdown = rewriteImagePathsForWeb(markdown, mdPath, lang, srcDir);
    markdown = normalizeRstTables(markdown);
    markdown = convertIndentedNotes(markdown);

    // Extended syntax pre-processing
    markdown = processAdmonitions(markdown, lang, admonitionTitles);
    markdown = processCodeBlockMeta(markdown);

    // F4: pre-tokenize all fenced code blocks via Shiki so the marked
    // renderer (which is sync) can read pre-rendered HTML by lookup. The
    // shared in-memory cache makes this a near-noop for repeating snippets
    // across chapters / languages.
    const highlightedCode = await precomputeShikiBlocks(markdown, shikiTheme);

    const headings: Heading[] = [];
    const marked = new Marked();
    marked.use({
      renderer: buildWebRenderer(chapterSlug, headings, {
        chapterIndex,
        lang,
        srcDir,
        figureLabels,
        highlightedCode,
      }),
    });

    const htmlContent = await marked.parse(markdown);

    rendered.push({
      chapter: { title: nav.title, slug: chapterSlug, htmlContent, headings },
      filePath: nav.path,
    });
  }

  // ── Build anchor registry from rendered HTML (accurate IDs) ──
  const registry = buildAnchorRegistryFromRendered(rendered);
  detectDuplicateAnchors(registry, diagnostics);

  // ── Pass 2: Rewrite cross-page links using the registry ──────
  const multiPage = options?.multiPage ?? false;
  for (const { chapter, filePath } of rendered) {
    chapter.htmlContent = fixMalformedCrossReferences(
      chapter.htmlContent,
      chapter.slug,
    );
    chapter.htmlContent = rewriteCrossPageLinks(
      chapter.htmlContent,
      chapter.slug,
      registry,
      diagnostics,
      filePath,
      multiPage,
    );
  }

  // ── Report diagnostics ───────────────────────────────────────
  if (diagnostics.length > 0) {
    reportLinkDiagnostics(diagnostics);
  }

  // Forward diagnostics to caller (used by --strict in the website generator).
  if (options?.diagnosticsSink) {
    for (const d of diagnostics) options.diagnosticsSink.push(d);
  }

  return rendered.map((r) => r.chapter);
}

/**
 * Process in-memory catalog markdown entries (not from files).
 * Used for the style catalog mode to render sample markdown through the full pipeline.
 */
export async function processCatalogMarkdownForWeb(
  entries: Array<{ title: string; markdown: string }>,
): Promise<Chapter[]> {
  const chapters: Chapter[] = [];

  for (const entry of entries) {
    let markdown = entry.markdown;
    const chapterSlug = slugify(entry.title);

    // Extended syntax pre-processing
    markdown = processAdmonitions(markdown);
    markdown = processCodeBlockMeta(markdown);

    // F4: catalog mode uses default Shiki theme — there's no toolkit config
    // available here. Operators who want a different theme will see it
    // applied in the real `build:web` path which threads `config.code` in.
    const highlightedCode = await precomputeShikiBlocks(
      markdown,
      DEFAULT_CODE_LIGHT_THEME,
    );

    const headings: Heading[] = [];
    const marked = new Marked();
    marked.use({
      renderer: buildWebRenderer(chapterSlug, headings, { highlightedCode }),
    });

    const htmlContent = await marked.parse(markdown);

    chapters.push({
      title: entry.title,
      slug: chapterSlug,
      htmlContent,
      headings,
    });
  }

  return chapters;
}