riksdagsmonitor/scripts/render-lib/article.ts at main · Hack23/riksdagsmonitor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
/**
 * @module Infrastructure/RenderLib/Article
 * @category Intelligence Operations / Supporting Infrastructure
 * @name End-to-end article composer: markdown + chrome + JSON-LD
 *
 * @description
 * Orchestrates the article pipeline's final stage. Given an aggregated
 * `article.md` (produced by the {@link ../aggregator.js | aggregator}
 * module) plus the target language + canonical path, produces a
 * complete `<!DOCTYPE html>` page ready to be written to
 * `news/$DATE-$SUBFOLDER-$LANG.html`.
 *
 * The composer does three things:
 * 1. Parse front-matter with `gray-matter` to pull `title` + `description`
 *    + `date` out of the aggregated markdown
 * 2. Call {@link renderMarkdownToHtml} on the body
 * 3. Build Schema.org `NewsArticle` JSON-LD, hand it to
 *    {@link buildChrome}, and concatenate the resulting head/header/footer
 *    around the rendered body + a footer "Analysis sources" block
 *    listing every artifact consumed by the aggregator
 *
 * Round-4 architecture split: extracted from the former monolithic
 * `render-lib/index.ts`. This module is the **single consumer** of the
 * markdown + chrome + aggregator modules taken together — keeping it in
 * its own file makes the orchestration logic obvious without
 * interleaving it with any of the building blocks.
 *
 * @author Hack23 AB (Infrastructure Team)
 * @license Apache-2.0
 */

import matter from 'gray-matter';
import path from 'path';

import type { Language } from '../types/language.js';
import { LANGUAGE_META, escapeHtml } from '../sitemap-html/index.js';
import { BASE_URL } from './constants.js';
import { buildGithubBlobUrl } from './url-helpers.js';
import { renderMarkdownToHtml } from './markdown/index.js';
import { buildChrome } from './chrome.js';
import { buildBreadcrumbListLd, buildNewsArticleLd, buildSpeakableWebPageLd, BREADCRUMB_TITLE_MAX_LENGTH, BREADCRUMB_ELLIPSIS_OVERHEAD } from './jsonld.js';

import { articleTypeIcon } from './article-type-i18n.js';
import { computeArticleHeadMetadata } from './article-head-metadata.js';
import {
  readFirstHeading,
  cleanArticleTitle,
  titleFromBluf,
  readHeadlineParagraph,
} from './aggregator/seo/title.js';
import {
  composeRichDescription,
  readBlufParagraph,
  readFirstParagraph,
  truncateToSentenceBoundary,
  descriptionWindowForLanguage,
} from './aggregator/seo/description.js';
import { extractLocalizedBriefSeo } from './aggregator/seo/localized-brief.js';
import {
  extractBriefEntities,
  flattenBriefEntities,
} from './aggregator/seo/brief-extractor.js';
import { titleWindowForLanguage } from './aggregator/seo/serp-budgets.js';
/**
 * @deprecated Re-exported from `article-head-metadata.ts`. The function
 * body lives there now so the renderer, regenerator and QA tooling all
 * call exactly one implementation. This export only exists to preserve
 * the historical `import { parseFrontMatterDate } from './article.js'`
 * import sites (notably `tests/render-lib-architecture.test.ts`).
 */
export { parseFrontMatterDate, inferArticleType } from './article-head-metadata.js';
import {
  renderReaderNavigation,
  renderAnalysisArtifactsReference,
  renderMethodsReference,
  renderPoliticalContext,
} from './article-aside.js';
import { enrichArticleMarkdownWithPoliticalContext } from './political-context.js';
import { applyScannabilityTransforms, transformProgressiveDisclosure } from './article-scannability.js';
import { localizeExecutiveBriefLead } from './article-brief-lead.js';

/**
 * CSS selectors identifying the voice-assistant TTS-readable regions of
 * an article. Must match the class names in the article HTML template
 * rendered at the bottom of `renderArticleHtml`.
 */
const ARTICLE_SPEAKABLE_SELECTORS: readonly string[] = [
  '.rm-article-header h1',
  '.rm-article-dek',
  '.rm-article-body',
];

export interface RenderArticleInput {
  /** Aggregated markdown (front-matter + body) produced by aggregateAnalysis. */
  readonly markdown: string;
  /** Language code. */
  readonly lang: Language;
  /** Canonical path (e.g. `news/2026-04-23/propositions-en.html`). */
  readonly canonicalPath: string;
  /** Hreflang alternates map (optional). */
  readonly hreflangAlternates?: Partial<Record<Language, string>>;
  /** Subfolder github tree link used in the analysis references block. */
  readonly subfolderRepoRelPath?: string;
  /** Ordered list of artifacts used (shown in the footer). */
  readonly artifactsUsed?: readonly string[];
  /**
   * Raw English `executive-brief.md` markdown adjacent to `article.md`.
   * When provided, the renderer derives `<title>` (from the brief H1
   * via {@link cleanArticleTitle}) and `<meta description>` (from the
   * BLUF via {@link composeRichDescription} / {@link readBlufParagraph}
   * → {@link truncateToSentenceBoundary}) **directly from the brief**,
   * bypassing the (now back-compat-only) `article.md` frontmatter
   * `title:` / `description:` lines.
   *
   * When omitted (the 278 pre-`2026-03-26` legacy `news/*-en.html`
   * articles whose `analysis/daily/<date>/` source directories have
   * been deleted), the renderer gracefully falls back to whatever
   * `article.md` frontmatter is available — keeps existing legacy
   * SEO intact without throwing.
   *
   * The subfolder slug (`propositions`, `committee-reports`, …) is
   * sourced from {@link subfolderSlug}; defaults to the empty string
   * which simply skips the article-type boilerplate scrub inside
   * {@link cleanArticleTitle}.
   */
  readonly englishBriefMarkdown?: string;
  /**
   * Raw localized `executive-brief_<lang>.md` markdown when one exists
   * for `input.lang`. When provided and `lang !== 'en'`, the renderer
   * derives title + description from the localized brief (per the
   * cascade-chain step #2 in `Article-Generation.md § "Per-language
   * precedence chain"`); when the localized brief has a banned /
   * missing H1 / BLUF, fields independently fall through to the
   * {@link englishBriefMarkdown} cascade.
   */
  readonly localizedBriefMarkdown?: string;
  /**
   * Subfolder slug (`propositions`, `committee-reports`, …). Forwarded
   * to {@link cleanArticleTitle} so brief H1s that simply repeat the
   * article-type label are scrubbed before truncation. Optional; an
   * empty string disables that scrub.
   */
  readonly subfolderSlug?: string;
}

function canonicalizeMarkdownHrefTarget(
  href: string,
  subfolderRepoRelPath: string | undefined,
): string {
  const [pathPart, anchor] = href.split('#', 2) as [string, string | undefined];
  if (!pathPart) return href;

  const withAnchor = (base: string): string => (anchor ? `${base}#${anchor}` : base);
  const toBlobHref = (repoRelativePath: string): string =>
    withAnchor(buildGithubBlobUrl(repoRelativePath.replace(/^\/+/, '')));

  if (/^(#|mailto:)/i.test(href)) return href;

  const rawGithubMatch = pathPart.match(
    /^https:\/\/raw\.githubusercontent\.com\/Hack23\/riksdagsmonitor\/(?:main|master)\/(.+\.md)$/i,
  );
  if (rawGithubMatch?.[1]) return toBlobHref(rawGithubMatch[1]);

  const githubFileMatch = pathPart.match(
    /^https:\/\/github\.com\/Hack23\/riksdagsmonitor\/(?:blob|tree)\/(?:main|master)\/(.+\.md)$/i,
  );
  if (githubFileMatch?.[1]) return toBlobHref(githubFileMatch[1]);

  if (/^https?:\/\//i.test(pathPart)) return href;

  const analysisPathMatch = pathPart.match(/^(?:\.\/|\.\.\/)*\/?(analysis\/.+\.md)$/i);
  if (analysisPathMatch?.[1]) return toBlobHref(analysisPathMatch[1]);

  if (!subfolderRepoRelPath) return href;

  const resolved = path.posix.normalize(path.posix.join(subfolderRepoRelPath, pathPart));
  if (resolved.startsWith('..')) return href;
  return toBlobHref(resolved);
}

export function rewriteMarkdownHrefsInHtml(
  bodyHtml: string,
  subfolderRepoRelPath: string | undefined,
): string {
  return bodyHtml.replace(
    /(<a\b[^>]*\bhref=)(['"])([^"']+\.md(?:#[^"']*)?)(\2)/gi,
    (_match, before: string, quote: string, href: string) =>
      `${before}${quote}${canonicalizeMarkdownHrefTarget(href, subfolderRepoRelPath)}${quote}`,
  );
}

/**
 * Strip the markdown-based "Reader Intelligence Guide" table and the
 * "Article Sources" appendix from the article body. These sections are
 * injected by the aggregator in English-only; the renderer emits
 * properly localized, styled HTML versions via chrome, so the markdown
 * duplicates must be removed to avoid showing the same content twice
 * (once untranslated, once translated).
 *
 * Matches:
 * - `## Reader Intelligence Guide` (any case) + all content until the
 *   next H2 or end-of-string.
 * - `## Article Sources` + all content until the next H2 or end-of-string.
 *
 * Exported for testability.
 */
export function stripBodyDuplicateSections(body: string): string {
  let cleaned = body.replace(
    /^##\s+Reader Intelligence Guide[^\n]*\n(?:(?!^## )[^\n]*\n?)*/gim,
    '',
  );
  cleaned = cleaned.replace(
    /^##\s+Article Sources[^\n]*\n(?:(?!^## )[^\n]*\n?)*/gim,
    '',
  );
  return cleaned;
}

/**
 * Split rendered article body HTML into two chunks at the boundary of
 * the second `<h2` element:
 *
 *   - `lead`  — everything from the start through (but not including)
 *               the second `<h2`. By aggregator contract the first H2 is
 *               always **Executive Brief**, so this chunk contains the
 *               opening BLUF / executive summary and nothing else.
 *   - `rest`  — the remainder of the body (Synthesis Summary onwards).
 *
 * The renderer composes the page as
 * `header → lead → reader-guide → rest → sources` so that readers see
 * the Executive Brief immediately, then the Reader Intelligence Guide
 * (which explains *how* to read the rest), then the full analysis, then
 * the source-card appendix. This is the journalist-optimal "fast answer
 * → operating manual → deep analysis → provenance" arc.
 *
 * If the body contains fewer than two `<h2` elements (very short
 * articles), the entire body is returned as `lead` and `rest` is empty —
 * the reader guide will then render after the whole body which still
 * matches the "executive brief first, then reader guide" intent because
 * a single-section body is, by definition, the executive brief.
 *
 * Exported for testability.
 */
export function splitBodyAtSecondH2(bodyHtml: string): { lead: string; rest: string } {
  const h2OpenRe = /<h2[\s>]/gi;
  const positions: number[] = [];
  let match: RegExpExecArray | null;
  while ((match = h2OpenRe.exec(bodyHtml)) !== null) {
    positions.push(match.index);
    if (positions.length >= 2) break;
  }
  if (positions.length < 2) {
    return { lead: bodyHtml, rest: '' };
  }
  const splitAt = positions[1];
  return {
    lead: bodyHtml.slice(0, splitAt),
    rest: bodyHtml.slice(splitAt),
  };
}

/**
 * Derive `<title>` / `<meta description>` / keyword-entity overrides
 * from the executive-brief markdown adjacent to `article.md`. Pure
 * function — no I/O, no clock.
 *
 * Resolution order:
 *
 *  1. For non-EN, prefer {@link RenderArticleInput.localizedBriefMarkdown}
 *     via {@link extractLocalizedBriefSeo}. Title and description are
 *     resolved **independently**: a banned title with a clean BLUF
 *     still localizes the description, and a clean title with an
 *     empty BLUF still localizes the title.
 *  2. Whatever field is still `null` after step 1 falls through to the
 *     English brief — title via {@link readFirstHeading} →
 *     {@link cleanArticleTitle}; description via
 *     {@link composeRichDescription} ∥ {@link readBlufParagraph} ∥
 *     {@link readFirstParagraph}, capped by per-language SERP window
 *     in {@link truncateToSentenceBoundary}.
 *  3. Entities are mined from the brief (localized first, EN
 *     fallback) — universal-Swedish identifiers (HD03267, JuU/SfU)
 *     carry across locales.
 *
 * Returns `{ title: undefined, description: undefined, entities: [] }`
 * when no brief markdown is provided so the head-metadata helper falls
 * back to the legacy frontmatter-only audit path (covers the 278 pre-
 * `2026-03-26` `news/*-en.html` files whose source `analysis/daily/`
 * directories have been deleted).
 *
 * Exported for testability.
 */
export function deriveBriefSeoOverrides(input: {
  readonly lang: Language;
  readonly englishBriefMarkdown?: string;
  readonly localizedBriefMarkdown?: string;
  readonly subfolderSlug?: string;
}): {
  readonly title: string | undefined;
  readonly description: string | undefined;
  readonly entities: readonly string[];
} {
  const subfolder = input.subfolderSlug ?? '';
  const hasEn = !!input.englishBriefMarkdown && input.englishBriefMarkdown.trim().length > 0;
  const hasLoc = !!input.localizedBriefMarkdown
    && input.localizedBriefMarkdown!.trim().length > 0;
  if (!hasEn && !hasLoc) {
    return { title: undefined, description: undefined, entities: [] };
  }

  let title: string | null = null;
  let description: string | null = null;
  let entities: readonly string[] = [];

  // Step 1 — localized brief (non-EN only).
  if (input.lang !== 'en' && hasLoc) {
    const briefSeo = extractLocalizedBriefSeo({
      briefMarkdown: input.localizedBriefMarkdown!,
      subfolder,
      lang: input.lang,
    });
    if (briefSeo.title) title = briefSeo.title;
    if (briefSeo.description) description = briefSeo.description;
    if (briefSeo.keywords.length > 0) entities = briefSeo.keywords;
  }

  // Step 2 — English brief fallback for any field still unresolved.
  if (hasEn) {
    if (title === null) {
      const rawH1 = readFirstHeading(input.englishBriefMarkdown!);
      const cleaned = cleanArticleTitle(rawH1, subfolder, 'en');
      if (cleaned && cleaned.length > 0) title = cleaned;
    }
   // Step 2a — Headline-section paragraph. Many briefs have a dedicated
   // `## Headline` or `## Intelligence Summary` section whose first
   // paragraph is a purpose-written headline sentence — far better than
   // the generic BLUF truncation for SERP titles.
    if (title === null) {
     const headlinePara = readHeadlineParagraph(input.englishBriefMarkdown!);
     const synthesised = titleFromBluf(headlinePara);
     if (synthesised && synthesised.length > 0) title = synthesised;
   }
   // Step 2b — BLUF-synthesised title when H1 is boilerplate/missing.
   // The brief body (## Headline, ## Intelligence Summary, or BLUF para)
   // often contains story-specific first sentences that serve better as
   // SERP titles than the generic category-label fallback.
   if (title === null) {
     const bluf = readBlufParagraph(input.englishBriefMarkdown!)
       ?? readFirstParagraph(input.englishBriefMarkdown!);
     const synthesised = titleFromBluf(bluf);
     if (synthesised && synthesised.length > 0) title = synthesised;
   }
    if (description === null) {
      // Rich description (BLUF + headline-section bullets) — mirrors
      // the aggregator's English path. **Use `'en'` here** because we
      // are extracting from the English brief markdown:
      // `composeRichDescription` does language-specific section-heading
      // lookup (`extractHeadlineSection(..., lang)`), and passing a
      // non-EN lang would miss the English `## 60-Second Read` section
      // and degrade rich descriptions for non-EN pages that fall back
      // to EN content. The per-language SERP window is then enforced
      // by the `capByWordBoundary` hardMax cap below for `input.lang`.
      const composed = composeRichDescription(input.englishBriefMarkdown!, 'en');
      if (composed && composed.length > 0) {
        description = composed;
      } else {
        const bluf = readBlufParagraph(input.englishBriefMarkdown!)
          ?? readFirstParagraph(input.englishBriefMarkdown!);
        if (bluf && bluf.trim().length > 0) {
          const { softMin, hardMax } = descriptionWindowForLanguage(input.lang);
          const truncated = truncateToSentenceBoundary(bluf, softMin, hardMax);
          if (truncated.length > 0) description = truncated;
        }
      }
    }
    if (entities.length === 0) {
      entities = flattenBriefEntities(extractBriefEntities(input.englishBriefMarkdown!, 'en'));
    }
  }

  // Final defence-in-depth admin-byline VALUE scrubber. The localized
  // `executive-brief_<lang>.md` files use translated admin labels
  // (`**Författare**`, `**المؤلف**`, `**Kirjoittaja**`, `**Forfatter**`,
  // `**著者**`, `**Upphovsman**`, etc.) and not every translation is
  // present in `ADMIN_FIELD_NAMES`. When a label is unrecognised the
  // upstream extractors leak the VALUE (the editorial-byline name, the
  // run-ID digits, the classification banner, the confidence grade)
  // into the rendered SEO surface — a journalist searching for a
  // specific date then sees `James Pether Sörling…` as the SERP
  // snippet instead of the BLUF. This scrubber is label-agnostic: it
  // matches the known VALUES that should never ship and removes them
  // from the composed title / description.
  if (title !== null) title = scrubAdminBylineValues(title);
  if (description !== null) description = scrubAdminBylineValues(description);

  // Final per-language ceiling enforcement. The localized-brief and
  // EN-fallback paths each have their own truncation logic but the
  // contract on this function is `length <= hardMax(lang)` for both
  // title and description. Defense-in-depth: any future extractor
  // upstream that forgets to truncate still ships within budget.
  if (title !== null) {
    const { hardMax: titleMax } = titleWindowForLanguage(input.lang);
    if (title.length > titleMax) {
      title = capByWordBoundary(title, titleMax);
    }
  }
  if (description !== null) {
    const { hardMax: descMax } = descriptionWindowForLanguage(input.lang);
    if (description.length > descMax) {
      description = capByWordBoundary(description, descMax);
    }
  }

  return {
    title: title ?? undefined,
    description: description ?? undefined,
    entities: entities.filter((e) => !isAdminLeakEntity(e)),
  };
}

/**
 * Detector for individual entity strings that encode admin-byline VALUES
 * (editorial byline name, run-ID, classification banner, confidence
 * grade, Admiralty grade, GDPR article). The entity miner picks up
 * capitalized phrases like `Confidence HIGH` / `Classification PUBLIC`
 * from the leading admin block when its label is not in
 * `ADMIN_FIELD_NAMES`; filtering them at the boundary of
 * `deriveBriefSeoOverrides` guarantees they never reach the keyword
 * line or JSON-LD `keywords` array.
 */
const ADMIN_LEAK_ENTITY_PATTERNS: readonly RegExp[] = [
  /James\s+Pether\s+S(?:ö|o)rling/i,
  /\bHack23\s+AB\b/i,
  /\b(?:Run[-\s]?ID|K[öo]rnings[-\s]?ID|Lauf[-\s]?ID|Ajo[-\s]?ID|实行ID|実行ID|운영\s*ID|实例ID)\b/i,
  /(?:Confidence|Konfidens(?:nivå)?|Konfidenz|Luottamustaso|信頼度|信心度|Niveau de confiance|Nivel de confianza|Betrouwbaarheid)\s*[:：]?\s*(?:HIGH|HØJ|HØY|KORKEA|高い|高|HOCH|Élevé|Alto|Hoog|عالٍ?|גבוה)/i,
  /\bClassification\b\s*[:：]?\s*PUBLIC/i,
  /\bAdmiralty\s+(?:Range|Baseline|Code|Grade|Scale)\b/i,
  /\bGDPR\s+Art\b/i,
];

function isAdminLeakEntity(entity: string): boolean {
  for (const re of ADMIN_LEAK_ENTITY_PATTERNS) {
    if (re.test(entity)) return true;
  }
  return false;
}

/**
 * Admin-byline VALUE patterns that must never appear in a shipped
 * SEO title or description regardless of source language. Each entry
 * captures a value that is generated by the brief pipeline itself
 * (editorial byline, classification banner, run-ID, OSINT Admiralty
 * grade) so removing them never destroys article-content meaning.
 *
 * The scrubber excises the matched span plus any immediately
 * surrounding admin-label fragment (`**Author**:`, `Författare:`,
 * `Date:`, …) so we collapse `(Author: <byline>) ` style residue
 * into clean whitespace.
 */
const ADMIN_VALUE_SCRUB_PATTERNS: readonly RegExp[] = [
  /James\s+Pether\s+S(?:ö|o)rling[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
  /\bHack23\s+AB\b[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
  /\b(?:Run[-\s]?ID|K[öo]rnings[-\s]?ID|Lauf[-\s]?ID|Ajo[-\s]?ID|实行ID|実行ID|운영\s*ID|实例ID)\b\s*[:：]?\s*\d{6,}/gi,
  /\b(?:Confidence|Konfidens(?:nivå)?|Konfidenz|Luottamustaso|信頼度|信心度|Niveau de confiance|Nivel de confianza|Betrouwbaarheid)\b\s*[:：]?\s*(?:HIGH|HØJ|HIGH\s*\[B\d\]|HØY|KORKEA|高い|高|HOCH|Élevé|Alto|Hoog|عالٍ?|גבוה)\b[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
  /\bClassification\b\s*[:：]?\s*PUBLIC[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
  /\bAdmiralty\s+(?:Range|Baseline|Code|Grade|Scale)\b[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
  /\bGDPR\s+Art\.?\s*\d+(?:\(\d+\))?(?:\([a-z](?:[,;]\s*[a-z])*\))?[^\n]*?(?=(?:\s*[—•·|]\s*|\s*\.\s|$))/gi,
];

/**
 * Strip admin-byline VALUES (editorial name, run-ID, classification
 * banner, confidence grade) from the final SEO string. Pure
 * label-agnostic — works on any source language.
 */
function scrubAdminBylineValues(text: string): string {
  let cleaned = text;
  for (const re of ADMIN_VALUE_SCRUB_PATTERNS) {
    cleaned = cleaned.replace(re, ' ');
  }
  // Collapse any " — " / "•" / "·" / "|" residue left behind by the
  // scrub plus tidy whitespace and dangling punctuation.
  return cleaned
    .replace(/\s*[—•·|]\s*(?=[—•·|])/g, ' ')
    .replace(/\s*[—•·|]\s*$/u, '')
    .replace(/^\s*[—•·|]\s*/u, '')
    .replace(/\s{2,}/g, ' ')
    .replace(/\s+([,.;:!?])/g, '$1')
    .trim();
}

/**
 * Word-boundary truncation with `…` ellipsis. Used as a defence-in-depth
 * cap on `deriveBriefSeoOverrides` output. The upstream extractors each
 * apply their own language-aware truncation; this only fires when the
 * result still exceeds the hard ceiling (e.g. an H1 that is itself
 * longer than the per-language `hardMax`).
 *
 * The function tries to cut at a word boundary in the last 45% of the
 * window; if no boundary exists it falls back to a hard slice. The
 * resulting string is guaranteed to have `length <= maxLen`.
 */
function capByWordBoundary(text: string, maxLen: number): string {
  if (text.length <= maxLen) return text;
  if (maxLen <= 1) return '…'.slice(0, maxLen);
  const sliced = text.slice(0, maxLen - 1);
  const lastSpace = sliced.lastIndexOf(' ');
  const cut = lastSpace > Math.floor(maxLen * 0.55) ? sliced.slice(0, lastSpace) : sliced;
  // Strip dangling punctuation / connectors before adding ellipsis.
  const stripped = cut
    .replace(/[\s,;:\-—–]+$/u, '')
    .trim();
  return (stripped + '…').slice(0, maxLen);
}

export async function renderArticleHtml(input: RenderArticleInput): Promise<string> {
  const parsed = matter(input.markdown);
  // Cascade-chain step #1+#2 — pull SEO directly from executive-brief.md
  // (localized brief beats EN brief for non-EN, EN brief is canonical
  // for EN). The brief is the single source of truth for `<title>` /
  // `<meta description>` / JSON-LD `headline` / JSON-LD `description`.
  // `article.md` frontmatter `title:` / `description:` / `keywords:`
  // lines are back-compat-only fallback for the 278 pre-`2026-03-26`
  // legacy `news/*-en.html` files whose source directories have been
  // deleted (see `deriveBriefSeoOverrides`).
  const briefOverrides = deriveBriefSeoOverrides({
    lang: input.lang,
    englishBriefMarkdown: input.englishBriefMarkdown,
    localizedBriefMarkdown: input.localizedBriefMarkdown,
    subfolderSlug: input.subfolderSlug,
  });
  // Delegate every `<head>`-relevant derivation to the shared helper so
  // the renderer and the `test-article-headers` CLI can never drift.
  const head = computeArticleHeadMetadata({
    markdown: input.markdown,
    lang: input.lang,
    canonicalPath: input.canonicalPath,
    // Pass the already-parsed front-matter data so `computeArticleHeadMetadata`
    // can skip a second `matter()` call on the same string.
    parsedData: parsed.data as Record<string, unknown>,
    briefDerivedTitle: briefOverrides.title,
    briefDerivedDescription: briefOverrides.description,
    briefDerivedEntities: briefOverrides.entities,
  });
  const { rawTitle: title, date, articleTypeId, articleTypeLabel: localizedArticleTypeLabel, seo } = head;
  const publishedIso = `${date}T00:00:00Z`;
  const modifiedIso = new Date().toISOString();
  const articleType = { type: articleTypeId, label: localizedArticleTypeLabel };

  const cleanedContent = localizeExecutiveBriefLead({
    content: stripBodyDuplicateSections(parsed.content),
    lang: input.lang,
    localizedBriefMarkdown: input.localizedBriefMarkdown,
    subfolderRepoRelPath: input.subfolderRepoRelPath,
  });

  const enrichedContent = enrichArticleMarkdownWithPoliticalContext(cleanedContent, input.lang);

  const bodyHtml = rewriteMarkdownHrefsInHtml(
    await renderMarkdownToHtml(enrichedContent),
    input.subfolderRepoRelPath,
  );

  // Apply visual scannability transforms (confidence chips, admiralty
  // badges, timeline indicators). Progressive disclosure is applied after
  // splitting so that wrapping <h2> sections in <details> doesn't interfere
  // with the split logic.
  const { transformedBody, tocHtml, methodologyFooterHtml } = applyScannabilityTransforms(bodyHtml, input.lang);

  const { lead: leadHtml, rest: rawRestHtml } = splitBodyAtSecondH2(transformedBody);
  const restHtml = transformProgressiveDisclosure(rawRestHtml);

  const articleUrl = `${BASE_URL}/${input.canonicalPath}`;
  const langMeta = LANGUAGE_META[input.lang];

  const newsArticleLd = buildNewsArticleLd({
    headline: title,
    description: seo.description,
    datePublished: publishedIso,
    dateModified: modifiedIso,
    inLanguage: langMeta.hreflang,
    url: articleUrl,
    isBasedOn: (input.artifactsUsed ?? []).map((a) => ({
      url: input.subfolderRepoRelPath
        ? buildGithubBlobUrl(`${input.subfolderRepoRelPath}/${a}`)
        : a,
      name: a,
    })),
    // Mirror the page's `<meta keywords>` into NewsArticle.keywords (an
    // array per Schema.org), and surface the localized article-type label
    // as `articleSection` (Propositions / Motions / Interpellations / …).
    // Both fields are skipped when empty so the JSON-LD shape is stable.
    keywords: seo.keywords,
    articleSection: localizedArticleTypeLabel,
  });

  const breadcrumbName = title.length > BREADCRUMB_TITLE_MAX_LENGTH
    ? title.substring(0, BREADCRUMB_TITLE_MAX_LENGTH - BREADCRUMB_ELLIPSIS_OVERHEAD) + '…'
    : title;
  const breadcrumbLd = buildBreadcrumbListLd([
    { name: langMeta.translations.home, item: `${BASE_URL}/` },
    { name: langMeta.translations.newsAnalysis, item: `${BASE_URL}/news/` },
    { name: breadcrumbName },
  ]);

  const speakableLd = buildSpeakableWebPageLd(
    articleUrl,
    langMeta.hreflang,
    ARTICLE_SPEAKABLE_SELECTORS,
  );

  const chrome = buildChrome({
    lang: input.lang,
    title: seo.title,
    description: seo.description,
    keywords: seo.keywords,
    canonicalPath: input.canonicalPath,
    hreflangAlternates: input.hreflangAlternates,
    publishedIso,
    modifiedIso,
    jsonLd: [newsArticleLd, breadcrumbLd, speakableLd],
    section: head.articleSection,
    heroBannerImage: 'images/riksdagsmonitornews-banner.webp',
    bodyClass: 'news-article',
  });

  const artifacts = input.artifactsUsed ?? [];
  const readerNavigationHtml = renderReaderNavigation({
    lang: input.lang,
    artifactsUsed: artifacts,
  });
  const politicalContextHtml = renderPoliticalContext({
    lang: input.lang,
    markdown: enrichedContent,
  });
  const analysisArtifactsHtml = renderAnalysisArtifactsReference({
    lang: input.lang,
    artifactsUsed: artifacts,
    subfolderRepoRelPath: input.subfolderRepoRelPath,
  });
  const methodsReferenceHtml = renderMethodsReference({
    lang: input.lang,
    canonicalPath: input.canonicalPath,
  });

  // Article content column (everything after the header). When a TOC is
  // present this is placed in the second grid column next to the sticky
  // sidebar; otherwise it sits directly in the article card.
  const contentHtml = `        <div class="rm-article-body">
${leadHtml}
        </div>
${readerNavigationHtml}${politicalContextHtml}${restHtml ? `
        <div class="rm-article-body rm-article-body-rest">
${restHtml}
        </div>` : ''}
${analysisArtifactsHtml}${methodsReferenceHtml}${methodologyFooterHtml}`;

  // With a TOC, lay out a two-column grid (sticky sidebar TOC + content).
  // Without one, keep the simple single-column flow.
  const bodyLayoutHtml = tocHtml
    ? `        <div class="rm-article-layout">
${tocHtml}
          <div class="rm-article-content">
${contentHtml}
          </div>
        </div>`
    : contentHtml;

  return `${chrome.head}
${chrome.headerHtml}
      <article class="rm-article rm-article-type-${escapeHtml(articleType.type)}${tocHtml ? ' rm-article--with-toc' : ''}" data-article-type="${escapeHtml(articleType.type)}" lang="${LANGUAGE_META[input.lang].hreflang}">
        <header class="rm-article-header">
          <p class="rm-article-eyebrow"><span class="rm-icon" aria-hidden="true">${articleTypeIcon(articleType.type)}</span> ${escapeHtml(localizedArticleTypeLabel)}</p>
          <h1>${escapeHtml(title)}</h1>
          <p class="rm-article-dek">${escapeHtml(seo.description)}</p>
          <p class="rm-article-meta">
            <time datetime="${publishedIso}"><span class="rm-icon" aria-hidden="true">📅</span> ${escapeHtml(date)}</time>
            · <span class="rm-article-lang">${LANGUAGE_META[input.lang].flag} ${LANGUAGE_META[input.lang].nativeName}</span>
          </p>
          <ul class="rm-article-trust-badges" aria-label="${escapeHtml(LANGUAGE_META[input.lang].translations.articleTrustAriaLabel)}">
            <li><span class="rm-icon" aria-hidden="true">🏛️</span> ${escapeHtml(LANGUAGE_META[input.lang].translations.articleTrustPublicSources)}</li>
            <li><span class="rm-icon" aria-hidden="true">🤖</span> ${escapeHtml(LANGUAGE_META[input.lang].translations.articleTrustAiFirst)}</li>
            <li><span class="rm-icon" aria-hidden="true">🔗</span> ${escapeHtml(LANGUAGE_META[input.lang].translations.articleTrustTraceable)}</li>
          </ul>
        </header>
${bodyLayoutHtml}
      </article>${tocHtml ? `
      <script>
        (function(){var t=document.querySelector('.rm-article-toc');if(!t)return;var links=t.querySelectorAll('a[href^="#"]');var ids=[];links.forEach(function(a){ids.push(a.getAttribute('href').slice(1));});if(!ids.length||!('IntersectionObserver' in window))return;var obs=new IntersectionObserver(function(entries){entries.forEach(function(e){if(e.isIntersecting){links.forEach(function(a){a.classList.toggle('rm-toc-active',a.getAttribute('href')==='#'+e.target.id);});};});},{rootMargin:'-80px 0px -60% 0px',threshold:0});ids.forEach(function(id){var el=document.getElementById(id);if(el)obs.observe(el);});})();
      </script>` : ''}
${chrome.footerHtml}`;
}