-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy patharticle-merge.ts
More file actions
208 lines (192 loc) · 8.71 KB
/
article-merge.ts
File metadata and controls
208 lines (192 loc) · 8.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
/**
* @module Infrastructure/RenderLib/ArticleMerge
* @category Intelligence Operations / Supporting Infrastructure
* @name Localized + English `article.md` merger
*
* @description
* The agent-translation step under each `news-*.md` workflow only has
* minutes per language to produce `article.<lang>.md`. The resulting file
* is therefore a short, hand-curated executive summary — not a full
* 14-language translation of the canonical English `article.md`
* (which is itself a 23-artifact aggregation, often >2 000 lines).
*
* Without merging, the renderer would publish a 47-line stub at
* `news/$DATE-$SUB-de.html` while the English sibling carries the full
* analytical depth — that is the regression this module fixes.
*
* `mergeLocalizedWithEnglish` produces a single Markdown document that:
*
* 1. **Carries the localized front-matter** (title, description, language,
* etc.) so SEO, JSON-LD `inLanguage` and the article header remain
* correctly localized. Front-matter fields the localized file omits
* fall back to the English values so canonical metadata
* (`date`, `subfolder`, `slug`, `source_folder`) stays stable.
* When a localized `executive-brief_<lang>.md` markdown is passed in
* (cascade chain step #2 per
* `Article-Generation.md § "Per-language precedence chain"`), its
* publishable H1 and BLUF override the localized article front-matter
* `title:` / `description:`. Banned-phrase H1s and empty BLUFs are
* rejected so the merger silently falls through to chain step #3
* (localized article front-matter) rather than shipping a template
* stub as the SERP `<title>`.
* 2. **Starts the body with the localized executive summary** so the
* reader gets a first-page experience in their own language.
* 3. **Appends the full English body** under a localized "Detailed
* analysis (in English)" H2 + an aside note explaining the
* fallback. This guarantees the published HTML contains *every*
* analysis section — Risk Assessment, Coalition Mathematics,
* Forward Indicators, Sources etc. — that an English reader sees.
*
* The function is pure (no I/O) and string-only — front-matter is parsed
* with `gray-matter`, re-serialised with `gray-matter`'s `stringify` so
* the output passes the existing aggregator validators unchanged.
*
* Used by `scripts/render-articles.ts` whenever a non-English language
* specific `article.<lang>.md` file exists alongside the canonical
* `article.md`.
*
* @author Hack23 AB (Infrastructure Team)
* @license Apache-2.0
*/
import matter from 'gray-matter';
import type { Language } from '../types/language.js';
import { LANGUAGE_META } from '../sitemap-html/index.js';
import { extractLocalizedBriefSeo } from './aggregator/seo/localized-brief.js';
export interface MergeLocalizedInput {
/** Canonical English `article.md` contents (front-matter + body). */
readonly englishMarkdown: string;
/** Localized `article.<lang>.md` contents (front-matter + body). */
readonly localizedMarkdown: string;
/** Target language. Used to pick the localized fallback heading + note. */
readonly lang: Language;
/**
* Optional localized executive-brief markdown contents
* (`analysis/daily/$DATE/$SUB/executive-brief_<lang>.md`). When provided
* AND the brief yields a publishable H1 / BLUF, those values override
* the corresponding fields in the localized article front-matter —
* this implements cascade chain step #2 documented in
* `Article-Generation.md § "Per-language precedence chain"`.
*
* Pass `undefined` / empty string when the file does not exist; the
* merger then falls through to chain step #3 (localized article
* front-matter) without code-path changes.
*/
readonly localizedBriefMarkdown?: string | null;
/**
* The analysis subfolder slug (e.g. `propositions`). Forwarded to
* {@link extractLocalizedBriefSeo} so its boilerplate-equality check
* for the brief H1 matches the English-side rule. Optional — when
* absent (legacy callers), the brief title check skips the slug
* comparison.
*/
readonly subfolder?: string;
}
/**
* Front-matter keys whose canonical value lives in the English
* `article.md` (set by the aggregator) and must not be overridden by an
* agent-authored `article.<lang>.md`. The aggregator owns these as a
* cross-language stable identity.
*/
const ENGLISH_ONLY_FRONT_MATTER_KEYS: ReadonlySet<string> = new Set([
'date',
'subfolder',
'slug',
'source_folder',
'layout',
'generated_at',
]);
/**
* Front-matter keys whose value should always come from the localized
* file when present (and fall back to the English value otherwise).
*/
const LOCALIZED_FIRST_FRONT_MATTER_KEYS: ReadonlySet<string> = new Set([
'title',
'description',
'language',
]);
/**
* Build the localized "Detailed analysis (in English)" boundary block
* that separates the localized executive summary from the full English
* body inside the merged Markdown document.
*
* Exported for testability.
*/
export function buildEnglishCoverageBoundary(lang: Language): string {
const t = LANGUAGE_META[lang].translations;
const heading = t.articleEnglishCoverageHeading;
const note = t.articleEnglishCoverageNote;
return `\n\n---\n\n## ${heading}\n\n> ℹ️ ${note}\n\n`;
}
/**
* Merge a localized `article.<lang>.md` with the canonical English
* `article.md`. Returns a single Markdown string ready to be handed to
* `renderArticleHtml`.
*
* Behaviour contract:
*
* - When `lang === 'en'` the English source is returned unchanged
* (defensive — render-articles.ts already takes the English path
* directly, but this keeps the function safe to call from any caller).
* - When the localized file has no body content, the English body is
* used directly with the localized front-matter (so at minimum the
* title/description are translated even if the agent failed).
* - When both have content, the merged body is
* `localizedBody + boundary + englishBody`, where `boundary` is the
* localized "Detailed analysis (in English)" H2 + aside note.
*
* The merged front-matter:
*
* - Starts from the English front-matter (so canonical fields remain
* intact).
* - Overlays *every* field present in the localized front-matter
* *except* the keys in `ENGLISH_ONLY_FRONT_MATTER_KEYS` (date, slug,
* layout, etc.). This keeps title/description localized while
* preventing accidental drift on canonical identity fields.
* - Forces `language: <lang>` so JSON-LD `inLanguage` and SEO match.
*/
export function mergeLocalizedWithEnglish(input: MergeLocalizedInput): string {
const { englishMarkdown, localizedMarkdown, lang, localizedBriefMarkdown, subfolder } = input;
if (lang === 'en') return englishMarkdown;
const english = matter(englishMarkdown);
const localized = matter(localizedMarkdown);
const englishData = (english.data ?? {}) as Record<string, unknown>;
const localizedData = (localized.data ?? {}) as Record<string, unknown>;
const mergedData: Record<string, unknown> = { ...englishData };
for (const [key, value] of Object.entries(localizedData)) {
if (ENGLISH_ONLY_FRONT_MATTER_KEYS.has(key)) continue;
if (value === undefined || value === null || value === '') continue;
mergedData[key] = value;
}
for (const key of LOCALIZED_FIRST_FRONT_MATTER_KEYS) {
const value = localizedData[key];
if (value !== undefined && value !== null && value !== '') {
mergedData[key] = value;
}
}
// Cascade chain step #2 — localized executive-brief beats article.<lang>.md
// front-matter for title + description when the brief is publishable
// (non-banned, non-empty H1 / BLUF). Fields are resolved independently
// so a banned title with a clean BLUF still localizes the description.
if (localizedBriefMarkdown && localizedBriefMarkdown.trim().length > 0) {
const briefSeo = extractLocalizedBriefSeo({
briefMarkdown: localizedBriefMarkdown,
subfolder: subfolder ?? '',
// Forward the page language so the description gets truncated
// using the per-language SERP window (RTL 120-170, CJK 70-120,
// Latin LTR 140-200). See `seo-metadata-contract.md` §4 +
// `descriptionWindowForLanguage`.
lang,
});
if (briefSeo.title) mergedData.title = briefSeo.title;
if (briefSeo.description) mergedData.description = briefSeo.description;
}
mergedData.language = lang;
const englishBody = english.content.trimStart();
const localizedBody = localized.content.trim();
if (localizedBody.length === 0) {
return matter.stringify(englishBody, mergedData);
}
const boundary = buildEnglishCoverageBoundary(lang);
const mergedBody = `${localizedBody}${boundary}${englishBody}`;
return matter.stringify(mergedBody, mergedData);
}