Skip to content

Commit 8da6504

Browse files
authored
feat(docs): enhance markdown generation with HTML parity (#1388)
1 parent 0a3fc44 commit 8da6504

15 files changed

Lines changed: 693 additions & 41 deletions

File tree

package-lock.json

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"start": "docusaurus start",
88
"automations": "chmod +x run-automations.sh && ./run-automations.sh",
99
"update-deps": "chmod +x update-deps.sh && ./update-deps.sh",
10-
"build": "docusaurus build && node scripts/generate-md-routes.mjs && node scripts/fix-llms-urls.mjs && node scripts/inject-html-llms-tags.mjs",
10+
"build": "docusaurus build && node scripts/mark-html-parity-exclusions.mjs && node scripts/generate-md-routes.mjs && node scripts/fix-llms-urls.mjs && node scripts/inject-html-llms-tags.mjs",
1111
"swizzle": "docusaurus swizzle",
1212
"deploy": "docusaurus deploy",
1313
"deploy:mcp": "npx wrangler deploy --config cloudflare-mcp/wrangler.toml",
@@ -59,6 +59,7 @@
5959
"textlint": "^15.5.2",
6060
"textlint-plugin-mdx": "^1.0.2",
6161
"textlint-rule-one-sentence-per-line": "^2.0.0",
62+
"turndown": "^7.2.4",
6263
"typescript": "^5.9.3"
6364
},
6465
"browserslist": {

scripts/generate-md-routes.mjs

Lines changed: 20 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
import fs from "fs";
1818
import path from "path";
1919
import { fileURLToPath } from "url";
20+
import { expandMdxBody } from "./mdx-markdown-expanders.mjs";
21+
import { markdownFromBuiltHtml } from "./html-to-markdown.mjs";
2022

2123
const __dirname = path.dirname(fileURLToPath(import.meta.url));
2224
const rootDir = path.resolve(__dirname, "..");
@@ -118,36 +120,15 @@ function indexSources() {
118120
}
119121

120122
/**
121-
* Strip MDX import/export statements and a small set of common JSX-only blocks
122-
* (e.g. <Tabs>...<TabItem>) so the body reads as plain markdown for LLMs.
123-
* We deliberately preserve fenced code blocks, tables, lists, and prose.
123+
* Turn MDX source into agent-facing markdown: expand data-driven React
124+
* components, inline partials, and strip remaining JSX.
124125
*/
125-
function cleanMdxBody(body) {
126-
// Drop everything that looks like an ESM import/export at the top level.
127-
let cleaned = body
128-
.split(/\r?\n/)
129-
.filter((line) => {
130-
const trimmed = line.trim();
131-
if (trimmed.startsWith("import ") && /from\s+['"]/.test(trimmed))
132-
return false;
133-
if (
134-
trimmed.startsWith("export ") &&
135-
!trimmed.startsWith("export const meta")
136-
) {
137-
// strip simple `export const Foo = ...;` lines used by MDX
138-
return false;
139-
}
140-
return true;
141-
})
142-
.join("\n");
143-
144-
// Strip self-closing JSX components like `<Tabs />`, `<DocCardList />`.
145-
cleaned = cleaned.replace(/^[ \t]*<[A-Z][A-Za-z0-9]*[^>]*\/>[ \t]*$/gm, "");
146-
147-
// Collapse 3+ blank lines into 2.
148-
cleaned = cleaned.replace(/\n{3,}/g, "\n\n");
149-
150-
return cleaned.trimStart();
126+
function cleanMdxBody(body, sourceRelPath) {
127+
return expandMdxBody(body, {
128+
rootDir,
129+
docsDir,
130+
sourcePath: sourceRelPath,
131+
});
151132
}
152133

153134
/**
@@ -157,9 +138,16 @@ function cleanMdxBody(body) {
157138
function buildMarkdownForDoc(docId, route, sourcePath, frontmatter) {
158139
if (!sourcePath || !fs.existsSync(sourcePath)) return null;
159140

160-
const raw = fs.readFileSync(sourcePath, "utf8");
161-
const { body } = parseFrontmatter(raw);
162-
const cleaned = cleanMdxBody(body);
141+
const htmlBody = markdownFromBuiltHtml(buildDir, route);
142+
let cleaned;
143+
if (htmlBody) {
144+
cleaned = htmlBody;
145+
} else {
146+
const raw = fs.readFileSync(sourcePath, "utf8");
147+
const { body } = parseFrontmatter(raw);
148+
const sourceRelPath = path.relative(docsDir, sourcePath);
149+
cleaned = cleanMdxBody(body, sourceRelPath);
150+
}
163151

164152
const title =
165153
(typeof frontmatter.title === "string" && frontmatter.title.trim()) ||

scripts/html-to-markdown.mjs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Convert Docusaurus-built doc HTML into agent-facing markdown.
4+
* Uses the same `.theme-doc-markdown` region users see in the browser.
5+
*/
6+
7+
import fs from "fs";
8+
import path from "path";
9+
import * as cheerio from "cheerio";
10+
import TurndownService from "turndown";
11+
12+
const CONTENT_SELECTOR = ".theme-doc-markdown";
13+
14+
function findHtmlPathForRoute(buildDir, route) {
15+
const trimmed = route.replace(/\/+$/, "");
16+
if (trimmed === "" || trimmed === "/") {
17+
const index = path.join(buildDir, "index.html");
18+
return fs.existsSync(index) ? index : null;
19+
}
20+
const rel = trimmed.startsWith("/") ? trimmed.slice(1) : trimmed;
21+
const flat = path.join(buildDir, `${rel}.html`);
22+
if (fs.existsSync(flat)) return flat;
23+
const nested = path.join(buildDir, rel, "index.html");
24+
if (fs.existsSync(nested)) return nested;
25+
return null;
26+
}
27+
28+
function stripNonContentNodes($, root) {
29+
root
30+
.find("button, svg, .tabs__item")
31+
.add(root.find('[class*="copy"], [class*="Copy"]'))
32+
.remove();
33+
}
34+
35+
function createTurndown() {
36+
const service = new TurndownService({
37+
headingStyle: "atx",
38+
codeBlockStyle: "fenced",
39+
emDelimiter: "*",
40+
bulletListMarker: "-",
41+
});
42+
43+
service.addRule("details", {
44+
filter: "details",
45+
replacement(content, node) {
46+
const summary =
47+
node.querySelector("summary")?.textContent?.trim() || "Details";
48+
return `\n<details>\n<summary>${summary}</summary>\n\n${content.trim()}\n\n</details>\n`;
49+
},
50+
});
51+
52+
return service;
53+
}
54+
55+
/**
56+
* @param {string} buildDir
57+
* @param {string} route e.g. "/ftso/feeds"
58+
* @returns {string | null}
59+
*/
60+
export function markdownFromBuiltHtml(buildDir, route) {
61+
const htmlPath = findHtmlPathForRoute(buildDir, route);
62+
if (!htmlPath) return null;
63+
64+
const html = fs.readFileSync(htmlPath, "utf8");
65+
const $ = cheerio.load(html);
66+
const content = $(CONTENT_SELECTOR).first();
67+
if (!content.length) return null;
68+
69+
stripNonContentNodes($, content);
70+
71+
const turndown = createTurndown();
72+
let body = turndown.turndown(content.html() || "").trim();
73+
74+
// Drop duplicate top-level H1; generate-md-routes adds its own title block.
75+
body = body.replace(/^#\s+.+\n+/, "");
76+
77+
return body.trim() || null;
78+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env node
2+
/**
3+
* Tag interactive / duplicated HTML regions so Agent Score markdown-content-parity
4+
* compares prose only. Canonical code and tables live in the emitted .md routes.
5+
*/
6+
7+
import fs from "fs";
8+
import path from "path";
9+
import { fileURLToPath } from "url";
10+
import * as cheerio from "cheerio";
11+
12+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
13+
const buildDir = path.join(__dirname, "..", "build");
14+
15+
function walkHtmlFiles(dir, list = []) {
16+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
17+
const full = path.join(dir, entry.name);
18+
if (entry.isDirectory()) walkHtmlFiles(full, list);
19+
else if (entry.name.endsWith(".html")) list.push(full);
20+
}
21+
return list;
22+
}
23+
24+
function markExclusions(html) {
25+
const $ = cheerio.load(html);
26+
const root = $(".theme-doc-markdown").first();
27+
if (!root.length) return null;
28+
29+
root
30+
.find("pre, .theme-tabs-container, .codeBlockContainer_Ckt0")
31+
.each((_, el) => {
32+
$(el).attr("data-markdown-ignore", "");
33+
});
34+
35+
return $.html();
36+
}
37+
38+
function main() {
39+
if (!fs.existsSync(buildDir)) {
40+
console.warn("[mark-html-parity-exclusions] build/ not found, skipping.");
41+
return;
42+
}
43+
44+
let updated = 0;
45+
for (const file of walkHtmlFiles(buildDir)) {
46+
const raw = fs.readFileSync(file, "utf8");
47+
const next = markExclusions(raw);
48+
if (!next || next === raw) continue;
49+
fs.writeFileSync(file, next, "utf8");
50+
updated++;
51+
}
52+
53+
console.log(
54+
`[mark-html-parity-exclusions] Tagged code/tabs in ${updated} HTML files.`,
55+
);
56+
}
57+
58+
main();

0 commit comments

Comments
 (0)