Skip to content

Commit 928eb81

Browse files
committed
fix: added new tests
1 parent 9aefc73 commit 928eb81

17 files changed

Lines changed: 260 additions & 17 deletions

File tree

schemas/yfm-schema.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,12 @@ properties:
367367
type: boolean
368368
description: Emit yfm-build-manifest.json with all documentation paths.
369369

370+
crawlerManifest:
371+
type: boolean
372+
description: >
373+
Emit crawler-manifest.json mapping each source file to its external links.
374+
Useful for link-health checks (detecting 404, 403, etc.).
375+
370376
changelogs:
371377
oneOf:
372378
- type: boolean

src/commands/build/features/crawler-manifest/index.spec.ts

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1-
import {describe, expect, it} from 'vitest';
1+
import {describe, expect, it, vi} from 'vitest';
22

3-
import {extractExternalLinks, extractIncludePaths} from './index';
3+
import type {Run} from '~/commands/build';
4+
5+
import {
6+
collectExternalLinksFromYaml,
7+
collectLinks,
8+
extractExternalLinks,
9+
extractIncludePaths,
10+
} from './index';
411

512
describe('CrawlerManifest feature', () => {
613
describe('extractExternalLinks', () => {
@@ -217,4 +224,145 @@ plain https://plain.example.com text
217224
);
218225
});
219226
});
227+
228+
describe('collectExternalLinksFromYaml', () => {
229+
it('extracts external link from href field', () => {
230+
expect(collectExternalLinksFromYaml('href: https://example.com')).toContain(
231+
'https://example.com',
232+
);
233+
});
234+
235+
it('extracts external link from nested url field', () => {
236+
expect(
237+
collectExternalLinksFromYaml('navigation:\n href: https://nav.example.com'),
238+
).toContain('https://nav.example.com');
239+
});
240+
241+
it('ignores relative href value', () => {
242+
expect(collectExternalLinksFromYaml('href: ./relative.md')).toEqual([]);
243+
});
244+
245+
it('returns empty on invalid YAML', () => {
246+
expect(collectExternalLinksFromYaml(': invalid: {unclosed')).toEqual([]);
247+
});
248+
249+
it('returns empty when YAML is null', () => {
250+
expect(collectExternalLinksFromYaml('')).toEqual([]);
251+
});
252+
253+
it('returns empty when YAML is a scalar', () => {
254+
expect(collectExternalLinksFromYaml('just a string')).toEqual([]);
255+
});
256+
});
257+
258+
describe('stripFencedBlocks (via extractExternalLinks)', () => {
259+
it('strips unclosed fenced block to end of content', () => {
260+
expect(extractExternalLinks('```\n[link](https://unclosed.example.com)')).toEqual([]);
261+
});
262+
263+
it('tilde fence does not close backtick fence', () => {
264+
const content = '```\n[inside](https://inside.example.com)\n~~~\nstill inside\n```';
265+
266+
expect(extractExternalLinks(content)).toEqual([]);
267+
});
268+
269+
it('shorter fence does not close longer opening fence', () => {
270+
const content = '````\n[inside](https://inside.example.com)\n```\nnot closed\n````';
271+
272+
expect(extractExternalLinks(content)).toEqual([]);
273+
});
274+
275+
it('handles CRLF line endings correctly', () => {
276+
const content =
277+
'```\r\n[inside](https://inside.example.com)\r\n```\r\n[outside](https://outside.example.com)';
278+
const links = extractExternalLinks(content);
279+
280+
expect(links).not.toContain('https://inside.example.com');
281+
expect(links).toContain('https://outside.example.com');
282+
});
283+
284+
it('content after closed fence is included', () => {
285+
const content =
286+
'```\n[inside](https://inside.example.com)\n```\n[outside](https://outside.example.com)';
287+
const links = extractExternalLinks(content);
288+
289+
expect(links).not.toContain('https://inside.example.com');
290+
expect(links).toContain('https://outside.example.com');
291+
});
292+
});
293+
294+
describe('collectLinks', () => {
295+
const makeRun = (files: Record<string, string>) =>
296+
({
297+
input: '/input',
298+
read: vi.fn(async (path: string) => {
299+
const relative = path.slice('/input/'.length);
300+
if (relative in files) return files[relative];
301+
throw new Error(`File not found: ${path}`);
302+
}),
303+
}) as unknown as Run;
304+
305+
it('extracts links from a markdown file', async () => {
306+
const run = makeRun({'page.md': '[link](https://example.com)'});
307+
308+
expect(await collectLinks(run, 'page.md')).toContain('https://example.com');
309+
});
310+
311+
it('returns empty array when file cannot be read', async () => {
312+
const run = makeRun({});
313+
314+
expect(await collectLinks(run, 'missing.md')).toEqual([]);
315+
});
316+
317+
it('returns empty array for already-visited file (circular protection)', async () => {
318+
const run = makeRun({'page.md': '[link](https://example.com)'});
319+
const visited = new Set(['page.md']);
320+
321+
expect(await collectLinks(run, 'page.md', visited)).toEqual([]);
322+
});
323+
324+
it('extracts links from YAML file including structured keys', async () => {
325+
const run = makeRun({
326+
'data.yaml': 'href: https://yaml.example.com',
327+
});
328+
329+
expect(await collectLinks(run, 'data.yaml')).toContain('https://yaml.example.com');
330+
});
331+
332+
it('extracts links from .yml file', async () => {
333+
const run = makeRun({
334+
'data.yml': 'href: https://yml.example.com',
335+
});
336+
337+
expect(await collectLinks(run, 'data.yml')).toContain('https://yml.example.com');
338+
});
339+
340+
it('recursively extracts links from included markdown files', async () => {
341+
const run = makeRun({
342+
'page.md': '{% include [snippet](./_includes/snippet.md) %}',
343+
'_includes/snippet.md': '[link](https://included.example.com)',
344+
});
345+
346+
expect(await collectLinks(run, 'page.md')).toContain('https://included.example.com');
347+
});
348+
349+
it('does not follow includes from YAML files', async () => {
350+
const run = makeRun({
351+
'data.yaml': 'href: https://yaml.example.com',
352+
'snippet.md': '[link](https://shouldnotappear.example.com)',
353+
});
354+
const links = await collectLinks(run, 'data.yaml');
355+
356+
expect(links).toContain('https://yaml.example.com');
357+
expect(links).not.toContain('https://shouldnotappear.example.com');
358+
});
359+
360+
it('skips unreadable include files gracefully', async () => {
361+
const run = makeRun({
362+
'page.md': '{% include [missing](./missing.md) %}\n[link](https://example.com)',
363+
});
364+
365+
expect(await collectLinks(run, 'page.md')).toContain('https://example.com');
366+
});
367+
});
220368
});

src/commands/build/features/crawler-manifest/index.ts

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,52 @@ export type CrawlerManifestConfig = {
2424

2525
const MANIFEST_FILENAME = 'crawler-manifest.json';
2626

27-
const INLINE_LINK_RE = /\[[^\]]*\]\(([^)\s]+)[^)]*\)/g;
28-
const AUTOLINK_REGEX = /<([^>]+)>/g;
27+
const INLINE_LINK_REGEX = /\[[^\]]*\]\(([^)\s"']+)/g;
28+
const AUTOLINK_REGEX = /<([^>\s]+)>/g;
2929
const BARE_URL_REGEX = /\bhttps?:\/\/[^\s<>)[\]"']+/g;
30-
const FILE_BLOCK_REGEX = /{%\s*file\s[^%]*src="([^"]+)"[^%]*%}/g;
30+
const FILE_BLOCK_REGEX = /{%\s*file\s[^%]*?src="([^"]+)"/g;
31+
32+
function stripFencedBlocks(content: string): string {
33+
const lines = content.split('\n');
34+
const out: string[] = [];
35+
let fenceChar = '';
36+
let fenceLen = 0;
37+
38+
for (const line of lines) {
39+
const normalized = line.replace(/\r$/, '');
40+
41+
if (fenceLen === 0) {
42+
const fence = /^(`{3,}|~{3,})/.exec(normalized);
43+
44+
if (fence) {
45+
fenceChar = fence[1][0];
46+
fenceLen = fence[1].length;
47+
} else {
48+
out.push(line);
49+
}
50+
} else {
51+
const re = fenceChar === '`' ? /^(`{3,})[ \t]*$/ : /^(~{3,})[ \t]*$/;
52+
const fence = re.exec(normalized);
53+
54+
if (fence && fence[1].length >= fenceLen) {
55+
fenceChar = '';
56+
fenceLen = 0;
57+
}
58+
}
59+
}
60+
61+
return out.join('\n');
62+
}
3163

3264
function stripNonContent(content: string): string {
33-
return content
34-
.replace(/<!--[\s\S]*?-->/g, '')
35-
.replace(/^(`{3,}|~{3,})[^\n]*\n[\s\S]*?\n\1[ \t]*$/gm, '')
36-
.replace(/`[^`\n]+`/g, '');
65+
return stripFencedBlocks(content.replace(/<!--[\s\S]*?-->/g, '')).replace(/`[^`\n]+`/g, '');
3766
}
3867

3968
export function extractExternalLinks(content: string): string[] {
4069
const links = new Set<string>();
4170
const stripped = stripNonContent(content);
4271

43-
for (const match of stripped.matchAll(INLINE_LINK_RE)) {
72+
for (const match of stripped.matchAll(INLINE_LINK_REGEX)) {
4473
if (isExternalHref(match[1])) {
4574
links.add(match[1]);
4675
}
@@ -65,13 +94,19 @@ export function extractExternalLinks(content: string): string[] {
6594
}
6695

6796
for (const match of stripped.matchAll(BARE_URL_REGEX)) {
68-
links.add(match[0].replace(/[.,;:!?]+$/, ''));
97+
let url = match[0];
98+
99+
while (url.length > 0 && '.,;:!?'.includes(url[url.length - 1])) {
100+
url = url.slice(0, -1);
101+
}
102+
103+
links.add(url);
69104
}
70105

71106
return [...links];
72107
}
73108

74-
function collectExternalLinksFromYaml(content: string): string[] {
109+
export function collectExternalLinksFromYaml(content: string): string[] {
75110
const links: string[] = [];
76111

77112
try {
@@ -103,7 +138,7 @@ export function extractIncludePaths(content: string): string[] {
103138
return paths;
104139
}
105140

106-
async function collectLinks(
141+
export async function collectLinks(
107142
run: Run,
108143
filePath: string,
109144
visited = new Set<string>(),

src/core/markdown/utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ export function getPcIconTitle(iconPath: string): string {
6868
}
6969

7070
export const PC_REGEX = /^([ \t]*):::\s*page-constructor[ \t]*\r?\n?/m;
71-
export const REF_DEF_REGEX = /^\s*\[(.*?)]:\s*([^\s]+)/gm;
72-
export const INCLUDE_REGEX = /{%\s*include\s*.+?%}/g;
71+
export const REF_DEF_REGEX = /^\s*\[([^\]]*)]:\s*([^\s]+)/gm;
72+
export const INCLUDE_REGEX = /{%\s*include\s+[^%]+%}/g;
7373

7474
export function findPcImages(content: string): AssetInfo[] {
7575
const pcImages: AssetInfo[] = [];

tests/e2e/crawler-manifest.spec.ts

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import {describe, expect, test} from 'vitest';
22
import {join} from 'node:path';
3-
import {readFile} from 'node:fs/promises';
3+
import {access, readFile} from 'node:fs/promises';
44

55
import {TestAdapter, getTestPaths} from '../fixtures';
66

77
describe('Crawler manifest', () => {
88
test('generates crawler-manifest.json', async () => {
9-
const {inputPath, outputPath} = getTestPaths('mocks/crawler-manifest');
9+
const {inputPath, outputPath} = getTestPaths('mocks/crawler-manifest/with-links');
1010

1111
await TestAdapter.testBuildPass(inputPath, outputPath, {
1212
md2md: true,
@@ -18,4 +18,41 @@ describe('Crawler manifest', () => {
1818

1919
expect(JSON.parse(manifestContent)).toMatchSnapshot();
2020
});
21+
22+
test('does not generate manifest when --crawler-manifest flag is absent', async () => {
23+
const {inputPath, outputPath} = getTestPaths('mocks/crawler-manifest/with-links');
24+
25+
await TestAdapter.testBuildPass(inputPath, outputPath, {
26+
md2md: true,
27+
md2html: false,
28+
});
29+
30+
await expect(access(join(outputPath, 'crawler-manifest.json'))).rejects.toThrow();
31+
});
32+
33+
test('does not generate manifest when there are no external links', async () => {
34+
const {inputPath, outputPath} = getTestPaths('mocks/crawler-manifest/no-links');
35+
36+
await TestAdapter.testBuildPass(inputPath, outputPath, {
37+
md2md: true,
38+
md2html: false,
39+
args: '--crawler-manifest',
40+
});
41+
42+
await expect(access(join(outputPath, 'crawler-manifest.json'))).rejects.toThrow();
43+
});
44+
45+
test('reads crawlerManifest setting from .yfm config file', async () => {
46+
const {inputPath, outputPath} = getTestPaths('mocks/crawler-manifest/config');
47+
48+
await TestAdapter.testBuildPass(inputPath, outputPath, {
49+
md2md: true,
50+
md2html: false,
51+
});
52+
53+
const manifestContent = await readFile(join(outputPath, 'crawler-manifest.json'), 'utf-8');
54+
const manifest = JSON.parse(manifestContent);
55+
56+
expect(manifest['index.md']).toContain('https://config.example.com');
57+
});
2158
});
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
crawlerManifest: true
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Index
2+
3+
[External link](https://config.example.com)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
title: Crawler manifest config test
2+
items:
3+
- name: Index
4+
href: index.md
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Index
2+
3+
Local link: [other page](./other.md).
4+
5+
Absolute local link: [absolute](/docs/page.md).
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
title: No external links test
2+
items:
3+
- name: Index
4+
href: index.md

0 commit comments

Comments
 (0)