-
Notifications
You must be signed in to change notification settings - Fork 750
Expand file tree
/
Copy pathcheck_links.ts
More file actions
248 lines (214 loc) · 6.51 KB
/
check_links.ts
File metadata and controls
248 lines (214 loc) · 6.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import { DOMParser } from "linkedom";
import * as path from "@std/path";
import { TextLineStream } from "@std/streams/text-line-stream";
import { mergeReadableStreams } from "@std/streams";
import * as colors from "@std/fmt/colors";
import { createBuilder } from "vite";
const www = path.join(import.meta.dirname!, "..", "www");
const totalStart = performance.now();
// deno-lint-ignore no-console
console.log("Building www...");
let stepStart = performance.now();
const builder = await createBuilder({ root: www, configLoader: "native" });
await builder.buildApp();
// deno-lint-ignore no-console
console.log(
`Build completed in ${((performance.now() - stepStart) / 1000).toFixed(1)}s`,
);
const EXCLUDED_PREFIXES = [
"mailto:",
"javascript:",
"vscode:",
"data:",
"https://github.com/denoland/fresh/edit/",
];
interface FailedLink {
url: string;
status: number;
referrer: string;
}
const checkedUrls = new Map<string, number>();
const visitedPages = new Set<string>();
const failedLinks: FailedLink[] = [];
const CONCURRENCY = 10;
// deno-lint-ignore no-console
console.log("Starting server...");
// Spawn the prod server directly to avoid importing test_utils.tsx
// (which launches a headless browser at module scope)
const cp = new Deno.Command(Deno.execPath(), {
args: ["serve", "-A", "--cached-only", "--port", "0", "_fresh/server.js"],
stdin: "null",
stdout: "piped",
stderr: "piped",
cwd: www,
}).spawn();
// Read server output to find the address
const linesStdout = cp.stdout
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
const linesStderr = cp.stderr
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream());
const lines = mergeReadableStreams(linesStdout, linesStderr);
let address = "";
// @ts-ignore yes it does
for await (const raw of lines.values({ preventCancel: true })) {
const line = colors.stripAnsiCode(raw);
const match = line.match(/https?:\/\/[^:]+:\d+(\/\w+[-\w]*)*/g);
if (match) {
address = match[0];
break;
}
}
if (!address) {
// deno-lint-ignore no-console
console.error("Could not find server address");
cp.kill();
Deno.exit(1);
}
// deno-lint-ignore no-console
console.log(`Server listening at ${address}`);
const rootUrl = new URL(address);
async function checkUrl(
url: string,
referrer: string,
): Promise<number> {
const cached = checkedUrls.get(url);
if (cached !== undefined) return cached;
// Mark as in-flight to avoid duplicate checks
checkedUrls.set(url, 0);
try {
const res = await fetch(url, {
method: "HEAD",
headers: { "User-Agent": "fresh-link-checker" },
redirect: "follow",
});
checkedUrls.set(url, res.status);
if (res.status >= 400) {
failedLinks.push({ url, status: res.status, referrer });
}
return res.status;
} catch {
checkedUrls.set(url, 0);
failedLinks.push({ url, status: 0, referrer });
return 0;
}
}
async function crawlPage(pageUrl: URL, referrer: string) {
const pathname = pageUrl.pathname;
if (visitedPages.has(pathname)) return;
visitedPages.add(pathname);
let res: Response;
try {
res = await fetch(pageUrl, {
headers: {
accept:
"text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8",
},
});
} catch {
failedLinks.push({ url: pageUrl.href, status: 0, referrer });
return;
}
if (res.status >= 400) {
failedLinks.push({ url: pageUrl.href, status: res.status, referrer });
await res.body?.cancel();
return;
}
if (!res.headers.get("content-type")?.includes("text/html")) {
await res.body?.cancel();
return;
}
const text = await res.text();
Deno.stdout.writeSync(new TextEncoder().encode("."));
const doc = new DOMParser().parseFromString(text, "text/html");
const linkChecks: Array<Promise<void>> = [];
const internalPages: Array<{ url: URL; referrer: string }> = [];
for (const link of doc.querySelectorAll("a")) {
const href = link.getAttribute("href")?.trim();
if (!href) continue;
if (EXCLUDED_PREFIXES.some((p) => href.startsWith(p))) continue;
if (href.startsWith("#")) continue;
let nextUrl: URL;
try {
nextUrl = new URL(href, pageUrl);
} catch {
continue;
}
// Strip fragment
nextUrl.hash = "";
const urlStr = nextUrl.href;
if (nextUrl.origin === rootUrl.origin) {
// Internal link -- crawl the page if it's a docs page
if (
!visitedPages.has(nextUrl.pathname) &&
nextUrl.pathname.startsWith("/docs")
) {
internalPages.push({ url: nextUrl, referrer: pathname });
} else if (!visitedPages.has(nextUrl.pathname)) {
// Non-docs internal page: just check it returns OK
if (!checkedUrls.has(urlStr)) {
linkChecks.push(checkUrl(urlStr, pathname).then(() => {}));
}
}
} else {
// External link -- verify it's live
if (!checkedUrls.has(urlStr)) {
linkChecks.push(checkUrl(urlStr, pathname).then(() => {}));
}
}
}
// Check external/non-docs links concurrently
const batched: Array<Promise<void>> = [];
for (const check of linkChecks) {
batched.push(check);
if (batched.length >= CONCURRENCY) {
await Promise.all(batched);
batched.length = 0;
}
}
if (batched.length > 0) await Promise.all(batched);
// Crawl internal docs pages
for (const page of internalPages) {
await crawlPage(page.url, page.referrer);
}
}
// Start crawling from /docs
stepStart = performance.now();
// deno-lint-ignore no-console
console.log("Crawling docs pages...");
const docsUrl = new URL("/docs", rootUrl);
await crawlPage(docsUrl, "(start)");
// deno-lint-ignore no-console
console.log();
// deno-lint-ignore no-console
console.log(
`\nCrawl completed in ${
((performance.now() - stepStart) / 1000).toFixed(1)
}s`,
);
// deno-lint-ignore no-console
console.log(`Docs pages crawled: ${visitedPages.size}`);
// deno-lint-ignore no-console
console.log(`Total links checked: ${checkedUrls.size}`);
// deno-lint-ignore no-console
console.log(
`Total time: ${((performance.now() - totalStart) / 1000).toFixed(1)}s`,
);
// Kill the server
cp.kill();
await cp.status;
if (failedLinks.length > 0) {
// deno-lint-ignore no-console
console.error(`\nBroken links found: ${failedLinks.length}`);
for (const link of failedLinks) {
// deno-lint-ignore no-console
console.error(
` ${link.status} ${link.url} (linked from ${link.referrer})`,
);
}
Deno.exit(1);
}
// deno-lint-ignore no-console
console.log("\nAll links OK!");
Deno.exit(0);