Skip to content

Commit 416ef96

Browse files
committed
Add warnings for missing or invalid URL replacements; rename function UrlWithoutAnchors to RemoveAnchorsFromUrl
1 parent eb5e964 commit 416ef96

File tree

4 files changed

+29
-18
lines changed

4 files changed

+29
-18
lines changed

apps/parser/src/helpers/url-handling.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ export function sanitizeUrlAsFilename(
1313
options?: SanitizeOptions,
1414
): string {
1515
if (!url) {
16+
console.warn(
17+
`Missing input url, sanitizing as default "${DEFAULT_REPLACEMENT}"`,
18+
);
1619
return DEFAULT_REPLACEMENT;
1720
}
1821
let filenameBase = url;
@@ -59,18 +62,24 @@ export function sanitizeUrlAsFilename(
5962

6063
function validReplacementOrDefault(candidate: string): string {
6164
if (!candidate) {
65+
console.warn(
66+
`Missing replacement character, using default "${DEFAULT_REPLACEMENT}"`,
67+
);
6268
return DEFAULT_REPLACEMENT;
6369
}
6470
if (
6571
/[\/\?<>\\:\*\|"]/u.test(candidate) ||
6672
/[\x00-\x1f\x80-\x9f]/u.test(candidate)
6773
) {
74+
console.warn(
75+
`Invalid replacement character: "${candidate}", using default "${DEFAULT_REPLACEMENT}"`,
76+
);
6877
return DEFAULT_REPLACEMENT;
6978
}
7079
return candidate;
7180
}
7281

73-
export const UrlWithoutAnchors = (rawUrl: string): string => {
82+
export const RemoveAnchorsFromUrl = (rawUrl: string): string => {
7483
try {
7584
const parsed = new URL(rawUrl);
7685
parsed.hash = "";
@@ -101,7 +110,7 @@ export function deriveSubPath(
101110
if (!relPath.startsWith("/")) relPath = "/" + relPath;
102111
}
103112
if (
104-
UrlWithoutAnchors(targetUrl) === sanitizedBaseUrl ||
113+
RemoveAnchorsFromUrl(targetUrl) === sanitizedBaseUrl ||
105114
relPath === "/" ||
106115
relPath === ""
107116
) {

apps/parser/src/modules/config.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import path from "node:path";
22
import { EnvConfig } from "./types";
33
import {
4-
UrlWithoutAnchors,
4+
RemoveAnchorsFromUrl,
55
sanitizeUrlAsFilename,
66
} from "../helpers/url-handling";
77
import * as dotenv from "dotenv";
@@ -27,7 +27,7 @@ export function resolveEnv(): EnvConfig {
2727
"Missing required URL. Set URL in environment or .env file.",
2828
);
2929
}
30-
const sanitizedBaseUrl = UrlWithoutAnchors(baseUrl);
30+
const sanitizedBaseUrl = RemoveAnchorsFromUrl(baseUrl);
3131
const parsedDepth = Number.parseInt(depth ?? `${DEFAULT_DEPTH}`, 10);
3232
const maxDepth = Number.isNaN(parsedDepth)
3333
? DEFAULT_DEPTH

apps/parser/src/modules/crawler.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
import { Browser } from "puppeteer";
22
import { ParsedNode, ParsedMetadata } from "./types";
3-
import { UrlWithoutAnchors } from "../helpers/url-handling";
3+
import { RemoveAnchorsFromUrl } from "../helpers/url-handling";
44
import { expandInteractiveSections } from "./dom-actions";
55

6+
const PAGE_NAVIGATION_OPTIONS = {
7+
waitUntil: "networkidle2" as const,
8+
};
9+
610
export async function exploreAndParsePages(
711
browser: Browser,
812
node: ParsedNode,
@@ -25,7 +29,7 @@ export async function exploreAndParsePages(
2529
if (parsedPages.has(visitKey) || depth > maxDepth) {
2630
return;
2731
}
28-
const normalizedUrl = UrlWithoutAnchors(node.url);
32+
const normalizedUrl = RemoveAnchorsFromUrl(node.url);
2933
if (!isWithinScope(normalizedUrl, baseScope, validDomainVariants)) {
3034
return;
3135
}
@@ -43,7 +47,7 @@ export async function exploreAndParsePages(
4347
try {
4448
page = await browser.newPage();
4549
await page.goto(node.url, {
46-
waitUntil: "networkidle2",
50+
...PAGE_NAVIGATION_OPTIONS,
4751
timeout: navigationTimeout,
4852
});
4953
await expandInteractiveSections(page);
@@ -90,7 +94,7 @@ export async function exploreAndParsePages(
9094
const nextChildren: ParsedNode[] = [];
9195
let newLinksCount = 0;
9296
for (const href of anchors) {
93-
const normalized = UrlWithoutAnchors(href);
97+
const normalized = RemoveAnchorsFromUrl(href);
9498
const visitCandidate = buildVisitKey(normalized);
9599
if (parsedPages.has(visitCandidate) || scheduledPages.has(visitCandidate))
96100
continue;
@@ -172,8 +176,9 @@ export function buildVisitKey(rawUrl: string): string {
172176
const url = new URL(rawUrl);
173177
url.hash = "";
174178
url.hostname = url.hostname.replace(/^www\./, "");
175-
return UrlWithoutAnchors(url.toString());
176-
} catch (_error) {
179+
return RemoveAnchorsFromUrl(url.toString());
180+
} catch (error) {
181+
console.warn(`Failed to build visit key for URL: ${rawUrl}`, error);
177182
return rawUrl;
178183
}
179184
}

apps/parser/src/parser.ts

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { expandInteractiveSections } from "./modules/dom-actions";
99
import { ParsedNode, ParsedMetadata } from "./modules/types";
1010
import {
1111
sanitizeUrlAsFilename,
12-
UrlWithoutAnchors,
12+
RemoveAnchorsFromUrl,
1313
} from "./helpers/url-handling";
1414
import { assertReachable } from "./modules/network";
1515
import { toIsoOrNull } from "./helpers/date-format";
@@ -18,7 +18,7 @@ puppeteer.use(StealthPlugin());
1818

1919
const NAVIGATION_TIMEOUT_MS = 30_000;
2020
const REQUEST_TIMEOUT_MS = 10_000;
21-
const FILENAME_LENGTH_THRESHOLD = 255;
21+
const FILENAME_LENGTH_THRESHOLD = 250;
2222

2323
const env = resolveEnv();
2424
const parsedPages = new Map<string, ParsedMetadata>();
@@ -32,7 +32,7 @@ void (async () => {
3232
const root: ParsedNode = { url: env.baseUrl };
3333
const baseUrlObject = new URL(env.baseUrl);
3434
const baseOrigin = baseUrlObject.origin;
35-
const baseScope = UrlWithoutAnchors(env.baseUrl);
35+
const baseScope = RemoveAnchorsFromUrl(env.baseUrl);
3636
const baseHostToken = baseUrlObject.hostname
3737
.replace(/^www\./, "")
3838
.toLowerCase();
@@ -72,7 +72,7 @@ async function parsePageFn(
7272
await expandInteractiveSections(page);
7373
const rawMetadata = await page.evaluate(extractDocumentMetadata);
7474
const snapshot = serializeMetadata(rawMetadata);
75-
await persistSnapshot(snapshot, FILENAME_LENGTH_THRESHOLD);
75+
await persistSnapshot(snapshot);
7676
return snapshot;
7777
} catch (error) {
7878
console.error(`Error while parsing ${url}:`, (error as Error).message);
@@ -84,10 +84,7 @@ async function parsePageFn(
8484
}
8585
}
8686

87-
async function persistSnapshot(
88-
snapshot: ParsedMetadata,
89-
FILENAME_LENGTH_THRESHOLD: number,
90-
): Promise<void> {
87+
async function persistSnapshot(snapshot: ParsedMetadata): Promise<void> {
9188
const finalName = sanitizeUrlAsFilename(snapshot.url, {
9289
replacement: "-",
9390
lengthThreshold: FILENAME_LENGTH_THRESHOLD,

0 commit comments

Comments
 (0)