Skip to content

Commit 286325d

Browse files
committed
Enhance URL handling and configuration:
- Update sanitizeUrlAsFilename to support length threshold and hash suffix for long URLs. - Modify resolveEnv to parse validDomainVariants from environment variables. - Refactor parsePages to utilize validDomainVariants for scope checking. - Update EnvConfig type to include validDomainVariants. - Improve tests for sanitizeUrlAsFilename to cover new functionality.
1 parent 0c19cbb commit 286325d

File tree

6 files changed

+142
-74
lines changed

6 files changed

+142
-74
lines changed

apps/parser/src/helpers/url-handling.ts

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { SanitizeOptions } from "../modules/types";
2+
import crypto from "crypto";
23

34
const ILLEGAL_RE = /[\/\?<>\\:\*\|"]/g;
45
const CONTROL_RE = /[\x00-\x1f\x80-\x9f]/g;
@@ -8,16 +9,24 @@ const WINDOWS_TRAILING_RE = /[\. ]+$/;
89
const DEFAULT_REPLACEMENT = "-";
910

1011
export function sanitizeUrlAsFilename(
11-
input: string,
12+
url: string,
1213
options?: SanitizeOptions,
1314
): string {
14-
if (!input) {
15+
if (!url) {
1516
return DEFAULT_REPLACEMENT;
1617
}
18+
let filenameBase = url;
19+
try {
20+
const urlObj = new URL(url);
21+
filenameBase = `${urlObj.pathname}${urlObj.search}`;
22+
} catch (_error) {
23+
// If it's not a valid URL, use as-is
24+
}
1725
const replacement = validReplacementOrDefault(
1826
options?.replacement ?? DEFAULT_REPLACEMENT,
1927
);
20-
let sanitized = input
28+
let sanitized = filenameBase
29+
.replace(/\/$/, "")
2130
.replace(ILLEGAL_RE, replacement)
2231
.replace(CONTROL_RE, replacement)
2332
.replace(RESERVED_RE, replacement)
@@ -27,7 +36,20 @@ export function sanitizeUrlAsFilename(
2736
if (sanitized.length === 0) {
2837
return replacement;
2938
}
30-
return sanitized.slice(0, 255);
39+
const trimmedName = sanitized.replace(/^[-_]+/, "") || sanitized;
40+
if (
41+
options?.lengthThreshold &&
42+
trimmedName.length > options.lengthThreshold
43+
) {
44+
const hash = crypto
45+
.createHash("sha1")
46+
.update(url)
47+
.digest("hex")
48+
.slice(0, 9);
49+
const prefix = trimmedName.slice(0, options.lengthThreshold - 10);
50+
return `${prefix}_${hash}`;
51+
}
52+
return trimmedName;
3153
}
3254

3355
function validReplacementOrDefault(candidate: string): string {

apps/parser/src/modules/config.ts

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@ export function resolveEnv(): EnvConfig {
1212
let baseUrl = process.env.URL?.trim();
1313
let depth = process.env.DEPTH?.trim();
1414
let vectorIndexName = process.env.CHB_INDEX_ID?.trim();
15+
let validDomainVariants = process.env.validDomainVariants?.trim();
1516
if (!baseUrl || !depth || !vectorIndexName) {
1617
const parserHome = path.resolve(__dirname, "../../");
1718
dotenv.config({ path: path.join(parserHome, ".env") });
1819
baseUrl = baseUrl || process.env.URL?.trim();
1920
depth = depth || process.env.DEPTH?.trim();
2021
vectorIndexName = vectorIndexName || process.env.CHB_INDEX_ID?.trim();
22+
validDomainVariants =
23+
validDomainVariants || process.env.validDomainVariants?.trim();
2124
}
2225
if (!baseUrl) {
2326
throw new Error(
@@ -33,7 +36,24 @@ export function resolveEnv(): EnvConfig {
3336
vectorIndexName,
3437
sanitizedBaseUrl,
3538
);
36-
return { baseUrl, sanitizedBaseUrl, outputDirectory, maxDepth };
39+
let parsedValidDomainVariants: string[] = [];
40+
if (validDomainVariants) {
41+
try {
42+
parsedValidDomainVariants = JSON.parse(validDomainVariants);
43+
if (!Array.isArray(parsedValidDomainVariants)) {
44+
parsedValidDomainVariants = [];
45+
}
46+
} catch (_error) {
47+
parsedValidDomainVariants = [];
48+
}
49+
}
50+
return {
51+
baseUrl,
52+
sanitizedBaseUrl,
53+
outputDirectory,
54+
maxDepth,
55+
validDomainVariants: parsedValidDomainVariants,
56+
};
3757
}
3858

3959
function generateOutputDirectoryPath(

apps/parser/src/modules/crawler.ts

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export async function parsePages(
1717
baseOrigin: string,
1818
baseScope: string,
1919
baseHostToken: string,
20+
validDomainVariants: string[] = [],
2021
navigationTimeout = 30000,
2122
): Promise<void> {
2223
const visitKey = buildVisitKey(node.url);
@@ -25,7 +26,9 @@ export async function parsePages(
2526
return;
2627
}
2728
const normalizedUrl = UrlWithoutAnchors(node.url);
28-
if (!isWithinScope(normalizedUrl, baseScope, baseHostToken)) {
29+
if (
30+
!isWithinScope(normalizedUrl, baseScope, validDomainVariants)
31+
) {
2932
return;
3033
}
3134
const metadata = await parsePageFn(browser, node.url);
@@ -90,19 +93,21 @@ export async function parsePages(
9093
let newLinksCount = 0;
9194
for (const href of anchors) {
9295
const normalized = UrlWithoutAnchors(href);
93-
const visitCandidate = buildVisitKey(href);
96+
const visitCandidate = buildVisitKey(normalized);
9497
if (parsedPages.has(visitCandidate) || scheduledPages.has(visitCandidate))
9598
continue;
9699
const lowerNormalized = normalized.toLowerCase();
97100
if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
98101
continue;
99102
}
100-
if (!isWithinScope(normalized, baseScope, baseHostToken)) {
103+
if (
104+
!isWithinScope(normalized, baseScope, validDomainVariants)
105+
) {
101106
continue;
102107
}
103108
scheduledPages.add(visitCandidate);
104109
newLinksCount += 1;
105-
nextChildren.push({ url: href });
110+
nextChildren.push({ url: normalized });
106111
}
107112
node.children = nextChildren;
108113
const totalKnown = parsedPages.size + scheduledPages.size;
@@ -126,33 +131,50 @@ export async function parsePages(
126131
baseOrigin,
127132
baseScope,
128133
baseHostToken,
134+
validDomainVariants,
129135
);
130136
}
131137
}
132138

133-
function isWithinScope(url: string, scope: string, hostToken: string): boolean {
134-
if (hostToken && url.toLowerCase().includes(hostToken)) {
135-
return true;
136-
}
139+
function isWithinScope(
140+
url: string,
141+
scope: string,
142+
validDomainVariants: string[] = [],
143+
): boolean {
137144
if (!scope) {
138145
return true;
139146
}
140-
const lowerUrl = url.toLowerCase();
141-
const lowerScope = scope.toLowerCase();
142-
if (lowerUrl === lowerScope) {
143-
return true;
144-
}
145-
if (!lowerUrl.startsWith(lowerScope)) {
147+
try {
148+
const urlObj = new URL(url);
149+
const scopeObj = new URL(scope);
150+
const urlDomain = urlObj.hostname.replace(/^www\./, "");
151+
const scopeDomain = scopeObj.hostname.replace(/^www\./, "");
152+
if (urlDomain === scopeDomain) {
153+
return true;
154+
}
155+
const urlParts = urlDomain.split(".");
156+
const scopeParts = scopeDomain.split(".");
157+
if (urlParts.length > scopeParts.length) {
158+
const subdomain = urlParts[0];
159+
const domainWithoutSubdomain = urlParts.slice(1).join(".");
160+
if (
161+
domainWithoutSubdomain === scopeDomain &&
162+
validDomainVariants.includes(subdomain)
163+
) {
164+
return true;
165+
}
166+
}
167+
return false;
168+
} catch (_error) {
146169
return false;
147170
}
148-
const nextChar = lowerUrl.charAt(lowerScope.length);
149-
return nextChar === "/" || nextChar === "?" || nextChar === "#";
150171
}
151172

152173
export function buildVisitKey(rawUrl: string): string {
153174
try {
154175
const url = new URL(rawUrl);
155176
url.hash = "";
177+
url.hostname = url.hostname.replace(/^www\./, "");
156178
return UrlWithoutAnchors(url.toString());
157179
} catch (_error) {
158180
return rawUrl;

apps/parser/src/modules/types.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ export type EnvConfig = {
33
readonly sanitizedBaseUrl: string;
44
readonly outputDirectory: string;
55
readonly maxDepth: number;
6+
readonly validDomainVariants?: string[];
67
};
78

89
export type SanitizeOptions = {
910
readonly replacement?: string;
11+
readonly lengthThreshold?: number;
1012
};
1113

1214
export type ParsedMetadata = {

apps/parser/src/parser.ts

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@ import { ParsedNode, ParsedMetadata } from "./modules/types";
1010
import {
1111
sanitizeUrlAsFilename,
1212
UrlWithoutAnchors,
13-
deriveSubPath,
1413
} from "./helpers/url-handling";
1514
import { assertReachable } from "./modules/network";
1615
import { toIsoOrNull } from "./helpers/date-format";
17-
import crypto from "crypto";
1816

1917
puppeteer.use(StealthPlugin());
2018

@@ -50,6 +48,7 @@ void (async () => {
5048
baseOrigin,
5149
baseScope,
5250
baseHostToken,
51+
env.validDomainVariants || [],
5352
NAVIGATION_TIMEOUT_MS,
5453
);
5554
await browser.close();
@@ -89,27 +88,10 @@ async function persistSnapshot(
8988
snapshot: ParsedMetadata,
9089
FILENAME_LENGTH_THRESHOLD: number,
9190
): Promise<void> {
92-
const subPath = deriveSubPath(
93-
snapshot.url,
94-
env.baseUrl,
95-
env.sanitizedBaseUrl,
96-
);
97-
const preferredName = subPath === "/" ? "root" : subPath;
98-
const sanitizedName = sanitizeUrlAsFilename(preferredName, {
91+
const finalName = sanitizeUrlAsFilename(snapshot.url, {
9992
replacement: "-",
93+
lengthThreshold: FILENAME_LENGTH_THRESHOLD,
10094
});
101-
const trimmedName = sanitizedName.replace(/^[-_]+/, "") || sanitizedName;
102-
let finalName = trimmedName;
103-
if (trimmedName.length > FILENAME_LENGTH_THRESHOLD) {
104-
const normalizedUrl = UrlWithoutAnchors(snapshot.url);
105-
const hash = crypto
106-
.createHash("sha1")
107-
.update(normalizedUrl)
108-
.digest("hex")
109-
.slice(0, 10);
110-
const prefix = trimmedName.slice(0, 240);
111-
finalName = `${prefix}_${hash}`;
112-
}
11395
await saveMetadata(env.outputDirectory, `${finalName}.json`, snapshot);
11496
}
11597

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
import { sanitizeUrlAsFilename } from "../src/helpers/url-handling";
22

3-
describe("sanitizeFilename", () => {
4-
it("replaces illegal characters", () => {
5-
expect(sanitizeUrlAsFilename("file/name?with*illegal|chars.txt")).toBe(
6-
"file-name-with-illegal-chars.txt",
3+
describe("sanitizeUrlAsFilename", () => {
4+
it("sanitizes URL paths with query strings and fragments", () => {
5+
expect(
6+
sanitizeUrlAsFilename("/docs/guide?version=1.0&lang=en#section"),
7+
).toBe("docs-guide-version=1.0&lang=en#section");
8+
});
9+
10+
it("sanitizes URL paths with special characters", () => {
11+
expect(sanitizeUrlAsFilename("/api/v1/users:profile")).toBe(
12+
"api-v1-users-profile",
713
);
814
});
915

10-
it("replaces control characters", () => {
11-
expect(sanitizeUrlAsFilename("file\u0000name\u0001.txt")).toBe(
12-
"file-name-.txt",
16+
it("sanitizes control characters in URL paths", () => {
17+
expect(sanitizeUrlAsFilename("/path\u0000with\u0001control.html")).toBe(
18+
"path-with-control.html",
1319
);
1420
});
1521

@@ -41,46 +47,60 @@ describe("sanitizeFilename", () => {
4147
expect(sanitizeUrlAsFilename(undefined as unknown as string)).toBe("-");
4248
});
4349

44-
it("uses the replacement option", () => {
45-
expect(sanitizeUrlAsFilename("file/name", { replacement: "-" })).toBe(
46-
"file-name",
50+
it("uses custom replacement option for URL paths", () => {
51+
expect(sanitizeUrlAsFilename("/api/users/123", { replacement: "_" })).toBe(
52+
"api_users_123",
4753
);
4854
});
4955

50-
it("falls back to default replacement for invalid replacement", () => {
51-
expect(sanitizeUrlAsFilename("file/name", { replacement: "/" })).toBe(
52-
"file-name",
53-
);
54-
expect(sanitizeUrlAsFilename("file/name", { replacement: "\u0000" })).toBe(
55-
"file-name",
56-
);
56+
it("falls back to default replacement for invalid replacement chars", () => {
57+
expect(
58+
sanitizeUrlAsFilename("/docs/guide?ref=x", { replacement: "/" }),
59+
).toBe("docs-guide-ref=x");
60+
expect(
61+
sanitizeUrlAsFilename("/docs/guide", { replacement: "\u0000" }),
62+
).toBe("docs-guide");
5763
});
5864

59-
it("limits output to 255 characters", () => {
60-
const longName = "a".repeat(300) + ".txt";
61-
expect(sanitizeUrlAsFilename(longName).length).toBe(255);
65+
it("appends hash suffix for URLs exceeding length threshold", () => {
66+
const longUrl = "/docs/" + "very-long-path-segment/".repeat(20);
67+
const result = sanitizeUrlAsFilename(longUrl, {
68+
lengthThreshold: 255
69+
});
70+
expect(result.length).toBe(255);
6271
});
6372

64-
it("returns replacement for input with only illegal/control chars", () => {
65-
expect(sanitizeUrlAsFilename("\u0000\u0001\u0002")).toBe("---");
66-
expect(sanitizeUrlAsFilename("////")).toBe("----");
73+
it("trims leading dashes from URL paths", () => {
74+
expect(sanitizeUrlAsFilename("/////path")).toBe("path");
6775
});
6876

69-
it("returns unchanged valid filename", () => {
70-
expect(sanitizeUrlAsFilename("valid-filename.txt")).toBe(
71-
"valid-filename.txt",
77+
it("returns unchanged valid URL path", () => {
78+
expect(sanitizeUrlAsFilename("api-reference.html")).toBe(
79+
"api-reference.html",
7280
);
7381
});
7482

75-
it("ignores illegal replacement chars", () => {
76-
expect(sanitizeUrlAsFilename("file/name", { replacement: "*" })).toBe(
77-
"file-name",
83+
it("ignores illegal characters in replacement option", () => {
84+
expect(sanitizeUrlAsFilename("/docs/guide", { replacement: "*" })).toBe(
85+
"docs-guide",
86+
);
87+
});
88+
89+
it("ignores control chars in replacement option", () => {
90+
expect(
91+
sanitizeUrlAsFilename("/docs/guide", { replacement: "\u0000" }),
92+
).toBe("docs-guide");
93+
});
94+
95+
it("sanitizes URL with subdomain path", () => {
96+
expect(sanitizeUrlAsFilename("showcase.example.com/docs-guide")).toBe(
97+
"showcase.example.com-docs-guide",
7898
);
7999
});
80100

81-
it("ignores control char replacement", () => {
82-
expect(sanitizeUrlAsFilename("file/name", { replacement: "\u0000" })).toBe(
83-
"file-name",
101+
it("handles URL with numbers and hyphens", () => {
102+
expect(sanitizeUrlAsFilename("/api/v2/resource-123")).toBe(
103+
"api-v2-resource-123",
84104
);
85105
});
86106
});

0 commit comments

Comments
 (0)