Skip to content

Commit 2a1b9a0

Browse files
committed
Refactor parser configuration: update maxDepth to allow null for unlimited depth, implement base scope handling, and enhance URL sanitization functions for Directory names
1 parent 755fb6b commit 2a1b9a0

File tree

8 files changed

+66
-18
lines changed

8 files changed

+66
-18
lines changed

apps/parser/.env.default

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# Root URL to start parsing from
33
URL="https://example.com"
44

5-
# Maximum recursion depth (integer)
6-
DEPTH=2
5+
# Maximum recursion depth (integer or null for unlimited)
6+
DEPTH=null
77

88
# Name of the vector index bucket/folder where parsed artifacts are stored
99
CHB_INDEX_ID="parser-vector-index-name"

apps/parser/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Create a `.env` file in the `apps/parser` directory with the following content:
4141
```
4242
URL=https://example.com
4343
CHB_INDEX_ID=name_of_your_choice
44-
# DEPTH=2 # Optional, defaults to 2
44+
# DEPTH=2 # Optional, defaults to null
4545
```
4646

4747
#### b) Using command line variables

apps/parser/src/helpers/url-handling.ts

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ const WINDOWS_RESERVED_RE = /^(con|prn|aux|nul|com[0-9]|lpt[0-9])$/i;
88
const WINDOWS_TRAILING_RE = /[\. ]+$/;
99
const DEFAULT_REPLACEMENT = "-";
1010

11+
let BASE_SCOPE: string;
12+
13+
export function setBaseScope(scope: string): void {
14+
BASE_SCOPE = scope;
15+
}
16+
1117
export function sanitizeUrlAsFilename(
1218
url: string,
1319
options?: SanitizeOptions,
@@ -19,17 +25,53 @@ export function sanitizeUrlAsFilename(
1925
return DEFAULT_REPLACEMENT;
2026
}
2127
let filenameBase = url;
22-
try {
23-
const urlObj = new URL(url);
24-
const pathAndSearch = `${urlObj.pathname}${urlObj.search}`;
25-
if (pathAndSearch === "/" || pathAndSearch === "") {
26-
filenameBase = urlObj.hostname;
28+
if (filenameBase === BASE_SCOPE) {
29+
filenameBase = new URL(filenameBase).hostname.replace(/^www\./, "");
30+
} else {
31+
const pathAndSearch = url.replace(BASE_SCOPE, "").replace(/^\/+/, "");
32+
if (!pathAndSearch || pathAndSearch === "/") {
33+
filenameBase = url.split("/").filter(Boolean).pop() || url;
2734
} else {
2835
filenameBase = pathAndSearch;
2936
}
30-
} catch (_error) {
31-
// If it's not a valid URL, use as-is
3237
}
38+
39+
const replacement = validReplacementOrDefault(
40+
options?.replacement ?? DEFAULT_REPLACEMENT,
41+
);
42+
let sanitized = filenameBase
43+
.replace(/\/$/, "")
44+
.replace(ILLEGAL_RE, replacement)
45+
.replace(CONTROL_RE, replacement)
46+
.replace(RESERVED_RE, replacement)
47+
.replace(WINDOWS_RESERVED_RE, replacement)
48+
.replace(WINDOWS_TRAILING_RE, replacement)
49+
.trim();
50+
if (sanitized.length === 0) {
51+
return replacement;
52+
}
53+
const trimmedName = sanitized.replace(/^[-_]+/, "") || sanitized;
54+
if (
55+
options?.lengthThreshold &&
56+
trimmedName.length > options.lengthThreshold
57+
) {
58+
const hash = crypto
59+
.createHash("sha1")
60+
.update(url)
61+
.digest("hex")
62+
.slice(0, 9);
63+
const prefix = trimmedName.slice(0, options.lengthThreshold - 10);
64+
return `${prefix}_${hash}`;
65+
}
66+
return trimmedName;
67+
}
68+
69+
export function sanitizeUrlAsDirectoryName(
70+
url: string,
71+
options?: SanitizeOptions,
72+
): string {
73+
let filenameBase = url;
74+
filenameBase = filenameBase.replace(/^(https?:\/\/)?(www\.)?/, "");
3375
const replacement = validReplacementOrDefault(
3476
options?.replacement ?? DEFAULT_REPLACEMENT,
3577
);

apps/parser/src/main.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,11 @@ import { ensureDirectory } from "./modules/output";
55
import { handleError } from "./modules/errors";
66
import { exploreAndParsePages } from "./modules/parser";
77
import { ParsedNode, ParsedMetadata } from "./modules/types";
8-
import { RemoveAnchorsFromUrl, buildVisitKey } from "./helpers/url-handling";
8+
import {
9+
RemoveAnchorsFromUrl,
10+
buildVisitKey,
11+
setBaseScope,
12+
} from "./helpers/url-handling";
913
import { assertReachable } from "./modules/network";
1014

1115
puppeteer.use(StealthPlugin());
@@ -21,6 +25,8 @@ export const BASE_HOST_TOKEN = new URL(env.baseUrl).hostname
2125
.toLowerCase();
2226
export const VALID_DOMAIN_VARIANTS = env.validDomainVariants || [];
2327

28+
setBaseScope(BASE_SCOPE);
29+
2430
void (async () => {
2531
try {
2632
await assertReachable(env.baseUrl);

apps/parser/src/modules/config.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@ import path from "node:path";
22
import { EnvConfig } from "./types";
33
import {
44
RemoveAnchorsFromUrl,
5-
sanitizeUrlAsFilename,
5+
sanitizeUrlAsDirectoryName,
66
} from "../helpers/url-handling";
77
import * as dotenv from "dotenv";
88

9-
const DEFAULT_DEPTH = 2;
9+
const DEFAULT_DEPTH = null;
1010

1111
export function resolveEnv(): EnvConfig {
1212
let baseUrl = process.env.URL?.trim();
@@ -60,7 +60,7 @@ function generateOutputDirectoryPath(
6060
vectorIndexName: string | undefined,
6161
sanitizedBaseUrl: string,
6262
): string {
63-
const safeBaseSegment = sanitizeUrlAsFilename(sanitizedBaseUrl, {
63+
const safeBaseSegment = sanitizeUrlAsDirectoryName(sanitizedBaseUrl, {
6464
replacement: "-",
6565
});
6666
if (!vectorIndexName) {

apps/parser/src/modules/output.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export async function persistSnapshot(
1818
let finalName = sanitizeUrlAsFilename(snapshot.url, {
1919
lengthThreshold: FILENAME_LENGTH_THRESHOLD,
2020
});
21-
if (finalName === BASE_HOST_TOKEN) {
21+
if (finalName.replace(/^www\./, "") === BASE_HOST_TOKEN) {
2222
finalName = "index";
2323
}
2424
await saveMetadata(outputDirectory, `${finalName}.json`, snapshot);

apps/parser/src/modules/parser.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ export async function exploreAndParsePages(
3333
): Promise<void> {
3434
const visitKey = buildVisitKey(node.url);
3535
scheduledPages.delete(visitKey);
36-
if (parsedPages.has(visitKey) || depth > MAX_DEPTH) {
36+
if (parsedPages.has(visitKey) || (MAX_DEPTH !== null && depth > MAX_DEPTH)) {
3737
return;
3838
}
3939
const normalizedUrl = RemoveAnchorsFromUrl(node.url);
@@ -125,7 +125,7 @@ export async function exploreAndParsePages(
125125
parsedPages.size
126126
}/${totalKnown} (${((parsedPages.size / totalKnown) * 100).toFixed(2)}%)`,
127127
);
128-
if (!node.children || depth >= MAX_DEPTH) return;
128+
if (!node.children || (MAX_DEPTH !== null && depth >= MAX_DEPTH)) return;
129129
for (const child of node.children) {
130130
await exploreAndParsePages(
131131
browser,

apps/parser/src/modules/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ export type EnvConfig = {
22
readonly baseUrl: string;
33
readonly sanitizedBaseUrl: string;
44
readonly outputDirectory: string;
5-
readonly maxDepth: number;
5+
readonly maxDepth: number | null;
66
readonly validDomainVariants?: string[];
77
};
88

0 commit comments

Comments
 (0)