Skip to content

Commit eb5e964

Browse files
committed
Rename parsePages to exploreAndParsePages. Expand sanitizeUrlAsFilename function so that for root it returns the hostname
1 parent 985363d commit eb5e964

File tree

4 files changed

+13
-12
lines changed

4 files changed

+13
-12
lines changed

apps/parser/src/helpers/url-handling.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,12 @@ export function sanitizeUrlAsFilename(
1818
let filenameBase = url;
1919
try {
2020
const urlObj = new URL(url);
21-
filenameBase = `${urlObj.pathname}${urlObj.search}`;
21+
const pathAndSearch = `${urlObj.pathname}${urlObj.search}`;
22+
if (pathAndSearch === "/" || pathAndSearch === "") {
23+
filenameBase = urlObj.hostname;
24+
} else {
25+
filenameBase = pathAndSearch;
26+
}
2227
} catch (_error) {
2328
// If it's not a valid URL, use as-is
2429
}

apps/parser/src/modules/config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ function generateOutputDirectoryPath(
6161
sanitizedBaseUrl: string,
6262
): string {
6363
const safeBaseSegment = sanitizeUrlAsFilename(sanitizedBaseUrl, {
64-
replacement: "_",
64+
replacement: "-",
6565
});
6666
if (!vectorIndexName) {
6767
return `output/${safeBaseSegment}`;

apps/parser/src/modules/crawler.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { ParsedNode, ParsedMetadata } from "./types";
33
import { UrlWithoutAnchors } from "../helpers/url-handling";
44
import { expandInteractiveSections } from "./dom-actions";
55

6-
export async function parsePages(
6+
export async function exploreAndParsePages(
77
browser: Browser,
88
node: ParsedNode,
99
depth: number,
@@ -26,9 +26,7 @@ export async function parsePages(
2626
return;
2727
}
2828
const normalizedUrl = UrlWithoutAnchors(node.url);
29-
if (
30-
!isWithinScope(normalizedUrl, baseScope, validDomainVariants)
31-
) {
29+
if (!isWithinScope(normalizedUrl, baseScope, validDomainVariants)) {
3230
return;
3331
}
3432
const metadata = await parsePageFn(browser, node.url);
@@ -100,9 +98,7 @@ export async function parsePages(
10098
if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
10199
continue;
102100
}
103-
if (
104-
!isWithinScope(normalized, baseScope, validDomainVariants)
105-
) {
101+
if (!isWithinScope(normalized, baseScope, validDomainVariants)) {
106102
continue;
107103
}
108104
scheduledPages.add(visitCandidate);
@@ -120,7 +116,7 @@ export async function parsePages(
120116
);
121117
if (!node.children || depth >= maxDepth) return;
122118
for (const child of node.children) {
123-
await parsePages(
119+
await exploreAndParsePages(
124120
browser,
125121
child,
126122
depth + 1,

apps/parser/src/parser.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { Browser, Page } from "puppeteer";
44
import { resolveEnv } from "./modules/config";
55
import { ensureDirectory, saveMetadata } from "./modules/output";
66
import { handleError } from "./modules/errors";
7-
import { buildVisitKey, parsePages } from "./modules/crawler";
7+
import { buildVisitKey, exploreAndParsePages } from "./modules/crawler";
88
import { expandInteractiveSections } from "./modules/dom-actions";
99
import { ParsedNode, ParsedMetadata } from "./modules/types";
1010
import {
@@ -37,7 +37,7 @@ void (async () => {
3737
.replace(/^www\./, "")
3838
.toLowerCase();
3939
scheduledPages.add(buildVisitKey(env.baseUrl));
40-
await parsePages(
40+
await exploreAndParsePages(
4141
browser,
4242
root,
4343
0,

0 commit comments

Comments
 (0)