Skip to content

Commit c8b1cab

Browse files
committed
Refactor code for consistency and readability; update imports, format code and add new helper for dates
1 parent 462c5e6 commit c8b1cab

File tree

13 files changed

+279
-255
lines changed

13 files changed

+279
-255
lines changed

apps/parser/jest.config.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
import type { Config } from 'jest';
1+
import type { Config } from "jest";
22

33
const config: Config = {
44
rootDir: __dirname,
5-
testRegex: 'tests/.*\\.test\\.ts$',
5+
testRegex: "tests/.*\\.test\\.ts$",
66
transform: {
7-
'^.+\\.ts$': ['ts-jest', { tsconfig: 'tsconfig.json' }],
7+
"^.+\\.ts$": ["ts-jest", { tsconfig: "tsconfig.json" }],
88
},
9-
testEnvironment: 'node',
9+
testEnvironment: "node",
1010
clearMocks: true,
1111
verbose: false,
1212
};
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
export function toIsoOrNull(value: string | null): string | null {
2+
if (!value) {
3+
return null;
4+
}
5+
const date = new Date(value);
6+
return Number.isNaN(date.getTime()) ? null : date.toISOString();
7+
}
Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,84 @@
1+
import { SanitizeOptions } from "../modules/types";
2+
13
const ILLEGAL_RE = /[\/\?<>\\:\*\|"]/g;
24
const CONTROL_RE = /[\x00-\x1f\x80-\x9f]/g;
35
const RESERVED_RE = /^\.+$/;
46
const WINDOWS_RESERVED_RE = /^(con|prn|aux|nul|com[0-9]|lpt[0-9])$/i;
57
const WINDOWS_TRAILING_RE = /[\. ]+$/;
8+
const DEFAULT_REPLACEMENT = "-";
69

7-
export type SanitizeOptions = {
8-
readonly replacement?: string;
9-
};
10-
11-
const DEFAULT_REPLACEMENT = '-';
12-
13-
export function sanitizeUrlAsFilename(input: string, options?: SanitizeOptions): string {
10+
export function sanitizeUrlAsFilename(
11+
input: string,
12+
options?: SanitizeOptions,
13+
): string {
1414
if (!input) {
1515
return DEFAULT_REPLACEMENT;
1616
}
17-
18-
const replacement = validReplacementOrDefault(options?.replacement ?? DEFAULT_REPLACEMENT);
17+
const replacement = validReplacementOrDefault(
18+
options?.replacement ?? DEFAULT_REPLACEMENT,
19+
);
1920
let sanitized = input
2021
.replace(ILLEGAL_RE, replacement)
2122
.replace(CONTROL_RE, replacement)
2223
.replace(RESERVED_RE, replacement)
2324
.replace(WINDOWS_RESERVED_RE, replacement)
2425
.replace(WINDOWS_TRAILING_RE, replacement)
2526
.trim();
26-
2727
if (sanitized.length === 0) {
2828
return replacement;
2929
}
30-
3130
return sanitized.slice(0, 255);
3231
}
3332

3433
function validReplacementOrDefault(candidate: string): string {
3534
if (!candidate) {
3635
return DEFAULT_REPLACEMENT;
3736
}
38-
39-
if (/[\/\?<>\\:\*\|"]/u.test(candidate) || /[\x00-\x1f\x80-\x9f]/u.test(candidate)) {
37+
if (
38+
/[\/\?<>\\:\*\|"]/u.test(candidate) ||
39+
/[\x00-\x1f\x80-\x9f]/u.test(candidate)
40+
) {
4041
return DEFAULT_REPLACEMENT;
4142
}
42-
4343
return candidate;
4444
}
4545

4646
export const UrlWithoutAnchors = (rawUrl: string): string => {
47-
try {
47+
try {
4848
const parsed = new URL(rawUrl);
49-
parsed.hash = '';
50-
if (parsed.pathname.length > 1 && parsed.pathname.endsWith('/')) {
51-
parsed.pathname = parsed.pathname.slice(0, -1);
49+
parsed.hash = "";
50+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
51+
parsed.pathname = parsed.pathname.slice(0, -1);
5252
}
5353
const serialized = parsed.toString();
54-
if (parsed.pathname === '/' && !parsed.search) {
55-
return serialized.endsWith('/') ? serialized.slice(0, -1) : serialized;
54+
if (parsed.pathname === "/" && !parsed.search) {
55+
return serialized.endsWith("/") ? serialized.slice(0, -1) : serialized;
5656
}
5757
return serialized;
58-
} catch (error) {
58+
} catch (error) {
5959
console.warn(`Failed to parse URL: ${rawUrl}`, error);
6060
return rawUrl;
61-
}
61+
}
6262
};
63+
64+
export function deriveSubPath(
65+
targetUrl: string,
66+
baseUrl: string,
67+
sanitizedBaseUrl: string,
68+
): string {
69+
const base = new URL(baseUrl);
70+
const target = new URL(targetUrl);
71+
let relPath = target.pathname;
72+
if (base.pathname !== "/" && relPath.startsWith(base.pathname)) {
73+
relPath = relPath.slice(base.pathname.length);
74+
if (!relPath.startsWith("/")) relPath = "/" + relPath;
75+
}
76+
if (
77+
UrlWithoutAnchors(targetUrl) === sanitizedBaseUrl ||
78+
relPath === "/" ||
79+
relPath === ""
80+
) {
81+
return "/";
82+
}
83+
return `${relPath}${target.search}${target.hash}` || "/";
84+
}

apps/parser/src/modules/config.ts

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,50 @@
1-
2-
import path from 'node:path';
3-
import { UrlWithoutAnchors, sanitizeUrlAsFilename } from '../helpers/url-handling';
4-
import * as dotenv from 'dotenv';
5-
6-
export type EnvConfig = {
7-
readonly baseUrl: string;
8-
readonly sanitizedBaseUrl: string;
9-
readonly outputDirectory: string;
10-
readonly maxDepth: number;
11-
};
12-
1+
import path from "node:path";
2+
import { EnvConfig } from "./types";
3+
import {
4+
UrlWithoutAnchors,
5+
sanitizeUrlAsFilename,
6+
} from "../helpers/url-handling";
7+
import * as dotenv from "dotenv";
138

149
const DEFAULT_DEPTH = 2;
1510

16-
1711
export function resolveEnv(): EnvConfig {
1812
let baseUrl = process.env.URL?.trim();
1913
let depth = process.env.DEPTH?.trim();
2014
let vectorIndexName = process.env.CHB_INDEX_ID?.trim();
21-
2215
if (!baseUrl || !depth || !vectorIndexName) {
23-
const parserHome = path.resolve(__dirname, '../../');
24-
dotenv.config({ path: path.join(parserHome, '.env') });
16+
const parserHome = path.resolve(__dirname, "../../");
17+
dotenv.config({ path: path.join(parserHome, ".env") });
2518
baseUrl = baseUrl || process.env.URL?.trim();
2619
depth = depth || process.env.DEPTH?.trim();
2720
vectorIndexName = vectorIndexName || process.env.CHB_INDEX_ID?.trim();
2821
}
29-
3022
if (!baseUrl) {
31-
throw new Error('Missing required URL. Set URL in environment or .env file.');
23+
throw new Error(
24+
"Missing required URL. Set URL in environment or .env file.",
25+
);
3226
}
33-
3427
const sanitizedBaseUrl = UrlWithoutAnchors(baseUrl);
3528
const parsedDepth = Number.parseInt(depth ?? `${DEFAULT_DEPTH}`, 10);
36-
const maxDepth = Number.isNaN(parsedDepth) ? DEFAULT_DEPTH : Math.max(parsedDepth, 0);
37-
const outputDirectory = generateOutputDirectoryPath(vectorIndexName, sanitizedBaseUrl);
38-
29+
const maxDepth = Number.isNaN(parsedDepth)
30+
? DEFAULT_DEPTH
31+
: Math.max(parsedDepth, 0);
32+
const outputDirectory = generateOutputDirectoryPath(
33+
vectorIndexName,
34+
sanitizedBaseUrl,
35+
);
3936
return { baseUrl, sanitizedBaseUrl, outputDirectory, maxDepth };
4037
}
4138

4239
function generateOutputDirectoryPath(
4340
vectorIndexName: string | undefined,
44-
sanitizedBaseUrl: string
41+
sanitizedBaseUrl: string,
4542
): string {
46-
const safeBaseSegment = sanitizeUrlAsFilename(sanitizedBaseUrl, { replacement: '_' });
43+
const safeBaseSegment = sanitizeUrlAsFilename(sanitizedBaseUrl, {
44+
replacement: "_",
45+
});
4746
if (!vectorIndexName) {
4847
return `output/${safeBaseSegment}`;
4948
}
50-
return path.join(vectorIndexName, 'parsing', safeBaseSegment);
49+
return path.join(vectorIndexName, "parsing", safeBaseSegment);
5150
}

apps/parser/src/modules/crawler.ts

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,58 @@
1-
import { Browser } from 'puppeteer';
2-
import { ParseNode, ParseMetadata } from './types';
3-
import { UrlWithoutAnchors } from '../helpers/url-handling';
4-
import { expandInteractiveSections } from './dom-actions';
1+
import { Browser } from "puppeteer";
2+
import { ParsedNode, ParsedMetadata } from "./types";
3+
import { UrlWithoutAnchors } from "../helpers/url-handling";
4+
import { expandInteractiveSections } from "./dom-actions";
55

66
export async function parsePages(
77
browser: Browser,
8-
node: ParseNode,
8+
node: ParsedNode,
99
depth: number,
1010
maxDepth: number,
11-
parsedPages: Map<string, ParseMetadata>,
12-
parsePageFn: (browser: Browser, url: string) => Promise<ParseMetadata | null>,
11+
parsedPages: Map<string, ParsedMetadata>,
12+
parsePageFn: (
13+
browser: Browser,
14+
url: string,
15+
) => Promise<ParsedMetadata | null>,
1316
baseOrigin: string,
1417
baseScope: string,
1518
baseHostToken: string,
16-
navigationTimeout = 30000
19+
navigationTimeout = 30000,
1720
): Promise<void> {
1821
const visitKey = buildVisitKey(node.url);
1922
if (parsedPages.has(visitKey) || depth > maxDepth) {
2023
return;
2124
}
22-
2325
const normalizedUrl = UrlWithoutAnchors(node.url);
2426
if (!isWithinScope(normalizedUrl, baseScope, baseHostToken)) {
2527
return;
2628
}
27-
2829
const metadata = await parsePageFn(browser, node.url);
2930
if (!metadata) return;
30-
3131
parsedPages.set(visitKey, metadata);
3232
node.title = metadata.title;
3333
node.bodyText = metadata.bodyText;
3434
node.lang = metadata.lang;
3535
node.keywords = metadata.keywords;
3636
node.datePublished = metadata.datePublished;
3737
node.lastModified = metadata.lastModified;
38-
3938
let page;
4039
let anchors: string[] = [];
4140
try {
4241
page = await browser.newPage();
43-
await page.goto(node.url, { waitUntil: 'networkidle2', timeout: navigationTimeout });
42+
await page.goto(node.url, {
43+
waitUntil: "networkidle2",
44+
timeout: navigationTimeout,
45+
});
4446
await expandInteractiveSections(page);
45-
anchors = await page.evaluate((allowedToken: string) => {
46-
const anchors = Array.from(document.querySelectorAll('a[href]'));
47-
const iframeSources = Array.from(document.querySelectorAll('iframe[src]'));
47+
anchors = (await page.evaluate((allowedToken: string) => {
48+
const anchors = Array.from(document.querySelectorAll("a[href]"));
49+
const iframeSources = Array.from(
50+
document.querySelectorAll("iframe[src]"),
51+
);
4852
const unique = new Set<string>();
4953
for (const anchor of anchors) {
5054
const href = (anchor as HTMLAnchorElement).href;
51-
if (!href || !href.startsWith('http')) continue;
55+
if (!href || !href.startsWith("http")) continue;
5256
try {
5357
const target = new URL(href, window.location.href);
5458
const normalizedHref = target.href.toLowerCase();
@@ -59,10 +63,9 @@ export async function parsePages(
5963
console.warn(`Failed to parse anchor href: ${href}`, error);
6064
}
6165
}
62-
6366
for (const frame of iframeSources) {
6467
const src = (frame as HTMLIFrameElement).src;
65-
if (!src || !src.startsWith('http')) {
68+
if (!src || !src.startsWith("http")) {
6669
continue;
6770
}
6871
try {
@@ -75,20 +78,19 @@ export async function parsePages(
7578
}
7679
}
7780
return Array.from(unique);
78-
}, baseHostToken) as string[];
81+
}, baseHostToken)) as string[];
7982
} catch (error) {
8083
console.warn(`Failed to extract anchors from ${node.url}`, error);
8184
} finally {
8285
if (page) await page.close();
8386
}
84-
85-
8687
const scheduled = new Set<string>();
87-
const nextChildren: ParseNode[] = [];
88+
const nextChildren: ParsedNode[] = [];
8889
for (const href of anchors) {
8990
const normalized = UrlWithoutAnchors(href);
9091
const visitCandidate = buildVisitKey(href);
91-
if (parsedPages.has(visitCandidate) || scheduled.has(visitCandidate)) continue;
92+
if (parsedPages.has(visitCandidate) || scheduled.has(visitCandidate))
93+
continue;
9294
const lowerNormalized = normalized.toLowerCase();
9395
if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
9496
continue;
@@ -100,7 +102,6 @@ export async function parsePages(
100102
nextChildren.push({ url: href });
101103
}
102104
node.children = nextChildren;
103-
104105
if (!node.children || depth >= maxDepth) return;
105106
for (const child of node.children) {
106107
await parsePages(
@@ -112,7 +113,7 @@ export async function parsePages(
112113
parsePageFn,
113114
baseOrigin,
114115
baseScope,
115-
baseHostToken
116+
baseHostToken,
116117
);
117118
}
118119
}
@@ -133,13 +134,13 @@ function isWithinScope(url: string, scope: string, hostToken: string): boolean {
133134
return false;
134135
}
135136
const nextChar = lowerUrl.charAt(lowerScope.length);
136-
return nextChar === '/' || nextChar === '?' || nextChar === '#';
137+
return nextChar === "/" || nextChar === "?" || nextChar === "#";
137138
}
138139

139140
export function buildVisitKey(rawUrl: string): string {
140141
try {
141142
const url = new URL(rawUrl);
142-
url.hash = '';
143+
url.hash = "";
143144
return UrlWithoutAnchors(url.toString());
144145
} catch (_error) {
145146
return rawUrl;

0 commit comments

Comments
 (0)