Skip to content

Commit 0c19cbb

Browse files
committed
Add progress log in recursive parsePages function
1 parent c8b1cab commit 0c19cbb

File tree

2 files changed

+93
-45
lines changed

2 files changed

+93
-45
lines changed

apps/parser/src/modules/crawler.ts

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export async function parsePages(
99
depth: number,
1010
maxDepth: number,
1111
parsedPages: Map<string, ParsedMetadata>,
12+
scheduledPages: Set<string>,
1213
parsePageFn: (
1314
browser: Browser,
1415
url: string,
@@ -19,6 +20,7 @@ export async function parsePages(
1920
navigationTimeout = 30000,
2021
): Promise<void> {
2122
const visitKey = buildVisitKey(node.url);
23+
scheduledPages.delete(visitKey);
2224
if (parsedPages.has(visitKey) || depth > maxDepth) {
2325
return;
2426
}
@@ -84,12 +86,12 @@ export async function parsePages(
8486
} finally {
8587
if (page) await page.close();
8688
}
87-
const scheduled = new Set<string>();
8889
const nextChildren: ParsedNode[] = [];
90+
let newLinksCount = 0;
8991
for (const href of anchors) {
9092
const normalized = UrlWithoutAnchors(href);
9193
const visitCandidate = buildVisitKey(href);
92-
if (parsedPages.has(visitCandidate) || scheduled.has(visitCandidate))
94+
if (parsedPages.has(visitCandidate) || scheduledPages.has(visitCandidate))
9395
continue;
9496
const lowerNormalized = normalized.toLowerCase();
9597
if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
@@ -98,10 +100,19 @@ export async function parsePages(
98100
if (!isWithinScope(normalized, baseScope, baseHostToken)) {
99101
continue;
100102
}
101-
scheduled.add(visitCandidate);
103+
scheduledPages.add(visitCandidate);
104+
newLinksCount += 1;
102105
nextChildren.push({ url: href });
103106
}
104107
node.children = nextChildren;
108+
const totalKnown = parsedPages.size + scheduledPages.size;
109+
console.log(
110+
`Completed parsing of page ${
111+
node.url
112+
}. Found ${newLinksCount} new links. Progress: ${
113+
parsedPages.size
114+
}/${totalKnown} (${((parsedPages.size / totalKnown) * 100).toFixed(2)}%)`,
115+
);
105116
if (!node.children || depth >= maxDepth) return;
106117
for (const child of node.children) {
107118
await parsePages(
@@ -110,6 +121,7 @@ export async function parsePages(
110121
depth + 1,
111122
maxDepth,
112123
parsedPages,
124+
scheduledPages,
113125
parsePageFn,
114126
baseOrigin,
115127
baseScope,

apps/parser/src/parser.ts

Lines changed: 78 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
1-
2-
import puppeteer from 'puppeteer-extra';
3-
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
4-
import { Browser, Page } from 'puppeteer';
5-
import { resolveEnv } from './modules/config';
6-
import { ensureDirectory, saveMetadata } from './modules/output';
7-
import { handleError } from './modules/errors';
8-
import { parsePages } from './modules/crawler';
9-
import { expandInteractiveSections } from './modules/dom-actions';
10-
import { ParsedNode, ParsedMetadata } from './modules/types';
11-
import { sanitizeUrlAsFilename, UrlWithoutAnchors, deriveSubPath } from './helpers/url-handling';
12-
import { assertReachable } from './modules/network';
13-
import { toIsoOrNull } from './helpers/date-format';
14-
import crypto from 'crypto';
1+
import puppeteer from "puppeteer-extra";
2+
import StealthPlugin from "puppeteer-extra-plugin-stealth";
3+
import { Browser, Page } from "puppeteer";
4+
import { resolveEnv } from "./modules/config";
5+
import { ensureDirectory, saveMetadata } from "./modules/output";
6+
import { handleError } from "./modules/errors";
7+
import { buildVisitKey, parsePages } from "./modules/crawler";
8+
import { expandInteractiveSections } from "./modules/dom-actions";
9+
import { ParsedNode, ParsedMetadata } from "./modules/types";
10+
import {
11+
sanitizeUrlAsFilename,
12+
UrlWithoutAnchors,
13+
deriveSubPath,
14+
} from "./helpers/url-handling";
15+
import { assertReachable } from "./modules/network";
16+
import { toIsoOrNull } from "./helpers/date-format";
17+
import crypto from "crypto";
1518

1619
puppeteer.use(StealthPlugin());
1720

@@ -21,6 +24,7 @@ const FILENAME_LENGTH_THRESHOLD = 255;
2124

2225
const env = resolveEnv();
2326
const parsedPages = new Map<string, ParsedMetadata>();
27+
const scheduledPages = new Set<string>();
2428

2529
void (async () => {
2630
try {
@@ -31,18 +35,22 @@ void (async () => {
3135
const baseUrlObject = new URL(env.baseUrl);
3236
const baseOrigin = baseUrlObject.origin;
3337
const baseScope = UrlWithoutAnchors(env.baseUrl);
34-
const baseHostToken = baseUrlObject.hostname.replace(/^www\./, '').toLowerCase();
38+
const baseHostToken = baseUrlObject.hostname
39+
.replace(/^www\./, "")
40+
.toLowerCase();
41+
scheduledPages.add(buildVisitKey(env.baseUrl));
3542
await parsePages(
3643
browser,
3744
root,
3845
0,
3946
env.maxDepth,
4047
parsedPages,
48+
scheduledPages,
4149
parsePageFn,
4250
baseOrigin,
4351
baseScope,
4452
baseHostToken,
45-
NAVIGATION_TIMEOUT_MS
53+
NAVIGATION_TIMEOUT_MS,
4654
);
4755
await browser.close();
4856
console.log(`Parsing complete! Data saved to ${env.outputDirectory}`);
@@ -51,11 +59,17 @@ void (async () => {
5159
}
5260
})();
5361

54-
async function parsePageFn(browser: Browser, url: string): Promise<ParsedMetadata | null> {
62+
async function parsePageFn(
63+
browser: Browser,
64+
url: string,
65+
): Promise<ParsedMetadata | null> {
5566
let page: Page | undefined;
5667
try {
5768
page = await browser.newPage();
58-
await page.goto(url, { waitUntil: 'networkidle2', timeout: NAVIGATION_TIMEOUT_MS });
69+
await page.goto(url, {
70+
waitUntil: "networkidle2",
71+
timeout: NAVIGATION_TIMEOUT_MS,
72+
});
5973
await expandInteractiveSections(page);
6074
const rawMetadata = await page.evaluate(extractDocumentMetadata);
6175
const snapshot = serializeMetadata(rawMetadata);
@@ -71,15 +85,28 @@ async function parsePageFn(browser: Browser, url: string): Promise<ParsedMetadat
7185
}
7286
}
7387

74-
async function persistSnapshot(snapshot: ParsedMetadata, FILENAME_LENGTH_THRESHOLD: number): Promise<void> {
75-
const subPath = deriveSubPath(snapshot.url, env.baseUrl, env.sanitizedBaseUrl);
76-
const preferredName = subPath === '/' ? 'root' : subPath;
77-
const sanitizedName = sanitizeUrlAsFilename(preferredName, { replacement: '-' });
78-
const trimmedName = sanitizedName.replace(/^[-_]+/, '') || sanitizedName;
88+
async function persistSnapshot(
89+
snapshot: ParsedMetadata,
90+
FILENAME_LENGTH_THRESHOLD: number,
91+
): Promise<void> {
92+
const subPath = deriveSubPath(
93+
snapshot.url,
94+
env.baseUrl,
95+
env.sanitizedBaseUrl,
96+
);
97+
const preferredName = subPath === "/" ? "root" : subPath;
98+
const sanitizedName = sanitizeUrlAsFilename(preferredName, {
99+
replacement: "-",
100+
});
101+
const trimmedName = sanitizedName.replace(/^[-_]+/, "") || sanitizedName;
79102
let finalName = trimmedName;
80103
if (trimmedName.length > FILENAME_LENGTH_THRESHOLD) {
81104
const normalizedUrl = UrlWithoutAnchors(snapshot.url);
82-
const hash = crypto.createHash('sha1').update(normalizedUrl).digest('hex').slice(0, 10);
105+
const hash = crypto
106+
.createHash("sha1")
107+
.update(normalizedUrl)
108+
.digest("hex")
109+
.slice(0, 10);
83110
const prefix = trimmedName.slice(0, 240);
84111
finalName = `${prefix}_${hash}`;
85112
}
@@ -101,42 +128,51 @@ function serializeMetadata(raw: ParsedMetadata): ParsedMetadata {
101128
const extractDocumentMetadata = (): ParsedMetadata => {
102129
const getMeta = (name: string): string | null => {
103130
return (
104-
document.querySelector(`meta[name="${name}"]`)?.getAttribute('content') ||
105-
document.querySelector(`meta[property="${name}"]`)?.getAttribute('content') ||
131+
document.querySelector(`meta[name="${name}"]`)?.getAttribute("content") ||
132+
document
133+
.querySelector(`meta[property="${name}"]`)
134+
?.getAttribute("content") ||
106135
null
107136
);
108137
};
109-
const metaTitle = getMeta('og:title') || getMeta('twitter:title');
138+
const metaTitle = getMeta("og:title") || getMeta("twitter:title");
110139
const documentTitle = document.title?.trim();
111-
const normalizedTitle = documentTitle?.length ? documentTitle : metaTitle || '';
140+
const normalizedTitle = documentTitle?.length
141+
? documentTitle
142+
: metaTitle || "";
112143
const normalizeText = (value: string | null | undefined): string => {
113-
return value ? value.replace(/\s+/g, ' ').trim() : '';
144+
return value ? value.replace(/\s+/g, " ").trim() : "";
114145
};
115-
const mainText = normalizeText(document.querySelector('main')?.innerText);
116-
const iframeTexts = Array.from(document.querySelectorAll('iframe'))
146+
const mainText = normalizeText(document.querySelector("main")?.innerText);
147+
const iframeTexts = Array.from(document.querySelectorAll("iframe"))
117148
.map((frame) => {
118149
try {
119-
return normalizeText(frame.contentDocument?.body?.innerText ?? '');
150+
return normalizeText(frame.contentDocument?.body?.innerText ?? "");
120151
} catch (_error) {
121-
return '';
152+
return "";
122153
}
123154
})
124155
.filter((text) => text.length > 0);
125-
const prioritizedTextParts = [mainText, ...iframeTexts].filter((text) => text.length > 0);
126-
const prioritizedText = prioritizedTextParts.join('\n\n').trim();
127-
const fallbackBody = normalizeText(document.body?.innerText ?? '');
128-
const bodyText = prioritizedText.length >= 120 ? prioritizedText : fallbackBody;
156+
const prioritizedTextParts = [mainText, ...iframeTexts].filter(
157+
(text) => text.length > 0,
158+
);
159+
const prioritizedText = prioritizedTextParts.join("\n\n").trim();
160+
const fallbackBody = normalizeText(document.body?.innerText ?? "");
161+
const bodyText =
162+
prioritizedText.length >= 120 ? prioritizedText : fallbackBody;
129163
return {
130164
title: normalizedTitle,
131165
url: window.location.href,
132166
bodyText,
133-
lang: document.documentElement.lang || getMeta('og:locale') || null,
134-
keywords: getMeta('keywords') || getMeta('news_keywords'),
167+
lang: document.documentElement.lang || getMeta("og:locale") || null,
168+
keywords: getMeta("keywords") || getMeta("news_keywords"),
135169
datePublished:
136-
getMeta('article:published_time') || getMeta('date') || getMeta('publish-date'),
170+
getMeta("article:published_time") ||
171+
getMeta("date") ||
172+
getMeta("publish-date"),
137173
lastModified:
138-
document.lastModified !== '01/01/1970 00:00:00'
174+
document.lastModified !== "01/01/1970 00:00:00"
139175
? document.lastModified
140-
: getMeta('article:modified_time'),
176+
: getMeta("article:modified_time"),
141177
};
142178
};

0 commit comments

Comments
 (0)