Skip to content

Commit c65fbee

Browse files
teng-linclaude
andauthored
refactor: decompose monolithic modules + sliding-window crawler (#2)
* refactor: decompose monolithic modules + sliding-window crawler - Decompose content-extractors.ts (1281 → ~400 lines) into 7 focused sub-modules: metadata, readability, selector, json-ld, next-data, text-density, and RSC extractors. Barrel re-exports preserve all existing import paths. - Decompose http-fetch.ts (1074 → ~700 lines) by extracting WP REST API logic into wp-rest-api.ts and Next.js data route logic into next-data-route.ts. - Replace crawler batch-wait (Promise.all) with sliding-window concurrency (Promise.race + Map) for better throughput with non-uniform fetch times. - Promote RequestContext interface to shared fetch/types.ts. - Eliminate duplicate htmlToText by importing shared utility. - Extract attachRawHtml helper to reduce repeated keepRawHtml patterns. All 683 tests pass unchanged — no test modifications needed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: sanitize HTML in JSON-LD articleBody extraction JSON-LD articleBody/text fields can contain raw HTML with script tags, event handlers, and dangerous URI schemes. Apply sanitizeHtml to strip dangerous elements and htmlToText to produce clean textContent, matching the pattern used by next-data-extractor. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 09eb3dd commit c65fbee

13 files changed

Lines changed: 1504 additions & 1365 deletions

src/crawl/crawler.ts

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -147,19 +147,6 @@ interface ProcessOptions {
147147
onBlocked: () => void;
148148
}
149149

150-
/**
151-
* Collect up to `count` entries from the frontier.
152-
*/
153-
function collectBatch(frontier: UrlFrontier, count: number): { url: string; depth: number }[] {
154-
const batch: { url: string; depth: number }[] = [];
155-
while (batch.length < count && frontier.hasMore()) {
156-
const entry = frontier.next();
157-
if (!entry) break;
158-
batch.push(entry);
159-
}
160-
return batch;
161-
}
162-
163150
/**
164151
* Check if a URL is allowed by robots.txt Disallow rules.
165152
* Returns false (blocked) if the URL cannot be parsed.
@@ -173,28 +160,36 @@ function isUrlAllowed(url: string, disallowPaths: string[]): boolean {
173160
}
174161

175162
/**
176-
* Process frontier entries with concurrency control.
177-
* When discoverLinks is true, extracts links from fetched pages for BFS crawling.
163+
* Process frontier entries with sliding-window concurrency control.
164+
* Uses a Map-based inflight tracker so that each completed request
165+
* immediately frees a slot for the next URL, rather than waiting
166+
* for the entire batch to finish.
178167
*/
179168
async function* processFrontier(
180169
frontier: UrlFrontier,
181170
opts: ProcessOptions
182171
): AsyncGenerator<CrawlResult> {
183172
const { concurrency, delay, disallowPaths, options, discoverLinks, onResult, onBlocked } = opts;
184173

185-
while (frontier.hasMore()) {
186-
const batch = collectBatch(frontier, concurrency);
187-
if (batch.length === 0) break;
174+
let nextId = 0;
175+
const inflight = new Map<number, Promise<{ id: number; result: CrawlResult | null }>>();
176+
177+
function enqueue(): void {
178+
while (inflight.size < concurrency && frontier.hasMore()) {
179+
const entry = frontier.next();
180+
if (!entry) break;
181+
const id = nextId++;
182+
183+
const promise = (async () => {
184+
if (delay > 0) await new Promise((r) => setTimeout(r, delay));
188185

189-
const results = await Promise.all(
190-
batch.map(async (entry) => {
191186
if (!isUrlAllowed(entry.url, disallowPaths)) {
192187
logger.debug({ url: entry.url }, 'Blocked by robots.txt');
193188
onBlocked();
194-
return null;
189+
return { id, result: null };
195190
}
196191

197-
const result = await httpFetch(entry.url, {
192+
const fetchResult = await httpFetch(entry.url, {
198193
preset: options.preset,
199194
timeout: options.timeout,
200195
targetSelector: options.targetSelector,
@@ -204,24 +199,32 @@ async function* processFrontier(
204199
cookies: options.cookies,
205200
});
206201

207-
if (discoverLinks && result.success && result.rawHtml) {
208-
frontier.addAll(extractLinks(result.rawHtml, entry.url), entry.depth + 1);
209-
result.rawHtml = null;
202+
if (discoverLinks && fetchResult.success && fetchResult.rawHtml) {
203+
frontier.addAll(extractLinks(fetchResult.rawHtml, entry.url), entry.depth + 1);
204+
fetchResult.rawHtml = null;
210205
}
211206

212-
return { ...result, depth: entry.depth } as CrawlResult;
213-
})
214-
);
207+
return { id, result: { ...fetchResult, depth: entry.depth } as CrawlResult };
208+
})();
215209

216-
for (const result of results) {
217-
if (result) {
218-
onResult(result.success);
219-
yield result;
220-
}
210+
inflight.set(id, promise);
221211
}
212+
}
213+
214+
// Fill initial window
215+
enqueue();
222216

223-
if (delay > 0) {
224-
await new Promise((resolve) => setTimeout(resolve, delay));
217+
while (inflight.size > 0) {
218+
// Race all inflight promises — settled includes the id for Map deletion
219+
const settled = await Promise.race(inflight.values());
220+
inflight.delete(settled.id);
221+
222+
if (settled.result) {
223+
onResult(settled.result.success);
224+
yield settled.result;
225225
}
226+
227+
// Refill window
228+
enqueue();
226229
}
227230
}

0 commit comments

Comments
 (0)