Skip to content

Commit 8d0259e

Browse files
committed
feat: implement wait for dynamic content timeout
1 parent 37e4b25 commit 8d0259e

File tree

4 files changed

+20
-1
lines changed

4 files changed

+20
-1
lines changed

code/src/configuration.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const ajv = new Ajv.default();
1616
*/
1717
export const parseConfiguration = async (input: Input): Promise<Config> => {
1818
const {
19+
dynamicContentWaitSecs = 0,
1920
excludeUrlGlobs,
2021
includeUrlGlobs,
2122
initialCookies,
@@ -51,6 +52,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
5152
const maxCrawlingDepth = input.maxCrawlingDepth || Number.POSITIVE_INFINITY;
5253

5354
return {
55+
dynamicContentWaitSecs,
5456
excludeUrlGlobs,
5557
includeUrlGlobs,
5658
initialCookies,

code/src/routes/crawl-route.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import { Actor } from 'apify';
2-
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
2+
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, sleep, utils } from 'crawlee';
3+
import { Page } from 'playwright';
4+
35
import { validateInputCssSelectors } from '../configuration.js';
46
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors.js';
57
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors.js';
@@ -17,6 +19,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
1719

1820
const state = await crawler.useState<CrawlerState>();
1921
const {
22+
dynamicContentWaitSecs,
2023
excludeUrlGlobs,
2124
includeUrlGlobs,
2225
instructions,
@@ -64,6 +67,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
6467

6568
log.info(`Opening ${url}...`);
6669

70+
await waitForDynamicContent(page, dynamicContentWaitSecs);
6771
await closeCookieModals();
6872

6973
// Enqueue links
@@ -209,3 +213,14 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
209213
},
210214
});
211215
};
216+
217+
/**
218+
* Waits for dynamic content to load on the page.
219+
* - Waits for the given `timeoutS` to pass, but breaks early if the network is idle (loaded all resources).
220+
*/
221+
const waitForDynamicContent = async (page: Page, timeoutS: number) => {
222+
const networkIdlePromise = page.waitForLoadState('networkidle');
223+
const timeoutPromise = sleep(timeoutS * 1000);
224+
225+
return Promise.race([networkIdlePromise, timeoutPromise]);
226+
};

code/src/types/config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { OpenAIModelSettings } from './models.js';
88
* Parsed input configuration.
99
*/
1010
export interface Config {
11+
dynamicContentWaitSecs: number;
1112
excludeUrlGlobs?: GlobInput[];
1213
includeUrlGlobs?: GlobInput[];
1314
initialCookies?: Cookie[];

code/src/types/input.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { ValuesOf } from './utils.js';
88
* Input schema in TypeScript format.
99
*/
1010
export type Input = (OpenAIModelSettings & DeprecatedInput) & {
11+
dynamicContentWaitSecs?: number;
1112
startUrls: RequestOptions[];
1213
includeUrlGlobs?: GlobInput[];
1314
excludeUrlGlobs?: GlobInput[];

0 commit comments

Comments
 (0)