Skip to content

Commit

Permalink
feat: implement wait for dynamic content timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
Patai5 committed Mar 30, 2024
1 parent 37e4b25 commit 8d0259e
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 1 deletion.
2 changes: 2 additions & 0 deletions code/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const ajv = new Ajv.default();
*/
export const parseConfiguration = async (input: Input): Promise<Config> => {
const {
dynamicContentWaitSecs = 0,
excludeUrlGlobs,
includeUrlGlobs,
initialCookies,
Expand Down Expand Up @@ -51,6 +52,7 @@ export const parseConfiguration = async (input: Input): Promise<Config> => {
const maxCrawlingDepth = input.maxCrawlingDepth || Number.POSITIVE_INFINITY;

return {
dynamicContentWaitSecs,
excludeUrlGlobs,
includeUrlGlobs,
initialCookies,
Expand Down
17 changes: 16 additions & 1 deletion code/src/routes/crawl-route.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { Actor } from 'apify';
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, utils } from 'crawlee';
import { Dataset, KeyValueStore, NonRetryableError, PlaywrightCrawlingContext, log, sleep, utils } from 'crawlee';
import { Page } from 'playwright';

import { validateInputCssSelectors } from '../configuration.js';
import { ERROR_OCCURRED_MESSAGE, NonRetryableOpenaiAPIError, OpenaiAPIErrorToExitActor } from '../errors.js';
import { getNumberOfTextTokens, htmlToMarkdown, maybeShortsTextByTokenLength, shrinkHtml } from '../processors.js';
Expand All @@ -17,6 +19,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {

const state = await crawler.useState<CrawlerState>();
const {
dynamicContentWaitSecs,
excludeUrlGlobs,
includeUrlGlobs,
instructions,
Expand Down Expand Up @@ -64,6 +67,7 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {

log.info(`Opening ${url}...`);

await waitForDynamicContent(page, dynamicContentWaitSecs);
await closeCookieModals();

// Enqueue links
Expand Down Expand Up @@ -209,3 +213,14 @@ export const crawlRoute = async (context: PlaywrightCrawlingContext) => {
},
});
};

/**
* Waits for dynamic content to load on the page.
* - Waits for the given `timeoutS` to pass, but breaks early if the network is idle (loaded all resources).
*/
const waitForDynamicContent = async (page: Page, timeoutS: number) => {
const networkIdlePromise = page.waitForLoadState('networkidle');
const timeoutPromise = sleep(timeoutS * 1000);

return Promise.race([networkIdlePromise, timeoutPromise]);
};
1 change: 1 addition & 0 deletions code/src/types/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { OpenAIModelSettings } from './models.js';
* Parsed input configuration.
*/
export interface Config {
dynamicContentWaitSecs: number;
excludeUrlGlobs?: GlobInput[];
includeUrlGlobs?: GlobInput[];
initialCookies?: Cookie[];
Expand Down
1 change: 1 addition & 0 deletions code/src/types/input.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { ValuesOf } from './utils.js';
* Input schema in TypeScript format.
*/
export type Input = (OpenAIModelSettings & DeprecatedInput) & {
dynamicContentWaitSecs?: number;
startUrls: RequestOptions[];
includeUrlGlobs?: GlobInput[];
excludeUrlGlobs?: GlobInput[];
Expand Down

0 comments on commit 8d0259e

Please sign in to comment.