Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"actorSpecification": 1,
"name": "project-stagehand-crawler-javascript",
"title": "Project Stagehand Crawler JavaScript",
"description": "Crawlee and Stagehand project in JavaScript.",
"version": "0.0",
"meta": {
"templateId": "js-crawlee-stagehand-chrome",
"generatedBy": "<FILL-IN-MODEL>"
},
"input": "./input_schema.json",
"output": "./output_schema.json",
"storages": {
"dataset": "./dataset_schema.json"
},
"dockerfile": "../Dockerfile",
"environmentVariables": {
"OPENAI_API_KEY": "@OPENAI_API_KEY"
}
}
33 changes: 33 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.actor/dataset_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"actorSpecification": 1,
"fields": {},
"views": {
"overview": {
"title": "Overview",
"transformation": {
"fields": ["title", "heading", "description", "url"]
},
"display": {
"component": "table",
"properties": {
"title": {
"label": "Page Title",
"format": "text"
},
"heading": {
"label": "Heading",
"format": "text"
},
"description": {
"label": "Description",
"format": "text"
},
"url": {
"label": "URL",
"format": "link"
}
}
}
}
}
}
18 changes: 18 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.actor/input_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"title": "StagehandCrawler Template",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "URLs to start with.",
"editor": "requestListSources",
"prefill": [
{
"url": "https://apify.com"
}
]
}
}
}
11 changes: 11 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.actor/output_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"actorOutputSchemaVersion": 1,
"title": "Output schema",
"properties": {
"overview": {
"type": "string",
"title": "Overview",
"template": "{{links.apiDefaultDatasetUrl}}/items?view=overview"
}
}
}
15 changes: 15 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# configurations
.idea
.vscode
.zed

# crawlee and apify storage folders
apify_storage
crawlee_storage
storage

# installed files
node_modules

# git folder
.git
10 changes: 10 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
root = true

[*]
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true
end_of_line = lf
quote_type = single
10 changes: 10 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# This file tells Git which files shouldn't be added to source control

.DS_Store
.idea
.vscode
.zed
dist
node_modules
apify_storage
storage
5 changes: 5 additions & 0 deletions templates/js-crawlee-stagehand-chrome/.prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"printWidth": 120,
"tabWidth": 4,
"singleQuote": true
}
34 changes: 34 additions & 0 deletions templates/js-crawlee-stagehand-chrome/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:22-1.58.1

# Check preinstalled packages
RUN npm ls @crawlee/core apify puppeteer playwright

# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser:myuser package*.json Dockerfile ./

# Check Playwright version is the same as the one from base image.
RUN node check-playwright-version.mjs

# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version \
&& rm -r ~/.npm

# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser:myuser . ./

CMD ["node", "src/main.js"]
23 changes: 23 additions & 0 deletions templates/js-crawlee-stagehand-chrome/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
## StagehandCrawler template

<!-- This is an Apify template readme -->

This template is a production-ready boilerplate for developing an [Actor](https://apify.com/actors) with `StagehandCrawler`. It combines [Crawlee](https://crawlee.dev)'s powerful crawling infrastructure with [Stagehand](https://github.com/browserbase/stagehand)'s AI-powered browser automation, enabling you to interact with web pages and extract structured data using natural language instructions.

The template demonstrates two key Stagehand capabilities:
- **`act()`** — Perform actions on a page using natural language (e.g., closing cookie dialogs)
- **`extract()`** — Extract structured data from pages using Zod schemas and AI

## Resources

If you're looking for examples or want to learn more visit:

- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform)
- [Stagehand documentation](https://docs.stagehand.dev)
- [Crawlee documentation](https://crawlee.dev/docs/quick-start)
- [Node.js tutorials](https://docs.apify.com/academy/node-js) in Academy
- [Integration with Zapier](https://apify.com/integrations), Make, GitHub, Google Drive and other apps
- [Video guide on getting data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM)
- A short guide on how to create Actors using code templates:

[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w)
6 changes: 6 additions & 0 deletions templates/js-crawlee-stagehand-chrome/eslint.config.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import prettier from 'eslint-config-prettier';

import apify from '@apify/eslint-config/js.js';

// eslint-disable-next-line import/no-default-export
export default [{ ignores: ['**/dist'] }, ...apify, prettier];
30 changes: 30 additions & 0 deletions templates/js-crawlee-stagehand-chrome/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"name": "crawlee-stagehand-javascript",
"version": "0.0.1",
"type": "module",
"description": "This is an example of an Apify Actor.",
"dependencies": {
"apify": "^3.5.2",
"@crawlee/stagehand": "^3.16.0",
"playwright": "1.58.1",
"openai": "^4.96.0",
"zod": "^3.25.0"
},
"devDependencies": {
"@apify/eslint-config": "^1.0.0",
"eslint": "^9.29.0",
"eslint-config-prettier": "^10.1.5",
"prettier": "^3.5.3"
},
"scripts": {
"start": "node src/main.js",
"format": "prettier --write .",
"format:check": "prettier --check .",
"lint": "eslint",
"lint:fix": "eslint --fix",
"test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1",
"postinstall": "npx crawlee install-playwright-browsers"
},
"author": "It's not you it's me",
"license": "ISC"
}
44 changes: 44 additions & 0 deletions templates/js-crawlee-stagehand-chrome/src/main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/**
* This template is a production ready boilerplate for developing with `StagehandCrawler`.
* Use this to bootstrap your projects using the most up-to-date code.
* If you're looking for examples or want to learn more, see README.
*/

// For more information, see https://crawlee.dev
import { StagehandCrawler } from '@crawlee/stagehand';
// For more information, see https://docs.apify.com/sdk/js
import { Actor } from 'apify';

// this is ESM project, and as such, it requires you to specify extensions in your relative imports
// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions
import { router } from './routes.js';

// Initialize the Apify SDK
await Actor.init();

const { startUrls = ['https://apify.com'] } = (await Actor.getInput()) ?? {};

// `checkAccess` flag ensures the proxy credentials are valid, but the check can take a few hundred milliseconds.
// Disable it for short runs if you are sure your proxy configuration is correct
const proxyConfiguration = await Actor.createProxyConfiguration({ checkAccess: true });

const crawler = new StagehandCrawler({
proxyConfiguration,
requestHandler: router,
stagehandOptions: {
model: 'openai/gpt-4o-mini',
apiKey: process.env.OPENAI_API_KEY,
},
launchContext: {
launchOptions: {
args: [
'--disable-gpu', // Mitigates the "crashing GPU process" issue in Docker containers
],
},
},
});

await crawler.run(startUrls);

// Exit successfully
await Actor.exit();
35 changes: 35 additions & 0 deletions templates/js-crawlee-stagehand-chrome/src/routes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { createStagehandRouter, Dataset } from '@crawlee/stagehand';
import { z } from 'zod';

export const router = createStagehandRouter();

router.addDefaultHandler(async ({ enqueueLinks, log }) => {
log.info('enqueueing new URLs');
await enqueueLinks({
globs: ['https://apify.com/*'],
label: 'detail',
});
});

router.addHandler('detail', async ({ request, page, log }) => {
const title = await page.title();
log.info(`${title}`, { url: request.loadedUrl });

// Use Stagehand act() to interact with the page
await page.act('Close any cookie consent dialogs or popups if present');

// Use Stagehand extract() to pull structured data with a Zod schema
const extracted = await page.extract({
instruction: 'Extract the main heading and a brief description of this page',
schema: z.object({
heading: z.string().describe('The main heading of the page'),
description: z.string().describe('A brief description or subtitle of the page'),
}),
});

await Dataset.pushData({
url: request.loadedUrl,
title,
...extracted,
});
});
32 changes: 32 additions & 0 deletions templates/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,22 @@
"showcaseFiles": ["src/main.js", "src/routes.js"],
"useCases": ["WEB_SCRAPING"]
},
{
"id": "js-crawlee-stagehand-chrome",
"name": "project_stagehand_crawler_js",
"label": "Crawlee + Stagehand + Chrome",
"category": "javascript",
"technologies": ["nodejs", "crawlee", "stagehand", "chrome"],
"description": "AI-powered web scraper using Crawlee and Stagehand. Uses natural language to interact with pages and extract structured data using LLMs.",
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-crawlee-stagehand-chrome.zip?raw=true",
"defaultRunOptions": {
"build": "latest",
"memoryMbytes": 4096,
"timeoutSecs": 3600
},
"showcaseFiles": ["src/main.js", "src/routes.js"],
"useCases": ["AI", "WEB_SCRAPING"]
},
{
"id": "js-crawlee-playwright-camoufox",
"name": "project_playwright_camoufox_crawler_js",
Expand Down Expand Up @@ -494,6 +510,22 @@
"showcaseFiles": ["src/main.ts", "src/routes.ts"],
"useCases": ["WEB_SCRAPING"]
},
{
"id": "ts-crawlee-stagehand-chrome",
"name": "project_stagehand_crawler_ts",
"label": "Crawlee + Stagehand + Chrome",
"category": "typescript",
"technologies": ["nodejs", "crawlee", "stagehand", "chrome"],
"description": "AI-powered web scraper using Crawlee and Stagehand. Uses natural language to interact with pages and extract structured data using LLMs.",
"archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-stagehand-chrome.zip?raw=true",
"defaultRunOptions": {
"build": "latest",
"memoryMbytes": 4096,
"timeoutSecs": 3600
},
"showcaseFiles": ["src/main.ts", "src/routes.ts"],
"useCases": ["AI", "WEB_SCRAPING"]
},
{
"id": "ts-crawlee-playwright-camoufox",
"name": "project_playwright_camoufox_crawler_ts",
Expand Down
20 changes: 20 additions & 0 deletions templates/ts-crawlee-stagehand-chrome/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"actorSpecification": 1,
"name": "project-stagehand-crawler-typescript",
"title": "Project Stagehand Crawler Typescript",
"description": "Crawlee and Stagehand project in typescript.",
"version": "0.0",
"meta": {
"templateId": "ts-crawlee-stagehand-chrome",
"generatedBy": "<FILL-IN-MODEL>"
},
"input": "./input_schema.json",
"output": "./output_schema.json",
"storages": {
"dataset": "./dataset_schema.json"
},
"dockerfile": "../Dockerfile",
"environmentVariables": {
"OPENAI_API_KEY": "@OPENAI_API_KEY"
}
}
Loading
Loading