diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 0b6e95e..614f68f 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -21,7 +21,7 @@ "keywords": ["browser", "automation", "web-scraping", "stagehand", "screenshots"], "strict": false, "skills": [ - "./skills/browser-automation" + "./skills/browser" ] }, { diff --git a/README.md b/README.md index 6e4526b..86d6dfb 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ This plugin includes the following skills (see `skills/` for details): | Skill | Description | |-------|-------------| -| [browser-automation](skills/browser-automation/SKILL.md) | Browser control using natural language commands (navigate, click, extract data, screenshot) | +| [browser](skills/browser/SKILL.md) | Automate web browser interactions via CLI commands — supports remote Browserbase sessions with anti-bot stealth, CAPTCHA solving, and residential proxies | +| [agent-browser-remote](skills/agent-browser-remote/SKILL.md) | Use Agent Browser with Browserbase remote CDP sessions when local browsing hits CAPTCHAs, anti-bot checks, or JavaScript-heavy sites | +| [browse-fleet-subagents](skills/browse-fleet-subagents/SKILL.md) | Orchestrate high-volume browser work via sub-agent fanout with clear decomposition, retries, and cleanup patterns | | [functions](skills/functions/SKILL.md) | Deploy serverless browser automation to Browserbase cloud using the `bb` CLI | ## Installation @@ -60,4 +62,4 @@ rm -rf .chrome-profile ## Resources - [Stagehand Documentation](https://github.com/browserbase/stagehand) -- [Claude Code Skills](https://support.claude.com/en/articles/12512176-what-are-skills) \ No newline at end of file +- [Claude Code Skills](https://support.claude.com/en/articles/12512176-what-are-skills) diff --git a/skills/agent-browser-remote/LICENSE.txt b/skills/agent-browser-remote/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/agent-browser-remote/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/agent-browser-remote/SKILL.md b/skills/agent-browser-remote/SKILL.md new file mode 100644 index 0000000..8e2ce4f --- /dev/null +++ b/skills/agent-browser-remote/SKILL.md @@ -0,0 +1,94 @@ +--- +name: agent-browser-remote +description: Use Agent Browser with Browserbase remote CDP sessions. Trigger this skill when users are driving browser tasks with agent-browser and encounter CAPTCHAs, anti-bot pages, IP-based blocking, JavaScript-heavy sites, geo restrictions, or request remote cloud browser execution. +compatibility: "Requires `agent-browser` and Node.js. Requires Browserbase credentials (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`) for remote sessions." +license: MIT +allowed-tools: Bash +metadata: + capabilities: + - agent-browser + - remote-browser + - cdp-interop + - stealth + - captcha-solving + - proxies + openclaw: + requires: + bins: + - agent-browser + - node + install: + - kind: node + package: "agent-browser" + bins: [agent-browser] + homepage: https://github.com/browserbase/skills +--- + +# Agent Browser + Remote CDP + +Use this skill when `agent-browser` local mode struggles on protected sites, or when the user explicitly wants remote cloud browsers. + +## Setup check + +```bash +which agent-browser || npm install -g agent-browser +agent-browser install +``` + +Set Browserbase credentials (or run `openclaw browserbase setup`): + +```bash +export BROWSERBASE_API_KEY="..." +export BROWSERBASE_PROJECT_ID="..." +``` + +## Fast path + +Create a Browserbase session, then point `agent-browser` at the returned CDP URL. + +```bash +eval "$(node scripts/browserbase-session.mjs create --proxies true --advanced-stealth true --format shell)" +agent-browser --cdp "$BROWSERBASE_CDP_URL" open https://example.com +agent-browser --cdp "$BROWSERBASE_CDP_URL" snapshot -i --json +``` + +When done: + +```bash +node scripts/browserbase-session.mjs close --session-id "$BROWSERBASE_SESSION_ID" +``` + +## When to switch from local to remote + +Switch to remote when any of these appear: +- CAPTCHA or challenge pages (reCAPTCHA, hCaptcha, Turnstile) +- bot checks ("checking your browser", "verify you are human") +- repeated `403` / `429` from sites that should be accessible +- empty DOM/snapshot on JavaScript-heavy pages that should have content +- geo-specific content requirements + +Stay local for simple docs sites, localhost, and basic internal QA flows. + +## Command patterns + +Per-command CDP (explicit, stateless): + +```bash +agent-browser --cdp "$BROWSERBASE_CDP_URL" open https://target.com +agent-browser --cdp "$BROWSERBASE_CDP_URL" snapshot -i --json +agent-browser --cdp "$BROWSERBASE_CDP_URL" click @e2 +``` + +Or connect once, then run normal commands: + +```bash +agent-browser connect "$BROWSERBASE_CDP_URL" +agent-browser open https://target.com +agent-browser snapshot -i --json +``` + +## Notes + +- `--proxies true` requires a Browserbase plan that includes proxies. +- `--advanced-stealth true` requires a plan that includes advanced stealth. +- Always close remote sessions explicitly when the task ends. diff --git a/skills/agent-browser-remote/scripts/browserbase-session.mjs b/skills/agent-browser-remote/scripts/browserbase-session.mjs new file mode 100755 index 0000000..97f3e26 --- /dev/null +++ b/skills/agent-browser-remote/scripts/browserbase-session.mjs @@ -0,0 +1,206 @@ +#!/usr/bin/env node + +const DEFAULT_API_BASE = process.env.BROWSERBASE_API_BASE_URL || "https://api.browserbase.com"; + +function usage() { + console.error( + [ + "Usage:", + " node scripts/browserbase-session.mjs create [options]", + " node scripts/browserbase-session.mjs close --session-id [options]", + "", + "Create options:", + " --api-key Browserbase API key (or BROWSERBASE_API_KEY)", + " --project-id Browserbase project ID (or BROWSERBASE_PROJECT_ID)", + " --proxies Enable proxies", + " --advanced-stealth Enable advanced stealth", + " --keep-alive Keep session alive on Browserbase", + " --format Output format (default: json)", + " --api-base-url API base URL (default: https://api.browserbase.com)", + "", + "Close options:", + " --session-id Session ID to close (required)", + " --project-id Browserbase project ID (optional, or BROWSERBASE_PROJECT_ID)", + " --api-key Browserbase API key (or BROWSERBASE_API_KEY)", + " --api-base-url API base URL", + ].join("\n"), + ); +} + +function parseArgs(argv) { + const out = { _: [] }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (!arg.startsWith("--")) { + out._.push(arg); + continue; + } + const key = arg.slice(2); + const next = argv[i + 1]; + if (!next || next.startsWith("--")) { + out[key] = true; + continue; + } + out[key] = next; + i += 1; + } + return out; +} + +function parseBool(value, name) { + if (value === undefined) return undefined; + if (value === true || value === false) return value; + const normalized = String(value).trim().toLowerCase(); + if (["1", "true", "yes", "on"].includes(normalized)) return true; + if (["0", "false", "no", "off"].includes(normalized)) return false; + throw new Error(`Invalid boolean for --${name}: ${value}`); +} + +function shellQuote(value) { + return `'${String(value).replace(/'/g, `'\\''`)}'`; +} + +async function createSession(args) { + const apiKey = args["api-key"] || process.env.BROWSERBASE_API_KEY; + const projectId = args["project-id"] || process.env.BROWSERBASE_PROJECT_ID; + const format = String(args.format || "json").toLowerCase(); + const apiBaseUrl = String(args["api-base-url"] || DEFAULT_API_BASE).replace(/\/$/, ""); + + if (!apiKey) throw new Error("Missing API key. Set --api-key or BROWSERBASE_API_KEY."); + if (!projectId) throw new Error("Missing project ID. Set --project-id or BROWSERBASE_PROJECT_ID."); + if (!["json", "shell", "url"].includes(format)) { + throw new Error(`Invalid --format: ${format}`); + } + + const proxies = parseBool(args.proxies, "proxies"); + const advancedStealth = parseBool(args["advanced-stealth"], "advanced-stealth"); + const keepAlive = parseBool(args["keep-alive"], "keep-alive"); + + const payload = { projectId }; + if (proxies !== undefined) payload.proxies = proxies; + if (keepAlive !== undefined) payload.keepAlive = keepAlive; + if (advancedStealth !== undefined) { + payload.browserSettings = { advancedStealth }; + } + + const response = await fetch(`${apiBaseUrl}/v1/sessions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-BB-API-Key": apiKey, + }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Failed to create session (${response.status}): ${text || response.statusText}`); + } + + const data = await response.json(); + const sessionId = data.id; + const connectUrl = data.connectUrl; + + if (!sessionId || !connectUrl) { + throw new Error("Browserbase response missing id or connectUrl."); + } + + const output = { + sessionId, + connectUrl, + debuggerUrl: `https://www.browserbase.com/sessions/${sessionId}`, + }; + + if (format === "url") { + console.log(output.connectUrl); + return; + } + + if (format === "shell") { + console.log(`export BROWSERBASE_SESSION_ID=${shellQuote(output.sessionId)}`); + console.log(`export BROWSERBASE_CDP_URL=${shellQuote(output.connectUrl)}`); + console.log(`export BROWSERBASE_DEBUGGER_URL=${shellQuote(output.debuggerUrl)}`); + return; + } + + console.log(JSON.stringify(output, null, 2)); +} + +async function closeSession(args) { + const apiKey = args["api-key"] || process.env.BROWSERBASE_API_KEY; + const sessionId = args["session-id"] || args.sessionId; + const projectId = args["project-id"] || process.env.BROWSERBASE_PROJECT_ID; + const apiBaseUrl = String(args["api-base-url"] || DEFAULT_API_BASE).replace(/\/$/, ""); + + if (!apiKey) throw new Error("Missing API key. Set --api-key or BROWSERBASE_API_KEY."); + if (!sessionId) throw new Error("Missing session ID. Set --session-id ."); + + // Current Browserbase API supports session release via POST /v1/sessions/{id}. + const releasePayload = { status: "REQUEST_RELEASE" }; + if (projectId) releasePayload.projectId = projectId; + + let response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-BB-API-Key": apiKey, + }, + body: JSON.stringify(releasePayload), + }); + + // Backward-compat fallback if the API still expects DELETE. + if (!response.ok && [404, 405].includes(response.status)) { + response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { + method: "DELETE", + headers: { + "X-BB-API-Key": apiKey, + }, + }); + } + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Failed to close session (${response.status}): ${text || response.statusText}`); + } + + const data = await response.json().catch(() => ({})); + console.log( + JSON.stringify( + { + closed: true, + sessionId, + status: data?.status ?? "REQUESTED", + }, + null, + 2, + ), + ); +} + +async function main() { + const argv = process.argv.slice(2); + if (argv.length === 0 || argv.includes("--help") || argv.includes("-h")) { + usage(); + process.exit(argv.length === 0 ? 1 : 0); + } + + const [command, ...rest] = argv; + const args = parseArgs(rest); + + if (command === "create") { + await createSession(args); + return; + } + if (command === "close") { + await closeSession(args); + return; + } + + usage(); + process.exit(1); +} + +main().catch((error) => { + console.error(String(error?.message || error)); + process.exit(1); +}); diff --git a/skills/browse-fleet-subagents/LICENSE.txt b/skills/browse-fleet-subagents/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/browse-fleet-subagents/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/browse-fleet-subagents/SKILL.md b/skills/browse-fleet-subagents/SKILL.md new file mode 100644 index 0000000..9749076 --- /dev/null +++ b/skills/browse-fleet-subagents/SKILL.md @@ -0,0 +1,92 @@ +--- +name: browse-fleet-subagents +description: Orchestrate high-volume browser tasks by decomposing one objective into many independent units and fanning out execution through sub-agents, each owning its own browser workflow. Use when users need parallel browser work such as competitive monitoring, account sweeps, QA matrix checks, regression checks across many URLs, or load-style deterministic actions. +compatibility: "Requires the `browse` CLI. For protected targets, set Browserbase credentials (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`) to use remote mode." +license: MIT +allowed-tools: Bash +metadata: + capabilities: + - parallel-subagents + - task-decomposition + - subagent-orchestration + - retry-control + openclaw: + requires: + bins: + - browse + install: + - kind: node + package: "@browserbasehq/browse-cli" + bins: [browse] + homepage: https://github.com/browserbase/skills +--- + +# Browser Fleet Orchestration + +Use this skill for parallel browser operations, not single interactive tasks. + +## Core rule + +Treat "fleet" as an orchestration pattern, not a CLI primitive. +Run fanout through sub-agents. + +## Sub-agent fanout (default) + +Use this for multi-step tasks per target/account. + +1. Build a worklist of independent units (URLs, account IDs, vendors, claims). +2. Give each sub-agent exactly one unit. +3. Require strict structured output from each sub-agent (JSON object). +4. Aggregate results and retry only failed units. + +Suggested sub-agent prompt contract: + +```text +Use /browser for exactly one target. +Steps: +1) open target URL +2) snapshot -c -i --main-frame +3) perform required action(s) +4) return JSON: {target, success, key_data, evidence, error} +Do not process multiple targets in one run. +``` + +## Deterministic batch pattern + +1. Generate a normalized worklist (`[{id,url,goal}]`). +2. Spawn one sub-agent per work item. +3. Keep each sub-agent deterministic with strict step order. +4. Merge outputs and run retries on failures only. + +## Research/exploratory pattern + +1. Generate a coarse worklist. +2. Spawn sub-agents with bounded budgets (turns/timeouts). +3. Require each sub-agent to return confidence + evidence. +4. Escalate low-confidence items to a second pass. + +## Recommended hybrid pattern + +1. Run a broad first pass over all items. +2. Classify `ok / retry / escalate`. +3. Retry transient failures (timeouts, temporary blocks). +4. Escalate hard cases to sub-agents for deeper reasoning. + +This keeps cost low while preserving high success on messy targets. + +## Concurrency and reliability guardrails + +- Start with conservative concurrency (5-15 workers), then ramp. +- For anti-bot targets, switch to Browserbase remote mode before fanning out. +- Cap each unit by timeout and max retries. +- Keep result schema stable across all workers. + +## Cleanup + +Always clean up browser state after fanout: + +```bash +browse stop --force +pkill -f "browse.*daemon" || true +pkill -f "chrom(e|ium).*browse-" || true +``` diff --git a/skills/browser-automation/EXAMPLES.md b/skills/browser-automation/EXAMPLES.md deleted file mode 100644 index abc35fb..0000000 --- a/skills/browser-automation/EXAMPLES.md +++ /dev/null @@ -1,308 +0,0 @@ -# Browser Automation Examples - -This document provides detailed examples of common browser automation tasks using the CLI tool. - -## Example 1: Extract Product Information from E-commerce - -**User request**: "Go to example.com/product/123 and extract the product details" - -**Workflow**: - -1. **Navigate** to the product page: - ```bash - browser navigate https://example.com/product/123 - ``` - -2. **Extract** product data with schema: - ```bash - browser extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number", "reviewCount": "number"}' - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -**Expected result**: JSON object with product details that can be analyzed or stored. - ---- - -## Example 2: Fill Out and Submit a Contact Form - -**User request**: "Fill out the contact form on example.com with my information" - -**Workflow**: - -1. **Navigate** to contact page: - ```bash - browser navigate https://example.com/contact - ``` - -2. **Act**: Fill in name field: - ```bash - browser act "Fill in the name field with 'John Doe'" - ``` - -3. **Act**: Fill in email field: - ```bash - browser act "Fill in the email field with 'john.doe@example.com'" - ``` - -4. **Act**: Fill in message field: - ```bash - browser act "Fill in the message field with 'I would like to inquire about your services'" - ``` - -5. **Act**: Submit the form: - ```bash - browser act "Click the Submit button" - ``` - -6. **Screenshot** to capture confirmation: - ```bash - browser screenshot - ``` - -7. **Close** the browser: - ```bash - browser close - ``` - ---- - -## Example 3: Research and Summarize News Articles - -**User request**: "Check the latest tech news on techcrunch.com and summarize the top stories" - -**Workflow**: - -1. **Navigate** to news site: - ```bash - browser navigate https://techcrunch.com - ``` - -2. **Extract** article headlines and summaries: - ```bash - browser extract "Extract the top 5 article headlines and their summaries" '{"headlines": "string", "summary": "string", "author": "string", "publishedDate": "string"}' - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -4. Analyze and summarize the extracted data using Claude's text analysis capabilities. - ---- - -## Example 4: Login and Navigate Authenticated Area - -**User request**: "Log into example.com and navigate to my dashboard" - -**Workflow**: - -1. **Navigate** to login page: - ```bash - browser navigate https://example.com/login - ``` - -2. **Act**: Fill in username: - ```bash - browser act "Fill in the username field with 'myusername'" - ``` - -3. **Act**: Fill in password: - ```bash - browser act "Fill in the password field with 'mypassword'" - ``` - -4. **Act**: Click login button: - ```bash - browser act "Click the Login button" - ``` - -5. **Act**: Wait for page load: - ```bash - browser act "Wait for the page to fully load" - ``` - -6. **Navigate** to dashboard: - ```bash - browser navigate https://example.com/dashboard - ``` - -7. **Screenshot** the dashboard: - ```bash - browser screenshot - ``` - -8. **Close** the browser: - ```bash - browser close - ``` - -**Note**: This example uses Chrome's user profile (`.chrome-profile/`) which may preserve session cookies between runs. - ---- - -## Example 5: Search and Collect Results - -**User request**: "Search Google for 'best TypeScript practices' and get the top 5 results" - -**Workflow**: - -1. **Navigate** to Google: - ```bash - browser navigate https://www.google.com - ``` - -2. **Act**: Perform search: - ```bash - browser act "Type 'best TypeScript practices' in the search box and press Enter" - ``` - -3. **Act**: Wait for results: - ```bash - browser act "Wait for search results to load" - ``` - -4. **Extract** search results: - ```bash - browser extract "Extract the top 5 search results" '{"title": "string", "url": "string", "snippet": "string"}' - ``` - -5. **Close** the browser: - ```bash - browser close - ``` - ---- - -## Example 6: Download a File - -**User request**: "Download the PDF file from example.com/documents/report.pdf" - -**Workflow**: - -1. **Navigate** to the file URL: - ```bash - browser navigate https://example.com/documents/report.pdf - ``` - -2. **Act**: Wait for download to start: - ```bash - browser act "Wait for 5 seconds for the download to complete" - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -**Note**: Files are automatically downloaded to `./agent/downloads/` directory due to CDP configuration. - ---- - -## Example 7: Debugging a Page Issue - -**User request**: "Check why the submit button isn't working on example.com/form" - -**Workflow**: - -1. **Navigate** to the form page: - ```bash - browser navigate https://example.com/form - ``` - -2. **Screenshot** initial state: - ```bash - browser screenshot - ``` - -3. **Observe** available elements: - ```bash - browser observe "Find all buttons and their states" - ``` - -4. **Observe** form fields: - ```bash - browser observe "Find all form input fields and their required status" - ``` - -5. **Act**: Try filling required fields: - ```bash - browser act "Fill in all required fields with test data" - ``` - -6. **Screenshot** after filling: - ```bash - browser screenshot - ``` - -7. **Observe** button state again: - ```bash - browser observe "Check if the submit button is now enabled" - ``` - -8. **Close** the browser: - ```bash - browser close - ``` - -Analyze the screenshots and observations to determine the issue. - ---- - -## Example 8: Multi-Page Data Collection - -**User request**: "Extract product information from the first 3 pages of results on example.com/products" - -**Workflow**: - -1. **Navigate** to products page: - ```bash - browser navigate https://example.com/products - ``` - -2. **Extract** products from page 1: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -3. **Act**: Click next page: - ```bash - browser act "Click the Next Page button" - ``` - -4. **Extract** products from page 2: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -5. **Act**: Click next page: - ```bash - browser act "Click the Next Page button" - ``` - -6. **Extract** products from page 3: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -7. **Close** the browser: - ```bash - browser close - ``` - -Combine and process all extracted data. - ---- - -## Tips for Success - -- **Be specific with natural language**: "Click the blue Submit button in the footer" is better than "click submit". This is **extremely important** because there's much ambiguity in many websites. -- **Wait when needed**: After navigation or actions that trigger page changes, explicitly wait -- **Use observe for discovery**: When unsure what elements exist, use observe first -- **Take screenshots for debugging**: Visual confirmation helps understand what the browser sees -- **Handle errors gracefully**: If an action fails, try breaking it into smaller steps -- **Clean up resources**: Always close the browser when done to free up system resources diff --git a/skills/browser-automation/REFERENCE.md b/skills/browser-automation/REFERENCE.md deleted file mode 100644 index 7f3dff9..0000000 --- a/skills/browser-automation/REFERENCE.md +++ /dev/null @@ -1,535 +0,0 @@ -# Browser Automation CLI Reference - -This document provides detailed technical reference for the CLI browser automation tool. - -## Architecture Overview - -The browser automation system consists of: - -- **Stagehand**: TypeScript library wrapping Playwright for AI-driven browser control. Uses AI model to find and interact with the right elements, so be specific -- **Chrome CDP**: Chrome DevTools Protocol connection on port 9222 -- **CLI Tool**: Command-line interface in `src/cli.ts` for browser automation -- **Local Chrome**: Chrome browser launched with remote debugging enabled - -### File Locations - -- **Chrome Profile**: `.chrome-profile/` - Persistent browser profile directory -- **Screenshots**: `./agent/browser_screenshots/` - Screenshot output directory -- **Downloads**: `./agent/downloads/` - File download directory - -## CLI Command Reference - -### navigate - -Navigate to a URL in the browser. - -**Usage**: -```bash -browser navigate -``` - -**Parameters**: -- `url` (string, required): The URL to navigate to. Must include protocol (http:// or https://) - -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Successfully navigated to ", - "screenshot": "/path/to/screenshot.png" -} -``` - -**Implementation Details**: -- Uses Playwright's `page.goto()` under the hood -- Waits for network idle and DOM content loaded -- Automatically takes a screenshot after navigation -- Supports HTTPS upgrade for HTTP URLs - -**Example**: -```bash -browser navigate https://example.com -``` - -**Error Handling**: -- Invalid URLs return error with `success: false` -- Network timeouts return timeout error -- SSL certificate errors may fail navigation - ---- - -### act - -Perform an action on the page using natural language. - -**Usage**: -```bash -browser act "" -``` - -**Parameters**: -- `action` (string, required): Natural language description of the action to perform - -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Successfully performed action: ", - "screenshot": "/path/to/screenshot.png" -} -``` - -Note: Without specificity it might succeed on the wrong element! - -**Implementation Details**: -- Uses Stagehand's `page.act()` which leverages Claude Haiku 4.5 -- AI model interprets natural language and executes corresponding browser actions -- Supports: clicking, typing, selecting, scrolling, waiting, hovering, and more -- Automatically handles element location and interaction -- Automatically takes a screenshot after the action - -**Natural Language Examples**: -```bash -browser act "Click the login button" -browser act "Fill in email field with test@example.com" -browser act "Scroll to the bottom of the page" -browser act "Select 'California' from the state dropdown" -browser act "Hover over the menu icon" -browser act "Wait for 3 seconds" -browser act "Press the Enter key" -browser act "Double-click the file icon" -``` - -**Best Practices**: -- Be **specific** about which element to interact with -- Include visual descriptors ("button next to the form", "top menu", "form at bottom") -- For ambiguous elements, mention nearby context -- Break complex actions into multiple simple actions - -**Error Handling**: -- Element not found errors indicate selector couldn't be resolved -- Timeout errors occur when action takes too long -- Action not possible errors indicate element state prevents action -- All errors return JSON with `success: false` - ---- - -### extract - -Extract structured data from the current page using a schema. - -**Usage**: -```bash -browser extract "" '{"field": "type"}' -``` - -**Parameters**: -- `instruction` (string, required): Natural language description of what to extract -- `schema` (JSON string, required): Schema definition mapping field names to types - -**Schema Types**: -- `"string"`: Text content -- `"number"`: Numeric values (integers or floats) -- `"boolean"`: True/false values - -**Returns**: -JSON output: -```json -{ - "success": true, - "data": { - "field1": "value", - "field2": 123 - } -} -``` - -**Implementation Details**: -- Uses Stagehand's `page.extract()` with Zod schema validation -- AI model (Claude Haiku 4.5) identifies relevant page elements -- Automatically handles pagination and dynamic content -- Validates extracted data against schema - -**Schema Example**: -```bash -browser extract "Extract the product information" '{"productName": "string", "price": "number", "inStock": "boolean", "description": "string", "rating": "number"}' -``` - -**Complex Extraction Example**: -```bash -browser extract "Extract all items from the shopping cart" '{"itemName": "string", "quantity": "number", "unitPrice": "number", "totalPrice": "number", "imageUrl": "string"}' -``` - -**Best Practices**: -- Use clear, descriptive field names -- Match schema types to expected data types -- Provide specific extraction instructions -- Handle missing data by checking result properties - -**Error Handling**: -- Schema validation errors indicate type mismatch -- Extraction failures occur when data not found on page -- Timeout errors for pages that take too long to analyze -- All errors return JSON with `success: false` - ---- - -### observe - -Discover available actions on the page. - -**Usage**: -```bash -browser observe "" -``` - -**Parameters**: -- `query` (string, required): Natural language query to discover elements - -**Returns**: -JSON output: -```json -{ - "success": true, - "data": [ - { - "selector": "button.submit-btn", - "text": "Submit Form", - "type": "button", - "visible": true, - "enabled": true - } - ] -} -``` - -**Implementation Details**: -- Uses Stagehand's `page.observe()` to scan page elements -- Returns actionable elements matching the query -- Provides element properties, states, and available actions - -**Query Examples**: -```bash -browser observe "Find all buttons" -browser observe "Find clickable links in the navigation" -browser observe "Find form input fields" -browser observe "Find all submit buttons" -browser observe "Find elements with text 'Login'" -browser observe "Find all images" -``` - -**Use Cases**: -- Page exploration and discovery -- Debugging action failures -- Understanding page structure -- Finding dynamic element selectors - -**Error Handling**: -- Empty array returned when no elements match -- Timeout for pages that take too long to scan -- All errors return JSON with `success: false` - ---- - -### screenshot - -Take a screenshot of the current page. - -**Usage**: -```bash -browser screenshot -``` - -**Parameters**: None - -**Returns**: -JSON output: -```json -{ - "success": true, - "screenshot": "/path/to/screenshot.png" -} -``` - -**Implementation Details**: -- Captures full viewport at current scroll position -- Saves as PNG format with timestamp in filename -- Automatically resizes images larger than 2000x2000 pixels using Sharp -- Uses lossless PNG compression - -**Screenshot Path Format**: -``` -./agent/browser_screenshots/screenshot-YYYY-MM-DDTHH-MM-SS-mmmZ.png -``` - -**Example**: -```bash -browser screenshot -``` - -**Image Processing**: -- Original resolution preserved if ≤ 2000x2000 -- Larger images resized to fit within 2000x2000 while maintaining aspect ratio -- Uses Sharp library for high-quality image processing - -**Best Practices**: -- Take screenshots before and after important actions -- Use for visual debugging and verification -- Screenshot after navigation to confirm page loaded -- Capture error states for troubleshooting - -**Error Handling**: -- Directory creation errors if screenshots folder can't be created -- CDP errors if Chrome DevTools Protocol connection fails -- File write errors if disk space insufficient -- All errors return JSON with `success: false` - ---- - -### close - -Close the browser and cleanup resources. - -**Usage**: -```bash -browser close -``` - -**Parameters**: None - -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Browser closed" -} -``` - -**Implementation Details**: -- Calls `stagehand.close()` to clean up Playwright resources -- Kills Chrome process if it was started by the CLI tool -- Clears internal state variables -- Does NOT delete `.chrome-profile/` directory (preserved for reuse) - -**Resource Cleanup**: -- Closes all browser tabs and windows -- Terminates Chrome process (only if started by this tool) -- Releases CDP connection -- Clears Stagehand instance - -**Best Practices**: -- Always call at the end of browser automation tasks -- Call even if errors occurred during automation -- Don't call mid-workflow unless explicitly needed - -**Error Handling**: -- Continues cleanup even if some steps fail -- Safe to call multiple times -- Gracefully handles already-closed browser -- All errors return JSON with `success: false` - ---- - -## Configuration Details - -### Stagehand Initialization - -The Stagehand instance is configured in `src/cli.ts` with: - -```typescript -new Stagehand({ - env: "LOCAL", - verbose: 0, - enableCaching: true, - model: "anthropic/claude-haiku-4-5-20251001", - localBrowserLaunchOptions: { - cdpUrl: wsUrl, - }, -}) -``` - -**Configuration Options**: -- `env: "LOCAL"`: Uses local Chrome instead of remote browser -- `verbose: 0`: Minimal logging output -- `enableCaching: true`: Caches page analysis for better performance -- `modelName`: Claude Haiku 4.5 for AI-driven actions and extraction -- `cdpUrl`: Chrome DevTools Protocol endpoint - -### Chrome Launch Arguments - -Chrome is launched by `src/cli.ts` with: - -```bash ---remote-debugging-port=9222 ---user-data-dir=.chrome-profile ---window-position=-9999,-9999 ---window-size=1280,720 -``` - -**Arguments**: -- `--remote-debugging-port`: Enables CDP on port 9222 -- `--user-data-dir`: Persistent profile directory for session/cookie persistence -- `--window-position`: Launches minimized off-screen -- `--window-size`: Default window size - -### Download Configuration - -Downloads are configured via CDP: - -```typescript -await client.send("Browser.setDownloadBehavior", { - behavior: "allow", - downloadPath: "./agent/downloads", - eventsEnabled: true, -}) -``` - -**Behavior**: -- Downloads start automatically (no dialog) -- Files saved to `./agent/downloads/` -- Download events can be monitored via CDP - ---- - -## Error Messages Reference - -### Common Errors - -**"Could not find local Chrome installation"** -- Cause: Chrome/Chromium not installed or not in standard locations -- Solution: Install Chrome from https://www.google.com/chrome/ - -**"Chrome failed to start with remote debugging on port 9222"** -- Cause: Port 9222 already in use or Chrome can't bind to port -- Solution: Close other Chrome instances or change CDP port - -**"Browser failed to become ready within timeout"** -- Cause: Chrome launched but page context not ready -- Solution: Check Chrome version compatibility, restart system - -**"Error performing action: element not found"** -- Cause: Natural language description didn't match any page element -- Solution: Use more specific description or use observe to find elements - -**"Error extracting data: schema validation failed"** -- Cause: Extracted data type doesn't match schema -- Solution: Verify schema types match actual page data - -**"Error taking screenshot: directory not writable"** -- Cause: Insufficient permissions for screenshots directory -- Solution: Check file permissions on `./agent/browser_screenshots/` - ---- - -## Performance Considerations - -### Caching - -Stagehand caches page analysis to improve performance on repeated actions. Cache is maintained for: -- Element selectors -- Page structure analysis -- Vision model results - -### Timeouts - -Default timeouts: -- Navigation: 30 seconds -- Action execution: 30 seconds -- Extraction: 60 seconds -- CDP connection: 15 seconds (50 retries × 300ms) - -### Resource Usage - -Browser automation consumes: -- Memory: ~200-500MB for Chrome process -- CPU: Variable based on page complexity -- Disk: ~50-200MB for Chrome profile -- Network: Depends on pages visited - ---- - -## Security Considerations - -### Credential Handling - -- Browser uses persistent profile (`.chrome-profile/`) -- Saved passwords and cookies persist between sessions -- Consider using isolated profiles for sensitive operations - -### Download Safety - -- Downloads automatically saved to `./agent/downloads/` -- No file type restrictions enforced -- Verify downloaded file integrity before use - -### Network Access - -- Browser has full network access -- Respects system proxy settings -- Can access localhost and internal networks - ---- - -## Debugging Tips - -### Enable Verbose Logging - -Edit `src/cli.ts` and change verbose level in Stagehand configuration: - -```typescript -// Change verbose: 0 to verbose: 1 or 2 -verbose: 2, // Maximum verbosity -``` - -### View Chrome Console - -Connect to Chrome DevTools manually: -1. Open Chrome -2. Navigate to `chrome://inspect` -3. Click "inspect" under Remote Target - -### Check CDP Connection - -Test CDP endpoint: -```bash -curl http://localhost:9222/json/version -``` - -### Monitor Browser Process - -Check Chrome process: -```bash -ps aux | grep chrome -``` - -### View Screenshots - -Screenshots provide visual debugging: -```bash -ls -lh ./agent/browser_screenshots/ -open ./agent/browser_screenshots/screenshot-*.png -``` - -### Test CLI Commands - -Test individual commands: -```bash -browser navigate https://example.com -browser screenshot -browser close -``` - ---- - -## Version Information - -- **Stagehand**: Uses `@browserbasehq/stagehand` package v2.5.2+ -- **Model**: Claude Haiku 4.5 (claude-haiku-4-5-20251001) for browser actions -- **CLI Tool**: TypeScript CLI in `src/cli.ts` -- **Agent SDK**: `@anthropic-ai/claude-agent-sdk` for conversation framework -- **Browser**: Local Chrome/Chromium installation - -For updates and changelog, see the main project repository. diff --git a/skills/browser-automation/SKILL.md b/skills/browser-automation/SKILL.md deleted file mode 100644 index f44cf40..0000000 --- a/skills/browser-automation/SKILL.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -name: browser -description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. -allowed-tools: Bash ---- - -# Browser Automation - -Automate browser interactions using Stagehand CLI with Claude. - -### First: Environment Selection (Local vs Remote) - -The skill automatically selects between local and remote browser environments: -- **If Browserbase API keys exist** (BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID in .env file): Uses remote Browserbase environment -- **If no Browserbase API keys**: Falls back to local Chrome browser -- **No user prompting**: The selection happens automatically based on available configuration - -## Setup (First Time Only) - -Check `setup.json` in this directory. If `setupComplete: false`: - -```bash -npm install # Install dependencies -npm link # Create global 'browser' command -``` - -## Commands - -All commands work identically in both modes: - -```bash -browser navigate # Go to URL -browser act "" # Natural language action -browser extract "" ['{}'] # Extract data (optional schema) -browser observe "" # Discover elements -browser screenshot # Take screenshot -browser close # Close browser -``` - -## Quick Example - -```bash -browser navigate https://example.com -browser act "click the Sign In button" -browser extract "get the page title" -browser close -``` - -## Mode Comparison - -| Feature | Local | Browserbase | -|---------|-------|-------------| -| Speed | Faster | Slightly slower | -| Setup | Chrome required | API key required | -| Stealth mode | No | Yes | -| Proxy/CAPTCHA | No | Yes | -| Best for | Development | Production/scraping | - -## Best Practices - -1. **Always navigate first** before interacting -2. **View screenshots** after each command to verify -3. **Be specific** in action descriptions -4. **Close browser** when done - -## Troubleshooting - -- **Chrome not found**: Install Chrome or use Browserbase mode -- **Action fails**: Use `browser observe` to discover available elements -- **Browserbase fails**: Verify API key and project ID are set - -For detailed examples, see [EXAMPLES.md](EXAMPLES.md). -For API reference, see [REFERENCE.md](REFERENCE.md). diff --git a/skills/browser-automation/setup.json b/skills/browser-automation/setup.json deleted file mode 100644 index d9ac900..0000000 --- a/skills/browser-automation/setup.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "setupComplete": false, - "prerequisites": { - "chrome": { - "required": true, - "installed": false, - "description": "Google Chrome browser" - }, - "dependencies": { - "required": true, - "installed": false, - "description": "Node.js dependencies (npm install completed)" - }, - "apiKey": { - "required": true, - "configured": false, - "description": "ANTHROPIC_API_KEY exported (i.e $ANTHROPIC_API_KEY) or in .env file" - }, - "browserCommand": { - "required": true, - "installed": false, - "description": "Browser CLI command globally linked (npm link)" - } - }, - "setupInstructions": [ - "1. Run: npm install (this will automatically build TypeScript)", - "2. Run: npm link (this creates the global 'browser' command)", - "3. (RECOMMENDED) Export ANTHROPIC_API_KEY: export ANTHROPIC_API_KEY='your-api-key-here' (check if already exported)", - " OR alternatively create .env file: cp .env.example .env and edit it to add your API key", - "4. Ensure Google Chrome is installed on your system", - "5. Test installation: browser navigate https://example.com", - "6. Update this setup.json file: set all 'installed'/'configured' to true and 'setupComplete' to true" - ], - "verifySetup": "Run 'browser navigate https://example.com' from any directory to verify installation" -} diff --git a/skills/browser/EXAMPLES.md b/skills/browser/EXAMPLES.md new file mode 100644 index 0000000..56dd53c --- /dev/null +++ b/skills/browser/EXAMPLES.md @@ -0,0 +1,117 @@ +# Browser Automation Examples + +Common browser automation workflows using the `browse` CLI. Each example demonstrates a distinct pattern using real commands. + +## Example 1: Extract Data from a Page + +**User request**: "Get the product details from example.com/product/123" + +```bash +browse open https://example.com/product/123 +browse snapshot # read page structure + element refs +browse get text "body" # extract all visible text content +browse stop +``` + +Parse the text output to extract structured data (name, price, description, etc.). + +For a specific section, use a CSS selector: + +```bash +browse get text ".product-details" # text from a specific container +``` + +**Note**: `browse get text` requires a CSS selector — use `"body"` for all page text. + +## Example 2: Fill and Submit a Form + +**User request**: "Fill out the contact form on example.com with my information" + +```bash +browse open https://example.com/contact +browse snapshot # find form fields and their refs +browse click @0-3 # click the Name input (ref from snapshot) +browse type "John Doe" +browse press Tab # move to next field +browse type "john@example.com" +browse fill "#message" "I would like to inquire about your services" +browse snapshot # verify fields are filled +browse click @0-8 # click Submit button (ref from snapshot) +browse snapshot # confirm submission result +browse stop +``` + +**Key pattern**: Use `browse snapshot` before interacting to discover element refs, then `browse click ` and `browse type` to interact. + +## Example 3: Multi-Step Navigation + +**User request**: "Get headlines from the first 3 pages of results on example.com/news" + +```bash +browse open https://example.com/news +browse snapshot # read page 1 content +browse get text ".headline" # extract headlines + +browse snapshot # find "Next" button ref +browse click @0-12 # click Next (ref from snapshot) +browse wait load # wait for page 2 to load +browse get text ".headline" # extract page 2 headlines + +browse snapshot # find Next again (ref may change) +browse click @0-15 # click Next +browse wait load +browse get text ".headline" # extract page 3 headlines + +browse stop +``` + +**Key pattern**: Re-run `browse snapshot` after each navigation because element refs change when the page updates. + +## Example 4: Escalate to Remote Mode + +**User request**: "Scrape pricing from competitor.com" (a site with Cloudflare protection) + +```bash +# Attempt 1: local mode +browse open https://competitor.com/pricing +browse snapshot +# Output shows: "Checking your browser..." (Cloudflare interstitial) +# or: page content is empty / access denied +browse stop +``` + +The agent detects bot protection and tells the user: + +> This site has Cloudflare bot detection. Browserbase remote mode can bypass this with anti-bot stealth and residential proxies. Want me to set it up? + +If the user agrees: + +```bash +# Set up Browserbase credentials +openclaw browserbase setup +# User enters API key + project ID interactively + +# Retry — credentials are now in the environment +browse open https://competitor.com/pricing +browse snapshot # full page content now accessible +browse get text ".pricing-table" +browse stop +``` + +If the env vars aren't visible yet (setup was run outside OpenClaw): + +```bash +eval "$(openclaw browserbase env --format shell)" && browse open https://competitor.com/pricing +browse snapshot +browse get text ".pricing-table" +browse stop +``` + +## Tips + +- **Snapshot first**: Always run `browse snapshot` before interacting — it gives you the accessibility tree with element refs +- **Use refs to click**: `browse click @0-5` is more reliable than trying to describe elements +- **Re-snapshot after actions**: Element refs change when the page updates +- **`get text` for data extraction**: Use `browse get text [selector]` to pull text content from specific elements +- **`stop` when done**: Always `browse stop` to clean up the browser session +- **Prefer snapshot over screenshot**: Snapshot is fast and structured; screenshot is slow and uses vision tokens. Only screenshot when you need visual context (layout, images, debugging) diff --git a/skills/browser/LICENSE.txt b/skills/browser/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/browser/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md new file mode 100644 index 0000000..7a23718 --- /dev/null +++ b/skills/browser/REFERENCE.md @@ -0,0 +1,452 @@ +# Browser Automation CLI Reference + +Technical reference for the `browse` CLI tool. + +## Table of Contents + +- [Architecture](#architecture) +- [Command Reference](#command-reference) + - [Navigation](#navigation) + - [Page State](#page-state) + - [Interaction](#interaction) + - [Session Management](#session-management) + - [JavaScript Evaluation](#javascript-evaluation) + - [Viewport](#viewport) + - [Network Capture](#network-capture) +- [Configuration](#configuration) + - [Global Flags](#global-flags) + - [Environment Variables](#environment-variables) +- [Error Messages](#error-messages) + +## Architecture + +The browse CLI is a **daemon-based** command-line tool: + +- **Daemon process**: A background process manages the browser instance. Auto-starts on the first command (e.g., `browse open`), persists across commands, and stops with `browse stop`. +- **Local mode** (default): Launches a local Chrome/Chromium instance. +- **Remote mode** (Browserbase): Connects to a Browserbase cloud browser session when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set. +- **Accessibility-first**: Use `browse snapshot` to get the page's accessibility tree with element refs, then interact using those refs. + +## Command Reference + +### Navigation + +#### `open ` + +Navigate to a URL. Alias: `goto`. Auto-starts the daemon if not running. + +```bash +browse open https://example.com +browse open https://example.com --wait networkidle # wait for all network requests to finish (useful for SPAs) +browse open https://example.com --wait domcontentloaded +``` + +The `--wait` flag controls when navigation is considered complete. Values: `load` (default), `domcontentloaded`, `networkidle`. Use `networkidle` for JavaScript-heavy pages that fetch data after initial load. + +#### `reload` + +Reload the current page. + +```bash +browse reload +``` + +#### `back` / `forward` + +Navigate browser history. + +```bash +browse back +browse forward +``` + +--- + +### Page State + +#### `snapshot` + +Get the accessibility tree with interactive element refs. This is the primary way to understand page structure. + +```bash +browse snapshot +browse snapshot --compact # tree only, no ref maps +browse snapshot -c -i --main-frame # focused refs (recommended on large pages) +browse snapshot -c --contains "price" # filter tree lines by text +browse snapshot -c --max-lines 200 # cap output size +``` + +Returns a text representation of the page with refs like `@0-5` that can be passed to `click`. Use `--compact` for shorter output when you only need the tree. +Use `--interactive` and `--main-frame` to reduce payload size and speed up agent loops on heavy pages. + +#### `screenshot [path]` + +Take a visual screenshot. Slower than snapshot and uses vision tokens. + +```bash +browse screenshot # auto-generated path +browse screenshot ./capture.png # custom path +browse screenshot --full-page # capture entire scrollable page +``` + +#### `get [selector]` + +Get page properties. Available properties: `url`, `title`, `text`, `html`, `value`, `box`, `visible`, `checked`. + +```bash +browse get url # current URL +browse get title # page title +browse get text "body" # all visible text (selector required) +browse get text ".product-info" # text within a CSS selector +browse get html "#main" # inner HTML of an element +browse get value "#email-input" # value of a form field +browse get box "#header" # bounding box (centroid coordinates) +browse get visible ".modal" # check if element is visible +browse get checked "#agree" # check if checkbox/radio is checked +``` + +**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. + +#### `refs` + +Show the cached ref map from the last `browse snapshot`. Useful for looking up element refs without re-running a full snapshot. + +```bash +browse refs +``` + +--- + +### Interaction + +#### `click ` + +Click an element by its ref from `browse snapshot` output. + +```bash +browse click @0-5 # click element with ref 0-5 +``` + +#### `click_xy ` + +Click at exact viewport coordinates. + +```bash +browse click_xy 500 300 +``` + +#### `hover ` + +Hover at viewport coordinates. + +```bash +browse hover 500 300 +``` + +#### `type ` + +Type text into the currently focused element. + +```bash +browse type "Hello, world!" +browse type "slow typing" --delay 100 # 100ms between keystrokes +browse type "human-like" --mistakes # simulate human typing with typos +``` + +#### `fill ` + +Fill an input element matching a CSS selector and press Enter. + +```bash +browse fill "#search" "OpenClaw documentation" +browse fill "input[name=email]" "user@example.com" +browse fill "#search" "query" --no-press-enter # fill without pressing Enter +``` + +#### `select ` + +Select option(s) from a dropdown. + +```bash +browse select "#country" "United States" +browse select "#tags" "javascript" "typescript" # multi-select +``` + +#### `press ` + +Press a keyboard key or key combination. + +```bash +browse press Enter +browse press Tab +browse press Escape +browse press Cmd+A # select all (Mac) +browse press Ctrl+C # copy (Linux/Windows) +``` + +#### `scroll ` + +Scroll at a given position by a given amount. + +```bash +browse scroll 500 300 0 -300 # scroll up at (500, 300) +browse scroll 500 300 0 500 # scroll down +``` + +#### `drag ` + +Drag from one viewport coordinate to another. + +```bash +browse drag 80 80 310 100 # drag with default 10 steps +browse drag 80 80 310 100 --steps 20 # more intermediate steps +browse drag 80 80 310 100 --delay 50 # 50ms between steps +browse drag 80 80 310 100 --button right # use right mouse button +browse drag 80 80 310 100 --xpath # return source/target XPaths +``` + +#### `highlight ` + +Highlight an element on the page for visual debugging. + +```bash +browse highlight "#submit-btn" # highlight for 2 seconds (default) +browse highlight ".nav" -d 5000 # highlight for 5 seconds +``` + +#### `is ` + +Check element state. Available checks: `visible`, `checked`. + +```bash +browse is visible ".modal" # returns { visible: true/false } +browse is checked "#agree" # returns { checked: true/false } +``` + +#### `wait [arg]` + +Wait for a condition. + +```bash +browse wait load # wait for page load +browse wait "selector" ".results" # wait for element to appear +browse wait timeout 3000 # wait 3 seconds +``` + +--- + +### Session Management + +#### `start` + +Start the browser daemon manually. Usually not needed — the daemon auto-starts on first command. + +```bash +browse start +``` + +#### `stop` + +Stop the browser daemon and close the browser. + +```bash +browse stop +browse stop --force # force kill if daemon is unresponsive +``` + +#### `status` + +Check whether the daemon is running, its connection details, and current environment. + +```bash +browse status +``` + +#### `env [local|remote]` + +Show or switch the browser environment. Without arguments, prints the current environment. With an argument, stops the running daemon and restarts in the specified environment. The switch is sticky — subsequent commands stay in the chosen environment until you switch again or run `browse stop`. + +```bash +browse env # print current environment +browse env local # switch to local Chrome +browse env remote # switch to Browserbase (requires API keys) +``` + +#### `newpage [url]` + +Create a new tab, optionally navigating to a URL. + +```bash +browse newpage # open blank tab +browse newpage https://example.com # open tab with URL +``` + +#### `pages` + +List all open tabs. + +```bash +browse pages +``` + +#### `tab_switch ` + +Switch to a tab by its index (from `browse pages`). + +```bash +browse tab_switch 1 +``` + +#### `tab_close [index]` + +Close a tab. Closes current tab if no index given. + +```bash +browse tab_close # close current tab +browse tab_close 2 # close tab at index 2 +``` + +--- + +### JavaScript Evaluation + +#### `eval ` + +Evaluate JavaScript in the page context. + +```bash +browse eval "document.title" +browse eval "document.querySelectorAll('a').length" +``` + +--- + +### Viewport + +#### `viewport ` + +Set the browser viewport size. + +```bash +browse viewport 1920 1080 +``` + +--- + +### Network Capture + +Capture network requests to the filesystem for inspection. + +#### `network on` + +Enable network request capture. Creates a temp directory where requests and responses are saved as JSON files. + +```bash +browse network on +``` + +#### `network off` + +Disable network capture. + +```bash +browse network off +``` + +#### `network path` + +Show the capture directory path. + +```bash +browse network path +``` + +#### `network clear` + +Clear all captured requests. + +```bash +browse network clear +``` + +--- + +## Configuration + +### Global Flags + +#### `--json` + +Output as JSON for all commands. Useful for structured, parseable output. + +```bash +browse --json get url # returns {"url": "https://..."} +browse --json snapshot # returns JSON accessibility tree +``` + +#### `--session ` + +Run commands against a named session, enabling multiple concurrent browsers. + +```bash +browse --session work open https://a.com +browse --session personal open https://b.com +``` + +#### Parallel orchestration (recommended) + +Use sub-agents for parallel browser work. Assign each sub-agent one independent unit and let each sub-agent run a normal `browse` workflow in its own session. + +```bash +# Example session isolation pattern +browse --session job-1 open https://example.com/a +browse --session job-2 open https://example.com/b +``` + +This improves reliability for multi-step tasks and makes retries/debugging easier than single-command fanout patterns. + +### Environment Variables + +| Variable | Required | Description | +|----------|----------|-------------| +| `BROWSERBASE_API_KEY` | For remote mode | API key from https://browserbase.com/settings | +| `BROWSERBASE_PROJECT_ID` | For remote mode | Project ID from Browserbase dashboard | + +When both are set, the CLI uses Browserbase remote sessions. Otherwise, it falls back to local Chrome. + +The Browserbase OpenClaw plugin automatically bridges credentials from `~/.openclaw/openclaw.json` into these environment variables on startup. + +### Setting credentials + +```bash +# Via OpenClaw plugin (recommended) +openclaw browserbase setup + +# Via environment variables (manual) +export BROWSERBASE_API_KEY="bb_live_..." +export BROWSERBASE_PROJECT_ID="proj_..." +``` + +--- + +## Error Messages + +**"No active page"** +- The daemon is running but has no page open. +- Fix: Run `browse open `. If the issue persists, run `browse stop` and retry. For zombie daemons: `pkill -f "browse.*daemon"`. + +**"Chrome not found"** / **"Could not find local Chrome installation"** +- Chrome/Chromium is not installed or not in a standard location. +- Fix: Install Chrome, or switch to remote with `browse env remote` (no local browser needed). + +**"Daemon not running"** +- No daemon process is active. Most commands auto-start the daemon, but `snapshot`, `click`, etc. require an active session. +- Fix: Run `browse open ` to start a session. + +**Element ref not found (e.g., "@0-5")** +- The ref from a previous snapshot is no longer valid (page changed). +- Fix: Run `browse snapshot` again to get fresh refs. + +**Timeout errors** +- The page took too long to load or an element didn't appear. +- Fix: Try `browse wait load` before interacting, or increase wait time. diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md new file mode 100644 index 0000000..74cfa61 --- /dev/null +++ b/skills/browser/SKILL.md @@ -0,0 +1,177 @@ +--- +name: browser +description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. Supports remote Browserbase sessions with automatic CAPTCHA solving, anti-bot stealth mode, and residential proxies — ideal for scraping protected websites, bypassing bot detection, and interacting with JavaScript-heavy pages. +compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." +license: MIT +allowed-tools: Bash +metadata: + capabilities: + - remote-browser + - stealth + - captcha-solving + - residential-proxies + - parallel-subagents + - cdp-interop + openclaw: + requires: + bins: + - browse + install: + - kind: node + package: "@browserbasehq/browse-cli" + bins: [browse] + homepage: https://github.com/browserbase/skills +--- + +# Browser Automation + +Automate browser interactions using the browse CLI with Claude. + +## Setup check + +Before running any browser commands, verify the CLI is available: + +```bash +which browse || npm install -g @browserbasehq/browse-cli +``` + +## Environment Selection (Local vs Remote) + +The CLI automatically selects between local and remote browser environments based on available configuration: + +### Local mode (default) +- Uses local Chrome — no API keys needed +- Best for: development, simple pages, trusted sites with no bot protection + +### Remote mode (Browserbase) +- Activated when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set +- Provides: anti-bot stealth, automatic CAPTCHA solving, residential proxies, session persistence +- **Use remote mode when:** the target site has bot detection, CAPTCHAs, IP rate limiting, Cloudflare protection, or requires geo-specific access +- Get credentials at https://browserbase.com/settings + +### When to choose which +- **Simple browsing** (docs, wikis, public APIs): local mode is fine +- **Protected sites** (login walls, CAPTCHAs, anti-scraping): use remote mode +- **If local mode fails** with bot detection or access denied: switch to remote mode + +## Commands + +All commands work identically in both modes. The daemon auto-starts on first command. + +### Navigation +```bash +browse open # Go to URL (aliases: goto) +browse reload # Reload current page +browse back # Go back in history +browse forward # Go forward in history +``` + +### Page state (prefer snapshot over screenshot) +```bash +browse snapshot # Full accessibility tree with refs +browse snapshot -c -i --main-frame # Focused tree (recommended on large pages) +browse screenshot [path] # Take visual screenshot (slow, uses vision tokens) +browse get url # Get current URL +browse get title # Get page title +browse get text # Get text content (use "body" for all text) +browse get html # Get HTML content of element +browse get value # Get form field value +``` + +Use focused snapshots by default on complex pages: `browse snapshot -c -i --main-frame`. +Add `--contains ""` and `--max-lines ` when output is large. +Only use `browse screenshot` when you need visual context (layout, images, debugging). + +### Interaction +```bash +browse click # Click element by ref from snapshot (e.g., @0-5) +browse type # Type text into focused element +browse fill # Fill input and press Enter +browse select # Select dropdown option(s) +browse press # Press key (Enter, Tab, Escape, Cmd+A, etc.) +browse drag # Drag from one point to another +browse scroll # Scroll at coordinates +browse highlight # Highlight element on page +browse is visible # Check if element is visible +browse is checked # Check if element is checked +browse wait [arg] # Wait for: load, selector, timeout +``` + +### Session management +```bash +browse stop # Stop the browser daemon +browse status # Check daemon status (includes env) +browse env # Show current environment (local or remote) +browse env local # Switch to local Chrome +browse env remote # Switch to Browserbase (requires API keys) +browse pages # List all open tabs +browse tab_switch # Switch to tab by index +browse tab_close [index] # Close tab +``` + +### Typical workflow +1. `browse open ` — navigate to the page +2. `browse snapshot -c -i --main-frame` — get focused refs with less output +3. `browse click ` / `browse type ` / `browse fill ` — interact using refs from snapshot +4. `browse snapshot -c --contains "" --max-lines 200` — confirm state changes +5. Repeat 3-4 as needed +6. `browse stop` — close the browser when done + +## Quick Example + +```bash +browse open https://example.com +browse snapshot -c -i --main-frame # focused refs +browse click @0-5 # click element with ref 0-5 +browse get title +browse stop +``` + +## Mode Comparison + +| Feature | Local | Browserbase | +|---------|-------|-------------| +| Speed | Faster | Slightly slower | +| Setup | Chrome required | API key required | +| Stealth mode | No | Yes (custom Chromium, anti-bot fingerprinting) | +| CAPTCHA solving | No | Yes (automatic reCAPTCHA/hCaptcha) | +| Residential proxies | No | Yes (201 countries, geo-targeting) | +| Session persistence | No | Yes (cookies/auth persist across sessions) | +| Best for | Development/simple pages | Protected sites, bot detection, production scraping | + +## Best Practices + +1. **Always `browse open` first** before interacting +2. **Use focused snapshots** first — `browse snapshot -c -i --main-frame` +3. **Only screenshot when visual context is needed** (layout checks, images, debugging) +4. **Use refs from snapshot** to click/interact — e.g., `browse click @0-5` +5. **`browse stop`** when done to clean up the browser session +6. **For parallel work, use sub-agents** and assign one unit of work per agent +7. **Quote URLs with query params** — e.g. `browse open "https://site.com/path?a=1&b=2"` to avoid shell globbing + +## Troubleshooting + +- **"No active page"**: Run `browse stop`, then check `browse status`. If it still says running, kill the zombie daemon with `pkill -f "browse.*daemon"`, then retry `browse open` +- **Chrome not found**: Install Chrome or use `browse env remote` +- **Action fails**: Run `browse snapshot` to see available elements and their refs +- **Browserbase fails**: Verify API key and project ID are set + +## Switching to Remote Mode + +Switch to remote when you detect: CAPTCHAs (reCAPTCHA, hCaptcha, Turnstile), bot detection pages ("Checking your browser..."), HTTP 403/429, empty pages on sites that should have content, or the user asks for it. + +Don't switch for simple sites (docs, wikis, public APIs, localhost). + +```bash +browse env remote # switch to Browserbase +browse env local # switch back to local Chrome +``` + +The switch is sticky until you run `browse stop` or switch again. If API keys aren't set: + +```bash +openclaw browserbase setup # interactive — prompts for API key + project ID +``` + +For detailed examples, see [EXAMPLES.md](EXAMPLES.md). +For API reference, see [REFERENCE.md](REFERENCE.md). diff --git a/skills/functions/LICENSE.txt b/skills/functions/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/functions/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/functions/REFERENCE.md b/skills/functions/REFERENCE.md new file mode 100644 index 0000000..37237e4 --- /dev/null +++ b/skills/functions/REFERENCE.md @@ -0,0 +1,151 @@ +# Browserbase Functions Reference + +## Table of Contents + +- [Invoking Deployed Functions](#invoking-deployed-functions) +- [Common Patterns](#common-patterns) +- [Troubleshooting](#troubleshooting) + +## Invoking Deployed Functions + +### Via curl + +```bash +# Start invocation +curl -X POST "https://api.browserbase.com/v1/functions/FUNCTION_ID/invoke" \ + -H "Content-Type: application/json" \ + -H "x-bb-api-key: $BROWSERBASE_API_KEY" \ + -d '{"params": {"url": "https://example.com"}}' + +# Response: {"id": "INVOCATION_ID"} + +# Poll for result +curl "https://api.browserbase.com/v1/functions/invocations/INVOCATION_ID" \ + -H "x-bb-api-key: $BROWSERBASE_API_KEY" +``` + +### Via Code + +```typescript +async function invokeFunction(functionId: string, params: object) { + // Start invocation + const invokeRes = await fetch( + `https://api.browserbase.com/v1/functions/${functionId}/invoke`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-bb-api-key': process.env.BROWSERBASE_API_KEY!, + }, + body: JSON.stringify({ params }), + } + ); + const { id: invocationId } = await invokeRes.json(); + + // Poll until complete + while (true) { + await new Promise(r => setTimeout(r, 5000)); + + const statusRes = await fetch( + `https://api.browserbase.com/v1/functions/invocations/${invocationId}`, + { headers: { 'x-bb-api-key': process.env.BROWSERBASE_API_KEY! } } + ); + const result = await statusRes.json(); + + if (result.status === 'COMPLETED') return result.results; + if (result.status === 'FAILED') throw new Error(result.error); + } +} +``` + +## Common Patterns + +### Parameterized Scraping + +```typescript +defineFn("scrape", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + await page.goto(params.url); + await page.waitForSelector(params.selector); + + const items = await page.$$eval(params.selector, els => + els.map(el => el.textContent?.trim()) + ); + + return { url: params.url, items }; +}); +``` + +### With Authentication + +```typescript +defineFn("authenticated-action", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + // Login + await page.goto("https://example.com/login"); + await page.fill('[name="email"]', params.email); + await page.fill('[name="password"]', params.password); + await page.click('button[type="submit"]'); + await page.waitForURL('**/dashboard'); + + // Do authenticated work + const data = await page.textContent('.user-data'); + return { data }; +}); +``` + +### Error Handling + +```typescript +defineFn("safe-scrape", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + try { + await page.goto(params.url, { timeout: 30000 }); + await page.waitForSelector(params.selector, { timeout: 10000 }); + + const data = await page.textContent(params.selector); + return { success: true, data }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error' + }; + } +}); +``` + +## Troubleshooting + +### "Missing API key" +```bash +# Check .env file has credentials +cat .env + +# Or set for current shell +export BROWSERBASE_API_KEY="your_key" +export BROWSERBASE_PROJECT_ID="your_project" +``` + +### Dev server won't start +```bash +# Make sure SDK is installed +pnpm add @browserbasehq/sdk-functions + +# Or use npx +npx @browserbasehq/sdk-functions dev index.ts +``` + +### Function times out +- Max execution time is 15 minutes +- Add specific timeouts to page operations +- Use `waitForSelector` instead of sleep + +### Can't connect to browser +- Check `session.connectUrl` is being used correctly +- Ensure you're using `chromium.connectOverCDP()` not `chromium.launch()` diff --git a/skills/functions/SKILL.md b/skills/functions/SKILL.md index 9711cb6..4089b59 100644 --- a/skills/functions/SKILL.md +++ b/skills/functions/SKILL.md @@ -1,29 +1,17 @@ --- name: functions -description: Guide Claude through deploying serverless browser automation using the official bb CLI +description: "Deploy serverless browser automation as cloud functions using Browserbase. Use when the user wants to deploy browser automation to run on a schedule or cron, create a webhook endpoint for browser tasks, run automation in the cloud instead of locally, or asks about Browserbase Functions." +license: MIT --- -# Browserbase Functions Skill +# Browserbase Functions -Guide Claude through deploying serverless browser automation using the official `bb` CLI. - -## When to Use - -Use this skill when: -- User wants to deploy automation to run on a schedule -- User needs a webhook endpoint for browser automation -- User wants to run automation in the cloud (not locally) -- User asks about Browserbase Functions +Deploy serverless browser automation using the official `bb` CLI. ## Prerequisites -### 1. Get Credentials - Get API key and Project ID from: https://browserbase.com/settings -### 2. Set Environment Variables - -Set directly: ```bash export BROWSERBASE_API_KEY="your_api_key" export BROWSERBASE_PROJECT_ID="your_project_id" @@ -31,7 +19,7 @@ export BROWSERBASE_PROJECT_ID="your_project_id" ## Creating a Function Project -### 1. Initialize with Official CLI +### 1. Initialize ```bash pnpm dlx @browserbasehq/sdk-functions init my-function @@ -49,17 +37,10 @@ my-function/ ### 2. Add Credentials to .env ```bash -# Copy from stored credentials echo "BROWSERBASE_API_KEY=$BROWSERBASE_API_KEY" >> .env echo "BROWSERBASE_PROJECT_ID=$BROWSERBASE_PROJECT_ID" >> .env ``` -Or manually edit `.env`: -``` -BROWSERBASE_API_KEY=your_api_key -BROWSERBASE_PROJECT_ID=your_project_id -``` - ### 3. Install Dependencies ```bash @@ -74,15 +55,15 @@ import { chromium } from "playwright-core"; defineFn("my-function", async (context) => { const { session, params } = context; - + // Connect to browser const browser = await chromium.connectOverCDP(session.connectUrl); const page = browser.contexts()[0]!.pages()[0]!; - + // Your automation await page.goto(params.url || "https://example.com"); const title = await page.title(); - + // Return JSON-serializable result return { success: true, title }; }); @@ -116,8 +97,6 @@ The dev server auto-reloads on file changes. Use `console.log()` for debugging - ## Deploying -### Publish to Browserbase - ```bash pnpm bb publish index.ts ``` @@ -131,121 +110,7 @@ Function ID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx **Save the Function ID** - you need it to invoke. -## Invoking Deployed Functions - -### Via curl - -```bash -# Start invocation -curl -X POST "https://api.browserbase.com/v1/functions/FUNCTION_ID/invoke" \ - -H "Content-Type: application/json" \ - -H "x-bb-api-key: $BROWSERBASE_API_KEY" \ - -d '{"params": {"url": "https://example.com"}}' - -# Response: {"id": "INVOCATION_ID"} - -# Poll for result -curl "https://api.browserbase.com/v1/functions/invocations/INVOCATION_ID" \ - -H "x-bb-api-key: $BROWSERBASE_API_KEY" -``` - -### Via Code - -```typescript -async function invokeFunction(functionId: string, params: object) { - // Start invocation - const invokeRes = await fetch( - `https://api.browserbase.com/v1/functions/${functionId}/invoke`, - { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-bb-api-key': process.env.BROWSERBASE_API_KEY!, - }, - body: JSON.stringify({ params }), - } - ); - const { id: invocationId } = await invokeRes.json(); - - // Poll until complete - while (true) { - await new Promise(r => setTimeout(r, 5000)); - - const statusRes = await fetch( - `https://api.browserbase.com/v1/functions/invocations/${invocationId}`, - { headers: { 'x-bb-api-key': process.env.BROWSERBASE_API_KEY! } } - ); - const result = await statusRes.json(); - - if (result.status === 'COMPLETED') return result.results; - if (result.status === 'FAILED') throw new Error(result.error); - } -} -``` - -## Common Patterns - -### Parameterized Scraping - -```typescript -defineFn("scrape", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - await page.goto(params.url); - await page.waitForSelector(params.selector); - - const items = await page.$$eval(params.selector, els => - els.map(el => el.textContent?.trim()) - ); - - return { url: params.url, items }; -}); -``` - -### With Authentication - -```typescript -defineFn("authenticated-action", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - // Login - await page.goto("https://example.com/login"); - await page.fill('[name="email"]', params.email); - await page.fill('[name="password"]', params.password); - await page.click('button[type="submit"]'); - await page.waitForURL('**/dashboard'); - - // Do authenticated work - const data = await page.textContent('.user-data'); - return { data }; -}); -``` - -### Error Handling - -```typescript -defineFn("safe-scrape", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - try { - await page.goto(params.url, { timeout: 30000 }); - await page.waitForSelector(params.selector, { timeout: 10000 }); - - const data = await page.textContent(params.selector); - return { success: true, data }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : 'Unknown error' - }; - } -}); -``` - -## CLI Reference +## Quick Reference | Command | Description | |---------|-------------| @@ -253,32 +118,4 @@ defineFn("safe-scrape", async ({ session, params }) => { | `pnpm bb dev ` | Start local dev server | | `pnpm bb publish ` | Deploy to Browserbase | -## Troubleshooting - -### "Missing API key" -```bash -# Check .env file has credentials -cat .env - -# Or set for current shell -export BROWSERBASE_API_KEY="your_key" -export BROWSERBASE_PROJECT_ID="your_project" -``` - -### Dev server won't start -```bash -# Make sure SDK is installed -pnpm add @browserbasehq/sdk-functions - -# Or use npx -npx @browserbasehq/sdk-functions dev index.ts -``` - -### Function times out -- Max execution time is 15 minutes -- Add specific timeouts to page operations -- Use `waitForSelector` instead of sleep - -### Can't connect to browser -- Check `session.connectUrl` is being used correctly -- Ensure you're using `chromium.connectOverCDP()` not `chromium.launch()` +For invocation examples, common patterns, and troubleshooting, see [REFERENCE.md](REFERENCE.md).