From ab68dc712e47e5d73f944032ecd8fe22bee79594 Mon Sep 17 00:00:00 2001 From: openclaw Date: Sat, 21 Feb 2026 17:10:25 -0800 Subject: [PATCH 01/24] Use standard skill frontmatter for setup instead of setup.json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No agent (Claude Code, OpenCode, OpenClaw, Cursor, Windsurf) reads setup.json — only SKILL.md is loaded into context. The setup.json pattern was invisible to all agents and therefore broken everywhere. Replace with standard frontmatter fields: - requires.bins: [browser] — agents skip/suppress the skill if the binary isn't installed, rather than always injecting it and hoping the agent notices setupComplete: false - install.kind/pkg — agents that support auto-install can invoke npm install @browserbasehq/stagehand-cli automatically Also remove the ".env file" reference for credentials — the env vars are read from the environment, not specifically from a .env file. Co-Authored-By: Claude Sonnet 4.6 --- skills/browser-automation/SKILL.md | 18 ++++++-------- skills/browser-automation/setup.json | 35 ---------------------------- 2 files changed, 7 insertions(+), 46 deletions(-) delete mode 100644 skills/browser-automation/setup.json diff --git a/skills/browser-automation/SKILL.md b/skills/browser-automation/SKILL.md index f44cf40..6b5a396 100644 --- a/skills/browser-automation/SKILL.md +++ b/skills/browser-automation/SKILL.md @@ -1,6 +1,11 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. +requires: + bins: [browser] +install: + kind: node + pkg: "@browserbasehq/stagehand-cli" allowed-tools: Bash --- @@ -8,22 +13,13 @@ allowed-tools: Bash Automate browser interactions using Stagehand CLI with Claude. -### First: Environment Selection (Local vs Remote) +### Environment Selection (Local vs Remote) The skill automatically selects between local and remote browser environments: -- **If Browserbase API keys exist** (BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID in .env file): Uses remote Browserbase environment +- **If Browserbase API keys exist** (`BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`): Uses remote Browserbase environment - **If no Browserbase API keys**: Falls back to local Chrome browser - **No user prompting**: The selection happens automatically based on available configuration -## Setup (First Time Only) - -Check `setup.json` in this directory. If `setupComplete: false`: - -```bash -npm install # Install dependencies -npm link # Create global 'browser' command -``` - ## Commands All commands work identically in both modes: diff --git a/skills/browser-automation/setup.json b/skills/browser-automation/setup.json deleted file mode 100644 index d9ac900..0000000 --- a/skills/browser-automation/setup.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "setupComplete": false, - "prerequisites": { - "chrome": { - "required": true, - "installed": false, - "description": "Google Chrome browser" - }, - "dependencies": { - "required": true, - "installed": false, - "description": "Node.js dependencies (npm install completed)" - }, - "apiKey": { - "required": true, - "configured": false, - "description": "ANTHROPIC_API_KEY exported (i.e $ANTHROPIC_API_KEY) or in .env file" - }, - "browserCommand": { - "required": true, - "installed": false, - "description": "Browser CLI command globally linked (npm link)" - } - }, - "setupInstructions": [ - "1. Run: npm install (this will automatically build TypeScript)", - "2. Run: npm link (this creates the global 'browser' command)", - "3. (RECOMMENDED) Export ANTHROPIC_API_KEY: export ANTHROPIC_API_KEY='your-api-key-here' (check if already exported)", - " OR alternatively create .env file: cp .env.example .env and edit it to add your API key", - "4. Ensure Google Chrome is installed on your system", - "5. Test installation: browser navigate https://example.com", - "6. Update this setup.json file: set all 'installed'/'configured' to true and 'setupComplete' to true" - ], - "verifySetup": "Run 'browser navigate https://example.com' from any directory to verify installation" -} From abe5f592979cf73f141ff48e1c752f72cfa23dd1 Mon Sep 17 00:00:00 2001 From: openclaw Date: Sat, 21 Feb 2026 17:19:13 -0800 Subject: [PATCH 02/24] Fix: use compatibility field and inline setup check instead of invented frontmatter requires.bins and install.kind/pkg are not part of any agent skills spec. Replace with: - compatibility field (valid per agentskills.io spec) to surface requirements - inline `which browser || npm install` check in the skill body so the agent can self-heal without relying on non-standard frontmatter fields Co-Authored-By: Claude Sonnet 4.6 --- skills/browser-automation/SKILL.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/skills/browser-automation/SKILL.md b/skills/browser-automation/SKILL.md index 6b5a396..8d01a90 100644 --- a/skills/browser-automation/SKILL.md +++ b/skills/browser-automation/SKILL.md @@ -1,11 +1,7 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. -requires: - bins: [browser] -install: - kind: node - pkg: "@browserbasehq/stagehand-cli" +compatibility: Requires the Stagehand browser CLI (`npm install -g @browserbasehq/stagehand-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise. allowed-tools: Bash --- @@ -13,6 +9,14 @@ allowed-tools: Bash Automate browser interactions using Stagehand CLI with Claude. +### Setup check + +Before running any browser commands, verify the CLI is available: + +```bash +which browser || npm install -g @browserbasehq/stagehand-cli +``` + ### Environment Selection (Local vs Remote) The skill automatically selects between local and remote browser environments: From cf762a8f43d72fb669de7a89ae6b84d6a388de70 Mon Sep 17 00:00:00 2001 From: openclaw Date: Sat, 21 Feb 2026 17:25:33 -0800 Subject: [PATCH 03/24] Rename browser-automation/ to browser/ to match skill name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agentskills.io spec requires the directory name to match the name field in SKILL.md frontmatter. Since the skill is named "browser" (invoked as /browser), the directory should be browser/ not browser-automation/. Also fix inconsistent heading levels (### → ##). Co-Authored-By: Claude Sonnet 4.6 --- skills/{browser-automation => browser}/EXAMPLES.md | 0 skills/{browser-automation => browser}/REFERENCE.md | 0 skills/{browser-automation => browser}/SKILL.md | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename skills/{browser-automation => browser}/EXAMPLES.md (100%) rename skills/{browser-automation => browser}/REFERENCE.md (100%) rename skills/{browser-automation => browser}/SKILL.md (97%) diff --git a/skills/browser-automation/EXAMPLES.md b/skills/browser/EXAMPLES.md similarity index 100% rename from skills/browser-automation/EXAMPLES.md rename to skills/browser/EXAMPLES.md diff --git a/skills/browser-automation/REFERENCE.md b/skills/browser/REFERENCE.md similarity index 100% rename from skills/browser-automation/REFERENCE.md rename to skills/browser/REFERENCE.md diff --git a/skills/browser-automation/SKILL.md b/skills/browser/SKILL.md similarity index 97% rename from skills/browser-automation/SKILL.md rename to skills/browser/SKILL.md index 8d01a90..5af748a 100644 --- a/skills/browser-automation/SKILL.md +++ b/skills/browser/SKILL.md @@ -9,7 +9,7 @@ allowed-tools: Bash Automate browser interactions using Stagehand CLI with Claude. -### Setup check +## Setup check Before running any browser commands, verify the CLI is available: @@ -17,7 +17,7 @@ Before running any browser commands, verify the CLI is available: which browser || npm install -g @browserbasehq/stagehand-cli ``` -### Environment Selection (Local vs Remote) +## Environment Selection (Local vs Remote) The skill automatically selects between local and remote browser environments: - **If Browserbase API keys exist** (`BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`): Uses remote Browserbase environment From 44444770f0b9730fadf525f40a200328c0ba4aa7 Mon Sep 17 00:00:00 2001 From: openclaw Date: Sat, 21 Feb 2026 17:56:42 -0800 Subject: [PATCH 04/24] Align skills with Agent Skills spec (agentskills.io) Validated both skills against the skills-ref reference library and fixed all issues found: - Quote compatibility YAML value to fix strictyaml parse error - Rewrite functions description with trigger keywords (schedule, webhook, cloud, cron, Browserbase Functions) -- the spec requires triggers in the description, not in the body - Split functions SKILL.md into SKILL.md + REFERENCE.md for progressive disclosure (invocation examples, common patterns, troubleshooting) - Remove "When to Use" body section from functions (redundant with description, invisible during skill discovery) - Add license: Apache-2.0 and LICENSE.txt to both skills - Add table of contents to browser REFERENCE.md (535 lines) - Condense browser EXAMPLES.md from 8 repetitive examples to 4 diverse ones Co-Authored-By: Claude Opus 4.6 --- skills/browser/EXAMPLES.md | 317 +++++----------------------------- skills/browser/LICENSE.txt | 190 ++++++++++++++++++++ skills/browser/REFERENCE.md | 16 ++ skills/browser/SKILL.md | 3 +- skills/functions/LICENSE.txt | 190 ++++++++++++++++++++ skills/functions/REFERENCE.md | 151 ++++++++++++++++ skills/functions/SKILL.md | 183 ++------------------ 7 files changed, 599 insertions(+), 451 deletions(-) create mode 100644 skills/browser/LICENSE.txt create mode 100644 skills/functions/LICENSE.txt create mode 100644 skills/functions/REFERENCE.md diff --git a/skills/browser/EXAMPLES.md b/skills/browser/EXAMPLES.md index abc35fb..f74bc9d 100644 --- a/skills/browser/EXAMPLES.md +++ b/skills/browser/EXAMPLES.md @@ -1,306 +1,69 @@ # Browser Automation Examples -This document provides detailed examples of common browser automation tasks using the CLI tool. +Common browser automation workflows using the CLI tool. Each example demonstrates a distinct pattern. -## Example 1: Extract Product Information from E-commerce +## Example 1: Extract Structured Data **User request**: "Go to example.com/product/123 and extract the product details" -**Workflow**: +```bash +browser navigate https://example.com/product/123 +browser extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number"}' +browser close +``` -1. **Navigate** to the product page: - ```bash - browser navigate https://example.com/product/123 - ``` - -2. **Extract** product data with schema: - ```bash - browser extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number", "reviewCount": "number"}' - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -**Expected result**: JSON object with product details that can be analyzed or stored. - ---- - -## Example 2: Fill Out and Submit a Contact Form +## Example 2: Fill and Submit a Form **User request**: "Fill out the contact form on example.com with my information" -**Workflow**: - -1. **Navigate** to contact page: - ```bash - browser navigate https://example.com/contact - ``` - -2. **Act**: Fill in name field: - ```bash - browser act "Fill in the name field with 'John Doe'" - ``` - -3. **Act**: Fill in email field: - ```bash - browser act "Fill in the email field with 'john.doe@example.com'" - ``` - -4. **Act**: Fill in message field: - ```bash - browser act "Fill in the message field with 'I would like to inquire about your services'" - ``` - -5. **Act**: Submit the form: - ```bash - browser act "Click the Submit button" - ``` - -6. **Screenshot** to capture confirmation: - ```bash - browser screenshot - ``` - -7. **Close** the browser: - ```bash - browser close - ``` - ---- - -## Example 3: Research and Summarize News Articles - -**User request**: "Check the latest tech news on techcrunch.com and summarize the top stories" - -**Workflow**: - -1. **Navigate** to news site: - ```bash - browser navigate https://techcrunch.com - ``` - -2. **Extract** article headlines and summaries: - ```bash - browser extract "Extract the top 5 article headlines and their summaries" '{"headlines": "string", "summary": "string", "author": "string", "publishedDate": "string"}' - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -4. Analyze and summarize the extracted data using Claude's text analysis capabilities. - ---- - -## Example 4: Login and Navigate Authenticated Area - -**User request**: "Log into example.com and navigate to my dashboard" - -**Workflow**: - -1. **Navigate** to login page: - ```bash - browser navigate https://example.com/login - ``` - -2. **Act**: Fill in username: - ```bash - browser act "Fill in the username field with 'myusername'" - ``` - -3. **Act**: Fill in password: - ```bash - browser act "Fill in the password field with 'mypassword'" - ``` - -4. **Act**: Click login button: - ```bash - browser act "Click the Login button" - ``` - -5. **Act**: Wait for page load: - ```bash - browser act "Wait for the page to fully load" - ``` - -6. **Navigate** to dashboard: - ```bash - browser navigate https://example.com/dashboard - ``` - -7. **Screenshot** the dashboard: - ```bash - browser screenshot - ``` - -8. **Close** the browser: - ```bash - browser close - ``` - -**Note**: This example uses Chrome's user profile (`.chrome-profile/`) which may preserve session cookies between runs. - ---- +```bash +browser navigate https://example.com/contact +browser act "Fill in the name field with 'John Doe'" +browser act "Fill in the email field with 'john.doe@example.com'" +browser act "Fill in the message field with 'I would like to inquire about your services'" +browser act "Click the Submit button" +browser screenshot +browser close +``` -## Example 5: Search and Collect Results - -**User request**: "Search Google for 'best TypeScript practices' and get the top 5 results" - -**Workflow**: - -1. **Navigate** to Google: - ```bash - browser navigate https://www.google.com - ``` - -2. **Act**: Perform search: - ```bash - browser act "Type 'best TypeScript practices' in the search box and press Enter" - ``` - -3. **Act**: Wait for results: - ```bash - browser act "Wait for search results to load" - ``` - -4. **Extract** search results: - ```bash - browser extract "Extract the top 5 search results" '{"title": "string", "url": "string", "snippet": "string"}' - ``` - -5. **Close** the browser: - ```bash - browser close - ``` - ---- - -## Example 6: Download a File - -**User request**: "Download the PDF file from example.com/documents/report.pdf" - -**Workflow**: - -1. **Navigate** to the file URL: - ```bash - browser navigate https://example.com/documents/report.pdf - ``` - -2. **Act**: Wait for download to start: - ```bash - browser act "Wait for 5 seconds for the download to complete" - ``` - -3. **Close** the browser: - ```bash - browser close - ``` - -**Note**: Files are automatically downloaded to `./agent/downloads/` directory due to CDP configuration. - ---- - -## Example 7: Debugging a Page Issue +## Example 3: Debug a Page Issue **User request**: "Check why the submit button isn't working on example.com/form" -**Workflow**: +This example shows how to combine `observe` and `screenshot` for page inspection. -1. **Navigate** to the form page: - ```bash - browser navigate https://example.com/form - ``` - -2. **Screenshot** initial state: - ```bash - browser screenshot - ``` - -3. **Observe** available elements: - ```bash - browser observe "Find all buttons and their states" - ``` - -4. **Observe** form fields: - ```bash - browser observe "Find all form input fields and their required status" - ``` - -5. **Act**: Try filling required fields: - ```bash - browser act "Fill in all required fields with test data" - ``` - -6. **Screenshot** after filling: - ```bash - browser screenshot - ``` - -7. **Observe** button state again: - ```bash - browser observe "Check if the submit button is now enabled" - ``` - -8. **Close** the browser: - ```bash - browser close - ``` +```bash +browser navigate https://example.com/form +browser screenshot +browser observe "Find all buttons and their states" +browser observe "Find all form input fields and their required status" +browser act "Fill in all required fields with test data" +browser screenshot +browser observe "Check if the submit button is now enabled" +browser close +``` Analyze the screenshots and observations to determine the issue. ---- - -## Example 8: Multi-Page Data Collection +## Example 4: Multi-Page Data Collection **User request**: "Extract product information from the first 3 pages of results on example.com/products" -**Workflow**: - -1. **Navigate** to products page: - ```bash - browser navigate https://example.com/products - ``` - -2. **Extract** products from page 1: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -3. **Act**: Click next page: - ```bash - browser act "Click the Next Page button" - ``` - -4. **Extract** products from page 2: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -5. **Act**: Click next page: - ```bash - browser act "Click the Next Page button" - ``` - -6. **Extract** products from page 3: - ```bash - browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' - ``` - -7. **Close** the browser: - ```bash - browser close - ``` +```bash +browser navigate https://example.com/products +browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browser act "Click the Next Page button" +browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browser act "Click the Next Page button" +browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browser close +``` Combine and process all extracted data. ---- - ## Tips for Success -- **Be specific with natural language**: "Click the blue Submit button in the footer" is better than "click submit". This is **extremely important** because there's much ambiguity in many websites. +- **Be specific with natural language**: "Click the blue Submit button in the footer" is better than "click submit". This is **extremely important** because there's much ambiguity in many websites. - **Wait when needed**: After navigation or actions that trigger page changes, explicitly wait - **Use observe for discovery**: When unsure what elements exist, use observe first - **Take screenshots for debugging**: Visual confirmation helps understand what the browser sees diff --git a/skills/browser/LICENSE.txt b/skills/browser/LICENSE.txt new file mode 100644 index 0000000..7778163 --- /dev/null +++ b/skills/browser/LICENSE.txt @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2025 Browserbase, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 7f3dff9..636628a 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -2,6 +2,22 @@ This document provides detailed technical reference for the CLI browser automation tool. +## Table of Contents + +- [Architecture Overview](#architecture-overview) +- [CLI Command Reference](#cli-command-reference) + - [navigate](#navigate) + - [act](#act) + - [extract](#extract) + - [observe](#observe) + - [screenshot](#screenshot) + - [close](#close) +- [Configuration Details](#configuration-details) +- [Error Messages Reference](#error-messages-reference) +- [Performance Considerations](#performance-considerations) +- [Security Considerations](#security-considerations) +- [Debugging Tips](#debugging-tips) + ## Architecture Overview The browser automation system consists of: diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 5af748a..89b45a1 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -1,7 +1,8 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. -compatibility: Requires the Stagehand browser CLI (`npm install -g @browserbasehq/stagehand-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise. +compatibility: "Requires the Stagehand browser CLI (`npm install -g @browserbasehq/stagehand-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." +license: Apache-2.0 allowed-tools: Bash --- diff --git a/skills/functions/LICENSE.txt b/skills/functions/LICENSE.txt new file mode 100644 index 0000000..7778163 --- /dev/null +++ b/skills/functions/LICENSE.txt @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2025 Browserbase, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/skills/functions/REFERENCE.md b/skills/functions/REFERENCE.md new file mode 100644 index 0000000..37237e4 --- /dev/null +++ b/skills/functions/REFERENCE.md @@ -0,0 +1,151 @@ +# Browserbase Functions Reference + +## Table of Contents + +- [Invoking Deployed Functions](#invoking-deployed-functions) +- [Common Patterns](#common-patterns) +- [Troubleshooting](#troubleshooting) + +## Invoking Deployed Functions + +### Via curl + +```bash +# Start invocation +curl -X POST "https://api.browserbase.com/v1/functions/FUNCTION_ID/invoke" \ + -H "Content-Type: application/json" \ + -H "x-bb-api-key: $BROWSERBASE_API_KEY" \ + -d '{"params": {"url": "https://example.com"}}' + +# Response: {"id": "INVOCATION_ID"} + +# Poll for result +curl "https://api.browserbase.com/v1/functions/invocations/INVOCATION_ID" \ + -H "x-bb-api-key: $BROWSERBASE_API_KEY" +``` + +### Via Code + +```typescript +async function invokeFunction(functionId: string, params: object) { + // Start invocation + const invokeRes = await fetch( + `https://api.browserbase.com/v1/functions/${functionId}/invoke`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-bb-api-key': process.env.BROWSERBASE_API_KEY!, + }, + body: JSON.stringify({ params }), + } + ); + const { id: invocationId } = await invokeRes.json(); + + // Poll until complete + while (true) { + await new Promise(r => setTimeout(r, 5000)); + + const statusRes = await fetch( + `https://api.browserbase.com/v1/functions/invocations/${invocationId}`, + { headers: { 'x-bb-api-key': process.env.BROWSERBASE_API_KEY! } } + ); + const result = await statusRes.json(); + + if (result.status === 'COMPLETED') return result.results; + if (result.status === 'FAILED') throw new Error(result.error); + } +} +``` + +## Common Patterns + +### Parameterized Scraping + +```typescript +defineFn("scrape", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + await page.goto(params.url); + await page.waitForSelector(params.selector); + + const items = await page.$$eval(params.selector, els => + els.map(el => el.textContent?.trim()) + ); + + return { url: params.url, items }; +}); +``` + +### With Authentication + +```typescript +defineFn("authenticated-action", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + // Login + await page.goto("https://example.com/login"); + await page.fill('[name="email"]', params.email); + await page.fill('[name="password"]', params.password); + await page.click('button[type="submit"]'); + await page.waitForURL('**/dashboard'); + + // Do authenticated work + const data = await page.textContent('.user-data'); + return { data }; +}); +``` + +### Error Handling + +```typescript +defineFn("safe-scrape", async ({ session, params }) => { + const browser = await chromium.connectOverCDP(session.connectUrl); + const page = browser.contexts()[0]!.pages()[0]!; + + try { + await page.goto(params.url, { timeout: 30000 }); + await page.waitForSelector(params.selector, { timeout: 10000 }); + + const data = await page.textContent(params.selector); + return { success: true, data }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error' + }; + } +}); +``` + +## Troubleshooting + +### "Missing API key" +```bash +# Check .env file has credentials +cat .env + +# Or set for current shell +export BROWSERBASE_API_KEY="your_key" +export BROWSERBASE_PROJECT_ID="your_project" +``` + +### Dev server won't start +```bash +# Make sure SDK is installed +pnpm add @browserbasehq/sdk-functions + +# Or use npx +npx @browserbasehq/sdk-functions dev index.ts +``` + +### Function times out +- Max execution time is 15 minutes +- Add specific timeouts to page operations +- Use `waitForSelector` instead of sleep + +### Can't connect to browser +- Check `session.connectUrl` is being used correctly +- Ensure you're using `chromium.connectOverCDP()` not `chromium.launch()` diff --git a/skills/functions/SKILL.md b/skills/functions/SKILL.md index 9711cb6..f83567c 100644 --- a/skills/functions/SKILL.md +++ b/skills/functions/SKILL.md @@ -1,29 +1,17 @@ --- name: functions -description: Guide Claude through deploying serverless browser automation using the official bb CLI +description: "Deploy serverless browser automation as cloud functions using Browserbase. Use when the user wants to deploy browser automation to run on a schedule or cron, create a webhook endpoint for browser tasks, run automation in the cloud instead of locally, or asks about Browserbase Functions." +license: Apache-2.0 --- -# Browserbase Functions Skill +# Browserbase Functions -Guide Claude through deploying serverless browser automation using the official `bb` CLI. - -## When to Use - -Use this skill when: -- User wants to deploy automation to run on a schedule -- User needs a webhook endpoint for browser automation -- User wants to run automation in the cloud (not locally) -- User asks about Browserbase Functions +Deploy serverless browser automation using the official `bb` CLI. ## Prerequisites -### 1. Get Credentials - Get API key and Project ID from: https://browserbase.com/settings -### 2. Set Environment Variables - -Set directly: ```bash export BROWSERBASE_API_KEY="your_api_key" export BROWSERBASE_PROJECT_ID="your_project_id" @@ -31,7 +19,7 @@ export BROWSERBASE_PROJECT_ID="your_project_id" ## Creating a Function Project -### 1. Initialize with Official CLI +### 1. Initialize ```bash pnpm dlx @browserbasehq/sdk-functions init my-function @@ -49,17 +37,10 @@ my-function/ ### 2. Add Credentials to .env ```bash -# Copy from stored credentials echo "BROWSERBASE_API_KEY=$BROWSERBASE_API_KEY" >> .env echo "BROWSERBASE_PROJECT_ID=$BROWSERBASE_PROJECT_ID" >> .env ``` -Or manually edit `.env`: -``` -BROWSERBASE_API_KEY=your_api_key -BROWSERBASE_PROJECT_ID=your_project_id -``` - ### 3. Install Dependencies ```bash @@ -74,15 +55,15 @@ import { chromium } from "playwright-core"; defineFn("my-function", async (context) => { const { session, params } = context; - + // Connect to browser const browser = await chromium.connectOverCDP(session.connectUrl); const page = browser.contexts()[0]!.pages()[0]!; - + // Your automation await page.goto(params.url || "https://example.com"); const title = await page.title(); - + // Return JSON-serializable result return { success: true, title }; }); @@ -116,8 +97,6 @@ The dev server auto-reloads on file changes. Use `console.log()` for debugging - ## Deploying -### Publish to Browserbase - ```bash pnpm bb publish index.ts ``` @@ -131,121 +110,7 @@ Function ID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx **Save the Function ID** - you need it to invoke. -## Invoking Deployed Functions - -### Via curl - -```bash -# Start invocation -curl -X POST "https://api.browserbase.com/v1/functions/FUNCTION_ID/invoke" \ - -H "Content-Type: application/json" \ - -H "x-bb-api-key: $BROWSERBASE_API_KEY" \ - -d '{"params": {"url": "https://example.com"}}' - -# Response: {"id": "INVOCATION_ID"} - -# Poll for result -curl "https://api.browserbase.com/v1/functions/invocations/INVOCATION_ID" \ - -H "x-bb-api-key: $BROWSERBASE_API_KEY" -``` - -### Via Code - -```typescript -async function invokeFunction(functionId: string, params: object) { - // Start invocation - const invokeRes = await fetch( - `https://api.browserbase.com/v1/functions/${functionId}/invoke`, - { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-bb-api-key': process.env.BROWSERBASE_API_KEY!, - }, - body: JSON.stringify({ params }), - } - ); - const { id: invocationId } = await invokeRes.json(); - - // Poll until complete - while (true) { - await new Promise(r => setTimeout(r, 5000)); - - const statusRes = await fetch( - `https://api.browserbase.com/v1/functions/invocations/${invocationId}`, - { headers: { 'x-bb-api-key': process.env.BROWSERBASE_API_KEY! } } - ); - const result = await statusRes.json(); - - if (result.status === 'COMPLETED') return result.results; - if (result.status === 'FAILED') throw new Error(result.error); - } -} -``` - -## Common Patterns - -### Parameterized Scraping - -```typescript -defineFn("scrape", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - await page.goto(params.url); - await page.waitForSelector(params.selector); - - const items = await page.$$eval(params.selector, els => - els.map(el => el.textContent?.trim()) - ); - - return { url: params.url, items }; -}); -``` - -### With Authentication - -```typescript -defineFn("authenticated-action", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - // Login - await page.goto("https://example.com/login"); - await page.fill('[name="email"]', params.email); - await page.fill('[name="password"]', params.password); - await page.click('button[type="submit"]'); - await page.waitForURL('**/dashboard'); - - // Do authenticated work - const data = await page.textContent('.user-data'); - return { data }; -}); -``` - -### Error Handling - -```typescript -defineFn("safe-scrape", async ({ session, params }) => { - const browser = await chromium.connectOverCDP(session.connectUrl); - const page = browser.contexts()[0]!.pages()[0]!; - - try { - await page.goto(params.url, { timeout: 30000 }); - await page.waitForSelector(params.selector, { timeout: 10000 }); - - const data = await page.textContent(params.selector); - return { success: true, data }; - } catch (error) { - return { - success: false, - error: error instanceof Error ? error.message : 'Unknown error' - }; - } -}); -``` - -## CLI Reference +## Quick Reference | Command | Description | |---------|-------------| @@ -253,32 +118,4 @@ defineFn("safe-scrape", async ({ session, params }) => { | `pnpm bb dev ` | Start local dev server | | `pnpm bb publish ` | Deploy to Browserbase | -## Troubleshooting - -### "Missing API key" -```bash -# Check .env file has credentials -cat .env - -# Or set for current shell -export BROWSERBASE_API_KEY="your_key" -export BROWSERBASE_PROJECT_ID="your_project" -``` - -### Dev server won't start -```bash -# Make sure SDK is installed -pnpm add @browserbasehq/sdk-functions - -# Or use npx -npx @browserbasehq/sdk-functions dev index.ts -``` - -### Function times out -- Max execution time is 15 minutes -- Add specific timeouts to page operations -- Use `waitForSelector` instead of sleep - -### Can't connect to browser -- Check `session.connectUrl` is being used correctly -- Ensure you're using `chromium.connectOverCDP()` not `chromium.launch()` +For invocation examples, common patterns, and troubleshooting, see [REFERENCE.md](REFERENCE.md). From 2764231395af63bea76995c74ff95dbfe26e8542 Mon Sep 17 00:00:00 2001 From: openclaw Date: Sat, 21 Feb 2026 17:58:14 -0800 Subject: [PATCH 05/24] Switch skill license from Apache-2.0 to MIT Co-Authored-By: Claude Opus 4.6 --- skills/browser/LICENSE.txt | 211 ++++------------------------------- skills/browser/SKILL.md | 2 +- skills/functions/LICENSE.txt | 211 ++++------------------------------- skills/functions/SKILL.md | 2 +- 4 files changed, 44 insertions(+), 382 deletions(-) diff --git a/skills/browser/LICENSE.txt b/skills/browser/LICENSE.txt index 7778163..40a87cf 100644 --- a/skills/browser/LICENSE.txt +++ b/skills/browser/LICENSE.txt @@ -1,190 +1,21 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to the Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by the Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding any notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - Copyright 2025 Browserbase, Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +MIT License + +Copyright (c) 2025 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 89b45a1..5a41d39 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -2,7 +2,7 @@ name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. compatibility: "Requires the Stagehand browser CLI (`npm install -g @browserbasehq/stagehand-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." -license: Apache-2.0 +license: MIT allowed-tools: Bash --- diff --git a/skills/functions/LICENSE.txt b/skills/functions/LICENSE.txt index 7778163..40a87cf 100644 --- a/skills/functions/LICENSE.txt +++ b/skills/functions/LICENSE.txt @@ -1,190 +1,21 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to the Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by the Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding any notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - Copyright 2025 Browserbase, Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +MIT License + +Copyright (c) 2025 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/functions/SKILL.md b/skills/functions/SKILL.md index f83567c..4089b59 100644 --- a/skills/functions/SKILL.md +++ b/skills/functions/SKILL.md @@ -1,7 +1,7 @@ --- name: functions description: "Deploy serverless browser automation as cloud functions using Browserbase. Use when the user wants to deploy browser automation to run on a schedule or cron, create a webhook endpoint for browser tasks, run automation in the cloud instead of locally, or asks about Browserbase Functions." -license: Apache-2.0 +license: MIT --- # Browserbase Functions From 5ea80521ef9936b5570c46263fbe09c73d753608 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Mon, 23 Feb 2026 13:35:50 -0800 Subject: [PATCH 06/24] Address review feedback: fix copyright year and package name - Update copyright year from 2025 to 2026 in both LICENSE.txt files - Fix package name from @browserbasehq/stagehand-cli (doesn't exist) to @browserbasehq/browse-cli (actual npm package) - Update CLI command from `browser` to `browse` to match the npm package binary name across SKILL.md, REFERENCE.md, and EXAMPLES.md Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/EXAMPLES.md | 50 +++++++++++++++---------------- skills/browser/LICENSE.txt | 2 +- skills/browser/REFERENCE.md | 58 ++++++++++++++++++------------------ skills/browser/SKILL.md | 28 ++++++++--------- skills/functions/LICENSE.txt | 2 +- 5 files changed, 70 insertions(+), 70 deletions(-) diff --git a/skills/browser/EXAMPLES.md b/skills/browser/EXAMPLES.md index f74bc9d..bd0f71e 100644 --- a/skills/browser/EXAMPLES.md +++ b/skills/browser/EXAMPLES.md @@ -7,9 +7,9 @@ Common browser automation workflows using the CLI tool. Each example demonstrate **User request**: "Go to example.com/product/123 and extract the product details" ```bash -browser navigate https://example.com/product/123 -browser extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number"}' -browser close +browse navigate https://example.com/product/123 +browse extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number"}' +browse close ``` ## Example 2: Fill and Submit a Form @@ -17,13 +17,13 @@ browser close **User request**: "Fill out the contact form on example.com with my information" ```bash -browser navigate https://example.com/contact -browser act "Fill in the name field with 'John Doe'" -browser act "Fill in the email field with 'john.doe@example.com'" -browser act "Fill in the message field with 'I would like to inquire about your services'" -browser act "Click the Submit button" -browser screenshot -browser close +browse navigate https://example.com/contact +browse act "Fill in the name field with 'John Doe'" +browse act "Fill in the email field with 'john.doe@example.com'" +browse act "Fill in the message field with 'I would like to inquire about your services'" +browse act "Click the Submit button" +browse screenshot +browse close ``` ## Example 3: Debug a Page Issue @@ -33,14 +33,14 @@ browser close This example shows how to combine `observe` and `screenshot` for page inspection. ```bash -browser navigate https://example.com/form -browser screenshot -browser observe "Find all buttons and their states" -browser observe "Find all form input fields and their required status" -browser act "Fill in all required fields with test data" -browser screenshot -browser observe "Check if the submit button is now enabled" -browser close +browse navigate https://example.com/form +browse screenshot +browse observe "Find all buttons and their states" +browse observe "Find all form input fields and their required status" +browse act "Fill in all required fields with test data" +browse screenshot +browse observe "Check if the submit button is now enabled" +browse close ``` Analyze the screenshots and observations to determine the issue. @@ -50,13 +50,13 @@ Analyze the screenshots and observations to determine the issue. **User request**: "Extract product information from the first 3 pages of results on example.com/products" ```bash -browser navigate https://example.com/products -browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browser act "Click the Next Page button" -browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browser act "Click the Next Page button" -browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browser close +browse navigate https://example.com/products +browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browse act "Click the Next Page button" +browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browse act "Click the Next Page button" +browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' +browse close ``` Combine and process all extracted data. diff --git a/skills/browser/LICENSE.txt b/skills/browser/LICENSE.txt index 40a87cf..f2f4397 100644 --- a/skills/browser/LICENSE.txt +++ b/skills/browser/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 Browserbase, Inc. +Copyright (c) 2026 Browserbase, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 636628a..45bb343 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -41,7 +41,7 @@ Navigate to a URL in the browser. **Usage**: ```bash -browser navigate +browse navigate ``` **Parameters**: @@ -65,7 +65,7 @@ JSON output: **Example**: ```bash -browser navigate https://example.com +browse navigate https://example.com ``` **Error Handling**: @@ -81,7 +81,7 @@ Perform an action on the page using natural language. **Usage**: ```bash -browser act "" +browse act "" ``` **Parameters**: @@ -101,21 +101,21 @@ Note: Without specificity it might succeed on the wrong element! **Implementation Details**: - Uses Stagehand's `page.act()` which leverages Claude Haiku 4.5 -- AI model interprets natural language and executes corresponding browser actions +- AI model interprets natural language and executes corresponding browse actions - Supports: clicking, typing, selecting, scrolling, waiting, hovering, and more - Automatically handles element location and interaction - Automatically takes a screenshot after the action **Natural Language Examples**: ```bash -browser act "Click the login button" -browser act "Fill in email field with test@example.com" -browser act "Scroll to the bottom of the page" -browser act "Select 'California' from the state dropdown" -browser act "Hover over the menu icon" -browser act "Wait for 3 seconds" -browser act "Press the Enter key" -browser act "Double-click the file icon" +browse act "Click the login button" +browse act "Fill in email field with test@example.com" +browse act "Scroll to the bottom of the page" +browse act "Select 'California' from the state dropdown" +browse act "Hover over the menu icon" +browse act "Wait for 3 seconds" +browse act "Press the Enter key" +browse act "Double-click the file icon" ``` **Best Practices**: @@ -138,7 +138,7 @@ Extract structured data from the current page using a schema. **Usage**: ```bash -browser extract "" '{"field": "type"}' +browse extract "" '{"field": "type"}' ``` **Parameters**: @@ -170,12 +170,12 @@ JSON output: **Schema Example**: ```bash -browser extract "Extract the product information" '{"productName": "string", "price": "number", "inStock": "boolean", "description": "string", "rating": "number"}' +browse extract "Extract the product information" '{"productName": "string", "price": "number", "inStock": "boolean", "description": "string", "rating": "number"}' ``` **Complex Extraction Example**: ```bash -browser extract "Extract all items from the shopping cart" '{"itemName": "string", "quantity": "number", "unitPrice": "number", "totalPrice": "number", "imageUrl": "string"}' +browse extract "Extract all items from the shopping cart" '{"itemName": "string", "quantity": "number", "unitPrice": "number", "totalPrice": "number", "imageUrl": "string"}' ``` **Best Practices**: @@ -198,7 +198,7 @@ Discover available actions on the page. **Usage**: ```bash -browser observe "" +browse observe "" ``` **Parameters**: @@ -228,12 +228,12 @@ JSON output: **Query Examples**: ```bash -browser observe "Find all buttons" -browser observe "Find clickable links in the navigation" -browser observe "Find form input fields" -browser observe "Find all submit buttons" -browser observe "Find elements with text 'Login'" -browser observe "Find all images" +browse observe "Find all buttons" +browse observe "Find clickable links in the navigation" +browse observe "Find form input fields" +browse observe "Find all submit buttons" +browse observe "Find elements with text 'Login'" +browse observe "Find all images" ``` **Use Cases**: @@ -255,7 +255,7 @@ Take a screenshot of the current page. **Usage**: ```bash -browser screenshot +browse screenshot ``` **Parameters**: None @@ -282,7 +282,7 @@ JSON output: **Example**: ```bash -browser screenshot +browse screenshot ``` **Image Processing**: @@ -310,7 +310,7 @@ Close the browser and cleanup resources. **Usage**: ```bash -browser close +browse close ``` **Parameters**: None @@ -533,9 +533,9 @@ open ./agent/browser_screenshots/screenshot-*.png Test individual commands: ```bash -browser navigate https://example.com -browser screenshot -browser close +browse navigate https://example.com +browse screenshot +browse close ``` --- @@ -543,7 +543,7 @@ browser close ## Version Information - **Stagehand**: Uses `@browserbasehq/stagehand` package v2.5.2+ -- **Model**: Claude Haiku 4.5 (claude-haiku-4-5-20251001) for browser actions +- **Model**: Claude Haiku 4.5 (claude-haiku-4-5-20251001) for browse actions - **CLI Tool**: TypeScript CLI in `src/cli.ts` - **Agent SDK**: `@anthropic-ai/claude-agent-sdk` for conversation framework - **Browser**: Local Chrome/Chromium installation diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 5a41d39..5bd434e 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -1,21 +1,21 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. -compatibility: "Requires the Stagehand browser CLI (`npm install -g @browserbasehq/stagehand-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." +compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." license: MIT allowed-tools: Bash --- # Browser Automation -Automate browser interactions using Stagehand CLI with Claude. +Automate browser interactions using the browse CLI with Claude. ## Setup check Before running any browser commands, verify the CLI is available: ```bash -which browser || npm install -g @browserbasehq/stagehand-cli +which browse || npm install -g @browserbasehq/browse-cli ``` ## Environment Selection (Local vs Remote) @@ -30,21 +30,21 @@ The skill automatically selects between local and remote browser environments: All commands work identically in both modes: ```bash -browser navigate # Go to URL -browser act "" # Natural language action -browser extract "" ['{}'] # Extract data (optional schema) -browser observe "" # Discover elements -browser screenshot # Take screenshot -browser close # Close browser +browse navigate # Go to URL +browse act "" # Natural language action +browse extract "" ['{}'] # Extract data (optional schema) +browse observe "" # Discover elements +browse screenshot # Take screenshot +browse close # Close browser ``` ## Quick Example ```bash -browser navigate https://example.com -browser act "click the Sign In button" -browser extract "get the page title" -browser close +browse navigate https://example.com +browse act "click the Sign In button" +browse extract "get the page title" +browse close ``` ## Mode Comparison @@ -67,7 +67,7 @@ browser close ## Troubleshooting - **Chrome not found**: Install Chrome or use Browserbase mode -- **Action fails**: Use `browser observe` to discover available elements +- **Action fails**: Use `browse observe` to discover available elements - **Browserbase fails**: Verify API key and project ID are set For detailed examples, see [EXAMPLES.md](EXAMPLES.md). diff --git a/skills/functions/LICENSE.txt b/skills/functions/LICENSE.txt index 40a87cf..f2f4397 100644 --- a/skills/functions/LICENSE.txt +++ b/skills/functions/LICENSE.txt @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 Browserbase, Inc. +Copyright (c) 2026 Browserbase, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From e75e7f89b46043bdaae14c07c0c30ba2be111233 Mon Sep 17 00:00:00 2001 From: openclaw Date: Mon, 23 Feb 2026 18:27:46 -0800 Subject: [PATCH 07/24] Improve skill description for agent discovery Add anti-bot stealth, CAPTCHA solving, residential proxy, and session persistence details to the browser skill description and mode comparison table. These trigger phrases help AI agents discover and select this skill when users need to interact with protected or JavaScript-heavy websites. Co-Authored-By: Claude Opus 4.6 --- skills/browser/SKILL.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 5bd434e..4d5a481 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -1,6 +1,6 @@ --- name: browser -description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. +description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. Supports remote Browserbase sessions with automatic CAPTCHA solving, anti-bot stealth mode, and residential proxies — ideal for scraping protected websites, bypassing bot detection, and interacting with JavaScript-heavy pages. compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." license: MIT allowed-tools: Bash @@ -53,9 +53,11 @@ browse close |---------|-------|-------------| | Speed | Faster | Slightly slower | | Setup | Chrome required | API key required | -| Stealth mode | No | Yes | -| Proxy/CAPTCHA | No | Yes | -| Best for | Development | Production/scraping | +| Stealth mode | No | Yes (custom Chromium, anti-bot fingerprinting) | +| CAPTCHA solving | No | Yes (automatic reCAPTCHA/hCaptcha) | +| Residential proxies | No | Yes (201 countries, geo-targeting) | +| Session persistence | No | Yes (cookies/auth persist across sessions) | +| Best for | Development/simple pages | Protected sites, bot detection, production scraping | ## Best Practices From 5965ecaf3085ead08c1be5d892fca7a84f924e35 Mon Sep 17 00:00:00 2001 From: openclaw Date: Mon, 23 Feb 2026 18:58:52 -0800 Subject: [PATCH 08/24] Add metadata.openclaw requires + improve local vs remote guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add requires.bins (browse CLI) and install spec so OpenClaw can gate the skill properly and auto-install the CLI - Add homepage for ClawHub trust score - Do NOT gate on env vars — local mode works without Browserbase keys - Rewrite "Environment Selection" section with clear guidance on when to use local mode (simple pages) vs remote mode (protected sites, CAPTCHAs, bot detection, Cloudflare, geo-restricted content) Co-Authored-By: Claude Opus 4.6 --- skills/browser/SKILL.md | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 4d5a481..e264510 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -4,6 +4,16 @@ description: Automate web browser interactions using natural language via CLI co compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." license: MIT allowed-tools: Bash +metadata: + openclaw: + requires: + bins: + - browse + install: + - kind: node + package: "@browserbasehq/browse-cli" + bins: [browse] + homepage: https://github.com/browserbase/skills --- # Browser Automation @@ -20,10 +30,22 @@ which browse || npm install -g @browserbasehq/browse-cli ## Environment Selection (Local vs Remote) -The skill automatically selects between local and remote browser environments: -- **If Browserbase API keys exist** (`BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`): Uses remote Browserbase environment -- **If no Browserbase API keys**: Falls back to local Chrome browser -- **No user prompting**: The selection happens automatically based on available configuration +The CLI automatically selects between local and remote browser environments based on available configuration: + +### Local mode (default) +- Uses local Chrome — no API keys needed +- Best for: development, simple pages, trusted sites with no bot protection + +### Remote mode (Browserbase) +- Activated when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set +- Provides: anti-bot stealth, automatic CAPTCHA solving, residential proxies, session persistence +- **Use remote mode when:** the target site has bot detection, CAPTCHAs, IP rate limiting, Cloudflare protection, or requires geo-specific access +- Get credentials at https://browserbase.com/settings + +### When to choose which +- **Simple browsing** (docs, wikis, public APIs): local mode is fine +- **Protected sites** (login walls, CAPTCHAs, anti-scraping): use remote mode +- **If local mode fails** with bot detection or access denied: switch to remote mode ## Commands From 2cda0fad0e1d323c9af2a5407f0f4f9f12837e23 Mon Sep 17 00:00:00 2001 From: shrey150 Date: Tue, 24 Feb 2026 00:59:16 -0800 Subject: [PATCH 09/24] Guide agent toward snapshot-first workflow and high-level commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Session logs show the agent screenshots after every action (expensive, slow) and ignores browse act/observe in favor of manual snapshot → click ref loops. Update the skill to: - Document browse snapshot and recommend it as default over screenshot - Add guidance on when to use snapshot vs screenshot - Steer toward browse act/observe over low-level ref-based commands - Rewrite best practices to reflect snapshot-first, act-first workflow Co-Authored-By: Claude Opus 4.6 --- skills/browser/SKILL.md | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index e264510..8e6d8ed 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -53,13 +53,26 @@ All commands work identically in both modes: ```bash browse navigate # Go to URL -browse act "" # Natural language action -browse extract "" ['{}'] # Extract data (optional schema) -browse observe "" # Discover elements -browse screenshot # Take screenshot +browse act "" # Natural language action (click, type, scroll, etc.) +browse extract "" ['{}'] # Extract structured data (optional JSON schema) +browse observe "" # Discover interactive elements on the page +browse snapshot # Get page accessibility tree (fast, structured) +browse screenshot # Take visual screenshot (slow, uses vision tokens) browse close # Close browser ``` +### Choosing between snapshot and screenshot + +- **Use `browse snapshot` as your default** for understanding page state. It returns the accessibility tree with element refs — fast, structured, and gives you everything needed to find and interact with elements. +- **Use `browse screenshot` only when you need visual context** — verifying layout rendered correctly, reading images/charts, or debugging why an action didn't work as expected. +- **Do NOT screenshot after every action.** Screenshots are expensive (vision tokens) and slow. Use snapshot to confirm state changes. + +### Choosing between act/observe and low-level commands + +- **Prefer `browse act`** for interactions — it uses natural language so you don't need to find element refs first. Example: `browse act "click the Sign In button"` instead of snapshot → find ref → click ref. +- **Use `browse observe`** when you need to discover what interactive elements exist on the page before deciding what to do. +- **Fall back to `browse snapshot` + ref-based commands** only if `act`/`observe` fail to find the right element. + ## Quick Example ```bash @@ -83,10 +96,12 @@ browse close ## Best Practices -1. **Always navigate first** before interacting -2. **View screenshots** after each command to verify -3. **Be specific** in action descriptions -4. **Close browser** when done +1. **Always `browse navigate` first** before interacting +2. **Use `browse snapshot`** (not screenshot) to check page state after actions +3. **Use `browse act`** for interactions — describe what you want in natural language +4. **Only screenshot when visual context is needed** (layout checks, images, debugging) +5. **Be specific** in action descriptions — "click the blue Submit button" not "click submit" +6. **Close browser** when done ## Troubleshooting From 1ada63a384e2a896d4e4ee21ab48f73a5fa6bff6 Mon Sep 17 00:00:00 2001 From: shrey150 Date: Tue, 24 Feb 2026 01:05:11 -0800 Subject: [PATCH 10/24] Rewrite commands section to match actual browse CLI The SKILL.md documented commands (navigate, act, extract, observe) that don't exist in the CLI. The actual commands are open, click, type, fill, snapshot, etc. This caused agents to run nonexistent commands and fall back to guessing. - Replace all command docs with actual CLI syntax from browse --help - Document snapshot-first workflow with element refs - Add session management commands (stop, status, pages, tab_switch) - Add "No active page" to troubleshooting - Fix quick example to use real commands Co-Authored-By: Claude Opus 4.6 --- skills/browser/SKILL.md | 81 +++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 8e6d8ed..84d1099 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -49,37 +49,64 @@ The CLI automatically selects between local and remote browser environments base ## Commands -All commands work identically in both modes: +All commands work identically in both modes. The daemon auto-starts on first command. +### Navigation ```bash -browse navigate # Go to URL -browse act "" # Natural language action (click, type, scroll, etc.) -browse extract "" ['{}'] # Extract structured data (optional JSON schema) -browse observe "" # Discover interactive elements on the page -browse snapshot # Get page accessibility tree (fast, structured) -browse screenshot # Take visual screenshot (slow, uses vision tokens) -browse close # Close browser +browse open # Go to URL (aliases: goto) +browse reload # Reload current page +browse back # Go back in history +browse forward # Go forward in history ``` -### Choosing between snapshot and screenshot +### Page state (prefer snapshot over screenshot) +```bash +browse snapshot # Get accessibility tree with element refs (fast, structured) +browse screenshot [path] # Take visual screenshot (slow, uses vision tokens) +browse get url # Get current URL +browse get title # Get page title +browse get text [selector] # Get text content +browse get html [selector] # Get HTML content +``` -- **Use `browse snapshot` as your default** for understanding page state. It returns the accessibility tree with element refs — fast, structured, and gives you everything needed to find and interact with elements. -- **Use `browse screenshot` only when you need visual context** — verifying layout rendered correctly, reading images/charts, or debugging why an action didn't work as expected. -- **Do NOT screenshot after every action.** Screenshots are expensive (vision tokens) and slow. Use snapshot to confirm state changes. +Use `browse snapshot` as your default for understanding page state — it returns the accessibility tree with element refs you can use to interact. Only use `browse screenshot` when you need visual context (layout, images, debugging). -### Choosing between act/observe and low-level commands +### Interaction +```bash +browse click # Click element by ref from snapshot (e.g., @0-5) +browse type # Type text into focused element +browse fill # Fill input and press Enter +browse select # Select dropdown option(s) +browse press # Press key (Enter, Tab, Escape, Cmd+A, etc.) +browse scroll # Scroll at coordinates +browse wait [arg] # Wait for: load, selector, timeout +``` + +### Session management +```bash +browse stop # Stop the browser daemon +browse status # Check daemon status +browse pages # List all open tabs +browse tab_switch # Switch to tab by index +browse tab_close [index] # Close tab +``` -- **Prefer `browse act`** for interactions — it uses natural language so you don't need to find element refs first. Example: `browse act "click the Sign In button"` instead of snapshot → find ref → click ref. -- **Use `browse observe`** when you need to discover what interactive elements exist on the page before deciding what to do. -- **Fall back to `browse snapshot` + ref-based commands** only if `act`/`observe` fail to find the right element. +### Typical workflow +1. `browse open ` — navigate to the page +2. `browse snapshot` — read the accessibility tree to understand page structure and get element refs +3. `browse click ` / `browse type ` / `browse fill ` — interact using refs from snapshot +4. `browse snapshot` — confirm the action worked +5. Repeat 3-4 as needed +6. `browse stop` — close the browser when done ## Quick Example ```bash -browse navigate https://example.com -browse act "click the Sign In button" -browse extract "get the page title" -browse close +browse open https://example.com +browse snapshot # see page structure + element refs +browse click @0-5 # click element with ref 0-5 +browse get title +browse stop ``` ## Mode Comparison @@ -96,17 +123,17 @@ browse close ## Best Practices -1. **Always `browse navigate` first** before interacting -2. **Use `browse snapshot`** (not screenshot) to check page state after actions -3. **Use `browse act`** for interactions — describe what you want in natural language -4. **Only screenshot when visual context is needed** (layout checks, images, debugging) -5. **Be specific** in action descriptions — "click the blue Submit button" not "click submit" -6. **Close browser** when done +1. **Always `browse open` first** before interacting +2. **Use `browse snapshot`** to check page state — it's fast and gives you element refs +3. **Only screenshot when visual context is needed** (layout checks, images, debugging) +4. **Use refs from snapshot** to click/interact — e.g., `browse click @0-5` +5. **`browse stop`** when done to clean up the browser session ## Troubleshooting +- **"No active page"**: Run `browse stop` then retry your `browse open` command - **Chrome not found**: Install Chrome or use Browserbase mode -- **Action fails**: Use `browse observe` to discover available elements +- **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set For detailed examples, see [EXAMPLES.md](EXAMPLES.md). From f63168fa8453de040f05a8ae3773c040d8ae9e3b Mon Sep 17 00:00:00 2001 From: shrey150 Date: Tue, 24 Feb 2026 01:11:31 -0800 Subject: [PATCH 11/24] Fix troubleshooting for zombie daemon "No active page" error browse stop doesn't always kill the daemon process. Add pkill fallback for when the daemon is stuck with wsUrl: "unknown" after a SIGTERM. Co-Authored-By: Claude Opus 4.6 --- skills/browser/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 84d1099..94b7751 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -131,7 +131,7 @@ browse stop ## Troubleshooting -- **"No active page"**: Run `browse stop` then retry your `browse open` command +- **"No active page"**: Run `browse stop`, then check `browse status`. If it still says running, kill the zombie daemon with `pkill -f "browse.*daemon"`, then retry `browse open` - **Chrome not found**: Install Chrome or use Browserbase mode - **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set From 3fb2f490f22a6b11c0afba422b920ea395cc1ef3 Mon Sep 17 00:00:00 2001 From: shrey150 Date: Tue, 24 Feb 2026 04:18:02 -0800 Subject: [PATCH 12/24] Fix stale browser-automation references; rewrite skill docs with real CLI commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename browser-automation → browser in README.md and marketplace.json - Rewrite EXAMPLES.md: replace nonexistent commands (navigate, act, extract, observe, close) with real browse CLI commands (open, snapshot, click, type, fill, get, stop). 4 concrete examples including remote mode escalation. - Rewrite REFERENCE.md: replace Stagehand/Playwright architecture with actual daemon-based CLI docs, all 20 real commands, env var config. - SKILL.md: add "Activating Remote Mode" progressive disclosure section, fix get text to require selector argument. Co-Authored-By: Claude Opus 4.6 --- .claude-plugin/marketplace.json | 2 +- README.md | 2 +- skills/browser/EXAMPLES.md | 130 ++++--- skills/browser/REFERENCE.md | 584 +++++++++----------------------- skills/browser/SKILL.md | 57 +++- 5 files changed, 306 insertions(+), 469 deletions(-) diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 0b6e95e..614f68f 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -21,7 +21,7 @@ "keywords": ["browser", "automation", "web-scraping", "stagehand", "screenshots"], "strict": false, "skills": [ - "./skills/browser-automation" + "./skills/browser" ] }, { diff --git a/README.md b/README.md index 6e4526b..a168b44 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This plugin includes the following skills (see `skills/` for details): | Skill | Description | |-------|-------------| -| [browser-automation](skills/browser-automation/SKILL.md) | Browser control using natural language commands (navigate, click, extract data, screenshot) | +| [browser](skills/browser/SKILL.md) | Automate web browser interactions via CLI commands — supports remote Browserbase sessions with anti-bot stealth, CAPTCHA solving, and residential proxies | | [functions](skills/functions/SKILL.md) | Deploy serverless browser automation to Browserbase cloud using the `bb` CLI | ## Installation diff --git a/skills/browser/EXAMPLES.md b/skills/browser/EXAMPLES.md index bd0f71e..56dd53c 100644 --- a/skills/browser/EXAMPLES.md +++ b/skills/browser/EXAMPLES.md @@ -1,71 +1,117 @@ # Browser Automation Examples -Common browser automation workflows using the CLI tool. Each example demonstrates a distinct pattern. +Common browser automation workflows using the `browse` CLI. Each example demonstrates a distinct pattern using real commands. -## Example 1: Extract Structured Data +## Example 1: Extract Data from a Page -**User request**: "Go to example.com/product/123 and extract the product details" +**User request**: "Get the product details from example.com/product/123" ```bash -browse navigate https://example.com/product/123 -browse extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number"}' -browse close +browse open https://example.com/product/123 +browse snapshot # read page structure + element refs +browse get text "body" # extract all visible text content +browse stop ``` +Parse the text output to extract structured data (name, price, description, etc.). + +For a specific section, use a CSS selector: + +```bash +browse get text ".product-details" # text from a specific container +``` + +**Note**: `browse get text` requires a CSS selector — use `"body"` for all page text. + ## Example 2: Fill and Submit a Form **User request**: "Fill out the contact form on example.com with my information" ```bash -browse navigate https://example.com/contact -browse act "Fill in the name field with 'John Doe'" -browse act "Fill in the email field with 'john.doe@example.com'" -browse act "Fill in the message field with 'I would like to inquire about your services'" -browse act "Click the Submit button" -browse screenshot -browse close +browse open https://example.com/contact +browse snapshot # find form fields and their refs +browse click @0-3 # click the Name input (ref from snapshot) +browse type "John Doe" +browse press Tab # move to next field +browse type "john@example.com" +browse fill "#message" "I would like to inquire about your services" +browse snapshot # verify fields are filled +browse click @0-8 # click Submit button (ref from snapshot) +browse snapshot # confirm submission result +browse stop +``` + +**Key pattern**: Use `browse snapshot` before interacting to discover element refs, then `browse click ` and `browse type` to interact. + +## Example 3: Multi-Step Navigation + +**User request**: "Get headlines from the first 3 pages of results on example.com/news" + +```bash +browse open https://example.com/news +browse snapshot # read page 1 content +browse get text ".headline" # extract headlines + +browse snapshot # find "Next" button ref +browse click @0-12 # click Next (ref from snapshot) +browse wait load # wait for page 2 to load +browse get text ".headline" # extract page 2 headlines + +browse snapshot # find Next again (ref may change) +browse click @0-15 # click Next +browse wait load +browse get text ".headline" # extract page 3 headlines + +browse stop ``` -## Example 3: Debug a Page Issue +**Key pattern**: Re-run `browse snapshot` after each navigation because element refs change when the page updates. -**User request**: "Check why the submit button isn't working on example.com/form" +## Example 4: Escalate to Remote Mode -This example shows how to combine `observe` and `screenshot` for page inspection. +**User request**: "Scrape pricing from competitor.com" (a site with Cloudflare protection) ```bash -browse navigate https://example.com/form -browse screenshot -browse observe "Find all buttons and their states" -browse observe "Find all form input fields and their required status" -browse act "Fill in all required fields with test data" -browse screenshot -browse observe "Check if the submit button is now enabled" -browse close +# Attempt 1: local mode +browse open https://competitor.com/pricing +browse snapshot +# Output shows: "Checking your browser..." (Cloudflare interstitial) +# or: page content is empty / access denied +browse stop ``` -Analyze the screenshots and observations to determine the issue. +The agent detects bot protection and tells the user: -## Example 4: Multi-Page Data Collection +> This site has Cloudflare bot detection. Browserbase remote mode can bypass this with anti-bot stealth and residential proxies. Want me to set it up? -**User request**: "Extract product information from the first 3 pages of results on example.com/products" +If the user agrees: ```bash -browse navigate https://example.com/products -browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browse act "Click the Next Page button" -browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browse act "Click the Next Page button" -browse extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}' -browse close +# Set up Browserbase credentials +openclaw browserbase setup +# User enters API key + project ID interactively + +# Retry — credentials are now in the environment +browse open https://competitor.com/pricing +browse snapshot # full page content now accessible +browse get text ".pricing-table" +browse stop ``` -Combine and process all extracted data. +If the env vars aren't visible yet (setup was run outside OpenClaw): + +```bash +eval "$(openclaw browserbase env --format shell)" && browse open https://competitor.com/pricing +browse snapshot +browse get text ".pricing-table" +browse stop +``` -## Tips for Success +## Tips -- **Be specific with natural language**: "Click the blue Submit button in the footer" is better than "click submit". This is **extremely important** because there's much ambiguity in many websites. -- **Wait when needed**: After navigation or actions that trigger page changes, explicitly wait -- **Use observe for discovery**: When unsure what elements exist, use observe first -- **Take screenshots for debugging**: Visual confirmation helps understand what the browser sees -- **Handle errors gracefully**: If an action fails, try breaking it into smaller steps -- **Clean up resources**: Always close the browser when done to free up system resources +- **Snapshot first**: Always run `browse snapshot` before interacting — it gives you the accessibility tree with element refs +- **Use refs to click**: `browse click @0-5` is more reliable than trying to describe elements +- **Re-snapshot after actions**: Element refs change when the page updates +- **`get text` for data extraction**: Use `browse get text [selector]` to pull text content from specific elements +- **`stop` when done**: Always `browse stop` to clean up the browser session +- **Prefer snapshot over screenshot**: Snapshot is fast and structured; screenshot is slow and uses vision tokens. Only screenshot when you need visual context (layout, images, debugging) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 45bb343..d4ff486 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -1,551 +1,287 @@ # Browser Automation CLI Reference -This document provides detailed technical reference for the CLI browser automation tool. +Technical reference for the `browse` CLI tool. -## Table of Contents +## Architecture -- [Architecture Overview](#architecture-overview) -- [CLI Command Reference](#cli-command-reference) - - [navigate](#navigate) - - [act](#act) - - [extract](#extract) - - [observe](#observe) - - [screenshot](#screenshot) - - [close](#close) -- [Configuration Details](#configuration-details) -- [Error Messages Reference](#error-messages-reference) -- [Performance Considerations](#performance-considerations) -- [Security Considerations](#security-considerations) -- [Debugging Tips](#debugging-tips) +The browse CLI is a **daemon-based** command-line tool: -## Architecture Overview +- **Daemon process**: A background process manages the browser instance. Auto-starts on the first command (e.g., `browse open`), persists across commands, and stops with `browse stop`. +- **Local mode** (default): Launches a local Chrome/Chromium instance. +- **Remote mode** (Browserbase): Connects to a Browserbase cloud browser session when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set. +- **Accessibility-first**: Use `browse snapshot` to get the page's accessibility tree with element refs, then interact using those refs. -The browser automation system consists of: +## Command Reference -- **Stagehand**: TypeScript library wrapping Playwright for AI-driven browser control. Uses AI model to find and interact with the right elements, so be specific -- **Chrome CDP**: Chrome DevTools Protocol connection on port 9222 -- **CLI Tool**: Command-line interface in `src/cli.ts` for browser automation -- **Local Chrome**: Chrome browser launched with remote debugging enabled +### Navigation -### File Locations +#### `open ` -- **Chrome Profile**: `.chrome-profile/` - Persistent browser profile directory -- **Screenshots**: `./agent/browser_screenshots/` - Screenshot output directory -- **Downloads**: `./agent/downloads/` - File download directory +Navigate to a URL. Alias: `goto`. Auto-starts the daemon if not running. -## CLI Command Reference - -### navigate - -Navigate to a URL in the browser. - -**Usage**: ```bash -browse navigate +browse open https://example.com ``` -**Parameters**: -- `url` (string, required): The URL to navigate to. Must include protocol (http:// or https://) - -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Successfully navigated to ", - "screenshot": "/path/to/screenshot.png" -} -``` +#### `reload` -**Implementation Details**: -- Uses Playwright's `page.goto()` under the hood -- Waits for network idle and DOM content loaded -- Automatically takes a screenshot after navigation -- Supports HTTPS upgrade for HTTP URLs +Reload the current page. -**Example**: ```bash -browse navigate https://example.com +browse reload ``` -**Error Handling**: -- Invalid URLs return error with `success: false` -- Network timeouts return timeout error -- SSL certificate errors may fail navigation +#### `back` / `forward` ---- +Navigate browser history. -### act - -Perform an action on the page using natural language. - -**Usage**: ```bash -browse act "" +browse back +browse forward ``` -**Parameters**: -- `action` (string, required): Natural language description of the action to perform - -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Successfully performed action: ", - "screenshot": "/path/to/screenshot.png" -} -``` +--- -Note: Without specificity it might succeed on the wrong element! +### Page State -**Implementation Details**: -- Uses Stagehand's `page.act()` which leverages Claude Haiku 4.5 -- AI model interprets natural language and executes corresponding browse actions -- Supports: clicking, typing, selecting, scrolling, waiting, hovering, and more -- Automatically handles element location and interaction -- Automatically takes a screenshot after the action +#### `snapshot` + +Get the accessibility tree with interactive element refs. This is the primary way to understand page structure. -**Natural Language Examples**: ```bash -browse act "Click the login button" -browse act "Fill in email field with test@example.com" -browse act "Scroll to the bottom of the page" -browse act "Select 'California' from the state dropdown" -browse act "Hover over the menu icon" -browse act "Wait for 3 seconds" -browse act "Press the Enter key" -browse act "Double-click the file icon" +browse snapshot ``` -**Best Practices**: -- Be **specific** about which element to interact with -- Include visual descriptors ("button next to the form", "top menu", "form at bottom") -- For ambiguous elements, mention nearby context -- Break complex actions into multiple simple actions - -**Error Handling**: -- Element not found errors indicate selector couldn't be resolved -- Timeout errors occur when action takes too long -- Action not possible errors indicate element state prevents action -- All errors return JSON with `success: false` +Returns a text representation of the page with refs like `@0-5` that can be passed to `click`. ---- +#### `screenshot [path]` -### extract +Take a visual screenshot. Slower than snapshot and uses vision tokens. -Extract structured data from the current page using a schema. - -**Usage**: ```bash -browse extract "" '{"field": "type"}' +browse screenshot # auto-generated path +browse screenshot ./capture.png # custom path ``` -**Parameters**: -- `instruction` (string, required): Natural language description of what to extract -- `schema` (JSON string, required): Schema definition mapping field names to types - -**Schema Types**: -- `"string"`: Text content -- `"number"`: Numeric values (integers or floats) -- `"boolean"`: True/false values - -**Returns**: -JSON output: -```json -{ - "success": true, - "data": { - "field1": "value", - "field2": 123 - } -} -``` +#### `get [selector]` -**Implementation Details**: -- Uses Stagehand's `page.extract()` with Zod schema validation -- AI model (Claude Haiku 4.5) identifies relevant page elements -- Automatically handles pagination and dynamic content -- Validates extracted data against schema +Get page properties. Available properties: `url`, `title`, `text`, `html`, `value`, `box`. -**Schema Example**: ```bash -browse extract "Extract the product information" '{"productName": "string", "price": "number", "inStock": "boolean", "description": "string", "rating": "number"}' +browse get url # current URL +browse get title # page title +browse get text "body" # all visible text (selector required) +browse get text ".product-info" # text within a CSS selector +browse get html "#main" # HTML of an element +browse get value "#email-input" # value of a form field +browse get box "#header" # bounding box of an element ``` -**Complex Extraction Example**: -```bash -browse extract "Extract all items from the shopping cart" '{"itemName": "string", "quantity": "number", "unitPrice": "number", "totalPrice": "number", "imageUrl": "string"}' -``` - -**Best Practices**: -- Use clear, descriptive field names -- Match schema types to expected data types -- Provide specific extraction instructions -- Handle missing data by checking result properties - -**Error Handling**: -- Schema validation errors indicate type mismatch -- Extraction failures occur when data not found on page -- Timeout errors for pages that take too long to analyze -- All errors return JSON with `success: false` +**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse-cli versions (v0.1.4); use `get text` or `snapshot` as alternatives. --- -### observe +### Interaction -Discover available actions on the page. +#### `click ` + +Click an element by its ref from `browse snapshot` output. -**Usage**: ```bash -browse observe "" +browse click @0-5 # click element with ref 0-5 ``` -**Parameters**: -- `query` (string, required): Natural language query to discover elements - -**Returns**: -JSON output: -```json -{ - "success": true, - "data": [ - { - "selector": "button.submit-btn", - "text": "Submit Form", - "type": "button", - "visible": true, - "enabled": true - } - ] -} -``` +#### `click_xy ` -**Implementation Details**: -- Uses Stagehand's `page.observe()` to scan page elements -- Returns actionable elements matching the query -- Provides element properties, states, and available actions +Click at exact viewport coordinates. -**Query Examples**: ```bash -browse observe "Find all buttons" -browse observe "Find clickable links in the navigation" -browse observe "Find form input fields" -browse observe "Find all submit buttons" -browse observe "Find elements with text 'Login'" -browse observe "Find all images" +browse click_xy 500 300 ``` -**Use Cases**: -- Page exploration and discovery -- Debugging action failures -- Understanding page structure -- Finding dynamic element selectors - -**Error Handling**: -- Empty array returned when no elements match -- Timeout for pages that take too long to scan -- All errors return JSON with `success: false` - ---- +#### `type ` -### screenshot +Type text into the currently focused element. -Take a screenshot of the current page. - -**Usage**: ```bash -browse screenshot +browse type "Hello, world!" ``` -**Parameters**: None +#### `fill ` + +Fill an input element matching a CSS selector and press Enter. -**Returns**: -JSON output: -```json -{ - "success": true, - "screenshot": "/path/to/screenshot.png" -} +```bash +browse fill "#search" "OpenClaw documentation" +browse fill "input[name=email]" "user@example.com" ``` -**Implementation Details**: -- Captures full viewport at current scroll position -- Saves as PNG format with timestamp in filename -- Automatically resizes images larger than 2000x2000 pixels using Sharp -- Uses lossless PNG compression +#### `select ` -**Screenshot Path Format**: -``` -./agent/browser_screenshots/screenshot-YYYY-MM-DDTHH-MM-SS-mmmZ.png -``` +Select option(s) from a dropdown. -**Example**: ```bash -browse screenshot +browse select "#country" "United States" +browse select "#tags" "javascript" "typescript" # multi-select ``` -**Image Processing**: -- Original resolution preserved if ≤ 2000x2000 -- Larger images resized to fit within 2000x2000 while maintaining aspect ratio -- Uses Sharp library for high-quality image processing - -**Best Practices**: -- Take screenshots before and after important actions -- Use for visual debugging and verification -- Screenshot after navigation to confirm page loaded -- Capture error states for troubleshooting +#### `press ` -**Error Handling**: -- Directory creation errors if screenshots folder can't be created -- CDP errors if Chrome DevTools Protocol connection fails -- File write errors if disk space insufficient -- All errors return JSON with `success: false` +Press a keyboard key or key combination. ---- +```bash +browse press Enter +browse press Tab +browse press Escape +browse press Cmd+A # select all (Mac) +browse press Ctrl+C # copy (Linux/Windows) +``` -### close +#### `scroll ` -Close the browser and cleanup resources. +Scroll at a given position by a given amount. -**Usage**: ```bash -browse close +browse scroll 500 300 0 -300 # scroll up at (500, 300) +browse scroll 500 300 0 500 # scroll down ``` -**Parameters**: None +#### `wait [arg]` -**Returns**: -JSON output: -```json -{ - "success": true, - "message": "Browser closed" -} -``` +Wait for a condition. -**Implementation Details**: -- Calls `stagehand.close()` to clean up Playwright resources -- Kills Chrome process if it was started by the CLI tool -- Clears internal state variables -- Does NOT delete `.chrome-profile/` directory (preserved for reuse) - -**Resource Cleanup**: -- Closes all browser tabs and windows -- Terminates Chrome process (only if started by this tool) -- Releases CDP connection -- Clears Stagehand instance - -**Best Practices**: -- Always call at the end of browser automation tasks -- Call even if errors occurred during automation -- Don't call mid-workflow unless explicitly needed - -**Error Handling**: -- Continues cleanup even if some steps fail -- Safe to call multiple times -- Gracefully handles already-closed browser -- All errors return JSON with `success: false` +```bash +browse wait load # wait for page load +browse wait "selector" ".results" # wait for element to appear +browse wait timeout 3000 # wait 3 seconds +``` --- -## Configuration Details +### Session Management -### Stagehand Initialization +#### `start` -The Stagehand instance is configured in `src/cli.ts` with: +Start the browser daemon manually. Usually not needed — the daemon auto-starts on first command. -```typescript -new Stagehand({ - env: "LOCAL", - verbose: 0, - enableCaching: true, - model: "anthropic/claude-haiku-4-5-20251001", - localBrowserLaunchOptions: { - cdpUrl: wsUrl, - }, -}) +```bash +browse start ``` -**Configuration Options**: -- `env: "LOCAL"`: Uses local Chrome instead of remote browser -- `verbose: 0`: Minimal logging output -- `enableCaching: true`: Caches page analysis for better performance -- `modelName`: Claude Haiku 4.5 for AI-driven actions and extraction -- `cdpUrl`: Chrome DevTools Protocol endpoint +#### `stop` -### Chrome Launch Arguments - -Chrome is launched by `src/cli.ts` with: +Stop the browser daemon and close the browser. ```bash ---remote-debugging-port=9222 ---user-data-dir=.chrome-profile ---window-position=-9999,-9999 ---window-size=1280,720 +browse stop ``` -**Arguments**: -- `--remote-debugging-port`: Enables CDP on port 9222 -- `--user-data-dir`: Persistent profile directory for session/cookie persistence -- `--window-position`: Launches minimized off-screen -- `--window-size`: Default window size - -### Download Configuration +#### `status` -Downloads are configured via CDP: +Check whether the daemon is running and its connection details. -```typescript -await client.send("Browser.setDownloadBehavior", { - behavior: "allow", - downloadPath: "./agent/downloads", - eventsEnabled: true, -}) +```bash +browse status ``` -**Behavior**: -- Downloads start automatically (no dialog) -- Files saved to `./agent/downloads/` -- Download events can be monitored via CDP - ---- +#### `pages` -## Error Messages Reference +List all open tabs. -### Common Errors +```bash +browse pages +``` -**"Could not find local Chrome installation"** -- Cause: Chrome/Chromium not installed or not in standard locations -- Solution: Install Chrome from https://www.google.com/chrome/ +#### `tab_switch ` -**"Chrome failed to start with remote debugging on port 9222"** -- Cause: Port 9222 already in use or Chrome can't bind to port -- Solution: Close other Chrome instances or change CDP port +Switch to a tab by its index (from `browse pages`). -**"Browser failed to become ready within timeout"** -- Cause: Chrome launched but page context not ready -- Solution: Check Chrome version compatibility, restart system +```bash +browse tab_switch 1 +``` -**"Error performing action: element not found"** -- Cause: Natural language description didn't match any page element -- Solution: Use more specific description or use observe to find elements +#### `tab_close [index]` -**"Error extracting data: schema validation failed"** -- Cause: Extracted data type doesn't match schema -- Solution: Verify schema types match actual page data +Close a tab. Closes current tab if no index given. -**"Error taking screenshot: directory not writable"** -- Cause: Insufficient permissions for screenshots directory -- Solution: Check file permissions on `./agent/browser_screenshots/` +```bash +browse tab_close # close current tab +browse tab_close 2 # close tab at index 2 +``` --- -## Performance Considerations - -### Caching - -Stagehand caches page analysis to improve performance on repeated actions. Cache is maintained for: -- Element selectors -- Page structure analysis -- Vision model results - -### Timeouts - -Default timeouts: -- Navigation: 30 seconds -- Action execution: 30 seconds -- Extraction: 60 seconds -- CDP connection: 15 seconds (50 retries × 300ms) - -### Resource Usage - -Browser automation consumes: -- Memory: ~200-500MB for Chrome process -- CPU: Variable based on page complexity -- Disk: ~50-200MB for Chrome profile -- Network: Depends on pages visited - ---- +## Configuration -## Security Considerations +### Environment Variables -### Credential Handling +| Variable | Required | Description | +|----------|----------|-------------| +| `BROWSERBASE_API_KEY` | For remote mode | API key from https://browserbase.com/settings | +| `BROWSERBASE_PROJECT_ID` | For remote mode | Project ID from Browserbase dashboard | -- Browser uses persistent profile (`.chrome-profile/`) -- Saved passwords and cookies persist between sessions -- Consider using isolated profiles for sensitive operations +When both are set, the CLI uses Browserbase remote sessions. Otherwise, it falls back to local Chrome. -### Download Safety +The Browserbase OpenClaw plugin automatically bridges credentials from `~/.openclaw/openclaw.json` into these environment variables on startup. -- Downloads automatically saved to `./agent/downloads/` -- No file type restrictions enforced -- Verify downloaded file integrity before use +### Setting credentials -### Network Access +```bash +# Via OpenClaw plugin (recommended) +openclaw browserbase setup -- Browser has full network access -- Respects system proxy settings -- Can access localhost and internal networks +# Via environment variables (manual) +export BROWSERBASE_API_KEY="bb_live_..." +export BROWSERBASE_PROJECT_ID="proj_..." +``` --- -## Debugging Tips +## Error Messages -### Enable Verbose Logging +**"No active page"** +- The daemon is running but has no page open. +- Fix: Run `browse open `. If the issue persists, run `browse stop` and retry. For zombie daemons: `pkill -f "browse.*daemon"`. -Edit `src/cli.ts` and change verbose level in Stagehand configuration: - -```typescript -// Change verbose: 0 to verbose: 1 or 2 -verbose: 2, // Maximum verbosity -``` +**"Chrome not found"** / **"Could not find local Chrome installation"** +- Chrome/Chromium is not installed or not in a standard location. +- Fix: Install Chrome, or use Browserbase remote mode (no local browser needed). -### View Chrome Console +**"Daemon not running"** +- No daemon process is active. Most commands auto-start the daemon, but `snapshot`, `click`, etc. require an active session. +- Fix: Run `browse open ` to start a session. -Connect to Chrome DevTools manually: -1. Open Chrome -2. Navigate to `chrome://inspect` -3. Click "inspect" under Remote Target - -### Check CDP Connection - -Test CDP endpoint: -```bash -curl http://localhost:9222/json/version -``` +**Element ref not found (e.g., "@0-5")** +- The ref from a previous snapshot is no longer valid (page changed). +- Fix: Run `browse snapshot` again to get fresh refs. -### Monitor Browser Process +**Timeout errors** +- The page took too long to load or an element didn't appear. +- Fix: Try `browse wait load` before interacting, or increase wait time. -Check Chrome process: -```bash -ps aux | grep chrome -``` +--- -### View Screenshots +## Typical Workflow -Screenshots provide visual debugging: -```bash -ls -lh ./agent/browser_screenshots/ -open ./agent/browser_screenshots/screenshot-*.png ``` - -### Test CLI Commands - -Test individual commands: -```bash -browse navigate https://example.com -browse screenshot -browse close +1. browse open → navigate to the page +2. browse snapshot → read accessibility tree, get element refs +3. browse click/type/fill → interact using refs from step 2 +4. browse snapshot → verify action worked +5. repeat 3-4 as needed +6. browse stop → clean up ``` --- -## Version Information - -- **Stagehand**: Uses `@browserbasehq/stagehand` package v2.5.2+ -- **Model**: Claude Haiku 4.5 (claude-haiku-4-5-20251001) for browse actions -- **CLI Tool**: TypeScript CLI in `src/cli.ts` -- **Agent SDK**: `@anthropic-ai/claude-agent-sdk` for conversation framework -- **Browser**: Local Chrome/Chromium installation - -For updates and changelog, see the main project repository. +## Local vs Remote Mode + +| Feature | Local | Remote (Browserbase) | +|---------|-------|----------------------| +| Speed | Faster | Slightly slower | +| Setup | Local Chrome required | API key required | +| Anti-bot stealth | No | Yes | +| CAPTCHA solving | No | Yes (reCAPTCHA, hCaptcha) | +| Residential proxies | No | Yes (201 countries) | +| Session persistence | No | Yes | +| Best for | Dev, simple pages | Protected sites, production scraping | diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 94b7751..f251607 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -65,7 +65,7 @@ browse snapshot # Get accessibility tree with element r browse screenshot [path] # Take visual screenshot (slow, uses vision tokens) browse get url # Get current URL browse get title # Get page title -browse get text [selector] # Get text content +browse get text # Get text content (use "body" for all text) browse get html [selector] # Get HTML content ``` @@ -136,5 +136,60 @@ browse stop - **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set +## Activating Remote Mode + +The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. The Browserbase plugin bridges stored credentials into the environment on startup, so in most cases remote mode "just works" after initial setup. + +### Checking current mode + +```bash +echo $BROWSERBASE_API_KEY | head -c4 # non-empty = remote mode active +browse status # shows connection details +``` + +### When to suggest remote mode + +Suggest Browserbase remote mode when you detect ANY of these signals: +- **CAPTCHA challenges**: reCAPTCHA, hCaptcha, Cloudflare Turnstile on the page +- **Bot detection pages**: "Checking your browser...", Cloudflare interstitial, Akamai block page +- **Access denied**: HTTP 403/429 responses, "access denied" or "bot detected" page content +- **Missing content**: repeated element-not-found on pages that should have content (bot detection may serve empty/different pages) +- **User request**: user says "use Browserbase", "bypass the CAPTCHA", "use a proxy", "use stealth mode", or similar + +Do NOT suggest remote mode for simple sites (docs, wikis, public APIs, localhost). + +### Setting up remote mode + +If remote mode is not active and the situation calls for it, tell the user WHY it would help (anti-bot stealth, CAPTCHA solving, residential proxies) and guide them: + +```bash +openclaw browserbase setup # interactive — prompts for API key + project ID +``` + +Credentials are saved to `~/.openclaw/openclaw.json`. On next OpenClaw startup, the plugin loads them into the environment automatically. + +### Mid-session activation + +If the user runs `openclaw browserbase setup` during an active session, the credentials are set in the environment immediately — no restart needed. Just retry the failed command: + +```bash +browse stop # stop any local session +browse open # now uses remote mode +``` + +If the env vars are not yet visible (e.g., setup was run outside OpenClaw), use the eval bridge for the first command: + +```bash +eval "$(openclaw browserbase env --format shell)" && browse open +``` + +The `eval` prefix is only needed once per session. Subsequent `browse` commands in the same shell inherit the exported variables. + +### What NOT to do + +- Don't retry the same URL in remote mode without asking the user first +- Don't fall back to local silently if remote fails — tell the user what happened +- Don't suggest remote mode preemptively for simple, unprotected sites + For detailed examples, see [EXAMPLES.md](EXAMPLES.md). For API reference, see [REFERENCE.md](REFERENCE.md). From 6d020cf9023023b676d1c2090bfd9bd76102dbc6 Mon Sep 17 00:00:00 2001 From: shrey150 Date: Tue, 24 Feb 2026 08:40:50 -0800 Subject: [PATCH 13/24] Document browse mode command for on-the-fly local/remote switching Add browse mode to SKILL.md commands list and REFERENCE.md. Rewrite "Activating Remote Mode" as "Switching Between Local and Remote Mode" using browse mode as the primary mechanism. Co-Authored-By: Claude Opus 4.6 --- skills/browser/REFERENCE.md | 12 +++++++++- skills/browser/SKILL.md | 47 ++++++++++++++----------------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index d4ff486..6e88ff5 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -178,12 +178,22 @@ browse stop #### `status` -Check whether the daemon is running and its connection details. +Check whether the daemon is running, its connection details, and current mode. ```bash browse status ``` +#### `mode [local|remote]` + +Show or switch the daemon's execution mode. Without arguments, prints the current mode. With an argument, stops the running daemon and restarts in the specified mode. The switch is sticky — subsequent commands stay in the chosen mode until you switch again or run `browse stop`. + +```bash +browse mode # print current mode +browse mode local # switch to local Chrome +browse mode remote # switch to Browserbase (requires API keys) +``` + #### `pages` List all open tabs. diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index f251607..f1cdde6 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -85,7 +85,10 @@ browse wait [arg] # Wait for: load, selector, timeout ### Session management ```bash browse stop # Stop the browser daemon -browse status # Check daemon status +browse status # Check daemon status (includes mode) +browse mode # Show current mode (local or remote) +browse mode local # Switch to local Chrome +browse mode remote # Switch to Browserbase (requires API keys) browse pages # List all open tabs browse tab_switch # Switch to tab by index browse tab_close [index] # Close tab @@ -136,54 +139,38 @@ browse stop - **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set -## Activating Remote Mode +## Switching Between Local and Remote Mode -The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. The Browserbase plugin bridges stored credentials into the environment on startup, so in most cases remote mode "just works" after initial setup. - -### Checking current mode +The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. Use `browse mode` to check or switch modes on the fly: ```bash -echo $BROWSERBASE_API_KEY | head -c4 # non-empty = remote mode active -browse status # shows connection details +browse mode # check current mode +browse mode remote # switch to Browserbase +browse mode local # switch to local Chrome ``` -### When to suggest remote mode +The switch is sticky — subsequent commands stay in the chosen mode until you switch again or run `browse stop` (which resets to env var detection). + +### When to switch to remote mode -Suggest Browserbase remote mode when you detect ANY of these signals: +Switch to remote mode when you detect ANY of these signals: - **CAPTCHA challenges**: reCAPTCHA, hCaptcha, Cloudflare Turnstile on the page - **Bot detection pages**: "Checking your browser...", Cloudflare interstitial, Akamai block page - **Access denied**: HTTP 403/429 responses, "access denied" or "bot detected" page content - **Missing content**: repeated element-not-found on pages that should have content (bot detection may serve empty/different pages) - **User request**: user says "use Browserbase", "bypass the CAPTCHA", "use a proxy", "use stealth mode", or similar -Do NOT suggest remote mode for simple sites (docs, wikis, public APIs, localhost). +Do NOT switch to remote for simple sites (docs, wikis, public APIs, localhost). -### Setting up remote mode +### Setting up credentials -If remote mode is not active and the situation calls for it, tell the user WHY it would help (anti-bot stealth, CAPTCHA solving, residential proxies) and guide them: +If `browse mode remote` fails because API keys aren't set, guide the user: ```bash openclaw browserbase setup # interactive — prompts for API key + project ID ``` -Credentials are saved to `~/.openclaw/openclaw.json`. On next OpenClaw startup, the plugin loads them into the environment automatically. - -### Mid-session activation - -If the user runs `openclaw browserbase setup` during an active session, the credentials are set in the environment immediately — no restart needed. Just retry the failed command: - -```bash -browse stop # stop any local session -browse open # now uses remote mode -``` - -If the env vars are not yet visible (e.g., setup was run outside OpenClaw), use the eval bridge for the first command: - -```bash -eval "$(openclaw browserbase env --format shell)" && browse open -``` - -The `eval` prefix is only needed once per session. Subsequent `browse` commands in the same shell inherit the exported variables. +Credentials are saved to `~/.openclaw/openclaw.json`. On next startup, the plugin loads them into the environment automatically. ### What NOT to do From 5b743336a8352a3838f9cdc23b410154c1b12727 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 24 Feb 2026 19:21:00 +0000 Subject: [PATCH 14/24] browser(skill): remove auto-install for non-existent @browserbasehq/browse-cli; gate on 'browse' bin only\n\n- Drop OpenClaw install block to avoid failed installs\n- Make compatibility + setup check generic (no npm package name)\n- Generalize REFERENCE note (avoid browse-cli v0.1.4 mention) Co-authored-by: Kyle Jeong --- skills/browser/REFERENCE.md | 2 +- skills/browser/SKILL.md | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 6e88ff5..18f0e7d 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -77,7 +77,7 @@ browse get value "#email-input" # value of a form field browse get box "#header" # bounding box of an element ``` -**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse-cli versions (v0.1.4); use `get text` or `snapshot` as alternatives. +**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse CLI versions; use `get text` or `snapshot` as alternatives. --- diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index f1cdde6..cd14d3c 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -1,7 +1,7 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. Supports remote Browserbase sessions with automatic CAPTCHA solving, anti-bot stealth mode, and residential proxies — ideal for scraping protected websites, bypassing bot detection, and interacting with JavaScript-heavy pages. -compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." +compatibility: "Requires the 'browse' CLI on PATH. Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." license: MIT allowed-tools: Bash metadata: @@ -9,10 +9,6 @@ metadata: requires: bins: - browse - install: - - kind: node - package: "@browserbasehq/browse-cli" - bins: [browse] homepage: https://github.com/browserbase/skills --- @@ -25,7 +21,7 @@ Automate browser interactions using the browse CLI with Claude. Before running any browser commands, verify the CLI is available: ```bash -which browse || npm install -g @browserbasehq/browse-cli +which browse || echo "Missing 'browse' CLI. See homepage for install instructions." ``` ## Environment Selection (Local vs Remote) From a2fdaded0fa1a4f9eecce649a12ffd2e01354c85 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Wed, 25 Feb 2026 02:32:34 +0000 Subject: [PATCH 15/24] Revert "browser(skill): remove auto-install for non-existent @browserbasehq/browse-cli; gate on 'browse' bin only\n\n- Drop OpenClaw install block to avoid failed installs\n- Make compatibility + setup check generic (no npm package name)\n- Generalize REFERENCE note (avoid browse-cli v0.1.4 mention)" This reverts commit 5b743336a8352a3838f9cdc23b410154c1b12727. --- skills/browser/REFERENCE.md | 2 +- skills/browser/SKILL.md | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 18f0e7d..6e88ff5 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -77,7 +77,7 @@ browse get value "#email-input" # value of a form field browse get box "#header" # bounding box of an element ``` -**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse CLI versions; use `get text` or `snapshot` as alternatives. +**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse-cli versions (v0.1.4); use `get text` or `snapshot` as alternatives. --- diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index cd14d3c..f1cdde6 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -1,7 +1,7 @@ --- name: browser description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. Supports remote Browserbase sessions with automatic CAPTCHA solving, anti-bot stealth mode, and residential proxies — ideal for scraping protected websites, bypassing bot detection, and interacting with JavaScript-heavy pages. -compatibility: "Requires the 'browse' CLI on PATH. Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." +compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-cli`). Optional: set BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID for remote Browserbase sessions; falls back to local Chrome otherwise." license: MIT allowed-tools: Bash metadata: @@ -9,6 +9,10 @@ metadata: requires: bins: - browse + install: + - kind: node + package: "@browserbasehq/browse-cli" + bins: [browse] homepage: https://github.com/browserbase/skills --- @@ -21,7 +25,7 @@ Automate browser interactions using the browse CLI with Claude. Before running any browser commands, verify the CLI is available: ```bash -which browse || echo "Missing 'browse' CLI. See homepage for install instructions." +which browse || npm install -g @browserbasehq/browse-cli ``` ## Environment Selection (Local vs Remote) From 07ddb74e4f7b999e16e8d5b71fa62d1fd530fc84 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 17:05:55 -0800 Subject: [PATCH 16/24] docs(browser): remove broken commands, document undocumented features Remove: get html, drag, highlight, is, execute references. Add docs for: hover, newpage, eval, viewport, network capture, snapshot --compact, screenshot --full-page, type --delay/--mistakes, fill --no-press-enter, stop --force. Replace get html with get value in SKILL.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 95 +++++++++++++++++++++++++++++++++++-- skills/browser/SKILL.md | 2 +- 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 6e88ff5..1d5d407 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -50,9 +50,10 @@ Get the accessibility tree with interactive element refs. This is the primary wa ```bash browse snapshot +browse snapshot --compact # tree only, no ref maps ``` -Returns a text representation of the page with refs like `@0-5` that can be passed to `click`. +Returns a text representation of the page with refs like `@0-5` that can be passed to `click`. Use `--compact` for shorter output when you only need the tree. #### `screenshot [path]` @@ -61,23 +62,23 @@ Take a visual screenshot. Slower than snapshot and uses vision tokens. ```bash browse screenshot # auto-generated path browse screenshot ./capture.png # custom path +browse screenshot --full-page # capture entire scrollable page ``` #### `get [selector]` -Get page properties. Available properties: `url`, `title`, `text`, `html`, `value`, `box`. +Get page properties. Available properties: `url`, `title`, `text`, `value`, `box`. ```bash browse get url # current URL browse get title # page title browse get text "body" # all visible text (selector required) browse get text ".product-info" # text within a CSS selector -browse get html "#main" # HTML of an element browse get value "#email-input" # value of a form field -browse get box "#header" # bounding box of an element +browse get box "#header" # bounding box (centroid coordinates) ``` -**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. `get html` may error on some browse-cli versions (v0.1.4); use `get text` or `snapshot` as alternatives. +**Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. --- @@ -99,12 +100,22 @@ Click at exact viewport coordinates. browse click_xy 500 300 ``` +#### `hover ` + +Hover at viewport coordinates. + +```bash +browse hover 500 300 +``` + #### `type ` Type text into the currently focused element. ```bash browse type "Hello, world!" +browse type "slow typing" --delay 100 # 100ms between keystrokes +browse type "human-like" --mistakes # simulate human typing with typos ``` #### `fill ` @@ -114,6 +125,7 @@ Fill an input element matching a CSS selector and press Enter. ```bash browse fill "#search" "OpenClaw documentation" browse fill "input[name=email]" "user@example.com" +browse fill "#search" "query" --no-press-enter # fill without pressing Enter ``` #### `select ` @@ -174,6 +186,7 @@ Stop the browser daemon and close the browser. ```bash browse stop +browse stop --force # force kill if daemon is unresponsive ``` #### `status` @@ -194,6 +207,15 @@ browse mode local # switch to local Chrome browse mode remote # switch to Browserbase (requires API keys) ``` +#### `newpage [url]` + +Create a new tab, optionally navigating to a URL. + +```bash +browse newpage # open blank tab +browse newpage https://example.com # open tab with URL +``` + #### `pages` List all open tabs. @@ -221,6 +243,69 @@ browse tab_close 2 # close tab at index 2 --- +### JavaScript Evaluation + +#### `eval ` + +Evaluate JavaScript in the page context. + +```bash +browse eval "document.title" +browse eval "document.querySelectorAll('a').length" +``` + +--- + +### Viewport + +#### `viewport ` + +Set the browser viewport size. + +```bash +browse viewport 1920 1080 +``` + +--- + +### Network Capture + +Capture network requests to the filesystem for inspection. + +#### `network on` + +Enable network request capture. Creates a temp directory where requests and responses are saved as JSON files. + +```bash +browse network on +``` + +#### `network off` + +Disable network capture. + +```bash +browse network off +``` + +#### `network path` + +Show the capture directory path. + +```bash +browse network path +``` + +#### `network clear` + +Clear all captured requests. + +```bash +browse network clear +``` + +--- + ## Configuration ### Environment Variables diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index f1cdde6..98f8852 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -66,7 +66,7 @@ browse screenshot [path] # Take visual screenshot (slow, uses vi browse get url # Get current URL browse get title # Get page title browse get text # Get text content (use "body" for all text) -browse get html [selector] # Get HTML content +browse get value # Get form field value ``` Use `browse snapshot` as your default for understanding page state — it returns the accessibility tree with element refs you can use to interact. Only use `browse screenshot` when you need visual context (layout, images, debugging). From 06fcff2dadf6ac2523be002c7ea6e6a3ce8f5967 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 19:48:37 -0800 Subject: [PATCH 17/24] rename `browse mode` to `browse env` in docs Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 14 +++++++------- skills/browser/SKILL.md | 26 +++++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 1d5d407..578ffd5 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -191,20 +191,20 @@ browse stop --force # force kill if daemon is unresponsive #### `status` -Check whether the daemon is running, its connection details, and current mode. +Check whether the daemon is running, its connection details, and current environment. ```bash browse status ``` -#### `mode [local|remote]` +#### `env [local|remote]` -Show or switch the daemon's execution mode. Without arguments, prints the current mode. With an argument, stops the running daemon and restarts in the specified mode. The switch is sticky — subsequent commands stay in the chosen mode until you switch again or run `browse stop`. +Show or switch the browser environment. Without arguments, prints the current environment. With an argument, stops the running daemon and restarts in the specified environment. The switch is sticky — subsequent commands stay in the chosen environment until you switch again or run `browse stop`. ```bash -browse mode # print current mode -browse mode local # switch to local Chrome -browse mode remote # switch to Browserbase (requires API keys) +browse env # print current environment +browse env local # switch to local Chrome +browse env remote # switch to Browserbase (requires API keys) ``` #### `newpage [url]` @@ -340,7 +340,7 @@ export BROWSERBASE_PROJECT_ID="proj_..." **"Chrome not found"** / **"Could not find local Chrome installation"** - Chrome/Chromium is not installed or not in a standard location. -- Fix: Install Chrome, or use Browserbase remote mode (no local browser needed). +- Fix: Install Chrome, or switch to remote with `browse env remote` (no local browser needed). **"Daemon not running"** - No daemon process is active. Most commands auto-start the daemon, but `snapshot`, `click`, etc. require an active session. diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 98f8852..37e2705 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -85,10 +85,10 @@ browse wait [arg] # Wait for: load, selector, timeout ### Session management ```bash browse stop # Stop the browser daemon -browse status # Check daemon status (includes mode) -browse mode # Show current mode (local or remote) -browse mode local # Switch to local Chrome -browse mode remote # Switch to Browserbase (requires API keys) +browse status # Check daemon status (includes env) +browse env # Show current environment (local or remote) +browse env local # Switch to local Chrome +browse env remote # Switch to Browserbase (requires API keys) browse pages # List all open tabs browse tab_switch # Switch to tab by index browse tab_close [index] # Close tab @@ -135,23 +135,23 @@ browse stop ## Troubleshooting - **"No active page"**: Run `browse stop`, then check `browse status`. If it still says running, kill the zombie daemon with `pkill -f "browse.*daemon"`, then retry `browse open` -- **Chrome not found**: Install Chrome or use Browserbase mode +- **Chrome not found**: Install Chrome or use `browse env remote` - **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set -## Switching Between Local and Remote Mode +## Switching Between Local and Remote Environment -The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. Use `browse mode` to check or switch modes on the fly: +The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. Use `browse env` to check or switch environments on the fly: ```bash -browse mode # check current mode -browse mode remote # switch to Browserbase -browse mode local # switch to local Chrome +browse env # check current environment +browse env remote # switch to Browserbase +browse env local # switch to local Chrome ``` -The switch is sticky — subsequent commands stay in the chosen mode until you switch again or run `browse stop` (which resets to env var detection). +The switch is sticky — subsequent commands stay in the chosen environment until you switch again or run `browse stop` (which resets to env var detection). -### When to switch to remote mode +### When to switch to remote Switch to remote mode when you detect ANY of these signals: - **CAPTCHA challenges**: reCAPTCHA, hCaptcha, Cloudflare Turnstile on the page @@ -164,7 +164,7 @@ Do NOT switch to remote for simple sites (docs, wikis, public APIs, localhost). ### Setting up credentials -If `browse mode remote` fails because API keys aren't set, guide the user: +If `browse env remote` fails because API keys aren't set, guide the user: ```bash openclaw browserbase setup # interactive — prompts for API key + project ID From d1aa1f1456b8fdf4b6bb34f73cc58eef3e80e150 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 20:14:56 -0800 Subject: [PATCH 18/24] docs(browser): restore drag, highlight, is, and get html/visible/checked commands These commands are functional again after PR browserbase/stagent-cli#11 fixed the daemon startup (EPIPE crash) and restored selector command surfaces. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 35 ++++++++++++++++++++++++++++++++++- skills/browser/SKILL.md | 5 +++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 578ffd5..f6cc53b 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -67,15 +67,18 @@ browse screenshot --full-page # capture entire scrollable page #### `get [selector]` -Get page properties. Available properties: `url`, `title`, `text`, `value`, `box`. +Get page properties. Available properties: `url`, `title`, `text`, `html`, `value`, `box`, `visible`, `checked`. ```bash browse get url # current URL browse get title # page title browse get text "body" # all visible text (selector required) browse get text ".product-info" # text within a CSS selector +browse get html "#main" # inner HTML of an element browse get value "#email-input" # value of a form field browse get box "#header" # bounding box (centroid coordinates) +browse get visible ".modal" # check if element is visible +browse get checked "#agree" # check if checkbox/radio is checked ``` **Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. @@ -158,6 +161,36 @@ browse scroll 500 300 0 -300 # scroll up at (500, 300) browse scroll 500 300 0 500 # scroll down ``` +#### `drag ` + +Drag from one viewport coordinate to another. + +```bash +browse drag 80 80 310 100 # drag with default 10 steps +browse drag 80 80 310 100 --steps 20 # more intermediate steps +browse drag 80 80 310 100 --delay 50 # 50ms between steps +browse drag 80 80 310 100 --button right # use right mouse button +browse drag 80 80 310 100 --xpath # return source/target XPaths +``` + +#### `highlight ` + +Highlight an element on the page for visual debugging. + +```bash +browse highlight "#submit-btn" # highlight for 2 seconds (default) +browse highlight ".nav" -d 5000 # highlight for 5 seconds +``` + +#### `is ` + +Check element state. Available checks: `visible`, `checked`. + +```bash +browse is visible ".modal" # returns { visible: true/false } +browse is checked "#agree" # returns { checked: true/false } +``` + #### `wait [arg]` Wait for a condition. diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 37e2705..2922eb7 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -66,6 +66,7 @@ browse screenshot [path] # Take visual screenshot (slow, uses vi browse get url # Get current URL browse get title # Get page title browse get text # Get text content (use "body" for all text) +browse get html # Get HTML content of element browse get value # Get form field value ``` @@ -78,7 +79,11 @@ browse type # Type text into focused element browse fill # Fill input and press Enter browse select # Select dropdown option(s) browse press # Press key (Enter, Tab, Escape, Cmd+A, etc.) +browse drag # Drag from one point to another browse scroll # Scroll at coordinates +browse highlight # Highlight element on page +browse is visible # Check if element is visible +browse is checked # Check if element is checked browse wait [arg] # Wait for: load, selector, timeout ``` From 068b5279781b917e23f235902f1abe2851e3ce68 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 23:16:09 -0800 Subject: [PATCH 19/24] docs(browser): add refs, open --wait, --json, and --session to REFERENCE.md These CLI features were verified working in browse-cli v0.1.5 but were missing from the reference documentation: - `refs` command for cached ref map lookup - `open --wait` flag (networkidle/domcontentloaded) for SPAs - `--json` global flag for structured output - `--session` global flag for concurrent browser sessions Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index f6cc53b..a0a7ce4 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -21,8 +21,12 @@ Navigate to a URL. Alias: `goto`. Auto-starts the daemon if not running. ```bash browse open https://example.com +browse open https://example.com --wait networkidle # wait for all network requests to finish (useful for SPAs) +browse open https://example.com --wait domcontentloaded ``` +The `--wait` flag controls when navigation is considered complete. Values: `load` (default), `domcontentloaded`, `networkidle`. Use `networkidle` for JavaScript-heavy pages that fetch data after initial load. + #### `reload` Reload the current page. @@ -83,6 +87,14 @@ browse get checked "#agree" # check if checkbox/radio is checked **Note**: `get text` requires a CSS selector argument — use `"body"` for full page text. +#### `refs` + +Show the cached ref map from the last `browse snapshot`. Useful for looking up element refs without re-running a full snapshot. + +```bash +browse refs +``` + --- ### Interaction @@ -341,6 +353,19 @@ browse network clear ## Configuration +### Global Flags + +| Flag | Description | +|------|-------------| +| `--json` | Output as JSON for all commands (structured, parseable output) | +| `--session ` | Run commands against a named session (enables multiple concurrent browsers) | + +```bash +browse --json get url # returns {"url": "https://..."} +browse --session work open https://a.com # open in "work" session +browse --session personal open https://b.com # open in separate "personal" session +``` + ### Environment Variables | Variable | Required | Description | From b4ea2f98914a8f88e68a75069d87e940a080c442 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 23:23:02 -0800 Subject: [PATCH 20/24] docs(browser): use consistent heading style for global flags Match the #### heading + description + code block pattern used throughout the rest of REFERENCE.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index a0a7ce4..c3594cc 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -355,15 +355,22 @@ browse network clear ### Global Flags -| Flag | Description | -|------|-------------| -| `--json` | Output as JSON for all commands (structured, parseable output) | -| `--session ` | Run commands against a named session (enables multiple concurrent browsers) | +#### `--json` + +Output as JSON for all commands. Useful for structured, parseable output. ```bash browse --json get url # returns {"url": "https://..."} -browse --session work open https://a.com # open in "work" session -browse --session personal open https://b.com # open in separate "personal" session +browse --json snapshot # returns JSON accessibility tree +``` + +#### `--session ` + +Run commands against a named session, enabling multiple concurrent browsers. + +```bash +browse --session work open https://a.com +browse --session personal open https://b.com ``` ### Environment Variables From acc6cc3d8608c1335673146e053df34977090917 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Wed, 25 Feb 2026 23:30:56 -0800 Subject: [PATCH 21/24] docs(browser): add TOC, remove duplicated sections, tighten env switching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Table of Contents to REFERENCE.md (matches functions/REFERENCE.md and top skills like terraform-skill, mcp-builder) - Remove Typical Workflow and Local vs Remote Mode sections from REFERENCE.md — these were near-identical copies of SKILL.md content - Condense "Switching Between Local and Remote Environment" in SKILL.md from 38 lines to 16 — keeps the signal detection list and credential setup, drops redundant env commands already shown in Commands section Co-Authored-By: Claude Opus 4.6 (1M context) --- skills/browser/REFERENCE.md | 43 ++++++++++++++----------------------- skills/browser/SKILL.md | 36 ++++++------------------------- 2 files changed, 23 insertions(+), 56 deletions(-) diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index c3594cc..1fbc989 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -2,6 +2,22 @@ Technical reference for the `browse` CLI tool. +## Table of Contents + +- [Architecture](#architecture) +- [Command Reference](#command-reference) + - [Navigation](#navigation) + - [Page State](#page-state) + - [Interaction](#interaction) + - [Session Management](#session-management) + - [JavaScript Evaluation](#javascript-evaluation) + - [Viewport](#viewport) + - [Network Capture](#network-capture) +- [Configuration](#configuration) + - [Global Flags](#global-flags) + - [Environment Variables](#environment-variables) +- [Error Messages](#error-messages) + ## Architecture The browse CLI is a **daemon-based** command-line tool: @@ -418,30 +434,3 @@ export BROWSERBASE_PROJECT_ID="proj_..." **Timeout errors** - The page took too long to load or an element didn't appear. - Fix: Try `browse wait load` before interacting, or increase wait time. - ---- - -## Typical Workflow - -``` -1. browse open → navigate to the page -2. browse snapshot → read accessibility tree, get element refs -3. browse click/type/fill → interact using refs from step 2 -4. browse snapshot → verify action worked -5. repeat 3-4 as needed -6. browse stop → clean up -``` - ---- - -## Local vs Remote Mode - -| Feature | Local | Remote (Browserbase) | -|---------|-------|----------------------| -| Speed | Faster | Slightly slower | -| Setup | Local Chrome required | API key required | -| Anti-bot stealth | No | Yes | -| CAPTCHA solving | No | Yes (reCAPTCHA, hCaptcha) | -| Residential proxies | No | Yes (201 countries) | -| Session persistence | No | Yes | -| Best for | Dev, simple pages | Protected sites, production scraping | diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index 2922eb7..ea1f35a 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -144,44 +144,22 @@ browse stop - **Action fails**: Run `browse snapshot` to see available elements and their refs - **Browserbase fails**: Verify API key and project ID are set -## Switching Between Local and Remote Environment +## Switching to Remote Mode -The browse CLI automatically uses remote Browserbase sessions when `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set in the environment. Use `browse env` to check or switch environments on the fly: +Switch to remote when you detect: CAPTCHAs (reCAPTCHA, hCaptcha, Turnstile), bot detection pages ("Checking your browser..."), HTTP 403/429, empty pages on sites that should have content, or the user asks for it. + +Don't switch for simple sites (docs, wikis, public APIs, localhost). ```bash -browse env # check current environment browse env remote # switch to Browserbase -browse env local # switch to local Chrome +browse env local # switch back to local Chrome ``` -The switch is sticky — subsequent commands stay in the chosen environment until you switch again or run `browse stop` (which resets to env var detection). - -### When to switch to remote - -Switch to remote mode when you detect ANY of these signals: -- **CAPTCHA challenges**: reCAPTCHA, hCaptcha, Cloudflare Turnstile on the page -- **Bot detection pages**: "Checking your browser...", Cloudflare interstitial, Akamai block page -- **Access denied**: HTTP 403/429 responses, "access denied" or "bot detected" page content -- **Missing content**: repeated element-not-found on pages that should have content (bot detection may serve empty/different pages) -- **User request**: user says "use Browserbase", "bypass the CAPTCHA", "use a proxy", "use stealth mode", or similar - -Do NOT switch to remote for simple sites (docs, wikis, public APIs, localhost). - -### Setting up credentials - -If `browse env remote` fails because API keys aren't set, guide the user: +The switch is sticky until you run `browse stop` or switch again. If API keys aren't set: ```bash -openclaw browserbase setup # interactive — prompts for API key + project ID +openclaw browserbase setup # interactive — prompts for API key + project ID ``` -Credentials are saved to `~/.openclaw/openclaw.json`. On next startup, the plugin loads them into the environment automatically. - -### What NOT to do - -- Don't retry the same URL in remote mode without asking the user first -- Don't fall back to local silently if remote fails — tell the user what happened -- Don't suggest remote mode preemptively for simple, unprotected sites - For detailed examples, see [EXAMPLES.md](EXAMPLES.md). For API reference, see [REFERENCE.md](REFERENCE.md). From 909f675e51e3db281436131692af9358fa299dc6 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Thu, 26 Feb 2026 17:39:38 -0800 Subject: [PATCH 22/24] Add agent-browser remote interop and sub-agent fleet skills --- README.md | 4 +- skills/agent-browser-remote/LICENSE.txt | 21 +++ skills/agent-browser-remote/SKILL.md | 94 ++++++++++ .../scripts/browserbase-session.mjs | 177 ++++++++++++++++++ skills/browser-fleet/LICENSE.txt | 21 +++ skills/browser-fleet/SKILL.md | 92 +++++++++ skills/browser/REFERENCE.md | 16 ++ skills/browser/SKILL.md | 24 ++- 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 skills/agent-browser-remote/LICENSE.txt create mode 100644 skills/agent-browser-remote/SKILL.md create mode 100755 skills/agent-browser-remote/scripts/browserbase-session.mjs create mode 100644 skills/browser-fleet/LICENSE.txt create mode 100644 skills/browser-fleet/SKILL.md diff --git a/README.md b/README.md index a168b44..e78ea5b 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ This plugin includes the following skills (see `skills/` for details): | Skill | Description | |-------|-------------| | [browser](skills/browser/SKILL.md) | Automate web browser interactions via CLI commands — supports remote Browserbase sessions with anti-bot stealth, CAPTCHA solving, and residential proxies | +| [agent-browser-remote](skills/agent-browser-remote/SKILL.md) | Use Agent Browser with Browserbase remote CDP sessions when local browsing hits CAPTCHAs, anti-bot checks, or JavaScript-heavy sites | +| [browser-fleet](skills/browser-fleet/SKILL.md) | Orchestrate high-volume browser work via sub-agent fanout with clear decomposition, retries, and cleanup patterns | | [functions](skills/functions/SKILL.md) | Deploy serverless browser automation to Browserbase cloud using the `bb` CLI | ## Installation @@ -60,4 +62,4 @@ rm -rf .chrome-profile ## Resources - [Stagehand Documentation](https://github.com/browserbase/stagehand) -- [Claude Code Skills](https://support.claude.com/en/articles/12512176-what-are-skills) \ No newline at end of file +- [Claude Code Skills](https://support.claude.com/en/articles/12512176-what-are-skills) diff --git a/skills/agent-browser-remote/LICENSE.txt b/skills/agent-browser-remote/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/agent-browser-remote/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/agent-browser-remote/SKILL.md b/skills/agent-browser-remote/SKILL.md new file mode 100644 index 0000000..8e2ce4f --- /dev/null +++ b/skills/agent-browser-remote/SKILL.md @@ -0,0 +1,94 @@ +--- +name: agent-browser-remote +description: Use Agent Browser with Browserbase remote CDP sessions. Trigger this skill when users are driving browser tasks with agent-browser and encounter CAPTCHAs, anti-bot pages, IP-based blocking, JavaScript-heavy sites, geo restrictions, or request remote cloud browser execution. +compatibility: "Requires `agent-browser` and Node.js. Requires Browserbase credentials (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`) for remote sessions." +license: MIT +allowed-tools: Bash +metadata: + capabilities: + - agent-browser + - remote-browser + - cdp-interop + - stealth + - captcha-solving + - proxies + openclaw: + requires: + bins: + - agent-browser + - node + install: + - kind: node + package: "agent-browser" + bins: [agent-browser] + homepage: https://github.com/browserbase/skills +--- + +# Agent Browser + Remote CDP + +Use this skill when `agent-browser` local mode struggles on protected sites, or when the user explicitly wants remote cloud browsers. + +## Setup check + +```bash +which agent-browser || npm install -g agent-browser +agent-browser install +``` + +Set Browserbase credentials (or run `openclaw browserbase setup`): + +```bash +export BROWSERBASE_API_KEY="..." +export BROWSERBASE_PROJECT_ID="..." +``` + +## Fast path + +Create a Browserbase session, then point `agent-browser` at the returned CDP URL. + +```bash +eval "$(node scripts/browserbase-session.mjs create --proxies true --advanced-stealth true --format shell)" +agent-browser --cdp "$BROWSERBASE_CDP_URL" open https://example.com +agent-browser --cdp "$BROWSERBASE_CDP_URL" snapshot -i --json +``` + +When done: + +```bash +node scripts/browserbase-session.mjs close --session-id "$BROWSERBASE_SESSION_ID" +``` + +## When to switch from local to remote + +Switch to remote when any of these appear: +- CAPTCHA or challenge pages (reCAPTCHA, hCaptcha, Turnstile) +- bot checks ("checking your browser", "verify you are human") +- repeated `403` / `429` from sites that should be accessible +- empty DOM/snapshot on JavaScript-heavy pages that should have content +- geo-specific content requirements + +Stay local for simple docs sites, localhost, and basic internal QA flows. + +## Command patterns + +Per-command CDP (explicit, stateless): + +```bash +agent-browser --cdp "$BROWSERBASE_CDP_URL" open https://target.com +agent-browser --cdp "$BROWSERBASE_CDP_URL" snapshot -i --json +agent-browser --cdp "$BROWSERBASE_CDP_URL" click @e2 +``` + +Or connect once, then run normal commands: + +```bash +agent-browser connect "$BROWSERBASE_CDP_URL" +agent-browser open https://target.com +agent-browser snapshot -i --json +``` + +## Notes + +- `--proxies true` requires a Browserbase plan that includes proxies. +- `--advanced-stealth true` requires a plan that includes advanced stealth. +- Always close remote sessions explicitly when the task ends. diff --git a/skills/agent-browser-remote/scripts/browserbase-session.mjs b/skills/agent-browser-remote/scripts/browserbase-session.mjs new file mode 100755 index 0000000..499c30d --- /dev/null +++ b/skills/agent-browser-remote/scripts/browserbase-session.mjs @@ -0,0 +1,177 @@ +#!/usr/bin/env node + +const DEFAULT_API_BASE = process.env.BROWSERBASE_API_BASE_URL || "https://api.browserbase.com"; + +function usage() { + console.error( + [ + "Usage:", + " node scripts/browserbase-session.mjs create [options]", + " node scripts/browserbase-session.mjs close --session-id [options]", + "", + "Create options:", + " --api-key Browserbase API key (or BROWSERBASE_API_KEY)", + " --project-id Browserbase project ID (or BROWSERBASE_PROJECT_ID)", + " --proxies Enable proxies", + " --advanced-stealth Enable advanced stealth", + " --keep-alive Keep session alive on Browserbase", + " --format Output format (default: json)", + " --api-base-url API base URL (default: https://api.browserbase.com)", + "", + "Close options:", + " --session-id Session ID to close (required)", + " --api-key Browserbase API key (or BROWSERBASE_API_KEY)", + " --api-base-url API base URL", + ].join("\n"), + ); +} + +function parseArgs(argv) { + const out = { _: [] }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + if (!arg.startsWith("--")) { + out._.push(arg); + continue; + } + const key = arg.slice(2); + const next = argv[i + 1]; + if (!next || next.startsWith("--")) { + out[key] = true; + continue; + } + out[key] = next; + i += 1; + } + return out; +} + +function parseBool(value, name) { + if (value === undefined) return undefined; + if (value === true || value === false) return value; + const normalized = String(value).trim().toLowerCase(); + if (["1", "true", "yes", "on"].includes(normalized)) return true; + if (["0", "false", "no", "off"].includes(normalized)) return false; + throw new Error(`Invalid boolean for --${name}: ${value}`); +} + +function shellQuote(value) { + return `'${String(value).replace(/'/g, `'\\''`)}'`; +} + +async function createSession(args) { + const apiKey = args["api-key"] || process.env.BROWSERBASE_API_KEY; + const projectId = args["project-id"] || process.env.BROWSERBASE_PROJECT_ID; + const format = String(args.format || "json").toLowerCase(); + const apiBaseUrl = String(args["api-base-url"] || DEFAULT_API_BASE).replace(/\/$/, ""); + + if (!apiKey) throw new Error("Missing API key. Set --api-key or BROWSERBASE_API_KEY."); + if (!projectId) throw new Error("Missing project ID. Set --project-id or BROWSERBASE_PROJECT_ID."); + if (!["json", "shell", "url"].includes(format)) { + throw new Error(`Invalid --format: ${format}`); + } + + const proxies = parseBool(args.proxies, "proxies"); + const advancedStealth = parseBool(args["advanced-stealth"], "advanced-stealth"); + const keepAlive = parseBool(args["keep-alive"], "keep-alive"); + + const payload = { projectId }; + if (proxies !== undefined) payload.proxies = proxies; + if (keepAlive !== undefined) payload.keepAlive = keepAlive; + if (advancedStealth !== undefined) { + payload.browserSettings = { advancedStealth }; + } + + const response = await fetch(`${apiBaseUrl}/v1/sessions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-BB-API-Key": apiKey, + }, + body: JSON.stringify(payload), + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Failed to create session (${response.status}): ${text || response.statusText}`); + } + + const data = await response.json(); + const sessionId = data.id; + const connectUrl = data.connectUrl; + + if (!sessionId || !connectUrl) { + throw new Error("Browserbase response missing id or connectUrl."); + } + + const output = { + sessionId, + connectUrl, + debuggerUrl: `https://www.browserbase.com/sessions/${sessionId}`, + }; + + if (format === "url") { + console.log(output.connectUrl); + return; + } + + if (format === "shell") { + console.log(`export BROWSERBASE_SESSION_ID=${shellQuote(output.sessionId)}`); + console.log(`export BROWSERBASE_CDP_URL=${shellQuote(output.connectUrl)}`); + console.log(`export BROWSERBASE_DEBUGGER_URL=${shellQuote(output.debuggerUrl)}`); + return; + } + + console.log(JSON.stringify(output, null, 2)); +} + +async function closeSession(args) { + const apiKey = args["api-key"] || process.env.BROWSERBASE_API_KEY; + const sessionId = args["session-id"] || args.sessionId; + const apiBaseUrl = String(args["api-base-url"] || DEFAULT_API_BASE).replace(/\/$/, ""); + + if (!apiKey) throw new Error("Missing API key. Set --api-key or BROWSERBASE_API_KEY."); + if (!sessionId) throw new Error("Missing session ID. Set --session-id ."); + + const response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { + method: "DELETE", + headers: { + "X-BB-API-Key": apiKey, + }, + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Failed to close session (${response.status}): ${text || response.statusText}`); + } + + console.log(JSON.stringify({ closed: true, sessionId }, null, 2)); +} + +async function main() { + const argv = process.argv.slice(2); + if (argv.length === 0 || argv.includes("--help") || argv.includes("-h")) { + usage(); + process.exit(argv.length === 0 ? 1 : 0); + } + + const [command, ...rest] = argv; + const args = parseArgs(rest); + + if (command === "create") { + await createSession(args); + return; + } + if (command === "close") { + await closeSession(args); + return; + } + + usage(); + process.exit(1); +} + +main().catch((error) => { + console.error(String(error?.message || error)); + process.exit(1); +}); diff --git a/skills/browser-fleet/LICENSE.txt b/skills/browser-fleet/LICENSE.txt new file mode 100644 index 0000000..f2f4397 --- /dev/null +++ b/skills/browser-fleet/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Browserbase, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/skills/browser-fleet/SKILL.md b/skills/browser-fleet/SKILL.md new file mode 100644 index 0000000..5db457f --- /dev/null +++ b/skills/browser-fleet/SKILL.md @@ -0,0 +1,92 @@ +--- +name: browser-fleet +description: Orchestrate high-volume browser tasks by decomposing one objective into many independent units and fanning out execution through sub-agents, each owning its own browser workflow. Use when users need parallel browser work such as competitive monitoring, account sweeps, QA matrix checks, regression checks across many URLs, or load-style deterministic actions. +compatibility: "Requires the `browse` CLI. For protected targets, set Browserbase credentials (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`) to use remote mode." +license: MIT +allowed-tools: Bash +metadata: + capabilities: + - parallel-subagents + - task-decomposition + - subagent-orchestration + - retry-control + openclaw: + requires: + bins: + - browse + install: + - kind: node + package: "@browserbasehq/browse-cli" + bins: [browse] + homepage: https://github.com/browserbase/skills +--- + +# Browser Fleet Orchestration + +Use this skill for parallel browser operations, not single interactive tasks. + +## Core rule + +Treat "fleet" as an orchestration pattern, not a CLI primitive. +Run fanout through sub-agents. + +## Sub-agent fanout (default) + +Use this for multi-step tasks per target/account. + +1. Build a worklist of independent units (URLs, account IDs, vendors, claims). +2. Give each sub-agent exactly one unit. +3. Require strict structured output from each sub-agent (JSON object). +4. Aggregate results and retry only failed units. + +Suggested sub-agent prompt contract: + +```text +Use /browser for exactly one target. +Steps: +1) open target URL +2) snapshot -c -i --main-frame +3) perform required action(s) +4) return JSON: {target, success, key_data, evidence, error} +Do not process multiple targets in one run. +``` + +## Deterministic batch pattern + +1. Generate a normalized worklist (`[{id,url,goal}]`). +2. Spawn one sub-agent per work item. +3. Keep each sub-agent deterministic with strict step order. +4. Merge outputs and run retries on failures only. + +## Research/exploratory pattern + +1. Generate a coarse worklist. +2. Spawn sub-agents with bounded budgets (turns/timeouts). +3. Require each sub-agent to return confidence + evidence. +4. Escalate low-confidence items to a second pass. + +## Recommended hybrid pattern + +1. Run a broad first pass over all items. +2. Classify `ok / retry / escalate`. +3. Retry transient failures (timeouts, temporary blocks). +4. Escalate hard cases to sub-agents for deeper reasoning. + +This keeps cost low while preserving high success on messy targets. + +## Concurrency and reliability guardrails + +- Start with conservative concurrency (5-15 workers), then ramp. +- For anti-bot targets, switch to Browserbase remote mode before fanning out. +- Cap each unit by timeout and max retries. +- Keep result schema stable across all workers. + +## Cleanup + +Always clean up browser state after fanout: + +```bash +browse stop --force +pkill -f "browse.*daemon" || true +pkill -f "chrom(e|ium).*browse-" || true +``` diff --git a/skills/browser/REFERENCE.md b/skills/browser/REFERENCE.md index 1fbc989..7a23718 100644 --- a/skills/browser/REFERENCE.md +++ b/skills/browser/REFERENCE.md @@ -71,9 +71,13 @@ Get the accessibility tree with interactive element refs. This is the primary wa ```bash browse snapshot browse snapshot --compact # tree only, no ref maps +browse snapshot -c -i --main-frame # focused refs (recommended on large pages) +browse snapshot -c --contains "price" # filter tree lines by text +browse snapshot -c --max-lines 200 # cap output size ``` Returns a text representation of the page with refs like `@0-5` that can be passed to `click`. Use `--compact` for shorter output when you only need the tree. +Use `--interactive` and `--main-frame` to reduce payload size and speed up agent loops on heavy pages. #### `screenshot [path]` @@ -389,6 +393,18 @@ browse --session work open https://a.com browse --session personal open https://b.com ``` +#### Parallel orchestration (recommended) + +Use sub-agents for parallel browser work. Assign each sub-agent one independent unit and let each sub-agent run a normal `browse` workflow in its own session. + +```bash +# Example session isolation pattern +browse --session job-1 open https://example.com/a +browse --session job-2 open https://example.com/b +``` + +This improves reliability for multi-step tasks and makes retries/debugging easier than single-command fanout patterns. + ### Environment Variables | Variable | Required | Description | diff --git a/skills/browser/SKILL.md b/skills/browser/SKILL.md index ea1f35a..74cfa61 100644 --- a/skills/browser/SKILL.md +++ b/skills/browser/SKILL.md @@ -5,6 +5,13 @@ compatibility: "Requires the browse CLI (`npm install -g @browserbasehq/browse-c license: MIT allowed-tools: Bash metadata: + capabilities: + - remote-browser + - stealth + - captcha-solving + - residential-proxies + - parallel-subagents + - cdp-interop openclaw: requires: bins: @@ -61,7 +68,8 @@ browse forward # Go forward in history ### Page state (prefer snapshot over screenshot) ```bash -browse snapshot # Get accessibility tree with element refs (fast, structured) +browse snapshot # Full accessibility tree with refs +browse snapshot -c -i --main-frame # Focused tree (recommended on large pages) browse screenshot [path] # Take visual screenshot (slow, uses vision tokens) browse get url # Get current URL browse get title # Get page title @@ -70,7 +78,9 @@ browse get html # Get HTML content of element browse get value # Get form field value ``` -Use `browse snapshot` as your default for understanding page state — it returns the accessibility tree with element refs you can use to interact. Only use `browse screenshot` when you need visual context (layout, images, debugging). +Use focused snapshots by default on complex pages: `browse snapshot -c -i --main-frame`. +Add `--contains ""` and `--max-lines ` when output is large. +Only use `browse screenshot` when you need visual context (layout, images, debugging). ### Interaction ```bash @@ -101,9 +111,9 @@ browse tab_close [index] # Close tab ### Typical workflow 1. `browse open ` — navigate to the page -2. `browse snapshot` — read the accessibility tree to understand page structure and get element refs +2. `browse snapshot -c -i --main-frame` — get focused refs with less output 3. `browse click ` / `browse type ` / `browse fill ` — interact using refs from snapshot -4. `browse snapshot` — confirm the action worked +4. `browse snapshot -c --contains "" --max-lines 200` — confirm state changes 5. Repeat 3-4 as needed 6. `browse stop` — close the browser when done @@ -111,7 +121,7 @@ browse tab_close [index] # Close tab ```bash browse open https://example.com -browse snapshot # see page structure + element refs +browse snapshot -c -i --main-frame # focused refs browse click @0-5 # click element with ref 0-5 browse get title browse stop @@ -132,10 +142,12 @@ browse stop ## Best Practices 1. **Always `browse open` first** before interacting -2. **Use `browse snapshot`** to check page state — it's fast and gives you element refs +2. **Use focused snapshots** first — `browse snapshot -c -i --main-frame` 3. **Only screenshot when visual context is needed** (layout checks, images, debugging) 4. **Use refs from snapshot** to click/interact — e.g., `browse click @0-5` 5. **`browse stop`** when done to clean up the browser session +6. **For parallel work, use sub-agents** and assign one unit of work per agent +7. **Quote URLs with query params** — e.g. `browse open "https://site.com/path?a=1&b=2"` to avoid shell globbing ## Troubleshooting From 63df3bcb87a3c56e8c3450f939b8b5ad5d8f9b47 Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Thu, 26 Feb 2026 17:55:36 -0800 Subject: [PATCH 23/24] Rename browser-fleet skill to browse-fleet-subagents --- README.md | 2 +- skills/{browser-fleet => browse-fleet-subagents}/LICENSE.txt | 0 skills/{browser-fleet => browse-fleet-subagents}/SKILL.md | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename skills/{browser-fleet => browse-fleet-subagents}/LICENSE.txt (100%) rename skills/{browser-fleet => browse-fleet-subagents}/SKILL.md (99%) diff --git a/README.md b/README.md index e78ea5b..86d6dfb 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This plugin includes the following skills (see `skills/` for details): |-------|-------------| | [browser](skills/browser/SKILL.md) | Automate web browser interactions via CLI commands — supports remote Browserbase sessions with anti-bot stealth, CAPTCHA solving, and residential proxies | | [agent-browser-remote](skills/agent-browser-remote/SKILL.md) | Use Agent Browser with Browserbase remote CDP sessions when local browsing hits CAPTCHAs, anti-bot checks, or JavaScript-heavy sites | -| [browser-fleet](skills/browser-fleet/SKILL.md) | Orchestrate high-volume browser work via sub-agent fanout with clear decomposition, retries, and cleanup patterns | +| [browse-fleet-subagents](skills/browse-fleet-subagents/SKILL.md) | Orchestrate high-volume browser work via sub-agent fanout with clear decomposition, retries, and cleanup patterns | | [functions](skills/functions/SKILL.md) | Deploy serverless browser automation to Browserbase cloud using the `bb` CLI | ## Installation diff --git a/skills/browser-fleet/LICENSE.txt b/skills/browse-fleet-subagents/LICENSE.txt similarity index 100% rename from skills/browser-fleet/LICENSE.txt rename to skills/browse-fleet-subagents/LICENSE.txt diff --git a/skills/browser-fleet/SKILL.md b/skills/browse-fleet-subagents/SKILL.md similarity index 99% rename from skills/browser-fleet/SKILL.md rename to skills/browse-fleet-subagents/SKILL.md index 5db457f..9749076 100644 --- a/skills/browser-fleet/SKILL.md +++ b/skills/browse-fleet-subagents/SKILL.md @@ -1,5 +1,5 @@ --- -name: browser-fleet +name: browse-fleet-subagents description: Orchestrate high-volume browser tasks by decomposing one objective into many independent units and fanning out execution through sub-agents, each owning its own browser workflow. Use when users need parallel browser work such as competitive monitoring, account sweeps, QA matrix checks, regression checks across many URLs, or load-style deterministic actions. compatibility: "Requires the `browse` CLI. For protected targets, set Browserbase credentials (`BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID`) to use remote mode." license: MIT From 9e5c1a3e786defecc4b5ba88662862c68d5578ec Mon Sep 17 00:00:00 2001 From: Shrey Pandya Date: Thu, 26 Feb 2026 21:10:39 -0800 Subject: [PATCH 24/24] Fix Browserbase session close flow in agent-browser-remote script --- .../scripts/browserbase-session.mjs | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/skills/agent-browser-remote/scripts/browserbase-session.mjs b/skills/agent-browser-remote/scripts/browserbase-session.mjs index 499c30d..97f3e26 100755 --- a/skills/agent-browser-remote/scripts/browserbase-session.mjs +++ b/skills/agent-browser-remote/scripts/browserbase-session.mjs @@ -20,6 +20,7 @@ function usage() { "", "Close options:", " --session-id Session ID to close (required)", + " --project-id Browserbase project ID (optional, or BROWSERBASE_PROJECT_ID)", " --api-key Browserbase API key (or BROWSERBASE_API_KEY)", " --api-base-url API base URL", ].join("\n"), @@ -128,24 +129,52 @@ async function createSession(args) { async function closeSession(args) { const apiKey = args["api-key"] || process.env.BROWSERBASE_API_KEY; const sessionId = args["session-id"] || args.sessionId; + const projectId = args["project-id"] || process.env.BROWSERBASE_PROJECT_ID; const apiBaseUrl = String(args["api-base-url"] || DEFAULT_API_BASE).replace(/\/$/, ""); if (!apiKey) throw new Error("Missing API key. Set --api-key or BROWSERBASE_API_KEY."); if (!sessionId) throw new Error("Missing session ID. Set --session-id ."); - const response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { - method: "DELETE", + // Current Browserbase API supports session release via POST /v1/sessions/{id}. + const releasePayload = { status: "REQUEST_RELEASE" }; + if (projectId) releasePayload.projectId = projectId; + + let response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { + method: "POST", headers: { + "Content-Type": "application/json", "X-BB-API-Key": apiKey, }, + body: JSON.stringify(releasePayload), }); + // Backward-compat fallback if the API still expects DELETE. + if (!response.ok && [404, 405].includes(response.status)) { + response = await fetch(`${apiBaseUrl}/v1/sessions/${encodeURIComponent(sessionId)}`, { + method: "DELETE", + headers: { + "X-BB-API-Key": apiKey, + }, + }); + } + if (!response.ok) { const text = await response.text(); throw new Error(`Failed to close session (${response.status}): ${text || response.statusText}`); } - console.log(JSON.stringify({ closed: true, sessionId }, null, 2)); + const data = await response.json().catch(() => ({})); + console.log( + JSON.stringify( + { + closed: true, + sessionId, + status: data?.status ?? "REQUESTED", + }, + null, + 2, + ), + ); } async function main() {