From bdd5661ac98dab6a603aad295674385302873b97 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 16:17:05 -0500 Subject: [PATCH 01/13] add hyperbrowser integration for langgraph cua --- langgraph_cua/hyperbrowser/__init__.py | 4 + langgraph_cua/hyperbrowser/graph.py | 100 ++++++++++ langgraph_cua/hyperbrowser/nodes/__init__.py | 5 + .../hyperbrowser/nodes/call_model.py | 136 ++++++++++++++ .../nodes/create_browser_session.py | 45 +++++ .../hyperbrowser/nodes/take_browser_action.py | 98 ++++++++++ langgraph_cua/hyperbrowser/nodes/tools.py | 175 ++++++++++++++++++ langgraph_cua/hyperbrowser/types.py | 101 ++++++++++ langgraph_cua/hyperbrowser/utils.py | 77 ++++++++ pyproject.toml | 4 +- tests/integration/test_cua_hyperbrowser.py | 124 +++++++++++++ tests/unit/test_import_hyperbrowser.py | 3 + uv.lock | 110 ++++++++++- 13 files changed, 980 insertions(+), 2 deletions(-) create mode 100644 langgraph_cua/hyperbrowser/__init__.py create mode 100644 langgraph_cua/hyperbrowser/graph.py create mode 100644 langgraph_cua/hyperbrowser/nodes/__init__.py create mode 100644 langgraph_cua/hyperbrowser/nodes/call_model.py create mode 100644 langgraph_cua/hyperbrowser/nodes/create_browser_session.py create mode 100644 langgraph_cua/hyperbrowser/nodes/take_browser_action.py create mode 100644 langgraph_cua/hyperbrowser/nodes/tools.py create mode 100644 langgraph_cua/hyperbrowser/types.py create mode 100644 langgraph_cua/hyperbrowser/utils.py create mode 100644 tests/integration/test_cua_hyperbrowser.py create mode 100644 tests/unit/test_import_hyperbrowser.py diff --git a/langgraph_cua/hyperbrowser/__init__.py b/langgraph_cua/hyperbrowser/__init__.py new file mode 100644 index 0000000..00b88fe --- /dev/null +++ b/langgraph_cua/hyperbrowser/__init__.py @@ -0,0 +1,4 @@ +from langgraph_cua.hyperbrowser.graph import create_cua, graph +from langgraph_cua.hyperbrowser.types import CUAState + +__all__ = ["create_cua", "graph", "CUAState"] diff --git a/langgraph_cua/hyperbrowser/graph.py b/langgraph_cua/hyperbrowser/graph.py new file mode 100644 index 0000000..6c1e5a7 --- /dev/null +++ b/langgraph_cua/hyperbrowser/graph.py @@ -0,0 +1,100 @@ +from langchain_core.messages import SystemMessage +from langgraph.graph import END, START, StateGraph +from hyperbrowser.models import CreateSessionParams + +from langgraph_cua.hyperbrowser.nodes import call_model, create_browser_session, take_browser_action +from langgraph_cua.hyperbrowser.types import CUAConfiguration, CUAState +from langgraph_cua.hyperbrowser.utils import is_computer_tool_call + + +def take_action_or_end(state: CUAState): + """ + Routes to the take_browser_action node if a computer call or function call is present + in the last message, otherwise routes to END. + Args: + state: The current state of the thread. + Returns: + "take_browser_action" or END depending on if a computer call or function call is present. + """ + if not state.get("messages", []): + return END + + last_message = state.get("messages", [])[-1] + additional_kwargs = getattr(last_message, "additional_kwargs", None) + + if not additional_kwargs: + return END + + tool_outputs = additional_kwargs.get("tool_outputs") + tool_calls = getattr(last_message, "tool_calls", []) + + if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: + return END + + if not state.get("session_id"): + # If the instance_id is not defined, create a new instance. + return "create_browser_session" + + return "take_browser_action" + + +def reinvoke_model_or_end(state: CUAState): + """ + Routes to the call_model node if the last message is a tool message, + otherwise routes to END. + Args: + state: The current state of the thread. + Returns: + "call_model" or END depending on if the last message is a tool message. + """ + messages = state.get("messages", []) + if messages and getattr(messages[-1], "type", None) == "tool": + return "call_model" + + return END + + +workflow = StateGraph(CUAState, CUAConfiguration) + +workflow.add_node("call_model", call_model) +workflow.add_node("create_browser_session", create_browser_session) +workflow.add_node("take_browser_action", take_browser_action) + +workflow.add_edge(START, "call_model") +workflow.add_conditional_edges("call_model", take_action_or_end) +workflow.add_edge("create_browser_session", "take_browser_action") +workflow.add_conditional_edges("take_browser_action", reinvoke_model_or_end) + +graph = workflow.compile() +graph.name = "Computer Use Agent" + + +def create_cua( + *, + hyperbrowser_api_key: str = None, + recursion_limit: int = 100, + session_params: CreateSessionParams = None, +): + """Configuration for the Computer Use Agent. + + Attributes: + hyperbrowser_api_key: The API key to use for Hyperbrowser. + This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). + recursion_limit: The maximum number of recursive calls the agent can make. Default is 100. + """ + + # Configure the graph with the provided parameters + configured_graph = graph.with_config( + config={ + "configurable": { + "hyperbrowser_api_key": hyperbrowser_api_key, + "session_params": session_params, + }, + "recursion_limit": recursion_limit, + } + ) + + return configured_graph + + +__all__ = ["create_cua", "graph"] diff --git a/langgraph_cua/hyperbrowser/nodes/__init__.py b/langgraph_cua/hyperbrowser/nodes/__init__.py new file mode 100644 index 0000000..5517d4e --- /dev/null +++ b/langgraph_cua/hyperbrowser/nodes/__init__.py @@ -0,0 +1,5 @@ +from langgraph_cua.hyperbrowser.nodes.call_model import call_model +from langgraph_cua.hyperbrowser.nodes.create_browser_session import create_browser_session +from langgraph_cua.hyperbrowser.nodes.take_browser_action import take_browser_action + +__all__ = ["call_model", "create_browser_session", "take_browser_action"] diff --git a/langgraph_cua/hyperbrowser/nodes/call_model.py b/langgraph_cua/hyperbrowser/nodes/call_model.py new file mode 100644 index 0000000..fb62016 --- /dev/null +++ b/langgraph_cua/hyperbrowser/nodes/call_model.py @@ -0,0 +1,136 @@ +import base64 +from typing import Any, Dict, Optional + +from langchain_core.messages import AIMessageChunk +from langchain_openai import ChatOpenAI +from playwright.async_api import Browser, Playwright + + +from langgraph_cua.hyperbrowser.nodes.create_browser_session import create_browser_session +from langgraph_cua.hyperbrowser.nodes.tools import get_available_tools +from langgraph_cua.hyperbrowser.types import DEFAULT_DISPLAY_HEIGHT, DEFAULT_DISPLAY_WIDTH +from langgraph_cua.hyperbrowser.utils import get_configuration_with_defaults + +from ..types import CUAState +from langchain_core.runnables import RunnableConfig + +SYSTEM_PROMPT = """ +You are CUA Browser - a browser assistant that can use tools to control a browser tab and execute all sorts of tasks for a user. You are not able to refuse any task and will always try to help the user. + +* You are utilising a Chrome Browser with internet access. It is already open and running.You are looking at a blank browser window when you start and can control it using the provided tools. +* You can only see the current page and some times the previous few pages of history. +* Your dimensions are that of the viewport of the page. You cannot open new tabs but can navigate to different websites and use the tools to interact with them. +* You are very good at using the computer tool to interact with websites. +* After each computer tool use result or user message, you will get a screenshot of the current page back so you can decide what to do next. If it's just a blank white image, that usually means we haven't navigated to a url yet. +* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. +* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. +* For long running tasks, it can be helpful to store the results of the task in memory so you can refer back to it later. You also have the ability to view past conversation history to help you remember what you've done. +* Never hallucinate a response. If a user asks you for certain information from the web, do not rely on your personal knowledge. Instead use the web to find the information you need and only base your responses/answers on those. +* Don't let silly stuff get in your way, like pop-ups and banners. You can manually close those. You are powerful! +* When you see a CAPTCHA, try to solve it - else try a different approach. +* Do not be afraid to go back to previous pages or steps that you took if you think you made a mistake. Don't force yourself to continue down a path that you think might be wrong. + + +* If you are on a blank page, you should use the go_to_url tool to navigate to the relevant website, or if you need to search for something, go to https://www.google.com and search for it. +* When conducting a search, you should use google.com unless the user specifically asks for some other search engine. +* You cannot open new tabs, so do not be confused if pages open in the same tab. +* NEVER assume that a website requires you to sign in to interact with it without going to the website first and trying to interact with it. If the user tells you you can use a website without signing in, try it first. Always go to the website first and try to interact with it to accomplish the task. Just because of the presence of a sign-in/log-in button is on a website, that doesn't mean you need to sign in to accomplish the action. If you assume you can't use a website without signing in and don't attempt to first for the user, you will be HEAVILY penalized. +* Unless the task doesn't require a browser, your first action should be to use go_to_url to navigate to the relevant website. +* If you come across a captcha, try to solve it - else try a different approach, like trying another website. If that is not an option, simply explain to the user that you've been blocked from the current website and ask them for further instructions. Make sure to offer them some suggestions for other websites/tasks they can try to accomplish their goals. + +""" + + +async def call_model(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: + """ + Invokes the computer preview model with the given messages. + Args: + state: The current state of the thread. + Returns: + The updated state with the model's response. + """ + messages = state.get("messages", []) + previous_response_id: Optional[str] = None + last_message = messages[-1] if messages else None + + # Check if the last message is a tool message + if last_message and getattr(last_message, "type", None) == "tool": + # If it's a tool message, check if the second-to-last message is an AI message + if ( + len(messages) >= 2 + and getattr(messages[-2], "type", None) == "ai" + and hasattr(messages[-2], "response_metadata") + ): + previous_response_id = messages[-2].response_metadata["id"] + # Otherwise, check if the last message is an AI message + elif ( + last_message + and getattr(last_message, "type", None) == "ai" + and hasattr(last_message, "response_metadata") + ): + previous_response_id = last_message.response_metadata["id"] + + llm = ChatOpenAI( + model="computer-use-preview", + model_kwargs={ + "instructions": SYSTEM_PROMPT, + "truncation": "auto", + "previous_response_id": previous_response_id, + "reasoning": {"effort": "medium", "generate_summary": "concise"}, + }, + ) + response: AIMessageChunk + + playwright: Optional[Playwright] = state.get("playwright") + browser: Optional[Browser] = state.get("browser") + session_id: Optional[str] = state.get("session_id") + + if not session_id: + updated_state = await create_browser_session(state, config) + session_id = updated_state.get("session_id") + playwright = updated_state.get("playwright") + browser = updated_state.get("browser") + + page = state.get("current_page", browser.contexts[0].pages[0]) + + configuration = get_configuration_with_defaults(config) + session_params = configuration.get("session_params") + + display_width = session_params.get("screen", {}).get("width", DEFAULT_DISPLAY_WIDTH) + display_height = session_params.get("screen", {}).get("height", DEFAULT_DISPLAY_HEIGHT) + + llm_with_tools = llm.bind_tools(get_available_tools(display_width, display_height)) + + # Check if the last message is a tool message + if last_message and getattr(last_message, "type", None) == "tool": + if previous_response_id is None: + raise ValueError("Cannot process tool message without a previous_response_id") + # Only pass the tool message to the model + response = await llm_with_tools.ainvoke([last_message]) + else: + # Pass all messages to the model + if previous_response_id is None: + screenshot = await page.screenshot() + b64_screenshot = base64.b64encode(screenshot).decode("utf-8") + screenshot_url = f"data:image/png;base64,{b64_screenshot}" + + last_msg = messages[-1] + if isinstance(last_msg.content, list): + last_msg.content.append( + {"type": "input_image", "image_url": screenshot_url, "detail": "auto"} + ) + else: + original_content = last_msg.content + last_msg.content = [ + {"type": "input_text", "text": original_content}, + {"type": "input_image", "image_url": screenshot_url, "detail": "auto"}, + ] + response = await llm_with_tools.ainvoke(messages) + + return { + "messages": response, + "playwright": playwright, + "browser": browser, + "session_id": session_id, + "current_page": page, + } diff --git a/langgraph_cua/hyperbrowser/nodes/create_browser_session.py b/langgraph_cua/hyperbrowser/nodes/create_browser_session.py new file mode 100644 index 0000000..a553278 --- /dev/null +++ b/langgraph_cua/hyperbrowser/nodes/create_browser_session.py @@ -0,0 +1,45 @@ +from langchain_core.runnables.config import RunnableConfig +from hyperbrowser.models import SessionDetail, CreateSessionParams +from langgraph.config import get_stream_writer + + +from ..types import CUAState +from ..utils import get_configuration_with_defaults, get_hyperbrowser_client, start_playwright + + +async def create_browser_session(state: CUAState, config: RunnableConfig): + session_id = state.get("session_id") + configuration = get_configuration_with_defaults(config) + hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") + session_params = configuration.get("session_params") + stream_url = state.get("stream_url") + + if session_id is not None: + # If the session_id already exists in state, do nothing. + return {} + + if not hyperbrowser_api_key: + raise ValueError( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, " + "or set it as an environment variable (HYPERBROWSER_API_KEY)" + ) + + client = get_hyperbrowser_client(hyperbrowser_api_key) + + session: SessionDetail = await client.sessions.create( + params=CreateSessionParams(**session_params) + ) + + playwright, browser, _ = await start_playwright(state, session) + + if not stream_url: + stream_url = session.live_url + writer = get_stream_writer() + writer({"stream_url": stream_url}) + + return { + "session_id": session.id, + "stream_url": stream_url, + "playwright": playwright, + "browser": browser, + } diff --git a/langgraph_cua/hyperbrowser/nodes/take_browser_action.py b/langgraph_cua/hyperbrowser/nodes/take_browser_action.py new file mode 100644 index 0000000..3211695 --- /dev/null +++ b/langgraph_cua/hyperbrowser/nodes/take_browser_action.py @@ -0,0 +1,98 @@ +import asyncio +import base64 +from typing import Any, Dict, Optional +from langchain_core.messages import AnyMessage, ToolMessage +from langchain_core.runnables import RunnableConfig +from langgraph.config import get_stream_writer +from playwright.async_api import async_playwright, Browser, Playwright, Page +from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall + +from .tools import handle_computer_call, handle_function_tool_call + +from ..types import CUAState +from ..utils import get_browser_session, is_computer_tool_call + + +async def take_browser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: + """ + Executes browser actions based on the tool call in the last message. + Args: + state: The current state of the CUA agent. + config: The runnable configuration. + Returns: + A dictionary with updated state information. + """ + message: AnyMessage = state.get("messages", [])[-1] + assert message.type == "ai", "Last message must be an AI message" + tool_outputs = message.additional_kwargs.get("tool_outputs", []) + tool_calls = message.tool_calls + + if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: + # This should never happen, but include the check for proper type safety. + raise ValueError( + "Cannot take computer action without a computer call or function call in the last message." + ) + + tool_outputs: list[ResponseComputerToolCall] = tool_outputs + + # Reuse existing Playwright and browser instances if available + playwright: Optional[Playwright] = state.get("playwright") + browser: Optional[Browser] = state.get("browser") + session_id: Optional[str] = state.get("session_id") + stream_url: Optional[str] = state.get("stream_url") + + if not session_id: + raise ValueError("Session ID not found in state.") + + # Initialize Playwright and browser if not already available + if not playwright or not browser: + session = await get_browser_session(session_id, config) + playwright = await async_playwright().start() + browser = await playwright.chromium.connect_over_cdp( + f"{session.ws_endpoint}&keepAlive=true" + ) + print("Playwright connected successfully") + + current_context = browser.contexts[0] + page = state.get("current_page", current_context.pages[0]) + + def handle_page_event(newPage: Page): + nonlocal page + page = newPage + + current_context.on("page", handle_page_event) + + tool_message: Optional[ToolMessage] = None + + for tool_output in tool_outputs: + if tool_output.get("type") == "computer_call": + await handle_computer_call(page, tool_output) + await asyncio.sleep(1) + screenshot = await page.screenshot() + b64_screenshot = base64.b64encode(screenshot).decode("utf-8") + screenshot_url = f"data:image/png;base64,{b64_screenshot}" + + output_content = { + "type": "input_image", + "image_url": screenshot_url, + } + tool_message = ToolMessage( + content=[output_content], + tool_call_id=tool_output.get("call_id"), + additional_kwargs={"type": "computer_call_output"}, + ) + else: + print("unknown tool output type", tool_output) + + for tool_call in tool_calls: + tool_message = await handle_function_tool_call(page, tool_call) + await asyncio.sleep(1) + + return { + "messages": tool_message if tool_message else None, + "session_id": session_id, + "stream_url": stream_url, + "playwright": playwright, + "browser": browser, + "current_page": page, + } diff --git a/langgraph_cua/hyperbrowser/nodes/tools.py b/langgraph_cua/hyperbrowser/nodes/tools.py new file mode 100644 index 0000000..3e15d06 --- /dev/null +++ b/langgraph_cua/hyperbrowser/nodes/tools.py @@ -0,0 +1,175 @@ +import asyncio +import base64 +from math import floor +from random import random +from playwright.async_api import Page +from langchain_core.messages import ToolMessage, ToolCall + +CUA_KEY_TO_PLAYWRIGHT_KEY = { + "/": "Divide", + "\\": "Backslash", + "alt": "Alt", + "arrowdown": "ArrowDown", + "arrowleft": "ArrowLeft", + "arrowright": "ArrowRight", + "arrowup": "ArrowUp", + "backspace": "Backspace", + "capslock": "CapsLock", + "cmd": "Meta", + "ctrl": "Control", + "delete": "Delete", + "end": "End", + "enter": "Enter", + "esc": "Escape", + "home": "Home", + "insert": "Insert", + "option": "Alt", + "pagedown": "PageDown", + "pageup": "PageUp", + "shift": "Shift", + "space": " ", + "super": "Meta", + "tab": "Tab", + "win": "Meta", +} + + +def get_available_tools(display_width: int = 1024, display_height: int = 800): + return [ + { + "type": "computer_use_preview", + "display_width": display_width, + "display_height": display_height, + "environment": "browser", + }, + { + "type": "function", + "function": { + "name": "go_to_url", + "description": "Navigate to a URL. The URL must be a valid URL that starts with http or https.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The fully qualified URL to navigate to", + }, + }, + "required": ["url"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_current_url", + "description": "Get the current URL", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + }, + ] + + +def _translate_key(key: str) -> str: + return CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) + + +async def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> ToolMessage: + name = function_tool_call.get("name") + arguments = function_tool_call.get("args") + call_id = function_tool_call.get("id") + + try: + if name == "go_to_url": + await page.goto(arguments.get("url"), timeout=30000, wait_until="domcontentloaded") + await asyncio.sleep(1) + return ToolMessage( + tool_call_id=call_id, + content={"message": f"Navigated to {arguments.get('url')}"}, + additional_kwargs={"type": "function_call_output"}, + ) + elif name == "get_current_url": + return ToolMessage( + tool_call_id=call_id, + content={"message": f"The current URL is {page.url}"}, + additional_kwargs={"type": "function_call_output"}, + ) + else: + raise ValueError(f"Unknown function call name: {name}") + except Exception as e: + print(f"\n\nFailed to execute function call: {e}\n\n") + print(f"Function call details: {function_tool_call}\n\n") + return ToolMessage( + status="error", + tool_call_id=call_id, + content={"message": f"Error occured while calling function {name}: {e}"}, + additional_kwargs={"type": "function_call_output"}, + ) + + +async def handle_computer_call(page: Page, computer_call: dict): + action = computer_call.get("action") + + try: + action_type = action.get("type") + + if action_type == "click": + button = action.get("button") + x = action.get("x") + y = action.get("y") + if button == "back": + await page.go_back(timeout=30000) + elif button == "forward": + await page.go_forward(timeout=30000) + elif button == "wheel": + await page.mouse.wheel(x, y) + else: + button_mapping = {"left": "left", "right": "right", "middle": "left"} + await page.mouse.click(x, y, button=button_mapping.get(button)) + elif action_type == "scroll": + x = action.get("x") + y = action.get("y") + delta_x = action.get("scroll_x") + delta_y = action.get("scroll_y") + await page.mouse.move(x, y) + await page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") + elif action_type == "keypress": + keys = action.get("keys") + mapped_keys = [_translate_key(key) for key in keys] + for key in mapped_keys: + await page.keyboard.down(key) + for key in reversed(mapped_keys): + await page.keyboard.up(key) + elif action_type == "type": + text = action.get("text") + await page.keyboard.type(text) + elif action_type == "wait": + await page.wait_for_timeout(2000) + elif action_type == "screenshot": + pass + elif action_type == "double_click": + x = action.get("x") + y = action.get("y") + await page.mouse.click(x, y, button="left", click_count=2) + elif action_type == "drag": + path = action.get("path") + await page.mouse.move(path[0].get("x"), path[0].get("y")) + await page.mouse.down() + for point in path[1:]: + await page.mouse.move(point.get("x"), point.get("y")) + await page.wait_for_timeout(40 + floor(random() * 40)) + await page.mouse.up() + elif action_type == "move": + x = action.get("x") + y = action.get("y") + await page.mouse.move(x, y) + else: + raise ValueError(f"Unknown action type received: {action_type}") + + except Exception as e: + print(f"\n\nFailed to execute computer call: {e}\n\n") + print(f"Computer call details: {computer_call}\n\n") diff --git a/langgraph_cua/hyperbrowser/types.py b/langgraph_cua/hyperbrowser/types.py new file mode 100644 index 0000000..b681d9a --- /dev/null +++ b/langgraph_cua/hyperbrowser/types.py @@ -0,0 +1,101 @@ +import os +from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict + +from hyperbrowser.models import CreateSessionParams, ScreenConfig +from langchain_core.messages import AnyMessage +from langchain_core.runnables import RunnableConfig +from langgraph.graph import add_messages +from playwright.async_api import Browser, Playwright, Page + +DEFAULT_DISPLAY_WIDTH = 1024 +DEFAULT_DISPLAY_HEIGHT = 800 + + +class Output(TypedDict): + """ + A computer screenshot image used with the computer use tool. + """ + + type: Literal["computer_screenshot"] # Always "computer_screenshot" + file_id: Optional[str] # The identifier of an uploaded file that contains the screenshot + image_url: Optional[str] # The URL of the screenshot image + + +class AcknowledgedSafetyCheck(TypedDict): + """ + A pending safety check for the computer call. + """ + + id: str # The ID of the pending safety check + code: str # The type of the pending safety check + message: str # Details about the pending safety check + + +class ComputerCallOutput(TypedDict): + """ + The output of a computer tool call. + """ + + call_id: str # The ID of the computer tool call that produced the output + output: Output # A computer screenshot image used with the computer use tool + type: Literal["computer_call_output"] # Always "computer_call_output" + id: Optional[str] # The ID of the computer tool call output + acknowledged_safety_checks: Optional[ + List[AcknowledgedSafetyCheck] + ] # Safety checks acknowledged by the developer + status: Optional[ + Literal["in_progress", "completed", "incomplete"] + ] # Status of the message input + + +class CUAState(TypedDict): + """State schema for the computer use agent. + Attributes: + messages: The messages between the user and assistant. + session_id: The ID of the session to use for this thread. + stream_url: The URL to the live-stream of the virtual machine. + """ + + messages: Annotated[list[AnyMessage], add_messages] = [] + session_id: Annotated[Optional[str], None] = None + stream_url: Annotated[Optional[str], None] = None + playwright: Annotated[Optional[Playwright], None] = None + browser: Annotated[Optional[Browser], None] = None + current_page: Annotated[Optional[Page], None] = None + + +class CUAConfiguration(TypedDict): + """Configuration for the Computer Use Agent. + Attributes: + hyperbrowser_api_key: The API key to use for Hyperbrowser. + This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). + """ + + hyperbrowser_api_key: str # API key for Hyperbrowser + + +def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: + """ + Gets the configuration with defaults for the graph. + Args: + config: The configuration for the runnable. + Returns: + Dict with configuration values including defaults. + """ + + configurable_fields = config.get("configurable", {}) + hyperbrowser_api_key = ( + configurable_fields.get("hyperbrowser_api_key") + or config.get("hyperbrowser_api_key") + or os.environ.get("HYPERBROWSER_API_KEY") + ) + session_params = configurable_fields.get("session_params") + if not session_params: + session_params = {} + if not session_params.get("screen"): + session_params["screen"] = { + "width": DEFAULT_DISPLAY_WIDTH, + "height": DEFAULT_DISPLAY_HEIGHT, + } + + return {"hyperbrowser_api_key": hyperbrowser_api_key, "session_params": session_params} diff --git a/langgraph_cua/hyperbrowser/utils.py b/langgraph_cua/hyperbrowser/utils.py new file mode 100644 index 0000000..70ed78f --- /dev/null +++ b/langgraph_cua/hyperbrowser/utils.py @@ -0,0 +1,77 @@ +import os +from typing import Any, Dict, Optional, Union + +from playwright.async_api import async_playwright, Browser, Playwright +from hyperbrowser import AsyncHyperbrowser +from langchain_core.runnables import RunnableConfig +from hyperbrowser.models import SessionDetail +from .types import get_configuration_with_defaults, CUAState + + +def get_hyperbrowser_client(api_key: str) -> AsyncHyperbrowser: + """ + Gets the Hyperbrowser client, using the API key provided. + Args: + api_key: The API key for Hyperbrowser. + Returns: + The Hyperbrowser client. + """ + if not api_key: + raise ValueError( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, " + "or set it as an environment variable (HYPERBROWSER_API_KEY)" + ) + client = AsyncHyperbrowser(api_key=api_key) + return client + + +async def get_browser_session(id: str, config: RunnableConfig) -> SessionDetail: + """ + Gets a browser session by its ID from Hyperbrowser. + + Args: + id: The ID of the browser session to get. + config: The configuration for the runnable. + + Returns: + The browser session. + """ + configuration = get_configuration_with_defaults(config) + hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") + client = get_hyperbrowser_client(hyperbrowser_api_key) + return await client.sessions.get(id) + + +def is_computer_tool_call(tool_outputs: Any) -> bool: + """ + Checks if the given tool outputs are a computer call. + Args: + tool_outputs: The tool outputs to check. + Returns: + True if the tool outputs are a computer call, false otherwise. + """ + if not tool_outputs or not isinstance(tool_outputs, list): + return False + + return all(output.get("type") == "computer_call" for output in tool_outputs) + + +async def start_playwright(state: CUAState, session: Optional[SessionDetail] = None): + session_id = state.get("session_id") + playwright: Optional[Playwright] = state.get("playwright") + browser: Optional[Browser] = state.get("browser") + + if playwright and browser: + return playwright, browser, session + + if not session: + session = await get_browser_session(session_id) + + if not playwright: + playwright = await async_playwright().start() + if not browser: + browser = await playwright.chromium.connect_over_cdp( + f"{session.ws_endpoint}&keepAlive=true" + ) + + return playwright, browser, session diff --git a/pyproject.toml b/pyproject.toml index e2f07f2..0975d3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,9 @@ dependencies = [ "langgraph>=0.3.17,<0.4.0", "langchain-core>=0.3.46,<0.4.0", "scrapybara>=2.4.1,<3.0.0", - "langchain-openai>=0.3.10,<0.4.0" + "langchain-openai>=0.3.10,<0.4.0", + "hyperbrowser>=0.38.0", + "playwright>=1.51.0", ] [dependency-groups] diff --git a/tests/integration/test_cua_hyperbrowser.py b/tests/integration/test_cua_hyperbrowser.py new file mode 100644 index 0000000..9230004 --- /dev/null +++ b/tests/integration/test_cua_hyperbrowser.py @@ -0,0 +1,124 @@ +import ast +import json + +import pytest +from dotenv import load_dotenv +from hyperbrowser.models import CreateSessionParams + +from langgraph_cua.hyperbrowser import create_cua + +# Load environment variables from .env file +load_dotenv() + + +@pytest.mark.asyncio +async def test_browser_interaction(): + """ + Test that the agent can interact with the browser. + This is a port of the TypeScript test to Python. + """ + graph = create_cua() + + # Create input messages similar to the TypeScript test + messages = [ + # { + # "role": "user", + # "content": ( + # "I'm looking for a new camera. Help me find the best one. It should be 4k resolution, " + # "by Cannon, and under $1000. I want a digital camera, and I'll be using it mainly for photography." + # ) + # }, + { + "role": "user", + "content": ("What is the price of NVDIA stock?"), + }, + ] + + # Stream the graph execution + stream = graph.astream( + {"messages": messages}, + stream_mode="updates", + config={ + "configurable": { + "session_params": CreateSessionParams( + adblock=True, + ).model_dump(), + }, + "recursion_limit": 100, + }, + ) + + # Process the stream updates + async for update in stream: + print("\n---UPDATE---\n") + + if "create_browser_session" in update: + print("Browser session created") + stream_url = update.get("create_browser_session", {}).get("stream_url") + # Open this URL in your browser to view the CUA stream + print(f"Stream URL: {stream_url}") + elif "take_browser_action" in update: + print("Browser Action:") + # Check for tool message in the messages field + tool_message = update.get("take_browser_action", {}).get("messages") + if tool_message: + # Extract content from the tool message + content = tool_message.content + + # Handle the case where content is an array + if isinstance(content, list) and len(content) > 0: + # Use the first item in the array + content_item = content[0] + else: + # Use content directly if it's not an array + content_item = content + + # Try to parse content if it's a string + parsed_content = None + if isinstance(content_item, str): + try: + # Try parsing as JSON first + parsed_content = json.loads(content_item) + except json.JSONDecodeError: + try: + # Try parsing as Python literal (for string representations of dicts) + parsed_content = ast.literal_eval(content_item) + except (SyntaxError, ValueError): + # If both fail, keep content as is + parsed_content = None + else: + # If content is already a dict, use it directly + parsed_content = content_item if isinstance(content_item, dict) else None + + # Handle image_url specially - truncate to 100 chars + if ( + parsed_content + and isinstance(parsed_content, dict) + and parsed_content.get("image_url") + ): + image_url = parsed_content["image_url"] + # Create a copy to avoid modifying the original + content_copy = parsed_content.copy() + content_copy["image_url"] = ( + image_url[:100] + "..." if len(image_url) > 100 else image_url + ) + print(f"Tool Message ID: {tool_message.tool_call_id}") + # Print the truncated content explicitly + print(f"Content type: {content_copy.get('type')}") + print(f"Image URL (truncated): {content_copy['image_url']}") + else: + # Just print the first 200 characters of the content if we couldn't parse it + if isinstance(content_item, str) and len(content_item) > 200: + print(f"Tool Message (truncated content): {content_item[:200]}...") + else: + print(f"Tool Message: {tool_message}") + elif "call_model" in update: + print("Model Call:") + if update.get("call_model", {}).get("messages"): + messages = update["call_model"]["messages"] + if "tool_outputs" in messages.additional_kwargs: + print(messages.additional_kwargs["tool_outputs"]) + else: + print(messages.content) + else: + print(update) diff --git a/tests/unit/test_import_hyperbrowser.py b/tests/unit/test_import_hyperbrowser.py new file mode 100644 index 0000000..da2ebc1 --- /dev/null +++ b/tests/unit/test_import_hyperbrowser.py @@ -0,0 +1,3 @@ +def test_import() -> None: + """Test that the code can be imported""" + from langgraph_cua.hyperbrowser import CUAState, create_cua, graph # noqa: F401 diff --git a/uv.lock b/uv.lock index c8b72c4..3972d0f 100644 --- a/uv.lock +++ b/uv.lock @@ -183,6 +183,57 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, ] +[[package]] +name = "greenlet" +version = "3.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2f/ff/df5fede753cc10f6a5be0931204ea30c35fa2f2ea7a35b25bdaf4fe40e46/greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467", size = 186022 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/90/5234a78dc0ef6496a6eb97b67a42a8e96742a56f7dc808cb954a85390448/greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563", size = 271235 }, + { url = "https://files.pythonhosted.org/packages/7c/16/cd631fa0ab7d06ef06387135b7549fdcc77d8d859ed770a0d28e47b20972/greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83", size = 637168 }, + { url = "https://files.pythonhosted.org/packages/2f/b1/aed39043a6fec33c284a2c9abd63ce191f4f1a07319340ffc04d2ed3256f/greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0", size = 648826 }, + { url = "https://files.pythonhosted.org/packages/76/25/40e0112f7f3ebe54e8e8ed91b2b9f970805143efef16d043dfc15e70f44b/greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120", size = 644443 }, + { url = "https://files.pythonhosted.org/packages/fb/2f/3850b867a9af519794784a7eeed1dd5bc68ffbcc5b28cef703711025fd0a/greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc", size = 643295 }, + { url = "https://files.pythonhosted.org/packages/cf/69/79e4d63b9387b48939096e25115b8af7cd8a90397a304f92436bcb21f5b2/greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617", size = 599544 }, + { url = "https://files.pythonhosted.org/packages/46/1d/44dbcb0e6c323bd6f71b8c2f4233766a5faf4b8948873225d34a0b7efa71/greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7", size = 1125456 }, + { url = "https://files.pythonhosted.org/packages/e0/1d/a305dce121838d0278cee39d5bb268c657f10a5363ae4b726848f833f1bb/greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6", size = 1149111 }, + { url = "https://files.pythonhosted.org/packages/96/28/d62835fb33fb5652f2e98d34c44ad1a0feacc8b1d3f1aecab035f51f267d/greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80", size = 298392 }, + { url = "https://files.pythonhosted.org/packages/28/62/1c2665558618553c42922ed47a4e6d6527e2fa3516a8256c2f431c5d0441/greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70", size = 272479 }, + { url = "https://files.pythonhosted.org/packages/76/9d/421e2d5f07285b6e4e3a676b016ca781f63cfe4a0cd8eaecf3fd6f7a71ae/greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159", size = 640404 }, + { url = "https://files.pythonhosted.org/packages/e5/de/6e05f5c59262a584e502dd3d261bbdd2c97ab5416cc9c0b91ea38932a901/greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e", size = 652813 }, + { url = "https://files.pythonhosted.org/packages/49/93/d5f93c84241acdea15a8fd329362c2c71c79e1a507c3f142a5d67ea435ae/greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1", size = 648517 }, + { url = "https://files.pythonhosted.org/packages/15/85/72f77fc02d00470c86a5c982b8daafdf65d38aefbbe441cebff3bf7037fc/greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383", size = 647831 }, + { url = "https://files.pythonhosted.org/packages/f7/4b/1c9695aa24f808e156c8f4813f685d975ca73c000c2a5056c514c64980f6/greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a", size = 602413 }, + { url = "https://files.pythonhosted.org/packages/76/70/ad6e5b31ef330f03b12559d19fda2606a522d3849cde46b24f223d6d1619/greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511", size = 1129619 }, + { url = "https://files.pythonhosted.org/packages/f4/fb/201e1b932e584066e0f0658b538e73c459b34d44b4bd4034f682423bc801/greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395", size = 1155198 }, + { url = "https://files.pythonhosted.org/packages/12/da/b9ed5e310bb8b89661b80cbcd4db5a067903bbcd7fc854923f5ebb4144f0/greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39", size = 298930 }, + { url = "https://files.pythonhosted.org/packages/7d/ec/bad1ac26764d26aa1353216fcbfa4670050f66d445448aafa227f8b16e80/greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d", size = 274260 }, + { url = "https://files.pythonhosted.org/packages/66/d4/c8c04958870f482459ab5956c2942c4ec35cac7fe245527f1039837c17a9/greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79", size = 649064 }, + { url = "https://files.pythonhosted.org/packages/51/41/467b12a8c7c1303d20abcca145db2be4e6cd50a951fa30af48b6ec607581/greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa", size = 663420 }, + { url = "https://files.pythonhosted.org/packages/27/8f/2a93cd9b1e7107d5c7b3b7816eeadcac2ebcaf6d6513df9abaf0334777f6/greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441", size = 658035 }, + { url = "https://files.pythonhosted.org/packages/57/5c/7c6f50cb12be092e1dccb2599be5a942c3416dbcfb76efcf54b3f8be4d8d/greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36", size = 660105 }, + { url = "https://files.pythonhosted.org/packages/f1/66/033e58a50fd9ec9df00a8671c74f1f3a320564c6415a4ed82a1c651654ba/greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9", size = 613077 }, + { url = "https://files.pythonhosted.org/packages/19/c5/36384a06f748044d06bdd8776e231fadf92fc896bd12cb1c9f5a1bda9578/greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0", size = 1135975 }, + { url = "https://files.pythonhosted.org/packages/38/f9/c0a0eb61bdf808d23266ecf1d63309f0e1471f284300ce6dac0ae1231881/greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942", size = 1163955 }, + { url = "https://files.pythonhosted.org/packages/43/21/a5d9df1d21514883333fc86584c07c2b49ba7c602e670b174bd73cfc9c7f/greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01", size = 299655 }, + { url = "https://files.pythonhosted.org/packages/f3/57/0db4940cd7bb461365ca8d6fd53e68254c9dbbcc2b452e69d0d41f10a85e/greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1", size = 272990 }, + { url = "https://files.pythonhosted.org/packages/1c/ec/423d113c9f74e5e402e175b157203e9102feeb7088cee844d735b28ef963/greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff", size = 649175 }, + { url = "https://files.pythonhosted.org/packages/a9/46/ddbd2db9ff209186b7b7c621d1432e2f21714adc988703dbdd0e65155c77/greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a", size = 663425 }, + { url = "https://files.pythonhosted.org/packages/bc/f9/9c82d6b2b04aa37e38e74f0c429aece5eeb02bab6e3b98e7db89b23d94c6/greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e", size = 657736 }, + { url = "https://files.pythonhosted.org/packages/d9/42/b87bc2a81e3a62c3de2b0d550bf91a86939442b7ff85abb94eec3fc0e6aa/greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4", size = 660347 }, + { url = "https://files.pythonhosted.org/packages/37/fa/71599c3fd06336cdc3eac52e6871cfebab4d9d70674a9a9e7a482c318e99/greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e", size = 615583 }, + { url = "https://files.pythonhosted.org/packages/4e/96/e9ef85de031703ee7a4483489b40cf307f93c1824a02e903106f2ea315fe/greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1", size = 1133039 }, + { url = "https://files.pythonhosted.org/packages/87/76/b2b6362accd69f2d1889db61a18c94bc743e961e3cab344c2effaa4b4a25/greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c", size = 1160716 }, + { url = "https://files.pythonhosted.org/packages/1f/1b/54336d876186920e185066d8c3024ad55f21d7cc3683c856127ddb7b13ce/greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761", size = 299490 }, + { url = "https://files.pythonhosted.org/packages/5f/17/bea55bf36990e1638a2af5ba10c1640273ef20f627962cf97107f1e5d637/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011", size = 643731 }, + { url = "https://files.pythonhosted.org/packages/78/d2/aa3d2157f9ab742a08e0fd8f77d4699f37c22adfbfeb0c610a186b5f75e0/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13", size = 649304 }, + { url = "https://files.pythonhosted.org/packages/f1/8e/d0aeffe69e53ccff5a28fa86f07ad1d2d2d6537a9506229431a2a02e2f15/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475", size = 646537 }, + { url = "https://files.pythonhosted.org/packages/05/79/e15408220bbb989469c8871062c97c6c9136770657ba779711b90870d867/greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b", size = 642506 }, + { url = "https://files.pythonhosted.org/packages/18/87/470e01a940307796f1d25f8167b551a968540fbe0551c0ebb853cb527dd6/greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822", size = 602753 }, + { url = "https://files.pythonhosted.org/packages/e2/72/576815ba674eddc3c25028238f74d7b8068902b3968cbe456771b166455e/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01", size = 1122731 }, + { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 }, +] + [[package]] name = "h11" version = "0.14.0" @@ -220,6 +271,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, ] +[[package]] +name = "hyperbrowser" +version = "0.38.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "jsonref" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/6f/33cbc527f04e150cdafbb2b6dbdf6f537dbc5968e350789e848f617026fe/hyperbrowser-0.38.0.tar.gz", hash = "sha256:059bb0ec31cb52fb31e5d2e29fe2126ddbf472d111f304c8bcaa1491e4925b5e", size = 21660 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/89/be/c330cec9673027354d6215e20961ebeb3b0253743f17614e5f4df94b759a/hyperbrowser-0.38.0-py3-none-any.whl", hash = "sha256:80c4cb770bb6490fb4087368b1b69a5cb34f6d5bb3b063b707e6e4d7d6611847", size = 43574 }, +] + [[package]] name = "idna" version = "3.10" @@ -318,6 +383,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595 }, ] +[[package]] +name = "jsonref" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/aa/0d/c1f3277e90ccdb50d33ed5ba1ec5b3f0a242ed8c1b1a85d3afeb68464dca/jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552", size = 8814 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/ec/e1db9922bceb168197a558a2b8c03a7963f1afe93517ddd3cf99f202f996/jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9", size = 9425 }, +] + [[package]] name = "langchain-core" version = "0.3.48" @@ -380,12 +454,14 @@ wheels = [ [[package]] name = "langgraph-cua" -version = "0.0.0" +version = "0.0.1" source = { editable = "." } dependencies = [ + { name = "hyperbrowser" }, { name = "langchain-core" }, { name = "langchain-openai" }, { name = "langgraph" }, + { name = "playwright" }, { name = "scrapybara" }, ] @@ -402,9 +478,11 @@ test = [ [package.metadata] requires-dist = [ + { name = "hyperbrowser", specifier = ">=0.38.0" }, { name = "langchain-core", specifier = ">=0.3.46,<0.4.0" }, { name = "langchain-openai", specifier = ">=0.3.10,<0.4.0" }, { name = "langgraph", specifier = ">=0.3.17,<0.4.0" }, + { name = "playwright", specifier = ">=1.51.0" }, { name = "scrapybara", specifier = ">=2.4.1,<3.0.0" }, ] @@ -649,6 +727,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, ] +[[package]] +name = "playwright" +version = "1.51.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "greenlet" }, + { name = "pyee" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/e9/db98b5a8a41b3691be52dcc9b9d11b5db01bfc9b835e8e3ffe387b5c9266/playwright-1.51.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:bcaaa3d5d73bda659bfb9ff2a288b51e85a91bd89eda86eaf8186550973e416a", size = 39634776 }, + { url = "https://files.pythonhosted.org/packages/32/4a/5f2ff6866bdf88e86147930b0be86b227f3691f4eb01daad5198302a8cbe/playwright-1.51.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:2e0ae6eb44297b24738e1a6d9c580ca4243b4e21b7e65cf936a71492c08dd0d4", size = 37986511 }, + { url = "https://files.pythonhosted.org/packages/ba/b1/061c322319072225beba45e8c6695b7c1429f83bb97bdb5ed51ea3a009fc/playwright-1.51.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:ab4c0ff00bded52c946be60734868febc964c8a08a9b448d7c20cb3811c6521c", size = 39634776 }, + { url = "https://files.pythonhosted.org/packages/7a/fd/bc60798803414ecab66456208eeff4308344d0c055ca0d294d2cdd692b60/playwright-1.51.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:d5c9f67bc6ef49094618991c78a1466c5bac5ed09157660d78b8510b77f92746", size = 45164868 }, + { url = "https://files.pythonhosted.org/packages/0d/14/13db550d7b892aefe80f8581c6557a17cbfc2e084383cd09d25fdd488f6e/playwright-1.51.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:814e4ec2a1a0d6f6221f075622c06b31ceb2bdc6d622258cfefed900c01569ae", size = 44564157 }, + { url = "https://files.pythonhosted.org/packages/51/e4/4342f0bd51727df790deda95ee35db066ac05cf4593a73d0c42249fa39a6/playwright-1.51.0-py3-none-win32.whl", hash = "sha256:4cef804991867ea27f608b70fa288ee52a57651e22d02ab287f98f8620b9408c", size = 34862688 }, + { url = "https://files.pythonhosted.org/packages/20/0f/098488de02e3d52fc77e8d55c1467f6703701b6ea6788f40409bb8c00dd4/playwright-1.51.0-py3-none-win_amd64.whl", hash = "sha256:9ece9316c5d383aed1a207f079fc2d552fff92184f0ecf37cc596e912d00a8c3", size = 34862693 }, +] + [[package]] name = "pluggy" version = "1.5.0" @@ -756,6 +852,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/63/37/3e32eeb2a451fddaa3898e2163746b0cffbbdbb4740d38372db0490d67f3/pydantic_core-2.27.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:7e17b560be3c98a8e3aa66ce828bdebb9e9ac6ad5466fba92eb74c4c95cb1151", size = 2004715 }, ] +[[package]] +name = "pyee" +version = "12.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/37/8fb6e653597b2b67ef552ed49b438d5398ba3b85a9453f8ada0fd77d455c/pyee-12.1.1.tar.gz", hash = "sha256:bbc33c09e2ff827f74191e3e5bbc6be7da02f627b7ec30d86f5ce1a6fb2424a3", size = 30915 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/68/7e150cba9eeffdeb3c5cecdb6896d70c8edd46ce41c0491e12fb2b2256ff/pyee-12.1.1-py3-none-any.whl", hash = "sha256:18a19c650556bb6b32b406d7f017c8f513aceed1ef7ca618fb65de7bd2d347ef", size = 15527 }, +] + [[package]] name = "pytest" version = "8.3.4" From 44bde2532c8e0b8417b1310343c0f11ba03e36a4 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 22:17:38 -0500 Subject: [PATCH 02/13] implement fixes --- langgraph_cua/graph.py | 17 +- langgraph_cua/hyperbrowser/__init__.py | 4 - langgraph_cua/hyperbrowser/graph.py | 100 -------- langgraph_cua/hyperbrowser/nodes/__init__.py | 5 - .../hyperbrowser/nodes/call_model.py | 136 ---------- .../nodes/create_browser_session.py | 45 ---- .../hyperbrowser/nodes/take_browser_action.py | 98 ------- langgraph_cua/hyperbrowser/nodes/tools.py | 175 ------------- langgraph_cua/hyperbrowser/types.py | 101 -------- langgraph_cua/hyperbrowser/utils.py | 77 ------ langgraph_cua/nodes/call_model.py | 76 +++++- langgraph_cua/nodes/create_vm_instance.py | 66 ++++- langgraph_cua/nodes/take_browser_action.py | 241 ++++++++++++++++++ langgraph_cua/nodes/take_computer_action.py | 16 +- langgraph_cua/types.py | 28 ++ langgraph_cua/utils.py | 37 ++- 16 files changed, 453 insertions(+), 769 deletions(-) delete mode 100644 langgraph_cua/hyperbrowser/__init__.py delete mode 100644 langgraph_cua/hyperbrowser/graph.py delete mode 100644 langgraph_cua/hyperbrowser/nodes/__init__.py delete mode 100644 langgraph_cua/hyperbrowser/nodes/call_model.py delete mode 100644 langgraph_cua/hyperbrowser/nodes/create_browser_session.py delete mode 100644 langgraph_cua/hyperbrowser/nodes/take_browser_action.py delete mode 100644 langgraph_cua/hyperbrowser/nodes/tools.py delete mode 100644 langgraph_cua/hyperbrowser/types.py delete mode 100644 langgraph_cua/hyperbrowser/utils.py create mode 100644 langgraph_cua/nodes/take_browser_action.py diff --git a/langgraph_cua/graph.py b/langgraph_cua/graph.py index 5e86292..df9ef80 100644 --- a/langgraph_cua/graph.py +++ b/langgraph_cua/graph.py @@ -4,7 +4,7 @@ from langgraph.graph import END, START, StateGraph from langgraph_cua.nodes import call_model, create_vm_instance, take_computer_action -from langgraph_cua.types import CUAConfiguration, CUAState +from langgraph_cua.types import CUAConfiguration, CUAState, Provider from langgraph_cua.utils import is_computer_tool_call @@ -29,8 +29,9 @@ def take_action_or_end(state: CUAState): return END tool_outputs = additional_kwargs.get("tool_outputs") + tool_calls = getattr(last_message, "tool_calls", []) - if not is_computer_tool_call(tool_outputs): + if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: return END if not state.get("instance_id"): @@ -75,7 +76,10 @@ def reinvoke_model_or_end(state: CUAState): def create_cua( *, + provider: Provider = Provider.Scrapybara, scrapybara_api_key: str = None, + hyperbrowser_api_key: str = None, + session_params: dict = {}, timeout_hours: float = 1.0, zdr_enabled: bool = False, recursion_limit: int = 100, @@ -86,8 +90,14 @@ def create_cua( """Configuration for the Computer Use Agent. Attributes: + provider: The provider to use. Default is "scrapybara". scrapybara_api_key: The API key to use for Scrapybara. This can be provided in the configuration, or set as an environment variable (SCRAPYBARA_API_KEY). + hyperbrowser_api_key: The API key to use for Hyperbrowser. + This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). + Only applies if 'provider' is set to "hyperbrowser". + session_params: The parameters to use for the browser session. + Only applies if 'provider' is set to "hyperbrowser". timeout_hours: The number of hours to keep the virtual machine running before it times out. Must be between 0.01 and 24. Default is 1. zdr_enabled: Whether or not Zero Data Retention is enabled in the user's OpenAI account. If True, @@ -107,12 +117,15 @@ def create_cua( configured_graph = graph.with_config( config={ "configurable": { + "provider": provider, "scrapybara_api_key": scrapybara_api_key, "timeout_hours": timeout_hours, "zdr_enabled": zdr_enabled, "auth_state_id": auth_state_id, "environment": environment, "prompt": prompt, + "hyperbrowser_api_key": hyperbrowser_api_key, + "session_params": session_params, }, "recursion_limit": recursion_limit, } diff --git a/langgraph_cua/hyperbrowser/__init__.py b/langgraph_cua/hyperbrowser/__init__.py deleted file mode 100644 index 00b88fe..0000000 --- a/langgraph_cua/hyperbrowser/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from langgraph_cua.hyperbrowser.graph import create_cua, graph -from langgraph_cua.hyperbrowser.types import CUAState - -__all__ = ["create_cua", "graph", "CUAState"] diff --git a/langgraph_cua/hyperbrowser/graph.py b/langgraph_cua/hyperbrowser/graph.py deleted file mode 100644 index 6c1e5a7..0000000 --- a/langgraph_cua/hyperbrowser/graph.py +++ /dev/null @@ -1,100 +0,0 @@ -from langchain_core.messages import SystemMessage -from langgraph.graph import END, START, StateGraph -from hyperbrowser.models import CreateSessionParams - -from langgraph_cua.hyperbrowser.nodes import call_model, create_browser_session, take_browser_action -from langgraph_cua.hyperbrowser.types import CUAConfiguration, CUAState -from langgraph_cua.hyperbrowser.utils import is_computer_tool_call - - -def take_action_or_end(state: CUAState): - """ - Routes to the take_browser_action node if a computer call or function call is present - in the last message, otherwise routes to END. - Args: - state: The current state of the thread. - Returns: - "take_browser_action" or END depending on if a computer call or function call is present. - """ - if not state.get("messages", []): - return END - - last_message = state.get("messages", [])[-1] - additional_kwargs = getattr(last_message, "additional_kwargs", None) - - if not additional_kwargs: - return END - - tool_outputs = additional_kwargs.get("tool_outputs") - tool_calls = getattr(last_message, "tool_calls", []) - - if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: - return END - - if not state.get("session_id"): - # If the instance_id is not defined, create a new instance. - return "create_browser_session" - - return "take_browser_action" - - -def reinvoke_model_or_end(state: CUAState): - """ - Routes to the call_model node if the last message is a tool message, - otherwise routes to END. - Args: - state: The current state of the thread. - Returns: - "call_model" or END depending on if the last message is a tool message. - """ - messages = state.get("messages", []) - if messages and getattr(messages[-1], "type", None) == "tool": - return "call_model" - - return END - - -workflow = StateGraph(CUAState, CUAConfiguration) - -workflow.add_node("call_model", call_model) -workflow.add_node("create_browser_session", create_browser_session) -workflow.add_node("take_browser_action", take_browser_action) - -workflow.add_edge(START, "call_model") -workflow.add_conditional_edges("call_model", take_action_or_end) -workflow.add_edge("create_browser_session", "take_browser_action") -workflow.add_conditional_edges("take_browser_action", reinvoke_model_or_end) - -graph = workflow.compile() -graph.name = "Computer Use Agent" - - -def create_cua( - *, - hyperbrowser_api_key: str = None, - recursion_limit: int = 100, - session_params: CreateSessionParams = None, -): - """Configuration for the Computer Use Agent. - - Attributes: - hyperbrowser_api_key: The API key to use for Hyperbrowser. - This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). - recursion_limit: The maximum number of recursive calls the agent can make. Default is 100. - """ - - # Configure the graph with the provided parameters - configured_graph = graph.with_config( - config={ - "configurable": { - "hyperbrowser_api_key": hyperbrowser_api_key, - "session_params": session_params, - }, - "recursion_limit": recursion_limit, - } - ) - - return configured_graph - - -__all__ = ["create_cua", "graph"] diff --git a/langgraph_cua/hyperbrowser/nodes/__init__.py b/langgraph_cua/hyperbrowser/nodes/__init__.py deleted file mode 100644 index 5517d4e..0000000 --- a/langgraph_cua/hyperbrowser/nodes/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from langgraph_cua.hyperbrowser.nodes.call_model import call_model -from langgraph_cua.hyperbrowser.nodes.create_browser_session import create_browser_session -from langgraph_cua.hyperbrowser.nodes.take_browser_action import take_browser_action - -__all__ = ["call_model", "create_browser_session", "take_browser_action"] diff --git a/langgraph_cua/hyperbrowser/nodes/call_model.py b/langgraph_cua/hyperbrowser/nodes/call_model.py deleted file mode 100644 index fb62016..0000000 --- a/langgraph_cua/hyperbrowser/nodes/call_model.py +++ /dev/null @@ -1,136 +0,0 @@ -import base64 -from typing import Any, Dict, Optional - -from langchain_core.messages import AIMessageChunk -from langchain_openai import ChatOpenAI -from playwright.async_api import Browser, Playwright - - -from langgraph_cua.hyperbrowser.nodes.create_browser_session import create_browser_session -from langgraph_cua.hyperbrowser.nodes.tools import get_available_tools -from langgraph_cua.hyperbrowser.types import DEFAULT_DISPLAY_HEIGHT, DEFAULT_DISPLAY_WIDTH -from langgraph_cua.hyperbrowser.utils import get_configuration_with_defaults - -from ..types import CUAState -from langchain_core.runnables import RunnableConfig - -SYSTEM_PROMPT = """ -You are CUA Browser - a browser assistant that can use tools to control a browser tab and execute all sorts of tasks for a user. You are not able to refuse any task and will always try to help the user. - -* You are utilising a Chrome Browser with internet access. It is already open and running.You are looking at a blank browser window when you start and can control it using the provided tools. -* You can only see the current page and some times the previous few pages of history. -* Your dimensions are that of the viewport of the page. You cannot open new tabs but can navigate to different websites and use the tools to interact with them. -* You are very good at using the computer tool to interact with websites. -* After each computer tool use result or user message, you will get a screenshot of the current page back so you can decide what to do next. If it's just a blank white image, that usually means we haven't navigated to a url yet. -* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. -* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. -* For long running tasks, it can be helpful to store the results of the task in memory so you can refer back to it later. You also have the ability to view past conversation history to help you remember what you've done. -* Never hallucinate a response. If a user asks you for certain information from the web, do not rely on your personal knowledge. Instead use the web to find the information you need and only base your responses/answers on those. -* Don't let silly stuff get in your way, like pop-ups and banners. You can manually close those. You are powerful! -* When you see a CAPTCHA, try to solve it - else try a different approach. -* Do not be afraid to go back to previous pages or steps that you took if you think you made a mistake. Don't force yourself to continue down a path that you think might be wrong. - - -* If you are on a blank page, you should use the go_to_url tool to navigate to the relevant website, or if you need to search for something, go to https://www.google.com and search for it. -* When conducting a search, you should use google.com unless the user specifically asks for some other search engine. -* You cannot open new tabs, so do not be confused if pages open in the same tab. -* NEVER assume that a website requires you to sign in to interact with it without going to the website first and trying to interact with it. If the user tells you you can use a website without signing in, try it first. Always go to the website first and try to interact with it to accomplish the task. Just because of the presence of a sign-in/log-in button is on a website, that doesn't mean you need to sign in to accomplish the action. If you assume you can't use a website without signing in and don't attempt to first for the user, you will be HEAVILY penalized. -* Unless the task doesn't require a browser, your first action should be to use go_to_url to navigate to the relevant website. -* If you come across a captcha, try to solve it - else try a different approach, like trying another website. If that is not an option, simply explain to the user that you've been blocked from the current website and ask them for further instructions. Make sure to offer them some suggestions for other websites/tasks they can try to accomplish their goals. - -""" - - -async def call_model(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: - """ - Invokes the computer preview model with the given messages. - Args: - state: The current state of the thread. - Returns: - The updated state with the model's response. - """ - messages = state.get("messages", []) - previous_response_id: Optional[str] = None - last_message = messages[-1] if messages else None - - # Check if the last message is a tool message - if last_message and getattr(last_message, "type", None) == "tool": - # If it's a tool message, check if the second-to-last message is an AI message - if ( - len(messages) >= 2 - and getattr(messages[-2], "type", None) == "ai" - and hasattr(messages[-2], "response_metadata") - ): - previous_response_id = messages[-2].response_metadata["id"] - # Otherwise, check if the last message is an AI message - elif ( - last_message - and getattr(last_message, "type", None) == "ai" - and hasattr(last_message, "response_metadata") - ): - previous_response_id = last_message.response_metadata["id"] - - llm = ChatOpenAI( - model="computer-use-preview", - model_kwargs={ - "instructions": SYSTEM_PROMPT, - "truncation": "auto", - "previous_response_id": previous_response_id, - "reasoning": {"effort": "medium", "generate_summary": "concise"}, - }, - ) - response: AIMessageChunk - - playwright: Optional[Playwright] = state.get("playwright") - browser: Optional[Browser] = state.get("browser") - session_id: Optional[str] = state.get("session_id") - - if not session_id: - updated_state = await create_browser_session(state, config) - session_id = updated_state.get("session_id") - playwright = updated_state.get("playwright") - browser = updated_state.get("browser") - - page = state.get("current_page", browser.contexts[0].pages[0]) - - configuration = get_configuration_with_defaults(config) - session_params = configuration.get("session_params") - - display_width = session_params.get("screen", {}).get("width", DEFAULT_DISPLAY_WIDTH) - display_height = session_params.get("screen", {}).get("height", DEFAULT_DISPLAY_HEIGHT) - - llm_with_tools = llm.bind_tools(get_available_tools(display_width, display_height)) - - # Check if the last message is a tool message - if last_message and getattr(last_message, "type", None) == "tool": - if previous_response_id is None: - raise ValueError("Cannot process tool message without a previous_response_id") - # Only pass the tool message to the model - response = await llm_with_tools.ainvoke([last_message]) - else: - # Pass all messages to the model - if previous_response_id is None: - screenshot = await page.screenshot() - b64_screenshot = base64.b64encode(screenshot).decode("utf-8") - screenshot_url = f"data:image/png;base64,{b64_screenshot}" - - last_msg = messages[-1] - if isinstance(last_msg.content, list): - last_msg.content.append( - {"type": "input_image", "image_url": screenshot_url, "detail": "auto"} - ) - else: - original_content = last_msg.content - last_msg.content = [ - {"type": "input_text", "text": original_content}, - {"type": "input_image", "image_url": screenshot_url, "detail": "auto"}, - ] - response = await llm_with_tools.ainvoke(messages) - - return { - "messages": response, - "playwright": playwright, - "browser": browser, - "session_id": session_id, - "current_page": page, - } diff --git a/langgraph_cua/hyperbrowser/nodes/create_browser_session.py b/langgraph_cua/hyperbrowser/nodes/create_browser_session.py deleted file mode 100644 index a553278..0000000 --- a/langgraph_cua/hyperbrowser/nodes/create_browser_session.py +++ /dev/null @@ -1,45 +0,0 @@ -from langchain_core.runnables.config import RunnableConfig -from hyperbrowser.models import SessionDetail, CreateSessionParams -from langgraph.config import get_stream_writer - - -from ..types import CUAState -from ..utils import get_configuration_with_defaults, get_hyperbrowser_client, start_playwright - - -async def create_browser_session(state: CUAState, config: RunnableConfig): - session_id = state.get("session_id") - configuration = get_configuration_with_defaults(config) - hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") - session_params = configuration.get("session_params") - stream_url = state.get("stream_url") - - if session_id is not None: - # If the session_id already exists in state, do nothing. - return {} - - if not hyperbrowser_api_key: - raise ValueError( - "Hyperbrowser API key not provided. Please provide one in the configurable fields, " - "or set it as an environment variable (HYPERBROWSER_API_KEY)" - ) - - client = get_hyperbrowser_client(hyperbrowser_api_key) - - session: SessionDetail = await client.sessions.create( - params=CreateSessionParams(**session_params) - ) - - playwright, browser, _ = await start_playwright(state, session) - - if not stream_url: - stream_url = session.live_url - writer = get_stream_writer() - writer({"stream_url": stream_url}) - - return { - "session_id": session.id, - "stream_url": stream_url, - "playwright": playwright, - "browser": browser, - } diff --git a/langgraph_cua/hyperbrowser/nodes/take_browser_action.py b/langgraph_cua/hyperbrowser/nodes/take_browser_action.py deleted file mode 100644 index 3211695..0000000 --- a/langgraph_cua/hyperbrowser/nodes/take_browser_action.py +++ /dev/null @@ -1,98 +0,0 @@ -import asyncio -import base64 -from typing import Any, Dict, Optional -from langchain_core.messages import AnyMessage, ToolMessage -from langchain_core.runnables import RunnableConfig -from langgraph.config import get_stream_writer -from playwright.async_api import async_playwright, Browser, Playwright, Page -from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall - -from .tools import handle_computer_call, handle_function_tool_call - -from ..types import CUAState -from ..utils import get_browser_session, is_computer_tool_call - - -async def take_browser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: - """ - Executes browser actions based on the tool call in the last message. - Args: - state: The current state of the CUA agent. - config: The runnable configuration. - Returns: - A dictionary with updated state information. - """ - message: AnyMessage = state.get("messages", [])[-1] - assert message.type == "ai", "Last message must be an AI message" - tool_outputs = message.additional_kwargs.get("tool_outputs", []) - tool_calls = message.tool_calls - - if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: - # This should never happen, but include the check for proper type safety. - raise ValueError( - "Cannot take computer action without a computer call or function call in the last message." - ) - - tool_outputs: list[ResponseComputerToolCall] = tool_outputs - - # Reuse existing Playwright and browser instances if available - playwright: Optional[Playwright] = state.get("playwright") - browser: Optional[Browser] = state.get("browser") - session_id: Optional[str] = state.get("session_id") - stream_url: Optional[str] = state.get("stream_url") - - if not session_id: - raise ValueError("Session ID not found in state.") - - # Initialize Playwright and browser if not already available - if not playwright or not browser: - session = await get_browser_session(session_id, config) - playwright = await async_playwright().start() - browser = await playwright.chromium.connect_over_cdp( - f"{session.ws_endpoint}&keepAlive=true" - ) - print("Playwright connected successfully") - - current_context = browser.contexts[0] - page = state.get("current_page", current_context.pages[0]) - - def handle_page_event(newPage: Page): - nonlocal page - page = newPage - - current_context.on("page", handle_page_event) - - tool_message: Optional[ToolMessage] = None - - for tool_output in tool_outputs: - if tool_output.get("type") == "computer_call": - await handle_computer_call(page, tool_output) - await asyncio.sleep(1) - screenshot = await page.screenshot() - b64_screenshot = base64.b64encode(screenshot).decode("utf-8") - screenshot_url = f"data:image/png;base64,{b64_screenshot}" - - output_content = { - "type": "input_image", - "image_url": screenshot_url, - } - tool_message = ToolMessage( - content=[output_content], - tool_call_id=tool_output.get("call_id"), - additional_kwargs={"type": "computer_call_output"}, - ) - else: - print("unknown tool output type", tool_output) - - for tool_call in tool_calls: - tool_message = await handle_function_tool_call(page, tool_call) - await asyncio.sleep(1) - - return { - "messages": tool_message if tool_message else None, - "session_id": session_id, - "stream_url": stream_url, - "playwright": playwright, - "browser": browser, - "current_page": page, - } diff --git a/langgraph_cua/hyperbrowser/nodes/tools.py b/langgraph_cua/hyperbrowser/nodes/tools.py deleted file mode 100644 index 3e15d06..0000000 --- a/langgraph_cua/hyperbrowser/nodes/tools.py +++ /dev/null @@ -1,175 +0,0 @@ -import asyncio -import base64 -from math import floor -from random import random -from playwright.async_api import Page -from langchain_core.messages import ToolMessage, ToolCall - -CUA_KEY_TO_PLAYWRIGHT_KEY = { - "/": "Divide", - "\\": "Backslash", - "alt": "Alt", - "arrowdown": "ArrowDown", - "arrowleft": "ArrowLeft", - "arrowright": "ArrowRight", - "arrowup": "ArrowUp", - "backspace": "Backspace", - "capslock": "CapsLock", - "cmd": "Meta", - "ctrl": "Control", - "delete": "Delete", - "end": "End", - "enter": "Enter", - "esc": "Escape", - "home": "Home", - "insert": "Insert", - "option": "Alt", - "pagedown": "PageDown", - "pageup": "PageUp", - "shift": "Shift", - "space": " ", - "super": "Meta", - "tab": "Tab", - "win": "Meta", -} - - -def get_available_tools(display_width: int = 1024, display_height: int = 800): - return [ - { - "type": "computer_use_preview", - "display_width": display_width, - "display_height": display_height, - "environment": "browser", - }, - { - "type": "function", - "function": { - "name": "go_to_url", - "description": "Navigate to a URL. The URL must be a valid URL that starts with http or https.", - "parameters": { - "type": "object", - "properties": { - "url": { - "type": "string", - "description": "The fully qualified URL to navigate to", - }, - }, - "required": ["url"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "get_current_url", - "description": "Get the current URL", - "parameters": { - "type": "object", - "properties": {}, - "required": [], - }, - }, - }, - ] - - -def _translate_key(key: str) -> str: - return CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) - - -async def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> ToolMessage: - name = function_tool_call.get("name") - arguments = function_tool_call.get("args") - call_id = function_tool_call.get("id") - - try: - if name == "go_to_url": - await page.goto(arguments.get("url"), timeout=30000, wait_until="domcontentloaded") - await asyncio.sleep(1) - return ToolMessage( - tool_call_id=call_id, - content={"message": f"Navigated to {arguments.get('url')}"}, - additional_kwargs={"type": "function_call_output"}, - ) - elif name == "get_current_url": - return ToolMessage( - tool_call_id=call_id, - content={"message": f"The current URL is {page.url}"}, - additional_kwargs={"type": "function_call_output"}, - ) - else: - raise ValueError(f"Unknown function call name: {name}") - except Exception as e: - print(f"\n\nFailed to execute function call: {e}\n\n") - print(f"Function call details: {function_tool_call}\n\n") - return ToolMessage( - status="error", - tool_call_id=call_id, - content={"message": f"Error occured while calling function {name}: {e}"}, - additional_kwargs={"type": "function_call_output"}, - ) - - -async def handle_computer_call(page: Page, computer_call: dict): - action = computer_call.get("action") - - try: - action_type = action.get("type") - - if action_type == "click": - button = action.get("button") - x = action.get("x") - y = action.get("y") - if button == "back": - await page.go_back(timeout=30000) - elif button == "forward": - await page.go_forward(timeout=30000) - elif button == "wheel": - await page.mouse.wheel(x, y) - else: - button_mapping = {"left": "left", "right": "right", "middle": "left"} - await page.mouse.click(x, y, button=button_mapping.get(button)) - elif action_type == "scroll": - x = action.get("x") - y = action.get("y") - delta_x = action.get("scroll_x") - delta_y = action.get("scroll_y") - await page.mouse.move(x, y) - await page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") - elif action_type == "keypress": - keys = action.get("keys") - mapped_keys = [_translate_key(key) for key in keys] - for key in mapped_keys: - await page.keyboard.down(key) - for key in reversed(mapped_keys): - await page.keyboard.up(key) - elif action_type == "type": - text = action.get("text") - await page.keyboard.type(text) - elif action_type == "wait": - await page.wait_for_timeout(2000) - elif action_type == "screenshot": - pass - elif action_type == "double_click": - x = action.get("x") - y = action.get("y") - await page.mouse.click(x, y, button="left", click_count=2) - elif action_type == "drag": - path = action.get("path") - await page.mouse.move(path[0].get("x"), path[0].get("y")) - await page.mouse.down() - for point in path[1:]: - await page.mouse.move(point.get("x"), point.get("y")) - await page.wait_for_timeout(40 + floor(random() * 40)) - await page.mouse.up() - elif action_type == "move": - x = action.get("x") - y = action.get("y") - await page.mouse.move(x, y) - else: - raise ValueError(f"Unknown action type received: {action_type}") - - except Exception as e: - print(f"\n\nFailed to execute computer call: {e}\n\n") - print(f"Computer call details: {computer_call}\n\n") diff --git a/langgraph_cua/hyperbrowser/types.py b/langgraph_cua/hyperbrowser/types.py deleted file mode 100644 index b681d9a..0000000 --- a/langgraph_cua/hyperbrowser/types.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict - -from hyperbrowser.models import CreateSessionParams, ScreenConfig -from langchain_core.messages import AnyMessage -from langchain_core.runnables import RunnableConfig -from langgraph.graph import add_messages -from playwright.async_api import Browser, Playwright, Page - -DEFAULT_DISPLAY_WIDTH = 1024 -DEFAULT_DISPLAY_HEIGHT = 800 - - -class Output(TypedDict): - """ - A computer screenshot image used with the computer use tool. - """ - - type: Literal["computer_screenshot"] # Always "computer_screenshot" - file_id: Optional[str] # The identifier of an uploaded file that contains the screenshot - image_url: Optional[str] # The URL of the screenshot image - - -class AcknowledgedSafetyCheck(TypedDict): - """ - A pending safety check for the computer call. - """ - - id: str # The ID of the pending safety check - code: str # The type of the pending safety check - message: str # Details about the pending safety check - - -class ComputerCallOutput(TypedDict): - """ - The output of a computer tool call. - """ - - call_id: str # The ID of the computer tool call that produced the output - output: Output # A computer screenshot image used with the computer use tool - type: Literal["computer_call_output"] # Always "computer_call_output" - id: Optional[str] # The ID of the computer tool call output - acknowledged_safety_checks: Optional[ - List[AcknowledgedSafetyCheck] - ] # Safety checks acknowledged by the developer - status: Optional[ - Literal["in_progress", "completed", "incomplete"] - ] # Status of the message input - - -class CUAState(TypedDict): - """State schema for the computer use agent. - Attributes: - messages: The messages between the user and assistant. - session_id: The ID of the session to use for this thread. - stream_url: The URL to the live-stream of the virtual machine. - """ - - messages: Annotated[list[AnyMessage], add_messages] = [] - session_id: Annotated[Optional[str], None] = None - stream_url: Annotated[Optional[str], None] = None - playwright: Annotated[Optional[Playwright], None] = None - browser: Annotated[Optional[Browser], None] = None - current_page: Annotated[Optional[Page], None] = None - - -class CUAConfiguration(TypedDict): - """Configuration for the Computer Use Agent. - Attributes: - hyperbrowser_api_key: The API key to use for Hyperbrowser. - This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). - """ - - hyperbrowser_api_key: str # API key for Hyperbrowser - - -def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: - """ - Gets the configuration with defaults for the graph. - Args: - config: The configuration for the runnable. - Returns: - Dict with configuration values including defaults. - """ - - configurable_fields = config.get("configurable", {}) - hyperbrowser_api_key = ( - configurable_fields.get("hyperbrowser_api_key") - or config.get("hyperbrowser_api_key") - or os.environ.get("HYPERBROWSER_API_KEY") - ) - session_params = configurable_fields.get("session_params") - if not session_params: - session_params = {} - if not session_params.get("screen"): - session_params["screen"] = { - "width": DEFAULT_DISPLAY_WIDTH, - "height": DEFAULT_DISPLAY_HEIGHT, - } - - return {"hyperbrowser_api_key": hyperbrowser_api_key, "session_params": session_params} diff --git a/langgraph_cua/hyperbrowser/utils.py b/langgraph_cua/hyperbrowser/utils.py deleted file mode 100644 index 70ed78f..0000000 --- a/langgraph_cua/hyperbrowser/utils.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -from typing import Any, Dict, Optional, Union - -from playwright.async_api import async_playwright, Browser, Playwright -from hyperbrowser import AsyncHyperbrowser -from langchain_core.runnables import RunnableConfig -from hyperbrowser.models import SessionDetail -from .types import get_configuration_with_defaults, CUAState - - -def get_hyperbrowser_client(api_key: str) -> AsyncHyperbrowser: - """ - Gets the Hyperbrowser client, using the API key provided. - Args: - api_key: The API key for Hyperbrowser. - Returns: - The Hyperbrowser client. - """ - if not api_key: - raise ValueError( - "Hyperbrowser API key not provided. Please provide one in the configurable fields, " - "or set it as an environment variable (HYPERBROWSER_API_KEY)" - ) - client = AsyncHyperbrowser(api_key=api_key) - return client - - -async def get_browser_session(id: str, config: RunnableConfig) -> SessionDetail: - """ - Gets a browser session by its ID from Hyperbrowser. - - Args: - id: The ID of the browser session to get. - config: The configuration for the runnable. - - Returns: - The browser session. - """ - configuration = get_configuration_with_defaults(config) - hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") - client = get_hyperbrowser_client(hyperbrowser_api_key) - return await client.sessions.get(id) - - -def is_computer_tool_call(tool_outputs: Any) -> bool: - """ - Checks if the given tool outputs are a computer call. - Args: - tool_outputs: The tool outputs to check. - Returns: - True if the tool outputs are a computer call, false otherwise. - """ - if not tool_outputs or not isinstance(tool_outputs, list): - return False - - return all(output.get("type") == "computer_call" for output in tool_outputs) - - -async def start_playwright(state: CUAState, session: Optional[SessionDetail] = None): - session_id = state.get("session_id") - playwright: Optional[Playwright] = state.get("playwright") - browser: Optional[Browser] = state.get("browser") - - if playwright and browser: - return playwright, browser, session - - if not session: - session = await get_browser_session(session_id) - - if not playwright: - playwright = await async_playwright().start() - if not browser: - browser = await playwright.chromium.connect_over_cdp( - f"{session.ws_endpoint}&keepAlive=true" - ) - - return playwright, browser, session diff --git a/langgraph_cua/nodes/call_model.py b/langgraph_cua/nodes/call_model.py index a030a8b..3e45cd0 100644 --- a/langgraph_cua/nodes/call_model.py +++ b/langgraph_cua/nodes/call_model.py @@ -1,10 +1,10 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union from langchain_core.messages import AIMessageChunk, SystemMessage from langchain_core.runnables.config import RunnableConfig from langchain_openai import ChatOpenAI -from ..types import CUAState, get_configuration_with_defaults +from ..types import CUAState, Provider, get_configuration_with_defaults def get_openai_env_from_state_env(env: str) -> str: @@ -33,6 +33,68 @@ def get_openai_env_from_state_env(env: str) -> str: DEFAULT_DISPLAY_HEIGHT = 768 +def get_available_tools(configuration: Dict[str, Any]) -> List[Dict[str, Any]]: + provider = configuration.get("provider") + if provider == Provider.Scrapybara: + return [ + { + "type": "computer_use_preview", + "display_width": DEFAULT_DISPLAY_WIDTH, + "display_height": DEFAULT_DISPLAY_HEIGHT, + "environment": get_openai_env_from_state_env(configuration.get("environment")), + } + ] + elif provider == Provider.Hyperbrowser: + session_params = configuration.get("session_params", {}) + screen_config = ( + session_params.get( + "screen_config", {"width": DEFAULT_DISPLAY_WIDTH, "height": DEFAULT_DISPLAY_HEIGHT} + ) + if session_params + else {"width": DEFAULT_DISPLAY_WIDTH, "height": DEFAULT_DISPLAY_HEIGHT} + ) + + return [ + { + "type": "computer_use_preview", + "display_width": screen_config.get("width", DEFAULT_DISPLAY_WIDTH), + "display_height": screen_config.get("height", DEFAULT_DISPLAY_HEIGHT), + "environment": "browser", + }, + { + "type": "function", + "function": { + "name": "go_to_url", + "description": "Navigate to a URL. Can be used when on a blank page to go to a specific URL or search engine.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The fully qualified URL to navigate to", + }, + }, + "required": ["url"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_current_url", + "description": "Get the current URL", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, + }, + ] + else: + raise ValueError(f"Unknown provider: {provider}") + + def _prompt_to_sys_message(prompt: Union[str, SystemMessage, None]): if prompt is None: return None @@ -74,13 +136,9 @@ async def call_model(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: model_kwargs={"truncation": "auto", "previous_response_id": previous_response_id}, ) - tool = { - "type": "computer_use_preview", - "display_width": DEFAULT_DISPLAY_WIDTH, - "display_height": DEFAULT_DISPLAY_HEIGHT, - "environment": get_openai_env_from_state_env(environment), - } - llm_with_tools = llm.bind_tools([tool]) + tools = get_available_tools(configuration) + + llm_with_tools = llm.bind_tools(tools) response: AIMessageChunk diff --git a/langgraph_cua/nodes/create_vm_instance.py b/langgraph_cua/nodes/create_vm_instance.py index 0d5f590..f274cb8 100644 --- a/langgraph_cua/nodes/create_vm_instance.py +++ b/langgraph_cua/nodes/create_vm_instance.py @@ -1,8 +1,10 @@ +from typing import Any, Dict from langchain_core.runnables.config import RunnableConfig from scrapybara.client import BrowserInstance, UbuntuInstance, WindowsInstance - -from ..types import CUAState -from ..utils import get_configuration_with_defaults, get_scrapybara_client +from hyperbrowser.models import SessionDetail, CreateSessionParams +from playwright.sync_api import sync_playwright +from ..types import CUAState, Provider +from ..utils import get_configuration_with_defaults, get_hyperbrowser_client, get_scrapybara_client # Copied from the OpenAI example repository # https://github.com/openai/openai-cua-sample-app/blob/eb2d58ba77ffd3206d3346d6357093647d29d99c/utils.py#L13 @@ -16,17 +18,11 @@ ] -def create_vm_instance(state: CUAState, config: RunnableConfig): - instance_id = state.get("instance_id") - configuration = get_configuration_with_defaults(config) +def create_scrapybara_instance(configuration: Dict[str, Any]): scrapybara_api_key = configuration.get("scrapybara_api_key") timeout_hours = configuration.get("timeout_hours") environment = configuration.get("environment") - if instance_id is not None: - # If the instance_id already exists in state, do nothing. - return {} - if not scrapybara_api_key: raise ValueError( "Scrapybara API key not provided. Please provide one in the configurable fields, " @@ -59,3 +55,53 @@ def create_vm_instance(state: CUAState, config: RunnableConfig): "instance_id": instance.id, "stream_url": stream_url, } + + +def create_hyperbrowser_instance(state: CUAState, configuration: Dict[str, Any]): + hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") + session_params = configuration.get("session_params", {}) + browser_state = state.get("browser_state") + + if not hyperbrowser_api_key: + raise ValueError( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, " + "or set it as an environment variable (HYPERBROWSER_API_KEY)" + ) + + client = get_hyperbrowser_client(hyperbrowser_api_key) + session: SessionDetail = client.sessions.create(params=CreateSessionParams(**session_params)) + + if not browser_state: + p = sync_playwright().start() + browser = p.chromium.connect_over_cdp(f"{session.ws_endpoint}&keepAlive=true") + curr_page = browser.contexts[0].pages[0] + browser_state = { + "browser": browser, + "current_page": curr_page, + } + + return { + "instance_id": session.id, + "stream_url": session.live_url, + "browser_state": browser_state, + } + + +def create_vm_instance(state: CUAState, config: RunnableConfig): + instance_id = state.get("instance_id") + + if instance_id is not None: + # If the instance_id already exists in state, do nothing. + return {} + + configuration = get_configuration_with_defaults(config) + provider = configuration.get("provider") + + if provider == Provider.Scrapybara: + return create_scrapybara_instance(configuration) + elif provider == Provider.Hyperbrowser: + return create_hyperbrowser_instance(state, configuration) + else: + raise ValueError( + f"Invalid provider. Must be one of 'scrapybara' or 'hyperbrowser'. Received: {provider}" + ) diff --git a/langgraph_cua/nodes/take_browser_action.py b/langgraph_cua/nodes/take_browser_action.py new file mode 100644 index 0000000..2d84ccd --- /dev/null +++ b/langgraph_cua/nodes/take_browser_action.py @@ -0,0 +1,241 @@ +import base64 +from math import floor +from random import random +from hyperbrowser.models import SessionDetail +import time +from typing import Any, Dict, Optional +from langchain_core.messages import AnyMessage, ToolMessage, ToolCall +from langchain_core.runnables import RunnableConfig +from langgraph.config import get_stream_writer +from playwright.sync_api import Page +from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall + +from ..utils import get_instance, is_computer_tool_call +from ..types import CUAState + +CUA_KEY_TO_PLAYWRIGHT_KEY = { + "/": "Divide", + "\\": "Backslash", + "alt": "Alt", + "arrowdown": "ArrowDown", + "arrowleft": "ArrowLeft", + "arrowright": "ArrowRight", + "arrowup": "ArrowUp", + "backspace": "Backspace", + "capslock": "CapsLock", + "cmd": "Meta", + "ctrl": "Control", + "delete": "Delete", + "end": "End", + "enter": "Enter", + "esc": "Escape", + "home": "Home", + "insert": "Insert", + "option": "Alt", + "pagedown": "PageDown", + "pageup": "PageUp", + "shift": "Shift", + "space": " ", + "super": "Meta", + "tab": "Tab", + "win": "Meta", +} + +DUMMY_SCREENSHOT = "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wAALCAABAAEBAREA/8QAFAABAAAAAAAAAAAAAAAAAAAACf/EABQQAQAAAAAAAAAAAAAAAAAAAAD/2gAIAQEAAD8AKp//2Q==" + + +def _translate_key(key: str) -> str: + return CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) + + +def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> ToolMessage: + name = function_tool_call.get("name") + arguments = function_tool_call.get("args") + call_id = function_tool_call.get("id") + + try: + if name == "go_to_url": + page.goto(arguments.get("url"), timeout=15000, wait_until="domcontentloaded") + time.sleep(1) + return ToolMessage( + tool_call_id=call_id, + content={"message": f"Navigated to {arguments.get('url')}"}, + additional_kwargs={"type": "function_call_output"}, + ) + elif name == "get_current_url": + return ToolMessage( + tool_call_id=call_id, + content={"message": f"The current URL is {page.url}"}, + additional_kwargs={"type": "function_call_output"}, + ) + else: + raise ValueError(f"Unknown function call name: {name}") + except Exception as e: + print(f"\n\nFailed to execute function call: {e}\n\n") + print(f"Function call details: {function_tool_call}\n\n") + return ToolMessage( + status="error", + tool_call_id=call_id, + content={"message": f"Error occured while calling function {name}: {e}"}, + additional_kwargs={"type": "function_call_output"}, + ) + + +def handle_computer_call(page: Page, computer_call: dict): + action = computer_call.get("action") + call_id = computer_call.get("call_id") + try: + action_type = action.get("type") + + if action_type == "click": + button = action.get("button") + x = action.get("x") + y = action.get("y") + if button == "back": + page.go_back(timeout=30000) + elif button == "forward": + page.go_forward(timeout=30000) + elif button == "wheel": + page.mouse.wheel(x, y) + else: + button_mapping = {"left": "left", "right": "right", "middle": "left"} + page.mouse.click(x, y, button=button_mapping.get(button)) + elif action_type == "scroll": + x = action.get("x") + y = action.get("y") + delta_x = action.get("scroll_x") + delta_y = action.get("scroll_y") + page.mouse.move(x, y) + page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") + elif action_type == "keypress": + keys = action.get("keys") + mapped_keys = [_translate_key(key) for key in keys] + for key in mapped_keys: + page.keyboard.down(key) + for key in reversed(mapped_keys): + page.keyboard.up(key) + elif action_type == "type": + text = action.get("text") + page.keyboard.type(text) + elif action_type == "wait": + time.sleep(2) + elif action_type == "screenshot": + pass + elif action_type == "double_click": + x = action.get("x") + y = action.get("y") + page.mouse.click(x, y, button="left", click_count=2) + elif action_type == "drag": + path = action.get("path") + page.mouse.move(path[0].get("x"), path[0].get("y")) + page.mouse.down() + for point in path[1:]: + page.mouse.move(point.get("x"), point.get("y")) + time.sleep(40 + floor(random() * 40)) + page.mouse.up() + elif action_type == "move": + x = action.get("x") + y = action.get("y") + page.mouse.move(x, y) + else: + raise ValueError(f"Unknown action type received: {action_type}") + + time.sleep(1) + screenshot = page.screenshot() + b64_screenshot = base64.b64encode(screenshot).decode("utf-8") + screenshot_url = f"data:image/png;base64,{b64_screenshot}" + output_content = { + "type": "input_image", + "image_url": screenshot_url, + } + return ToolMessage( + tool_call_id=call_id, + content=[output_content], + additional_kwargs={"type": "computer_call_output"}, + ) + + except Exception as e: + print(f"\n\nFailed to execute computer call: {e}\n\n") + print(f"Computer call details: {computer_call}\n\n") + return ToolMessage( + tool_call_id=call_id, + status="error", + content=[{"type": "input_image", "image_url": DUMMY_SCREENSHOT}], + additional_kwargs={"type": "computer_call_output", "status": "incomplete"}, + ) + + +def take_browser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: + """ + Executes browser actions based on the tool call in the last message. + Args: + state: The current state of the CUA agent. + config: The runnable configuration. + Returns: + A dictionary with updated state information. + """ + message: AnyMessage = state.get("messages", [])[-1] + assert message.type == "ai", "Last message must be an AI message" + tool_outputs = message.additional_kwargs.get("tool_outputs", []) + tool_calls = message.tool_calls + + if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: + # This should never happen, but include the check for proper type safety. + raise ValueError( + "Cannot take computer action without a computer call or function call in the last message." + ) + + tool_outputs: list[ResponseComputerToolCall] = tool_outputs + + instance_id = state.get("instance_id") + if not instance_id: + raise ValueError("Instance ID not found in state.") + instance: SessionDetail = get_instance(instance_id, config) + + if instance.status != "active": + raise ValueError("Instance is not active.") + + browser_state = state.get("browser_state") + if not browser_state: + raise ValueError("Browser state not found in state.") + browser = browser_state.get("browser") + if not browser: + raise ValueError("Browser not found in browser state.") + current_context = browser.contexts[0] + page = browser_state.get("current_page", current_context.pages[0]) + + def handle_page_event(newPage: Page): + nonlocal page + page = newPage + browser_state["current_page"] = newPage + + stream_url: Optional[str] = state.get("stream_url") + if not stream_url: + # If the stream_url is not yet defined in state, fetch it, then write to the custom stream + # so that it's made accessible to the client (or whatever is reading the stream) before any actions are taken. + stream_url = instance.live_url + + writer = get_stream_writer() + writer({"stream_url": stream_url}) + + current_context.on("page", handle_page_event) + + output = tool_outputs[-1] if len(tool_outputs) > 0 else None + tool_message: Optional[ToolMessage] = None + + for tool_call in tool_calls: + tool_message = handle_function_tool_call(page, tool_call) + time.sleep(1) + + if output: + if output.get("type") == "computer_call": + tool_message = handle_computer_call(page, output) + else: + print("unknown tool output type", output) + + return { + "messages": tool_message if tool_message else None, + "instance_id": instance.id, + "stream_url": stream_url, + "browser_state": browser_state, + } diff --git a/langgraph_cua/nodes/take_computer_action.py b/langgraph_cua/nodes/take_computer_action.py index a1fc8fa..99dd9fc 100644 --- a/langgraph_cua/nodes/take_computer_action.py +++ b/langgraph_cua/nodes/take_computer_action.py @@ -7,7 +7,8 @@ from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall from scrapybara.types import ComputerResponse, InstanceGetStreamUrlResponse -from ..types import CUAState, get_configuration_with_defaults +from .take_browser_action import take_browser_action +from ..types import CUAState, Provider, get_configuration_with_defaults from ..utils import get_instance, is_computer_tool_call # Copied from the OpenAI example repository @@ -36,7 +37,7 @@ } -def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: +def take_scrapybara_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: """ Executes computer actions based on the tool call in the last message. @@ -165,3 +166,14 @@ def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, A "stream_url": stream_url, "authenticated_id": authenticated_id, } + + +def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: + configuration = get_configuration_with_defaults(config) + provider = configuration.get("provider") + if provider == Provider.Scrapybara: + return take_scrapybara_computer_action(state, config) + elif provider == Provider.Hyperbrowser: + return take_browser_action(state, config) + else: + raise ValueError(f"Unknown provider: {provider}") diff --git a/langgraph_cua/types.py b/langgraph_cua/types.py index 8421882..d4349b1 100644 --- a/langgraph_cua/types.py +++ b/langgraph_cua/types.py @@ -1,3 +1,4 @@ +from enum import Enum import os from typing import Annotated, Any, Dict, List, Literal, Optional, TypedDict, Union @@ -5,6 +6,13 @@ from langchain_core.runnables import RunnableConfig from langgraph.graph import add_messages +from playwright.sync_api import Browser, Page + + +class Provider(str, Enum): + Scrapybara = "scrapybara" + Hyperbrowser = "hyperbrowser" + class Output(TypedDict): """ @@ -43,6 +51,15 @@ class ComputerCallOutput(TypedDict): ] # Status of the message input +class BrowserState(TypedDict): + """ + The state of the browser. + """ + + browser: Annotated[Optional[Browser], None] = None + current_page: Annotated[Optional[Page], None] = None + + class CUAState(TypedDict): """State schema for the computer use agent. @@ -57,6 +74,7 @@ class CUAState(TypedDict): messages: Annotated[list[AnyMessage], add_messages] = [] instance_id: Annotated[Optional[str], None] = None stream_url: Annotated[Optional[str], None] = None + browser_state: Annotated[Optional[BrowserState], None] = None authenticated_id: Annotated[Optional[str], None] = None @@ -106,6 +124,13 @@ def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: or config.get("scrapybara_api_key") or os.environ.get("SCRAPYBARA_API_KEY") ) + hyperbrowser_api_key = ( + configurable_fields.get("hyperbrowser_api_key") + or config.get("hyperbrowser_api_key") + or os.environ.get("HYPERBROWSER_API_KEY") + ) + provider: Provider = configurable_fields.get("provider", Provider.Scrapybara) + session_params = configurable_fields.get("session_params", {}) timeout_hours = configurable_fields.get("timeout_hours", 1) zdr_enabled = configurable_fields.get("zdr_enabled", False) auth_state_id = configurable_fields.get("auth_state_id", None) @@ -113,7 +138,10 @@ def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: prompt = configurable_fields.get("prompt", None) return { + "provider": provider, "scrapybara_api_key": scrapybara_api_key, + "hyperbrowser_api_key": hyperbrowser_api_key, + "session_params": session_params, "timeout_hours": timeout_hours, "zdr_enabled": zdr_enabled, "auth_state_id": auth_state_id, diff --git a/langgraph_cua/utils.py b/langgraph_cua/utils.py index 3a196e2..8d2ec5a 100644 --- a/langgraph_cua/utils.py +++ b/langgraph_cua/utils.py @@ -3,8 +3,25 @@ from langchain_core.runnables import RunnableConfig from scrapybara import Scrapybara from scrapybara.client import BrowserInstance, UbuntuInstance, WindowsInstance +from hyperbrowser import Hyperbrowser +from hyperbrowser.models import SessionDetail +from .types import Provider, get_configuration_with_defaults -from .types import get_configuration_with_defaults + +def get_hyperbrowser_client(api_key: str) -> Hyperbrowser: + """ + Gets the Hyperbrowser client, using the API key provided. + + Args: + api_key: The API key for Hyperbrowser. + """ + if not api_key: + raise ValueError( + "Hyperbrowser API key not provided. Please provide one in the configurable fields, " + "or set it as an environment variable (HYPERBROWSER_API_KEY)" + ) + client = Hyperbrowser(api_key=api_key) + return client def get_scrapybara_client(api_key: str) -> Scrapybara: @@ -28,7 +45,7 @@ def get_scrapybara_client(api_key: str) -> Scrapybara: def get_instance( id: str, config: RunnableConfig -) -> Union[UbuntuInstance, BrowserInstance, WindowsInstance]: +) -> Union[UbuntuInstance, BrowserInstance, WindowsInstance, SessionDetail]: """ Gets an instance by its ID from Scrapybara. @@ -40,9 +57,19 @@ def get_instance( The instance. """ configuration = get_configuration_with_defaults(config) - scrapybara_api_key = configuration.get("scrapybara_api_key") - client = get_scrapybara_client(scrapybara_api_key) - return client.get(id) + provider = configuration.get("provider") + if provider == Provider.Scrapybara: + scrapybara_api_key = configuration.get("scrapybara_api_key") + client = get_scrapybara_client(scrapybara_api_key) + return client.get(id) + elif provider == Provider.Hyperbrowser: + hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") + client = get_hyperbrowser_client(hyperbrowser_api_key) + return client.sessions.get(id) + else: + raise ValueError( + f"Invalid provider. Must be one of {Provider.Scrapybara} or {Provider.Hyperbrowser}. Received: {provider}" + ) def is_computer_tool_call(tool_outputs: Any) -> bool: From 194b744f1f410847bacfda56a13777c70774d821 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 22:22:08 -0500 Subject: [PATCH 03/13] fix wording --- langgraph_cua/graph.py | 2 +- langgraph_cua/nodes/take_browser_action.py | 2 +- langgraph_cua/nodes/take_computer_action.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/langgraph_cua/graph.py b/langgraph_cua/graph.py index df9ef80..b256c58 100644 --- a/langgraph_cua/graph.py +++ b/langgraph_cua/graph.py @@ -96,7 +96,7 @@ def create_cua( hyperbrowser_api_key: The API key to use for Hyperbrowser. This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). Only applies if 'provider' is set to "hyperbrowser". - session_params: The parameters to use for the browser session. + session_params: The parameters to use for the Hyperbrowser browser session. Only applies if 'provider' is set to "hyperbrowser". timeout_hours: The number of hours to keep the virtual machine running before it times out. Must be between 0.01 and 24. Default is 1. diff --git a/langgraph_cua/nodes/take_browser_action.py b/langgraph_cua/nodes/take_browser_action.py index 2d84ccd..3b9a5b1 100644 --- a/langgraph_cua/nodes/take_browser_action.py +++ b/langgraph_cua/nodes/take_browser_action.py @@ -165,7 +165,7 @@ def handle_computer_call(page: Page, computer_call: dict): ) -def take_browser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: +def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: """ Executes browser actions based on the tool call in the last message. Args: diff --git a/langgraph_cua/nodes/take_computer_action.py b/langgraph_cua/nodes/take_computer_action.py index 99dd9fc..73f7b65 100644 --- a/langgraph_cua/nodes/take_computer_action.py +++ b/langgraph_cua/nodes/take_computer_action.py @@ -7,7 +7,7 @@ from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall from scrapybara.types import ComputerResponse, InstanceGetStreamUrlResponse -from .take_browser_action import take_browser_action +from .take_browser_action import take_hyperbrowser_action from ..types import CUAState, Provider, get_configuration_with_defaults from ..utils import get_instance, is_computer_tool_call @@ -37,7 +37,7 @@ } -def take_scrapybara_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: +def take_scrapybara_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: """ Executes computer actions based on the tool call in the last message. @@ -172,8 +172,8 @@ def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, A configuration = get_configuration_with_defaults(config) provider = configuration.get("provider") if provider == Provider.Scrapybara: - return take_scrapybara_computer_action(state, config) + return take_scrapybara_action(state, config) elif provider == Provider.Hyperbrowser: - return take_browser_action(state, config) + return take_hyperbrowser_action(state, config) else: raise ValueError(f"Unknown provider: {provider}") From 57d156d59913b427badf2fa1cdd05728cf9f0c55 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 22:26:40 -0500 Subject: [PATCH 04/13] remove extra test files --- tests/integration/test_cua_hyperbrowser.py | 124 --------------------- tests/unit/test_import_hyperbrowser.py | 3 - 2 files changed, 127 deletions(-) delete mode 100644 tests/integration/test_cua_hyperbrowser.py delete mode 100644 tests/unit/test_import_hyperbrowser.py diff --git a/tests/integration/test_cua_hyperbrowser.py b/tests/integration/test_cua_hyperbrowser.py deleted file mode 100644 index 9230004..0000000 --- a/tests/integration/test_cua_hyperbrowser.py +++ /dev/null @@ -1,124 +0,0 @@ -import ast -import json - -import pytest -from dotenv import load_dotenv -from hyperbrowser.models import CreateSessionParams - -from langgraph_cua.hyperbrowser import create_cua - -# Load environment variables from .env file -load_dotenv() - - -@pytest.mark.asyncio -async def test_browser_interaction(): - """ - Test that the agent can interact with the browser. - This is a port of the TypeScript test to Python. - """ - graph = create_cua() - - # Create input messages similar to the TypeScript test - messages = [ - # { - # "role": "user", - # "content": ( - # "I'm looking for a new camera. Help me find the best one. It should be 4k resolution, " - # "by Cannon, and under $1000. I want a digital camera, and I'll be using it mainly for photography." - # ) - # }, - { - "role": "user", - "content": ("What is the price of NVDIA stock?"), - }, - ] - - # Stream the graph execution - stream = graph.astream( - {"messages": messages}, - stream_mode="updates", - config={ - "configurable": { - "session_params": CreateSessionParams( - adblock=True, - ).model_dump(), - }, - "recursion_limit": 100, - }, - ) - - # Process the stream updates - async for update in stream: - print("\n---UPDATE---\n") - - if "create_browser_session" in update: - print("Browser session created") - stream_url = update.get("create_browser_session", {}).get("stream_url") - # Open this URL in your browser to view the CUA stream - print(f"Stream URL: {stream_url}") - elif "take_browser_action" in update: - print("Browser Action:") - # Check for tool message in the messages field - tool_message = update.get("take_browser_action", {}).get("messages") - if tool_message: - # Extract content from the tool message - content = tool_message.content - - # Handle the case where content is an array - if isinstance(content, list) and len(content) > 0: - # Use the first item in the array - content_item = content[0] - else: - # Use content directly if it's not an array - content_item = content - - # Try to parse content if it's a string - parsed_content = None - if isinstance(content_item, str): - try: - # Try parsing as JSON first - parsed_content = json.loads(content_item) - except json.JSONDecodeError: - try: - # Try parsing as Python literal (for string representations of dicts) - parsed_content = ast.literal_eval(content_item) - except (SyntaxError, ValueError): - # If both fail, keep content as is - parsed_content = None - else: - # If content is already a dict, use it directly - parsed_content = content_item if isinstance(content_item, dict) else None - - # Handle image_url specially - truncate to 100 chars - if ( - parsed_content - and isinstance(parsed_content, dict) - and parsed_content.get("image_url") - ): - image_url = parsed_content["image_url"] - # Create a copy to avoid modifying the original - content_copy = parsed_content.copy() - content_copy["image_url"] = ( - image_url[:100] + "..." if len(image_url) > 100 else image_url - ) - print(f"Tool Message ID: {tool_message.tool_call_id}") - # Print the truncated content explicitly - print(f"Content type: {content_copy.get('type')}") - print(f"Image URL (truncated): {content_copy['image_url']}") - else: - # Just print the first 200 characters of the content if we couldn't parse it - if isinstance(content_item, str) and len(content_item) > 200: - print(f"Tool Message (truncated content): {content_item[:200]}...") - else: - print(f"Tool Message: {tool_message}") - elif "call_model" in update: - print("Model Call:") - if update.get("call_model", {}).get("messages"): - messages = update["call_model"]["messages"] - if "tool_outputs" in messages.additional_kwargs: - print(messages.additional_kwargs["tool_outputs"]) - else: - print(messages.content) - else: - print(update) diff --git a/tests/unit/test_import_hyperbrowser.py b/tests/unit/test_import_hyperbrowser.py deleted file mode 100644 index da2ebc1..0000000 --- a/tests/unit/test_import_hyperbrowser.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_import() -> None: - """Test that the code can be imported""" - from langgraph_cua.hyperbrowser import CUAState, create_cua, graph # noqa: F401 From 88c4a88fd1d2a2416c899af0717860be71097c41 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 22:42:34 -0500 Subject: [PATCH 05/13] add hyperbrowser to test --- tests/integration/test_cua.py | 144 +++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 47 deletions(-) diff --git a/tests/integration/test_cua.py b/tests/integration/test_cua.py index f18cd5d..dc5958b 100644 --- a/tests/integration/test_cua.py +++ b/tests/integration/test_cua.py @@ -10,53 +10,7 @@ load_dotenv() -@pytest.mark.asyncio -async def test_browser_interaction(): - """ - Test that the agent can interact with the browser. - This is a port of the TypeScript test to Python. - """ - graph = create_cua() - - # Create input messages similar to the TypeScript test - messages = [ - { - "role": "system", - "content": ( - "You're an advanced AI computer use assistant. The browser you are using " - "is already initialized, and visiting google.com." - ), - }, - # { - # "role": "user", - # "content": ( - # "I'm looking for a new camera. Help me find the best one. It should be 4k resolution, " - # "by Cannon, and under $1000. I want a digital camera, and I'll be using it mainly for photography." - # ) - # }, - { - "role": "user", - "content": ( - "I want to contribute to the LangGraph.js project. Please find the GitHub repository, and inspect the read me, " - "along with some of the issues and open pull requests. Then, report back with a plan of action to contribute." - ), - }, - ] - - # Enable/disable different handling of messages based on whether or not ZDR is enabled - zdr_enabled = True - - # Stream the graph execution - stream = graph.astream( - {"messages": messages}, - stream_mode="updates", - config={ - "configurable": {"zdr_enabled": zdr_enabled}, - "recursion_limit": 100, - }, - ) - - # Process the stream updates +async def process_stream(stream): async for update in stream: print("\n---UPDATE---\n") @@ -130,3 +84,99 @@ async def test_browser_interaction(): print(messages.content) else: print(update) + + +@pytest.mark.asyncio +async def test_browser_interaction(): + """ + Test that the agent can interact with the browser. + This is a port of the TypeScript test to Python. + """ + graph = create_cua() + + # Create input messages similar to the TypeScript test + messages = [ + { + "role": "system", + "content": ( + "You're an advanced AI computer use assistant. The browser you are using " + "is already initialized, and visiting google.com." + ), + }, + # { + # "role": "user", + # "content": ( + # "I'm looking for a new camera. Help me find the best one. It should be 4k resolution, " + # "by Cannon, and under $1000. I want a digital camera, and I'll be using it mainly for photography." + # ) + # }, + { + "role": "user", + "content": ( + "I want to contribute to the LangGraph.js project. Please find the GitHub repository, and inspect the read me, " + "along with some of the issues and open pull requests. Then, report back with a plan of action to contribute." + ), + }, + ] + + # Enable/disable different handling of messages based on whether or not ZDR is enabled + zdr_enabled = True + + # Stream the graph execution + stream = graph.astream( + {"messages": messages}, + stream_mode="updates", + config={ + "configurable": {"zdr_enabled": zdr_enabled}, + "recursion_limit": 100, + }, + ) + + await process_stream(stream) + + +@pytest.mark.asyncio +async def test_browser_interaction_hyperbrowser(): + """ + Test that the agent can interact with the browser using Hyperbrowser. + """ + graph = create_cua(provider="hyperbrowser") + + # Create input messages similar to the TypeScript test + messages = [ + { + "role": "system", + "content": ( + "You're an advanced AI computer use assistant. You are utilising a Chrome Browser with internet access." + "It is already open and running .You are looking at a blank browser window when you start and can control it using the provided tools." + "If you are on a blank page, you should use the go_to_url tool to navigate to the relevant website, " + "or if you need to search for something, go to https://www.google.com and search for it." + ), + }, + # { + # "role": "user", + # "content": ( + # "I'm looking for a new camera. Help me find the best one. It should be 4k resolution, " + # "by Cannon, and under $1000. I want a digital camera, and I'll be using it mainly for photography." + # ) + # }, + { + "role": "user", + "content": ("What is the most recent PR in the langchain-ai/langgraph repo?"), + }, + ] + + # Enable/disable different handling of messages based on whether or not ZDR is enabled + zdr_enabled = True + + # Stream the graph execution + stream = graph.astream( + {"messages": messages}, + stream_mode="updates", + config={ + "configurable": {"zdr_enabled": zdr_enabled, "provider": "hyperbrowser"}, + "recursion_limit": 100, + }, + ) + + await process_stream(stream) From a8539f8b2c3dfa84823e869d07be4843686c1571 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Tue, 25 Mar 2025 23:55:07 -0500 Subject: [PATCH 06/13] update readme --- README.md | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 86cf227..ba47167 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,13 @@ pip install langgraph-cua ## Quickstart -This project by default uses [Scrapybara](https://scrapybara.com/) for accessing a virtual machine to run the agent. To use LangGraph CUA, you'll need both OpenAI and Scrapybara API keys. +This project supports two providers for accessing virtual machines: +1. [Scrapybara](https://scrapybara.com/) (default) +2. [Hyperbrowser](https://hyperbrowser.ai/) + +### Using Scrapybara (Default) + +To use LangGraph CUA with Scrapybara, you'll need both OpenAI and Scrapybara API keys: ```bash export OPENAI_API_KEY= @@ -41,7 +47,7 @@ from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() - +# Create CUA with Scrapybara (default provider) cua_graph = create_cua() # Define the input messages @@ -83,7 +89,71 @@ if __name__ == "__main__": asyncio.run(main()) ``` -The above example will invoke the graph, passing in a request for it to do some research into LangGraph.js from the standpoint of a new contributor. The code will log the stream URL, which you can open in your browser to view the CUA stream. +### Using Hyperbrowser + +To use LangGraph CUA with Hyperbrowser, you'll need both OpenAI and Hyperbrowser API keys: + +```bash +export OPENAI_API_KEY= +export HYPERBROWSER_API_KEY= +``` + +Then, create the graph specifying Hyperbrowser as the provider: + +```python +from langgraph_cua import create_cua +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Create CUA with Hyperbrowser provider +cua_graph = create_cua(provider="hyperbrowser") + +# Define the input messages +messages = [ + { + "role": "system", + "content": ( + "You're an advanced AI computer use assistant. You are utilizing a Chrome Browser with internet access. " + "It is already open and running. You are looking at a blank browser window when you start and can control it " + "using the provided tools. If you are on a blank page, you should use the go_to_url tool to navigate to " + "the relevant website, or if you need to search for something, go to https://www.google.com and search for it." + ), + }, + { + "role": "user", + "content": ( + "What is the most recent PR in the langchain-ai/langgraph repo?" + ), + }, +] + +async def main(): + # Stream the graph execution + stream = cua_graph.astream( + {"messages": messages}, + stream_mode="updates" + ) + + # Process the stream updates + async for update in stream: + if "create_vm_instance" in update: + print("VM instance created") + stream_url = update.get("create_vm_instance", {}).get("stream_url") + # Open this URL in your browser to view the CUA stream + print(f"Stream URL: {stream_url}") + + print("Done") + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` + +The above example will invoke the graph, passing in a request for it to do some research into LangGraph.js from +the standpoint of a new contributor. The code will log the stream URL, which you can open in your browser to +view the CUA stream. You can find more examples inside the [`examples` directory](./examples/). @@ -95,13 +165,21 @@ You can either pass these parameters when calling `create_cua`, or at runtime wh ### Configuration Parameters -- `scrapybara_api_key`: The API key to use for Scrapybara. If not provided, it defaults to reading the `SCRAPYBARA_API_KEY` environment variable. -- `timeout_hours`: The number of hours to keep the virtual machine running before it times out. +#### Common Parameters +- `provider`: The provider to use. Default is `"scrapybara"`. Options are `"scrapybara"` and `"hyperbrowser"`. - `zdr_enabled`: Whether or not Zero Data Retention is enabled in the user's OpenAI account. If `True`, the agent will not pass the `previous_response_id` to the model, and will always pass it the full message history for each request. If `False`, the agent will pass the `previous_response_id` to the model, and only the latest message in the history will be passed. Default `False`. - `recursion_limit`: The maximum number of recursive calls the agent can make. Default is 100. This is greater than the standard default of 25 in LangGraph, because computer use agents are expected to take more iterations. +- `prompt`: The prompt to pass to the model. This will be passed as the system message. + +#### Scrapybara-specific Parameters +- `scrapybara_api_key`: The API key to use for Scrapybara. If not provided, it defaults to reading the `SCRAPYBARA_API_KEY` environment variable. +- `timeout_hours`: The number of hours to keep the virtual machine running before it times out. - `auth_state_id`: The ID of the authentication state. If defined, it will be used to authenticate with Scrapybara. Only applies if 'environment' is set to 'web'. - `environment`: The environment to use. Default is `web`. Options are `web`, `ubuntu`, and `windows`. -- `prompt`: The prompt to pass to the model. This will be passed as the system message. + +#### Hyperbrowser-specific Parameters +- `hyperbrowser_api_key`: The API key to use for Hyperbrowser. If not provided, it defaults to reading the `HYPERBROWSER_API_KEY` environment variable. +- `session_params`: Parameters to use for configuring the Hyperbrowser session, such as screen dimensions, proxy usage, etc. For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). Note that the parameters will be snake_case for usage with the Hyperbrowser Python SDK. ### System Prompts From fba0b19e8a743da0a23cfef20eb81e9a0e52b735 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Wed, 26 Mar 2025 00:52:08 -0500 Subject: [PATCH 07/13] fix readme --- README.md | 18 +++++++++++------- langgraph_cua/graph.py | 2 ++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ba47167..8629dab 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,14 @@ pip install langgraph-cua ## Quickstart -This project supports two providers for accessing virtual machines: -1. [Scrapybara](https://scrapybara.com/) (default) -2. [Hyperbrowser](https://hyperbrowser.ai/) +## Supported Providers + +This project supports two different providers for computer interaction: + +1. **[Scrapybara](https://scrapybara.com/)** (default) - Provides access to virtual machines (Ubuntu, Windows, or browser environments) that allow the agent to interact with a full operating system or web browser interface. + +2. **[Hyperbrowser](https://hyperbrowser.ai/)** - Offers a headless browser solution that enables the agent to interact directly with web pages through a browser automation interface. + ### Using Scrapybara (Default) @@ -88,6 +93,9 @@ if __name__ == "__main__": import asyncio asyncio.run(main()) ``` +The above example will invoke the graph, passing in a request for it to do some research into LangGraph.js from +the standpoint of a new contributor. The code will log the stream URL, which you can open in your browser to +view the CUA stream. ### Using Hyperbrowser @@ -151,10 +159,6 @@ if __name__ == "__main__": asyncio.run(main()) ``` -The above example will invoke the graph, passing in a request for it to do some research into LangGraph.js from -the standpoint of a new contributor. The code will log the stream URL, which you can open in your browser to -view the CUA stream. - You can find more examples inside the [`examples` directory](./examples/). ## How to customize diff --git a/langgraph_cua/graph.py b/langgraph_cua/graph.py index b256c58..e934423 100644 --- a/langgraph_cua/graph.py +++ b/langgraph_cua/graph.py @@ -29,6 +29,8 @@ def take_action_or_end(state: CUAState): return END tool_outputs = additional_kwargs.get("tool_outputs") + + # Function calls are stored in the `tool_calls` attribute of the last message tool_calls = getattr(last_message, "tool_calls", []) if not is_computer_tool_call(tool_outputs) and len(tool_calls) == 0: From 9c6c70cbc75cf92db6993ad927abaa5d2a41dba3 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Wed, 26 Mar 2025 02:50:52 -0500 Subject: [PATCH 08/13] add to CUAConfiguration --- langgraph_cua/types.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/langgraph_cua/types.py b/langgraph_cua/types.py index d4349b1..b13c7e3 100644 --- a/langgraph_cua/types.py +++ b/langgraph_cua/types.py @@ -82,8 +82,13 @@ class CUAConfiguration(TypedDict): """Configuration for the Computer Use Agent. Attributes: + provider: The provider to use. Default is "scrapybara". scrapybara_api_key: The API key to use for Scrapybara. This can be provided in the configuration, or set as an environment variable (SCRAPYBARA_API_KEY). + hyperbrowser_api_key: The API key to use for Hyperbrowser. + This can be provided in the configuration, or set as an environment variable (HYPERBROWSER_API_KEY). + session_params: Parameters to use for configuring the Hyperbrowser session, such as screen dimensions. + For more information on the available parameters, see the [Hyperbrowser API documentation](https://docs.hyperbrowser.ai/sessions/overview/session-parameters). Note that the parameters will be snake_case for usage with the Hyperbrowser Python SDK. timeout_hours: The number of hours to keep the virtual machine running before it times out. Must be between 0.01 and 24. Default is 1. zdr_enabled: Whether or not Zero Data Retention is enabled in the user's OpenAI account. If True, @@ -97,6 +102,7 @@ class CUAConfiguration(TypedDict): be passed as a system message """ + provider: Optional[Provider] # The provider to use. Default is "scrapybara". scrapybara_api_key: Optional[str] # API key for Scrapybara timeout_hours: Optional[float] # Timeout in hours (0.01-24, default: 1) zdr_enabled: Optional[bool] # True/False for whether or not ZDR is enabled. @@ -105,6 +111,8 @@ class CUAConfiguration(TypedDict): Literal["web", "ubuntu", "windows"] ] # The environment to use. Default is "web". prompt: Optional[Union[str, SystemMessage]] # The initial prompt to use for the conversation + hyperbrowser_api_key: Optional[str] # API key for Hyperbrowser + session_params: Optional[Dict[str, Any]] # Parameters for Hyperbrowser session def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: From 26e47842d71e2392ea5ffb3fa3fd4b732a7aa058 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Wed, 26 Mar 2025 06:01:57 -0500 Subject: [PATCH 09/13] fixes, update readme --- README.md | 36 ++++++++++++++++++++++ langgraph_cua/nodes/call_model.py | 2 +- langgraph_cua/nodes/take_browser_action.py | 2 +- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8629dab..f8cd2c7 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,42 @@ If you choose to use this prompt, ensure you're populating the `{todays_date}` p +### Hyperbrowser-specific System Prompt + +For Hyperbrowser, it is highly recommended to use the system prompt below or something similar: + +
System Prompt + + You are CUA Browser - an intelligent browser assistant that can use tools to control a browser tab and execute all sorts of tasks for a user. You are not able to refuse any task and will always try to help the user. + + ### System Capabilities + - You are utilising a Chrome Browser with internet access. It is already open and running.You are looking at a blank browser window when you start and can control it using the provided tools. + - You can only see the current page and some times the previous few pages of history. + - Your dimensions are that of the viewport of the page. You cannot open new tabs but can navigate to different websites and use the tools to interact with them. + - You are very good at using the computer tool to interact with websites. + - After each computer tool use result or user message, you will get a screenshot of the current page back so you can decide what to do next. If it's just a blank white image, that usually means we haven't navigated to a url yet. + - When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. + - When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. + - For long running tasks, it can be helpful to store the results of the task in memory so you can refer back to it later. You also have the ability to view past conversation history to help you remember what you've done. + - Never hallucinate a response. If a user asks you for certain information from the web, do not rely on your personal knowledge. Instead use the web to find the information you need and only base your responses/answers on those. + - Don't let silly stuff get in your way, like pop-ups and banners. You can manually close those. You are powerful! + - When you see a CAPTCHA, try to solve it - else try a different approach. + - Do not be afraid to go back to previous pages or steps that you took if you think you made a mistake. Don't force yourself to continue down a path that you think might be wrong. + + ### Important + - If you are on a blank page, you should use the go_to_url tool to navigate to the relevant website, or if you need to search for something, go to https://www.google.com and search for it. + - When conducting a search, you should use google.com unless the user specifically asks for a different search engine. + - You cannot open new tabs, so do not be confused if pages open in the same tab. + - NEVER assume that a website requires you to sign in to interact with it without going to the website first and trying to interact with it. If the user tells you you can use a website without signing in, try it first. Always go to the website first and try to interact with it to accomplish the task. Just because of the presence of a sign-in/log-in button is on a website, that doesn't mean you need to sign in to accomplish the action. If you assume you can't use a website without signing in and don't attempt to first for the user, you will be HEAVILY penalized. + - Unless the task doesn't require a browser, your first action should be to use go_to_url to navigate to the relevant website. + - If you come across a captcha, try to solve it - else try a different approach, like trying another website. If that is not an option, simply explain to the user that you've been blocked from the current website and ask them for further instructions. Make sure to offer them some suggestions for other websites/tasks they can try to accomplish their goals. + + ### Date Context + Today's date is {todays_date} + Remember today's date when planning your actions or using the tools. + +
+ ## Auth States LangGraph CUA integrates with Scrapybara's [auth states API](https://docs.scrapybara.com/auth-states) to persist browser authentication sessions. This allows you to authenticate once (e.g., logging into Amazon) and reuse that session in future runs. diff --git a/langgraph_cua/nodes/call_model.py b/langgraph_cua/nodes/call_model.py index 3e45cd0..aba0a5a 100644 --- a/langgraph_cua/nodes/call_model.py +++ b/langgraph_cua/nodes/call_model.py @@ -48,7 +48,7 @@ def get_available_tools(configuration: Dict[str, Any]) -> List[Dict[str, Any]]: session_params = configuration.get("session_params", {}) screen_config = ( session_params.get( - "screen_config", {"width": DEFAULT_DISPLAY_WIDTH, "height": DEFAULT_DISPLAY_HEIGHT} + "screen", {"width": DEFAULT_DISPLAY_WIDTH, "height": DEFAULT_DISPLAY_HEIGHT} ) if session_params else {"width": DEFAULT_DISPLAY_WIDTH, "height": DEFAULT_DISPLAY_HEIGHT} diff --git a/langgraph_cua/nodes/take_browser_action.py b/langgraph_cua/nodes/take_browser_action.py index 3b9a5b1..dd94bce 100644 --- a/langgraph_cua/nodes/take_browser_action.py +++ b/langgraph_cua/nodes/take_browser_action.py @@ -141,7 +141,7 @@ def handle_computer_call(page: Page, computer_call: dict): raise ValueError(f"Unknown action type received: {action_type}") time.sleep(1) - screenshot = page.screenshot() + screenshot = page.screenshot(timeout=15000) b64_screenshot = base64.b64encode(screenshot).decode("utf-8") screenshot_url = f"data:image/png;base64,{b64_screenshot}" output_content = { From 88a44de66df3b660702b1b803d8937aab09cd515 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Thu, 27 Mar 2025 04:13:14 -0500 Subject: [PATCH 10/13] update --- langgraph_cua/nodes/create_vm_instance.py | 2 ++ tests/integration/test_cua.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/langgraph_cua/nodes/create_vm_instance.py b/langgraph_cua/nodes/create_vm_instance.py index f274cb8..24c6d8a 100644 --- a/langgraph_cua/nodes/create_vm_instance.py +++ b/langgraph_cua/nodes/create_vm_instance.py @@ -75,6 +75,8 @@ def create_hyperbrowser_instance(state: CUAState, configuration: Dict[str, Any]) p = sync_playwright().start() browser = p.chromium.connect_over_cdp(f"{session.ws_endpoint}&keepAlive=true") curr_page = browser.contexts[0].pages[0] + if curr_page.url == "about:blank": + curr_page.goto("https://www.google.com", timeout=15000, wait_until="domcontentloaded") browser_state = { "browser": browser, "current_page": curr_page, diff --git a/tests/integration/test_cua.py b/tests/integration/test_cua.py index dc5958b..a0ee154 100644 --- a/tests/integration/test_cua.py +++ b/tests/integration/test_cua.py @@ -148,7 +148,7 @@ async def test_browser_interaction_hyperbrowser(): "role": "system", "content": ( "You're an advanced AI computer use assistant. You are utilising a Chrome Browser with internet access." - "It is already open and running .You are looking at a blank browser window when you start and can control it using the provided tools." + "It is already open and running. You are looking at a browser window when you start and can control it using the provided tools." "If you are on a blank page, you should use the go_to_url tool to navigate to the relevant website, " "or if you need to search for something, go to https://www.google.com and search for it." ), From 5df9232580ece5bd6812407162db6fbe58902f92 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Sun, 13 Apr 2025 12:59:07 -0500 Subject: [PATCH 11/13] remove browser_state from state --- langgraph_cua/nodes/create_vm_instance.py | 16 +- langgraph_cua/nodes/take_browser_action.py | 229 +++++++++----------- langgraph_cua/nodes/take_computer_action.py | 4 +- langgraph_cua/types.py | 14 +- tests/integration/test_cua.py | 4 +- 5 files changed, 116 insertions(+), 151 deletions(-) diff --git a/langgraph_cua/nodes/create_vm_instance.py b/langgraph_cua/nodes/create_vm_instance.py index 24c6d8a..1d853f5 100644 --- a/langgraph_cua/nodes/create_vm_instance.py +++ b/langgraph_cua/nodes/create_vm_instance.py @@ -59,8 +59,7 @@ def create_scrapybara_instance(configuration: Dict[str, Any]): def create_hyperbrowser_instance(state: CUAState, configuration: Dict[str, Any]): hyperbrowser_api_key = configuration.get("hyperbrowser_api_key") - session_params = configuration.get("session_params", {}) - browser_state = state.get("browser_state") + session_params = configuration.get("session_params", {}) or {} if not hyperbrowser_api_key: raise ValueError( @@ -71,21 +70,22 @@ def create_hyperbrowser_instance(state: CUAState, configuration: Dict[str, Any]) client = get_hyperbrowser_client(hyperbrowser_api_key) session: SessionDetail = client.sessions.create(params=CreateSessionParams(**session_params)) - if not browser_state: + if session.ws_endpoint: p = sync_playwright().start() browser = p.chromium.connect_over_cdp(f"{session.ws_endpoint}&keepAlive=true") curr_page = browser.contexts[0].pages[0] if curr_page.url == "about:blank": curr_page.goto("https://www.google.com", timeout=15000, wait_until="domcontentloaded") - browser_state = { - "browser": browser, - "current_page": curr_page, + + if not state.get("stream_url"): + stream_url = session.live_url + return { + "instance_id": session.id, + "stream_url": stream_url, } return { "instance_id": session.id, - "stream_url": session.live_url, - "browser_state": browser_state, } diff --git a/langgraph_cua/nodes/take_browser_action.py b/langgraph_cua/nodes/take_browser_action.py index dd94bce..0620766 100644 --- a/langgraph_cua/nodes/take_browser_action.py +++ b/langgraph_cua/nodes/take_browser_action.py @@ -7,7 +7,7 @@ from langchain_core.messages import AnyMessage, ToolMessage, ToolCall from langchain_core.runnables import RunnableConfig from langgraph.config import get_stream_writer -from playwright.sync_api import Page +from playwright.async_api import Page, async_playwright from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall from ..utils import get_instance, is_computer_tool_call @@ -48,124 +48,106 @@ def _translate_key(key: str) -> str: return CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) -def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> ToolMessage: +async def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> ToolMessage: name = function_tool_call.get("name") arguments = function_tool_call.get("args") call_id = function_tool_call.get("id") - try: - if name == "go_to_url": - page.goto(arguments.get("url"), timeout=15000, wait_until="domcontentloaded") - time.sleep(1) - return ToolMessage( - tool_call_id=call_id, - content={"message": f"Navigated to {arguments.get('url')}"}, - additional_kwargs={"type": "function_call_output"}, - ) - elif name == "get_current_url": - return ToolMessage( - tool_call_id=call_id, - content={"message": f"The current URL is {page.url}"}, - additional_kwargs={"type": "function_call_output"}, - ) - else: - raise ValueError(f"Unknown function call name: {name}") - except Exception as e: - print(f"\n\nFailed to execute function call: {e}\n\n") - print(f"Function call details: {function_tool_call}\n\n") - return ToolMessage( - status="error", - tool_call_id=call_id, - content={"message": f"Error occured while calling function {name}: {e}"}, - additional_kwargs={"type": "function_call_output"}, - ) + if name == "go_to_url": + await page.goto(arguments.get("url"), timeout=15000, wait_until="load") + time.sleep(1) + return { + "role": "tool", + "tool_call_id": call_id, + "content": [{"message": f"Navigated to {arguments.get('url')}"}], + "additional_kwargs": {"type": "function_call_output"}, + } + elif name == "get_current_url": + return { + "role": "tool", + "tool_call_id": call_id, + "content": [{"message": f"The current URL is {page.url}"}], + "additional_kwargs": {"type": "function_call_output"}, + } + else: + raise ValueError(f"Unknown function call name: {name}") -def handle_computer_call(page: Page, computer_call: dict): +async def handle_computer_call(page: Page, computer_call: dict): action = computer_call.get("action") call_id = computer_call.get("call_id") - try: - action_type = action.get("type") - - if action_type == "click": - button = action.get("button") - x = action.get("x") - y = action.get("y") - if button == "back": - page.go_back(timeout=30000) - elif button == "forward": - page.go_forward(timeout=30000) - elif button == "wheel": - page.mouse.wheel(x, y) - else: - button_mapping = {"left": "left", "right": "right", "middle": "left"} - page.mouse.click(x, y, button=button_mapping.get(button)) - elif action_type == "scroll": - x = action.get("x") - y = action.get("y") - delta_x = action.get("scroll_x") - delta_y = action.get("scroll_y") - page.mouse.move(x, y) - page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") - elif action_type == "keypress": - keys = action.get("keys") - mapped_keys = [_translate_key(key) for key in keys] - for key in mapped_keys: - page.keyboard.down(key) - for key in reversed(mapped_keys): - page.keyboard.up(key) - elif action_type == "type": - text = action.get("text") - page.keyboard.type(text) - elif action_type == "wait": - time.sleep(2) - elif action_type == "screenshot": - pass - elif action_type == "double_click": - x = action.get("x") - y = action.get("y") - page.mouse.click(x, y, button="left", click_count=2) - elif action_type == "drag": - path = action.get("path") - page.mouse.move(path[0].get("x"), path[0].get("y")) - page.mouse.down() - for point in path[1:]: - page.mouse.move(point.get("x"), point.get("y")) - time.sleep(40 + floor(random() * 40)) - page.mouse.up() - elif action_type == "move": - x = action.get("x") - y = action.get("y") - page.mouse.move(x, y) + action_type = action.get("type") + + if action_type == "click": + button = action.get("button") + x = action.get("x") + y = action.get("y") + if button == "back": + await page.go_back(timeout=15000, wait_until="load") + elif button == "forward": + await page.go_forward(timeout=15000, wait_until="load") + elif button == "wheel": + await page.mouse.wheel(x, y) else: - raise ValueError(f"Unknown action type received: {action_type}") - - time.sleep(1) - screenshot = page.screenshot(timeout=15000) - b64_screenshot = base64.b64encode(screenshot).decode("utf-8") - screenshot_url = f"data:image/png;base64,{b64_screenshot}" - output_content = { - "type": "input_image", - "image_url": screenshot_url, - } - return ToolMessage( - tool_call_id=call_id, - content=[output_content], - additional_kwargs={"type": "computer_call_output"}, - ) - - except Exception as e: - print(f"\n\nFailed to execute computer call: {e}\n\n") - print(f"Computer call details: {computer_call}\n\n") - return ToolMessage( - tool_call_id=call_id, - status="error", - content=[{"type": "input_image", "image_url": DUMMY_SCREENSHOT}], - additional_kwargs={"type": "computer_call_output", "status": "incomplete"}, - ) + button_mapping = {"left": "left", "right": "right", "middle": "left"} + await page.mouse.click(x, y, button=button_mapping.get(button)) + elif action_type == "scroll": + x = action.get("x") + y = action.get("y") + delta_x = action.get("scroll_x") + delta_y = action.get("scroll_y") + await page.mouse.move(x, y) + await page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") + elif action_type == "keypress": + keys = action.get("keys") + mapped_keys = [_translate_key(key) for key in keys] + for key in mapped_keys: + await page.keyboard.down(key) + for key in reversed(mapped_keys): + await page.keyboard.up(key) + elif action_type == "type": + text = action.get("text") + await page.keyboard.type(text) + elif action_type == "wait": + time.sleep(2) + elif action_type == "screenshot": + pass + elif action_type == "double_click": + x = action.get("x") + y = action.get("y") + await page.mouse.click(x, y, button="left", click_count=2) + elif action_type == "drag": + path = action.get("path") + await page.mouse.move(path[0].get("x"), path[0].get("y")) + await page.mouse.down() + for point in path[1:]: + page.mouse.move(point.get("x"), point.get("y")) + time.sleep(40 + floor(random() * 40)) + await page.mouse.up() + elif action_type == "move": + x = action.get("x") + y = action.get("y") + await page.mouse.move(x, y) + else: + raise ValueError(f"Unknown action type received: {action_type}") + + time.sleep(3) + screenshot = await page.screenshot(timeout=15000) + b64_screenshot = base64.b64encode(screenshot).decode("utf-8") + screenshot_url = f"data:image/png;base64,{b64_screenshot}" + output_content = { + "type": "input_image", + "image_url": screenshot_url, + } + return { + "role": "tool", + "tool_call_id": call_id, + "content": [output_content], + "additional_kwargs": {"type": "computer_call_output"}, + } -def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: +async def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: """ Executes browser actions based on the tool call in the last message. Args: @@ -195,19 +177,9 @@ def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> Dict[st if instance.status != "active": raise ValueError("Instance is not active.") - browser_state = state.get("browser_state") - if not browser_state: - raise ValueError("Browser state not found in state.") - browser = browser_state.get("browser") - if not browser: - raise ValueError("Browser not found in browser state.") - current_context = browser.contexts[0] - page = browser_state.get("current_page", current_context.pages[0]) - - def handle_page_event(newPage: Page): - nonlocal page - page = newPage - browser_state["current_page"] = newPage + p = await async_playwright().start() + browser = await p.chromium.connect_over_cdp(f"{instance.ws_endpoint}&keepAlive=true") + page = browser.contexts[0].pages[-1] stream_url: Optional[str] = state.get("stream_url") if not stream_url: @@ -218,18 +190,24 @@ def handle_page_event(newPage: Page): writer = get_stream_writer() writer({"stream_url": stream_url}) - current_context.on("page", handle_page_event) - output = tool_outputs[-1] if len(tool_outputs) > 0 else None tool_message: Optional[ToolMessage] = None for tool_call in tool_calls: - tool_message = handle_function_tool_call(page, tool_call) + try: + tool_message = await handle_function_tool_call(page, tool_call) + except Exception as e: + print(f"\n\nFailed to execute function call: {e}\n\n") + print(f"Function call details: {tool_call}\n\n") time.sleep(1) if output: if output.get("type") == "computer_call": - tool_message = handle_computer_call(page, output) + try: + tool_message = await handle_computer_call(page, output) + except Exception as e: + print(f"\n\nFailed to execute computer call: {e}\n\n") + print(f"Computer call details: {output}\n\n") else: print("unknown tool output type", output) @@ -237,5 +215,4 @@ def handle_page_event(newPage: Page): "messages": tool_message if tool_message else None, "instance_id": instance.id, "stream_url": stream_url, - "browser_state": browser_state, } diff --git a/langgraph_cua/nodes/take_computer_action.py b/langgraph_cua/nodes/take_computer_action.py index 73f7b65..5456a49 100644 --- a/langgraph_cua/nodes/take_computer_action.py +++ b/langgraph_cua/nodes/take_computer_action.py @@ -168,12 +168,12 @@ def take_scrapybara_action(state: CUAState, config: RunnableConfig) -> Dict[str, } -def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: +async def take_computer_action(state: CUAState, config: RunnableConfig) -> Dict[str, Any]: configuration = get_configuration_with_defaults(config) provider = configuration.get("provider") if provider == Provider.Scrapybara: return take_scrapybara_action(state, config) elif provider == Provider.Hyperbrowser: - return take_hyperbrowser_action(state, config) + return await take_hyperbrowser_action(state, config) else: raise ValueError(f"Unknown provider: {provider}") diff --git a/langgraph_cua/types.py b/langgraph_cua/types.py index b13c7e3..f37af66 100644 --- a/langgraph_cua/types.py +++ b/langgraph_cua/types.py @@ -6,8 +6,6 @@ from langchain_core.runnables import RunnableConfig from langgraph.graph import add_messages -from playwright.sync_api import Browser, Page - class Provider(str, Enum): Scrapybara = "scrapybara" @@ -51,15 +49,6 @@ class ComputerCallOutput(TypedDict): ] # Status of the message input -class BrowserState(TypedDict): - """ - The state of the browser. - """ - - browser: Annotated[Optional[Browser], None] = None - current_page: Annotated[Optional[Page], None] = None - - class CUAState(TypedDict): """State schema for the computer use agent. @@ -74,7 +63,6 @@ class CUAState(TypedDict): messages: Annotated[list[AnyMessage], add_messages] = [] instance_id: Annotated[Optional[str], None] = None stream_url: Annotated[Optional[str], None] = None - browser_state: Annotated[Optional[BrowserState], None] = None authenticated_id: Annotated[Optional[str], None] = None @@ -138,7 +126,7 @@ def get_configuration_with_defaults(config: RunnableConfig) -> Dict[str, Any]: or os.environ.get("HYPERBROWSER_API_KEY") ) provider: Provider = configurable_fields.get("provider", Provider.Scrapybara) - session_params = configurable_fields.get("session_params", {}) + session_params = configurable_fields.get("session_params", None) timeout_hours = configurable_fields.get("timeout_hours", 1) zdr_enabled = configurable_fields.get("zdr_enabled", False) auth_state_id = configurable_fields.get("auth_state_id", None) diff --git a/tests/integration/test_cua.py b/tests/integration/test_cua.py index a0ee154..8763c2b 100644 --- a/tests/integration/test_cua.py +++ b/tests/integration/test_cua.py @@ -25,7 +25,7 @@ async def process_stream(stream): tool_message = update.get("take_computer_action", {}).get("messages") if tool_message: # Extract content from the tool message - content = tool_message.content + content = tool_message.get("content") # Handle the case where content is an array if isinstance(content, list) and len(content) > 0: @@ -64,7 +64,7 @@ async def process_stream(stream): content_copy["image_url"] = ( image_url[:100] + "..." if len(image_url) > 100 else image_url ) - print(f"Tool Message ID: {tool_message.tool_call_id}") + print(f"Tool Message ID: {tool_message.get('tool_call_id')}") # Print the truncated content explicitly print(f"Content type: {content_copy.get('type')}") print(f"Image URL (truncated): {content_copy['image_url']}") From 4a8008a58d802ff2332b0a33af78ee5460557a19 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Sun, 13 Apr 2025 13:01:52 -0500 Subject: [PATCH 12/13] update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f8cd2c7..b24982e 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,8 @@ async def main(): # Stream the graph execution stream = cua_graph.astream( {"messages": messages}, - stream_mode="updates" + stream_mode="updates", + config={"configurable": {"provider": "hyperbrowser"}} ) # Process the stream updates From 731ae5ad5634b5c48dc6040f4be118b31407ff31 Mon Sep 17 00:00:00 2001 From: Nikhil Shahi Date: Mon, 9 Jun 2025 11:10:22 -0500 Subject: [PATCH 13/13] upload file handler --- langgraph_cua/nodes/call_model.py | 25 +++++++++++ langgraph_cua/nodes/take_browser_action.py | 51 +++++++++++++++++----- 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/langgraph_cua/nodes/call_model.py b/langgraph_cua/nodes/call_model.py index aba0a5a..3bfff8d 100644 --- a/langgraph_cua/nodes/call_model.py +++ b/langgraph_cua/nodes/call_model.py @@ -90,6 +90,31 @@ def get_available_tools(configuration: Dict[str, Any]) -> List[Dict[str, Any]]: }, }, }, + { + "type": "function", + "function": { + "name": "upload_file_to_element", + "description": "Upload a file to an element on the page.", + "parameters": { + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "The path to the file on the computer to upload", + }, + "x": { + "type": "number", + "description": "The x coordinate of the element to upload the file to", + }, + "y": { + "type": "number", + "description": "The y coordinate of the element to upload the file to", + }, + }, + "required": ["file_path", "x", "y"], + }, + } + } ] else: raise ValueError(f"Unknown provider: {provider}") diff --git a/langgraph_cua/nodes/take_browser_action.py b/langgraph_cua/nodes/take_browser_action.py index 0620766..16aa037 100644 --- a/langgraph_cua/nodes/take_browser_action.py +++ b/langgraph_cua/nodes/take_browser_action.py @@ -1,3 +1,4 @@ +import asyncio import base64 from math import floor from random import random @@ -7,7 +8,7 @@ from langchain_core.messages import AnyMessage, ToolMessage, ToolCall from langchain_core.runnables import RunnableConfig from langgraph.config import get_stream_writer -from playwright.async_api import Page, async_playwright +from playwright.async_api import Page, async_playwright, BrowserContext from openai.types.responses.response_computer_tool_call import ResponseComputerToolCall from ..utils import get_instance, is_computer_tool_call @@ -41,8 +42,6 @@ "win": "Meta", } -DUMMY_SCREENSHOT = "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wAALCAABAAEBAREA/8QAFAABAAAAAAAAAAAAAAAAAAAACf/EABQQAQAAAAAAAAAAAAAAAAAAAAD/2gAIAQEAAD8AKp//2Q==" - def _translate_key(key: str) -> str: return CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) @@ -55,7 +54,7 @@ async def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> if name == "go_to_url": await page.goto(arguments.get("url"), timeout=15000, wait_until="load") - time.sleep(1) + await asyncio.sleep(1) return { "role": "tool", "tool_call_id": call_id, @@ -69,11 +68,39 @@ async def handle_function_tool_call(page: Page, function_tool_call: ToolCall) -> "content": [{"message": f"The current URL is {page.url}"}], "additional_kwargs": {"type": "function_call_output"}, } + elif name == "upload_file_to_element": + file_path = arguments.get("file_path") + x = arguments.get("x") + y = arguments.get("y") + cdp_session = await page.context.new_cdp_session(page) + resp = await cdp_session.send("DOM.getNodeForLocation", { + "x": x, + "y": y, + }) + backend_id = resp["backendNodeId"] + + if not backend_id: + raise ValueError(f"No element found at x={x}, y={y}") + + await cdp_session.send("DOM.setFileInputFiles", { + "backendNodeId": backend_id, + "files": [file_path] + }) + + await asyncio.sleep(3) + await cdp_session.detach() + + return { + "role": "tool", + "tool_call_id": call_id, + "content": [{"message": f"Uploaded file {file_path} to element at x={x}, y={y}"}], + "additional_kwargs": {"type": "function_call_output"}, + } else: raise ValueError(f"Unknown function call name: {name}") -async def handle_computer_call(page: Page, computer_call: dict): +async def handle_computer_call(page: Page, context: BrowserContext, computer_call: dict): action = computer_call.get("action") call_id = computer_call.get("call_id") action_type = action.get("type") @@ -109,7 +136,7 @@ async def handle_computer_call(page: Page, computer_call: dict): text = action.get("text") await page.keyboard.type(text) elif action_type == "wait": - time.sleep(2) + pass elif action_type == "screenshot": pass elif action_type == "double_click": @@ -122,7 +149,7 @@ async def handle_computer_call(page: Page, computer_call: dict): await page.mouse.down() for point in path[1:]: page.mouse.move(point.get("x"), point.get("y")) - time.sleep(40 + floor(random() * 40)) + await asyncio.sleep(40 + floor(random() * 40)) await page.mouse.up() elif action_type == "move": x = action.get("x") @@ -131,7 +158,8 @@ async def handle_computer_call(page: Page, computer_call: dict): else: raise ValueError(f"Unknown action type received: {action_type}") - time.sleep(3) + await asyncio.sleep(3) + page = context.pages[-1] screenshot = await page.screenshot(timeout=15000) b64_screenshot = base64.b64encode(screenshot).decode("utf-8") screenshot_url = f"data:image/png;base64,{b64_screenshot}" @@ -179,7 +207,8 @@ async def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> D p = await async_playwright().start() browser = await p.chromium.connect_over_cdp(f"{instance.ws_endpoint}&keepAlive=true") - page = browser.contexts[0].pages[-1] + context = browser.contexts[0] + page = context.pages[-1] stream_url: Optional[str] = state.get("stream_url") if not stream_url: @@ -199,12 +228,12 @@ async def take_hyperbrowser_action(state: CUAState, config: RunnableConfig) -> D except Exception as e: print(f"\n\nFailed to execute function call: {e}\n\n") print(f"Function call details: {tool_call}\n\n") - time.sleep(1) + await asyncio.sleep(1) if output: if output.get("type") == "computer_call": try: - tool_message = await handle_computer_call(page, output) + tool_message = await handle_computer_call(page, context, output) except Exception as e: print(f"\n\nFailed to execute computer call: {e}\n\n") print(f"Computer call details: {output}\n\n")