From 711fed319b5ec55a45b9c425959256f6645e029b Mon Sep 17 00:00:00 2001 From: f-trycua Date: Thu, 3 Apr 2025 22:48:19 +0100 Subject: [PATCH 1/5] Add cua-computer --- README.md | 63 ++++++++-- cli.py | 33 ++++- computers/__init__.py | 1 + computers/cua.py | 207 +++++++++++++++++++++++++++++++ examples/macos_finder_example.py | 53 ++++++++ requirements.txt | 1 + 6 files changed, 348 insertions(+), 10 deletions(-) create mode 100644 computers/cua.py create mode 100644 examples/macos_finder_example.py diff --git a/README.md b/README.md index 74065a2..d765b74 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Other included sample [computer environments](#computer-environments): - [Docker](https://docker.com/) (containerized desktop) - [Browserbase](https://www.browserbase.com/) (remote browser, requires account) - [Scrapybara](https://scrapybara.com) (remote browser or computer, requires account) +- [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) (virtual machines using lume virtualization) - ...or implement your own `Computer`! ## Overview @@ -58,10 +59,17 @@ The CLI (`cli.py`) is the easiest way to get started with CUA. It accepts the fo ### Run examples (optional) -The `examples` folder contains more examples of how to use CUA. +The `examples` folder contains more examples of how to use CUA with different environments: ```shell +# General weather example using Scrapybara python -m examples.weather_example + +# Example with function calling +python -m examples.function_calling_example + +# Example for macOS Finder +python -m examples.macos_finder_example # Work with Finder on macOS ``` For reference, the file `simple_cua_loop.py` implements the basics of the CUA loop. @@ -89,13 +97,14 @@ CUA can work with any `Computer` environment that can handle the [CUA actions](h This sample app provides a set of implemented `Computer` examples, but feel free to add your own! -| Computer | Option | Type | Description | Requirements | -| ------------------- | ------------------ | --------- | --------------------------------- | ---------------------------------------------------------------- | -| `LocalPlaywright` | local-playwright | `browser` | Local browser window | [Playwright SDK](https://playwright.dev/) | -| `Docker` | docker | `linux` | Docker container environment | [Docker](https://docs.docker.com/engine/install/) running | -| `Browserbase` | browserbase | `browser` | Remote browser environment | [Browserbase](https://www.browserbase.com/) API key in `.env` | -| `ScrapybaraBrowser` | scrapybara-browser | `browser` | Remote browser environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` | -| `ScrapybaraUbuntu` | scrapybara-ubuntu | `linux` | Remote Ubuntu desktop environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` | +| Computer | Option | Type | Description | Requirements | +| ------------------- | ------------------ | -------------------- | --------------------------------- | ---------------------------------------------------------------- | +| `LocalPlaywright` | local-playwright | `browser` | Local browser window | [Playwright SDK](https://playwright.dev/) | +| `Docker` | docker | `linux` | Docker container environment | [Docker](https://docs.docker.com/engine/install/) running | +| `Browserbase` | browserbase | `browser` | Remote browser environment | [Browserbase](https://www.browserbase.com/) API key in `.env` | +| `ScrapybaraBrowser` | scrapybara-browser | `browser` | Remote browser environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` | +| `ScrapybaraUbuntu` | scrapybara-ubuntu | `linux` | Remote Ubuntu desktop environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` | +| `CuaMacOSComputer` | cua-macos | `mac` | macOS VM with lume virtualization | [cua-computer](https://github.com/trycua/cua/tree/main/libs/computer) package and [lume CLI](https://github.com/trycua/cua/tree/main/libs/lume) | Using the CLI, you can run the sample app with different computer environments using the options listed above: @@ -109,6 +118,12 @@ For example, to run the sample app with the `Docker` computer environment, you c python cli.py --show --computer docker ``` +To run with the `cua-macos` computer environment: + +```shell +python cli.py --show --computer cua-macos +``` + ### Docker Setup If you want to run the sample app with the `Docker` computer environment, you need to build and run a local Docker container. @@ -136,6 +151,36 @@ docker run --rm -it --name cua-sample-app -p 5900:5900 --dns=1.1.1.3 -e DISPLAY= > docker rm -f cua-sample-app > ``` +### Cua MacOS Setup + +To use the `cua-macos` computer environment, you need to install the `cua-computer` package and the `lume` CLI: + +1. **Install cua-computer package**: + ```shell + pip install cua-computer + ``` + +2. **Install lume CLI**: + ```shell + sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)" + ``` + +3. **Start the lume daemon**: + ```shell + lume serve + ``` + +4. **Pull the macOS VM image**: + ```shell + lume pull macos-sequoia-cua:latest --no-cache + ``` + +> [!NOTE] +> - Initial download requires 80GB of free space +> - After first run, space usage reduces to ~30GB due to macOS's sparse file system +> - VMs are stored in `~/.lume` +> - Cached images are stored in `~/.lume/cache` + ### Hosted environment setup This repository contains example implementations of third-party hosted environments. @@ -151,4 +196,4 @@ However, if you pass in any `tools` that are also defined in your `Computer` met This repository provides example implementations with basic safety measures in place. -We recommend reviewing the best practices outlined in our [guide](https://platform.openai.com/docs/guides/tools-computer-use#risks-and-safety), and making sure you understand the risks involved with using this tool. +We recommend reviewing the best practices outlined in our [guide](https://platform.openai.com/docs/guides/tools-computer-use#risks-and-safety), and making sure you understand the risks involved with using this tool. \ No newline at end of file diff --git a/cli.py b/cli.py index 891f3fb..4120fa8 100644 --- a/cli.py +++ b/cli.py @@ -6,6 +6,7 @@ ScrapybaraUbuntu, LocalPlaywrightComputer, DockerComputer, + CuaMacOSComputer, ) def acknowledge_safety_check_callback(message: str) -> bool: @@ -27,6 +28,7 @@ def main(): "browserbase", "scrapybara-browser", "scrapybara-ubuntu", + "cua-macos", ], help="Choose the computer environment to use.", default="local-playwright", @@ -53,6 +55,25 @@ def main(): help="Start the browsing session with a specific URL (only for browser environments).", default="https://bing.com", ) + # Add cua-specific arguments + parser.add_argument( + "--display", + type=str, + help="Display resolution for VM (e.g., 1024x768)", + default="1024x768", + ) + parser.add_argument( + "--memory", + type=str, + help="Memory allocation for VM (e.g., 4GB)", + default="4GB", + ) + parser.add_argument( + "--cpu", + type=str, + help="CPU cores for VM", + default="2", + ) args = parser.parse_args() computer_mapping = { @@ -61,11 +82,21 @@ def main(): "browserbase": BrowserbaseBrowser, "scrapybara-browser": ScrapybaraBrowser, "scrapybara-ubuntu": ScrapybaraUbuntu, + "cua-macos": CuaMacOSComputer, } ComputerClass = computer_mapping[args.computer] - with ComputerClass() as computer: + if args.computer == "cua-macos": + computer = ComputerClass( + display=args.display, + memory=args.memory, + cpu=args.cpu, + ) + else: + computer = ComputerClass() + + with computer: agent = Agent( computer=computer, acknowledge_safety_check_callback=acknowledge_safety_check_callback, diff --git a/computers/__init__.py b/computers/__init__.py index 606332e..458bf7d 100644 --- a/computers/__init__.py +++ b/computers/__init__.py @@ -3,3 +3,4 @@ from .local_playwright import LocalPlaywrightComputer from .docker import DockerComputer from .scrapybara import ScrapybaraBrowser, ScrapybaraUbuntu +from .cua import CuaMacOSComputer diff --git a/computers/cua.py b/computers/cua.py new file mode 100644 index 0000000..e4058ea --- /dev/null +++ b/computers/cua.py @@ -0,0 +1,207 @@ +import asyncio +import base64 +import os +import time +from typing import Dict, List, Optional, Tuple, Literal + +try: + from computer import Computer as CuaComputer +except ImportError: + raise ImportError("The cua-computer package is required. Install it with 'pip install cua-computer'") + +class CuaComputerAdapter: + """Adapter class to convert between sync and async methods for cua-computer.""" + + def __init__(self, computer): + self.computer = computer + self.loop = asyncio.get_event_loop() + + def _run_async(self, coro): + """Run an async coroutine in a synchronous context.""" + return self.loop.run_until_complete(coro) + + def screenshot(self): + """Take a screenshot of the VM.""" + screenshot_bytes = self._run_async(self.computer.interface.screenshot()) + return base64.b64encode(screenshot_bytes).decode('utf-8') + + def click(self, x: int, y: int, button: str = "left"): + """Click at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + if button == "right": + self._run_async(self.computer.interface.right_click()) + else: + self._run_async(self.computer.interface.left_click()) + + def double_click(self, x: int, y: int): + """Double click at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + self._run_async(self.computer.interface.left_click()) + time.sleep(0.1) + self._run_async(self.computer.interface.left_click()) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int): + """Scroll at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + self._run_async(self.computer.interface.scroll(scroll_y)) + + def type(self, text: str): + """Type the specified text.""" + self._run_async(self.computer.interface.type_text(text)) + + def wait(self, ms: int = 1000): + """Wait for the specified number of milliseconds.""" + time.sleep(ms / 1000) + + def move(self, x: int, y: int): + """Move the cursor to the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + + def keypress(self, keys: List[str]): + """Press the specified keys.""" + for key in keys: + # Map common key names to CUA equivalents + if key.lower() == "enter": + self._run_async(self.computer.interface.press_key("return")) + elif key.lower() == "space": + self._run_async(self.computer.interface.press_key("space")) + else: + self._run_async(self.computer.interface.press_key(key)) + + def drag(self, path: List[Dict[str, int]]): + """Drag from the start point to the end point.""" + if len(path) < 2: + return + + # Move to start position + start = path[0] + self._run_async(self.computer.interface.move_cursor(start[0], start[1])) + + # Start dragging + self._run_async(self.computer.interface.mouse_down()) + + # Move through each point in the path + for point in path[1:]: + self._run_async(self.computer.interface.move_cursor(point[0], point[1])) + time.sleep(0.05) # Small delay between movements + + # Release at final position + self._run_async(self.computer.interface.mouse_up()) + + def get_current_url(self) -> str: + """Get the current URL (only applicable for browser environments).""" + # Not directly available in cua-computer, but could be implemented + # in a more sophisticated way if needed + return "" + + +class CuaBaseComputer: + """Base implementation of the Computer protocol using cua-computer and lume virtualization.""" + + def __init__( + self, + display: str = "1024x768", + memory: str = "4GB", + cpu: str = "2", + os: str = "macos", + image: str = None + ): + self.display = display + self.memory = memory + self.cpu = cpu + self.os = os + self.image = image + self.computer = None + self.adapter = None + self._width, self._height = map(int, display.split('x')) + + @property + def dimensions(self) -> Tuple[int, int]: + return (self._width, self._height) + + def __enter__(self): + # Create and run the cua-computer instance + self.computer = CuaComputer( + display=self.display, + memory=self.memory, + cpu=self.cpu, + os=self.os, + image=self.image + ) + + # Run the VM + asyncio.get_event_loop().run_until_complete(self.computer.run()) + + # Create the adapter for sync operations + self.adapter = CuaComputerAdapter(self.computer) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Stop the VM when we're done + if self.computer: + asyncio.get_event_loop().run_until_complete(self.computer.stop()) + + # Delegate all the Computer protocol methods to the adapter + def screenshot(self) -> str: + return self.adapter.screenshot() + + def click(self, x: int, y: int, button: str = "left") -> None: + self.adapter.click(x, y, button) + + def double_click(self, x: int, y: int) -> None: + self.adapter.double_click(x, y) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + self.adapter.scroll(x, y, scroll_x, scroll_y) + + def type(self, text: str) -> None: + self.adapter.type(text) + + def wait(self, ms: int = 1000) -> None: + self.adapter.wait(ms) + + def move(self, x: int, y: int) -> None: + self.adapter.move(x, y) + + def keypress(self, keys: List[str]) -> None: + self.adapter.keypress(keys) + + def drag(self, path: List[Dict[str, int]]) -> None: + self.adapter.drag(path) + + def get_current_url(self) -> str: + return self.adapter.get_current_url() + + # Additional methods that could be useful for function calling + def goto(self, url: str) -> None: + """Navigate to a specific URL (emulating browser functionality).""" + # This would require launching a browser and typing the URL + self.adapter.type(url) + self.adapter.keypress(["Enter"]) + + +class CuaMacOSComputer(CuaBaseComputer): + """Implementation of the Computer protocol using cua-computer and lume virtualization for macOS.""" + + def __init__( + self, + display: str = "1024x768", + memory: str = "4GB", + cpu: str = "2" + ): + super().__init__( + display=display, + memory=memory, + cpu=cpu, + os="macos", + image="macos-sequoia-cua:latest" + ) + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: + return "mac" + + def back(self) -> None: + """Go back (browser functionality) on macOS.""" + self.adapter.keypress(["Command", "Left"]) \ No newline at end of file diff --git a/examples/macos_finder_example.py b/examples/macos_finder_example.py new file mode 100644 index 0000000..0b6aeaa --- /dev/null +++ b/examples/macos_finder_example.py @@ -0,0 +1,53 @@ +from agent import Agent +from computers import CuaMacOSComputer + +def acknowledge_safety_check_callback(message: str) -> bool: + """Callback function to handle safety check warnings.""" + print(f"Safety Check Warning: {message}") + response = input("Do you want to acknowledge and proceed? (y/n): ").lower() + return response == "y" + +def main(): + """Example of using CuaMacOSComputer to interact with Finder and other macOS apps.""" + print("Starting macOS environment...") + print("Task: Open Finder, create a new folder, and take a screenshot") + print("This may take a minute to initialize the VM...") + + with CuaMacOSComputer() as computer: + # Create the agent with our computer and safety callback + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback + ) + + # Define the task: interact with macOS Finder + task = """ + Follow these steps on macOS: + 1. Open Finder + 2. Create a new folder on the Desktop named "CUA Demo" + 3. Open the folder + 4. Open TextEdit and save a file in that folder + 5. Take a screenshot with the keyboard shortcut Command+Shift+3 + """ + + # Create the input items with our task + input_items = [{"role": "user", "content": task}] + + # Run the agent and get the response items + print("\nExecuting macOS task...") + response_items = agent.run_full_turn( + input_items, + debug=True, + show_images=True + ) + + # Print the final response + if response_items and response_items[-1].get("role") == "assistant": + print("\nTask completed!") + print("Assistant's final response:") + print(response_items[-1]["content"][0]["text"]) + else: + print("\nNo final response from assistant.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 13769fb..a406b5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ anyio==4.8.0 browserbase==1.2.0 certifi==2025.1.31 charset-normalizer==3.4.1 +cua-computer>=0.1.0 distro==1.9.0 greenlet==3.1.1 h11==0.14.0 From a5c00ac1ca121f74ce6b54718e500b5720055eed Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 09:17:38 -0400 Subject: [PATCH 2/5] add to config --- computers/config.py | 1 + computers/contrib/__init__.py | 1 + computers/contrib/cua.py | 207 ++++++++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100644 computers/contrib/cua.py diff --git a/computers/config.py b/computers/config.py index 699f1a8..2f05739 100644 --- a/computers/config.py +++ b/computers/config.py @@ -7,4 +7,5 @@ "browserbase": BrowserbaseBrowser, "scrapybara-browser": ScrapybaraBrowser, "scrapybara-ubuntu": ScrapybaraUbuntu, + "cua-macos": CuaMacOSComputer, } diff --git a/computers/contrib/__init__.py b/computers/contrib/__init__.py index e69de29..d783750 100644 --- a/computers/contrib/__init__.py +++ b/computers/contrib/__init__.py @@ -0,0 +1 @@ +from .cua import CuaMacOSComputer \ No newline at end of file diff --git a/computers/contrib/cua.py b/computers/contrib/cua.py new file mode 100644 index 0000000..e4058ea --- /dev/null +++ b/computers/contrib/cua.py @@ -0,0 +1,207 @@ +import asyncio +import base64 +import os +import time +from typing import Dict, List, Optional, Tuple, Literal + +try: + from computer import Computer as CuaComputer +except ImportError: + raise ImportError("The cua-computer package is required. Install it with 'pip install cua-computer'") + +class CuaComputerAdapter: + """Adapter class to convert between sync and async methods for cua-computer.""" + + def __init__(self, computer): + self.computer = computer + self.loop = asyncio.get_event_loop() + + def _run_async(self, coro): + """Run an async coroutine in a synchronous context.""" + return self.loop.run_until_complete(coro) + + def screenshot(self): + """Take a screenshot of the VM.""" + screenshot_bytes = self._run_async(self.computer.interface.screenshot()) + return base64.b64encode(screenshot_bytes).decode('utf-8') + + def click(self, x: int, y: int, button: str = "left"): + """Click at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + if button == "right": + self._run_async(self.computer.interface.right_click()) + else: + self._run_async(self.computer.interface.left_click()) + + def double_click(self, x: int, y: int): + """Double click at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + self._run_async(self.computer.interface.left_click()) + time.sleep(0.1) + self._run_async(self.computer.interface.left_click()) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int): + """Scroll at the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + self._run_async(self.computer.interface.scroll(scroll_y)) + + def type(self, text: str): + """Type the specified text.""" + self._run_async(self.computer.interface.type_text(text)) + + def wait(self, ms: int = 1000): + """Wait for the specified number of milliseconds.""" + time.sleep(ms / 1000) + + def move(self, x: int, y: int): + """Move the cursor to the specified coordinates.""" + self._run_async(self.computer.interface.move_cursor(x, y)) + + def keypress(self, keys: List[str]): + """Press the specified keys.""" + for key in keys: + # Map common key names to CUA equivalents + if key.lower() == "enter": + self._run_async(self.computer.interface.press_key("return")) + elif key.lower() == "space": + self._run_async(self.computer.interface.press_key("space")) + else: + self._run_async(self.computer.interface.press_key(key)) + + def drag(self, path: List[Dict[str, int]]): + """Drag from the start point to the end point.""" + if len(path) < 2: + return + + # Move to start position + start = path[0] + self._run_async(self.computer.interface.move_cursor(start[0], start[1])) + + # Start dragging + self._run_async(self.computer.interface.mouse_down()) + + # Move through each point in the path + for point in path[1:]: + self._run_async(self.computer.interface.move_cursor(point[0], point[1])) + time.sleep(0.05) # Small delay between movements + + # Release at final position + self._run_async(self.computer.interface.mouse_up()) + + def get_current_url(self) -> str: + """Get the current URL (only applicable for browser environments).""" + # Not directly available in cua-computer, but could be implemented + # in a more sophisticated way if needed + return "" + + +class CuaBaseComputer: + """Base implementation of the Computer protocol using cua-computer and lume virtualization.""" + + def __init__( + self, + display: str = "1024x768", + memory: str = "4GB", + cpu: str = "2", + os: str = "macos", + image: str = None + ): + self.display = display + self.memory = memory + self.cpu = cpu + self.os = os + self.image = image + self.computer = None + self.adapter = None + self._width, self._height = map(int, display.split('x')) + + @property + def dimensions(self) -> Tuple[int, int]: + return (self._width, self._height) + + def __enter__(self): + # Create and run the cua-computer instance + self.computer = CuaComputer( + display=self.display, + memory=self.memory, + cpu=self.cpu, + os=self.os, + image=self.image + ) + + # Run the VM + asyncio.get_event_loop().run_until_complete(self.computer.run()) + + # Create the adapter for sync operations + self.adapter = CuaComputerAdapter(self.computer) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Stop the VM when we're done + if self.computer: + asyncio.get_event_loop().run_until_complete(self.computer.stop()) + + # Delegate all the Computer protocol methods to the adapter + def screenshot(self) -> str: + return self.adapter.screenshot() + + def click(self, x: int, y: int, button: str = "left") -> None: + self.adapter.click(x, y, button) + + def double_click(self, x: int, y: int) -> None: + self.adapter.double_click(x, y) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + self.adapter.scroll(x, y, scroll_x, scroll_y) + + def type(self, text: str) -> None: + self.adapter.type(text) + + def wait(self, ms: int = 1000) -> None: + self.adapter.wait(ms) + + def move(self, x: int, y: int) -> None: + self.adapter.move(x, y) + + def keypress(self, keys: List[str]) -> None: + self.adapter.keypress(keys) + + def drag(self, path: List[Dict[str, int]]) -> None: + self.adapter.drag(path) + + def get_current_url(self) -> str: + return self.adapter.get_current_url() + + # Additional methods that could be useful for function calling + def goto(self, url: str) -> None: + """Navigate to a specific URL (emulating browser functionality).""" + # This would require launching a browser and typing the URL + self.adapter.type(url) + self.adapter.keypress(["Enter"]) + + +class CuaMacOSComputer(CuaBaseComputer): + """Implementation of the Computer protocol using cua-computer and lume virtualization for macOS.""" + + def __init__( + self, + display: str = "1024x768", + memory: str = "4GB", + cpu: str = "2" + ): + super().__init__( + display=display, + memory=memory, + cpu=cpu, + os="macos", + image="macos-sequoia-cua:latest" + ) + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: + return "mac" + + def back(self) -> None: + """Go back (browser functionality) on macOS.""" + self.adapter.keypress(["Command", "Left"]) \ No newline at end of file From 1684f9728c461d02416b1164fefc0fb585a099ac Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 09:25:48 -0400 Subject: [PATCH 3/5] following contrib guidelines --- cli.py | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/cli.py b/cli.py index d74f82b..79e574a 100644 --- a/cli.py +++ b/cli.py @@ -44,38 +44,10 @@ def main(): help="Start the browsing session with a specific URL (only for browser environments).", default="https://bing.com", ) - # Add cua-specific arguments - parser.add_argument( - "--display", - type=str, - help="Display resolution for VM (e.g., 1024x768)", - default="1024x768", - ) - parser.add_argument( - "--memory", - type=str, - help="Memory allocation for VM (e.g., 4GB)", - default="4GB", - ) - parser.add_argument( - "--cpu", - type=str, - help="CPU cores for VM", - default="2", - ) args = parser.parse_args() ComputerClass = computers_config[args.computer] - if args.computer == "cua-macos": - computer = ComputerClass( - display=args.display, - memory=args.memory, - cpu=args.cpu, - ) - else: - computer = ComputerClass() - - with computer: + with ComputerClass() as computer: agent = Agent( computer=computer, acknowledge_safety_check_callback=acknowledge_safety_check_callback, @@ -107,4 +79,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From 3e7afbe8cced41643951513dd2e2e06fc7f5901e Mon Sep 17 00:00:00 2001 From: Dillon DuPont Date: Mon, 28 Apr 2025 09:26:13 -0400 Subject: [PATCH 4/5] removed extra files --- computers/cua.py | 207 ----------------------------------------------- 1 file changed, 207 deletions(-) delete mode 100644 computers/cua.py diff --git a/computers/cua.py b/computers/cua.py deleted file mode 100644 index e4058ea..0000000 --- a/computers/cua.py +++ /dev/null @@ -1,207 +0,0 @@ -import asyncio -import base64 -import os -import time -from typing import Dict, List, Optional, Tuple, Literal - -try: - from computer import Computer as CuaComputer -except ImportError: - raise ImportError("The cua-computer package is required. Install it with 'pip install cua-computer'") - -class CuaComputerAdapter: - """Adapter class to convert between sync and async methods for cua-computer.""" - - def __init__(self, computer): - self.computer = computer - self.loop = asyncio.get_event_loop() - - def _run_async(self, coro): - """Run an async coroutine in a synchronous context.""" - return self.loop.run_until_complete(coro) - - def screenshot(self): - """Take a screenshot of the VM.""" - screenshot_bytes = self._run_async(self.computer.interface.screenshot()) - return base64.b64encode(screenshot_bytes).decode('utf-8') - - def click(self, x: int, y: int, button: str = "left"): - """Click at the specified coordinates.""" - self._run_async(self.computer.interface.move_cursor(x, y)) - if button == "right": - self._run_async(self.computer.interface.right_click()) - else: - self._run_async(self.computer.interface.left_click()) - - def double_click(self, x: int, y: int): - """Double click at the specified coordinates.""" - self._run_async(self.computer.interface.move_cursor(x, y)) - self._run_async(self.computer.interface.left_click()) - time.sleep(0.1) - self._run_async(self.computer.interface.left_click()) - - def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int): - """Scroll at the specified coordinates.""" - self._run_async(self.computer.interface.move_cursor(x, y)) - self._run_async(self.computer.interface.scroll(scroll_y)) - - def type(self, text: str): - """Type the specified text.""" - self._run_async(self.computer.interface.type_text(text)) - - def wait(self, ms: int = 1000): - """Wait for the specified number of milliseconds.""" - time.sleep(ms / 1000) - - def move(self, x: int, y: int): - """Move the cursor to the specified coordinates.""" - self._run_async(self.computer.interface.move_cursor(x, y)) - - def keypress(self, keys: List[str]): - """Press the specified keys.""" - for key in keys: - # Map common key names to CUA equivalents - if key.lower() == "enter": - self._run_async(self.computer.interface.press_key("return")) - elif key.lower() == "space": - self._run_async(self.computer.interface.press_key("space")) - else: - self._run_async(self.computer.interface.press_key(key)) - - def drag(self, path: List[Dict[str, int]]): - """Drag from the start point to the end point.""" - if len(path) < 2: - return - - # Move to start position - start = path[0] - self._run_async(self.computer.interface.move_cursor(start[0], start[1])) - - # Start dragging - self._run_async(self.computer.interface.mouse_down()) - - # Move through each point in the path - for point in path[1:]: - self._run_async(self.computer.interface.move_cursor(point[0], point[1])) - time.sleep(0.05) # Small delay between movements - - # Release at final position - self._run_async(self.computer.interface.mouse_up()) - - def get_current_url(self) -> str: - """Get the current URL (only applicable for browser environments).""" - # Not directly available in cua-computer, but could be implemented - # in a more sophisticated way if needed - return "" - - -class CuaBaseComputer: - """Base implementation of the Computer protocol using cua-computer and lume virtualization.""" - - def __init__( - self, - display: str = "1024x768", - memory: str = "4GB", - cpu: str = "2", - os: str = "macos", - image: str = None - ): - self.display = display - self.memory = memory - self.cpu = cpu - self.os = os - self.image = image - self.computer = None - self.adapter = None - self._width, self._height = map(int, display.split('x')) - - @property - def dimensions(self) -> Tuple[int, int]: - return (self._width, self._height) - - def __enter__(self): - # Create and run the cua-computer instance - self.computer = CuaComputer( - display=self.display, - memory=self.memory, - cpu=self.cpu, - os=self.os, - image=self.image - ) - - # Run the VM - asyncio.get_event_loop().run_until_complete(self.computer.run()) - - # Create the adapter for sync operations - self.adapter = CuaComputerAdapter(self.computer) - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - # Stop the VM when we're done - if self.computer: - asyncio.get_event_loop().run_until_complete(self.computer.stop()) - - # Delegate all the Computer protocol methods to the adapter - def screenshot(self) -> str: - return self.adapter.screenshot() - - def click(self, x: int, y: int, button: str = "left") -> None: - self.adapter.click(x, y, button) - - def double_click(self, x: int, y: int) -> None: - self.adapter.double_click(x, y) - - def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: - self.adapter.scroll(x, y, scroll_x, scroll_y) - - def type(self, text: str) -> None: - self.adapter.type(text) - - def wait(self, ms: int = 1000) -> None: - self.adapter.wait(ms) - - def move(self, x: int, y: int) -> None: - self.adapter.move(x, y) - - def keypress(self, keys: List[str]) -> None: - self.adapter.keypress(keys) - - def drag(self, path: List[Dict[str, int]]) -> None: - self.adapter.drag(path) - - def get_current_url(self) -> str: - return self.adapter.get_current_url() - - # Additional methods that could be useful for function calling - def goto(self, url: str) -> None: - """Navigate to a specific URL (emulating browser functionality).""" - # This would require launching a browser and typing the URL - self.adapter.type(url) - self.adapter.keypress(["Enter"]) - - -class CuaMacOSComputer(CuaBaseComputer): - """Implementation of the Computer protocol using cua-computer and lume virtualization for macOS.""" - - def __init__( - self, - display: str = "1024x768", - memory: str = "4GB", - cpu: str = "2" - ): - super().__init__( - display=display, - memory=memory, - cpu=cpu, - os="macos", - image="macos-sequoia-cua:latest" - ) - - @property - def environment(self) -> Literal["windows", "mac", "linux", "browser"]: - return "mac" - - def back(self) -> None: - """Go back (browser functionality) on macOS.""" - self.adapter.keypress(["Command", "Left"]) \ No newline at end of file From e311f915b02a0f752501408599b5eb6b31db9dec Mon Sep 17 00:00:00 2001 From: ddupont <3820588+ddupont808@users.noreply.github.com> Date: Tue, 29 Apr 2025 16:30:36 -0400 Subject: [PATCH 5/5] decreased scroll sensitivity --- computers/contrib/cua.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/computers/contrib/cua.py b/computers/contrib/cua.py index e4058ea..042d17e 100644 --- a/computers/contrib/cua.py +++ b/computers/contrib/cua.py @@ -43,7 +43,7 @@ def double_click(self, x: int, y: int): def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int): """Scroll at the specified coordinates.""" self._run_async(self.computer.interface.move_cursor(x, y)) - self._run_async(self.computer.interface.scroll(scroll_y)) + self._run_async(self.computer.interface.scroll(scroll_y // 50)) def type(self, text: str): """Type the specified text.""" @@ -59,14 +59,17 @@ def move(self, x: int, y: int): def keypress(self, keys: List[str]): """Press the specified keys.""" - for key in keys: - # Map common key names to CUA equivalents - if key.lower() == "enter": - self._run_async(self.computer.interface.press_key("return")) - elif key.lower() == "space": - self._run_async(self.computer.interface.press_key("space")) - else: - self._run_async(self.computer.interface.press_key(key)) + if len(keys) > 1: + self._run_async(self.computer.interface.hotkey(*keys)) + else: + for key in keys: + # Map common key names to CUA equivalents + if key.lower() == "enter": + self._run_async(self.computer.interface.press_key("return")) + elif key.lower() == "space": + self._run_async(self.computer.interface.press_key("space")) + else: + self._run_async(self.computer.interface.press_key(key)) def drag(self, path: List[Dict[str, int]]): """Drag from the start point to the end point.""" @@ -204,4 +207,4 @@ def environment(self) -> Literal["windows", "mac", "linux", "browser"]: def back(self) -> None: """Go back (browser functionality) on macOS.""" - self.adapter.keypress(["Command", "Left"]) \ No newline at end of file + self.adapter.keypress(["Command", "Left"])