diff --git a/README.md b/README.md index 74065a2..1d6c30f 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,12 @@ The `examples` folder contains more examples of how to use CUA. python -m examples.weather_example ``` +You can also try the PyAutoGUI desktop control example: + +```shell +python -m examples.pyautogui_desktop_example +``` + For reference, the file `simple_cua_loop.py` implements the basics of the CUA loop. You can run it with: @@ -92,6 +98,7 @@ This sample app provides a set of implemented `Computer` examples, but feel free | Computer | Option | Type | Description | Requirements | | ------------------- | ------------------ | --------- | --------------------------------- | ---------------------------------------------------------------- | | `LocalPlaywright` | local-playwright | `browser` | Local browser window | [Playwright SDK](https://playwright.dev/) | +| `PyAutoGUI` | local-desktop | `desktop` | Local desktop control | [PyAutoGUI](https://pyautogui.readthedocs.io/) | | `Docker` | docker | `linux` | Docker container environment | [Docker](https://docs.docker.com/engine/install/) running | | `Browserbase` | browserbase | `browser` | Remote browser environment | [Browserbase](https://www.browserbase.com/) API key in `.env` | | `ScrapybaraBrowser` | scrapybara-browser | `browser` | Remote browser environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` | diff --git a/agent/agent.py b/agent/agent.py index 2514c24..d4eb17b 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -80,7 +80,11 @@ def handle_item(self, item): screenshot_base64 = self.computer.screenshot() if self.show_images: - show_image(screenshot_base64) + # Use non-intrusive display for PyAutoGUI to avoid changing screen state + use_external_viewer = self.computer.environment != "windows" and \ + self.computer.environment != "mac" and \ + self.computer.environment != "linux" + show_image(screenshot_base64, use_external_viewer=use_external_viewer) # if user doesn't ack all safety checks exit with error pending_checks = item.get("pending_safety_checks", []) diff --git a/cli.py b/cli.py index 891f3fb..5fc0b21 100644 --- a/cli.py +++ b/cli.py @@ -6,6 +6,7 @@ ScrapybaraUbuntu, LocalPlaywrightComputer, DockerComputer, + PyAutoGUIComputer, ) def acknowledge_safety_check_callback(message: str) -> bool: @@ -27,6 +28,7 @@ def main(): "browserbase", "scrapybara-browser", "scrapybara-ubuntu", + "local-desktop", ], help="Choose the computer environment to use.", default="local-playwright", @@ -61,6 +63,7 @@ def main(): "browserbase": BrowserbaseBrowser, "scrapybara-browser": ScrapybaraBrowser, "scrapybara-ubuntu": ScrapybaraUbuntu, + "local-desktop": PyAutoGUIComputer, } ComputerClass = computer_mapping[args.computer] @@ -77,6 +80,11 @@ def main(): if not args.start_url.startswith("http"): args.start_url = "https://" + args.start_url agent.computer.goto(args.start_url) + + # Display a welcome message for local-desktop mode + if args.computer == "local-desktop": + print("Local desktop control initialized. The agent will now control your desktop.") + print("Move mouse to upper-left corner (0,0) to abort if needed (PyAutoGUI failsafe).") while True: try: diff --git a/computers/__init__.py b/computers/__init__.py index 606332e..1c1bdf5 100644 --- a/computers/__init__.py +++ b/computers/__init__.py @@ -3,3 +3,4 @@ from .local_playwright import LocalPlaywrightComputer from .docker import DockerComputer from .scrapybara import ScrapybaraBrowser, ScrapybaraUbuntu +from .pyautogui_computer import PyAutoGUIComputer diff --git a/computers/pyautogui_computer.py b/computers/pyautogui_computer.py new file mode 100644 index 0000000..be50510 --- /dev/null +++ b/computers/pyautogui_computer.py @@ -0,0 +1,152 @@ +import time +import base64 +import platform +import io +from typing import List, Dict, Literal +import pyautogui +from PIL import Image + +# Key mapping for CUA style keys to PyAutoGUI keys +CUA_KEY_TO_PYAUTOGUI_KEY = { + "/": "/", + "\\": "\\", + "alt": "alt", + "arrowdown": "down", + "arrowleft": "left", + "arrowright": "right", + "arrowup": "up", + "backspace": "backspace", + "capslock": "capslock", + "cmd": "command", + "ctrl": "ctrl", + "delete": "delete", + "end": "end", + "enter": "enter", + "esc": "escape", + "home": "home", + "insert": "insert", + "option": "option", + "pagedown": "pagedown", + "pageup": "pageup", + "shift": "shift", + "space": "space", + "super": "win", + "tab": "tab", + "win": "win", +} + + +class PyAutoGUIComputer: + """ + Computer implementation using PyAutoGUI to control the local desktop environment. + Follows the Computer protocol to provide consistent interface for the agent. + """ + + def __init__(self): + # Set the default behavior of PyAutoGUI + pyautogui.PAUSE = 0.1 # Add a small pause between PyAutoGUI commands + pyautogui.FAILSAFE = True # Move mouse to upper-left corner to abort + + # Store the screen size + self._screen_width, self._screen_height = pyautogui.size() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Cleanup if needed + pass + + @property + def environment(self) -> Literal["windows", "mac", "linux"]: + """Return the operating system environment.""" + system = platform.system().lower() + if system == "darwin": + return "mac" + elif system == "windows": + return "windows" + else: + return "linux" + + @property + def dimensions(self) -> tuple[int, int]: + """Return the screen dimensions.""" + return (self._screen_width, self._screen_height) + + def screenshot(self) -> str: + """Take a screenshot and return as base64 encoded string.""" + screenshot = pyautogui.screenshot() + + # Convert PIL Image to base64 + buffered = io.BytesIO() + screenshot.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + def click(self, x: int, y: int, button: str = "left") -> None: + """Click at the specified coordinates with the specified button.""" + # Map button names if needed + button_mapping = {"left": "left", "right": "right", "middle": "middle"} + button_type = button_mapping.get(button, "left") + + pyautogui.click(x=x, y=y, button=button_type) + + def double_click(self, x: int, y: int) -> None: + """Double-click at the specified coordinates.""" + pyautogui.doubleClick(x=x, y=y) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + """Scroll at the specified coordinates.""" + # Move to position first + pyautogui.moveTo(x, y) + + # PyAutoGUI scroll works differently, normalized to fit the interface + # Positive values scroll down, negative values scroll up + pyautogui.scroll(clicks=-scroll_y) # Invert scroll_y to match expected behavior + + # Note: PyAutoGUI doesn't support horizontal scrolling directly + # Could use pyautogui.hscroll if it becomes available + + def type(self, text: str) -> None: + """Type the specified text.""" + pyautogui.write(text) + + def wait(self, ms: int = 1000) -> None: + """Wait for the specified number of milliseconds.""" + time.sleep(ms / 1000) + + def move(self, x: int, y: int) -> None: + """Move the mouse to the specified coordinates.""" + pyautogui.moveTo(x, y) + + def keypress(self, keys: List[str]) -> None: + """Press the specified keys.""" + # Map keys to PyAutoGUI format + mapped_keys = [CUA_KEY_TO_PYAUTOGUI_KEY.get(key.lower(), key) for key in keys] + + # Press and release keys in sequence + pyautogui.hotkey(*mapped_keys) + + def drag(self, path: List[Dict[str, int]]) -> None: + """Drag along the specified path.""" + if not path: + return + + # Move to starting point + pyautogui.moveTo(path[0]["x"], path[0]["y"]) + + # Start dragging + pyautogui.mouseDown() + + # Move along path + for point in path[1:]: + pyautogui.moveTo(point["x"], point["y"]) + + # Release mouse + pyautogui.mouseUp() + + def get_current_url(self) -> str: + """ + This method is required by the Computer protocol but doesn't make + sense for desktop control. Return a placeholder value. + """ + return "desktop://" \ No newline at end of file diff --git a/examples/pyautogui_desktop_example.py b/examples/pyautogui_desktop_example.py new file mode 100644 index 0000000..91c7802 --- /dev/null +++ b/examples/pyautogui_desktop_example.py @@ -0,0 +1,58 @@ +""" +Example demonstrating the PyAutoGUIComputer for controlling the local desktop. +""" + +from agent.agent import Agent +from computers import PyAutoGUIComputer + +def acknowledge_safety_check_callback(message: str) -> bool: + """Callback for safety check acknowledgment.""" + response = input( + f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " + ).lower() + return response.lower().strip() == "y" + +def main(): + """Main function to run the PyAutoGUI desktop agent.""" + print("Initializing PyAutoGUI Desktop Control") + print("=====================================") + print("This example allows an agent to control your desktop using PyAutoGUI.") + print("Move mouse to upper-left corner (0,0) to abort if needed.") + print() + + with PyAutoGUIComputer() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + ) + + items = [] + + print("Desktop agent ready. Type 'exit' to quit.") + print("Example commands:") + print(" - 'Open a calculator'") + print(" - 'Create a new text file on the desktop'") + print(" - 'Take a screenshot and tell me what you see'") + + while True: + try: + user_input = input("> ") + if user_input.lower() == 'exit': + break + except EOFError as e: + print(f"An error occurred: {e}") + break + + items.append({"role": "user", "content": user_input}) + # Using custom show_images parameter for the PyAutoGUI example + # This will use the non-intrusive matplotlib display method + output_items = agent.run_full_turn( + items, + print_steps=True, + show_images=True, # Will use our modified non-intrusive display + debug=False, + ) + items += output_items + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 13769fb..9dd1a5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,10 @@ httpcore==1.0.7 httpx==0.28.1 idna==3.10 jiter==0.8.2 +matplotlib==3.8.0 pillow==11.1.0 playwright==1.50.0 +pyautogui==0.9.54 pydantic==2.10.6 pydantic_core==2.27.2 pyee==12.1.1 diff --git a/utils.py b/utils.py index b17ee81..1fd1691 100644 --- a/utils.py +++ b/utils.py @@ -24,10 +24,33 @@ def pp(obj): print(json.dumps(obj, indent=4)) -def show_image(base_64_image): +def show_image(base_64_image, use_external_viewer=False): + """ + Display an image from base64 string. + + If use_external_viewer is True, use the default system viewer (which creates popups), + otherwise try to use matplotlib for inline display that doesn't affect screen state. + """ image_data = base64.b64decode(base_64_image) image = Image.open(BytesIO(image_data)) - image.show() + + if use_external_viewer: + # Original behavior - creates popup windows which can interfere with automation + image.show() + else: + try: + # Try to use matplotlib for non-intrusive display + import matplotlib.pyplot as plt + plt.figure(figsize=(10, 10)) + plt.imshow(image) + plt.axis('off') + plt.show(block=False) # Non-blocking display + plt.pause(0.5) # Short pause to render + except ImportError: + # Fall back to writing to a temporary file - doesn't create popups + temp_path = os.path.join(os.path.expanduser("~"), "temp_screenshot.png") + image.save(temp_path) + print(f"Screenshot saved to {temp_path}") def calculate_image_dimensions(base_64_image):