openai · jam3scampbell · Mar 22, 2025
diff --git a/README.md b/README.md
@@ -64,6 +64,12 @@ The `examples` folder contains more examples of how to use CUA.
 python -m examples.weather_example
 ```
 
+You can also try the PyAutoGUI desktop control example:
+
+```shell
+python -m examples.pyautogui_desktop_example
+```
+
 For reference, the file `simple_cua_loop.py` implements the basics of the CUA loop.
 
 You can run it with:
@@ -92,6 +98,7 @@ This sample app provides a set of implemented `Computer` examples, but feel free
 | Computer            | Option             | Type      | Description                       | Requirements                                                     |
 | ------------------- | ------------------ | --------- | --------------------------------- | ---------------------------------------------------------------- |
 | `LocalPlaywright`   | local-playwright   | `browser` | Local browser window              | [Playwright SDK](https://playwright.dev/)                        |
+| `PyAutoGUI`         | local-desktop      | `desktop` | Local desktop control             | [PyAutoGUI](https://pyautogui.readthedocs.io/)                  |
 | `Docker`            | docker             | `linux`   | Docker container environment      | [Docker](https://docs.docker.com/engine/install/) running        |
 | `Browserbase`       | browserbase        | `browser` | Remote browser environment        | [Browserbase](https://www.browserbase.com/) API key in `.env`    |
 | `ScrapybaraBrowser` | scrapybara-browser | `browser` | Remote browser environment        | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` |

diff --git a/agent/agent.py b/agent/agent.py
@@ -80,7 +80,11 @@ def handle_item(self, item):
 
             screenshot_base64 = self.computer.screenshot()
             if self.show_images:
-                show_image(screenshot_base64)
+                # Use non-intrusive display for PyAutoGUI to avoid changing screen state
+                use_external_viewer = self.computer.environment != "windows" and \
+                                    self.computer.environment != "mac" and \
+                                    self.computer.environment != "linux"
+                show_image(screenshot_base64, use_external_viewer=use_external_viewer)
 
             # if user doesn't ack all safety checks exit with error
             pending_checks = item.get("pending_safety_checks", [])

diff --git a/cli.py b/cli.py
@@ -6,6 +6,7 @@
     ScrapybaraUbuntu,
     LocalPlaywrightComputer,
     DockerComputer,
+    PyAutoGUIComputer,
 )
 
 def acknowledge_safety_check_callback(message: str) -> bool:
@@ -27,6 +28,7 @@ def main():
             "browserbase",
             "scrapybara-browser",
             "scrapybara-ubuntu",
+            "local-desktop",
         ],
         help="Choose the computer environment to use.",
         default="local-playwright",
@@ -61,6 +63,7 @@ def main():
         "browserbase": BrowserbaseBrowser,
         "scrapybara-browser": ScrapybaraBrowser,
         "scrapybara-ubuntu": ScrapybaraUbuntu,
+        "local-desktop": PyAutoGUIComputer,
     }
 
     ComputerClass = computer_mapping[args.computer]
@@ -77,6 +80,11 @@ def main():
             if not args.start_url.startswith("http"):
                 args.start_url = "https://" + args.start_url
             agent.computer.goto(args.start_url)
+
+        # Display a welcome message for local-desktop mode
+        if args.computer == "local-desktop":
+            print("Local desktop control initialized. The agent will now control your desktop.")
+            print("Move mouse to upper-left corner (0,0) to abort if needed (PyAutoGUI failsafe).")
 
         while True:
             try:

diff --git a/computers/__init__.py b/computers/__init__.py
@@ -3,3 +3,4 @@
 from .local_playwright import LocalPlaywrightComputer
 from .docker import DockerComputer
 from .scrapybara import ScrapybaraBrowser, ScrapybaraUbuntu
+from .pyautogui_computer import PyAutoGUIComputer
diff --git a/computers/pyautogui_computer.py b/computers/pyautogui_computer.py
@@ -0,0 +1,152 @@
+import time
+import base64
+import platform
+import io
+from typing import List, Dict, Literal
+import pyautogui
+from PIL import Image
+
+# Key mapping for CUA style keys to PyAutoGUI keys
+CUA_KEY_TO_PYAUTOGUI_KEY = {
+    "/": "/",
+    "\\": "\\",
+    "alt": "alt",
+    "arrowdown": "down",
+    "arrowleft": "left",
+    "arrowright": "right",
+    "arrowup": "up",
+    "backspace": "backspace",
+    "capslock": "capslock",
+    "cmd": "command",
+    "ctrl": "ctrl",
+    "delete": "delete",
+    "end": "end",
+    "enter": "enter",
+    "esc": "escape",
+    "home": "home",
+    "insert": "insert",
+    "option": "option",
+    "pagedown": "pagedown",
+    "pageup": "pageup",
+    "shift": "shift",
+    "space": "space",
+    "super": "win",
+    "tab": "tab",
+    "win": "win",
+}
+
+
+class PyAutoGUIComputer:
+    """
+    Computer implementation using PyAutoGUI to control the local desktop environment.
+    Follows the Computer protocol to provide consistent interface for the agent.
+    """
+
+    def __init__(self):
+        # Set the default behavior of PyAutoGUI
+        pyautogui.PAUSE = 0.1  # Add a small pause between PyAutoGUI commands
+        pyautogui.FAILSAFE = True  # Move mouse to upper-left corner to abort
+
+        # Store the screen size
+        self._screen_width, self._screen_height = pyautogui.size()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        # Cleanup if needed
+        pass
+
+    @property
+    def environment(self) -> Literal["windows", "mac", "linux"]:
+        """Return the operating system environment."""
+        system = platform.system().lower()
+        if system == "darwin":
+            return "mac"
+        elif system == "windows":
+            return "windows"
+        else:
+            return "linux"
+
+    @property
+    def dimensions(self) -> tuple[int, int]:
+        """Return the screen dimensions."""
+        return (self._screen_width, self._screen_height)
+
+    def screenshot(self) -> str:
+        """Take a screenshot and return as base64 encoded string."""
+        screenshot = pyautogui.screenshot()
+
+        # Convert PIL Image to base64
+        buffered = io.BytesIO()
+        screenshot.save(buffered, format="PNG")
+        return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    def click(self, x: int, y: int, button: str = "left") -> None:
+        """Click at the specified coordinates with the specified button."""
+        # Map button names if needed
+        button_mapping = {"left": "left", "right": "right", "middle": "middle"}
+        button_type = button_mapping.get(button, "left")
+
+        pyautogui.click(x=x, y=y, button=button_type)
+
+    def double_click(self, x: int, y: int) -> None:
+        """Double-click at the specified coordinates."""
+        pyautogui.doubleClick(x=x, y=y)
+
+    def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
+        """Scroll at the specified coordinates."""
+        # Move to position first
+        pyautogui.moveTo(x, y)
+
+        # PyAutoGUI scroll works differently, normalized to fit the interface
+        # Positive values scroll down, negative values scroll up
+        pyautogui.scroll(clicks=-scroll_y)  # Invert scroll_y to match expected behavior
+
+        # Note: PyAutoGUI doesn't support horizontal scrolling directly
+        # Could use pyautogui.hscroll if it becomes available
+
+    def type(self, text: str) -> None:
+        """Type the specified text."""
+        pyautogui.write(text)
+
+    def wait(self, ms: int = 1000) -> None:
+        """Wait for the specified number of milliseconds."""
+        time.sleep(ms / 1000)
+
+    def move(self, x: int, y: int) -> None:
+        """Move the mouse to the specified coordinates."""
+        pyautogui.moveTo(x, y)
+
+    def keypress(self, keys: List[str]) -> None:
+        """Press the specified keys."""
+        # Map keys to PyAutoGUI format
+        mapped_keys = [CUA_KEY_TO_PYAUTOGUI_KEY.get(key.lower(), key) for key in keys]
+
+        # Press and release keys in sequence
+        pyautogui.hotkey(*mapped_keys)
+
+    def drag(self, path: List[Dict[str, int]]) -> None:
+        """Drag along the specified path."""
+        if not path:
+            return
+
+        # Move to starting point
+        pyautogui.moveTo(path[0]["x"], path[0]["y"])
+
+        # Start dragging
+        pyautogui.mouseDown()
+
+        # Move along path
+        for point in path[1:]:
+            pyautogui.moveTo(point["x"], point["y"])
+
+        # Release mouse
+        pyautogui.mouseUp()
+
+    def get_current_url(self) -> str:
+        """
+        This method is required by the Computer protocol but doesn't make 
+        sense for desktop control. Return a placeholder value.
+        """
+        return "desktop://"
diff --git a/examples/pyautogui_desktop_example.py b/examples/pyautogui_desktop_example.py
@@ -0,0 +1,58 @@
+"""
+Example demonstrating the PyAutoGUIComputer for controlling the local desktop.
+"""
+
+from agent.agent import Agent
+from computers import PyAutoGUIComputer
+
+def acknowledge_safety_check_callback(message: str) -> bool:
+    """Callback for safety check acknowledgment."""
+    response = input(
+        f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
+    ).lower()
+    return response.lower().strip() == "y"
+
+def main():
+    """Main function to run the PyAutoGUI desktop agent."""
+    print("Initializing PyAutoGUI Desktop Control")
+    print("=====================================")
+    print("This example allows an agent to control your desktop using PyAutoGUI.")
+    print("Move mouse to upper-left corner (0,0) to abort if needed.")
+    print()
+
+    with PyAutoGUIComputer() as computer:
+        agent = Agent(
+            computer=computer,
+            acknowledge_safety_check_callback=acknowledge_safety_check_callback,
+        )
+
+        items = []
+
+        print("Desktop agent ready. Type 'exit' to quit.")
+        print("Example commands:")
+        print(" - 'Open a calculator'")
+        print(" - 'Create a new text file on the desktop'")
+        print(" - 'Take a screenshot and tell me what you see'")
+
+        while True:
+            try:
+                user_input = input("> ")
+                if user_input.lower() == 'exit':
+                    break
+            except EOFError as e:
+                print(f"An error occurred: {e}")
+                break
+
+            items.append({"role": "user", "content": user_input})
+            # Using custom show_images parameter for the PyAutoGUI example
+            # This will use the non-intrusive matplotlib display method
+            output_items = agent.run_full_turn(
+                items,
+                print_steps=True,
+                show_images=True,  # Will use our modified non-intrusive display
+                debug=False,
+            )
+            items += output_items
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -10,8 +10,10 @@ httpcore==1.0.7
 httpx==0.28.1
 idna==3.10
 jiter==0.8.2
+matplotlib==3.8.0
 pillow==11.1.0
 playwright==1.50.0
+pyautogui==0.9.54
 pydantic==2.10.6
 pydantic_core==2.27.2
 pyee==12.1.1

diff --git a/utils.py b/utils.py
@@ -24,10 +24,33 @@ def pp(obj):
     print(json.dumps(obj, indent=4))
 
 
-def show_image(base_64_image):
+def show_image(base_64_image, use_external_viewer=False):
+    """
+    Display an image from base64 string.
+
+    If use_external_viewer is True, use the default system viewer (which creates popups),
+    otherwise try to use matplotlib for inline display that doesn't affect screen state.
+    """
     image_data = base64.b64decode(base_64_image)
     image = Image.open(BytesIO(image_data))
-    image.show()
+
+    if use_external_viewer:
+        # Original behavior - creates popup windows which can interfere with automation
+        image.show()
+    else:
+        try:
+            # Try to use matplotlib for non-intrusive display
+            import matplotlib.pyplot as plt
+            plt.figure(figsize=(10, 10))
+            plt.imshow(image)
+            plt.axis('off')
+            plt.show(block=False)  # Non-blocking display
+            plt.pause(0.5)  # Short pause to render
+        except ImportError:
+            # Fall back to writing to a temporary file - doesn't create popups
+            temp_path = os.path.join(os.path.expanduser("~"), "temp_screenshot.png")
+            image.save(temp_path)
+            print(f"Screenshot saved to {temp_path}")
 
 
 def calculate_image_dimensions(base_64_image):