Skip to content

PyAutoGUI desktop computer implementation #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ The `examples` folder contains more examples of how to use CUA.
python -m examples.weather_example
```

You can also try the PyAutoGUI desktop control example:

```shell
python -m examples.pyautogui_desktop_example
```

For reference, the file `simple_cua_loop.py` implements the basics of the CUA loop.

You can run it with:
Expand Down Expand Up @@ -92,6 +98,7 @@ This sample app provides a set of implemented `Computer` examples, but feel free
| Computer | Option | Type | Description | Requirements |
| ------------------- | ------------------ | --------- | --------------------------------- | ---------------------------------------------------------------- |
| `LocalPlaywright` | local-playwright | `browser` | Local browser window | [Playwright SDK](https://playwright.dev/) |
| `PyAutoGUI` | local-desktop | `desktop` | Local desktop control | [PyAutoGUI](https://pyautogui.readthedocs.io/) |
| `Docker` | docker | `linux` | Docker container environment | [Docker](https://docs.docker.com/engine/install/) running |
| `Browserbase` | browserbase | `browser` | Remote browser environment | [Browserbase](https://www.browserbase.com/) API key in `.env` |
| `ScrapybaraBrowser` | scrapybara-browser | `browser` | Remote browser environment | [Scrapybara](https://scrapybara.com/dashboard) API key in `.env` |
Expand Down
6 changes: 5 additions & 1 deletion agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,11 @@ def handle_item(self, item):

screenshot_base64 = self.computer.screenshot()
if self.show_images:
show_image(screenshot_base64)
# Use non-intrusive display for PyAutoGUI to avoid changing screen state
use_external_viewer = self.computer.environment != "windows" and \
self.computer.environment != "mac" and \
self.computer.environment != "linux"
show_image(screenshot_base64, use_external_viewer=use_external_viewer)

# if user doesn't ack all safety checks exit with error
pending_checks = item.get("pending_safety_checks", [])
Expand Down
8 changes: 8 additions & 0 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ScrapybaraUbuntu,
LocalPlaywrightComputer,
DockerComputer,
PyAutoGUIComputer,
)

def acknowledge_safety_check_callback(message: str) -> bool:
Expand All @@ -27,6 +28,7 @@ def main():
"browserbase",
"scrapybara-browser",
"scrapybara-ubuntu",
"local-desktop",
],
help="Choose the computer environment to use.",
default="local-playwright",
Expand Down Expand Up @@ -61,6 +63,7 @@ def main():
"browserbase": BrowserbaseBrowser,
"scrapybara-browser": ScrapybaraBrowser,
"scrapybara-ubuntu": ScrapybaraUbuntu,
"local-desktop": PyAutoGUIComputer,
}

ComputerClass = computer_mapping[args.computer]
Expand All @@ -77,6 +80,11 @@ def main():
if not args.start_url.startswith("http"):
args.start_url = "https://" + args.start_url
agent.computer.goto(args.start_url)

# Display a welcome message for local-desktop mode
if args.computer == "local-desktop":
print("Local desktop control initialized. The agent will now control your desktop.")
print("Move mouse to upper-left corner (0,0) to abort if needed (PyAutoGUI failsafe).")

while True:
try:
Expand Down
1 change: 1 addition & 0 deletions computers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .local_playwright import LocalPlaywrightComputer
from .docker import DockerComputer
from .scrapybara import ScrapybaraBrowser, ScrapybaraUbuntu
from .pyautogui_computer import PyAutoGUIComputer
152 changes: 152 additions & 0 deletions computers/pyautogui_computer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import time
import base64
import platform
import io
from typing import List, Dict, Literal
import pyautogui
from PIL import Image

# Key mapping for CUA style keys to PyAutoGUI keys
CUA_KEY_TO_PYAUTOGUI_KEY = {
"/": "/",
"\\": "\\",
"alt": "alt",
"arrowdown": "down",
"arrowleft": "left",
"arrowright": "right",
"arrowup": "up",
"backspace": "backspace",
"capslock": "capslock",
"cmd": "command",
"ctrl": "ctrl",
"delete": "delete",
"end": "end",
"enter": "enter",
"esc": "escape",
"home": "home",
"insert": "insert",
"option": "option",
"pagedown": "pagedown",
"pageup": "pageup",
"shift": "shift",
"space": "space",
"super": "win",
"tab": "tab",
"win": "win",
}


class PyAutoGUIComputer:
"""
Computer implementation using PyAutoGUI to control the local desktop environment.
Follows the Computer protocol to provide consistent interface for the agent.
"""

def __init__(self):
# Set the default behavior of PyAutoGUI
pyautogui.PAUSE = 0.1 # Add a small pause between PyAutoGUI commands
pyautogui.FAILSAFE = True # Move mouse to upper-left corner to abort

# Store the screen size
self._screen_width, self._screen_height = pyautogui.size()

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
# Cleanup if needed
pass

@property
def environment(self) -> Literal["windows", "mac", "linux"]:
"""Return the operating system environment."""
system = platform.system().lower()
if system == "darwin":
return "mac"
elif system == "windows":
return "windows"
else:
return "linux"

@property
def dimensions(self) -> tuple[int, int]:
"""Return the screen dimensions."""
return (self._screen_width, self._screen_height)

def screenshot(self) -> str:
"""Take a screenshot and return as base64 encoded string."""
screenshot = pyautogui.screenshot()

# Convert PIL Image to base64
buffered = io.BytesIO()
screenshot.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")

def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at the specified coordinates with the specified button."""
# Map button names if needed
button_mapping = {"left": "left", "right": "right", "middle": "middle"}
button_type = button_mapping.get(button, "left")

pyautogui.click(x=x, y=y, button=button_type)

def double_click(self, x: int, y: int) -> None:
"""Double-click at the specified coordinates."""
pyautogui.doubleClick(x=x, y=y)

def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at the specified coordinates."""
# Move to position first
pyautogui.moveTo(x, y)

# PyAutoGUI scroll works differently, normalized to fit the interface
# Positive values scroll down, negative values scroll up
pyautogui.scroll(clicks=-scroll_y) # Invert scroll_y to match expected behavior

# Note: PyAutoGUI doesn't support horizontal scrolling directly
# Could use pyautogui.hscroll if it becomes available

def type(self, text: str) -> None:
"""Type the specified text."""
pyautogui.write(text)

def wait(self, ms: int = 1000) -> None:
"""Wait for the specified number of milliseconds."""
time.sleep(ms / 1000)

def move(self, x: int, y: int) -> None:
"""Move the mouse to the specified coordinates."""
pyautogui.moveTo(x, y)

def keypress(self, keys: List[str]) -> None:
"""Press the specified keys."""
# Map keys to PyAutoGUI format
mapped_keys = [CUA_KEY_TO_PYAUTOGUI_KEY.get(key.lower(), key) for key in keys]

# Press and release keys in sequence
pyautogui.hotkey(*mapped_keys)

def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along the specified path."""
if not path:
return

# Move to starting point
pyautogui.moveTo(path[0]["x"], path[0]["y"])

# Start dragging
pyautogui.mouseDown()

# Move along path
for point in path[1:]:
pyautogui.moveTo(point["x"], point["y"])

# Release mouse
pyautogui.mouseUp()

def get_current_url(self) -> str:
"""
This method is required by the Computer protocol but doesn't make
sense for desktop control. Return a placeholder value.
"""
return "desktop://"
58 changes: 58 additions & 0 deletions examples/pyautogui_desktop_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Example demonstrating the PyAutoGUIComputer for controlling the local desktop.
"""

from agent.agent import Agent
from computers import PyAutoGUIComputer

def acknowledge_safety_check_callback(message: str) -> bool:
"""Callback for safety check acknowledgment."""
response = input(
f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): "
).lower()
return response.lower().strip() == "y"

def main():
"""Main function to run the PyAutoGUI desktop agent."""
print("Initializing PyAutoGUI Desktop Control")
print("=====================================")
print("This example allows an agent to control your desktop using PyAutoGUI.")
print("Move mouse to upper-left corner (0,0) to abort if needed.")
print()

with PyAutoGUIComputer() as computer:
agent = Agent(
computer=computer,
acknowledge_safety_check_callback=acknowledge_safety_check_callback,
)

items = []

print("Desktop agent ready. Type 'exit' to quit.")
print("Example commands:")
print(" - 'Open a calculator'")
print(" - 'Create a new text file on the desktop'")
print(" - 'Take a screenshot and tell me what you see'")

while True:
try:
user_input = input("> ")
if user_input.lower() == 'exit':
break
except EOFError as e:
print(f"An error occurred: {e}")
break

items.append({"role": "user", "content": user_input})
# Using custom show_images parameter for the PyAutoGUI example
# This will use the non-intrusive matplotlib display method
output_items = agent.run_full_turn(
items,
print_steps=True,
show_images=True, # Will use our modified non-intrusive display
debug=False,
)
items += output_items

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@ httpcore==1.0.7
httpx==0.28.1
idna==3.10
jiter==0.8.2
matplotlib==3.8.0
pillow==11.1.0
playwright==1.50.0
pyautogui==0.9.54
pydantic==2.10.6
pydantic_core==2.27.2
pyee==12.1.1
Expand Down
27 changes: 25 additions & 2 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,33 @@ def pp(obj):
print(json.dumps(obj, indent=4))


def show_image(base_64_image):
def show_image(base_64_image, use_external_viewer=False):
"""
Display an image from base64 string.

If use_external_viewer is True, use the default system viewer (which creates popups),
otherwise try to use matplotlib for inline display that doesn't affect screen state.
"""
image_data = base64.b64decode(base_64_image)
image = Image.open(BytesIO(image_data))
image.show()

if use_external_viewer:
# Original behavior - creates popup windows which can interfere with automation
image.show()
else:
try:
# Try to use matplotlib for non-intrusive display
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
plt.imshow(image)
plt.axis('off')
plt.show(block=False) # Non-blocking display
plt.pause(0.5) # Short pause to render
except ImportError:
# Fall back to writing to a temporary file - doesn't create popups
temp_path = os.path.join(os.path.expanduser("~"), "temp_screenshot.png")
image.save(temp_path)
print(f"Screenshot saved to {temp_path}")


def calculate_image_dimensions(base_64_image):
Expand Down