-
Notifications
You must be signed in to change notification settings - Fork 373
Feat/windows command #108
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Feat/windows command #108
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,11 +66,13 @@ def __init__( | |
| browser_computer: Computer, | ||
| query: str, | ||
| model_name: str, | ||
| environment: str = "playwright", | ||
| verbose: bool = True, | ||
| ): | ||
| self._browser_computer = browser_computer | ||
| self._query = query | ||
| self._model_name = model_name | ||
| self._environment = environment | ||
| self._verbose = verbose | ||
| self.final_reasoning = None | ||
| self._client = genai.Client( | ||
|
|
@@ -90,6 +92,45 @@ def __init__( | |
|
|
||
| # Exclude any predefined functions here. | ||
| excluded_predefined_functions = [] | ||
| if self._environment in ("desktop", "desktop_win"): | ||
| excluded_predefined_functions = [ | ||
| "open_web_browser", | ||
| "search", | ||
| "navigate", | ||
| ] | ||
| system_instruction = None | ||
| if self._environment == "desktop": | ||
| system_instruction = ( | ||
| "You are controlling a local desktop GUI (macOS). " | ||
| "Do not type into the current active app unless you have explicitly focused the correct input field. " | ||
| "To open apps, first open Spotlight with Command+Space using key_combination, " | ||
| "then type the app name and press Enter without clicking in other windows. " | ||
| "Prefer keyboard shortcuts to switch apps instead of typing into arbitrary windows." | ||
| ) | ||
| search_intent = any( | ||
| token in self._query.lower() | ||
| for token in ("find", "search", "look for", "locate") | ||
| ) | ||
| if search_intent: | ||
| system_instruction += ( | ||
| " If the user asks to find or search for something, open Spotlight first." | ||
| ) | ||
| elif self._environment == "desktop_win": | ||
| system_instruction = ( | ||
| "You are controlling a local desktop GUI (Windows). " | ||
| "Do not type into the current active app unless you have explicitly focused the correct input field. " | ||
| "To open apps, first open Start/Search with Win or Win+S using key_combination, " | ||
| "then type the app name and press Enter without clicking in other windows. " | ||
| "Prefer keyboard shortcuts to switch apps instead of typing into arbitrary windows." | ||
| ) | ||
| search_intent = any( | ||
| token in self._query.lower() | ||
| for token in ("find", "search", "look for", "locate") | ||
| ) | ||
| if search_intent: | ||
| system_instruction += ( | ||
| " If the user asks to find or search for something, open Start/Search first." | ||
| ) | ||
|
Comment on lines
+101
to
+133
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The introduction of desktop automation capabilities (macOS and Windows) via To mitigate the RCE risk, consider:
Additionally, refactoring the duplicated logic for |
||
|
|
||
| # Add your own custom functions here. | ||
| custom_functions = [ | ||
|
|
@@ -104,6 +145,7 @@ def __init__( | |
| top_p=0.95, | ||
| top_k=40, | ||
| max_output_tokens=8192, | ||
| system_instruction=system_instruction, | ||
| tools=[ | ||
| types.Tool( | ||
| computer_use=types.ComputerUse( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,199 @@ | ||
| import io | ||
| import sys | ||
| import time | ||
| import webbrowser | ||
| from typing import Literal | ||
|
|
||
| import pyautogui | ||
|
|
||
| from ..computer import Computer, EnvState | ||
|
|
||
|
|
||
| class DesktopComputer(Computer): | ||
| """Controls the local desktop using OS-level input automation.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| screen_size: tuple[int, int], | ||
| initial_url: str = "https://www.google.com", | ||
| search_engine_url: str = "https://www.google.com", | ||
| ): | ||
| self._initial_url = initial_url | ||
| self._search_engine_url = search_engine_url | ||
| size = pyautogui.size() | ||
| self._screen_size = (size.width, size.height) | ||
| self._current_url = "" | ||
| self._spotlight_pending = False | ||
|
|
||
| def __enter__(self): | ||
| return self | ||
|
|
||
| def __exit__(self, exc_type, exc_val, exc_tb): | ||
| return None | ||
|
|
||
| def screen_size(self) -> tuple[int, int]: | ||
| return self._screen_size | ||
|
|
||
| def open_web_browser(self) -> EnvState: | ||
| webbrowser.open(self._initial_url) | ||
| self._current_url = self._initial_url | ||
| time.sleep(1) | ||
| return self.current_state() | ||
|
|
||
| def click_at(self, x: int, y: int) -> EnvState: | ||
| pyautogui.click(x, y) | ||
| return self.current_state() | ||
|
|
||
| def hover_at(self, x: int, y: int) -> EnvState: | ||
| pyautogui.moveTo(x, y) | ||
| return self.current_state() | ||
|
|
||
| def type_text_at( | ||
| self, | ||
| x: int, | ||
| y: int, | ||
| text: str, | ||
| press_enter: bool = False, | ||
| clear_before_typing: bool = True, | ||
| ) -> EnvState: | ||
| if self._spotlight_pending: | ||
| self._spotlight_pending = False | ||
| else: | ||
| pyautogui.click(x, y) | ||
| if clear_before_typing: | ||
| if sys.platform == "darwin": | ||
| pyautogui.hotkey("command", "a") | ||
| else: | ||
| pyautogui.hotkey("ctrl", "a") | ||
| pyautogui.press("backspace") | ||
| pyautogui.write(text) | ||
| if press_enter: | ||
| pyautogui.press("enter") | ||
| return self.current_state() | ||
|
|
||
| def scroll_document( | ||
| self, direction: Literal["up", "down", "left", "right"] | ||
| ) -> EnvState: | ||
| scroll_amount = self._screen_size[1] // 2 | ||
| if direction == "up": | ||
| pyautogui.scroll(scroll_amount) | ||
| elif direction == "down": | ||
| pyautogui.scroll(-scroll_amount) | ||
| elif direction == "left": | ||
| pyautogui.hscroll(-scroll_amount) | ||
| elif direction == "right": | ||
| pyautogui.hscroll(scroll_amount) | ||
| else: | ||
| raise ValueError("Unsupported direction: ", direction) | ||
| return self.current_state() | ||
|
|
||
| def scroll_at( | ||
| self, | ||
| x: int, | ||
| y: int, | ||
| direction: Literal["up", "down", "left", "right"], | ||
| magnitude: int = 800, | ||
| ) -> EnvState: | ||
| pyautogui.moveTo(x, y) | ||
| if direction == "up": | ||
| pyautogui.scroll(magnitude) | ||
| elif direction == "down": | ||
| pyautogui.scroll(-magnitude) | ||
| elif direction == "left": | ||
| pyautogui.hscroll(-magnitude) | ||
| elif direction == "right": | ||
| pyautogui.hscroll(magnitude) | ||
| else: | ||
| raise ValueError("Unsupported direction: ", direction) | ||
| return self.current_state() | ||
|
|
||
| def wait_5_seconds(self) -> EnvState: | ||
| time.sleep(5) | ||
| return self.current_state() | ||
|
|
||
| def go_back(self) -> EnvState: | ||
| if sys.platform == "darwin": | ||
| pyautogui.hotkey("command", "[") | ||
| else: | ||
| pyautogui.hotkey("alt", "left") | ||
| return self.current_state() | ||
|
|
||
| def go_forward(self) -> EnvState: | ||
| if sys.platform == "darwin": | ||
| pyautogui.hotkey("command", "]") | ||
| else: | ||
| pyautogui.hotkey("alt", "right") | ||
| return self.current_state() | ||
|
|
||
| def search(self) -> EnvState: | ||
| return self.navigate(self._search_engine_url) | ||
|
|
||
| def navigate(self, url: str) -> EnvState: | ||
| normalized_url = url | ||
| if not normalized_url.startswith(("http://", "https://")): | ||
| normalized_url = "https://" + normalized_url | ||
| if sys.platform == "darwin": | ||
| pyautogui.hotkey("command", "l") | ||
| else: | ||
| pyautogui.hotkey("ctrl", "l") | ||
| pyautogui.write(normalized_url) | ||
| pyautogui.press("enter") | ||
| self._current_url = normalized_url | ||
| time.sleep(1) | ||
| return self.current_state() | ||
|
|
||
| def key_combination(self, keys: list[str]) -> EnvState: | ||
| normalized_keys = [self._normalize_key(key) for key in keys] | ||
| if len(normalized_keys) == 1: | ||
| pyautogui.press(normalized_keys[0]) | ||
| else: | ||
| pyautogui.hotkey(*normalized_keys) | ||
| if sys.platform == "darwin" and normalized_keys == ["command", "space"]: | ||
| self._spotlight_pending = True | ||
| time.sleep(0.2) | ||
| return self.current_state() | ||
|
|
||
| def drag_and_drop( | ||
| self, x: int, y: int, destination_x: int, destination_y: int | ||
| ) -> EnvState: | ||
| pyautogui.moveTo(x, y) | ||
| pyautogui.dragTo(destination_x, destination_y, button="left") | ||
| return self.current_state() | ||
|
Comment on lines
+51
to
+161
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Consider restricting the tool's capabilities to the minimum necessary for the intended task and ensuring that all actions are performed in a secure, isolated environment. |
||
|
|
||
| def current_state(self) -> EnvState: | ||
| screenshot = pyautogui.screenshot() | ||
| buffer = io.BytesIO() | ||
| screenshot.save(buffer, format="PNG") | ||
| return EnvState(screenshot=buffer.getvalue(), url=self._current_url) | ||
|
|
||
| def _normalize_key(self, key: str) -> str: | ||
| k = key.strip().lower() | ||
| if k in ("controlormeta", "meta", "command"): | ||
| return "command" if sys.platform == "darwin" else "ctrl" | ||
| if k in ("control", "ctrl"): | ||
| return "ctrl" | ||
| if k in ("alt", "option"): | ||
| return "alt" | ||
| if k in ("return", "enter"): | ||
| return "enter" | ||
| if k in ("escape", "esc"): | ||
| return "esc" | ||
| if k in ("space", "spacebar"): | ||
| return "space" | ||
| if k in ("pageup", "page_up"): | ||
| return "pageup" | ||
| if k in ("pagedown", "page_down"): | ||
| return "pagedown" | ||
| if k in ("arrowleft", "left"): | ||
| return "left" | ||
| if k in ("arrowright", "right"): | ||
| return "right" | ||
| if k in ("arrowup", "up"): | ||
| return "up" | ||
| if k in ("arrowdown", "down"): | ||
| return "down" | ||
| if k == "delete": | ||
| return "delete" | ||
| if k == "backspace": | ||
| return "backspace" | ||
| return k | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The documentation for the
--envargument is missing its default value. The table shows "N/A", but the code inmain.pyspecifiesdefault="playwright". To improve clarity for users, the default value should be documented here.References