google-gemini · SiswoHandoko · Feb 10, 2026 · Feb 10, 2026 · gemini-code-assist · Feb 10, 2026
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ playwright install chrome
 ```
 
 ### 2. Configuration
+
 You can get started using either the Gemini Developer API or Vertex AI.
 
 #### A. If using the Gemini Developer API:
@@ -90,10 +91,12 @@ python main.py --query "Go to Google and type 'Hello World' into the search bar"
 
 **Available Environments:**
 
-You can specify a particular environment with the ```--env <environment>``` flag.  Available options:
+You can specify a particular environment with the `--env <environment>` flag. Available options:
 
 - `playwright`: Runs the browser locally using Playwright.
 - `browserbase`: Connects to a Browserbase instance.
+- `desktop`: Controls the local desktop using OS-level mouse/keyboard automation (experimental).
+- `desktop_win`: Controls the local desktop on Windows using OS-level mouse/keyboard automation (experimental).
 
 **Local Playwright**
 
@@ -117,9 +120,25 @@ Runs the agent using Browserbase as the browser backend. Ensure the proper Brows
 python main.py --query="Go to Google and type 'Hello World' into the search bar" --env="browserbase"
 ```
 
+**Local Desktop (experimental)**
+
+Runs the agent against your local desktop using OS-level mouse/keyboard automation. On macOS, you must grant Accessibility and Screen Recording permissions to your terminal app.
+
+```bash
+python main.py --query="Search for Markdown files in my desktop" --env="desktop"
+```
+
+**Local Desktop Windows (experimental)**
+
+Runs the agent against your local Windows desktop using OS-level mouse/keyboard automation.
+
+```bash
+python main.py --query="Open Notepad and type hello" --env="desktop_win"
+```
+
 **Available Models:**
 
-You can choose the model to use by specifying the ```--model <model name>``` flag. Available options on Gemini Developer API and Vertex AI Client:
+You can choose the model to use by specifying the `--model <model name>` flag. Available options on Gemini Developer API and Vertex AI Client:
 
 - `gemini-2.5-computer-use-preview-10-2025`: This is the default model.
 
@@ -134,21 +153,21 @@ The `main.py` script is the command-line interface (CLI) for running the browser
 
 ### Command-Line Arguments
 
-| Argument | Description | Required | Default | Supported Environment(s) |
-|-|-|-|-|-|
-| `--query` | The natural language query for the browser agent to execute. | Yes | N/A | All |
-| `--env` | The computer use environment to use. Must be one of the following: `playwright`, or `browserbase` | No | N/A | All |
-| `--initial_url` | The initial URL to load when the browser starts. | No | https://www.google.com | All |
-| `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No | False (not highlighted) | `playwright` |
-| `--model` | The model to use. See the "Available Models" section for more information. | No | `gemini-2.5-computer-use-preview-10-2025` | All |
+| Argument            | Description                                                                                                                            | Required | Default                                   | Supported Environment(s) |
+| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | -------- | ----------------------------------------- | ------------------------ |
+| `--query`           | The natural language query for the browser agent to execute.                                                                           | Yes      | N/A                                       | All                      |
+| `--env`             | The computer use environment to use. Must be one of the following: `playwright`, `browserbase`, `desktop`, or `desktop_win`             | No       | N/A                                       | All                      |
-| `--env`             | The computer use environment to use. Must be one of the following: `playwright`, `browserbase`, `desktop`, or `desktop_win`             | No       | N/A                                       | All                      |
+| `--env`             | The computer use environment to use. Must be one of the following: `playwright`, `browserbase`, `desktop`, or `desktop_win`             | No       | `playwright`                              | All                      |
-| `--env`             | The computer use environment to use. Must be one of the following: `playwright`, `browserbase`, `desktop`, or `desktop_win`             | No       | N/A                                       | All                      |
+| `--env`             | The computer use environment to use. Must be one of the following: `playwright`, `browserbase`, `desktop`, or `desktop_win`             | No       | `playwright`                              | All                      |
+| `--initial_url`     | The initial URL to load when the browser starts.                                                                                       | No       | https://www.google.com                    | All                      |
+| `--highlight_mouse` | If specified, the agent will attempt to highlight the mouse cursor's position in the screenshots. This is useful for visual debugging. | No       | False (not highlighted)                   | `playwright`             |
+| `--model`           | The model to use. See the "Available Models" section for more information.                                                             | No       | `gemini-2.5-computer-use-preview-10-2025` | All                      |
 
 ### Environment Variables
 
-| Variable | Description | Required |
-|-|-|-|
-| GEMINI_API_KEY | Your API key for the Gemini model. | Yes |
-| BROWSERBASE_API_KEY | Your API key for Browserbase. | Yes (when using the browserbase environment) |
-| BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase. | Yes (when using the browserbase environment) |
+| Variable               | Description                        | Required                                     |
+| ---------------------- | ---------------------------------- | -------------------------------------------- |
+| GEMINI_API_KEY         | Your API key for the Gemini model. | Yes                                          |
+| BROWSERBASE_API_KEY    | Your API key for Browserbase.      | Yes (when using the browserbase environment) |
+| BROWSERBASE_PROJECT_ID | Your Project ID for Browserbase.   | Yes (when using the browserbase environment) |
 
 ## Known Issues
 

diff --git a/agent.py b/agent.py
@@ -66,11 +66,13 @@ def __init__(
         browser_computer: Computer,
         query: str,
         model_name: str,
+        environment: str = "playwright",
         verbose: bool = True,
     ):
         self._browser_computer = browser_computer
         self._query = query
         self._model_name = model_name
+        self._environment = environment
         self._verbose = verbose
         self.final_reasoning = None
         self._client = genai.Client(
@@ -90,6 +92,45 @@ def __init__(
 
         # Exclude any predefined functions here.
         excluded_predefined_functions = []
+        if self._environment in ("desktop", "desktop_win"):
+            excluded_predefined_functions = [
+                "open_web_browser",
+                "search",
+                "navigate",
+            ]
+        system_instruction = None
+        if self._environment == "desktop":
+            system_instruction = (
+                "You are controlling a local desktop GUI (macOS). "
+                "Do not type into the current active app unless you have explicitly focused the correct input field. "
+                "To open apps, first open Spotlight with Command+Space using key_combination, "
+                "then type the app name and press Enter without clicking in other windows. "
+                "Prefer keyboard shortcuts to switch apps instead of typing into arbitrary windows."
+            )
+            search_intent = any(
+                token in self._query.lower()
+                for token in ("find", "search", "look for", "locate")
+            )
+            if search_intent:
+                system_instruction += (
+                    " If the user asks to find or search for something, open Spotlight first."
+                )
+        elif self._environment == "desktop_win":
+            system_instruction = (
+                "You are controlling a local desktop GUI (Windows). "
+                "Do not type into the current active app unless you have explicitly focused the correct input field. "
+                "To open apps, first open Start/Search with Win or Win+S using key_combination, "
+                "then type the app name and press Enter without clicking in other windows. "
+                "Prefer keyboard shortcuts to switch apps instead of typing into arbitrary windows."
+            )
+            search_intent = any(
+                token in self._query.lower()
+                for token in ("find", "search", "look for", "locate")
+            )
+            if search_intent:
+                system_instruction += (
+                    " If the user asks to find or search for something, open Start/Search first."
+                )
 
         # Add your own custom functions here.
         custom_functions = [
@@ -104,6 +145,7 @@ def __init__(
             top_p=0.95,
             top_k=40,
             max_output_tokens=8192,
+            system_instruction=system_instruction,
             tools=[
                 types.Tool(
                     computer_use=types.ComputerUse(

diff --git a/computers/__init__.py b/computers/__init__.py
@@ -13,11 +13,15 @@
 # limitations under the License.
 from .computer import Computer, EnvState
 from .browserbase.browserbase import BrowserbaseComputer
+from .desktop.desktop import DesktopComputer
+from .desktop.desktop_win import DesktopComputer as DesktopWindowsComputer
 from .playwright.playwright import PlaywrightComputer
 
 __all__ = [
     "Computer",
     "EnvState",
     "BrowserbaseComputer",
+    "DesktopComputer",
+    "DesktopWindowsComputer",
     "PlaywrightComputer",
 ]
diff --git a/computers/desktop/desktop.py b/computers/desktop/desktop.py
@@ -0,0 +1,199 @@
+import io
+import sys
+import time
+import webbrowser
+from typing import Literal
+
+import pyautogui
+
+from ..computer import Computer, EnvState
+
+
+class DesktopComputer(Computer):
+    """Controls the local desktop using OS-level input automation."""
+
+    def __init__(
+        self,
+        screen_size: tuple[int, int],
+        initial_url: str = "https://www.google.com",
+        search_engine_url: str = "https://www.google.com",
+    ):
+        self._initial_url = initial_url
+        self._search_engine_url = search_engine_url
+        size = pyautogui.size()
+        self._screen_size = (size.width, size.height)
+        self._current_url = ""
+        self._spotlight_pending = False
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        return None
+
+    def screen_size(self) -> tuple[int, int]:
+        return self._screen_size
+
+    def open_web_browser(self) -> EnvState:
+        webbrowser.open(self._initial_url)
+        self._current_url = self._initial_url
+        time.sleep(1)
+        return self.current_state()
+
+    def click_at(self, x: int, y: int) -> EnvState:
+        pyautogui.click(x, y)
+        return self.current_state()
+
+    def hover_at(self, x: int, y: int) -> EnvState:
+        pyautogui.moveTo(x, y)
+        return self.current_state()
+
+    def type_text_at(
+        self,
+        x: int,
+        y: int,
+        text: str,
+        press_enter: bool = False,
+        clear_before_typing: bool = True,
+    ) -> EnvState:
+        if self._spotlight_pending:
+            self._spotlight_pending = False
+        else:
+            pyautogui.click(x, y)
+        if clear_before_typing:
+            if sys.platform == "darwin":
+                pyautogui.hotkey("command", "a")
+            else:
+                pyautogui.hotkey("ctrl", "a")
+            pyautogui.press("backspace")
+        pyautogui.write(text)
+        if press_enter:
+            pyautogui.press("enter")
+        return self.current_state()
+
+    def scroll_document(
+        self, direction: Literal["up", "down", "left", "right"]
+    ) -> EnvState:
+        scroll_amount = self._screen_size[1] // 2
+        if direction == "up":
+            pyautogui.scroll(scroll_amount)
+        elif direction == "down":
+            pyautogui.scroll(-scroll_amount)
+        elif direction == "left":
+            pyautogui.hscroll(-scroll_amount)
+        elif direction == "right":
+            pyautogui.hscroll(scroll_amount)
+        else:
+            raise ValueError("Unsupported direction: ", direction)
+        return self.current_state()
+
+    def scroll_at(
+        self,
+        x: int,
+        y: int,
+        direction: Literal["up", "down", "left", "right"],
+        magnitude: int = 800,
+    ) -> EnvState:
+        pyautogui.moveTo(x, y)
+        if direction == "up":
+            pyautogui.scroll(magnitude)
+        elif direction == "down":
+            pyautogui.scroll(-magnitude)
+        elif direction == "left":
+            pyautogui.hscroll(-magnitude)
+        elif direction == "right":
+            pyautogui.hscroll(magnitude)
+        else:
+            raise ValueError("Unsupported direction: ", direction)
+        return self.current_state()
+
+    def wait_5_seconds(self) -> EnvState:
+        time.sleep(5)
+        return self.current_state()
+
+    def go_back(self) -> EnvState:
+        if sys.platform == "darwin":
+            pyautogui.hotkey("command", "[")
+        else:
+            pyautogui.hotkey("alt", "left")
+        return self.current_state()
+
+    def go_forward(self) -> EnvState:
+        if sys.platform == "darwin":
+            pyautogui.hotkey("command", "]")
+        else:
+            pyautogui.hotkey("alt", "right")
+        return self.current_state()
+
+    def search(self) -> EnvState:
+        return self.navigate(self._search_engine_url)
+
+    def navigate(self, url: str) -> EnvState:
+        normalized_url = url
+        if not normalized_url.startswith(("http://", "https://")):
+            normalized_url = "https://" + normalized_url
+        if sys.platform == "darwin":
+            pyautogui.hotkey("command", "l")
+        else:
+            pyautogui.hotkey("ctrl", "l")
+        pyautogui.write(normalized_url)
+        pyautogui.press("enter")
+        self._current_url = normalized_url
+        time.sleep(1)
+        return self.current_state()
+
+    def key_combination(self, keys: list[str]) -> EnvState:
+        normalized_keys = [self._normalize_key(key) for key in keys]
+        if len(normalized_keys) == 1:
+            pyautogui.press(normalized_keys[0])
+        else:
+            pyautogui.hotkey(*normalized_keys)
+        if sys.platform == "darwin" and normalized_keys == ["command", "space"]:
+            self._spotlight_pending = True
+            time.sleep(0.2)
+        return self.current_state()
+
+    def drag_and_drop(
+        self, x: int, y: int, destination_x: int, destination_y: int
+    ) -> EnvState:
+        pyautogui.moveTo(x, y)
+        pyautogui.dragTo(destination_x, destination_y, button="left")
+        return self.current_state()
+
+    def current_state(self) -> EnvState:
+        screenshot = pyautogui.screenshot()
+        buffer = io.BytesIO()
+        screenshot.save(buffer, format="PNG")
+        return EnvState(screenshot=buffer.getvalue(), url=self._current_url)
+
+    def _normalize_key(self, key: str) -> str:
+        k = key.strip().lower()
+        if k in ("controlormeta", "meta", "command"):
+            return "command" if sys.platform == "darwin" else "ctrl"
+        if k in ("control", "ctrl"):
+            return "ctrl"
+        if k in ("alt", "option"):
+            return "alt"
+        if k in ("return", "enter"):
+            return "enter"
+        if k in ("escape", "esc"):
+            return "esc"
+        if k in ("space", "spacebar"):
+            return "space"
+        if k in ("pageup", "page_up"):
+            return "pageup"
+        if k in ("pagedown", "page_down"):
+            return "pagedown"
+        if k in ("arrowleft", "left"):
+            return "left"
+        if k in ("arrowright", "right"):
+            return "right"
+        if k in ("arrowup", "up"):
+            return "up"
+        if k in ("arrowdown", "down"):
+            return "down"
+        if k == "delete":
+            return "delete"
+        if k == "backspace":
+            return "backspace"
+        return k