diff --git a/docs/agent_implementation.md b/docs/agent_implementation.md new file mode 100644 index 0000000..5ade8bf --- /dev/null +++ b/docs/agent_implementation.md @@ -0,0 +1,290 @@ +# Agent Implementation + +## Overview + +The `Agent` class, defined in `agent/agent.py`, serves as the primary orchestrator for the interaction between: +- The user +- The OpenAI model +- The computer environment + +It manages the conversation flow, handles model responses, and routes actions to the appropriate computer implementation. + +## Class Definition + +```python +class Agent: + """ + A sample agent class that can be used to interact with a computer. + + (See simple_cua_loop.py for a simple example without an agent.) + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + ): + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + self.tools += [ + { + "type": "computer-preview", + "display_width": computer.dimensions[0], + "display_height": computer.dimensions[1], + "environment": computer.environment, + }, + ] +``` + +## Key Methods + +### `run_full_turn()` + +The `run_full_turn()` method is the main entry point for running a complete interaction turn. It: + +1. Takes the current conversation context as input +2. Calls the model to generate a response +3. Processes any actions in the response +4. Continues calling the model until a final response is reached + +```python +def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False +): + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + new_items = [] + + # keep looping until we get a final response + while new_items[-1].get("role") != "assistant" if new_items else True: + self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + + response = create_response( + model=self.model, + input=input_items + new_items, + tools=self.tools, + truncation="auto", + ) + self.debug_print(response) + + if "output" not in response and self.debug: + print(response) + raise ValueError("No output from model") + else: + new_items += response["output"] + for item in response["output"]: + new_items += self.handle_item(item) + + return new_items +``` + +### `handle_item()` + +The `handle_item()` method processes individual items from the model's response: + +- For `message` items, it displays the message to the user +- For `function_call` items, it executes functions +- For `computer_call` items, it: + - Executes the specified computer action + - Takes a screenshot of the result + - Handles safety checks + - Prepares the output to send back to the model + +```python +def handle_item(self, item): + """Handle each item; may cause a computer action + screenshot.""" + if item["type"] == "message": + if self.print_steps: + print(item["content"][0]["text"]) + + if item["type"] == "function_call": + name, args = item["name"], json.loads(item["arguments"]) + if self.print_steps: + print(f"{name}({args})") + + if hasattr(self.computer, name): # if function exists on computer, call it + method = getattr(self.computer, name) + method(**args) + return [ + { + "type": "function_call_output", + "call_id": item["call_id"], + "output": "success", # hard-coded output for demo + } + ] + + if item["type"] == "computer_call": + action = item["action"] + action_type = action["type"] + action_args = {k: v for k, v in action.items() if k != "type"} + if self.print_steps: + print(f"{action_type}({action_args})") + + method = getattr(self.computer, action_type) + method(**action_args) + + screenshot_base64 = self.computer.screenshot() + if self.show_images: + show_image(screenshot_base64) + + # if user doesn't ack all safety checks exit with error + pending_checks = item.get("pending_safety_checks", []) + for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) + + call_output = { + "type": "computer_call_output", + "call_id": item["call_id"], + "acknowledged_safety_checks": pending_checks, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + + # additional URL safety checks for browser environments + if self.computer.environment == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url + + return [call_output] + return [] +``` + +## Initialization Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `model` | The OpenAI model to use | `"computer-use-preview-2025-02-04"` | +| `computer` | The Computer implementation to use | `None` | +| `tools` | A list of additional tools to provide to the model | `[]` | +| `acknowledge_safety_check_callback` | A callback function for handling safety checks | `lambda: False` | + +## Agent Workflow Diagram + +``` +┌─────────────────┐ +│ │ +│ User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ run_full_turn │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ OpenAI Model │ +│ Response │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ handle_item │ +│ │ +└────────┬────────┘ + │ + │ ┌─────────┐ + ├────┤ message │ + │ └─────────┘ + │ + │ ┌─────────────┐ + ├────┤function_call│ + │ └─────────────┘ + │ + │ ┌──────────────┐ + └────┤computer_call │ + └───────┬──────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Computer │ + │ Action │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Screenshot │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Safety Checks │ + │ │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ │ + │ Return Output │ + │ │ + └─────────────────┘ +``` + +## Using the Agent + +The most common way to use the Agent is through the CLI, which handles the initialization and interaction loop: + +```python +with ComputerClass() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + ) + items = [] + + while True: + user_input = args.input or input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn( + items, + print_steps=True, + show_images=args.show, + debug=args.debug, + ) + items += output_items + args.input = None +``` + +## Function Calling + +The Agent supports function calling through the `tools` parameter. If the model calls a function that exists on the Computer implementation, the Agent will route the call to the appropriate method. + +This is useful for extending the capabilities of the Computer implementation with custom functions that can't be expressed through standard computer actions like click or type. + +## Safety Considerations + +The Agent includes several safety measures: + +- URL blocklisting for browser-based environments +- Safety check acknowledgment for potentially risky actions +- Exception handling for failures + +The `acknowledge_safety_check_callback` parameter allows you to customize the behavior when a safety check is triggered. \ No newline at end of file diff --git a/docs/api_reference.md b/docs/api_reference.md new file mode 100644 index 0000000..43a5cbd --- /dev/null +++ b/docs/api_reference.md @@ -0,0 +1,366 @@ +# API Reference + +This document provides a reference for the key API components in the Computer Using Agent (CUA) Sample App. + +## Agent API + +### Agent Class + +```python +class Agent: + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + ): + """ + Initialize an Agent instance. + + Args: + model (str): The OpenAI model to use + computer (Computer): The Computer implementation to use + tools (list[dict]): Additional tools to provide to the model + acknowledge_safety_check_callback (Callable): Function to call for safety checks + """ + pass + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + """ + Run a full interaction turn with the model. + + Args: + input_items (list): The current conversation context + print_steps (bool): Whether to print steps during execution + debug (bool): Whether to print debug information + show_images (bool): Whether to show images during execution + + Returns: + list: The new items added to the conversation context + """ + pass + + def handle_item(self, item): + """ + Handle an item from the model's response. + + Args: + item (dict): The item to handle + + Returns: + list: Any new items to add to the conversation context + """ + pass +``` + +## Computer API + +### Computer Protocol + +```python +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + @property + def dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... +``` + +### BasePlaywrightComputer + +```python +class BasePlaywrightComputer: + """ + Abstract base for Playwright-based computers. + + Attributes: + environment (Literal["browser"]): The environment type + dimensions (tuple[int, int]): The dimensions of the screen + """ + + def __enter__(self): + """ + Set up the Playwright environment. + + Returns: + self: The computer instance + """ + pass + + def __exit__(self, exc_type, exc_val, exc_tb): + """ + Clean up the Playwright environment. + """ + pass + + def get_current_url(self) -> str: + """ + Get the current URL of the page. + + Returns: + str: The current URL + """ + pass + + def screenshot(self) -> str: + """ + Capture a screenshot of the current page. + + Returns: + str: The base64-encoded screenshot + """ + pass + + def click(self, x: int, y: int, button: str = "left") -> None: + """ + Perform a mouse click at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + button (str): The mouse button to use + """ + pass + + def double_click(self, x: int, y: int) -> None: + """ + Perform a double-click at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + """ + pass + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + """ + Scroll the page at the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + scroll_x (int): The amount to scroll horizontally + scroll_y (int): The amount to scroll vertically + """ + pass + + def type(self, text: str) -> None: + """ + Type the specified text. + + Args: + text (str): The text to type + """ + pass + + def wait(self, ms: int = 1000) -> None: + """ + Wait for the specified number of milliseconds. + + Args: + ms (int): The number of milliseconds to wait + """ + pass + + def move(self, x: int, y: int) -> None: + """ + Move the mouse to the specified coordinates. + + Args: + x (int): The x-coordinate + y (int): The y-coordinate + """ + pass + + def keypress(self, keys: List[str]) -> None: + """ + Press the specified keys. + + Args: + keys (List[str]): The keys to press + """ + pass + + def drag(self, path: List[Dict[str, int]]) -> None: + """ + Perform a drag operation along the specified path. + + Args: + path (List[Dict[str, int]]): The path to drag along + """ + pass + + def goto(self, url: str) -> None: + """ + Navigate to the specified URL. + + Args: + url (str): The URL to navigate to + """ + pass + + def back(self) -> None: + """ + Navigate back in the browser history. + """ + pass + + def forward(self) -> None: + """ + Navigate forward in the browser history. + """ + pass + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + """ + Get a browser instance and page. + + Returns: + tuple[Browser, Page]: The browser and page instances + """ + raise NotImplementedError +``` + +## Utility Functions + +### create_response() + +```python +def create_response(**kwargs): + """ + Create a response from the OpenAI API. + + Args: + **kwargs: Arguments to pass to the API + + Returns: + dict: The API response + """ + pass +``` + +### show_image() + +```python +def show_image(base_64_image): + """ + Display an image from a base64-encoded string. + + Args: + base_64_image (str): The base64-encoded image + """ + pass +``` + +### check_blocklisted_url() + +```python +def check_blocklisted_url(url: str) -> None: + """ + Check if a URL is in the blocklist. + + Args: + url (str): The URL to check + + Raises: + ValueError: If the URL is in the blocklist + """ + pass +``` + +### sanitize_message() + +```python +def sanitize_message(msg: dict) -> dict: + """ + Sanitize a message by omitting image_url for computer_call_output messages. + + Args: + msg (dict): The message to sanitize + + Returns: + dict: The sanitized message + """ + pass +``` + +## CLI Functions + +### acknowledge_safety_check_callback() + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + """ + Prompt the user to acknowledge a safety check. + + Args: + message (str): The safety check message + + Returns: + bool: Whether the user acknowledged the check + """ + pass +``` + +### main() + +```python +def main(): + """ + Run the CLI. + """ + pass +``` + +## Simple CUA Loop Functions + +### handle_item() + +```python +def handle_item(item, computer: Computer): + """ + Handle an item from the model's response. + + Args: + item (dict): The item to handle + computer (Computer): The Computer implementation to use + + Returns: + list: Any new items to add to the conversation context + """ + pass +``` + +### main() + +```python +def main(): + """ + Run the simple CUA loop. + """ + pass +``` \ No newline at end of file diff --git a/docs/assets/class_diagram.md b/docs/assets/class_diagram.md new file mode 100644 index 0000000..1d967e0 --- /dev/null +++ b/docs/assets/class_diagram.md @@ -0,0 +1,82 @@ +# Class Diagram + +The following diagram illustrates the class structure and relationships in Octotools. + +```mermaid +classDiagram + class NotebookAnalyzer { + +analyze(notebook_path: str) Dict + -_load_notebook(notebook_path: str) Dict + -_analyze_cells(cells: List) Dict + -_generate_metrics(cell_data: Dict) Dict + } + + class CellAnalyzer { + +analyze_cell(cell: Dict) Dict + -_extract_imports(code: str) List + -_measure_complexity(code: str) int + -_detect_patterns(code: str) List + } + + class NotebookVisualizer { + +create_visualization(notebook_path: str, output_path: str) None + -_generate_graph(metrics: Dict) Figure + -_save_visualization(figure: Figure, output_path: str) None + } + + class CLI { + +run() None + -_parse_args() Namespace + -_process_command(command: str, args: Namespace) None + } + + class API { + +analyze_notebook(notebook_path: str) Dict + +visualize_notebook(notebook_path: str, output_path: str) None + +batch_analyze(notebook_paths: List[str]) Dict + } + + class Utils { + +load_config() Dict + +setup_logging() Logger + +format_output(results: Dict) str + } + + NotebookAnalyzer --> CellAnalyzer : uses + NotebookVisualizer --> NotebookAnalyzer : uses + API --> NotebookAnalyzer : uses + API --> NotebookVisualizer : uses + CLI --> API : uses + NotebookAnalyzer --> Utils : uses + NotebookVisualizer --> Utils : uses +``` + +## Component Relationships + +The diagram above illustrates the key classes and their relationships: + +1. **NotebookAnalyzer** - Core class responsible for analyzing Jupyter notebooks +2. **CellAnalyzer** - Analyzes individual cells within a notebook +3. **NotebookVisualizer** - Creates visualizations based on notebook analysis +4. **CLI** - Command-line interface for the tool +5. **API** - Programmatic interface for the tool +6. **Utils** - Utility functions used by multiple components + +## Key Relationships + +- **NotebookAnalyzer** uses **CellAnalyzer** to analyze individual cells +- **NotebookVisualizer** uses **NotebookAnalyzer** to get data for visualizations +- **API** uses both **NotebookAnalyzer** and **NotebookVisualizer** +- **CLI** uses the **API** to provide command-line functionality +- **Utils** provides common functionality to several components + +## Design Patterns + +The codebase leverages several design patterns: + +1. **Facade Pattern** - The API class provides a simplified interface to the complex subsystem +2. **Strategy Pattern** - Different analysis strategies can be used by the NotebookAnalyzer +3. **Command Pattern** - The CLI uses commands to trigger different functionalities +4. **Singleton Pattern** - The Utils class provides global access to configuration +5. **Composite Pattern** - Notebooks and cells form a composite structure + \ No newline at end of file diff --git a/docs/assets/process_flow.md b/docs/assets/process_flow.md new file mode 100644 index 0000000..fa4c0b8 --- /dev/null +++ b/docs/assets/process_flow.md @@ -0,0 +1,101 @@ +# Process Flow Diagrams + +This document contains flow diagrams illustrating the key processes in Octotools. + +## Notebook Analysis Flow + +The following diagram shows the process flow for analyzing a notebook. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + participant CellAnalyzer + participant Utils + + User->>CLI: Run analyze command + CLI->>API: Call analyze_notebook() + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer->>Utils: load_config() + Utils-->>NotebookAnalyzer: Return config + NotebookAnalyzer->>NotebookAnalyzer: _load_notebook() + loop For each cell + NotebookAnalyzer->>CellAnalyzer: analyze_cell(cell) + CellAnalyzer-->>NotebookAnalyzer: Return cell metrics + end + NotebookAnalyzer->>NotebookAnalyzer: _generate_metrics() + NotebookAnalyzer-->>API: Return analysis results + API->>Utils: format_output(results) + Utils-->>API: Return formatted results + API-->>CLI: Return formatted results + CLI-->>User: Display results +``` + +## Visualization Flow + +The following diagram shows the process flow for creating visualizations. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + participant NotebookVisualizer + + User->>CLI: Run visualize command + CLI->>API: Call visualize_notebook() + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer-->>API: Return analysis results + API->>NotebookVisualizer: create_visualization(results, output_path) + NotebookVisualizer->>NotebookVisualizer: _generate_graph() + NotebookVisualizer->>NotebookVisualizer: _save_visualization() + NotebookVisualizer-->>API: Return success + API-->>CLI: Return success + CLI-->>User: Display success message +``` + +## Batch Analysis Flow + +The following diagram shows the process flow for batch analysis of multiple notebooks. + +```mermaid +sequenceDiagram + participant User + participant CLI + participant API + participant NotebookAnalyzer + + User->>CLI: Run batch command + CLI->>API: Call batch_analyze(notebook_paths) + + loop For each notebook + API->>NotebookAnalyzer: analyze(notebook_path) + NotebookAnalyzer-->>API: Return analysis results + API->>API: Aggregate results + end + + API-->>CLI: Return aggregated results + CLI-->>User: Display aggregated results +``` + +## Overall System Architecture + +The following diagram shows the overall system architecture. + +```mermaid +graph TD + User[User] -->|Uses| CLI[Command Line Interface] + User -->|Uses| PythonAPI[Python API] + CLI -->|Calls| Core[Core Library] + PythonAPI -->|Calls| Core + Core -->|Contains| Analyzer[Notebook Analyzer] + Core -->|Contains| Visualizer[Notebook Visualizer] + Core -->|Contains| Utils[Utilities] + Analyzer -->|Analyzes| Notebooks[Jupyter Notebooks] + Visualizer -->|Creates| Reports[Reports & Visualizations] + Utils -->|Supports| Analyzer + Utils -->|Supports| Visualizer +``` \ No newline at end of file diff --git a/docs/cli_usage.md b/docs/cli_usage.md new file mode 100644 index 0000000..f5147f3 --- /dev/null +++ b/docs/cli_usage.md @@ -0,0 +1,227 @@ +# CLI Usage Guide + +The Command Line Interface (CLI) provides an easy way to interact with the Computer Using Agent (CUA) system. It allows you to select different computer environments, configure execution parameters, and start an interactive session with the agent. + +## Basic Usage + +The basic command to run the CLI is: + +```bash +python cli.py +``` + +This will start an interactive session with the default settings (local Playwright browser environment). + +## Command Line Arguments + +The CLI supports several command-line arguments to customize its behavior: + +| Argument | Description | Default | +|----------|-------------|---------| +| `--computer` | The computer environment to use | `local-playwright` | +| `--input` | Initial input to the agent (optional) | None | +| `--debug` | Enable debug mode | False | +| `--show` | Show images (screenshots) during execution | False | +| `--start-url` | Starting URL for browser environments | `https://bing.com` | + +### Example Usage + +Using a different computer environment: + +```bash +python cli.py --computer docker +``` + +Providing an initial input: + +```bash +python cli.py --input "Search for information about climate change" +``` + +Enabling debug mode: + +```bash +python cli.py --debug +``` + +Showing images during execution: + +```bash +python cli.py --show +``` + +Specifying a start URL: + +```bash +python cli.py --start-url "https://www.google.com" +``` + +Combining multiple arguments: + +```bash +python cli.py --computer local-playwright --show --debug --start-url "https://www.wikipedia.org" +``` + +## Available Computer Environments + +The CLI supports several computer environments, each with its own requirements and characteristics. + +| Environment Option | Description | Type | Requirements | +|--------------------|-------------|------|-------------| +| `local-playwright` | Local browser window | Browser | Playwright SDK | +| `docker` | Docker container environment | Linux | Docker running | +| `browserbase` | Remote browser environment | Browser | Browserbase API key in `.env` | +| `scrapybara-browser` | Remote browser environment | Browser | Scrapybara API key in `.env` | +| `scrapybara-ubuntu` | Remote Ubuntu desktop | Linux | Scrapybara API key in `.env` | + +## Implementation Details + +The CLI is implemented in `cli.py`. Here's an overview of the key components: + +### Safety Check Callback + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + response = input( + f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " + ).lower() + return response.lower().strip() == "y" +``` + +This function is called when the agent encounters a safety check. It displays the safety warning message and asks the user if they want to proceed. + +### Main Function + +```python +def main(): + parser = argparse.ArgumentParser( + description="Select a computer environment from the available options." + ) + parser.add_argument( + "--computer", + choices=[ + "local-playwright", + "docker", + "browserbase", + "scrapybara-browser", + "scrapybara-ubuntu", + ], + help="Choose the computer environment to use.", + default="local-playwright", + ) + # ...other arguments... + args = parser.parse_args() + + computer_mapping = { + "local-playwright": LocalPlaywrightComputer, + "docker": DockerComputer, + "browserbase": BrowserbaseBrowser, + "scrapybara-browser": ScrapybaraBrowser, + "scrapybara-ubuntu": ScrapybaraUbuntu, + } + + ComputerClass = computer_mapping[args.computer] + + with ComputerClass() as computer: + agent = Agent( + computer=computer, + acknowledge_safety_check_callback=acknowledge_safety_check_callback, + ) + items = [] + + while True: + user_input = args.input or input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn( + items, + print_steps=True, + show_images=args.show, + debug=args.debug, + ) + items += output_items + args.input = None +``` + +The main function: +1. Parses command-line arguments +2. Maps the selected computer environment to the appropriate class +3. Creates an instance of the selected Computer class +4. Creates an Agent with the computer instance +5. Enters the main interaction loop, where it: + - Gets user input (or uses the provided initial input) + - Adds the input to the conversation context + - Runs a full turn of the agent + - Adds the agent's output to the conversation context + - Resets the initial input (so it's only used once) + +## Interaction Flow + +``` +┌─────────────────┐ +│ │ +│ Parse Command │ +│ Line Arguments │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Computer │ +│ Environment │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Create Agent │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│ Get User Input │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│Run Agent Full │ +│ Turn │ +│ │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ │ +│Update Convo │ +│Context │ +│ │ +└────────┬────────┘ + │ + └─────────────┐ + │ + ▼ + ┌───────┐ + │ Loop │ + └───────┘ +``` + +## Error Handling + +The CLI includes basic error handling: +- If the model returns an error, it's displayed to the user +- If a safety check fails, the program raises a ValueError with the safety message +- The context manager pattern (`with ComputerClass() as computer:`) ensures proper cleanup of computer environment resources, even in case of errors + +## Extending the CLI + +To add a new computer environment to the CLI: + +1. Implement the Computer protocol in a new class +2. Add your class to the `computers/__init__.py` file +3. Add your environment option to the `--computer` argument choices +4. Add your class to the `computer_mapping` dictionary \ No newline at end of file diff --git a/docs/computer_implementations.md b/docs/computer_implementations.md new file mode 100644 index 0000000..bb4b59f --- /dev/null +++ b/docs/computer_implementations.md @@ -0,0 +1,177 @@ +# Computer Protocol and Implementations + +## Computer Protocol + +The `Computer` protocol, defined in `computers/computer.py`, specifies the interface that all computer environment implementations must adhere to. It defines a set of methods for interacting with a computer environment, which could be a local browser, a remote browser, or a desktop environment. + +### Core Interface + +```python +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + @property + def environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + @property + def dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... +``` + +### Required Methods + +| Method | Description | Parameters | +|--------|-------------|------------| +| `screenshot()` | Captures and returns a base64-encoded image of the current screen | None | +| `click()` | Performs a mouse click at the specified coordinates | `x`, `y`, `button` | +| `double_click()` | Performs a double-click at the specified coordinates | `x`, `y` | +| `scroll()` | Scrolls the screen at the specified coordinates | `x`, `y`, `scroll_x`, `scroll_y` | +| `type()` | Types the specified text | `text` | +| `wait()` | Waits for the specified number of milliseconds | `ms` | +| `move()` | Moves the mouse to the specified coordinates | `x`, `y` | +| `keypress()` | Presses the specified keys | `keys` | +| `drag()` | Performs a drag operation along the specified path | `path` | +| `get_current_url()` | Returns the current URL (for browser environments) | None | + +### Required Properties + +| Property | Description | Type | +|----------|-------------|------| +| `environment` | Specifies the type of environment ("windows", "mac", "linux", "browser") | `Literal["windows", "mac", "linux", "browser"]` | +| `dimensions` | The dimensions of the screen (width, height) | `tuple[int, int]` | + +## Computer Implementations + +The repository includes several computer implementations, each designed to work with a different environment. + +### BasePlaywrightComputer + +The `BasePlaywrightComputer` class, defined in `computers/base_playwright.py`, serves as an abstract base class for Playwright-based computer implementations. It implements all the required methods of the `Computer` protocol, but leaves the actual browser/page connection to be implemented by subclasses. + +Key features: + +- Context management with `__enter__` and `__exit__` methods +- Network interception for security (blocking requests to suspicious domains) +- Implementation of all standard Computer actions +- Extra browser-specific actions like `goto()`, `back()`, and `forward()` + +### LocalPlaywrightComputer + +The `LocalPlaywrightComputer` class, defined in `computers/local_playwright.py`, extends `BasePlaywrightComputer` to use a local Chromium instance via Playwright. + +```python +class LocalPlaywrightComputer(BasePlaywrightComputer): + """Launches a local Chromium instance using Playwright.""" + + def __init__(self, headless: bool = False): + super().__init__() + self.headless = headless + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + width, height = self.dimensions + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system" + ] + browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={} + ) + page = browser.new_page() + page.set_viewport_size({"width": width, "height": height}) + page.goto("https://bing.com") + return browser, page +``` + +### DockerComputer + +The `DockerComputer` class connects to a Docker container running a VNC server, providing a Linux desktop environment. + +Key features: +- Connects to a VNC server running in Docker +- Uses PyVNC for VNC interaction +- Implements all standard Computer actions in a Linux desktop context + +### BrowserbaseBrowser + +The `BrowserbaseBrowser` class connects to the Browserbase API, a service that provides remote browser environments. + +Key features: +- Creates and connects to a remote browser session +- Uses the Browserbase API for interaction +- Requires a Browserbase API key + +### ScrapybaraBrowser + +The `ScrapybaraBrowser` class connects to the Scrapybara API, which provides remote browser environments. + +Key features: +- Creates and connects to a remote browser session +- Uses the Scrapybara API for interaction +- Requires a Scrapybara API key + +### ScrapybaraUbuntu + +The `ScrapybaraUbuntu` class connects to the Scrapybara API, but uses a remote Ubuntu desktop environment instead of a browser. + +Key features: +- Creates and connects to a remote Ubuntu desktop session +- Uses the Scrapybara API for interaction +- Requires a Scrapybara API key + +## Extending with Custom Computer Implementations + +You can create your own Computer implementation by: + +1. Implementing the `Computer` protocol +2. Adding your implementation to the `computers/__init__.py` file +3. Registering it in the `computer_mapping` dictionary in `cli.py` + +Example skeleton for a custom implementation: + +```python +class MyCustomComputer: + """My custom computer implementation.""" + + environment = "browser" # or "windows", "mac", "linux" + dimensions = (1024, 768) # default dimensions + + def __init__(self): + # Initialize your environment connection + pass + + def __enter__(self): + # Set up your environment + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + # Clean up your environment + pass + + def screenshot(self) -> str: + # Capture and return a base64-encoded screenshot + pass + + # Implement all other required methods... +``` \ No newline at end of file diff --git a/docs/developer_guide.md b/docs/developer_guide.md new file mode 100644 index 0000000..b558d24 --- /dev/null +++ b/docs/developer_guide.md @@ -0,0 +1,139 @@ +# Developer Guide + +This document provides information for developers who want to modify or extend Octotools. + +## Development Environment Setup + +1. **Clone the repository**: + ```bash + git clone https://github.com/octotools/octotools.git + cd octotools + ``` + +2. **Set up a virtual environment**: + ```bash + python -m venv env + source env/bin/activate # On Windows, use: env\Scripts\activate + ``` + +3. **Install in development mode**: + ```bash + pip install -e . + ``` + +4. **Install additional development dependencies**: + ```bash + pip install pytest black isort mypy + ``` + +## Project Structure + +The project is organized into several key components: + +``` +. +├── octotools/ # Main package +│ ├── __init__.py # Package initialization +│ ├── api/ # API implementation +│ ├── cli/ # Command-line interface +│ ├── core/ # Core functionality +│ ├── utils/ # Utility functions +│ └── visualization/ # Visualization tools +├── tasks/ # Task definitions +├── assets/ # Static assets +├── docs/ # Documentation +├── tests/ # Test suite +├── setup.py # Package configuration +├── requirements.txt # Python dependencies +└── README.md # Project overview +``` + +## Adding New Features + +To add a new feature to Octotools: + +1. **Create a feature branch**: + ```bash + git checkout -b feature/your-feature-name + ``` + +2. **Implement your feature**: + - Place your code in the appropriate module + - Follow the existing code style + - Add tests for your feature + - Update documentation + +3. **Submit a Pull Request**: + - Push your branch to your fork + - Create a pull request with a clear description of the changes + +## Coding Standards + +Octotools follows these coding standards: + +1. **PEP 8**: Follow PEP 8 style guidelines +2. **Type Hints**: Use type hints for function parameters and return values +3. **Docstrings**: Use Google-style docstrings +4. **Testing**: Write tests for new functionality +5. **Code Formatting**: Use Black and isort for code formatting + +Example of a well-documented function: + +```python +def analyze_notebook(notebook_path: str, metrics: List[str] = None) -> Dict[str, Any]: + """Analyze a Jupyter notebook and return metrics. + + Args: + notebook_path: Path to the notebook file + metrics: List of metrics to calculate. If None, calculate all metrics. + + Returns: + Dictionary of metrics and their values + + Raises: + FileNotFoundError: If the notebook file doesn't exist + ValueError: If an invalid metric is specified + """ + # Implementation + pass +``` + +## Testing + +To run the tests: + +```bash +pytest +``` + +For more information about testing, see the [Testing Guide](testing.md). + +## Documentation + +When adding or modifying features, update the documentation: + +1. **API Documentation**: Update docstrings in the code +2. **Usage Examples**: Add examples to show how to use the feature +3. **README**: Update the README if necessary + +## Release Process + +1. **Update Version**: Update the version in `setup.py` +2. **Create Changelog**: Update the changelog +3. **Create Tag**: Create a git tag for the version +4. **Build Distribution**: Build the distribution package + ```bash + python setup.py sdist bdist_wheel + ``` +5. **Upload to PyPI**: Upload the package to PyPI + ```bash + twine upload dist/* + ``` + +## Getting Help + +If you need help with the development process: + +1. Check the [Troubleshooting](troubleshooting.md) guide +2. Open an issue on GitHub +3. Reach out to the maintainers \ No newline at end of file diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..45c1d8d --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,162 @@ +# Examples + +The repository includes several example applications that demonstrate different aspects of the Computer Using Agent (CUA) functionality. This document provides an overview of these examples. + +## Weather Example + +The `weather_example.py` script demonstrates a simple, single-turn interaction with the CUA to check the weather. + +```python +from agent import Agent +from computers import ScrapybaraBrowser + +with ScrapybaraBrowser() as computer: + agent = Agent(computer=computer) + input_items = [{"role": "user", "content": "what is the weather in sf"}] + response_items = agent.run_full_turn(input_items, debug=True, show_images=True) + print(response_items[-1]["content"][0]["text"]) +``` + +### Key aspects: +- Uses the ScrapybaraBrowser computer environment +- Sends a single query about the weather in San Francisco +- Uses the debug mode to show detailed information during execution +- Shows images (screenshots) during execution +- Prints only the final text response + +## Function Calling Example + +The `function_calling_example.py` script demonstrates how to integrate function calling with the CUA. + +```python +from agent import Agent +from computers import ScrapybaraBrowser + +tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Determine weather in my location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["c", "f"]}, + }, + "additionalProperties": False, + "required": ["location", "unit"], + }, + } +] + + +def main(): + with ScrapybaraBrowser() as computer: + agent = Agent(tools=tools, computer=computer) + items = [] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items) + items += output_items + + +if __name__ == "__main__": + main() +``` + +### Key aspects: +- Defines a `get_weather` function tool with parameters for location and temperature unit +- Uses the ScrapybaraBrowser computer environment +- Creates an interactive session that continually takes user input +- Adds the function tool to the Agent's available tools + +This example shows how to: +1. Define a function schema using the OpenAI function calling format +2. Pass the function to the Agent via the `tools` parameter +3. Handle function calls in the Agent's conversation loop + +## Playwright with Custom Functions + +The `playwright_with_custom_functions.py` script demonstrates how to extend the CUA with custom browser navigation functions. + +```python +from agent.agent import Agent +from computers import LocalPlaywrightComputer + +tools = [ + { + "type": "function", + "name": "back", + "description": "Go back to the previous page.", + "parameters": {}, + }, + { + "type": "function", + "name": "goto", + "description": "Go to a specific URL.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Fully qualified URL to navigate to.", + }, + }, + "additionalProperties": False, + "required": ["url"], + }, + }, +] + + +def main(): + with LocalPlaywrightComputer() as computer: + agent = Agent(computer=computer, tools=tools) + items = [ + { + "role": "developer", + "content": "Use the additional back() and goto() functions to naviate the browser. If you see nothing, trying going to Google.", + } + ] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, show_images=False) + items += output_items + + +if __name__ == "__main__": + main() +``` + +### Key aspects: +- Uses the LocalPlaywrightComputer environment (local browser) +- Defines two custom function tools: `back()` and `goto(url)` +- Provides an initial developer message suggesting how to use these functions +- Creates an interactive session that continually takes user input +- Runs without showing images (screenshots) for faster execution + +This example demonstrates how: +1. Custom functions can be defined and passed to the Agent +2. These functions can be implemented in the Computer class +3. The Agent will route function calls to the appropriate methods in the Computer implementation + +## Running the Examples + +To run any of the examples, use the following command: + +```bash +python -m examples. +``` + +For instance, to run the weather example: + +```bash +python -m examples.weather_example +``` + +Note that some examples may require specific API keys or environment setup, particularly those using ScrapybaraBrowser or other remote browsers. \ No newline at end of file diff --git a/docs/octotools_integration_guide.md b/docs/octotools_integration_guide.md new file mode 100644 index 0000000..a2982cd --- /dev/null +++ b/docs/octotools_integration_guide.md @@ -0,0 +1,819 @@ +# Integrating Octotools with CUA-SAMPLE-APP + +This guide provides comprehensive instructions for integrating the Octotools framework with the CUA-SAMPLE-APP to enhance its reasoning and problem-solving capabilities. + +## Table of Contents + +1. [Introduction](#introduction) +2. [Benefits of Integration](#benefits-of-integration) +3. [Architecture Overview](#architecture-overview) +4. [Prerequisites](#prerequisites) +5. [Installation](#installation) +6. [Integration Steps](#integration-steps) +7. [Creating an Octotools-Enhanced Agent](#creating-an-octotools-enhanced-agent) +8. [Custom Tool Development](#custom-tool-development) +9. [Browser Automation with Octotools](#browser-automation-with-octotools) +10. [Examples](#examples) +11. [Troubleshooting](#troubleshooting) +12. [Further Resources](#further-resources) + +## Introduction + +Octotools is an open-source agentic framework designed for complex reasoning tasks across diverse domains. It provides standardized tools that can be easily integrated with large language models (LLMs). By integrating Octotools with the CUA-SAMPLE-APP, we can enhance the application's ability to perform multi-step reasoning, leverage specialized tools, and handle complex user queries. + +## Benefits of Integration + +1. **Enhanced Reasoning Capabilities**: Octotools provides a sophisticated planning and execution framework that enables multi-step reasoning. +2. **Extensible Tool Ecosystem**: Access to a wide range of pre-built tools for tasks like web search, image processing, code generation, and more. +3. **Standardized Tool Interface**: Consistent interface for creating and using tools, making it easy to extend functionality. +4. **Browser Automation Enhancement**: Augment CUA's browser automation with additional tools for understanding and interacting with web content. +5. **Performance Improvements**: Octotools has shown substantial average accuracy gains over raw LLM responses on complex reasoning tasks. + +## Architecture Overview + +The integrated system will combine CUA-SAMPLE-APP's Computer-Utilizing Agent capabilities with Octotools' reasoning framework: + +``` +┌───────────────────────────────────────────────────────────────┐ +│ CUA-SAMPLE-APP + Octotools │ +│ │ +│ ┌─────────────────┐ ┌────────────────────────────────┐ │ +│ │ User Input │ │ Enhanced Agent │ │ +│ └────────┬────────┘ │ ┌─────────────┐ ┌─────────┐ │ │ +│ │ │ │ CUA │ │Octotools│ │ │ +│ ▼ │ │ Agent │──► Solver │ │ │ +│ ┌────────────────┐ │ └─────────────┘ └─────────┘ │ │ +│ │ Agent Router │─────►│ │ │ │ │ +│ └────────────────┘ │ │ │ │ │ +│ ▲ │ ▼ ▼ │ │ +│ │ │ ┌─────────────────────┐ │ │ +│ ┌────────────────┐ │ │ Computer Interface │ │ │ +│ │ Response │◄─────│ └─────────────────────┘ │ │ +│ │ Generation │ │ │ │ │ +│ └────────────────┘ └──────────────┼─────────────────┘ │ +│ │ │ +│ ┌─────────────▼────────────┐ │ +│ │ Browser/Computer │ │ +│ └──────────────────────────┘ │ +└───────────────────────────────────────────────────────────────┘ +``` + +## Prerequisites + +Before integrating Octotools with CUA-SAMPLE-APP, ensure you have: + +1. Python 3.10 or higher +2. CUA-SAMPLE-APP installed and working +3. Git for version control +4. API keys for required services: + - OpenAI API key + - Google API key and CX (for search functionality) + - Any other API keys required by specific tools + +## Installation + +### Step 1: Add Octotools as a Dependency + +Add Octotools to your project's requirements.txt: + +```bash +echo "octotools @ git+https://github.com/OctoTools/OctoTools.git" >> requirements.txt +``` + +### Step 2: Install Dependencies + +Install the updated dependencies: + +```bash +pip install -r requirements.txt +``` + +### Step 3: Set Up Environment Variables + +Add the necessary environment variables for Octotools in your `.env` file: + +``` +# Existing CUA-SAMPLE-APP variables +# ... + +# Octotools API Keys +OPENAI_API_KEY= +GOOGLE_API_KEY= +GOOGLE_CX= +``` + +## Integration Steps + +### Step 1: Create an Octotools Wrapper + +Create a new file `octotools_wrapper.py` in the project root: + +```python +from octotools.models.solver import Solver +from typing import List, Dict, Any, Optional +import os +import base64 + + +class OctotoolsWrapper: + """ + Wrapper for Octotools integration with CUA-SAMPLE-APP. + """ + + def __init__( + self, + llm_engine: str = "gpt-4o", + enabled_tools: Optional[List[str]] = None, + max_steps: int = 5, + ): + """ + Initialize the Octotools wrapper. + + Args: + llm_engine: The LLM engine to use (default: "gpt-4o") + enabled_tools: List of tools to enable (default: None, which enables default tools) + max_steps: Maximum number of steps for solving a task (default: 5) + """ + self.llm_engine = llm_engine + self.max_steps = max_steps + + # Default tools useful for browser automation context + if enabled_tools is None: + self.enabled_tools = [ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Image_Captioner_Tool", + "Object_Detector_Tool", + "Google_Search_Tool", + "URL_Text_Extractor_Tool", + "Generalist_Solution_Generator_Tool" + ] + else: + self.enabled_tools = enabled_tools + + # Initialize the solver + self.solver = Solver( + model_string=self.llm_engine, + enabled_tools=self.enabled_tools, + max_steps=self.max_steps, + ) + + def solve( + self, + query: str, + image_data: Optional[str] = None, + context: Optional[str] = None + ) -> Dict[str, Any]: + """ + Solve a task using Octotools. + + Args: + query: The user query to solve + image_data: Optional base64-encoded image data + context: Optional additional context for the solver + + Returns: + Dictionary containing the solver output + """ + # Process the image if provided + image_path = None + if image_data: + # Save the image temporarily + import tempfile + with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp: + # Remove the data:image/png;base64, prefix if present + if 'base64,' in image_data: + image_data = image_data.split('base64,')[1] + + temp.write(base64.b64decode(image_data)) + image_path = temp.name + + # Build full context with query and additional context + full_query = query + if context: + full_query = f"{query}\n\nContext: {context}" + + # Solve the task + result = self.solver.solve( + query=full_query, + image_path=image_path, + verbose=True + ) + + # Clean up temporary file if created + if image_path and os.path.exists(image_path): + os.remove(image_path) + + return result +``` + +### Step 2: Enhance the Agent Class + +Modify `agent/agent.py` to integrate Octotools: + +```python +# Import the OctotoolsWrapper +from octotools_wrapper import OctotoolsWrapper + +class Agent: + """ + A sample agent class that can be used to interact with a computer. + Enhanced with Octotools for complex reasoning. + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + use_octotools: bool = False, + octotools_engine: str = "gpt-4o", + octotools_tools: List[str] = None, + ): + # Existing initialization + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + self.tools += [ + { + "type": "computer-preview", + "display_width": computer.dimensions[0], + "display_height": computer.dimensions[1], + "environment": computer.environment, + }, + ] + + # Octotools integration + self.use_octotools = use_octotools + if use_octotools: + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + else: + self.octotools = None + + # ... existing methods ... + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + """Enhanced run_full_turn with Octotools integration for complex reasoning.""" + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + + # Check if we should use Octotools for complex reasoning + complex_reasoning_trigger = self._needs_complex_reasoning(input_items) + + if self.use_octotools and complex_reasoning_trigger: + return self._handle_with_octotools(input_items) + else: + # Original CUA logic + new_items = [] + # ... existing code ... + return new_items + + def _needs_complex_reasoning(self, input_items): + """ + Determine if the query needs complex reasoning that would benefit from Octotools. + This is a basic heuristic and can be enhanced based on specific requirements. + """ + # Extract the latest user message + latest_user_message = None + for item in reversed(input_items): + if item.get("role") == "user": + latest_user_message = item.get("content", "") + break + + if not latest_user_message: + return False + + # Simple heuristic: check for keywords that might suggest complex reasoning + complex_keywords = [ + "analyze", "compare", "calculate", "extract data", "search for", + "find information", "summarize", "visual analysis", + "collect data", "research", "solve" + ] + + return any(keyword in latest_user_message.lower() for keyword in complex_keywords) + + def _handle_with_octotools(self, input_items): + """ + Handle a query using Octotools for complex reasoning. + """ + # Extract the latest user message and any screenshots + latest_user_message = None + latest_screenshot = None + + for item in reversed(input_items): + if item.get("role") == "user" and not latest_user_message: + latest_user_message = item.get("content", "") + + # Look for the most recent screenshot + if not latest_screenshot and item.get("type") == "computer_call_output": + output = item.get("output", {}) + if output.get("type") == "input_image": + image_url = output.get("image_url", "") + if image_url.startswith("data:image/png;base64,"): + latest_screenshot = image_url + + if not latest_user_message: + return [] + + # Get the current URL for context if in browser environment + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + result = self.octotools.solve( + query=latest_user_message, + image_data=latest_screenshot.split("base64,")[1] if latest_screenshot else None, + context=context + ) + + # Format the result for CUA + answer = result.get("answer", "I couldn't find a solution using the available tools.") + steps = result.get("steps", []) + + # Build a detailed response that includes steps taken + detailed_response = answer + "\n\n" + if steps: + detailed_response += "I took the following steps to solve this:\n" + for i, step in enumerate(steps, 1): + tool_used = step.get("tool_used", "Unknown tool") + reasoning = step.get("reasoning", "No reasoning provided") + detailed_response += f"\n{i}. Used {tool_used}: {reasoning}" + + # Return as a message from the assistant + return [{"role": "assistant", "content": detailed_response}] +``` + +### Step 3: Update Main Application + +Update `main.py` to allow enabling Octotools: + +```python +from agent.agent import Agent +from computers import LocalPlaywrightComputer +import argparse + + +def main(use_octotools=False): + with LocalPlaywrightComputer() as computer: + agent = Agent( + computer=computer, + use_octotools=use_octotools, + octotools_engine="gpt-4o", + ) + items = [] + while True: + user_input = input("> ") + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run CUA with optional Octotools integration") + parser.add_argument('--use-octotools', action='store_true', help='Enable Octotools integration') + args = parser.parse_args() + + main(use_octotools=args.use_octotools) +``` + +## Creating an Octotools-Enhanced Agent + +For more advanced use cases, you can create a dedicated Octotools-enhanced agent: + +```python +# octotools_agent.py +from agent.agent import Agent +from computers import Computer +from octotools_wrapper import OctotoolsWrapper +from typing import List, Dict, Any, Callable, Optional + + +class OctotoolsAgent(Agent): + """ + An agent that combines CUA capabilities with Octotools reasoning. + """ + + def __init__( + self, + model="computer-use-preview-2025-02-04", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda: False, + octotools_engine: str = "gpt-4o", + octotools_tools: Optional[List[str]] = None, + reasoning_threshold: float = 0.7, + ): + super().__init__( + model=model, + computer=computer, + tools=tools, + acknowledge_safety_check_callback=acknowledge_safety_check_callback + ) + + # Initialize Octotools + self.octotools = OctotoolsWrapper( + llm_engine=octotools_engine, + enabled_tools=octotools_tools + ) + + # Reasoning threshold determines when to use Octotools vs standard CUA + self.reasoning_threshold = reasoning_threshold + + # Add an Octotools tool to the CUA tools list + self.tools.append({ + "type": "function", + "function": { + "name": "use_octotools_reasoning", + "description": "Use Octotools framework for complex reasoning tasks", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The query to solve using Octotools" + } + }, + "required": ["query"] + } + } + }) + + def use_octotools_reasoning(self, query: str) -> str: + """ + Use Octotools to solve a complex reasoning task. + This can be called by the CUA as a tool. + """ + # Capture the current screenshot + screenshot_base64 = None + if self.computer: + screenshot_base64 = self.computer.screenshot() + + # Get current URL for context + current_url = None + if self.computer and self.computer.environment == "browser": + try: + current_url = self.computer.get_current_url() + except: + pass + + # Build context + context = f"Current URL: {current_url}" if current_url else "" + + # Solve using Octotools + result = self.octotools.solve( + query=query, + image_data=screenshot_base64, + context=context + ) + + # Return the answer + answer = result.get("answer", "I couldn't find a solution using the available tools.") + return answer + + def handle_item(self, item): + """Override to handle the Octotools function call.""" + if item["type"] == "function_call" and item["name"] == "use_octotools_reasoning": + args = json.loads(item["arguments"]) + result = self.use_octotools_reasoning(args["query"]) + return [{ + "type": "function_call_output", + "call_id": item["call_id"], + "output": result + }] + else: + # Use the original handle_item for other cases + return super().handle_item(item) +``` + +## Custom Tool Development + +You can extend Octotools with custom tools tailored for the CUA application: + +### Example: Creating a Webpage Analysis Tool + +```python +# webpage_analyzer_tool.py +from octotools.tools.base import BaseTool +from bs4 import BeautifulSoup + + +class Webpage_Analyzer_Tool(BaseTool): + """ + A tool that analyzes the structure and content of a webpage. + """ + + def __init__(self): + super().__init__( + tool_name="Webpage_Analyzer_Tool", + tool_description="Analyzes the structure and content of a webpage", + tool_version="1.0.0", + input_types={ + "html": "str - HTML content of the webpage to analyze", + "analysis_type": "str - Type of analysis to perform (structure, content, links, or forms)" + }, + output_type="dict - Analysis results containing requested information", + demo_commands=[ + { + "command": 'execution = tool.execute(html="

Title

", analysis_type="structure")', + "description": "Analyze the structure of an HTML document" + } + ], + user_metadata={ + "limitations": [ + "Cannot analyze JavaScript-rendered content", + "Does not execute JavaScript code", + "Limited to static HTML analysis" + ], + "best_practices": [ + "Provide complete HTML for accurate analysis", + "Specify the analysis type to get focused results" + ] + } + ) + + def execute(self, html, analysis_type="structure"): + """ + Execute the webpage analysis tool. + + Args: + html (str): HTML content of the webpage to analyze + analysis_type (str): Type of analysis to perform (structure, content, links, or forms) + + Returns: + dict: Analysis results containing requested information + """ + # Parse the HTML + soup = BeautifulSoup(html, 'html.parser') + + # Perform the requested analysis + if analysis_type == "structure": + return self._analyze_structure(soup) + elif analysis_type == "content": + return self._analyze_content(soup) + elif analysis_type == "links": + return self._analyze_links(soup) + elif analysis_type == "forms": + return self._analyze_forms(soup) + else: + return {"error": f"Unknown analysis type: {analysis_type}"} + + def _analyze_structure(self, soup): + """Analyze the structure of the HTML document.""" + headings = {} + for i in range(1, 7): + headings[f'h{i}'] = len(soup.find_all(f'h{i}')) + + return { + "title": soup.title.string if soup.title else None, + "headings": headings, + "paragraphs": len(soup.find_all('p')), + "divs": len(soup.find_all('div')), + "lists": { + "ul": len(soup.find_all('ul')), + "ol": len(soup.find_all('ol')), + }, + "tables": len(soup.find_all('table')) + } + + def _analyze_content(self, soup): + """Extract the main content of the HTML document.""" + return { + "title": soup.title.string if soup.title else None, + "meta_description": soup.find('meta', attrs={'name': 'description'}).get('content') if soup.find('meta', attrs={'name': 'description'}) else None, + "main_text": soup.get_text(strip=True)[:1000] + "..." if len(soup.get_text(strip=True)) > 1000 else soup.get_text(strip=True), + "word_count": len(soup.get_text(strip=True).split()) + } + + def _analyze_links(self, soup): + """Extract and analyze links in the HTML document.""" + links = soup.find_all('a') + internal_links = [] + external_links = [] + + for link in links: + href = link.get('href') + if href: + if href.startswith('http') or href.startswith('//'): + external_links.append(href) + else: + internal_links.append(href) + + return { + "total_links": len(links), + "internal_links": internal_links[:20], # Limit to avoid overwhelming output + "internal_link_count": len(internal_links), + "external_links": external_links[:20], # Limit to avoid overwhelming output + "external_link_count": len(external_links) + } + + def _analyze_forms(self, soup): + """Extract and analyze forms in the HTML document.""" + forms = soup.find_all('form') + form_analysis = [] + + for i, form in enumerate(forms): + inputs = form.find_all('input') + input_details = [] + + for input_field in inputs: + input_type = input_field.get('type', 'text') + input_name = input_field.get('name', '') + input_details.append({ + "type": input_type, + "name": input_name, + "id": input_field.get('id', ''), + "required": input_field.has_attr('required') + }) + + form_analysis.append({ + "form_id": form.get('id', f'form_{i}'), + "method": form.get('method', 'get').upper(), + "action": form.get('action', ''), + "input_fields": input_details, + "submit_button": bool(form.find('button', attrs={'type': 'submit'}) or form.find('input', attrs={'type': 'submit'})) + }) + + return { + "form_count": len(forms), + "forms": form_analysis + } +``` + +## Browser Automation with Octotools + +One powerful use case is to enhance CUA's browser automation capabilities with Octotools. Here's an example of how to combine them: + +```python +# enhanced_browser_agent.py +from octotools_agent import OctotoolsAgent +from computers import LocalPlaywrightComputer + + +def run_enhanced_browser_agent(): + """ + Run an enhanced browser agent that combines CUA with Octotools. + """ + with LocalPlaywrightComputer() as computer: + # Define browser-specific tools + browser_octotools = [ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Image_Captioner_Tool", + "Object_Detector_Tool", + "Webpage_Analyzer_Tool", # Custom tool created above + "URL_Text_Extractor_Tool" + ] + + # Create the agent + agent = OctotoolsAgent( + computer=computer, + octotools_tools=browser_octotools + ) + + items = [] + print("Enhanced Browser Agent with Octotools") + print("Type 'exit' to quit") + + while True: + user_input = input("> ") + if user_input.lower() == 'exit': + break + + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items + + +if __name__ == "__main__": + run_enhanced_browser_agent() +``` + +## Examples + +Here are some examples of how to use the integrated system: + +### Example 1: Basic Integration + +```python +# Run with basic Octotools integration +python main.py --use-octotools +``` + +### Example 2: Advanced Integration + +```python +# Using the enhanced browser agent +python enhanced_browser_agent.py +``` + +### Example 3: Custom Integration Script + +```python +# custom_integration.py +from agent.agent import Agent +from computers import LocalPlaywrightComputer +from octotools_wrapper import OctotoolsWrapper + +# Initialize components +computer = LocalPlaywrightComputer() +computer.start() + +# Initialize Octotools wrapper with specific tools +octotools = OctotoolsWrapper( + llm_engine="gpt-4o", + enabled_tools=[ + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "Google_Search_Tool" + ] +) + +# Initialize agent without Octotools integration +agent = Agent(computer=computer) + +# Process loop with manual Octotools integration +items = [] +try: + while True: + user_input = input("> ") + + # Determine if we should use Octotools + if any(keyword in user_input.lower() for keyword in ["search", "find", "calculate", "analyze"]): + # Get current screenshot + screenshot = computer.screenshot() + + # Use Octotools to solve + result = octotools.solve( + query=user_input, + image_data=screenshot + ) + + # Add the result as an assistant message + items.append({"role": "user", "content": user_input}) + items.append({"role": "assistant", "content": result["answer"]}) + print(result["answer"]) + else: + # Use standard CUA processing + items.append({"role": "user", "content": user_input}) + output_items = agent.run_full_turn(items, debug=True, show_images=True) + items += output_items +finally: + computer.stop() +``` + +## Troubleshooting + +### Common Issues + +#### Octotools Import Errors + +If you encounter import errors for Octotools: + +``` +Make sure Octotools is properly installed: +pip install git+https://github.com/OctoTools/OctoTools.git +``` + +#### API Key Issues + +If tools that require API keys don't work: + +``` +Check that all required API keys are correctly set in your .env file +``` + +#### Integration Conflicts + +If you encounter conflicts between CUA and Octotools: + +``` +Ensure that you're not trying to use both frameworks for the same task simultaneously. +The OctotoolsAgent class should handle the proper coordination between them. +``` + +## Further Resources + +- [Octotools Documentation](https://github.com/OctoTools/OctoTools/tree/main/docs) +- [CUA-SAMPLE-APP Documentation](https://github.com/openai/openai-cua-sample-app/tree/main/docs) +- [Custom Tool Development Guide](https://github.com/OctoTools/OctoTools/tree/main/docs/custom_tools.md) +- [OpenAI API Documentation](https://platform.openai.com/docs/api-reference) \ No newline at end of file diff --git a/docs/performance.md b/docs/performance.md new file mode 100644 index 0000000..2c2f7da --- /dev/null +++ b/docs/performance.md @@ -0,0 +1,229 @@ +# Performance Considerations + +This document provides guidance on optimizing the performance of your Computer Using Agent (CUA) application. + +## Understanding Performance Factors + +The performance of a CUA application depends on several factors: + +1. **Network Latency**: The time it takes for requests to travel between your application and the OpenAI API +2. **API Processing Time**: The time it takes for the OpenAI API to process your request and generate a response +3. **Computer Environment Performance**: The speed of the computer environment (local browser, remote browser, Docker container) +4. **Screenshot Size and Quality**: Larger screenshots take longer to process +5. **Conversation History Size**: Larger conversation histories increase token usage and processing time +6. **Action Complexity**: Complex sequences of actions take longer to execute than simple ones + +## Optimizing API Interactions + +### Reduce API Calls + +Each call to the OpenAI API adds latency to your application. To reduce the number of API calls: + +1. **Batch Actions**: When possible, design your prompts to encourage the model to perform multiple actions in a single turn +2. **Provide Clear Instructions**: Clear prompts help the model accomplish tasks with fewer turns +3. **Use Custom Functions**: For complex operations, use custom functions instead of relying on the model to perform a sequence of basic actions + +### Optimize Context Size + +Larger contexts take longer to process and consume more tokens: + +1. **Limit Conversation History**: If you're building a long-running application, consider pruning old conversation history +2. **Compress Screenshots**: Use lower resolution or compressed screenshots when possible +3. **Use Truncation**: The Agent uses `truncation="auto"` by default, which helps manage large contexts + +## Optimizing Computer Environments + +### Local Playwright + +The LocalPlaywrightComputer is usually the fastest option because it runs locally: + +1. **Use Headless Mode**: For automated tasks, use headless mode to reduce overhead: + ```python + LocalPlaywrightComputer(headless=True) + ``` +2. **Optimize Browser Settings**: Customize browser launch arguments: + ```python + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-gpu", # Disable GPU acceleration for faster headless operation + "--no-sandbox", # Use with caution - reduces security + ] + ``` + +### Docker Environment + +The Docker environment's performance depends on your Docker setup: + +1. **Allocate Sufficient Resources**: Make sure Docker has enough CPU and memory +2. **Use a Fast VNC Connection**: VNC performance greatly affects DockerComputer performance +3. **Optimize Display Resolution**: Use a lower resolution to reduce VNC traffic + +### Remote Browser Environments + +Remote browser environments (Browserbase, Scrapybara) have additional network latency: + +1. **Choose Geographically Closer Servers**: If available, use servers that are closer to your location +2. **Reduce Screenshot Frequency**: Minimize the number of actions that require screenshots +3. **Use Batch Operations**: Perform multiple actions in sequence before requesting a new screenshot + +## Code-Level Optimizations + +### Optimize Screenshot Handling + +Screenshots are the largest data elements in most CUA applications: + +1. **Compress Screenshots**: Consider compressing screenshots before encoding them: + ```python + def screenshot(self) -> str: + # Capture screenshot + png_bytes = self._page.screenshot(full_page=False) + + # Optionally compress the image + from PIL import Image + import io + image = Image.open(io.BytesIO(png_bytes)) + image = image.resize((image.width // 2, image.height // 2)) # Downsample + + output = io.BytesIO() + image.save(output, format='JPEG', quality=70) # Convert to JPEG with compression + compressed_bytes = output.getvalue() + + return base64.b64encode(compressed_bytes).decode("utf-8") + ``` + +2. **Crop Screenshots**: Consider cropping screenshots to the relevant area: + ```python + def screenshot(self) -> str: + # Capture full screenshot + png_bytes = self._page.screenshot(full_page=False) + + # Crop to relevant area + from PIL import Image + import io + image = Image.open(io.BytesIO(png_bytes)) + + # Example: crop to the top half of the screen + width, height = image.size + image = image.crop((0, 0, width, height // 2)) + + output = io.BytesIO() + image.save(output, format='PNG') + cropped_bytes = output.getvalue() + + return base64.b64encode(cropped_bytes).decode("utf-8") + ``` + +### Optimize Action Execution + +1. **Parallelize Actions**: For independent actions, consider parallelizing them: + ```python + import threading + + def perform_parallel_actions(actions): + threads = [] + for action in actions: + thread = threading.Thread(target=action) + threads.append(thread) + thread.start() + + for thread in threads: + thread.join() + ``` + +2. **Batch Similar Actions**: Group similar actions together: + ```python + def type_paragraphs(self, paragraphs): + for paragraph in paragraphs: + self._page.keyboard.type(paragraph) + self._page.keyboard.press("Enter") + self._page.keyboard.press("Enter") + ``` + +### Caching + +1. **Cache Screenshots**: If the screen hasn't changed, reuse the previous screenshot: + ```python + def screenshot(self) -> str: + # Check if the screen has changed since the last screenshot + current_hash = self._get_screen_hash() + if current_hash == self._last_screen_hash: + return self._last_screenshot + + # If screen has changed, take a new screenshot + png_bytes = self._page.screenshot(full_page=False) + screenshot = base64.b64encode(png_bytes).decode("utf-8") + + # Update cache + self._last_screen_hash = current_hash + self._last_screenshot = screenshot + + return screenshot + ``` + +2. **Cache Function Results**: For expensive function calls, consider caching results: + ```python + import functools + + @functools.lru_cache(maxsize=128) + def expensive_function(self, arg1, arg2): + # Expensive operation + return result + ``` + +## Measuring Performance + +To identify performance bottlenecks, add timing measurements: + +```python +import time + +def measure_time(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + print(f"{func.__name__} took {end_time - start_time:.2f} seconds") + return result + return wrapper + +class TimedComputer(LocalPlaywrightComputer): + @measure_time + def screenshot(self) -> str: + return super().screenshot() + + @measure_time + def click(self, x: int, y: int, button: str = "left") -> None: + return super().click(x, y, button) + + # etc. +``` + +Use this data to identify which operations are taking the most time and focus your optimization efforts there. + +## Trade-offs + +When optimizing for performance, consider these trade-offs: + +1. **Quality vs. Speed**: Lower quality screenshots are faster but may lead to less accurate model responses +2. **Safety vs. Speed**: Some safety checks add overhead but are important for security +3. **Flexibility vs. Speed**: Custom functions are faster but less flexible than general-purpose computer actions +4. **Memory vs. Speed**: Caching improves speed but increases memory usage + +Choose optimizations that make sense for your specific use case and requirements. + +## Environment-Specific Recommendations + +### Local Development + +For local development, prioritize: +- Fast iteration time with LocalPlaywrightComputer +- Debug mode for detailed information +- Showing images for visual feedback + +### Production + +For production deployments, prioritize: +- Robustness with error handling and reconnection logic +- Performance optimizations like headless mode and caching +- Memory management for long-running applications \ No newline at end of file diff --git a/docs/project_overview.md b/docs/project_overview.md new file mode 100644 index 0000000..bd06f54 --- /dev/null +++ b/docs/project_overview.md @@ -0,0 +1,64 @@ +# Project Overview + +## Introduction + +The Computer Using Agent (CUA) Sample App is a reference implementation demonstrating how to build an agent that can use a computer through browser and terminal interfaces. This project shows how to implement OpenAI's Computer Protocol to enable an AI assistant to interact with a user's computer in a safe and controlled manner. + +## Architecture + +The CUA Sample App follows a modular architecture that separates the agent logic from the computer implementation: + +``` +┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ +│ │ │ │ │ │ +│ User Interface │────▶│ Agent │────▶│ OpenAI API │ +│ (CLI/App) │ │ │ │ │ +│ │ │ │ │ │ +└─────────────────┘ └────────┬────────┘ └────────┬────────┘ + │ │ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ │ │ │ + │ Computer │◀────│ Model Output │ + │ Implementation │ │ (computer_call) │ + │ │ │ │ + └─────────────────┘ └─────────────────┘ +``` + +### Key Components + +1. **Agent**: The agent class handles communication with the OpenAI API and processes computer calls. +2. **Computer Protocol**: Defines the interface for how the agent interacts with the computer. +3. **Computer Implementations**: Various implementations of the Computer Protocol for different environments: + - Browser (using Playwright) + - Terminal + - Docker containers + - Remote browser services +4. **CLI Application**: Command-line interface for user interaction with the agent. + +## Core Workflow + +1. **User Input**: The user provides input through the CLI or application interface. +2. **Agent Processing**: The agent sends the user input to the OpenAI API along with conversation history. +3. **API Response**: The API returns responses, which may include computer calls. +4. **Computer Interaction**: Computer calls are executed by the appropriate Computer implementation. +5. **Response Display**: Results are displayed to the user, and the conversation continues. + +## Key Features + +- **Modular Architecture**: Clear separation of concerns, allowing different computer environments to be used interchangeably. +- **Multiple Computer Environments**: Support for various computer environments, including local browsers, Docker containers, and remote browser services. +- **Safety Measures**: URL blocklisting and safety check acknowledgments to ensure safe operation. +- **Function Calling**: Support for custom functions to be defined and used in the conversation. +- **Extensible Design**: Easily extended with new Computer implementations or custom functions. + +## Getting Started + +To get started with the CUA Sample App: + +1. Clone the repository +2. Install dependencies with `pip install -r requirements.txt` +3. Run the application with `python main.py` + +For more detailed information, see the [CLI Usage Guide](cli_usage.md) and [Developer Guide](developer_guide.md). \ No newline at end of file diff --git a/docs/safety_considerations.md b/docs/safety_considerations.md new file mode 100644 index 0000000..001c6ea --- /dev/null +++ b/docs/safety_considerations.md @@ -0,0 +1,160 @@ +# Safety Considerations + +## Overview + +The Computer Using Agent (CUA) has significant capabilities that come with potential risks. This document outlines the safety measures implemented in the codebase to mitigate these risks. + +## URL Blocklisting + +In browser-based environments, the system includes URL blocklisting to prevent access to potentially malicious or inappropriate websites. + +### Implementation in utils.py + +```python +BLOCKED_DOMAINS = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] + +def check_blocklisted_url(url: str) -> None: + """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" + hostname = urlparse(url).hostname or "" + if any( + hostname == blocked or hostname.endswith(f".{blocked}") + for blocked in BLOCKED_DOMAINS + ): + raise ValueError(f"Blocked URL: {url}") +``` + +This function checks if a URL's hostname (or any of its subdomains) matches any entry in the `BLOCKED_DOMAINS` list. If a match is found, it raises a `ValueError` with the blocked URL. + +### Integration in Agent Implementation + +The Agent class integrates URL blocklisting in its handling of computer calls for browser environments: + +```python +# additional URL safety checks for browser environments +if self.computer.environment == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url +``` + +### Network Interception + +For Playwright-based browser environments, the system also includes network interception to block requests to suspicious domains: + +```python +# Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS +def handle_route(route, request): + url = request.url + if check_blocklisted_url(url): + print(f"Flagging blocked domain: {url}") + route.abort() + else: + route.continue_() + +self._page.route("**/*", handle_route) +``` + +This intercepts all network requests and aborts them if they target a blocked domain. + +## Safety Check Acknowledgment + +The CUA model may sometimes generate safety checks for potentially risky actions. The system implements a callback mechanism to handle these checks: + +### Default Implementation + +```python +def acknowledge_safety_check_callback(message: str) -> bool: + response = input( + f"Safety Check Warning: {message}\nDo you want to acknowledge and proceed? (y/n): " + ).lower() + return response.strip() == "y" +``` + +This function displays the safety check message to the user and asks for explicit acknowledgment before proceeding. + +### Integration in Agent Implementation + +```python +# if user doesn't ack all safety checks exit with error +pending_checks = item.get("pending_safety_checks", []) +for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) +``` + +If any safety check is not acknowledged, the Agent raises a `ValueError` and halts execution. + +## DNS Safety in Docker Environment + +For the Docker environment, the system uses a restricted DNS server to limit access to websites: + +```bash +docker run --rm -it --name cua-sample-app -p 5900:5900 --dns=1.1.1.3 -e DISPLAY=:99 cua-sample-app +``` + +The `--dns=1.1.1.3` flag restricts the accessible websites to a smaller, safer set. + +## Container Isolation + +The Docker environment runs in a container, providing isolation from the host system: + +- Limited network access +- No access to host file system +- Controlled execution environment + +## Browser Safeguards + +Playwright-based browsers are launched with safeguards: + +```python +launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system" +] +browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={} +) +``` + +- `--disable-extensions`: Disables browser extensions +- `--disable-file-system`: Restricts file system access +- `chromium_sandbox=True`: Enables the Chromium sandbox for additional isolation + +## Best Practices for Implementation + +When extending or modifying the CUA implementation, consider these safety best practices: + +1. **Expand Blocklists**: Add more domains to the `BLOCKED_DOMAINS` list as needed. +2. **Custom Safety Callbacks**: Implement more sophisticated safety check callbacks for specific use cases. +3. **Request Filtering**: Add additional filtering for network requests in browser environments. +4. **Environment Isolation**: Ensure proper isolation for computer environments, especially for production use. +5. **Limited Access Scopes**: Restrict the scope of what the CUA can access and control. +6. **Monitoring and Logging**: Implement comprehensive logging to track the CUA's actions. +7. **User Intervention**: Always provide mechanisms for user intervention and oversight. + +## Limitations and Disclaimers + +Even with these safety measures, the CUA is still in preview and may have vulnerabilities: + +- Safety checks might not catch all potentially harmful actions +- Blocklists may be incomplete or bypassed +- Browser or system vulnerabilities could be exploited + +As noted in the official documentation: + +> [!CAUTION] +> Computer use is in preview. Because the model is still in preview and may be susceptible to exploits and inadvertent mistakes, we discourage trusting it in authenticated environments or for high-stakes tasks. \ No newline at end of file diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 0000000..0a26356 --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,228 @@ +# Testing Guide + +This document provides guidance on testing Octotools to ensure it works correctly and reliably. + +## Testing Framework + +Octotools uses pytest as its testing framework. Tests are located in the `tests/` directory and follow the pytest conventions. + +## Test Types + +The Octotools test suite includes several types of tests: + +1. **Unit Tests** - Testing individual components in isolation +2. **Integration Tests** - Testing how components work together +3. **Functional Tests** - Testing end-to-end functionality +4. **Regression Tests** - Ensuring new changes don't break existing functionality + +## Running Tests + +### Running All Tests + +To run all tests: + +```bash +pytest +``` + +### Running Specific Tests + +To run tests from a specific file: + +```bash +pytest tests/test_specific_module.py +``` + +To run a specific test: + +```bash +pytest tests/test_specific_module.py::test_specific_function +``` + +### Test Coverage + +To run tests with coverage: + +```bash +pytest --cov=octotools +``` + +To generate a coverage report: + +```bash +pytest --cov=octotools --cov-report=html +``` + +This will create an HTML report in the `htmlcov/` directory. + +## Writing Tests + +### Unit Tests + +Unit tests should test individual functions or classes in isolation. Here's an example of a unit test: + +```python +from typing import Any, Dict, List, TYPE_CHECKING +import pytest +from octotools.core import analyze_cell + +if TYPE_CHECKING: + from _pytest.capture import CaptureFixture + from _pytest.fixtures import FixtureRequest + from _pytest.logging import LogCaptureFixture + from _pytest.monkeypatch import MonkeyPatch + from pytest_mock.plugin import MockerFixture + +def test_analyze_cell_empty() -> None: + """Test that analyze_cell handles an empty cell correctly.""" + result = analyze_cell("") + assert result["code_length"] == 0 + assert result["has_output"] is False + +def test_analyze_cell_with_code() -> None: + """Test that analyze_cell correctly analyzes a cell with code.""" + result = analyze_cell("print('Hello, world!')") + assert result["code_length"] == 21 + assert result["has_output"] is False +``` + +### Integration Tests + +Integration tests check that different components work together correctly: + +```python +import pytest +from octotools.core import load_notebook, analyze_notebook + +def test_notebook_analysis_pipeline() -> None: + """Test the full notebook analysis pipeline.""" + notebook = load_notebook("tests/fixtures/example_notebook.ipynb") + results = analyze_notebook(notebook) + + assert "cell_count" in results + assert "code_cells" in results + assert "markdown_cells" in results +``` + +### Fixtures + +Use pytest fixtures to set up and tear down test environments: + +```python +import pytest +import tempfile +import os +from typing import Generator + +@pytest.fixture +def temp_notebook() -> Generator[str, None, None]: + """Create a temporary notebook file for testing.""" + with tempfile.NamedTemporaryFile(suffix=".ipynb", delete=False) as f: + f.write(b'''{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "source": [ + "print(\\"Hello, world!\\")" + ], + "outputs": [] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 4 +}''') + temp_path = f.name + + yield temp_path + + # Cleanup + os.unlink(temp_path) +``` + +### Mocking + +Use mocking to isolate the code being tested: + +```python +def test_with_mocking(mocker: "MockerFixture") -> None: + """Test using mocks to isolate the code being tested.""" + # Mock a function + mock_load = mocker.patch("octotools.core.load_notebook") + mock_load.return_value = {"cells": []} + + # Test the function that uses load_notebook + from octotools.core import count_cells + result = count_cells("dummy.ipynb") + + # Verify the result and that the mock was called + assert result == 0 + mock_load.assert_called_once_with("dummy.ipynb") +``` + +## Test Organization + +Organize tests according to the module they test: + +``` +tests/ +├── __init__.py +├── core/ +│ ├── __init__.py +│ ├── test_notebook.py +│ └── test_cell.py +├── api/ +│ ├── __init__.py +│ └── test_api.py +├── cli/ +│ ├── __init__.py +│ └── test_cli.py +└── utils/ + ├── __init__.py + └── test_utils.py +``` + +## Continuous Integration + +Octotools uses GitHub Actions for continuous integration testing. The CI configuration is located in `.github/workflows/`. + +## Test Guidelines + +1. **Test Coverage**: Aim for high test coverage, especially for critical components +2. **Test Edge Cases**: Test boundary conditions and error handling +3. **Test Readability**: Write clear, readable tests with meaningful names +4. **Test Independence**: Tests should not depend on other tests +5. **Test Performance**: Tests should run quickly + +## Test Documentation + +Each test should have a clear docstring explaining what it tests and why. + +## Debugging Tests + +If a test fails, you can use the following techniques to debug it: + +1. **Verbose Mode**: Run pytest in verbose mode: + ```bash + pytest -v + ``` + +2. **Print Debugging**: Use `print` statements or the pytest `capfd` fixture: + ```python + def test_with_debug(capfd: "CaptureFixture") -> None: + print("Debug information") + # Test code + captured = capfd.readouterr() + print(captured.out) # Shows all output + ``` + +3. **PDB Debugger**: Use the PDB debugger: + ```bash + pytest --pdb + ``` + +## Test Maintenance + +Regularly review and update tests to ensure they remain relevant and effective. \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 0000000..532da8c --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,249 @@ +# Troubleshooting Guide + +This document provides solutions to common issues you might encounter when working with Octotools. + +## Installation Issues + +### Package Installation Failures + +**Problem**: Errors when installing Octotools or its dependencies. + +**Solution**: +1. Make sure you're using a compatible Python version (3.7+) +2. Try installing in development mode: + ```bash + pip install -e . + ``` +3. If specific dependencies are failing, try installing them manually: + ```bash + pip install numpy pandas matplotlib jupyter + ``` + +### Import Errors + +**Problem**: Import errors when trying to use Octotools. + +**Solution**: +1. Make sure Octotools is installed in your current environment +2. Check your Python path: + ```python + import sys + print(sys.path) + ``` +3. If using virtual environments, make sure you've activated the correct one + +## Usage Issues + +### Notebook Loading Failures + +**Problem**: Errors when loading a Jupyter notebook. + +**Solution**: +1. Check if the notebook file exists and is accessible +2. Verify the notebook file is valid JSON: + ```bash + # Check if the notebook is valid JSON + python -c "import json; json.load(open('path/to/notebook.ipynb'))" + ``` +3. Try opening and resaving the notebook in Jupyter + +### Analysis Errors + +**Problem**: Errors during notebook analysis. + +**Solution**: +1. Check for malformed cells in the notebook +2. Try running with debug logging enabled: + ```bash + octotools analyze --debug path/to/notebook.ipynb + ``` +3. If analyzing specific cells is failing, try excluding problematic cells: + ```bash + octotools analyze --exclude-cell 5 path/to/notebook.ipynb + ``` + +### Visualization Errors + +**Problem**: Errors when generating visualizations. + +**Solution**: +1. Make sure matplotlib is installed correctly +2. Check if the output directory exists and is writable +3. Try specifying a different output format: + ```bash + octotools visualize --format png path/to/notebook.ipynb + ``` + +## Performance Issues + +### Slow Analysis + +**Problem**: Notebook analysis is taking too long. + +**Solution**: +1. For large notebooks, use the `--sample` option to analyze only a subset of cells: + ```bash + octotools analyze --sample 10 path/to/notebook.ipynb + ``` +2. Disable complex metrics that take longer to compute: + ```bash + octotools analyze --disable-metrics complexity,patterns path/to/notebook.ipynb + ``` +3. Use the batch processor with multiple processes for analyzing many notebooks: + ```bash + octotools batch --processes 4 path/to/notebooks/ + ``` + +### Memory Issues + +**Problem**: Out of memory errors when analyzing large notebooks. + +**Solution**: +1. Process cells individually instead of loading the whole notebook: + ```bash + octotools analyze --cell-by-cell path/to/notebook.ipynb + ``` +2. Increase the memory limit in your Python environment (if applicable) +3. Use the `--lite` mode for a lighter-weight analysis: + ```bash + octotools analyze --lite path/to/notebook.ipynb + ``` + +## Configuration Issues + +### Config File Not Found + +**Problem**: Octotools can't find the configuration file. + +**Solution**: +1. Create a default configuration file: + ```bash + octotools init-config + ``` +2. Specify the config file path explicitly: + ```bash + octotools --config /path/to/config.yaml analyze notebook.ipynb + ``` +3. Check the default config locations: + - `~/.octotools/config.yaml` + - `./octotools_config.yaml` + +### Configuration Syntax Errors + +**Problem**: Errors related to the configuration file syntax. + +**Solution**: +1. Validate your configuration file: + ```bash + octotools validate-config /path/to/config.yaml + ``` +2. Reset to the default configuration: + ```bash + octotools reset-config + ``` +3. Check the configuration documentation for correct syntax + +## CLI Issues + +### Command Not Found + +**Problem**: The `octotools` command is not found. + +**Solution**: +1. Make sure the package is installed: + ```bash + pip install -e . + ``` +2. Check if the installation path is in your PATH environment variable +3. Try running the module directly: + ```bash + python -m octotools.cli + ``` + +### Incorrect Command Usage + +**Problem**: Command line arguments are not being recognized. + +**Solution**: +1. Check the help documentation for the correct usage: + ```bash + octotools --help + octotools analyze --help + ``` +2. Make sure you're using the correct syntax for your shell +3. If using special characters in arguments, use quotes: + ```bash + octotools analyze --output-path "/path/with spaces/output.json" + ``` + +## Output Issues + +### JSON Output Formatting + +**Problem**: JSON output is not formatted correctly. + +**Solution**: +1. Use an explicit formatting option: + ```bash + octotools analyze --format json-pretty path/to/notebook.ipynb + ``` +2. Use a specific output file: + ```bash + octotools analyze --output results.json path/to/notebook.ipynb + ``` +3. Pipe through a JSON formatter: + ```bash + octotools analyze --format json | python -m json.tool + ``` + +### Visualization Quality + +**Problem**: Generated visualizations have poor quality or readability. + +**Solution**: +1. Adjust the figure size: + ```bash + octotools visualize --figure-size 12 8 path/to/notebook.ipynb + ``` +2. Change the DPI setting: + ```bash + octotools visualize --dpi 300 path/to/notebook.ipynb + ``` +3. Use a different theme or color scheme: + ```bash + octotools visualize --theme dark path/to/notebook.ipynb + ``` + +## Advanced Troubleshooting + +### Debug Mode + +For detailed debugging information, run Octotools in debug mode: + +```bash +octotools --debug analyze path/to/notebook.ipynb +``` + +### Logging + +You can configure the logging level for more detailed output: + +```bash +octotools --log-level DEBUG analyze path/to/notebook.ipynb +``` + +### Generate a Diagnostic Report + +Generate a diagnostic report for support purposes: + +```bash +octotools diagnostics > diagnostic_report.txt +``` + +### Check for Updates + +Make sure you're using the latest version: + +```bash +pip install --upgrade octotools +``` \ No newline at end of file diff --git a/main.py b/main.py index 41729fa..a8ef01f 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,82 @@ -from agent.agent import Agent -from computers import LocalPlaywrightComputer +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Main entry point for the CUA-SAMPLE-APP with Octotools integration. + +This module initializes and runs the agent with optional Octotools +integration for enhanced reasoning capabilities. +""" +import os +import signal +import sys +import argparse +from dotenv import load_dotenv +from typing import NoReturn + +from computers import LocalPlaywrightComputer +from agent.agent import Agent +from octotools_agent import OctotoolsAgent -def main(user_input=None): - with LocalPlaywrightComputer() as computer: - agent = Agent(computer=computer) - items = [] - while True: - user_input = input("> ") - items.append({"role": "user", "content": user_input}) - output_items = agent.run_full_turn(items, debug=True, show_images=True) - items += output_items +def signal_handler(signum: int, frame: object) -> NoReturn: + """Handle shutdown signals. + + Args: + signum: Signal number + frame: Current stack frame + """ + print("\nShutting down gracefully...") + sys.exit(0) +def main(debug: bool = False, use_octotools: bool = False) -> None: + """ + Run the main application loop. + + Args: + debug: Whether to enable debug output + use_octotools: Whether to use Octotools integration + """ + print("Initializing agent with Octotools integration...") + agent = OctotoolsAgent( + model_string="gpt-4o", + enabled_tools=[ + "Generalist_Solution_Generator_Tool", + "Python_Code_Generator_Tool", + "Text_Detector_Tool", + "URL_Text_Extractor_Tool", + "Nature_News_Fetcher_Tool" + ], + debug=debug + ) + + # Set up signal handler for graceful shutdown + signal.signal(signal.SIGINT, signal_handler) + + # Main interaction loop + while True: + try: + user_input = input("\nEnter your query (or 'exit' to quit): ") + if user_input.lower() == "exit": + break + + agent.process_input(user_input) + + except Exception as e: + print(f"Error: {str(e)}") + if debug: + import traceback + traceback.print_exc() if __name__ == "__main__": - main() + # Load environment variables + load_dotenv() + + # Parse command line arguments + parser = argparse.ArgumentParser(description="Run CUA-SAMPLE-APP with optional Octotools integration") + parser.add_argument("--debug", action="store_true", help="Enable debug output") + parser.add_argument("--use-octotools", action="store_true", help="Use Octotools integration") + args = parser.parse_args() + + # Run the main application + main(debug=args.debug, use_octotools=args.use_octotools)