Skip to content

Commit 9e94a05

Browse files
committed
Implement OmniMCP for Claude computer control
This commit adds OmniMCP, a system that enables Claude to control the computer using the Model Control Protocol. Key components: - OmniParser adapter for UI element detection - MCP server implementation - CLI interface for commands and debugging - Comprehensive documentation OmniMCP combines OmniParser's visual understanding with Claude's natural language capabilities to automate UI interactions.
1 parent 1899393 commit 9e94a05

File tree

5 files changed

+1649
-0
lines changed

5 files changed

+1649
-0
lines changed

openadapt/adapters/omniparser.py

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""Adapter for interacting with the OmniParser server.
2+
3+
This module provides a client for the OmniParser API deployed on AWS.
4+
"""
5+
6+
import base64
7+
import io
8+
from typing import Dict, List, Any, Optional
9+
10+
import requests
11+
from PIL import Image
12+
13+
from openadapt.custom_logger import logger
14+
15+
16+
class OmniParserClient:
17+
"""Client for the OmniParser API."""
18+
19+
def __init__(self, server_url: str):
20+
"""Initialize the OmniParser client.
21+
22+
Args:
23+
server_url: URL of the OmniParser server
24+
"""
25+
self.server_url = server_url.rstrip("/") # Remove trailing slash if present
26+
27+
def check_server_available(self) -> bool:
28+
"""Check if the OmniParser server is available.
29+
30+
Returns:
31+
bool: True if server is available, False otherwise
32+
"""
33+
try:
34+
probe_url = f"{self.server_url}/probe/"
35+
response = requests.get(probe_url, timeout=5)
36+
response.raise_for_status()
37+
logger.info("OmniParser server is available")
38+
return True
39+
except requests.exceptions.RequestException as e:
40+
logger.error(f"OmniParser server not available: {e}")
41+
return False
42+
43+
def image_to_base64(self, image: Image.Image) -> str:
44+
"""Convert a PIL Image to base64 string.
45+
46+
Args:
47+
image: PIL Image to convert
48+
49+
Returns:
50+
str: Base64 encoded string of the image
51+
"""
52+
img_byte_arr = io.BytesIO()
53+
image.save(img_byte_arr, format='PNG')
54+
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
55+
56+
def parse_image(self, image: Image.Image) -> Dict[str, Any]:
57+
"""Parse an image using the OmniParser service.
58+
59+
Args:
60+
image: PIL Image to parse
61+
62+
Returns:
63+
Dict[str, Any]: Parsed results including UI elements
64+
"""
65+
if not self.check_server_available():
66+
return {"error": "Server not available", "parsed_content_list": []}
67+
68+
# Convert image to base64
69+
base64_image = self.image_to_base64(image)
70+
71+
# Prepare request
72+
url = f"{self.server_url}/parse/"
73+
payload = {"base64_image": base64_image}
74+
75+
try:
76+
# Make request to API
77+
response = requests.post(url, json=payload, timeout=30)
78+
response.raise_for_status()
79+
80+
# Parse response
81+
result = response.json()
82+
logger.info(f"OmniParser latency: {result.get('latency', 0):.2f} seconds")
83+
return result
84+
except requests.exceptions.RequestException as e:
85+
logger.error(f"Error making request to OmniParser API: {e}")
86+
return {"error": str(e), "parsed_content_list": []}
87+
except Exception as e:
88+
logger.error(f"Error parsing image with OmniParser: {e}")
89+
return {"error": str(e), "parsed_content_list": []}
90+
91+
92+
class OmniParserProvider:
93+
"""Provider for OmniParser services."""
94+
95+
def __init__(self, server_url: Optional[str] = None):
96+
"""Initialize OmniParser provider.
97+
98+
Args:
99+
server_url: URL of the OmniParser server (optional)
100+
"""
101+
self.server_url = server_url or "http://localhost:8000"
102+
self.client = OmniParserClient(self.server_url)
103+
104+
def is_available(self) -> bool:
105+
"""Check if the OmniParser service is available.
106+
107+
Returns:
108+
bool: True if service is available, False otherwise
109+
"""
110+
return self.client.check_server_available()
111+
112+
def status(self) -> Dict[str, Any]:
113+
"""Check the status of the OmniParser service.
114+
115+
Returns:
116+
Dict[str, Any]: Status information
117+
"""
118+
is_available = self.is_available()
119+
return {
120+
"services": [
121+
{
122+
"name": "omniparser",
123+
"status": "running" if is_available else "stopped",
124+
"url": self.server_url
125+
}
126+
],
127+
"is_available": is_available
128+
}
129+
130+
def deploy(self) -> bool:
131+
"""Deploy the OmniParser service if not already running.
132+
133+
Returns:
134+
bool: True if successfully deployed or already running, False otherwise
135+
"""
136+
# Check if already running
137+
if self.status()["is_available"]:
138+
logger.info("OmniParser service is already running")
139+
return True
140+
141+
# Try to deploy using the deployment script
142+
try:
143+
from deploy.deploy.models.omniparser.deploy import Deploy
144+
logger.info("Deploying OmniParser service...")
145+
Deploy.start()
146+
return self.status()["is_available"]
147+
except Exception as e:
148+
logger.error(f"Failed to deploy OmniParser service: {e}")
149+
return False
150+
151+
def parse_screenshot(self, image_data: bytes) -> Dict[str, Any]:
152+
"""Parse a screenshot using OmniParser.
153+
154+
Args:
155+
image_data: Raw image data in bytes
156+
157+
Returns:
158+
Dict[str, Any]: Parsed content with UI elements
159+
"""
160+
try:
161+
image = Image.open(io.BytesIO(image_data))
162+
return self.client.parse_image(image)
163+
except Exception as e:
164+
logger.error(f"Error processing image data: {e}")
165+
return {"error": str(e), "parsed_content_list": []}

openadapt/mcp/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Model Control Protocol (MCP) implementation for OpenAdapt."""

0 commit comments

Comments
 (0)