Skip to content

Commit 0e082b2

Browse files
committed
feat(pixel-interaction): phase 1 - add mode enum and config support
Add BrowserInteractionMode enum and interaction_mode configuration to support pixel-based browser interaction for vision models. This is Phase 1 (Foundation) of pixel interaction mode implementation. Phase 2 will add the ToolGenerator and integration. Changes: - Add BrowserInteractionMode enum (DEFAULT, FULL_VISION, PIXEL_INTERACTION) - Update BrowserConfig to include interaction_mode field with DEFAULT value - Add interactionMode parameter handling in ConfigLoader.from_kwargs() - Supports both string and enum values for interaction_mode parameter - Includes validation with helpful error messages Related issue: #3680
1 parent 1aac8be commit 0e082b2

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

camel/toolkits/hybrid_browser_toolkit/config_loader.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
from dataclasses import dataclass
1515
from typing import Any, Dict, Optional
1616

17+
from camel.types.enums import BrowserInteractionMode
18+
1719

1820
@dataclass
1921
class BrowserConfig:
@@ -48,6 +50,9 @@ class BrowserConfig:
4850
# Full visual mode configuration
4951
full_visual_mode: bool = False
5052

53+
# Interaction mode configuration
54+
interaction_mode: BrowserInteractionMode = BrowserInteractionMode.DEFAULT
55+
5156

5257
@dataclass
5358
class ToolkitConfig:
@@ -125,6 +130,18 @@ def from_kwargs(cls, **kwargs) -> 'ConfigLoader':
125130
toolkit_kwargs["enabled_tools"] = value
126131
elif key == "fullVisualMode":
127132
browser_kwargs["full_visual_mode"] = value
133+
elif key == "interactionMode":
134+
# Convert string to BrowserInteractionMode enum
135+
if isinstance(value, str):
136+
try:
137+
browser_kwargs["interaction_mode"] = BrowserInteractionMode(value)
138+
except ValueError:
139+
raise ValueError(
140+
f"Invalid interaction_mode: '{value}'. "
141+
f"Must be one of: {[m.value for m in BrowserInteractionMode]}"
142+
)
143+
else:
144+
browser_kwargs["interaction_mode"] = value
128145

129146
browser_config = BrowserConfig(**browser_kwargs)
130147
toolkit_config = ToolkitConfig(**toolkit_kwargs)

camel/types/enums.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2070,3 +2070,24 @@ class HuggingFaceRepoType(str, Enum):
20702070
DATASET = "dataset"
20712071
MODEL = "model"
20722072
SPACE = "space"
2073+
2074+
2075+
class BrowserInteractionMode(Enum):
2076+
r"""Browser interaction mode for computer-use models.
2077+
2078+
This enum defines different modes for browser automation, particularly
2079+
for vision-based models that may prefer pixel-based interactions over
2080+
DOM element references.
2081+
2082+
Attributes:
2083+
DEFAULT: Use DOM element references, return snapshots after
2084+
operations (SOM-labeled screenshots).
2085+
FULL_VISION: Use DOM element references, but don't return snapshots
2086+
after operations (only explicit screenshot requests).
2087+
PIXEL_INTERACTION: Use pixel coordinates only (no DOM refs), don't
2088+
return snapshots after operations. Designed for vision models
2089+
that can visually locate UI elements.
2090+
"""
2091+
DEFAULT = "default"
2092+
FULL_VISION = "full_vision"
2093+
PIXEL_INTERACTION = "pixel_interaction"

0 commit comments

Comments
 (0)