Skip to content

Commit 2739f5d

Browse files
committed
Merge branch 'release-v0.1.0'
2 parents 06a0cc3 + 77fa14e commit 2739f5d

File tree

7 files changed

+74
-29
lines changed

7 files changed

+74
-29
lines changed

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,17 @@ Crab is a framework for building LLM agent benchmark environments in a Python-ce
66

77
#### Key Features
88

9-
* 🌐 Cross-platform
10-
* Build agent environments in memory, hosted through a docker environment, installed a virtual machine, or as a distributed physical machine, as long as they can be accessed by Python functions.
11-
* Let the agent access all the environments in the same time through a unified interface.
12-
* ⚙ ️Easy-to-use Configuration
13-
* Add a new action by simply adding a `@action` decorator on a Python function.
14-
* Deine the environment by integrating several actions together.
15-
* 📐 Novel Benchmarking Suite
16-
* Define tasks and the corresponding evlauators in an intuitive Python-native way.
17-
* Introduce a novel graph evaluator method proving fine-grained metrics.
9+
🌐 Cross-platform
10+
* Create build agent environments that support various deployment options including in-memory, Docker-hosted, virtual machines, or distributed physical machines, provided they are accessible via Python functions.
11+
* Let the agent access all the environments in the same time through a unified interface.
12+
13+
⚙ ️Easy-to-use Configuration
14+
* Add a new action by simply adding a `@action` decorator on a Python function.
15+
* Deine the environment by integrating several actions together.
16+
17+
📐 Novel Benchmarking Suite
18+
* Define tasks and the corresponding evlauators in an intuitive Python-native way.
19+
* Introduce a novel graph evaluator method providing fine-grained metrics.
1820

1921
## Installation
2022

crab/actions/desktop_actions.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,14 @@
2525
DURATION = 0.8
2626
DELAY = 0.5
2727

28+
@action
29+
def set_screen_size(env) -> None:
30+
"""Set the screen size to env attribute `width` and `height`."""
31+
env.width, env.height = pyautogui.size()
2832

2933
@action
3034
def click_position(x: int, y: int) -> None:
31-
"""
32-
click on the current desktop screen.
35+
"""Click on the current desktop screen.
3336
3437
Args:
3538
x: The X coordinate, as a floating-point number in the range [0.0, 1.0].
@@ -41,8 +44,7 @@ def click_position(x: int, y: int) -> None:
4144

4245
@action(local=True)
4346
def click(element: int, env) -> None:
44-
"""
45-
Click an UI element shown on the desktop screen. A simple use case can be
47+
"""Click an UI element shown on the desktop screen. A simple use case can be
4648
click(5), which clicks the UI element labeled with the number 5.
4749
4850
Args:
@@ -55,8 +57,7 @@ def click(element: int, env) -> None:
5557

5658
@action
5759
def mouse_scroll(click: int = 1) -> None:
58-
"""
59-
Performs a scroll of the mouse scroll wheel.
60+
"""Perform a scroll of the mouse scroll wheel.
6061
6162
Args:
6263
click(int): The amount of scrolling. Default to 1.
@@ -165,8 +166,7 @@ class KeyEnum(str, Enum):
165166

166167
@action
167168
def key_press(key: KeyEnum) -> None:
168-
"""
169-
Performs a keyboard key press down, followed by a release.
169+
"""Press and release a single keyboard key.
170170
171171
Args:
172172
key (str): The key to be pressed.
@@ -177,13 +177,27 @@ def key_press(key: KeyEnum) -> None:
177177
pyautogui.press(key)
178178
time.sleep(DELAY)
179179

180+
@action
181+
def hotkey_press(keys: list[KeyEnum]) -> None:
182+
"""Press and release multiple keyboard keys at the same time.
183+
184+
For exmaple, if you want to use Ctrl-C hoykey to copy the selected text, you
185+
can call hotkey_press(keys=["ctrl", "c"]).
186+
187+
Args:
188+
keys: The key list to be pressed together.
189+
"""
190+
if isinstance(keys[0], KeyEnum):
191+
keys = [key.value for key in keys]
192+
pyautogui.hotkey(*keys)
193+
time.sleep(DELAY)
180194

181195
@action
182196
def write_text(text: str) -> None:
183-
"""
184-
Typing the specified text. Note: This function does not move the mouse cursor.
185-
Ensure the cursor focuses in the correct text input field before calling this
186-
function.
197+
"""Type the specified text.
198+
199+
Note: This function does not move the mouse cursor. Ensure the cursor
200+
focuses in the correct text input field before calling this function.
187201
188202
Args:
189203
text (str): The text to be typed.
@@ -194,10 +208,11 @@ def write_text(text: str) -> None:
194208

195209
@action
196210
def search_application(name: str) -> None:
197-
"""
198-
Search an application name. For exmaple, if you want to open an application named
199-
"slack", you can call search_application(name="slack"). You MUST use this action to
200-
search for applications.
211+
"""Search an application name.
212+
213+
For exmaple, if you want to open an application named "slack", you can call
214+
search_application(name="slack"). You MUST use this action to search for
215+
applications.
201216
202217
Args:
203218
name: the application name.

crab/actions/visual_prompt_actions.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,15 @@ def groundingdino_easyocr(
274274
filtered_boxes = filter_boxes_by_overlap(filtered_boxes)
275275
result_boxes = [box[0] for box in filtered_boxes]
276276
draw_boxes(image, result_boxes, font_size)
277-
env.element_position_map = result_boxes
277+
env.element_position_map = [
278+
(
279+
box[0] / image.width,
280+
box[1] / image.height,
281+
box[2] / image.width,
282+
box[3] / image.height,
283+
)
284+
for box in result_boxes
285+
]
278286
env.ocr_results = "".join([box[1] for box in ocr_boxes])
279287
return image_to_base64(image), filtered_boxes
280288

@@ -298,4 +306,4 @@ def get_element_position(element_id, env):
298306
box = env.element_position_map[element_id]
299307
x = (box[0] + box[2]) / 2
300308
y = (box[1] + box[3]) / 2
301-
return round(x), round(y)
309+
return round(x * env.width), round(y * env.height)

crab/environments/linux.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
key_press,
1717
screenshot,
1818
search_application,
19+
set_screen_size,
1920
write_text,
2021
)
2122
from crab.core import EnvironmentConfig
@@ -25,4 +26,5 @@
2526
action_space=[click, key_press, write_text, search_application],
2627
observation_space=[screenshot],
2728
description="A Ubuntu 22.04 desktop environment with a single display.",
29+
reset=set_screen_size,
2830
)

examples/desktop_env.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,14 @@
99
create_benchmark,
1010
evaluator,
1111
)
12-
from crab.actions.desktop_actions import click, key_press, screenshot, write_text
12+
from crab.actions.desktop_actions import (
13+
click,
14+
hotkey_press,
15+
key_press,
16+
screenshot,
17+
set_screen_size,
18+
write_text,
19+
)
1320
from crab.actions.visual_prompt_actions import (
1421
get_elements_prompt,
1522
groundingdino_easyocr,
@@ -63,9 +70,10 @@ def start_benchmark(benchmark: Benchmark, agent: OpenAIAgent):
6370

6471
ENV_CONFIG = EnvironmentConfig(
6572
name="desktop",
66-
action_space=[click, key_press, write_text],
73+
action_space=[click, key_press, write_text, hotkey_press],
6774
observation_space=[screenshot],
6875
description="A desktop environment with a single display.",
76+
reset=set_screen_size,
6977
)
7078

7179
BENCHMARK_CONFIG = BenchmarkConfig(

examples/multi_env.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,5 +59,10 @@ def start_benchmark(benchmark, agent):
5959
multienv=True,
6060
model="gpt-4-turbo-preview",
6161
)
62+
print(
63+
"\033[92m"
64+
f"Start performing task: \"{task.description}\""
65+
"\033[0m"
66+
)
6267
start_benchmark(benchmark, agent)
6368
benchmark.reset()

examples/single_env.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,10 @@ def start_benchmark(benchmark, agent):
5555
action_space,
5656
model="gpt-4-turbo-preview",
5757
)
58+
print(
59+
"\033[92m"
60+
f"Start performing task: \"{task.description}\""
61+
"\033[0m"
62+
)
5863
start_benchmark(benchmark, agent)
5964
benchmark.reset()

0 commit comments

Comments
 (0)