Merge branch 'release-v0.1.0'

dandansamax · dandansamax · commit 2739f5da21dd · 2024-05-12T22:24:59.000+03:00
diff --git a/README.md b/README.md
@@ -6,15 +6,17 @@ Crab is a framework for building LLM agent benchmark environments in a Python-ce
 
 #### Key Features
 
-* 🌐 Cross-platform
-  * Build agent environments in memory, hosted through a docker environment, installed a virtual machine, or as a distributed physical machine, as long as they can be accessed by Python functions.
-  * Let the agent access all the environments in the same time through a unified interface.
-* ⚙ ️Easy-to-use Configuration
-  * Add a new action by simply adding a `@action` decorator on a Python function.
-  * Deine the environment by integrating several actions together.
-* 📐 Novel Benchmarking Suite
-  * Define tasks and the corresponding evlauators in an intuitive Python-native way.
-  * Introduce a novel graph evaluator method proving fine-grained metrics.
+🌐 Cross-platform
+* Create build agent environments that support various deployment options including in-memory, Docker-hosted, virtual machines, or distributed physical machines, provided they are accessible via Python functions.
+* Let the agent access all the environments in the same time through a unified interface.
+
+⚙ ️Easy-to-use Configuration
+* Add a new action by simply adding a `@action` decorator on a Python function.
+* Deine the environment by integrating several actions together.
+
+📐 Novel Benchmarking Suite
+* Define tasks and the corresponding evlauators in an intuitive Python-native way.
+* Introduce a novel graph evaluator method providing fine-grained metrics.
 
 ## Installation
 
diff --git a/crab/actions/desktop_actions.py b/crab/actions/desktop_actions.py
@@ -25,11 +25,14 @@
 DURATION = 0.8
 DELAY = 0.5
 
+@action
+def set_screen_size(env) -> None:
+    """Set the screen size to env attribute `width` and `height`."""
+    env.width, env.height = pyautogui.size()
 
 @action
 def click_position(x: int, y: int) -> None:
-    """
-    click on the current desktop screen.
+    """Click on the current desktop screen.
 
     Args:
         x: The X coordinate, as a floating-point number in the range [0.0, 1.0].
@@ -41,8 +44,7 @@ def click_position(x: int, y: int) -> None:
 
 @action(local=True)
 def click(element: int, env) -> None:
-    """
-    Click an UI element shown on the desktop screen. A simple use case can be
+    """Click an UI element shown on the desktop screen. A simple use case can be
     click(5), which clicks the UI element labeled with the number 5.
 
     Args:
@@ -55,8 +57,7 @@ def click(element: int, env) -> None:
 
 @action
 def mouse_scroll(click: int = 1) -> None:
-    """
-    Performs a scroll of the mouse scroll wheel.
+    """Perform a scroll of the mouse scroll wheel.
 
     Args:
         click(int): The amount of scrolling. Default to 1.
@@ -165,8 +166,7 @@ class KeyEnum(str, Enum):
 
 @action
 def key_press(key: KeyEnum) -> None:
-    """
-    Performs a keyboard key press down, followed by a release.
+    """Press and release a single keyboard key.
 
     Args:
         key (str): The key to be pressed.
@@ -177,13 +177,27 @@ def key_press(key: KeyEnum) -> None:
         pyautogui.press(key)
     time.sleep(DELAY)
 
+@action
+def hotkey_press(keys: list[KeyEnum]) -> None:
+    """Press and release multiple keyboard keys at the same time.
+    
+    For exmaple, if you want to use Ctrl-C hoykey to copy the selected text, you
+    can call hotkey_press(keys=["ctrl", "c"]).
+
+    Args:
+        keys: The key list to be pressed together.
+    """
+    if isinstance(keys[0], KeyEnum):
+        keys = [key.value for key in keys]
+    pyautogui.hotkey(*keys)
+    time.sleep(DELAY)
 
 @action
 def write_text(text: str) -> None:
-    """
-    Typing the specified text. Note: This function does not move the mouse cursor.
-    Ensure the cursor focuses in the correct text input field before calling this
-    function.
+    """Type the specified text.
+
+    Note: This function does not move the mouse cursor. Ensure the cursor
+    focuses in the correct text input field before calling this function.
 
     Args:
         text (str): The text to be typed.
@@ -194,10 +208,11 @@ def write_text(text: str) -> None:
 
 @action
 def search_application(name: str) -> None:
-    """
-    Search an application name. For exmaple, if you want to open an application named
-    "slack", you can call search_application(name="slack"). You MUST use this action to
-    search for applications.
+    """Search an application name.
+
+    For exmaple, if you want to open an application named "slack", you can call
+    search_application(name="slack"). You MUST use this action to search for
+    applications.
 
     Args:
         name: the application name.
diff --git a/crab/actions/visual_prompt_actions.py b/crab/actions/visual_prompt_actions.py
@@ -274,7 +274,15 @@ def groundingdino_easyocr(
     filtered_boxes = filter_boxes_by_overlap(filtered_boxes)
     result_boxes = [box[0] for box in filtered_boxes]
     draw_boxes(image, result_boxes, font_size)
-    env.element_position_map = result_boxes
+    env.element_position_map = [
+        (
+            box[0] / image.width,
+            box[1] / image.height,
+            box[2] / image.width,
+            box[3] / image.height,
+        )
+        for box in result_boxes
+    ]
     env.ocr_results = "".join([box[1] for box in ocr_boxes])
     return image_to_base64(image), filtered_boxes
 
@@ -298,4 +306,4 @@ def get_element_position(element_id, env):
     box = env.element_position_map[element_id]
     x = (box[0] + box[2]) / 2
     y = (box[1] + box[3]) / 2
-    return round(x), round(y)
+    return round(x * env.width), round(y * env.height)
diff --git a/crab/environments/linux.py b/crab/environments/linux.py
@@ -16,6 +16,7 @@
     key_press,
     screenshot,
     search_application,
+    set_screen_size,
     write_text,
 )
 from crab.core import EnvironmentConfig
@@ -25,4 +26,5 @@
     action_space=[click, key_press, write_text, search_application],
     observation_space=[screenshot],
     description="A Ubuntu 22.04 desktop environment with a single display.",
+    reset=set_screen_size,
 )
diff --git a/examples/desktop_env.py b/examples/desktop_env.py
@@ -9,7 +9,14 @@
     create_benchmark,
     evaluator,
 )
-from crab.actions.desktop_actions import click, key_press, screenshot, write_text
+from crab.actions.desktop_actions import (
+    click,
+    hotkey_press,
+    key_press,
+    screenshot,
+    set_screen_size,
+    write_text,
+)
 from crab.actions.visual_prompt_actions import (
     get_elements_prompt,
     groundingdino_easyocr,
@@ -63,9 +70,10 @@ def start_benchmark(benchmark: Benchmark, agent: OpenAIAgent):
 
 ENV_CONFIG = EnvironmentConfig(
     name="desktop",
-    action_space=[click, key_press, write_text],
+    action_space=[click, key_press, write_text, hotkey_press],
     observation_space=[screenshot],
     description="A desktop environment with a single display.",
+    reset=set_screen_size,
 )
 
 BENCHMARK_CONFIG = BenchmarkConfig(
diff --git a/examples/multi_env.py b/examples/multi_env.py
@@ -59,5 +59,10 @@ def start_benchmark(benchmark, agent):
         multienv=True,
         model="gpt-4-turbo-preview",
     )
+    print(
+        "\033[92m"
+        f"Start performing task: \"{task.description}\""
+        "\033[0m"
+    )
     start_benchmark(benchmark, agent)
     benchmark.reset()
diff --git a/examples/single_env.py b/examples/single_env.py
@@ -55,5 +55,10 @@ def start_benchmark(benchmark, agent):
         action_space,
         model="gpt-4-turbo-preview",
     )
+    print(
+        "\033[92m"
+        f"Start performing task: \"{task.description}\""
+        "\033[0m"
+    )
     start_benchmark(benchmark, agent)
     benchmark.reset()

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@`
`16`	`16`	`key_press,`
`17`	`17`	`screenshot,`
`18`	`18`	`search_application,`
	`19`	`+ set_screen_size,`
`19`	`20`	`write_text,`
`20`	`21`	`)`
`21`	`22`	`from crab.core import EnvironmentConfig`
`@@ -25,4 +26,5 @@`
`25`	`26`	`action_space=[click, key_press, write_text, search_application],`
`26`	`27`	`observation_space=[screenshot],`
`27`	`28`	`description="A Ubuntu 22.04 desktop environment with a single display.",`
	`29`	`+ reset=set_screen_size,`
`28`	`30`	`)`