do test

WHALEEYE · WHALEEYE · commit aac92b3db5ea · 2024-09-10T19:05:02.000-07:00
diff --git a/crab-benchmark-v0/macos_local.py b/crab-benchmark-v0/macos_local.py
@@ -0,0 +1,139 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+import argparse
+import warnings
+from pathlib import Path
+from typing import Literal
+from uuid import uuid4
+
+from crab import (
+    BenchmarkConfig,
+    Experiment,
+    MessageType,
+    Task,
+    create_benchmark,
+)
+from crab.actions.crab_actions import complete
+from crab.actions.visual_prompt_actions import (
+    get_elements_prompt,
+    groundingdino_easyocr,
+)
+from crab.agents.backend_models import OpenAIModel
+from crab.agents.policies import (
+    SingleAgentPolicy,
+)
+from crab.core.agent_policy import AgentPolicy
+from crab.core.benchmark import Benchmark
+from crab.core.decorators import evaluator
+from crab.environments.macos import mac_env
+
+warnings.filterwarnings("ignore")
+
+
+class CrabBenchmarkV0(Experiment):
+    def __init__(
+        self,
+        benchmark: Benchmark,
+        task_id: str,
+        agent_policy: AgentPolicy | Literal["human"],
+        log_dir: Path | None = None,
+    ) -> None:
+        super().__init__(benchmark, task_id, agent_policy, log_dir)
+
+    def get_prompt(self):
+        observation, ob_prompt = self.benchmark.observe_with_prompt()
+
+        # construct prompt
+        result_prompt = {}
+        for env in ob_prompt:
+            if env == "root":
+                continue
+            screenshot = observation[env]["screenshot"]
+            marked_screenshot, _ = ob_prompt[env]["screenshot"]
+            result_prompt[env] = [
+                (f"Here is the current screenshot of {env}:", MessageType.TEXT),
+                (screenshot, MessageType.IMAGE_JPG_BASE64),
+                (
+                    f"Here is the screenshot with element labels of {env}:",
+                    MessageType.TEXT,
+                ),
+                (marked_screenshot, MessageType.IMAGE_JPG_BASE64),
+            ]
+        return result_prompt
+
+
+@evaluator(env_name="macos")
+def empty_evaluator() -> bool:
+    return False
+
+
+def get_mac_benchmark_local():
+    mac_env.remote_url = "http://localhost:8000"
+    mac_tool = {
+        "screenshot": groundingdino_easyocr(font_size=24) >> get_elements_prompt
+    }
+    prompting_tools = {"macos": mac_tool}
+    benchmark_config = BenchmarkConfig(
+        name="mac_benchmark",
+        tasks=[],
+        environments=[mac_env],
+        prompting_tools=prompting_tools,
+        root_action_space=[complete],
+        multienv=True,
+    )
+
+    benchmark_config.step_limit = 15
+    return create_benchmark(benchmark_config)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Script for running benchmark with an agent."
+    )
+    parser.add_argument(
+        "--task-description",
+        type=str,
+        help="task description. If provided, will overwrite the task id.",
+        required=True,
+    )
+    args = parser.parse_args()
+    benchmark = get_mac_benchmark_local()
+
+    task_id = str(uuid4())
+    benchmark.tasks = [
+        Task(
+            id=task_id,
+            description=args.task_description,
+            evaluator=empty_evaluator,
+        )
+    ]
+
+    history_messages_len = 2
+    model = OpenAIModel(model="gpt-4o", history_messages_len=history_messages_len)
+    agent_policy = SingleAgentPolicy(model_backend=model)
+
+    log_dir = (Path(__file__).parent / "logs").resolve()
+    expeirment = CrabBenchmarkV0(
+        benchmark=benchmark,
+        task_id=task_id,
+        agent_policy=agent_policy,
+        log_dir=log_dir,
+    )
+    expeirment.start_benchmark()
+
+
+"""
+python -m crab.server.main --HOST 0.0.0.0
+python -m crab-benchmark-v0.macos_local
+"""
diff --git a/crab-benchmark-v0/main.py b/crab-benchmark-v0/main.py
@@ -16,11 +16,13 @@
 import warnings
 from pathlib import Path
 from typing import Literal
+from uuid import uuid4
 
 from crab import (
     BenchmarkConfig,
     Experiment,
     MessageType,
+    Task,
     TaskGenerator,
     create_benchmark,
 )
@@ -37,6 +39,8 @@
 )
 from crab.core.agent_policy import AgentPolicy
 from crab.core.benchmark import Benchmark
+from crab.core.decorators import evaluator
+from crab.environments.macos import mac_env
 
 from .android_env import ANDROID_ENV
 from .dataset.android_subtasks import android_subtasks
@@ -79,6 +83,11 @@ def get_prompt(self):
         return result_prompt
 
 
+@evaluator(env_name="macos")
+def empty_evaluator() -> bool:
+    return False
+
+
 def get_benchmark(env: str, ubuntu_url: str):
     ubuntu_env = UBUNTU_ENV.model_copy()
     ubuntu_env.remote_url = ubuntu_url
@@ -88,6 +97,9 @@ def get_benchmark(env: str, ubuntu_url: str):
     android_tool = {
         "screenshot": groundingdino_easyocr(font_size=40) >> get_elements_prompt
     }
+    mac_tool = {
+        "screenshot": groundingdino_easyocr(font_size=24) >> get_elements_prompt
+    }
 
     if env == "ubuntu":
         prompting_tools = {"ubuntu": ubuntu_tool}
@@ -122,6 +134,22 @@ def get_benchmark(env: str, ubuntu_url: str):
             root_action_space=[complete],
             multienv=True,
         )
+    elif env == "mac":
+        task = Task(
+            description="Open firefox in both macos and android.",
+            id="0",
+            evaluator=empty_evaluator,
+        )
+        prompting_tools = {"macos": mac_tool, "android": android_tool}
+        mac_env.remote_url = "http://10.85.170.240:8000"
+        benchmark_config = BenchmarkConfig(
+            name="mac_benchmark",
+            tasks=[task],
+            environments=[mac_env, ANDROID_ENV],
+            prompting_tools=prompting_tools,
+            root_action_space=[complete],
+            multienv=True,
+        )
     else:
         raise ValueError("Env not support")
 
@@ -169,7 +197,13 @@ def get_benchmark(env: str, ubuntu_url: str):
         help="ubuntu, android or cross",
         default="cross",
     )
-    parser.add_argument("--task-id", type=str, help="task id")
+    parser.add_argument("--task-id", type=str, help="task id", default=None)
+    parser.add_argument(
+        "--task-description",
+        type=str,
+        help="task description. If provided, will overwrite the task id.",
+        default=None,
+    )
     parser.add_argument(
         "--loglevel",
         type=str,
@@ -180,20 +214,39 @@ def get_benchmark(env: str, ubuntu_url: str):
     loglevel = args.loglevel
     numeric_level = getattr(logging, loglevel.upper(), None)
     if not isinstance(numeric_level, int):
-        raise ValueError('Invalid log level: %s' % loglevel)
+        raise ValueError("Invalid log level: %s" % loglevel)
     logging.basicConfig(level=numeric_level)
 
-
     benchmark = get_benchmark(args.env, args.remote_url)
 
+    if args.task_description is not None:
+        task_id = str(uuid4())
+        benchmark.tasks = [
+            Task(
+                id=task_id,
+                description=args.task_description,
+                evaluator=empty_evaluator,
+            )
+        ]
+    else:
+        task_id = args.task_id
+
+    history_messages_len = 2
+
     if args.model == "gpt4o":
-        model = OpenAIModel(model="gpt-4o")
+        model = OpenAIModel(model="gpt-4o", history_messages_len=history_messages_len)
     elif args.policy == "gpt4turbo":
-        model = OpenAIModel(model="gpt-4-turbo")
+        model = OpenAIModel(
+            model="gpt-4-turbo", history_messages_len=history_messages_len
+        )
     elif args.policy == "gemini":
-        model = GeminiModel(model="gemini-1.5-pro-latest")
+        model = GeminiModel(
+            model="gemini-1.5-pro-latest", history_messages_len=history_messages_len
+        )
     elif args.policy == "claude":
-        model = ClaudeModel(model="claude-3-opus-20240229")
+        model = ClaudeModel(
+            model="claude-3-opus-20240229", history_messages_len=history_messages_len
+        )
     else:
         print("Unsupported model: ", args.model)
         exit()
@@ -215,7 +268,7 @@ def get_benchmark(env: str, ubuntu_url: str):
     log_dir = (Path(__file__).parent / "logs").resolve()
     expeirment = CrabBenchmarkV0(
         benchmark=benchmark,
-        task_id=args.task_id,
+        task_id=task_id,
         agent_policy=agent_policy,
         log_dir=log_dir,
     )
diff --git a/crab/actions/desktop_actions.py b/crab/actions/desktop_actions.py
@@ -93,10 +93,10 @@ def mouse_scroll(click: int = 1) -> None:
 
 
 class KeyEnum(str, Enum):
-    KEY_TAB = "\t"
+    KEY_TAB = "tab"
     KEY_LB = "\n"
     KEY_RR = "\r"
-    KEY_SPACE = " "
+    KEY_SPACE = "space"
     KEY_EXCLAMATION = "!"
     KEY_DQUOTE = '"'
     KEY_SHARP = "#"
@@ -188,6 +188,7 @@ class KeyEnum(str, Enum):
     KEY_UP = "up"
     KEY_RIGHT = "right"
     KEY_DOWN = "down"
+    KEY_CMD = "command"
 
 
 @action
@@ -217,7 +218,7 @@ def press_hotkey(keys: list[KeyEnum]) -> None:
     """
     if isinstance(keys[0], KeyEnum):
         keys = [key.value for key in keys]
-    pyautogui.hotkey(*keys)
+    pyautogui.hotkey(*keys, interval=0.25)
     time.sleep(DELAY)
 
 
diff --git a/crab/environments/macos.py b/crab/environments/macos.py
@@ -0,0 +1,45 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+from crab import action
+from crab.actions.crab_actions import complete, get_element_position
+from crab.actions.desktop_actions import (
+    click_position,
+    key_press,
+    press_hotkey,
+    right_click,
+    screenshot,
+    write_text,
+)
+from crab.core import EnvironmentConfig
+
+
+@action(local=True)
+def click(element: int, env) -> None:
+    """
+    Click an UI element shown on the desktop screen. A simple use case can be
+    click(5), which clicks the UI element labeled with the number 5.
+
+    Args:
+        element: A numeric tag assigned to an UI element shown on the screenshot.
+    """
+    x, y = get_element_position(element, env)
+    env._action_endpoint(click_position, {"x": round(x / 2), "y": round(y / 2)})
+
+
+mac_env = EnvironmentConfig(
+    name="macos",
+    action_space=[click, key_press, write_text, press_hotkey, right_click, complete],
+    observation_space=[screenshot],
+    description="A Macbook laptop environment with a single display.",
+)
diff --git a/crab/utils/measure.py b/crab/utils/measure.py
@@ -1,3 +1,16 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 import logging
 import time
 from functools import wraps
diff --git a/test/actions/test_visual_prompt_actions.py b/test/actions/test_visual_prompt_actions.py
@@ -1,3 +1,16 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
 from pathlib import Path
 
 import pytest