Skip to content

Commit aac92b3

Browse files
committed
do test
1 parent 8feb7df commit aac92b3

File tree

6 files changed

+275
-11
lines changed

6 files changed

+275
-11
lines changed

crab-benchmark-v0/macos_local.py

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
2+
# Licensed under the Apache License, Version 2.0 (the “License”);
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an “AS IS” BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
14+
import argparse
15+
import warnings
16+
from pathlib import Path
17+
from typing import Literal
18+
from uuid import uuid4
19+
20+
from crab import (
21+
BenchmarkConfig,
22+
Experiment,
23+
MessageType,
24+
Task,
25+
create_benchmark,
26+
)
27+
from crab.actions.crab_actions import complete
28+
from crab.actions.visual_prompt_actions import (
29+
get_elements_prompt,
30+
groundingdino_easyocr,
31+
)
32+
from crab.agents.backend_models import OpenAIModel
33+
from crab.agents.policies import (
34+
SingleAgentPolicy,
35+
)
36+
from crab.core.agent_policy import AgentPolicy
37+
from crab.core.benchmark import Benchmark
38+
from crab.core.decorators import evaluator
39+
from crab.environments.macos import mac_env
40+
41+
warnings.filterwarnings("ignore")
42+
43+
44+
class CrabBenchmarkV0(Experiment):
45+
def __init__(
46+
self,
47+
benchmark: Benchmark,
48+
task_id: str,
49+
agent_policy: AgentPolicy | Literal["human"],
50+
log_dir: Path | None = None,
51+
) -> None:
52+
super().__init__(benchmark, task_id, agent_policy, log_dir)
53+
54+
def get_prompt(self):
55+
observation, ob_prompt = self.benchmark.observe_with_prompt()
56+
57+
# construct prompt
58+
result_prompt = {}
59+
for env in ob_prompt:
60+
if env == "root":
61+
continue
62+
screenshot = observation[env]["screenshot"]
63+
marked_screenshot, _ = ob_prompt[env]["screenshot"]
64+
result_prompt[env] = [
65+
(f"Here is the current screenshot of {env}:", MessageType.TEXT),
66+
(screenshot, MessageType.IMAGE_JPG_BASE64),
67+
(
68+
f"Here is the screenshot with element labels of {env}:",
69+
MessageType.TEXT,
70+
),
71+
(marked_screenshot, MessageType.IMAGE_JPG_BASE64),
72+
]
73+
return result_prompt
74+
75+
76+
@evaluator(env_name="macos")
77+
def empty_evaluator() -> bool:
78+
return False
79+
80+
81+
def get_mac_benchmark_local():
82+
mac_env.remote_url = "http://localhost:8000"
83+
mac_tool = {
84+
"screenshot": groundingdino_easyocr(font_size=24) >> get_elements_prompt
85+
}
86+
prompting_tools = {"macos": mac_tool}
87+
benchmark_config = BenchmarkConfig(
88+
name="mac_benchmark",
89+
tasks=[],
90+
environments=[mac_env],
91+
prompting_tools=prompting_tools,
92+
root_action_space=[complete],
93+
multienv=True,
94+
)
95+
96+
benchmark_config.step_limit = 15
97+
return create_benchmark(benchmark_config)
98+
99+
100+
if __name__ == "__main__":
101+
parser = argparse.ArgumentParser(
102+
description="Script for running benchmark with an agent."
103+
)
104+
parser.add_argument(
105+
"--task-description",
106+
type=str,
107+
help="task description. If provided, will overwrite the task id.",
108+
required=True,
109+
)
110+
args = parser.parse_args()
111+
benchmark = get_mac_benchmark_local()
112+
113+
task_id = str(uuid4())
114+
benchmark.tasks = [
115+
Task(
116+
id=task_id,
117+
description=args.task_description,
118+
evaluator=empty_evaluator,
119+
)
120+
]
121+
122+
history_messages_len = 2
123+
model = OpenAIModel(model="gpt-4o", history_messages_len=history_messages_len)
124+
agent_policy = SingleAgentPolicy(model_backend=model)
125+
126+
log_dir = (Path(__file__).parent / "logs").resolve()
127+
expeirment = CrabBenchmarkV0(
128+
benchmark=benchmark,
129+
task_id=task_id,
130+
agent_policy=agent_policy,
131+
log_dir=log_dir,
132+
)
133+
expeirment.start_benchmark()
134+
135+
136+
"""
137+
python -m crab.server.main --HOST 0.0.0.0
138+
python -m crab-benchmark-v0.macos_local
139+
"""

crab-benchmark-v0/main.py

+61-8
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@
1616
import warnings
1717
from pathlib import Path
1818
from typing import Literal
19+
from uuid import uuid4
1920

2021
from crab import (
2122
BenchmarkConfig,
2223
Experiment,
2324
MessageType,
25+
Task,
2426
TaskGenerator,
2527
create_benchmark,
2628
)
@@ -37,6 +39,8 @@
3739
)
3840
from crab.core.agent_policy import AgentPolicy
3941
from crab.core.benchmark import Benchmark
42+
from crab.core.decorators import evaluator
43+
from crab.environments.macos import mac_env
4044

4145
from .android_env import ANDROID_ENV
4246
from .dataset.android_subtasks import android_subtasks
@@ -79,6 +83,11 @@ def get_prompt(self):
7983
return result_prompt
8084

8185

86+
@evaluator(env_name="macos")
87+
def empty_evaluator() -> bool:
88+
return False
89+
90+
8291
def get_benchmark(env: str, ubuntu_url: str):
8392
ubuntu_env = UBUNTU_ENV.model_copy()
8493
ubuntu_env.remote_url = ubuntu_url
@@ -88,6 +97,9 @@ def get_benchmark(env: str, ubuntu_url: str):
8897
android_tool = {
8998
"screenshot": groundingdino_easyocr(font_size=40) >> get_elements_prompt
9099
}
100+
mac_tool = {
101+
"screenshot": groundingdino_easyocr(font_size=24) >> get_elements_prompt
102+
}
91103

92104
if env == "ubuntu":
93105
prompting_tools = {"ubuntu": ubuntu_tool}
@@ -122,6 +134,22 @@ def get_benchmark(env: str, ubuntu_url: str):
122134
root_action_space=[complete],
123135
multienv=True,
124136
)
137+
elif env == "mac":
138+
task = Task(
139+
description="Open firefox in both macos and android.",
140+
id="0",
141+
evaluator=empty_evaluator,
142+
)
143+
prompting_tools = {"macos": mac_tool, "android": android_tool}
144+
mac_env.remote_url = "http://10.85.170.240:8000"
145+
benchmark_config = BenchmarkConfig(
146+
name="mac_benchmark",
147+
tasks=[task],
148+
environments=[mac_env, ANDROID_ENV],
149+
prompting_tools=prompting_tools,
150+
root_action_space=[complete],
151+
multienv=True,
152+
)
125153
else:
126154
raise ValueError("Env not support")
127155

@@ -169,7 +197,13 @@ def get_benchmark(env: str, ubuntu_url: str):
169197
help="ubuntu, android or cross",
170198
default="cross",
171199
)
172-
parser.add_argument("--task-id", type=str, help="task id")
200+
parser.add_argument("--task-id", type=str, help="task id", default=None)
201+
parser.add_argument(
202+
"--task-description",
203+
type=str,
204+
help="task description. If provided, will overwrite the task id.",
205+
default=None,
206+
)
173207
parser.add_argument(
174208
"--loglevel",
175209
type=str,
@@ -180,20 +214,39 @@ def get_benchmark(env: str, ubuntu_url: str):
180214
loglevel = args.loglevel
181215
numeric_level = getattr(logging, loglevel.upper(), None)
182216
if not isinstance(numeric_level, int):
183-
raise ValueError('Invalid log level: %s' % loglevel)
217+
raise ValueError("Invalid log level: %s" % loglevel)
184218
logging.basicConfig(level=numeric_level)
185219

186-
187220
benchmark = get_benchmark(args.env, args.remote_url)
188221

222+
if args.task_description is not None:
223+
task_id = str(uuid4())
224+
benchmark.tasks = [
225+
Task(
226+
id=task_id,
227+
description=args.task_description,
228+
evaluator=empty_evaluator,
229+
)
230+
]
231+
else:
232+
task_id = args.task_id
233+
234+
history_messages_len = 2
235+
189236
if args.model == "gpt4o":
190-
model = OpenAIModel(model="gpt-4o")
237+
model = OpenAIModel(model="gpt-4o", history_messages_len=history_messages_len)
191238
elif args.policy == "gpt4turbo":
192-
model = OpenAIModel(model="gpt-4-turbo")
239+
model = OpenAIModel(
240+
model="gpt-4-turbo", history_messages_len=history_messages_len
241+
)
193242
elif args.policy == "gemini":
194-
model = GeminiModel(model="gemini-1.5-pro-latest")
243+
model = GeminiModel(
244+
model="gemini-1.5-pro-latest", history_messages_len=history_messages_len
245+
)
195246
elif args.policy == "claude":
196-
model = ClaudeModel(model="claude-3-opus-20240229")
247+
model = ClaudeModel(
248+
model="claude-3-opus-20240229", history_messages_len=history_messages_len
249+
)
197250
else:
198251
print("Unsupported model: ", args.model)
199252
exit()
@@ -215,7 +268,7 @@ def get_benchmark(env: str, ubuntu_url: str):
215268
log_dir = (Path(__file__).parent / "logs").resolve()
216269
expeirment = CrabBenchmarkV0(
217270
benchmark=benchmark,
218-
task_id=args.task_id,
271+
task_id=task_id,
219272
agent_policy=agent_policy,
220273
log_dir=log_dir,
221274
)

crab/actions/desktop_actions.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,10 @@ def mouse_scroll(click: int = 1) -> None:
9393

9494

9595
class KeyEnum(str, Enum):
96-
KEY_TAB = "\t"
96+
KEY_TAB = "tab"
9797
KEY_LB = "\n"
9898
KEY_RR = "\r"
99-
KEY_SPACE = " "
99+
KEY_SPACE = "space"
100100
KEY_EXCLAMATION = "!"
101101
KEY_DQUOTE = '"'
102102
KEY_SHARP = "#"
@@ -188,6 +188,7 @@ class KeyEnum(str, Enum):
188188
KEY_UP = "up"
189189
KEY_RIGHT = "right"
190190
KEY_DOWN = "down"
191+
KEY_CMD = "command"
191192

192193

193194
@action
@@ -217,7 +218,7 @@ def press_hotkey(keys: list[KeyEnum]) -> None:
217218
"""
218219
if isinstance(keys[0], KeyEnum):
219220
keys = [key.value for key in keys]
220-
pyautogui.hotkey(*keys)
221+
pyautogui.hotkey(*keys, interval=0.25)
221222
time.sleep(DELAY)
222223

223224

crab/environments/macos.py

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
2+
# Licensed under the Apache License, Version 2.0 (the “License”);
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an “AS IS” BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
14+
from crab import action
15+
from crab.actions.crab_actions import complete, get_element_position
16+
from crab.actions.desktop_actions import (
17+
click_position,
18+
key_press,
19+
press_hotkey,
20+
right_click,
21+
screenshot,
22+
write_text,
23+
)
24+
from crab.core import EnvironmentConfig
25+
26+
27+
@action(local=True)
28+
def click(element: int, env) -> None:
29+
"""
30+
Click an UI element shown on the desktop screen. A simple use case can be
31+
click(5), which clicks the UI element labeled with the number 5.
32+
33+
Args:
34+
element: A numeric tag assigned to an UI element shown on the screenshot.
35+
"""
36+
x, y = get_element_position(element, env)
37+
env._action_endpoint(click_position, {"x": round(x / 2), "y": round(y / 2)})
38+
39+
40+
mac_env = EnvironmentConfig(
41+
name="macos",
42+
action_space=[click, key_press, write_text, press_hotkey, right_click, complete],
43+
observation_space=[screenshot],
44+
description="A Macbook laptop environment with a single display.",
45+
)

crab/utils/measure.py

+13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
2+
# Licensed under the Apache License, Version 2.0 (the “License”);
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an “AS IS” BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
114
import logging
215
import time
316
from functools import wraps

test/actions/test_visual_prompt_actions.py

+13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
2+
# Licensed under the Apache License, Version 2.0 (the “License”);
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an “AS IS” BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
114
from pathlib import Path
215

316
import pytest

0 commit comments

Comments
 (0)