From 3bc649b7e60d6d0c38a2c200c549707f5c8a7595 Mon Sep 17 00:00:00 2001 From: Isaac Jin Date: Sun, 27 Oct 2024 18:37:58 -0500 Subject: [PATCH] draft --- gui/__init__.py | 13 +++++ gui/envs/__init__.py | 19 ++++++++ gui/envs/android.py | 40 +++++++++++++++ gui/envs/mac.py | 48 ++++++++++++++++++ gui/envs/ubuntu.py | 48 ++++++++++++++++++ gui/envs/windows.py | 47 ++++++++++++++++++ gui/host_os.py | 21 ++++++++ gui/main.py | 113 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 349 insertions(+) create mode 100644 gui/__init__.py create mode 100644 gui/envs/__init__.py create mode 100644 gui/envs/android.py create mode 100644 gui/envs/mac.py create mode 100644 gui/envs/ubuntu.py create mode 100644 gui/envs/windows.py create mode 100644 gui/host_os.py create mode 100644 gui/main.py diff --git a/gui/__init__.py b/gui/__init__.py new file mode 100644 index 0000000..66e0731 --- /dev/null +++ b/gui/__init__.py @@ -0,0 +1,13 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== diff --git a/gui/envs/__init__.py b/gui/envs/__init__.py new file mode 100644 index 0000000..5617da0 --- /dev/null +++ b/gui/envs/__init__.py @@ -0,0 +1,19 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +from .android import ANDROID_ENV +from .mac import MAC_ENV +from .ubuntu import UBUNTU_ENV +from .windows import WINDOWS_ENV + +__all__ = ["MAC_ENV", "UBUNTU_ENV", "WINDOWS_ENV", "ANDROID_ENV"] diff --git a/gui/envs/android.py b/gui/envs/android.py new file mode 100644 index 0000000..f7c08fc --- /dev/null +++ b/gui/envs/android.py @@ -0,0 +1,40 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +from crab import EnvironmentConfig +from crab.actions.android_actions import ( + key_press, + open_app_drawer, + screenshot, + setup, + swipe, + tap, + write_text, +) + +ANDROID_ENV = EnvironmentConfig( + name="android", + action_space=[tap, key_press, write_text, swipe, open_app_drawer], + observation_space=[screenshot], + description=( + "An Android operating system running on Google Pixel smartphone. The interface " + "displays a current screenshot at each step and primarily supports interaction " + "through tapping and typing. This device offers a suite of standard " + "applications including Phone, Photos, Camera, Chrome, and Calendar, among " + "others. Access the app drawer to view all installed applications on the " + "device. The Google account is logged in, synchronized with the same account " + "used in other environments." + ), + extra_attributes={"device": None}, + reset=setup, +) diff --git a/gui/envs/mac.py b/gui/envs/mac.py new file mode 100644 index 0000000..535cc41 --- /dev/null +++ b/gui/envs/mac.py @@ -0,0 +1,48 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== + +from crab.actions.desktop_actions import ( + click, + key_press, + press_hotkey, + right_click, + screenshot, + search_application, + write_text, +) +from crab.core import EnvironmentConfig + +MAC_ENV = EnvironmentConfig( + name="mac", + action_space=[ + click, + key_press, + write_text, + press_hotkey, + search_application, + right_click, + ], + observation_space=[screenshot], + description=( + "A MacOS desktop operating system. The interface displays a current screenshot " + "at each step and primarily supports interaction via mouse and keyboard. You " + "are encouraged to use keyboard shortcuts and searching functionality to open" + " applications in the system. This device includes system-related applications " + "like Terminal, Finder, TextEdit, Settings, etc. For communication, Slack is " + "available. It also features Safari as the web browser. The Google account is " + "already logged in on Safari, synchronized with the same account used in other " + "environments." + ), + remote_url="http://127.0.0.1:8080", +) diff --git a/gui/envs/ubuntu.py b/gui/envs/ubuntu.py new file mode 100644 index 0000000..c4a3d07 --- /dev/null +++ b/gui/envs/ubuntu.py @@ -0,0 +1,48 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== + +from crab.actions.desktop_actions import ( + click, + key_press, + press_hotkey, + right_click, + screenshot, + search_application, + write_text, +) +from crab.core import EnvironmentConfig + +UBUNTU_ENV = EnvironmentConfig( + name="ubuntu", + action_space=[ + click, + key_press, + write_text, + press_hotkey, + search_application, + right_click, + ], + observation_space=[screenshot], + description=( + "An Ubuntu desktop OS. The interface displays a current screenshot at each step" + " and primarily supports interaction via mouse and keyboard. You are encouraged" + "to use searching functionalities to open applications in the system. This " + "device includes system-related applications like Terminal, Files, Text Editor," + " Vim, Settings, etc. It also features Firefox as the web browser, and the " + "LibreOffice suite—Writer, Calc, and Impress. For communication, Slack is " + "available. The Google account is logged in on Firefox, synchronized with the " + "same account used in other environments." + ), + remote_url="http://127.0.0.1:8080", +) diff --git a/gui/envs/windows.py b/gui/envs/windows.py new file mode 100644 index 0000000..c775111 --- /dev/null +++ b/gui/envs/windows.py @@ -0,0 +1,47 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== + +from crab.actions.desktop_actions import ( + click, + key_press, + press_hotkey, + right_click, + screenshot, + search_application, + write_text, +) +from crab.core import EnvironmentConfig + +WINDOWS_ENV = EnvironmentConfig( + name="windows", + action_space=[ + click, + key_press, + write_text, + press_hotkey, + search_application, + right_click, + ], + observation_space=[screenshot], + description=( + "A Windows desktop OS. The interface displays a current screenshot at each step" + " and primarily supports interaction via mouse and keyboard. You are encouraged" + "to use searching functionalities to open applications in the system. This " + "device includes system-related applications like Powershell, Explorer, " + "Notepad, etc. It also features Edge as the web browser. For communication, " + "Slack is available. The Google account is logged in on Edge, synchronized with" + " the same account used in other environments." + ), + remote_url="http://127.0.0.1:8080", +) diff --git a/gui/host_os.py b/gui/host_os.py new file mode 100644 index 0000000..e384e70 --- /dev/null +++ b/gui/host_os.py @@ -0,0 +1,21 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +from enum import Enum + + +class HostOS(str, Enum): + WINDOWS = "ubuntu" + LINUX = "ubuntu" + MAC = "macos" + UNKNOWN = "unknown" diff --git a/gui/main.py b/gui/main.py new file mode 100644 index 0000000..af724c1 --- /dev/null +++ b/gui/main.py @@ -0,0 +1,113 @@ +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +import warnings + +from crab import ( + BenchmarkConfig, + create_benchmark, +) +from crab.actions.crab_actions import complete +from crab.actions.visual_prompt_actions import ( + get_elements_prompt, + groundingdino_easyocr, +) +from crab.environments.macos import mac_env +from gui.envs import ANDROID_ENV, UBUNTU_ENV, WINDOWS_ENV +from gui.host_os import HostOS + +warnings.filterwarnings("ignore") + + +def check_host_os() -> HostOS: + return HostOS.WINDOWS + + +def get_benchmark(env: str, ubuntu_url: str): + ubuntu_tool = { + "screenshot": groundingdino_easyocr(font_size=16) >> get_elements_prompt + } + android_tool = { + "screenshot": groundingdino_easyocr(font_size=40) >> get_elements_prompt + } + mac_tool = { + "screenshot": groundingdino_easyocr(font_size=24) >> get_elements_prompt + } + + if env == "ubuntu": + prompting_tools = {"ubuntu": ubuntu_tool} + benchmark_config = BenchmarkConfig( + name="ubuntu_benchmark", + tasks=[], + environments=[UBUNTU_ENV], + prompting_tools=prompting_tools, + root_action_space=[complete], + multienv=True, + ) + elif env == "android": + prompting_tools = {"android": android_tool} + benchmark_config = BenchmarkConfig( + name="android_benchmark", + tasks=[], + environments=[ANDROID_ENV], + prompting_tools=prompting_tools, + root_action_space=[complete], + multienv=True, + ) + elif env == "cross": + prompting_tools = { + "android": android_tool, + "ubuntu": ubuntu_tool, + } + benchmark_config = BenchmarkConfig( + name="ubuntu_android_benchmark", + tasks=[], + environments=[UBUNTU_ENV, ANDROID_ENV], + prompting_tools=prompting_tools, + root_action_space=[complete], + multienv=True, + ) + elif env == "mac": + prompting_tools = {"macos": mac_tool} + benchmark_config = BenchmarkConfig( + name="mac_benchmark", + tasks=[], + environments=[mac_env, ANDROID_ENV], + prompting_tools=prompting_tools, + root_action_space=[complete], + multienv=True, + ) + elif env == "windows": + prompting_tools = {"windows": ubuntu_tool} + benchmark_config = BenchmarkConfig( + name="windows_benchmark", + tasks=[], + environments=[WINDOWS_ENV], + prompting_tools=prompting_tools, + root_action_space=[complete], + multienv=True, + ) + else: + raise ValueError("Env not support") + + benchmark_config.step_limit = 15 + return create_benchmark(benchmark_config) + + +def main(): + host_os = check_host_os() + print(f"Host OS: {host_os}") + + +if __name__ == "__main__": + main()