diff --git a/crab-benchmark-v0/README.md b/crab-benchmark-v0/README.md index 29c8db8..6965338 100644 --- a/crab-benchmark-v0/README.md +++ b/crab-benchmark-v0/README.md @@ -29,3 +29,7 @@ After setting up the environment, you can start the experiment. A brief overview 2. Start the CRAB server in the Ubuntu environment and get its IP address and port. Let's say they are `192.168.122.72` and `8000`. 3. Choose a task. As an example, we take the task with ID `a3476778-e512-40ca-b1c0-d7aab0c7f18b` from [handmade_tasks](./dataset/handmade_tasks.py). The task is: "Open the 'Tasks' app on Android, check the first incomplete task, then perform the task according to its description." 4. Run [main.py](./main.py) with the command `poetry run python -m crab-benchmark-v0.main --model gpt4o --policy single --remote-url http://192.168.122.72:8000 --task-id a3476778-e512-40ca-b1c0-d7aab0c7f18b`. In this command, `--model gpt4o` and `--policy single` determine the agent system, `--remote-url` specifies the Ubuntu environment interface, and `--task-id` indicates the task to be performed. + +#### Model + +For open source models, we use [VLLM](https://github.com/vllm-project/vllm) to host Pixtral model, check [here](https://docs.vllm.ai/en/latest/models/vlm.html#online-inference) for the setup commands; [SGLang](https://github.com/sgl-project/sglang) to host LLaVa-OneVision model, check [here](https://github.com/sgl-project/sglang?tab=readme-ov-file#supported-models) for the setup commands. \ No newline at end of file diff --git a/crab-benchmark-v0/android_env.py b/crab-benchmark-v0/android_env.py index ec43644..d4d3c91 100644 --- a/crab-benchmark-v0/android_env.py +++ b/crab-benchmark-v0/android_env.py @@ -14,6 +14,7 @@ from crab import EnvironmentConfig from crab.actions.android_actions import ( key_press, + long_tap, open_app_drawer, screenshot, setup, @@ -24,7 +25,7 @@ ANDROID_ENV = EnvironmentConfig( name="android", - action_space=[tap, key_press, write_text, swipe, open_app_drawer], + action_space=[tap, key_press, long_tap, write_text, swipe, open_app_drawer], observation_space=[screenshot], description="""A Google Pixel smartphone runs on the Android operating system. \ The interface displays a current screenshot at each step and primarily \ diff --git a/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json b/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json new file mode 100644 index 0000000..b1a808e --- /dev/null +++ b/crab-benchmark-v0/dataset/android/1005c437-50d1-465a-b3fc-833098b22bfc.json @@ -0,0 +1,23 @@ +{ + "description": "In the Android operating system, use the \"Google Map\" app to find the city name corresponding to the postal code \"63002\" in South Korea, then use the \"Calendar\" app to add a new all-day event for 1 January 2025 with the text of the found city name.", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "number": "63002", + "country": "South Korea" + }, + "output": "Jeju" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac", + "attribute": { + "content": "Jeju", + "date": "1 January 2025" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "1005c437-50d1-465a-b3fc-833098b22bfc" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json b/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json new file mode 100644 index 0000000..69aebbc --- /dev/null +++ b/crab-benchmark-v0/dataset/android/12333aa0-e76d-4a5c-8657-9f897f62f62d.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, use the \"Google Map\" app to find the city name for the postal code \"2770885\" in Japan, and then, using the \"Keep Notes\" app, create a new note without a title to record the city name you found.", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "number": "2770885", + "country": "Japan" + }, + "output": "Chiba" + }, + { + "task": "eb92a1e6-4c86-4d56-baac-95fc8397732e", + "attribute": { + "content": "Chiba" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "12333aa0-e76d-4a5c-8657-9f897f62f62d" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json b/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json new file mode 100644 index 0000000..2275b01 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/2ade6a13-c7a6-4df7-8c62-77382687369e.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, using the \"Contacts\" app, find the email of the contact named John Lauphin, then using the \"Gmail\" app, send an email to that contact with the subject \"Hello John.\"", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap", + "attribute": { + "name": "John Lauphin" + }, + "output": "crabbb@gmail.com" + }, + { + "task": "0090f116-e02b-4562-a20d-b5df38be963a", + "attribute": { + "content": "Hello John", + "mail": "crabbb@gmail.com" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "2ade6a13-c7a6-4df7-8c62-77382687369e" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json b/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json new file mode 100644 index 0000000..5ff2b91 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d.json @@ -0,0 +1,15 @@ +{ + "description": "In Android, Using Google Map app, Find the city name of corresponding post code \"1010021\" in the country \"Japan\".", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "country": "Japan", + "number": "101-0021" + }, + "output": "Tokyo" + } + ], + "adjlist": "0", + "id": "4190c90c-b28c-4bb3-ab5c-af3c4fde0a3d" +} diff --git a/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json b/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json new file mode 100644 index 0000000..95962b7 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/483fbf9c-dc78-4ac2-9264-53c4f617f6cc.json @@ -0,0 +1,21 @@ +{ + "description": "Open the calendar app in the Android system and find the title of an event on the date \"17 August 2024,\" then using the \"Google Drive\" app on the same Android device, create a new folder with the founded name", + "tasks": [ + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "17 August 2024" + }, + "output": "Travel to Paris" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar", + "attribute": { + "content": "Travel to Paris" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "483fbf9c-dc78-4ac2-9264-53c4f617f6cc" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json b/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json new file mode 100644 index 0000000..5df87bb --- /dev/null +++ b/crab-benchmark-v0/dataset/android/4893a9b0-6477-495d-a73c-32503326e24a.json @@ -0,0 +1,14 @@ +{ + "description": "In the Android system, use the calendar app to find the title of an event on the date \"16 July 2024,\".", + "tasks": [ + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "16 July 2024" + }, + "output": "Japan" + } + ], + "adjlist": "0", + "id": "4893a9b0-6477-495d-a73c-32503326e24a" +} diff --git a/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json b/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json new file mode 100644 index 0000000..05c7451 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/53010c40-dce4-4d72-a856-842c21059e2b.json @@ -0,0 +1,22 @@ +{ + "description": "In the Android system, use the calendar app to find the title of an event on the date \"16 July 2024,\" then, using the Google Map app, find the city name of the corresponding post code \"113-8654\" in the country with same name as title.", + "tasks": [ + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "16 July 2024" + }, + "output": "Japan" + }, + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "number": "113-8654", + "country": "Japan" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "53010c40-dce4-4d72-a856-842c21059e2b" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json b/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json new file mode 100644 index 0000000..f13b507 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/71ef7fd2-0ae3-49c8-8238-06b7aa985d25.json @@ -0,0 +1,23 @@ +{ + "description": "Using the \"Google Map\" app on Android, find the distance of the shortest route from \"National University of Singapore\" to \"Nanyang Technology University,\" then using the \"Calendar\" app, add a new event with the text representing the found distance on the date 21 June 2024 as an all-day event.", + "tasks": [ + { + "task": "1a1b72d7-78c9-4027-8278-86083ae01045", + "attribute": { + "place_name_1": "National University of Singapore", + "place_name_2": "Nanyang Technology University" + }, + "output": "13km" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac", + "attribute": { + "content": "13km", + "date": "21 June 2024" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "71ef7fd2-0ae3-49c8-8238-06b7aa985d25" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json b/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json new file mode 100644 index 0000000..41c48a4 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/7891ceab-7965-4ddb-a0fc-15740c9a4e44.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, Using \"Google Map\" app, find the city name of corresponding post code \"560049\" in the country \"India\". Creat a folder with the city name in \"Google Drive \" app", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "country": "India", + "number": "560049" + }, + "output": "Bengaluru" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar", + "attribute": { + "content": "Bengaluru" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "7891ceab-7965-4ddb-a0fc-15740c9a4e44" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json b/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json new file mode 100644 index 0000000..c8f6ed8 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/8bd51440-f959-4edc-baa5-cd03d32a5b0f.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, use the \"Google Map\" app to find the address of the University of Sydney, then using the \"Gmail\" app, send a message to crabbb@gmail.com with the found address.", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw", + "attribute": { + "content": "The University of Sydney" + }, + "output": "Camperdown NSW 2050 Australia" + }, + { + "task": "0090f116-e02b-4562-a20d-b5df38be963a", + "attribute": { + "content": "Camperdown NSW 2050 Australia", + "mail": "crabbb@gmail.com" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "8bd51440-f959-4edc-baa5-cd03d32a5b0f" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json b/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json new file mode 100644 index 0000000..137e20f --- /dev/null +++ b/crab-benchmark-v0/dataset/android/94b1836b-3111-40ad-8d07-b8a57efe7438.json @@ -0,0 +1,22 @@ +{ + "description": "In an Android system, use the calendar app to find the title of an event on the date \"9 August 2024\", and then, using the Gmail app, send an email to crabbb@gmail.com with the event title as message.", + "tasks": [ + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "9 August 2024" + }, + "output": "National Day of Singapore would be a public holiday" + }, + { + "task": "0090f116-e02b-4562-a20d-b5df38be963a", + "attribute": { + "content": "National Day of Singapore would be a public holiday", + "mail": "crabbb@gmail.com" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "94b1836b-3111-40ad-8d07-b8a57efe7438" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json b/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json new file mode 100644 index 0000000..e7ee4b8 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/a225f7f8-6d03-4619-b57d-7a08610030d8.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, Using \"Google Map\" app, Find the address of \"University of Oxford\" and send \"98801234\" the address using \"message\" App. ", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw", + "attribute": { + "content": "University of Oxford" + }, + "output": "Wellington Square, Oxford OX1 2JD, United Kingdom" + }, + { + "task": "caa29623-1811-402d-963a-19f7eecc63d8", + "attribute": { + "content": "Wellington Square, Oxford OX1 2JD, United Kingdom", + "number": "98801234" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "a225f7f8-6d03-4619-b57d-7a08610030d8" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json b/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json new file mode 100644 index 0000000..aabd243 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/b3965b07-4683-4445-9de1-a1dedf6c73ad.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, Using \"Google Map\" app, Find the address of \"University of Oxford\" and send \"abcdcly@qq.com\" the address using \"Gmail\" App. ", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548aw", + "attribute": { + "content": "University of Oxford" + }, + "output": "Wellington Square, Oxford OX1 2JD, United Kingdom" + }, + { + "task": "0090f116-e02b-4562-a20d-b5df38be963a", + "attribute": { + "content": "Wellington Square, Oxford OX1 2JD, United Kingdom", + "mail": "abcdcly@qq.com" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "b3965b07-4683-4445-9de1-a1dedf6c73ad" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json b/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json new file mode 100644 index 0000000..2d2c72f --- /dev/null +++ b/crab-benchmark-v0/dataset/android/cf4c496b-fbbd-4701-91ea-4590fe6a66e1.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postcode \"110151\" in Colombia, then use the \"Clock\" app to set the time of that city in the clock and check the time gap between that city and your current city.", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "number": "110151", + "country": "Columbia" + }, + "output": "Bogota" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ah", + "attribute": { + "place_name": "Bogota" + }, + "output": "-5h" + } + ], + "adjlist": "0 1\n1", + "id": "cf4c496b-fbbd-4701-91ea-4590fe6a66e1" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json b/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json new file mode 100644 index 0000000..8372ca5 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/d0811e47-d75f-40ce-b34b-e1ee3c8bed3f.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, first use the \"Files\" app to find the creation date of the file /Movies/movie_list.txt, then use the \"Calendar\" app to add a new event titled \"Public Talking\" scheduled for all day on the founded day.", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ak", + "attribute": { + "file_path": "/Movies/movie_list.txt" + }, + "output": "4 June 2024" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ac", + "attribute": { + "content": "Public Talking", + "date": "4 June 2024" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d0811e47-d75f-40ce-b34b-e1ee3c8bed3f" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json b/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json new file mode 100644 index 0000000..9050070 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/d7489d00-0046-4fb1-af5b-1fde7d87312c.json @@ -0,0 +1,21 @@ +{ + "description": "In Android, open the \"Contacts\" app to find the email address of the contact named Karoon Wei, then use the \"Tasks\" app to add a new task with the email address.", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap", + "attribute": { + "name": "Karoon Wei" + }, + "output": "karroonw@gmail.com" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548af", + "attribute": { + "content": "karroonw@gmail.com" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d7489d00-0046-4fb1-af5b-1fde7d87312c" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json b/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json new file mode 100644 index 0000000..9e4fe8b --- /dev/null +++ b/crab-benchmark-v0/dataset/android/d92f6c33-e0a7-4101-957d-e7dd218d2565.json @@ -0,0 +1,21 @@ +{ + "description": "Using the \"Files\" app on an Android device, locate the file /Movies/movie_list.txt and determine its creation date, then use the Task app in the same Android system to find the title of an event scheduled for the days.", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ak", + "attribute": { + "file_path": "/Movies/movie_list.txt" + }, + "output": "4 June 2024" + }, + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "4 June 2024" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d92f6c33-e0a7-4101-957d-e7dd218d2565" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json b/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json new file mode 100644 index 0000000..450ac2c --- /dev/null +++ b/crab-benchmark-v0/dataset/android/e55d7a39-7b6b-4852-8711-844cebc88cb8.json @@ -0,0 +1,15 @@ +{ + "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postcode \"110151\" in Colombia.", + "tasks": [ + { + "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", + "attribute": { + "number": "110151", + "country": "Columbia" + }, + "output": "Bogota" + } + ], + "adjlist": "0", + "id": "e55d7a39-7b6b-4852-8711-844cebc88cb8" +} diff --git a/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json b/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json new file mode 100644 index 0000000..334ef3d --- /dev/null +++ b/crab-benchmark-v0/dataset/android/e9268070-91b7-4e8c-9976-1cf8126ba13b.json @@ -0,0 +1,21 @@ +{ + "description": "In the Android system, use the task app to find the title of an event on the date \"15 June 2024\", then using the \"Google Drive\" app, create a new folder named as the title we found.", + "tasks": [ + { + "task": "2394b768-2ca7-45e9-b41e-2aa4e9573192", + "attribute": { + "date": "15 June 2024" + }, + "output": "EMNLP24 DDL" + }, + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ar", + "attribute": { + "content": "EMNLP24 DDL" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "e9268070-91b7-4e8c-9976-1cf8126ba13b" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json b/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json new file mode 100644 index 0000000..2acd5b4 --- /dev/null +++ b/crab-benchmark-v0/dataset/android/fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61.json @@ -0,0 +1,22 @@ +{ + "description": "In Android, open the \"Contacts\" app to find the email address of a contact named Luis Martin, then use the \"Messages\" app to send the found email address to the phone number \"04055891132\".", + "tasks": [ + { + "task": "a3d11574-2acf-4b26-a569-a5dbc9d548ap", + "attribute": { + "name": "Luis Martin" + }, + "output": "lmartin0431@gmail.com" + }, + { + "task": "caa29623-1811-402d-963a-19f7eecc63d8", + "attribute": { + "content": "lmartin0431@gmail.com", + "number": "04055891132" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "fbe6a1b1-63bb-4d4e-8a53-ff4f7839ef61" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/android_subtasks.py b/crab-benchmark-v0/dataset/android_subtasks.py index 02c7960..80be4be 100644 --- a/crab-benchmark-v0/dataset/android_subtasks.py +++ b/crab-benchmark-v0/dataset/android_subtasks.py @@ -361,6 +361,8 @@ def check_event(date: str, env) -> bool: event_nodes = root.xpath('//node[@class="android.support.v7.widget.RecyclerView"]') if event_nodes is None: return False + if not event_nodes: + return False for node in event_nodes[0]: text = node.get("content-desc") if date in text: diff --git a/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json b/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json index 2e6c9f0..5837689 100644 --- a/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json +++ b/crab-benchmark-v0/dataset/cross/05a7633d-b966-471c-8848-e18e69ad265f.json @@ -1,5 +1,5 @@ { - "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postal code \"1010021\" in Japan, then paste the name into LibreOffice Writer on an Ubuntu system and save it as an ODT file at \"/home/crab/Desktop\".", + "description": "In Android, use the \"Google Map\" app to find the city name corresponding to the postal code \"1010021\" in Japan, then paste the name into LibreOffice Writer on an Ubuntu system and save it as an ODT file at \"/home/crab/Desktop/target.opt\".", "tasks": [ { "task": "51b2463c-9904-4a32-81ba-507bfb89d61f", diff --git a/crab-benchmark-v0/dataset/handmade_tasks.py b/crab-benchmark-v0/dataset/handmade_tasks.py index ef20a70..f3a15f4 100644 --- a/crab-benchmark-v0/dataset/handmade_tasks.py +++ b/crab-benchmark-v0/dataset/handmade_tasks.py @@ -24,8 +24,10 @@ from .android_subtasks import ( check_current_package_name, + check_google_tasks_name, check_message_text_box_contain, check_message_text_box_empty, + check_note_content, get_xml_etree, ) from .ubuntu_subtasks import * # noqa: F403 @@ -148,6 +150,44 @@ def check_keep_notes_content(text: str, env) -> bool: return text_nodes[0].get("text") == text +@evaluator(env_name="android") +def check_keep_notes_contain_fd(env) -> bool: + global RESULT_fd0576be + text = RESULT_fd0576be + root = get_xml_etree(env) + if root is None or text is None: + return False + edit_node = root.xpath( + '//node[@resource-id="com.google.android.keep:id/editor_bottom_bar"]' + ) + if len(edit_node) != 1: + return False + content_node = root.xpath( + '//node[@resource-id="com.google.android.keep:id/browse_note_interior_content"]' + ) + for node in content_node: + text_nodes = node.getchildren() + if len(text_nodes) != 1: + continue + if text in text_nodes[0].get("text"): + return True + return False + + +@evaluator(env_name="android") +def check_alarm_contains(time: str, env) -> bool: + root = get_xml_etree(env) + if root is None or time is None: + return False + clock_node = root.xpath( + '//node[@resource-id="com.google.android.deskclock:id/digital_clock"]' + ) + for node in clock_node: + if time == node.get("text"): + return True + return False + + @evaluator(env_name="android", local=True) def check_tap_text(text: str, env) -> bool: if env.trajectory: @@ -361,6 +401,112 @@ def evaluator_ca79febf(): return result +def evaluator_dfabf84c(): + result = nx.DiGraph() + keyword = "kaust" + a = check_text_in_current_window_name("Mozilla Firefox") + b = check_contain_input_text(keyword) + c = is_img_url_in_clipboard() + d = download_from_clipboard_and_verify_file("/home/crab/Desktop/download.jpg") + e = check_current_package_name("com.google.android.keep") + f = check_contain_input_text(keyword) + g = check_note_content(keyword) + result.add_edges_from([(a, b), (b, c), (c, d), (d, g)]) + result.add_edges_from([(b, e), (e, f), (f, g)]) + return result + + +def evaluator_aab5555e(): + result = nx.DiGraph() + a = check_current_window_process("gnome-terminal-server") + b = check_contain_input_text("uname -a") + d = check_current_package_name("com.google.android.apps.messaging") + e = check_message_text_box_contain("ubuntu") + f = check_message_text_box_contain("x86") + g = check_message_text_box_contain("linux") + h = check_message_text_box_contain("crab") + sink = check_message_text_box_empty() + result.add_edges_from( + [ + (a, b), + (b, sink), + (d, e), + (d, f), + (d, g), + (d, h), + (e, sink), + (f, sink), + (g, sink), + (h, sink), + ] + ) + return result + + +RESULT_fd0576be = None + + +@action(env_name="ubuntu") +def get_root_usage() -> str: + try: + output = subprocess.check_output(["df", "/"], text=True) + return output.split("\n")[1].split()[4][:-1] + except Exception: + return None + + +@evaluator(env_name="ubuntu", local=True) +def check_contain_input_text_and_get_df_result(text: str, env) -> bool: + global RESULT_fd0576be + RESULT_fd0576be = env._action_endpoint(get_root_usage, parameters={}) + if env.trajectory: + inputs = [ + params["text"].lower() + for action_name, params, _ in env.trajectory + if action_name == "write_text" + ] + return any(text.lower() in input_text for input_text in inputs) + + return False + + +def evaluator_fd0576be(): + result = nx.DiGraph() + a = check_current_window_process("gnome-terminal-server") + b = check_contain_input_text_and_get_df_result("df") + c = check_current_package_name("com.google.android.keep") + d = check_keep_notes_contain_fd() + result.add_edges_from([(a, b), (b, d), (c, d)]) + return result + + +def evaluator_7e08f7d4(): + result = nx.DiGraph() + a = check_text_in_current_window_name("Mozilla Firefox") + b = check_contain_input_text( + "https://farm9.staticflickr.com/8293/7591378270_76059bc1cf_z.jpg" + ) + c = check_current_package_name("com.android.deskclock.DeskClock") + d = check_alarm_contains("7:00\u200aAM") + result.add_edges_from([(a, b), (b, d), (c, d)]) + return result + + +def evaluator_4957e964(): + result = nx.DiGraph() + a = check_current_window_process("gnome-terminal-server") + b = check_contain_input_text("wget") + c = check_contain_input_text( + "https://farm8.staticflickr.com/7451/10001676353_fd762e02f0_z.jpg" + ) + d = check_file_exist("/home/crab/Desktop/download.jpg") + e = check_text_in_current_window_name("Image Viewer") + f = check_current_package_name("com.google.android.apps.tasks") + g = check_google_tasks_name("tennis") + result.add_edges_from([(a, b), (b, c), (c, d), (d, e), (e, g), (f, g)]) + return result + + # Hand-made environment setup guide: # Ubuntu # * Make sure the Ubuntu slack login, and the default channel has at least two messages @@ -370,7 +516,40 @@ def evaluator_ca79febf(): # * Make sure the init page of "Calendar" app is "Day" view. There should be at least one element today. -handmade_tasks = [ +ubuntu_handmade_tasks = [ + Task( + id="82efbd82-c941-4be9-9ac0-a495dc629e02", + description='Download an image file from a given URL "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" to "/home/crab/Downloads/raw.jpg", then use GIMP (GNU Image Manipulation Program) to adjust the brightness of the image from "/home/crab/Downloads/raw.jpg" to be brighter and save the edited file to "/home/crab/Pictures/edited.jpg", and set the adjusted image "/home/crab/Pictures/edited.jpg" as the screen background of the system.', + evaluator=evaluator_82efbd82(), + ), + Task( + id="515a5467-b7ce-4cad-874d-da894361c1a3", + description='Download two image files from given URLs "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" and "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png" to "/home/crab/Downloads/img_1.jpg" and "/home/crab/Downloads/img_2.jpg", combine the first image ("/home/crab/Downloads/img_1.jpg") with the second image ("/home/crab/Downloads/img_2.jpg") using GIMP (GNU Image Manipulation Program) by placing the first image on the right side of the second image, and save the resulting combined image to "/home/crab/Downloads/combined_editing.jpg". Then, create a new directory "/home/crab/jpg" and copy all files with the specified "jpg" extension from "/home/crab/Downloads" to the newly created directory "/home/crab/jpg".', + evaluator=evaluator_515a5467(), + ), + Task( + id="5a1eba49-ed2d-4955-a684-32472090a45b", + description='Use Firefox to search for an image using the keyword "GPU", copy the URL of the found image to the clipboard, download the image file from the URL stored in the clipboard to "/home/crab/Pictures/GPU.png", and create a new directory "/home/crab/Pictures/png_files" to copy all files with the specified "png" extension from "/home/crab/Pictures" to the newly created directory "/home/crab/Pictures/png_files".', + evaluator=evaluator_5a1eba49(), + ), + Task( + id="c347f78a-4643-43c8-b41e-e437b70a2c5e", + description='Open a file at "/home/crab/assets/content.txt" using vim in a terminal, write the specified "An air quality health advisory is in effect Tuesday for New York City and the lower Hudson Valley, as well as western Connecticut and northern New Jersey, meaning it may not be safe for people with some conditions to be outside long." to it, then save and exit vim. Print the content of the file by printing it to the command line interface through a terminal, and finally, submit the printed content.', + evaluator=evaluator_c347f78a(), + ), + Task( + id="bf83c176-fa15-4057-996f-f75be4338c05", + description='Use Firefox to search for an image using the keyword "Waymo" first, copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/waymo.jpg". Then, search for another image using the keyword "Tesla", copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/tesla.png". Finally, combine the two images using LibreOffice Impress, placing Image 1 from "/home/crab/Desktop/waymo.jpg" on the left side of Image 2 "/home/crab/Desktop/tesla.png", and save the resulting file in PDF format to "/home/crab/Documents/self_driving.pdf".', + evaluator=evaluator_bf83c176(), + ), + Task( + id="74bb11dd-89ca-43d0-8edf-fe7b5201ecf7", + description='Use Firefox to search for information about the country "France" on Wikipedia. Extract the capital city and population, and save this information in an ODS file at "/home/crab/Documents/FR.ods" using LibreOffice Calc. Then, search for information about the country "Mexico" on Wikipedia, extract the capital city and population, and save this information in a separate ODS file at "/home/crab/Documents/MX.ods" using LibreOffice Calc. The format of the file are, first column for the country name, the second for the capital city name, and the third for the population without any header. Finally, create a new directory "/home/crab/Desktop/country_info" and copy all files with the specified "ods" extension from "/home/crab/Documents" to the newly created directory "/home/crab/Desktop/country_info".', + evaluator=evaluator_74bb11dd(), + ), +] + +corss_environment_tasks = [ Task( id="79832e15-5fd3-43b8-b3e3-66249edfe1db", description='Open slack in Ubuntu desktop, summarize the last two messages in current channel, then use "Messages" app in android phone to send the summary to the first contact in the list.', @@ -401,38 +580,35 @@ def evaluator_ca79febf(): evaluator=evaluator_97e6f333(), ), Task( - id="82efbd82-c941-4be9-9ac0-a495dc629e02", - description='Download an image file from a given URL "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" to "/home/crab/Downloads/raw.jpg", then use GIMP (GNU Image Manipulation Program) to adjust the brightness of the image from "/home/crab/Downloads/raw.jpg" to be brighter and save the edited file to "/home/crab/Pictures/edited.jpg", and set the adjusted image "/home/crab/Pictures/edited.jpg" as the screen background of the system.', - evaluator=evaluator_82efbd82(), + id="ca79febf-cae7-4669-8812-d3ec85ee2868", + description="Open the first note in the Keep Notes app on Android, copy its contents, and paste them into a new document in Google docs. Then, open the newly created document in Firefox on Ubuntu.", + evaluator=evaluator_ca79febf(), ), Task( - id="515a5467-b7ce-4cad-874d-da894361c1a3", - description='Download two image files from given URLs "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg" and "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png" to "/home/crab/Downloads/img_1.jpg" and "/home/crab/Downloads/img_2.jpg", combine the first image ("/home/crab/Downloads/img_1.jpg") with the second image ("/home/crab/Downloads/img_2.jpg") using GIMP (GNU Image Manipulation Program) by placing the first image on the right side of the second image, and save the resulting combined image to "/home/crab/Downloads/combined_editing.jpg". Then, create a new directory "/home/crab/jpg" and copy all files with the specified "jpg" extension from "/home/crab/Downloads" to the newly created directory "/home/crab/jpg".', - evaluator=evaluator_515a5467(), + id="dfabf84c-d05f-4e25-9f21-ba0f08107bd5", + description='Use Firefox to search for an image using the keyword "kaust" and copy the URL of the image to the clipboard. Download a file from the URL stored in the clipboard to "/home/crab/Desktop/download.jpg". Then describe this image and save it in the Android Keep Notes app.', + evaluator=evaluator_dfabf84c(), ), Task( - id="5a1eba49-ed2d-4955-a684-32472090a45b", - description='Use Firefox to search for an image using the keyword "GPU", copy the URL of the found image to the clipboard, download the image file from the URL stored in the clipboard to "/home/crab/Pictures/GPU.png", and create a new directory "/home/crab/Pictures/png_files" to copy all files with the specified "png" extension from "/home/crab/Pictures" to the newly created directory "/home/crab/Pictures/png_files".', - evaluator=evaluator_5a1eba49(), + id="aab5555e-4b72-4ebf-816a-59c1da2cec86", + description="Check the all uname information of the system in Ubuntu, then explain the information to the first contact in the list of the Messages app in Android.", + evaluator=evaluator_aab5555e(), ), Task( - id="c347f78a-4643-43c8-b41e-e437b70a2c5e", - description='Open a file at "/home/crab/assets/content.txt" using vim in a terminal, write the specified "An air quality health advisory is in effect Tuesday for New York City and the lower Hudson Valley, as well as western Connecticut and northern New Jersey, meaning it may not be safe for people with some conditions to be outside long." to it, then save and exit vim. Print the content of the file by printing it to the command line interface through a terminal, and finally, submit the printed content.', - evaluator=evaluator_c347f78a(), + id="fd0576be-8b2c-45ce-b4a2-78659740879b", + description="Check the current disk usage through command line in Ubuntu, check the root directory usage in percentage and save the information to a note in Keep Notes app in Android.", + evaluator=evaluator_fd0576be(), ), Task( - id="bf83c176-fa15-4057-996f-f75be4338c05", - description='Use Firefox to search for an image using the keyword "Waymo" first, copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/waymo.jpg". Then, search for another image using the keyword "Tesla", copy the URL of the image to the clipboard, and download the image to "/home/crab/Desktop/tesla.png". Finally, combine the two images using LibreOffice Impress, placing Image 1 from "/home/crab/Desktop/waymo.jpg" on the left side of Image 2 "/home/crab/Desktop/tesla.png", and save the resulting file in PDF format to "/home/crab/Documents/self_driving.pdf".', - evaluator=evaluator_bf83c176(), + id="7e08f7d4-9b11-4aec-9b42-6cbde083fb4c", + description='Use firefox on Ubuntu to openup the image "https://farm9.staticflickr.com/8293/7591378270_76059bc1cf_z.jpg", check the time of the clock in the image, then open the clock app in Android and set an alarm to the same as the image.', + evaluator=evaluator_7e08f7d4(), ), Task( - id="74bb11dd-89ca-43d0-8edf-fe7b5201ecf7", - description='Use Firefox to search for information about the country "France" on Wikipedia. Extract the capital city and population, and save this information in an ODS file at "/home/crab/Documents/FR.ods" using LibreOffice Calc. Then, search for information about the country "Mexico" on Wikipedia, extract the capital city and population, and save this information in a separate ODS file at "/home/crab/Documents/MX.ods" using LibreOffice Calc. The format of the file are, first column for the country name, the second for the capital city name, and the third for the population without any header. Finally, create a new directory "/home/crab/Desktop/country_info" and copy all files with the specified "ods" extension from "/home/crab/Documents" to the newly created directory "/home/crab/Desktop/country_info".', - evaluator=evaluator_74bb11dd(), - ), - Task( - id="ca79febf-cae7-4669-8812-d3ec85ee2868", - description="Open the first note in the Keep Notes app on Android, copy its contents, and paste them into a new document in Google docs. Then, open the newly created document in Firefox on Ubuntu.", - evaluator=evaluator_ca79febf(), + id="4957e964-5dd5-42f6-9d5d-f6a53a9a5d94", + description='Use wget to download the image "https://farm8.staticflickr.com/7451/10001676353_fd762e02f0_z.jpg" to /home/crab/Desktop/download.jpg, what does the people in the image do? Create a task in the Tasks app in Android to remind you to do the same thing.', + evaluator=evaluator_4957e964(), ), ] + +handmade_tasks = ubuntu_handmade_tasks + corss_environment_tasks diff --git a/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json b/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json new file mode 100644 index 0000000..391e321 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/0deafe05-8db5-445f-9031-f6e884569d03.json @@ -0,0 +1,25 @@ +{ + "description": "Create a new directory \"/home/crab/jpg_folder\", copy all files with the \"jpg\" extension from \"/home/crab/Pictures\" to this newly created directory, then open LibreOffice Impress to combine the two images located at \"/home/crab/jpg_folder/dog.jpg\" (Image 1) and \"/home/crab/jpg_folder/Interstellar.jpg\" (Image 2), placing Image 1 on the right side of Image 2, and save the combined image in PDF format to \"/home/crab/Documents/combination.pdf\".", + "tasks": [ + { + "task": "217ababc-ccc7-4b9f-af07-c239d92848fe", + "attribute": { + "file_extension": "jpg", + "source_dir": "/home/crab/Pictures", + "target_dir": "/home/crab/jpg_folder" + }, + "output": "/home/crab/jpg_folder" + }, + { + "task": "467f17a6-c42f-4eda-996f-a53385eb3efd", + "attribute": { + "image_path_1": "/home/crab/jpg_folder/dog.jpg", + "image_path_2": "/home/crab/jpg_folder/Interstellar.jpg", + "output_path": "/home/crab/Documents/combination.pdf" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "0deafe05-8db5-445f-9031-f6e884569d03" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json b/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json new file mode 100644 index 0000000..8f88774 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/15a150a8-899c-4753-8dc5-05248ccc3640.json @@ -0,0 +1,22 @@ +{ + "description": "Download the file from \"https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg\" to the location \"/home/crab/Downloads/fiji.png\", and then set \"/home/crab/Downloads/fiji.png\" as the desktop background on the system.", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://media.cntraveller.com/photos/642aa1ad770beda2d4f5cc22/4:3/w_2664,h_1998,c_limit/Fiji-march2023issue-JackJohns15.jpg", + "file_path": "/home/crab/Downloads/fiji.png" + }, + "output": "/home/crab/Downloads/fiji.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Downloads/fiji.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "15a150a8-899c-4753-8dc5-05248ccc3640" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json b/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json new file mode 100644 index 0000000..aff7c6e --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/299db8f2-81eb-455f-9302-5c8cb30be691.json @@ -0,0 +1,23 @@ +{ + "description": "Combine two images, Image 1 \"/home/crab/Pictures/Interstellar.jpg\" and Image 2 \"/home/crab/Pictures/cat.png\", using GIMP (GNU Image Manipulation Program) with Image 1 placed on the left side of Image 2, and save the resulting image to \"/home/crab/Pictures/edited_background.png\". Then, set \"/home/crab/Pictures/edited_background.png\" as the desktop background on the system.", + "tasks": [ + { + "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af", + "attribute": { + "image_path_1": "/home/crab/Pictures/Interstellar.jpg", + "image_path_2": "/home/crab/Pictures/cat.png", + "output_path": "/home/crab/Pictures/edited_background.png" + }, + "output": "/home/crab/Pictures/edited_background.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Pictures/edited_background.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "299db8f2-81eb-455f-9302-5c8cb30be691" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json b/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json new file mode 100644 index 0000000..0b9ee8d --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/29f099b2-b3a5-463f-b10a-15363bf7e845.json @@ -0,0 +1,22 @@ +{ + "description": "Use Firefox to search for a \"garden\" around \"ETH Zurich\" on Google Maps, copy the sharing URL of that \"garden\" to the clipboard, then paste the content into Visual Studio Code (VS Code) and save the file at \"/home/crab/eth_garden.txt\".", + "tasks": [ + { + "task": "2b189dc2-c77f-4fa3-8432-ba4355cc294c", + "attribute": { + "place_type": "garden", + "place_name": "ETH Zurich" + }, + "output": null + }, + { + "task": "8491e674-596b-452b-9e0e-58a44d90f947", + "attribute": { + "file_path": "/home/crab/eth_garden.txt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "29f099b2-b3a5-463f-b10a-15363bf7e845" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json b/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json new file mode 100644 index 0000000..6c7ce88 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/51c91051-3efb-4e92-a967-739b18520714.json @@ -0,0 +1,19 @@ +{ + "description": "Open Firefox and search for the torch.matmul example provided by the official PyTorch version 1.13 documentation, copy all the lines of code from the example, open Visual Studio Code (VS Code), paste the clipboard content into a new file, and save it as \"/home/crab/example.py\".", + "tasks": [ + { + "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5", + "attribute": {}, + "output": null + }, + { + "task": "8491e674-596b-452b-9e0e-58a44d90f947", + "attribute": { + "file_path": "/home/crab/example.py" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "51c91051-3efb-4e92-a967-739b18520714" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json b/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json new file mode 100644 index 0000000..8c52c37 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8.json @@ -0,0 +1,25 @@ +{ + "description": "Create a new directory \"/home/crab/assets_for_edit\" and copy all files with the \"png\" extension from \"/home/crab/assets\" to this new directory. Then, combining Image 1 \"/home/crab/assets_for_edit/background.png\" and Image 2 \"/home/crab/assets_for_edit/campus.png\" with LibreOffice Writer, place Image 1 above Image 2, and save the file in the ODT format to \"/home/crab/assets_for_edit/back_n_campus.odt\".", + "tasks": [ + { + "task": "217ababc-ccc7-4b9f-af07-c239d92848fe", + "attribute": { + "file_extension": "png", + "source_dir": "/home/crab/assets", + "target_dir": "/home/crab/assets_for_edit" + }, + "output": "/home/crab/assets_for_edit" + }, + { + "task": "0111384f-38ca-41a2-9504-cb1c55002b3c", + "attribute": { + "image_path_1": "/home/crab/assets_for_edit/background.png", + "image_path_2": "/home/crab/assets_for_edit/campus.png", + "output_path": "/home/crab/assets_for_edit/back_n_campus.odt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "57b7e8a7-8c17-4cc4-9bb5-4385afde3ad8" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json b/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json new file mode 100644 index 0000000..0a16cf3 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/5ba74c6a-4513-448b-8b68-ff145ece0652.json @@ -0,0 +1,22 @@ +{ + "description": "Download the file from \"https://raw.githubusercontent.com/camel-ai/camel/master/README.md\" to \"/home/crab/Documents/README.md\", and then print the content of \"/home/crab/Documents/README.md\" to the command line interface through a terminal.", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://raw.githubusercontent.com/camel-ai/camel/master/README.md", + "file_path": "/home/crab/Documents/README.md" + }, + "output": "/home/crab/Documents/README.md" + }, + { + "task": "5b527839-0e58-426d-bab6-7160200b0d24", + "attribute": { + "file_path": "/home/crab/Documents/README.md" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "5ba74c6a-4513-448b-8b68-ff145ece0652" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json b/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json new file mode 100644 index 0000000..04f5684 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/6428f803-62de-40d2-a345-64e6cf955c9d.json @@ -0,0 +1,24 @@ +{ + "description": "First, use LibreOffice Impress to adjust the brightness of the image located at \"/home/crab/Pictures/cat.png\" to make it darker, and save the edited image as \"/home/crab/Pictures/cat_edited.png\". Then, using GIMP (GNU Image Manipulation Program), combine the image \"/home/crab/Pictures/dog.png\" with \"/home/crab/Pictures/cat_edited.png\" by placing the dog image on the left side of the cat image, and save the merged image to \"/home/crab/Pictures/dog_cat.png\".", + "tasks": [ + { + "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0", + "attribute": { + "image_path_before_edit": "/home/crab/Pictures/cat.png", + "image_path_after_edit": "/home/crab/Pictures/cat_edited.png" + }, + "output": "/home/crab/Pictures/cat_edited.png" + }, + { + "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af", + "attribute": { + "image_path_1": "/home/crab/Pictures/dog.png", + "image_path_2": "/home/crab/Pictures/cat_edited.png", + "output_path": "/home/crab/Pictures/dog_cat.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "6428f803-62de-40d2-a345-64e6cf955c9d" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json b/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json new file mode 100644 index 0000000..77d2049 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/696ca9bb-89ea-4cd5-b693-f2d749d964b1.json @@ -0,0 +1,22 @@ +{ + "description": "Adjust the brightness of the image located at \"/home/crab/assets/campus.png\" using GIMP (GNU Image Manipulation Program) to make it brighter, save the adjusted image to \"/home/crab/Pictures/campus_brighter.png\", and then set this enhanced image as the desktop background on an Ubuntu system.", + "tasks": [ + { + "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69", + "attribute": { + "image_path_before_edit": "/home/crab/assets/campus.png", + "image_path_after_edit": "/home/crab/Pictures/campus_brighter.png" + }, + "output": "/home/crab/Pictures/campus_brighter.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Pictures/campus_brighter.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "696ca9bb-89ea-4cd5-b693-f2d749d964b1" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json b/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json new file mode 100644 index 0000000..6eea98c --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/6c3105a2-328c-4190-823d-03d759be0b57.json @@ -0,0 +1,21 @@ +{ + "description": "Use Firefox to search for an image with the keyword \"reinforcement learning,\" copy the URL of the chosen image to the clipboard, and download the image from the URL in the clipboard to \"/home/crab/Downloads/RL.png\" on an Ubuntu system.", + "tasks": [ + { + "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a", + "attribute": { + "keyword": "reinforcement learning" + }, + "output": null + }, + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd", + "attribute": { + "file_path": "/home/crab/Downloads/RL.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "6c3105a2-328c-4190-823d-03d759be0b57" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json b/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json new file mode 100644 index 0000000..9b52848 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/6c560516-ca14-4f97-b51d-16ad81fc29e4.json @@ -0,0 +1,22 @@ +{ + "description": "Open \"/home/crab/assets/a.txt\" using vim in a terminal, write \"The most recent COMPUTEX was held from 30 May to 2 June 2023 with sessions about such topics as high-performance computing, artificial intelligence, next-gen connectivity and sustainability.\", then save and exit vim, and print the content of \"/home/crab/assets/a.txt\" to the command line interface.", + "tasks": [ + { + "task": "0f589bf9-9b26-4581-8b78-2961b115ab49", + "attribute": { + "file_path": "/home/crab/assets/a.txt", + "content": "The most recent COMPUTEX was held from 30 May to 2 June 2023 with sessions about such topics as high-performance computing, artificial intelligence, next-gen connectivity and sustainability." + }, + "output": "/home/crab/assets/a.txt" + }, + { + "task": "5b527839-0e58-426d-bab6-7160200b0d24", + "attribute": { + "file_path": "/home/crab/assets/a.txt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "6c560516-ca14-4f97-b51d-16ad81fc29e4" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json b/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json new file mode 100644 index 0000000..de96602 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/730172f5-894a-4d46-9102-ac7d985a479d.json @@ -0,0 +1,23 @@ +{ + "description": "Download the image of Jupiter from \"https://upload.wikimedia.org/wikipedia/commons/thumb/2/2b/Jupiter_and_its_shrunken_Great_Red_Spot.jpg/640px-Jupiter_and_its_shrunken_Great_Red_Spot.jpg\" to \"/home/crab/Pictures/jupiter.jpg\", then use LibreOffice Impress to adjust the brightness of this image to make it darker and save the edited version as \"/home/crab/Pictures/jupiter_edited.jpg\".", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/2/2b/Jupiter_and_its_shrunken_Great_Red_Spot.jpg/640px-Jupiter_and_its_shrunken_Great_Red_Spot.jpg", + "file_path": "/home/crab/Pictures/jupiter.jpg" + }, + "output": "/home/crab/Pictures/jupiter.jpg" + }, + { + "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0", + "attribute": { + "image_path_before_edit": "/home/crab/Pictures/jupiter.jpg", + "image_path_after_edit": "/home/crab/Pictures/jupiter_edited.jpg" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "730172f5-894a-4d46-9102-ac7d985a479d" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json b/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json new file mode 100644 index 0000000..4478eda --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/73038efb-ca0f-4d90-a947-fcfd097dd91b.json @@ -0,0 +1,19 @@ +{ + "description": "Open Firefox and navigate to the official PyTorch version 1.13 documentation to find an example of `torch.matmul`. Copy all the lines of code in the example to the clipboard. Then, paste the clipboard content into Visual Studio Code (VS Code) and save it as a file at \"/home/crab/example_code.txt\".", + "tasks": [ + { + "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5", + "attribute": {}, + "output": null + }, + { + "task": "8491e674-596b-452b-9e0e-58a44d90f947", + "attribute": { + "file_path": "/home/crab/example_code.txt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "73038efb-ca0f-4d90-a947-fcfd097dd91b" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json b/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json new file mode 100644 index 0000000..00d368c --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/73da97c9-f084-4cab-8697-1151737387ff.json @@ -0,0 +1,22 @@ +{ + "description": "Download the file from \"https://images.top1market.com/images/cms/uploads/20230928/4950e1db0038feb506fdcfa0c936fd8e.png\" to \"/home/crab/Desktop/meta.png\", then set this image, \"/home/crab/Desktop/meta.png\", as the desktop background on the system.", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://images.top1market.com/images/cms/uploads/20230928/4950e1db0038feb506fdcfa0c936fd8e.png", + "file_path": "/home/crab/Desktop/meta.png" + }, + "output": "/home/crab/Desktop/meta.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Desktop/meta.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "73da97c9-f084-4cab-8697-1151737387ff" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json b/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json new file mode 100644 index 0000000..37acf3d --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/78502f1c-879b-4932-a5fd-d85f7f6b0f81.json @@ -0,0 +1,22 @@ +{ + "description": "Download the file from \"https://cemse.kaust.edu.sa/sites/default/files/styles/large/public/2023-04/Web%20banner.jpg?itok=d1TvGUKY\" to \"/home/crab/Pictures/KAUST_AI.png\" and then set this image as the desktop background on the system.", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://cemse.kaust.edu.sa/sites/default/files/styles/large/public/2023-04/Web%20banner.jpg?itok=d1TvGUKY", + "file_path": "/home/crab/Pictures/KAUST_AI.png" + }, + "output": "/home/crab/Pictures/KAUST_AI.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Pictures/KAUST_AI.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "78502f1c-879b-4932-a5fd-d85f7f6b0f81" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json b/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json new file mode 100644 index 0000000..eb2ee8f --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/7dda7e46-78be-4663-b882-6132dbbff335.json @@ -0,0 +1,22 @@ +{ + "description": "Adjust the brightness of the image located at \"/home/crab/Pictures/Interstellar.jpg\" to a higher value using GIMP (GNU Image Manipulation Program), save the edited image as \"/home/crab/edited_background.png\", and then set this edited image as the desktop background on the system.", + "tasks": [ + { + "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69", + "attribute": { + "image_path_before_edit": "/home/crab/Pictures/Interstellar.jpg", + "image_path_after_edit": "/home/crab/edited_background.png" + }, + "output": "/home/crab/edited_background.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/edited_background.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "7dda7e46-78be-4663-b882-6132dbbff335" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json b/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json new file mode 100644 index 0000000..ab94179 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/82c49e12-3b2f-432e-9069-4b67bafebbf7.json @@ -0,0 +1,22 @@ +{ + "description": "Open Firefox to find a coffee shop around the hungarian parliament on Google Maps, copy the sharing URL of the coffee shop to the clipboard, then paste the clipboard content into Visual Studio Code (VS Code), and save the content as a file at \"/home/crab/Downloads/coffee\".", + "tasks": [ + { + "task": "2b189dc2-c77f-4fa3-8432-ba4355cc294c", + "attribute": { + "place_type": "coffee shop", + "place_name": "hungarian parliament" + }, + "output": null + }, + { + "task": "8491e674-596b-452b-9e0e-58a44d90f947", + "attribute": { + "file_path": "/home/crab/Downloads/coffee" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "82c49e12-3b2f-432e-9069-4b67bafebbf7" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json b/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json new file mode 100644 index 0000000..d83c6b7 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/87910f23-ab23-4ccc-b115-d71cff6f0162.json @@ -0,0 +1,21 @@ +{ + "description": "Use Firefox to search for an image with the keyword \"patagonia,\" copy the URL of the chosen image to the clipboard, and download the file from that URL to \"/home/crab/Desktop/brand.jpg\".", + "tasks": [ + { + "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a", + "attribute": { + "keyword": "patagonia" + }, + "output": null + }, + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd", + "attribute": { + "file_path": "/home/crab/Desktop/brand.jpg" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "87910f23-ab23-4ccc-b115-d71cff6f0162" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json b/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json new file mode 100644 index 0000000..b7be94c --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/8cb5ab6d-a56e-43b9-aa83-00a46331e20f.json @@ -0,0 +1,23 @@ +{ + "description": "Download the image from \"https://res.cloudinary.com/simpleview/image/upload/v1648755098/clients/austin/Austin_Skyline_Credit_Christopher_Sherman_lifetime__4f60343d-9f69-450c-8ad3-fa636761786d.jpg\" to \"/home/crab/Downloads/Austin.jpg\", then use GIMP (GNU Image Manipulation Program) to adjust its brightness to a higher value and save the modified image as \"/home/crab/Downloads/brighter_austin.jpg\".", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://res.cloudinary.com/simpleview/image/upload/v1648755098/clients/austin/Austin_Skyline_Credit_Christopher_Sherman_lifetime__4f60343d-9f69-450c-8ad3-fa636761786d.jpg", + "file_path": "/home/crab/Downloads/Austin.jpg" + }, + "output": "/home/crab/Downloads/Austin.jpg" + }, + { + "task": "cc1adae7-bef9-4c8a-865d-00d44486dd69", + "attribute": { + "image_path_before_edit": "/home/crab/Downloads/Austin.jpg", + "image_path_after_edit": "/home/crab/Downloads/brighter_austin.jpg" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "8cb5ab6d-a56e-43b9-aa83-00a46331e20f" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json b/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json new file mode 100644 index 0000000..a2e4ba1 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/a70ab903-835f-48b7-8356-2321b8b869d8.json @@ -0,0 +1,19 @@ +{ + "description": "Using Firefox, find the example of torch.matmul provided by the official PyTorch version 1.13 documentation and copy all the lines of code in the example to the clipboard, then paste the clipboard content into LibreOffice Writer and save it as an ODT file at \"/home/crab/Desktop/doc_torch.odt\".", + "tasks": [ + { + "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5", + "attribute": {}, + "output": null + }, + { + "task": "76de4bdb-c980-4b3a-9bd3-c87db467dffe", + "attribute": { + "file_path": "/home/crab/Desktop/doc_torch.odt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "a70ab903-835f-48b7-8356-2321b8b869d8" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json b/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json new file mode 100644 index 0000000..190ddb9 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/abb16512-27ae-49c0-b12b-7fbf0e95056b.json @@ -0,0 +1,21 @@ +{ + "description": "Paste the clipboard content into Visual Studio Code (VS Code) and save the file as \"/home/crab/Desktop/content.txt\", then open a terminal and print the content of \"/home/crab/Desktop/content.txt\" to the command line interface.", + "tasks": [ + { + "task": "8491e674-596b-452b-9e0e-58a44d90f947", + "attribute": { + "file_path": "/home/crab/Desktop/content.txt" + }, + "output": "/home/crab/Desktop/content.txt" + }, + { + "task": "5b527839-0e58-426d-bab6-7160200b0d24", + "attribute": { + "file_path": "/home/crab/Desktop/content.txt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "abb16512-27ae-49c0-b12b-7fbf0e95056b" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json b/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json new file mode 100644 index 0000000..969ddff --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/b2ca21dc-dde9-49f5-bec7-321fbf769315.json @@ -0,0 +1,24 @@ +{ + "description": "Adjust the brightness of the image located at \"/home/crab/assets/desert.jpg\" to a darker value using LibreOffice Impress and save it as \"/home/crab/assets/darker_desert.jpg\", then use GIMP (GNU Image Manipulation Program) to combine this adjusted image with the original image at \"/home/crab/assets/desert.jpg\", placing the darker image on the left side and the original on the right, finally save the resulting comparison image to \"/home/crab/assets/desert_comparison.jpg\".", + "tasks": [ + { + "task": "434402f3-647a-4a9a-9d8f-10f5bb6c7cf0", + "attribute": { + "image_path_before_edit": "/home/crab/assets/desert.jpg", + "image_path_after_edit": "/home/crab/assets/darker_desert.jpg" + }, + "output": "/home/crab/assets/darker_desert.jpg" + }, + { + "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af", + "attribute": { + "image_path_1": "/home/crab/assets/darker_desert.jpg", + "image_path_2": "/home/crab/assets/desert.jpg", + "output_path": "/home/crab/assets/desert_comparison.jpg" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "b2ca21dc-dde9-49f5-bec7-321fbf769315" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json b/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json new file mode 100644 index 0000000..6d4a06d --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/ccf31785-ec13-4981-93c5-ca6c242ac0c3.json @@ -0,0 +1,24 @@ +{ + "description": "Download the flag of Ethiopia image from \"https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png\" to \"/home/crab/Pictures/flag.png\", create a new directory named \"/home/crab/Pictures/png_\", and copy all PNG files from \"/home/crab/Pictures\" to the newly created directory \"/home/crab/Pictures/png_\".", + "tasks": [ + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19eac1", + "attribute": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Flag_of_Ethiopia.svg/250px-Flag_of_Ethiopia.svg.png", + "file_path": "/home/crab/Pictures/flag.png" + }, + "output": "/home/crab/Pictures/flag.png" + }, + { + "task": "217ababc-ccc7-4b9f-af07-c239d92848fe", + "attribute": { + "file_extension": "png", + "source_dir": "/home/crab/Pictures", + "target_dir": "/home/crab/Pictures/png_" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "ccf31785-ec13-4981-93c5-ca6c242ac0c3" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json b/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json new file mode 100644 index 0000000..b4745c2 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/d3478489-70f2-4a82-b7d2-0a47b75986eb.json @@ -0,0 +1,24 @@ +{ + "description": "Use Firefox to search for the country \"Ethiopia\" on Wikipedia, extract the capital city and population, save this information in an ODS file at \"/home/crab/Documents/africa.ods\" with LibreOffice Calc with the first column for the country name, the second for the capital city name, and the third for the population without any header, then create a new directory \"/home/crab/sheet\" and copy all ODS files from \"/home/crab/Documents\" to \"/home/crab/sheet\".", + "tasks": [ + { + "task": "1cd6519a-9ee0-442b-ba5a-9238aeb00ff6", + "attribute": { + "country": "Ethiopia", + "file_path": "/home/crab/Documents/africa.ods" + }, + "output": "/home/crab/Documents/africa.ods" + }, + { + "task": "217ababc-ccc7-4b9f-af07-c239d92848fe", + "attribute": { + "file_extension": "ods", + "source_dir": "/home/crab/Documents", + "target_dir": "/home/crab/sheet" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d3478489-70f2-4a82-b7d2-0a47b75986eb" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json b/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json new file mode 100644 index 0000000..485f6e6 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/d3c917ff-406f-447a-87f5-b8d835cba750.json @@ -0,0 +1,23 @@ +{ + "description": "Combine Image 1 \"/home/crab/Pictures/cat.png\" and Image 2 \"/home/crab/assets/campus.png\" using GIMP (GNU Image Manipulation Program), placing Image 1 on the left side of Image 2, and save the combined image to \"/home/crab/Desktop/background.png\". Then, set this combined image as the screen background of the system.", + "tasks": [ + { + "task": "4cf246ea-0a7f-43da-84b6-61d74a2699af", + "attribute": { + "image_path_1": "/home/crab/Pictures/cat.png", + "image_path_2": "/home/crab/assets/campus.png", + "output_path": "/home/crab/Desktop/background.png" + }, + "output": "/home/crab/Desktop/background.png" + }, + { + "task": "a207ef38-b3b2-4c6c-a1e3-75c38162f5ba", + "attribute": { + "photo_path": "/home/crab/Desktop/background.png" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d3c917ff-406f-447a-87f5-b8d835cba750" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json b/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json new file mode 100644 index 0000000..c0332e8 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/d6e460e4-c295-40ad-883c-11300d7832f0.json @@ -0,0 +1,19 @@ +{ + "description": "Using Firefox, locate the example provided of torch.matmul by the official PyTorch version 1.13 documentation and copy all the lines of code to the clipboard, then open LibreOffice Writer, paste the content from the clipboard, and save the document as an ODT file at \"/home/crab/Documents/torch_matmul.odt\".", + "tasks": [ + { + "task": "49b614c5-c4bb-4c20-aab8-ab9dcc7de1b5", + "attribute": {}, + "output": null + }, + { + "task": "76de4bdb-c980-4b3a-9bd3-c87db467dffe", + "attribute": { + "file_path": "/home/crab/Documents/torch_matmul.odt" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "d6e460e4-c295-40ad-883c-11300d7832f0" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json b/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json new file mode 100644 index 0000000..815ed5a --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/e31d4e3b-b753-4deb-b9ad-a0add5d4790e.json @@ -0,0 +1,21 @@ +{ + "description": "Use Firefox to search for an image with the keyword \"Mission: Impossible\", copy the image's URL to the clipboard, and then download the file from the clipboard's URL to \"/home/crab/Pictures/movie.jpg\".", + "tasks": [ + { + "task": "017102b6-d2c3-466b-96f7-37c8bcddc41a", + "attribute": { + "keyword": "Mission: Impossible" + }, + "output": "" + }, + { + "task": "a313ea4d-e501-4971-b4fe-db2aad19acsd", + "attribute": { + "file_path": "/home/crab/Pictures/movie.jpg" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "e31d4e3b-b753-4deb-b9ad-a0add5d4790e" +} \ No newline at end of file diff --git a/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json b/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json new file mode 100644 index 0000000..b24bc25 --- /dev/null +++ b/crab-benchmark-v0/dataset/ubuntu/f67a26e4-58dd-4dc6-8859-affbf1d62f94.json @@ -0,0 +1,22 @@ +{ + "description": "Open \"/home/crab/poem\" using vim in a terminal, write \"Two roads diverged in a yellow wood, and sorry I could not travel both and be one traveler, long I stood and looked down one as far as I could to where it bent in the undergrowth.\", save and exit vim, and then print the content of \"/home/crab/poem\" to the command line interface through the terminal.", + "tasks": [ + { + "task": "0f589bf9-9b26-4581-8b78-2961b115ab49", + "attribute": { + "file_path": "/home/crab/poem", + "content": "Two roads diverged in a yellow wood, and sorry I could not travel both and be one traveler, long I stood and looked down one as far as I could to where it bent in the undergrowth." + }, + "output": "/home/crab/poem" + }, + { + "task": "5b527839-0e58-426d-bab6-7160200b0d24", + "attribute": { + "file_path": "/home/crab/poem" + }, + "output": null + } + ], + "adjlist": "0 1\n1", + "id": "f67a26e4-58dd-4dc6-8859-affbf1d62f94" +} \ No newline at end of file diff --git a/crab-benchmark-v0/main.py b/crab-benchmark-v0/main.py index 07c4ba0..79e4afa 100644 --- a/crab-benchmark-v0/main.py +++ b/crab-benchmark-v0/main.py @@ -24,7 +24,7 @@ TaskGenerator, create_benchmark, ) -from crab.actions.crab_actions import complete +from crab.actions.crab_actions import complete, wait from crab.actions.visual_prompt_actions import ( get_elements_prompt, groundingdino_easyocr, @@ -96,7 +96,7 @@ def get_benchmark(env: str, ubuntu_url: str): tasks=[], environments=[ubuntu_env], prompting_tools=prompting_tools, - root_action_space=[complete], + root_action_space=[complete, wait], multienv=True, ) elif env == "android": @@ -106,7 +106,7 @@ def get_benchmark(env: str, ubuntu_url: str): tasks=[], environments=[ANDROID_ENV], prompting_tools=prompting_tools, - root_action_space=[complete], + root_action_space=[complete, wait], multienv=True, ) elif env == "cross": @@ -119,7 +119,7 @@ def get_benchmark(env: str, ubuntu_url: str): tasks=[], environments=[ubuntu_env, ANDROID_ENV], prompting_tools=prompting_tools, - root_action_space=[complete], + root_action_space=[complete, wait], multienv=True, ) else: @@ -137,7 +137,7 @@ def get_benchmark(env: str, ubuntu_url: str): # Load from handmade tasks benchmark_config.tasks.extend(handmade_tasks) - benchmark_config.step_limit = 15 + benchmark_config.step_limit = 20 return create_benchmark(benchmark_config) @@ -188,6 +188,12 @@ def get_benchmark(env: str, ubuntu_url: str): help="logger level, debug, info, warning, or error", default="warning", ) + parser.add_argument( + "--history-messages-len", + type=int, + help="The number of rounds of chat history to provide to the model", + default=2, + ) args = parser.parse_args() loglevel = args.loglevel numeric_level = getattr(logging, loglevel.upper(), None) @@ -197,43 +203,58 @@ def get_benchmark(env: str, ubuntu_url: str): benchmark = get_benchmark(args.env, args.ubuntu_url) + if args.model == "human": + expeirment = CrabBenchmarkV0( + benchmark=benchmark, + task_id=args.task_id, + agent_policy="human", + ) + expeirment.start_benchmark() + exit() + if args.model == "gpt4o": model = BackendModelConfig( model_class="openai", model_name="gpt-4o", - history_messages_len=2, + history_messages_len=args.history_messages_len, ) elif args.model == "gpt4turbo": model = BackendModelConfig( model_class="openai", model_name="gpt-4-turbo", - history_messages_len=2, + history_messages_len=args.history_messages_len, ) elif args.model == "gemini": model = BackendModelConfig( model_class="gemini", model_name="gemini-1.5-pro-latest", - history_messages_len=2, + history_messages_len=args.history_messages_len, ) elif args.model == "claude": model = BackendModelConfig( model_class="claude", model_name="claude-3-opus-20240229", - history_messages_len=2, + history_messages_len=args.history_messages_len, ) - elif args.model == "llava-1.6": + elif args.model == "pixtral": model = BackendModelConfig( - model_class="vllm", - model_name="llava-hf/llava-v1.6-34b-hf", - history_messages_len=2, + model_class="openai-json", + model_name="mistralai/Pixtral-12B-2409", + history_messages_len=args.history_messages_len, base_url=args.model_base_url, api_key=args.model_api_key, ) - elif args.model == "pixtral": + elif args.model == "gpt4o-wofc": model = BackendModelConfig( - model_class="vllm", - model_name="mistralai/Pixtral-12B-2409", - history_messages_len=1, + model_class="openai-json", + model_name="gpt-4o", + history_messages_len=args.history_messages_len, + ) + elif args.model == "llava-ov72b": + model = BackendModelConfig( + model_class="sglang-openai-json", + model_name="lmms-lab/llava-onevision-qwen2-72b-ov-chat", + history_messages_len=args.history_messages_len, base_url=args.model_base_url, api_key=args.model_api_key, ) @@ -255,7 +276,7 @@ def get_benchmark(env: str, ubuntu_url: str): print("Unsupported policy: ", args.policy) exit() - log_dir = (Path(__file__).parent / "logs").resolve() + log_dir = (Path(__file__).parent / "tianqi_logs").resolve() expeirment = CrabBenchmarkV0( benchmark=benchmark, task_id=args.task_id, diff --git a/crab-benchmark-v0/ubuntu_env.py b/crab-benchmark-v0/ubuntu_env.py index 2ecec7e..2fd5be2 100644 --- a/crab-benchmark-v0/ubuntu_env.py +++ b/crab-benchmark-v0/ubuntu_env.py @@ -13,6 +13,7 @@ # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== from crab.actions.desktop_actions import ( click, + double_click, key_press, press_hotkey, right_click, @@ -31,6 +32,7 @@ press_hotkey, search_application, right_click, + double_click, ], observation_space=[screenshot], description="""An Ubuntu 22.04 Linux desktop operating system. The interface \ diff --git a/crab/actions/crab_actions.py b/crab/actions/crab_actions.py index d757cf2..8c41d5a 100644 --- a/crab/actions/crab_actions.py +++ b/crab/actions/crab_actions.py @@ -11,6 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +from time import sleep + from crab import action, evaluator @@ -42,6 +44,14 @@ def complete() -> bool: pass +@action(env_name="root") +def wait() -> bool: + """If the environment is still processing your action and you have nothing to do in + this step, you can use wait(). + """ + sleep(5) + + def get_element_position(element_id, env): """Get element position provided by function `zs_object_detection`""" box = env.element_position_map[element_id] diff --git a/crab/actions/desktop_actions.py b/crab/actions/desktop_actions.py index cf47ddc..861ecdd 100644 --- a/crab/actions/desktop_actions.py +++ b/crab/actions/desktop_actions.py @@ -69,7 +69,7 @@ def right_click(element: int, env) -> None: """ Right-click an UI element shown on the desktop screen using the mouse, which is usually used for opening the menu of the element. A simple use case can be - rght_click(5), which right-clicks the UI element labeled with the number 5 to open + right_click(5), which right-clicks the UI element labeled with the number 5 to open up menu on it. Args: @@ -80,6 +80,34 @@ def right_click(element: int, env) -> None: time.sleep(DELAY) +@action +def double_click_position(x: int, y: int) -> None: + """ + Double-click on the current desktop screen. + + Args: + x: The X coordinate, as a floating-point number in the range [0.0, 1.0]. + y: The Y coordinate, as a floating-point number in the range [0.0, 1.0]. + """ + pyautogui.click(x, y, duration=DURATION, clicks=2, interval=0.2) + + +@action(local=True) +def double_click(element: int, env) -> None: + """ + Double-click an UI element shown on the desktop screen using the mouse, which is + usually used for opening a folder or a file. A simple use case can be + double_click(5), which double-clicks the UI element labeled with the number 5 to + open it. + + Args: + element: A numeric tag assigned to an UI element shown on the screenshot. + """ + x, y = get_element_position(element, env) + env._action_endpoint(double_click_position, {"x": x, "y": y}) + time.sleep(DELAY) + + @action def mouse_scroll(click: int = 1) -> None: """ diff --git a/crab/agents/backend_models/__init__.py b/crab/agents/backend_models/__init__.py index 172b6a1..6c6bdab 100644 --- a/crab/agents/backend_models/__init__.py +++ b/crab/agents/backend_models/__init__.py @@ -21,16 +21,15 @@ from .camel_model import CamelModel from .claude_model import ClaudeModel from .gemini_model import GeminiModel -from .openai_model import OpenAIModel -from .vllm_model import VLLMModel +from .openai_model import OpenAIModel, OpenAIModelJSON, SGlangOpenAIModelJSON class BackendModelConfig(BaseModel): - model_class: Literal["openai", "claude", "gemini", "camel", "vllm"] + model_class: Literal["openai", "claude", "gemini", "camel", "vllm", "sglang"] model_name: str history_messages_len: int = 0 parameters: dict[str, Any] = {} - tool_call_required: bool = False + tool_call_required: bool = True base_url: str | None = None # Only used in OpenAIModel and VLLMModel currently api_key: str | None = None # Only used in OpenAIModel and VLLMModel currently @@ -46,6 +45,7 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel: model=model_config.model_name, parameters=model_config.parameters, history_messages_len=model_config.history_messages_len, + tool_call_required=model_config.tool_call_required, ) case "gemini": if model_config.base_url is not None or model_config.api_key is not None: @@ -56,6 +56,7 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel: model=model_config.model_name, parameters=model_config.parameters, history_messages_len=model_config.history_messages_len, + tool_call_required=model_config.tool_call_required, ) case "openai": return OpenAIModel( @@ -64,9 +65,18 @@ def create_backend_model(model_config: BackendModelConfig) -> BackendModel: history_messages_len=model_config.history_messages_len, base_url=model_config.base_url, api_key=model_config.api_key, + tool_call_required=model_config.tool_call_required, ) - case "vllm": - return VLLMModel( + case "openai-json": + return OpenAIModelJSON( + model=model_config.model_name, + parameters=model_config.parameters, + history_messages_len=model_config.history_messages_len, + base_url=model_config.base_url, + api_key=model_config.api_key, + ) + case "sglang-openai-json": + return SGlangOpenAIModelJSON( model=model_config.model_name, parameters=model_config.parameters, history_messages_len=model_config.history_messages_len, diff --git a/crab/agents/backend_models/claude_model.py b/crab/agents/backend_models/claude_model.py index ed37f47..92641aa 100644 --- a/crab/agents/backend_models/claude_model.py +++ b/crab/agents/backend_models/claude_model.py @@ -33,7 +33,7 @@ def __init__( model: str, parameters: dict[str, Any] | None = None, history_messages_len: int = 0, - tool_call_required: bool = False, + tool_call_required: bool = True, ) -> None: if anthropic_model_enable is False: raise ImportError("Please install anthropic to use ClaudeModel") @@ -152,6 +152,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message: system=self.system_message, # <-- system prompt messages=request_messages, # type: ignore model=self.model, + max_tokens=4096, tools=self.action_schema, tool_choice={"type": "any" if self.tool_call_required else "auto"}, **self.parameters, @@ -161,6 +162,7 @@ def call_api(self, request_messages: list[dict]) -> anthropic.types.Message: system=self.system_message, # <-- system prompt messages=request_messages, # type: ignore model=self.model, + max_tokens=4096, **self.parameters, ) diff --git a/crab/agents/backend_models/gemini_model.py b/crab/agents/backend_models/gemini_model.py index 3032d94..efa5dbe 100644 --- a/crab/agents/backend_models/gemini_model.py +++ b/crab/agents/backend_models/gemini_model.py @@ -42,7 +42,7 @@ def __init__( model: str, parameters: dict[str, Any] | None = None, history_messages_len: int = 0, - tool_call_required: bool = False, + tool_call_required: bool = True, ) -> None: if gemini_model_enable is False: raise ImportError("Please install google.generativeai to use GeminiModel") @@ -191,6 +191,11 @@ def _action_to_func_dec(action: Action) -> FunctionDeclaration: if "$defs" in p_schema: p_schema = json_expand_refs(p_schema) _clear_schema(p_schema) + if not p_schema["properties"]: + return FunctionDeclaration( + name=action.name, + description=action.description, + ) return FunctionDeclaration( name=action.name, description=action.description, diff --git a/crab/agents/backend_models/openai_model.py b/crab/agents/backend_models/openai_model.py index e8a11eb..714b3f1 100644 --- a/crab/agents/backend_models/openai_model.py +++ b/crab/agents/backend_models/openai_model.py @@ -15,6 +15,7 @@ from typing import Any from crab import Action, ActionOutput, BackendModel, BackendOutput, Message, MessageType +from crab.agents.utils import extract_text_and_code_prompts try: import openai @@ -31,7 +32,7 @@ def __init__( model: str, parameters: dict[str, Any] | None = None, history_messages_len: int = 0, - tool_call_required: bool = False, + tool_call_required: bool = True, base_url: str | None = None, api_key: str | None = None, ) -> None: @@ -179,3 +180,91 @@ def _convert_action_to_schema( new_action = action.to_openai_json_schema() actions.append({"type": "function", "function": new_action}) return actions + + +class OpenAIModelJSON(OpenAIModel): + def __init__( + self, + model: str, + parameters: dict[str, Any] = dict(), + history_messages_len: int = 0, + base_url: str | None = None, + api_key: str | None = None, + ) -> None: + super().__init__( + model, + parameters, + history_messages_len, + False, + base_url, + api_key, + ) + self.support_tool_call = False + + def reset(self, system_message: str, action_space: list[Action] | None) -> None: + super().reset(system_message, action_space) + self.action_schema = None + + def record_message( + self, new_message: dict, response_message: ChatCompletionMessage + ) -> None: + self.chat_history.append([new_message]) + self.chat_history[-1].append( + {"role": "assistant", "content": response_message.content} + ) + + def generate_backend_output( + self, response_message: ChatCompletionMessage + ) -> BackendOutput: + content = response_message.content + text_list, code_list = extract_text_and_code_prompts(content) + + action_list = [] + try: + for code_block in code_list: + action_object = json.loads(code_block) + action_list.append( + ActionOutput( + name=action_object["name"], arguments=action_object["arguments"] + ) + ) + except json.JSONDecodeError as e: + raise RuntimeError(f"Failed to parse code block: {code_block}") from e + except KeyError as e: + raise RuntimeError(f"Received invalid action format: {code_block}") from e + + return BackendOutput( + message="".join(text_list), + action_list=action_list, + ) + + +class SGlangOpenAIModelJSON(OpenAIModelJSON): + def construct_new_message(self, message: list[Message]) -> dict[str, Any]: + new_message_content: list[dict[str, Any]] = [] + image_count = 0 + for _, msg_type in message: + if msg_type == MessageType.IMAGE_JPG_BASE64: + image_count += 1 + for content, msg_type in message: + match msg_type: + case MessageType.TEXT: + new_message_content.append( + { + "type": "text", + "text": content, + } + ) + case MessageType.IMAGE_JPG_BASE64: + image_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/png;base64,{content}", + "detail": "high", + }, + } + if image_count > 1: + image_content["modalities"] = "multi-images" + new_message_content.append(image_content) + + return {"role": "user", "content": new_message_content} diff --git a/crab/agents/backend_models/vllm_model.py b/crab/agents/backend_models/vllm_model.py deleted file mode 100644 index 18ed12c..0000000 --- a/crab/agents/backend_models/vllm_model.py +++ /dev/null @@ -1,80 +0,0 @@ -# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== -import json -from typing import Any - -from openai.types.chat import ChatCompletionMessage - -from crab import Action, ActionOutput, BackendOutput -from crab.agents.backend_models.openai_model import OpenAIModel -from crab.agents.utils import extract_text_and_code_prompts - - -class VLLMModel(OpenAIModel): - def __init__( - self, - model: str, - parameters: dict[str, Any] = dict(), - history_messages_len: int = 0, - base_url: str | None = None, - api_key: str | None = None, - ) -> None: - if base_url is None: - raise ValueError("base_url is required for VLLMModel") - super().__init__( - model, - parameters, - history_messages_len, - False, - base_url, - api_key, - ) - self.support_tool_call = False - - def reset(self, system_message: str, action_space: list[Action] | None) -> None: - super().reset(system_message, action_space) - self.action_schema = None - - def record_message( - self, new_message: dict, response_message: ChatCompletionMessage - ) -> None: - self.chat_history.append([new_message]) - self.chat_history[-1].append( - {"role": "assistant", "content": response_message.content} - ) - - def generate_backend_output( - self, response_message: ChatCompletionMessage - ) -> BackendOutput: - content = response_message.content - text_list, code_list = extract_text_and_code_prompts(content) - - action_list = [] - try: - for code_block in code_list: - action_object = json.loads(code_block) - action_list.append( - ActionOutput( - name=action_object["name"], arguments=action_object["arguments"] - ) - ) - except json.JSONDecodeError as e: - raise RuntimeError(f"Failed to parse code block: {code_block}") from e - except KeyError as e: - raise RuntimeError(f"Received invalid action format: {code_block}") from e - - return BackendOutput( - message="".join(text_list), - action_list=action_list, - ) diff --git a/crab/agents/policies/multi_agent_by_env.py b/crab/agents/policies/multi_agent_by_env.py index b72a535..57afc76 100644 --- a/crab/agents/policies/multi_agent_by_env.py +++ b/crab/agents/policies/multi_agent_by_env.py @@ -106,9 +106,8 @@ def get_token_usage(self): def get_backend_model_name(self): return ( self.main_agent_model_backend.__class__.__name__ - + "(sub: " - + self.env_agent_model_backend.__class__.__name__ - + ")" + + "_" + + self.main_agent_model_backend.model ) def chat( diff --git a/crab/agents/policies/multi_agent_by_func.py b/crab/agents/policies/multi_agent_by_func.py index eec0159..8d4df64 100644 --- a/crab/agents/policies/multi_agent_by_func.py +++ b/crab/agents/policies/multi_agent_by_func.py @@ -74,9 +74,8 @@ def get_token_usage(self): def get_backend_model_name(self): return ( self.main_agent_model_backend.__class__.__name__ - + "(sub: " - + self.tool_agent_model_backend.__class__.__name__ - + ")" + + "_" + + self.main_agent_model_backend.model ) def chat( diff --git a/crab/agents/policies/single_agent.py b/crab/agents/policies/single_agent.py index 74a6cd6..fa4b846 100644 --- a/crab/agents/policies/single_agent.py +++ b/crab/agents/policies/single_agent.py @@ -11,6 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. =========== +import logging + from crab import Action, ActionOutput from crab.agents.backend_models import BackendModelConfig, create_backend_model from crab.agents.utils import ( @@ -24,6 +26,8 @@ ) from crab.utils.measure import timed +logger = logging.getLogger(__name__) + class SingleAgentPolicy(AgentPolicy): _system_prompt_with_function_call = """\ @@ -69,6 +73,8 @@ class SingleAgentPolicy(AgentPolicy): ```json {{"name": "action_name", "arguments": {{}}}} ``` + You MUST use exactly the same "action_name" as I gave to you in the action space. + You SHOULDN'T add any comments in the code blocks. In each step, You MUST explain what do you see from the current observation and the plan of the next action, then use a provided action in each step to achieve the @@ -80,9 +86,19 @@ class SingleAgentPolicy(AgentPolicy): def __init__( self, model_backend: BackendModelConfig, + function_call: bool = True, ): self.model_backend = create_backend_model(model_backend) - if self.model_backend.support_tool_call: + self.function_call = function_call + if not self.model_backend.support_tool_call and self.function_call: + logger.warning( + "The backend model does not support tool call: {}".format( + model_backend.model_name + ) + + "\nFallback to no function call mode." + ) + self.function_call = False + if self.function_call: self.system_prompt = self._system_prompt_with_function_call else: self.system_prompt = self._system_prompt_no_function_call @@ -100,17 +116,20 @@ def reset( task_description=task_description, action_descriptions=generate_action_prompt( self.action_space, - expand=not self.model_backend.support_tool_call, + expand=not self.function_call, ), env_description=str(env_descriptions), ) - self.model_backend.reset(system_message, self.action_space) + if self.function_call: + self.model_backend.reset(system_message, self.action_space) + else: + self.model_backend.reset(system_message, None) def get_token_usage(self): return self.model_backend.get_token_usage() def get_backend_model_name(self): - return self.model_backend.__class__.__name__ + return self.model_backend.__class__.__name__ + "_" + self.model_backend.model @timed def chat( @@ -127,4 +146,6 @@ def chat( ) ) output = self.model_backend.chat(prompt) + # print("Agent Message: " + output.message, flush=True) + # print("Agent Action: " + str(output.action_list), flush=True) return decode_combined_action(output.action_list) diff --git a/crab/agents/utils.py b/crab/agents/utils.py index b174b92..e284406 100644 --- a/crab/agents/utils.py +++ b/crab/agents/utils.py @@ -92,8 +92,10 @@ def extract_text_and_code_prompts(content: str) -> tuple[list[str], list[str]]: # code_type = lines[idx].strip()[3:].strip() idx += 1 start_idx = idx - while not lines[idx].lstrip().startswith("```"): + while not lines[idx].lstrip().startswith("```") and idx < len(lines): idx += 1 + if idx >= len(lines): + break code = "\n".join(lines[start_idx:idx]).strip() code_prompts.append(code) diff --git a/crab/core/benchmark.py b/crab/core/benchmark.py index 87a0611..1c23b60 100644 --- a/crab/core/benchmark.py +++ b/crab/core/benchmark.py @@ -239,7 +239,22 @@ def step( info=info, ) - environment = self._get_env(env_name=env_name, action_name=action) + try: + environment = self._get_env(env_name=env_name, action_name=action) + except Exception: + print(traceback.format_exc()) + terminated = True + info["terminate_reason"] = "action_format_error" + info["exception_detail"] = traceback.format_exc() + environment.reset() + self.close_task() + return StepResult( + truncated=False, + terminated=True, + action_returns=None, + evaluation_results=self.current_evaluator.stat(), + info=info, + ) try: action_returns = environment.step(action, parameters) except Exception: diff --git a/crab/core/environment.py b/crab/core/environment.py index e045353..938c1da 100644 --- a/crab/core/environment.py +++ b/crab/core/environment.py @@ -89,7 +89,7 @@ def __init__( self._client: Client | None = None if remote_url is not None: - self._client = Client(base_url=remote_url) + self._client = Client(base_url=remote_url, timeout=60) for key, value in extra_attributes.items(): setattr(self, key, value) diff --git a/crab/core/experiment.py b/crab/core/experiment.py index 9e14c9e..59721d2 100644 --- a/crab/core/experiment.py +++ b/crab/core/experiment.py @@ -138,6 +138,10 @@ def execute_action(self, response: list[ActionOutput]) -> bool: print("\033[92m" f"Task finished, result: {self.metrics}" "\033[0m") self.write_current_log_row(action) self.write_main_csv_row(benchmark_result.info["terminate_reason"]) + if "exception_detail" in benchmark_result.info: + self.write_exception_detail( + benchmark_result.info["exception_detail"] + ) return True print( "\033[92m" @@ -171,6 +175,7 @@ def step(self, it) -> bool: except Exception: print(traceback.format_exc()) self.write_main_csv_row("agent_exception") + self.write_exception_detail(traceback.format_exc()) return True # content = response["content"] # self.write_message(str(content), it) @@ -214,6 +219,12 @@ def start_benchmark(self): sleep(2) # input("Press enter to do next step:") + def write_exception_detail(self, exception_info: str): + if self.log_dir is None: + return + with open(self.current_experiment_dir / "exception_detail.txt", "w") as file: + file.write(exception_info) + def write_current_log_row(self, action): if self.log_dir is None: return diff --git a/crab/core/task_generator.py b/crab/core/task_generator.py index 2f373eb..682f875 100644 --- a/crab/core/task_generator.py +++ b/crab/core/task_generator.py @@ -16,6 +16,7 @@ import importlib import itertools import json +import os import random from pathlib import Path @@ -121,6 +122,8 @@ def __init__( self.attribute_pool = attribute_pool self.graph_generation(subtasks) self.task_mapping = {task.id: task for task in subtasks} + if not os.getenv("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = "EMPTY" self.client = OpenAI() @classmethod