diff --git a/agent/agent.py b/agent/agent.py index 240ce0b..8668540 100644 --- a/agent/agent.py +++ b/agent/agent.py @@ -116,40 +116,43 @@ def set_action_set_tag(self, tag: str) -> None: def next_action( self, trajectory: Trajectory, intent: str, meta_data: dict[str, Any] ) -> Action: - prompt = self.prompt_constructor.construct( - trajectory, intent, meta_data - ) lm_config = self.lm_config - if lm_config.provider == "openai": - if lm_config.mode == "chat": - response = generate_from_openai_chat_completion( - messages=prompt, - model=lm_config.model, - temperature=lm_config.gen_config["temperature"], - top_p=lm_config.gen_config["top_p"], - context_length=lm_config.gen_config["context_length"], - max_tokens=lm_config.gen_config["max_tokens"], - stop_token=None, - ) - elif lm_config.mode == "completion": - response = generate_from_openai_completion( - prompt=prompt, - engine=lm_config.model, - temperature=lm_config.gen_config["temperature"], - max_tokens=lm_config.gen_config["max_tokens"], - top_p=lm_config.gen_config["top_p"], - stop_token=lm_config.gen_config["stop_token"], - ) + def llm(prompt): + if lm_config.provider == "openai": + if lm_config.mode == "chat": + response = generate_from_openai_chat_completion( + messages=prompt, + model=lm_config.model, + temperature=lm_config.gen_config["temperature"], + top_p=lm_config.gen_config["top_p"], + context_length=lm_config.gen_config["context_length"], + max_tokens=lm_config.gen_config["max_tokens"], + stop_token=None, + ) + elif lm_config.mode == "completion": + response = generate_from_openai_completion( + prompt=prompt, + engine=lm_config.model, + temperature=lm_config.gen_config["temperature"], + max_tokens=lm_config.gen_config["max_tokens"], + top_p=lm_config.gen_config["top_p"], + stop_token=lm_config.gen_config["stop_token"], + ) + else: + raise ValueError( + f"OpenAI models do not support mode {lm_config.mode}" + ) else: - raise ValueError( - f"OpenAI models do not support mode {lm_config.mode}" + raise NotImplementedError( + f"Provider {lm_config.provider} not implemented" ) - else: - raise NotImplementedError( - f"Provider {lm_config.provider} not implemented" - ) - + + return response + try: + response = self.prompt_constructor.construct( + trajectory, intent, meta_data, llm + ) parsed_response = self.prompt_constructor.extract_action(response) if self.action_set_tag == "id_accessibility_tree": action = create_id_based_action(parsed_response) diff --git a/agent/prompts/prompt_constructor.py b/agent/prompts/prompt_constructor.py index 6e2d3cb..f184621 100644 --- a/agent/prompts/prompt_constructor.py +++ b/agent/prompts/prompt_constructor.py @@ -1,7 +1,7 @@ import json import re from pathlib import Path -from typing import Any, TypedDict +from typing import Any, TypedDict, Callable, Optional import tiktoken @@ -37,6 +37,22 @@ def __init__( self.instruction: Instruction = instruction self.tokenizer = tokenizer + @beartype + def _get_llm_output( + self, + intro: str, + examples: list[tuple[str, str]], + template: str, + llm: Callable, + **kwargs + ) -> str: + prompt = template.format(**kwargs) + prompt = self.get_lm_api_input(intro, examples, prompt) + response = llm(prompt) + + return response + + @beartype def get_lm_api_input( self, intro: str, examples: list[tuple[str, str]], current: str ) -> APIInput: @@ -129,7 +145,8 @@ def construct( trajectory: Trajectory, intent: str, meta_data: dict[str, Any] = {}, - ) -> APIInput: + llm: Callable = None + ) -> str: """Construct prompt given the trajectory""" intro = self.instruction["intro"] examples = self.instruction["examples"] @@ -146,18 +163,18 @@ def construct( url = page.url previous_action_str = meta_data["action_history"][-1] - # input x - current = template.format( + response = self._get_llm_output( + intro, + examples, + template, + llm, objective=intent, url=self.map_url_to_real(url), observation=obs, previous_action=previous_action_str, ) - # make sure all keywords are replaced - assert all([f"{{k}}" not in current for k in keywords]) - prompt = self.get_lm_api_input(intro, examples, current) - return prompt + return response def _extract_action(self, response: str) -> str: action_splitter = self.instruction["meta_data"]["action_splitter"] @@ -188,7 +205,8 @@ def construct( trajectory: Trajectory, intent: str, meta_data: dict[str, Any] = {}, - ) -> APIInput: + llm: Callable = None + ) -> str: intro = self.instruction["intro"] examples = self.instruction["examples"] template = self.instruction["template"] @@ -203,17 +221,150 @@ def construct( page = state_info["info"]["page"] url = page.url previous_action_str = meta_data["action_history"][-1] - current = template.format( + + response = self._get_llm_output( + intro, + examples, + template, + llm, objective=intent, url=self.map_url_to_real(url), observation=obs, previous_action=previous_action_str, ) - assert all([f"{{k}}" not in current for k in keywords]) + return response - prompt = self.get_lm_api_input(intro, examples, current) - return prompt + @beartype + def _extract_action(self, response: str) -> str: + # find the first occurence of action + action_splitter = self.instruction["meta_data"]["action_splitter"] + pattern = rf"{action_splitter}(.*?){action_splitter}" + match = re.search(pattern, response) + if match: + return match.group(1) + else: + raise ActionParsingError( + f'Cannot find the answer phrase "{self.answer_phrase}" in "{response}"' + ) + +class RCIPromptConstructor(PromptConstructor): + def __init__( + self, + instruction_path: str | Path, + lm_config: lm_config.LMConfig, + tokenizer: tiktoken.core.Encoding, + ): + super().__init__(instruction_path, lm_config, tokenizer) + self.answer_phrase = self.instruction["meta_data"]["answer_phrase"] + self.plan = None + + @beartype + def construct( + self, + trajectory: Trajectory, + intent: str, + meta_data: dict[str, Any] = {}, + llm: Callable = None + ) -> str: + intro = self.instruction["intro"] + + state_info: StateInfo = trajectory[-1] # type: ignore[assignment] + + page = state_info["info"]["page"] + url = self.map_url_to_real(page.url) + history_actions = ', '.join(meta_data["action_history"]) + previous_action_str = meta_data["action_history"][-1] + + obs = state_info["observation"][self.obs_modality] + max_obs_length = self.lm_config.gen_config["max_obs_length"] + if max_obs_length: + obs = self.tokenizer.decode(self.tokenizer.encode(obs)[:max_obs_length]) # type: ignore[arg-type] + + # Get plan + if self.plan is None: + plan = self._get_llm_output( + intro, + [], + self.instruction["template_plan"], + llm, + observation=obs, + url=url, + objective=intent, + ) + + # Get critique + critique = self._get_llm_output( + intro, + [], + self.instruction["template_critique"], + llm, + observation=obs, + url=url, + objective=intent, + plan=plan, + ) + + # Get improved plan + plan = self._get_llm_output( + intro, + [], + self.instruction["template_improve"], + llm, + observation=obs, + url=url, + objective=intent, + plan=plan, + critique=critique, + ) + + self.plan = plan + + # Get next step + meta_next_action = self._get_llm_output( + intro, + [], + self.instruction["template_next_step"], + llm, + observation=obs, + url=url, + objective=intent, + previous_action=previous_action_str, + plan=self.plan, + ) + + # Get state grounding + draft_next_action = self._get_llm_output( + intro, + [], + self.instruction["template_state_grounding"], + llm, + observation=obs, + url=url, + previous_action=previous_action_str, + meta_next_action=meta_next_action, + ) + + # Get agent grounding + response = self._get_llm_output( + intro, + [], + self.instruction["template_agent_grounding"], + llm, + observation=obs, + url=url, + previous_action=previous_action_str, + meta_next_action=meta_next_action, + draft_next_action=draft_next_action + ) + + # XXX: hacky fix + # fix = input(f'fix response="{response}"?').strip() + # if fix != '': + # response = fix + # print(f'fixed response="{response}"') + + return response def _extract_action(self, response: str) -> str: # find the first occurence of action diff --git a/agent/prompts/raw/p_rci_id_actree_2s.py b/agent/prompts/raw/p_rci_id_actree_2s.py new file mode 100644 index 0000000..09079ac --- /dev/null +++ b/agent/prompts/raw/p_rci_id_actree_2s.py @@ -0,0 +1,98 @@ +prompt = { + "intro": """You are an autonomous intelligent agent tasked with navigating a web browser. You will be given web-based tasks. These tasks will be accomplished through the use of specific actions you can issue. + +Here's the information you'll have: +The user's objective: This is the task you're trying to complete. +The current web page's accessibility tree: This is a simplified representation of the webpage, providing key information. +The current web page's URL: This is the page you're currently navigating. +The open tabs: These are the tabs you have open. +The previous action: This is the action you just performed. It may be helpful to track your progress. + +The actions you can perform fall into several categories: + +Page Operation Actions: +`click [id]`: This action clicks on an element with a specific id on the webpage. +`type [id] [content] [press_enter_after=0|1]`: Use this to type the content into the field with id. By default, the "Enter" key is pressed after typing unless press_enter_after is set to 0. +`hover [id]`: Hover over an element with id. +`press [key_comb]`: Simulates the pressing of a key combination on the keyboard (e.g., Ctrl+v). +`scroll [direction=down|up]`: Scroll the page up or down. + +Tab Management Actions: +`new_tab`: Open a new, empty browser tab. +`tab_focus [tab_index]`: Switch the browser's focus to a specific tab using its index. +`close_tab`: Close the currently active tab. + +URL Navigation Actions: +`goto [url]`: Navigate to a specific URL. +`go_back`: Navigate to the previously viewed page. +`go_forward`: Navigate to the next page (if a previous 'go_back' action was performed). + +Completion Action: +`stop [answer]`: Issue this action when you believe the task is complete. If the objective is to find a text-based answer, provide the answer in the bracket. If you believe the task is impossible to complete, provide the answer as "N/A" in the bracket. + +Homepage: +If you want to visit other websites, check out the homepage at http://homepage.com. It has a list of websites you can visit. +http://homepage.com/password.html lists all the account name and password for the websites. You can use them to log in to the websites. +""", + "examples": [], + "template_plan": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} + +After considering the current observation and objective, here is a plan to solve the task using the instructions provided in the introduction (Only show the plan):""", + "template_critique": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +BAD PLAN: {plan} + +With the objective, find problems with this plan.""", + "template_improve": """OBSERVATION: +{observation} +URL: {url} +OBJECTIVE: {objective} +BAD PLAN: {plan} +CRITIQUE: {critique} + +Based on the critique and objective, the good plan for the agent to complete the task are as follows (Only show the plan).""", + "template_next_step": """OBSERVATION: +{observation} +URL: {url} +PREVIOUS ACTION: {previous_action} +OBJECTIVE: {objective} +PLAN: {plan} + +According to the current plan and the history actions I have executed previously, find the next meta action I should perform and provide a reason. Remember not to repeat the same action twice in a row.""", + "template_state_grounding": """OBSERVATION: +{observation} +URL: {url} +PREVIOUS ACTION: {previous_action} +META ACTION: {meta_next_action} + +Considering the observation and the meta next action, generate a specific action that I can execute on this observation.""", + "template_agent_grounding": """OBSERVATION: +{observation} +URL: {url} +PREVIOUS ACTION: {previous_action} +META ACTION: {meta_next_action} +DRAFT ACTION: {draft_next_action} + +To be successful, it is very important to follow the following rules: +1. You should only issue an action that is valid given the current observation +2. You should only issue one action at a time. +3. You should follow the examples to reason step by step and then issue the next action. +4. Issue stop action when you think you have achieved the objective. Don't generate anything after stop. + +Remember, if you think the answer is empty or the task is impossible to complete, provide answer "N/A" in the bracket e.g. ```stop [N/A]```. + +Now, with the meta and draft next action, we can generate the final single action that fits the format specified in the introduction. Ensure that the action is wrapped inside a pair of ``` and enclose arguments within [] as follows: ```[action] [arg] ...```. For example, ```type [123] [abc def] [0]``` or ```click [135]``` or ```scroll [down]```".""", + "meta_data": { + "observation": "accessibility_tree", + "action_type": "id_accessibility_tree", + "keywords": ["url", "objective", "observation", "previous_action"], + "prompt_constructor": "RCIPromptConstructor", + "answer_phrase": "In summary, the next action I will perform is", + "action_splitter": "```" + }, +} \ No newline at end of file