from collections import OrderedDict from dataclasses import dataclass, field from typing import Any, Dict, List, Optional from mm_agents.uipath.types_utils import PlanAction, key_maps from mm_agents.uipath.utils import ValidationException system_template = """You are a computer use agent that perform computer-related tasks. You will be given a task, a current screenshot, and a list of previous actions. You need to predict the next action. ## Available Actions: {available_actions} In addition there are some special actions that are not part of the main UI actions: {special_actions} Each action has a description and parameters. The action description is a single sentence which mentions the action and the control element to interact with. This description will be used by the executor agent to locate the action's target element coordinates in the screen, so describe the element targeted by the action as detailed as possible. Particularly for icons, you can describe their position, text on it, color, nearby elements etc... Example of some action descriptions with more detailed information to help the executor agent locate the element: - "Click on the Calendar icon with the text 'Thu 28'" - "Click the 'Search' button on the top right corner next to the login button." - "Click the 'First Name' input box from the UserInfo section to focus it before typing." Your action response must be a valid JSON with the following format: {{ "type": str # one of the valid action types "description": # action description "parameters": # optional, action parameters dictionary }} ## Action examples: example of valid actions: {examples} ## Important Notes: - Close any cookies, ads, login or registration etc pop-ups if not needed. - Before typing, ensure the input box is focused by clicking on it. """ user_command_template = """Recall Task Again: {task} Check if the task is finished. If not provide the next action to perform. Remember: - Perform the task on provided application(s) or website(s). You are not allowed to use the browser "address bar". - Close any cookies, ads, login or registration etc pop-ups if not needed. - Only one action at a time (never "click and type", "click and drag", "type and press", "press shift and click", etc..). Think of how to combine them in two consecutive actions obtaining the intended result or use an available action that can obtain it. - For any opening input combobox, dropdown menu options, you must select an option or press Enter key to select default one. - Click on input box to ensure is focused before typing. Otherwise, the input box will not accept the text. - Once focusing on an input box, if it has a default pre-typed value (not placeholder which is usually grayed-out), remove the existing value first by clicking on "X" icon or using "Ctrl A" + "Backspace" or "Backspace" if the value is already selected. - For search input, if no search button or suggestions popup after typing, press 'Enter' to trigger search. - Retry the drag action on slider control if needed to refine the slider values closer to expected values. - Scroll / Pageup / Pagedown to explore or extract more content/data if needed (prefer 'key_press' action with key 'Pageup', 'Pagedown' for faster scrolling). Particularly when extraction data from table with hidden rows or columns. - Scroll action must have a 'direction' parameter. Finish action must have a 'status' parameter. - If you modify some settings remember to save/apply them. If button is not visible try to scroll for it. Most importantly, never type or click on element not visible on screenshot. Use scroll or pageup/pagedown to make the element visible first. {execution_info_message} Answer in json format: {json_output_format} """ PlanerCoTSections = OrderedDict( { "review": { "display": "previous_action_result", "description": "Briefly describe the previous action result and UI change on the screenshot to see if is correctly performed.", }, "thought": { "display": "thought", "description": "Reason briefly about the next action to perform if the task is not finished.", }, "action_description": { "display": "action_description", "description": "Describe the action to perform in a single sentence. The description must be precise and not rely on specific information in the current screen.", }, } ) ### for chat conversation user_task_info_template = """## Task Information: The current date is (YYYY-MM-DD): {current_date} Task: {task} """ @dataclass class ActionDefinition: type: str description: str parameters: Optional[Dict[str, str]] = None examples: List[Dict[str, Any]] = field(default_factory=list) class PlannerOutput(object): def __init__(self, plan_action: PlanAction, additional_sections: dict[str, str]): self.plan_action = plan_action self.thought = additional_sections["thought"] self.review = additional_sections["review"] self.additional_sections = { key: value for key, value in additional_sections.items() if key not in ["review", "thought"] } class ComputerUseAgentInterface: def __init__(self): self.ui_actions = {} self.special_actions = {} self._setup_default_actions() def _setup_default_actions(self): self.add_action( ActionDefinition( type="click", description="Click on a UI element", examples=[ {"type": "click", "description": "Click the 'Next' button."}, { "type": "click", "description": "Click the 'X' icon in the input box", }, { "type": "click", "description": "Click the first name input box to focus on it.", }, ], ) ) self.add_action( ActionDefinition( type="right_click", description="Right click on a UI element", examples=[ { "type": "right_click", "description": "Right click on the first row from the patient table to open the context menu.", } ], ) ) self.add_action( ActionDefinition( type="double_click", description="Double click on a UI element", examples=[ { "type": "double_click", "description": "Double click word app icon to open the application.", }, ], ) ) self.add_action( ActionDefinition( type="type", description="Type text into a focused input field. Ensure the input box is focused before typing. To focus the input box, you may need to click on it first.", parameters={"text": "str - the text to be typed"}, examples=[ { "type": "type", "description": "Type 'John' in the first name input box.", "parameters": {"text": "John"}, }, { "type": "type", "description": "Type 'Doe' in the last name input box.", "parameters": {"text": "Doe"}, }, { "type": "type", "description": "Type 'Hello, world!' in the text area.", "parameters": {"text": "Hello, world!"}, }, ], ) ) self.add_action( ActionDefinition( type="scroll", description="Scroll an UI element in a specified direction", parameters={ "direction": "str - 'up', 'down', 'left', or 'right'", "distance": "int - the number of scroll steps (wheel “clicks”) to send.", }, examples=[ { "type": "scroll", "description": "Scroll down to see more content.", "parameters": {"direction": "down"}, }, { "type": "scroll", "description": "Scroll up to the top of the page.", "parameters": {"direction": "up"}, }, ], ) ) self.add_action( ActionDefinition( type="drag", description="Drag an element or the mouse (with left click on) from one location to another. You must specify both start_description and end_description.", parameters={ "start_description": "description of the location to start dragging", "end_description": "description of the location to drag to", }, examples=[ { "type": "drag", "description": "Drag the response.txt file to the responses folder", "start_description": "Click the response.txt file", "end_description": "Click the responses folder", }, ], ) ) self.add_action( ActionDefinition( type="mouse_move", description="Move the mouse to a specific element", examples=[ { "type": "mouse_move", "description": "Move the mouse to the 'Submit' button.", }, { "type": "mouse_move", "description": "Hover over the 'Settings' icon.", }, ], ) ) self.add_action( ActionDefinition( type="key_press", description="Press a specific key on the keyboard", parameters={ "key": f'str # the key or key combination (separated by space) to be pressed. Example of key combination "Ctrl A", "Shift Tab", "Ctrl C" etc. " + Click" is not a valid combination, use two separate actions. Beside normal keys like letters, numerics, punctuations etc.. here are special key list: {key_maps.keys()}.' }, examples=[ { "type": "key_press", "description": "Press 'Ctrl A' to select all text.", "parameters": {"key": "Ctrl A"}, }, { "type": "key_press", "description": "Press Pagedown key.", "parameters": {"key": "Pagedown"}, }, ], ) ) self.add_special_action( ActionDefinition( type="extract_data", description="Use to extract some data from the screen for the task. This data will be stored in memory and used in the next actions or returned in the final result.", parameters={ "description": "str - short description of the data to be extracted", "data": "str|json - the data to be extracted", }, examples=[ { "type": "extract_data", "description": "Extract the product name and price from the screen.", "parameters": { "description": "Available product name and price", "data": "Product Name: iPhone 14, Price: $999", }, }, ], ) ) self.add_special_action( ActionDefinition( type="finish", description=" Use it to finish the task with success or failure status. When you think the task was finished return success, while when you think can not be done, return failure, don't easily say failure, try your best to do the task.", parameters={"status": "str - 'success' or 'failure'"}, examples=[ { "type": "finish", "description": "Task completed successfully.", "parameters": {"status": "success"}, }, ], ) ) def add_action(self, action: ActionDefinition): self.ui_actions[action.type] = action def add_special_action(self, action: ActionDefinition): self.special_actions[action.type] = action def get_action_definition(self, action_type: str) -> Optional[ActionDefinition]: return self.ui_actions.get(action_type) or self.special_actions.get(action_type) def validate_action(self, action: PlanAction): action_definition = self.get_action_definition(action.action_type) if action_definition is None: raise ValidationException(f"Invalid action type: {action.action_type}") if action_definition.parameters: for parameter in action_definition.parameters: if parameter not in action.parameters: raise ValidationException( f"Missing parameter '{parameter}' in action: {action}" ) def get_system_prompt(self) -> str: indentation = " " def get_action_definition(action: ActionDefinition) -> str: action_prompt = f"- {action.type}: {action.description}" if action.parameters is not None and len(action.parameters) > 0: params = (",\n" + 2 * indentation).join( f"{k}: {v}" for k, v in action.parameters.items() ) parameter_def = ( f"{indentation}parameters:\n{indentation}{indentation}{params}" ) action_prompt += "\n" + parameter_def return action_prompt def get_examples(actions: List[ActionDefinition]) -> list[str]: output_examples = [] for action in actions: for example in action.examples: example_type = example["type"] example_description = example["description"] type_str = f'"type": "{example_type}"' description_str = f'"description": "{example_description}"' example_parts = [type_str, description_str] if "parameters" in example: params = (",\n" + 2 * indentation).join( f'"{k}": "{v}"' for k, v in example["parameters"].items() ) parameters_str = ( '"parameters"' + ": {\n" + 2 * indentation + params + "\n" + indentation + "}" ) example_parts.append(parameters_str) example_json = ( "{\n" + indentation + (",\n" + indentation).join(example_parts) + "\n}" ) output_examples.append(example_json) return output_examples available_actions = "\n\n".join( get_action_definition(action) for action in self.ui_actions.values() ) special_actions = "\n\n".join( get_action_definition(action) for action in self.special_actions.values() ) examples = "\n\n".join( get_examples( list(self.ui_actions.values()) + list(self.special_actions.values()) ) ) return system_template.format( available_actions=available_actions, special_actions=special_actions, examples=examples, ) if __name__ == "__main__": agent = ComputerUseAgentInterface() print(agent.get_system_prompt())