diff --git a/requirements.txt b/requirements.txt index f6b197b..eabdb7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -browser-use==0.1.17 -langchain-google-genai +browser-use>=0.1.18 +langchain-google-genai>=2.0.8 pyperclip gradio langchain-ollama \ No newline at end of file diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py index 955389b..3bf5496 100644 --- a/src/agent/custom_agent.py +++ b/src/agent/custom_agent.py @@ -6,6 +6,8 @@ import json import logging +import pdb +import traceback from typing import Optional, Type from browser_use.agent.prompts import SystemPrompt @@ -37,51 +39,53 @@ logger = logging.getLogger(__name__) class CustomAgent(Agent): def __init__( - self, - task: str, - llm: BaseChatModel, - add_infos: str = "", - browser: Browser | None = None, - browser_context: BrowserContext | None = None, - controller: Controller = Controller(), - use_vision: bool = True, - save_conversation_path: Optional[str] = None, - max_failures: int = 5, - retry_delay: int = 10, - system_prompt_class: Type[SystemPrompt] = SystemPrompt, - max_input_tokens: int = 128000, - validate_output: bool = False, - include_attributes: list[str] = [ - "title", - "type", - "name", - "role", - "tabindex", - "aria-label", - "placeholder", - "value", - "alt", - "aria-expanded", - ], - max_error_length: int = 400, - max_actions_per_step: int = 10, + self, + task: str, + llm: BaseChatModel, + add_infos: str = "", + browser: Browser | None = None, + browser_context: BrowserContext | None = None, + controller: Controller = Controller(), + use_vision: bool = True, + save_conversation_path: Optional[str] = None, + max_failures: int = 5, + retry_delay: int = 10, + system_prompt_class: Type[SystemPrompt] = SystemPrompt, + max_input_tokens: int = 128000, + validate_output: bool = False, + include_attributes: list[str] = [ + "title", + "type", + "name", + "role", + "tabindex", + "aria-label", + "placeholder", + "value", + "alt", + "aria-expanded", + ], + max_error_length: int = 400, + max_actions_per_step: int = 10, + tool_call_in_content: bool = True, ): super().__init__( - task, - llm, - browser, - browser_context, - controller, - use_vision, - save_conversation_path, - max_failures, - retry_delay, - system_prompt_class, - max_input_tokens, - validate_output, - include_attributes, - max_error_length, - max_actions_per_step, + task=task, + llm=llm, + browser=browser, + browser_context=browser_context, + controller=controller, + use_vision=use_vision, + save_conversation_path=save_conversation_path, + max_failures=max_failures, + retry_delay=retry_delay, + system_prompt_class=system_prompt_class, + max_input_tokens=max_input_tokens, + validate_output=validate_output, + include_attributes=include_attributes, + max_error_length=max_error_length, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content, ) self.add_infos = add_infos self.message_manager = CustomMassageManager( @@ -93,6 +97,7 @@ class CustomAgent(Agent): include_attributes=self.include_attributes, max_error_length=self.max_error_length, max_actions_per_step=self.max_actions_per_step, + tool_call_in_content=tool_call_in_content, ) def _setup_action_models(self) -> None: @@ -122,7 +127,7 @@ class CustomAgent(Agent): ) def update_step_info( - self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None + self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None ): """ update step info @@ -133,9 +138,9 @@ class CustomAgent(Agent): step_info.step_number += 1 important_contents = model_output.current_state.important_contents if ( - important_contents - and "None" not in important_contents - and important_contents not in step_info.memory + important_contents + and "None" not in important_contents + and important_contents not in step_info.memory ): step_info.memory += important_contents + "\n" @@ -146,16 +151,35 @@ class CustomAgent(Agent): @time_execution_async("--get_next_action") async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput: """Get next action from LLM based on current state""" + try: + structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True) + response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore - ret = self.llm.invoke(input_messages) - parsed_json = json.loads(ret.content.replace("```json", "").replace("```", "")) - parsed: AgentOutput = self.AgentOutput(**parsed_json) - # cut the number of actions to max_actions_per_step - parsed.action = parsed.action[: self.max_actions_per_step] - self._log_response(parsed) - self.n_steps += 1 + parsed: AgentOutput = response['parsed'] + # cut the number of actions to max_actions_per_step + parsed.action = parsed.action[: self.max_actions_per_step] + self._log_response(parsed) + self.n_steps += 1 - return parsed + return parsed + except Exception as e: + # If something goes wrong, try to invoke the LLM again without structured output, + # and Manually parse the response. Temporarily solution for DeepSeek + ret = self.llm.invoke(input_messages) + if isinstance(ret.content, list): + parsed_json = json.loads(ret.content[0].replace("```json", "").replace("```", "")) + else: + parsed_json = json.loads(ret.content.replace("```json", "").replace("```", "")) + parsed: AgentOutput = self.AgentOutput(**parsed_json) + if parsed is None: + raise ValueError(f'Could not parse response.') + + # cut the number of actions to max_actions_per_step + parsed.action = parsed.action[: self.max_actions_per_step] + self._log_response(parsed) + self.n_steps += 1 + + return parsed @time_execution_async("--step") async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None: @@ -233,7 +257,7 @@ class CustomAgent(Agent): if self.history.is_done(): if ( - self.validate_output and step < max_steps - 1 + self.validate_output and step < max_steps - 1 ): # if last step, we dont need to validate if not await self._validate_output(): continue diff --git a/src/agent/custom_massage_manager.py b/src/agent/custom_massage_manager.py index 480ad39..8de2b06 100644 --- a/src/agent/custom_massage_manager.py +++ b/src/agent/custom_massage_manager.py @@ -17,6 +17,7 @@ from browser_use.browser.views import BrowserState from langchain_core.language_models import BaseChatModel from langchain_core.messages import ( HumanMessage, + AIMessage ) from .custom_prompts import CustomAgentMessagePrompt @@ -26,40 +27,70 @@ logger = logging.getLogger(__name__) class CustomMassageManager(MessageManager): def __init__( - self, - llm: BaseChatModel, - task: str, - action_descriptions: str, - system_prompt_class: Type[SystemPrompt], - max_input_tokens: int = 128000, - estimated_tokens_per_character: int = 3, - image_tokens: int = 800, - include_attributes: list[str] = [], - max_error_length: int = 400, - max_actions_per_step: int = 10, + self, + llm: BaseChatModel, + task: str, + action_descriptions: str, + system_prompt_class: Type[SystemPrompt], + max_input_tokens: int = 128000, + estimated_tokens_per_character: int = 3, + image_tokens: int = 800, + include_attributes: list[str] = [], + max_error_length: int = 400, + max_actions_per_step: int = 10, + tool_call_in_content: bool = False, ): super().__init__( - llm, - task, - action_descriptions, - system_prompt_class, - max_input_tokens, - estimated_tokens_per_character, - image_tokens, - include_attributes, - max_error_length, - max_actions_per_step, + llm=llm, + task=task, + action_descriptions=action_descriptions, + system_prompt_class=system_prompt_class, + max_input_tokens=max_input_tokens, + estimated_tokens_per_character=estimated_tokens_per_character, + image_tokens=image_tokens, + include_attributes=include_attributes, + max_error_length=max_error_length, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content, ) - # Move Task info to state_message + # Custom: Move Task info to state_message self.history = MessageHistory() self._add_message_with_tokens(self.system_prompt) + tool_calls = [ + { + 'name': 'AgentOutput', + 'args': { + 'current_state': { + 'evaluation_previous_goal': 'Unknown - No previous actions to evaluate.', + 'memory': '', + 'next_goal': 'Obtain task from user', + }, + 'action': [], + }, + 'id': '', + 'type': 'tool_call', + } + ] + if self.tool_call_in_content: + # openai throws error if tool_calls are not responded -> move to content + example_tool_call = AIMessage( + content=f'{tool_calls}', + tool_calls=[], + ) + else: + example_tool_call = AIMessage( + content=f'', + tool_calls=tool_calls, + ) + + self._add_message_with_tokens(example_tool_call) def add_state_message( - self, - state: BrowserState, - result: Optional[List[ActionResult]] = None, - step_info: Optional[AgentStepInfo] = None, + self, + state: BrowserState, + result: Optional[List[ActionResult]] = None, + step_info: Optional[AgentStepInfo] = None, ) -> None: """Add browser state as human message""" @@ -72,7 +103,7 @@ class CustomMassageManager(MessageManager): self._add_message_with_tokens(msg) if r.error: msg = HumanMessage( - content=str(r.error)[-self.max_error_length :] + content=str(r.error)[-self.max_error_length:] ) self._add_message_with_tokens(msg) result = None # if result in history, we dont want to add it again diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py index f913cbb..56aeb64 100644 --- a/src/agent/custom_prompts.py +++ b/src/agent/custom_prompts.py @@ -24,7 +24,7 @@ class CustomSystemPrompt(SystemPrompt): { "current_state": { "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.", - "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".", + "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.", "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button", "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.", "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought." @@ -148,12 +148,12 @@ class CustomSystemPrompt(SystemPrompt): class CustomAgentMessagePrompt: def __init__( - self, - state: BrowserState, - result: Optional[List[ActionResult]] = None, - include_attributes: list[str] = [], - max_error_length: int = 400, - step_info: Optional[CustomAgentStepInfo] = None, + self, + state: BrowserState, + result: Optional[List[ActionResult]] = None, + include_attributes: list[str] = [], + max_error_length: int = 400, + step_info: Optional[CustomAgentStepInfo] = None, ): self.state = state self.result = result @@ -183,7 +183,7 @@ class CustomAgentMessagePrompt: state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}" if result.error: # only use last 300 characters of error - error = result.error[-self.max_error_length :] + error = result.error[-self.max_error_length:] state_description += ( f"\nError of action {i + 1}/{len(self.result)}: ...{error}" ) diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py index ebae54e..03ac869 100644 --- a/src/browser/custom_context.py +++ b/src/browser/custom_context.py @@ -23,11 +23,12 @@ class CustomBrowserContext(BrowserContext): config: BrowserContextConfig = BrowserContextConfig(), context: BrowserContext = None, ): - super(CustomBrowserContext, self).__init__(browser, config) + super(CustomBrowserContext, self).__init__(browser=browser, config=config) self.context = context async def _create_context(self, browser: PlaywrightBrowser): """Creates a new browser context with anti-detection measures and loads cookies if available.""" + # If we have a context, return it directly if self.context: return self.context if self.browser.config.chrome_instance_path and len(browser.contexts) > 0: @@ -46,7 +47,7 @@ class CustomBrowserContext(BrowserContext): bypass_csp=self.config.disable_security, ignore_https_errors=self.config.disable_security, record_video_dir=self.config.save_recording_path, - record_video_size=self.config.browser_window_size, # set record video size + record_video_size=self.config.browser_window_size, # set record video size, same as windows size ) if self.config.trace_path: diff --git a/src/utils/utils.py b/src/utils/utils.py index 8a900cd..f0c5fcb 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs): return ChatOllama( model=kwargs.get("model_name", "qwen2.5:7b"), temperature=kwargs.get("temperature", 0.0), + num_ctx=128000, ) elif provider == "azure_openai": if not kwargs.get("base_url", ""): diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py index 91b4f54..4ced1db 100644 --- a/tests/test_browser_use.py +++ b/tests/test_browser_use.py @@ -80,11 +80,12 @@ async def test_browser_use_org(): async def test_browser_use_custom(): from browser_use.browser.context import BrowserContextWindowSize + from browser_use.browser.browser import BrowserConfig from playwright.async_api import async_playwright from src.agent.custom_agent import CustomAgent from src.agent.custom_prompts import CustomSystemPrompt - from src.browser.custom_browser import BrowserConfig, CustomBrowser + from src.browser.custom_browser import CustomBrowser from src.browser.custom_context import BrowserContextConfig from src.controller.custom_controller import CustomController @@ -95,15 +96,15 @@ async def test_browser_use_custom(): # model_name="gpt-4o", # temperature=0.8, # base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - # api_key=os.getenv("AZURE_OPENAI_API_KEY", "") + # api_key=os.getenv("AZURE_OPENAI_API_KEY", ""), # ) - # llm = utils.get_llm_model( - # provider="gemini", - # model_name="gemini-2.0-flash-exp", - # temperature=1.0, - # api_key=os.getenv("GOOGLE_API_KEY", "") - # ) + llm = utils.get_llm_model( + provider="gemini", + model_name="gemini-2.0-flash-exp", + temperature=1.0, + api_key=os.getenv("GOOGLE_API_KEY", "") + ) # llm = utils.get_llm_model( # provider="deepseek", @@ -111,14 +112,16 @@ async def test_browser_use_custom(): # temperature=0.8 # ) - llm = utils.get_llm_model( - provider="ollama", model_name="qwen2.5:7b", temperature=0.8 - ) + # llm = utils.get_llm_model( + # provider="ollama", model_name="qwen2.5:7b", temperature=0.8 + # ) controller = CustomController() use_own_browser = False disable_security = True - use_vision = False + use_vision = True # Set to False when using DeepSeek + tool_call_in_content = True # Set to True when using Ollama + max_actions_per_step = 1 playwright = None browser_context_ = None try: @@ -171,6 +174,8 @@ async def test_browser_use_custom(): controller=controller, system_prompt_class=CustomSystemPrompt, use_vision=use_vision, + tool_call_in_content=tool_call_in_content, + max_actions_per_step=max_actions_per_step ) history: AgentHistoryList = await agent.run(max_steps=10) diff --git a/webui.py b/webui.py index aaee0ff..f0ec79b 100644 --- a/webui.py +++ b/webui.py @@ -29,22 +29,24 @@ from src.utils import utils async def run_browser_agent( - agent_type, - llm_provider, - llm_model_name, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - add_infos, - max_steps, - use_vision, + agent_type, + llm_provider, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + max_steps, + use_vision, + max_actions_per_step, + tool_call_in_content ): # Ensure the recording directory exists os.makedirs(save_recording_path, exist_ok=True) @@ -74,6 +76,8 @@ async def run_browser_agent( task=task, max_steps=max_steps, use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content ) elif agent_type == "custom": final_result, errors, model_actions, model_thoughts = await run_custom_agent( @@ -88,6 +92,8 @@ async def run_browser_agent( add_infos=add_infos, max_steps=max_steps, use_vision=use_vision, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content ) else: raise ValueError(f"Invalid agent type: {agent_type}") @@ -107,15 +113,18 @@ async def run_browser_agent( async def run_org_agent( - llm, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - max_steps, - use_vision, + llm, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + max_steps, + use_vision, + max_actions_per_step, + tool_call_in_content + ): browser = Browser( config=BrowserConfig( @@ -125,20 +134,22 @@ async def run_org_agent( ) ) async with await browser.new_context( - config=BrowserContextConfig( - trace_path="./tmp/traces", - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h - ), - ) + config=BrowserContextConfig( + trace_path="./tmp/traces", + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) ) as browser_context: agent = Agent( task=task, llm=llm, use_vision=use_vision, browser_context=browser_context, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content ) history = await agent.run(max_steps=max_steps) @@ -151,17 +162,19 @@ async def run_org_agent( async def run_custom_agent( - llm, - use_own_browser, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - add_infos, - max_steps, - use_vision, + llm, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + max_steps, + use_vision, + max_actions_per_step, + tool_call_in_content ): controller = CustomController() playwright = None @@ -197,17 +210,17 @@ async def run_custom_agent( ) ) async with await browser.new_context( - config=BrowserContextConfig( - trace_path="./tmp/result_processing", - save_recording_path=save_recording_path - if save_recording_path - else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize( - width=window_w, height=window_h + config=BrowserContextConfig( + trace_path="./tmp/result_processing", + save_recording_path=save_recording_path + if save_recording_path + else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), ), - ), - context=browser_context_, + context=browser_context_, ) as browser_context: agent = CustomAgent( task=task, @@ -217,6 +230,8 @@ async def run_custom_agent( browser_context=browser_context, controller=controller, system_prompt_class=CustomSystemPrompt, + max_actions_per_step=max_actions_per_step, + tool_call_in_content=tool_call_in_content ) history = await agent.run(max_steps=max_steps) @@ -290,7 +305,7 @@ def create_ui(theme_name="Ocean"): """ with gr.Blocks( - title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js + title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js ) as demo: with gr.Row(): gr.Markdown( @@ -318,11 +333,24 @@ def create_ui(theme_name="Ocean"): label="Max Run Steps", info="Maximum number of steps the agent will take", ) + max_actions_per_step = gr.Slider( + minimum=1, + maximum=20, + value=10, + step=1, + label="Max Actions per Step", + info="Maximum number of actions the agent will take per step", + ) use_vision = gr.Checkbox( label="Use Vision", value=True, info="Enable visual processing capabilities", ) + tool_call_in_content = gr.Checkbox( + label="Use Tool Calls in Content", + value=True, + info="Enable Tool Calls in content", + ) with gr.TabItem("🔧 LLM Configuration", id=2): with gr.Group(): @@ -461,6 +489,8 @@ def create_ui(theme_name="Ocean"): add_infos, max_steps, use_vision, + max_actions_per_step, + tool_call_in_content ], outputs=[ final_result_output,