feat: adpat to new version of browser-use

2025-01-08 19:23:23 +08:00
parent dcb39145ec
commit 041dc55a36
8 changed files with 254 additions and 162 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-browser-use==0.1.17
-langchain-google-genai
+browser-use>=0.1.18
+langchain-google-genai>=2.0.8
 pyperclip
 gradio
 langchain-ollama
--- a/src/agent/custom_agent.py
+++ b/src/agent/custom_agent.py
@@ -6,6 +6,8 @@

 import json
 import logging
+import pdb
+import traceback
 from typing import Optional, Type

 from browser_use.agent.prompts import SystemPrompt
@@ -37,51 +39,53 @@ logger = logging.getLogger(__name__)

 class CustomAgent(Agent):
    def __init__(
-        self,
-        task: str,
-        llm: BaseChatModel,
-        add_infos: str = "",
-        browser: Browser | None = None,
-        browser_context: BrowserContext | None = None,
-        controller: Controller = Controller(),
-        use_vision: bool = True,
-        save_conversation_path: Optional[str] = None,
-        max_failures: int = 5,
-        retry_delay: int = 10,
-        system_prompt_class: Type[SystemPrompt] = SystemPrompt,
-        max_input_tokens: int = 128000,
-        validate_output: bool = False,
-        include_attributes: list[str] = [
-            "title",
-            "type",
-            "name",
-            "role",
-            "tabindex",
-            "aria-label",
-            "placeholder",
-            "value",
-            "alt",
-            "aria-expanded",
-        ],
-        max_error_length: int = 400,
-        max_actions_per_step: int = 10,
+            self,
+            task: str,
+            llm: BaseChatModel,
+            add_infos: str = "",
+            browser: Browser | None = None,
+            browser_context: BrowserContext | None = None,
+            controller: Controller = Controller(),
+            use_vision: bool = True,
+            save_conversation_path: Optional[str] = None,
+            max_failures: int = 5,
+            retry_delay: int = 10,
+            system_prompt_class: Type[SystemPrompt] = SystemPrompt,
+            max_input_tokens: int = 128000,
+            validate_output: bool = False,
+            include_attributes: list[str] = [
+                "title",
+                "type",
+                "name",
+                "role",
+                "tabindex",
+                "aria-label",
+                "placeholder",
+                "value",
+                "alt",
+                "aria-expanded",
+            ],
+            max_error_length: int = 400,
+            max_actions_per_step: int = 10,
+            tool_call_in_content: bool = True,
    ):
        super().__init__(
-            task,
-            llm,
-            browser,
-            browser_context,
-            controller,
-            use_vision,
-            save_conversation_path,
-            max_failures,
-            retry_delay,
-            system_prompt_class,
-            max_input_tokens,
-            validate_output,
-            include_attributes,
-            max_error_length,
-            max_actions_per_step,
+            task=task,
+            llm=llm,
+            browser=browser,
+            browser_context=browser_context,
+            controller=controller,
+            use_vision=use_vision,
+            save_conversation_path=save_conversation_path,
+            max_failures=max_failures,
+            retry_delay=retry_delay,
+            system_prompt_class=system_prompt_class,
+            max_input_tokens=max_input_tokens,
+            validate_output=validate_output,
+            include_attributes=include_attributes,
+            max_error_length=max_error_length,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
        )
        self.add_infos = add_infos
        self.message_manager = CustomMassageManager(
@@ -93,6 +97,7 @@ class CustomAgent(Agent):
            include_attributes=self.include_attributes,
            max_error_length=self.max_error_length,
            max_actions_per_step=self.max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
        )

    def _setup_action_models(self) -> None:
@@ -122,7 +127,7 @@ class CustomAgent(Agent):
            )

    def update_step_info(
-        self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
+            self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
    ):
        """
        update step info
@@ -133,9 +138,9 @@ class CustomAgent(Agent):
        step_info.step_number += 1
        important_contents = model_output.current_state.important_contents
        if (
-            important_contents
-            and "None" not in important_contents
-            and important_contents not in step_info.memory
+                important_contents
+                and "None" not in important_contents
+                and important_contents not in step_info.memory
        ):
            step_info.memory += important_contents + "\n"

@@ -146,16 +151,35 @@ class CustomAgent(Agent):
    @time_execution_async("--get_next_action")
    async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
        """Get next action from LLM based on current state"""
+        try:
+            structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
+            response: dict[str, Any] = await structured_llm.ainvoke(input_messages)  # type: ignore

-        ret = self.llm.invoke(input_messages)
-        parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
-        parsed: AgentOutput = self.AgentOutput(**parsed_json)
-        # cut the number of actions to max_actions_per_step
-        parsed.action = parsed.action[: self.max_actions_per_step]
-        self._log_response(parsed)
-        self.n_steps += 1
+            parsed: AgentOutput = response['parsed']
+            # cut the number of actions to max_actions_per_step
+            parsed.action = parsed.action[: self.max_actions_per_step]
+            self._log_response(parsed)
+            self.n_steps += 1

-        return parsed
+            return parsed
+        except Exception as e:
+            # If something goes wrong, try to invoke the LLM again without structured output,
+            # and Manually parse the response. Temporarily solution for DeepSeek
+            ret = self.llm.invoke(input_messages)
+            if isinstance(ret.content, list):
+                parsed_json = json.loads(ret.content[0].replace("```json", "").replace("```", ""))
+            else:
+                parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
+            parsed: AgentOutput = self.AgentOutput(**parsed_json)
+            if parsed is None:
+                raise ValueError(f'Could not parse response.')
+
+            # cut the number of actions to max_actions_per_step
+            parsed.action = parsed.action[: self.max_actions_per_step]
+            self._log_response(parsed)
+            self.n_steps += 1
+
+            return parsed

    @time_execution_async("--step")
    async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
@@ -233,7 +257,7 @@ class CustomAgent(Agent):

                if self.history.is_done():
                    if (
-                        self.validate_output and step < max_steps - 1
+                            self.validate_output and step < max_steps - 1
                    ):  # if last step, we dont need to validate
                        if not await self._validate_output():
                            continue
--- a/src/agent/custom_massage_manager.py
+++ b/src/agent/custom_massage_manager.py
@@ -17,6 +17,7 @@ from browser_use.browser.views import BrowserState
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import (
    HumanMessage,
+    AIMessage
 )

 from .custom_prompts import CustomAgentMessagePrompt
@@ -26,40 +27,70 @@ logger = logging.getLogger(__name__)

 class CustomMassageManager(MessageManager):
    def __init__(
-        self,
-        llm: BaseChatModel,
-        task: str,
-        action_descriptions: str,
-        system_prompt_class: Type[SystemPrompt],
-        max_input_tokens: int = 128000,
-        estimated_tokens_per_character: int = 3,
-        image_tokens: int = 800,
-        include_attributes: list[str] = [],
-        max_error_length: int = 400,
-        max_actions_per_step: int = 10,
+            self,
+            llm: BaseChatModel,
+            task: str,
+            action_descriptions: str,
+            system_prompt_class: Type[SystemPrompt],
+            max_input_tokens: int = 128000,
+            estimated_tokens_per_character: int = 3,
+            image_tokens: int = 800,
+            include_attributes: list[str] = [],
+            max_error_length: int = 400,
+            max_actions_per_step: int = 10,
+            tool_call_in_content: bool = False,
    ):
        super().__init__(
-            llm,
-            task,
-            action_descriptions,
-            system_prompt_class,
-            max_input_tokens,
-            estimated_tokens_per_character,
-            image_tokens,
-            include_attributes,
-            max_error_length,
-            max_actions_per_step,
+            llm=llm,
+            task=task,
+            action_descriptions=action_descriptions,
+            system_prompt_class=system_prompt_class,
+            max_input_tokens=max_input_tokens,
+            estimated_tokens_per_character=estimated_tokens_per_character,
+            image_tokens=image_tokens,
+            include_attributes=include_attributes,
+            max_error_length=max_error_length,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
        )

-        # Move Task info to state_message
+        # Custom: Move Task info to state_message
        self.history = MessageHistory()
        self._add_message_with_tokens(self.system_prompt)
+        tool_calls = [
+            {
+                'name': 'AgentOutput',
+                'args': {
+                    'current_state': {
+                        'evaluation_previous_goal': 'Unknown - No previous actions to evaluate.',
+                        'memory': '',
+                        'next_goal': 'Obtain task from user',
+                    },
+                    'action': [],
+                },
+                'id': '',
+                'type': 'tool_call',
+            }
+        ]
+        if self.tool_call_in_content:
+            # openai throws error if tool_calls are not responded -> move to content
+            example_tool_call = AIMessage(
+                content=f'{tool_calls}',
+                tool_calls=[],
+            )
+        else:
+            example_tool_call = AIMessage(
+                content=f'',
+                tool_calls=tool_calls,
+            )
+
+        self._add_message_with_tokens(example_tool_call)

    def add_state_message(
-        self,
-        state: BrowserState,
-        result: Optional[List[ActionResult]] = None,
-        step_info: Optional[AgentStepInfo] = None,
+            self,
+            state: BrowserState,
+            result: Optional[List[ActionResult]] = None,
+            step_info: Optional[AgentStepInfo] = None,
    ) -> None:
        """Add browser state as human message"""

@@ -72,7 +103,7 @@ class CustomMassageManager(MessageManager):
                        self._add_message_with_tokens(msg)
                    if r.error:
                        msg = HumanMessage(
-                            content=str(r.error)[-self.max_error_length :]
+                            content=str(r.error)[-self.max_error_length:]
                        )
                        self._add_message_with_tokens(msg)
                    result = None  # if result in history, we dont want to add it again
--- a/src/agent/custom_prompts.py
+++ b/src/agent/custom_prompts.py
@@ -24,7 +24,7 @@ class CustomSystemPrompt(SystemPrompt):
       {
         "current_state": {
           "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
-           "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".",
+           "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
           "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
           "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
           "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
@@ -148,12 +148,12 @@ class CustomSystemPrompt(SystemPrompt):

 class CustomAgentMessagePrompt:
    def __init__(
-        self,
-        state: BrowserState,
-        result: Optional[List[ActionResult]] = None,
-        include_attributes: list[str] = [],
-        max_error_length: int = 400,
-        step_info: Optional[CustomAgentStepInfo] = None,
+            self,
+            state: BrowserState,
+            result: Optional[List[ActionResult]] = None,
+            include_attributes: list[str] = [],
+            max_error_length: int = 400,
+            step_info: Optional[CustomAgentStepInfo] = None,
    ):
        self.state = state
        self.result = result
@@ -183,7 +183,7 @@ class CustomAgentMessagePrompt:
                    state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
                if result.error:
                    # only use last 300 characters of error
-                    error = result.error[-self.max_error_length :]
+                    error = result.error[-self.max_error_length:]
                    state_description += (
                        f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
                    )
--- a/src/browser/custom_context.py
+++ b/src/browser/custom_context.py
@@ -23,11 +23,12 @@ class CustomBrowserContext(BrowserContext):
        config: BrowserContextConfig = BrowserContextConfig(),
        context: BrowserContext = None,
    ):
-        super(CustomBrowserContext, self).__init__(browser, config)
+        super(CustomBrowserContext, self).__init__(browser=browser, config=config)
        self.context = context

    async def _create_context(self, browser: PlaywrightBrowser):
        """Creates a new browser context with anti-detection measures and loads cookies if available."""
+        # If we have a context, return it directly
        if self.context:
            return self.context
        if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
@@ -46,7 +47,7 @@ class CustomBrowserContext(BrowserContext):
                bypass_csp=self.config.disable_security,
                ignore_https_errors=self.config.disable_security,
                record_video_dir=self.config.save_recording_path,
-                record_video_size=self.config.browser_window_size,  # set record video size
+                record_video_size=self.config.browser_window_size,  # set record video size, same as windows size
            )

        if self.config.trace_path:
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs):
        return ChatOllama(
            model=kwargs.get("model_name", "qwen2.5:7b"),
            temperature=kwargs.get("temperature", 0.0),
+            num_ctx=128000,
        )
    elif provider == "azure_openai":
        if not kwargs.get("base_url", ""):
--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -80,11 +80,12 @@ async def test_browser_use_org():

 async def test_browser_use_custom():
    from browser_use.browser.context import BrowserContextWindowSize
+    from browser_use.browser.browser import BrowserConfig
    from playwright.async_api import async_playwright

    from src.agent.custom_agent import CustomAgent
    from src.agent.custom_prompts import CustomSystemPrompt
-    from src.browser.custom_browser import BrowserConfig, CustomBrowser
+    from src.browser.custom_browser import CustomBrowser
    from src.browser.custom_context import BrowserContextConfig
    from src.controller.custom_controller import CustomController

@@ -95,15 +96,15 @@ async def test_browser_use_custom():
    #     model_name="gpt-4o",
    #     temperature=0.8,
    #     base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
+    #     api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
    # )

-    # llm = utils.get_llm_model(
-    #     provider="gemini",
-    #     model_name="gemini-2.0-flash-exp",
-    #     temperature=1.0,
-    #     api_key=os.getenv("GOOGLE_API_KEY", "")
-    # )
+    llm = utils.get_llm_model(
+        provider="gemini",
+        model_name="gemini-2.0-flash-exp",
+        temperature=1.0,
+        api_key=os.getenv("GOOGLE_API_KEY", "")
+    )

    # llm = utils.get_llm_model(
    #     provider="deepseek",
@@ -111,14 +112,16 @@ async def test_browser_use_custom():
    #     temperature=0.8
    # )

-    llm = utils.get_llm_model(
-        provider="ollama", model_name="qwen2.5:7b", temperature=0.8
-    )
+    # llm = utils.get_llm_model(
+    #     provider="ollama", model_name="qwen2.5:7b", temperature=0.8
+    # )

    controller = CustomController()
    use_own_browser = False
    disable_security = True
-    use_vision = False
+    use_vision = True  # Set to False when using DeepSeek
+    tool_call_in_content = True  # Set to True when using Ollama
+    max_actions_per_step = 1
    playwright = None
    browser_context_ = None
    try:
@@ -171,6 +174,8 @@ async def test_browser_use_custom():
                controller=controller,
                system_prompt_class=CustomSystemPrompt,
                use_vision=use_vision,
+                tool_call_in_content=tool_call_in_content,
+                max_actions_per_step=max_actions_per_step
            )
            history: AgentHistoryList = await agent.run(max_steps=10)

--- a/webui.py
+++ b/webui.py
@@ -29,22 +29,24 @@ from src.utils import utils


 async def run_browser_agent(
-    agent_type,
-    llm_provider,
-    llm_model_name,
-    llm_temperature,
-    llm_base_url,
-    llm_api_key,
-    use_own_browser,
-    headless,
-    disable_security,
-    window_w,
-    window_h,
-    save_recording_path,
-    task,
-    add_infos,
-    max_steps,
-    use_vision,
+        agent_type,
+        llm_provider,
+        llm_model_name,
+        llm_temperature,
+        llm_base_url,
+        llm_api_key,
+        use_own_browser,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        add_infos,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_call_in_content
 ):
    # Ensure the recording directory exists
    os.makedirs(save_recording_path, exist_ok=True)
@@ -74,6 +76,8 @@ async def run_browser_agent(
            task=task,
            max_steps=max_steps,
            use_vision=use_vision,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content
        )
    elif agent_type == "custom":
        final_result, errors, model_actions, model_thoughts = await run_custom_agent(
@@ -88,6 +92,8 @@ async def run_browser_agent(
            add_infos=add_infos,
            max_steps=max_steps,
            use_vision=use_vision,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content
        )
    else:
        raise ValueError(f"Invalid agent type: {agent_type}")
@@ -107,15 +113,18 @@ async def run_browser_agent(


 async def run_org_agent(
-    llm,
-    headless,
-    disable_security,
-    window_w,
-    window_h,
-    save_recording_path,
-    task,
-    max_steps,
-    use_vision,
+        llm,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_call_in_content
+
 ):
    browser = Browser(
        config=BrowserConfig(
@@ -125,20 +134,22 @@ async def run_org_agent(
        )
    )
    async with await browser.new_context(
-        config=BrowserContextConfig(
-            trace_path="./tmp/traces",
-            save_recording_path=save_recording_path if save_recording_path else None,
-            no_viewport=False,
-            browser_window_size=BrowserContextWindowSize(
-                width=window_w, height=window_h
-            ),
-        )
+            config=BrowserContextConfig(
+                trace_path="./tmp/traces",
+                save_recording_path=save_recording_path if save_recording_path else None,
+                no_viewport=False,
+                browser_window_size=BrowserContextWindowSize(
+                    width=window_w, height=window_h
+                ),
+            )
    ) as browser_context:
        agent = Agent(
            task=task,
            llm=llm,
            use_vision=use_vision,
            browser_context=browser_context,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content
        )
        history = await agent.run(max_steps=max_steps)

@@ -151,17 +162,19 @@ async def run_org_agent(


 async def run_custom_agent(
-    llm,
-    use_own_browser,
-    headless,
-    disable_security,
-    window_w,
-    window_h,
-    save_recording_path,
-    task,
-    add_infos,
-    max_steps,
-    use_vision,
+        llm,
+        use_own_browser,
+        headless,
+        disable_security,
+        window_w,
+        window_h,
+        save_recording_path,
+        task,
+        add_infos,
+        max_steps,
+        use_vision,
+        max_actions_per_step,
+        tool_call_in_content
 ):
    controller = CustomController()
    playwright = None
@@ -197,17 +210,17 @@ async def run_custom_agent(
            )
        )
        async with await browser.new_context(
-            config=BrowserContextConfig(
-                trace_path="./tmp/result_processing",
-                save_recording_path=save_recording_path
-                if save_recording_path
-                else None,
-                no_viewport=False,
-                browser_window_size=BrowserContextWindowSize(
-                    width=window_w, height=window_h
+                config=BrowserContextConfig(
+                    trace_path="./tmp/result_processing",
+                    save_recording_path=save_recording_path
+                    if save_recording_path
+                    else None,
+                    no_viewport=False,
+                    browser_window_size=BrowserContextWindowSize(
+                        width=window_w, height=window_h
+                    ),
                ),
-            ),
-            context=browser_context_,
+                context=browser_context_,
        ) as browser_context:
            agent = CustomAgent(
                task=task,
@@ -217,6 +230,8 @@ async def run_custom_agent(
                browser_context=browser_context,
                controller=controller,
                system_prompt_class=CustomSystemPrompt,
+                max_actions_per_step=max_actions_per_step,
+                tool_call_in_content=tool_call_in_content
            )
            history = await agent.run(max_steps=max_steps)

@@ -290,7 +305,7 @@ def create_ui(theme_name="Ocean"):
    """

    with gr.Blocks(
-        title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js
+            title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js
    ) as demo:
        with gr.Row():
            gr.Markdown(
@@ -318,11 +333,24 @@ def create_ui(theme_name="Ocean"):
                        label="Max Run Steps",
                        info="Maximum number of steps the agent will take",
                    )
+                    max_actions_per_step = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=10,
+                        step=1,
+                        label="Max Actions per Step",
+                        info="Maximum number of actions the agent will take per step",
+                    )
                    use_vision = gr.Checkbox(
                        label="Use Vision",
                        value=True,
                        info="Enable visual processing capabilities",
                    )
+                    tool_call_in_content = gr.Checkbox(
+                        label="Use Tool Calls in Content",
+                        value=True,
+                        info="Enable Tool Calls in content",
+                    )

            with gr.TabItem("🔧 LLM Configuration", id=2):
                with gr.Group():
@@ -461,6 +489,8 @@ def create_ui(theme_name="Ocean"):
                add_infos,
                max_steps,
                use_vision,
+                max_actions_per_step,
+                tool_call_in_content
            ],
            outputs=[
                final_result_output,