Merge pull request #304 from vvincent1234/main

update to browser-use==0.1.37
2025-02-16 13:22:05 +08:00
parent 1a3905e8b1 2538a75e98
commit 3fd3ab26d5
9 changed files with 315 additions and 357 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-browser-use==0.1.29
+browser-use==0.1.37
 pyperclip==1.9.0
 gradio==5.10.0
 json-repair
--- a/src/agent/custom_agent.py
+++ b/src/agent/custom_agent.py
@@ -22,15 +22,19 @@ from browser_use.browser.context import BrowserContext
 from browser_use.browser.views import BrowserStateHistory
 from browser_use.controller.service import Controller
 from browser_use.telemetry.views import (
-	AgentEndTelemetryEvent,
-	AgentRunTelemetryEvent,
-	AgentStepTelemetryEvent,
+    AgentEndTelemetryEvent,
+    AgentRunTelemetryEvent,
+    AgentStepTelemetryEvent,
 )
 from browser_use.utils import time_execution_async
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import (
    BaseMessage,
+    HumanMessage,
+    AIMessage
 )
+from browser_use.agent.prompts import PlannerPrompt
+
 from json_repair import repair_json
 from src.utils.agent_state import AgentState

@@ -50,34 +54,42 @@ class CustomAgent(Agent):
            browser_context: BrowserContext | None = None,
            controller: Controller = Controller(),
            use_vision: bool = True,
+            use_vision_for_planner: bool = False,
            save_conversation_path: Optional[str] = None,
-            max_failures: int = 5,
+            save_conversation_path_encoding: Optional[str] = 'utf-8',
+            max_failures: int = 3,
            retry_delay: int = 10,
            system_prompt_class: Type[SystemPrompt] = SystemPrompt,
            agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
            max_input_tokens: int = 128000,
            validate_output: bool = False,
+            message_context: Optional[str] = None,
+            generate_gif: bool | str = True,
+            sensitive_data: Optional[Dict[str, str]] = None,
+            available_file_paths: Optional[list[str]] = None,
            include_attributes: list[str] = [
-                "title",
-                "type",
-                "name",
-                "role",
-                "tabindex",
-                "aria-label",
-                "placeholder",
-                "value",
-                "alt",
-                "aria-expanded",
+                'title',
+                'type',
+                'name',
+                'role',
+                'tabindex',
+                'aria-label',
+                'placeholder',
+                'value',
+                'alt',
+                'aria-expanded',
            ],
            max_error_length: int = 400,
            max_actions_per_step: int = 10,
            tool_call_in_content: bool = True,
-            agent_state: AgentState = None,
            initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
            # Cloud Callbacks
            register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], None] | None = None,
            register_done_callback: Callable[['AgentHistoryList'], None] | None = None,
            tool_calling_method: Optional[str] = 'auto',
+            page_extraction_llm: Optional[BaseChatModel] = None,
+            planner_llm: Optional[BaseChatModel] = None,
+            planner_interval: int = 1,  # Run planner every N steps
    ):
        super().__init__(
            task=task,
@@ -86,12 +98,18 @@ class CustomAgent(Agent):
            browser_context=browser_context,
            controller=controller,
            use_vision=use_vision,
+            use_vision_for_planner=use_vision_for_planner,
            save_conversation_path=save_conversation_path,
+            save_conversation_path_encoding=save_conversation_path_encoding,
            max_failures=max_failures,
            retry_delay=retry_delay,
            system_prompt_class=system_prompt_class,
            max_input_tokens=max_input_tokens,
            validate_output=validate_output,
+            message_context=message_context,
+            generate_gif=generate_gif,
+            sensitive_data=sensitive_data,
+            available_file_paths=available_file_paths,
            include_attributes=include_attributes,
            max_error_length=max_error_length,
            max_actions_per_step=max_actions_per_step,
@@ -99,7 +117,9 @@ class CustomAgent(Agent):
            initial_actions=initial_actions,
            register_new_step_callback=register_new_step_callback,
            register_done_callback=register_done_callback,
-            tool_calling_method=tool_calling_method
+            tool_calling_method=tool_calling_method,
+            planner_llm=planner_llm,
+            planner_interval=planner_interval
        )
        if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
            # deepseek-reasoner does not support function calling
@@ -108,15 +128,14 @@ class CustomAgent(Agent):
            self.max_input_tokens = 64000
        else:
            self.use_deepseek_r1 = False
-        
+
        # record last actions
        self._last_actions = None
        # record extract content
        self.extracted_content = ""
        # custom new info
        self.add_infos = add_infos
-        # agent_state for Stop
-        self.agent_state = agent_state
+
        self.agent_prompt_class = agent_prompt_class
        self.message_manager = CustomMessageManager(
            llm=self.llm,
@@ -127,7 +146,9 @@ class CustomAgent(Agent):
            max_input_tokens=self.max_input_tokens,
            include_attributes=self.include_attributes,
            max_error_length=self.max_error_length,
-            max_actions_per_step=self.max_actions_per_step
+            max_actions_per_step=self.max_actions_per_step,
+            message_context=self.message_context,
+            sensitive_data=self.sensitive_data
        )

    def _setup_action_models(self) -> None:
@@ -183,19 +204,16 @@ class CustomAgent(Agent):
        if future_plans and "None" not in future_plans:
            step_info.future_plans = future_plans

+        logger.info(f"🧠 All Memory: \n{step_info.memory}")
+
    @time_execution_async("--get_next_action")
    async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
        """Get next action from LLM based on current state"""
-        messages_to_process = (
-            self.message_manager.merge_successive_human_messages(input_messages)
-            if self.use_deepseek_r1
-            else input_messages
-        )

-        ai_message = self.llm.invoke(messages_to_process)
+        ai_message = self.llm.invoke(input_messages)
        self.message_manager._add_message_with_tokens(ai_message)

-        if self.use_deepseek_r1:
+        if hasattr(ai_message, "reasoning_content"):
            logger.info("🤯 Start Deep Thinking: ")
            logger.info(ai_message.reasoning_content)
            logger.info("🤯 End Deep Thinking")
@@ -209,7 +227,7 @@ class CustomAgent(Agent):
        ai_content = repair_json(ai_content)
        parsed_json = json.loads(ai_content)
        parsed: AgentOutput = self.AgentOutput(**parsed_json)
-        
+
        if parsed is None:
            logger.debug(ai_message.content)
            raise ValueError('Could not parse response.')
@@ -218,9 +236,63 @@ class CustomAgent(Agent):
        parsed.action = parsed.action[: self.max_actions_per_step]
        self._log_response(parsed)
        self.n_steps += 1
-        
+
        return parsed

+    async def _run_planner(self) -> Optional[str]:
+        """Run the planner to analyze state and suggest next steps"""
+        # Skip planning if no planner_llm is set
+        if not self.planner_llm:
+            return None
+
+        # Create planner message history using full message history
+        planner_messages = [
+            PlannerPrompt(self.action_descriptions).get_system_message(),
+            *self.message_manager.get_messages()[1:],  # Use full message history except the first
+        ]
+
+        if not self.use_vision_for_planner and self.use_vision:
+            last_state_message = planner_messages[-1]
+            # remove image from last state message
+            new_msg = ''
+            if isinstance(last_state_message.content, list):
+                for msg in last_state_message.content:
+                    if msg['type'] == 'text':
+                        new_msg += msg['text']
+                    elif msg['type'] == 'image_url':
+                        continue
+            else:
+                new_msg = last_state_message.content
+
+            planner_messages[-1] = HumanMessage(content=new_msg)
+
+        # Get planner output
+        response = await self.planner_llm.ainvoke(planner_messages)
+        plan = response.content
+        last_state_message = planner_messages[-1]
+        # remove image from last state message
+        if isinstance(last_state_message.content, list):
+            for msg in last_state_message.content:
+                if msg['type'] == 'text':
+                    msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
+        else:
+            last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
+
+        try:
+            plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
+            logger.info(f'📋 Plans:\n{json.dumps(plan_json, indent=4)}')
+
+            if hasattr(response, "reasoning_content"):
+                logger.info("🤯 Start Planning Deep Thinking: ")
+                logger.info(response.reasoning_content)
+                logger.info("🤯 End Planning Deep Thinking")
+
+        except json.JSONDecodeError:
+            logger.info(f'📋 Plans:\n{plan}')
+        except Exception as e:
+            logger.debug(f'Error parsing planning analysis: {e}')
+            logger.info(f'📋 Plans: {plan}')
+
    @time_execution_async("--step")
    async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
        """Execute one step of the task"""
@@ -228,21 +300,30 @@ class CustomAgent(Agent):
        state = None
        model_output = None
        result: list[ActionResult] = []
+        actions: list[ActionModel] = []

        try:
-            state = await self.browser_context.get_state(use_vision=self.use_vision)
-            self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info)
+            state = await self.browser_context.get_state()
+            self._check_if_stopped_or_paused()
+
+            self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info,
+                                                   self.use_vision)
+
+            # Run planner at specified intervals if planner is configured
+            if self.planner_llm and self.n_steps % self.planning_interval == 0:
+                await self._run_planner()
            input_messages = self.message_manager.get_messages()
+            self._check_if_stopped_or_paused()
            try:
                model_output = await self.get_next_action(input_messages)
                if self.register_new_step_callback:
                    self.register_new_step_callback(state, model_output, self.n_steps)
                self.update_step_info(model_output, step_info)
-                logger.info(f"🧠 All Memory: \n{step_info.memory}")
                self._save_conversation(input_messages, model_output)
                if self.model_name != "deepseek-reasoner":
                    # remove prev message
                    self.message_manager._remove_state_message_by_index(-1)
+                self._check_if_stopped_or_paused()
            except Exception as e:
                # model call failed, remove last state message from history
                self.message_manager._remove_state_message_by_index(-1)
@@ -250,21 +331,23 @@ class CustomAgent(Agent):

            actions: list[ActionModel] = model_output.action
            result: list[ActionResult] = await self.controller.multi_act(
-                actions, self.browser_context
+                actions,
+                self.browser_context,
+                page_extraction_llm=self.page_extraction_llm,
+                sensitive_data=self.sensitive_data,
+                check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
+                available_file_paths=self.available_file_paths,
            )
            if len(result) != len(actions):
                # I think something changes, such information should let LLM know
                for ri in range(len(result), len(actions)):
                    result.append(ActionResult(extracted_content=None,
-                                                include_in_memory=True,
-                                                error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
+                                               include_in_memory=True,
+                                               error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
                                                    Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
-                                                is_done=False))
-            if len(actions) == 0:
-                # TODO: fix no action case
-                result = [ActionResult(is_done=True, extracted_content=step_info.memory, include_in_memory=True)]
+                                               is_done=False))
            for ret_ in result:
-                if "Extracted page" in ret_.extracted_content:
+                if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
                    # record every extracted page
                    self.extracted_content += ret_.extracted_content
            self._last_result = result
@@ -305,7 +388,14 @@ class CustomAgent(Agent):

            # Execute initial actions if provided
            if self.initial_actions:
-                result = await self.controller.multi_act(self.initial_actions, self.browser_context, check_for_new_elements=False)
+                result = await self.controller.multi_act(
+                    self.initial_actions,
+                    self.browser_context,
+                    check_for_new_elements=False,
+                    page_extraction_llm=self.page_extraction_llm,
+                    check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
+                    available_file_paths=self.available_file_paths,
+                )
                self._last_result = result

            step_info = CustomAgentStepInfo(
@@ -319,17 +409,6 @@ class CustomAgent(Agent):
            )

            for step in range(max_steps):
-                # 1) Check if stop requested
-                if self.agent_state and self.agent_state.is_stop_requested():
-                    logger.info("🛑 Stop requested by user")
-                    self._create_stop_history_item()
-                    break
-
-                # 2) Store last valid state before step
-                if self.browser_context and self.agent_state:
-                    state = await self.browser_context.get_state(use_vision=self.use_vision)
-                    self.agent_state.set_last_valid_state(state)
-
                if self._too_many_failures():
                    break

@@ -378,76 +457,18 @@ class CustomAgent(Agent):

                self.create_history_gif(output_path=output_path)

-    def _create_stop_history_item(self):
-        """Create a history item for when the agent is stopped."""
-        try:
-            # Attempt to retrieve the last valid state from agent_state
-            state = None
-            if self.agent_state:
-                last_state = self.agent_state.get_last_valid_state()
-                if last_state:
-                    # Convert to BrowserStateHistory
-                    state = BrowserStateHistory(
-                        url=getattr(last_state, 'url', ""),
-                        title=getattr(last_state, 'title', ""),
-                        tabs=getattr(last_state, 'tabs', []),
-                        interacted_element=[None],
-                        screenshot=getattr(last_state, 'screenshot', None)
-                    )
-                else:
-                    state = self._create_empty_state()
-            else:
-                state = self._create_empty_state()
-
-            # Create a final item in the agent history indicating done
-            stop_history = AgentHistory(
-                model_output=None,
-                state=state,
-                result=[ActionResult(extracted_content=None, error=None, is_done=True)]
-            )
-            self.history.history.append(stop_history)
-
-        except Exception as e:
-            logger.error(f"Error creating stop history item: {e}")
-            # Create empty state as fallback
-            state = self._create_empty_state()
-            stop_history = AgentHistory(
-                model_output=None,
-                state=state,
-                result=[ActionResult(extracted_content=None, error=None, is_done=True)]
-            )
-            self.history.history.append(stop_history)
-
-    def _convert_to_browser_state_history(self, browser_state):
-        return BrowserStateHistory(
-            url=getattr(browser_state, 'url', ""),
-            title=getattr(browser_state, 'title', ""),
-            tabs=getattr(browser_state, 'tabs', []),
-            interacted_element=[None],
-            screenshot=getattr(browser_state, 'screenshot', None)
-        )
-
-    def _create_empty_state(self):
-        return BrowserStateHistory(
-            url="",
-            title="",
-            tabs=[],
-            interacted_element=[None],
-            screenshot=None
-        )
-
    def create_history_gif(
-        self,
-        output_path: str = 'agent_history.gif',
-        duration: int = 3000,
-        show_goals: bool = True,
-        show_task: bool = True,
-        show_logo: bool = False,
-        font_size: int = 40,
-        title_font_size: int = 56,
-        goal_font_size: int = 44,
-        margin: int = 40,
-        line_spacing: float = 1.5,
+            self,
+            output_path: str = 'agent_history.gif',
+            duration: int = 3000,
+            show_goals: bool = True,
+            show_task: bool = True,
+            show_logo: bool = False,
+            font_size: int = 40,
+            title_font_size: int = 56,
+            goal_font_size: int = 44,
+            margin: int = 40,
+            line_spacing: float = 1.5,
    ) -> None:
        """Create a GIF from the agent's history with overlaid task and goal text."""
        if not self.history.history:
@@ -547,4 +568,4 @@ class CustomAgent(Agent):
            )
            logger.info(f'Created GIF at {output_path}')
        else:
-            logger.warning('No images found in history to create GIF')
+            logger.warning('No images found in history to create GIF')
--- a/src/agent/custom_message_manager.py
+++ b/src/agent/custom_message_manager.py
@@ -1,7 +1,7 @@
 from __future__ import annotations

 import logging
-from typing import List, Optional, Type
+from typing import List, Optional, Type, Dict

 from browser_use.agent.message_manager.service import MessageManager
 from browser_use.agent.message_manager.views import MessageHistory
@@ -38,7 +38,8 @@ class CustomMessageManager(MessageManager):
            include_attributes: list[str] = [],
            max_error_length: int = 400,
            max_actions_per_step: int = 10,
-            message_context: Optional[str] = None
+            message_context: Optional[str] = None,
+            sensitive_data: Optional[Dict[str, str]] = None,
    ):
        super().__init__(
            llm=llm,
@@ -51,7 +52,8 @@ class CustomMessageManager(MessageManager):
            include_attributes=include_attributes,
            max_error_length=max_error_length,
            max_actions_per_step=max_actions_per_step,
-            message_context=message_context
+            message_context=message_context,
+            sensitive_data=sensitive_data
        )
        self.agent_prompt_class = agent_prompt_class
        # Custom: Move Task info to state_message
@@ -68,7 +70,7 @@ class CustomMessageManager(MessageManager):
        min_message_len = 2 if self.message_context is not None else 1
        
        while diff > 0 and len(self.history.messages) > min_message_len:
-            self.history.remove_message(min_message_len) # alway remove the oldest message
+            self.history.remove_message(min_message_len)  # always remove the oldest message
            diff = self.history.total_tokens - self.max_input_tokens
        
    def add_state_message(
@@ -77,6 +79,7 @@ class CustomMessageManager(MessageManager):
            actions: Optional[List[ActionModel]] = None,
            result: Optional[List[ActionResult]] = None,
            step_info: Optional[AgentStepInfo] = None,
+            use_vision=True,
    ) -> None:
        """Add browser state as human message"""
        # otherwise add state message and result to next message (which will not stay in memory)
@@ -87,7 +90,7 @@ class CustomMessageManager(MessageManager):
            include_attributes=self.include_attributes,
            max_error_length=self.max_error_length,
            step_info=step_info,
-        ).get_user_message()
+        ).get_user_message(use_vision)
        self._add_message_with_tokens(state_message)
    
    def _count_text_tokens(self, text: str) -> int:
@@ -114,4 +117,4 @@ class CustomMessageManager(MessageManager):
            if remove_cnt == abs(remove_ind):
                self.history.remove_message(i)
                break
-            i -= 1
+            i -= 1
--- a/src/agent/custom_prompts.py
+++ b/src/agent/custom_prompts.py
@@ -16,122 +16,104 @@ class CustomSystemPrompt(SystemPrompt):
        Returns the important rules for the agent.
        """
        text = r"""
-    1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
-       {
-         "current_state": {
-           "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
-           "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
-           "task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
-           "future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
-           "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
-           "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
-         },
-         "action": [
-           * actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}* 
-         ]
-       }
+1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
+   {
+     "current_state": {
+       "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
+       "important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
+       "task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
+       "future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
+       "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
+       "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
+     },
+     "action": [
+       * actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}* 
+     ]
+   }

-    2. ACTIONS: You can specify multiple actions to be executed in sequence. 
+2. ACTIONS: You can specify multiple actions to be executed in sequence. 

-       Common action sequences:
-       - Form filling: [
-           {"input_text": {"index": 1, "text": "username"}},
-           {"input_text": {"index": 2, "text": "password"}},
-           {"click_element": {"index": 3}}
-         ]
-       - Navigation and extraction: [
-           {"go_to_url": {"url": "https://example.com"}},
-           {"extract_page_content": {}}
-         ]
+   Common action sequences:
+   - Form filling: [
+       {"input_text": {"index": 1, "text": "username"}},
+       {"input_text": {"index": 2, "text": "password"}},
+       {"click_element": {"index": 3}}
+     ]
+   - Navigation and extraction: [
+       {"go_to_url": {"url": "https://example.com"}},
+       {"extract_page_content": {}}
+     ]


-    3. ELEMENT INTERACTION:
-       - Only use indexes that exist in the provided element list
-       - Each element has a unique index number (e.g., "33[:]<button>")
-       - Elements marked with "_[:]" are non-interactive (for context only)
+3. ELEMENT INTERACTION:
+   - Only use indexes that exist in the provided element list
+   - Each element has a unique index number (e.g., "33[:]<button>")
+   - Elements marked with "_[:]" are non-interactive (for context only)

-    4. NAVIGATION & ERROR HANDLING:
-       - If no suitable elements exist, use other functions to complete the task
-       - If stuck, try alternative approaches
-       - Handle popups/cookies by accepting or closing them
-       - Use scroll to find elements you are looking for
+4. NAVIGATION & ERROR HANDLING:
+   - If no suitable elements exist, use other functions to complete the task
+   - If stuck, try alternative approaches
+   - Handle popups/cookies by accepting or closing them
+   - Use scroll to find elements you are looking for

-    5. TASK COMPLETION:
-       - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
-       - Don't hallucinate actions.
-       - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
-       - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
-       - Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.
+5. TASK COMPLETION:
+   - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
+   - Don't hallucinate actions.
+   - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
+   - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
+   - Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.

-    6. VISUAL CONTEXT:
-       - When an image is provided, use it to understand the page layout
-       - Bounding boxes with labels correspond to element indexes
-       - Each bounding box and its label have the same color
-       - Most often the label is inside the bounding box, on the top right
-       - Visual context helps verify element locations and relationships
-       - sometimes labels overlap, so use the context to verify the correct element
+6. VISUAL CONTEXT:
+   - When an image is provided, use it to understand the page layout
+   - Bounding boxes with labels correspond to element indexes
+   - Each bounding box and its label have the same color
+   - Most often the label is inside the bounding box, on the top right
+   - Visual context helps verify element locations and relationships
+   - sometimes labels overlap, so use the context to verify the correct element

-    7. Form filling:
-       - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
+7. Form filling:
+   - If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.

-    8. ACTION SEQUENCING:
-       - Actions are executed in the order they appear in the list 
-       - Each action should logically follow from the previous one
-       - If the page changes after an action, the sequence is interrupted and you get the new state.
-       - If content only disappears the sequence continues.
-       - Only provide the action sequence until you think the page will change.
-       - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
-       - only use multiple actions if it makes sense. 
-    """
+8. ACTION SEQUENCING:
+   - Actions are executed in the order they appear in the list 
+   - Each action should logically follow from the previous one
+   - If the page changes after an action, the sequence is interrupted and you get the new state.
+   - If content only disappears the sequence continues.
+   - Only provide the action sequence until you think the page will change.
+   - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
+   - only use multiple actions if it makes sense. 
+
+9. Extraction:
+    - If your task is to find information or do research - call extract_content on the specific pages to get and store the information.
+
+"""
        text += f"   - use maximum {self.max_actions_per_step} actions per sequence"
        return text

    def input_format(self) -> str:
        return """
-    INPUT STRUCTURE:
-    1. Task: The user\'s instructions you need to complete.
-    2. Hints(Optional): Some hints to help you complete the user\'s instructions.
-    3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
-    4. Current URL: The webpage you're currently on
-    5. Available Tabs: List of open browser tabs
-    6. Interactive Elements: List in the format:
-       index[:]<element_type>element_text</element_type>
-       - index: Numeric identifier for interaction
-       - element_type: HTML element type (button, input, etc.)
-       - element_text: Visible text or element description
+INPUT STRUCTURE:
+1. Task: The user\'s instructions you need to complete.
+2. Hints(Optional): Some hints to help you complete the user\'s instructions.
+3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
+4. Current URL: The webpage you're currently on
+5. Available Tabs: List of open browser tabs
+6. Interactive Elements: List in the format:
+   [index]<element_type>element_text</element_type>
+   - index: Numeric identifier for interaction
+   - element_type: HTML element type (button, input, etc.)
+   - element_text: Visible text or element description

-    Example:
-    33[:]<button>Submit Form</button>
-    _[:] Non-interactive text
+Example:
+[33]<button>Submit Form</button>
+[] Non-interactive text


-    Notes:
-    - Only elements with numeric indexes are interactive
-    - _[:] elements provide context but cannot be interacted with
+Notes:
+- Only elements with numeric indexes inside [] are interactive
+- [] elements provide context but cannot be interacted with
    """

-    def get_system_message(self) -> SystemMessage:
-        """
-        Get the system prompt for the agent.
-
-        Returns:
-            str: Formatted system prompt
-        """
-        AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
-    1. Analyze the provided webpage elements and structure
-    2. Plan a sequence of actions to accomplish the given task
-    3. Your final result MUST be a valid JSON as the **RESPONSE FORMAT** described, containing your action sequence and state assessment, No need extra content to expalin. 
-
-    {self.input_format()}
-
-    {self.important_rules()}
-
-    Functions:
-    {self.default_action_description}
-
-    Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid."""
-        return SystemMessage(content=AGENT_PROMPT)
-

 class CustomAgentMessagePrompt(AgentMessagePrompt):
    def __init__(
@@ -143,20 +125,20 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
            max_error_length: int = 400,
            step_info: Optional[CustomAgentStepInfo] = None,
    ):
-        super(CustomAgentMessagePrompt, self).__init__(state=state, 
-                                                       result=result, 
-                                                       include_attributes=include_attributes, 
-                                                       max_error_length=max_error_length, 
+        super(CustomAgentMessagePrompt, self).__init__(state=state,
+                                                       result=result,
+                                                       include_attributes=include_attributes,
+                                                       max_error_length=max_error_length,
                                                       step_info=step_info
                                                       )
        self.actions = actions

-    def get_user_message(self) -> HumanMessage:
+    def get_user_message(self, use_vision: bool = True) -> HumanMessage:
        if self.step_info:
            step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
        else:
            step_info_description = ''
-            
+
        time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
        step_info_description += f"Current date and time: {time_str}"

@@ -180,7 +162,7 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
                elements_text = f'{elements_text}\n[End of page]'
        else:
            elements_text = 'empty page'
-   
+
        state_description = f"""
 {step_info_description}
 1. Task: {self.step_info.task}. 
@@ -211,18 +193,16 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
                            f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
                        )

-        if self.state.screenshot:
+        if self.state.screenshot and use_vision == True:
            # Format message for vision model
            return HumanMessage(
                content=[
-                    {"type": "text", "text": state_description},
+                    {'type': 'text', 'text': state_description},
                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{self.state.screenshot}"
-                        },
+                        'type': 'image_url',
+                        'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
                    },
                ]
            )

-        return HumanMessage(content=state_description)
+        return HumanMessage(content=state_description)
--- a/src/browser/custom_browser.py
+++ b/src/browser/custom_browser.py
@@ -25,57 +25,3 @@ class CustomBrowser(Browser):
        config: BrowserContextConfig = BrowserContextConfig()
    ) -> CustomBrowserContext:
        return CustomBrowserContext(config=config, browser=self)
-    
-    async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
-        """Sets up and returns a Playwright Browser instance with anti-detection measures."""
-        if not self.config.chrome_instance_path:
-            raise ValueError('Chrome instance path is required')
-        import subprocess
-
-        import requests
-
-        try:
-            # Check if browser is already running
-            response = requests.get('http://localhost:9222/json/version', timeout=2)
-            if response.status_code == 200:
-                logger.info('Reusing existing Chrome instance')
-                browser = await playwright.chromium.connect_over_cdp(
-                    endpoint_url='http://localhost:9222',
-                    timeout=20000,  # 20 second timeout for connection
-                )
-                return browser
-        except requests.ConnectionError:
-            logger.debug('No existing Chrome instance found, starting a new one')
-
-        # Start a new Chrome instance
-        subprocess.Popen(
-            [
-                self.config.chrome_instance_path,
-                '--remote-debugging-port=9222',
-            ] + self.config.extra_chromium_args,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-  
-        # try to connect first in case the browser have not started
-        for _ in range(10):
-            try:
-                response = requests.get('http://localhost:9222/json/version', timeout=2)
-                if response.status_code == 200:
-                    break
-            except requests.ConnectionError:
-                pass
-            await asyncio.sleep(1)
-
-        # Attempt to connect again after starting a new instance
-        try:
-            browser = await playwright.chromium.connect_over_cdp(
-                endpoint_url='http://localhost:9222',
-                timeout=20000,  # 20 second timeout for connection
-            )
-            return browser
-        except Exception as e:
-            logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
-            raise RuntimeError(
-                ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
-            )
--- a/src/controller/custom_controller.py
+++ b/src/controller/custom_controller.py
@@ -39,7 +39,7 @@ class CustomController(Controller):
            pyperclip.copy(text)
            return ActionResult(extracted_content=text)

-        @self.registry.action("Paste text from clipboard", requires_browser=True)
+        @self.registry.action("Paste text from clipboard")
        async def paste_from_clipboard(browser: BrowserContext):
            text = pyperclip.paste()
            # send text to browser
@@ -47,25 +47,3 @@ class CustomController(Controller):
            await page.keyboard.type(text)

            return ActionResult(extracted_content=text)
-
-        @self.registry.action(
-            'Extract page content to get the pure text or markdown with links if include_links is set to true',
-            param_model=ExtractPageContentAction,
-            requires_browser=True,
-        )
-        async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
-            page = await browser.get_current_page()
-            # use jina reader
-            url = page.url
-            jina_url = f"https://r.jina.ai/{url}"
-            await page.goto(jina_url)
-            output_format = 'markdown' if params.include_links else 'text'
-            content = MainContentExtractor.extract(  # type: ignore
-                html=await page.content(),
-                output_format=output_format,
-            )
-            # go back to org url
-            await page.go_back()
-            msg = f'Extracted page content:\n {content}\n'
-            logger.info(msg)
-            return ActionResult(extracted_content=msg)
--- a/src/utils/deep_research.py
+++ b/src/utils/deep_research.py
@@ -15,12 +15,16 @@ import json
 import re
 from browser_use.agent.service import Agent
 from browser_use.browser.browser import BrowserConfig, Browser
+from browser_use.agent.views import ActionResult
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.service import Controller, DoneAction
+from main_content_extractor import MainContentExtractor
 from langchain.schema import SystemMessage, HumanMessage
 from json_repair import repair_json
 from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
 from src.controller.custom_controller import CustomController
 from src.browser.custom_browser import CustomBrowser
-from src.browser.custom_context import BrowserContextConfig
+from src.browser.custom_context import BrowserContextConfig, BrowserContext
 from browser_use.browser.context import (
    BrowserContextConfig,
    BrowserContextWindowSize,
@@ -65,6 +69,27 @@ async def deep_research(task, llm, agent_state=None, **kwargs):

    controller = CustomController()

+    @controller.registry.action(
+        'Extract page content to get the pure markdown.',
+    )
+    async def extract_content(browser: BrowserContext):
+        page = await browser.get_current_page()
+        # use jina reader
+        url = page.url
+
+        jina_url = f"https://r.jina.ai/{url}"
+        await page.goto(jina_url)
+        output_format = 'markdown'
+        content = MainContentExtractor.extract(  # type: ignore
+            html=await page.content(),
+            output_format=output_format,
+        )
+        # go back to org url
+        await page.go_back()
+        msg = f'Extracted page content:\n {content}\n'
+        logger.info(msg)
+        return ActionResult(extracted_content=msg)
+
    search_system_prompt = f"""
    You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.

@@ -200,8 +225,7 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
                    system_prompt_class=CustomSystemPrompt,
                    agent_prompt_class=CustomAgentMessagePrompt,
                    max_actions_per_step=5,
-                    controller=controller,
-                    agent_state=agent_state
+                    controller=controller
                )
                agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10))
                query_results = [agent_result]
@@ -224,7 +248,6 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
                    agent_prompt_class=CustomAgentMessagePrompt,
                    max_actions_per_step=5,
                    controller=controller,
-                    agent_state=agent_state
                ) for task in query_tasks]
                query_results = await asyncio.gather(
                    *[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents])
@@ -265,6 +288,9 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
                    record_content = repair_json(record_content)
                    new_record_infos = json.loads(record_content)
                    history_infos.extend(new_record_infos)
+            if agent_state and agent_state.is_stop_requested():
+                # Stop
+                break

        logger.info("\nFinish Searching, Start Generating Report...")

--- a/tests/test_browser_use.py
+++ b/tests/test_browser_use.py
@@ -128,7 +128,7 @@ async def test_browser_use_custom():

    # llm = utils.get_llm_model(
    #     provider="google",
-    #     model_name="gemini-2.0-flash-exp",
+    #     model_name="gemini-2.0-flash",
    #     temperature=1.0,
    #     api_key=os.getenv("GOOGLE_API_KEY", "")
    # )
@@ -193,7 +193,7 @@ async def test_browser_use_custom():
            )
        )
        agent = CustomAgent(
-            task="Search 'Nvidia' and give me the first url",
+            task="Give me stock price of Tesla",
            add_infos="",  # some hints for llm to complete the task
            llm=llm,
            browser=browser,
--- a/webui.py
+++ b/webui.py
@@ -39,17 +39,18 @@ from src.utils.utils import update_model_dropdown, get_latest_files, capture_scr
 # Global variables for persistence
 _global_browser = None
 _global_browser_context = None
+_global_agent = None

 # Create the global agent state instance
 _global_agent_state = AgentState()

 async def stop_agent():
    """Request the agent to stop and update UI with enhanced feedback"""
-    global _global_agent_state, _global_browser_context, _global_browser
+    global _global_agent_state, _global_browser_context, _global_browser, _global_agent

    try:
        # Request stop
-        _global_agent_state.request_stop()
+        _global_agent.stop()

        # Update UI immediately
        message = "Stop requested - the agent will halt at the next safe point"
@@ -247,7 +248,7 @@ async def run_org_agent(
        tool_calling_method
 ):
    try:
-        global _global_browser, _global_browser_context, _global_agent_state
+        global _global_browser, _global_browser_context, _global_agent_state, _global_agent
        
        # Clear any previous stop request
        _global_agent_state.clear_stop()
@@ -284,20 +285,21 @@ async def run_org_agent(
                    ),
                )
            )
-            
-        agent = Agent(
-            task=task,
-            llm=llm,
-            use_vision=use_vision,
-            browser=_global_browser,
-            browser_context=_global_browser_context,
-            max_actions_per_step=max_actions_per_step,
-            tool_calling_method=tool_calling_method
-        )
-        history = await agent.run(max_steps=max_steps)

-        history_file = os.path.join(save_agent_history_path, f"{agent.agent_id}.json")
-        agent.save_history(history_file)
+        if _global_agent is None:
+            _global_agent = Agent(
+                task=task,
+                llm=llm,
+                use_vision=use_vision,
+                browser=_global_browser,
+                browser_context=_global_browser_context,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method
+            )
+        history = await _global_agent.run(max_steps=max_steps)
+
+        history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
+        _global_agent.save_history(history_file)

        final_result = history.final_result()
        errors = history.errors()
@@ -313,6 +315,7 @@ async def run_org_agent(
        errors = str(e) + "\n" + traceback.format_exc()
        return '', errors, '', '', None, None
    finally:
+        _global_agent = None
        # Handle cleanup based on persistence configuration
        if not keep_browser_open:
            if _global_browser_context:
@@ -342,7 +345,7 @@ async def run_custom_agent(
        tool_calling_method
 ):
    try:
-        global _global_browser, _global_browser_context, _global_agent_state
+        global _global_browser, _global_browser_context, _global_agent_state, _global_agent

        # Clear any previous stop request
        _global_agent_state.clear_stop()
@@ -384,24 +387,24 @@ async def run_custom_agent(
            )
            
        # Create and run agent
-        agent = CustomAgent(
-            task=task,
-            add_infos=add_infos,
-            use_vision=use_vision,
-            llm=llm,
-            browser=_global_browser,
-            browser_context=_global_browser_context,
-            controller=controller,
-            system_prompt_class=CustomSystemPrompt,
-            agent_prompt_class=CustomAgentMessagePrompt,
-            max_actions_per_step=max_actions_per_step,
-            agent_state=_global_agent_state,
-            tool_calling_method=tool_calling_method
-        )
-        history = await agent.run(max_steps=max_steps)
+        if _global_agent is None:
+            _global_agent = CustomAgent(
+                task=task,
+                add_infos=add_infos,
+                use_vision=use_vision,
+                llm=llm,
+                browser=_global_browser,
+                browser_context=_global_browser_context,
+                controller=controller,
+                system_prompt_class=CustomSystemPrompt,
+                agent_prompt_class=CustomAgentMessagePrompt,
+                max_actions_per_step=max_actions_per_step,
+                tool_calling_method=tool_calling_method
+            )
+        history = await _global_agent.run(max_steps=max_steps)

-        history_file = os.path.join(save_agent_history_path, f"{agent.agent_id}.json")
-        agent.save_history(history_file)
+        history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
+        _global_agent.save_history(history_file)

        final_result = history.final_result()
        errors = history.errors()
@@ -417,6 +420,7 @@ async def run_custom_agent(
        errors = str(e) + "\n" + traceback.format_exc()
        return '', errors, '', '', None, None
    finally:
+        _global_agent = None
        # Handle cleanup based on persistence configuration
        if not keep_browser_open:
            if _global_browser_context: