Merge pull request #304 from vvincent1234/main

update to browser-use==0.1.37
This commit is contained in:
warmshao
2025-02-16 13:22:05 +08:00
committed by GitHub
9 changed files with 315 additions and 357 deletions

View File

@@ -1,4 +1,4 @@
browser-use==0.1.29
browser-use==0.1.37
pyperclip==1.9.0
gradio==5.10.0
json-repair

View File

@@ -22,15 +22,19 @@ from browser_use.browser.context import BrowserContext
from browser_use.browser.views import BrowserStateHistory
from browser_use.controller.service import Controller
from browser_use.telemetry.views import (
AgentEndTelemetryEvent,
AgentRunTelemetryEvent,
AgentStepTelemetryEvent,
AgentEndTelemetryEvent,
AgentRunTelemetryEvent,
AgentStepTelemetryEvent,
)
from browser_use.utils import time_execution_async
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import (
BaseMessage,
HumanMessage,
AIMessage
)
from browser_use.agent.prompts import PlannerPrompt
from json_repair import repair_json
from src.utils.agent_state import AgentState
@@ -50,34 +54,42 @@ class CustomAgent(Agent):
browser_context: BrowserContext | None = None,
controller: Controller = Controller(),
use_vision: bool = True,
use_vision_for_planner: bool = False,
save_conversation_path: Optional[str] = None,
max_failures: int = 5,
save_conversation_path_encoding: Optional[str] = 'utf-8',
max_failures: int = 3,
retry_delay: int = 10,
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
max_input_tokens: int = 128000,
validate_output: bool = False,
message_context: Optional[str] = None,
generate_gif: bool | str = True,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
include_attributes: list[str] = [
"title",
"type",
"name",
"role",
"tabindex",
"aria-label",
"placeholder",
"value",
"alt",
"aria-expanded",
'title',
'type',
'name',
'role',
'tabindex',
'aria-label',
'placeholder',
'value',
'alt',
'aria-expanded',
],
max_error_length: int = 400,
max_actions_per_step: int = 10,
tool_call_in_content: bool = True,
agent_state: AgentState = None,
initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
# Cloud Callbacks
register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], None] | None = None,
register_done_callback: Callable[['AgentHistoryList'], None] | None = None,
tool_calling_method: Optional[str] = 'auto',
page_extraction_llm: Optional[BaseChatModel] = None,
planner_llm: Optional[BaseChatModel] = None,
planner_interval: int = 1, # Run planner every N steps
):
super().__init__(
task=task,
@@ -86,12 +98,18 @@ class CustomAgent(Agent):
browser_context=browser_context,
controller=controller,
use_vision=use_vision,
use_vision_for_planner=use_vision_for_planner,
save_conversation_path=save_conversation_path,
save_conversation_path_encoding=save_conversation_path_encoding,
max_failures=max_failures,
retry_delay=retry_delay,
system_prompt_class=system_prompt_class,
max_input_tokens=max_input_tokens,
validate_output=validate_output,
message_context=message_context,
generate_gif=generate_gif,
sensitive_data=sensitive_data,
available_file_paths=available_file_paths,
include_attributes=include_attributes,
max_error_length=max_error_length,
max_actions_per_step=max_actions_per_step,
@@ -99,7 +117,9 @@ class CustomAgent(Agent):
initial_actions=initial_actions,
register_new_step_callback=register_new_step_callback,
register_done_callback=register_done_callback,
tool_calling_method=tool_calling_method
tool_calling_method=tool_calling_method,
planner_llm=planner_llm,
planner_interval=planner_interval
)
if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
# deepseek-reasoner does not support function calling
@@ -108,15 +128,14 @@ class CustomAgent(Agent):
self.max_input_tokens = 64000
else:
self.use_deepseek_r1 = False
# record last actions
self._last_actions = None
# record extract content
self.extracted_content = ""
# custom new info
self.add_infos = add_infos
# agent_state for Stop
self.agent_state = agent_state
self.agent_prompt_class = agent_prompt_class
self.message_manager = CustomMessageManager(
llm=self.llm,
@@ -127,7 +146,9 @@ class CustomAgent(Agent):
max_input_tokens=self.max_input_tokens,
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
max_actions_per_step=self.max_actions_per_step
max_actions_per_step=self.max_actions_per_step,
message_context=self.message_context,
sensitive_data=self.sensitive_data
)
def _setup_action_models(self) -> None:
@@ -183,19 +204,16 @@ class CustomAgent(Agent):
if future_plans and "None" not in future_plans:
step_info.future_plans = future_plans
logger.info(f"🧠 All Memory: \n{step_info.memory}")
@time_execution_async("--get_next_action")
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
messages_to_process = (
self.message_manager.merge_successive_human_messages(input_messages)
if self.use_deepseek_r1
else input_messages
)
ai_message = self.llm.invoke(messages_to_process)
ai_message = self.llm.invoke(input_messages)
self.message_manager._add_message_with_tokens(ai_message)
if self.use_deepseek_r1:
if hasattr(ai_message, "reasoning_content"):
logger.info("🤯 Start Deep Thinking: ")
logger.info(ai_message.reasoning_content)
logger.info("🤯 End Deep Thinking")
@@ -209,7 +227,7 @@ class CustomAgent(Agent):
ai_content = repair_json(ai_content)
parsed_json = json.loads(ai_content)
parsed: AgentOutput = self.AgentOutput(**parsed_json)
if parsed is None:
logger.debug(ai_message.content)
raise ValueError('Could not parse response.')
@@ -218,9 +236,63 @@ class CustomAgent(Agent):
parsed.action = parsed.action[: self.max_actions_per_step]
self._log_response(parsed)
self.n_steps += 1
return parsed
async def _run_planner(self) -> Optional[str]:
"""Run the planner to analyze state and suggest next steps"""
# Skip planning if no planner_llm is set
if not self.planner_llm:
return None
# Create planner message history using full message history
planner_messages = [
PlannerPrompt(self.action_descriptions).get_system_message(),
*self.message_manager.get_messages()[1:], # Use full message history except the first
]
if not self.use_vision_for_planner and self.use_vision:
last_state_message = planner_messages[-1]
# remove image from last state message
new_msg = ''
if isinstance(last_state_message.content, list):
for msg in last_state_message.content:
if msg['type'] == 'text':
new_msg += msg['text']
elif msg['type'] == 'image_url':
continue
else:
new_msg = last_state_message.content
planner_messages[-1] = HumanMessage(content=new_msg)
# Get planner output
response = await self.planner_llm.ainvoke(planner_messages)
plan = response.content
last_state_message = planner_messages[-1]
# remove image from last state message
if isinstance(last_state_message.content, list):
for msg in last_state_message.content:
if msg['type'] == 'text':
msg['text'] += f"\nPlanning Agent outputs plans:\n {plan}\n"
else:
last_state_message.content += f"\nPlanning Agent outputs plans:\n {plan}\n "
try:
plan_json = json.loads(plan.replace("```json", "").replace("```", ""))
logger.info(f'📋 Plans:\n{json.dumps(plan_json, indent=4)}')
if hasattr(response, "reasoning_content"):
logger.info("🤯 Start Planning Deep Thinking: ")
logger.info(response.reasoning_content)
logger.info("🤯 End Planning Deep Thinking")
except json.JSONDecodeError:
logger.info(f'📋 Plans:\n{plan}')
except Exception as e:
logger.debug(f'Error parsing planning analysis: {e}')
logger.info(f'📋 Plans: {plan}')
@time_execution_async("--step")
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
"""Execute one step of the task"""
@@ -228,21 +300,30 @@ class CustomAgent(Agent):
state = None
model_output = None
result: list[ActionResult] = []
actions: list[ActionModel] = []
try:
state = await self.browser_context.get_state(use_vision=self.use_vision)
self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info)
state = await self.browser_context.get_state()
self._check_if_stopped_or_paused()
self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info,
self.use_vision)
# Run planner at specified intervals if planner is configured
if self.planner_llm and self.n_steps % self.planning_interval == 0:
await self._run_planner()
input_messages = self.message_manager.get_messages()
self._check_if_stopped_or_paused()
try:
model_output = await self.get_next_action(input_messages)
if self.register_new_step_callback:
self.register_new_step_callback(state, model_output, self.n_steps)
self.update_step_info(model_output, step_info)
logger.info(f"🧠 All Memory: \n{step_info.memory}")
self._save_conversation(input_messages, model_output)
if self.model_name != "deepseek-reasoner":
# remove prev message
self.message_manager._remove_state_message_by_index(-1)
self._check_if_stopped_or_paused()
except Exception as e:
# model call failed, remove last state message from history
self.message_manager._remove_state_message_by_index(-1)
@@ -250,21 +331,23 @@ class CustomAgent(Agent):
actions: list[ActionModel] = model_output.action
result: list[ActionResult] = await self.controller.multi_act(
actions, self.browser_context
actions,
self.browser_context,
page_extraction_llm=self.page_extraction_llm,
sensitive_data=self.sensitive_data,
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)
if len(result) != len(actions):
# I think something changes, such information should let LLM know
for ri in range(len(result), len(actions)):
result.append(ActionResult(extracted_content=None,
include_in_memory=True,
error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
include_in_memory=True,
error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
is_done=False))
if len(actions) == 0:
# TODO: fix no action case
result = [ActionResult(is_done=True, extracted_content=step_info.memory, include_in_memory=True)]
is_done=False))
for ret_ in result:
if "Extracted page" in ret_.extracted_content:
if ret_.extracted_content and "Extracted page" in ret_.extracted_content:
# record every extracted page
self.extracted_content += ret_.extracted_content
self._last_result = result
@@ -305,7 +388,14 @@ class CustomAgent(Agent):
# Execute initial actions if provided
if self.initial_actions:
result = await self.controller.multi_act(self.initial_actions, self.browser_context, check_for_new_elements=False)
result = await self.controller.multi_act(
self.initial_actions,
self.browser_context,
check_for_new_elements=False,
page_extraction_llm=self.page_extraction_llm,
check_break_if_paused=lambda: self._check_if_stopped_or_paused(),
available_file_paths=self.available_file_paths,
)
self._last_result = result
step_info = CustomAgentStepInfo(
@@ -319,17 +409,6 @@ class CustomAgent(Agent):
)
for step in range(max_steps):
# 1) Check if stop requested
if self.agent_state and self.agent_state.is_stop_requested():
logger.info("🛑 Stop requested by user")
self._create_stop_history_item()
break
# 2) Store last valid state before step
if self.browser_context and self.agent_state:
state = await self.browser_context.get_state(use_vision=self.use_vision)
self.agent_state.set_last_valid_state(state)
if self._too_many_failures():
break
@@ -378,76 +457,18 @@ class CustomAgent(Agent):
self.create_history_gif(output_path=output_path)
def _create_stop_history_item(self):
"""Create a history item for when the agent is stopped."""
try:
# Attempt to retrieve the last valid state from agent_state
state = None
if self.agent_state:
last_state = self.agent_state.get_last_valid_state()
if last_state:
# Convert to BrowserStateHistory
state = BrowserStateHistory(
url=getattr(last_state, 'url', ""),
title=getattr(last_state, 'title', ""),
tabs=getattr(last_state, 'tabs', []),
interacted_element=[None],
screenshot=getattr(last_state, 'screenshot', None)
)
else:
state = self._create_empty_state()
else:
state = self._create_empty_state()
# Create a final item in the agent history indicating done
stop_history = AgentHistory(
model_output=None,
state=state,
result=[ActionResult(extracted_content=None, error=None, is_done=True)]
)
self.history.history.append(stop_history)
except Exception as e:
logger.error(f"Error creating stop history item: {e}")
# Create empty state as fallback
state = self._create_empty_state()
stop_history = AgentHistory(
model_output=None,
state=state,
result=[ActionResult(extracted_content=None, error=None, is_done=True)]
)
self.history.history.append(stop_history)
def _convert_to_browser_state_history(self, browser_state):
return BrowserStateHistory(
url=getattr(browser_state, 'url', ""),
title=getattr(browser_state, 'title', ""),
tabs=getattr(browser_state, 'tabs', []),
interacted_element=[None],
screenshot=getattr(browser_state, 'screenshot', None)
)
def _create_empty_state(self):
return BrowserStateHistory(
url="",
title="",
tabs=[],
interacted_element=[None],
screenshot=None
)
def create_history_gif(
self,
output_path: str = 'agent_history.gif',
duration: int = 3000,
show_goals: bool = True,
show_task: bool = True,
show_logo: bool = False,
font_size: int = 40,
title_font_size: int = 56,
goal_font_size: int = 44,
margin: int = 40,
line_spacing: float = 1.5,
self,
output_path: str = 'agent_history.gif',
duration: int = 3000,
show_goals: bool = True,
show_task: bool = True,
show_logo: bool = False,
font_size: int = 40,
title_font_size: int = 56,
goal_font_size: int = 44,
margin: int = 40,
line_spacing: float = 1.5,
) -> None:
"""Create a GIF from the agent's history with overlaid task and goal text."""
if not self.history.history:
@@ -547,4 +568,4 @@ class CustomAgent(Agent):
)
logger.info(f'Created GIF at {output_path}')
else:
logger.warning('No images found in history to create GIF')
logger.warning('No images found in history to create GIF')

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import logging
from typing import List, Optional, Type
from typing import List, Optional, Type, Dict
from browser_use.agent.message_manager.service import MessageManager
from browser_use.agent.message_manager.views import MessageHistory
@@ -38,7 +38,8 @@ class CustomMessageManager(MessageManager):
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
message_context: Optional[str] = None
message_context: Optional[str] = None,
sensitive_data: Optional[Dict[str, str]] = None,
):
super().__init__(
llm=llm,
@@ -51,7 +52,8 @@ class CustomMessageManager(MessageManager):
include_attributes=include_attributes,
max_error_length=max_error_length,
max_actions_per_step=max_actions_per_step,
message_context=message_context
message_context=message_context,
sensitive_data=sensitive_data
)
self.agent_prompt_class = agent_prompt_class
# Custom: Move Task info to state_message
@@ -68,7 +70,7 @@ class CustomMessageManager(MessageManager):
min_message_len = 2 if self.message_context is not None else 1
while diff > 0 and len(self.history.messages) > min_message_len:
self.history.remove_message(min_message_len) # alway remove the oldest message
self.history.remove_message(min_message_len) # always remove the oldest message
diff = self.history.total_tokens - self.max_input_tokens
def add_state_message(
@@ -77,6 +79,7 @@ class CustomMessageManager(MessageManager):
actions: Optional[List[ActionModel]] = None,
result: Optional[List[ActionResult]] = None,
step_info: Optional[AgentStepInfo] = None,
use_vision=True,
) -> None:
"""Add browser state as human message"""
# otherwise add state message and result to next message (which will not stay in memory)
@@ -87,7 +90,7 @@ class CustomMessageManager(MessageManager):
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
step_info=step_info,
).get_user_message()
).get_user_message(use_vision)
self._add_message_with_tokens(state_message)
def _count_text_tokens(self, text: str) -> int:
@@ -114,4 +117,4 @@ class CustomMessageManager(MessageManager):
if remove_cnt == abs(remove_ind):
self.history.remove_message(i)
break
i -= 1
i -= 1

View File

@@ -16,122 +16,104 @@ class CustomSystemPrompt(SystemPrompt):
Returns the important rules for the agent.
"""
text = r"""
1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
{
"current_state": {
"prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
"important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
"task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
"future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
"summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
},
"action": [
* actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}*
]
}
1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
{
"current_state": {
"prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
"important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
"task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
"future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
"summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
},
"action": [
* actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}*
]
}
2. ACTIONS: You can specify multiple actions to be executed in sequence.
2. ACTIONS: You can specify multiple actions to be executed in sequence.
Common action sequences:
- Form filling: [
{"input_text": {"index": 1, "text": "username"}},
{"input_text": {"index": 2, "text": "password"}},
{"click_element": {"index": 3}}
]
- Navigation and extraction: [
{"go_to_url": {"url": "https://example.com"}},
{"extract_page_content": {}}
]
Common action sequences:
- Form filling: [
{"input_text": {"index": 1, "text": "username"}},
{"input_text": {"index": 2, "text": "password"}},
{"click_element": {"index": 3}}
]
- Navigation and extraction: [
{"go_to_url": {"url": "https://example.com"}},
{"extract_page_content": {}}
]
3. ELEMENT INTERACTION:
- Only use indexes that exist in the provided element list
- Each element has a unique index number (e.g., "33[:]<button>")
- Elements marked with "_[:]" are non-interactive (for context only)
3. ELEMENT INTERACTION:
- Only use indexes that exist in the provided element list
- Each element has a unique index number (e.g., "33[:]<button>")
- Elements marked with "_[:]" are non-interactive (for context only)
4. NAVIGATION & ERROR HANDLING:
- If no suitable elements exist, use other functions to complete the task
- If stuck, try alternative approaches
- Handle popups/cookies by accepting or closing them
- Use scroll to find elements you are looking for
4. NAVIGATION & ERROR HANDLING:
- If no suitable elements exist, use other functions to complete the task
- If stuck, try alternative approaches
- Handle popups/cookies by accepting or closing them
- Use scroll to find elements you are looking for
5. TASK COMPLETION:
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
- Don't hallucinate actions.
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
- Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.
5. TASK COMPLETION:
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
- Don't hallucinate actions.
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
- Note that you must verify if you've truly fulfilled the user's request by examining the actual page content, not just by looking at the actions you output but also whether the action is executed successfully. Pay particular attention when errors occur during action execution.
6. VISUAL CONTEXT:
- When an image is provided, use it to understand the page layout
- Bounding boxes with labels correspond to element indexes
- Each bounding box and its label have the same color
- Most often the label is inside the bounding box, on the top right
- Visual context helps verify element locations and relationships
- sometimes labels overlap, so use the context to verify the correct element
6. VISUAL CONTEXT:
- When an image is provided, use it to understand the page layout
- Bounding boxes with labels correspond to element indexes
- Each bounding box and its label have the same color
- Most often the label is inside the bounding box, on the top right
- Visual context helps verify element locations and relationships
- sometimes labels overlap, so use the context to verify the correct element
7. Form filling:
- If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
7. Form filling:
- If you fill an input field and your action sequence is interrupted, most often a list with suggestions poped up under the field and you need to first select the right element from the suggestion list.
8. ACTION SEQUENCING:
- Actions are executed in the order they appear in the list
- Each action should logically follow from the previous one
- If the page changes after an action, the sequence is interrupted and you get the new state.
- If content only disappears the sequence continues.
- Only provide the action sequence until you think the page will change.
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
- only use multiple actions if it makes sense.
"""
8. ACTION SEQUENCING:
- Actions are executed in the order they appear in the list
- Each action should logically follow from the previous one
- If the page changes after an action, the sequence is interrupted and you get the new state.
- If content only disappears the sequence continues.
- Only provide the action sequence until you think the page will change.
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
- only use multiple actions if it makes sense.
9. Extraction:
- If your task is to find information or do research - call extract_content on the specific pages to get and store the information.
"""
text += f" - use maximum {self.max_actions_per_step} actions per sequence"
return text
def input_format(self) -> str:
return """
INPUT STRUCTURE:
1. Task: The user\'s instructions you need to complete.
2. Hints(Optional): Some hints to help you complete the user\'s instructions.
3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
4. Current URL: The webpage you're currently on
5. Available Tabs: List of open browser tabs
6. Interactive Elements: List in the format:
index[:]<element_type>element_text</element_type>
- index: Numeric identifier for interaction
- element_type: HTML element type (button, input, etc.)
- element_text: Visible text or element description
INPUT STRUCTURE:
1. Task: The user\'s instructions you need to complete.
2. Hints(Optional): Some hints to help you complete the user\'s instructions.
3. Memory: Important contents are recorded during historical operations for use in subsequent operations.
4. Current URL: The webpage you're currently on
5. Available Tabs: List of open browser tabs
6. Interactive Elements: List in the format:
[index]<element_type>element_text</element_type>
- index: Numeric identifier for interaction
- element_type: HTML element type (button, input, etc.)
- element_text: Visible text or element description
Example:
33[:]<button>Submit Form</button>
_[:] Non-interactive text
Example:
[33]<button>Submit Form</button>
[] Non-interactive text
Notes:
- Only elements with numeric indexes are interactive
- _[:] elements provide context but cannot be interacted with
Notes:
- Only elements with numeric indexes inside [] are interactive
- [] elements provide context but cannot be interacted with
"""
def get_system_message(self) -> SystemMessage:
"""
Get the system prompt for the agent.
Returns:
str: Formatted system prompt
"""
AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
1. Analyze the provided webpage elements and structure
2. Plan a sequence of actions to accomplish the given task
3. Your final result MUST be a valid JSON as the **RESPONSE FORMAT** described, containing your action sequence and state assessment, No need extra content to expalin.
{self.input_format()}
{self.important_rules()}
Functions:
{self.default_action_description}
Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid."""
return SystemMessage(content=AGENT_PROMPT)
class CustomAgentMessagePrompt(AgentMessagePrompt):
def __init__(
@@ -143,20 +125,20 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
max_error_length: int = 400,
step_info: Optional[CustomAgentStepInfo] = None,
):
super(CustomAgentMessagePrompt, self).__init__(state=state,
result=result,
include_attributes=include_attributes,
max_error_length=max_error_length,
super(CustomAgentMessagePrompt, self).__init__(state=state,
result=result,
include_attributes=include_attributes,
max_error_length=max_error_length,
step_info=step_info
)
self.actions = actions
def get_user_message(self) -> HumanMessage:
def get_user_message(self, use_vision: bool = True) -> HumanMessage:
if self.step_info:
step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
else:
step_info_description = ''
time_str = datetime.now().strftime("%Y-%m-%d %H:%M")
step_info_description += f"Current date and time: {time_str}"
@@ -180,7 +162,7 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
elements_text = f'{elements_text}\n[End of page]'
else:
elements_text = 'empty page'
state_description = f"""
{step_info_description}
1. Task: {self.step_info.task}.
@@ -211,18 +193,16 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
)
if self.state.screenshot:
if self.state.screenshot and use_vision == True:
# Format message for vision model
return HumanMessage(
content=[
{"type": "text", "text": state_description},
{'type': 'text', 'text': state_description},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{self.state.screenshot}"
},
'type': 'image_url',
'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},
},
]
)
return HumanMessage(content=state_description)
return HumanMessage(content=state_description)

View File

@@ -25,57 +25,3 @@ class CustomBrowser(Browser):
config: BrowserContextConfig = BrowserContextConfig()
) -> CustomBrowserContext:
return CustomBrowserContext(config=config, browser=self)
async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
if not self.config.chrome_instance_path:
raise ValueError('Chrome instance path is required')
import subprocess
import requests
try:
# Check if browser is already running
response = requests.get('http://localhost:9222/json/version', timeout=2)
if response.status_code == 200:
logger.info('Reusing existing Chrome instance')
browser = await playwright.chromium.connect_over_cdp(
endpoint_url='http://localhost:9222',
timeout=20000, # 20 second timeout for connection
)
return browser
except requests.ConnectionError:
logger.debug('No existing Chrome instance found, starting a new one')
# Start a new Chrome instance
subprocess.Popen(
[
self.config.chrome_instance_path,
'--remote-debugging-port=9222',
] + self.config.extra_chromium_args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# try to connect first in case the browser have not started
for _ in range(10):
try:
response = requests.get('http://localhost:9222/json/version', timeout=2)
if response.status_code == 200:
break
except requests.ConnectionError:
pass
await asyncio.sleep(1)
# Attempt to connect again after starting a new instance
try:
browser = await playwright.chromium.connect_over_cdp(
endpoint_url='http://localhost:9222',
timeout=20000, # 20 second timeout for connection
)
return browser
except Exception as e:
logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
raise RuntimeError(
' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
)

View File

@@ -39,7 +39,7 @@ class CustomController(Controller):
pyperclip.copy(text)
return ActionResult(extracted_content=text)
@self.registry.action("Paste text from clipboard", requires_browser=True)
@self.registry.action("Paste text from clipboard")
async def paste_from_clipboard(browser: BrowserContext):
text = pyperclip.paste()
# send text to browser
@@ -47,25 +47,3 @@ class CustomController(Controller):
await page.keyboard.type(text)
return ActionResult(extracted_content=text)
@self.registry.action(
'Extract page content to get the pure text or markdown with links if include_links is set to true',
param_model=ExtractPageContentAction,
requires_browser=True,
)
async def extract_content(params: ExtractPageContentAction, browser: BrowserContext):
page = await browser.get_current_page()
# use jina reader
url = page.url
jina_url = f"https://r.jina.ai/{url}"
await page.goto(jina_url)
output_format = 'markdown' if params.include_links else 'text'
content = MainContentExtractor.extract( # type: ignore
html=await page.content(),
output_format=output_format,
)
# go back to org url
await page.go_back()
msg = f'Extracted page content:\n {content}\n'
logger.info(msg)
return ActionResult(extracted_content=msg)

View File

@@ -15,12 +15,16 @@ import json
import re
from browser_use.agent.service import Agent
from browser_use.browser.browser import BrowserConfig, Browser
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller, DoneAction
from main_content_extractor import MainContentExtractor
from langchain.schema import SystemMessage, HumanMessage
from json_repair import repair_json
from src.agent.custom_prompts import CustomSystemPrompt, CustomAgentMessagePrompt
from src.controller.custom_controller import CustomController
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.browser.custom_context import BrowserContextConfig, BrowserContext
from browser_use.browser.context import (
BrowserContextConfig,
BrowserContextWindowSize,
@@ -65,6 +69,27 @@ async def deep_research(task, llm, agent_state=None, **kwargs):
controller = CustomController()
@controller.registry.action(
'Extract page content to get the pure markdown.',
)
async def extract_content(browser: BrowserContext):
page = await browser.get_current_page()
# use jina reader
url = page.url
jina_url = f"https://r.jina.ai/{url}"
await page.goto(jina_url)
output_format = 'markdown'
content = MainContentExtractor.extract( # type: ignore
html=await page.content(),
output_format=output_format,
)
# go back to org url
await page.go_back()
msg = f'Extracted page content:\n {content}\n'
logger.info(msg)
return ActionResult(extracted_content=msg)
search_system_prompt = f"""
You are a **Deep Researcher**, an AI agent specializing in in-depth information gathering and research using a web browser with **automated execution capabilities**. Your expertise lies in formulating comprehensive research plans and executing them meticulously to fulfill complex user requests. You will analyze user instructions, devise a detailed research plan, and determine the necessary search queries to gather the required information.
@@ -200,8 +225,7 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=5,
controller=controller,
agent_state=agent_state
controller=controller
)
agent_result = await agent.run(max_steps=kwargs.get("max_steps", 10))
query_results = [agent_result]
@@ -224,7 +248,6 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=5,
controller=controller,
agent_state=agent_state
) for task in query_tasks]
query_results = await asyncio.gather(
*[agent.run(max_steps=kwargs.get("max_steps", 10)) for agent in agents])
@@ -265,6 +288,9 @@ Provide your output as a JSON formatted list. Each item in the list must adhere
record_content = repair_json(record_content)
new_record_infos = json.loads(record_content)
history_infos.extend(new_record_infos)
if agent_state and agent_state.is_stop_requested():
# Stop
break
logger.info("\nFinish Searching, Start Generating Report...")

View File

@@ -128,7 +128,7 @@ async def test_browser_use_custom():
# llm = utils.get_llm_model(
# provider="google",
# model_name="gemini-2.0-flash-exp",
# model_name="gemini-2.0-flash",
# temperature=1.0,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
@@ -193,7 +193,7 @@ async def test_browser_use_custom():
)
)
agent = CustomAgent(
task="Search 'Nvidia' and give me the first url",
task="Give me stock price of Tesla",
add_infos="", # some hints for llm to complete the task
llm=llm,
browser=browser,

View File

@@ -39,17 +39,18 @@ from src.utils.utils import update_model_dropdown, get_latest_files, capture_scr
# Global variables for persistence
_global_browser = None
_global_browser_context = None
_global_agent = None
# Create the global agent state instance
_global_agent_state = AgentState()
async def stop_agent():
"""Request the agent to stop and update UI with enhanced feedback"""
global _global_agent_state, _global_browser_context, _global_browser
global _global_agent_state, _global_browser_context, _global_browser, _global_agent
try:
# Request stop
_global_agent_state.request_stop()
_global_agent.stop()
# Update UI immediately
message = "Stop requested - the agent will halt at the next safe point"
@@ -247,7 +248,7 @@ async def run_org_agent(
tool_calling_method
):
try:
global _global_browser, _global_browser_context, _global_agent_state
global _global_browser, _global_browser_context, _global_agent_state, _global_agent
# Clear any previous stop request
_global_agent_state.clear_stop()
@@ -284,20 +285,21 @@ async def run_org_agent(
),
)
)
agent = Agent(
task=task,
llm=llm,
use_vision=use_vision,
browser=_global_browser,
browser_context=_global_browser_context,
max_actions_per_step=max_actions_per_step,
tool_calling_method=tool_calling_method
)
history = await agent.run(max_steps=max_steps)
history_file = os.path.join(save_agent_history_path, f"{agent.agent_id}.json")
agent.save_history(history_file)
if _global_agent is None:
_global_agent = Agent(
task=task,
llm=llm,
use_vision=use_vision,
browser=_global_browser,
browser_context=_global_browser_context,
max_actions_per_step=max_actions_per_step,
tool_calling_method=tool_calling_method
)
history = await _global_agent.run(max_steps=max_steps)
history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
_global_agent.save_history(history_file)
final_result = history.final_result()
errors = history.errors()
@@ -313,6 +315,7 @@ async def run_org_agent(
errors = str(e) + "\n" + traceback.format_exc()
return '', errors, '', '', None, None
finally:
_global_agent = None
# Handle cleanup based on persistence configuration
if not keep_browser_open:
if _global_browser_context:
@@ -342,7 +345,7 @@ async def run_custom_agent(
tool_calling_method
):
try:
global _global_browser, _global_browser_context, _global_agent_state
global _global_browser, _global_browser_context, _global_agent_state, _global_agent
# Clear any previous stop request
_global_agent_state.clear_stop()
@@ -384,24 +387,24 @@ async def run_custom_agent(
)
# Create and run agent
agent = CustomAgent(
task=task,
add_infos=add_infos,
use_vision=use_vision,
llm=llm,
browser=_global_browser,
browser_context=_global_browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=max_actions_per_step,
agent_state=_global_agent_state,
tool_calling_method=tool_calling_method
)
history = await agent.run(max_steps=max_steps)
if _global_agent is None:
_global_agent = CustomAgent(
task=task,
add_infos=add_infos,
use_vision=use_vision,
llm=llm,
browser=_global_browser,
browser_context=_global_browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
agent_prompt_class=CustomAgentMessagePrompt,
max_actions_per_step=max_actions_per_step,
tool_calling_method=tool_calling_method
)
history = await _global_agent.run(max_steps=max_steps)
history_file = os.path.join(save_agent_history_path, f"{agent.agent_id}.json")
agent.save_history(history_file)
history_file = os.path.join(save_agent_history_path, f"{_global_agent.agent_id}.json")
_global_agent.save_history(history_file)
final_result = history.final_result()
errors = history.errors()
@@ -417,6 +420,7 @@ async def run_custom_agent(
errors = str(e) + "\n" + traceback.format_exc()
return '', errors, '', '', None, None
finally:
_global_agent = None
# Handle cleanup based on persistence configuration
if not keep_browser_open:
if _global_browser_context: