feat: adpat to new version of browser-use

This commit is contained in:
vincent
2025-01-08 19:23:23 +08:00
parent dcb39145ec
commit 041dc55a36
8 changed files with 254 additions and 162 deletions

View File

@@ -1,5 +1,5 @@
browser-use==0.1.17
langchain-google-genai
browser-use>=0.1.18
langchain-google-genai>=2.0.8
pyperclip
gradio
langchain-ollama

View File

@@ -6,6 +6,8 @@
import json
import logging
import pdb
import traceback
from typing import Optional, Type
from browser_use.agent.prompts import SystemPrompt
@@ -65,23 +67,25 @@ class CustomAgent(Agent):
],
max_error_length: int = 400,
max_actions_per_step: int = 10,
tool_call_in_content: bool = True,
):
super().__init__(
task,
llm,
browser,
browser_context,
controller,
use_vision,
save_conversation_path,
max_failures,
retry_delay,
system_prompt_class,
max_input_tokens,
validate_output,
include_attributes,
max_error_length,
max_actions_per_step,
task=task,
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
use_vision=use_vision,
save_conversation_path=save_conversation_path,
max_failures=max_failures,
retry_delay=retry_delay,
system_prompt_class=system_prompt_class,
max_input_tokens=max_input_tokens,
validate_output=validate_output,
include_attributes=include_attributes,
max_error_length=max_error_length,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content,
)
self.add_infos = add_infos
self.message_manager = CustomMassageManager(
@@ -93,6 +97,7 @@ class CustomAgent(Agent):
include_attributes=self.include_attributes,
max_error_length=self.max_error_length,
max_actions_per_step=self.max_actions_per_step,
tool_call_in_content=tool_call_in_content,
)
def _setup_action_models(self) -> None:
@@ -146,10 +151,29 @@ class CustomAgent(Agent):
@time_execution_async("--get_next_action")
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
try:
structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
parsed: AgentOutput = response['parsed']
# cut the number of actions to max_actions_per_step
parsed.action = parsed.action[: self.max_actions_per_step]
self._log_response(parsed)
self.n_steps += 1
return parsed
except Exception as e:
# If something goes wrong, try to invoke the LLM again without structured output,
# and Manually parse the response. Temporarily solution for DeepSeek
ret = self.llm.invoke(input_messages)
if isinstance(ret.content, list):
parsed_json = json.loads(ret.content[0].replace("```json", "").replace("```", ""))
else:
parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
parsed: AgentOutput = self.AgentOutput(**parsed_json)
if parsed is None:
raise ValueError(f'Could not parse response.')
# cut the number of actions to max_actions_per_step
parsed.action = parsed.action[: self.max_actions_per_step]
self._log_response(parsed)

View File

@@ -17,6 +17,7 @@ from browser_use.browser.views import BrowserState
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import (
HumanMessage,
AIMessage
)
from .custom_prompts import CustomAgentMessagePrompt
@@ -37,23 +38,53 @@ class CustomMassageManager(MessageManager):
include_attributes: list[str] = [],
max_error_length: int = 400,
max_actions_per_step: int = 10,
tool_call_in_content: bool = False,
):
super().__init__(
llm,
task,
action_descriptions,
system_prompt_class,
max_input_tokens,
estimated_tokens_per_character,
image_tokens,
include_attributes,
max_error_length,
max_actions_per_step,
llm=llm,
task=task,
action_descriptions=action_descriptions,
system_prompt_class=system_prompt_class,
max_input_tokens=max_input_tokens,
estimated_tokens_per_character=estimated_tokens_per_character,
image_tokens=image_tokens,
include_attributes=include_attributes,
max_error_length=max_error_length,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content,
)
# Move Task info to state_message
# Custom: Move Task info to state_message
self.history = MessageHistory()
self._add_message_with_tokens(self.system_prompt)
tool_calls = [
{
'name': 'AgentOutput',
'args': {
'current_state': {
'evaluation_previous_goal': 'Unknown - No previous actions to evaluate.',
'memory': '',
'next_goal': 'Obtain task from user',
},
'action': [],
},
'id': '',
'type': 'tool_call',
}
]
if self.tool_call_in_content:
# openai throws error if tool_calls are not responded -> move to content
example_tool_call = AIMessage(
content=f'{tool_calls}',
tool_calls=[],
)
else:
example_tool_call = AIMessage(
content=f'',
tool_calls=tool_calls,
)
self._add_message_with_tokens(example_tool_call)
def add_state_message(
self,
@@ -72,7 +103,7 @@ class CustomMassageManager(MessageManager):
self._add_message_with_tokens(msg)
if r.error:
msg = HumanMessage(
content=str(r.error)[-self.max_error_length :]
content=str(r.error)[-self.max_error_length:]
)
self._add_message_with_tokens(msg)
result = None # if result in history, we dont want to add it again

View File

@@ -24,7 +24,7 @@ class CustomSystemPrompt(SystemPrompt):
{
"current_state": {
"prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
"important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".",
"important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
"completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
"summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
@@ -183,7 +183,7 @@ class CustomAgentMessagePrompt:
state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
if result.error:
# only use last 300 characters of error
error = result.error[-self.max_error_length :]
error = result.error[-self.max_error_length:]
state_description += (
f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
)

View File

@@ -23,11 +23,12 @@ class CustomBrowserContext(BrowserContext):
config: BrowserContextConfig = BrowserContextConfig(),
context: BrowserContext = None,
):
super(CustomBrowserContext, self).__init__(browser, config)
super(CustomBrowserContext, self).__init__(browser=browser, config=config)
self.context = context
async def _create_context(self, browser: PlaywrightBrowser):
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
# If we have a context, return it directly
if self.context:
return self.context
if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
@@ -46,7 +47,7 @@ class CustomBrowserContext(BrowserContext):
bypass_csp=self.config.disable_security,
ignore_https_errors=self.config.disable_security,
record_video_dir=self.config.save_recording_path,
record_video_size=self.config.browser_window_size, # set record video size
record_video_size=self.config.browser_window_size, # set record video size, same as windows size
)
if self.config.trace_path:

View File

@@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs):
return ChatOllama(
model=kwargs.get("model_name", "qwen2.5:7b"),
temperature=kwargs.get("temperature", 0.0),
num_ctx=128000,
)
elif provider == "azure_openai":
if not kwargs.get("base_url", ""):

View File

@@ -80,11 +80,12 @@ async def test_browser_use_org():
async def test_browser_use_custom():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt
from src.browser.custom_browser import BrowserConfig, CustomBrowser
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
@@ -95,15 +96,15 @@ async def test_browser_use_custom():
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="gemini",
# model_name="gemini-2.0-flash-exp",
# temperature=1.0,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
llm = utils.get_llm_model(
provider="gemini",
model_name="gemini-2.0-flash-exp",
temperature=1.0,
api_key=os.getenv("GOOGLE_API_KEY", "")
)
# llm = utils.get_llm_model(
# provider="deepseek",
@@ -111,14 +112,16 @@ async def test_browser_use_custom():
# temperature=0.8
# )
llm = utils.get_llm_model(
provider="ollama", model_name="qwen2.5:7b", temperature=0.8
)
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.8
# )
controller = CustomController()
use_own_browser = False
disable_security = True
use_vision = False
use_vision = True # Set to False when using DeepSeek
tool_call_in_content = True # Set to True when using Ollama
max_actions_per_step = 1
playwright = None
browser_context_ = None
try:
@@ -171,6 +174,8 @@ async def test_browser_use_custom():
controller=controller,
system_prompt_class=CustomSystemPrompt,
use_vision=use_vision,
tool_call_in_content=tool_call_in_content,
max_actions_per_step=max_actions_per_step
)
history: AgentHistoryList = await agent.run(max_steps=10)

View File

@@ -45,6 +45,8 @@ async def run_browser_agent(
add_infos,
max_steps,
use_vision,
max_actions_per_step,
tool_call_in_content
):
# Ensure the recording directory exists
os.makedirs(save_recording_path, exist_ok=True)
@@ -74,6 +76,8 @@ async def run_browser_agent(
task=task,
max_steps=max_steps,
use_vision=use_vision,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content
)
elif agent_type == "custom":
final_result, errors, model_actions, model_thoughts = await run_custom_agent(
@@ -88,6 +92,8 @@ async def run_browser_agent(
add_infos=add_infos,
max_steps=max_steps,
use_vision=use_vision,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content
)
else:
raise ValueError(f"Invalid agent type: {agent_type}")
@@ -116,6 +122,9 @@ async def run_org_agent(
task,
max_steps,
use_vision,
max_actions_per_step,
tool_call_in_content
):
browser = Browser(
config=BrowserConfig(
@@ -139,6 +148,8 @@ async def run_org_agent(
llm=llm,
use_vision=use_vision,
browser_context=browser_context,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content
)
history = await agent.run(max_steps=max_steps)
@@ -162,6 +173,8 @@ async def run_custom_agent(
add_infos,
max_steps,
use_vision,
max_actions_per_step,
tool_call_in_content
):
controller = CustomController()
playwright = None
@@ -217,6 +230,8 @@ async def run_custom_agent(
browser_context=browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content
)
history = await agent.run(max_steps=max_steps)
@@ -318,11 +333,24 @@ def create_ui(theme_name="Ocean"):
label="Max Run Steps",
info="Maximum number of steps the agent will take",
)
max_actions_per_step = gr.Slider(
minimum=1,
maximum=20,
value=10,
step=1,
label="Max Actions per Step",
info="Maximum number of actions the agent will take per step",
)
use_vision = gr.Checkbox(
label="Use Vision",
value=True,
info="Enable visual processing capabilities",
)
tool_call_in_content = gr.Checkbox(
label="Use Tool Calls in Content",
value=True,
info="Enable Tool Calls in content",
)
with gr.TabItem("🔧 LLM Configuration", id=2):
with gr.Group():
@@ -461,6 +489,8 @@ def create_ui(theme_name="Ocean"):
add_infos,
max_steps,
use_vision,
max_actions_per_step,
tool_call_in_content
],
outputs=[
final_result_output,