Merge remote-tracking branch 'upstream/main' into stream_function

This commit is contained in:
katiue
2025-01-12 20:05:26 +07:00
4 changed files with 349 additions and 33 deletions

View File

@@ -9,6 +9,10 @@ import logging
import pdb
import traceback
from typing import Optional, Type
from PIL import Image, ImageDraw, ImageFont
import os
import base64
import io
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.service import Agent
@@ -227,6 +231,119 @@ class CustomAgent(Agent):
)
if state:
self._make_history_item(model_output, state, result)
def create_history_gif(
self,
output_path: str = 'agent_history.gif',
duration: int = 3000,
show_goals: bool = True,
show_task: bool = True,
show_logo: bool = False,
font_size: int = 40,
title_font_size: int = 56,
goal_font_size: int = 44,
margin: int = 40,
line_spacing: float = 1.5,
) -> None:
"""Create a GIF from the agent's history with overlaid task and goal text."""
if not self.history.history:
logger.warning('No history to create GIF from')
return
images = []
# if history is empty or first screenshot is None, we can't create a gif
if not self.history.history or not self.history.history[0].state.screenshot:
logger.warning('No history or first screenshot to create GIF from')
return
# Try to load nicer fonts
try:
# Try different font options in order of preference
font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
font_loaded = False
for font_name in font_options:
try:
import platform
if platform.system() == "Windows":
# Need to specify the abs font path on Windows
font_name = os.path.join(os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"), font_name + ".ttf")
regular_font = ImageFont.truetype(font_name, font_size)
title_font = ImageFont.truetype(font_name, title_font_size)
goal_font = ImageFont.truetype(font_name, goal_font_size)
font_loaded = True
break
except OSError:
continue
if not font_loaded:
raise OSError('No preferred fonts found')
except OSError:
regular_font = ImageFont.load_default()
title_font = ImageFont.load_default()
goal_font = regular_font
# Load logo if requested
logo = None
if show_logo:
try:
logo = Image.open('./static/browser-use.png')
# Resize logo to be small (e.g., 40px height)
logo_height = 150
aspect_ratio = logo.width / logo.height
logo_width = int(logo_height * aspect_ratio)
logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
except Exception as e:
logger.warning(f'Could not load logo: {e}')
# Create task frame if requested
if show_task and self.task:
task_frame = self._create_task_frame(
self.task,
self.history.history[0].state.screenshot,
title_font,
regular_font,
logo,
line_spacing,
)
images.append(task_frame)
# Process each history item
for i, item in enumerate(self.history.history, 1):
if not item.state.screenshot:
continue
# Convert base64 screenshot to PIL Image
img_data = base64.b64decode(item.state.screenshot)
image = Image.open(io.BytesIO(img_data))
if show_goals and item.model_output:
image = self._add_overlay_to_image(
image=image,
step_number=i,
goal_text=item.model_output.current_state.thought,
regular_font=regular_font,
title_font=title_font,
margin=margin,
logo=logo,
)
images.append(image)
if images:
# Save the GIF
images[0].save(
output_path,
save_all=True,
append_images=images[1:],
duration=duration,
loop=0,
optimize=False,
)
logger.info(f'Created GIF at {output_path}')
else:
logger.warning('No images found in history to create GIF')
async def run(self, max_steps: int = 100) -> AgentHistoryList:
"""Execute the task with maximum number of steps"""
@@ -283,3 +400,6 @@ class CustomAgent(Agent):
if not self.injected_browser and self.browser:
await self.browser.close()
if self.generate_gif:
self.create_history_gif()

View File

@@ -123,4 +123,4 @@ class CustomBrowser(Browser):
return browser
except Exception as e:
logger.error(f'Failed to initialize Playwright browser: {str(e)}')
raise
raise

View File

@@ -3,6 +3,7 @@
# @Author : wenshao
# @ProjectName: browser-use-webui
# @FileName: test_browser_use.py
import pdb
from dotenv import load_dotenv
@@ -28,20 +29,29 @@ async def test_browser_use_org():
BrowserContextWindowSize,
)
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
llm = utils.get_llm_model(
provider="azure_openai",
model_name="gpt-4o",
temperature=0.8,
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
provider="deepseek",
model_name="deepseek-chat",
temperature=0.8
)
window_w, window_h = 1920, 1080
use_vision = False
chrome_path = os.getenv("CHROME_PATH", None)
browser = Browser(
config=BrowserConfig(
headless=False,
disable_security=True,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
@@ -59,6 +69,7 @@ async def test_browser_use_org():
task="go to google.com and type 'OpenAI' click search and give me the first url",
llm=llm,
browser_context=browser_context,
use_vision=use_vision
)
history: AgentHistoryList = await agent.run(max_steps=10)
@@ -208,6 +219,122 @@ async def test_browser_use_custom():
await browser.close()
async def test_browser_use_custom_v2():
from browser_use.browser.context import BrowserContextWindowSize
from browser_use.browser.browser import BrowserConfig
from playwright.async_api import async_playwright
from src.agent.custom_agent import CustomAgent
from src.agent.custom_prompts import CustomSystemPrompt
from src.browser.custom_browser import CustomBrowser
from src.browser.custom_context import BrowserContextConfig
from src.controller.custom_controller import CustomController
window_w, window_h = 1920, 1080
# llm = utils.get_llm_model(
# provider="azure_openai",
# model_name="gpt-4o",
# temperature=0.8,
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
# )
# llm = utils.get_llm_model(
# provider="gemini",
# model_name="gemini-2.0-flash-exp",
# temperature=1.0,
# api_key=os.getenv("GOOGLE_API_KEY", "")
# )
llm = utils.get_llm_model(
provider="deepseek",
model_name="deepseek-chat",
temperature=0.8
)
# llm = utils.get_llm_model(
# provider="ollama", model_name="qwen2.5:7b", temperature=0.8
# )
controller = CustomController()
use_own_browser = True
disable_security = True
use_vision = False # Set to False when using DeepSeek
tool_call_in_content = True # Set to True when using Ollama
max_actions_per_step = 1
playwright = None
browser = None
browser_context = None
try:
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
else:
chrome_path = None
browser = CustomBrowser(
config=BrowserConfig(
headless=False,
disable_security=disable_security,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
browser_context = await browser.new_context(
config=BrowserContextConfig(
trace_path="./tmp/traces",
save_recording_path="./tmp/record_videos",
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
)
agent = CustomAgent(
task="go to google.com and type 'OpenAI' click search and give me the first url",
add_infos="", # some hints for llm to complete the task
llm=llm,
browser=browser,
browser_context=browser_context,
controller=controller,
system_prompt_class=CustomSystemPrompt,
use_vision=use_vision,
tool_call_in_content=tool_call_in_content,
max_actions_per_step=max_actions_per_step
)
history: AgentHistoryList = await agent.run(max_steps=10)
print("Final Result:")
pprint(history.final_result(), indent=4)
print("\nErrors:")
pprint(history.errors(), indent=4)
# e.g. xPaths the model clicked on
print("\nModel Outputs:")
pprint(history.model_actions(), indent=4)
print("\nThoughts:")
pprint(history.model_thoughts(), indent=4)
# close browser
except Exception:
import traceback
traceback.print_exc()
finally:
# 显式关闭持久化上下文
if browser_context:
await browser_context.close()
# 关闭 Playwright 对象
if playwright:
await playwright.stop()
if browser:
await browser.close()
if __name__ == "__main__":
# asyncio.run(test_browser_use_org())
asyncio.run(test_browser_use_custom())
# asyncio.run(test_browser_use_custom())
asyncio.run(test_browser_use_custom_v2())

121
webui.py
View File

@@ -35,7 +35,6 @@ load_dotenv()
# Global variables for persistence
_global_browser = None
_global_browser_context = None
_global_playwright = None
async def run_browser_agent(
agent_type,
@@ -45,6 +44,7 @@ async def run_browser_agent(
llm_base_url,
llm_api_key,
use_own_browser,
keep_browser_open,
headless,
disable_security,
window_w,
@@ -89,6 +89,8 @@ async def run_browser_agent(
if agent_type == "org":
final_result, errors, model_actions, model_thoughts, recorded_files, trace_file = await run_org_agent(
llm=llm,
use_own_browser=use_own_browser,
keep_browser_open=keep_browser_open,
headless=headless,
disable_security=disable_security,
window_w=window_w,
@@ -108,6 +110,7 @@ async def run_browser_agent(
final_result, errors, model_actions, model_thoughts, recorded_files, trace_file = await run_custom_agent(
llm=llm,
use_own_browser=use_own_browser,
keep_browser_open=keep_browser_open,
headless=headless,
disable_security=disable_security,
window_w=window_w,
@@ -141,6 +144,8 @@ async def run_browser_agent(
async def run_org_agent(
llm,
use_own_browser,
keep_browser_open,
headless,
disable_security,
window_w,
@@ -156,28 +161,43 @@ async def run_org_agent(
browser_context,
playwright
):
browser = Browser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
async with await browser.new_context(
config=BrowserContextConfig(
trace_path=save_trace_path if save_trace_path else None,
save_recording_path=save_recording_path if save_recording_path else None,
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
try:
global _global_browser, _global_browser_context
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
else:
chrome_path = None
if _global_browser is None:
_global_browser = Browser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
) as browser_context:
if _global_browser_context is None:
_global_browser_context = await _global_browser.new_context(
config=BrowserContextConfig(
trace_path=save_trace_path if save_trace_path else None,
save_recording_path=save_recording_path if save_recording_path else None,
no_viewport=False,
browser_window_size=BrowserContextWindowSize(
width=window_w, height=window_h
),
)
)
agent = Agent(
task=task,
llm=llm,
use_vision=use_vision,
browser_context=browser_context,
browser=_global_browser,
browser_context=_global_browser_context,
max_actions_per_step=max_actions_per_step,
tool_call_in_content=tool_call_in_content
)
@@ -191,12 +211,28 @@ async def run_org_agent(
recorded_files = get_latest_files(save_recording_path)
trace_file = get_latest_files(save_trace_path)
await browser.close()
return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), trace_file.get('.zip')
return final_result, errors, model_actions, model_thoughts, recorded_files.get('.webm'), trace_file.get('.zip')
except Exception as e:
import traceback
traceback.print_exc()
errors = str(e) + "\n" + traceback.format_exc()
return '', errors, '', ''
finally:
# Handle cleanup based on persistence configuration
if not keep_browser_open:
if _global_browser_context:
await _global_browser_context.close()
_global_browser_context = None
if _global_browser:
await _global_browser.close()
_global_browser = None
async def run_custom_agent(
llm,
use_own_browser,
keep_browser_open,
headless,
disable_security,
window_w,
@@ -217,12 +253,24 @@ async def run_custom_agent(
persistence_config = BrowserPersistenceConfig.from_env()
try:
global _global_browser, _global_browser_context
if use_own_browser:
chrome_path = os.getenv("CHROME_PATH", None)
if chrome_path == "":
chrome_path = None
else:
chrome_path = None
controller = CustomController()
# Initialize global browser if needed
if browser is None:
browser = CustomBrowser(
config=BrowserConfig(
headless=headless,
disable_security=disable_security,
chrome_instance_path=chrome_path,
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
)
)
@@ -271,6 +319,7 @@ async def run_custom_agent(
),
),
)
)
# Create and run agent
agent = CustomAgent(
@@ -462,6 +511,17 @@ theme_map = {
"Base": Base()
}
async def close_global_browser():
global _global_browser, _global_browser_context
if _global_browser_context:
await _global_browser_context.close()
_global_browser_context = None
if _global_browser:
await _global_browser.close()
_global_browser = None
def create_ui(theme_name="Ocean"):
css = """
.gradio-container {
@@ -541,14 +601,15 @@ def create_ui(theme_name="Ocean"):
with gr.TabItem("🔧 LLM Configuration", id=2):
with gr.Group():
llm_provider = gr.Dropdown(
["anthropic", "openai", "deepseek", "gemini", "ollama", "azure_openai"],
choices=[provider for provider,model in utils.model_names.items()],
label="LLM Provider",
value="",
value="openai",
info="Select your preferred language model provider"
)
llm_model_name = gr.Dropdown(
label="Model Name",
value="",
choices=utils.model_names['openai'],
value="gpt-4o",
interactive=True,
allow_custom_value=True, # Allow users to input custom model names
info="Select a model from the dropdown or type a custom model name"
@@ -564,13 +625,13 @@ def create_ui(theme_name="Ocean"):
with gr.Row():
llm_base_url = gr.Textbox(
label="Base URL",
value=os.getenv(f"{llm_provider.value.upper()}_BASE_URL ", ""), # Default to .env value
value='',
info="API endpoint URL (if required)"
)
llm_api_key = gr.Textbox(
label="API Key",
type="password",
value=os.getenv(f"{llm_provider.value.upper()}_API_KEY", ""), # Default to .env value
value='',
info="Your API key (leave blank to use .env)"
)
@@ -582,6 +643,11 @@ def create_ui(theme_name="Ocean"):
value=False,
info="Use your existing browser instance",
)
keep_browser_open = gr.Checkbox(
label="Keep Browser Open",
value=os.getenv("CHROME_PERSISTENT_SESSION", "False").lower() == "true",
info="Keep Browser Open between Tasks",
)
headless = gr.Checkbox(
label="Headless Mode",
value=False,
@@ -725,12 +791,15 @@ def create_ui(theme_name="Ocean"):
outputs=save_recording_path
)
use_own_browser.change(fn=close_global_browser)
keep_browser_open.change(fn=close_global_browser)
# Run button click handler
run_button.click(
fn=run_with_stream,
inputs=[
agent_type, llm_provider, llm_model_name, llm_temperature, llm_base_url, llm_api_key,
use_own_browser, headless, disable_security, window_w, window_h, save_recording_path, save_trace_path,
use_own_browser, keep_browser_open, headless, disable_security, window_w, window_h, save_recording_path, save_trace_path,
enable_recording, task, add_infos, max_steps, use_vision, max_actions_per_step, tool_call_in_content
],
outputs=[