diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..8b09300 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "python.analysis.typeCheckingMode": "basic", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true, + "editor.codeActionsOnSave": { + "source.fixAll.ruff": "explicit", + "source.organizeImports.ruff": "explicit" + } + } +} diff --git a/README.md b/README.md index 5d6363e..9d9eb6c 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,53 @@ -# Browser-Use WebUI +Browser Use Web UI -## Background +
-This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. We have enhanced the original capabilities by providing: +[![GitHub stars](https://img.shields.io/github/stars/browser-use/web-ui?style=social)](https://github.com/browser-use/web-ui/stargazers) +[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord) +[![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com) +[![WarmShao](https://img.shields.io/twitter/follow/warmshao?style=social)](https://x.com/warmshao) -1. **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent. +This project builds upon the foundation of the [browser-use](https://github.com/browser-use/browser-use), which is designed to make websites accessible for AI agents. -2. **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future. +We would like to officially thank [WarmShao](https://github.com/warmshao) for his contribution to this project. -3. **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording. +**WebUI:** is built on Gradio and supports a most of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent. -4. **Customized Agent:** We've implemented a custom agent that enhances `browser-use` with Optimized prompts. +**Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek, Ollama etc. And we plan to add support for even more models in the future. - +**Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording. -**Changelog** -- [x] **2025/01/06:** Thanks to @richard-devbot, a New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113). + +## Installation Guide -## Environment Installation +Read the [quickstart guide](https://docs.browser-use.com/quickstart#prepare-the-environment) or follow the steps below to get started. -1. **Python Version:** Ensure you have Python 3.11 or higher installed. -2. **Install `browser-use`:** - ```bash - pip install browser-use - ``` -3. **Install Playwright:** - ```bash - playwright install - ``` -4. **Install Dependencies:** - ```bash - pip install -r requirements.txt - ``` -5. **Configure Environment Variables:** - - Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. - - **If using your own browser:** - - Set `CHROME_PATH` to the executable path of your browser (e.g., `C:\Program Files\Google\Chrome\Application\chrome.exe` on Windows). - - Set `CHROME_USER_DATA` to the user data directory of your browser (e.g.,`C:\Users\\AppData\Local\Google\Chrome\User Data`). +> Python 3.11 or higher is required. + +First, we recommend using [uv](https://docs.astral.sh/uv/) to setup the Python environment. + +```bash +uv venv --python 3.11 +``` + +and activate it with: + +```bash +source .venv/bin/activate +``` + +Install the dependencies: + +```bash +uv pip install -r requirements.txt +``` + +Then install playwright: + +```bash +playwright install +``` ## Usage @@ -50,3 +60,35 @@ This project builds upon the foundation of the [browser-use](https://github.com/ - Close all chrome windows - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent. - Check the "Use Own Browser" option within the Browser Settings. + +## (Optional) Configure Environment Variables + +Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. With + +```bash +cp .env.example .env +``` + +**If using your own browser:** - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. + +You can just copy examples down below to your `.env` file. + +### Windows + +```env +CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe" +CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data" +``` + +> Note: Replace `YourUsername` with your actual Windows username for Windows systems. + +### Mac + +```env +CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" +CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1" +``` + +## Changelog + +- [x] **2025/01/06:** Thanks to @richard-devbot, a New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113). diff --git a/assets/web-ui.png b/assets/web-ui.png new file mode 100644 index 0000000..383fffc Binary files /dev/null and b/assets/web-ui.png differ diff --git a/requirements.txt b/requirements.txt index cdda0d1..1471909 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,4 @@ browser-use langchain-google-genai pyperclip gradio -langchain-ollama - +langchain-ollama \ No newline at end of file diff --git a/webui.py b/webui.py index eef1e3c..a1e81b4 100644 --- a/webui.py +++ b/webui.py @@ -4,62 +4,56 @@ # @Email : wenshaoguo1026@gmail.com # @Project : browser-use-webui # @FileName: webui.py -import pdb from dotenv import load_dotenv load_dotenv() import argparse - -import asyncio +import os import gradio as gr -import asyncio -import os -from pprint import pprint -from typing import List, Dict, Any - -from playwright.async_api import async_playwright +from browser_use.agent.service import Agent from browser_use.browser.browser import Browser, BrowserConfig from browser_use.browser.context import ( - BrowserContext, BrowserContextConfig, BrowserContextWindowSize, ) -from browser_use.agent.service import Agent +from playwright.async_api import async_playwright -from src.browser.custom_browser import CustomBrowser, BrowserConfig -from src.browser.custom_context import BrowserContext, BrowserContextConfig -from src.controller.custom_controller import CustomController from src.agent.custom_agent import CustomAgent from src.agent.custom_prompts import CustomSystemPrompt - +from src.browser.custom_browser import BrowserConfig, CustomBrowser +from src.browser.custom_context import BrowserContextConfig +from src.controller.custom_controller import CustomController from src.utils import utils + async def run_browser_agent( - agent_type, - llm_provider, - llm_model_name, - llm_temperature, - llm_base_url, - llm_api_key, - use_own_browser, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - add_infos, - max_steps, - use_vision + agent_type, + llm_provider, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + max_steps, + use_vision, ): # Ensure the recording directory exists os.makedirs(save_recording_path, exist_ok=True) # Get the list of existing videos before the agent runs - existing_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + - glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]'))) + existing_videos = set( + glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + ) # Run the agent llm = utils.get_llm_model( @@ -67,7 +61,7 @@ async def run_browser_agent( model_name=llm_model_name, temperature=llm_temperature, base_url=llm_base_url, - api_key=llm_api_key + api_key=llm_api_key, ) if agent_type == "org": final_result, errors, model_actions, model_thoughts = await run_org_agent( @@ -79,7 +73,7 @@ async def run_browser_agent( save_recording_path=save_recording_path, task=task, max_steps=max_steps, - use_vision=use_vision + use_vision=use_vision, ) elif agent_type == "custom": final_result, errors, model_actions, model_thoughts = await run_custom_agent( @@ -93,14 +87,16 @@ async def run_browser_agent( task=task, add_infos=add_infos, max_steps=max_steps, - use_vision=use_vision + use_vision=use_vision, ) else: raise ValueError(f"Invalid agent type: {agent_type}") # Get the list of videos after the agent runs - new_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + - glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]'))) + new_videos = set( + glob.glob(os.path.join(save_recording_path, "*.[mM][pP]4")) + + glob.glob(os.path.join(save_recording_path, "*.[wW][eE][bB][mM]")) + ) # Find the newly created video latest_video = None @@ -109,31 +105,34 @@ async def run_browser_agent( return final_result, errors, model_actions, model_thoughts, latest_video + async def run_org_agent( - llm, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - max_steps, - use_vision + llm, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + max_steps, + use_vision, ): browser = Browser( config=BrowserConfig( headless=headless, disable_security=disable_security, - extra_chromium_args=[f'--window-size={window_w},{window_h}'], + extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) async with await browser.new_context( - config=BrowserContextConfig( - trace_path='./tmp/traces', - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), - ) + config=BrowserContextConfig( + trace_path="./tmp/traces", + save_recording_path=save_recording_path if save_recording_path else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h + ), + ) ) as browser_context: agent = Agent( task=task, @@ -150,18 +149,19 @@ async def run_org_agent( await browser.close() return final_result, errors, model_actions, model_thoughts + async def run_custom_agent( - llm, - use_own_browser, - headless, - disable_security, - window_w, - window_h, - save_recording_path, - task, - add_infos, - max_steps, - use_vision + llm, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + max_steps, + use_vision, ): controller = CustomController() playwright = None @@ -177,14 +177,14 @@ async def run_custom_agent( no_viewport=False, headless=headless, # 保持浏览器窗口可见 user_agent=( - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' - '(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36" ), java_script_enabled=True, bypass_csp=disable_security, ignore_https_errors=disable_security, record_video_dir=save_recording_path if save_recording_path else None, - record_video_size={'width': window_w, 'height': window_h} + record_video_size={"width": window_w, "height": window_h}, ) else: browser_context_ = None @@ -193,17 +193,21 @@ async def run_custom_agent( config=BrowserConfig( headless=headless, disable_security=disable_security, - extra_chromium_args=[f'--window-size={window_w},{window_h}'], + extra_chromium_args=[f"--window-size={window_w},{window_h}"], ) ) async with await browser.new_context( - config=BrowserContextConfig( - trace_path='./tmp/result_processing', - save_recording_path=save_recording_path if save_recording_path else None, - no_viewport=False, - browser_window_size=BrowserContextWindowSize(width=window_w, height=window_h), + config=BrowserContextConfig( + trace_path="./tmp/result_processing", + save_recording_path=save_recording_path + if save_recording_path + else None, + no_viewport=False, + browser_window_size=BrowserContextWindowSize( + width=window_w, height=window_h ), - context=browser_context_ + ), + context=browser_context_, ) as browser_context: agent = CustomAgent( task=task, @@ -212,7 +216,7 @@ async def run_custom_agent( llm=llm, browser_context=browser_context, controller=controller, - system_prompt_class=CustomSystemPrompt + system_prompt_class=CustomSystemPrompt, ) history = await agent.run(max_steps=max_steps) @@ -223,6 +227,7 @@ async def run_custom_agent( except Exception as e: import traceback + traceback.print_exc() final_result = "" errors = str(e) + "\n" + traceback.format_exc() @@ -240,10 +245,9 @@ async def run_custom_agent( return final_result, errors, model_actions, model_thoughts -import argparse -import gradio as gr -from gradio.themes import Base, Default, Soft, Monochrome, Glass, Origin, Citrus, Ocean -import os, glob +import glob + +from gradio.themes import Citrus, Default, Glass, Monochrome, Ocean, Origin, Soft # Define the theme map globally theme_map = { @@ -253,9 +257,10 @@ theme_map = { "Glass": Glass(), "Origin": Origin(), "Citrus": Citrus(), - "Ocean": Ocean() + "Ocean": Ocean(), } + def create_ui(theme_name="Ocean"): css = """ .gradio-container { @@ -283,25 +288,27 @@ def create_ui(theme_name="Ocean"): } } """ - - with gr.Blocks(title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js) as demo: + + with gr.Blocks( + title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js + ) as demo: with gr.Row(): gr.Markdown( """ # 🌐 Browser Use WebUI ### Control your browser with AI assistance """, - elem_classes=["header-text"] + elem_classes=["header-text"], ) - + with gr.Tabs() as tabs: - with gr.TabItem("🤖 Agent Settings", id=1): + with gr.TabItem("⚙️ Agent Settings", id=1): with gr.Group(): agent_type = gr.Radio( ["org", "custom"], label="Agent Type", value="custom", - info="Select the type of agent to use" + info="Select the type of agent to use", ) max_steps = gr.Slider( minimum=1, @@ -309,26 +316,33 @@ def create_ui(theme_name="Ocean"): value=100, step=1, label="Max Run Steps", - info="Maximum number of steps the agent will take" + info="Maximum number of steps the agent will take", ) use_vision = gr.Checkbox( label="Use Vision", value=True, - info="Enable visual processing capabilities" + info="Enable visual processing capabilities", ) with gr.TabItem("🔧 LLM Configuration", id=2): with gr.Group(): llm_provider = gr.Dropdown( - ["anthropic", "openai", "gemini", "azure_openai", "deepseek", "ollama"], + [ + "anthropic", + "openai", + "gemini", + "azure_openai", + "deepseek", + "ollama", + ], label="LLM Provider", - value="gemini", - info="Select your preferred language model provider" + value="openai", + info="Select your preferred language model provider", ) llm_model_name = gr.Textbox( label="Model Name", - value="gemini-2.0-flash-exp", - info="Specify the model to use" + value="gpt-4o", + info="Specify the model to use", ) llm_temperature = gr.Slider( minimum=0.0, @@ -336,17 +350,14 @@ def create_ui(theme_name="Ocean"): value=1.0, step=0.1, label="Temperature", - info="Controls randomness in model outputs" + info="Controls randomness in model outputs", ) with gr.Row(): llm_base_url = gr.Textbox( - label="Base URL", - info="API endpoint URL (if required)" + label="Base URL", info="API endpoint URL (if required)" ) llm_api_key = gr.Textbox( - label="API Key", - type="password", - info="Your API key" + label="API Key", type="password", info="Your API key" ) with gr.TabItem("🌐 Browser Settings", id=3): @@ -355,51 +366,51 @@ def create_ui(theme_name="Ocean"): use_own_browser = gr.Checkbox( label="Use Own Browser", value=False, - info="Use your existing browser instance" + info="Use your existing browser instance", ) headless = gr.Checkbox( label="Headless Mode", value=False, - info="Run browser without GUI" + info="Run browser without GUI", ) disable_security = gr.Checkbox( label="Disable Security", value=True, - info="Disable browser security features" + info="Disable browser security features", ) - + with gr.Row(): window_w = gr.Number( label="Window Width", - value=1920, - info="Browser window width" + value=1280, + info="Browser window width", ) window_h = gr.Number( label="Window Height", - value=1080, - info="Browser window height" + value=1100, + info="Browser window height", ) - + save_recording_path = gr.Textbox( label="Recording Path", placeholder="e.g. ./tmp/record_videos", value="./tmp/record_videos", - info="Path to save browser recordings" + info="Path to save browser recordings", ) - with gr.TabItem("📝 Task Settings", id=4): + with gr.TabItem("🤖 Run Agent", id=4): task = gr.Textbox( label="Task Description", lines=4, placeholder="Enter your task here...", value="go to google.com and type 'OpenAI' click search and give me the first url", - info="Describe what you want the agent to do" + info="Describe what you want the agent to do", ) add_infos = gr.Textbox( label="Additional Information", lines=3, placeholder="Add any helpful context or instructions...", - info="Optional hints to help the LLM complete the task" + info="Optional hints to help the LLM complete the task", ) with gr.Row(): @@ -414,54 +425,74 @@ def create_ui(theme_name="Ocean"): with gr.Row(): with gr.Column(): final_result_output = gr.Textbox( - label="Final Result", - lines=3, - show_label=True + label="Final Result", lines=3, show_label=True ) with gr.Column(): errors_output = gr.Textbox( - label="Errors", - lines=3, - show_label=True + label="Errors", lines=3, show_label=True ) with gr.Row(): with gr.Column(): model_actions_output = gr.Textbox( - label="Model Actions", - lines=3, - show_label=True + label="Model Actions", lines=3, show_label=True ) with gr.Column(): model_thoughts_output = gr.Textbox( - label="Model Thoughts", - lines=3, - show_label=True + label="Model Thoughts", lines=3, show_label=True ) # Run button click handler run_button.click( fn=run_browser_agent, inputs=[ - agent_type, llm_provider, llm_model_name, llm_temperature, - llm_base_url, llm_api_key, use_own_browser, headless, - disable_security, window_w, window_h, save_recording_path, - task, add_infos, max_steps, use_vision + agent_type, + llm_provider, + llm_model_name, + llm_temperature, + llm_base_url, + llm_api_key, + use_own_browser, + headless, + disable_security, + window_w, + window_h, + save_recording_path, + task, + add_infos, + max_steps, + use_vision, + ], + outputs=[ + final_result_output, + errors_output, + model_actions_output, + model_thoughts_output, + recording_display, ], - outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output, recording_display] ) return demo + def main(): parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent") - parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to") + parser.add_argument( + "--ip", type=str, default="127.0.0.1", help="IP address to bind to" + ) parser.add_argument("--port", type=int, default=7788, help="Port to listen on") - parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI") + parser.add_argument( + "--theme", + type=str, + default="Ocean", + choices=theme_map.keys(), + help="Theme to use for the UI", + ) parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode") args = parser.parse_args() demo = create_ui(theme_name=args.theme) demo.launch(server_name=args.ip, server_port=args.port) -if __name__ == '__main__': + +if __name__ == "__main__": main()