mirror of
https://github.com/microsoft/OmniParser.git
synced 2025-02-18 03:18:33 +03:00
qwen2.5vl
This commit is contained in:
@@ -2,7 +2,7 @@ from groq import Groq
|
||||
import os
|
||||
from .utils import is_image_path
|
||||
|
||||
def run_groq_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0.6):
|
||||
def run_groq_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0.6):
|
||||
"""
|
||||
Run a chat completion through Groq's API, ignoring any images in the messages.
|
||||
"""
|
||||
|
||||
@@ -4,11 +4,7 @@ import base64
|
||||
import requests
|
||||
from .utils import is_image_path, encode_image
|
||||
|
||||
def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
|
||||
api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY is not set")
|
||||
|
||||
def run_oai_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens=256, temperature=0, provider_base_url: str = "https://api.openai.com/v1"):
|
||||
headers = {"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}"}
|
||||
|
||||
@@ -43,20 +39,21 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
|
||||
final_messages = [{"role": "user", "content": messages}]
|
||||
|
||||
payload = {
|
||||
"model": llm,
|
||||
"model": model_name,
|
||||
"messages": final_messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
|
||||
f"{provider_base_url}/chat/completions", headers=headers, json=payload
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
text = response.json()['choices'][0]['message']['content']
|
||||
token_usage = int(response.json()['usage']['total_tokens'])
|
||||
return text, token_usage
|
||||
except Exception as e:
|
||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
||||
print(f"Error in interleaved openAI: {e}. This may due to your invalid API key. Please check the response: {response.json()} ")
|
||||
return response.json()
|
||||
@@ -42,6 +42,8 @@ class VLMAgent:
|
||||
self.model = "gpt-4o-2024-11-20"
|
||||
elif model == "omniparser + R1":
|
||||
self.model = "deepseek-r1-distill-llama-70b"
|
||||
elif model == "omniparser + qwen2.5vl":
|
||||
self.model = "qwen2.5-vl-72b-instruct"
|
||||
else:
|
||||
raise ValueError(f"Model {model} not supported")
|
||||
|
||||
@@ -93,9 +95,10 @@ class VLMAgent:
|
||||
vlm_response, token_usage = run_oai_interleaved(
|
||||
messages=planner_messages,
|
||||
system=system,
|
||||
llm=self.model,
|
||||
model_name=self.model,
|
||||
api_key=self.api_key,
|
||||
max_tokens=self.max_tokens,
|
||||
provider_base_url="https://api.openai.com/v1",
|
||||
temperature=0,
|
||||
)
|
||||
print(f"oai token usage: {token_usage}")
|
||||
@@ -106,13 +109,26 @@ class VLMAgent:
|
||||
vlm_response, token_usage = run_groq_interleaved(
|
||||
messages=planner_messages,
|
||||
system=system,
|
||||
llm=self.model,
|
||||
model_name=self.model,
|
||||
api_key=self.api_key,
|
||||
max_tokens=self.max_tokens,
|
||||
)
|
||||
print(f"groq token usage: {token_usage}")
|
||||
self.total_token_usage += token_usage
|
||||
self.total_cost += (token_usage * 0.99 / 1000000)
|
||||
elif "qwen" in self.model:
|
||||
vlm_response, token_usage = run_oai_interleaved(
|
||||
messages=planner_messages,
|
||||
system=system,
|
||||
model_name=self.model,
|
||||
api_key=self.api_key,
|
||||
max_tokens=min(2048, self.max_tokens),
|
||||
provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
temperature=0,
|
||||
)
|
||||
print(f"qwen token usage: {token_usage}")
|
||||
self.total_token_usage += token_usage
|
||||
self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
|
||||
else:
|
||||
raise ValueError(f"Model {self.model} not supported")
|
||||
latency_vlm = time.time() - start
|
||||
|
||||
@@ -28,7 +28,7 @@ API_KEY_FILE = CONFIG_DIR / "api_key"
|
||||
INTRO_TEXT = '''
|
||||
🚀🤖✨ It's Play Time!
|
||||
|
||||
Welcome to the OmniParser+X Computer Use Demo! X = [GPT-4o, R1, Claude]. Let OmniParser turn your general purpose vision-langauge model to an AI agent.
|
||||
Welcome to the OmniParser+X Computer Use Demo! X = [GPT-4o, R1, Qwen2.5VL, Claude]. Let OmniParser turn your general purpose vision-langauge model to an AI agent.
|
||||
|
||||
Type a message and press submit to start OmniParser+X. Press the trash icon in the chat to clear the message history.
|
||||
'''
|
||||
@@ -189,7 +189,7 @@ def valid_params(user_input, state):
|
||||
"""Validate all requirements and return a list of error messages."""
|
||||
errors = []
|
||||
|
||||
for server_name, url in [('Windows Host', args.windows_host_url), ('OmniParser Server', args.omniparser_server_url)]:
|
||||
for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]:
|
||||
try:
|
||||
url = f'http://{url}/probe'
|
||||
response = requests.get(url, timeout=3)
|
||||
@@ -270,7 +270,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
||||
with gr.Column():
|
||||
model = gr.Dropdown(
|
||||
label="Model",
|
||||
choices=["omniparser + gpt-4o", "omniparser + R1", "claude-3-5-sonnet-20241022"],
|
||||
choices=["omniparser + gpt-4o", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022"],
|
||||
value="omniparser + gpt-4o",
|
||||
interactive=True,
|
||||
)
|
||||
@@ -326,6 +326,8 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
|
||||
provider_choices = ["openai"]
|
||||
elif model_selection == "omniparser + R1":
|
||||
provider_choices = ["groq"]
|
||||
elif model_selection == "omniparser + qwen2.5vl":
|
||||
provider_choices = ["dashscope"]
|
||||
else:
|
||||
provider_choices = [option.value for option in APIProvider]
|
||||
default_provider_value = provider_choices[0]
|
||||
|
||||
@@ -64,7 +64,7 @@ def sampling_loop_sync(
|
||||
max_tokens=max_tokens,
|
||||
only_n_most_recent_images=only_n_most_recent_images
|
||||
)
|
||||
elif model == "omniparser + gpt-4o" or model == "omniparser + R1":
|
||||
elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl":
|
||||
actor = VLMAgent(
|
||||
model=model,
|
||||
provider=provider,
|
||||
@@ -100,7 +100,7 @@ def sampling_loop_sync(
|
||||
|
||||
messages.append({"content": tool_result_content, "role": "user"})
|
||||
|
||||
elif model == "omniparser + gpt-4o" or model == "omniparser + R1":
|
||||
elif model == "omniparser + gpt-4o" or model == "omniparser + R1" or model == "omniparser + qwen2.5vl":
|
||||
while True:
|
||||
parsed_screen = omniparser_client()
|
||||
tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)
|
||||
|
||||
Reference in New Issue
Block a user