mirror of
https://github.com/microsoft/OmniParser.git
synced 2025-02-18 03:18:33 +03:00
107 lines
3.8 KiB
Python
107 lines
3.8 KiB
Python
|
|
import os
|
|
import logging
|
|
import base64
|
|
import requests
|
|
|
|
import dashscope
|
|
# from computer_use_demo.gui_agent.llm_utils import is_image_path, encode_image
|
|
|
|
def is_image_path(text):
|
|
return False
|
|
|
|
def encode_image(image_path):
|
|
return ""
|
|
|
|
|
|
def run_qwen(messages: list, system: str, llm: str, api_key: str, max_tokens=256, temperature=0):
|
|
|
|
api_key = api_key or os.environ.get("QWEN_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("QWEN_API_KEY is not set")
|
|
|
|
dashscope.api_key = api_key
|
|
|
|
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
|
|
|
final_messages = [{"role": "system", "content": [{"text": system}]}]
|
|
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
|
if type(messages) == list:
|
|
for item in messages:
|
|
contents = []
|
|
if isinstance(item, dict):
|
|
for cnt in item["content"]:
|
|
if isinstance(cnt, str):
|
|
if is_image_path(cnt):
|
|
# base64_image = encode_image(cnt)
|
|
content = [{"image": cnt}]
|
|
# content = {"type": "image_url", "image_url": {"url": image_url}}
|
|
else:
|
|
content = {"text": cnt}
|
|
contents.append(content)
|
|
|
|
message = {"role": item["role"], "content": contents}
|
|
else: # str
|
|
contents.append({"text": item})
|
|
message = {"role": "user", "content": contents}
|
|
|
|
final_messages.append(message)
|
|
|
|
print("[qwen-vl] sending messages:", final_messages)
|
|
|
|
response = dashscope.MultiModalConversation.call(
|
|
model='qwen-vl-max-0809',
|
|
messages=final_messages
|
|
)
|
|
|
|
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
|
|
|
try:
|
|
text = response.output.choices[0].message.content[0]['text']
|
|
usage = response.usage
|
|
|
|
if "total_tokens" not in usage:
|
|
token_usage = int(usage["input_tokens"] + usage["output_tokens"])
|
|
else:
|
|
token_usage = int(usage["total_tokens"])
|
|
|
|
return text, token_usage
|
|
# return response.json()['choices'][0]['message']['content']
|
|
# return error message if the response is not successful
|
|
except Exception as e:
|
|
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
|
|
return response.json()
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
api_key = os.environ.get("QWEN_API_KEY")
|
|
if not api_key:
|
|
raise ValueError("QWEN_API_KEY is not set")
|
|
|
|
dashscope.api_key = api_key
|
|
|
|
final_messages = [{"role": "user",
|
|
"content": [
|
|
{"text": "What is in the screenshot?"},
|
|
{"image": "./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"}
|
|
]
|
|
}
|
|
]
|
|
response = dashscope.MultiModalConversation.call(model='qwen-vl-max-0809', messages=final_messages)
|
|
|
|
print(response)
|
|
|
|
text = response.output.choices[0].message.content[0]['text']
|
|
usage = response.usage
|
|
|
|
if "total_tokens" not in usage:
|
|
if "image_tokens" in usage:
|
|
token_usage = usage["input_tokens"] + usage["output_tokens"] + usage["image_tokens"]
|
|
else:
|
|
token_usage = usage["input_tokens"] + usage["output_tokens"]
|
|
else:
|
|
token_usage = usage["total_tokens"]
|
|
|
|
print(text, token_usage)
|
|
# The screenshot is from a video game... 1387 |