code cleanup

This commit is contained in:
Thomas Dhome-Casanova
2025-01-29 23:01:14 -08:00
parent f6029344c5
commit b2d6bc5c3e
6 changed files with 13 additions and 92 deletions

View File

@@ -159,24 +159,4 @@ def _maybe_filter_to_n_most_recent_images(
images_to_remove -= 1
continue
new_content.append(content)
tool_result["content"] = new_content
if __name__ == "__main__":
pass
# client = Anthropic(api_key="")
# response = client.beta.messages.with_raw_response.create(
# max_tokens=4096,
# model="claude-3-5-sonnet-20241022",
# system=SYSTEM_PROMPT,
# # tools=ToolCollection(
# # ComputerTool(),
# # ).to_params(),
# betas=["computer-use-2024-10-22"],
# messages=[
# {"role": "user", "content": "click on (199, 199)."}
# ],
# )
# print(f"AnthropicActor response: {response.parse().usage.input_tokens+response.parse().usage.output_tokens}")
tool_result["content"] = new_content

View File

@@ -1,11 +1,9 @@
import os
import logging
import base64
import requests
def is_image_path(text):
# Checking if the input text ends with typical image file extensions
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
if text.endswith(image_extensions):
return True
@@ -28,7 +26,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
final_messages = [{"role": "system", "content": system}]
# image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
if type(messages) == list:
for item in messages:
contents = []
@@ -56,7 +53,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
elif isinstance(messages, str):
final_messages = [{"role": "user", "content": messages}]
# import pdb; pdb.set_trace()
print("[oai] sending messages:", {"role": "user", "content": messages})
@@ -64,12 +60,9 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
"model": llm,
"messages": final_messages,
"max_tokens": max_tokens,
"temperature": temperature,
# "stop": stop,
"temperature": temperature
}
# from IPython.core.debugger import Pdb; Pdb().set_trace()
response = requests.post(
"https://api.openai.com/v1/chat/completions", headers=headers, json=payload
)
@@ -78,30 +71,6 @@ def run_oai_interleaved(messages: list, system: str, llm: str, api_key: str, max
text = response.json()['choices'][0]['message']['content']
token_usage = int(response.json()['usage']['total_tokens'])
return text, token_usage
# return error message if the response is not successful
except Exception as e:
print(f"Error in interleaved openAI: {e}. This may due to your invalid OPENAI_API_KEY. Please check the response: {response.json()} ")
return response.json()
if __name__ == "__main__":
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY is not set")
text, token_usage = run_oai_interleaved(
messages= [{"content": [
"What is in the screenshot?",
"./tmp/outputs/screenshot_0b04acbb783d4706bc93873d17ba8c05.png"],
"role": "user"
}],
llm="gpt-4o-mini",
system="You are a helpful assistant",
api_key=api_key,
max_tokens=256,
temperature=0)
print(text, token_usage)
# There is an introduction describing the Calyx... 36986
return response.json()

View File

@@ -1,6 +1,5 @@
"""
Entrypoint for Gradio, see https://gradio.app/
python app.py --windows_host_url xxxx:8006/ --omniparser_host_url localhost:8000
python app.py --windows_host_url localhost:8006/ --omniparser_server_url localhost:8000
"""
import os
@@ -35,13 +34,9 @@ Type a message and press submit to start OmniParser+X. Press the trash icon in t
def parse_arguments():
parser = argparse.ArgumentParser(description="Gradio App")
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
parser.add_argument("--omniparser_host_url", type=str, default="localhost:8000")
parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
return parser.parse_args()
args = parse_arguments()
windows_host_url = args.windows_host_url
omniparser_host_url = args.omniparser_host_url
print(f"Windows host URL: {windows_host_url}")
print(f"OmniParser host URL: {omniparser_host_url}")
class Sender(StrEnum):
@@ -140,7 +135,6 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b
is_tool_result = not isinstance(message, str) and (
isinstance(message, ToolResult)
or message.__class__.__name__ == "ToolResult"
or message.__class__.__name__ == "CLIResult"
)
if not message or (
is_tool_result
@@ -214,7 +208,7 @@ def process_input(user_input, state):
api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
api_key=state["api_key"],
only_n_most_recent_images=state["only_n_most_recent_images"],
omniparser_url=omniparser_host_url
omniparser_url=args.omniparser_server_url
):
if loop_msg is None:
yield state['chatbot_messages']
@@ -289,20 +283,11 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
with gr.Column(scale=1):
chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580)
with gr.Column(scale=3):
if not windows_host_url:
iframe = gr.HTML(
f'<iframe src="http://localhost:8006/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
container=False,
elem_classes="no-padding"
)
else:
# machine_fqdn = socket.getfqdn()
# print('machine_fqdn:', machine_fqdn)
iframe = gr.HTML(
f'<iframe src="http://{windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
container=False,
elem_classes="no-padding"
)
iframe = gr.HTML(
f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
container=False,
elem_classes="no-padding"
)
def update_model(model_selection, state):
state["model"] = model_selection

View File

@@ -96,9 +96,7 @@ def sampling_loop_sync(
if model == "claude-3-5-sonnet-20241022": # Anthropic loop
while True:
parsed_screen = omniparser_client() # parsed_screen: {"som_image_base64": dino_labled_img, "parsed_content_list": parsed_content_list, "screen_info"}
import pdb; pdb.set_trace()
screen_info_block = TextBlock(text='Below is the structured accessibility information of the current UI screen, which includes text and icons you can operate on, take these information into account when you are making the prediction for the next action. Note you will still need to take screenshot to get the image: \n' + parsed_screen['screen_info'], type='text')
# # messages[-1]['content'].append(screen_info_block)
screen_info_dict = {"role": "user", "content": [screen_info_block]}
messages.append(screen_info_dict)
tools_use_needed = actor(messages=messages)
@@ -120,10 +118,4 @@ def sampling_loop_sync(
yield message
if not tool_result_content:
return messages
# import pdb; pdb.set_trace()
# messages.append({"role": "user",
# "content": ["History plan:\n" + str(vlm_response_json['Reasoning'])]})
# messages.append({"content": tool_result_content, "role": "user"})
return messages

View File

@@ -1,10 +1,9 @@
from .base import CLIResult, ToolResult
from .base import ToolResult
from .collection import ToolCollection
from .computer import ComputerTool
from .screen_capture import get_screenshot
__ALL__ = [
CLIResult,
ComputerTool,
ToolCollection,
ToolResult,

View File

@@ -54,10 +54,6 @@ class ToolResult:
return replace(self, **kwargs)
class CLIResult(ToolResult):
"""A ToolResult that can be rendered as a CLI output."""
class ToolFailure(ToolResult):
"""A ToolResult that represents a failure."""