mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2024-08-29 01:18:33 +03:00
[Agent, Eval] Fixes LLM config issue for delegation & Add eval to measure the delegation accuracy (#2948)
* fix json import
* pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose
* add inference script for browser delegation
* add readme
* Update agenthub/codeact_agent/action_parser.py
Co-authored-by: Graham Neubig <neubig@gmail.com>
* revert action parser changes.
* Rework --llm-config CLI arg
* Revert "pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose"
This reverts commit 81034c486e.
* remove view summary
* update readme
* update comment
* update readme
---------
Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
51
evaluation/browsing_delegation/README.md
Normal file
51
evaluation/browsing_delegation/README.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Browsing Delegation Evalution
|
||||
|
||||
Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
|
||||
|
||||
This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
|
||||
If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
|
||||
|
||||
|
||||
## Setup Environment
|
||||
|
||||
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
|
||||
|
||||
## Configure OpenDevin and your LLM
|
||||
|
||||
Create a `config.toml` file if it does not exist at the root of the workspace.
|
||||
|
||||
Add the following configurations:
|
||||
|
||||
```toml
|
||||
# TODO: Change these to the model you want to evaluate
|
||||
[llm.eval_gpt4_1106_preview_llm]
|
||||
model = "gpt-4-1106-preview"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
|
||||
[llm.eval_some_openai_compatible_model_llm]
|
||||
model = "openai/MODEL_NAME"
|
||||
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
|
||||
api_key = "XXX"
|
||||
temperature = 0.0
|
||||
```
|
||||
|
||||
## Run Inference
|
||||
|
||||
```bash
|
||||
./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
|
||||
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
|
||||
```
|
||||
|
||||
where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
|
||||
|
||||
`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
|
||||
LLM settings, as defined in your `config.toml`.
|
||||
|
||||
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
|
||||
like to evaluate. It could also be a release tag like `0.6.2`.
|
||||
|
||||
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
|
||||
to `CodeActAgent`.
|
||||
|
||||
`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
|
||||
164
evaluation/browsing_delegation/run_infer.py
Normal file
164
evaluation/browsing_delegation/run_infer.py
Normal file
@@ -0,0 +1,164 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import nltk
|
||||
import pandas as pd
|
||||
from datasets import load_dataset
|
||||
|
||||
from evaluation.utils.shared import (
|
||||
EvalMetadata,
|
||||
make_metadata,
|
||||
prepare_dataset,
|
||||
run_evaluation,
|
||||
)
|
||||
from opendevin.controller.agent import Agent
|
||||
from opendevin.controller.state.state import State
|
||||
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
|
||||
from opendevin.core.logger import get_console_handler
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.main import run_agent_controller
|
||||
from opendevin.llm.llm import LLM
|
||||
|
||||
# Only CodeActAgent can delegate to BrowsingAgent
|
||||
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
|
||||
|
||||
|
||||
def process_instance(
|
||||
instance: pd.Series,
|
||||
metadata: EvalMetadata,
|
||||
reset_logger: bool = True,
|
||||
):
|
||||
# Create the agent
|
||||
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
||||
env_id = instance.instance_id
|
||||
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
|
||||
if reset_logger:
|
||||
# Set up logger
|
||||
log_file = os.path.join(
|
||||
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
# add back the console handler to print ONE line
|
||||
logger.addHandler(get_console_handler())
|
||||
logger.info(
|
||||
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
|
||||
)
|
||||
# Remove all existing handlers from logger
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(
|
||||
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
)
|
||||
logger.addHandler(file_handler)
|
||||
else:
|
||||
logger.info(f'Starting evaluation for instance {env_id}.')
|
||||
|
||||
instruction = (
|
||||
f'You can delegate browsing tasks to a browser agent. '
|
||||
f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
|
||||
f'Now, solve the following query: "{instance.instruction}"\n'
|
||||
f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
|
||||
)
|
||||
|
||||
state: State | None = asyncio.run(
|
||||
run_agent_controller(
|
||||
agent,
|
||||
instruction,
|
||||
max_iterations=metadata.max_iterations,
|
||||
sid=env_id,
|
||||
)
|
||||
)
|
||||
|
||||
# ======= Attempt to evaluate the agent's environment impact =======
|
||||
|
||||
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
|
||||
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
|
||||
|
||||
if state is None:
|
||||
raise ValueError('State should not be None.')
|
||||
|
||||
metrics = state.metrics.get() if state.metrics else None
|
||||
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
|
||||
# for compatibility with the existing output format, we can remake the pairs here
|
||||
# remove when it becomes unnecessary
|
||||
histories = state.history.compatibility_for_eval_history_pairs()
|
||||
|
||||
# find the last delegate action
|
||||
last_delegate_action = None
|
||||
result = {}
|
||||
for action, _ in histories:
|
||||
if action['action'] == 'delegate':
|
||||
last_delegate_action = action
|
||||
instruction_for_delegate = action['args']['inputs']['task']
|
||||
# parse `browse_actions` from `instruction_for_delegate`
|
||||
# task = f'{thought}. I should start with: {browse_actions}'
|
||||
instruction_for_delegate = re.search(
|
||||
r'I should start with: (.*)', instruction_for_delegate
|
||||
).group(1)
|
||||
|
||||
# calculate the edit distance between the instance.instruction and the instruction_for_delegate
|
||||
edit_distance = nltk.edit_distance(
|
||||
instance.instruction, instruction_for_delegate
|
||||
)
|
||||
is_exact_match = (
|
||||
instance.instruction.strip() == instruction_for_delegate.strip()
|
||||
)
|
||||
result['edit_distance'] = edit_distance
|
||||
result['is_exact_match'] = is_exact_match
|
||||
|
||||
# Save the output
|
||||
output = {
|
||||
'instance_id': env_id,
|
||||
'instruction': instruction,
|
||||
'metadata': metadata.model_dump(),
|
||||
'history': histories,
|
||||
'metrics': metrics,
|
||||
'error': state.last_error if state and state.last_error else None,
|
||||
'test_result': {
|
||||
'query': instance.instruction,
|
||||
'action': last_delegate_action,
|
||||
'result': result,
|
||||
},
|
||||
}
|
||||
|
||||
return output
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
|
||||
dataset = load_dataset('OpenDevin/eval-browsing-instructions')
|
||||
dataset = dataset['train'].to_pandas()
|
||||
assert dataset.columns.tolist() == ['instance_id', 'instruction']
|
||||
id_column = 'instance_id'
|
||||
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
||||
logger.info(f'Config for evaluation: {config}')
|
||||
|
||||
metadata = make_metadata(
|
||||
llm_config,
|
||||
'browsing_delegation',
|
||||
args.agent_cls,
|
||||
args.max_iterations,
|
||||
args.eval_note,
|
||||
args.eval_output_dir,
|
||||
)
|
||||
if metadata.agent_class not in SUPPORTED_AGENT_CLS:
|
||||
raise ValueError(
|
||||
f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
|
||||
)
|
||||
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
|
||||
run_evaluation(
|
||||
instances,
|
||||
metadata,
|
||||
output_file,
|
||||
args.eval_num_workers,
|
||||
process_instance,
|
||||
id_column,
|
||||
)
|
||||
45
evaluation/browsing_delegation/scripts/run_infer.sh
Executable file
45
evaluation/browsing_delegation/scripts/run_infer.sh
Executable file
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
source "evaluation/utils/version_control.sh"
|
||||
|
||||
MODEL_CONFIG=$1
|
||||
COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
NUM_WORKERS=$5
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
echo "Number of workers not specified, use default $NUM_WORKERS"
|
||||
fi
|
||||
checkout_eval_branch
|
||||
|
||||
if [ -z "$AGENT" ]; then
|
||||
echo "Agent not specified, use default CodeActAgent"
|
||||
AGENT="CodeActAgent"
|
||||
fi
|
||||
|
||||
get_agent_version
|
||||
|
||||
echo "AGENT: $AGENT"
|
||||
echo "AGENT_VERSION: $AGENT_VERSION"
|
||||
echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="$AGENT_VERSION"
|
||||
|
||||
COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 1 \
|
||||
--max-chars 10000000 \
|
||||
--eval-num-workers $NUM_WORKERS \
|
||||
--eval-note $EVAL_NOTE"
|
||||
|
||||
if [ -n "$EVAL_LIMIT" ]; then
|
||||
echo "EVAL_LIMIT: $EVAL_LIMIT"
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
@@ -1,11 +1,11 @@
|
||||
import asyncio
|
||||
import json
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Callable, Iterable
|
||||
|
||||
from opendevin.core.logger import opendevin_logger as logger
|
||||
from opendevin.core.utils import json
|
||||
from opendevin.events.serialization.event import event_from_dict, event_to_dict
|
||||
from opendevin.storage import FileStore, get_file_store
|
||||
|
||||
|
||||
Reference in New Issue
Block a user