[Agent, Eval] Fixes LLM config issue for delegation & Add eval to measure the delegation accuracy (#2948)

* fix json import

* pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose

* add inference script for browser delegation

* add readme

* Update agenthub/codeact_agent/action_parser.py

Co-authored-by: Graham Neubig <neubig@gmail.com>

* revert action parser changes.

* Rework --llm-config CLI arg

* Revert "pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose"

This reverts commit 81034c486e.

* remove view summary

* update readme

* update comment

* update readme

---------

Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
This commit is contained in:
Xingyao Wang
2024-07-16 23:51:29 +08:00
committed by GitHub
parent f5a4fb80a3
commit f45a2ff04e
4 changed files with 261 additions and 1 deletions

View File

@@ -0,0 +1,51 @@
# Browsing Delegation Evalution
Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
## Setup Environment
Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
## Configure OpenDevin and your LLM
Create a `config.toml` file if it does not exist at the root of the workspace.
Add the following configurations:
```toml
# TODO: Change these to the model you want to evaluate
[llm.eval_gpt4_1106_preview_llm]
model = "gpt-4-1106-preview"
api_key = "XXX"
temperature = 0.0
[llm.eval_some_openai_compatible_model_llm]
model = "openai/MODEL_NAME"
base_url = "https://OPENAI_COMPATIBLE_URL/v1"
api_key = "XXX"
temperature = 0.0
```
## Run Inference
```bash
./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
```
where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
LLM settings, as defined in your `config.toml`.
`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
like to evaluate. It could also be a release tag like `0.6.2`.
`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
to `CodeActAgent`.
`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.

View File

@@ -0,0 +1,164 @@
import asyncio
import logging
import os
import re
import nltk
import pandas as pd
from datasets import load_dataset
from evaluation.utils.shared import (
EvalMetadata,
make_metadata,
prepare_dataset,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
# Only CodeActAgent can delegate to BrowsingAgent
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
env_id = instance.instance_id
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
instruction = (
f'You can delegate browsing tasks to a browser agent. '
f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
f'Now, solve the following query: "{instance.instruction}"\n'
f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
)
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
sid=env_id,
)
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# find the last delegate action
last_delegate_action = None
result = {}
for action, _ in histories:
if action['action'] == 'delegate':
last_delegate_action = action
instruction_for_delegate = action['args']['inputs']['task']
# parse `browse_actions` from `instruction_for_delegate`
# task = f'{thought}. I should start with: {browse_actions}'
instruction_for_delegate = re.search(
r'I should start with: (.*)', instruction_for_delegate
).group(1)
# calculate the edit distance between the instance.instruction and the instruction_for_delegate
edit_distance = nltk.edit_distance(
instance.instruction, instruction_for_delegate
)
is_exact_match = (
instance.instruction.strip() == instruction_for_delegate.strip()
)
result['edit_distance'] = edit_distance
result['is_exact_match'] = is_exact_match
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
'query': instance.instruction,
'action': last_delegate_action,
'result': result,
},
}
return output
if __name__ == '__main__':
args = parse_arguments()
dataset = load_dataset('OpenDevin/eval-browsing-instructions')
dataset = dataset['train'].to_pandas()
assert dataset.columns.tolist() == ['instance_id', 'instruction']
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
'browsing_delegation',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
if metadata.agent_class not in SUPPORTED_AGENT_CLS:
raise ValueError(
f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
)

View File

@@ -0,0 +1,45 @@
#!/bin/bash
set -eo pipefail
source "evaluation/utils/version_control.sh"
MODEL_CONFIG=$1
COMMIT_HASH=$2
AGENT=$3
EVAL_LIMIT=$4
NUM_WORKERS=$5
if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
echo "Number of workers not specified, use default $NUM_WORKERS"
fi
checkout_eval_branch
if [ -z "$AGENT" ]; then
echo "Agent not specified, use default CodeActAgent"
AGENT="CodeActAgent"
fi
get_agent_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 1 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $EVAL_NOTE"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
fi
# Run the command
eval $COMMAND

View File

@@ -1,11 +1,11 @@
import asyncio
import json
import threading
from datetime import datetime
from enum import Enum
from typing import Callable, Iterable
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.utils import json
from opendevin.events.serialization.event import event_from_dict, event_to_dict
from opendevin.storage import FileStore, get_file_store