mirror of
https://github.com/All-Hands-AI/OpenHands.git
synced 2024-08-29 01:18:33 +03:00
(eval) Aider_bench: add eval_ids arg to run specific instance id's (#3592)
* add eval_ids arg to run specific instance id's; fix/extend README * fix description in parser for --eval-ids * fix test_arg_parser.py to account for added arg * fix typo in README to say "summarize" instead of "summarise" for script
This commit is contained in:
@@ -16,42 +16,49 @@ development environment and LLM.
|
||||
## Start the evaluation
|
||||
|
||||
```bash
|
||||
./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
|
||||
./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
|
||||
```
|
||||
|
||||
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
|
||||
- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
|
||||
your LLM settings, as defined in your `config.toml`.
|
||||
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
|
||||
you would like to evaluate. It could also be a release tag like `0.6.2`.
|
||||
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
|
||||
- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
|
||||
you would like to evaluate. It could also be a release tag like `0.9.0`.
|
||||
- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
|
||||
defaulting to `CodeActAgent`.
|
||||
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
|
||||
- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
|
||||
instances. By default, the script evaluates the entire Exercism test set
|
||||
(133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
|
||||
- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
|
||||
- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
|
||||
given IDs (comma separated).
|
||||
|
||||
Following is the basic command to start the evaluation.
|
||||
|
||||
You can update the arguments in the script
|
||||
`evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`,
|
||||
`--eval-num-workers` and so on.
|
||||
`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
|
||||
`--eval-num-workers` and so on:
|
||||
|
||||
- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
|
||||
- `--llm-config`: the LLM configuration to use. For example,
|
||||
`eval_gpt4_1106_preview`.
|
||||
- `--max-iterations`: the number of iterations to run the evaluation. For
|
||||
example, `30`.
|
||||
- `--eval-num-workers`: the number of workers to use for evaluation. For
|
||||
example, `5`.
|
||||
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
|
||||
- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
|
||||
- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
|
||||
- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
|
||||
- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
|
||||
- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
|
||||
- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
|
||||
|
||||
```bash
|
||||
./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
|
||||
./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
|
||||
```
|
||||
|
||||
## Summarize Results
|
||||
|
||||
```bash
|
||||
poetry run python ./evaluation/agent_bench/scripts/summarise_results.py [path_to_output_jsonl_file]
|
||||
poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
|
||||
```
|
||||
|
||||
Full example:
|
||||
|
||||
```bash
|
||||
poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
|
||||
```
|
||||
|
||||
This will list the instances that passed and the instances that failed. For each
|
||||
|
||||
@@ -245,7 +245,16 @@ if __name__ == '__main__':
|
||||
args.eval_output_dir,
|
||||
)
|
||||
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
||||
instances = prepare_dataset(aider_bench_tests, output_file, args.eval_n_limit)
|
||||
|
||||
# Parse dataset IDs if provided
|
||||
eval_ids = None
|
||||
if args.eval_ids:
|
||||
eval_ids = str(args.eval_ids).split(',')
|
||||
logger.info(f'Using specific dataset IDs: {eval_ids}')
|
||||
|
||||
instances = prepare_dataset(
|
||||
aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids
|
||||
)
|
||||
|
||||
asyncio.run(
|
||||
run_evaluation(
|
||||
|
||||
@@ -8,6 +8,7 @@ COMMIT_HASH=$2
|
||||
AGENT=$3
|
||||
EVAL_LIMIT=$4
|
||||
NUM_WORKERS=$5
|
||||
EVAL_IDS=$6
|
||||
|
||||
if [ -z "$NUM_WORKERS" ]; then
|
||||
NUM_WORKERS=1
|
||||
@@ -39,5 +40,10 @@ if [ -n "$EVAL_LIMIT" ]; then
|
||||
COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
|
||||
fi
|
||||
|
||||
if [ -n "$EVAL_IDS" ]; then
|
||||
echo "EVAL_IDS: $EVAL_IDS"
|
||||
COMMAND="$COMMAND --eval-ids $EVAL_IDS"
|
||||
fi
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
||||
|
||||
@@ -164,7 +164,12 @@ def make_metadata(
|
||||
return metadata
|
||||
|
||||
|
||||
def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
|
||||
def prepare_dataset(
|
||||
dataset: pd.DataFrame,
|
||||
output_file: str,
|
||||
eval_n_limit: int,
|
||||
eval_ids: list[str] | None = None,
|
||||
):
|
||||
assert (
|
||||
'instance_id' in dataset.columns
|
||||
), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
|
||||
@@ -180,7 +185,11 @@ def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
|
||||
f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
|
||||
)
|
||||
|
||||
if eval_n_limit:
|
||||
if eval_ids:
|
||||
eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids]
|
||||
dataset = dataset[dataset[id_column].isin(eval_ids_converted)]
|
||||
logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.')
|
||||
elif eval_n_limit:
|
||||
dataset = dataset.head(eval_n_limit)
|
||||
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
|
||||
|
||||
|
||||
@@ -740,6 +740,12 @@ def get_parser() -> argparse.ArgumentParser:
|
||||
type=str,
|
||||
help='Name for the session',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--eval-ids',
|
||||
default=None,
|
||||
type=str,
|
||||
help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
2
poetry.lock
generated
2
poetry.lock
generated
@@ -9457,4 +9457,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "ea650e78171ccd3088112c232ca9b09b180db502bc45a22feaf313acfeaf83b6"
|
||||
content-hash = "f6abf770480dfd3a739d3d0b4499b601df44f130b27684f34b5f6791950e99d8"
|
||||
|
||||
@@ -52,6 +52,7 @@ PyPDF2 = "*"
|
||||
python-pptx = "*"
|
||||
pylatexenc = "*"
|
||||
tornado = "*"
|
||||
python-dotenv = "*"
|
||||
|
||||
[tool.poetry.group.llama-index.dependencies]
|
||||
llama-index = "*"
|
||||
@@ -82,7 +83,6 @@ reportlab = "*"
|
||||
[tool.coverage.run]
|
||||
concurrency = ["gevent"]
|
||||
|
||||
|
||||
[tool.poetry.group.runtime.dependencies]
|
||||
jupyterlab = "*"
|
||||
notebook = "*"
|
||||
@@ -113,7 +113,6 @@ ignore = ["D1"]
|
||||
[tool.ruff.lint.pydocstyle]
|
||||
convention = "google"
|
||||
|
||||
|
||||
[tool.poetry.group.evaluation.dependencies]
|
||||
streamlit = "*"
|
||||
whatthepatch = "*"
|
||||
|
||||
@@ -104,7 +104,7 @@ def test_help_message(capsys):
|
||||
parser.parse_args(['--help'])
|
||||
captured = capsys.readouterr()
|
||||
help_output = captured.out
|
||||
|
||||
print(help_output)
|
||||
expected_elements = [
|
||||
'usage:',
|
||||
'Run an agent with a specific task',
|
||||
@@ -120,6 +120,7 @@ def test_help_message(capsys):
|
||||
'--eval-n-limit EVAL_N_LIMIT',
|
||||
'--eval-num-workers EVAL_NUM_WORKERS',
|
||||
'--eval-note EVAL_NOTE',
|
||||
'--eval-ids EVAL_IDS',
|
||||
'-l LLM_CONFIG, --llm-config LLM_CONFIG',
|
||||
'-n NAME, --name NAME',
|
||||
]
|
||||
@@ -128,4 +129,4 @@ def test_help_message(capsys):
|
||||
assert element in help_output, f"Expected '{element}' to be in the help message"
|
||||
|
||||
option_count = help_output.count(' -')
|
||||
assert option_count == 13, f'Expected 13 options, found {option_count}'
|
||||
assert option_count == 14, f'Expected 14 options, found {option_count}'
|
||||
|
||||
Reference in New Issue
Block a user