[Agent, Eval] Fixes LLM config issue for delegation & Add eval to measure the delegation accuracy (#2948)

* fix json import * pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose * add inference script for browser delegation * add readme * Update agenthub/codeact_agent/action_parser.py Co-authored-by: Graham Neubig <neubig@gmail.com> * revert action parser changes. * Rework --llm-config CLI arg * Revert "pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose" This reverts commit 81034c486e. * remove view summary * update readme * update comment * update readme --------- Co-authored-by: Graham Neubig <neubig@gmail.com> Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
2024-08-29 01:18:33 +03:00 · 2024-07-16 23:51:29 +08:00
parent f5a4fb80a3
commit f45a2ff04e
4 changed files with 261 additions and 1 deletions
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -0,0 +1,51 @@
+# Browsing Delegation Evalution
+
+Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
+
+This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
+If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
+
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+# TODO: Change these to the model you want to evaluate
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference
+
+```bash
+./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
+```
+
+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
+
+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -0,0 +1,164 @@
+import asyncio
+import logging
+import os
+import re
+
+import nltk
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    make_metadata,
+    prepare_dataset,
+    run_evaluation,
+)
+from opendevin.controller.agent import Agent
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import run_agent_controller
+from opendevin.llm.llm import LLM
+
+# Only CodeActAgent can delegate to BrowsingAgent
+SUPPORTED_AGENT_CLS = {'CodeActAgent'}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+):
+    # Create the agent
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    env_id = instance.instance_id
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    instruction = (
+        f'You can delegate browsing tasks to a browser agent. '
+        f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
+        f'Now, solve the following query: "{instance.instruction}"\n'
+        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
+    )
+
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
+            instruction,
+            max_iterations=metadata.max_iterations,
+            sid=env_id,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # find the last delegate action
+    last_delegate_action = None
+    result = {}
+    for action, _ in histories:
+        if action['action'] == 'delegate':
+            last_delegate_action = action
+            instruction_for_delegate = action['args']['inputs']['task']
+            # parse `browse_actions` from `instruction_for_delegate`
+            # task = f'{thought}. I should start with: {browse_actions}'
+            instruction_for_delegate = re.search(
+                r'I should start with: (.*)', instruction_for_delegate
+            ).group(1)
+
+            # calculate the edit distance between the instance.instruction and the instruction_for_delegate
+            edit_distance = nltk.edit_distance(
+                instance.instruction, instruction_for_delegate
+            )
+            is_exact_match = (
+                instance.instruction.strip() == instruction_for_delegate.strip()
+            )
+            result['edit_distance'] = edit_distance
+            result['is_exact_match'] = is_exact_match
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata.model_dump(),
+        'history': histories,
+        'metrics': metrics,
+        'error': state.last_error if state and state.last_error else None,
+        'test_result': {
+            'query': instance.instruction,
+            'action': last_delegate_action,
+            'result': result,
+        },
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
+    dataset = dataset['train'].to_pandas()
+    assert dataset.columns.tolist() == ['instance_id', 'instruction']
+    id_column = 'instance_id'
+    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
+    logger.info(f'Config for evaluation: {config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'browsing_delegation',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
+        raise ValueError(
+            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
+        )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        id_column,
+    )
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_agent_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$AGENT_VERSION"
+
+COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 1 \
+  --max-chars 10000000 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND
--- a/opendevin/events/stream.py
+++ b/opendevin/events/stream.py
@@ -1,11 +1,11 @@
 import asyncio
-import json
 import threading
 from datetime import datetime
 from enum import Enum
 from typing import Callable, Iterable

 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.utils import json
 from opendevin.events.serialization.event import event_from_dict, event_to_dict
 from opendevin.storage import FileStore, get_file_store