Enabling of unittests in aider benchmark should be optional. (#3620)

2024-08-29 01:18:33 +03:00 · 2024-08-27 22:55:55 +05:30
parent 292148826e
commit 0cdeb83b17
4 changed files with 28 additions and 9 deletions
--- a/evaluation/aider_bench/README.md
+++ b/evaluation/aider_bench/README.md
@@ -32,6 +32,11 @@ development environment and LLM.
 - `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
    given IDs (comma separated).

+There are also following optional environment variables you can set:
+```
+export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
+```
+
 Following is the basic command to start the evaluation.

 You can update the arguments in the script
--- a/evaluation/aider_bench/helper.py
+++ b/evaluation/aider_bench/helper.py
@@ -6,7 +6,6 @@ INSTRUCTIONS_ADDENDUM = """
 Use the above instructions to modify the supplied files: {signature_file}
 Don't change the names of existing functions or classes, as they may be referenced from other code like unit tests, etc.

-Use the test_file: {test_file}, to verify the correctness of your solution. DO NOT EDIT the test file.
 Only use standard python libraries, don't suggest installing any packages.
 """

--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -32,6 +32,9 @@ from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime

+# Configure visibility of unit tests to the Agent.
+USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
+

 def get_config(
    metadata: EvalMetadata,
@@ -85,13 +88,14 @@ async def initialize_runtime(
            file_path,
            '/workspace',
        )
-        file_path = os.path.join(tmpdir, f'{instance.instance_name}_test.py')
-        with open(file_path, 'w') as f:
-            f.write(instance.test)
-        await runtime.copy_to(
-            file_path,
-            '/workspace',
-        )
+        if USE_UNIT_TESTS:
+            file_path = os.path.join(tmpdir, f'{instance.instance_name}_test.py')
+            with open(file_path, 'w') as f:
+                f.write(instance.test)
+            await runtime.copy_to(
+                file_path,
+                '/workspace',
+            )
    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")


@@ -163,8 +167,13 @@ async def process_instance(
    instruction = instance.instruction
    instruction += INSTRUCTIONS_ADDENDUM.format(
        signature_file=f'{instance.instance_name}.py',
-        test_file=f'{instance.instance_name}_test.py',
    )
+    if USE_UNIT_TESTS:
+        instruction += (
+            f'Use the test_file: {instance.instance_name}_test.py, to verify '
+            'the correctness of your solution. DO NOT EDIT the test file.\n\n'
+        )
+
    instruction += (
        'IMPORTANT: You should ONLY interact with the environment provided '
        'to you AND NEVER ASK FOR HUMAN HELP.\n'
--- a/evaluation/aider_bench/scripts/run_infer.sh
+++ b/evaluation/aider_bench/scripts/run_infer.sh
@@ -35,6 +35,12 @@ COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run pyt
  --eval-num-workers $NUM_WORKERS \
  --eval-note $AGENT_VERSION"

+# Default to NOT use unit tests.
+if [ -z "$USE_UNIT_TESTS" ]; then
+  export USE_UNIT_TESTS=false
+fi
+echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+
 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"