Merge branch 'main' into dependabot/pip/boto3-1.35.7

2024-08-29 01:18:33 +03:00 · 2024-08-28 20:07:45 +01:00
parent d2b5b9b074 c6ba0e8339
commit 6b8002c212
27 changed files with 388 additions and 111 deletions
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@
  <a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenHands-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
  <a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
  <br/>
-  <a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
+  <a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
 </div>
 <hr>

--- a/agenthub/codeact_agent/codeact_agent.py
+++ b/agenthub/codeact_agent/codeact_agent.py
@@ -27,6 +27,7 @@ from openhands.runtime.plugins import (
    JupyterRequirement,
    PluginRequirement,
 )
+from openhands.utils.microagent import MicroAgent
 from openhands.utils.prompt import PromptManager


@@ -73,10 +74,21 @@ class CodeActAgent(Agent):
        """
        super().__init__(llm, config)
        self.reset()
+
+        self.micro_agent = (
+            MicroAgent(
+                os.path.join(
+                    os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
+                )
+            )
+            if config.micro_agent_name
+            else None
+        )
+
        self.prompt_manager = PromptManager(
            prompt_dir=os.path.join(os.path.dirname(__file__)),
            agent_skills_docs=AgentSkillsRequirement.documentation,
-            micro_agent_name=None,  # TODO: implement micro-agent
+            micro_agent=self.micro_agent,
        )

    def action_to_str(self, action: Action) -> str:
--- a/agenthub/codeact_agent/micro/github.md
+++ b/agenthub/codeact_agent/micro/github.md
@@ -0,0 +1,59 @@
+---
+name: github
+agent: CodeActAgent
+require_env_var:
+    SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
+---
+
+# How to Interact with Github
+
+## Environment Variable Available
+
+1. `GITHUB_TOKEN`: A read-only token for Github.
+
+## Using GitHub's RESTful API
+
+Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
+
+1. View an issue:
+   ```
+   curl -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}
+   ```
+
+2. List repository issues:
+   ```
+   curl -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/{owner}/{repo}/issues
+   ```
+
+3. Get repository details:
+   ```
+   curl -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/{owner}/{repo}
+   ```
+
+4. List pull requests:
+   ```
+   curl -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/repos/{owner}/{repo}/pulls
+   ```
+
+5. Get user information:
+   ```
+   curl -H "Authorization: token $GITHUB_TOKEN" \
+        https://api.github.com/user
+   ```
+
+Replace `{owner}`, `{repo}`, and `{issue_number}` with appropriate values.
+
+## Important Notes
+
+1. Always use the GitHub API for operations instead of a web browser.
+2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
+3. Git config (username and email) is pre-set. Do not modify.
+4. Edit and test code locally. Never push directly to remote.
+5. Verify correct branch before committing.
+6. Commit changes frequently.
+7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
+8. You should avoid using command line tools like `sed` for file editing.
--- a/config.template.toml
+++ b/config.template.toml
@@ -64,6 +64,15 @@ workspace_base = "./workspace"
 # Name of the default agent
 #default_agent = "CodeActAgent"

+# JWT secret for authentication
+#jwt_secret = ""
+
+# Restrict file types for file uploads
+#file_uploads_restrict_file_types = false
+
+# List of allowed file extensions for uploads
+#file_uploads_allowed_extensions = [".*"]
+
 #################################### LLM #####################################
 # Configuration for LLM models (group name starts with 'llm')
 # use 'llm' for the default LLM config
@@ -126,6 +135,15 @@ model = "gpt-4o"
 # Retry minimum wait time
 #retry_min_wait = 3

+# Retry multiplier for exponential backoff
+#retry_multiplier = 2.0
+
+# Drop any unmapped (unsupported) params without causing an exception
+#drop_params = false
+
+# Base URL for the OLLAMA API
+#ollama_base_url = ""
+
 # Temperature for the API
 #temperature = 0.0

@@ -149,6 +167,9 @@ model = "gpt-3.5"
 # agent.CodeActAgent
 ##############################################################################
 [agent]
+# Name of the micro agent to use for this agent
+#micro_agent_name = ""
+
 # Memory enabled
 #memory_enabled = false

@@ -182,6 +203,18 @@ llm_config = 'gpt3'
 # Enable auto linting after editing
 #enable_auto_lint = false

+# Whether to initialize plugins
+#initialize_plugins = true
+
+# Extra dependencies to install in the runtime image
+#runtime_extra_deps = ""
+
+# Environment variables to set at the launch of the runtime
+#runtime_startup_env_vars = {}
+
+# BrowserGym environment to use for evaluation
+#browsergym_eval_env = ""
+
 #################################### Security ###################################
 # Configuration for security features
 ##############################################################################
--- a/docs/src/components/HomepageHeader/HomepageHeader.tsx
+++ b/docs/src/components/HomepageHeader/HomepageHeader.tsx
@@ -31,7 +31,7 @@ export function HomepageHeader() {
          <a href="https://arxiv.org/abs/2407.16741">
            <img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
          </a>
-          <a href="https://huggingface.co/spaces/OpenHands/evaluation">
+          <a href="https://huggingface.co/spaces/OpenDevin/evaluation">
            <img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
          </a>
        </div>
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -3,12 +3,14 @@
 This folder contains code and resources to run experiments and evaluations.

 ## Logistics
+
 To better organize the evaluation folder, we should follow the rules below:
-  - Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
+
+- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
 all the preprocessing/evaluation/analysis scripts.
-  - Raw data and experimental records should not be stored within this repo.
-    - For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
-  - Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
+- Raw data and experimental records should not be stored within this repo.
+- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization.
+- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.

 ## Supported Benchmarks

@@ -23,6 +25,7 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
 - ML-Bench: [`evaluation/ml_bench`](./ml_bench)
 - APIBench: [`evaluation/gorilla`](./gorilla/)
 - ToolQA: [`evaluation/toolqa`](./toolqa/)
+- AiderBench: [`evaluation/aider_bench`](./aider_bench/)

 ### Web Browsing

@@ -38,7 +41,6 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
 - ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)

-
 ## Before everything begins: Setup Environment and LLM Configuration

 Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
@@ -65,12 +67,10 @@ api_key = "XXX"
 temperature = 0.0
 ```

-
 ### Result Visualization

-Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.
-
+Check [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization of existing experimental results.

 ### Upload your results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
--- a/evaluation/aider_bench/README.md
+++ b/evaluation/aider_bench/README.md
@@ -33,8 +33,10 @@ development environment and LLM.
    given IDs (comma separated).

 There are also following optional environment variables you can set:
-```
+
+```bash
 export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
+export SKIP_NUM=12 # skip the first 12 instances from the dataset
 ```

 Following is the basic command to start the evaluation.
@@ -58,6 +60,8 @@ You can update the arguments in the script

 ```bash
 poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+# with optional SKIP_NUM
+poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
 ```

 Full example:
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -34,6 +34,10 @@ from openhands.runtime.runtime import Runtime

 # Configure visibility of unit tests to the Agent.
 USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
+SKIP_NUM = os.environ.get('SKIP_NUM')
+SKIP_NUM = (
+    int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
+)


 def get_config(
@@ -66,7 +70,7 @@ async def initialize_runtime(

    This function is called before the runtime is used to run the agent.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
    obs: CmdOutputObservation

    # Set instance id
@@ -96,7 +100,7 @@ async def initialize_runtime(
                file_path,
                '/workspace',
            )
-    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")


 async def complete_runtime(
@@ -109,7 +113,7 @@ async def complete_runtime(
    If you need to do something in the sandbox to get the correctness metric after
    the agent has run, modify this function.
    """
-    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
    obs: CmdOutputObservation

    # Rewriting the test file to ignore any changes Agent may have made.
@@ -136,7 +140,9 @@ async def complete_runtime(
    if isinstance(obs, CmdOutputObservation):
        exit_code = obs.exit_code

-    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
+    logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
+
+    await runtime.close()

    return {
        'test_output': obs.content,
@@ -156,7 +162,9 @@ async def process_instance(
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
        reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
    else:
-        logger.info(f'Starting evaluation for instance {str(instance.instance_id)}.')
+        logger.info(
+            f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
+        )

    # =============================================
    # build instruction
@@ -268,10 +276,14 @@ if __name__ == '__main__':
    eval_ids = None
    if args.eval_ids:
        eval_ids = str(args.eval_ids).split(',')
-        logger.info(f'Using specific dataset IDs: {eval_ids}')
+        logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')

    instances = prepare_dataset(
-        aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids
+        aider_bench_tests,
+        output_file,
+        args.eval_n_limit,
+        eval_ids=eval_ids,
+        skip_num=SKIP_NUM,
    )

    asyncio.run(
--- a/evaluation/aider_bench/scripts/summarize_results.py
+++ b/evaluation/aider_bench/scripts/summarize_results.py
@@ -22,7 +22,7 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
 if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(
-            'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
+            'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file>'
        )
        sys.exit(1)
    json_file_path = sys.argv[1]
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -26,7 +26,7 @@ poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_o

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).


 ## BrowsingAgent V1.0 result
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -95,7 +95,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc

 > If you want to evaluate existing results, you should first run this to clone existing outputs
 >```bash
->git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
+>git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
 >```

 NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#openhands-swe-bench-instance-level-docker-support).
@@ -129,10 +129,10 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be

 ## Visualize Results

-First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
+First you need to clone `https://huggingface.co/spaces/OpenDevin/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.

 ```bash
-git clone https://huggingface.co/spaces/OpenHands/evaluation
+git clone https://huggingface.co/spaces/OpenDevin/evaluation
 ```

 **(optional) setup streamlit environment with conda**:
@@ -156,4 +156,4 @@ Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -181,34 +181,44 @@ def prepare_dataset(
    output_file: str,
    eval_n_limit: int,
    eval_ids: list[str] | None = None,
+    skip_num: int | None = None,
 ):
    assert (
        'instance_id' in dataset.columns
    ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
    id_column = 'instance_id'
    logger.info(f'Writing evaluation output to {output_file}')
-    finished_ids = set()
+    finished_ids: set[str] = set()
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            for line in f:
                data = json.loads(line)
-                finished_ids.add(data[id_column])
+                finished_ids.add(str(data[id_column]))
        logger.warning(
-            f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
+            f'\nOutput file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
        )

    if eval_ids:
        eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids]
        dataset = dataset[dataset[id_column].isin(eval_ids_converted)]
        logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.')
-    elif eval_n_limit:
+    elif skip_num and skip_num >= 0:
+        skip_num = min(skip_num, len(dataset))
+        dataset = dataset.iloc[skip_num:]
+        logger.info(
+            f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).'
+        )
+        if eval_n_limit and eval_n_limit > 0:
+            dataset = dataset.head(eval_n_limit)
+            logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
+    elif eval_n_limit and eval_n_limit > 0:
        dataset = dataset.head(eval_n_limit)
        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')

    new_dataset = [
        instance
        for _, instance in dataset.iterrows()
-        if instance[id_column] not in finished_ids
+        if str(instance[id_column]) not in finished_ids
    ]
    logger.info(
        f'Finished instances: {len(finished_ids)}, Remaining instances: {len(new_dataset)}'
@@ -228,8 +238,8 @@ async def run_evaluation(
 ):
    use_multiprocessing = num_workers > 1
    logger.info(
-        f'Evaluation started with Agent {metadata.agent_class}, '
-        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
+        f'Evaluation started with Agent {metadata.agent_class}:\n'
+        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
    )
    pbar = tqdm(total=len(dataset))
    output_fp = open(output_file, 'a')
@@ -241,7 +251,7 @@ async def run_evaluation(
        pbar.set_description(f'Instance {output.instance_id}')
        pbar.set_postfix_str(f'Test Result: {output.test_result}')
        logger.info(
-            f'Finished evaluation for instance {output.instance_id}: {output.test_result}'
+            f'Finished evaluation for instance {output.instance_id}: {output.test_result}\n'
        )
        output_fp.write(json.dumps(output.model_dump()) + '\n')
        output_fp.flush()
@@ -270,11 +280,11 @@ async def run_evaluation(
                await update_progress(output)

    except KeyboardInterrupt:
-        print('KeyboardInterrupt received. Cleaning up...')
+        print('\nKeyboardInterrupt received. Cleaning up...\n')
        cleanup()

    output_fp.close()
-    logger.info('Evaluation finished.')
+    logger.info('\nEvaluation finished.\n')


 def reset_logger_for_multiprocessing(
--- a/evaluation/webarena/README.md
+++ b/evaluation/webarena/README.md
@@ -7,6 +7,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
 Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.

 ## Setup WebArena Environment
+
 WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
 Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
 Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
@@ -36,8 +37,7 @@ poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_

 ## Submit your evaluation results

-You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
-
+You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).

 ## BrowsingAgent V1.0 result

--- a/openhands/core/config.py
+++ b/openhands/core/config.py
@@ -12,7 +12,6 @@ import toml
 from dotenv import load_dotenv

 from openhands.core import logger
-from openhands.core.utils import Singleton

 load_dotenv()

@@ -123,11 +122,13 @@ class AgentConfig:
    """Configuration for the agent.

    Attributes:
+        micro_agent_name: The name of the micro agent to use for this agent.
        memory_enabled: Whether long-term memory (embeddings) is enabled.
        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
        llm_config: The name of the llm config to use. If specified, this will override global llm config.
    """

+    micro_agent_name: str | None = None
    memory_enabled: bool = False
    memory_max_threads: int = 2
    llm_config: str | None = None
@@ -141,7 +142,7 @@ class AgentConfig:


@dataclass
-class SecurityConfig(metaclass=Singleton):
+class SecurityConfig:
    """Configuration for security related functionalities.

    Attributes:
@@ -174,7 +175,7 @@ class SecurityConfig(metaclass=Singleton):


@dataclass
-class SandboxConfig(metaclass=Singleton):
+class SandboxConfig:
    """Configuration for the sandbox.

    Attributes:
@@ -241,7 +242,7 @@ class UndefinedString(str, Enum):


@dataclass
-class AppConfig(metaclass=Singleton):
+class AppConfig:
    """Configuration for the app.

    Attributes:
@@ -565,7 +566,12 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
            sandbox_config = SandboxConfig(**toml_config['sandbox'])

        # update the config object with the new values
-        AppConfig(sandbox=sandbox_config, **core_config)
+        cfg.sandbox = sandbox_config
+        for key, value in core_config.items():
+            if hasattr(cfg, key):
+                setattr(cfg, key, value)
+            else:
+                logger.openhands_logger.warning(f'Unknown core config key: {key}')
    except (TypeError, KeyError) as e:
        logger.openhands_logger.warning(
            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
--- a/openhands/core/exceptions.py
+++ b/openhands/core/exceptions.py
@@ -72,3 +72,8 @@ class LLMResponseError(Exception):
 class UserCancelledError(Exception):
    def __init__(self, message='User cancelled the request'):
        super().__init__(message)
+
+
+class MicroAgentValidationError(Exception):
+    def __init__(self, message='Micro agent validation failed'):
+        super().__init__(message)
--- a/openhands/core/utils/init.py
+++ b/openhands/core/utils/init.py
@@ -1,3 +0,0 @@
-from openhands.core.utils.singleton import Singleton
-
-__all__ = ['Singleton']
--- a/openhands/core/utils/singleton.py
+++ b/openhands/core/utils/singleton.py
@@ -1,37 +0,0 @@
-import dataclasses
-
-from openhands.core import logger
-
-
-class Singleton(type):
-    _instances: dict = {}
-
-    def __call__(cls, *args, **kwargs):
-        if cls not in cls._instances:
-            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
-        else:
-            # allow updates, just update existing instance
-            # perhaps not the most orthodox way to do it, though it simplifies client code
-            # useful for pre-defined groups of settings
-            instance = cls._instances[cls]
-            for key, value in kwargs.items():
-                if hasattr(instance, key):
-                    setattr(instance, key, value)
-                else:
-                    logger.openhands_logger.warning(
-                        f'Unknown key for {cls.__name__}: "{key}"'
-                    )
-        return cls._instances[cls]
-
-    @classmethod
-    def reset(cls):
-        # used by pytest to reset the state of the singleton instances
-        for instance_type, instance in cls._instances.items():
-            print('resetting... ', instance_type)
-            for field_info in dataclasses.fields(instance_type):
-                if dataclasses.is_dataclass(field_info.type):
-                    setattr(instance, field_info.name, field_info.type())
-                elif field_info.default_factory is not dataclasses.MISSING:
-                    setattr(instance, field_info.name, field_info.default_factory())
-                else:
-                    setattr(instance, field_info.name, field_info.default)
--- a/openhands/runtime/utils/system.py
+++ b/openhands/runtime/utils/system.py
@@ -12,3 +12,33 @@ def find_available_tcp_port() -> int:
        return -1
    finally:
        sock.close()
+
+
+def display_number_matrix(number: int) -> str | None:
+    if not 0 <= number <= 999:
+        return None
+
+    # Define the matrix representation for each digit
+    digits = {
+        '0': ['###', '# #', '# #', '# #', '###'],
+        '1': ['  #', '  #', '  #', '  #', '  #'],
+        '2': ['###', '  #', '###', '#  ', '###'],
+        '3': ['###', '  #', '###', '  #', '###'],
+        '4': ['# #', '# #', '###', '  #', '  #'],
+        '5': ['###', '#  ', '###', '  #', '###'],
+        '6': ['###', '#  ', '###', '# #', '###'],
+        '7': ['###', '  #', '  #', '  #', '  #'],
+        '8': ['###', '# #', '###', '# #', '###'],
+        '9': ['###', '# #', '###', '  #', '###'],
+    }
+
+    # alternatively, with leading zeros: num_str = f"{number:03d}"
+    num_str = str(number)  # Convert to string without padding
+
+    result = []
+    for row in range(5):
+        line = ' '.join(digits[digit][row] for digit in num_str)
+        result.append(line)
+
+    matrix_display = '\n'.join(result)
+    return f'\n{matrix_display}\n'
--- a/openhands/utils/microagent.py
+++ b/openhands/utils/microagent.py
@@ -0,0 +1,44 @@
+import os
+
+import frontmatter
+import pydantic
+
+from openhands.controller.agent import Agent
+from openhands.core.exceptions import MicroAgentValidationError
+from openhands.core.logger import openhands_logger as logger
+
+
+class MicroAgentMetadata(pydantic.BaseModel):
+    name: str
+    agent: str
+    require_env_var: dict[str, str]
+
+
+class MicroAgent:
+    def __init__(self, path: str):
+        self.path = path
+        if not os.path.exists(path):
+            raise FileNotFoundError(f'Micro agent file {path} is not found')
+        with open(path, 'r') as file:
+            self._loaded = frontmatter.load(file)
+            self._content = self._loaded.content
+            self._metadata = MicroAgentMetadata(**self._loaded.metadata)
+        self._validate_micro_agent()
+
+    @property
+    def content(self) -> str:
+        return self._content
+
+    def _validate_micro_agent(self):
+        logger.info(
+            f'Loading and validating micro agent [{self._metadata.name}] based on [{self._metadata.agent}]'
+        )
+        # Make sure the agent is registered
+        agent_cls = Agent.get_cls(self._metadata.agent)
+        assert agent_cls is not None
+        # Make sure the environment variables are set
+        for env_var, instruction in self._metadata.require_env_var.items():
+            if env_var not in os.environ:
+                raise MicroAgentValidationError(
+                    f'Environment variable [{env_var}] is required by micro agent [{self._metadata.name}] but not set. {instruction}'
+                )
--- a/openhands/utils/prompt.py
+++ b/openhands/utils/prompt.py
@@ -2,6 +2,8 @@ import os

 from jinja2 import Template

+from openhands.utils.microagent import MicroAgent
+

 class PromptManager:
    """
@@ -14,23 +16,21 @@ class PromptManager:
    Attributes:
        prompt_dir (str): Directory containing prompt templates.
        agent_skills_docs (str): Documentation of agent skills.
-        micro_agent (str | None): Content of the micro-agent definition file, if specified.
+        micro_agent (MicroAgent | None): Micro-agent, if specified.
    """

    def __init__(
        self,
        prompt_dir: str,
        agent_skills_docs: str,
-        micro_agent_name: str | None = None,
+        micro_agent: MicroAgent | None = None,
    ):
        self.prompt_dir: str = prompt_dir
        self.agent_skills_docs: str = agent_skills_docs

        self.system_template: Template = self._load_template('system_prompt')
        self.user_template: Template = self._load_template('user_prompt')
-        self.micro_agent: str | None = (
-            self._load_micro_agent(micro_agent_name) if micro_agent_name else None
-        )
+        self.micro_agent: MicroAgent | None = micro_agent

    def _load_template(self, template_name: str) -> Template:
        template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
@@ -39,15 +39,6 @@ class PromptManager:
        with open(template_path, 'r') as file:
            return Template(file.read())

-    def _load_micro_agent(self, micro_agent_name: str) -> str:
-        micro_agent_path = os.path.join(self.prompt_dir, f'micro/{micro_agent_name}.md')
-        if not os.path.exists(micro_agent_path):
-            raise FileNotFoundError(
-                f'Micro agent file {micro_agent_path} for {micro_agent_name} is not found'
-            )
-        with open(micro_agent_path, 'r') as file:
-            return file.read()
-
    @property
    def system_message(self) -> str:
        rendered = self.system_template.render(
@@ -66,5 +57,7 @@ class PromptManager:
        These additional context will convert the current generic agent
        into a more specialized agent that is tailored to the user's task.
        """
-        rendered = self.user_template.render(micro_agent=self.micro_agent)
+        rendered = self.user_template.render(
+            micro_agent=self.micro_agent.content if self.micro_agent else None
+        )
        return rendered.strip()
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aenum"
@@ -6585,6 +6585,24 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]

+[[package]]
+name = "python-frontmatter"
+version = "1.1.0"
+description = "Parse and manage posts with YAML (or other) frontmatter"
+optional = false
+python-versions = "*"
+files = [
+    {file = "python-frontmatter-1.1.0.tar.gz", hash = "sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d"},
+    {file = "python_frontmatter-1.1.0-py3-none-any.whl", hash = "sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1"},
+]
+
+[package.dependencies]
+PyYAML = "*"
+
+[package.extras]
+docs = ["sphinx"]
+test = ["mypy", "pyaml", "pytest", "toml", "types-PyYAML", "types-toml"]
+
 [[package]]
 name = "python-json-logger"
 version = "2.0.7"
@@ -9459,4 +9477,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "0e95b8afa4826171ad0b57a46690c8dc4317e1d5a642388e9be9352eac7b3cdc"
+content-hash = "ca8ef3dbc1eed207bca42c98c3cbf1fc085548977994ebc28283bc5ddbfa0101"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,6 +47,7 @@ tree-sitter = "0.21.3"
 bashlex = "^0.18"
 pyjwt = "^2.9.0"
 dirhash = "*"
+python-frontmatter = "^1.1.0"
 python-docx = "*"
 PyPDF2 = "*"
 python-pptx = "*"
@@ -83,6 +84,7 @@ reportlab = "*"
 [tool.coverage.run]
 concurrency = ["gevent"]

+
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 notebook = "*"
@@ -113,6 +115,7 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

+
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 whatthepatch = "*"
--- a/tests/unit/test_agent_controller.py
+++ b/tests/unit/test_agent_controller.py
@@ -49,6 +49,7 @@ async def test_set_agent_state(mock_agent, mock_event_stream):

    await controller.set_agent_state_to(AgentState.PAUSED)
    assert controller.get_agent_state() == AgentState.PAUSED
+    await controller.close()


@pytest.mark.asyncio
@@ -65,6 +66,7 @@ async def test_on_event_message_action(mock_agent, mock_event_stream):
    message_action = MessageAction(content='Test message')
    await controller.on_event(message_action)
    assert controller.get_agent_state() == AgentState.RUNNING
+    await controller.close()


@pytest.mark.asyncio
@@ -81,6 +83,7 @@ async def test_on_event_change_agent_state_action(mock_agent, mock_event_stream)
    change_state_action = ChangeAgentStateAction(agent_state=AgentState.PAUSED)
    await controller.on_event(change_state_action)
    assert controller.get_agent_state() == AgentState.PAUSED
+    await controller.close()


@pytest.mark.asyncio
@@ -97,6 +100,7 @@ async def test_report_error(mock_agent, mock_event_stream):
    await controller.report_error(error_message)
    assert controller.state.last_error == error_message
    controller.event_stream.add_event.assert_called_once()
+    await controller.close()


@pytest.mark.asyncio
@@ -116,6 +120,7 @@ async def test_step_with_exception(mock_agent, mock_event_stream):

    # Verify that report_error was called with the correct error message
    controller.report_error.assert_called_once_with('Malformed action')
+    await controller.close()


@pytest.mark.asyncio
@@ -134,6 +139,7 @@ async def test_step_max_iterations(mock_agent, mock_event_stream):
    await controller._step()
    assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
    assert controller.state.agent_state == AgentState.PAUSED
+    await controller.close()


@pytest.mark.asyncio
@@ -153,6 +159,7 @@ async def test_step_max_iterations_headless(mock_agent, mock_event_stream):
    assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
    # In headless mode, throttling results in an error
    assert controller.state.agent_state == AgentState.ERROR
+    await controller.close()


@pytest.mark.asyncio
@@ -172,6 +179,7 @@ async def test_step_max_budget(mock_agent, mock_event_stream):
    await controller._step()
    assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
    assert controller.state.agent_state == AgentState.PAUSED
+    await controller.close()


@pytest.mark.asyncio
@@ -192,3 +200,4 @@ async def test_step_max_budget_headless(mock_agent, mock_event_stream):
    assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
    # In headless mode, throttling results in an error
    assert controller.state.agent_state == AgentState.ERROR
+    await controller.close()
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -40,7 +40,6 @@ def temp_toml_file(tmp_path):
@pytest.fixture
 def default_config(monkeypatch):
    # Fixture to provide a default AppConfig instance
-    AppConfig.reset()
    yield AppConfig()


@@ -501,8 +500,8 @@ def test_api_keys_repr_str():
 def test_max_iterations_and_max_budget_per_task_from_toml(temp_toml_file):
    temp_toml = """
 [core]
-max_iterations = 100
-max_budget_per_task = 4.0
+max_iterations = 42
+max_budget_per_task = 4.7
 """

    config = AppConfig()
@@ -511,8 +510,8 @@ max_budget_per_task = 4.0

    load_from_toml(config, temp_toml_file)

-    assert config.max_iterations == 100
-    assert config.max_budget_per_task == 4.0
+    assert config.max_iterations == 42
+    assert config.max_budget_per_task == 4.7


 def test_get_llm_config_arg(temp_toml_file):
--- a/tests/unit/test_logging.py
+++ b/tests/unit/test_logging.py
@@ -87,9 +87,6 @@ def test_app_config_attributes_masking(test_handler):
    assert 'e2b-xyz789' not in log_output
    assert 'ghp_abcdefghijklmnopqrstuvwxyz' not in log_output

-    # reset the AppConfig
-    AppConfig.reset()
-

 def test_sensitive_env_vars_masking(test_handler):
    logger, stream = test_handler
--- a/tests/unit/test_microagent_utils.py
+++ b/tests/unit/test_microagent_utils.py
@@ -0,0 +1,73 @@
+import os
+
+import pytest
+from pytest import MonkeyPatch
+
+import agenthub  # noqa: F401
+from openhands.core.exceptions import (
+    AgentNotRegisteredError,
+    MicroAgentValidationError,
+)
+from openhands.utils.microagent import MicroAgent
+
+CONTENT = (
+    '# dummy header\n' 'dummy content\n' '## dummy subheader\n' 'dummy subcontent\n'
+)
+
+
+def test_micro_agent_load(tmp_path, monkeypatch: MonkeyPatch):
+    with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
+        f.write(
+            (
+                '---\n'
+                'name: dummy\n'
+                'agent: CodeActAgent\n'
+                'require_env_var:\n'
+                '  SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
+                '---\n' + CONTENT
+            )
+        )
+
+    # Patch the required environment variable
+    monkeypatch.setenv('SANDBOX_OPENHANDS_TEST_ENV_VAR', 'dummy_value')
+
+    micro_agent = MicroAgent(os.path.join(tmp_path, 'dummy.md'))
+    assert micro_agent is not None
+    assert micro_agent.content == CONTENT.strip()
+
+
+def test_not_existing_agent(tmp_path, monkeypatch: MonkeyPatch):
+    with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
+        f.write(
+            (
+                '---\n'
+                'name: dummy\n'
+                'agent: NotExistingAgent\n'
+                'require_env_var:\n'
+                '  SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
+                '---\n' + CONTENT
+            )
+        )
+    monkeypatch.setenv('SANDBOX_OPENHANDS_TEST_ENV_VAR', 'dummy_value')
+
+    with pytest.raises(AgentNotRegisteredError):
+        MicroAgent(os.path.join(tmp_path, 'dummy.md'))
+
+
+def test_not_existing_env_var(tmp_path):
+    with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
+        f.write(
+            (
+                '---\n'
+                'name: dummy\n'
+                'agent: CodeActAgent\n'
+                'require_env_var:\n'
+                '  SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
+                '---\n' + CONTENT
+            )
+        )
+
+    with pytest.raises(MicroAgentValidationError) as excinfo:
+        MicroAgent(os.path.join(tmp_path, 'dummy.md'))
+
+    assert 'Set this environment variable for testing purposes' in str(excinfo.value)
--- a/tests/unit/test_prompt_manager.py
+++ b/tests/unit/test_prompt_manager.py
@@ -1,8 +1,10 @@
 import os
 import shutil
+from unittest.mock import Mock

 import pytest

+from openhands.utils.microagent import MicroAgent
 from openhands.utils.prompt import PromptManager


@@ -56,11 +58,19 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
    with open(os.path.join(prompt_dir, 'micro', f'{micro_agent_name}.md'), 'w') as f:
        f.write(micro_agent_content)

-    manager = PromptManager(prompt_dir, agent_skills_docs, micro_agent_name)
+    # Mock MicroAgent
+    mock_micro_agent = Mock(spec=MicroAgent)
+    mock_micro_agent.content = micro_agent_content
+
+    manager = PromptManager(
+        prompt_dir=prompt_dir,
+        agent_skills_docs=agent_skills_docs,
+        micro_agent=mock_micro_agent,
+    )

    assert manager.prompt_dir == prompt_dir
    assert manager.agent_skills_docs == agent_skills_docs
-    assert manager.micro_agent == micro_agent_content
+    assert manager.micro_agent == mock_micro_agent

    assert isinstance(manager.system_message, str)
    assert (
@@ -86,7 +96,7 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):

 def test_prompt_manager_file_not_found(prompt_dir, agent_skills_docs):
    with pytest.raises(FileNotFoundError):
-        PromptManager(prompt_dir, agent_skills_docs, 'non_existent_micro_agent')
+        MicroAgent(os.path.join(prompt_dir, 'micro', 'non_existent_micro_agent.md'))


 def test_prompt_manager_template_rendering(prompt_dir, agent_skills_docs):