Merge branch 'main' into dependabot/pip/boto3-1.35.7

This commit is contained in:
tofarr
2024-08-28 20:07:45 +01:00
committed by GitHub
27 changed files with 388 additions and 111 deletions

View File

@@ -37,7 +37,7 @@
<a href="https://docs.all-hands.dev/modules/usage/intro"><img src="https://img.shields.io/badge/Documentation-OpenHands-blue?logo=googledocs&logoColor=white&style=for-the-badge" alt="Check out the documentation"></a>
<a href="https://arxiv.org/abs/2407.16741"><img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv"></a>
<br/>
<a href="https://huggingface.co/spaces/OpenHands/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
<a href="https://huggingface.co/spaces/OpenDevin/evaluation"><img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark"></a>
</div>
<hr>

View File

@@ -27,6 +27,7 @@ from openhands.runtime.plugins import (
JupyterRequirement,
PluginRequirement,
)
from openhands.utils.microagent import MicroAgent
from openhands.utils.prompt import PromptManager
@@ -73,10 +74,21 @@ class CodeActAgent(Agent):
"""
super().__init__(llm, config)
self.reset()
self.micro_agent = (
MicroAgent(
os.path.join(
os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
)
)
if config.micro_agent_name
else None
)
self.prompt_manager = PromptManager(
prompt_dir=os.path.join(os.path.dirname(__file__)),
agent_skills_docs=AgentSkillsRequirement.documentation,
micro_agent_name=None, # TODO: implement micro-agent
micro_agent=self.micro_agent,
)
def action_to_str(self, action: Action) -> str:

View File

@@ -0,0 +1,59 @@
---
name: github
agent: CodeActAgent
require_env_var:
SANDBOX_ENV_GITHUB_TOKEN: "Create a GitHub Personal Access Token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) and set it as SANDBOX_GITHUB_TOKEN in your environment variables."
---
# How to Interact with Github
## Environment Variable Available
1. `GITHUB_TOKEN`: A read-only token for Github.
## Using GitHub's RESTful API
Use `curl` with the `GITHUB_TOKEN` to interact with GitHub's API. Here are some common operations:
1. View an issue:
```
curl -H "Authorization: token $GITHUB_TOKEN" \
https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}
```
2. List repository issues:
```
curl -H "Authorization: token $GITHUB_TOKEN" \
https://api.github.com/repos/{owner}/{repo}/issues
```
3. Get repository details:
```
curl -H "Authorization: token $GITHUB_TOKEN" \
https://api.github.com/repos/{owner}/{repo}
```
4. List pull requests:
```
curl -H "Authorization: token $GITHUB_TOKEN" \
https://api.github.com/repos/{owner}/{repo}/pulls
```
5. Get user information:
```
curl -H "Authorization: token $GITHUB_TOKEN" \
https://api.github.com/user
```
Replace `{owner}`, `{repo}`, and `{issue_number}` with appropriate values.
## Important Notes
1. Always use the GitHub API for operations instead of a web browser.
2. The `GITHUB_TOKEN` is read-only. Avoid operations that require write access.
3. Git config (username and email) is pre-set. Do not modify.
4. Edit and test code locally. Never push directly to remote.
5. Verify correct branch before committing.
6. Commit changes frequently.
7. If the issue or task is ambiguous or lacks sufficient detail, always request clarification from the user before proceeding.
8. You should avoid using command line tools like `sed` for file editing.

View File

@@ -64,6 +64,15 @@ workspace_base = "./workspace"
# Name of the default agent
#default_agent = "CodeActAgent"
# JWT secret for authentication
#jwt_secret = ""
# Restrict file types for file uploads
#file_uploads_restrict_file_types = false
# List of allowed file extensions for uploads
#file_uploads_allowed_extensions = [".*"]
#################################### LLM #####################################
# Configuration for LLM models (group name starts with 'llm')
# use 'llm' for the default LLM config
@@ -126,6 +135,15 @@ model = "gpt-4o"
# Retry minimum wait time
#retry_min_wait = 3
# Retry multiplier for exponential backoff
#retry_multiplier = 2.0
# Drop any unmapped (unsupported) params without causing an exception
#drop_params = false
# Base URL for the OLLAMA API
#ollama_base_url = ""
# Temperature for the API
#temperature = 0.0
@@ -149,6 +167,9 @@ model = "gpt-3.5"
# agent.CodeActAgent
##############################################################################
[agent]
# Name of the micro agent to use for this agent
#micro_agent_name = ""
# Memory enabled
#memory_enabled = false
@@ -182,6 +203,18 @@ llm_config = 'gpt3'
# Enable auto linting after editing
#enable_auto_lint = false
# Whether to initialize plugins
#initialize_plugins = true
# Extra dependencies to install in the runtime image
#runtime_extra_deps = ""
# Environment variables to set at the launch of the runtime
#runtime_startup_env_vars = {}
# BrowserGym environment to use for evaluation
#browsergym_eval_env = ""
#################################### Security ###################################
# Configuration for security features
##############################################################################

View File

@@ -31,7 +31,7 @@ export function HomepageHeader() {
<a href="https://arxiv.org/abs/2407.16741">
<img src="https://img.shields.io/badge/Paper-%20on%20Arxiv-red?logo=arxiv&style=for-the-badge" alt="Paper on Arxiv" />
</a>
<a href="https://huggingface.co/spaces/OpenHands/evaluation">
<a href="https://huggingface.co/spaces/OpenDevin/evaluation">
<img src="https://img.shields.io/badge/Evaluation-Benchmark%20on%20HF%20Space-green?logo=huggingface&style=for-the-badge" alt="Evaluation Benchmark" />
</a>
</div>

View File

@@ -3,12 +3,14 @@
This folder contains code and resources to run experiments and evaluations.
## Logistics
To better organize the evaluation folder, we should follow the rules below:
- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
- Each subfolder contains a specific benchmark or experiment. For example, `evaluation/swe_bench` should contain
all the preprocessing/evaluation/analysis scripts.
- Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization.
- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
- Raw data and experimental records should not be stored within this repo.
- For model outputs, they should be stored at [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization.
- Important data files of manageable size and analysis scripts (e.g., jupyter notebooks) can be directly uploaded to this repo.
## Supported Benchmarks
@@ -23,6 +25,7 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
- ML-Bench: [`evaluation/ml_bench`](./ml_bench)
- APIBench: [`evaluation/gorilla`](./gorilla/)
- ToolQA: [`evaluation/toolqa`](./toolqa/)
- AiderBench: [`evaluation/aider_bench`](./aider_bench/)
### Web Browsing
@@ -38,7 +41,6 @@ To learn more about how to integrate your benchmark into OpenHands, check out [t
- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
- ProofWriter: [`evaluation/logic_reasoning`](./logic_reasoning)
## Before everything begins: Setup Environment and LLM Configuration
Please follow instruction [here](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to setup your local development environment and LLM.
@@ -65,12 +67,10 @@ api_key = "XXX"
temperature = 0.0
```
### Result Visualization
Check [this huggingface space](https://huggingface.co/spaces/OpenHands/evaluation) for visualization of existing experimental results.
Check [this huggingface space](https://huggingface.co/spaces/OpenDevin/evaluation) for visualization of existing experimental results.
### Upload your results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results to our hosted huggingface repo via PR following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).

View File

@@ -33,8 +33,10 @@ development environment and LLM.
given IDs (comma separated).
There are also following optional environment variables you can set:
```
```bash
export USE_UNIT_TESTS=true # if you want to allow the Agent to verify correctness using unittests. Default to false.
export SKIP_NUM=12 # skip the first 12 instances from the dataset
```
Following is the basic command to start the evaluation.
@@ -58,6 +60,8 @@ You can update the arguments in the script
```bash
poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
# with optional SKIP_NUM
poetry run python SKIP_NUM=12 ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
```
Full example:

View File

@@ -34,6 +34,10 @@ from openhands.runtime.runtime import Runtime
# Configure visibility of unit tests to the Agent.
USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
SKIP_NUM = os.environ.get('SKIP_NUM')
SKIP_NUM = (
int(SKIP_NUM) if SKIP_NUM and SKIP_NUM.isdigit() and int(SKIP_NUM) >= 0 else None
)
def get_config(
@@ -66,7 +70,7 @@ async def initialize_runtime(
This function is called before the runtime is used to run the agent.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
logger.info(f"\n{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}\n")
obs: CmdOutputObservation
# Set instance id
@@ -96,7 +100,7 @@ async def initialize_runtime(
file_path,
'/workspace',
)
logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
logger.info(f"\n{'-' * 50} END Runtime Initialization Fn {'-' * 50}\n")
async def complete_runtime(
@@ -109,7 +113,7 @@ async def complete_runtime(
If you need to do something in the sandbox to get the correctness metric after
the agent has run, modify this function.
"""
logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
logger.info(f"\n{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}\n")
obs: CmdOutputObservation
# Rewriting the test file to ignore any changes Agent may have made.
@@ -136,7 +140,9 @@ async def complete_runtime(
if isinstance(obs, CmdOutputObservation):
exit_code = obs.exit_code
logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
logger.info(f"\n{'-' * 50} END Runtime Completion Fn {'-' * 50}\n")
await runtime.close()
return {
'test_output': obs.content,
@@ -156,7 +162,9 @@ async def process_instance(
log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
reset_logger_for_multiprocessing(logger, str(instance.instance_id), log_dir)
else:
logger.info(f'Starting evaluation for instance {str(instance.instance_id)}.')
logger.info(
f'\nStarting evaluation for instance {str(instance.instance_id)}.\n'
)
# =============================================
# build instruction
@@ -268,10 +276,14 @@ if __name__ == '__main__':
eval_ids = None
if args.eval_ids:
eval_ids = str(args.eval_ids).split(',')
logger.info(f'Using specific dataset IDs: {eval_ids}')
logger.info(f'\nUsing specific dataset IDs: {eval_ids}\n')
instances = prepare_dataset(
aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids
aider_bench_tests,
output_file,
args.eval_n_limit,
eval_ids=eval_ids,
skip_num=SKIP_NUM,
)
asyncio.run(

View File

@@ -22,7 +22,7 @@ def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
if __name__ == '__main__':
if len(sys.argv) != 2:
print(
'Usage: poetry run python summarise_results.py <path_to_output_jsonl_file>'
'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file>'
)
sys.exit(1)
json_file_path = sys.argv[1]

View File

@@ -26,7 +26,7 @@ poetry run python evaluation/miniwob/get_success_rate.py evaluation/evaluation_o
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
## BrowsingAgent V1.0 result

View File

@@ -95,7 +95,7 @@ With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patc
> If you want to evaluate existing results, you should first run this to clone existing outputs
>```bash
>git clone https://huggingface.co/spaces/OpenHands/evaluation evaluation/evaluation_outputs
>git clone https://huggingface.co/spaces/OpenDevin/evaluation evaluation/evaluation_outputs
>```
NOTE, you should have already pulled the instance-level OR env-level docker images following [this section](#openhands-swe-bench-instance-level-docker-support).
@@ -129,10 +129,10 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
## Visualize Results
First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
First you need to clone `https://huggingface.co/spaces/OpenDevin/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
```bash
git clone https://huggingface.co/spaces/OpenHands/evaluation
git clone https://huggingface.co/spaces/OpenDevin/evaluation
```
**(optional) setup streamlit environment with conda**:
@@ -156,4 +156,4 @@ Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).

View File

@@ -181,34 +181,44 @@ def prepare_dataset(
output_file: str,
eval_n_limit: int,
eval_ids: list[str] | None = None,
skip_num: int | None = None,
):
assert (
'instance_id' in dataset.columns
), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
id_column = 'instance_id'
logger.info(f'Writing evaluation output to {output_file}')
finished_ids = set()
finished_ids: set[str] = set()
if os.path.exists(output_file):
with open(output_file, 'r') as f:
for line in f:
data = json.loads(line)
finished_ids.add(data[id_column])
finished_ids.add(str(data[id_column]))
logger.warning(
f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
f'\nOutput file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
)
if eval_ids:
eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids]
dataset = dataset[dataset[id_column].isin(eval_ids_converted)]
logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.')
elif eval_n_limit:
elif skip_num and skip_num >= 0:
skip_num = min(skip_num, len(dataset))
dataset = dataset.iloc[skip_num:]
logger.info(
f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).'
)
if eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit)
logger.info(f'Limiting evaluation to {eval_n_limit} instances.')
elif eval_n_limit and eval_n_limit > 0:
dataset = dataset.head(eval_n_limit)
logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
new_dataset = [
instance
for _, instance in dataset.iterrows()
if instance[id_column] not in finished_ids
if str(instance[id_column]) not in finished_ids
]
logger.info(
f'Finished instances: {len(finished_ids)}, Remaining instances: {len(new_dataset)}'
@@ -228,8 +238,8 @@ async def run_evaluation(
):
use_multiprocessing = num_workers > 1
logger.info(
f'Evaluation started with Agent {metadata.agent_class}, '
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
f'Evaluation started with Agent {metadata.agent_class}:\n'
f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
)
pbar = tqdm(total=len(dataset))
output_fp = open(output_file, 'a')
@@ -241,7 +251,7 @@ async def run_evaluation(
pbar.set_description(f'Instance {output.instance_id}')
pbar.set_postfix_str(f'Test Result: {output.test_result}')
logger.info(
f'Finished evaluation for instance {output.instance_id}: {output.test_result}'
f'Finished evaluation for instance {output.instance_id}: {output.test_result}\n'
)
output_fp.write(json.dumps(output.model_dump()) + '\n')
output_fp.flush()
@@ -270,11 +280,11 @@ async def run_evaluation(
await update_progress(output)
except KeyboardInterrupt:
print('KeyboardInterrupt received. Cleaning up...')
print('\nKeyboardInterrupt received. Cleaning up...\n')
cleanup()
output_fp.close()
logger.info('Evaluation finished.')
logger.info('\nEvaluation finished.\n')
def reset_logger_for_multiprocessing(

View File

@@ -7,6 +7,7 @@ This folder contains evaluation for [WebArena](https://github.com/web-arena-x/we
Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
## Setup WebArena Environment
WebArena requires you to set up websites containing pre-populated content that is accessible via URL to the machine running the OpenHands agents.
Follow [this document](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) to set up your own WebArena environment through local servers or AWS EC2 instances.
Take note of the base URL (`$WEBARENA_BASE_URL`) of the machine where the environment is installed.
@@ -36,8 +37,7 @@ poetry run python evaluation/webarena/get_success_rate.py evaluation/evaluation_
## Submit your evaluation results
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenHands/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
## BrowsingAgent V1.0 result

View File

@@ -12,7 +12,6 @@ import toml
from dotenv import load_dotenv
from openhands.core import logger
from openhands.core.utils import Singleton
load_dotenv()
@@ -123,11 +122,13 @@ class AgentConfig:
"""Configuration for the agent.
Attributes:
micro_agent_name: The name of the micro agent to use for this agent.
memory_enabled: Whether long-term memory (embeddings) is enabled.
memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
llm_config: The name of the llm config to use. If specified, this will override global llm config.
"""
micro_agent_name: str | None = None
memory_enabled: bool = False
memory_max_threads: int = 2
llm_config: str | None = None
@@ -141,7 +142,7 @@ class AgentConfig:
@dataclass
class SecurityConfig(metaclass=Singleton):
class SecurityConfig:
"""Configuration for security related functionalities.
Attributes:
@@ -174,7 +175,7 @@ class SecurityConfig(metaclass=Singleton):
@dataclass
class SandboxConfig(metaclass=Singleton):
class SandboxConfig:
"""Configuration for the sandbox.
Attributes:
@@ -241,7 +242,7 @@ class UndefinedString(str, Enum):
@dataclass
class AppConfig(metaclass=Singleton):
class AppConfig:
"""Configuration for the app.
Attributes:
@@ -565,7 +566,12 @@ def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
sandbox_config = SandboxConfig(**toml_config['sandbox'])
# update the config object with the new values
AppConfig(sandbox=sandbox_config, **core_config)
cfg.sandbox = sandbox_config
for key, value in core_config.items():
if hasattr(cfg, key):
setattr(cfg, key, value)
else:
logger.openhands_logger.warning(f'Unknown core config key: {key}')
except (TypeError, KeyError) as e:
logger.openhands_logger.warning(
f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',

View File

@@ -72,3 +72,8 @@ class LLMResponseError(Exception):
class UserCancelledError(Exception):
def __init__(self, message='User cancelled the request'):
super().__init__(message)
class MicroAgentValidationError(Exception):
def __init__(self, message='Micro agent validation failed'):
super().__init__(message)

View File

@@ -1,3 +0,0 @@
from openhands.core.utils.singleton import Singleton
__all__ = ['Singleton']

View File

@@ -1,37 +0,0 @@
import dataclasses
from openhands.core import logger
class Singleton(type):
_instances: dict = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
else:
# allow updates, just update existing instance
# perhaps not the most orthodox way to do it, though it simplifies client code
# useful for pre-defined groups of settings
instance = cls._instances[cls]
for key, value in kwargs.items():
if hasattr(instance, key):
setattr(instance, key, value)
else:
logger.openhands_logger.warning(
f'Unknown key for {cls.__name__}: "{key}"'
)
return cls._instances[cls]
@classmethod
def reset(cls):
# used by pytest to reset the state of the singleton instances
for instance_type, instance in cls._instances.items():
print('resetting... ', instance_type)
for field_info in dataclasses.fields(instance_type):
if dataclasses.is_dataclass(field_info.type):
setattr(instance, field_info.name, field_info.type())
elif field_info.default_factory is not dataclasses.MISSING:
setattr(instance, field_info.name, field_info.default_factory())
else:
setattr(instance, field_info.name, field_info.default)

View File

@@ -12,3 +12,33 @@ def find_available_tcp_port() -> int:
return -1
finally:
sock.close()
def display_number_matrix(number: int) -> str | None:
if not 0 <= number <= 999:
return None
# Define the matrix representation for each digit
digits = {
'0': ['###', '# #', '# #', '# #', '###'],
'1': [' #', ' #', ' #', ' #', ' #'],
'2': ['###', ' #', '###', '# ', '###'],
'3': ['###', ' #', '###', ' #', '###'],
'4': ['# #', '# #', '###', ' #', ' #'],
'5': ['###', '# ', '###', ' #', '###'],
'6': ['###', '# ', '###', '# #', '###'],
'7': ['###', ' #', ' #', ' #', ' #'],
'8': ['###', '# #', '###', '# #', '###'],
'9': ['###', '# #', '###', ' #', '###'],
}
# alternatively, with leading zeros: num_str = f"{number:03d}"
num_str = str(number) # Convert to string without padding
result = []
for row in range(5):
line = ' '.join(digits[digit][row] for digit in num_str)
result.append(line)
matrix_display = '\n'.join(result)
return f'\n{matrix_display}\n'

View File

@@ -0,0 +1,44 @@
import os
import frontmatter
import pydantic
from openhands.controller.agent import Agent
from openhands.core.exceptions import MicroAgentValidationError
from openhands.core.logger import openhands_logger as logger
class MicroAgentMetadata(pydantic.BaseModel):
name: str
agent: str
require_env_var: dict[str, str]
class MicroAgent:
def __init__(self, path: str):
self.path = path
if not os.path.exists(path):
raise FileNotFoundError(f'Micro agent file {path} is not found')
with open(path, 'r') as file:
self._loaded = frontmatter.load(file)
self._content = self._loaded.content
self._metadata = MicroAgentMetadata(**self._loaded.metadata)
self._validate_micro_agent()
@property
def content(self) -> str:
return self._content
def _validate_micro_agent(self):
logger.info(
f'Loading and validating micro agent [{self._metadata.name}] based on [{self._metadata.agent}]'
)
# Make sure the agent is registered
agent_cls = Agent.get_cls(self._metadata.agent)
assert agent_cls is not None
# Make sure the environment variables are set
for env_var, instruction in self._metadata.require_env_var.items():
if env_var not in os.environ:
raise MicroAgentValidationError(
f'Environment variable [{env_var}] is required by micro agent [{self._metadata.name}] but not set. {instruction}'
)

View File

@@ -2,6 +2,8 @@ import os
from jinja2 import Template
from openhands.utils.microagent import MicroAgent
class PromptManager:
"""
@@ -14,23 +16,21 @@ class PromptManager:
Attributes:
prompt_dir (str): Directory containing prompt templates.
agent_skills_docs (str): Documentation of agent skills.
micro_agent (str | None): Content of the micro-agent definition file, if specified.
micro_agent (MicroAgent | None): Micro-agent, if specified.
"""
def __init__(
self,
prompt_dir: str,
agent_skills_docs: str,
micro_agent_name: str | None = None,
micro_agent: MicroAgent | None = None,
):
self.prompt_dir: str = prompt_dir
self.agent_skills_docs: str = agent_skills_docs
self.system_template: Template = self._load_template('system_prompt')
self.user_template: Template = self._load_template('user_prompt')
self.micro_agent: str | None = (
self._load_micro_agent(micro_agent_name) if micro_agent_name else None
)
self.micro_agent: MicroAgent | None = micro_agent
def _load_template(self, template_name: str) -> Template:
template_path = os.path.join(self.prompt_dir, f'{template_name}.j2')
@@ -39,15 +39,6 @@ class PromptManager:
with open(template_path, 'r') as file:
return Template(file.read())
def _load_micro_agent(self, micro_agent_name: str) -> str:
micro_agent_path = os.path.join(self.prompt_dir, f'micro/{micro_agent_name}.md')
if not os.path.exists(micro_agent_path):
raise FileNotFoundError(
f'Micro agent file {micro_agent_path} for {micro_agent_name} is not found'
)
with open(micro_agent_path, 'r') as file:
return file.read()
@property
def system_message(self) -> str:
rendered = self.system_template.render(
@@ -66,5 +57,7 @@ class PromptManager:
These additional context will convert the current generic agent
into a more specialized agent that is tailored to the user's task.
"""
rendered = self.user_template.render(micro_agent=self.micro_agent)
rendered = self.user_template.render(
micro_agent=self.micro_agent.content if self.micro_agent else None
)
return rendered.strip()

22
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "aenum"
@@ -6585,6 +6585,24 @@ files = [
[package.extras]
cli = ["click (>=5.0)"]
[[package]]
name = "python-frontmatter"
version = "1.1.0"
description = "Parse and manage posts with YAML (or other) frontmatter"
optional = false
python-versions = "*"
files = [
{file = "python-frontmatter-1.1.0.tar.gz", hash = "sha256:7118d2bd56af9149625745c58c9b51fb67e8d1294a0c76796dafdc72c36e5f6d"},
{file = "python_frontmatter-1.1.0-py3-none-any.whl", hash = "sha256:335465556358d9d0e6c98bbeb69b1c969f2a4a21360587b9873bfc3b213407c1"},
]
[package.dependencies]
PyYAML = "*"
[package.extras]
docs = ["sphinx"]
test = ["mypy", "pyaml", "pytest", "toml", "types-PyYAML", "types-toml"]
[[package]]
name = "python-json-logger"
version = "2.0.7"
@@ -9459,4 +9477,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "0e95b8afa4826171ad0b57a46690c8dc4317e1d5a642388e9be9352eac7b3cdc"
content-hash = "ca8ef3dbc1eed207bca42c98c3cbf1fc085548977994ebc28283bc5ddbfa0101"

View File

@@ -47,6 +47,7 @@ tree-sitter = "0.21.3"
bashlex = "^0.18"
pyjwt = "^2.9.0"
dirhash = "*"
python-frontmatter = "^1.1.0"
python-docx = "*"
PyPDF2 = "*"
python-pptx = "*"
@@ -83,6 +84,7 @@ reportlab = "*"
[tool.coverage.run]
concurrency = ["gevent"]
[tool.poetry.group.runtime.dependencies]
jupyterlab = "*"
notebook = "*"
@@ -113,6 +115,7 @@ ignore = ["D1"]
[tool.ruff.lint.pydocstyle]
convention = "google"
[tool.poetry.group.evaluation.dependencies]
streamlit = "*"
whatthepatch = "*"

View File

@@ -49,6 +49,7 @@ async def test_set_agent_state(mock_agent, mock_event_stream):
await controller.set_agent_state_to(AgentState.PAUSED)
assert controller.get_agent_state() == AgentState.PAUSED
await controller.close()
@pytest.mark.asyncio
@@ -65,6 +66,7 @@ async def test_on_event_message_action(mock_agent, mock_event_stream):
message_action = MessageAction(content='Test message')
await controller.on_event(message_action)
assert controller.get_agent_state() == AgentState.RUNNING
await controller.close()
@pytest.mark.asyncio
@@ -81,6 +83,7 @@ async def test_on_event_change_agent_state_action(mock_agent, mock_event_stream)
change_state_action = ChangeAgentStateAction(agent_state=AgentState.PAUSED)
await controller.on_event(change_state_action)
assert controller.get_agent_state() == AgentState.PAUSED
await controller.close()
@pytest.mark.asyncio
@@ -97,6 +100,7 @@ async def test_report_error(mock_agent, mock_event_stream):
await controller.report_error(error_message)
assert controller.state.last_error == error_message
controller.event_stream.add_event.assert_called_once()
await controller.close()
@pytest.mark.asyncio
@@ -116,6 +120,7 @@ async def test_step_with_exception(mock_agent, mock_event_stream):
# Verify that report_error was called with the correct error message
controller.report_error.assert_called_once_with('Malformed action')
await controller.close()
@pytest.mark.asyncio
@@ -134,6 +139,7 @@ async def test_step_max_iterations(mock_agent, mock_event_stream):
await controller._step()
assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
assert controller.state.agent_state == AgentState.PAUSED
await controller.close()
@pytest.mark.asyncio
@@ -153,6 +159,7 @@ async def test_step_max_iterations_headless(mock_agent, mock_event_stream):
assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
# In headless mode, throttling results in an error
assert controller.state.agent_state == AgentState.ERROR
await controller.close()
@pytest.mark.asyncio
@@ -172,6 +179,7 @@ async def test_step_max_budget(mock_agent, mock_event_stream):
await controller._step()
assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
assert controller.state.agent_state == AgentState.PAUSED
await controller.close()
@pytest.mark.asyncio
@@ -192,3 +200,4 @@ async def test_step_max_budget_headless(mock_agent, mock_event_stream):
assert controller.state.traffic_control_state == TrafficControlState.THROTTLING
# In headless mode, throttling results in an error
assert controller.state.agent_state == AgentState.ERROR
await controller.close()

View File

@@ -40,7 +40,6 @@ def temp_toml_file(tmp_path):
@pytest.fixture
def default_config(monkeypatch):
# Fixture to provide a default AppConfig instance
AppConfig.reset()
yield AppConfig()
@@ -501,8 +500,8 @@ def test_api_keys_repr_str():
def test_max_iterations_and_max_budget_per_task_from_toml(temp_toml_file):
temp_toml = """
[core]
max_iterations = 100
max_budget_per_task = 4.0
max_iterations = 42
max_budget_per_task = 4.7
"""
config = AppConfig()
@@ -511,8 +510,8 @@ max_budget_per_task = 4.0
load_from_toml(config, temp_toml_file)
assert config.max_iterations == 100
assert config.max_budget_per_task == 4.0
assert config.max_iterations == 42
assert config.max_budget_per_task == 4.7
def test_get_llm_config_arg(temp_toml_file):

View File

@@ -87,9 +87,6 @@ def test_app_config_attributes_masking(test_handler):
assert 'e2b-xyz789' not in log_output
assert 'ghp_abcdefghijklmnopqrstuvwxyz' not in log_output
# reset the AppConfig
AppConfig.reset()
def test_sensitive_env_vars_masking(test_handler):
logger, stream = test_handler

View File

@@ -0,0 +1,73 @@
import os
import pytest
from pytest import MonkeyPatch
import agenthub # noqa: F401
from openhands.core.exceptions import (
AgentNotRegisteredError,
MicroAgentValidationError,
)
from openhands.utils.microagent import MicroAgent
CONTENT = (
'# dummy header\n' 'dummy content\n' '## dummy subheader\n' 'dummy subcontent\n'
)
def test_micro_agent_load(tmp_path, monkeypatch: MonkeyPatch):
with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
f.write(
(
'---\n'
'name: dummy\n'
'agent: CodeActAgent\n'
'require_env_var:\n'
' SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
'---\n' + CONTENT
)
)
# Patch the required environment variable
monkeypatch.setenv('SANDBOX_OPENHANDS_TEST_ENV_VAR', 'dummy_value')
micro_agent = MicroAgent(os.path.join(tmp_path, 'dummy.md'))
assert micro_agent is not None
assert micro_agent.content == CONTENT.strip()
def test_not_existing_agent(tmp_path, monkeypatch: MonkeyPatch):
with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
f.write(
(
'---\n'
'name: dummy\n'
'agent: NotExistingAgent\n'
'require_env_var:\n'
' SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
'---\n' + CONTENT
)
)
monkeypatch.setenv('SANDBOX_OPENHANDS_TEST_ENV_VAR', 'dummy_value')
with pytest.raises(AgentNotRegisteredError):
MicroAgent(os.path.join(tmp_path, 'dummy.md'))
def test_not_existing_env_var(tmp_path):
with open(os.path.join(tmp_path, 'dummy.md'), 'w') as f:
f.write(
(
'---\n'
'name: dummy\n'
'agent: CodeActAgent\n'
'require_env_var:\n'
' SANDBOX_OPENHANDS_TEST_ENV_VAR: "Set this environment variable for testing purposes"\n'
'---\n' + CONTENT
)
)
with pytest.raises(MicroAgentValidationError) as excinfo:
MicroAgent(os.path.join(tmp_path, 'dummy.md'))
assert 'Set this environment variable for testing purposes' in str(excinfo.value)

View File

@@ -1,8 +1,10 @@
import os
import shutil
from unittest.mock import Mock
import pytest
from openhands.utils.microagent import MicroAgent
from openhands.utils.prompt import PromptManager
@@ -56,11 +58,19 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
with open(os.path.join(prompt_dir, 'micro', f'{micro_agent_name}.md'), 'w') as f:
f.write(micro_agent_content)
manager = PromptManager(prompt_dir, agent_skills_docs, micro_agent_name)
# Mock MicroAgent
mock_micro_agent = Mock(spec=MicroAgent)
mock_micro_agent.content = micro_agent_content
manager = PromptManager(
prompt_dir=prompt_dir,
agent_skills_docs=agent_skills_docs,
micro_agent=mock_micro_agent,
)
assert manager.prompt_dir == prompt_dir
assert manager.agent_skills_docs == agent_skills_docs
assert manager.micro_agent == micro_agent_content
assert manager.micro_agent == mock_micro_agent
assert isinstance(manager.system_message, str)
assert (
@@ -86,7 +96,7 @@ def test_prompt_manager_with_micro_agent(prompt_dir, agent_skills_docs):
def test_prompt_manager_file_not_found(prompt_dir, agent_skills_docs):
with pytest.raises(FileNotFoundError):
PromptManager(prompt_dir, agent_skills_docs, 'non_existent_micro_agent')
MicroAgent(os.path.join(prompt_dir, 'micro', 'non_existent_micro_agent.md'))
def test_prompt_manager_template_rendering(prompt_dir, agent_skills_docs):