diff --git a/.gitignore b/.gitignore index a4bc03c4eeb..0cc7d149d78 100644 --- a/.gitignore +++ b/.gitignore @@ -174,6 +174,7 @@ evaluation/bird/data evaluation/gaia/data evaluation/gorilla/data evaluation/toolqa/data +evaluation/scienceagentbench/benchmark # frontend diff --git a/evaluation/scienceagentbench/Dockerfile b/evaluation/scienceagentbench/Dockerfile new file mode 100644 index 00000000000..70ed92cc4dc --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-bookworm + + +# For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`. +# **Please DO NOT redistribute the unzipped data files online.** +# It will download a benchmark.zip file to the current directory. +# unzip it and put the benchmark folder under evaluation/scienceagentbench/ + +RUN mkdir -p /benchmark +COPY benchmark /benchmark + +RUN mkdir -p /workspace +WORKDIR /workspace + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench . +# popd diff --git a/evaluation/scienceagentbench/Dockerfile.evaluator b/evaluation/scienceagentbench/Dockerfile.evaluator new file mode 100644 index 00000000000..f8263e1bb0a --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile.evaluator @@ -0,0 +1,25 @@ +FROM mambaorg/micromamba:debian12 + +USER root +# For https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#code-generation-with-agents + +RUN micromamba create -n sci-agent-eval python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent-eval pip install pip-tools + +RUN mkdir -p /workspace +WORKDIR /workspace + +RUN apt-get update && apt-get install -y git + +RUN git clone https://github.com/OSU-NLP-Group/ScienceAgentBench.git /workspace/ +RUN git checkout 4eddc7db6449a5ade3e37285747c8b208cd54ce7 + +RUN micromamba create -n sci-agent python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent pip install -r requirements.txt + +# Replace all occurence of conda with micromamba under the /workspace +RUN find ./ -type f -exec sed -i 's/conda/micromamba/g' {} \; + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench-evaluator -f Dockerfile.evaluator . +# popd diff --git a/evaluation/scienceagentbench/README.md b/evaluation/scienceagentbench/README.md new file mode 100644 index 00000000000..3182c2e117b --- /dev/null +++ b/evaluation/scienceagentbench/README.md @@ -0,0 +1,54 @@ +# ScienceAgentBench Evaluation with OpenHands + +This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080). + +## Setup Environment and LLM Configuration + +Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. + +## Setup ScienceAgentBench + +To prevent benchmark data contamination, we only provide the annotation sheet on [Huggingface](https://huggingface.co/datasets/osunlp/ScienceAgentBench), which includes all necessary *inputs* to run an agent. + +## Run Inference on ScienceAgentBench + +```bash +./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] + +# Example +./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 +``` + +where `model_config` is mandatory, and the rest are optional. + +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your +LLM settings, as defined in your `config.toml`. +- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would +like to evaluate. It could also be a release tag like `0.6.2`. +- `use_knowledge`, e.g. `true`, specifies whether allowing the agent to use expert-provided knowledge as additional input or not. By default, it is set to `false`. +- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting +to `CodeActAgent`. +- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By +default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note: +in order to use `eval_limit`, you must also set `agent`. +- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By +default, it is set to 30. +- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By +default, it is set to 1. + +## Evaluate Generated Programs + +### Extract Necessary Information from OpenHands Log + +After the inference is completed, you may use the following command to extract necessary information from the output log for evaluation: + +```bash +python post_proc.py [log_fname] +``` +- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. + +Output will be write to e.g. `evaluation/.../output.converted.jsonl` + +### Run evaluation + +Please follow the steps [here](https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#evaluation-of-generated-code) to evaluate the generated programs. diff --git a/evaluation/scienceagentbench/post_proc.py b/evaluation/scienceagentbench/post_proc.py new file mode 100644 index 00000000000..46cfbe2b2a7 --- /dev/null +++ b/evaluation/scienceagentbench/post_proc.py @@ -0,0 +1,30 @@ +import json +from argparse import ArgumentParser + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument( + 'log_fname', + type=str, + ) + args = parser.parse_args() + + fname = args.log_fname + out_fname = args.log_fname.replace('.jsonl', '.converted.jsonl') + + log = [json.loads(line) for line in open(fname)] + + simple_log = [ + json.dumps( + { + 'instance_id': ex['instance_id'], + 'instruction': ex['instruction'], + 'test_result': ex['test_result'], + 'cost': ex['metrics']['accumulated_cost'], + } + ) + for ex in log + ] + + with open(out_fname, 'w+', encoding='utf-8') as f: + f.write('\n'.join(simple_log)) diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/scienceagentbench/run_infer.py new file mode 100644 index 00000000000..52e42f29f12 --- /dev/null +++ b/evaluation/scienceagentbench/run_infer.py @@ -0,0 +1,292 @@ +import asyncio +import os +from typing import Any + +import pandas as pd +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + codeact_user_response, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + get_parser, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} + +LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'benchmark') + + +def format_task_dict(example, use_knowledge): + task = { + 'instance_id': example['instance_id'], + 'task_inst': example['task_inst'], + 'dataset_path': '/benchmark/datasets/' + + example['dataset_folder_tree'].split('\n')[0][4:], + 'dataset_folder_tree': example['dataset_folder_tree'], + 'dataset_preview': example['dataset_preview'], + 'pred_program_name': 'pred_' + example['gold_program_name'], + } + + if use_knowledge: + task['task_inst'] += '\n' + str(example['domain_knowledge']) + + return task + + +def get_config( + metadata: EvalMetadata, + instance_id: str, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'eventstream'), + max_budget_per_task=4, + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench', + enable_auto_lint=True, + use_host_network=False, + timeout=300, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_remote_runtime_alive=False, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + config.set_llm_config(metadata.llm_config) + if metadata.llm_config.log_completions: + metadata.llm_config.log_completions_folder = os.path.join( + metadata.eval_output_dir, 'llm_completions', instance_id + ) + logger.info( + f'Logging LLM completions for instance {instance_id} to ' + f'{metadata.llm_config.log_completions_folder}' + ) + return config + + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, # this argument is not required +): + """Initialize the runtime for the agent. + + This function is called before the runtime is used to run the agent. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}") + obs: CmdOutputObservation + + # Set up workspace directories + action = CmdRunAction(command='mkdir -p /workspace/pred_programs') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='mkdir -p /workspace/pred_results') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + dataset_name = instance['dataset_folder_tree'].split('\n')[0][4:].rstrip('/') + + # Copy the dataset to the workspace + dataset_dir = os.path.join( + LOCAL_DATASET_PATH, + 'datasets', + dataset_name, + ) + runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True) + + # Check the dataset exists + action = CmdRunAction( + command='cd /workspace/benchmark/datasets && ls', + keep_prompt=False, + ) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + assert dataset_name in obs.content + + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") + + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> dict[str, Any]: + """Complete the runtime for the agent. + + This function is called before the runtime is used to run the agent. + If you need to do something in the sandbox to get the correctness metric after + the agent has run, modify this function. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}") + obs: CmdOutputObservation + + test_result = {} + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + assert obs.exit_code == 0 + + action = CmdRunAction( + command=f'cat pred_programs/{instance.pred_program_name}', + keep_prompt=False, + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + if obs.exit_code == 0: + test_result = {'program': obs.content} + else: + test_result = {'program': 'ERROR'} + + logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}") + return test_result + + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + instance_id = instance.instance_id.replace('/', '__') + config = get_config(metadata, instance_id) + + # Set up the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance_id}.') + + instruction = f"""You are an expert Python programming assistant that helps scientist users to write high-quality code to solve their tasks. +Given a user request, you are expected to write a complete program that accomplishes the requested task and save any outputs to `/workspace/pred_results/` in the correct format. + +Here's the user request you need to work on: +{instance.task_inst} + +You can access the dataset at `{instance.dataset_path}`. Here is the directory structure of the dataset: +``` +{instance.dataset_folder_tree} +``` +Here are some helpful previews for the dataset file(s): +{instance.dataset_preview} + +Please save your program as `/workspace/pred_programs/{instance.pred_program_name}`. +Then, please run the program to check and fix any errors. +Please do NOT run the program in the background. +If the program uses some packages that are incompatible, please figure out alternative implementations and do NOT restart the environment. + +""" + + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) + ) + + # ======= Attempt to evaluate the agent's edits ======= + test_result = complete_runtime(runtime, instance) + + # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) + # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. + if state is None: + raise ValueError('State should not be None.') + metrics = state.metrics.get() if state.metrics else None + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + # remove when it becomes unnecessary + histories = state.history.compatibility_for_eval_history_pairs() + + # Save the output + output = EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + + +if __name__ == '__main__': + parser = get_parser() + parser.add_argument( + '--use_knowledge', + type=str, + default='false', + choices=['true', 'false'], + help='use expert-provided knowledge or not', + ) + args, _ = parser.parse_known_args() + + sab_dataset = load_dataset('osunlp/ScienceAgentBench', split='validation') + + dataset_processed = [] + for example in tqdm(sab_dataset): + dataset_processed.append( + format_task_dict(example, args.use_knowledge == 'true') + ) + + dataset = pd.DataFrame(dataset_processed) + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'ScienceAgentBench', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + dataset['instance_id'] = dataset['instance_id'].apply(str) + instances = prepare_dataset(dataset, output_file, args.eval_n_limit) + + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance + ) diff --git a/evaluation/scienceagentbench/scripts/run_infer.sh b/evaluation/scienceagentbench/scripts/run_infer.sh new file mode 100755 index 00000000000..7667e572378 --- /dev/null +++ b/evaluation/scienceagentbench/scripts/run_infer.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -eo pipefail + +source "evaluation/utils/version_control.sh" + +MODEL_CONFIG=$1 +COMMIT_HASH=$2 +USE_KNOWLEDGE=$3 +AGENT=$4 +EVAL_LIMIT=$5 +NUM_WORKERS=$6 + +if [ -z "$NUM_WORKERS" ]; then + NUM_WORKERS=1 + echo "Number of workers not specified, use default $NUM_WORKERS" +fi +checkout_eval_branch + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +if [ -z "$USE_KNOWLEDGE" ]; then + echo "Use knowledge not specified, use default False" + USE_KNOWLEDGE=false +fi + +get_agent_version + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" + +COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --use_knowledge $USE_KNOWLEDGE \ + --max-iterations 30 \ + --eval-num-workers $NUM_WORKERS \ + --eval-note $AGENT_VERSION" \ + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/openhands/runtime/impl/e2b/sandbox.py b/openhands/runtime/impl/e2b/sandbox.py index 666dc43f701..d145dac3511 100644 --- a/openhands/runtime/impl/e2b/sandbox.py +++ b/openhands/runtime/impl/e2b/sandbox.py @@ -4,9 +4,7 @@ from glob import glob from e2b import Sandbox as E2BSandbox -from e2b.sandbox.exception import ( - TimeoutException, -) +from e2b.sandbox.exception import TimeoutException from openhands.core.config import SandboxConfig from openhands.core.logger import openhands_logger as logger