Skip to content

Commit

Permalink
Merge branch 'i18_translations' of https://github.com/Vaishakh-SM/Ope…
Browse files Browse the repository at this point in the history
…nHands into i18_translations
  • Loading branch information
Vaishakh-SM committed Oct 27, 2024
2 parents cdfe99d + 1b675ac commit 43e21aa
Show file tree
Hide file tree
Showing 93 changed files with 2,437 additions and 1,529 deletions.
6 changes: 4 additions & 2 deletions Development.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ Otherwise, you can clone the OpenHands project directly.

## Start the server for development
### 1. Requirements
* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install) [ Ubuntu <= 22.04]
* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install) [Ubuntu <= 22.04]
* [Docker](https://docs.docker.com/engine/install/) (For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
* [Python](https://www.python.org/downloads/) = 3.12
* [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
* [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
* netcat => sudo apt-get install netcat
* OS-specific dependencies:
- Ubuntu: build-essential => `sudo apt-get install build-essential`
- WSL: netcat => `sudo apt-get install netcat`

Make sure you have all these dependencies installed before moving on to `make build`.

Expand Down
2 changes: 2 additions & 0 deletions docs/modules/usage/how-to/evaluation-harness.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:

4. Create a function to process each instance:
```python
from openhands.utils.async_utils import call_async_from_sync
def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
config = get_config(instance, metadata)
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

instruction = get_instruction(instance, metadata)
Expand Down
2 changes: 2 additions & 0 deletions evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

game = None

Expand Down Expand Up @@ -119,6 +120,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def get_config(
Expand Down Expand Up @@ -210,6 +211,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

initialize_runtime(runtime, instance=instance)

Expand Down
2 changes: 2 additions & 0 deletions evaluation/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

# Configure visibility of unit tests to the Agent.
USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
Expand Down Expand Up @@ -207,6 +208,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

initialize_runtime(runtime, instance=instance)

Expand Down
3 changes: 2 additions & 1 deletion evaluation/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': functools.partial(
Expand Down Expand Up @@ -275,7 +276,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response(state: State) -> str:
Expand Down Expand Up @@ -403,6 +404,7 @@ def execute_sql(db_path, sql):
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')

Expand Down Expand Up @@ -142,6 +143,7 @@ def process_instance(
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -81,6 +82,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
3 changes: 2 additions & 1 deletion evaluation/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
MessageAction,
)
from openhands.events.observation import Observation
from openhands.utils.async_utils import call_async_from_sync

ACTION_FORMAT = """
<<FINAL_ANSWER||
Expand Down Expand Up @@ -215,7 +216,7 @@ def process_instance(
"""

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
state: State | None = asyncio.run(
run_controller(
config=config,
Expand Down
2 changes: 2 additions & 0 deletions evaluation/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

IMPORT_HELPER = {
'python': [
Expand Down Expand Up @@ -233,6 +234,7 @@ def process_instance(

# Here's how you can run the agent (similar to the `main` function) and get the final task state
runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)
state: State | None = asyncio.run(
run_controller(
Expand Down
13 changes: 12 additions & 1 deletion evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import MessageAction
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

FAKE_RESPONSES = {
'CodeActAgent': codeact_user_response,
Expand All @@ -33,6 +34,7 @@

def get_config(
metadata: EvalMetadata,
instance_id: str,
) -> AppConfig:
config = AppConfig(
default_agent=metadata.agent_class,
Expand All @@ -49,6 +51,14 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
if metadata.llm_config.log_completions:
metadata.llm_config.log_completions_folder = os.path.join(
metadata.eval_output_dir, 'llm_completions', instance_id
)
logger.info(
f'Logging LLM completions for instance {instance_id} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
return config

Expand All @@ -58,7 +68,7 @@ def process_instance(
metadata: EvalMetadata,
reset_logger: bool = True,
) -> EvalOutput:
config = get_config(metadata)
config = get_config(metadata, instance.instance_id)

# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
Expand Down Expand Up @@ -92,6 +102,7 @@ def process_instance(
# =============================================

runtime: Runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

test_class.initialize_runtime(runtime)

Expand Down
2 changes: 2 additions & 0 deletions evaluation/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -202,6 +203,7 @@ def process_instance(
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
2 changes: 2 additions & 0 deletions evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
BROWSER_EVAL_GET_GOAL_ACTION,
BROWSER_EVAL_GET_REWARDS_ACTION,
)
from openhands.utils.async_utils import call_async_from_sync

SUPPORTED_AGENT_CLS = {'BrowsingAgent'}

Expand Down Expand Up @@ -127,6 +128,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {env_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
task_str = initialize_runtime(runtime)
state: State | None = asyncio.run(
run_controller(
Expand Down
2 changes: 2 additions & 0 deletions evaluation/mint/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync


def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
Expand Down Expand Up @@ -176,6 +177,7 @@ def process_instance(
)

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

state: State | None = asyncio.run(
Expand Down
12 changes: 5 additions & 7 deletions evaluation/mint/tasks/reasoning.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,9 @@ def extract_options(self, prompt: str) -> dict:


def compare_two_numbers(p, gt):
if isinstance(p, int) or isinstance(p, float):
if isinstance(p, (int, float)):
pass
elif isinstance(p, list) or isinstance(p, bool) or isinstance(p, str):
return False
elif isinstance(p, tuple) or isinstance(p, complex) or isinstance(p, dict):
elif isinstance(p, (bool, complex, dict, list, str, tuple)):
return False
else:
raise ValueError(p)
Expand Down Expand Up @@ -227,8 +225,8 @@ def extract_answer(self, solution: str) -> Any:
prediction = prediction.replace('°', '')

# Detect the boolean keyword in the generation
if prediction in ['true', 'yes', 'false', 'no']:
if prediction == 'true' or prediction == 'yes':
if prediction in ('true', 'yes', 'false', 'no'):
if prediction in ('true', 'yes'):
prediction = 'True'
else:
prediction = 'False'
Expand Down Expand Up @@ -342,7 +340,7 @@ def success(self, solution: str) -> bool:
answer_type = self._answer_type
gt = self.extract_answer(self.reference)

if isinstance(prediction, (str, int, float)) or isinstance(prediction, list):
if isinstance(prediction, (str, int, float, list)):
# Comparing prediction against the reference
if answer_type in ['bool', 'option', 'Option']:
cur_correct = int(prediction == f'({gt})') or int(prediction == gt)
Expand Down
2 changes: 2 additions & 0 deletions evaluation/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

config = load_app_config()

Expand Down Expand Up @@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime, instance)

# Run the agent
Expand Down
3 changes: 2 additions & 1 deletion evaluation/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from openhands.core.main import create_runtime
from openhands.events.action import CmdRunAction
from openhands.events.observation import CmdOutputObservation
from openhands.utils.async_utils import call_async_from_sync

# TODO: migrate all swe-bench docker to ghcr.io/openhands
DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
Expand Down Expand Up @@ -128,7 +129,7 @@ def process_instance(
)

runtime = create_runtime(config)

call_async_from_sync(runtime.connect)
# Get patch and save it to /tmp/patch.diff
with tempfile.TemporaryDirectory() as temp_dir:
# Patch file
Expand Down
11 changes: 10 additions & 1 deletion evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from openhands.events.serialization.event import event_to_dict
from openhands.runtime.base import Runtime
from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
from openhands.utils.async_utils import call_async_from_sync

USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
Expand Down Expand Up @@ -143,6 +144,14 @@ def get_config(
workspace_base=None,
workspace_mount_path=None,
)
if metadata.llm_config.log_completions:
metadata.llm_config.log_completions_folder = os.path.join(
metadata.eval_output_dir, 'llm_completions', instance['instance_id']
)
logger.info(
f'Logging LLM completions for instance {instance["instance_id"]} to '
f'{metadata.llm_config.log_completions_folder}'
)
config.set_llm_config(metadata.llm_config)
return config

Expand Down Expand Up @@ -372,6 +381,7 @@ def process_instance(
logger.info(f'Starting evaluation for instance {instance.instance_id}.')

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)

try:
initialize_runtime(runtime, instance)
Expand Down Expand Up @@ -432,7 +442,6 @@ def process_instance(
metadata=metadata,
history=histories,
metrics=metrics,
llm_completions=state.extra_data.get('llm_completions', []),
error=state.last_error if state and state.last_error else None,
)
return output
Expand Down
2 changes: 2 additions & 0 deletions evaluation/toolqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from openhands.events.action import CmdRunAction, MessageAction
from openhands.events.observation import CmdOutputObservation
from openhands.runtime.base import Runtime
from openhands.utils.async_utils import call_async_from_sync

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
Expand Down Expand Up @@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})

runtime = create_runtime(config)
call_async_from_sync(runtime.connect)
initialize_runtime(runtime)

# Here's how you can run the agent (similar to the `main` function) and get the final task state
Expand Down
Loading

0 comments on commit 43e21aa

Please sign in to comment.