From 009b7f888b4476a588c0873dac6888a10b7f5934 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 30 Oct 2024 12:50:31 +0000 Subject: [PATCH 1/7] Fix issue #4629: '[Bug]: Replace claude-3-5-sonnet-20240620 with claude-3-5-sonnet-20241022' --- .../usage/how-to/evaluation-harness.md | 4 +- .../current/usage/how-to/headless-mode.md | 42 --- .../current/usage/how-to/cli-mode.md | 92 ------ .../usage/how-to/evaluation-harness.md | 266 ------------------ .../current/usage/how-to/headless-mode.md | 4 +- docs/modules/usage/how-to/cli-mode.md | 4 +- .../usage/how-to/evaluation-harness.md | 254 +---------------- docs/modules/usage/how-to/headless-mode.md | 4 +- .../utils/extractModelAndProvider.test.ts | 5 +- .../utils/organizeModelsAndProviders.test.ts | 7 +- frontend/src/utils/verified-models.ts | 3 +- tests/unit/test_prompt_caching.py | 3 +- 12 files changed, 27 insertions(+), 661 deletions(-) diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index ec1fdfc70ef..d027a0ead92 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -14,7 +14,8 @@ Voici un exemple de fichier de configuration que vous pouvez utiliser pour défi ```toml [llm] # IMPORTANT : ajoutez votre clé API ici et définissez le modèle que vous souhaitez évaluer -model = "claude-3-5-sonnet-20240620" +model = "claude-3-5-sonnet-20241022" + api_key = "sk-XXX" [llm.eval_gpt4_1106_preview_llm] @@ -278,3 +279,4 @@ Cette fonction fait ce qui suit : 3. Si l'agent a fait plusieurs tentatives, il lui donne la possibilité d'abandonner En utilisant cette fonction, vous pouvez garantir un comportement cohérent sur plusieurs exécutions d'évaluation et empêcher l'agent de rester bloqué en attendant une entrée humaine. + diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md index 10ba7ec1865..47430074713 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md @@ -13,46 +13,4 @@ Pour exécuter OpenHands en mode sans interface avec Python, [suivez les instructions de configuration de développement](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md), puis exécutez : -```bash -poetry run python -m openhands.core.main -t "write a bash script that prints hi" -``` -Vous devrez vous assurer de définir votre modèle, votre clé API et d'autres paramètres via des variables d'environnement -[ou le fichier `config.toml`](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml). - -## Avec Docker - -1. Définissez `WORKSPACE_BASE` sur le répertoire que vous voulez qu'OpenHands modifie : - -```bash -WORKSPACE_BASE=$(pwd)/workspace -``` - -2. Définissez `LLM_MODEL` sur le modèle que vous voulez utiliser : - -```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" -``` - -3. Définissez `LLM_API_KEY` sur votre clé API : - -```bash -LLM_API_KEY="sk_test_12345" -``` - -4. Exécutez la commande Docker suivante : - -```bash -docker run -it \ - --pull=always \ - -e SANDBOX_USER_ID=$(id -u) \ - -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ - -e LLM_API_KEY=$LLM_API_KEY \ - -e LLM_MODEL=$LLM_MODEL \ - -v $WORKSPACE_BASE:/opt/workspace_base \ - -v /var/run/docker.sock:/var/run/docker.sock \ - --add-host host.docker.internal:host-gateway \ - --name openhands-app-$(date +%Y%m%d%H%M%S) \ - ghcr.io/all-hands-ai/openhands:0.11 \ - python -m openhands.core.main -t "write a bash script that prints hi" -``` diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md index ec9134f5d3b..4609b3aeebc 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md @@ -14,96 +14,4 @@ OpenHands 可以在交互式命令行模式下运行,允许用户通过命令行 2. 运行以下命令: -```bash -poetry run python -m openhands.core.cli -``` -该命令将启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。 - -你需要确保通过环境变量[或 `config.toml` 文件](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml)设置你的模型、API 密钥和其他设置。 - - -## 使用 Docker - -要在 Docker 中以命令行模式运行 OpenHands,请按照以下步骤操作: - -1. 将 `WORKSPACE_BASE` 设置为你希望 OpenHands 编辑的目录: - -```bash -WORKSPACE_BASE=$(pwd)/workspace -``` - -2. 将 `LLM_MODEL` 设置为你要使用的模型: - -```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" -``` - -3. 将 `LLM_API_KEY` 设置为你的 API 密钥: - -```bash -LLM_API_KEY="sk_test_12345" -``` - -4. 运行以下 Docker 命令: - -```bash -docker run -it \ - --pull=always \ - -e SANDBOX_USER_ID=$(id -u) \ - -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ - -e LLM_API_KEY=$LLM_API_KEY \ - -e LLM_MODEL=$LLM_MODEL \ - -v $WORKSPACE_BASE:/opt/workspace_base \ - -v /var/run/docker.sock:/var/run/docker.sock \ - --add-host host.docker.internal:host-gateway \ - --name openhands-app-$(date +%Y%m%d%H%M%S) \ - ghcr.io/all-hands-ai/openhands:0.11 \ - python -m openhands.core.cli -``` - -该命令将在 Docker 中启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。 - -## 命令行命令和预期输出示例 - -以下是一些命令行命令及其预期输出的示例: - -### 示例 1: 简单任务 - -```bash -How can I help? >> Write a Python script that prints "Hello, World!" -``` - -预期输出: - -```bash -🤖 Sure! Here is a Python script that prints "Hello, World!": - -❯ print("Hello, World!") -``` - -### 示例 2: Bash 命令 - -```bash -How can I help? >> Create a directory named "test_dir" -``` - -预期输出: - -```bash -🤖 Creating a directory named "test_dir": - -❯ mkdir test_dir -``` - -### 示例 3: 错误处理 - -```bash -How can I help? >> Delete a non-existent file -``` - -预期输出: - -```bash -🤖 An error occurred. Please try again. -``` diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index f46a42c8eab..2ce588da11e 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -9,270 +9,4 @@ 以下是一个示例配置文件,您可以使用它来定义和使用多个 LLM: -```toml -[llm] -# 重要:在此处添加您的 API 密钥,并将模型设置为您要评估的模型 -model = "claude-3-5-sonnet-20240620" -api_key = "sk-XXX" -[llm.eval_gpt4_1106_preview_llm] -model = "gpt-4-1106-preview" -api_key = "XXX" -temperature = 0.0 - -[llm.eval_some_openai_compatible_model_llm] -model = "openai/MODEL_NAME" -base_url = "https://OPENAI_COMPATIBLE_URL/v1" -api_key = "XXX" -temperature = 0.0 -``` - - -## 如何在命令行中使用 OpenHands - -可以使用以下格式从命令行运行 OpenHands: - -```bash -poetry run python ./openhands/core/main.py \ - -i \ - -t "" \ - -c \ - -l -``` - -例如: - -```bash -poetry run python ./openhands/core/main.py \ - -i 10 \ - -t "Write me a bash script that prints hello world." \ - -c CodeActAgent \ - -l llm -``` - -此命令使用以下参数运行 OpenHands: -- 最大迭代次数为 10 -- 指定的任务描述 -- 使用 CodeActAgent -- 使用 `config.toml` 文件的 `llm` 部分中定义的 LLM 配置 - -## OpenHands 如何工作 - -OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工作原理的简化流程: - -1. 解析命令行参数并加载配置 -2. 使用 `create_runtime()` 创建运行时环境 -3. 初始化指定的代理 -4. 使用 `run_controller()` 运行控制器,它: - - 将运行时附加到代理 - - 执行代理的任务 - - 完成后返回最终状态 - -`run_controller()` 函数是 OpenHands 执行的核心。它管理代理、运行时和任务之间的交互,处理用户输入模拟和事件处理等事项。 - - -## 入门最简单的方法:探索现有基准 - -我们鼓励您查看我们仓库的 [`evaluation/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation)中提供的各种评估基准。 - -要集成您自己的基准,我们建议从最接近您需求的基准开始。这种方法可以显著简化您的集成过程,允许您在现有结构的基础上进行构建并使其适应您的特定要求。 - -## 如何创建评估工作流 - - -要为您的基准创建评估工作流,请按照以下步骤操作: - -1. 导入相关的 OpenHands 实用程序: - ```python - import openhands.agenthub - from evaluation.utils.shared import ( - EvalMetadata, - EvalOutput, - make_metadata, - prepare_dataset, - reset_logger_for_multiprocessing, - run_evaluation, - ) - from openhands.controller.state.state import State - from openhands.core.config import ( - AppConfig, - SandboxConfig, - get_llm_config_arg, - parse_arguments, - ) - from openhands.core.logger import openhands_logger as logger - from openhands.core.main import create_runtime, run_controller - from openhands.events.action import CmdRunAction - from openhands.events.observation import CmdOutputObservation, ErrorObservation - from openhands.runtime.runtime import Runtime - ``` - -2. 创建配置: - ```python - def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: - config = AppConfig( - default_agent=metadata.agent_class, - runtime='eventstream', - max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='your_container_image', - enable_auto_lint=True, - timeout=300, - ), - ) - config.set_llm_config(metadata.llm_config) - return config - ``` - -3. 初始化运行时并设置评估环境: - ```python - def initialize_runtime(runtime: Runtime, instance: pd.Series): - # 在此处设置您的评估环境 - # 例如,设置环境变量、准备文件等 - pass - ``` - -4. 创建一个函数来处理每个实例: - ```python - from openhands.utils.async_utils import call_async_from_sync - def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: - config = get_config(instance, metadata) - runtime = create_runtime(config) - call_async_from_sync(runtime.connect) - initialize_runtime(runtime, instance) - - instruction = get_instruction(instance, metadata) - - state = run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=your_user_response_function, - ) - - # 评估代理的操作 - evaluation_result = await evaluate_agent_actions(runtime, instance) - - return EvalOutput( - instance_id=instance.instance_id, - instruction=instruction, - test_result=evaluation_result, - metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), - metrics=state.metrics.get() if state.metrics else None, - error=state.last_error if state and state.last_error else None, - ) - ``` - -5. 运行评估: - ```python - metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir) - output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') - instances = prepare_dataset(your_dataset, output_file, eval_n_limit) - - await run_evaluation( - instances, - metadata, - output_file, - num_workers, - process_instance - ) - ``` - -此工作流设置配置,初始化运行时环境,通过运行代理并评估其操作来处理每个实例,然后将结果收集到 `EvalOutput` 对象中。`run_evaluation` 函数处理并行化和进度跟踪。 - -请记住根据您特定的基准要求自定义 `get_instruction`、`your_user_response_function` 和 `evaluate_agent_actions` 函数。 - -通过遵循此结构,您可以在 OpenHands 框架内为您的基准创建强大的评估工作流。 - - -## 理解 `user_response_fn` - -`user_response_fn` 是 OpenHands 评估工作流中的关键组件。它模拟用户与代理的交互,允许在评估过程中自动响应。当您想要为代理的查询或操作提供一致的、预定义的响应时,此函数特别有用。 - - -### 工作流和交互 - -处理操作和 `user_response_fn` 的正确工作流如下: - -1. 代理接收任务并开始处理 -2. 代理发出操作 -3. 如果操作可执行(例如 CmdRunAction、IPythonRunCellAction): - - 运行时处理操作 - - 运行时返回观察结果 -4. 如果操作不可执行(通常是 MessageAction): - - 调用 `user_response_fn` - - 它返回模拟的用户响应 -5. 代理接收观察结果或模拟响应 -6. 重复步骤 2-5,直到任务完成或达到最大迭代次数 - -以下是更准确的可视化表示: - -``` - [代理] - | - v - [发出操作] - | - v - [操作是否可执行?] - / \ - 是 否 - | | - v v - [运行时] [user_response_fn] - | | - v v - [返回观察结果] [模拟响应] - \ / - \ / - v v - [代理接收反馈] - | - v - [继续或完成任务] -``` - -在此工作流中: - -- 可执行的操作(如运行命令或执行代码)由运行时直接处理 -- 不可执行的操作(通常是当代理想要通信或寻求澄清时)由 `user_response_fn` 处理 -- 然后,代理处理反馈,无论是来自运行时的观察结果还是来自 `user_response_fn` 的模拟响应 - -这种方法允许自动处理具体操作和模拟用户交互,使其适用于您想要测试代理在最少人工干预的情况下完成任务的能力的评估场景。 - -### 示例实现 - -以下是 SWE-Bench 评估中使用的 `user_response_fn` 示例: - -```python -def codeact_user_response(state: State | None) -> str: - msg = ( - 'Please continue working on the task on whatever approach you think is suitable.\n' - 'If you think you have solved the task, please first send your answer to user through message and then exit .\n' - 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' - ) - - if state and state.history: - # 检查代理是否已尝试与用户对话 3 次,如果是,让代理知道它可以放弃 - user_msgs = [ - event - for event in state.history.get_events() - if isinstance(event, MessageAction) and event.source == 'user' - ] - if len(user_msgs) >= 2: - # 让代理知道它在尝试 3 次后可以放弃 - return ( - msg - + 'If you want to give up, run: exit .\n' - ) - return msg -``` - -此函数执行以下操作: - -1. 提供一条标准消息,鼓励代理继续工作 -2. 检查代理尝试与用户通信的次数 -3. 如果代理已多次尝试,它会提供放弃的选项 - -通过使用此函数,您可以确保在多次评估运行中保持一致的行为,并防止代理在等待人工输入时陷入困境。 diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md index 791a0e05e8e..8beacdd208b 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md @@ -32,7 +32,8 @@ WORKSPACE_BASE=$(pwd)/workspace 2. 将 `LLM_MODEL` 设置为你要使用的模型: ```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" + ``` 3. 将 `LLM_API_KEY` 设置为你的 API 密钥: @@ -57,3 +58,4 @@ docker run -it \ ghcr.io/all-hands-ai/openhands:0.11 \ python -m openhands.core.main -t "write a bash script that prints hi" ``` + diff --git a/docs/modules/usage/how-to/cli-mode.md b/docs/modules/usage/how-to/cli-mode.md index 572f40e2193..e5d4a504679 100644 --- a/docs/modules/usage/how-to/cli-mode.md +++ b/docs/modules/usage/how-to/cli-mode.md @@ -35,7 +35,8 @@ WORKSPACE_BASE=$(pwd)/workspace 2. Set `LLM_MODEL` to the model you want to use: ```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" + ``` 3. Set `LLM_API_KEY` to your API key: @@ -106,3 +107,4 @@ Expected Output: ```bash 🤖 An error occurred. Please try again. ``` + diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index daf144d11e8..04ce1a45d71 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -9,10 +9,13 @@ OpenHands in development mode uses `config.toml` to keep track of most configura Here's an example configuration file you can use to define and use multiple LLMs: + + + ```toml [llm] # IMPORTANT: add your API key here, and set the model to the one you want to evaluate -model = "claude-3-5-sonnet-20240620" +model = "claude-3-5-sonnet-20241022" api_key = "sk-XXX" [llm.eval_gpt4_1106_preview_llm] @@ -27,252 +30,3 @@ api_key = "XXX" temperature = 0.0 ``` - -## How to use OpenHands in the command line - -OpenHands can be run from the command line using the following format: - -```bash -poetry run python ./openhands/core/main.py \ - -i \ - -t "" \ - -c \ - -l -``` - -For example: - -```bash -poetry run python ./openhands/core/main.py \ - -i 10 \ - -t "Write me a bash script that prints hello world." \ - -c CodeActAgent \ - -l llm -``` - -This command runs OpenHands with: -- A maximum of 10 iterations -- The specified task description -- Using the CodeActAgent -- With the LLM configuration defined in the `llm` section of your `config.toml` file - -## How does OpenHands work - -The main entry point for OpenHands is in `openhands/core/main.py`. Here's a simplified flow of how it works: - -1. Parse command-line arguments and load the configuration -2. Create a runtime environment using `create_runtime()` -3. Initialize the specified agent -4. Run the controller using `run_controller()`, which: - - Attaches the runtime to the agent - - Executes the agent's task - - Returns a final state when complete - -The `run_controller()` function is the core of OpenHands's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing. - - -## Easiest way to get started: Exploring Existing Benchmarks - -We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) of our repository. - -To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements. - -## How to create an evaluation workflow - - -To create an evaluation workflow for your benchmark, follow these steps: - -1. Import relevant OpenHands utilities: - ```python - import openhands.agenthub - from evaluation.utils.shared import ( - EvalMetadata, - EvalOutput, - make_metadata, - prepare_dataset, - reset_logger_for_multiprocessing, - run_evaluation, - ) - from openhands.controller.state.state import State - from openhands.core.config import ( - AppConfig, - SandboxConfig, - get_llm_config_arg, - parse_arguments, - ) - from openhands.core.logger import openhands_logger as logger - from openhands.core.main import create_runtime, run_controller - from openhands.events.action import CmdRunAction - from openhands.events.observation import CmdOutputObservation, ErrorObservation - from openhands.runtime.runtime import Runtime - ``` - -2. Create a configuration: - ```python - def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: - config = AppConfig( - default_agent=metadata.agent_class, - runtime='eventstream', - max_iterations=metadata.max_iterations, - sandbox=SandboxConfig( - base_container_image='your_container_image', - enable_auto_lint=True, - timeout=300, - ), - ) - config.set_llm_config(metadata.llm_config) - return config - ``` - -3. Initialize the runtime and set up the evaluation environment: - ```python - def initialize_runtime(runtime: Runtime, instance: pd.Series): - # Set up your evaluation environment here - # For example, setting environment variables, preparing files, etc. - pass - ``` - -4. Create a function to process each instance: - ```python - from openhands.utils.async_utils import call_async_from_sync - def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: - config = get_config(instance, metadata) - runtime = create_runtime(config) - call_async_from_sync(runtime.connect) - initialize_runtime(runtime, instance) - - instruction = get_instruction(instance, metadata) - - state = run_controller( - config=config, - task_str=instruction, - runtime=runtime, - fake_user_response_fn=your_user_response_function, - ) - - # Evaluate the agent's actions - evaluation_result = await evaluate_agent_actions(runtime, instance) - - return EvalOutput( - instance_id=instance.instance_id, - instruction=instruction, - test_result=evaluation_result, - metadata=metadata, - history=state.history.compatibility_for_eval_history_pairs(), - metrics=state.metrics.get() if state.metrics else None, - error=state.last_error if state and state.last_error else None, - ) - ``` - -5. Run the evaluation: - ```python - metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir) - output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') - instances = prepare_dataset(your_dataset, output_file, eval_n_limit) - - await run_evaluation( - instances, - metadata, - output_file, - num_workers, - process_instance - ) - ``` - -This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking. - -Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements. - -By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenHands framework. - - -## Understanding the `user_response_fn` - -The `user_response_fn` is a crucial component in OpenHands's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions. - - -### Workflow and Interaction - -The correct workflow for handling actions and the `user_response_fn` is as follows: - -1. Agent receives a task and starts processing -2. Agent emits an Action -3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction): - - The Runtime processes the Action - - Runtime returns an Observation -4. If the Action is not executable (typically a MessageAction): - - The `user_response_fn` is called - - It returns a simulated user response -5. The agent receives either the Observation or the simulated response -6. Steps 2-5 repeat until the task is completed or max iterations are reached - -Here's a more accurate visual representation: - -``` - [Agent] - | - v - [Emit Action] - | - v - [Is Action Executable?] - / \ - Yes No - | | - v v - [Runtime] [user_response_fn] - | | - v v - [Return Observation] [Simulated Response] - \ / - \ / - v v - [Agent receives feedback] - | - v - [Continue or Complete Task] -``` - -In this workflow: - -- Executable actions (like running commands or executing code) are handled directly by the Runtime -- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn` -- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn` - -This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention. - -### Example Implementation - -Here's an example of a `user_response_fn` used in the SWE-Bench evaluation: - -```python -def codeact_user_response(state: State | None) -> str: - msg = ( - 'Please continue working on the task on whatever approach you think is suitable.\n' - 'If you think you have solved the task, please first send your answer to user through message and then exit .\n' - 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' - ) - - if state and state.history: - # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up - user_msgs = [ - event - for event in state.history.get_events() - if isinstance(event, MessageAction) and event.source == 'user' - ] - if len(user_msgs) >= 2: - # let the agent know that it can give up when it has tried 3 times - return ( - msg - + 'If you want to give up, run: exit .\n' - ) - return msg -``` - -This function does the following: - -1. Provides a standard message encouraging the agent to continue working -2. Checks how many times the agent has attempted to communicate with the user -3. If the agent has made multiple attempts, it provides an option to give up - -By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input. diff --git a/docs/modules/usage/how-to/headless-mode.md b/docs/modules/usage/how-to/headless-mode.md index 9843b2622e8..ed585302b07 100644 --- a/docs/modules/usage/how-to/headless-mode.md +++ b/docs/modules/usage/how-to/headless-mode.md @@ -29,7 +29,8 @@ WORKSPACE_BASE=$(pwd)/workspace 2. Set `LLM_MODEL` to the model you want to use: ```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" + ``` 3. Set `LLM_API_KEY` to your API key: @@ -54,3 +55,4 @@ docker run -it \ ghcr.io/all-hands-ai/openhands:0.11 \ python -m openhands.core.main -t "write a bash script that prints hi" ``` + diff --git a/frontend/__tests__/utils/extractModelAndProvider.test.ts b/frontend/__tests__/utils/extractModelAndProvider.test.ts index 2ec00e50abd..6ea84db241d 100644 --- a/frontend/__tests__/utils/extractModelAndProvider.test.ts +++ b/frontend/__tests__/utils/extractModelAndProvider.test.ts @@ -59,9 +59,9 @@ describe("extractModelAndProvider", () => { separator: "/", }); - expect(extractModelAndProvider("claude-3-5-sonnet-20240620")).toEqual({ + expect(extractModelAndProvider("claude-3-5-sonnet-20241022")).toEqual({ provider: "anthropic", - model: "claude-3-5-sonnet-20240620", + model: "claude-3-5-sonnet-20241022", separator: "/", }); @@ -78,3 +78,4 @@ describe("extractModelAndProvider", () => { }); }); }); + diff --git a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts index 40a93a88d78..a38af64f663 100644 --- a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts +++ b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts @@ -4,7 +4,7 @@ import { organizeModelsAndProviders } from "../../src/utils/organizeModelsAndPro test("organizeModelsAndProviders", () => { const models = [ "azure/ada", - "azure/gpt-35-turbo", + "azure/gpt-35-turbo", "azure/gpt-3-turbo", "azure/standard/1024-x-1024/dall-e-2", "vertex_ai_beta/chat-bison", @@ -15,7 +15,7 @@ test("organizeModelsAndProviders", () => { "gpt-4o", "together-ai-21.1b-41b", "gpt-4o-mini", - "claude-3-5-sonnet-20240620", + "claude-3-5-sonnet-20241022", "claude-3-haiku-20240307", "claude-2", "claude-2.1", @@ -51,7 +51,7 @@ test("organizeModelsAndProviders", () => { anthropic: { separator: "/", models: [ - "claude-3-5-sonnet-20240620", + "claude-3-5-sonnet-20241022", "claude-3-haiku-20240307", "claude-2", "claude-2.1", @@ -63,3 +63,4 @@ test("organizeModelsAndProviders", () => { }, }); }); + diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts index 432a5439121..f0493f0d133 100644 --- a/frontend/src/utils/verified-models.ts +++ b/frontend/src/utils/verified-models.ts @@ -1,6 +1,6 @@ // Here are the list of verified models and providers that we know work well with OpenHands. export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"]; -export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20240620"]; +export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20241022"]; // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`) @@ -27,3 +27,4 @@ export const VERIFIED_ANTHROPIC_MODELS = [ "claude-instant-1", "claude-instant-1.2", ]; + diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py index d038f18d9c6..10063fc52d8 100644 --- a/tests/unit/test_prompt_caching.py +++ b/tests/unit/test_prompt_caching.py @@ -14,7 +14,7 @@ @pytest.fixture def mock_llm(): llm = Mock(spec=LLM) - llm.config = LLMConfig(model='claude-3-5-sonnet-20240620', caching_prompt=True) + llm.config = LLMConfig(model='claude-3-5-sonnet-20241022', caching_prompt=True) llm.is_caching_prompt_active.return_value = True return llm @@ -259,3 +259,4 @@ def check_headers(**kwargs): # Assert assert isinstance(result, MessageAction) assert result.content == 'Hello! How can I assist you today?' + From 3d936fdbbbdedc6fc130fc03100a9f14ead498d2 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Wed, 30 Oct 2024 11:53:44 -0400 Subject: [PATCH 2/7] Fix deleted content --- .../current/usage/how-to/headless-mode.md | 42 +++ .../current/usage/how-to/cli-mode.md | 92 ++++++ .../usage/how-to/evaluation-harness.md | 266 ++++++++++++++++++ .../usage/how-to/evaluation-harness.md | 252 ++++++++++++++++- 4 files changed, 649 insertions(+), 3 deletions(-) diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md index 47430074713..a6cfc512057 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/headless-mode.md @@ -13,4 +13,46 @@ Pour exécuter OpenHands en mode sans interface avec Python, [suivez les instructions de configuration de développement](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md), puis exécutez : +```bash +poetry run python -m openhands.core.main -t "write a bash script that prints hi" +``` +Vous devrez vous assurer de définir votre modèle, votre clé API et d'autres paramètres via des variables d'environnement +[ou le fichier `config.toml`](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml). + +## Avec Docker + +1. Définissez `WORKSPACE_BASE` sur le répertoire que vous voulez qu'OpenHands modifie : + +```bash +WORKSPACE_BASE=$(pwd)/workspace +``` + +2. Définissez `LLM_MODEL` sur le modèle que vous voulez utiliser : + +```bash +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" +``` + +3. Définissez `LLM_API_KEY` sur votre clé API : + +```bash +LLM_API_KEY="sk_test_12345" +``` + +4. Exécutez la commande Docker suivante : + +```bash +docker run -it \ + --pull=always \ + -e SANDBOX_USER_ID=$(id -u) \ + -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ + -e LLM_API_KEY=$LLM_API_KEY \ + -e LLM_MODEL=$LLM_MODEL \ + -v $WORKSPACE_BASE:/opt/workspace_base \ + -v /var/run/docker.sock:/var/run/docker.sock \ + --add-host host.docker.internal:host-gateway \ + --name openhands-app-$(date +%Y%m%d%H%M%S) \ + ghcr.io/all-hands-ai/openhands:0.11 \ + python -m openhands.core.main -t "write a bash script that prints hi" +``` diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md index 4609b3aeebc..feef1b34bd0 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md @@ -14,4 +14,96 @@ OpenHands 可以在交互式命令行模式下运行,允许用户通过命令行 2. 运行以下命令: +```bash +poetry run python -m openhands.core.cli +``` +该命令将启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。 + +你需要确保通过环境变量[或 `config.toml` 文件](https://github.com/All-Hands-AI/OpenHands/blob/main/config.template.toml)设置你的模型、API 密钥和其他设置。 + + +## 使用 Docker + +要在 Docker 中以命令行模式运行 OpenHands,请按照以下步骤操作: + +1. 将 `WORKSPACE_BASE` 设置为你希望 OpenHands 编辑的目录: + +```bash +WORKSPACE_BASE=$(pwd)/workspace +``` + +2. 将 `LLM_MODEL` 设置为你要使用的模型: + +```bash +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" +``` + +3. 将 `LLM_API_KEY` 设置为你的 API 密钥: + +```bash +LLM_API_KEY="sk_test_12345" +``` + +4. 运行以下 Docker 命令: + +```bash +docker run -it \ + --pull=always \ + -e SANDBOX_USER_ID=$(id -u) \ + -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \ + -e LLM_API_KEY=$LLM_API_KEY \ + -e LLM_MODEL=$LLM_MODEL \ + -v $WORKSPACE_BASE:/opt/workspace_base \ + -v /var/run/docker.sock:/var/run/docker.sock \ + --add-host host.docker.internal:host-gateway \ + --name openhands-app-$(date +%Y%m%d%H%M%S) \ + ghcr.io/all-hands-ai/openhands:0.11 \ + python -m openhands.core.cli +``` + +该命令将在 Docker 中启动一个交互式会话,你可以在其中输入任务并接收来自 OpenHands 的响应。 + +## 命令行命令和预期输出示例 + +以下是一些命令行命令及其预期输出的示例: + +### 示例 1: 简单任务 + +```bash +How can I help? >> Write a Python script that prints "Hello, World!" +``` + +预期输出: + +```bash +🤖 Sure! Here is a Python script that prints "Hello, World!": + +❯ print("Hello, World!") +``` + +### 示例 2: Bash 命令 + +```bash +How can I help? >> Create a directory named "test_dir" +``` + +预期输出: + +```bash +🤖 Creating a directory named "test_dir": + +❯ mkdir test_dir +``` + +### 示例 3: 错误处理 + +```bash +How can I help? >> Delete a non-existent file +``` + +预期输出: + +```bash +🤖 An error occurred. Please try again. +``` diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index 2ce588da11e..f46a42c8eab 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -9,4 +9,270 @@ 以下是一个示例配置文件,您可以使用它来定义和使用多个 LLM: +```toml +[llm] +# 重要:在此处添加您的 API 密钥,并将模型设置为您要评估的模型 +model = "claude-3-5-sonnet-20240620" +api_key = "sk-XXX" +[llm.eval_gpt4_1106_preview_llm] +model = "gpt-4-1106-preview" +api_key = "XXX" +temperature = 0.0 + +[llm.eval_some_openai_compatible_model_llm] +model = "openai/MODEL_NAME" +base_url = "https://OPENAI_COMPATIBLE_URL/v1" +api_key = "XXX" +temperature = 0.0 +``` + + +## 如何在命令行中使用 OpenHands + +可以使用以下格式从命令行运行 OpenHands: + +```bash +poetry run python ./openhands/core/main.py \ + -i \ + -t "" \ + -c \ + -l +``` + +例如: + +```bash +poetry run python ./openhands/core/main.py \ + -i 10 \ + -t "Write me a bash script that prints hello world." \ + -c CodeActAgent \ + -l llm +``` + +此命令使用以下参数运行 OpenHands: +- 最大迭代次数为 10 +- 指定的任务描述 +- 使用 CodeActAgent +- 使用 `config.toml` 文件的 `llm` 部分中定义的 LLM 配置 + +## OpenHands 如何工作 + +OpenHands 的主要入口点在 `openhands/core/main.py` 中。以下是它工作原理的简化流程: + +1. 解析命令行参数并加载配置 +2. 使用 `create_runtime()` 创建运行时环境 +3. 初始化指定的代理 +4. 使用 `run_controller()` 运行控制器,它: + - 将运行时附加到代理 + - 执行代理的任务 + - 完成后返回最终状态 + +`run_controller()` 函数是 OpenHands 执行的核心。它管理代理、运行时和任务之间的交互,处理用户输入模拟和事件处理等事项。 + + +## 入门最简单的方法:探索现有基准 + +我们鼓励您查看我们仓库的 [`evaluation/` 目录](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation)中提供的各种评估基准。 + +要集成您自己的基准,我们建议从最接近您需求的基准开始。这种方法可以显著简化您的集成过程,允许您在现有结构的基础上进行构建并使其适应您的特定要求。 + +## 如何创建评估工作流 + + +要为您的基准创建评估工作流,请按照以下步骤操作: + +1. 导入相关的 OpenHands 实用程序: + ```python + import openhands.agenthub + from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + ) + from openhands.controller.state.state import State + from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + parse_arguments, + ) + from openhands.core.logger import openhands_logger as logger + from openhands.core.main import create_runtime, run_controller + from openhands.events.action import CmdRunAction + from openhands.events.observation import CmdOutputObservation, ErrorObservation + from openhands.runtime.runtime import Runtime + ``` + +2. 创建配置: + ```python + def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + runtime='eventstream', + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='your_container_image', + enable_auto_lint=True, + timeout=300, + ), + ) + config.set_llm_config(metadata.llm_config) + return config + ``` + +3. 初始化运行时并设置评估环境: + ```python + def initialize_runtime(runtime: Runtime, instance: pd.Series): + # 在此处设置您的评估环境 + # 例如,设置环境变量、准备文件等 + pass + ``` + +4. 创建一个函数来处理每个实例: + ```python + from openhands.utils.async_utils import call_async_from_sync + def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: + config = get_config(instance, metadata) + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance) + + instruction = get_instruction(instance, metadata) + + state = run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=your_user_response_function, + ) + + # 评估代理的操作 + evaluation_result = await evaluate_agent_actions(runtime, instance) + + return EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + test_result=evaluation_result, + metadata=metadata, + history=state.history.compatibility_for_eval_history_pairs(), + metrics=state.metrics.get() if state.metrics else None, + error=state.last_error if state and state.last_error else None, + ) + ``` + +5. 运行评估: + ```python + metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + instances = prepare_dataset(your_dataset, output_file, eval_n_limit) + + await run_evaluation( + instances, + metadata, + output_file, + num_workers, + process_instance + ) + ``` + +此工作流设置配置,初始化运行时环境,通过运行代理并评估其操作来处理每个实例,然后将结果收集到 `EvalOutput` 对象中。`run_evaluation` 函数处理并行化和进度跟踪。 + +请记住根据您特定的基准要求自定义 `get_instruction`、`your_user_response_function` 和 `evaluate_agent_actions` 函数。 + +通过遵循此结构,您可以在 OpenHands 框架内为您的基准创建强大的评估工作流。 + + +## 理解 `user_response_fn` + +`user_response_fn` 是 OpenHands 评估工作流中的关键组件。它模拟用户与代理的交互,允许在评估过程中自动响应。当您想要为代理的查询或操作提供一致的、预定义的响应时,此函数特别有用。 + + +### 工作流和交互 + +处理操作和 `user_response_fn` 的正确工作流如下: + +1. 代理接收任务并开始处理 +2. 代理发出操作 +3. 如果操作可执行(例如 CmdRunAction、IPythonRunCellAction): + - 运行时处理操作 + - 运行时返回观察结果 +4. 如果操作不可执行(通常是 MessageAction): + - 调用 `user_response_fn` + - 它返回模拟的用户响应 +5. 代理接收观察结果或模拟响应 +6. 重复步骤 2-5,直到任务完成或达到最大迭代次数 + +以下是更准确的可视化表示: + +``` + [代理] + | + v + [发出操作] + | + v + [操作是否可执行?] + / \ + 是 否 + | | + v v + [运行时] [user_response_fn] + | | + v v + [返回观察结果] [模拟响应] + \ / + \ / + v v + [代理接收反馈] + | + v + [继续或完成任务] +``` + +在此工作流中: + +- 可执行的操作(如运行命令或执行代码)由运行时直接处理 +- 不可执行的操作(通常是当代理想要通信或寻求澄清时)由 `user_response_fn` 处理 +- 然后,代理处理反馈,无论是来自运行时的观察结果还是来自 `user_response_fn` 的模拟响应 + +这种方法允许自动处理具体操作和模拟用户交互,使其适用于您想要测试代理在最少人工干预的情况下完成任务的能力的评估场景。 + +### 示例实现 + +以下是 SWE-Bench 评估中使用的 `user_response_fn` 示例: + +```python +def codeact_user_response(state: State | None) -> str: + msg = ( + 'Please continue working on the task on whatever approach you think is suitable.\n' + 'If you think you have solved the task, please first send your answer to user through message and then exit .\n' + 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' + ) + + if state and state.history: + # 检查代理是否已尝试与用户对话 3 次,如果是,让代理知道它可以放弃 + user_msgs = [ + event + for event in state.history.get_events() + if isinstance(event, MessageAction) and event.source == 'user' + ] + if len(user_msgs) >= 2: + # 让代理知道它在尝试 3 次后可以放弃 + return ( + msg + + 'If you want to give up, run: exit .\n' + ) + return msg +``` + +此函数执行以下操作: + +1. 提供一条标准消息,鼓励代理继续工作 +2. 检查代理尝试与用户通信的次数 +3. 如果代理已多次尝试,它会提供放弃的选项 + +通过使用此函数,您可以确保在多次评估运行中保持一致的行为,并防止代理在等待人工输入时陷入困境。 diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md index 04ce1a45d71..622f7e5607b 100644 --- a/docs/modules/usage/how-to/evaluation-harness.md +++ b/docs/modules/usage/how-to/evaluation-harness.md @@ -9,9 +9,6 @@ OpenHands in development mode uses `config.toml` to keep track of most configura Here's an example configuration file you can use to define and use multiple LLMs: - - - ```toml [llm] # IMPORTANT: add your API key here, and set the model to the one you want to evaluate @@ -30,3 +27,252 @@ api_key = "XXX" temperature = 0.0 ``` + +## How to use OpenHands in the command line + +OpenHands can be run from the command line using the following format: + +```bash +poetry run python ./openhands/core/main.py \ + -i \ + -t "" \ + -c \ + -l +``` + +For example: + +```bash +poetry run python ./openhands/core/main.py \ + -i 10 \ + -t "Write me a bash script that prints hello world." \ + -c CodeActAgent \ + -l llm +``` + +This command runs OpenHands with: +- A maximum of 10 iterations +- The specified task description +- Using the CodeActAgent +- With the LLM configuration defined in the `llm` section of your `config.toml` file + +## How does OpenHands work + +The main entry point for OpenHands is in `openhands/core/main.py`. Here's a simplified flow of how it works: + +1. Parse command-line arguments and load the configuration +2. Create a runtime environment using `create_runtime()` +3. Initialize the specified agent +4. Run the controller using `run_controller()`, which: + - Attaches the runtime to the agent + - Executes the agent's task + - Returns a final state when complete + +The `run_controller()` function is the core of OpenHands's execution. It manages the interaction between the agent, the runtime, and the task, handling things like user input simulation and event processing. + + +## Easiest way to get started: Exploring Existing Benchmarks + +We encourage you to review the various evaluation benchmarks available in the [`evaluation/` directory](https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation) of our repository. + +To integrate your own benchmark, we suggest starting with the one that most closely resembles your needs. This approach can significantly streamline your integration process, allowing you to build upon existing structures and adapt them to your specific requirements. + +## How to create an evaluation workflow + + +To create an evaluation workflow for your benchmark, follow these steps: + +1. Import relevant OpenHands utilities: + ```python + import openhands.agenthub + from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, + ) + from openhands.controller.state.state import State + from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + parse_arguments, + ) + from openhands.core.logger import openhands_logger as logger + from openhands.core.main import create_runtime, run_controller + from openhands.events.action import CmdRunAction + from openhands.events.observation import CmdOutputObservation, ErrorObservation + from openhands.runtime.runtime import Runtime + ``` + +2. Create a configuration: + ```python + def get_config(instance: pd.Series, metadata: EvalMetadata) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + runtime='eventstream', + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='your_container_image', + enable_auto_lint=True, + timeout=300, + ), + ) + config.set_llm_config(metadata.llm_config) + return config + ``` + +3. Initialize the runtime and set up the evaluation environment: + ```python + def initialize_runtime(runtime: Runtime, instance: pd.Series): + # Set up your evaluation environment here + # For example, setting environment variables, preparing files, etc. + pass + ``` + +4. Create a function to process each instance: + ```python + from openhands.utils.async_utils import call_async_from_sync + def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput: + config = get_config(instance, metadata) + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance) + + instruction = get_instruction(instance, metadata) + + state = run_controller( + config=config, + task_str=instruction, + runtime=runtime, + fake_user_response_fn=your_user_response_function, + ) + + # Evaluate the agent's actions + evaluation_result = await evaluate_agent_actions(runtime, instance) + + return EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + test_result=evaluation_result, + metadata=metadata, + history=state.history.compatibility_for_eval_history_pairs(), + metrics=state.metrics.get() if state.metrics else None, + error=state.last_error if state and state.last_error else None, + ) + ``` + +5. Run the evaluation: + ```python + metadata = make_metadata(llm_config, dataset_name, agent_class, max_iterations, eval_note, eval_output_dir) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + instances = prepare_dataset(your_dataset, output_file, eval_n_limit) + + await run_evaluation( + instances, + metadata, + output_file, + num_workers, + process_instance + ) + ``` + +This workflow sets up the configuration, initializes the runtime environment, processes each instance by running the agent and evaluating its actions, and then collects the results into an `EvalOutput` object. The `run_evaluation` function handles parallelization and progress tracking. + +Remember to customize the `get_instruction`, `your_user_response_function`, and `evaluate_agent_actions` functions according to your specific benchmark requirements. + +By following this structure, you can create a robust evaluation workflow for your benchmark within the OpenHands framework. + + +## Understanding the `user_response_fn` + +The `user_response_fn` is a crucial component in OpenHands's evaluation workflow. It simulates user interaction with the agent, allowing for automated responses during the evaluation process. This function is particularly useful when you want to provide consistent, predefined responses to the agent's queries or actions. + + +### Workflow and Interaction + +The correct workflow for handling actions and the `user_response_fn` is as follows: + +1. Agent receives a task and starts processing +2. Agent emits an Action +3. If the Action is executable (e.g., CmdRunAction, IPythonRunCellAction): + - The Runtime processes the Action + - Runtime returns an Observation +4. If the Action is not executable (typically a MessageAction): + - The `user_response_fn` is called + - It returns a simulated user response +5. The agent receives either the Observation or the simulated response +6. Steps 2-5 repeat until the task is completed or max iterations are reached + +Here's a more accurate visual representation: + +``` + [Agent] + | + v + [Emit Action] + | + v + [Is Action Executable?] + / \ + Yes No + | | + v v + [Runtime] [user_response_fn] + | | + v v + [Return Observation] [Simulated Response] + \ / + \ / + v v + [Agent receives feedback] + | + v + [Continue or Complete Task] +``` + +In this workflow: + +- Executable actions (like running commands or executing code) are handled directly by the Runtime +- Non-executable actions (typically when the agent wants to communicate or ask for clarification) are handled by the `user_response_fn` +- The agent then processes the feedback, whether it's an Observation from the Runtime or a simulated response from the `user_response_fn` + +This approach allows for automated handling of both concrete actions and simulated user interactions, making it suitable for evaluation scenarios where you want to test the agent's ability to complete tasks with minimal human intervention. + +### Example Implementation + +Here's an example of a `user_response_fn` used in the SWE-Bench evaluation: + +```python +def codeact_user_response(state: State | None) -> str: + msg = ( + 'Please continue working on the task on whatever approach you think is suitable.\n' + 'If you think you have solved the task, please first send your answer to user through message and then exit .\n' + 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n' + ) + + if state and state.history: + # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up + user_msgs = [ + event + for event in state.history.get_events() + if isinstance(event, MessageAction) and event.source == 'user' + ] + if len(user_msgs) >= 2: + # let the agent know that it can give up when it has tried 3 times + return ( + msg + + 'If you want to give up, run: exit .\n' + ) + return msg +``` + +This function does the following: + +1. Provides a standard message encouraging the agent to continue working +2. Checks how many times the agent has attempted to communicate with the user +3. If the agent has made multiple attempts, it provides an option to give up + +By using this function, you can ensure consistent behavior across multiple evaluation runs and prevent the agent from getting stuck waiting for human input. From 330703d29c6844fa75d0262511b980d99672a5ea Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Wed, 30 Oct 2024 11:54:50 -0400 Subject: [PATCH 3/7] Remaining cleanup --- .../current/usage/how-to/cli-mode.md | 2 +- .../current/usage/how-to/evaluation-harness.md | 2 +- frontend/src/utils/verified-models.ts | 3 +-- openhands/llm/llm.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md index 6c315957e2b..9f1b28c66c5 100644 --- a/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md +++ b/docs/i18n/fr/docusaurus-plugin-content-docs/current/usage/how-to/cli-mode.md @@ -37,7 +37,7 @@ WORKSPACE_BASE=$(pwd)/workspace 2. Définissez `LLM_MODEL` sur le modèle que vous souhaitez utiliser : ```bash -LLM_MODEL="anthropic/claude-3-5-sonnet-20240620" +LLM_MODEL="anthropic/claude-3-5-sonnet-20241022" ``` 3. Définissez `LLM_API_KEY` sur votre clé API : diff --git a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md index f46a42c8eab..a50bb18502e 100644 --- a/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md +++ b/docs/i18n/zh-Hans/docusaurus-plugin-content-docs/current/usage/how-to/evaluation-harness.md @@ -12,7 +12,7 @@ ```toml [llm] # 重要:在此处添加您的 API 密钥,并将模型设置为您要评估的模型 -model = "claude-3-5-sonnet-20240620" +model = "claude-3-5-sonnet-20241022" api_key = "sk-XXX" [llm.eval_gpt4_1106_preview_llm] diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts index f0493f0d133..614e78c8f4c 100644 --- a/frontend/src/utils/verified-models.ts +++ b/frontend/src/utils/verified-models.ts @@ -15,7 +15,7 @@ export const VERIFIED_OPENAI_MODELS = [ ]; // LiteLLM does not return the compatible Anthropic models with the provider, so we list them here to set them ourselves -// (e.g., they return `claude-3-5-sonnet-20240620` instead of `anthropic/claude-3-5-sonnet-20240620`) +// (e.g., they return `claude-3-5-sonnet-20241022` instead of `anthropic/claude-3-5-sonnet-20241022`) export const VERIFIED_ANTHROPIC_MODELS = [ "claude-2", "claude-2.1", @@ -27,4 +27,3 @@ export const VERIFIED_ANTHROPIC_MODELS = [ "claude-instant-1", "claude-instant-1.2", ]; - diff --git a/openhands/llm/llm.py b/openhands/llm/llm.py index 66d34037484..26e910d7e0d 100644 --- a/openhands/llm/llm.py +++ b/openhands/llm/llm.py @@ -47,8 +47,8 @@ # cache prompt supporting models # remove this when we gemini and deepseek are supported CACHE_PROMPT_SUPPORTED_MODELS = [ - 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20241022', + 'claude-3-5-sonnet-20240620', 'claude-3-haiku-20240307', 'claude-3-opus-20240229', ] From 83dee71d61bc99930711859c14e34a35bf6fe7bf Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Wed, 30 Oct 2024 11:57:53 -0400 Subject: [PATCH 4/7] Fix linting --- tests/unit/test_prompt_caching.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_prompt_caching.py b/tests/unit/test_prompt_caching.py index 10063fc52d8..7adbd9119dd 100644 --- a/tests/unit/test_prompt_caching.py +++ b/tests/unit/test_prompt_caching.py @@ -259,4 +259,3 @@ def check_headers(**kwargs): # Assert assert isinstance(result, MessageAction) assert result.content == 'Hello! How can I assist you today?' - From 717a57b81db585dccdf3db434749491775959c9c Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Wed, 30 Oct 2024 12:00:07 -0400 Subject: [PATCH 5/7] Update frontend/__tests__/utils/organizeModelsAndProviders.test.ts Co-authored-by: Xingyao Wang --- frontend/__tests__/utils/organizeModelsAndProviders.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts index a38af64f663..1062309dbf6 100644 --- a/frontend/__tests__/utils/organizeModelsAndProviders.test.ts +++ b/frontend/__tests__/utils/organizeModelsAndProviders.test.ts @@ -4,7 +4,7 @@ import { organizeModelsAndProviders } from "../../src/utils/organizeModelsAndPro test("organizeModelsAndProviders", () => { const models = [ "azure/ada", - "azure/gpt-35-turbo", + "azure/gpt-35-turbo", "azure/gpt-3-turbo", "azure/standard/1024-x-1024/dall-e-2", "vertex_ai_beta/chat-bison", From 72b8de32b505a7acd7d1a261617b64a4647b835e Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Wed, 30 Oct 2024 17:18:38 +0100 Subject: [PATCH 6/7] Update frontend/src/utils/verified-models.ts Co-authored-by: Xingyao Wang --- frontend/src/utils/verified-models.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts index 614e78c8f4c..03272800cf0 100644 --- a/frontend/src/utils/verified-models.ts +++ b/frontend/src/utils/verified-models.ts @@ -1,6 +1,6 @@ // Here are the list of verified models and providers that we know work well with OpenHands. export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"]; -export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20241022"]; +export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022"]; // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`) From 05b27daaf38e290c0747233ac48235ac3e6392fd Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Wed, 30 Oct 2024 17:55:03 +0100 Subject: [PATCH 7/7] fix lint --- frontend/src/utils/verified-models.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/frontend/src/utils/verified-models.ts b/frontend/src/utils/verified-models.ts index 03272800cf0..4733fa9bcdf 100644 --- a/frontend/src/utils/verified-models.ts +++ b/frontend/src/utils/verified-models.ts @@ -1,6 +1,10 @@ // Here are the list of verified models and providers that we know work well with OpenHands. export const VERIFIED_PROVIDERS = ["openai", "azure", "anthropic"]; -export const VERIFIED_MODELS = ["gpt-4o", "claude-3-5-sonnet-20240620", "claude-3-5-sonnet-20241022"]; +export const VERIFIED_MODELS = [ + "gpt-4o", + "claude-3-5-sonnet-20240620", + "claude-3-5-sonnet-20241022", +]; // LiteLLM does not return OpenAI models with the provider, so we list them here to set them ourselves for consistency // (e.g., they return `gpt-4o` instead of `openai/gpt-4o`)