diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml index 6906538a6a8..25d05b9a0ca 100644 --- a/.github/workflows/ghcr-build.yml +++ b/.github/workflows/ghcr-build.yml @@ -401,7 +401,7 @@ jobs: exit 1 update_pr_description: name: Update PR Description - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork needs: [ghcr_build_runtime] runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index a4bc03c4eeb..0cc7d149d78 100644 --- a/.gitignore +++ b/.gitignore @@ -174,6 +174,7 @@ evaluation/bird/data evaluation/gaia/data evaluation/gorilla/data evaluation/toolqa/data +evaluation/scienceagentbench/benchmark # frontend diff --git a/evaluation/scienceagentbench/Dockerfile b/evaluation/scienceagentbench/Dockerfile new file mode 100644 index 00000000000..70ed92cc4dc --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile @@ -0,0 +1,17 @@ +FROM python:3.11-bookworm + + +# For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`. +# **Please DO NOT redistribute the unzipped data files online.** +# It will download a benchmark.zip file to the current directory. +# unzip it and put the benchmark folder under evaluation/scienceagentbench/ + +RUN mkdir -p /benchmark +COPY benchmark /benchmark + +RUN mkdir -p /workspace +WORKDIR /workspace + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench . +# popd diff --git a/evaluation/scienceagentbench/Dockerfile.evaluator b/evaluation/scienceagentbench/Dockerfile.evaluator new file mode 100644 index 00000000000..f8263e1bb0a --- /dev/null +++ b/evaluation/scienceagentbench/Dockerfile.evaluator @@ -0,0 +1,25 @@ +FROM mambaorg/micromamba:debian12 + +USER root +# For https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#code-generation-with-agents + +RUN micromamba create -n sci-agent-eval python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent-eval pip install pip-tools + +RUN mkdir -p /workspace +WORKDIR /workspace + +RUN apt-get update && apt-get install -y git + +RUN git clone https://github.com/OSU-NLP-Group/ScienceAgentBench.git /workspace/ +RUN git checkout 4eddc7db6449a5ade3e37285747c8b208cd54ce7 + +RUN micromamba create -n sci-agent python=3.10 pip setuptools wheel +RUN micromamba run -n sci-agent pip install -r requirements.txt + +# Replace all occurence of conda with micromamba under the /workspace +RUN find ./ -type f -exec sed -i 's/conda/micromamba/g' {} \; + +# pushd evaluation/scienceagentbench +# docker build -t xingyaoww/openhands-eval-scienceagentbench-evaluator -f Dockerfile.evaluator . +# popd diff --git a/evaluation/scienceagentbench/README.md b/evaluation/scienceagentbench/README.md new file mode 100644 index 00000000000..3182c2e117b --- /dev/null +++ b/evaluation/scienceagentbench/README.md @@ -0,0 +1,54 @@ +# ScienceAgentBench Evaluation with OpenHands + +This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080). + +## Setup Environment and LLM Configuration + +Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM. + +## Setup ScienceAgentBench + +To prevent benchmark data contamination, we only provide the annotation sheet on [Huggingface](https://huggingface.co/datasets/osunlp/ScienceAgentBench), which includes all necessary *inputs* to run an agent. + +## Run Inference on ScienceAgentBench + +```bash +./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] + +# Example +./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3 +``` + +where `model_config` is mandatory, and the rest are optional. + +- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your +LLM settings, as defined in your `config.toml`. +- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would +like to evaluate. It could also be a release tag like `0.6.2`. +- `use_knowledge`, e.g. `true`, specifies whether allowing the agent to use expert-provided knowledge as additional input or not. By default, it is set to `false`. +- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting +to `CodeActAgent`. +- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By +default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note: +in order to use `eval_limit`, you must also set `agent`. +- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By +default, it is set to 30. +- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By +default, it is set to 1. + +## Evaluate Generated Programs + +### Extract Necessary Information from OpenHands Log + +After the inference is completed, you may use the following command to extract necessary information from the output log for evaluation: + +```bash +python post_proc.py [log_fname] +``` +- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent. + +Output will be write to e.g. `evaluation/.../output.converted.jsonl` + +### Run evaluation + +Please follow the steps [here](https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#evaluation-of-generated-code) to evaluate the generated programs. diff --git a/evaluation/scienceagentbench/post_proc.py b/evaluation/scienceagentbench/post_proc.py new file mode 100644 index 00000000000..46cfbe2b2a7 --- /dev/null +++ b/evaluation/scienceagentbench/post_proc.py @@ -0,0 +1,30 @@ +import json +from argparse import ArgumentParser + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument( + 'log_fname', + type=str, + ) + args = parser.parse_args() + + fname = args.log_fname + out_fname = args.log_fname.replace('.jsonl', '.converted.jsonl') + + log = [json.loads(line) for line in open(fname)] + + simple_log = [ + json.dumps( + { + 'instance_id': ex['instance_id'], + 'instruction': ex['instruction'], + 'test_result': ex['test_result'], + 'cost': ex['metrics']['accumulated_cost'], + } + ) + for ex in log + ] + + with open(out_fname, 'w+', encoding='utf-8') as f: + f.write('\n'.join(simple_log)) diff --git a/evaluation/scienceagentbench/run_infer.py b/evaluation/scienceagentbench/run_infer.py new file mode 100644 index 00000000000..52e42f29f12 --- /dev/null +++ b/evaluation/scienceagentbench/run_infer.py @@ -0,0 +1,292 @@ +import asyncio +import os +from typing import Any + +import pandas as pd +from datasets import load_dataset +from tqdm import tqdm + +from evaluation.utils.shared import ( + EvalMetadata, + EvalOutput, + codeact_user_response, + make_metadata, + prepare_dataset, + reset_logger_for_multiprocessing, + run_evaluation, +) +from openhands.controller.state.state import State +from openhands.core.config import ( + AppConfig, + SandboxConfig, + get_llm_config_arg, + get_parser, +) +from openhands.core.logger import openhands_logger as logger +from openhands.core.main import create_runtime, run_controller +from openhands.events.action import CmdRunAction, MessageAction +from openhands.events.observation import CmdOutputObservation +from openhands.runtime.base import Runtime +from openhands.utils.async_utils import call_async_from_sync + +AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = { + 'CodeActAgent': codeact_user_response, +} + +LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'benchmark') + + +def format_task_dict(example, use_knowledge): + task = { + 'instance_id': example['instance_id'], + 'task_inst': example['task_inst'], + 'dataset_path': '/benchmark/datasets/' + + example['dataset_folder_tree'].split('\n')[0][4:], + 'dataset_folder_tree': example['dataset_folder_tree'], + 'dataset_preview': example['dataset_preview'], + 'pred_program_name': 'pred_' + example['gold_program_name'], + } + + if use_knowledge: + task['task_inst'] += '\n' + str(example['domain_knowledge']) + + return task + + +def get_config( + metadata: EvalMetadata, + instance_id: str, +) -> AppConfig: + config = AppConfig( + default_agent=metadata.agent_class, + run_as_openhands=False, + runtime=os.environ.get('RUNTIME', 'eventstream'), + max_budget_per_task=4, + max_iterations=metadata.max_iterations, + sandbox=SandboxConfig( + base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench', + enable_auto_lint=True, + use_host_network=False, + timeout=300, + api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), + keep_remote_runtime_alive=False, + ), + # do not mount workspace + workspace_base=None, + workspace_mount_path=None, + ) + config.set_llm_config(metadata.llm_config) + if metadata.llm_config.log_completions: + metadata.llm_config.log_completions_folder = os.path.join( + metadata.eval_output_dir, 'llm_completions', instance_id + ) + logger.info( + f'Logging LLM completions for instance {instance_id} to ' + f'{metadata.llm_config.log_completions_folder}' + ) + return config + + +def initialize_runtime( + runtime: Runtime, + instance: pd.Series, # this argument is not required +): + """Initialize the runtime for the agent. + + This function is called before the runtime is used to run the agent. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}") + obs: CmdOutputObservation + + # Set up workspace directories + action = CmdRunAction(command='mkdir -p /workspace/pred_programs') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + action = CmdRunAction(command='mkdir -p /workspace/pred_results') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + assert obs.exit_code == 0 + + dataset_name = instance['dataset_folder_tree'].split('\n')[0][4:].rstrip('/') + + # Copy the dataset to the workspace + dataset_dir = os.path.join( + LOCAL_DATASET_PATH, + 'datasets', + dataset_name, + ) + runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True) + + # Check the dataset exists + action = CmdRunAction( + command='cd /workspace/benchmark/datasets && ls', + keep_prompt=False, + ) + obs = runtime.run_action(action) + logger.info(obs, extra={'msg_type': 'OBSERVATION'}) + assert obs.exit_code == 0 + assert dataset_name in obs.content + + logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}") + + +def complete_runtime( + runtime: Runtime, + instance: pd.Series, +) -> dict[str, Any]: + """Complete the runtime for the agent. + + This function is called before the runtime is used to run the agent. + If you need to do something in the sandbox to get the correctness metric after + the agent has run, modify this function. + """ + logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}") + obs: CmdOutputObservation + + test_result = {} + + action = CmdRunAction(command='cd /workspace') + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + assert obs.exit_code == 0 + + action = CmdRunAction( + command=f'cat pred_programs/{instance.pred_program_name}', + keep_prompt=False, + ) + logger.info(action, extra={'msg_type': 'ACTION'}) + obs = runtime.run_action(action) + + if obs.exit_code == 0: + test_result = {'program': obs.content} + else: + test_result = {'program': 'ERROR'} + + logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}") + return test_result + + +def process_instance( + instance: pd.Series, + metadata: EvalMetadata, + reset_logger: bool = True, +) -> EvalOutput: + instance_id = instance.instance_id.replace('/', '__') + config = get_config(metadata, instance_id) + + # Set up the logger properly, so you can run multi-processing to parallelize the evaluation + if reset_logger: + log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs') + reset_logger_for_multiprocessing(logger, instance_id, log_dir) + else: + logger.info(f'Starting evaluation for instance {instance_id}.') + + instruction = f"""You are an expert Python programming assistant that helps scientist users to write high-quality code to solve their tasks. +Given a user request, you are expected to write a complete program that accomplishes the requested task and save any outputs to `/workspace/pred_results/` in the correct format. + +Here's the user request you need to work on: +{instance.task_inst} + +You can access the dataset at `{instance.dataset_path}`. Here is the directory structure of the dataset: +``` +{instance.dataset_folder_tree} +``` +Here are some helpful previews for the dataset file(s): +{instance.dataset_preview} + +Please save your program as `/workspace/pred_programs/{instance.pred_program_name}`. +Then, please run the program to check and fix any errors. +Please do NOT run the program in the background. +If the program uses some packages that are incompatible, please figure out alternative implementations and do NOT restart the environment. + +""" + + runtime = create_runtime(config) + call_async_from_sync(runtime.connect) + initialize_runtime(runtime, instance) + + # Here's how you can run the agent (similar to the `main` function) and get the final task state + state: State | None = asyncio.run( + run_controller( + config=config, + initial_user_action=MessageAction(content=instruction), + runtime=runtime, + fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get( + metadata.agent_class + ), + ) + ) + + # ======= Attempt to evaluate the agent's edits ======= + test_result = complete_runtime(runtime, instance) + + # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) + # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. + if state is None: + raise ValueError('State should not be None.') + metrics = state.metrics.get() if state.metrics else None + + # history is now available as a stream of events, rather than list of pairs of (Action, Observation) + # for compatibility with the existing output format, we can remake the pairs here + # remove when it becomes unnecessary + histories = state.history.compatibility_for_eval_history_pairs() + + # Save the output + output = EvalOutput( + instance_id=instance.instance_id, + instruction=instruction, + metadata=metadata, + history=histories, + metrics=metrics, + error=state.last_error if state and state.last_error else None, + test_result=test_result, + ) + return output + + +if __name__ == '__main__': + parser = get_parser() + parser.add_argument( + '--use_knowledge', + type=str, + default='false', + choices=['true', 'false'], + help='use expert-provided knowledge or not', + ) + args, _ = parser.parse_known_args() + + sab_dataset = load_dataset('osunlp/ScienceAgentBench', split='validation') + + dataset_processed = [] + for example in tqdm(sab_dataset): + dataset_processed.append( + format_task_dict(example, args.use_knowledge == 'true') + ) + + dataset = pd.DataFrame(dataset_processed) + + llm_config = None + if args.llm_config: + llm_config = get_llm_config_arg(args.llm_config) + if llm_config is None: + raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}') + + metadata = make_metadata( + llm_config, + 'ScienceAgentBench', + args.agent_cls, + args.max_iterations, + args.eval_note, + args.eval_output_dir, + ) + output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl') + dataset['instance_id'] = dataset['instance_id'].apply(str) + instances = prepare_dataset(dataset, output_file, args.eval_n_limit) + + run_evaluation( + instances, metadata, output_file, args.eval_num_workers, process_instance + ) diff --git a/evaluation/scienceagentbench/scripts/run_infer.sh b/evaluation/scienceagentbench/scripts/run_infer.sh new file mode 100755 index 00000000000..7667e572378 --- /dev/null +++ b/evaluation/scienceagentbench/scripts/run_infer.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -eo pipefail + +source "evaluation/utils/version_control.sh" + +MODEL_CONFIG=$1 +COMMIT_HASH=$2 +USE_KNOWLEDGE=$3 +AGENT=$4 +EVAL_LIMIT=$5 +NUM_WORKERS=$6 + +if [ -z "$NUM_WORKERS" ]; then + NUM_WORKERS=1 + echo "Number of workers not specified, use default $NUM_WORKERS" +fi +checkout_eval_branch + +if [ -z "$AGENT" ]; then + echo "Agent not specified, use default CodeActAgent" + AGENT="CodeActAgent" +fi + +if [ -z "$USE_KNOWLEDGE" ]; then + echo "Use knowledge not specified, use default False" + USE_KNOWLEDGE=false +fi + +get_agent_version + +echo "AGENT: $AGENT" +echo "AGENT_VERSION: $AGENT_VERSION" +echo "MODEL_CONFIG: $MODEL_CONFIG" + +COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \ + --agent-cls $AGENT \ + --llm-config $MODEL_CONFIG \ + --use_knowledge $USE_KNOWLEDGE \ + --max-iterations 30 \ + --eval-num-workers $NUM_WORKERS \ + --eval-note $AGENT_VERSION" \ + +if [ -n "$EVAL_LIMIT" ]; then + echo "EVAL_LIMIT: $EVAL_LIMIT" + COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT" +fi + +# Run the command +eval $COMMAND diff --git a/frontend/__tests__/components/feedback-form.test.tsx b/frontend/__tests__/components/feedback-form.test.tsx index b41fa7e5c77..28684401e2c 100644 --- a/frontend/__tests__/components/feedback-form.test.tsx +++ b/frontend/__tests__/components/feedback-form.test.tsx @@ -5,7 +5,6 @@ import { FeedbackForm } from "#/components/feedback-form"; describe("FeedbackForm", () => { const user = userEvent.setup(); - const onSubmitMock = vi.fn(); const onCloseMock = vi.fn(); afterEach(() => { @@ -13,7 +12,7 @@ describe("FeedbackForm", () => { }); it("should render correctly", () => { - render(); + render(); screen.getByLabelText("Email"); screen.getByLabelText("Private"); @@ -24,7 +23,7 @@ describe("FeedbackForm", () => { }); it("should switch between private and public permissions", async () => { - render(); + render(); const privateRadio = screen.getByLabelText("Private"); const publicRadio = screen.getByLabelText("Public"); @@ -40,69 +39,11 @@ describe("FeedbackForm", () => { expect(publicRadio).not.toBeChecked(); }); - it("should call onSubmit when the form is submitted", async () => { - render(); - const email = screen.getByLabelText("Email"); - - await user.type(email, "test@test.test"); - await user.click(screen.getByRole("button", { name: "Submit" })); - - expect(onSubmitMock).toHaveBeenCalledWith("private", "test@test.test"); // private is the default value - }); - - it("should not call onSubmit when the email is invalid", async () => { - render(); - const email = screen.getByLabelText("Email"); - const submitButton = screen.getByRole("button", { name: "Submit" }); - - await user.click(submitButton); - - expect(onSubmitMock).not.toHaveBeenCalled(); - - await user.type(email, "test"); - await user.click(submitButton); - - expect(onSubmitMock).not.toHaveBeenCalled(); - }); - - it("should submit public permissions when the public radio is checked", async () => { - render(); - const email = screen.getByLabelText("Email"); - const publicRadio = screen.getByLabelText("Public"); - - await user.type(email, "test@test.test"); - await user.click(publicRadio); - await user.click(screen.getByRole("button", { name: "Submit" })); - - expect(onSubmitMock).toHaveBeenCalledWith("public", "test@test.test"); - }); - it("should call onClose when the close button is clicked", async () => { - render(); + render(); await user.click(screen.getByRole("button", { name: "Cancel" })); - expect(onSubmitMock).not.toHaveBeenCalled(); expect(onCloseMock).toHaveBeenCalled(); }); - it("should disable the buttons if isSubmitting is true", () => { - const { rerender } = render( - , - ); - const submitButton = screen.getByRole("button", { name: "Submit" }); - const cancelButton = screen.getByRole("button", { name: "Cancel" }); - - expect(submitButton).not.toBeDisabled(); - expect(cancelButton).not.toBeDisabled(); - - rerender( - , - ); - expect(submitButton).toBeDisabled(); - expect(cancelButton).toBeDisabled(); - }); }); diff --git a/frontend/src/api/open-hands.types.ts b/frontend/src/api/open-hands.types.ts index ba0e8642bc7..9da1a339b4d 100644 --- a/frontend/src/api/open-hands.types.ts +++ b/frontend/src/api/open-hands.types.ts @@ -31,7 +31,7 @@ export interface Feedback { version: string; email: string; token: string; - feedback: "positive" | "negative"; + polarity: "positive" | "negative"; permissions: "public" | "private"; trajectory: unknown[]; } diff --git a/frontend/src/components/chat-interface.tsx b/frontend/src/components/chat-interface.tsx index 10786c13923..25a57073698 100644 --- a/frontend/src/components/chat-interface.tsx +++ b/frontend/src/components/chat-interface.tsx @@ -1,6 +1,5 @@ import { useDispatch, useSelector } from "react-redux"; import React from "react"; -import { useFetcher } from "@remix-run/react"; import { useSocket } from "#/context/socket"; import { convertImageToBase64 } from "#/utils/convert-image-to-base-64"; import { ChatMessage } from "./chat-message"; @@ -13,10 +12,6 @@ import { RootState } from "#/store"; import AgentState from "#/types/AgentState"; import { generateAgentStateChangeEvent } from "#/services/agentStateService"; import { FeedbackModal } from "./feedback-modal"; -import { Feedback } from "#/api/open-hands.types"; -import { getToken } from "#/services/auth"; -import { removeApiKey, removeUnwantedKeys } from "#/utils/utils"; -import { clientAction } from "#/routes/submit-feedback"; import { useScrollToBottom } from "#/hooks/useScrollToBottom"; import TypingIndicator from "./chat/TypingIndicator"; import ConfirmationButtons from "./chat/ConfirmationButtons"; @@ -24,16 +19,13 @@ import { ErrorMessage } from "./error-message"; import { ContinueButton } from "./continue-button"; import { ScrollToBottomButton } from "./scroll-to-bottom-button"; -const FEEDBACK_VERSION = "1.0"; - const isErrorMessage = ( message: Message | ErrorMessage, ): message is ErrorMessage => "error" in message; export function ChatInterface() { - const { send, events } = useSocket(); + const { send } = useSocket(); const dispatch = useDispatch(); - const fetcher = useFetcher({ key: "feedback" }); const scrollRef = React.useRef(null); const { scrollDomToBottom, onChatBodyScroll, hitBottom } = useScrollToBottom(scrollRef); @@ -44,7 +36,6 @@ export function ChatInterface() { const [feedbackPolarity, setFeedbackPolarity] = React.useState< "positive" | "negative" >("positive"); - const [feedbackShared, setFeedbackShared] = React.useState(0); const [feedbackModalIsOpen, setFeedbackModalIsOpen] = React.useState(false); const handleSendMessage = async (content: string, files: File[]) => { @@ -71,30 +62,6 @@ export function ChatInterface() { setFeedbackPolarity(polarity); }; - const handleSubmitFeedback = ( - permissions: "private" | "public", - email: string, - ) => { - const feedback: Feedback = { - version: FEEDBACK_VERSION, - feedback: feedbackPolarity, - email, - permissions, - token: getToken(), - trajectory: removeApiKey(removeUnwantedKeys(events)), - }; - - const formData = new FormData(); - formData.append("feedback", JSON.stringify(feedback)); - - fetcher.submit(formData, { - action: "/submit-feedback", - method: "POST", - }); - - setFeedbackShared(messages.length); - }; - return (
- {feedbackShared !== messages.length && messages.length > 3 && ( - - onClickShareFeedbackActionButton("positive") - } - onNegativeFeedback={() => - onClickShareFeedbackActionButton("negative") - } - /> - )} + + onClickShareFeedbackActionButton("positive") + } + onNegativeFeedback={() => + onClickShareFeedbackActionButton("negative") + } + />
{messages.length > 2 && curAgentState === AgentState.AWAITING_USER_INPUT && ( @@ -163,9 +128,8 @@ export function ChatInterface() { setFeedbackModalIsOpen(false)} - onSubmit={handleSubmitFeedback} + polarity={feedbackPolarity} />
); diff --git a/frontend/src/components/feedback-form.tsx b/frontend/src/components/feedback-form.tsx index 7ff03a44307..4e1ddde6354 100644 --- a/frontend/src/components/feedback-form.tsx +++ b/frontend/src/components/feedback-form.tsx @@ -1,27 +1,87 @@ +import React from "react"; +import hotToast from "react-hot-toast"; import ModalButton from "./buttons/ModalButton"; +import { request } from "#/services/api"; +import { Feedback } from "#/api/open-hands.types"; + +const FEEDBACK_VERSION = "1.0"; +const VIEWER_PAGE = "https://www.all-hands.dev/share"; interface FeedbackFormProps { - onSubmit: (permissions: "private" | "public", email: string) => void; onClose: () => void; - isSubmitting?: boolean; + polarity: "positive" | "negative"; } -export function FeedbackForm({ - onSubmit, - onClose, - isSubmitting, -}: FeedbackFormProps) { - const handleSubmit = (event: React.FormEvent) => { +export function FeedbackForm({ onClose, polarity }: FeedbackFormProps) { + const [isSubmitting, setIsSubmitting] = React.useState(false); + + const copiedToClipboardToast = () => { + hotToast("Password copied to clipboard", { + icon: "📋", + position: "bottom-right", + }); + }; + + const onPressToast = (password: string) => { + navigator.clipboard.writeText(password); + copiedToClipboardToast(); + }; + + const shareFeedbackToast = ( + message: string, + link: string, + password: string, + ) => { + hotToast( +
+ {message} + onPressToast(password)} + href={link} + target="_blank" + rel="noreferrer" + > + Go to shared feedback + + onPressToast(password)} className="cursor-pointer"> + Password: {password} (copy) + +
, + { duration: 10000 }, + ); + }; + + const handleSubmit = async (event: React.FormEvent) => { event?.preventDefault(); const formData = new FormData(event.currentTarget); + setIsSubmitting(true); + + const email = formData.get("email")?.toString() || ""; + const permissions = (formData.get("permissions")?.toString() || + "private") as "private" | "public"; - const email = formData.get("email")?.toString(); - const permissions = formData.get("permissions")?.toString() as - | "private" - | "public" - | undefined; + const feedback: Feedback = { + version: FEEDBACK_VERSION, + email, + polarity, + permissions, + trajectory: [], + token: "", + }; - if (email) onSubmit(permissions || "private", email); + const response = await request("/api/submit-feedback", { + method: "POST", + body: JSON.stringify(feedback), + headers: { + "Content-Type": "application/json", + }, + }); + const { message, feedback_id, password } = response.body; // eslint-disable-line + const link = `${VIEWER_PAGE}?share_id=${feedback_id}`; + shareFeedbackToast(message, link, password); + setIsSubmitting(false); }; return ( diff --git a/frontend/src/components/feedback-modal.tsx b/frontend/src/components/feedback-modal.tsx index f9cf05f0789..96663135e89 100644 --- a/frontend/src/components/feedback-modal.tsx +++ b/frontend/src/components/feedback-modal.tsx @@ -1,6 +1,4 @@ import React from "react"; -import hotToast, { toast } from "react-hot-toast"; -import { useFetcher } from "@remix-run/react"; import { FeedbackForm } from "./feedback-form"; import { BaseModalTitle, @@ -8,82 +6,18 @@ import { } from "./modals/confirmation-modals/BaseModal"; import { ModalBackdrop } from "./modals/modal-backdrop"; import ModalBody from "./modals/ModalBody"; -import { clientAction } from "#/routes/submit-feedback"; interface FeedbackModalProps { - onSubmit: (permissions: "private" | "public", email: string) => void; onClose: () => void; isOpen: boolean; - isSubmitting?: boolean; + polarity: "positive" | "negative"; } export function FeedbackModal({ - onSubmit, onClose, isOpen, - isSubmitting, + polarity, }: FeedbackModalProps) { - const fetcher = useFetcher({ key: "feedback" }); - const isInitialRender = React.useRef(true); - - const copiedToClipboardToast = () => { - hotToast("Password copied to clipboard", { - icon: "📋", - position: "bottom-right", - }); - }; - - const onPressToast = (password: string) => { - navigator.clipboard.writeText(password); - copiedToClipboardToast(); - }; - - const shareFeedbackToast = ( - message: string, - link: string, - password: string, - ) => { - hotToast( -
- {message} - onPressToast(password)} - href={link} - target="_blank" - rel="noreferrer" - > - Go to shared feedback - - onPressToast(password)} className="cursor-pointer"> - Password: {password} (copy) - -
, - { duration: 10000 }, - ); - }; - - React.useEffect(() => { - if (isInitialRender.current) { - isInitialRender.current = false; - return; - } - - // Handle feedback submission - if (fetcher.state === "idle" && fetcher.data) { - if (!fetcher.data.success) { - toast.error("Error submitting feedback"); - } else if (fetcher.data.data) { - const { data } = fetcher.data; - const { message, link, password } = data; - shareFeedbackToast(message, link, password); - } - - onClose(); - } - }, [fetcher.state, fetcher.data?.success]); - if (!isOpen) return null; return ( @@ -91,11 +25,7 @@ export function FeedbackModal({ - + ); diff --git a/frontend/src/routes/submit-feedback.ts b/frontend/src/routes/submit-feedback.ts deleted file mode 100644 index 19281eea7ad..00000000000 --- a/frontend/src/routes/submit-feedback.ts +++ /dev/null @@ -1,47 +0,0 @@ -import { ClientActionFunctionArgs, json } from "@remix-run/react"; -import { Feedback } from "#/api/open-hands.types"; -import OpenHands from "#/api/open-hands"; - -const VIEWER_PAGE = "https://www.all-hands.dev/share"; - -const isFeedback = (feedback: unknown): feedback is Feedback => { - if (typeof feedback !== "object" || feedback === null) { - return false; - } - - return ( - "version" in feedback && - "email" in feedback && - "token" in feedback && - "feedback" in feedback && - "permissions" in feedback && - "trajectory" in feedback - ); -}; - -export const clientAction = async ({ request }: ClientActionFunctionArgs) => { - const formData = await request.formData(); - const feedback = formData.get("feedback")?.toString(); - const token = localStorage.getItem("token"); - - if (token && feedback) { - const parsed = JSON.parse(feedback); - if (isFeedback(parsed)) { - try { - const response = await OpenHands.sendFeedback(token, parsed); - if (response.statusCode === 200) { - const { message, feedback_id: feedbackId, password } = response.body; - const link = `${VIEWER_PAGE}?share_id=${feedbackId}`; - return json({ - success: true, - data: { message, link, password }, - }); - } - } catch (error) { - return json({ success: false, data: null }); - } - } - } - - return json({ success: false, data: null }); -}; diff --git a/openhands/events/stream.py b/openhands/events/stream.py index 1e4c3b9d539..aafbcc2fc87 100644 --- a/openhands/events/stream.py +++ b/openhands/events/stream.py @@ -71,7 +71,15 @@ def get_events( end_id=None, reverse=False, filter_out_type: tuple[type[Event], ...] | None = None, + filter_hidden=False, ) -> Iterable[Event]: + def should_filter(event: Event): + if filter_hidden and hasattr(event, 'hidden') and event.hidden: + return True + if filter_out_type is not None and isinstance(event, filter_out_type): + return True + return False + if reverse: if end_id is None: end_id = self._cur_id - 1 @@ -79,9 +87,7 @@ def get_events( while event_id >= start_id: try: event = self.get_event(event_id) - if filter_out_type is None or not isinstance( - event, filter_out_type - ): + if not should_filter(event): yield event except FileNotFoundError: logger.debug(f'No event found for ID {event_id}') @@ -93,9 +99,7 @@ def get_events( break try: event = self.get_event(event_id) - if filter_out_type is None or not isinstance( - event, filter_out_type - ): + if not should_filter(event): yield event except FileNotFoundError: break diff --git a/openhands/runtime/impl/e2b/sandbox.py b/openhands/runtime/impl/e2b/sandbox.py index 666dc43f701..d145dac3511 100644 --- a/openhands/runtime/impl/e2b/sandbox.py +++ b/openhands/runtime/impl/e2b/sandbox.py @@ -4,9 +4,7 @@ from glob import glob from e2b import Sandbox as E2BSandbox -from e2b.sandbox.exception import ( - TimeoutException, -) +from e2b.sandbox.exception import TimeoutException from openhands.core.config import SandboxConfig from openhands.core.logger import openhands_logger as logger diff --git a/openhands/server/data_models/feedback.py b/openhands/server/data_models/feedback.py index cbdd8744807..59f32008b52 100644 --- a/openhands/server/data_models/feedback.py +++ b/openhands/server/data_models/feedback.py @@ -1,5 +1,5 @@ import json -from typing import Any, Literal +from typing import Any, Literal, Optional import requests from pydantic import BaseModel @@ -10,10 +10,12 @@ class FeedbackDataModel(BaseModel): version: str email: str - token: str - feedback: Literal['positive', 'negative'] + polarity: Literal['positive', 'negative'] + feedback: Literal[ + 'positive', 'negative' + ] # TODO: remove this, its here for backward compatibility permissions: Literal['public', 'private'] - trajectory: list[dict[str, Any]] + trajectory: Optional[list[dict[str, Any]]] FEEDBACK_URL = 'https://share-od-trajectory-3u9bw9tx.uc.gateway.dev/share_od_trajectory' @@ -21,6 +23,7 @@ class FeedbackDataModel(BaseModel): def store_feedback(feedback: FeedbackDataModel) -> dict[str, str]: # Start logging + feedback.feedback = feedback.polarity display_feedback = feedback.model_dump() if 'trajectory' in display_feedback: display_feedback['trajectory'] = ( diff --git a/openhands/server/listen.py b/openhands/server/listen.py index fc740e80293..cc74d5ba736 100644 --- a/openhands/server/listen.py +++ b/openhands/server/listen.py @@ -634,14 +634,14 @@ async def upload_file(request: Request, files: list[UploadFile]): @app.post('/api/submit-feedback') -async def submit_feedback(request: Request, feedback: FeedbackDataModel): +async def submit_feedback(request: Request): """Submit user feedback. This function stores the provided feedback data. To submit feedback: ```sh - curl -X POST -F "email=test@example.com" -F "token=abc" -F "feedback=positive" -F "permissions=private" -F "trajectory={}" http://localhost:3000/api/submit-feedback + curl -X POST -d '{"email": "test@example.com"}' -H "Authorization:" ``` Args: @@ -656,6 +656,19 @@ async def submit_feedback(request: Request, feedback: FeedbackDataModel): """ # Assuming the storage service is already configured in the backend # and there is a function to handle the storage. + body = await request.json() + events = request.state.conversation.event_stream.get_events(filter_hidden=True) + trajectory = [] + for event in events: + trajectory.append(event_to_dict(event)) + feedback = FeedbackDataModel( + email=body.get('email', ''), + version=body.get('version', ''), + permissions=body.get('permissions', 'private'), + polarity=body.get('polarity', ''), + feedback=body.get('polarity', ''), + trajectory=trajectory, + ) try: feedback_data = store_feedback(feedback) return JSONResponse(status_code=200, content=feedback_data)