diff --git a/sotopia/cli/benchmark/benchmark.py b/sotopia/cli/benchmark/benchmark.py
index 5ee317739..06f6010a7 100644
--- a/sotopia/cli/benchmark/benchmark.py
+++ b/sotopia/cli/benchmark/benchmark.py
@@ -360,7 +360,7 @@ def benchmark_all(
     batch_size: int = typer.Option(10, help="The batch size you want to use."),
     task: str = typer.Option("hard", help="The task id you want to benchmark."),
     print_logs: bool = typer.Option(False, help="Print logs."),
-):
+) -> None:
     for model in model_list:
         benchmark(
             model=model,
@@ -388,7 +388,7 @@ def benchmark_display(
         "gpt-4o", help="The evaluator model you want to use."
     ),
     task: str = typer.Option("hard", help="The task id you want to benchmark."),
-):
+) -> None:
     """
     Usage: sotopia benchmark-display --model-list gpt-4o --model-list together_ai/meta-llama-Llama-3-70b-chat-hf
     Aggregate all the results for the benchmark, as described in https://github.com/sotopia-lab/sotopia-space/blob/main/data_dir/models_vs_gpt35.jsonl
@@ -402,7 +402,7 @@ def benchmark_display(
         if len(episodes) == 0:
             print(f"No episodes found for {model}")
             continue
-        avg_rewards = get_avg_reward(episodes, model)
+        avg_rewards = get_avg_reward(episodes, model)  # type: ignore
         model_rewards_dict[model] = avg_rewards
         print(f"Model: {model}, episodes: {len(episodes)}, Avg Rewards: {avg_rewards}")