Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enabled saving and evaluation for moderator (#271) #272

Merged
merged 30 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
693f792
feat: FastAPI Implementation of Sotopia Part Two (w websocket) (#252)
XuhuiZhou Dec 5, 2024
5a9f4b7
Add customizable evaluation dimensions (#256)
bugsz Dec 8, 2024
dea25d3
Feat/addtional fast apis for non-streaming simulation and managing re…
XuhuiZhou Dec 11, 2024
cadf06d
fix ci error
XuhuiZhou Dec 12, 2024
187a21b
solving pytests
XuhuiZhou Dec 12, 2024
ec5c394
improve the tests
XuhuiZhou Dec 12, 2024
1a1244e
add custom eval fast api (#268)
XuhuiZhou Dec 13, 2024
ae4014e
fix mypy error
XuhuiZhou Dec 13, 2024
ab6903a
aact moderator (#257)
XuhuiZhou Dec 14, 2024
1f4fb0a
Deploy the api to modal (#267)
bugsz Dec 28, 2024
cb6b2d1
Feature/sotopia demo UI (#261)
XuhuiZhou Dec 31, 2024
70293aa
remove dev tag
XuhuiZhou Dec 31, 2024
2526be1
add custom eval
XuhuiZhou Dec 31, 2024
b0b53d8
base dimension
XuhuiZhou Dec 31, 2024
22a1ecf
fix ui mypy
XuhuiZhou Jan 2, 2025
302835a
fix mypy
XuhuiZhou Jan 2, 2025
0e44603
add delete dimension
XuhuiZhou Jan 5, 2025
520a1dd
update streamlit ui
XuhuiZhou Jan 5, 2025
5ffdee3
ignores the ui directory
XuhuiZhou Jan 6, 2025
f9e2ea3
Committing changes before push
openhands-agent Jan 6, 2025
a45e440
pytest for eval dimension
XuhuiZhou Jan 6, 2025
24ca6a3
fix mypy
XuhuiZhou Jan 6, 2025
6b2db2a
clean up comments
bugsz Jan 7, 2025
66da649
feat: enabled saving and evaluation for moderator (#271)
JXZhou0224 Jan 7, 2025
bbf6061
back compatible with evaluators[draft]
XuhuiZhou Jan 7, 2025
7558927
add evaluation node
XuhuiZhou Jan 8, 2025
8fc24b5
implemented the evaluator for multiagent
Jan 15, 2025
a67166e
Merge branch 'main' into feature/multiparty
XuhuiZhou Jan 16, 2025
02ea368
fix mypy errors
Jan 17, 2025
cb441c0
fix mypy errors in sotopia/database
Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/experiment_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
Expand Down Expand Up @@ -164,7 +164,7 @@ def _iterate_env_agent_combo_not_in_db(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[evaluation_dimensions], # type: ignore
# TODO check how to do type annotation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import logging
import sys
import json
from rich.logging import RichHandler

from aact import NodeFactory

from sotopia.experimental.agents.base_agent import BaseAgent
from sotopia.experimental.agents.datamodels import Observation, AgentAction
from sotopia.database.persistent_profile import AgentProfile
from typing import Any

from sotopia.generation_utils import agenerate
from sotopia.generation_utils.generate import StrOutputParser
Expand Down Expand Up @@ -33,11 +36,13 @@ def __init__(
input_channels: list[str],
output_channel: str,
query_interval: int,
agent_name: str,
node_name: str,
goal: str,
model_name: str,
redis_url: str,
goal: str,
agent_name: str = "",
background: dict[str, Any] | None = None,
agent_pk: str | None = None,
redis_url: str = "redis://localhost:6379/0",
):
super().__init__(
[(input_channel, Observation) for input_channel in input_channels],
Expand All @@ -47,23 +52,59 @@ def __init__(
)
self.output_channel = output_channel
self.query_interval = query_interval
self.count_ticks = 0
self.count_ticks: int = 0
self.message_history: list[Observation] = []
self.name = agent_name
self.model_name = model_name
self.goal = goal
self.goal: str = goal
self.model_name: str = model_name
self.agent_profile_pk: str | None = agent_pk
self.name: str = agent_name
self.background: dict[str, Any] | None = background
self.awake: bool = False

def set_profile(self, use_pk_value: bool) -> None:
if not use_pk_value:
assert (
self.background is not None and self.name is not None
), "Background and name must be provided"
if " " in self.name:
first_name, last_name = self.name.split(" ", 1)
else:
first_name = self.name
last_name = ""
profile = AgentProfile(
first_name=first_name, last_name=last_name, **self.background
)
profile.save()
else:
profile = AgentProfile.get(pk=self.agent_profile_pk)

self.agent_profile_pk = profile.pk
self.name = " ".join([profile.first_name, profile.last_name]).strip()
self.background = profile.model_dump()

def _format_message_history(self, message_history: list[Observation]) -> str:
## TODO: akhatua Fix the mapping of action to be gramatically correct
return "\n".join(message.to_natural_language() for message in message_history)

async def aact(self, obs: Observation) -> AgentAction:
if obs.turn_number == -1:
if self.awake:
return AgentAction(
agent_name=self.name,
output_channel=self.output_channel,
action_type="none",
argument="",
)
args = json.loads(obs.last_turn)
self.set_profile(args["use_pk_value"])
self.awake = True
return AgentAction(
agent_name=self.name,
output_channel=self.output_channel,
action_type="none",
argument=self.model_name,
argument=json.dumps(
{"pk": self.agent_profile_pk, "model_name": self.model_name}
),
)

self.message_history.append(obs)
Expand Down
21 changes: 18 additions & 3 deletions examples/experimental/sotopia_original_replica/origin.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
redis_url = "redis://localhost:6379/0"
extra_modules = ["examples.experimental.sotopia_original_replica.llm_agent_sotopia", "examples.experimental.nodes.chat_print_node", "sotopia.experimental.agents.moderator"]
extra_modules = ["examples.experimental.sotopia_original_replica.llm_agent_sotopia", "examples.experimental.nodes.chat_print_node", "sotopia.experimental.agents.moderator","sotopia.experimental.agents.evaluators"]


[[nodes]]
Expand All @@ -9,11 +9,13 @@ node_class = "moderator"
[nodes.node_args]
output_channels = ["moderator:Jane", "moderator:Jack"]
input_channels = ["Jane:moderator", "Jack:moderator"]
agent_backgrounds = {"Jane" = "", "Jack" = ""}
evaluator_channels = [["evaluator:moderator","moderator:evaluator"]]
agent_mapping = {"moderator:Jane" = "Jane", "moderator:Jack" = "Jack"}
scenario = "Two friends are sitting in a cafe and catching up with each other's lives."
max_turns = 2
max_turns = 3
push_to_db = false
evaluate_episode = true
use_pk_value = false

[[nodes]]
node_name = "Jack"
Expand All @@ -26,6 +28,8 @@ output_channel = "Jack:moderator"
goal = "Your goal is to borrow 5000 dollars from Jane."
model_name = "gpt-4o-mini"
agent_name = "Jack"
background = {"occupation" = "construction worker"}
agent_pk = ""


[[nodes]]
Expand All @@ -39,6 +43,8 @@ input_channels = ["moderator:Jane"]
goal = "Your goal is to help Jack however, you are in a finicial crisis yourself and can only afford to give him 500 dollars."
model_name = "gpt-4o-mini"
agent_name = "Jane"
background = {"occupation" = "gardener"}
agent_pk = ""

[[nodes]]
node_name = "chat_print"
Expand All @@ -50,3 +56,12 @@ node_class = "chat_print"

[nodes.node_args]
env_agents = ["Jack", "Jane"]

[[nodes]]
node_name = "evaluator"
node_class = "evaluator"

[nodes.node_args]
input_channels = ["moderator:evaluator"]
output_channels = ["evaluator:moderator"]
model_name = "gpt-4o-mini"
4 changes: 2 additions & 2 deletions examples/fix_missing_episodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
Expand Down Expand Up @@ -229,7 +229,7 @@ def yield_env_agent_combo(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[SotopiaDimensions],
),
Expand Down
4 changes: 2 additions & 2 deletions examples/fix_missing_episodes_with_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
)
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
Expand Down Expand Up @@ -327,7 +327,7 @@ def yield_env_agent_combo(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[SotopiaDimensions],
),
Expand Down
4 changes: 2 additions & 2 deletions examples/use_custom_dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Type, Union
from redis_om import Migrator
from sotopia.envs.evaluators import (
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
EvaluationForTwoAgents,
RuleBasedTerminatedEvaluator,
)
Expand Down Expand Up @@ -152,7 +152,7 @@ def run_simple_sample_with_custom_samples(
custom_dimensions, list_name="custom"
)
evaluator = RuleBasedTerminatedEvaluator(max_turn_number=10, max_stale_turn=2)
terminal_evaluator = ReachGoalLLMEvaluator(
terminal_evaluator = EpisodeLLMEvaluator(
model_name="gpt-4o-mini",
response_format_class=EvaluationForTwoAgents[custom_dimensions_type], # type: ignore
)
Expand Down
6 changes: 3 additions & 3 deletions sotopia-chat/chat_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
EnvironmentProfile,
)
from sotopia.envs.evaluators import (
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
)
from sotopia.envs.parallel import ParallelSotopiaEnv
Expand Down Expand Up @@ -64,7 +64,7 @@ async def _start_server_with_two_session_ids_and_agent_env_combo(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator("gpt-4", EvaluationForTwoAgents[SotopiaDimensions]),
EpisodeLLMEvaluator("gpt-4", EvaluationForTwoAgents[SotopiaDimensions]),
],
)
random.shuffle(session_ids)
Expand Down Expand Up @@ -97,7 +97,7 @@ async def _start_server_with_one_session_id_and_agent_env_combo(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator("gpt-4", EvaluationForTwoAgents[SotopiaDimensions]),
EpisodeLLMEvaluator("gpt-4", EvaluationForTwoAgents[SotopiaDimensions]),
],
)

Expand Down
4 changes: 2 additions & 2 deletions sotopia/api/fastapi_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from sotopia.envs.parallel import ParallelSotopiaEnv
from sotopia.envs.evaluators import (
RuleBasedTerminatedEvaluator,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
EvaluationForTwoAgents,
SotopiaDimensions,
)
Expand Down Expand Up @@ -267,7 +267,7 @@ async def nonstreaming_simulation(
),
],
"terminal_evaluators": [
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
simulation_request.models[0],
EvaluationForTwoAgents[SotopiaDimensions],
),
Expand Down
4 changes: 2 additions & 2 deletions sotopia/api/websocket_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
)
from sotopia.agents import Agents, LLMAgent
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_env_agents(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
evaluator_model,
EvaluationForTwoAgents[evaluation_dimensions], # type: ignore
),
Expand Down
4 changes: 2 additions & 2 deletions sotopia/cli/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from sotopia.database.serialization import get_rewards_from_episode
from sotopia.envs.evaluators import (
EvaluationForTwoAgents,
ReachGoalLLMEvaluator,
EpisodeLLMEvaluator,
RuleBasedTerminatedEvaluator,
SotopiaDimensions,
)
Expand Down Expand Up @@ -363,7 +363,7 @@ def _list_all_env_agent_combo_not_in_db(
RuleBasedTerminatedEvaluator(max_turn_number=20, max_stale_turn=2),
],
terminal_evaluators=[
ReachGoalLLMEvaluator(
EpisodeLLMEvaluator(
model_names["env"],
EvaluationForTwoAgents[SotopiaDimensions],
),
Expand Down
2 changes: 1 addition & 1 deletion sotopia/database/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class BaseEpisodeLog(BaseModel):
tag: str | None = Field(index=True, default="")
models: list[str] | None = Field(index=True, default=[])
messages: list[list[tuple[str, str, str]]] # Messages arranged by turn
reasoning: str
reasoning: str = Field(default="")
rewards: list[tuple[float, dict[str, float]] | float] # Rewards arranged by turn
rewards_prompt: str

Expand Down
2 changes: 1 addition & 1 deletion sotopia/envs/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ async def __acall__(
return self(turn_number, messages)


class ReachGoalLLMEvaluator(Evaluator, Generic[T_eval_dim]):
class EpisodeLLMEvaluator(Evaluator, Generic[T_eval_dim]):
@beartype
def __init__(
self,
Expand Down
Loading
Loading