-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathiddpg.py
253 lines (219 loc) · 8.44 KB
/
iddpg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
from dataclasses import dataclass, MISSING
from typing import Dict, Iterable, Tuple, Type
from tensordict import TensorDictBase
from tensordict.nn import TensorDictModule, TensorDictSequential
from torchrl.data import Composite, Unbounded
from torchrl.modules import (
AdditiveGaussianWrapper,
Delta,
ProbabilisticActor,
TanhDelta,
)
from torchrl.objectives import DDPGLoss, LossModule, ValueEstimators
from benchmarl.algorithms.common import Algorithm, AlgorithmConfig
from benchmarl.models.common import ModelConfig
class Iddpg(Algorithm):
"""Same as :class:`~benchmarl.algorithms.Maddpg` (from `https://arxiv.org/abs/1706.02275 <https://arxiv.org/abs/1706.02275>`__) but with decentralized critics.
Args:
share_param_critic (bool): Whether to share the parameters of the critics withing agent groups
loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1".
delay_value (bool): whether to separate the target value networks from the value networks used for
data collection.
use_tanh_mapping (bool): if ``True``, use squash actions (output by the policy) into the action range, otherwise
clip them.
"""
def __init__(
self,
share_param_critic: bool,
loss_function: str,
delay_value: bool,
use_tanh_mapping: bool,
**kwargs
):
super().__init__(**kwargs)
self.share_param_critic = share_param_critic
self.delay_value = delay_value
self.loss_function = loss_function
self.use_tanh_mapping = use_tanh_mapping
#############################
# Overridden abstract methods
#############################
def _get_loss(
self, group: str, policy_for_loss: TensorDictModule, continuous: bool
) -> Tuple[LossModule, bool]:
if continuous:
# Loss
loss_module = DDPGLoss(
actor_network=policy_for_loss,
value_network=self.get_value_module(group),
delay_value=self.delay_value,
loss_function=self.loss_function,
)
loss_module.set_keys(
state_action_value=(group, "state_action_value"),
reward=(group, "reward"),
priority=(group, "td_error"),
done=(group, "done"),
terminated=(group, "terminated"),
)
loss_module.make_value_estimator(
ValueEstimators.TD0, gamma=self.experiment_config.gamma
)
return loss_module, True
else:
raise NotImplementedError(
"Iddpg is not compatible with discrete actions yet"
)
def _get_parameters(self, group: str, loss: LossModule) -> Dict[str, Iterable]:
return {
"loss_actor": list(loss.actor_network_params.flatten_keys().values()),
"loss_value": list(loss.value_network_params.flatten_keys().values()),
}
def _get_policy_for_loss(
self, group: str, model_config: ModelConfig, continuous: bool
) -> TensorDictModule:
if continuous:
n_agents = len(self.group_map[group])
logits_shape = list(self.action_spec[group, "action"].shape)
actor_input_spec = Composite(
{group: self.observation_spec[group].clone().to(self.device)}
)
actor_output_spec = Composite(
{
group: Composite(
{"param": Unbounded(shape=logits_shape)},
shape=(n_agents,),
)
}
)
actor_module = model_config.get_model(
input_spec=actor_input_spec,
output_spec=actor_output_spec,
agent_group=group,
input_has_agent_dim=True,
n_agents=n_agents,
centralised=False,
share_params=self.experiment_config.share_policy_params,
device=self.device,
action_spec=self.action_spec,
)
policy = ProbabilisticActor(
module=actor_module,
spec=self.action_spec[group, "action"],
in_keys=[(group, "param")],
out_keys=[(group, "action")],
distribution_class=TanhDelta if self.use_tanh_mapping else Delta,
distribution_kwargs=(
{
"low": self.action_spec[(group, "action")].space.low,
"high": self.action_spec[(group, "action")].space.high,
}
if self.use_tanh_mapping
else {}
),
return_log_prob=False,
safe=not self.use_tanh_mapping,
)
return policy
else:
raise NotImplementedError(
"Iddpg is not compatible with discrete actions yet"
)
def _get_policy_for_collection(
self, policy_for_loss: TensorDictModule, group: str, continuous: bool
) -> TensorDictModule:
return AdditiveGaussianWrapper(
policy_for_loss,
annealing_num_steps=self.experiment_config.get_exploration_anneal_frames(
self.on_policy
),
action_key=(group, "action"),
sigma_init=self.experiment_config.exploration_eps_init,
sigma_end=self.experiment_config.exploration_eps_end,
)
def process_batch(self, group: str, batch: TensorDictBase) -> TensorDictBase:
keys = list(batch.keys(True, True))
group_shape = batch.get(group).shape
nested_done_key = ("next", group, "done")
nested_terminated_key = ("next", group, "terminated")
nested_reward_key = ("next", group, "reward")
if nested_done_key not in keys:
batch.set(
nested_done_key,
batch.get(("next", "done")).unsqueeze(-1).expand((*group_shape, 1)),
)
if nested_terminated_key not in keys:
batch.set(
nested_terminated_key,
batch.get(("next", "terminated"))
.unsqueeze(-1)
.expand((*group_shape, 1)),
)
if nested_reward_key not in keys:
batch.set(
nested_reward_key,
batch.get(("next", "reward")).unsqueeze(-1).expand((*group_shape, 1)),
)
return batch
#####################
# Custom new methods
#####################
def get_value_module(self, group: str) -> TensorDictModule:
n_agents = len(self.group_map[group])
modules = []
critic_input_spec = Composite(
{
group: self.observation_spec[group]
.clone()
.update(self.action_spec[group])
}
)
critic_output_spec = Composite(
{
group: Composite(
{"state_action_value": Unbounded(shape=(n_agents, 1))},
shape=(n_agents,),
)
}
)
modules.append(
self.critic_model_config.get_model(
input_spec=critic_input_spec,
output_spec=critic_output_spec,
n_agents=n_agents,
centralised=False,
input_has_agent_dim=True,
agent_group=group,
share_params=self.share_param_critic,
device=self.device,
action_spec=self.action_spec,
)
)
return TensorDictSequential(*modules)
@dataclass
class IddpgConfig(AlgorithmConfig):
"""Configuration dataclass for :class:`~benchmarl.algorithms.Iddpg`."""
share_param_critic: bool = MISSING
loss_function: str = MISSING
delay_value: bool = MISSING
use_tanh_mapping: bool = MISSING
@staticmethod
def associated_class() -> Type[Algorithm]:
return Iddpg
@staticmethod
def supports_continuous_actions() -> bool:
return True
@staticmethod
def supports_discrete_actions() -> bool:
return False
@staticmethod
def on_policy() -> bool:
return False
@staticmethod
def has_independent_critic() -> bool:
return True