diff --git a/LICENSE b/LICENSE index 57f53c30e8..209b99f034 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ The MIT License Copyright (c) 2017 OpenAI (http://openai.com) +Copyright (c) 2018-2019 Stable-Baselines Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/conf.py b/docs/conf.py index e9937fac83..18ae8d8aab 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ def __getattr__(cls, name): # -- Project information ----------------------------------------------------- project = 'Stable Baselines' -copyright = '2018, Stable Baselines' +copyright = '2018-2019, Stable Baselines' author = 'Stable Baselines Contributors' # The short X.Y version diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 0135b884fe..ed9d2d5c3a 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -5,9 +5,11 @@ Changelog For download links, please look at `Github release page `_. -Pre-Release 2.4.1a (WIP) +Release 2.4.1 (2019-02-11) -------------------------- +**Bug fixes and improvements** + - fixed computation of training metrics in TRPO and PPO1 - added ``reset_num_timesteps`` keyword when calling train() to continue tensorboard learning curves - reduced the size taken by tensorboard logs (added a ``full_tensorboard_log`` to enable full logging, which was the previous behavior) @@ -17,6 +19,7 @@ Pre-Release 2.4.1a (WIP) - fixed custom policy examples in the doc for DQN and DDPG - remove gym spaces patch for equality functions - fixed tensorflow dependency: cpu version was installed overwritting tensorflow-gpu when present. +- fixed a bug in ``traj_segment_generator`` (used in ppo1 and trpo) where ``new`` was not updated. (spotted by @junhyeokahn) Release 2.4.0 (2019-01-17) @@ -233,4 +236,4 @@ Contributors (since v2.0.0): In random order... Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck -@EliasHasle @mrakgr @Bleyddyn @antoine-galataud +@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn diff --git a/setup.py b/setup.py index d276f6d039..b0b32fd50d 100644 --- a/setup.py +++ b/setup.py @@ -136,7 +136,7 @@ license="MIT", long_description=long_description, long_description_content_type='text/markdown', - version="2.4.1a0", + version="2.4.1", ) # python setup.py sdist diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py index 5cbf3d7a3e..14a7029bf5 100644 --- a/stable_baselines/__init__.py +++ b/stable_baselines/__init__.py @@ -9,4 +9,4 @@ from stable_baselines.trpo_mpi import TRPO from stable_baselines.sac import SAC -__version__ = "2.4.1a0" +__version__ = "2.4.1" diff --git a/stable_baselines/trpo_mpi/utils.py b/stable_baselines/trpo_mpi/utils.py index 8a119a0916..a6504a544d 100644 --- a/stable_baselines/trpo_mpi/utils.py +++ b/stable_baselines/trpo_mpi/utils.py @@ -18,7 +18,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): - ob: (np.ndarray) observations - rew: (numpy float) rewards (if gail is used it is the predicted reward) - vpred: (numpy float) action logits - - new: (numpy bool) dones (is end of episode) + - dones: (numpy bool) dones (is end of episode -> True if first timestep of an episode) - ac: (np.ndarray) actions - prevac: (np.ndarray) previous actions - nextvpred: (numpy float) next action logits @@ -32,7 +32,6 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): # Initialize state variables step = 0 action = env.action_space.sample() # not used, just so we have the datatype - new = True observation = env.reset() cur_ep_ret = 0 # return in current episode @@ -51,7 +50,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): actions = np.array([action for _ in range(horizon)]) prev_actions = actions.copy() states = policy.initial_state - done = None + done = True # marks if we're on first timestep of an episode while True: prevac = action @@ -66,9 +65,20 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): else: current_it_timesteps = sum(ep_lens) + current_it_len - yield {"ob": observations, "rew": rews, "dones": dones, "true_rew": true_rews, "vpred": vpreds, - "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets, - "ep_lens": ep_lens, "ep_true_rets": ep_true_rets, "total_timestep": current_it_timesteps} + yield { + "ob": observations, + "rew": rews, + "dones": dones, + "true_rew": true_rews, + "vpred": vpreds, + "ac": actions, + "prevac": prev_actions, + "nextvpred": vpred[0] * (1 - done), + "ep_rets": ep_rets, + "ep_lens": ep_lens, + "ep_true_rets": ep_true_rets, + "total_timestep": current_it_timesteps + } _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape)) # Be careful!!! if you change the downstream algorithm to aggregate # several of these batches, then be sure to do a deepcopy