Release 2.4.1 (#194)

* Prepare 2.4.1 release * Fix variable not updated in traj_segment_generator
hill-a · Feb 11, 2019 · 655dd0e · 655dd0e
1 parent 5acf88f
commit 655dd0e
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 11 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 The MIT License
 
 Copyright (c) 2017 OpenAI (http://openai.com)
+Copyright (c) 2018-2019 Stable-Baselines Team
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/docs/conf.py b/docs/conf.py
@@ -42,7 +42,7 @@ def __getattr__(cls, name):
 # -- Project information -----------------------------------------------------
 
 project = 'Stable Baselines'
-copyright = '2018, Stable Baselines'
+copyright = '2018-2019, Stable Baselines'
 author = 'Stable Baselines Contributors'
 
 # The short X.Y version

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -5,9 +5,11 @@ Changelog
 
 For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.
 
-Pre-Release 2.4.1a (WIP)
+Release 2.4.1 (2019-02-11)
 --------------------------
 
+**Bug fixes and improvements**
+
 - fixed computation of training metrics in TRPO and PPO1
 - added ``reset_num_timesteps`` keyword when calling train() to continue tensorboard learning curves
 - reduced the size taken by tensorboard logs (added a ``full_tensorboard_log`` to enable full logging, which was the previous behavior)
@@ -17,6 +19,7 @@ Pre-Release 2.4.1a (WIP)
 - fixed custom policy examples in the doc for DQN and DDPG
 - remove gym spaces patch for equality functions
 - fixed tensorflow dependency: cpu version was installed overwritting tensorflow-gpu when present.
+- fixed a bug in ``traj_segment_generator`` (used in ppo1 and trpo) where ``new`` was not updated. (spotted by @junhyeokahn)
 
 
 Release 2.4.0 (2019-01-17)
@@ -233,4 +236,4 @@ Contributors (since v2.0.0):
 In random order...
 
 Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
-@EliasHasle @mrakgr @Bleyddyn @antoine-galataud
+@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn
diff --git a/setup.py b/setup.py
@@ -136,7 +136,7 @@
       license="MIT",
       long_description=long_description,
       long_description_content_type='text/markdown',
-      version="2.4.1a0",
+      version="2.4.1",
       )
 
 # python setup.py sdist

diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py
@@ -9,4 +9,4 @@
 from stable_baselines.trpo_mpi import TRPO
 from stable_baselines.sac import SAC
 
-__version__ = "2.4.1a0"
+__version__ = "2.4.1"
diff --git a/stable_baselines/trpo_mpi/utils.py b/stable_baselines/trpo_mpi/utils.py
@@ -18,7 +18,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
         - ob: (np.ndarray) observations
         - rew: (numpy float) rewards (if gail is used it is the predicted reward)
         - vpred: (numpy float) action logits
-        - new: (numpy bool) dones (is end of episode)
+        - dones: (numpy bool) dones (is end of episode -> True if first timestep of an episode)
         - ac: (np.ndarray) actions
         - prevac: (np.ndarray) previous actions
         - nextvpred: (numpy float) next action logits
@@ -32,7 +32,6 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
     # Initialize state variables
     step = 0
     action = env.action_space.sample()  # not used, just so we have the datatype
-    new = True
     observation = env.reset()
 
     cur_ep_ret = 0  # return in current episode
@@ -51,7 +50,7 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
     actions = np.array([action for _ in range(horizon)])
     prev_actions = actions.copy()
     states = policy.initial_state
-    done = None
+    done = True  # marks if we're on first timestep of an episode
 
     while True:
         prevac = action
@@ -66,9 +65,20 @@ def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
             else:
                 current_it_timesteps = sum(ep_lens) + current_it_len
 
-            yield {"ob": observations, "rew": rews, "dones": dones, "true_rew": true_rews, "vpred": vpreds,
-                   "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets,
-                   "ep_lens": ep_lens, "ep_true_rets": ep_true_rets, "total_timestep": current_it_timesteps}
+            yield {
+                    "ob": observations,
+                    "rew": rews,
+                    "dones": dones,
+                    "true_rew": true_rews,
+                    "vpred": vpreds,
+                    "ac": actions,
+                    "prevac": prev_actions,
+                    "nextvpred": vpred[0] * (1 - done),
+                    "ep_rets": ep_rets,
+                    "ep_lens": ep_lens,
+                    "ep_true_rets": ep_true_rets,
+                    "total_timestep": current_it_timesteps
+            }
             _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape))
             # Be careful!!! if you change the downstream algorithm to aggregate
             # several of these batches, then be sure to do a deepcopy