Merge pull request DLR-RM#57 from Antonin-Raffin/misc/improvements

Misc improvements
Shunian-Chen · Mar 12, 2020 · bfbe96c · bfbe96c
2 parents 809a3d3 + 70e601c
commit bfbe96c
Show file tree

Hide file tree

Showing 33 changed files with 633 additions and 422 deletions.
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -9,18 +9,26 @@ Pre-Release 0.3.0a0 (WIP)
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- Removed default seed
+- Bump dependencies (PyTorch and Gym)
 
 New Features:
 ^^^^^^^^^^^^^
 
 Bug Fixes:
 ^^^^^^^^^^
+- Synced callbacks with Stable-Baselines
 
 Deprecations:
 ^^^^^^^^^^^^^
 
 Others:
 ^^^^^^^
+- SAC with SDE now sample only one matrix
+- Added ``clip_mean`` parameter to SAC policy
+- Buffers now return ``NamedTuple``
+- More typing
+- Add test for ``expln``
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -34,25 +42,25 @@ Pre-Release 0.2.0 (2020-02-14)
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
 - Python 2 support was dropped, Torchy Baselines now requires Python 3.6 or above
-- Return type of `evaluation.evaluate_policy()` has been changed
+- Return type of ``evaluation.evaluate_policy()`` has been changed
 - Refactored the replay buffer to avoid transformation between PyTorch and NumPy
 - Created `OffPolicyRLModel` base class
 - Remove deprecated JSON format for `Monitor`
 
 New Features:
 ^^^^^^^^^^^^^
-- Add `seed()` method to `VecEnv` class
+- Add ``seed()`` method to ``VecEnv`` class
 - Add support for Callback (cf https://github.com/hill-a/stable-baselines/pull/644)
 - Add methods for saving and loading replay buffer
-- Add `extend()` method to the buffers
-- Add `get_vec_normalize_env()` to `BaseRLModel` to retrieve `VecNormalize` wrapper when it exists
-- Add `results_plotter` from Stable Baselines
-- Improve `predict()` method to handle different type of observations (single, vectorized, ...)
+- Add ``extend()`` method to the buffers
+- Add ``get_vec_normalize_env()`` to ``BaseRLModel`` to retrieve ``VecNormalize`` wrapper when it exists
+- Add ``results_plotter`` from Stable Baselines
+- Improve ``predict()`` method to handle different type of observations (single, vectorized, ...)
 
 Bug Fixes:
 ^^^^^^^^^^
 - Fix loading model on CPU that were trained on GPU
-- Fix `reset_num_timesteps` that was not used
+- Fix ``reset_num_timesteps`` that was not used
 - Fix entropy computation for squashed Gaussian (approximate it now)
 - Fix seeding when using multiple environments (different seed per env)
 
@@ -63,8 +71,8 @@ Others:
 ^^^^^^^
 - Add type check
 - Converted all format string to f-strings
-- Add test for `OrnsteinUhlenbeckActionNoise`
-- Add type aliases in `common.type_aliases`
+- Add test for ``OrnsteinUhlenbeckActionNoise``
+- Add type aliases in ``common.type_aliases``
 
 Documentation:
 ^^^^^^^^^^^^^^
@@ -80,7 +88,7 @@ Breaking Changes:
 
 New Features:
 ^^^^^^^^^^^^^
-- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with `Box` input space
+- Initial release of A2C, CEM-RL, PPO, SAC and TD3, working only with ``Box`` input space
 - State-Dependent Exploration (SDE) for A2C, PPO, SAC and TD3
 
 Bug Fixes:
@@ -110,4 +118,12 @@ Contributors:
 -------------
 In random order...
 
-Thanks to @hill-a @enerijunior @AdamGleave @Miffyli
+Thanks to the maintainers of V2: @hill-a @enerijunior @AdamGleave @Miffyli
+
+And all the contributors:
+@bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
+@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol
+@XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs
+@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket
+@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching
+@flodorner @KuKuXia @NeoExtended @solliet @mmcenta @richardwu
diff --git a/setup.py b/setup.py
@@ -7,9 +7,9 @@
       packages=[package for package in find_packages()
                 if package.startswith('torchy_baselines')],
       install_requires=[
-          'gym[classic_control]>=0.10.9',
+          'gym[classic_control]>=0.11',
           'numpy',
-          'torch>=1.2.0',
+          'torch>=1.4.0',
           'cloudpickle',
           # For reading logs
           'pandas',
@@ -47,7 +47,7 @@
       license="MIT",
       long_description="",
       long_description_content_type='text/markdown',
-      version="0.2.0",
+      version="0.2.3",
       )
 
 # python setup.py sdist

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
@@ -6,7 +6,7 @@
 
 from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
 from torchy_baselines.common.callbacks import (CallbackList, CheckpointCallback, EvalCallback,
-    EveryNTimesteps, StopTrainingOnRewardThreshold)
+                                               EveryNTimesteps, StopTrainingOnRewardThreshold)
 
 
 @pytest.mark.parametrize("model_class", [A2C, CEMRL, PPO, SAC, TD3])
@@ -44,6 +44,6 @@ def test_callbacks(model_class):
     # Transform callback into a callback list automatically
     model.learn(500, callback=[checkpoint_callback, eval_callback])
     # Automatic wrapping, old way of doing callbacks
-    model.learn(500, callback=lambda _locals, _globals : True)
+    model.learn(500, callback=lambda _locals, _globals: True)
     if os.path.exists(log_folder):
         shutil.rmtree(log_folder)
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
@@ -22,6 +22,7 @@ def test_bijector():
     # Check the inverse method
     assert th.isclose(TanhBijector.inverse(squashed_actions), actions).all()
 
+
 @pytest.mark.parametrize("model_class", [A2C, PPO])
 def test_squashed_gaussian(model_class):
     """

diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -5,7 +5,8 @@
 import numpy as np
 
 from torchy_baselines.common.logger import (make_output_format, read_csv, read_json, DEBUG, ScopedConfigure,
-    info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn, error, reset)
+                                            info, debug, set_level, configure, logkv, logkvs, dumpkvs, logkv_mean, warn,
+                                            error, reset)
 
 KEY_VALUES = {
     "test": 1,

diff --git a/tests/test_predict.py b/tests/test_predict.py
@@ -12,6 +12,7 @@
     SAC,
 ]
 
+
 @pytest.mark.parametrize("model_class", MODEL_LIST)
 def test_auto_wrap(model_class):
     # test auto wrapping of env into a VecEnv

diff --git a/tests/test_run.py b/tests/test_run.py
@@ -1,7 +1,5 @@
-import os
-
-import pytest
 import numpy as np
+import pytest
 
 from torchy_baselines import A2C, CEMRL, PPO, SAC, TD3
 from torchy_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
@@ -25,7 +23,7 @@ def test_cemrl():
 @pytest.mark.parametrize("model_class", [A2C, PPO])
 @pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
 def test_onpolicy(model_class, env_id):
-    model = model_class('MlpPolicy', env_id, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
+    model = model_class('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
     model.learn(total_timesteps=1000, eval_freq=500)
 
 

diff --git a/tests/test_save_load.py b/tests/test_save_load.py
@@ -16,6 +16,7 @@
     SAC,
 ]
 
+
 @pytest.mark.parametrize("model_class", MODEL_LIST)
 def test_save_load(model_class):
     """

diff --git a/tests/test_sde.py b/tests/test_sde.py
@@ -2,7 +2,7 @@
 import torch as th
 from torch.distributions import Normal
 
-from torchy_baselines import A2C, TD3, SAC
+from torchy_baselines import A2C, TD3, SAC, PPO
 
 
 def test_state_dependent_exploration_grad():
@@ -55,12 +55,13 @@ def test_state_dependent_exploration_grad():
     assert sigma_hat.grad.allclose(grad)
 
 
-@pytest.mark.parametrize("model_class", [TD3, SAC, A2C])
+@pytest.mark.parametrize("model_class", [TD3, SAC, A2C, PPO])
 @pytest.mark.parametrize("sde_net_arch", [None, [32, 16], []])
-def test_state_dependent_offpolicy_noise(model_class, sde_net_arch):
+@pytest.mark.parametrize("use_expln", [False, True])
+def test_state_dependent_offpolicy_noise(model_class, sde_net_arch, use_expln):
     model = model_class('MlpPolicy', 'Pendulum-v0', use_sde=True, seed=None, create_eval_env=True,
-                        verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch))
-    model.learn(total_timesteps=int(1000), eval_freq=500)
+                        verbose=1, policy_kwargs=dict(log_std_init=-2, sde_net_arch=sde_net_arch, use_expln=use_expln))
+    model.learn(total_timesteps=int(500), eval_freq=250)
 
 
 def test_scheduler():

diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
@@ -3,14 +3,16 @@
 import numpy as np
 
 from torchy_baselines.common.running_mean_std import RunningMeanStd
-from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization
+from torchy_baselines.common.vec_env import DummyVecEnv, VecNormalize, VecFrameStack, sync_envs_normalization, unwrap_vec_normalize
 from torchy_baselines import CEMRL, SAC, TD3
 
 ENV_ID = 'Pendulum-v0'
 
+
 def make_env():
     return gym.make(ENV_ID)
 
+
 def check_rms_equal(rmsa, rmsb):
     assert np.all(rmsa.mean == rmsb.mean)
     assert np.all(rmsa.var == rmsb.var)
@@ -34,6 +36,7 @@ def check_vec_norm_equal(norma, normb):
     assert norma.epsilon == normb.epsilon
     assert norma.training == normb.training
 
+
 def _make_warmstart_cartpole():
     """Warm-start VecNormalize by stepping through CartPole"""
     venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
@@ -50,8 +53,8 @@ def _make_warmstart_cartpole():
 def test_runningmeanstd():
     """Test RunningMeanStd object"""
     for (x_1, x_2, x_3) in [
-         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
-         (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
         rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])
 
         x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
@@ -129,9 +132,17 @@ def test_offpolicy_normalization(model_class):
 
 def test_sync_vec_normalize():
     env = DummyVecEnv([make_env])
+
+    assert unwrap_vec_normalize(env) is None
+
     env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     env = VecFrameStack(env, 1)
 
+    assert isinstance(unwrap_vec_normalize(env), VecNormalize)
+
     eval_env = DummyVecEnv([make_env])
     eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
     eval_env = VecFrameStack(eval_env, 1)
@@ -143,10 +154,12 @@ def test_sync_vec_normalize():
 
     obs = env.reset()
     original_obs = env.get_original_obs()
+    dummy_rewards = np.random.rand(10)
     # Normalization must be different
     assert not np.allclose(obs, eval_env.normalize_obs(original_obs))
 
     sync_envs_normalization(env, eval_env)
 
     # Now they must be synced
     assert np.allclose(obs, eval_env.normalize_obs(original_obs))
+    assert np.allclose(env.normalize_reward(dummy_rewards), eval_env.normalize_reward(dummy_rewards))
diff --git a/torchy_baselines/__init__.py b/torchy_baselines/__init__.py
@@ -4,4 +4,4 @@
 from torchy_baselines.sac import SAC
 from torchy_baselines.td3 import TD3
 
-__version__ = "0.2.0"
+__version__ = "0.2.3"