diff --git a/README.md b/README.md
index 7274cffbdb..d3ee04bea8 100644
--- a/README.md
+++ b/README.md
@@ -145,7 +145,7 @@ All the following examples can be executed online using Google colab notebooks:
 | ------------------- | ---------------------------- | ------------------ | ------------------ | ------------------ | ------------------- | ------------------ | --------------------------------- |
 | A2C                 | :heavy_check_mark:           | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark:  | :heavy_check_mark: | :heavy_check_mark:                |
 | ACER                | :heavy_check_mark:           | :heavy_check_mark: | :x: <sup>(5)</sup> | :heavy_check_mark: | :x:                 | :x:                | :heavy_check_mark:                |
-| ACKTR               | :heavy_check_mark:           | :x:                | :x: <sup>(5)</sup> | :heavy_check_mark: | :x:                 | :x:                | :heavy_check_mark:                |
+| ACKTR               | :heavy_check_mark:           | :heavy_check_mark: | :x: <sup>(5)</sup> | :heavy_check_mark: | :x:                 | :x:                | :heavy_check_mark:                |
 | DDPG                | :heavy_check_mark:           | :x:                | :heavy_check_mark: | :x:                | :x:                 | :x:                | :x:                               |
 | DQN                 | :heavy_check_mark:           | :x:                | :x:                | :heavy_check_mark: | :x:                 | :x:                | :x:                               |
 | GAIL <sup>(2)</sup> | :heavy_check_mark:           | :x:                | :heavy_check_mark: | :x:                | :x:                 | :x:                | :heavy_check_mark: <sup>(4)</sup> |
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index a0ddc324d6..0135b884fe 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -9,8 +9,14 @@ Pre-Release 2.4.1a (WIP)
 --------------------------
 
 - fixed computation of training metrics in TRPO and PPO1
+- added ``reset_num_timesteps`` keyword when calling train() to continue tensorboard learning curves
+- reduced the size taken by tensorboard logs (added a ``full_tensorboard_log`` to enable full logging, which was the previous behavior)
+- fixed image detection for tensorboard logging
+- fixed ACKTR for recurrent policies
+- fixed gym breaking changes
 - fixed custom policy examples in the doc for DQN and DDPG
 - remove gym spaces patch for equality functions
+- fixed tensorflow dependency: cpu version was installed overwritting tensorflow-gpu when present.
 
 
 Release 2.4.0 (2019-01-17)
diff --git a/setup.py b/setup.py
index 8937299b56..d276f6d039 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,35 @@
-from setuptools import setup, find_packages
 import sys
+import subprocess
+from setuptools import setup, find_packages
+from distutils.version import LooseVersion
 
 if sys.version_info.major != 3:
     print('This Python is only compatible with Python 3, but you are running '
           'Python {}. The installation will likely fail.'.format(sys.version_info.major))
 
+# Check tensorflow installation to avoid
+# breaking pre-installed tf gpu
+install_tf, tf_gpu = False, False
+try:
+    import tensorflow as tf
+    if tf.__version__ < LooseVersion('1.5.0'):
+        install_tf = True
+        # check if a gpu version is needed
+        tf_gpu = tf.test.is_gpu_available()
+except ImportError:
+    install_tf = True
+    # Check if a nvidia gpu is present
+    for command in ['nvidia-smi', '/usr/bin/nvidia-smi', 'nvidia-smi.exe']:
+        if subprocess.call([command]) == 0:
+            tf_gpu = True
+            break
+
+tf_dependency = []
+if install_tf:
+    tf_dependency = ['tensorflow-gpu>=1.5.0'] if tf_gpu else ['tensorflow>=1.5.0']
+    if tf_gpu:
+        print("A GPU was detected, tensorflow-gpu will be installed")
+
 
 long_description = """
 [![Build Status](https://travis-ci.com/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.com/hill-a/stable-baselines) [![Documentation Status](https://readthedocs.org/projects/stable-baselines/badge/?version=master)](https://stable-baselines.readthedocs.io/en/master/?badge=master) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=hill-a/stable-baselines&amp;utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage)
@@ -83,7 +108,6 @@
           'progressbar2',
           'mpi4py',
           'cloudpickle>=0.5.5',
-          'tensorflow>=1.5.0',
           'click',
           'opencv-python',
           'numpy',
@@ -91,7 +115,7 @@
           'matplotlib',
           'seaborn',
           'glob2'
-      ],
+      ] + tf_dependency,
       extras_require={
         'tests': [
             'pytest==3.5.1',
@@ -112,7 +136,7 @@
       license="MIT",
       long_description=long_description,
       long_description_content_type='text/markdown',
-      version="2.4.1a",
+      version="2.4.1a0",
       )
 
 # python setup.py sdist
diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py
index 00d6700556..5cbf3d7a3e 100644
--- a/stable_baselines/__init__.py
+++ b/stable_baselines/__init__.py
@@ -9,4 +9,4 @@
 from stable_baselines.trpo_mpi import TRPO
 from stable_baselines.sac import SAC
 
-__version__ = "2.4.1a"
+__version__ = "2.4.1a0"
diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py
index e54a157c00..fe41af4040 100644
--- a/stable_baselines/a2c/a2c.py
+++ b/stable_baselines/a2c/a2c.py
@@ -35,11 +35,13 @@ class A2C(ActorCriticRLModel):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
                               (used only for loading)
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5,
                  learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='linear', verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None):
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
 
         super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
                                   _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
@@ -54,6 +56,7 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
         self.lr_schedule = lr_schedule
         self.learning_rate = learning_rate
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -132,15 +135,16 @@ def setup_model(self):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
-                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
-                    tf.summary.histogram('learning_rate', self.learning_rate)
                     tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
-                    tf.summary.histogram('advantage', self.advs_ph)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', train_model.obs_ph)
-                    else:
-                        tf.summary.histogram('observation', train_model.obs_ph)
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('discounted_rewards', self.rewards_ph)
+                        tf.summary.histogram('learning_rate', self.learning_rate)
+                        tf.summary.histogram('advantage', self.advs_ph)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', train_model.obs_ph)
+                        else:
+                            tf.summary.histogram('observation', train_model.obs_ph)
 
                 trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
                                                     epsilon=self.epsilon)
@@ -184,7 +188,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
-            if (1 + update) % 10 == 0:
+            if self.full_tensorboard_log and (1 + update) % 10 == 0:
                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
                 summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run(
@@ -202,8 +206,13 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         return policy_loss, value_loss, policy_entropy
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
@@ -216,8 +225,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
             for update in range(1, total_timesteps // self.n_batch + 1):
                 # true_reward is the reward without discount
                 obs, states, rewards, masks, actions, values, true_reward = runner.run()
-                _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, update,
-                                                                 writer)
+                _, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
+                                                                 self.num_timesteps // (self.n_batch + 1), writer)
                 n_seconds = time.time() - t_start
                 fps = int((update * self.n_batch) / n_seconds)
 
@@ -225,18 +234,20 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                       true_reward.reshape((self.n_envs, self.n_steps)),
                                                                       masks.reshape((self.n_envs, self.n_steps)),
-                                                                      writer, update * (self.n_batch + 1))
+                                                                      writer, self.num_timesteps)
+
+                self.num_timesteps += self.n_batch + 1
 
                 if callback is not None:
                     # Only stop training if return value is False, not when it is None. This is for backwards
                     # compatibility with callbacks that have no return statement.
-                    if callback(locals(), globals()) == False:
+                    if callback(locals(), globals()) is False:
                         break
 
                 if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                     explained_var = explained_variance(values, rewards)
                     logger.record_tabular("nupdates", update)
-                    logger.record_tabular("total_timesteps", update * self.n_batch)
+                    logger.record_tabular("total_timesteps", self.num_timesteps)
                     logger.record_tabular("fps", fps)
                     logger.record_tabular("policy_entropy", float(policy_entropy))
                     logger.record_tabular("value_loss", float(value_loss))
diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
index 04e4c1d964..0f469d7c40 100644
--- a/stable_baselines/acer/acer_simple.py
+++ b/stable_baselines/acer/acer_simple.py
@@ -91,12 +91,15 @@ class ACER(ActorCriticRLModel):
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
                  learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000,
-                 replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1,
-                 verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None):
+                 replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True,
+                 alpha=0.99, delta=1, verbose=0, tensorboard_log=None,
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
 
         super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
                                    _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
@@ -119,6 +122,7 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5,
         self.lr_schedule = lr_schedule
         self.num_procs = num_procs
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -361,17 +365,19 @@ def custom_getter(getter, name, *args, **kwargs):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph))
-                    tf.summary.histogram('rewards', self.reward_ph)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
-                    tf.summary.histogram('learning_rate', self.learning_rate)
                     tf.summary.scalar('advantage', tf.reduce_mean(adv))
-                    tf.summary.histogram('advantage', adv)
                     tf.summary.scalar('action_probabilty', tf.reduce_mean(self.mu_ph))
-                    tf.summary.histogram('action_probabilty', self.mu_ph)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', train_model.obs_ph)
-                    else:
-                        tf.summary.histogram('observation', train_model.obs_ph)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('rewards', self.reward_ph)
+                        tf.summary.histogram('learning_rate', self.learning_rate)
+                        tf.summary.histogram('advantage', adv)
+                        tf.summary.histogram('action_probabilty', self.mu_ph)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', train_model.obs_ph)
+                        else:
+                            tf.summary.histogram('observation', train_model.obs_ph)
 
                 trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.rprop_alpha,
                                                     epsilon=self.rprop_epsilon)
@@ -429,7 +435,7 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
-            if (1 + (steps / self.n_batch)) % 10 == 0:
+            if self.full_tensorboard_log and (1 + (steps / self.n_batch)) % 10 == 0:
                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
                 step_return = self.sess.run([self.summary] + self.run_ops, td_map, options=run_options,
@@ -444,8 +450,13 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w
 
         return self.names_ops, step_return[1:]  # strip off _train
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
@@ -474,7 +485,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                       rewards.reshape((self.n_envs, self.n_steps)),
                                                                       dones.reshape((self.n_envs, self.n_steps)),
-                                                                      writer, steps)
+                                                                      writer, self.num_timesteps)
 
                 # reshape stuff correctly
                 obs = obs.reshape(runner.batch_ob_shape)
@@ -485,16 +496,16 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                 masks = masks.reshape([runner.batch_ob_shape[0]])
 
                 names_ops, values_ops = self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
-                                                         steps, writer)
+                                                         self.num_timesteps, writer)
 
                 if callback is not None:
                     # Only stop training if return value is False, not when it is None. This is for backwards
                     # compatibility with callbacks that have no return statement.
-                    if callback(locals(), globals()) == False:
+                    if callback(locals(), globals()) is False:
                         break
 
                 if self.verbose >= 1 and (int(steps / runner.n_batch) % log_interval == 0):
-                    logger.record_tabular("total_timesteps", steps)
+                    logger.record_tabular("total_timesteps", self.num_timesteps)
                     logger.record_tabular("fps", int(steps / (time.time() - t_start)))
                     # IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
                     # not just at the terminal state. Thus, this is mean until end of life, not end of episode.
@@ -519,7 +530,10 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                         dones = dones.reshape([runner.n_batch])
                         masks = masks.reshape([runner.batch_ob_shape[0]])
 
-                        self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks, steps)
+                        self._train_step(obs, actions, rewards, dones, mus, self.initial_state, masks,
+                                         self.num_timesteps)
+
+                self.num_timesteps += self.n_batch
 
         return self
 
diff --git a/stable_baselines/acktr/acktr_disc.py b/stable_baselines/acktr/acktr_disc.py
index de9134f135..e289491fa4 100644
--- a/stable_baselines/acktr/acktr_disc.py
+++ b/stable_baselines/acktr/acktr_disc.py
@@ -39,11 +39,14 @@ class ACKTR(ActorCriticRLModel):
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param async_eigen_decomp: (bool) Use async eigen decomposition
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
                  learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
-                 tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, policy_kwargs=None):
+                 tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False,
+                 policy_kwargs=None, full_tensorboard_log=False):
 
         super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
                                     _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
@@ -60,6 +63,7 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01,
         self.nprocs = nprocs
         self.tensorboard_log = tensorboard_log
         self.async_eigen_decomp = async_eigen_decomp
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -160,15 +164,17 @@ def setup_model(self):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
-                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.pg_lr_ph))
-                    tf.summary.histogram('learning_rate', self.pg_lr_ph)
                     tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
-                    tf.summary.histogram('advantage', self.advs_ph)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', train_model.obs_ph)
-                    else:
-                        tf.summary.histogram('observation', train_model.obs_ph)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('discounted_rewards', self.rewards_ph)
+                        tf.summary.histogram('learning_rate', self.pg_lr_ph)
+                        tf.summary.histogram('advantage', self.advs_ph)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', train_model.obs_ph)
+                        else:
+                            tf.summary.histogram('observation', train_model.obs_ph)
 
                 with tf.variable_scope("kfac", reuse=False, custom_getter=tf_util.outer_scope_getter("kfac")):
                     with tf.device('/gpu:0'):
@@ -219,7 +225,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
-            if (1 + update) % 10 == 0:
+            if self.full_tensorboard_log and (1 + update) % 10 == 0:
                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
                 summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run(
@@ -236,8 +242,13 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
         return policy_loss, value_loss, policy_entropy
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
             self.n_batch = self.n_envs * self.n_steps
 
@@ -282,7 +293,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                 # true_reward is the reward without discount
                 obs, states, rewards, masks, actions, values, true_reward = runner.run()
                 policy_loss, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values,
-                                                                           update, writer)
+                                                                           self.num_timesteps // (self.n_batch + 1),
+                                                                           writer)
                 n_seconds = time.time() - t_start
                 fps = int((update * self.n_batch) / n_seconds)
 
@@ -290,18 +302,18 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                       true_reward.reshape((self.n_envs, self.n_steps)),
                                                                       masks.reshape((self.n_envs, self.n_steps)),
-                                                                      writer, update * (self.n_batch + 1))
+                                                                      writer, self.num_timesteps)
 
                 if callback is not None:
                     # Only stop training if return value is False, not when it is None. This is for backwards
                     # compatibility with callbacks that have no return statement.
-                    if callback(locals(), globals()) == False:
+                    if callback(locals(), globals()) is False:
                         break
 
                 if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                     explained_var = explained_variance(values, rewards)
                     logger.record_tabular("nupdates", update)
-                    logger.record_tabular("total_timesteps", update * self.n_batch)
+                    logger.record_tabular("total_timesteps", self.num_timesteps)
                     logger.record_tabular("fps", fps)
                     logger.record_tabular("policy_entropy", float(policy_entropy))
                     logger.record_tabular("policy_loss", float(policy_loss))
@@ -309,6 +321,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     logger.record_tabular("explained_variance", float(explained_var))
                     logger.dump_tabular()
 
+                self.num_timesteps += self.n_batch + 1
+
             coord.request_stop()
             coord.join(enqueue_threads)
 
diff --git a/stable_baselines/acktr/kfac.py b/stable_baselines/acktr/kfac.py
index 8f70143285..607423e6a3 100644
--- a/stable_baselines/acktr/kfac.py
+++ b/stable_baselines/acktr/kfac.py
@@ -158,7 +158,8 @@ def _search_factors(gradient, graph):
                             if len(b_tensor.get_shape()) > 0 and b_tensor.get_shape()[0].value is None:
                                 b_tensor.set_shape(b_tensor_shape)
                             b_tensors.append(b_tensor)
-                    fprop_op_name = op_types.append('UNK-' + fprop_op.op_def.name)
+                    fprop_op_name = 'UNK-' + fprop_op.op_def.name
+                    op_types.append(fprop_op_name)
 
             return {'opName': fprop_op_name, 'op': fprop_op, 'fpropFactors': f_tensors, 'bpropFactors': b_tensors}
 
diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py
index ed01fcae8d..a54a00825d 100644
--- a/stable_baselines/common/base_class.py
+++ b/stable_baselines/common/base_class.py
@@ -39,6 +39,7 @@ def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base, pol
         self.action_space = None
         self.n_envs = None
         self._vectorize_action = False
+        self.num_timesteps = 0
 
         if env is not None:
             if isinstance(env, str):
@@ -115,6 +116,21 @@ def set_env(self, env):
 
         self.env = env
 
+    def _init_num_timesteps(self, reset_num_timesteps=True):
+        """
+        Initialize and resets num_timesteps (total timesteps since beginning of training)
+        if needed. Mainly used logging and plotting (tensorboard).
+
+        :param reset_num_timesteps: (bool) Set it to false when continuing training
+            to not create new plotting curves in tensorboard.
+        :return: (bool) Whether a new tensorboard log needs to be created
+        """
+        if reset_num_timesteps:
+            self.num_timesteps = 0
+
+        new_tb_log = self.num_timesteps == 0
+        return new_tb_log
+
     @abstractmethod
     def setup_model(self):
         """
@@ -135,7 +151,8 @@ def _setup_learn(self, seed):
             set_global_seeds(seed)
 
     @abstractmethod
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="run"):
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="run",
+              reset_num_timesteps=True):
         """
         Return a trained model.
 
@@ -145,6 +162,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
             It takes the local and global variables. If it returns False, training is aborted.
         :param log_interval: (int) The number of timesteps before logging.
         :param tb_log_name: (str) the name of the run for tensorboard log
+        :param reset_num_timesteps: (bool) whether or not to reset the current timestep number (used in logging)
         :return: (BaseRLModel) the trained model
         """
         pass
@@ -333,7 +351,8 @@ def setup_model(self):
         pass
 
     @abstractmethod
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="run"):
+    def learn(self, total_timesteps, callback=None, seed=None,
+              log_interval=100, tb_log_name="run", reset_num_timesteps=True):
         pass
 
     def predict(self, observation, state=None, mask=None, deterministic=False):
@@ -468,7 +487,8 @@ def setup_model(self):
         pass
 
     @abstractmethod
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="run"):
+    def learn(self, total_timesteps, callback=None, seed=None,
+              log_interval=100, tb_log_name="run", reset_num_timesteps=True):
         pass
 
     @abstractmethod
@@ -544,23 +564,27 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class TensorboardWriter:
-    def __init__(self, graph, tensorboard_log_path, tb_log_name):
+    def __init__(self, graph, tensorboard_log_path, tb_log_name, new_tb_log=True):
         """
         Create a Tensorboard writer for a code segment, and saves it to the log directory as its own run
 
         :param graph: (Tensorflow Graph) the model graph
         :param tensorboard_log_path: (str) the save path for the log (can be None for no logging)
         :param tb_log_name: (str) the name of the run for tensorboard log
+        :param new_tb_log: (bool) whether or not to create a new logging folder for tensorbaord
         """
         self.graph = graph
         self.tensorboard_log_path = tensorboard_log_path
         self.tb_log_name = tb_log_name
         self.writer = None
+        self.new_tb_log = new_tb_log
 
     def __enter__(self):
         if self.tensorboard_log_path is not None:
-            save_path = os.path.join(self.tensorboard_log_path,
-                                     "{}_{}".format(self.tb_log_name, self._get_latest_run_id() + 1))
+            latest_run_id = self._get_latest_run_id()
+            if self.new_tb_log:
+                latest_run_id = latest_run_id + 1
+            save_path = os.path.join(self.tensorboard_log_path, "{}_{}".format(self.tb_log_name, latest_run_id))
             self.writer = tf.summary.FileWriter(save_path, graph=self.graph)
         return self.writer
 
diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py
index 9b14591ba9..354102cd3d 100644
--- a/stable_baselines/common/distributions.py
+++ b/stable_baselines/common/distributions.py
@@ -186,7 +186,10 @@ def __init__(self, n_vec):
 
         :param n_vec: ([int]) the vectors
         """
-        self.n_vec = n_vec
+        # Cast the variable because tf does not allow uint32
+        self.n_vec = n_vec.astype(np.int32)
+        # Check that the cast was valid
+        assert (self.n_vec > 0).all(), "Casting uint32 to int32 was invalid"
 
     def probability_distribution_class(self):
         return MultiCategoricalProbabilityDistribution
diff --git a/stable_baselines/common/misc_util.py b/stable_baselines/common/misc_util.py
index 5532805168..e3ef88c4f8 100644
--- a/stable_baselines/common/misc_util.py
+++ b/stable_baselines/common/misc_util.py
@@ -89,7 +89,9 @@ def set_global_seeds(seed):
     tf.set_random_seed(seed)
     np.random.seed(seed)
     random.seed(seed)
-    gym.spaces.prng.seed(seed)
+    # prng was removed in latest gym version
+    if hasattr(gym.spaces, 'prng'):
+        gym.spaces.prng.seed(seed)
 
 
 def pretty_eta(seconds_left):
diff --git a/stable_baselines/common/tf_util.py b/stable_baselines/common/tf_util.py
index d7a3d95f06..fd64a5147d 100644
--- a/stable_baselines/common/tf_util.py
+++ b/stable_baselines/common/tf_util.py
@@ -11,6 +11,19 @@
 from stable_baselines import logger
 
 
+def is_image(tensor):
+    """
+    Check if a tensor has the shape of
+    a valid image for tensorboard logging.
+    Valid image: RGB, RGBD, GrayScale
+
+    :param tensor: (np.ndarray or tf.placeholder)
+    :return: (bool)
+    """
+
+    return len(tensor.shape) == 3 and tensor.shape[-1] in [1, 3, 4]
+
+
 def switch(condition, then_expression, else_expression):
     """
     Switches between two operations depending on a scalar value (int or bool).
@@ -210,7 +223,7 @@ def function(inputs, outputs, updates=None, givens=None):
     Take a bunch of tensorflow placeholders and expressions
     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
     values to be fed to the input's placeholders and produces the values of the expressions
-    in outputs. Just like a Theano function. 
+    in outputs. Just like a Theano function.
 
     Input values can be passed in the same order as inputs or can be provided as kwargs based
     on placeholder name (passed to constructor or accessible via placeholder.op.name).
@@ -225,13 +238,13 @@ def function(inputs, outputs, updates=None, givens=None):
        >>>     assert lin(2) == 6
        >>>     assert lin(x=3) == 9
        >>>     assert lin(2, 2) == 10
-    
+
     :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments
     :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned
         value will also have the same shape.
     :param updates: ([tf.Operation] or tf.Operation)
         list of update functions or single update function that will be run whenever
-        the function is called. The return is ignored. 
+        the function is called. The return is ignored.
     :param givens: (dict) the values known for the output
     """
     if isinstance(outputs, list):
@@ -254,7 +267,7 @@ def __init__(self, inputs, outputs, updates, givens):
             value will also have the same shape.
         :param updates: ([tf.Operation] or tf.Operation)
         list of update functions or single update function that will be run whenever
-        the function is called. The return is ignored. 
+        the function is called. The return is ignored.
         :param givens: (dict) the values known for the output
         """
         for inpt in inputs:
diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py
index ef64d4404e..f69dce5f40 100644
--- a/stable_baselines/ddpg/ddpg.py
+++ b/stable_baselines/ddpg/ddpg.py
@@ -168,6 +168,8 @@ class DDPG(OffPolicyRLModel):
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50,
@@ -176,7 +178,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
                  normalize_returns=False, enable_popart=False, observation_range=(-5., 5.), critic_l2_reg=0.,
                  return_range=(-np.inf, np.inf), actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.,
                  render=False, render_eval=False, memory_limit=100, verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None):
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
 
         # TODO: replay_buffer refactoring
         super(DDPG, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DDPGPolicy,
@@ -208,6 +210,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
         self.nb_rollout_steps = nb_rollout_steps
         self.memory_limit = memory_limit
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         # init
         self.graph = None
@@ -260,6 +263,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n
         self.summary = None
         self.episode_reward = None
         self.tb_seen_steps = None
+
         self.target_params = None
 
         if _init_setup_model:
@@ -361,7 +365,8 @@ def setup_model(self):
                     self.target_q = self.rewards + (1. - self.terminals1) * self.gamma * q_obs1
 
                     tf.summary.scalar('critic_target', tf.reduce_mean(self.critic_target))
-                    tf.summary.histogram('critic_target', self.critic_target)
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('critic_target', self.critic_target)
 
                     # Set up parts.
                     if self.normalize_returns and self.enable_popart:
@@ -371,13 +376,15 @@ def setup_model(self):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('rewards', tf.reduce_mean(self.rewards))
-                    tf.summary.histogram('rewards', self.rewards)
                     tf.summary.scalar('param_noise_stddev', tf.reduce_mean(self.param_noise_stddev))
-                    tf.summary.histogram('param_noise_stddev', self.param_noise_stddev)
-                    if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]:
-                        tf.summary.image('observation', self.obs_train)
-                    else:
-                        tf.summary.histogram('observation', self.obs_train)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('rewards', self.rewards)
+                        tf.summary.histogram('param_noise_stddev', self.param_noise_stddev)
+                        if len(self.observation_space.shape) == 3 and self.observation_space.shape[0] in [1, 3, 4]:
+                            tf.summary.image('observation', self.obs_train)
+                        else:
+                            tf.summary.histogram('observation', self.obs_train)
 
                 with tf.variable_scope("Adam_mpi", reuse=False):
                     self._setup_actor_optimizer()
@@ -631,7 +638,7 @@ def _train_step(self, step, writer, log=False):
         if writer is not None:
             # run loss backprop with summary if the step_id was not already logged (can happen with the right
             # parameters as the step value is only an estimate)
-            if log and step not in self.tb_seen_steps:
+            if self.full_tensorboard_log and log and step not in self.tb_seen_steps:
                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
                 summary, actor_grads, actor_loss, critic_grads, critic_loss = \
@@ -737,8 +744,13 @@ def _reset(self):
                 self.param_noise_stddev: self.param_noise.current_stddev,
             })
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             # a list for tensorboard logging, to prevent logging with the same step number, if it already occured
@@ -800,9 +812,10 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                                 ep_rew = np.array([reward]).reshape((1, -1))
                                 ep_done = np.array([done]).reshape((1, -1))
                                 self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done,
-                                                                                  writer, total_steps)
+                                                                                  writer, self.num_timesteps)
                             step += 1
                             total_steps += 1
+                            self.num_timesteps += 1
                             if rank == 0 and self.render:
                                 self.env.render()
                             episode_reward += reward
@@ -814,9 +827,9 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                             self._store_transition(obs, action, reward, new_obs, done)
                             obs = new_obs
                             if callback is not None:
-                                # Only stop training if return value is False, not when it is None. This is for backwards
-                                # compatibility with callbacks that have no return statement.
-                                if callback(locals(), globals()) == False:
+                                # Only stop training if return value is False, not when it is None.
+                                # This is for backwards compatibility with callbacks that have no return statement.
+                                if callback(locals(), globals()) is False:
                                     return self
 
                             if done:
@@ -847,7 +860,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                             # weird equation to deal with the fact the nb_train_steps will be different
                             # to nb_rollout_steps
                             step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) +
-                                    total_steps - self.nb_rollout_steps)
+                                    self.num_timesteps - self.nb_rollout_steps)
 
                             critic_loss, actor_loss = self._train_step(step, writer, log=t_train == 0)
                             epoch_critic_losses.append(critic_loss)
diff --git a/stable_baselines/deepq/build_graph.py b/stable_baselines/deepq/build_graph.py
index 02542b568f..b6a9d39589 100644
--- a/stable_baselines/deepq/build_graph.py
+++ b/stable_baselines/deepq/build_graph.py
@@ -319,8 +319,9 @@ def act(obs, reset=None, update_param_noise_threshold=None, update_param_noise_s
     return act, obs_phs
 
 
-def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None, gamma=1.0, double_q=True,
-                scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
+def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None,
+                gamma=1.0, double_q=True, scope="deepq", reuse=None,
+                param_noise=False, param_noise_filter_func=None, full_tensorboard_log=False):
     """
     Creates the train function:
 
@@ -340,6 +341,8 @@ def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=
     :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
         variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
         is used by default.
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
 
     :return: (tuple)
 
@@ -410,9 +413,11 @@ def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=
         weighted_error = tf.reduce_mean(importance_weights_ph * errors)
 
         tf.summary.scalar("td_error", tf.reduce_mean(td_error))
-        tf.summary.histogram("td_error", td_error)
         tf.summary.scalar("loss", weighted_error)
 
+        if full_tensorboard_log:
+            tf.summary.histogram("td_error", td_error)
+
         # update_target_fn will be called periodically to copy Q network to target Q network
         update_target_expr = []
         for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
@@ -429,15 +434,15 @@ def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=
 
     with tf.variable_scope("input_info", reuse=False):
         tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph))
-        tf.summary.histogram('rewards', rew_t_ph)
         tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph))
-        tf.summary.histogram('importance_weights', importance_weights_ph)
-        # Valid image: RGB, RGBD, GrayScale
-        is_image = len(obs_phs[0].shape) == 3 and obs_phs[0].shape[-1] in [1, 3, 4]
-        if is_image:
-            tf.summary.image('observation', obs_phs[0])
-        elif len(obs_phs[0].shape) == 1:
-            tf.summary.histogram('observation', obs_phs[0])
+
+        if full_tensorboard_log:
+            tf.summary.histogram('rewards', rew_t_ph)
+            tf.summary.histogram('importance_weights', importance_weights_ph)
+            if tf_util.is_image(obs_phs[0]):
+                tf.summary.image('observation', obs_phs[0])
+            elif len(obs_phs[0].shape) == 1:
+                tf.summary.histogram('observation', obs_phs[0])
 
     optimize_expr = optimizer.apply_gradients(gradients)
 
diff --git a/stable_baselines/deepq/dqn.py b/stable_baselines/deepq/dqn.py
index edee7f5416..6247fd288b 100644
--- a/stable_baselines/deepq/dqn.py
+++ b/stable_baselines/deepq/dqn.py
@@ -45,6 +45,8 @@ class DQN(OffPolicyRLModel):
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1,
@@ -52,7 +54,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=5000
                  learning_starts=1000, target_network_update_freq=500, prioritized_replay=False,
                  prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
                  prioritized_replay_eps=1e-6, param_noise=False, verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None):
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
 
         # TODO: replay_buffer refactoring
         super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy,
@@ -76,6 +78,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=5000
         self.learning_rate = learning_rate
         self.gamma = gamma
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -95,6 +98,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=5000
             self.setup_model()
 
     def setup_model(self):
+
         with SetVerbosity(self.verbose):
             assert not isinstance(self.action_space, gym.spaces.Box), \
                 "Error: DQN cannot output a gym.spaces.Box action space."
@@ -122,7 +126,8 @@ def setup_model(self):
                     gamma=self.gamma,
                     grad_norm_clipping=10,
                     param_noise=self.param_noise,
-                    sess=self.sess
+                    sess=self.sess,
+                    full_tensorboard_log=self.full_tensorboard_log
                 )
                 self.proba_step = self.step_model.proba_step
                 self.params = find_trainable_variables("deepq")
@@ -133,8 +138,13 @@ def setup_model(self):
 
                 self.summary = tf.summary.merge_all()
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             # Create the replay buffer
@@ -160,16 +170,16 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
             reset = True
             self.episode_reward = np.zeros((1,))
 
-            for step in range(total_timesteps):
+            for _ in range(total_timesteps):
                 if callback is not None:
                     # Only stop training if return value is False, not when it is None. This is for backwards
                     # compatibility with callbacks that have no return statement.
-                    if callback(locals(), globals()) == False:
+                    if callback(locals(), globals()) is False:
                         break
                 # Take action and update exploration to the newest value
                 kwargs = {}
                 if not self.param_noise:
-                    update_eps = self.exploration.value(step)
+                    update_eps = self.exploration.value(self.num_timesteps)
                     update_param_noise_threshold = 0.
                 else:
                     update_eps = 0.
@@ -178,8 +188,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                     # for detailed explanation.
                     update_param_noise_threshold = \
-                        -np.log(1. - self.exploration.value(step) +
-                                self.exploration.value(step) / float(self.env.action_space.n))
+                        -np.log(1. - self.exploration.value(self.num_timesteps) +
+                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                     kwargs['reset'] = reset
                     kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                     kwargs['update_param_noise_scale'] = True
@@ -196,7 +206,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     ep_rew = np.array([rew]).reshape((1, -1))
                     ep_done = np.array([done]).reshape((1, -1))
                     self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
-                                                                      step)
+                                                                      self.num_timesteps)
 
                 episode_rewards[-1] += rew
                 if done:
@@ -205,10 +215,11 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     episode_rewards.append(0.0)
                     reset = True
 
-                if step > self.learning_starts and step % self.train_freq == 0:
+                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                     # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                     if self.prioritized_replay:
-                        experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step))
+                        experience = self.replay_buffer.sample(self.batch_size,
+                                                               beta=self.beta_schedule.value(self.num_timesteps))
                         (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                     else:
                         obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
@@ -217,17 +228,17 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     if writer is not None:
                         # run loss backprop with summary, but once every 100 steps save the metadata
                         # (memory, compute time, ...)
-                        if (1 + step) % 100 == 0:
+                        if (1 + self.num_timesteps) % 100 == 0:
                             run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                             run_metadata = tf.RunMetadata()
                             summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                   dones, weights, sess=self.sess, options=run_options,
                                                                   run_metadata=run_metadata)
-                            writer.add_run_metadata(run_metadata, 'step%d' % step)
+                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                         else:
                             summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                   dones, weights, sess=self.sess)
-                        writer.add_summary(summary, step)
+                        writer.add_summary(summary, self.num_timesteps)
                     else:
                         _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                         sess=self.sess)
@@ -236,7 +247,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                         new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                         self.replay_buffer.update_priorities(batch_idxes, new_priorities)
 
-                if step > self.learning_starts and step % self.target_network_update_freq == 0:
+                if self.num_timesteps > self.learning_starts and \
+                        self.num_timesteps % self.target_network_update_freq == 0:
                     # Update target network periodically.
                     self.update_target(sess=self.sess)
 
@@ -247,12 +259,15 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
 
                 num_episodes = len(episode_rewards)
                 if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
-                    logger.record_tabular("steps", step)
+                    logger.record_tabular("steps", self.num_timesteps)
                     logger.record_tabular("episodes", num_episodes)
                     logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
-                    logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(step)))
+                    logger.record_tabular("% time spent exploring",
+                                          int(100 * self.exploration.value(self.num_timesteps)))
                     logger.dump_tabular()
 
+                self.num_timesteps += 1
+
         return self
 
     def predict(self, observation, state=None, mask=None, deterministic=True):
diff --git a/stable_baselines/deepq/experiments/train_cartpole.py b/stable_baselines/deepq/experiments/train_cartpole.py
index 9cef8df718..9c1aee0e9d 100644
--- a/stable_baselines/deepq/experiments/train_cartpole.py
+++ b/stable_baselines/deepq/experiments/train_cartpole.py
@@ -19,7 +19,7 @@ def callback(lcl, _glb):
         mean_100ep_reward = -np.inf
     else:
         mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1)
-    is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199
+    is_solved = lcl['self'].num_timesteps > 100 and mean_100ep_reward >= 199
     return not is_solved
 
 
diff --git a/stable_baselines/gail/model.py b/stable_baselines/gail/model.py
index 4af84d7ac1..720802b524 100644
--- a/stable_baselines/gail/model.py
+++ b/stable_baselines/gail/model.py
@@ -31,6 +31,8 @@ class GAIL(ActorCriticRLModel):
     :param d_stepsize: (float) the reward giver stepsize
     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3,
@@ -66,15 +68,16 @@ def setup_model(self):
 
         self.trpo.setup_model()
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL"):
-        self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name)
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL",
+              reset_num_timesteps=True):
+        self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name, reset_num_timesteps)
         return self
 
     def predict(self, observation, state=None, mask=None, deterministic=False):
         return self.trpo.predict(observation, state, mask, deterministic=deterministic)
 
-    def action_probability(self, observation, state=None, mask=None):
-        return self.trpo.action_probability(observation, state, mask)
+    def action_probability(self, observation, state=None, mask=None, actions=None):
+        return self.trpo.action_probability(observation, state, mask, actions)
 
     def save(self, save_path):
         self.trpo.save(save_path)
diff --git a/stable_baselines/her/her.py b/stable_baselines/her/her.py
index 87d5b3e1b6..b1b2589b8e 100644
--- a/stable_baselines/her/her.py
+++ b/stable_baselines/her/her.py
@@ -93,7 +93,8 @@ def setup_model(self):
             with self.graph.as_default():
                 pass
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER"):
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER",
+              reset_num_timesteps=True):
         with SetVerbosity(self.verbose):
             self._setup_learn(seed)
 
@@ -102,7 +103,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
     def predict(self, observation, state=None, mask=None, deterministic=False):
         pass
 
-    def action_probability(self, observation, state=None, mask=None):
+    def action_probability(self, observation, state=None, mask=None, actions=None):
         pass
 
     def save(self, save_path):
diff --git a/stable_baselines/ppo1/pposgd_simple.py b/stable_baselines/ppo1/pposgd_simple.py
index 1da9cf3f37..36c5547cbc 100644
--- a/stable_baselines/ppo1/pposgd_simple.py
+++ b/stable_baselines/ppo1/pposgd_simple.py
@@ -38,11 +38,14 @@ class PPO1(ActorCriticRLModel):
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01,
                  optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5,
-                 schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None):
+                 schedule='linear', verbose=0, tensorboard_log=None,
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
 
         super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False,
                          _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
@@ -58,6 +61,7 @@ def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_p
         self.adam_epsilon = adam_epsilon
         self.schedule = schedule
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -148,17 +152,19 @@ def setup_model(self):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
-                    tf.summary.histogram('discounted_rewards', ret)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.optim_stepsize))
-                    tf.summary.histogram('learning_rate', self.optim_stepsize)
                     tf.summary.scalar('advantage', tf.reduce_mean(atarg))
-                    tf.summary.histogram('advantage', atarg)
                     tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_param))
-                    tf.summary.histogram('clip_range', self.clip_param)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', obs_ph)
-                    else:
-                        tf.summary.histogram('observation', obs_ph)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('discounted_rewards', ret)
+                        tf.summary.histogram('learning_rate', self.optim_stepsize)
+                        tf.summary.histogram('advantage', atarg)
+                        tf.summary.histogram('clip_range', self.clip_param)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', obs_ph)
+                        else:
+                            tf.summary.histogram('observation', obs_ph)
 
                 self.step = self.policy_pi.step
                 self.proba_step = self.policy_pi.proba_step
@@ -173,8 +179,13 @@ def setup_model(self):
                 self.compute_losses = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                                                        losses)
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
@@ -202,7 +213,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     if callback is not None:
                         # Only stop training if return value is False, not when it is None. This is for backwards
                         # compatibility with callbacks that have no return statement.
-                        if callback(locals(), globals()) == False:
+                        if callback(locals(), globals()) is False:
                             break
                     if total_timesteps and timesteps_so_far >= total_timesteps:
                         break
@@ -227,7 +238,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                         self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                           seg["true_rew"].reshape((self.n_envs, -1)),
                                                                           seg["dones"].reshape((self.n_envs, -1)),
-                                                                          writer, timesteps_so_far)
+                                                                          writer, self.num_timesteps)
 
                     # predicted value function before udpate
                     vpredbefore = seg["vpred"]
@@ -248,13 +259,13 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                         # list of tuples, each of which gives the loss for a minibatch
                         losses = []
                         for i, batch in enumerate(dataset.iterate_once(optim_batchsize)):
-                            steps = (timesteps_so_far +
+                            steps = (self.num_timesteps +
                                      k * optim_batchsize +
                                      int(i * (optim_batchsize / len(dataset.data_map))))
                             if writer is not None:
                                 # run loss backprop with summary, but once every 10 runs save the metadata
                                 # (memory, compute time, ...)
-                                if (1 + k) % 10 == 0:
+                                if self.full_tensorboard_log and (1 + k) % 10 == 0:
                                     run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                                     run_metadata = tf.RunMetadata()
                                     summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
@@ -302,10 +313,12 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                     logger.record_tabular("EpThisIter", len(lens))
                     episodes_so_far += len(lens)
-                    timesteps_so_far += MPI.COMM_WORLD.allreduce(seg["total_timestep"])
+                    current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"])
+                    timesteps_so_far += current_it_timesteps
+                    self.num_timesteps += current_it_timesteps
                     iters_so_far += 1
                     logger.record_tabular("EpisodesSoFar", episodes_so_far)
-                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
+                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                     logger.record_tabular("TimeElapsed", time.time() - t_start)
                     if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                         logger.dump_tabular()
diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py
index bdf264e914..432fac2c33 100644
--- a/stable_baselines/ppo2/ppo2.py
+++ b/stable_baselines/ppo2/ppo2.py
@@ -37,11 +37,14 @@ class PPO2(ActorCriticRLModel):
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
     """
 
     def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5,
                  max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, verbose=0,
-                 tensorboard_log=None, _init_setup_model=True, policy_kwargs=None):
+                 tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
+                 full_tensorboard_log=False):
 
         super(PPO2, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
                                    _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
@@ -57,6 +60,7 @@ def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning
         self.nminibatches = nminibatches
         self.noptepochs = noptepochs
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.graph = None
         self.sess = None
@@ -156,8 +160,9 @@ def setup_model(self):
 
                     with tf.variable_scope('model'):
                         self.params = tf.trainable_variables()
-                        for var in self.params:
-                            tf.summary.histogram(var.name, var)
+                        if self.full_tensorboard_log:
+                            for var in self.params:
+                                tf.summary.histogram(var.name, var)
                     grads = tf.gradients(loss, self.params)
                     if self.max_grad_norm is not None:
                         grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm)
@@ -169,21 +174,23 @@ def setup_model(self):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.rewards_ph))
-                    tf.summary.histogram('discounted_rewards', self.rewards_ph)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))
-                    tf.summary.histogram('learning_rate', self.learning_rate_ph)
                     tf.summary.scalar('advantage', tf.reduce_mean(self.advs_ph))
-                    tf.summary.histogram('advantage', self.advs_ph)
                     tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_range_ph))
-                    tf.summary.histogram('clip_range', self.clip_range_ph)
                     tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph))
-                    tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph)
                     tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph))
-                    tf.summary.histogram('old_value_pred', self.old_vpred_ph)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', train_model.obs_ph)
-                    else:
-                        tf.summary.histogram('observation', train_model.obs_ph)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('discounted_rewards', self.rewards_ph)
+                        tf.summary.histogram('learning_rate', self.learning_rate_ph)
+                        tf.summary.histogram('advantage', self.advs_ph)
+                        tf.summary.histogram('clip_range', self.clip_range_ph)
+                        tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph)
+                        tf.summary.histogram('old_value_pred', self.old_vpred_ph)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', train_model.obs_ph)
+                        else:
+                            tf.summary.histogram('observation', train_model.obs_ph)
 
                 self.train_model = train_model
                 self.act_model = act_model
@@ -230,7 +237,7 @@ def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, va
 
         if writer is not None:
             # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
-            if (1 + update) % 10 == 0:
+            if self.full_tensorboard_log and (1 + update) % 10 == 0:
                 run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                 run_metadata = tf.RunMetadata()
                 summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run(
@@ -248,12 +255,16 @@ def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, va
 
         return policy_loss, value_loss, policy_entropy, approxkl, clipfrac
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2"):
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2",
+              reset_num_timesteps=True):
         # Transform to callable if needed
         self.learning_rate = get_schedule_fn(self.learning_rate)
         self.cliprange = get_schedule_fn(self.cliprange)
 
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam)
@@ -275,18 +286,21 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
                 ep_info_buf.extend(ep_infos)
                 mb_loss_vals = []
                 if states is None:  # nonrecurrent version
+                    update_fac = self.n_batch // self.nminibatches // self.noptepochs + 1
                     inds = np.arange(self.n_batch)
                     for epoch_num in range(self.noptepochs):
                         np.random.shuffle(inds)
                         for start in range(0, self.n_batch, batch_size):
-                            timestep = ((update * self.noptepochs * self.n_batch + epoch_num * self.n_batch + start) //
-                                        batch_size)
+                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_batch + epoch_num *
+                                                                            self.n_batch + start) // batch_size)
                             end = start + batch_size
                             mbinds = inds[start:end]
                             slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
                             mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, writer=writer,
                                                                  update=timestep))
+                    self.num_timesteps += (self.n_batch * self.noptepochs) // batch_size * update_fac
                 else:  # recurrent version
+                    update_fac = self.n_batch // self.nminibatches // self.noptepochs // self.n_steps + 1
                     assert self.n_envs % self.nminibatches == 0
                     env_indices = np.arange(self.n_envs)
                     flat_indices = np.arange(self.n_envs * self.n_steps).reshape(self.n_envs, self.n_steps)
@@ -294,8 +308,8 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
                     for epoch_num in range(self.noptepochs):
                         np.random.shuffle(env_indices)
                         for start in range(0, self.n_envs, envs_per_batch):
-                            timestep = ((update * self.noptepochs * self.n_envs + epoch_num * self.n_envs + start) //
-                                        envs_per_batch)
+                            timestep = self.num_timesteps // update_fac + ((self.noptepochs * self.n_envs + epoch_num *
+                                                                            self.n_envs + start) // envs_per_batch)
                             end = start + envs_per_batch
                             mb_env_inds = env_indices[start:end]
                             mb_flat_inds = flat_indices[mb_env_inds].ravel()
@@ -303,6 +317,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
                             mb_states = states[mb_env_inds]
                             mb_loss_vals.append(self._train_step(lr_now, cliprangenow, *slices, update=timestep,
                                                                  writer=writer, states=mb_states))
+                    self.num_timesteps += (self.n_envs * self.noptepochs) // envs_per_batch * update_fac
 
                 loss_vals = np.mean(mb_loss_vals, axis=0)
                 t_now = time.time()
@@ -312,13 +327,13 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
                     self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                       true_reward.reshape((self.n_envs, self.n_steps)),
                                                                       masks.reshape((self.n_envs, self.n_steps)),
-                                                                      writer, update * (self.n_batch + 1))
+                                                                      writer, self.num_timesteps)
 
                 if self.verbose >= 1 and (update % log_interval == 0 or update == 1):
                     explained_var = explained_variance(values, returns)
                     logger.logkv("serial_timesteps", update * self.n_steps)
                     logger.logkv("nupdates", update)
-                    logger.logkv("total_timesteps", update * self.n_batch)
+                    logger.logkv("total_timesteps", self.num_timesteps)
                     logger.logkv("fps", fps)
                     logger.logkv("explained_variance", float(explained_var))
                     logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
@@ -331,7 +346,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo
                 if callback is not None:
                     # Only stop training if return value is False, not when it is None. This is for backwards
                     # compatibility with callbacks that have no return statement.
-                    if callback(locals(), globals()) == False:
+                    if callback(locals(), globals()) is False:
                         break
 
             return self
diff --git a/stable_baselines/sac/sac.py b/stable_baselines/sac/sac.py
index 10f6858030..10b1f30199 100644
--- a/stable_baselines/sac/sac.py
+++ b/stable_baselines/sac/sac.py
@@ -57,13 +57,16 @@ class SAC(OffPolicyRLModel):
     :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
     :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        Note: this has no effect on SAC logging for now
     """
 
     def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000,
                  learning_starts=100, train_freq=1, batch_size=64,
                  tau=0.005, ent_coef='auto', target_update_interval=1,
-                 gradient_steps=1, target_entropy='auto',
-                 verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None):
+                 gradient_steps=1, target_entropy='auto', verbose=0, tensorboard_log=None,
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
+
         super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose,
                                   policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs)
 
@@ -95,6 +98,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000
         self.summary = None
         self.policy_tf = None
         self.target_entropy = target_entropy
+        self.full_tensorboard_log = full_tensorboard_log
 
         self.obs_target = None
         self.target_policy = None
@@ -341,8 +345,14 @@ def _train_step(self, step, writer, learning_rate):
 
         return policy_loss, qf1_loss, qf2_loss, value_loss, entropy
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_log_name="SAC"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None,
+              log_interval=4, tb_log_name="SAC", reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
+
             self._setup_learn(seed)
 
             # Transform to callable if needed
@@ -368,7 +378,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_lo
                 # Before training starts, randomly sample actions
                 # from a uniform distribution for better exploration.
                 # Afterwards, use the learned policy.
-                if step < self.learning_starts:
+                if self.num_timesteps < self.learning_starts:
                     action = self.env.action_space.sample()
                     # No need to rescale when sampling random action
                     rescaled_action = action
@@ -395,13 +405,13 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_lo
                     ep_reward = np.array([reward]).reshape((1, -1))
                     ep_done = np.array([done]).reshape((1, -1))
                     self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_reward,
-                                                                      ep_done, writer, step)
+                                                                      ep_done, writer, self.num_timesteps)
 
                 if step % self.train_freq == 0:
                     mb_infos_vals = []
                     # Update policy, critics and target networks
                     for grad_step in range(self.gradient_steps):
-                        if step < self.batch_size or step < self.learning_starts:
+                        if self.num_timesteps < self.batch_size or self.num_timesteps < self.learning_starts:
                             break
                         n_updates += 1
                         # Compute current learning_rate
@@ -429,6 +439,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_lo
                     mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
 
                 num_episodes = len(episode_rewards)
+                self.num_timesteps += 1
                 # Display training infos
                 if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                     fps = int(step / (time.time() - start_time))
@@ -443,7 +454,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=4, tb_lo
                     if len(infos_values) > 0:
                         for (name, val) in zip(self.infos_names, infos_values):
                             logger.logkv(name, val)
-                    logger.logkv("total timesteps", step)
+                    logger.logkv("total timesteps", self.num_timesteps)
                     logger.dumpkvs()
                     # Reset infos:
                     infos_values = []
diff --git a/stable_baselines/trpo_mpi/trpo_mpi.py b/stable_baselines/trpo_mpi/trpo_mpi.py
index d1ed648b7a..55176baa0b 100644
--- a/stable_baselines/trpo_mpi/trpo_mpi.py
+++ b/stable_baselines/trpo_mpi/trpo_mpi.py
@@ -15,32 +15,36 @@
 from stable_baselines.common.policies import ActorCriticPolicy
 from stable_baselines.a2c.utils import find_trainable_variables, total_episode_reward_logger
 from stable_baselines.trpo_mpi.utils import traj_segment_generator, add_vtarg_and_adv, flatten_lists
+
+
 # from stable_baselines.gail.statistics import Stats
 
 
 class TRPO(ActorCriticRLModel):
+    """
+    Trust Region Policy Optimization (https://arxiv.org/abs/1502.05477)
+
+    :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
+    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
+    :param gamma: (float) the discount value
+    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
+    :param max_kl: (float) the kullback leiber loss threshold
+    :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
+    :param lam: (float) GAE factor
+    :param entcoeff: (float) the weight for the entropy loss
+    :param cg_damping: (float) the compute gradient dampening factor
+    :param vf_stepsize: (float) the value function stepsize
+    :param vf_iters: (int) the value function's number iterations for learning
+    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
+    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
+    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
+    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
+    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
+        WARNING: this logging can take a lot of space quickly
+    """
     def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98,
                  entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, tensorboard_log=None,
-                 _init_setup_model=True, policy_kwargs=None):
-        """
-        learns a TRPO policy using the given environment
-
-        :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
-        :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
-        :param gamma: (float) the discount value
-        :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
-        :param max_kl: (float) the kullback leiber loss threshold
-        :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
-        :param lam: (float) GAE factor
-        :param entcoeff: (float) the weight for the entropy loss
-        :param cg_damping: (float) the compute gradient dampening factor
-        :param vf_stepsize: (float) the value function stepsize
-        :param vf_iters: (int) the value function's number iterations for learning
-        :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
-        :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
-        :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
-        :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
-        """
+                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
         super(TRPO, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False,
                                    _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
 
@@ -55,6 +59,7 @@ def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.0
         self.vf_stepsize = vf_stepsize
         self.entcoeff = entcoeff
         self.tensorboard_log = tensorboard_log
+        self.full_tensorboard_log = full_tensorboard_log
 
         # GAIL Params
         self.pretrained_weight = None
@@ -181,7 +186,7 @@ def setup_model(self):
                     self.assign_old_eq_new = \
                         tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                           zipsame(tf_util.get_globals_vars("oldpi"),
-                                                          tf_util.get_globals_vars("model"))])
+                                                                  tf_util.get_globals_vars("model"))])
                     self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg], losses)
                     self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg],
                                                         fvp)
@@ -220,17 +225,19 @@ def allmean(arr):
 
                 with tf.variable_scope("input_info", reuse=False):
                     tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
-                    tf.summary.histogram('discounted_rewards', ret)
                     tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
-                    tf.summary.histogram('learning_rate', self.vf_stepsize)
                     tf.summary.scalar('advantage', tf.reduce_mean(atarg))
-                    tf.summary.histogram('advantage', atarg)
                     tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))
-                    tf.summary.histogram('kl_clip_range', self.max_kl)
-                    if len(self.observation_space.shape) == 3:
-                        tf.summary.image('observation', observation)
-                    else:
-                        tf.summary.histogram('observation', observation)
+
+                    if self.full_tensorboard_log:
+                        tf.summary.histogram('discounted_rewards', ret)
+                        tf.summary.histogram('learning_rate', self.vf_stepsize)
+                        tf.summary.histogram('advantage', atarg)
+                        tf.summary.histogram('kl_clip_range', self.max_kl)
+                        if tf_util.is_image(self.observation_space):
+                            tf.summary.image('observation', observation)
+                        else:
+                            tf.summary.histogram('observation', observation)
 
                 self.timed = timed
                 self.allmean = allmean
@@ -249,8 +256,13 @@ def allmean(arr):
                     tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                      [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
 
-    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO"):
-        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
+    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO",
+              reset_num_timesteps=True):
+
+        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
+
+        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
+                as writer:
             self._setup_learn(seed)
 
             with self.sess.as_default():
@@ -282,7 +294,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
                     if callback is not None:
                         # Only stop training if return value is False, not when it is None. This is for backwards
                         # compatibility with callbacks that have no return statement.
-                        if callback(locals(), globals()) == False:
+                        if callback(locals(), globals()) is False:
                             break
                     if total_timesteps and timesteps_so_far >= total_timesteps:
                         break
@@ -291,6 +303,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_
 
                     def fisher_vector_product(vec):
                         return self.allmean(self.compute_fvp(vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec
+
                     # ------------------ Update G ------------------
                     logger.log("Optimizing Policy...")
                     # g_step = 1 when not using GAIL
@@ -315,7 +328,7 @@ def fisher_vector_product(vec):
                                                                               seg["true_rew"].reshape(
                                                                                   (self.n_envs, -1)),
                                                                               seg["dones"].reshape((self.n_envs, -1)),
-                                                                              writer, timesteps_so_far)
+                                                                              writer, self.num_timesteps)
 
                         args = seg["ob"], seg["ob"], seg["ac"], atarg
                         fvpargs = [arr[::5] for arr in args]
@@ -323,15 +336,16 @@ def fisher_vector_product(vec):
                         self.assign_old_eq_new(sess=self.sess)
 
                         with self.timed("computegrad"):
-                            steps = timesteps_so_far + (k + 1) * (seg["total_timestep"] / self.g_step)
+                            steps = self.num_timesteps + (k + 1) * (seg["total_timestep"] / self.g_step)
                             run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
-                            run_metadata = tf.RunMetadata()
+                            run_metadata = tf.RunMetadata() if self.full_tensorboard_log else None
                             # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                             if writer is not None:
                                 summary, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess,
                                                                                       options=run_options,
                                                                                       run_metadata=run_metadata)
-                                writer.add_run_metadata(run_metadata, 'step%d' % steps)
+                                if self.full_tensorboard_log:
+                                    writer.add_run_metadata(run_metadata, 'step%d' % steps)
                                 writer.add_summary(summary, steps)
                             else:
                                 _, grad, *lossbefore = self.compute_lossandgrad(*args, tdlamret, sess=self.sess,
@@ -432,11 +446,13 @@ def fisher_vector_product(vec):
                         logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
                     logger.record_tabular("EpThisIter", len(lens))
                     episodes_so_far += len(lens)
-                    timesteps_so_far += seg["total_timestep"]
+                    current_it_timesteps = MPI.COMM_WORLD.allreduce(seg["total_timestep"])
+                    timesteps_so_far += current_it_timesteps
+                    self.num_timesteps += current_it_timesteps
                     iters_so_far += 1
 
                     logger.record_tabular("EpisodesSoFar", episodes_so_far)
-                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
+                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                     logger.record_tabular("TimeElapsed", time.time() - t_start)
 
                     if self.verbose >= 1 and self.rank == 0:
diff --git a/tests/test_distri.py b/tests/test_distri.py
index d33b14ecf1..d3be362617 100644
--- a/tests/test_distri.py
+++ b/tests/test_distri.py
@@ -22,7 +22,7 @@ def test_probtypes():
     categorical = CategoricalProbabilityDistributionType(pdparam_categorical.size)
     validate_probtype(categorical, pdparam_categorical)
 
-    nvec = [1, 2, 3]
+    nvec = np.array([1, 2, 3])
     pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
     multicategorical = MultiCategoricalProbabilityDistributionType(nvec)
     validate_probtype(multicategorical, pdparam_multicategorical)
diff --git a/tests/test_lstm_policy.py b/tests/test_lstm_policy.py
index 79e0ae6875..688cf4e1a8 100644
--- a/tests/test_lstm_policy.py
+++ b/tests/test_lstm_policy.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from stable_baselines import A2C, ACER, PPO2
+from stable_baselines import A2C, ACER, ACKTR, PPO2
 from stable_baselines.common.policies import MlpLstmPolicy, LstmPolicy
 
 
@@ -33,7 +33,7 @@ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64,
 
 N_TRIALS = 100
 
-MODELS = [A2C, ACER, PPO2]
+MODELS = [A2C, ACER, ACKTR, PPO2]
 LSTM_POLICIES = [MlpLstmPolicy, CustomLSTMPolicy1, CustomLSTMPolicy2, CustomLSTMPolicy3, CustomLSTMPolicy4]
 
 
diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py
new file mode 100644
index 0000000000..debd40cc80
--- /dev/null
+++ b/tests/test_tensorboard.py
@@ -0,0 +1,33 @@
+import os
+import shutil
+
+import pytest
+
+from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO
+
+TENSORBOARD_DIR = '/tmp/tb_dir/'
+
+if os.path.isdir(TENSORBOARD_DIR):
+    shutil.rmtree(TENSORBOARD_DIR)
+
+MODEL_DICT = {
+    'a2c': (A2C, 'CartPole-v1'),
+    'acer': (ACER, 'CartPole-v1'),
+    'acktr': (ACKTR, 'CartPole-v1'),
+    'dqn': (DQN, 'CartPole-v1'),
+    'ddpg': (DDPG, 'Pendulum-v0'),
+    'ppo1': (PPO1, 'CartPole-v1'),
+    'ppo2': (PPO2, 'CartPole-v1'),
+    'sac': (SAC, 'Pendulum-v0'),
+    'trpo': (TRPO, 'CartPole-v1'),
+}
+
+N_STEPS = 1000
+
+
+@pytest.mark.parametrize("model_name", MODEL_DICT.keys())
+def test_tensorboard(model_name):
+    algo, env_id = MODEL_DICT[model_name]
+    model = algo('MlpPolicy', env_id, verbose=1, tensorboard_log=TENSORBOARD_DIR)
+    model.learn(N_STEPS)
+    model.learn(N_STEPS, reset_num_timesteps=False)
diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py
index ef7f872139..d71374da03 100644
--- a/tests/test_tf_util.py
+++ b/tests/test_tf_util.py
@@ -1,7 +1,8 @@
 # tests for tf_util
+import numpy as np
 import tensorflow as tf
 
-from stable_baselines.common.tf_util import function, initialize, single_threaded_session
+from stable_baselines.common.tf_util import function, initialize, single_threaded_session, is_image
 
 
 def test_function():
@@ -38,6 +39,23 @@ def test_multikwargs():
             assert linear_fn(2, 2) == 10
 
 
-if __name__ == '__main__':
-    test_function()
-    test_multikwargs()
+def test_image_detection():
+    rgb = (32, 64, 3)
+    gray = (43, 23, 1)
+    rgbd = (12, 32, 4)
+    invalid_1 = (32, 12)
+    invalid_2 = (12, 32, 6)
+
+    # TF checks
+    for shape in (rgb, gray, rgbd):
+        assert is_image(tf.placeholder(tf.uint8, shape=shape))
+
+    for shape in (invalid_1, invalid_2):
+        assert not is_image(tf.placeholder(tf.uint8, shape=shape))
+
+    # Numpy checks
+    for shape in (rgb, gray, rgbd):
+        assert is_image(np.ones(shape))
+
+    for shape in (invalid_1, invalid_2):
+        assert not is_image(np.ones(shape))