diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000000..a0bbe87e15
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,16 @@
+[run]
+branch = False
+omit =
+    baselines/common/tests/*
+    # Mujoco requires a licence
+    baselines/*/run_mujoco.py
+    baselines/ppo1/run_humanoid.py
+    baselines/ppo1/run_robotics.py
+    # HER requires mpi and Mujoco
+    baselines/her/experiment/
+
+[report]
+exclude_lines =
+    pragma: no cover
+    raise NotImplementedError()
+    if KFAC_DEBUG:
diff --git a/.gitignore b/.gitignore
index 722e942b29..ac2dba664e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,13 @@
 *.pyc
 *.pkl
 *.py~
+*.bak
 .pytest_cache
 .DS_Store
 .idea
+.coverage
+.coverage.*
+__pycache__/
 
 # Setuptools distribution and build folders.
 /dist/
@@ -34,5 +38,3 @@ src
 .cache
 
 MUJOCO_LOG.TXT
-
-
diff --git a/.travis.yml b/.travis.yml
index 5ba3eadd97..4d5abfbdaf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,6 +2,9 @@ language: python
 python:
     - "3.6"
 
+notifications:
+    email: false
+
 services:
     - docker
 
@@ -11,4 +14,4 @@ install:
 
 script:
     - flake8 --select=F baselines/common
-    - docker run baselines-test pytest
+    - docker run --env CODACY_PROJECT_TOKEN=$CODACY_PROJECT_TOKEN baselines-test sh -c 'pytest --cov-config .coveragerc --cov-report term --cov-report xml --cov=. && python-codacy-coverage -r coverage.xml --token=$CODACY_PROJECT_TOKEN'
diff --git a/Dockerfile b/Dockerfile
index eeac22ad2f..3b1d0d4ad4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,18 +1,43 @@
 FROM ubuntu:16.04
 
-RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake
+RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake libglib2.0-0 libsm6 libxext6 libfontconfig1 libxrender1
 ENV CODE_DIR /root/code
 ENV VENV /root/venv
 
-COPY . $CODE_DIR/baselines
 RUN \
     pip install virtualenv && \
     virtualenv $VENV --python=python3 && \
     . $VENV/bin/activate && \
+    mkdir $CODE_DIR && \
     cd $CODE_DIR && \
     pip install --upgrade pip && \
-    pip install -e baselines && \
-    pip install pytest
+    pip install pytest && \
+    pip install pytest-cov && \
+    pip install codacy-coverage && \
+    pip install scipy && \
+    pip install tqdm && \
+    pip install joblib && \
+    pip install zmq && \
+    pip install dill && \
+    pip install progressbar2 && \
+    pip install mpi4py && \
+    pip install cloudpickle && \
+    pip install tensorflow>=1.4.0 && \
+    pip install click && \
+    pip install opencv-python && \
+    pip install numpy && \
+    pip install pandas && \
+    pip install pytest && \
+    pip install matplotlib && \
+    pip install seaborn && \
+    pip install glob2 && \
+    pip install gym[mujoco,atari,classic_control,robotics]
+
+COPY . $CODE_DIR/baselines
+RUN \
+    . $VENV/bin/activate && \
+    cd $CODE_DIR && \
+    pip install -e baselines
 
 ENV PATH=$VENV/bin:$PATH
 WORKDIR $CODE_DIR/baselines
diff --git a/README.md b/README.md
index 197f01af97..a48c78c3dc 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)
+<img src="data/logo.jpg" width=25% align="right" /> [![Build Status](https://travis-ci.org/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.org/hill-a/stable-baselines) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=hill-a/stable-baselines&amp;utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage)
 
 # Baselines
 
diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
index f1de88a37e..653a1b4a76 100644
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -1,59 +1,75 @@
-import os.path as osp
+import os
 import time
 import joblib
+
 import numpy as np
 import tensorflow as tf
-from baselines import logger
 
-from baselines.common import set_global_seeds, explained_variance
+from baselines import logger
+from baselines.common import set_global_seeds, explained_variance, tf_util
 from baselines.common.runners import AbstractEnvRunner
-from baselines.common import tf_util
+from baselines.a2c.utils import discount_with_dones, Scheduler, make_path, find_trainable_variables, calc_entropy, mse
 
-from baselines.a2c.utils import discount_with_dones
-from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
-from baselines.a2c.utils import cat_entropy, mse
 
 class Model(object):
-
-    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
-            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
-            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
+    def __init__(self, policy, ob_space, ac_space, n_envs, n_steps,
+                 ent_coef=0.01, vf_coef=0.25, max_grad_norm=0.5, learning_rate=7e-4,
+                 alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lr_schedule='linear'):
+        """
+        The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783
+
+        :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+        :param ob_space: (Gym Space) Observation space
+        :param ac_space: (Gym Space) Action space
+        :param n_envs: (int) The number of environments
+        :param n_steps: (int) The number of steps to run for each environment
+        :param ent_coef: (float) Entropy coefficient for the loss caculation
+        :param vf_coef: (float) Value function coefficient for the loss calculation
+        :param max_grad_norm: (float) The maximum value for the gradient clipping
+        :param learning_rate: (float) The learning rate
+        :param alpha: (float) RMS prop optimizer decay
+        :param epsilon: (float) RMS prop optimizer epsilon
+        :param total_timesteps: (int) The total number of samples
+        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+        """
 
         sess = tf_util.make_session()
-        nbatch = nenvs*nsteps
+        n_batch = n_envs * n_steps
 
-        A = tf.placeholder(tf.int32, [nbatch])
-        ADV = tf.placeholder(tf.float32, [nbatch])
-        R = tf.placeholder(tf.float32, [nbatch])
-        LR = tf.placeholder(tf.float32, [])
+        actions_ph = tf.placeholder(tf.int32, [n_batch])
+        advs_ph = tf.placeholder(tf.float32, [n_batch])
+        rewards_ph = tf.placeholder(tf.float32, [n_batch])
+        learning_rate_ph = tf.placeholder(tf.float32, [])
 
-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
+        step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True)
 
-        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
-        pg_loss = tf.reduce_mean(ADV * neglogpac)
-        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
-        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
-        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
+        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=actions_ph)
+        pg_loss = tf.reduce_mean(advs_ph * neglogpac)
+        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
+        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
+        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
 
         params = find_trainable_variables("model")
         grads = tf.gradients(loss, params)
         if max_grad_norm is not None:
-            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
+            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
         grads = list(zip(grads, params))
-        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
+        trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=alpha, epsilon=epsilon)
         _train = trainer.apply_gradients(grads)
 
-        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule)
 
         def train(obs, states, rewards, masks, actions, values):
             advs = rewards - values
-            for step in range(len(obs)):
-                cur_lr = lr.value()
-            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
+            for _ in range(len(obs)):
+                cur_lr = learning_rate.value()
+            td_map = {train_model.obs_ph: obs, actions_ph: actions, advs_ph: advs,
+                      rewards_ph: rewards, learning_rate_ph: cur_lr}
             if states is not None:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
+                td_map[train_model.states_ph] = states
+                td_map[train_model.masks_ph] = masks
             policy_loss, value_loss, policy_entropy, _ = sess.run(
                 [pg_loss, vf_loss, entropy, _train],
                 td_map
@@ -61,15 +77,15 @@ def train(obs, states, rewards, masks, actions, values):
             return policy_loss, value_loss, policy_entropy
 
         def save(save_path):
-            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
-            joblib.dump(ps, save_path)
+            parameters = sess.run(params)
+            make_path(os.path.dirname(save_path))
+            joblib.dump(parameters, save_path)
 
         def load(load_path):
             loaded_params = joblib.load(load_path)
             restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
+            for param, loaded_p in zip(params, loaded_params):
+                restores.append(param.assign(loaded_p))
             sess.run(restores)
 
         self.train = train
@@ -82,16 +98,30 @@ def load(load_path):
         self.load = load
         tf.global_variables_initializer().run(session=sess)
 
-class Runner(AbstractEnvRunner):
 
-    def __init__(self, env, model, nsteps=5, gamma=0.99):
-        super().__init__(env=env, model=model, nsteps=nsteps)
+class Runner(AbstractEnvRunner):
+    def __init__(self, env, model, n_steps=5, gamma=0.99):
+        """
+        A runner to learn the policy of an environment for a model
+
+        :param env: (Gym environment) The environment to learn from
+        :param model: (Model) The model to learn
+        :param n_steps: (int) The number of steps to run for each environment
+        :param gamma: (float) Discount factor
+        """
+        super(Runner, self).__init__(env=env, model=model, n_steps=n_steps)
         self.gamma = gamma
 
     def run(self):
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
+        """
+        Run a learning step of the model
+
+        :return: ([float], [float], [float], [bool], [float], [float])
+                 observations, states, rewards, masks, actions, values
+        """
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
         mb_states = self.states
-        for n in range(self.nsteps):
+        for _ in range(self.n_steps):
             actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
@@ -102,11 +132,11 @@ def run(self):
             self.dones = dones
             for n, done in enumerate(dones):
                 if done:
-                    self.obs[n] = self.obs[n]*0
+                    self.obs[n] = self.obs[n] * 0
             self.obs = obs
             mb_rewards.append(rewards)
         mb_dones.append(self.dones)
-        #batch of steps to batch of rollouts
+        # batch of steps to batch of rollouts
         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
@@ -115,12 +145,12 @@ def run(self):
         mb_masks = mb_dones[:, :-1]
         mb_dones = mb_dones[:, 1:]
         last_values = self.model.value(self.obs, self.states, self.dones).tolist()
-        #discount/bootstrap off value fn
+        # discount/bootstrap off value fn
         for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
             rewards = rewards.tolist()
             dones = dones.tolist()
             if dones[-1] == 0:
-                rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
+                rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
             else:
                 rewards = discount_with_dones(rewards, dones, self.gamma)
             mb_rewards[n] = rewards
@@ -130,31 +160,56 @@ def run(self):
         mb_masks = mb_masks.flatten()
         return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
 
-def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
+
+def learn(policy, env, seed, n_steps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5,
+          learning_rate=7e-4, lr_schedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
+    """
+    Return a trained A2C model.
+
+    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+    :param env: (Gym environment) The environment to learn from
+    :param seed: (int) The initial seed for training
+    :param n_steps: (int) The number of steps to run for each environment
+    :param total_timesteps: (int) The total number of samples
+    :param vf_coef: (float) Value function coefficient for the loss calculation
+    :param ent_coef: (float) Entropy coefficient for the loss caculation
+    :param max_grad_norm: (float) The maximum value for the gradient clipping
+    :param learning_rate: (float) The learning rate
+    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+    :param epsilon: (float) RMS prop optimizer epsilon
+    :param alpha: (float) RMS prop optimizer decay
+    :param gamma: (float) Discount factor
+    :param log_interval: (int) The number of timesteps before logging.
+    :return: (Model) A2C model
+    """
     set_global_seeds(seed)
 
-    nenvs = env.num_envs
+    n_envs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
-    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
-        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
-    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
-
-    nbatch = nenvs*nsteps
-    tstart = time.time()
-    for update in range(1, total_timesteps//nbatch+1):
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs,
+                  n_steps=n_steps, ent_coef=ent_coef,
+                  vf_coef=vf_coef, max_grad_norm=max_grad_norm, learning_rate=learning_rate,
+                  alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps,
+                  lr_schedule=lr_schedule)
+    runner = Runner(env, model, n_steps=n_steps, gamma=gamma)
+
+    n_batch = n_envs * n_steps
+    t_start = time.time()
+    for update in range(1, total_timesteps // n_batch + 1):
         obs, states, rewards, masks, actions, values = runner.run()
-        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
-        nseconds = time.time()-tstart
-        fps = int((update*nbatch)/nseconds)
+        _, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
+        n_seconds = time.time() - t_start
+        fps = int((update * n_batch) / n_seconds)
         if update % log_interval == 0 or update == 1:
-            ev = explained_variance(values, rewards)
+            explained_var = explained_variance(values, rewards)
             logger.record_tabular("nupdates", update)
-            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("total_timesteps", update * n_batch)
             logger.record_tabular("fps", fps)
             logger.record_tabular("policy_entropy", float(policy_entropy))
             logger.record_tabular("value_loss", float(value_loss))
-            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("explained_variance", float(explained_var))
             logger.dump_tabular()
     env.close()
     return model
diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py
index 6fbbb14ac8..61ebe71780 100644
--- a/baselines/a2c/policies.py
+++ b/baselines/a2c/policies.py
@@ -1,146 +1,141 @@
 import numpy as np
 import tensorflow as tf
-from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
-from baselines.common.distributions import make_pdtype
+
+from baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm
+from baselines.common.distributions import make_proba_dist_type
 from baselines.common.input import observation_input
 
-def nature_cnn(unscaled_images, **conv_kwargs):
+
+def nature_cnn(unscaled_images, **kwargs):
     """
     CNN from Nature paper.
+
+    :param unscaled_images: (TensorFlow Tensor) Image input placeholder
+    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
+    :return: (TensorFlow Tensor) The CNN output layer
     """
     scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
     activ = tf.nn.relu
-    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
-                   **conv_kwargs))
-    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = conv_to_fc(h3)
-    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
-
-class LnLstmPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        X, processed_x = observation_input(ob_space, nbatch)
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class LstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class CnnPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
+    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs))
+    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs))
+    layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
+    layer_3 = conv_to_fc(layer_3)
+    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
+
+
+class A2CPolicy(object):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False):
+        """
+        Policy object for A2C
+
+        :param sess: (TensorFlow session) The current TensorFlow session
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param n_batch: (int) The number of batch to run (n_envs * n_steps)
+        :param n_steps: (int) The number of steps to run for each environment
+        :param n_lstm: (int) The number of LSTM cells (for reccurent policies)
+        :param reuse: (bool) If the policy is reusable or not
+        """
+        self.n_env = n_batch // n_steps
+        self.obs_ph, self.processed_x = observation_input(ob_space, n_batch)
+        self.masks_ph = tf.placeholder(tf.float32, [n_batch])  # mask (done t-1)
+        self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2])  # states
+        self.pdtype = make_proba_dist_type(ac_space)
+        self.sess = sess
+        self.reuse = reuse
+
+    def step(self, obs, state=None, mask=None):
+        """
+        Returns the policy for a single step
+
+        :param obs: ([float] or [int]) The current observation of the environment
+        :param state: ([float]) The last states (used in reccurent policies)
+        :param mask: ([float]) The last masks (used in reccurent policies)
+        :return: ([float], [float], [float], [float]) actions, values, states, neglogp
+        """
+        raise NotImplementedError
+
+    def value(self, obs, state=None, mask=None):
+        """
+        Returns the value for a single step
+
+        :param obs: ([float] or [int]) The current observation of the environment
+        :param state: ([float]) The last states (used in reccurent policies)
+        :param mask: ([float]) The last masks (used in reccurent policies)
+        :return: ([float]) The associated value of the action
+        """
+        raise NotImplementedError
+
+
+class LstmPolicy(A2CPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, layer_norm=False, **kwargs):
+        super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse)
         with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x, **conv_kwargs)
-            vf = fc(h, 'v', 1)[:,0]
-            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
+            extracted_features = nature_cnn(self.obs_ph, **kwargs)
+            input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps)
+            masks = batch_to_seq(self.masks_ph, self.n_env, n_steps)
+            rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm,
+                                         layer_norm=layer_norm)
+            rnn_output = seq_to_batch(rnn_output)
+            value_fn = linear(rnn_output, 'v', 1)
+            self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(rnn_output)
 
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
+        self._value = value_fn[:, 0]
+        self.action = self.proba_distribution.sample()
+        self.neglogp = self.proba_distribution.neglogp(self.action)
+        self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32)
+        self.value_fn = value_fn
+
+    def step(self, obs, state=None, mask=None):
+        return self.sess.run([self.action, self._value, self.snew, self.neglogp],
+                             {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask})
+
+    def value(self, obs, state=None, mask=None):
+        return self.sess.run(self._value, {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask})
 
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
 
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
+class LnLstmPolicy(LstmPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_):
+        super(LnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, layer_norm=True)
 
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
 
-class MlpPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
+class FeedForwardPolicy(A2CPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, _type="cnn", **kwargs):
+        super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse)
         with tf.variable_scope("model", reuse=reuse):
-            X, processed_x = observation_input(ob_space, nbatch)
-            activ = tf.tanh
-            processed_x = tf.layers.flatten(processed_x)
-            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
-            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
-            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf = fc(vf_h2, 'vf', 1)[:,0]
+            if _type == "cnn":
+                extracted_features = nature_cnn(self.processed_x, **kwargs)
+                value_fn = linear(extracted_features, 'v', 1)[:, 0]
+            else:
+                activ = tf.tanh
+                processed_x = tf.layers.flatten(self.processed_x)
+                pi_h1 = activ(linear(processed_x, 'pi_fc1', n_hidden=64, init_scale=np.sqrt(2)))
+                pi_h2 = activ(linear(pi_h1, 'pi_fc2', n_hidden=64, init_scale=np.sqrt(2)))
+                vf_h1 = activ(linear(processed_x, 'vf_fc1', n_hidden=64, init_scale=np.sqrt(2)))
+                vf_h2 = activ(linear(vf_h1, 'vf_fc2', n_hidden=64, init_scale=np.sqrt(2)))
+                value_fn = linear(vf_h2, 'vf', 1)[:, 0]
+                extracted_features = pi_h2
+            self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(extracted_features,
+                                                                                              init_scale=0.01)
+
+        self.action = self.proba_distribution.sample()
+        self.neglogp = self.proba_distribution.neglogp(self.action)
+        self.initial_state = None
+        self.value_fn = value_fn
 
-            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
+    def step(self, obs, state=None, mask=None):
+        action, value, neglogp = self.sess.run([self.action, self.value_fn, self.neglogp], {self.obs_ph: obs})
+        return action, value, self.initial_state, neglogp
 
+    def value(self, obs, state=None, mask=None):
+        return self.sess.run(self.value_fn, {self.obs_ph: obs})
 
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
 
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
+class CnnPolicy(FeedForwardPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_kwargs):
+        super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, _type="cnn")
 
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
 
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
+class MlpPolicy(FeedForwardPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_kwargs):
+        super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, _type="mlp")
diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py
index b09d9bbffe..2b91609841 100644
--- a/baselines/a2c/run_atari.py
+++ b/baselines/a2c/run_atari.py
@@ -4,27 +4,49 @@
 from baselines.common.cmd_util import make_atari_env, atari_arg_parser
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack
 from baselines.a2c.a2c import learn
-from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
+from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
 
-def train(env_id, num_timesteps, seed, policy, lrschedule, num_env):
+
+def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
+    """
+    Train A2C model for atari environment, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+    :param num_env: (int) The number of environments
+    """
+    policy_fn = None
     if policy == 'cnn':
         policy_fn = CnnPolicy
     elif policy == 'lstm':
         policy_fn = LstmPolicy
     elif policy == 'lnlstm':
         policy_fn = LnLstmPolicy
+    if policy_fn is None:
+        raise ValueError("Error: policy {} not implemented".format(policy))
+
     env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
-    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
+    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule)
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
-    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
+    parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture')
+    parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant',
+                        help='Learning rate schedule')
     args = parser.parse_args()
     logger.configure()
-    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-        policy=args.policy, lrschedule=args.lrschedule, num_env=16)
+    train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule,
+          num_env=16)
+
 
 if __name__ == '__main__':
     main()
diff --git a/baselines/a2c/utils.py b/baselines/a2c/utils.py
index a7610ebcdc..09ec86738c 100644
--- a/baselines/a2c/utils.py
+++ b/baselines/a2c/utils.py
@@ -1,254 +1,482 @@
 import os
-import gym
+from collections import deque
+
 import numpy as np
 import tensorflow as tf
-from gym import spaces
-from collections import deque
+
 
 def sample(logits):
+    """
+    Creates a sampling Tensor for non deterministic policies
+
+    :param logits: (TensorFlow Tensor) The input probability for each action
+    :return: (TensorFlow Tensor) The sampled action
+    """
     noise = tf.random_uniform(tf.shape(logits))
     return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
 
-def cat_entropy(logits):
-    a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
-    ea0 = tf.exp(a0)
-    z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
-    p0 = ea0 / z0
-    return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
 
-def cat_entropy_softmax(p0):
-    return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
+def calc_entropy(logits):
+    """
+    Calculates the entropy of the output values of the network
+
+    :param logits: (TensorFlow Tensor) The input probability for each action
+    :return: (TensorFlow Tensor) The Entropy of the output values of the network
+    """
+    # Compute softmax
+    a_0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
+    exp_a_0 = tf.exp(a_0)
+    z_0 = tf.reduce_sum(exp_a_0, 1, keep_dims=True)
+    p_0 = exp_a_0 / z_0
+    return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), 1)
+
+
+def calc_entropy_softmax(action_proba):
+    """
+    Calculates the softmax entropy of the output values of the network
+
+    :param action_proba: (TensorFlow Tensor) The input probability for each action
+    :return: (TensorFlow Tensor) The softmax entropy of the output values of the network
+    """
+    return - tf.reduce_sum(action_proba * tf.log(action_proba + 1e-6), axis=1)
+
 
 def mse(pred, target):
-    return tf.square(pred-target)/2.
+    """
+    Returns the Mean squared error between prediction and target
+
+    :param pred: (TensorFlow Tensor) The predicted value
+    :param target: (TensorFlow Tensor) The target value
+    :return: (TensorFlow Tensor) The Mean squared error between prediction and target
+    """
+    return tf.reduce_mean(tf.square(pred - target))
+
 
 def ortho_init(scale=1.0):
-    def _ortho_init(shape, dtype, partition_info=None):
-        #lasagne ortho init for tf
+    """
+    Orthogonal initialization for the policy weights
+
+    :param scale: (float) Scaling factor for the weights.
+    :return: (function) an initialization function for the weights
+    """
+
+    # _ortho_init(shape, dtype, partition_info=None)
+    def _ortho_init(shape, *_, **_kwargs):
+        """Intialize weights as Orthogonal matrix.
+
+        Orthogonal matrix initialization [1]_. For n-dimensional shapes where
+        n > 2, the n-1 trailing axes are flattened. For convolutional layers, this
+        corresponds to the fan-in, so this makes the initialization usable for
+        both dense and convolutional layers.
+
+        References
+        ----------
+        .. [1] Saxe, Andrew M., James L. McClelland, and Surya Ganguli.
+               "Exact solutions to the nonlinear dynamics of learning in deep
+               linear
+        """
+        # lasagne ortho init for tf
         shape = tuple(shape)
         if len(shape) == 2:
             flat_shape = shape
-        elif len(shape) == 4: # assumes NHWC
+        elif len(shape) == 4:  # assumes NHWC
             flat_shape = (np.prod(shape[:-1]), shape[-1])
         else:
             raise NotImplementedError
-        a = np.random.normal(0.0, 1.0, flat_shape)
-        u, _, v = np.linalg.svd(a, full_matrices=False)
-        q = u if u.shape == flat_shape else v # pick the one with the correct shape
-        q = q.reshape(shape)
-        return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
+        gaussian_noise = np.random.normal(0.0, 1.0, flat_shape)
+        u, _, v = np.linalg.svd(gaussian_noise, full_matrices=False)
+        weights = u if u.shape == flat_shape else v  # pick the one with the correct shape
+        weights = weights.reshape(shape)
+        return (scale * weights[:shape[0], :shape[1]]).astype(np.float32)
+
     return _ortho_init
 
-def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False):
+
+def conv(input_tensor, scope, *, n_filters, filter_size, stride,
+         pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False):
+    """
+    Creates a 2d convolutional layer for TensorFlow
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution
+    :param scope: (str) The TensorFlow variable scope
+    :param n_filters: (int) The number of filters
+    :param filter_size: (int) The filter size
+    :param stride: (int) The stride of the convolution
+    :param pad: (str) The padding type ('VALID' or 'SAME')
+    :param init_scale: (int) The initialization scale
+    :param data_format: (str) The data format for the convolution weights
+    :param one_dim_bias: (bool) If the bias should be one dimentional or not
+    :return: (TensorFlow Tensor) 2d convolutional layer
+    """
     if data_format == 'NHWC':
         channel_ax = 3
         strides = [1, stride, stride, 1]
-        bshape = [1, 1, 1, nf]
+        bshape = [1, 1, 1, n_filters]
     elif data_format == 'NCHW':
         channel_ax = 1
         strides = [1, 1, stride, stride]
-        bshape = [1, nf, 1, 1]
+        bshape = [1, n_filters, 1, 1]
     else:
         raise NotImplementedError
-    bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1]
-    nin = x.get_shape()[channel_ax].value
-    wshape = [rf, rf, nin, nf]
+    bias_var_shape = [n_filters] if one_dim_bias else [1, n_filters, 1, 1]
+    n_input = input_tensor.get_shape()[channel_ax].value
+    wshape = [filter_size, filter_size, n_input, n_filters]
     with tf.variable_scope(scope):
-        w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
+        weight = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
+        bias = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0))
         if not one_dim_bias and data_format == 'NHWC':
-            b = tf.reshape(b, bshape)
-        return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
+            bias = tf.reshape(bias, bshape)
+        return bias + tf.nn.conv2d(input_tensor, weight, strides=strides, padding=pad, data_format=data_format)
 
-def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
-    with tf.variable_scope(scope):
-        nin = x.get_shape()[1].value
-        w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
-        return tf.matmul(x, w)+b
 
-def batch_to_seq(h, nbatch, nsteps, flat=False):
+def linear(input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0):
+    """
+    Creates a fully connected layer for TensorFlow
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer
+    :param scope: (str) The TensorFlow variable scope
+    :param n_hidden: (int) The number of hidden neurons
+    :param init_scale: (int) The initialization scale
+    :param init_bias: (int) The initialization offset bias
+    :return: (TensorFlow Tensor) fully connected layer
+    """
+    with tf.variable_scope(scope):
+        n_input = input_tensor.get_shape()[1].value
+        weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale))
+        bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias))
+        return tf.matmul(input_tensor, weight) + bias
+
+
+def batch_to_seq(tensor_batch, n_batch, n_steps, flat=False):
+    """
+    Transform a batch of Tensors, into a sequence of Tensors for reccurent policies
+
+    :param tensor_batch: (TensorFlow Tensor) The input tensor to unroll
+    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
+    :param n_steps: (int) The number of steps to run for each environment
+    :param flat: (bool) If the input Tensor is flat
+    :return: (TensorFlow Tensor) sequence of Tensors for reccurent policies
+    """
     if flat:
-        h = tf.reshape(h, [nbatch, nsteps])
+        tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps])
     else:
-        h = tf.reshape(h, [nbatch, nsteps, -1])
-    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
+        tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps, -1])
+    return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=n_steps, value=tensor_batch)]
+
 
-def seq_to_batch(h, flat = False):
-    shape = h[0].get_shape().as_list()
+def seq_to_batch(tensor_sequence, flat=False):
+    """
+    Transform a sequence of Tensors, into a batch of Tensors for reccurent policies
+
+    :param tensor_sequence: (TensorFlow Tensor) The input tensor to batch
+    :param flat: (bool) If the input Tensor is flat
+    :return: (TensorFlow Tensor) batch of Tensors for reccurent policies
+    """
+    shape = tensor_sequence[0].get_shape().as_list()
     if not flat:
-        assert(len(shape) > 1)
-        nh = h[0].get_shape()[-1].value
-        return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
+        assert len(shape) > 1
+        n_hidden = tensor_sequence[0].get_shape()[-1].value
+        return tf.reshape(tf.concat(axis=1, values=tensor_sequence), [-1, n_hidden])
     else:
-        return tf.reshape(tf.stack(values=h, axis=1), [-1])
-
-def lstm(xs, ms, s, scope, nh, init_scale=1.0):
-    nbatch, nin = [v.value for v in xs[0].get_shape()]
-    nsteps = len(xs)
-    with tf.variable_scope(scope):
-        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
-        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
-
-    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
-    for idx, (x, m) in enumerate(zip(xs, ms)):
-        c = c*(1-m)
-        h = h*(1-m)
-        z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
-        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
-        i = tf.nn.sigmoid(i)
-        f = tf.nn.sigmoid(f)
-        o = tf.nn.sigmoid(o)
-        u = tf.tanh(u)
-        c = f*c + i*u
-        h = o*tf.tanh(c)
-        xs[idx] = h
-    s = tf.concat(axis=1, values=[c, h])
-    return xs, s
-
-def _ln(x, g, b, e=1e-5, axes=[1]):
-    u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
-    x = (x-u)/tf.sqrt(s+e)
-    x = x*g+b
-    return x
-
-def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
-    nbatch, nin = [v.value for v in xs[0].get_shape()]
-    nsteps = len(xs)
+        return tf.reshape(tf.stack(values=tensor_sequence, axis=1), [-1])
+
+
+def lstm(input_tensor, mask_tensor, cell_state_hidden, scope, n_hidden, init_scale=1.0, layer_norm=False):
+    """
+    Creates an Long Short Term Memory (LSTM) cell for TensorFlow
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell
+    :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell
+    :param cell_state_hidden: (TensorFlow Tensor) The state tensor for the LSTM cell
+    :param scope: (str) The TensorFlow variable scope
+    :param n_hidden: (int) The number of hidden neurons
+    :param init_scale: (int) The initialization scale
+    :param layer_norm: (bool) Whether to apply Layer Normalization or not
+    :return: (TensorFlow Tensor) LSTM cell
+    """
+    _, n_input = [v.value for v in input_tensor[0].get_shape()]
     with tf.variable_scope(scope):
-        wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
-        gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
-        bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))
-
-        wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
-        gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
-        bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))
-
-        b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
-
-        gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
-        bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))
-
-    c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
-    for idx, (x, m) in enumerate(zip(xs, ms)):
-        c = c*(1-m)
-        h = h*(1-m)
-        z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
-        i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
-        i = tf.nn.sigmoid(i)
-        f = tf.nn.sigmoid(f)
-        o = tf.nn.sigmoid(o)
-        u = tf.tanh(u)
-        c = f*c + i*u
-        h = o*tf.tanh(_ln(c, gc, bc))
-        xs[idx] = h
-    s = tf.concat(axis=1, values=[c, h])
-    return xs, s
-
-def conv_to_fc(x):
-    nh = np.prod([v.value for v in x.get_shape()[1:]])
-    x = tf.reshape(x, [-1, nh])
-    return x
+        weight_x = tf.get_variable("wx", [n_input, n_hidden * 4], initializer=ortho_init(init_scale))
+        weight_h = tf.get_variable("wh", [n_hidden, n_hidden * 4], initializer=ortho_init(init_scale))
+        bias = tf.get_variable("b", [n_hidden * 4], initializer=tf.constant_initializer(0.0))
+
+        if layer_norm:
+            # Gain and bias of layer norm
+            gain_x = tf.get_variable("gx", [n_hidden * 4], initializer=tf.constant_initializer(1.0))
+            bias_x = tf.get_variable("bx", [n_hidden * 4], initializer=tf.constant_initializer(0.0))
+
+            gain_h = tf.get_variable("gh", [n_hidden * 4], initializer=tf.constant_initializer(1.0))
+            bias_h = tf.get_variable("bh", [n_hidden * 4], initializer=tf.constant_initializer(0.0))
+
+            gain_c = tf.get_variable("gc", [n_hidden], initializer=tf.constant_initializer(1.0))
+            bias_c = tf.get_variable("bc", [n_hidden], initializer=tf.constant_initializer(0.0))
+
+    cell_state, hidden = tf.split(axis=1, num_or_size_splits=2, value=cell_state_hidden)
+    for idx, (_input, mask) in enumerate(zip(input_tensor, mask_tensor)):
+        cell_state = cell_state * (1 - mask)
+        hidden = hidden * (1 - mask)
+        if layer_norm:
+            gates = _ln(tf.matmul(_input, weight_x), gain_x, bias_x) \
+                    + _ln(tf.matmul(hidden, weight_h), gain_h, bias_h) + bias
+        else:
+            gates = tf.matmul(_input, weight_x) + tf.matmul(hidden, weight_h) + bias
+        in_gate, forget_gate, out_gate, cell_candidate = tf.split(axis=1, num_or_size_splits=4, value=gates)
+        in_gate = tf.nn.sigmoid(in_gate)
+        forget_gate = tf.nn.sigmoid(forget_gate)
+        out_gate = tf.nn.sigmoid(out_gate)
+        cell_candidate = tf.tanh(cell_candidate)
+        cell_state = forget_gate * cell_state + in_gate * cell_candidate
+        if layer_norm:
+            hidden = out_gate * tf.tanh(_ln(cell_state, gain_c, bias_c))
+        else:
+            hidden = out_gate * tf.tanh(cell_state)
+        input_tensor[idx] = hidden
+    cell_state_hidden = tf.concat(axis=1, values=[cell_state, hidden])
+    return input_tensor, cell_state_hidden
+
+
+def _ln(input_tensor, gain, bias, epsilon=1e-5, axes=None):
+    """
+    Apply layer normalisation.
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the Layer normalization
+    :param gain: (TensorFlow Tensor) The scale tensor for the Layer normalization
+    :param bias: (TensorFlow Tensor) The bias tensor for the Layer normalization
+    :param epsilon: (float) The epsilon value for floating point calculations
+    :param axes: (tuple, list or int) The axes to apply the mean and variance calculation
+    :return: (TensorFlow Tensor) a normalizing layer
+    """
+    if axes is None:
+        axes = [1]
+    mean, variance = tf.nn.moments(input_tensor, axes=axes, keep_dims=True)
+    input_tensor = (input_tensor - mean) / tf.sqrt(variance + epsilon)
+    input_tensor = input_tensor * gain + bias
+    return input_tensor
+
+
+def lnlstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale=1.0):
+    """
+    Creates a LSTM with Layer Normalization (lnlstm) cell for TensorFlow
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell
+    :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell
+    :param cell_state: (TensorFlow Tensor) The state tensor for the LSTM cell
+    :param scope: (str) The TensorFlow variable scope
+    :param n_hidden: (int) The number of hidden neurons
+    :param init_scale: (int) The initialization scale
+    :return: (TensorFlow Tensor) lnlstm cell
+    """
+    return lstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale, layer_norm=True)
+
+
+def conv_to_fc(input_tensor):
+    """
+    Reshapes a Tensor from a convolutional network to a Tensor for a fully connected network
+
+    :param input_tensor: (TensorFlow Tensor) The convolutional input tensor
+    :return: (TensorFlow Tensor) The fully connected output tensor
+    """
+    n_hidden = np.prod([v.value for v in input_tensor.get_shape()[1:]])
+    input_tensor = tf.reshape(input_tensor, [-1, n_hidden])
+    return input_tensor
+
 
 def discount_with_dones(rewards, dones, gamma):
+    """
+    Apply the discount value to the reward, where the environment is not done
+
+    :param rewards: ([float]) The rewards
+    :param dones: ([bool]) Whether an environment is done or not
+    :param gamma: (float) The discount value
+    :return: ([float]) The discounted rewards
+    """
     discounted = []
-    r = 0
+    ret = 0  # Return: discounted reward
     for reward, done in zip(rewards[::-1], dones[::-1]):
-        r = reward + gamma*r*(1.-done) # fixed off by one bug
-        discounted.append(r)
+        ret = reward + gamma * ret * (1. - done)  # fixed off by one bug
+        discounted.append(ret)
     return discounted[::-1]
 
+
 def find_trainable_variables(key):
+    """
+    Returns the trainable variables within a given scope
+
+    :param key: (str) The variable scope
+    :return: ([TensorFlow Tensor]) the trainable variables
+    """
     with tf.variable_scope(key):
         return tf.trainable_variables()
 
-def make_path(f):
-    return os.makedirs(f, exist_ok=True)
 
-def constant(p):
-    return 1
+def make_path(path):
+    """
+    For a given path, create the folders if they do not exist
+
+    :param path: (str) The path
+    :return: (bool) Whether or not it finished correctly
+    """
+    return os.makedirs(path, exist_ok=True)
 
-def linear(p):
-    return 1-p
 
-def middle_drop(p):
+def constant(_):
+    """
+    Returns a constant value for the Scheduler
+
+    :param _: ignored
+    :return: (float) 1
+    """
+    return 1.
+
+
+def linear_schedule(progress):
+    """
+    Returns a linear value for the Scheduler
+
+    :param progress: (float) Current progress status (in [0, 1])
+    :return: (float) 1 - progress
+    """
+    return 1 - progress
+
+
+def middle_drop(progress):
+    """
+    Returns a linear value with a drop near the middle to a constant value for the Scheduler
+
+    :param progress: (float) Current progress status (in [0, 1])
+    :return: (float) 1 - progress if (1 - progress) >= 0.75 else 0.075
+    """
     eps = 0.75
-    if 1-p<eps:
-        return eps*0.1
-    return 1-p
+    if 1 - progress < eps:
+        return eps * 0.1
+    return 1 - progress
 
-def double_linear_con(p):
-    p *= 2
+
+def double_linear_con(progress):
+    """
+    Returns a linear value (x2) with a flattened tail for the Scheduler
+
+    :param progress: (float) Current progress status (in [0, 1])
+    :return: (float) 1 - progress*2 if (1 - progress*2) >= 0.125 else 0.125
+    """
+    progress *= 2
     eps = 0.125
-    if 1-p<eps:
+    if 1 - progress < eps:
         return eps
-    return 1-p
+    return 1 - progress
+
 
-def double_middle_drop(p):
+def double_middle_drop(progress):
+    """
+    Returns a linear value with two drops near the middle to a constant value for the Scheduler
+
+    :param progress: (float) Current progress status (in [0, 1])
+    :return: (float) if 0.75 <= 1 - p: 1 - p, if 0.25 <= 1 - p < 0.75: 0.75, if 1 - p < 0.25: 0.125
+    """
     eps1 = 0.75
     eps2 = 0.25
-    if 1-p<eps1:
-        if 1-p<eps2:
-            return eps2*0.5
-        return eps1*0.1
-    return 1-p
-
-schedules = {
-    'linear':linear,
-    'constant':constant,
+    if 1 - progress < eps1:
+        if 1 - progress < eps2:
+            return eps2 * 0.5
+        return eps1 * 0.1
+    return 1 - progress
+
+
+SCHEDULES = {
+    'linear': linear_schedule,
+    'constant': constant,
     'double_linear_con': double_linear_con,
     'middle_drop': middle_drop,
     'double_middle_drop': double_middle_drop
 }
 
-class Scheduler(object):
 
-    def __init__(self, v, nvalues, schedule):
-        self.n = 0.
-        self.v = v
-        self.nvalues = nvalues
-        self.schedule = schedules[schedule]
+class Scheduler(object):
+    def __init__(self, initial_value, n_values, schedule):
+        """
+        Update a value every iteration, with a specific curve
+
+        :param initial_value: (float) initial value
+        :param n_values: (int) the total number of iterations
+        :param schedule: (function) the curve you wish to follow for your value
+        """
+        self.step = 0.
+        self.initial_value = initial_value
+        self.nvalues = n_values
+        self.schedule = SCHEDULES[schedule]
 
     def value(self):
-        current_value = self.v*self.schedule(self.n/self.nvalues)
-        self.n += 1.
+        """
+        Update the Scheduler, and return the current value
+
+        :return: (float) the current value
+        """
+        current_value = self.initial_value * self.schedule(self.step / self.nvalues)
+        self.step += 1.
         return current_value
 
     def value_steps(self, steps):
-        return self.v*self.schedule(steps/self.nvalues)
+        """
+        Get a value for a given step
+
+        :param steps: (int) The current number of iterations
+        :return: (float) the value for the current number of iterations
+        """
+        return self.initial_value * self.schedule(steps / self.nvalues)
 
 
 class EpisodeStats:
-    def __init__(self, nsteps, nenvs):
+    def __init__(self, n_steps, n_envs):
+        """
+        Calculates the episode statistics
+
+        :param n_steps: (int) The number of steps to run for each environment
+        :param n_envs: (int) The number of environments
+        """
         self.episode_rewards = []
-        for i in range(nenvs):
+        for _ in range(n_envs):
             self.episode_rewards.append([])
-        self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
+        self.len_buffer = deque(maxlen=40)  # rolling buffer for episode lengths
         self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
-        self.nsteps = nsteps
-        self.nenvs = nenvs
+        self.n_steps = n_steps
+        self.n_envs = n_envs
 
     def feed(self, rewards, masks):
-        rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
-        masks = np.reshape(masks, [self.nenvs, self.nsteps])
-        for i in range(0, self.nenvs):
-            for j in range(0, self.nsteps):
+        """
+        Update the latest reward and mask
+
+        :param rewards: ([float]) The new rewards for the new step
+        :param masks: ([float]) The new masks for the new step
+        """
+        rewards = np.reshape(rewards, [self.n_envs, self.n_steps])
+        masks = np.reshape(masks, [self.n_envs, self.n_steps])
+        for i in range(0, self.n_envs):
+            for j in range(0, self.n_steps):
                 self.episode_rewards[i].append(rewards[i][j])
                 if masks[i][j]:
-                    l = len(self.episode_rewards[i])
-                    s = sum(self.episode_rewards[i])
-                    self.lenbuffer.append(l)
-                    self.rewbuffer.append(s)
+                    reward_length = len(self.episode_rewards[i])
+                    reward_sum = sum(self.episode_rewards[i])
+                    self.len_buffer.append(reward_length)
+                    self.rewbuffer.append(reward_sum)
                     self.episode_rewards[i] = []
 
     def mean_length(self):
-        if self.lenbuffer:
-            return np.mean(self.lenbuffer)
+        """
+        Returns the average length of each episode
+
+        :return: (float)
+        """
+        if self.len_buffer:
+            return np.mean(self.len_buffer)
         else:
             return 0  # on the first params dump, no episodes are finished
 
     def mean_reward(self):
+        """
+        Returns the average reward of each episode
+
+        :return: (float)
+        """
         if self.rewbuffer:
             return np.mean(self.rewbuffer)
         else:
@@ -256,35 +484,73 @@ def mean_reward(self):
 
 
 # For ACER
-def get_by_index(x, idx):
-    assert(len(x.get_shape()) == 2)
-    assert(len(idx.get_shape()) == 1)
-    idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
-    y = tf.gather(tf.reshape(x, [-1]),  # flatten input
-                  idx_flattened)  # use flattened indices
-    return y
-
-def check_shape(ts,shapes):
+def get_by_index(input_tensor, idx):
+    """
+    Return the input tensor, offset by a certain value
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor
+    :param idx: (int) The index offset
+    :return: (TensorFlow Tensor) the offset tensor
+    """
+    assert len(input_tensor.get_shape()) == 2
+    assert len(idx.get_shape()) == 1
+    idx_flattened = tf.range(0, input_tensor.shape[0]) * input_tensor.shape[1] + idx
+    offset_tensor = tf.gather(tf.reshape(input_tensor, [-1]),  # flatten input
+                              idx_flattened)  # use flattened indices
+    return offset_tensor
+
+
+def check_shape(tensors, shapes):
+    """
+    Verifies the tensors match the given shape, will raise an error if the shapes do not match
+
+    :param tensors: ([TensorFlow Tensor]) The tensors that should be checked
+    :param shapes: ([list]) The list of shapes for each tensor
+    """
     i = 0
-    for (t,shape) in zip(ts,shapes):
-        assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
+    for (tensor, shape) in zip(tensors, shapes):
+        assert tensor.get_shape().as_list() == shape, "id " + str(i) + " shape " + str(tensor.get_shape()) + str(shape)
         i += 1
 
-def avg_norm(t):
-    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
 
-def gradient_add(g1, g2, param):
-    print([g1, g2, param.name])
-    assert (not (g1 is None and g2 is None)), param.name
-    if g1 is None:
-        return g2
-    elif g2 is None:
-        return g1
+def avg_norm(tensor):
+    """
+    Return an average of the L2 normalization of the batch
+
+    :param tensor: (TensorFlow Tensor) The input tensor
+    :return: (TensorFlow Tensor) Average L2 normalization of the batch
+    """
+    return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(tensor), axis=-1)))
+
+
+def gradient_add(grad_1, grad_2, param):
+    """
+    Sum two gradients
+
+    :param grad_1: (TensorFlow Tensor) The first gradient
+    :param grad_2: (TensorFlow Tensor) The second gradient
+    :param param: (TensorFlow parameters) The trainable parameters
+    :return: (TensorFlow Tensor) the sum of the gradients
+    """
+    print([grad_1, grad_2, param.name])
+    assert (not (grad_1 is None and grad_2 is None)), param.name
+    if grad_1 is None:
+        return grad_2
+    elif grad_2 is None:
+        return grad_1
     else:
-        return g1 + g2
+        return grad_1 + grad_2
+
+
+def q_explained_variance(q_pred, q_true):
+    """
+    Calculates the explained variance of the Q value
 
-def q_explained_variance(qpred, q):
-    _, vary = tf.nn.moments(q, axes=[0, 1])
-    _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
-    check_shape([vary, varpred], [[]] * 2)
-    return 1.0 - (varpred / vary)
+    :param q_pred: (TensorFlow Tensor) The predicted Q value
+    :param q_true: (TensorFlow Tensor) The expected Q value
+    :return: (TensorFlow Tensor) the explained variance of the Q value
+    """
+    _, var_y = tf.nn.moments(q_true, axes=[0, 1])
+    _, var_pred = tf.nn.moments(q_true - q_pred, axes=[0, 1])
+    check_shape([var_y, var_pred], [[]] * 2)
+    return 1.0 - (var_pred / var_y)
diff --git a/baselines/acer/acer_simple.py b/baselines/acer/acer_simple.py
index bed486a917..07be94feb4 100644
--- a/baselines/acer/acer_simple.py
+++ b/baselines/acer/acer_simple.py
@@ -1,80 +1,108 @@
 import time
 import joblib
+import os
+
 import numpy as np
 import tensorflow as tf
-from baselines import logger
 
+from baselines import logger
 from baselines.common import set_global_seeds
 from baselines.common.runners import AbstractEnvRunner
-
-from baselines.a2c.utils import batch_to_seq, seq_to_batch
-from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
-from baselines.a2c.utils import cat_entropy_softmax
-from baselines.a2c.utils import EpisodeStats
-from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
 from baselines.acer.buffer import Buffer
+from baselines.a2c.utils import batch_to_seq, seq_to_batch, Scheduler, make_path, find_trainable_variables, \
+    calc_entropy_softmax, EpisodeStats, get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
 
-import os.path as osp
 
-# remove last step
-def strip(var, nenvs, nsteps, flat = False):
-    vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
-    return seq_to_batch(vars[:-1], flat)
+def strip(var, n_envs, n_steps, flat=False):
+    """
+    Removes the last step in the batch
 
-def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
+    :param var: (TensorFlow Tensor) The input Tensor
+    :param n_envs: (int) The number of environments
+    :param n_steps: (int) The number of steps to run for each environment
+    :param flat: (bool) If the input Tensor is flat
+    :return: (TensorFlow Tensor) the input tensor, without the last step in the batch
     """
-    Calculates q_retrace targets
-
-    :param R: Rewards
-    :param D: Dones
-    :param q_i: Q values for actions taken
-    :param v: V values
-    :param rho_i: Importance weight for each action
-    :return: Q_retrace values
+    out_vars = batch_to_seq(var, n_envs, n_steps + 1, flat)
+    return seq_to_batch(out_vars[:-1], flat)
+
+
+def q_retrace(rewards, dones, q_i, values, rho_i, n_envs, n_steps, gamma):
     """
-    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    rs = batch_to_seq(R, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    ds = batch_to_seq(D, nenvs, nsteps, True)  # list of len steps, shape [nenvs]
-    q_is = batch_to_seq(q_i, nenvs, nsteps, True)
-    vs = batch_to_seq(v, nenvs, nsteps + 1, True)
-    v_final = vs[-1]
-    qret = v_final
+    Calculates the target Q-retrace
+
+    :param rewards: ([TensorFlow Tensor]) The rewards
+    :param dones: ([TensorFlow Tensor])
+    :param q_i: ([TensorFlow Tensor]) The Q values for actions taken
+    :param values: ([TensorFlow Tensor]) The output of the value functions
+    :param rho_i: ([TensorFlow Tensor]) The importance weight for each action
+    :param n_envs: (int) The number of environments
+    :param n_steps: (int) The number of steps to run for each environment
+    :param gamma: (float) The discount value
+    :return: ([TensorFlow Tensor]) the target Q-retrace
+    """
+    rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), n_envs, n_steps, True)  # list of len steps, shape [n_envs]
+    reward_seq = batch_to_seq(rewards, n_envs, n_steps, True)  # list of len steps, shape [n_envs]
+    done_seq = batch_to_seq(dones, n_envs, n_steps, True)  # list of len steps, shape [n_envs]
+    q_is = batch_to_seq(q_i, n_envs, n_steps, True)
+    value_sequence = batch_to_seq(values, n_envs, n_steps + 1, True)
+    final_value = value_sequence[-1]
+    qret = final_value
     qrets = []
-    for i in range(nsteps - 1, -1, -1):
-        check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
-        qret = rs[i] + gamma * qret * (1.0 - ds[i])
+    for i in range(n_steps - 1, -1, -1):
+        check_shape([qret, done_seq[i], reward_seq[i], rho_bar[i], q_is[i], value_sequence[i]], [[n_envs]] * 6)
+        qret = reward_seq[i] + gamma * qret * (1.0 - done_seq[i])
         qrets.append(qret)
-        qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
+        qret = (rho_bar[i] * (qret - q_is[i])) + value_sequence[i]
     qrets = qrets[::-1]
     qret = seq_to_batch(qrets, flat=True)
     return qret
 
-# For ACER with PPO clipping instead of trust region
-# def clip(ratio, eps_clip):
-#     # assume 0 <= eps_clip <= 1
-#     return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
 
 class Model(object):
-    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
-                 ent_coef, q_coef, gamma, max_grad_norm, lr,
-                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
-                 c, trust_region, alpha, delta):
+    def __init__(self, policy, ob_space, ac_space, n_envs, n_steps, n_stack, num_procs, ent_coef, q_coef, gamma,
+                 max_grad_norm, learning_rate, rprop_alpha, rprop_epsilon,
+                 total_timesteps, lr_schedule, correction_term, trust_region, alpha, delta):
+        """
+        The ACER (Actor-Critic with Experience Replay) model class, https://arxiv.org/abs/1611.01224
+
+        :param policy: (AcerPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+        :param ob_space: (Gym Space) The observation space
+        :param ac_space: (Gym Space) The action space
+        :param n_envs: (int) The number of environments
+        :param n_steps: (int) The number of steps to run for each environment
+        :param n_stack: (int) The number of stacked frames
+        :param num_procs: (int) The number of threads for TensorFlow operations
+        :param ent_coef: (float) The weight for the entropic loss
+        :param q_coef: (float) The weight for the loss on the Q value
+        :param gamma: (float) The discount value
+        :param max_grad_norm: (float) The clipping value for the maximum gradient
+        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
+        :param rprop_alpha: (float) RMS prop optimizer decay rate
+        :param rprop_epsilon: (float) RMS prop optimizer epsilon
+        :param total_timesteps: (int) The total number of timesteps for training the model
+        :param lr_schedule: (str) The scheduler for a dynamic learning rate
+        :param correction_term: (float) The correction term for the weights
+        :param trust_region: (bool) Enable Trust region policy optimization loss
+        :param alpha: (float) The decay rate for the Exponential moving average of the parameters
+        :param delta: (float) trust region delta value
+        """
         config = tf.ConfigProto(allow_soft_placement=True,
                                 intra_op_parallelism_threads=num_procs,
                                 inter_op_parallelism_threads=num_procs)
         sess = tf.Session(config=config)
-        nact = ac_space.n
-        nbatch = nenvs * nsteps
-
-        A = tf.placeholder(tf.int32, [nbatch]) # actions
-        D = tf.placeholder(tf.float32, [nbatch]) # dones
-        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
-        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
-        LR = tf.placeholder(tf.float32, [])
+        n_act = ac_space.n
+        n_batch = n_envs * n_steps
+
+        action_ph = tf.placeholder(tf.int32, [n_batch])  # actions
+        done_ph = tf.placeholder(tf.float32, [n_batch])  # dones
+        reward_ph = tf.placeholder(tf.float32, [n_batch])  # rewards, not returns
+        mu_ph = tf.placeholder(tf.float32, [n_batch, n_act])  # mu's
+        learning_rate_ph = tf.placeholder(tf.float32, [])
         eps = 1e-6
 
-        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+        step_model = policy(sess, ob_space, ac_space, n_envs, 1, n_stack, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, n_envs, n_steps + 1, n_stack, reuse=True)
 
         params = find_trainable_variables("model")
         print("Params {}".format(len(params)))
@@ -86,82 +114,94 @@ def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
         ema_apply_op = ema.apply(params)
 
         def custom_getter(getter, *args, **kwargs):
-            v = ema.average(getter(*args, **kwargs))
-            print(v.name)
-            return v
+            val = ema.average(getter(*args, **kwargs))
+            print(val.name)
+            return val
 
         with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
-            polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
+            polyak_model = policy(sess, ob_space, ac_space, n_envs, n_steps + 1, n_stack, reuse=True)
 
-        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
-        v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
+        # Notation: (var) = batch variable, (var)s = sequence variable, (var)_i = variable index by action at step i
+        value = tf.reduce_sum(train_model.policy * train_model.q_value, axis=-1)  # shape is [n_envs * (n_steps + 1)]
 
         # strip off last step
-        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
+        # f is a distribution, chosen to be Gaussian distributions
+        # with fixed diagonal covariance and mean \phi(x)
+        # in the paper
+        distribution_f, f_polyak, q_value = map(lambda variables: strip(variables, n_envs, n_steps),
+                                                [train_model.policy, polyak_model.policy, train_model.q_value])
         # Get pi and q values for actions taken
-        f_i = get_by_index(f, A)
-        q_i = get_by_index(q, A)
+        f_i = get_by_index(distribution_f, action_ph)
+        q_i = get_by_index(q_value, action_ph)
 
         # Compute ratios for importance truncation
-        rho = f / (MU + eps)
-        rho_i = get_by_index(rho, A)
+        rho = distribution_f / (mu_ph + eps)
+        rho_i = get_by_index(rho, action_ph)
 
         # Calculate Q_retrace targets
-        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
+        qret = q_retrace(reward_ph, done_ph, q_i, value, rho_i, n_envs, n_steps, gamma)
 
         # Calculate losses
         # Entropy
-        entropy = tf.reduce_mean(cat_entropy_softmax(f))
+        entropy = tf.reduce_mean(calc_entropy_softmax(distribution_f))
 
-        # Policy Graident loss, with truncated importance sampling & bias correction
-        v = strip(v, nenvs, nsteps, True)
-        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
-        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
+        # Policy Gradient loss, with truncated importance sampling & bias correction
+        value = strip(value, n_envs, n_steps, True)
+        check_shape([qret, value, rho_i, f_i], [[n_envs * n_steps]] * 4)
+        check_shape([rho, distribution_f, q_value], [[n_envs * n_steps, n_act]] * 2)
 
         # Truncated importance sampling
-        adv = qret - v
-        logf = tf.log(f_i + eps)
-        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
+        adv = qret - value
+        log_f = tf.log(f_i + eps)
+        gain_f = log_f * tf.stop_gradient(adv * tf.minimum(correction_term, rho_i))  # [n_envs * n_steps]
         loss_f = -tf.reduce_mean(gain_f)
 
         # Bias correction for the truncation
-        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
-        logf_bc = tf.log(f + eps) # / (f_old + eps)
-        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
-        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
-        loss_bc= -tf.reduce_mean(gain_bc)
+        adv_bc = (q_value - tf.reshape(value, [n_envs * n_steps, 1]))  # [n_envs * n_steps, n_act]
+        log_f_bc = tf.log(distribution_f + eps)  # / (f_old + eps)
+        check_shape([adv_bc, log_f_bc], [[n_envs * n_steps, n_act]] * 2)
+        gain_bc = tf.reduce_sum(log_f_bc *
+                                tf.stop_gradient(
+                                    adv_bc * tf.nn.relu(1.0 - (correction_term / (rho + eps))) * distribution_f),
+                                axis=1)
+        # IMP: This is sum, as expectation wrt f
+        loss_bc = -tf.reduce_mean(gain_bc)
 
         loss_policy = loss_f + loss_bc
 
         # Value/Q function loss, and explained variance
-        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
-        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
-        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
+        check_shape([qret, q_i], [[n_envs * n_steps]] * 2)
+        explained_variance = q_explained_variance(tf.reshape(q_i, [n_envs, n_steps]),
+                                                  tf.reshape(qret, [n_envs, n_steps]))
+        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)
 
         # Net loss
         check_shape([loss_policy, loss_q, entropy], [[]] * 3)
         loss = loss_policy + q_coef * loss_q - ent_coef * entropy
 
         if trust_region:
-            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
-            # k = tf.gradients(KL(f_pol || f), f)
-            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
-            k_dot_g = tf.reduce_sum(k * g, axis=-1)
-            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
+            # [n_envs * n_steps, n_act]
+            grad = tf.gradients(- (loss_policy - ent_coef * entropy) * n_steps * n_envs, distribution_f)
+            # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f
+            kl_grad = - f_polyak / (distribution_f + eps)
+            k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1)
+            adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - delta) / (
+                    tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps))  # [n_envs * n_steps]
 
             # Calculate stats (before doing adjustment) for logging.
-            avg_norm_k = avg_norm(k)
-            avg_norm_g = avg_norm(g)
+            avg_norm_k = avg_norm(kl_grad)
+            avg_norm_g = avg_norm(grad)
             avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
             avg_norm_adj = tf.reduce_mean(tf.abs(adj))
 
-            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
-            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
-            grads_policy = tf.gradients(f, params, grads_f)
+            grad = grad - tf.reshape(adj, [n_envs * n_steps, 1]) * kl_grad
+            grads_f = -grad / (
+                    n_envs * n_steps)  # These are turst region adjusted gradients wrt f ie statistics of policy pi
+            grads_policy = tf.gradients(distribution_f, params, grads_f)
             grads_q = tf.gradients(loss_q * q_coef, params)
             grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
 
-            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
+            avg_norm_grads_f = avg_norm(grads_f) * (n_steps * n_envs)
             norm_grads_q = tf.global_norm(grads_q)
             norm_grads_policy = tf.global_norm(grads_policy)
         else:
@@ -170,39 +210,44 @@ def custom_getter(getter, *args, **kwargs):
         if max_grad_norm is not None:
             grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
         grads = list(zip(grads, params))
-        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
+        trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=rprop_alpha, epsilon=rprop_epsilon)
         _opt_op = trainer.apply_gradients(grads)
 
         # so when you call _train, you first do the gradient step, then you apply ema
         with tf.control_dependencies([_opt_op]):
             _train = tf.group(ema_apply_op)
 
-        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule)
 
         # Ops/Summaries to run, and their names for logging
-        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
+        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads]
         names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                      'norm_grads']
         if trust_region:
-            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
+            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g,
+                                 avg_norm_k_dot_g,
                                  avg_norm_adj]
-            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
+            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k',
+                                     'avg_norm_g',
                                      'avg_norm_k_dot_g', 'avg_norm_adj']
 
         def train(obs, actions, rewards, dones, mus, states, masks, steps):
-            cur_lr = lr.value_steps(steps)
-            td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
-            if states != []:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-                td_map[polyak_model.S] = states
-                td_map[polyak_model.M] = masks
+            cur_lr = learning_rate.value_steps(steps)
+            td_map = {train_model.obs_ph: obs, polyak_model.obs_ph: obs, action_ph: actions, reward_ph: rewards,
+                      done_ph: dones, mu_ph: mus, learning_rate_ph: cur_lr}
+
+            if len(states) != 0:
+                td_map[train_model.states_ph] = states
+                td_map[train_model.masks_ph] = masks
+                td_map[polyak_model.states_ph] = states
+                td_map[polyak_model.masks_ph] = masks
+
             return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train
 
         def save(save_path):
-            ps = sess.run(params)
-            make_path(osp.dirname(save_path))
-            joblib.dump(ps, save_path)
+            session_params = sess.run(params)
+            make_path(os.path.dirname(save_path))
+            joblib.dump(session_params, save_path)
 
         self.train = train
         self.save = save
@@ -212,30 +257,51 @@ def save(save_path):
         self.initial_state = step_model.initial_state
         tf.global_variables_initializer().run(session=sess)
 
+
 class Runner(AbstractEnvRunner):
-    def __init__(self, env, model, nsteps, nstack):
-        super().__init__(env=env, model=model, nsteps=nsteps)
-        self.nstack = nstack
-        nh, nw, nc = env.observation_space.shape
-        self.nc = nc  # nc = 1 for atari, but just in case
-        self.nenv = nenv = env.num_envs
-        self.nact = env.action_space.n
-        self.nbatch = nenv * nsteps
-        self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
-        self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
+    def __init__(self, env, model, n_steps, n_stack):
+        """
+        A runner to learn the policy of an environment for a model
+
+        :param env: (Gym environment) The environment to learn from
+        :param model: (Model) The model to learn
+        :param n_steps: (int) The number of steps to run for each environment
+        :param n_stack: (int) The number of stacked frames
+        """
+        super().__init__(env=env, model=model, n_steps=n_steps)
+        self.n_stack = n_stack
+        obs_height, obs_width, obs_num_channels = env.observation_space.shape
+        self.num_channels = obs_num_channels  # obs_num_channels = 1 for atari, but just in case
+        self.n_env = n_env = env.num_envs
+        self.n_act = env.action_space.n
+        self.n_batch = n_env * n_steps
+        self.batch_ob_shape = (n_env * (n_steps + 1), obs_height, obs_width, obs_num_channels * n_stack)
+        self.obs = np.zeros((n_env, obs_height, obs_width, obs_num_channels * n_stack), dtype=np.uint8)
         obs = env.reset()
         self.update_obs(obs)
 
     def update_obs(self, obs, dones=None):
+        """
+        Update the observation for rolling observation with stacking
+
+        :param obs: ([int] or [float]) The input observation
+        :param dones: ([bool])
+        """
         if dones is not None:
             self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
-        self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
-        self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
+        self.obs = np.roll(self.obs, shift=-self.num_channels, axis=3)
+        self.obs[:, :, :, -self.num_channels:] = obs[:, :, :, :]
 
     def run(self):
-        enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
+        """
+        Run a step leaning of the model
+
+        :return: ([float], [float], [float], [float], [float], [bool], [float])
+                 encoded observation, observations, actions, rewards, mus, dones, masks
+        """
+        enc_obs = np.split(self.obs, self.n_stack, axis=3)  # so now list of obs steps
         mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
-        for _ in range(self.nsteps):
+        for _ in range(self.n_steps):
             actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
@@ -259,25 +325,40 @@ def run(self):
 
         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
 
-        mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
-        mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
+        mb_masks = mb_dones  # Used for statefull models like LSTM's to mask state when done
+        mb_dones = mb_dones[:, 1:]  # Used for calculating returns. The dones array is now aligned with rewards
 
-        # shapes are now [nenv, nsteps, []]
+        # shapes are now [n_env, n_steps, []]
         # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
 
         return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
 
-class Acer():
+
+class Acer(object):
     def __init__(self, runner, model, buffer, log_interval):
+        """
+        Wrapper for the ACER model object
+
+        :param runner: (AbstractEnvRunner) The runner to learn the policy of an environment for a model
+        :param model: (Model) The model to learn
+        :param buffer: (Buffer) The observation buffer
+        :param log_interval: (int) The number of timesteps before logging.
+        """
+        super(Acer, self).__init__()
         self.runner = runner
         self.model = model
         self.buffer = buffer
         self.log_interval = log_interval
-        self.tstart = None
-        self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
+        self.t_start = None
+        self.episode_stats = EpisodeStats(runner.n_steps, runner.n_env)
         self.steps = None
 
     def call(self, on_policy):
+        """
+        Call a step with ACER
+
+        :param on_policy: (bool) To step on policy and not on buffer
+        """
         runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
         if on_policy:
             enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
@@ -290,19 +371,19 @@ def call(self, on_policy):
 
         # reshape stuff correctly
         obs = obs.reshape(runner.batch_ob_shape)
-        actions = actions.reshape([runner.nbatch])
-        rewards = rewards.reshape([runner.nbatch])
-        mus = mus.reshape([runner.nbatch, runner.nact])
-        dones = dones.reshape([runner.nbatch])
+        actions = actions.reshape([runner.n_batch])
+        rewards = rewards.reshape([runner.n_batch])
+        mus = mus.reshape([runner.n_batch, runner.n_act])
+        dones = dones.reshape([runner.n_batch])
         masks = masks.reshape([runner.batch_ob_shape[0]])
 
         names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
 
-        if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
+        if on_policy and (int(steps / runner.n_batch) % self.log_interval == 0):
             logger.record_tabular("total_timesteps", steps)
-            logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
-            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
-            # Thus, this is mean until end of life, not end of episode.
+            logger.record_tabular("fps", int(steps / (time.time() - self.t_start)))
+            # IMP: In EpisodicLife env, during training, we get done=True at each loss of life,
+            # not just at the terminal state. Thus, this is mean until end of life, not end of episode.
             # For true episode rewards, see the monitor files in the log folder.
             logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
             logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
@@ -311,38 +392,70 @@ def call(self, on_policy):
             logger.dump_tabular()
 
 
-def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
-          max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
-          log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
+def learn(policy, env, seed, n_steps=20, n_stack=4, total_timesteps=int(80e6),
+          q_coef=0.5, ent_coef=0.01,
+          max_grad_norm=10, learning_rate=7e-4, lr_schedule='linear',
+          rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
+          log_interval=100, buffer_size=50000, replay_ratio=4,
+          replay_start=10000, correction_term=10.0,
           trust_region=True, alpha=0.99, delta=1):
+    """
+    Train an ACER model.
+
+    :param policy: (ACERPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+    :param env: (Gym environment) The environment to learn from
+    :param seed: (int) The initial seed for training
+    :param n_steps: (int) The number of steps to run for each environment
+    :param n_stack: (int) The number of stacked frames
+    :param total_timesteps: (int) The total number of samples
+    :param q_coef: (float) Q function coefficient for the loss calculation
+    :param ent_coef: (float) Entropy coefficient for the loss caculation
+    :param max_grad_norm: (float) The maximum value for the gradient clipping
+    :param learning_rate: (float) The learning rate
+    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+    :param rprop_epsilon: (float) RMS prop optimizer epsilon
+    :param rprop_alpha: (float) RMS prop optimizer decay
+    :param gamma: (float) Discount factor
+    :param log_interval: (int) The number of timesteps before logging.
+    :param buffer_size: (int) The buffer size in number of steps
+    :param replay_ratio: (float) The number of replay learning per on policy learning on average,
+                                 using a poisson distribution
+    :param replay_start: (int) The minimum number of steps in the buffer, before learning replay
+    :param correction_term: (float) The correction term for the weights
+    :param trust_region: (bool) Enable Trust region policy optimization loss
+    :param alpha: (float) The decay rate for the Exponential moving average of the parameters
+    :param delta: (float) trust region delta value
+    """
     print("Running Acer Simple")
     print(locals())
-    tf.reset_default_graph()
     set_global_seeds(seed)
 
-    nenvs = env.num_envs
+    n_envs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
-    num_procs = len(env.remotes) # HACK
-    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
+    num_procs = len(env.remotes)  # HACK
+    model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs, n_steps=n_steps, n_stack=n_stack,
                   num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
-                  max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
-                  total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
+                  max_grad_norm=max_grad_norm, learning_rate=learning_rate, rprop_alpha=rprop_alpha,
+                  rprop_epsilon=rprop_epsilon,
+                  total_timesteps=total_timesteps, lr_schedule=lr_schedule, correction_term=correction_term,
                   trust_region=trust_region, alpha=alpha, delta=delta)
 
-    runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
+    runner = Runner(env=env, model=model, n_steps=n_steps, n_stack=n_stack)
     if replay_ratio > 0:
-        buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
+        buffer = Buffer(env=env, n_steps=n_steps, n_stack=n_stack, size=buffer_size)
     else:
         buffer = None
-    nbatch = nenvs*nsteps
+    n_batch = n_envs * n_steps
     acer = Acer(runner, model, buffer, log_interval)
-    acer.tstart = time.time()
-    for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
+    acer.t_start = time.time()
+    for acer.steps in range(0, total_timesteps,
+                            n_batch):  # n_batch samples, 1 on_policy call and multiple off-policy calls
         acer.call(on_policy=True)
         if replay_ratio > 0 and buffer.has_atleast(replay_start):
-            n = np.random.poisson(replay_ratio)
-            for _ in range(n):
+            samples_number = np.random.poisson(replay_ratio)
+            for _ in range(samples_number):
                 acer.call(on_policy=False)  # no simulation steps in this
 
     env.close()
diff --git a/baselines/acer/buffer.py b/baselines/acer/buffer.py
index 2dcfa1098a..26ba89a71e 100644
--- a/baselines/acer/buffer.py
+++ b/baselines/acer/buffer.py
@@ -1,14 +1,23 @@
 import numpy as np
 
+
 class Buffer(object):
-    # gets obs, actions, rewards, mu's, (states, masks), dones
-    def __init__(self, env, nsteps, nstack, size=50000):
-        self.nenv = env.num_envs
-        self.nsteps = nsteps
-        self.nh, self.nw, self.nc = env.observation_space.shape
-        self.nstack = nstack
-        self.nbatch = self.nenv * self.nsteps
-        self.size = size // (self.nsteps)  # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
+    def __init__(self, env, n_steps, n_stack, size=50000):
+        """
+        A buffer for observations, actions, rewards, mu's, states, masks and dones values
+        
+        :param env: (Gym environment) The environment to learn from
+        :param n_steps: (int) The number of steps to run for each environment
+        :param n_stack: (int) The number of stacked frames
+        :param size: (int) The buffer size in number of steps
+        """
+        self.n_env = env.num_envs
+        self.n_steps = n_steps
+        self.height, self.width, self.n_channels = env.observation_space.shape
+        self.n_stack = n_stack
+        self.n_batch = self.n_env * self.n_steps
+        # Each loc contains n_env * n_steps frames, thus total buffer is n_env * size frames
+        self.size = size // self.n_steps
 
         # Memory
         self.enc_obs = None
@@ -23,37 +32,66 @@ def __init__(self, env, nsteps, nstack, size=50000):
         self.num_in_buffer = 0
 
     def has_atleast(self, frames):
-        # Frames per env, so total (nenv * frames) Frames needed
-        # Each buffer loc has nenv * nsteps frames
-        return self.num_in_buffer >= (frames // self.nsteps)
+        """
+        Check to see if the buffer has at least the asked number of frames
+        
+        :param frames: (int) The number of frames checked
+        :return: (bool) number of frames in buffer >= number asked
+        """
+        # Frames per env, so total (n_env * frames) Frames needed
+        # Each buffer loc has n_env * n_steps frames
+        return self.num_in_buffer >= (frames // self.n_steps)
 
     def can_sample(self):
+        """
+        Check if the buffer has at least one frame
+        
+        :return: (bool) if the buffer has at least one frame
+        """
         return self.num_in_buffer > 0
 
-    # Generate stacked frames
     def decode(self, enc_obs, dones):
-        # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
-        # dones has shape [nenvs, nsteps, nh, nw, nc]
-        # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
-        nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
-        y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
-        obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
-        x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
-                                                                              0)  # [nsteps + nstack, nenv, nh, nw, nc]
-        y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0)  # keep
-        y[:3] = 1.0
-        # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
-        for i in range(nstack):
-            obs[-(i + 1), i:] = x
+        """
+        Get the stacked frames of an observation
+        
+        :param enc_obs: ([float]) the encoded observation
+        :param dones: ([bool])
+        :return: ([float]) the decoded observation
+        """
+        # enc_obs has shape [n_envs, n_steps + n_stack, nh, nw, nc]
+        # dones has shape [n_envs, n_steps, nh, nw, nc]
+        # returns stacked obs of shape [n_env, (n_steps + 1), nh, nw, n_stack*nc]
+        n_stack, n_env, n_steps = self.n_stack, self.n_env, self.n_steps
+        height, width, n_channels = self.height, self.width, self.n_channels
+        y_var = np.empty([n_steps + n_stack - 1, n_env, 1, 1, 1], dtype=np.float32)
+        obs = np.zeros([n_stack, n_steps + n_stack, n_env, height, width, n_channels], dtype=np.uint8)
+        # [n_steps + n_stack, n_env, nh, nw, nc]
+        x_var = np.reshape(enc_obs, [n_env, n_steps + n_stack, height, width, n_channels]).swapaxes(1, 0)
+        y_var[3:] = np.reshape(1.0 - dones, [n_env, n_steps, 1, 1, 1]).swapaxes(1, 0)  # keep
+        y_var[:3] = 1.0
+        # y = np.reshape(1 - dones, [n_envs, n_steps, 1, 1, 1])
+        for i in range(n_stack):
+            obs[-(i + 1), i:] = x_var
             # obs[:,i:,:,:,-(i+1),:] = x
-            x = x[:-1] * y
-            y = y[1:]
-        return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
+            x_var = x_var[:-1] * y_var
+            y_var = y_var[1:]
+        return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)),
+                          [n_env, (n_steps + 1), height, width, n_stack * n_channels])
 
     def put(self, enc_obs, actions, rewards, mus, dones, masks):
-        # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
-        # actions, rewards, dones [nenv, nsteps]
-        # mus [nenv, nsteps, nact]
+        """
+        Adds a frame to the buffer
+        
+        :param enc_obs: ([float]) the encoded observation
+        :param actions: ([float]) the actions
+        :param rewards: ([float]) the rewards
+        :param mus: ([float]) the policy probability for the actions
+        :param dones: ([bool])
+        :param masks: ([bool])
+        """
+        # enc_obs [n_env, (n_steps + n_stack), nh, nw, nc]
+        # actions, rewards, dones [n_env, n_steps]
+        # mus [n_env, n_steps, n_act]
 
         if self.enc_obs is None:
             self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
@@ -73,31 +111,44 @@ def put(self, enc_obs, actions, rewards, mus, dones, masks):
         self.next_idx = (self.next_idx + 1) % self.size
         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
 
-    def take(self, x, idx, envx):
-        nenv = self.nenv
-        out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
-        for i in range(nenv):
-            out[i] = x[idx[i], envx[i]]
+    def take(self, arr, idx, envx):
+        """
+        Reads a frame from a list and index for the asked environment ids
+        
+        :param arr: (numpy array) the array that is read
+        :param idx: ([int]) the idx that are read
+        :param envx: ([int]) the idx for the environments
+        :return: ([float]) the askes frames from the list
+        """
+        n_env = self.n_env
+        out = np.empty([n_env] + list(arr.shape[2:]), dtype=arr.dtype)
+        for i in range(n_env):
+            out[i] = arr[idx[i], envx[i]]
         return out
 
     def get(self):
+        """
+        randomly read a frame from the buffer
+        
+        :return: ([float], [float], [float], [float], [bool], [float])
+                 observations, actions, rewards, mus, dones, maskes
+        """
         # returns
-        # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
-        # actions, rewards, dones [nenv, nsteps]
-        # mus [nenv, nsteps, nact]
-        nenv = self.nenv
+        # obs [n_env, (n_steps + 1), nh, nw, n_stack*nc]
+        # actions, rewards, dones [n_env, n_steps]
+        # mus [n_env, n_steps, n_act]
+        n_env = self.n_env
         assert self.can_sample()
 
         # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
-        idx = np.random.randint(0, self.num_in_buffer, nenv)
-        envx = np.arange(nenv)
+        idx = np.random.randint(0, self.num_in_buffer, n_env)
+        envx = np.arange(n_env)
 
-        take = lambda x: self.take(x, idx, envx)  # for i in range(nenv)], axis = 0)
-        dones = take(self.dones)
-        enc_obs = take(self.enc_obs)
+        dones = self.take(self.dones, idx, envx)
+        enc_obs = self.take(self.enc_obs, idx, envx)
         obs = self.decode(enc_obs, dones)
-        actions = take(self.actions)
-        rewards = take(self.rewards)
-        mus = take(self.mus)
-        masks = take(self.masks)
+        actions = self.take(self.actions, idx, envx)
+        rewards = self.take(self.rewards, idx, envx)
+        mus = self.take(self.mus, idx, envx)
+        masks = self.take(self.masks, idx, envx)
         return obs, actions, rewards, mus, dones, masks
diff --git a/baselines/acer/policies.py b/baselines/acer/policies.py
index 627c40016c..2fb6cd9e0c 100644
--- a/baselines/acer/policies.py
+++ b/baselines/acer/policies.py
@@ -1,79 +1,131 @@
 import numpy as np
 import tensorflow as tf
-from baselines.ppo2.policies import nature_cnn
-from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
+from baselines.a2c.policies import nature_cnn
+from baselines.a2c.utils import linear, batch_to_seq, seq_to_batch, lstm, sample
 
 
-class AcerCnnPolicy(object):
+class AcerPolicy(object):
+    """
+    Policy object for Acer
+    
+    :param sess: (TensorFlow session) The current TensorFlow session
+    :param ob_space: (Gym Space) The observation space of the environment
+    :param ac_space: (Gym Space) The action space of the environment
+    :param n_env: (int) The number of environments
+    :param n_steps: (int) The number of steps to run
+    :param n_stack: (int) The number of frames stacked
+    :param reuse: (bool) If the policy is reusable or not
+    :param n_lstm: (int) The number of LSTM cells (for reccurent policies)
+    """
 
-    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
-        nbatch = nenv * nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc * nstack)
-        nact = ac_space.n
-        X = tf.placeholder(tf.uint8, ob_shape)  # obs
+    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False, n_lstm=256):
+        self.n_batch = n_env * n_steps
+        height, width, n_channels = ob_space.shape
+        self.ob_shape = (self.n_batch, height, width, n_channels * n_stack)
+        self.n_act = ac_space.n
+        self.obs_ph = tf.placeholder(tf.uint8, self.ob_shape)  # obs
+        self.masks_ph = tf.placeholder(tf.float32, [self.n_batch])  # mask (done t-1)
+        self.states_ph = tf.placeholder(tf.float32, [n_env, n_lstm * 2])  # states
+        self.sess = sess
+        self.reuse = reuse
+
+    def step(self, obs, state, mask, *args, **kwargs):
+        """
+        Returns the policy for a single step
+        
+        :param obs: ([float] or [int]) The current observation of the environment
+        :param state: ([float]) The last states (used in reccurent policies)
+        :param mask: ([float]) The last masks (used in reccurent policies)
+        :param args:
+        :param kwargs:
+        :return: ([float], [float], [float], [float]) action, mu, states
+        """
+        raise NotImplementedError
+
+    def out(self, obs, state, mask, *args, **kwargs):
+        """
+        Returns the pi and q values for a single step
+        
+        :param obs: ([float] or [int]) The current observation of the environment
+        :param state: ([float]) The last states (used in reccurent policies)
+        :param mask: ([float]) The last masks (used in reccurent policies)
+        :param args:
+        :param kwargs:
+        :return: ([float], [float]) pi, q
+        """
+        raise NotImplementedError
+
+    def act(self, obs, state, mask, *args, **kwargs):
+        """
+        Returns the action for a single step
+        
+        :param obs: ([float] or [int]) The current observation of the environment
+        :param state: ([float]) The last states (used in reccurent policies)
+        :param mask: ([float]) The last masks (used in reccurent policies)
+        :param args:
+        :param kwargs:
+        :return: ([float]) The action
+        """
+        raise NotImplementedError
+
+
+class AcerCnnPolicy(AcerPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False):
+        super(AcerCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse)
         with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
-            pi = tf.nn.softmax(pi_logits)
-            q = fc(h, 'q', nact)
+            extracted_features = nature_cnn(self.obs_ph)
+            pi_logits = linear(extracted_features, 'pi', self.n_act, init_scale=0.01)
+            policy = tf.nn.softmax(pi_logits)
+            q_value = linear(extracted_features, 'q', self.n_act)
 
-        a = sample(pi_logits)  # could change this to use self.pi instead
+        self.action = sample(pi_logits)  # could change this to use self.pi instead
         self.initial_state = []  # not stateful
-        self.X = X
-        self.pi = pi  # actual policy params now
-        self.q = q
-
-        def step(ob, *args, **kwargs):
-            # returns actions, mus, states
-            a0, pi0 = sess.run([a, pi], {X: ob})
-            return a0, pi0, []  # dummy state
-
-        def out(ob, *args, **kwargs):
-            pi0, q0 = sess.run([pi, q], {X: ob})
-            return pi0, q0
-
-        def act(ob, *args, **kwargs):
-            return sess.run(a, {X: ob})
-
-        self.step = step
-        self.out = out
-        self.act = act
-
-class AcerLstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
-        nbatch = nenv * nsteps
-        nh, nw, nc = ob_space.shape
-        ob_shape = (nbatch, nh, nw, nc * nstack)
-        nact = ac_space.n
-        X = tf.placeholder(tf.uint8, ob_shape)  # obs
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
+        self.policy = policy  # actual policy params now
+        self.q_value = q_value
+
+    def step(self, obs, state, mask, *args, **kwargs):
+        # returns actions, mus, states
+        action, policy = self.sess.run([self.action, self.policy], {self.obs_ph: obs})
+        return action, policy, []  # dummy state
+
+    def out(self, obs, state, mask, *args, **kwargs):
+        policy, q_value = self.sess.run([self.policy, self.q_value], {self.obs_ph: obs})
+        return policy, q_value
+
+    def act(self, obs, state, mask, *args, **kwargs):
+        return self.sess.run(self.action, {self.obs_ph: obs})
+
+
+class AcerLstmPolicy(AcerPolicy):
+    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False, n_lstm=256):
+        super(AcerLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse, n_lstm)
         with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
+            extracted_features = nature_cnn(self.obs_ph)
 
             # lstm
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-
-            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
-            pi = tf.nn.softmax(pi_logits)
-            q = fc(h5, 'q', nact)
-
-        a = sample(pi_logits)  # could change this to use self.pi instead
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-        self.X = X
-        self.M = M
-        self.S = S
-        self.pi = pi  # actual policy params now
-        self.q = q
-
-        def step(ob, state, mask, *args, **kwargs):
-            # returns actions, mus, states
-            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
-            return a0, pi0, s
-
-        self.step = step
+            input_seq = batch_to_seq(extracted_features, n_env, n_steps)
+            masks = batch_to_seq(self.masks_ph, n_env, n_steps)
+            rnn_output, self.snew = lstm(input_seq, masks, self.states_ph, 'lstm1', n_hidden=n_lstm)
+            rnn_output = seq_to_batch(rnn_output)
+
+            pi_logits = linear(rnn_output, 'pi', self.n_act, init_scale=0.01)
+            policy = tf.nn.softmax(pi_logits)
+            q_value = linear(rnn_output, 'q', self.n_act)
+
+        self.action = sample(pi_logits)  # could change this to use self.pi instead
+        self.initial_state = np.zeros((n_env, n_lstm * 2), dtype=np.float32)
+        self.policy = policy  # actual policy params now
+        self.q_value = q_value
+
+    def step(self, obs, state, mask, *args, **kwargs):
+        # returns actions, mus, states
+        action, policy, states = self.sess.run([self.action, self.policy, self.snew],
+                                   {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask})
+        return action, policy, states
+
+    def out(self, obs, state, mask, *args, **kwargs):
+        policy, q_value = self.sess.run([self.policy, self.q_value], {self.obs_ph: obs})
+        return policy, q_value
+
+    def act(self, obs, state, mask, *args, **kwargs):
+        return self.sess.run(self.action, {self.obs_ph: obs})
diff --git a/baselines/acer/run_atari.py b/baselines/acer/run_atari.py
index cce979eddd..2c1354e183 100644
--- a/baselines/acer/run_atari.py
+++ b/baselines/acer/run_atari.py
@@ -4,7 +4,19 @@
 from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy
 from baselines.common.cmd_util import make_atari_env, atari_arg_parser
 
-def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
+
+def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
+    """
+    train an ACER model on atari
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+    :param num_cpu: (int) The number of cpu to train on
+    """
     env = make_atari_env(env_id, num_cpu, seed)
     if policy == 'cnn':
         policy_fn = AcerCnnPolicy
@@ -13,18 +25,24 @@ def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu):
     else:
         print("Policy {} not implemented".format(policy))
         return
-    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule)
+    learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule, buffer_size=5000)
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     parser = atari_arg_parser()
-    parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
-    parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
-    parser.add_argument('--logdir', help ='Directory for logging')
+    parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture')
+    parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant',
+                        help='Learning rate schedule')
+    parser.add_argument('--logdir', help='Directory for logging')
     args = parser.parse_args()
     logger.configure(args.logdir)
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-          policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
+          policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16)
+
 
 if __name__ == '__main__':
     main()
diff --git a/baselines/acktr/acktr_cont.py b/baselines/acktr/acktr_cont.py
index 45f2fa29fa..9b75398579 100644
--- a/baselines/acktr/acktr_cont.py
+++ b/baselines/acktr/acktr_cont.py
@@ -1,93 +1,119 @@
+"""
+Continuous acktr
+"""
+
 import numpy as np
 import tensorflow as tf
+
 from baselines import logger
 import baselines.common as common
-from baselines.common import tf_util as U
+from baselines.common import tf_util
 from baselines.acktr import kfac
 from baselines.common.filters import ZFilter
 
-def pathlength(path):
-    return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
 
 def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
     """
     Simulate the env and policy for max_pathlength steps
+
+    :param env: (Gym environment) The environment to learn from
+    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
+    :param max_pathlength: (int) The maximum length for an episode
+    :param animate: (bool) if render env
+    :param obfilter: (Filter) the observation filter
+    :return: (dict) observation, terminated, reward, action, action_dist, logp
     """
-    ob = env.reset()
-    prev_ob = np.float32(np.zeros(ob.shape))
-    if obfilter: ob = obfilter(ob)
+    observation = env.reset()
+    prev_ob = np.float32(np.zeros(observation.shape))
+    if obfilter:
+        observation = obfilter(observation)
     terminated = False
 
-    obs = []
-    acs = []
-    ac_dists = []
+    observations = []
+    actions = []
+    action_dists = []
     logps = []
     rewards = []
     for _ in range(max_pathlength):
         if animate:
             env.render()
-        state = np.concatenate([ob, prev_ob], -1)
-        obs.append(state)
-        ac, ac_dist, logp = policy.act(state)
-        acs.append(ac)
-        ac_dists.append(ac_dist)
+        state = np.concatenate([observation, prev_ob], -1)
+        observations.append(state)
+        action, ac_dist, logp = policy.act(state)
+        actions.append(action)
+        action_dists.append(ac_dist)
         logps.append(logp)
-        prev_ob = np.copy(ob)
-        scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
+        prev_ob = np.copy(observation)
+        scaled_ac = env.action_space.low + (action + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
         scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
-        ob, rew, done, _ = env.step(scaled_ac)
-        if obfilter: ob = obfilter(ob)
+        observation, rew, done, _ = env.step(scaled_ac)
+        if obfilter:
+            observation = obfilter(observation)
         rewards.append(rew)
         if done:
             terminated = True
             break
-    return {"observation" : np.array(obs), "terminated" : terminated,
-            "reward" : np.array(rewards), "action" : np.array(acs),
-            "action_dist": np.array(ac_dists), "logp" : np.array(logps)}
+    return {"observation": np.array(observations), "terminated": terminated,
+            "reward": np.array(rewards), "action": np.array(actions),
+            "action_dist": np.array(action_dists), "logp": np.array(logps)}
 
-def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
-    animate=False, callback=None, desired_kl=0.002):
 
+def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps,
+          animate=False, callback=None, desired_kl=0.002):
+    """
+    Traines an ACKTR model.
+
+    :param env: (Gym environment) The environment to learn from
+    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
+    :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...)
+    :param gamma: (float) The discount value
+    :param lam: (float) the tradeoff between exploration and exploitation
+    :param timesteps_per_batch: (int) the number of timesteps for each batch
+    :param num_timesteps: (int) the total number of timesteps to run
+    :param animate: (bool) if render env
+    :param callback: (function) called every step, used for logging and saving
+    :param desired_kl: (float) the Kullback leibler weight for the loss
+    """
     obfilter = ZFilter(env.observation_space.shape)
 
     max_pathlength = env.spec.timestep_limit
     stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
     inputs, loss, loss_sampled = policy.update_info
-    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
-                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
-                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
+    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2,
+                               epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
+                               weight_decay_dict=policy.wd_dict, max_grad_norm=None)
     pi_var_list = []
     for var in tf.trainable_variables():
         if "pi" in var.name:
             pi_var_list.append(var)
 
     update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
-    do_update = U.function(inputs, update_op)
-    U.initialize()
+    do_update = tf_util.function(inputs, update_op)
+    tf_util.initialize()
 
     # start queue runners
     enqueue_threads = []
     coord = tf.train.Coordinator()
-    for qr in [q_runner, vf.q_runner]:
-        assert (qr != None)
-        enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True))
+    for queue_runner in [q_runner, value_fn.q_runner]:
+        assert queue_runner is not None
+        enqueue_threads.extend(queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True))
 
     i = 0
     timesteps_so_far = 0
     while True:
         if timesteps_so_far > num_timesteps:
             break
-        logger.log("********** Iteration %i ************"%i)
+        logger.log("********** Iteration %i ************" % i)
 
         # Collect paths until we have enough timesteps
         timesteps_this_batch = 0
         paths = []
         while True:
-            path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
+            path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate),
+                           obfilter=obfilter)
             paths.append(path)
-            n = pathlength(path)
-            timesteps_this_batch += n
-            timesteps_so_far += n
+            timesteps_this_batch += path["reward"].shape[0]
+            timesteps_so_far += path["reward"].shape[0]
             if timesteps_this_batch > timesteps_per_batch:
                 break
 
@@ -98,13 +124,13 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
             rew_t = path["reward"]
             return_t = common.discount(rew_t, gamma)
             vtargs.append(return_t)
-            vpred_t = vf.predict(path)
+            vpred_t = value_fn.predict(path)
             vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
-            delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
+            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
             adv_t = common.discount(delta_t, gamma * lam)
             advs.append(adv_t)
         # Update value function
-        vf.fit(paths, vtargs)
+        value_fn.fit(paths, vtargs)
 
         # Build arrays for policy update
         ob_no = np.concatenate([path["observation"] for path in paths])
@@ -119,20 +145,20 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
         min_stepsize = np.float32(1e-8)
         max_stepsize = np.float32(1e0)
         # Adjust stepsize
-        kl = policy.compute_kl(ob_no, oldac_dist)
-        if kl > desired_kl * 2:
+        kl_loss = policy.compute_kl(ob_no, oldac_dist)
+        if kl_loss > desired_kl * 2:
             logger.log("kl too high")
             tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
-        elif kl < desired_kl / 2:
+        elif kl_loss < desired_kl / 2:
             logger.log("kl too low")
             tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
         else:
             logger.log("kl just right!")
 
         logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
-        logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
-        logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
-        logger.record_tabular("KL", kl)
+        logger.record_tabular("EpRewSEM", np.std([path["reward"].sum() / np.sqrt(len(paths)) for path in paths]))
+        logger.record_tabular("EpLenMean", np.mean([path["reward"].shape[0] for path in paths]))
+        logger.record_tabular("KL", kl_loss)
         if callback:
             callback()
         logger.dump_tabular()
diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py
index a8b77b6fd5..9bcd7b34d0 100644
--- a/baselines/acktr/acktr_disc.py
+++ b/baselines/acktr/acktr_disc.py
@@ -1,80 +1,99 @@
-import os.path as osp
+"""
+Discrete acktr
+"""
+
+import os
 import time
 import joblib
-import numpy as np
+
 import tensorflow as tf
-from baselines import logger
 
+from baselines import logger
 from baselines.common import set_global_seeds, explained_variance
-
 from baselines.a2c.a2c import Runner
-from baselines.a2c.utils import discount_with_dones
-from baselines.a2c.utils import Scheduler, find_trainable_variables
-from baselines.a2c.utils import cat_entropy, mse
+from baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse
 from baselines.acktr import kfac
 
 
 class Model(object):
+    def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20,
+                 ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5,
+                 kfac_clip=0.001, lr_schedule='linear'):
+        """
+        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144
+
+        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
+        :param ob_space: (Gym Space) The observation space
+        :param ac_space: (Gym Space) The action space
+        :param n_envs: (int) The number of environments
+        :param total_timesteps: (int) The total number of timesteps for training the model
+        :param nprocs: (int) The number of threads for TensorFlow operations
+        :param n_steps: (int) The number of steps to run for each environment
+        :param ent_coef: (float) The weight for the entropic loss
+        :param vf_coef: (float) The weight for the loss on the value function
+        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
+        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
+        :param max_grad_norm: (float) The clipping value for the maximum gradient
+        :param kfac_clip: (float) gradient clipping for Kullback leiber
+        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+        """
 
-    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
-                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
-                 kfac_clip=0.001, lrschedule='linear'):
         config = tf.ConfigProto(allow_soft_placement=True,
                                 intra_op_parallelism_threads=nprocs,
                                 inter_op_parallelism_threads=nprocs)
         config.gpu_options.allow_growth = True
         self.sess = sess = tf.Session(config=config)
-        nact = ac_space.n
-        nbatch = nenvs * nsteps
-        A = tf.placeholder(tf.int32, [nbatch])
-        ADV = tf.placeholder(tf.float32, [nbatch])
-        R = tf.placeholder(tf.float32, [nbatch])
-        PG_LR = tf.placeholder(tf.float32, [])
-        VF_LR = tf.placeholder(tf.float32, [])
-
-        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
-        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
-
-        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
-        self.logits = logits = train_model.pi
-
-        ##training loss
-        pg_loss = tf.reduce_mean(ADV*logpac)
-        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
+        n_batch = n_envs * n_steps
+        action_ph = tf.placeholder(tf.int32, [n_batch])
+        advs_ph = tf.placeholder(tf.float32, [n_batch])
+        rewards_ph = tf.placeholder(tf.float32, [n_batch])
+        pg_lr_ph = tf.placeholder(tf.float32, [])
+
+        self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False)
+        self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True)
+
+        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph)
+        self.logits = train_model.policy
+
+        # training loss
+        pg_loss = tf.reduce_mean(advs_ph * logpac)
+        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
         pg_loss = pg_loss - ent_coef * entropy
-        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
+        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
         train_loss = pg_loss + vf_coef * vf_loss
 
-
-        ##Fisher loss construction
+        # Fisher loss construction
         self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
-        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
-        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
-        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
+        sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn))
+        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean(
+            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
+        self.joint_fisher = pg_fisher_loss + vf_fisher_loss
 
-        self.params=params = find_trainable_variables("model")
+        self.params = params = find_trainable_variables("model")
 
-        self.grads_check = grads = tf.gradients(train_loss,params)
+        self.grads_check = grads = tf.gradients(train_loss, params)
 
         with tf.device('/gpu:0'):
-            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
-                momentum=0.9, kfac_update=1, epsilon=0.01,\
-                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)
+            self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=kfac_clip,
+                                                    momentum=0.9, kfac_update=1, epsilon=0.01,
+                                                    stats_decay=0.99, async=1, cold_iter=10,
+                                                    max_grad_norm=max_grad_norm)
 
-            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
-            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
+            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
+            train_op, q_runner = optim.apply_gradients(list(zip(grads, params)))
         self.q_runner = q_runner
-        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
+        self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule)
 
         def train(obs, states, rewards, masks, actions, values):
             advs = rewards - values
-            for step in range(len(obs)):
-                cur_lr = self.lr.value()
+            for _ in range(len(obs)):
+                cur_lr = self.learning_rate.value()
 
-            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
+            td_map = {train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr}
             if states is not None:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
+                td_map[train_model.states_ph] = states
+                td_map[train_model.masks_ph] = masks
 
             policy_loss, value_loss, policy_entropy, _ = sess.run(
                 [pg_loss, vf_loss, entropy, train_op],
@@ -83,18 +102,16 @@ def train(obs, states, rewards, masks, actions, values):
             return policy_loss, value_loss, policy_entropy
 
         def save(save_path):
-            ps = sess.run(params)
-            joblib.dump(ps, save_path)
+            session_params = sess.run(params)
+            joblib.dump(session_params, save_path)
 
         def load(load_path):
             loaded_params = joblib.load(load_path)
             restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
+            for param, loaded_p in zip(params, loaded_params):
+                restores.append(param.assign(loaded_p))
             sess.run(restores)
 
-
-
         self.train = train
         self.save = save
         self.load = load
@@ -105,49 +122,70 @@ def load(load_path):
         self.initial_state = step_model.initial_state
         tf.global_variables_initializer().run(session=sess)
 
-def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20,
-                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
-                 kfac_clip=0.001, save_interval=None, lrschedule='linear'):
-    tf.reset_default_graph()
+
+def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, n_steps=20,
+          ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5,
+          kfac_clip=0.001, save_interval=None, lr_schedule='linear'):
+    """
+    Traines an ACKTR model.
+
+    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
+    :param env: (Gym environment) The environment to learn from
+    :param seed: (int) The initial seed for training
+    :param total_timesteps: (int) The total number of samples
+    :param gamma: (float) Discount factor
+    :param log_interval: (int) The number of timesteps before logging.
+    :param nprocs: (int) The number of threads for TensorFlow operations
+    :param n_steps: (int) The number of steps to run for each environment
+    :param ent_coef: (float) The weight for the entropic loss
+    :param vf_coef: (float) The weight for the loss on the value function
+    :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
+    :param learning_rate: (float) The learning rate
+    :param max_grad_norm: (float) The maximum value for the gradient clipping
+    :param kfac_clip: (float) gradient clipping for Kullback leiber
+    :param save_interval: (int) The number of timesteps before saving.
+    :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
+    """
     set_global_seeds(seed)
 
-    nenvs = env.num_envs
+    n_envs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
-    make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps
-                                =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=
-                                vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip,
-                                lrschedule=lrschedule)
+    make_model = lambda: Model(policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=nprocs, n_steps=n_steps,
+                               ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef,
+                               learning_rate=learning_rate,
+                               max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lr_schedule=lr_schedule)
     if save_interval and logger.get_dir():
         import cloudpickle
-        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
-            fh.write(cloudpickle.dumps(make_model))
+        with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as file_handler:
+            file_handler.write(cloudpickle.dumps(make_model))
     model = make_model()
 
-    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
-    nbatch = nenvs*nsteps
-    tstart = time.time()
+    runner = Runner(env, model, n_steps=n_steps, gamma=gamma)
+    n_batch = n_envs * n_steps
+    t_start = time.time()
     coord = tf.train.Coordinator()
     enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True)
-    for update in range(1, total_timesteps//nbatch+1):
+    for update in range(1, total_timesteps // n_batch + 1):
         obs, states, rewards, masks, actions, values = runner.run()
         policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
         model.old_obs = obs
-        nseconds = time.time()-tstart
-        fps = int((update*nbatch)/nseconds)
+        n_seconds = time.time() - t_start
+        fps = int((update * n_batch) / n_seconds)
         if update % log_interval == 0 or update == 1:
-            ev = explained_variance(values, rewards)
+            explained_var = explained_variance(values, rewards)
             logger.record_tabular("nupdates", update)
-            logger.record_tabular("total_timesteps", update*nbatch)
+            logger.record_tabular("total_timesteps", update * n_batch)
             logger.record_tabular("fps", fps)
             logger.record_tabular("policy_entropy", float(policy_entropy))
             logger.record_tabular("policy_loss", float(policy_loss))
             logger.record_tabular("value_loss", float(value_loss))
-            logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("explained_variance", float(explained_var))
             logger.dump_tabular()
 
         if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
-            savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update)
+            savepath = os.path.join(logger.get_dir(), 'checkpoint%.5i' % update)
             print('Saving to', savepath)
             model.save(savepath)
     coord.request_stop()
diff --git a/baselines/acktr/kfac.py b/baselines/acktr/kfac.py
index b4208199dc..3fe2a317bd 100644
--- a/baselines/acktr/kfac.py
+++ b/baselines/acktr/kfac.py
@@ -1,16 +1,44 @@
-import tensorflow as tf
-import numpy as np
 import re
-from baselines.acktr.kfac_utils import *
 from functools import reduce
 
+import tensorflow as tf
+import numpy as np
+
+from baselines.acktr.kfac_utils import detect_min_val, factor_reshape, gmatmul
+
 KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd']
 KFAC_DEBUG = False
 
 
-class KfacOptimizer():
-
-    def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5):
+class KfacOptimizer:
+    def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60,
+                 full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2,
+                 stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approx_t2=False,
+                 use_float64=False, weight_decay_dict=None, max_grad_norm=0.5):
+        """
+        Kfac Optimizer for ACKTR models
+        link: https://arxiv.org/pdf/1708.05144.pdf
+
+        :param learning_rate: (float) The learning rate
+        :param momentum: (float) The momentum value for the TensorFlow momentum optimizer
+        :param clip_kl: (float) gradient clipping for Kullback leiber
+        :param kfac_update: (int) update kfac after kfac_update steps
+        :param stats_accum_iter: (int) how may steps to accumulate stats
+        :param full_stats_init: (bool) whether or not to fully initalize stats
+        :param cold_iter: (int) Cold start learning rate for how many steps
+        :param cold_lr: (float) Cold start learning rate
+        :param async: (bool) Use async eigen decomposition
+        :param async_stats: (bool) Asynchronous stats update
+        :param epsilon: (float) epsilon value for small numbers
+        :param stats_decay: (float) the stats decay rate
+        :param blockdiag_bias: (bool)
+        :param channel_fac: (bool) factorization along the channels
+        :param factored_damping: (bool) use factored damping
+        :param approx_t2: (bool) approximate T2 act and grad fisher
+        :param use_float64: (bool) use 64-bit float
+        :param weight_decay_dict: (dict) custom weight decay coeff for a given gradient
+        :param max_grad_norm: (float) The maximum value for the gradient clipping
+        """
         self.max_grad_norm = max_grad_norm
         self._lr = learning_rate
         self._momentum = momentum
@@ -22,16 +50,18 @@ def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2
         self._epsilon = epsilon
         self._stats_decay = stats_decay
         self._blockdiag_bias = blockdiag_bias
-        self._approxT2 = approxT2
+        self._approx_t2 = approx_t2
         self._use_float64 = use_float64
         self._factored_damping = factored_damping
         self._cold_iter = cold_iter
-        if cold_lr == None:
+        if cold_lr is None:
             # good heuristics
-            self._cold_lr = self._lr# * 3.
+            self._cold_lr = self._lr  # * 3.
         else:
             self._cold_lr = cold_lr
         self._stats_accum_iter = stats_accum_iter
+        if weight_decay_dict is None:
+            weight_decay_dict = {}
         self._weight_decay_dict = weight_decay_dict
         self._diag_init_coeff = 0.
         self._full_stats_init = full_stats_init
@@ -46,241 +76,252 @@ def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2
             0, name='KFAC/factor_step', trainable=False)
         self.stats_step = tf.Variable(
             0, name='KFAC/stats_step', trainable=False)
-        self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False)
+        self.v_f_v = tf.Variable(0., name='KFAC/vFv', trainable=False)
 
         self.factors = {}
         self.param_vars = []
         self.stats = {}
         self.stats_eigen = {}
 
-    def getFactors(self, g, varlist):
-        graph = tf.get_default_graph()
-        factorTensors = {}
-        fpropTensors = []
-        bpropTensors = []
-        opTypes = []
-        fops = []
+    def get_factors(self, gradients, varlist):
+        """
+        get factors to update
+
+        :param gradients: ([TensorFlow Tensor]) The gradients
+        :param varlist: ([TensorFlow Tensor]) The parameters
+        :return: ([TensorFlow Tensor]) The factors to update
+        """
+        default_graph = tf.get_default_graph()
+        factor_tensors = {}
+        fprop_tensors = []
+        bprop_tensors = []
+        op_types = []
 
-        def searchFactors(gradient, graph):
+        def _search_factors(gradient, graph):
             # hard coded search stratergy
-            bpropOp = gradient.op
-            bpropOp_name = bpropOp.name
+            bprop_op = gradient.op
+            bprop_op_name = bprop_op.name
 
-            bTensors = []
-            fTensors = []
+            b_tensors = []
+            f_tensors = []
 
             # combining additive gradient, assume they are the same op type and
             # indepedent
-            if 'AddN' in bpropOp_name:
+            if 'AddN' in bprop_op_name:
                 factors = []
-                for g in gradient.op.inputs:
-                    factors.append(searchFactors(g, graph))
-                op_names = [item['opName'] for item in factors]
-                # TO-DO: need to check all the attribute of the ops as well
-                print (gradient.name)
-                print (op_names)
-                print (len(np.unique(op_names)))
-                assert len(np.unique(op_names)) == 1, gradient.name + \
-                    ' is shared among different computation OPs'
-
-                bTensors = reduce(lambda x, y: x + y,
-                                  [item['bpropFactors'] for item in factors])
+                for grad in gradient.op.inputs:
+                    factors.append(_search_factors(grad, graph))
+                op_names = [_item['opName'] for _item in factors]
+                # TODO: need to check all the attribute of the ops as well
+                print(gradient.name)
+                print(op_names)
+                print(len(np.unique(op_names)))
+                assert len(np.unique(op_names)) == 1, \
+                    'Error: {} is shared among different computation OPs'.format(gradient.name)
+
+                b_tensors = reduce(lambda x, y: x + y,
+                                   [_item['bpropFactors'] for _item in factors])
                 if len(factors[0]['fpropFactors']) > 0:
-                    fTensors = reduce(
-                        lambda x, y: x + y, [item['fpropFactors'] for item in factors])
-                fpropOp_name = op_names[0]
-                fpropOp = factors[0]['op']
+                    f_tensors = reduce(
+                        lambda x, y: x + y, [_item['fpropFactors'] for _item in factors])
+                fprop_op_name = op_names[0]
+                fprop_op = factors[0]['op']
             else:
-                fpropOp_name = re.search(
-                    'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2)
-                fpropOp = graph.get_operation_by_name(fpropOp_name)
-                if fpropOp.op_def.name in KFAC_OPS:
+                fprop_op_name = re.search('gradientsSampled(_[0-9]+|)/(.+?)_grad', bprop_op_name).group(2)
+                fprop_op = graph.get_operation_by_name(fprop_op_name)
+                if fprop_op.op_def.name in KFAC_OPS:
                     # Known OPs
-                    ###
-                    bTensor = [
-                        i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1]
-                    bTensorShape = fpropOp.outputs[0].get_shape()
-                    if bTensor.get_shape()[0].value == None:
-                        bTensor.set_shape(bTensorShape)
-                    bTensors.append(bTensor)
-                    ###
-                    if fpropOp.op_def.name == 'BiasAdd':
-                        fTensors = []
+                    b_tensor = [_i for _i in bprop_op.inputs if 'gradientsSampled' in _i.name][-1]
+                    b_tensor_shape = fprop_op.outputs[0].get_shape()
+                    if b_tensor.get_shape()[0].value is None:
+                        b_tensor.set_shape(b_tensor_shape)
+                    b_tensors.append(b_tensor)
+
+                    if fprop_op.op_def.name == 'BiasAdd':
+                        f_tensors = []
                     else:
-                        fTensors.append(
-                            [i for i in fpropOp.inputs if param.op.name not in i.name][0])
-                    fpropOp_name = fpropOp.op_def.name
+                        f_tensors.append([_i for _i in fprop_op.inputs if param.op.name not in _i.name][0])
+                    fprop_op_name = fprop_op.op_def.name
                 else:
                     # unknown OPs, block approximation used
-                    bInputsList = [i for i in bpropOp.inputs[
-                        0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name]
-                    if len(bInputsList) > 0:
-                        bTensor = bInputsList[0]
-                        bTensorShape = fpropOp.outputs[0].get_shape()
-                        if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None:
-                            bTensor.set_shape(bTensorShape)
-                        bTensors.append(bTensor)
-                    fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name)
-
-            return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors}
-
-        for t, param in zip(g, varlist):
+                    b_inputs_list = [_i for _i in bprop_op.inputs[0].op.inputs
+                                     if 'gradientsSampled' in _i.name if 'Shape' not in _i.name]
+                    if len(b_inputs_list) > 0:
+                        b_tensor = b_inputs_list[0]
+                        # only if tensor shape is defined, usually this will prevent tensor like Sum:0 to be used.
+                        if b_tensor.get_shape():
+                            b_tensor_shape = fprop_op.outputs[0].get_shape()
+                            if len(b_tensor.get_shape()) > 0 and b_tensor.get_shape()[0].value is None:
+                                b_tensor.set_shape(b_tensor_shape)
+                            b_tensors.append(b_tensor)
+                    fprop_op_name = op_types.append('UNK-' + fprop_op.op_def.name)
+
+            return {'opName': fprop_op_name, 'op': fprop_op, 'fpropFactors': f_tensors, 'bpropFactors': b_tensors}
+
+        for _grad, param in zip(gradients, varlist):
             if KFAC_DEBUG:
-                print(('get factor for '+param.name))
-            factors = searchFactors(t, graph)
-            factorTensors[param] = factors
+                print(('get factor for ' + param.name))
+            found_factors = _search_factors(_grad, default_graph)
+            factor_tensors[param] = found_factors
 
-        ########
         # check associated weights and bias for homogeneous coordinate representation
         # and check redundent factors
-        # TO-DO: there may be a bug to detect associate bias and weights for
-        # forking layer, e.g. in inception models.
+        # TODO: there may be a bug to detect associate bias and weights for forking layer, e.g. in inception models.
         for param in varlist:
-            factorTensors[param]['assnWeights'] = None
-            factorTensors[param]['assnBias'] = None
+            factor_tensors[param]['assnWeights'] = None
+            factor_tensors[param]['assnBias'] = None
         for param in varlist:
-            if factorTensors[param]['opName'] == 'BiasAdd':
-                factorTensors[param]['assnWeights'] = None
+            if factor_tensors[param]['opName'] == 'BiasAdd':
+                factor_tensors[param]['assnWeights'] = None
                 for item in varlist:
-                    if len(factorTensors[item]['bpropFactors']) > 0:
-                        if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0):
-                            factorTensors[param]['assnWeights'] = item
-                            factorTensors[item]['assnBias'] = param
-                            factorTensors[param]['bpropFactors'] = factorTensors[
+                    if len(factor_tensors[item]['bpropFactors']) > 0:
+                        if (set(factor_tensors[item]['bpropFactors']) == set(factor_tensors[param]['bpropFactors'])) \
+                                and (len(factor_tensors[item]['fpropFactors']) > 0):
+                            factor_tensors[param]['assnWeights'] = item
+                            factor_tensors[item]['assnBias'] = param
+                            factor_tensors[param]['bpropFactors'] = factor_tensors[
                                 item]['bpropFactors']
 
-        ########
-
-        ########
-        # concatenate the additive gradients along the batch dimension, i.e.
-        # assuming independence structure
+        # concatenate the additive gradients along the batch dimension, i.e. assuming independence structure
         for key in ['fpropFactors', 'bpropFactors']:
             for i, param in enumerate(varlist):
-                if len(factorTensors[param][key]) > 0:
-                    if (key + '_concat') not in factorTensors[param]:
-                        name_scope = factorTensors[param][key][0].name.split(':')[
+                if len(factor_tensors[param][key]) > 0:
+                    if (key + '_concat') not in factor_tensors[param]:
+                        name_scope = factor_tensors[param][key][0].name.split(':')[
                             0]
                         with tf.name_scope(name_scope):
-                            factorTensors[param][
-                                key + '_concat'] = tf.concat(factorTensors[param][key], 0)
+                            factor_tensors[param][
+                                key + '_concat'] = tf.concat(factor_tensors[param][key], 0)
                 else:
-                    factorTensors[param][key + '_concat'] = None
-                for j, param2 in enumerate(varlist[(i + 1):]):
-                    if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])):
-                        factorTensors[param2][key] = factorTensors[param][key]
-                        factorTensors[param2][
-                            key + '_concat'] = factorTensors[param][key + '_concat']
-        ########
+                    factor_tensors[param][key + '_concat'] = None
+                for _, param2 in enumerate(varlist[(i + 1):]):
+                    if (len(factor_tensors[param][key]) > 0) and (
+                            set(factor_tensors[param2][key]) == set(factor_tensors[param][key])):
+                        factor_tensors[param2][key] = factor_tensors[param][key]
+                        factor_tensors[param2][
+                            key + '_concat'] = factor_tensors[param][key + '_concat']
 
         if KFAC_DEBUG:
-            for items in zip(varlist, fpropTensors, bpropTensors, opTypes):
-                print((items[0].name, factorTensors[item]))
-        self.factors = factorTensors
-        return factorTensors
+            for items in zip(varlist, fprop_tensors, bprop_tensors, op_types):
+                print((items[0].name, factor_tensors[item]))
+        self.factors = factor_tensors
+        return factor_tensors
+
+    def get_stats(self, factors, varlist):
+        """
+        return the stats values from the factors to update and the parameters
 
-    def getStats(self, factors, varlist):
+        :param factors: ([TensorFlow Tensor]) The factors to update
+        :param varlist: ([TensorFlow Tensor]) The parameters
+        :return: ([TensorFlow Tensor]) The stats values
+        """
         if len(self.stats) == 0:
             # initialize stats variables on CPU because eigen decomp is
             # computed on CPU
             with tf.device('/cpu'):
-                tmpStatsCache = {}
+                tmp_stats_cache = {}
 
                 # search for tensor factors and
                 # use block diag approx for the bias units
                 for var in varlist:
-                    fpropFactor = factors[var]['fpropFactors_concat']
-                    bpropFactor = factors[var]['bpropFactors_concat']
-                    opType = factors[var]['opName']
-                    if opType == 'Conv2D':
-                        Kh = var.get_shape()[0]
-                        Kw = var.get_shape()[1]
-                        C = fpropFactor.get_shape()[-1]
-
-                        Oh = bpropFactor.get_shape()[1]
-                        Ow = bpropFactor.get_shape()[2]
-                        if Oh == 1 and Ow == 1 and self._channel_fac:
+                    bprop_factor = factors[var]['bpropFactors_concat']
+                    op_type = factors[var]['opName']
+                    if op_type == 'Conv2D':
+                        operator_height = bprop_factor.get_shape()[1]
+                        operator_width = bprop_factor.get_shape()[2]
+                        if operator_height == 1 and operator_width == 1 and self._channel_fac:
                             # factorization along the channels do not support
                             # homogeneous coordinate
-                            var_assnBias = factors[var]['assnBias']
-                            if var_assnBias:
+                            var_assn_bias = factors[var]['assnBias']
+                            if var_assn_bias:
                                 factors[var]['assnBias'] = None
-                                factors[var_assnBias]['assnWeights'] = None
-                ##
+                                factors[var_assn_bias]['assnWeights'] = None
 
                 for var in varlist:
-                    fpropFactor = factors[var]['fpropFactors_concat']
-                    bpropFactor = factors[var]['bpropFactors_concat']
-                    opType = factors[var]['opName']
-                    self.stats[var] = {'opName': opType,
+                    fprop_factor = factors[var]['fpropFactors_concat']
+                    bprop_factor = factors[var]['bpropFactors_concat']
+                    op_type = factors[var]['opName']
+                    self.stats[var] = {'opName': op_type,
                                        'fprop_concat_stats': [],
                                        'bprop_concat_stats': [],
                                        'assnWeights': factors[var]['assnWeights'],
                                        'assnBias': factors[var]['assnBias'],
                                        }
-                    if fpropFactor is not None:
-                        if fpropFactor not in tmpStatsCache:
-                            if opType == 'Conv2D':
-                                Kh = var.get_shape()[0]
-                                Kw = var.get_shape()[1]
-                                C = fpropFactor.get_shape()[-1]
-
-                                Oh = bpropFactor.get_shape()[1]
-                                Ow = bpropFactor.get_shape()[2]
-                                if Oh == 1 and Ow == 1 and self._channel_fac:
+                    if fprop_factor is not None:
+                        if fprop_factor not in tmp_stats_cache:
+                            if op_type == 'Conv2D':
+                                kernel_height = var.get_shape()[0]
+                                kernel_width = var.get_shape()[1]
+                                n_channels = fprop_factor.get_shape()[-1]
+
+                                operator_height = bprop_factor.get_shape()[1]
+                                operator_width = bprop_factor.get_shape()[2]
+                                if operator_height == 1 and operator_width == 1 and self._channel_fac:
                                     # factorization along the channels
                                     # assume independence between input channels and spatial
                                     # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix
                                     # factorization along the channels do not
                                     # support homogeneous coordinate, assnBias
                                     # is always None
-                                    fpropFactor2_size = Kh * Kw
-                                    slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones(
-                                        [fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                                    fprop_factor2_size = kernel_height * kernel_width
+                                    slot_fprop_factor_stats2 = tf.Variable(tf.diag(tf.ones(
+                                        [fprop_factor2_size])) * self._diag_init_coeff,
+                                                                           name='KFAC_STATS/' + fprop_factor.op.name,
+                                                                           trainable=False)
                                     self.stats[var]['fprop_concat_stats'].append(
-                                        slot_fpropFactor_stats2)
+                                        slot_fprop_factor_stats2)
 
-                                    fpropFactor_size = C
+                                    fprop_factor_size = n_channels
                                 else:
                                     # 2K-1 x 2K-1 x C x C covariance matrix
                                     # assume BHWC
-                                    fpropFactor_size = Kh * Kw * C
+                                    fprop_factor_size = kernel_height * kernel_width * n_channels
                             else:
                                 # D x D covariance matrix
-                                fpropFactor_size = fpropFactor.get_shape()[-1]
+                                fprop_factor_size = fprop_factor.get_shape()[-1]
 
                             # use homogeneous coordinate
                             if not self._blockdiag_bias and self.stats[var]['assnBias']:
-                                fpropFactor_size += 1
+                                fprop_factor_size += 1
 
-                            slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones(
-                                [fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False)
+                            slot_fprop_factor_stats = tf.Variable(
+                                tf.diag(tf.ones([fprop_factor_size])) * self._diag_init_coeff,
+                                name='KFAC_STATS/' + fprop_factor.op.name, trainable=False)
                             self.stats[var]['fprop_concat_stats'].append(
-                                slot_fpropFactor_stats)
-                            if opType != 'Conv2D':
-                                tmpStatsCache[fpropFactor] = self.stats[
+                                slot_fprop_factor_stats)
+                            if op_type != 'Conv2D':
+                                tmp_stats_cache[fprop_factor] = self.stats[
                                     var]['fprop_concat_stats']
                         else:
                             self.stats[var][
-                                'fprop_concat_stats'] = tmpStatsCache[fpropFactor]
+                                'fprop_concat_stats'] = tmp_stats_cache[fprop_factor]
 
-                    if bpropFactor is not None:
+                    if bprop_factor is not None:
                         # no need to collect backward stats for bias vectors if
                         # using homogeneous coordinates
-                        if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
-                            if bpropFactor not in tmpStatsCache:
-                                slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape(
-                                )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False)
+                        if not ((not self._blockdiag_bias) and self.stats[var]['assnWeights']):
+                            if bprop_factor not in tmp_stats_cache:
+                                slot_bprop_factor_stats = tf.Variable(tf.diag(tf.ones([bprop_factor.get_shape(
+                                )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bprop_factor.op.name,
+                                                                      trainable=False)
                                 self.stats[var]['bprop_concat_stats'].append(
-                                    slot_bpropFactor_stats)
-                                tmpStatsCache[bpropFactor] = self.stats[
+                                    slot_bprop_factor_stats)
+                                tmp_stats_cache[bprop_factor] = self.stats[
                                     var]['bprop_concat_stats']
                             else:
                                 self.stats[var][
-                                    'bprop_concat_stats'] = tmpStatsCache[bpropFactor]
+                                    'bprop_concat_stats'] = tmp_stats_cache[bprop_factor]
 
         return self.stats
 
     def compute_and_apply_stats(self, loss_sampled, var_list=None):
+        """
+        compute and apply stats
+
+        :param loss_sampled: ([TensorFlow Tensor]) the loss function output
+        :param var_list: ([TensorFlow Tensor]) The parameters
+        :return: (function) apply stats
+        """
         varlist = var_list
         if varlist is None:
             varlist = tf.trainable_variables()
@@ -289,206 +330,211 @@ def compute_and_apply_stats(self, loss_sampled, var_list=None):
         return self.apply_stats(stats)
 
     def compute_stats(self, loss_sampled, var_list=None):
+        """
+        compute the stats values
+
+        :param loss_sampled: ([TensorFlow Tensor]) the loss function output
+        :param var_list: ([TensorFlow Tensor]) The parameters
+        :return: ([TensorFlow Tensor]) stats updates
+        """
         varlist = var_list
         if varlist is None:
             varlist = tf.trainable_variables()
 
-        gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled')
-        self.gs = gs
-        factors = self.getFactors(gs, varlist)
-        stats = self.getStats(factors, varlist)
+        gradient_sampled = tf.gradients(loss_sampled, varlist, name='gradientsSampled')
+        self.gradient_sampled = gradient_sampled
+        factors = self.get_factors(gradient_sampled, varlist)
+        stats = self.get_stats(factors, varlist)
 
-        updateOps = []
-        statsUpdates = {}
-        statsUpdates_cache = {}
+        update_ops = []
+        stats_updates = {}
+        stats_updates_cache = {}
         for var in varlist:
-            opType = factors[var]['opName']
+            op_type = factors[var]['opName']
             fops = factors[var]['op']
-            fpropFactor = factors[var]['fpropFactors_concat']
-            fpropStats_vars = stats[var]['fprop_concat_stats']
-            bpropFactor = factors[var]['bpropFactors_concat']
-            bpropStats_vars = stats[var]['bprop_concat_stats']
-            SVD_factors = {}
-            for stats_var in fpropStats_vars:
+            fprop_factor = factors[var]['fpropFactors_concat']
+            fprop_stats_vars = stats[var]['fprop_concat_stats']
+            bprop_factor = factors[var]['bpropFactors_concat']
+            bprop_stats_vars = stats[var]['bprop_concat_stats']
+            svd_factors = {}
+            for stats_var in fprop_stats_vars:
                 stats_var_dim = int(stats_var.get_shape()[0])
-                if stats_var not in statsUpdates_cache:
-                    old_fpropFactor = fpropFactor
-                    B = (tf.shape(fpropFactor)[0])  # batch size
-                    if opType == 'Conv2D':
+                if stats_var not in stats_updates_cache:
+                    batch_size = (tf.shape(fprop_factor)[0])  # batch size
+                    if op_type == 'Conv2D':
                         strides = fops.get_attr("strides")
                         padding = fops.get_attr("padding")
                         convkernel_size = var.get_shape()[0:3]
 
-                        KH = int(convkernel_size[0])
-                        KW = int(convkernel_size[1])
-                        C = int(convkernel_size[2])
-                        flatten_size = int(KH * KW * C)
+                        kernel_height = int(convkernel_size[0])
+                        kernel_width = int(convkernel_size[1])
+                        chan = int(convkernel_size[2])
+                        flatten_size = int(kernel_height * kernel_width * chan)
 
-                        Oh = int(bpropFactor.get_shape()[1])
-                        Ow = int(bpropFactor.get_shape()[2])
+                        operator_height = int(bprop_factor.get_shape()[1])
+                        operator_width = int(bprop_factor.get_shape()[2])
 
-                        if Oh == 1 and Ow == 1 and self._channel_fac:
-                                # factorization along the channels
-                                # assume independence among input channels
-                                # factor = B x 1 x 1 x (KH xKW x C)
-                                # patches = B x Oh x Ow x (KH xKW x C)
-                            if len(SVD_factors) == 0:
+                        if operator_height == 1 and operator_width == 1 and self._channel_fac:
+                            # factorization along the channels
+                            # assume independence among input channels
+                            # factor = B x 1 x 1 x (KH xKW x C)
+                            # patches = B x Oh x Ow x (KH xKW x C)
+                            if len(svd_factors) == 0:
                                 if KFAC_DEBUG:
-                                    print(('approx %s act factor with rank-1 SVD factors' % (var.name)))
+                                    print(('approx %s act factor with rank-1 SVD factors' % var.name))
                                 # find closest rank-1 approx to the feature map
                                 S, U, V = tf.batch_svd(tf.reshape(
-                                    fpropFactor, [-1, KH * KW, C]))
+                                    fprop_factor, [-1, kernel_height * kernel_width, chan]))
                                 # get rank-1 approx slides
-                                sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
-                                patches_k = U[:, :, 0] * sqrtS1  # B x KH*KW
-                                full_factor_shape = fpropFactor.get_shape()
+                                sqrt_s1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1)
+                                patches_k = U[:, :, 0] * sqrt_s1  # B x KH*KW
+                                full_factor_shape = fprop_factor.get_shape()
                                 patches_k.set_shape(
-                                    [full_factor_shape[0], KH * KW])
-                                patches_c = V[:, :, 0] * sqrtS1  # B x C
-                                patches_c.set_shape([full_factor_shape[0], C])
-                                SVD_factors[C] = patches_c
-                                SVD_factors[KH * KW] = patches_k
-                            fpropFactor = SVD_factors[stats_var_dim]
+                                    [full_factor_shape[0], kernel_height * kernel_width])
+                                patches_c = V[:, :, 0] * sqrt_s1  # B x C
+                                patches_c.set_shape([full_factor_shape[0], chan])
+                                svd_factors[chan] = patches_c
+                                svd_factors[kernel_height * kernel_width] = patches_k
+                            fprop_factor = svd_factors[stats_var_dim]
 
                         else:
                             # poor mem usage implementation
-                            patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[
-                                                               0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
+                            patches = tf.extract_image_patches(fprop_factor, ksizes=[1, convkernel_size[
+                                0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding)
 
-                            if self._approxT2:
+                            if self._approx_t2:
                                 if KFAC_DEBUG:
-                                    print(('approxT2 act fisher for %s' % (var.name)))
+                                    print(('approxT2 act fisher for %s' % var.name))
                                 # T^2 terms * 1/T^2, size: B x C
-                                fpropFactor = tf.reduce_mean(patches, [1, 2])
+                                fprop_factor = tf.reduce_mean(patches, [1, 2])
                             else:
                                 # size: (B x Oh x Ow) x C
-                                fpropFactor = tf.reshape(
-                                    patches, [-1, flatten_size]) / Oh / Ow
-                    fpropFactor_size = int(fpropFactor.get_shape()[-1])
-                    if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias:
-                        if opType == 'Conv2D' and not self._approxT2:
+                                fprop_factor = tf.reshape(
+                                    patches, [-1, flatten_size]) / operator_height / operator_width
+                    fprop_factor_size = int(fprop_factor.get_shape()[-1])
+                    if stats_var_dim == (fprop_factor_size + 1) and not self._blockdiag_bias:
+                        if op_type == 'Conv2D' and not self._approx_t2:
                             # correct padding for numerical stability (we
                             # divided out OhxOw from activations for T1 approx)
-                            fpropFactor = tf.concat([fpropFactor, tf.ones(
-                                [tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1)
+                            fprop_factor = tf.concat([fprop_factor, tf.ones(
+                                [tf.shape(fprop_factor)[0], 1]) / operator_height / operator_width], 1)
                         else:
                             # use homogeneous coordinates
-                            fpropFactor = tf.concat(
-                                [fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1)
+                            fprop_factor = tf.concat(
+                                [fprop_factor, tf.ones([tf.shape(fprop_factor)[0], 1])], 1)
 
                     # average over the number of data points in a batch
                     # divided by B
-                    cov = tf.matmul(fpropFactor, fpropFactor,
-                                    transpose_a=True) / tf.cast(B, tf.float32)
-                    updateOps.append(cov)
-                    statsUpdates[stats_var] = cov
-                    if opType != 'Conv2D':
+                    cov = tf.matmul(fprop_factor, fprop_factor,
+                                    transpose_a=True) / tf.cast(batch_size, tf.float32)
+                    update_ops.append(cov)
+                    stats_updates[stats_var] = cov
+                    if op_type != 'Conv2D':
                         # HACK: for convolution we recompute fprop stats for
                         # every layer including forking layers
-                        statsUpdates_cache[stats_var] = cov
-
-            for stats_var in bpropStats_vars:
-                stats_var_dim = int(stats_var.get_shape()[0])
-                if stats_var not in statsUpdates_cache:
-                    old_bpropFactor = bpropFactor
-                    bpropFactor_shape = bpropFactor.get_shape()
-                    B = tf.shape(bpropFactor)[0]  # batch size
-                    C = int(bpropFactor_shape[-1])  # num channels
-                    if opType == 'Conv2D' or len(bpropFactor_shape) == 4:
-                        if fpropFactor is not None:
-                            if self._approxT2:
+                        stats_updates_cache[stats_var] = cov
+
+            for stats_var in bprop_stats_vars:
+                if stats_var not in stats_updates_cache:
+                    bprop_factor_shape = bprop_factor.get_shape()
+                    batch_size = tf.shape(bprop_factor)[0]  # batch size
+                    chan = int(bprop_factor_shape[-1])  # num channels
+                    if op_type == 'Conv2D' or len(bprop_factor_shape) == 4:
+                        if fprop_factor is not None:
+                            if self._approx_t2:
                                 if KFAC_DEBUG:
-                                    print(('approxT2 grad fisher for %s' % (var.name)))
-                                bpropFactor = tf.reduce_sum(
-                                    bpropFactor, [1, 2])  # T^2 terms * 1/T^2
+                                    print(('approxT2 grad fisher for %s' % var.name))
+                                bprop_factor = tf.reduce_sum(
+                                    bprop_factor, [1, 2])  # T^2 terms * 1/T^2
                             else:
-                                bpropFactor = tf.reshape(
-                                    bpropFactor, [-1, C]) * Oh * Ow  # T * 1/T terms
+                                bprop_factor = tf.reshape(
+                                    bprop_factor, [-1, chan]) * operator_height * operator_width  # T * 1/T terms
                         else:
                             # just doing block diag approx. spatial independent
                             # structure does not apply here. summing over
                             # spatial locations
                             if KFAC_DEBUG:
-                                print(('block diag approx fisher for %s' % (var.name)))
-                            bpropFactor = tf.reduce_sum(bpropFactor, [1, 2])
+                                print(('block diag approx fisher for %s' % var.name))
+                            bprop_factor = tf.reduce_sum(bprop_factor, [1, 2])
 
-                    # assume sampled loss is averaged. TO-DO:figure out better
+                    # assume sampled loss is averaged. TODO:figure out better
                     # way to handle this
-                    bpropFactor *= tf.to_float(B)
+                    bprop_factor *= tf.to_float(batch_size)
                     ##
 
                     cov_b = tf.matmul(
-                        bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0])
+                        bprop_factor, bprop_factor, transpose_a=True) / tf.to_float(tf.shape(bprop_factor)[0])
 
-                    updateOps.append(cov_b)
-                    statsUpdates[stats_var] = cov_b
-                    statsUpdates_cache[stats_var] = cov_b
+                    update_ops.append(cov_b)
+                    stats_updates[stats_var] = cov_b
+                    stats_updates_cache[stats_var] = cov_b
 
         if KFAC_DEBUG:
-            aKey = list(statsUpdates.keys())[0]
-            statsUpdates[aKey] = tf.Print(statsUpdates[aKey],
-                                          [tf.convert_to_tensor('step:'),
-                                           self.global_step,
-                                           tf.convert_to_tensor(
-                                               'computing stats'),
-                                           ])
-        self.statsUpdates = statsUpdates
-        return statsUpdates
-
-    def apply_stats(self, statsUpdates):
-        """ compute stats and update/apply the new stats to the running average
+            a_key = list(stats_updates.keys())[0]
+            stats_updates[a_key] = tf.Print(stats_updates[a_key], [tf.convert_to_tensor('step:'), self.global_step,
+                                                                   tf.convert_to_tensor('computing stats')])
+        self.stats_updates = stats_updates
+        return stats_updates
+
+    def apply_stats(self, stats_updates):
+        """
+        compute stats and update/apply the new stats to the running average
+
+        :param stats_updates: ([TensorFlow Tensor]) The stats updates
+        :return: (function) update stats operation
         """
 
-        def updateAccumStats():
+        def _update_accum_stats():
             if self._full_stats_init:
-                return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op)
+                return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(
+                    *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)),
+                               tf.no_op)
             else:
-                return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter))
+                return tf.group(
+                    *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter))
 
-        def updateRunningAvgStats(statsUpdates, fac_iter=1):
-            # return tf.cond(tf.greater_equal(self.factor_step,
-            # tf.convert_to_tensor(fac_iter)), lambda:
-            # tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op)
-            return tf.group(*self._apply_stats(statsUpdates))
+        def _update_running_avg_stats(stats_updates):
+            return tf.group(*self._apply_stats(stats_updates))
 
         if self._async_stats:
             # asynchronous stats update
-            update_stats = self._apply_stats(statsUpdates)
+            update_stats = self._apply_stats(stats_updates)
 
             queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[
-                                 item.get_shape() for item in update_stats])
+                item.get_shape() for item in update_stats])
             enqueue_op = queue.enqueue(update_stats)
 
             def dequeue_stats_op():
                 return queue.dequeue()
+
             self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op])
             update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(
                 0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ]))
         else:
             # synchronous stats update
-            update_stats_op = tf.cond(tf.greater_equal(
-                self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats)
+            update_stats_op = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
+                                      lambda: _update_running_avg_stats(stats_updates), _update_accum_stats)
         self._update_stats_op = update_stats_op
         return update_stats_op
 
-    def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
-        updateOps = []
+    def _apply_stats(self, stats_updates, accumulate=False, accumulate_coeff=0.):
+        update_ops = []
         # obtain the stats var list
-        for stats_var in statsUpdates:
-            stats_new = statsUpdates[stats_var]
+        for stats_var in stats_updates:
+            stats_new = stats_updates[stats_var]
             if accumulate:
                 # simple superbatch averaging
                 update_op = tf.assign_add(
-                    stats_var, accumulateCoeff * stats_new, use_locking=True)
+                    stats_var, accumulate_coeff * stats_new, use_locking=True)
             else:
                 # exponential running averaging
                 update_op = tf.assign(
                     stats_var, stats_var * self._stats_decay, use_locking=True)
                 update_op = tf.assign_add(
                     update_op, (1. - self._stats_decay) * stats_new, use_locking=True)
-            updateOps.append(update_op)
+            update_ops.append(update_op)
 
-        with tf.control_dependencies(updateOps):
+        with tf.control_dependencies(update_ops):
             stats_step_op = tf.assign_add(self.stats_step, 1)
 
         if KFAC_DEBUG:
@@ -502,120 +548,115 @@ def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.):
                                        tf.convert_to_tensor('Accum:'),
                                        tf.convert_to_tensor(accumulate),
                                        tf.convert_to_tensor('Accum coeff:'),
-                                       tf.convert_to_tensor(accumulateCoeff),
+                                       tf.convert_to_tensor(accumulate_coeff),
                                        tf.convert_to_tensor('stat step:'),
-                                       self.stats_step, updateOps[0], updateOps[1]]))
+                                       self.stats_step, update_ops[0], update_ops[1]]))
         return [stats_step_op, ]
 
-    def getStatsEigen(self, stats=None):
+    def get_stats_eigen(self, stats=None):
+        """
+        Return the eigen values from the stats
+
+        :param stats: ([TensorFlow Tensor]) The stats
+        :return: ([TensorFlow Tensor]) The stats eigen values
+        """
         if len(self.stats_eigen) == 0:
             stats_eigen = {}
             if stats is None:
                 stats = self.stats
 
-            tmpEigenCache = {}
+            tmp_eigen_cache = {}
             with tf.device('/cpu:0'):
                 for var in stats:
                     for key in ['fprop_concat_stats', 'bprop_concat_stats']:
                         for stats_var in stats[var][key]:
-                            if stats_var not in tmpEigenCache:
+                            if stats_var not in tmp_eigen_cache:
                                 stats_dim = stats_var.get_shape()[1].value
-                                e = tf.Variable(tf.ones(
-                                    [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False)
-                                Q = tf.Variable(tf.diag(tf.ones(
-                                    [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False)
-                                stats_eigen[stats_var] = {'e': e, 'Q': Q}
-                                tmpEigenCache[
+                                eigen_values = tf.Variable(tf.ones(
+                                    [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e',
+                                    trainable=False)
+                                eigen_vectors = tf.Variable(tf.diag(tf.ones(
+                                    [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q',
+                                    trainable=False)
+                                stats_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors}
+                                tmp_eigen_cache[
                                     stats_var] = stats_eigen[stats_var]
                             else:
-                                stats_eigen[stats_var] = tmpEigenCache[
+                                stats_eigen[stats_var] = tmp_eigen_cache[
                                     stats_var]
             self.stats_eigen = stats_eigen
         return self.stats_eigen
 
-    def computeStatsEigen(self):
-        """ compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """
-        # TO-DO: figure out why this op has delays (possibly moving
-        # eigenvectors around?)
-        with tf.device('/cpu:0'):
-            def removeNone(tensor_list):
-                local_list = []
-                for item in tensor_list:
-                    if item is not None:
-                        local_list.append(item)
-                return local_list
-
-            def copyStats(var_list):
-                print("copying stats to buffer tensors before eigen decomp")
-                redundant_stats = {}
-                copied_list = []
-                for item in var_list:
-                    if item is not None:
-                        if item not in redundant_stats:
-                            if self._use_float64:
-                                redundant_stats[item] = tf.cast(
-                                    tf.identity(item), tf.float64)
-                            else:
-                                redundant_stats[item] = tf.identity(item)
-                        copied_list.append(redundant_stats[item])
-                    else:
-                        copied_list.append(None)
-                return copied_list
-            #stats = [copyStats(self.fStats), copyStats(self.bStats)]
-            #stats = [self.fStats, self.bStats]
+    def compute_stats_eigen(self):
+        """
+        compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue
 
+        :return: ([TensorFlow Tensor]) update operations
+        """
+        # TODO: figure out why this op has delays (possibly moving eigenvectors around?)
+        with tf.device('/cpu:0'):
             stats_eigen = self.stats_eigen
-            computedEigen = {}
+            computed_eigen = {}
             eigen_reverse_lookup = {}
-            updateOps = []
+            update_ops = []
             # sync copied stats
-            # with tf.control_dependencies(removeNone(stats[0]) +
-            # removeNone(stats[1])):
             with tf.control_dependencies([]):
                 for stats_var in stats_eigen:
-                    if stats_var not in computedEigen:
-                        eigens = tf.self_adjoint_eig(stats_var)
-                        e = eigens[0]
-                        Q = eigens[1]
+                    if stats_var not in computed_eigen:
+                        eigen_decomposition = tf.self_adjoint_eig(stats_var)
+                        eigen_values = eigen_decomposition[0]
+                        eigen_vectors = eigen_decomposition[1]
                         if self._use_float64:
-                            e = tf.cast(e, tf.float32)
-                            Q = tf.cast(Q, tf.float32)
-                        updateOps.append(e)
-                        updateOps.append(Q)
-                        computedEigen[stats_var] = {'e': e, 'Q': Q}
-                        eigen_reverse_lookup[e] = stats_eigen[stats_var]['e']
-                        eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q']
+                            eigen_values = tf.cast(eigen_values, tf.float64)
+                            eigen_vectors = tf.cast(eigen_vectors, tf.float64)
+                        update_ops.append(eigen_values)
+                        update_ops.append(eigen_vectors)
+                        computed_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors}
+                        eigen_reverse_lookup[eigen_values] = stats_eigen[stats_var]['e']
+                        eigen_reverse_lookup[eigen_vectors] = stats_eigen[stats_var]['Q']
 
             self.eigen_reverse_lookup = eigen_reverse_lookup
-            self.eigen_update_list = updateOps
+            self.eigen_update_list = update_ops
 
             if KFAC_DEBUG:
-                self.eigen_update_list = [item for item in updateOps]
-                with tf.control_dependencies(updateOps):
-                    updateOps.append(tf.Print(tf.constant(
+                self.eigen_update_list = [item for item in update_ops]
+                with tf.control_dependencies(update_ops):
+                    update_ops.append(tf.Print(tf.constant(
                         0.), [tf.convert_to_tensor('computed factor eigen')]))
 
-        return updateOps
+        return update_ops
+
+    def apply_stats_eigen(self, eigen_list):
+        """
+        apply the update using the eigen values of the stats
 
-    def applyStatsEigen(self, eigen_list):
-        updateOps = []
+        :param eigen_list: ([TensorFlow Tensor]) The list of eigen values of the stats
+        :return: ([TensorFlow Tensor]) update operations
+        """
+        update_ops = []
         print(('updating %d eigenvalue/vectors' % len(eigen_list)))
-        for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
+        for _, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)):
             stats_eigen_var = self.eigen_reverse_lookup[mark]
-            updateOps.append(
+            update_ops.append(
                 tf.assign(stats_eigen_var, tensor, use_locking=True))
 
-        with tf.control_dependencies(updateOps):
+        with tf.control_dependencies(update_ops):
             factor_step_op = tf.assign_add(self.factor_step, 1)
-            updateOps.append(factor_step_op)
+            update_ops.append(factor_step_op)
             if KFAC_DEBUG:
-                updateOps.append(tf.Print(tf.constant(
+                update_ops.append(tf.Print(tf.constant(
                     0.), [tf.convert_to_tensor('updated kfac factors')]))
-        return updateOps
+        return update_ops
+
+    def get_kfac_precond_updates(self, gradlist, varlist):
+        """
+        return the KFAC updates
 
-    def getKfacPrecondUpdates(self, gradlist, varlist):
-        updatelist = []
-        vg = 0.
+        :param gradlist: ([TensorFlow Tensor]) The gradients
+        :param varlist: ([TensorFlow Tensor]) The parameters
+        :return: ([TensorFlow Tensor]) the update list
+        """
+        v_g = 0.
 
         assert len(self.stats) > 0
         assert len(self.stats_eigen) > 0
@@ -625,220 +666,223 @@ def getKfacPrecondUpdates(self, gradlist, varlist):
         grad_dict = {var: grad for grad, var in zip(gradlist, varlist)}
 
         for grad, var in zip(gradlist, varlist):
-            GRAD_RESHAPE = False
-            GRAD_TRANSPOSE = False
+            grad_reshape = False
 
-            fpropFactoredFishers = self.stats[var]['fprop_concat_stats']
-            bpropFactoredFishers = self.stats[var]['bprop_concat_stats']
+            fprop_factored_fishers = self.stats[var]['fprop_concat_stats']
+            bprop_factored_fishers = self.stats[var]['bprop_concat_stats']
 
-            if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0:
+            if (len(fprop_factored_fishers) + len(bprop_factored_fishers)) > 0:
                 counter += 1
-                GRAD_SHAPE = grad.get_shape()
+                grad_shape = grad.get_shape()
                 if len(grad.get_shape()) > 2:
                     # reshape conv kernel parameters
-                    KW = int(grad.get_shape()[0])
-                    KH = int(grad.get_shape()[1])
-                    C = int(grad.get_shape()[2])
-                    D = int(grad.get_shape()[3])
+                    kernel_width = int(grad.get_shape()[0])
+                    kernel_height = int(grad.get_shape()[1])
+                    n_channels = int(grad.get_shape()[2])
+                    depth = int(grad.get_shape()[3])
 
-                    if len(fpropFactoredFishers) > 1 and self._channel_fac:
+                    if len(fprop_factored_fishers) > 1 and self._channel_fac:
                         # reshape conv kernel parameters into tensor
-                        grad = tf.reshape(grad, [KW * KH, C, D])
+                        grad = tf.reshape(grad, [kernel_width * kernel_height, n_channels, depth])
                     else:
                         # reshape conv kernel parameters into 2D grad
-                        grad = tf.reshape(grad, [-1, D])
-                    GRAD_RESHAPE = True
+                        grad = tf.reshape(grad, [-1, depth])
+                    grad_reshape = True
                 elif len(grad.get_shape()) == 1:
                     # reshape bias or 1D parameters
-                    D = int(grad.get_shape()[0])
 
                     grad = tf.expand_dims(grad, 0)
-                    GRAD_RESHAPE = True
-                else:
-                    # 2D parameters
-                    C = int(grad.get_shape()[0])
-                    D = int(grad.get_shape()[1])
+                    grad_reshape = True
 
                 if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
                     # use homogeneous coordinates only works for 2D grad.
-                    # TO-DO: figure out how to factorize bias grad
+                    # TODO: figure out how to factorize bias grad
                     # stack bias grad
-                    var_assnBias = self.stats[var]['assnBias']
+                    var_assn_bias = self.stats[var]['assnBias']
                     grad = tf.concat(
-                        [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0)
+                        [grad, tf.expand_dims(grad_dict[var_assn_bias], 0)], 0)
 
                 # project gradient to eigen space and reshape the eigenvalues
                 # for broadcasting
-                eigVals = []
+                eig_vals = []
 
                 for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
-                    Q = self.stats_eigen[stats]['Q']
-                    e = detectMinVal(self.stats_eigen[stats][
-                                     'e'], var, name='act', debug=KFAC_DEBUG)
+                    eigen_vectors = self.stats_eigen[stats]['Q']
+                    eigen_values = detect_min_val(self.stats_eigen[stats][
+                                                      'e'], var, name='act', debug=KFAC_DEBUG)
 
-                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act')
-                    eigVals.append(e)
-                    grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx)
+                    eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values,
+                                                                 grad, fac_idx=idx, f_type='act')
+                    eig_vals.append(eigen_values)
+                    grad = gmatmul(eigen_vectors, grad, transpose_a=True, reduce_dim=idx)
 
                 for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
-                    Q = self.stats_eigen[stats]['Q']
-                    e = detectMinVal(self.stats_eigen[stats][
-                                     'e'], var, name='grad', debug=KFAC_DEBUG)
+                    eigen_vectors = self.stats_eigen[stats]['Q']
+                    eigen_values = detect_min_val(self.stats_eigen[stats][
+                                                      'e'], var, name='grad', debug=KFAC_DEBUG)
 
-                    Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad')
-                    eigVals.append(e)
-                    grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx)
-                ##
+                    eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values,
+                                                                 grad, fac_idx=idx, f_type='grad')
+                    eig_vals.append(eigen_values)
+                    grad = gmatmul(grad, eigen_vectors, transpose_b=False, reduce_dim=idx)
 
-                #####
                 # whiten using eigenvalues
-                weightDecayCoeff = 0.
+                weight_decay_coeff = 0.
                 if var in self._weight_decay_dict:
-                    weightDecayCoeff = self._weight_decay_dict[var]
+                    weight_decay_coeff = self._weight_decay_dict[var]
                     if KFAC_DEBUG:
-                        print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff)))
+                        print(('weight decay coeff for %s is %f' % (var.name, weight_decay_coeff)))
 
                 if self._factored_damping:
                     if KFAC_DEBUG:
-                        print(('use factored damping for %s' % (var.name)))
+                        print(('use factored damping for %s' % var.name))
                     coeffs = 1.
-                    num_factors = len(eigVals)
+                    num_factors = len(eig_vals)
                     # compute the ratio of two trace norm of the left and right
                     # KFac matrices, and their generalization
-                    if len(eigVals) == 1:
-                        damping = self._epsilon + weightDecayCoeff
+                    if len(eig_vals) == 1:
+                        damping = self._epsilon + weight_decay_coeff
                     else:
                         damping = tf.pow(
-                            self._epsilon + weightDecayCoeff, 1. / num_factors)
-                    eigVals_tnorm_avg = [tf.reduce_mean(
-                        tf.abs(e)) for e in eigVals]
-                    for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg):
-                        eig_tnorm_negList = [
-                            item for item in eigVals_tnorm_avg if item != e_tnorm]
-                        if len(eigVals) == 1:
+                            self._epsilon + weight_decay_coeff, 1. / num_factors)
+                    eig_vals_tnorm_avg = [tf.reduce_mean(
+                        tf.abs(e)) for e in eig_vals]
+                    for eigen_val, e_tnorm in zip(eig_vals, eig_vals_tnorm_avg):
+                        eig_tnorm_neg_list = [
+                            item for item in eig_vals_tnorm_avg if item != e_tnorm]
+                        if len(eig_vals) == 1:
                             adjustment = 1.
-                        elif len(eigVals) == 2:
+                        elif len(eig_vals) == 2:
                             adjustment = tf.sqrt(
-                                e_tnorm / eig_tnorm_negList[0])
+                                e_tnorm / eig_tnorm_neg_list[0])
                         else:
-                            eig_tnorm_negList_prod = reduce(
-                                lambda x, y: x * y, eig_tnorm_negList)
+                            eig_tnorm_neg_list_prod = reduce(
+                                lambda x, y: x * y, eig_tnorm_neg_list)
                             adjustment = tf.pow(
-                                tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors)
-                        coeffs *= (e + adjustment * damping)
+                                tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_neg_list_prod, 1. / num_factors)
+                        coeffs *= (eigen_val + adjustment * damping)
                 else:
                     coeffs = 1.
-                    damping = (self._epsilon + weightDecayCoeff)
-                    for e in eigVals:
-                        coeffs *= e
+                    damping = (self._epsilon + weight_decay_coeff)
+                    for eigen_val in eig_vals:
+                        coeffs *= eigen_val
                     coeffs += damping
 
-                #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()])
-
                 grad /= coeffs
 
-                #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()])
-                #####
                 # project gradient back to euclidean space
                 for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']):
-                    Q = self.stats_eigen[stats]['Q']
-                    grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx)
+                    eigen_vectors = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(eigen_vectors, grad, transpose_a=False, reduce_dim=idx)
 
                 for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']):
-                    Q = self.stats_eigen[stats]['Q']
-                    grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx)
-                ##
+                    eigen_vectors = self.stats_eigen[stats]['Q']
+                    grad = gmatmul(grad, eigen_vectors, transpose_b=True, reduce_dim=idx)
 
-                #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()])
                 if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias:
                     # use homogeneous coordinates only works for 2D grad.
-                    # TO-DO: figure out how to factorize bias grad
+                    # TODO: figure out how to factorize bias grad
                     # un-stack bias grad
-                    var_assnBias = self.stats[var]['assnBias']
-                    C_plus_one = int(grad.get_shape()[0])
-                    grad_assnBias = tf.reshape(tf.slice(grad,
-                                                        begin=[
-                                                            C_plus_one - 1, 0],
-                                                        size=[1, -1]), var_assnBias.get_shape())
-                    grad_assnWeights = tf.slice(grad,
-                                                begin=[0, 0],
-                                                size=[C_plus_one - 1, -1])
-                    grad_dict[var_assnBias] = grad_assnBias
-                    grad = grad_assnWeights
-
-                #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()])
-                if GRAD_RESHAPE:
-                    grad = tf.reshape(grad, GRAD_SHAPE)
+                    var_assn_bias = self.stats[var]['assnBias']
+                    c_plus_one = int(grad.get_shape()[0])
+                    grad_assn_bias = tf.reshape(tf.slice(grad,
+                                                         begin=[
+                                                             c_plus_one - 1, 0],
+                                                         size=[1, -1]), var_assn_bias.get_shape())
+                    grad_assn_weights = tf.slice(grad,
+                                                 begin=[0, 0],
+                                                 size=[c_plus_one - 1, -1])
+                    grad_dict[var_assn_bias] = grad_assn_bias
+                    grad = grad_assn_weights
+
+                if grad_reshape:
+                    grad = tf.reshape(grad, grad_shape)
 
                 grad_dict[var] = grad
 
         print(('projecting %d gradient matrices' % counter))
 
-        for g, var in zip(gradlist, varlist):
+        for grad_1, var in zip(gradlist, varlist):
             grad = grad_dict[var]
-            ### clipping ###
+            # clipping
             if KFAC_DEBUG:
-                print(('apply clipping to %s' % (var.name)))
+                print(('apply clipping to %s' % var.name))
             tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad")
-            local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr))
-            vg += local_vg
+            local_vg = tf.reduce_sum(grad * grad_1 * (self._lr * self._lr))
+            v_g += local_vg
 
-        # recale everything
+        # rescale everything
         if KFAC_DEBUG:
             print('apply vFv clipping')
 
-        scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg))
+        scaling = tf.minimum(1., tf.sqrt(self._clip_kl / v_g))
         if KFAC_DEBUG:
             scaling = tf.Print(scaling, [tf.convert_to_tensor(
-                'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg])
-        with tf.control_dependencies([tf.assign(self.vFv, vg)]):
+                'clip: '), scaling, tf.convert_to_tensor(' vFv: '), v_g])
+        with tf.control_dependencies([tf.assign(self.v_f_v, v_g)]):
             updatelist = [grad_dict[var] for var in varlist]
             for i, item in enumerate(updatelist):
                 updatelist[i] = scaling * item
 
         return updatelist
 
-    def compute_gradients(self, loss, var_list=None):
+    @classmethod
+    def compute_gradients(cls, loss, var_list=None):
+        """
+        compute the gradients from the loss and the parameters
+
+        :param loss: ([TensorFlow Tensor]) The loss
+        :param var_list: ([TensorFlow Tensor]) The parameters
+        :return: ([TensorFlow Tensor]) the gradient
+        """
         varlist = var_list
         if varlist is None:
             varlist = tf.trainable_variables()
-        g = tf.gradients(loss, varlist)
+        gradients = tf.gradients(loss, varlist)
 
-        return [(a, b) for a, b in zip(g, varlist)]
+        return [(a, b) for a, b in zip(gradients, varlist)]
 
     def apply_gradients_kfac(self, grads):
-        g, varlist = list(zip(*grads))
+        """
+        apply the kfac gradient
+
+        :param grads: ([TensorFlow Tensor]) the gradient
+        :return: ([function], QueueRunner) Update functions, queue operation runner
+        """
+        grad, varlist = list(zip(*grads))
 
         if len(self.stats_eigen) == 0:
-            self.getStatsEigen()
+            self.get_stats_eigen()
 
-        qr = None
+        queue_runner = None
         # launch eigen-decomp on a queue thread
         if self._async:
             print('Use async eigen decomp')
             # get a list of factor loading tensors
-            factorOps_dummy = self.computeStatsEigen()
+            factor_ops_dummy = self.compute_stats_eigen()
 
             # define a queue for the list of factor loading tensors
-            queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[
-                                 item.get_shape() for item in factorOps_dummy])
-            enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
-                0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op)
+            queue = tf.FIFOQueue(1, [item.dtype for item in factor_ops_dummy],
+                                 shapes=[item.get_shape() for item in factor_ops_dummy])
+            enqueue_op = tf.cond(
+                tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor(
+                    0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)),
+                lambda: queue.enqueue(self.compute_stats_eigen()), tf.no_op)
 
             def dequeue_op():
                 return queue.dequeue()
 
-            qr = tf.train.QueueRunner(queue, [enqueue_op])
+            queue_runner = tf.train.QueueRunner(queue, [enqueue_op])
 
-        updateOps = []
+        update_ops = []
         global_step_op = tf.assign_add(self.global_step, 1)
-        updateOps.append(global_step_op)
+        update_ops.append(global_step_op)
 
         with tf.control_dependencies([global_step_op]):
 
             # compute updates
-            assert self._update_stats_op != None
-            updateOps.append(self._update_stats_op)
+            assert self._update_stats_op is not None
+            update_ops.append(self._update_stats_op)
             dependency_list = []
             if not self._async:
                 dependency_list.append(self._update_stats_op)
@@ -849,78 +893,99 @@ def no_op_wrapper():
 
                 if not self._async:
                     # synchronous eigen-decomp updates
-                    updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update),
-                                                                      tf.convert_to_tensor(0)),
-                                                             tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper)
+                    update_factor_ops = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update),
+                                                                        tf.convert_to_tensor(0)),
+                                                               tf.greater_equal(self.stats_step,
+                                                                                self._stats_accum_iter)),
+                                                lambda: tf.group(*self.apply_stats_eigen(self.compute_stats_eigen())),
+                                                no_op_wrapper)
                 else:
                     # asynchronous eigen-decomp updates using queue
-                    updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
-                                              lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)),
-                                                              tf.no_op,
+                    update_factor_ops = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter),
+                                                lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)),
+                                                                tf.no_op,
+
+                                                                lambda: tf.group(
+                                                                    *self.apply_stats_eigen(dequeue_op())),
+                                                                ),
+                                                no_op_wrapper)
 
-                                                              lambda: tf.group(
-                                                                  *self.applyStatsEigen(dequeue_op())),
-                                                              ),
-                                              no_op_wrapper)
+                update_ops.append(update_factor_ops)
 
-                updateOps.append(updateFactorOps)
+                with tf.control_dependencies([update_factor_ops]):
+                    def grad_op():
+                        return list(grad)
 
-                with tf.control_dependencies([updateFactorOps]):
-                    def gradOp():
-                        return list(g)
+                    def get_kfac_grad_op():
+                        return self.get_kfac_precond_updates(grad, varlist)
 
-                    def getKfacGradOp():
-                        return self.getKfacPrecondUpdates(g, varlist)
                     u = tf.cond(tf.greater(self.factor_step,
-                                           tf.convert_to_tensor(0)), getKfacGradOp, gradOp)
+                                           tf.convert_to_tensor(0)), get_kfac_grad_op, grad_op)
 
                     optim = tf.train.MomentumOptimizer(
                         self._lr * (1. - self._momentum), self._momentum)
-                    #optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
 
-                    def optimOp():
-                        def updateOptimOp():
+                    # optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01)
+
+                    def optim_op():
+                        def update_optim_op():
                             if self._full_stats_init:
-                                return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op)
+                                return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)),
+                                               lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op)
                             else:
                                 return optim.apply_gradients(list(zip(u, varlist)))
+
                         if self._full_stats_init:
-                            return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op)
+                            return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), update_optim_op,
+                                           tf.no_op)
                         else:
-                            return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op)
-                    updateOps.append(optimOp())
+                            return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), update_optim_op, tf.no_op)
 
-        return tf.group(*updateOps), qr
+                    update_ops.append(optim_op())
+
+        return tf.group(*update_ops), queue_runner
 
     def apply_gradients(self, grads):
-        coldOptim = tf.train.MomentumOptimizer(
-            self._cold_lr, self._momentum)
+        """
+        apply the gradient
+
+        :param grads: ([TensorFlow Tensor]) the gradient
+        :return: (function, QueueRunner) train operation, queue operation runner
+        """
+        cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum)
 
-        def coldSGDstart():
+        def _cold_sgd_start():
             sgd_grads, sgd_var = zip(*grads)
 
-            if self.max_grad_norm != None:
-                sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)
+            if self.max_grad_norm is not None:
+                sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm)
 
-            sgd_grads = list(zip(sgd_grads,sgd_var))
+            sgd_grads = list(zip(sgd_grads, sgd_var))
 
             sgd_step_op = tf.assign_add(self.sgd_step, 1)
-            coldOptim_op = coldOptim.apply_gradients(sgd_grads)
+            cold_optim_op = cold_optim.apply_gradients(sgd_grads)
             if KFAC_DEBUG:
-                with tf.control_dependencies([sgd_step_op, coldOptim_op]):
+                with tf.control_dependencies([sgd_step_op, cold_optim_op]):
                     sgd_step_op = tf.Print(
                         sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
-            return tf.group(*[sgd_step_op, coldOptim_op])
+            return tf.group(*[sgd_step_op, cold_optim_op])
 
-        kfacOptim_op, qr = self.apply_gradients_kfac(grads)
+        kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads)
 
-        def warmKFACstart():
-            return kfacOptim_op
+        def _warm_kfac_start():
+            return kfac_optim_op
 
-        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr
+        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner
 
     def minimize(self, loss, loss_sampled, var_list=None):
+        """
+        minimize the gradient loss
+
+        :param loss: ([TensorFlow Tensor]) The loss
+        :param loss_sampled: ([TensorFlow Tensor]) the loss function output
+        :param var_list: ([TensorFlow Tensor]) The parameters
+        :return: (function, q_runner) train operation, queue operation runner
+        """
         grads = self.compute_gradients(loss, var_list=var_list)
-        update_stats_op = self.compute_and_apply_stats(
-            loss_sampled, var_list=var_list)
+        self.compute_and_apply_stats(loss_sampled, var_list=var_list)
         return self.apply_gradients(grads)
diff --git a/baselines/acktr/kfac_utils.py b/baselines/acktr/kfac_utils.py
index edc623d737..512e21a239 100644
--- a/baselines/acktr/kfac_utils.py
+++ b/baselines/acktr/kfac_utils.py
@@ -1,20 +1,31 @@
 import tensorflow as tf
 
-def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
+
+def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None):
+    """
+    Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match
+
+    :param tensor_a: (TensorFlow Tensor)
+    :param tensor_b: (TensorFlow Tensor)
+    :param transpose_a: (bool) If 'a' needs transposing
+    :param transpose_b: (bool) If 'b' needs transposing
+    :param reduce_dim: (int) the multiplication over the dim
+    :return: (TensorFlow Tensor) a * b
+    """
     assert reduce_dim is not None
 
     # weird batch matmul
-    if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
+    if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2:
         # reshape reduce_dim to the left most dim in b
-        b_shape = b.get_shape()
+        b_shape = tensor_b.get_shape()
         if reduce_dim != 0:
             b_dims = list(range(len(b_shape)))
             b_dims.remove(reduce_dim)
             b_dims.insert(0, reduce_dim)
-            b = tf.transpose(b, b_dims)
-        b_t_shape = b.get_shape()
-        b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
-        result = tf.matmul(a, b, transpose_a=transpose_a,
+            tensor_b = tf.transpose(tensor_b, b_dims)
+        b_t_shape = tensor_b.get_shape()
+        tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1])
+        result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a,
                            transpose_b=transpose_b)
         result = tf.reshape(result, b_t_shape)
         if reduce_dim != 0:
@@ -24,19 +35,19 @@ def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
             result = tf.transpose(result, b_dims)
         return result
 
-    elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
+    elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2:
         # reshape reduce_dim to the right most dim in a
-        a_shape = a.get_shape()
+        a_shape = tensor_a.get_shape()
         outter_dim = len(a_shape) - 1
         reduce_dim = len(a_shape) - reduce_dim - 1
         if reduce_dim != outter_dim:
             a_dims = list(range(len(a_shape)))
             a_dims.remove(reduce_dim)
             a_dims.insert(outter_dim, reduce_dim)
-            a = tf.transpose(a, a_dims)
-        a_t_shape = a.get_shape()
-        a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
-        result = tf.matmul(a, b, transpose_a=transpose_a,
+            tensor_a = tf.transpose(tensor_a, a_dims)
+        a_t_shape = tensor_a.get_shape()
+        tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])])
+        result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a,
                            transpose_b=transpose_b)
         result = tf.reshape(result, a_t_shape)
         if reduce_dim != outter_dim:
@@ -46,41 +57,72 @@ def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
             result = tf.transpose(result, a_dims)
         return result
 
-    elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
-        return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
+    elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2:
+        return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b)
 
     assert False, 'something went wrong'
 
 
-def clipoutNeg(vec, threshold=1e-6):
+def clipout_neg(vec, threshold=1e-6):
+    """
+    clip to 0 if input lower than threshold value
+
+    :param vec: (TensorFlow Tensor)
+    :param threshold: (float) the cutoff threshold
+    :return: (TensorFlow Tensor) clipped input
+    """
     mask = tf.cast(vec > threshold, tf.float32)
     return mask * vec
 
 
-def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
+def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False):
+    """
+    If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values
+
+    :param input_mat: (TensorFlow Tensor)
+    :param var: (TensorFlow Tensor) variable
+    :param threshold: (float) the cutoff threshold
+    :param name: (str) the name of the variable
+    :param debug: (bool) debug function
+    :return: (TensorFlow Tensor) clipped tensor
+    """
     eigen_min = tf.reduce_min(input_mat)
     eigen_max = tf.reduce_max(input_mat)
     eigen_ratio = eigen_max / eigen_min
-    input_mat_clipped = clipoutNeg(input_mat, threshold)
+    input_mat_clipped = clipout_neg(input_mat, threshold)
 
     if debug:
-        input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
-            input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
+        input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)),
+                                    lambda: input_mat_clipped, lambda: tf.Print(
+                input_mat_clipped,
+                [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name),
+                 eigen_min, eigen_max, eigen_ratio]))
 
     return input_mat_clipped
 
 
-def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
+def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'):
+    """
+    factor and reshape input eigen values
+
+    :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors
+    :param eigen_values: ([TensorFlow Tensor]) eigen values
+    :param grad: ([TensorFlow Tensor]) gradient
+    :param fac_idx: (int) index that should be factored
+    :param f_type: (str) function type to factor and reshape
+    :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors
+            and eigen values
+    """
     grad_shape = grad.get_shape()
-    if ftype == 'act':
-        assert e.get_shape()[0] == grad_shape[facIndx]
+    if f_type == 'act':
+        assert eigen_values.get_shape()[0] == grad_shape[fac_idx]
         expanded_shape = [1, ] * len(grad_shape)
-        expanded_shape[facIndx] = -1
-        e = tf.reshape(e, expanded_shape)
-    if ftype == 'grad':
-        assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
+        expanded_shape[fac_idx] = -1
+        eigen_values = tf.reshape(eigen_values, expanded_shape)
+    if f_type == 'grad':
+        assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1]
         expanded_shape = [1, ] * len(grad_shape)
-        expanded_shape[len(grad_shape) - facIndx - 1] = -1
-        e = tf.reshape(e, expanded_shape)
+        expanded_shape[len(grad_shape) - fac_idx - 1] = -1
+        eigen_values = tf.reshape(eigen_values, expanded_shape)
 
-    return Q, e
+    return eigen_vectors, eigen_values
diff --git a/baselines/acktr/policies.py b/baselines/acktr/policies.py
index 39bb6cbe6d..56896e30ad 100644
--- a/baselines/acktr/policies.py
+++ b/baselines/acktr/policies.py
@@ -1,42 +1,75 @@
 import numpy as np
 import tensorflow as tf
+
 from baselines.acktr.utils import dense, kl_div
-import baselines.common.tf_util as U
+import baselines.common.tf_util as tf_util
+
 
 class GaussianMlpPolicy(object):
     def __init__(self, ob_dim, ac_dim):
+        """
+        Create a gaussian MLP policy
+
+        :param ob_dim: (int) Observation dimention
+        :param ac_dim: (int) action dimention
+        """
         # Here we'll construct a bunch of expressions, which will be used in two places:
         # (1) When sampling actions
         # (2) When computing loss functions, for the policy update
         # Variables specific to (1) have the word "sampled" in them,
         # whereas variables specific to (2) have the word "old" in them
-        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
-        oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
-        oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
-        adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
+        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob")  # batch of observations
+        oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac")  # batch of actions previous actions
+        # batch of actions previous action distributions
+        oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist")
+        adv_n = tf.placeholder(tf.float32, shape=[None], name="adv")  # advantage function estimate
         wd_dict = {}
-        h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
-        h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
-        mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
+        layer_1 = tf.nn.tanh(dense(ob_no, 64, "h1",
+                                   weight_init=tf_util.normc_initializer(1.0),
+                                   bias_init=0.0, weight_loss_dict=wd_dict))
+        layer_2 = tf.nn.tanh(dense(layer_1, 64, "h2",
+                                   weight_init=tf_util.normc_initializer(1.0),
+                                   bias_init=0.0, weight_loss_dict=wd_dict))
+        mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1),
+                        bias_init=0.0, weight_loss_dict=wd_dict)  # Mean control output
         self.wd_dict = wd_dict
-        self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
+        # Variance on outputs
+        self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer())
         logstd_1a = tf.expand_dims(logstd_1a, 0)
         std_1a = tf.exp(logstd_1a)
         std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
         ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
-        sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
-        logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
-        logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
-        kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
-        #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
-        surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
-        surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
-        self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
-        #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
-        self.compute_kl = U.function([ob_no, oldac_dist], kl)
-        self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
-        U.initialize() # Initialize uninitialized TF variables
+        # This is the sampled action we'll perform.
+        sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim]
+        logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
+            2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
+            tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])),
+            axis=1)  # Logprob of sampled action
+        logprob_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
+            2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
+            tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])),
+            axis=1)  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
+        kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
+        # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n))
+        # Approximation of KL divergence between old policy used to generate actions,
+        # and new policy used to compute logprob_n
+        surr = - tf.reduce_mean(adv_n * logprob_n)  # Loss function that we'll differentiate to get the policy gradient
+        surr_sampled = - tf.reduce_mean(logprob_n)  # Sampled loss of the policy
+        # Generate a new action and its logprob
+        self._act = tf_util.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n])
+        # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl)
+        #  Compute (approximate) KL divergence between old policy and new policy
+        self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss)
+        # Input and output variables needed for computing loss
+        self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
+        tf_util.initialize()  # Initialize uninitialized TF variables
+
+    def act(self, obs):
+        """
+        get the action from an observation
 
-    def act(self, ob):
-        ac, ac_dist, logp = self._act(ob[None])
-        return ac[0], ac_dist[0], logp[0]
+        :param obs: ([float]) observation
+        :return: ([float], [float], [float]) action, action_proba, logp
+        """
+        action, ac_dist, logp = self._act(obs[None])
+        return action[0], ac_dist[0], logp[0]
diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py
index 6e398ce25d..f9df54991a 100644
--- a/baselines/acktr/run_atari.py
+++ b/baselines/acktr/run_atari.py
@@ -1,23 +1,36 @@
 #!/usr/bin/env python3
-
 from functools import partial
 
 from baselines import logger
 from baselines.acktr.acktr_disc import learn
 from baselines.common.cmd_util import make_atari_env, atari_arg_parser
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack
-from baselines.ppo2.policies import CnnPolicy
+from baselines.a2c.policies import CnnPolicy
+
 
 def train(env_id, num_timesteps, seed, num_cpu):
+    """
+    train an ACKTR model on atari
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    :param num_cpu: (int) The number of cpu to train on
+    """
     env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
     policy_fn = partial(CnnPolicy, one_dim_bias=True)
     learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu)
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     args = atari_arg_parser().parse_args()
     logger.configure()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
 
+
 if __name__ == '__main__':
     main()
diff --git a/baselines/acktr/run_mujoco.py b/baselines/acktr/run_mujoco.py
index 9065d58807..2ca413177b 100644
--- a/baselines/acktr/run_mujoco.py
+++ b/baselines/acktr/run_mujoco.py
@@ -1,34 +1,46 @@
 #!/usr/bin/env python3
 
 import tensorflow as tf
+
 from baselines import logger
 from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 from baselines.acktr.acktr_cont import learn
 from baselines.acktr.policies import GaussianMlpPolicy
 from baselines.acktr.value_functions import NeuralNetValueFunction
 
+
 def train(env_id, num_timesteps, seed):
+    """
+    train an ACKTR model on atari
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
     env = make_mujoco_env(env_id, seed)
 
     with tf.Session(config=tf.ConfigProto()):
         ob_dim = env.observation_space.shape[0]
         ac_dim = env.action_space.shape[0]
         with tf.variable_scope("vf"):
-            vf = NeuralNetValueFunction(ob_dim, ac_dim)
+            value_fn = NeuralNetValueFunction(ob_dim, ac_dim)
         with tf.variable_scope("pi"):
             policy = GaussianMlpPolicy(ob_dim, ac_dim)
 
-        learn(env, policy=policy, vf=vf,
-            gamma=0.99, lam=0.97, timesteps_per_batch=2500,
-            desired_kl=0.002,
-            num_timesteps=num_timesteps, animate=False)
+        learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002,
+              num_timesteps=num_timesteps, animate=False)
 
         env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     args = mujoco_arg_parser().parse_args()
     logger.configure()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
+
 if __name__ == "__main__":
     main()
diff --git a/baselines/acktr/utils.py b/baselines/acktr/utils.py
index 227350fe5e..5b67b2c804 100644
--- a/baselines/acktr/utils.py
+++ b/baselines/acktr/utils.py
@@ -1,28 +1,49 @@
 import tensorflow as tf
 
-def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
+
+def dense(input_tensor, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
+    """
+    A dense Layer
+    
+    :param input_tensor: ([TensorFlow Tensor]) input
+    :param size: (int) number of hidden neurons
+    :param name: (str) layer name
+    :param weight_init: (function or int or float) initialize the weight
+    :param bias_init: (function or int or float) initialize the weight
+    :param weight_loss_dict: (dict) store the weight loss if not None
+    :param reuse: (bool) if can be reused
+    :return: ([TensorFlow Tensor]) the output of the dense Layer
+    """
     with tf.variable_scope(name, reuse=reuse):
-        assert (len(tf.get_variable_scope().name.split('/')) == 2)
+        assert len(tf.get_variable_scope().name.split('/')) == 2
 
-        w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
-        b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
+        weight = tf.get_variable("w", [input_tensor.get_shape()[1], size], initializer=weight_init)
+        bias = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
         weight_decay_fc = 3e-4
 
         if weight_loss_dict is not None:
-            weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
-            if weight_loss_dict is not None:
-                weight_loss_dict[w] = weight_decay_fc
-                weight_loss_dict[b] = 0.0
+            weight_decay = tf.multiply(tf.nn.l2_loss(weight), weight_decay_fc, name='weight_decay_loss')
+            weight_loss_dict[weight] = weight_decay_fc
+            weight_loss_dict[bias] = 0.0
 
             tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
 
-        return tf.nn.bias_add(tf.matmul(x, w), b)
+        return tf.nn.bias_add(tf.matmul(input_tensor, weight), bias)
+
 
 def kl_div(action_dist1, action_dist2, action_size):
+    """
+    Kullback leiber divergence
+    
+    :param action_dist1: ([TensorFlow Tensor]) action distribution 1
+    :param action_dist2: ([TensorFlow Tensor]) action distribution 2
+    :param action_size: (int) the shape of an action
+    :return: (float) Kullback leiber divergence
+    """
     mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
     mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
 
     numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
     denominator = 2 * tf.square(std2) + 1e-8
     return tf.reduce_sum(
-        numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
+        numerator / denominator + tf.log(std2) - tf.log(std1), reduction_indices=-1)
diff --git a/baselines/acktr/value_functions.py b/baselines/acktr/value_functions.py
index d1e9e1a361..c97e0c8949 100644
--- a/baselines/acktr/value_functions.py
+++ b/baselines/acktr/value_functions.py
@@ -1,50 +1,86 @@
-from baselines import logger
 import numpy as np
-import baselines.common as common
-from baselines.common import tf_util as U
 import tensorflow as tf
+
+from baselines import logger
+import baselines.common as common
+from baselines.common import tf_util
 from baselines.acktr import kfac
 from baselines.acktr.utils import dense
 
+
 class NeuralNetValueFunction(object):
-    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
-        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
+    def __init__(self, ob_dim, ac_dim):
+        """
+        Create an MLP policy for a value function
+
+        :param ob_dim: (int) Observation dimention
+        :param ac_dim: (int) action dimention
+        """
+        obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2])  # batch of observations
         vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
         wd_dict = {}
-        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
-        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
-        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
+        layer_1 = tf.nn.elu(dense(obs_ph, 64, "h1",
+                                  weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
+        layer_2 = tf.nn.elu(dense(layer_1, 64, "h2",
+                                  weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
+        vpred_n = dense(layer_2, 1, "hfinal",
+                        weight_init=tf_util.normc_initializer(1.0), bias_init=0,
+                        weight_loss_dict=wd_dict)[:, 0]
         sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
         wd_loss = tf.get_collection("vf_losses", None)
         loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
         loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
-        self._predict = U.function([X], vpred_n)
-        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
-                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
-                                    async=1, kfac_update=2, cold_iter=50, \
-                                    weight_decay_dict=wd_dict, max_grad_norm=None)
+
+        self._predict = tf_util.function([obs_ph], vpred_n)
+
+        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9,
+                                   clip_kl=0.3, epsilon=0.1, stats_decay=0.95,
+                                   async=1, kfac_update=2, cold_iter=50,
+                                   weight_decay_dict=wd_dict, max_grad_norm=None)
         vf_var_list = []
         for var in tf.trainable_variables():
             if "vf" in var.name:
                 vf_var_list.append(var)
 
         update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
-        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
-        U.initialize() # Initialize uninitialized TF variables
-    def _preproc(self, path):
-        l = pathlength(path)
-        al = np.arange(l).reshape(-1,1)/10.0
+        self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
+        tf_util.initialize()  # Initialize uninitialized TF variables
+
+    @classmethod
+    def _preproc(cls, path):
+        """
+        preprocess path
+
+        :param path: ({TensorFlow Tensor}) the history of the network
+        :return: ([TensorFlow Tensor]) processed input
+        """
+        length = path["reward"].shape[0]
+        # used to be named 'al', unfortunalty we cant seem to know why it was called 'al' or what it means.
+        # Feel free to fix it if you know what is meant here.
+        # Could mean 'array_length', but even then we are not sure how this array is useful for the network.
+        al_capone = np.arange(length).reshape(-1, 1) / 10.0
         act = path["action_dist"].astype('float32')
-        X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
-        return X
+        return np.concatenate([path['observation'], act, al_capone, np.ones((length, 1))], axis=1)
+
     def predict(self, path):
+        """
+        predict value from history
+
+        :param path: ({TensorFlow Tensor}) the history of the network
+        :return: ([TensorFlow Tensor]) value function output
+        """
         return self._predict(self._preproc(path))
+
     def fit(self, paths, targvals):
-        X = np.concatenate([self._preproc(p) for p in paths])
-        y = np.concatenate(targvals)
-        logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
-        for _ in range(25): self.do_update(X, y)
-        logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
-
-def pathlength(path):
-    return path["reward"].shape[0]
+        """
+        fit paths to target values
+
+        :param paths: ({TensorFlow Tensor}) the history of the network
+        :param targvals: ([TensorFlow Tensor]) the expected value
+        """
+        _input = np.concatenate([self._preproc(p) for p in paths])
+        targets = np.concatenate(targvals)
+        logger.record_tabular("EVBefore", common.explained_variance(self._predict(_input), targets))
+        for _ in range(25):
+            self.do_update(_input, targets)
+        logger.record_tabular("EVAfter", common.explained_variance(self._predict(_input), targets))
diff --git a/baselines/bench/__init__.py b/baselines/bench/__init__.py
index 4fd3874b39..2ef5aaa071 100644
--- a/baselines/bench/__init__.py
+++ b/baselines/bench/__init__.py
@@ -1,2 +1 @@
-from baselines.bench.benchmarks import *
-from baselines.bench.monitor import *
\ No newline at end of file
+from baselines.bench.monitor import Monitor, load_results
diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py
index a5a35f831a..298edd23b6 100644
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -1,10 +1,10 @@
 import re
-import os.path as osp
 import os
+
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
-_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
-_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
+_ATARI7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
+_ATARIEXPL7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
 
 _BENCHMARKS = []
 
@@ -12,39 +12,67 @@
 
 
 def register_benchmark(benchmark):
-    for b in _BENCHMARKS:
-        if b['name'] == benchmark['name']:
-            raise ValueError('Benchmark with name %s already registered!' % b['name'])
+    """
+    Register an OpenAI gym environment
+
+    :param benchmark: (dict) Containes the name, description and tasks of the environment you wish to register
+    """
+    for bench in _BENCHMARKS:
+        if bench['name'] == benchmark['name']:
+            raise ValueError('Benchmark with name %s already registered!' % bench['name'])
 
     # automatically add a description if it is not present
     if 'tasks' in benchmark:
-        for t in benchmark['tasks']:
-            if 'desc' not in t:
-                t['desc'] = remove_version_re.sub('', t['env_id'])
+        for task in benchmark['tasks']:
+            if 'desc' not in task:
+                task['desc'] = remove_version_re.sub('', task['env_id'])
     _BENCHMARKS.append(benchmark)
 
 
 def list_benchmarks():
+    """
+    Retuns a list of all the benchmark dictionaries registed by this module
+
+    :return: ([dict]) the benchmarks
+    """
     return [b['name'] for b in _BENCHMARKS]
 
 
 def get_benchmark(benchmark_name):
-    for b in _BENCHMARKS:
-        if b['name'] == benchmark_name:
-            return b
+    """
+    Returns the registered benchmark of the same name, will raise a ValueError if the name is not present
+
+    :param benchmark_name: (str) the name of the benchmark you wish to lookup
+    :return: (dict) the benchmark dictionarie
+    """
+    for bench in _BENCHMARKS:
+        if bench['name'] == benchmark_name:
+            return bench
     raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
 
 
 def get_task(benchmark, env_id):
-    """Get a task by env_id. Return None if the benchmark doesn't have the env"""
+    """
+    Get a task by env_id. Return None if the benchmark doesn't have the env.
+
+    :param benchmark: (dict) the benchmark you wish to look in
+    :param env_id: (str) the environment id you want to find
+    :return: (dict) the task
+    """
     return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
 
 
-def find_task_for_env_id_in_any_benchmark(env_id):
-    for bm in _BENCHMARKS:
-        for task in bm["tasks"]:
+def find_task_in_benchmarks(env_id):
+    """
+    Get the first task and benchmark, that has the corresponding environment id
+
+    :param env_id: (str) the environment id you want to find
+    :return: (dict, dict) the benchmark and task dictionaries
+    """
+    for bench in _BENCHMARKS:
+        for task in bench["tasks"]:
             if task["env_id"] == env_id:
-                return bm, task
+                return bench, task
     return None, None
 
 
@@ -53,38 +81,42 @@ def find_task_for_env_id_in_any_benchmark(env_id):
 register_benchmark({
     'name': 'Atari50M',
     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)}
+              for _game in _ATARI7]
 })
 
 register_benchmark({
     'name': 'Atari10M',
     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)}
+              for _game in _ATARI7]
 })
 
 register_benchmark({
     'name': 'Atari1Hr',
     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60}
+              for _game in _ATARI7]
 })
 
 register_benchmark({
     'name': 'AtariExploration10M',
     'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)}
+              for _game in _ATARIEXPL7]
 })
 
 
 # MuJoCo
 
-_mujocosmall = [
+_MUJOCO_SMALL = [
     'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
     'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2',
     'Reacher-v2', 'Swimmer-v2']
 register_benchmark({
     'name': 'Mujoco1M',
     'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
-    'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
+    'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _MUJOCO_SMALL]
 })
 register_benchmark({
     'name': 'MujocoWalkers',
@@ -121,7 +153,7 @@ def find_task_for_env_id_in_any_benchmark(env_id):
 
 # Other
 
-_atari50 = [  # actually 47
+_ATARI50 = [  # actually 47
     'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
     'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
     'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
@@ -137,7 +169,8 @@ def find_task_for_env_id_in_any_benchmark(env_id):
 register_benchmark({
     'name': 'Atari50_10M',
     'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
-    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
+    'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)}
+              for _game in _ATARI50]
 })
 
 # HER DDPG
@@ -147,4 +180,3 @@ def find_task_for_env_id_in_any_benchmark(env_id):
     'description': 'Smoke-test only benchmark of HER',
     'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
 })
-
diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py
index 0da1b4f878..91b6ebecdd 100644
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -1,35 +1,48 @@
 __all__ = ['Monitor', 'get_monitor_files', 'load_results']
 
-import gym
-from gym.core import Wrapper
+import os
 import time
-from glob import glob
 import csv
-import os.path as osp
 import json
-import numpy as np
+import uuid
+from glob import glob
+
+import gym
+from gym.core import Wrapper
+import pandas
+
 
 class Monitor(Wrapper):
     EXT = "monitor.csv"
-    f = None
+    file_handler = None
 
     def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
+        """
+        A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data.
+
+        :param env: (Gym environment) The environment
+        :param filename: (str) the location to save a log file, can be None for no log
+        :param allow_early_resets: (bool) allows the reset of the environment before it is done
+        :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset
+        :param info_keywords: (tuple) extra information to log, from the information return of environment.step
+        """
         Wrapper.__init__(self, env=env)
-        self.tstart = time.time()
+        self.t_start = time.time()
         if filename is None:
-            self.f = None
+            self.file_handler = None
             self.logger = None
         else:
             if not filename.endswith(Monitor.EXT):
-                if osp.isdir(filename):
-                    filename = osp.join(filename, Monitor.EXT)
+                if os.path.isdir(filename):
+                    filename = os.path.join(filename, Monitor.EXT)
                 else:
                     filename = filename + "." + Monitor.EXT
-            self.f = open(filename, "wt")
-            self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
-            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
+            self.file_handler = open(filename, "wt")
+            self.file_handler.write('#%s\n' % json.dumps({"t_start": self.t_start, 'env_id': env.spec and env.spec.id}))
+            self.logger = csv.DictWriter(self.file_handler,
+                                         fieldnames=('r', 'l', 't') + reset_keywords + info_keywords)
             self.logger.writeheader()
-            self.f.flush()
+            self.file_handler.flush()
 
         self.reset_keywords = reset_keywords
         self.info_keywords = info_keywords
@@ -40,103 +53,159 @@ def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), i
         self.episode_lengths = []
         self.episode_times = []
         self.total_steps = 0
-        self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
+        self.current_reset_info = {}  # extra info about the current episode, that was passed in during reset()
 
     def reset(self, **kwargs):
+        """
+        Calls the Gym environment reset. Can only be called if the environment is over, or if allow_early_resets is True
+
+        :param kwargs: Extra keywords saved for the next episode. only if defined by reset_keywords
+        :return: ([int] or [float]) the first observation of the environment
+        """
         if not self.allow_early_resets and not self.needs_reset:
-            raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
+            raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, "
+                               "wrap your env with Monitor(env, path, allow_early_resets=True)")
         self.rewards = []
         self.needs_reset = False
-        for k in self.reset_keywords:
-            v = kwargs.get(k)
-            if v is None:
-                raise ValueError('Expected you to pass kwarg %s into reset'%k)
-            self.current_reset_info[k] = v
+        for key in self.reset_keywords:
+            value = kwargs.get(key)
+            if value is None:
+                raise ValueError('Expected you to pass kwarg %s into reset' % key)
+            self.current_reset_info[key] = value
         return self.env.reset(**kwargs)
 
     def step(self, action):
+        """
+        Step the environment with the given action
+
+        :param action: ([int] or [float]) the action
+        :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
+        """
         if self.needs_reset:
             raise RuntimeError("Tried to step environment that needs reset")
-        ob, rew, done, info = self.env.step(action)
-        self.rewards.append(rew)
+        observation, reward, done, info = self.env.step(action)
+        self.rewards.append(reward)
         if done:
             self.needs_reset = True
-            eprew = sum(self.rewards)
+            ep_rew = sum(self.rewards)
             eplen = len(self.rewards)
-            epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
-            for k in self.info_keywords:
-                epinfo[k] = info[k]
-            self.episode_rewards.append(eprew)
+            ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)}
+            for key in self.info_keywords:
+                ep_info[key] = info[key]
+            self.episode_rewards.append(ep_rew)
             self.episode_lengths.append(eplen)
-            self.episode_times.append(time.time() - self.tstart)
-            epinfo.update(self.current_reset_info)
+            self.episode_times.append(time.time() - self.t_start)
+            ep_info.update(self.current_reset_info)
             if self.logger:
-                self.logger.writerow(epinfo)
-                self.f.flush()
-            info['episode'] = epinfo
+                self.logger.writerow(ep_info)
+                self.file_handler.flush()
+            info['episode'] = ep_info
         self.total_steps += 1
-        return (ob, rew, done, info)
+        return observation, reward, done, info
 
     def close(self):
-        if self.f is not None:
-            self.f.close()
+        """
+        Closes the environment
+        """
+        if self.file_handler is not None:
+            self.file_handler.close()
 
     def get_total_steps(self):
+        """
+        Returns the total number of timesteps
+
+        :return: (int)
+        """
         return self.total_steps
 
     def get_episode_rewards(self):
+        """
+        Returns the rewards of all the episodes
+
+        :return: ([float])
+        """
         return self.episode_rewards
 
     def get_episode_lengths(self):
+        """
+        Returns the number of timesteps of all the episodes
+
+        :return: ([int])
+        """
         return self.episode_lengths
 
     def get_episode_times(self):
+        """
+        Returns the runtime in seconds of all the episodes
+
+        :return: ([float])
+        """
         return self.episode_times
 
+
 class LoadMonitorResultsError(Exception):
+    """
+    Raised when loading the monitor log fails.
+    """
     pass
 
-def get_monitor_files(dir):
-    return glob(osp.join(dir, "*" + Monitor.EXT))
 
-def load_results(dir):
-    import pandas
-    monitor_files = (
-        glob(osp.join(dir, "*monitor.json")) + 
-        glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
+def get_monitor_files(path):
+    """
+    get all the monitor files in the given path
+
+    :param path: (str) the logging folder
+    :return: ([str]) the log files
+    """
+    return glob(os.path.join(path, "*" + Monitor.EXT))
+
+
+def load_results(path):
+    """
+    Load results from a given file
+
+    :param path: (str) the path to the log file
+    :return: (Pandas DataFrame) the logged data
+    """
+    # get both csv and (old) json files
+    monitor_files = (glob(os.path.join(path, "*monitor.json")) + glob(os.path.join(path, "*monitor.csv")))
     if not monitor_files:
-        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
-    dfs = []
+        raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path))
+    data_frames = []
     headers = []
-    for fname in monitor_files:
-        with open(fname, 'rt') as fh:
-            if fname.endswith('csv'):
-                firstline = fh.readline()
-                assert firstline[0] == '#'
-                header = json.loads(firstline[1:])
-                df = pandas.read_csv(fh, index_col=None)
+    for file_name in monitor_files:
+        with open(file_name, 'rt') as file_handler:
+            if file_name.endswith('csv'):
+                first_line = file_handler.readline()
+                assert first_line[0] == '#'
+                header = json.loads(first_line[1:])
+                data_frame = pandas.read_csv(file_handler, index_col=None)
                 headers.append(header)
-            elif fname.endswith('json'): # Deprecated json format
+            elif file_name.endswith('json'):  # Deprecated json format
                 episodes = []
-                lines = fh.readlines()
+                lines = file_handler.readlines()
                 header = json.loads(lines[0])
                 headers.append(header)
                 for line in lines[1:]:
                     episode = json.loads(line)
                     episodes.append(episode)
-                df = pandas.DataFrame(episodes)
+                data_frame = pandas.DataFrame(episodes)
             else:
                 assert 0, 'unreachable'
-            df['t'] += header['t_start']
-        dfs.append(df)
-    df = pandas.concat(dfs)
-    df.sort_values('t', inplace=True)
-    df.reset_index(inplace=True)
-    df['t'] -= min(header['t_start'] for header in headers)
-    df.headers = headers # HACK to preserve backwards compatibility
-    return df
+            data_frame['t'] += header['t_start']
+        data_frames.append(data_frame)
+    data_frame = pandas.concat(data_frames)
+    data_frame.sort_values('t', inplace=True)
+    data_frame.reset_index(inplace=True)
+    data_frame['t'] -= min(header['t_start'] for header in headers)
+    data_frame.headers = headers  # HACK to preserve backwards compatibility
+    return data_frame
+
 
 def test_monitor():
+    """
+    test the monitor wrapper
+    """
     env = gym.make("CartPole-v1")
     env.seed(0)
     mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
@@ -147,15 +216,15 @@ def test_monitor():
         if done:
             menv.reset()
 
-    f = open(mon_file, 'rt')
+    file_handler = open(mon_file, 'rt')
 
-    firstline = f.readline()
+    firstline = file_handler.readline()
     assert firstline.startswith('#')
     metadata = json.loads(firstline[1:])
     assert metadata['env_id'] == "CartPole-v1"
-    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"
+    assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata"
 
-    last_logline = pandas.read_csv(f, index_col=None)
+    last_logline = pandas.read_csv(file_handler, index_col=None)
     assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
-    f.close()
-    os.remove(mon_file)
\ No newline at end of file
+    file_handler.close()
+    os.remove(mon_file)
diff --git a/baselines/common/__init__.py b/baselines/common/__init__.py
index 0834b36492..508c62e135 100644
--- a/baselines/common/__init__.py
+++ b/baselines/common/__init__.py
@@ -1,5 +1,7 @@
 # flake8: noqa F403
-from baselines.common.console_util import *
+from baselines.common.console_util import fmt_row, fmt_item, colorize
 from baselines.common.dataset import Dataset
-from baselines.common.math_util import *
-from baselines.common.misc_util import *
+from baselines.common.math_util import discount, discount_with_boundaries, explained_variance, explained_variance_2d,\
+    flatten_arrays, unflatten_vector
+from baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\
+    boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load
diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py
index 2aefad78cf..666f08a7ad 100644
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -1,14 +1,20 @@
-import numpy as np
 from collections import deque
+
+import numpy as np
 import gym
 from gym import spaces
 import cv2
 cv2.ocl.setUseOpenCL(False)
 
+
 class NoopResetEnv(gym.Wrapper):
     def __init__(self, env, noop_max=30):
-        """Sample initial states by taking random number of no-ops on reset.
+        """
+        Sample initial states by taking random number of no-ops on reset.
         No-op is assumed to be action 0.
+
+        :param env: (Gym Environment) the environment to wrap
+        :param noop_max: (int) the maximum value of no-ops to run
         """
         gym.Wrapper.__init__(self, env)
         self.noop_max = noop_max
@@ -17,12 +23,11 @@ def __init__(self, env, noop_max=30):
         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 
     def reset(self, **kwargs):
-        """ Do no-op action for a number of steps in [1, noop_max]."""
         self.env.reset(**kwargs)
         if self.override_num_noops is not None:
             noops = self.override_num_noops
         else:
-            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
         assert noops > 0
         obs = None
         for _ in range(noops):
@@ -31,12 +36,17 @@ def reset(self, **kwargs):
                 obs = self.env.reset(**kwargs)
         return obs
 
-    def step(self, ac):
-        return self.env.step(ac)
+    def step(self, action):
+        return self.env.step(action)
+
 
 class FireResetEnv(gym.Wrapper):
     def __init__(self, env):
-        """Take action on reset for environments that are fixed until firing."""
+        """
+        Take action on reset for environments that are fixed until firing.
+
+        :param env: (Gym Environment) the environment to wrap
+        """
         gym.Wrapper.__init__(self, env)
         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
         assert len(env.unwrapped.get_action_meanings()) >= 3
@@ -51,17 +61,21 @@ def reset(self, **kwargs):
             self.env.reset(**kwargs)
         return obs
 
-    def step(self, ac):
-        return self.env.step(ac)
+    def step(self, action):
+        return self.env.step(action)
+
 
 class EpisodicLifeEnv(gym.Wrapper):
     def __init__(self, env):
-        """Make end-of-life == end-of-episode, but only reset on true game over.
+        """
+        Make end-of-life == end-of-episode, but only reset on true game over.
         Done by DeepMind for the DQN and co. since it helps value estimation.
+
+        :param env: (Gym Environment) the environment to wrap
         """
         gym.Wrapper.__init__(self, env)
         self.lives = 0
-        self.was_real_done  = True
+        self.was_real_done = True
 
     def step(self, action):
         obs, reward, done, info = self.env.step(action)
@@ -69,7 +83,7 @@ def step(self, action):
         # check current lives, make loss of life terminal,
         # then update lives to handle bonus lives
         lives = self.env.unwrapped.ale.lives()
-        if lives < self.lives and lives > 0:
+        if 0 < lives < self.lives:
             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
             # so its important to keep lives > 0, so that we only reset once
             # the environment advertises done.
@@ -78,9 +92,13 @@ def step(self, action):
         return obs, reward, done, info
 
     def reset(self, **kwargs):
-        """Reset only when lives are exhausted.
+        """
+        Calls the Gym environment reset, only when lives are exhausted.
         This way all states are still reachable even though lives are episodic,
         and the learner need not know about any of this behind-the-scenes.
+
+        :param kwargs: Extra keywords passed to env.reset() call
+        :return: ([int] or [float]) the first observation of the environment
         """
         if self.was_real_done:
             obs = self.env.reset(**kwargs)
@@ -90,22 +108,36 @@ def reset(self, **kwargs):
         self.lives = self.env.unwrapped.ale.lives()
         return obs
 
+
 class MaxAndSkipEnv(gym.Wrapper):
     def __init__(self, env, skip=4):
-        """Return only every `skip`-th frame"""
+        """
+        Return only every `skip`-th frame (frameskipping)
+
+        :param env: (Gym Environment) the environment
+        :param skip: (int) number of `skip`-th frame
+        """
         gym.Wrapper.__init__(self, env)
         # most recent raw observations (for max pooling across time steps)
         self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
-        self._skip       = skip
+        self._skip = skip
 
     def step(self, action):
-        """Repeat action, sum reward, and max over last observations."""
+        """
+        Step the environment with the given action
+        Repeat action, sum reward, and max over last observations.
+
+        :param action: ([int] or [float]) the action
+        :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
+        """
         total_reward = 0.0
         done = None
         for i in range(self._skip):
             obs, reward, done, info = self.env.step(action)
-            if i == self._skip - 2: self._obs_buffer[0] = obs
-            if i == self._skip - 1: self._obs_buffer[1] = obs
+            if i == self._skip - 2:
+                self._obs_buffer[0] = obs
+            if i == self._skip - 1:
+                self._obs_buffer[1] = obs
             total_reward += reward
             if done:
                 break
@@ -118,59 +150,84 @@ def step(self, action):
     def reset(self, **kwargs):
         return self.env.reset(**kwargs)
 
+
 class ClipRewardEnv(gym.RewardWrapper):
     def __init__(self, env):
+        """
+        clips the reward to {+1, 0, -1} by its sign.
+
+        :param env: (Gym Environment) the environment
+        """
         gym.RewardWrapper.__init__(self, env)
 
     def reward(self, reward):
-        """Bin reward to {+1, 0, -1} by its sign."""
+        """
+        Bin reward to {+1, 0, -1} by its sign.
+
+        :param reward: (float)
+        """
         return np.sign(reward)
 
+
 class WarpFrame(gym.ObservationWrapper):
     def __init__(self, env):
-        """Warp frames to 84x84 as done in the Nature paper and later work."""
+        """
+        Warp frames to 84x84 as done in the Nature paper and later work.
+
+        :param env: (Gym Environment) the environment
+        """
         gym.ObservationWrapper.__init__(self, env)
         self.width = 84
         self.height = 84
-        self.observation_space = spaces.Box(low=0, high=255,
-            shape=(self.height, self.width, 1), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1), dtype=np.uint8)
 
     def observation(self, frame):
+        """
+        returns the current observation from a frame
+
+        :param frame: ([int] or [float]) environment frame
+        :return: ([int] or [float]) the observation
+        """
         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
         return frame[:, :, None]
 
+
 class FrameStack(gym.Wrapper):
-    def __init__(self, env, k):
-        """Stack k last frames.
+    def __init__(self, env, n_frames):
+        """Stack n_frames last frames.
 
         Returns lazy array, which is much more memory efficient.
 
         See Also
         --------
         baselines.common.atari_wrappers.LazyFrames
+
+        :param env: (Gym Environment) the environment
+        :param n_frames: (int) the number of frames to stack
         """
         gym.Wrapper.__init__(self, env)
-        self.k = k
-        self.frames = deque([], maxlen=k)
+        self.n_frames = n_frames
+        self.frames = deque([], maxlen=n_frames)
         shp = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * n_frames), dtype=np.uint8)
 
     def reset(self):
-        ob = self.env.reset()
-        for _ in range(self.k):
-            self.frames.append(ob)
+        obs = self.env.reset()
+        for _ in range(self.n_frames):
+            self.frames.append(obs)
         return self._get_ob()
 
     def step(self, action):
-        ob, reward, done, info = self.env.step(action)
-        self.frames.append(ob)
+        obs, reward, done, info = self.env.step(action)
+        self.frames.append(obs)
         return self._get_ob(), reward, done, info
 
     def _get_ob(self):
-        assert len(self.frames) == self.k
+        assert len(self.frames) == self.n_frames
         return LazyFrames(list(self.frames))
 
+
 class ScaledFloatFrame(gym.ObservationWrapper):
     def __init__(self, env):
         gym.ObservationWrapper.__init__(self, env)
@@ -180,15 +237,18 @@ def observation(self, observation):
         # with smaller replay buffers only.
         return np.array(observation).astype(np.float32) / 255.0
 
+
 class LazyFrames(object):
     def __init__(self, frames):
-        """This object ensures that common frames between the observations are only stored once.
+        """
+        This object ensures that common frames between the observations are only stored once.
         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
         buffers.
 
         This object should only be converted to numpy array before being passed to the model.
 
-        You'd not believe how complex the previous solution was."""
+        :param frames: ([int] or [float]) environment frames
+        """
         self._frames = frames
         self._out = None
 
@@ -210,15 +270,31 @@ def __len__(self):
     def __getitem__(self, i):
         return self._force()[i]
 
+
 def make_atari(env_id):
+    """
+    Create a wrapped atari envrionment
+
+    :param env_id: (str) the environment ID
+    :return: (Gym Environment) the wrapped atari environment
+    """
     env = gym.make(env_id)
     assert 'NoFrameskip' in env.spec.id
     env = NoopResetEnv(env, noop_max=30)
     env = MaxAndSkipEnv(env, skip=4)
     return env
 
+
 def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
-    """Configure environment for DeepMind-style Atari.
+    """
+    Configure environment for DeepMind-style Atari.
+
+    :param env: (Gym Environment) the atari environment
+    :param episode_life: (bool) wrap the episode life wrapper
+    :param clip_rewards: (bool) wrap the reward clipping wrapper
+    :param frame_stack: (bool) wrap the frame stacking wrapper
+    :param scale: (bool) wrap the scaling observation wrapper
+    :return: (Gym Environment) the wrapped atari environment
     """
     if episode_life:
         env = EpisodicLifeEnv(env)
@@ -232,4 +308,3 @@ def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False,
     if frame_stack:
         env = FrameStack(env, 4)
     return env
-
diff --git a/baselines/common/cg.py b/baselines/common/cg.py
index a913186666..15c0f9524d 100644
--- a/baselines/common/cg.py
+++ b/baselines/common/cg.py
@@ -1,34 +1,49 @@
 import numpy as np
-def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
+
+
+def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
     """
-    Demmel p 312
+    conjugate gradient calculation (Ax = b), bases on
+    https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312
+
+    :param f_ax: (function) The function describing the Matrix A dot the vector x
+                 (x being the input parameter of the function)
+    :param b_vec: (numpy float) vector b, where Ax = b
+    :param cg_iters: (int) the maximum number of iterations for converging
+    :param callback: (function) callback the values of x while converging
+    :param verbose: (bool) print extra information
+    :param residual_tol: (float) the break point if the residual is below this value
+    :return: (numpy float) vector x, where Ax = b
     """
-    p = b.copy()
-    r = b.copy()
-    x = np.zeros_like(b)
-    rdotr = r.dot(r)
+    first_basis_vect = b_vec.copy()  # the first basis vector
+    residual = b_vec.copy()  # the residual
+    x_var = np.zeros_like(b_vec)  # vector x, where Ax = b
+    residual_dot_residual = residual.dot(residual)  # L2 norm of the residual
 
-    fmtstr =  "%10i %10.3g %10.3g"
-    titlestr =  "%10s %10s %10s"
-    if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
+    fmt_str = "%10i %10.3g %10.3g"
+    title_str = "%10s %10s %10s"
+    if verbose:
+        print(title_str % ("iter", "residual norm", "soln norm"))
 
     for i in range(cg_iters):
         if callback is not None:
-            callback(x)
-        if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
-        z = f_Ax(p)
-        v = rdotr / p.dot(z)
-        x += v*p
-        r -= v*z
-        newrdotr = r.dot(r)
-        mu = newrdotr/rdotr
-        p = r + mu*p
+            callback(x_var)
+        if verbose:
+            print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var)))
+        z_var = f_ax(first_basis_vect)
+        v_var = residual_dot_residual / first_basis_vect.dot(z_var)
+        x_var += v_var * first_basis_vect
+        residual -= v_var * z_var
+        new_residual_dot_residual = residual.dot(residual)
+        mu_val = new_residual_dot_residual / residual_dot_residual
+        first_basis_vect = residual + mu_val * first_basis_vect
 
-        rdotr = newrdotr
-        if rdotr < residual_tol:
+        residual_dot_residual = new_residual_dot_residual
+        if residual_dot_residual < residual_tol:
             break
 
     if callback is not None:
-        callback(x)
-    if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
-    return x
\ No newline at end of file
+        callback(x_var)
+    if verbose:
+        print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var)))
+    return x_var
diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 5707695487..58b4d1a0b1 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -3,21 +3,33 @@
 """
 
 import os
+
 from mpi4py import MPI
 import gym
 from gym.wrappers import FlattenDictWrapper
+
 from baselines import logger
 from baselines.bench import Monitor
 from baselines.common import set_global_seeds
 from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 
+
 def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
     """
     Create a wrapped, monitored SubprocVecEnv for Atari.
+    
+    :param env_id: (str) the environment ID
+    :param num_env: (int) the number of environment you wish to have in subprocesses
+    :param seed: (int) the inital seed for RNG
+    :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
+    :param start_index: (int) start rank index
+    :return: (Gym Environment) The atari environment
     """
-    if wrapper_kwargs is None: wrapper_kwargs = {}
-    def make_env(rank): # pylint: disable=C0111
+    if wrapper_kwargs is None:
+        wrapper_kwargs = {}
+
+    def make_env(rank):
         def _thunk():
             env = make_atari(env_id)
             env.seed(seed + rank)
@@ -27,9 +39,14 @@ def _thunk():
     set_global_seeds(seed)
     return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
 
+
 def make_mujoco_env(env_id, seed):
     """
     Create a wrapped, monitored gym.Env for MuJoCo.
+    
+    :param env_id: (str) the environment ID
+    :param seed: (int) the inital seed for RNG
+    :return: (Gym Environment) The mujoco environment
     """
     rank = MPI.COMM_WORLD.Get_rank()
     set_global_seeds(seed + 10000 * rank)
@@ -38,9 +55,15 @@ def make_mujoco_env(env_id, seed):
     env.seed(seed)
     return env
 
+
 def make_robotics_env(env_id, seed, rank=0):
     """
     Create a wrapped, monitored gym.Env for MuJoCo.
+    
+    :param env_id: (str) the environment ID
+    :param seed: (int) the inital seed for RNG
+    :param rank: (int) the rank of the environment (for logging)
+    :return: (Gym Environment) The robotic environment
     """
     set_global_seeds(seed)
     env = gym.make(env_id)
@@ -51,26 +74,35 @@ def make_robotics_env(env_id, seed, rank=0):
     env.seed(seed)
     return env
 
+
 def arg_parser():
     """
     Create an empty argparse.ArgumentParser.
+    
+    :return: (ArgumentParser)
     """
     import argparse
     return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
+
 def atari_arg_parser():
     """
     Create an argparse.ArgumentParser for run_atari.py.
+    
+    :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)}
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
-    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
+    parser.add_argument('--num-timesteps', type=int, default=int(1e7))
     return parser
 
+
 def mujoco_arg_parser():
     """
     Create an argparse.ArgumentParser for run_mujoco.py.
+    
+    :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
@@ -79,9 +111,12 @@ def mujoco_arg_parser():
     parser.add_argument('--play', default=False, action='store_true')
     return parser
 
+
 def robotics_arg_parser():
     """
     Create an argparse.ArgumentParser for run_mujoco.py.
+    
+    :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)}
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
diff --git a/baselines/common/console_util.py b/baselines/common/console_util.py
index 8adc3f83ad..c8b4c94cb2 100644
--- a/baselines/common/console_util.py
+++ b/baselines/common/console_util.py
@@ -1,31 +1,51 @@
 from __future__ import print_function
-from contextlib import contextmanager
+
 import numpy as np
-import time
+
 
 # ================================================================
 # Misc
 # ================================================================
 
+
 def fmt_row(width, row, header=False):
+    """
+    fits a list of items to at least a certain length
+
+    :param width: (int) the minimum width of the string
+    :param row: ([Any]) a list of object you wish to get the string representation
+    :param header: (bool) whether or not to return the string as a header
+    :return: (str) the string representation of all the elements in 'row', of length >= 'width'
+    """
     out = " | ".join(fmt_item(x, width) for x in row)
-    if header: out = out + "\n" + "-"*len(out)
+    if header:
+        out = out + "\n" + "-" * len(out)
     return out
 
-def fmt_item(x, l):
-    if isinstance(x, np.ndarray):
-        assert x.ndim==0
-        x = x.item()
-    if isinstance(x, (float, np.float32, np.float64)):
-        v = abs(x)
-        if (v < 1e-4 or v > 1e+4) and v > 0:
-            rep = "%7.2e" % x
+
+def fmt_item(item, min_width):
+    """
+    fits items to a given string length
+
+    :param item: (Any) the item you wish to get the string representation
+    :param min_width: (int) the minimum width of the string
+    :return: (str) the string representation of 'x' of length >= 'l'
+    """
+    if isinstance(item, np.ndarray):
+        assert item.ndim == 0
+        item = item.item()
+    if isinstance(item, (float, np.float32, np.float64)):
+        value = abs(item)
+        if (value < 1e-4 or value > 1e+4) and value > 0:
+            rep = "%7.2e" % item
         else:
-            rep = "%7.5f" % x
-    else: rep = str(x)
-    return " "*(l - len(rep)) + rep
+            rep = "%7.5f" % item
+    else:
+        rep = str(item)
+    return " " * (min_width - len(rep)) + rep
 
-color2num = dict(
+
+COLOR_TO_NUM = dict(
     gray=30,
     red=31,
     green=32,
@@ -37,23 +57,22 @@ def fmt_item(x, l):
     crimson=38
 )
 
+
 def colorize(string, color, bold=False, highlight=False):
+    """
+    Colorize, bold and/or highlight a string for terminal print
+
+    :param string: (str) input string
+    :param color: (str) the color, the lookup table is the dict at console_util.color2num
+    :param bold: (bool) if the string should be bold or not
+    :param highlight: (bool) if the string should be highlighted or not
+    :return: (str) the stylized output string
+    """
     attr = []
-    num = color2num[color]
-    if highlight: num += 10
+    num = COLOR_TO_NUM[color]
+    if highlight:
+        num += 10
     attr.append(str(num))
-    if bold: attr.append('1')
+    if bold:
+        attr.append('1')
     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
-
-
-MESSAGE_DEPTH = 0
-
-@contextmanager
-def timed(msg):
-    global MESSAGE_DEPTH #pylint: disable=W0603
-    print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
-    tstart = time.time()
-    MESSAGE_DEPTH += 1
-    yield
-    MESSAGE_DEPTH -= 1
-    print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
diff --git a/baselines/common/dataset.py b/baselines/common/dataset.py
index 41a38c8af6..1f951b3b02 100644
--- a/baselines/common/dataset.py
+++ b/baselines/common/dataset.py
@@ -1,31 +1,49 @@
 import numpy as np
 
+
 class Dataset(object):
     def __init__(self, data_map, deterministic=False, shuffle=True):
+        """
+        Data loader that handles batches and shuffling.
+        WARNING: this will alter the given data_map ordering, as dicts are mutable
+
+        :param data_map: (dict) the input data, where every column is a key
+        :param deterministic: (bool) disables the shuffle function
+        :param shuffle: (bool) enable auto shuffle
+        """
         self.data_map = data_map
         self.deterministic = deterministic
         self.enable_shuffle = shuffle
-        self.n = next(iter(data_map.values())).shape[0]
+        self.n_samples = next(iter(data_map.values())).shape[0]
         self._next_id = 0
         self.shuffle()
 
     def shuffle(self):
+        """
+        shuffles the data_map
+        """
         if self.deterministic:
             return
-        perm = np.arange(self.n)
+        perm = np.arange(self.n_samples)
         np.random.shuffle(perm)
 
         for key in self.data_map:
             self.data_map[key] = self.data_map[key][perm]
 
-        self._next_id = 0
-
     def next_batch(self, batch_size):
-        if self._next_id >= self.n and self.enable_shuffle:
-            self.shuffle()
+        """
+        returns a batch of data of a given size
+
+        :param batch_size: (int) the size of the batch
+        :return: (dict) a batch of the input data of size 'batch_size'
+        """
+        if self._next_id >= self.n_samples:
+            self._next_id = 0
+            if self.enable_shuffle:
+                self.shuffle()
 
         cur_id = self._next_id
-        cur_batch_size = min(batch_size, self.n - self._next_id)
+        cur_batch_size = min(batch_size, self.n_samples - self._next_id)
         self._next_id += cur_batch_size
 
         data_map = dict()
@@ -34,13 +52,27 @@ def next_batch(self, batch_size):
         return data_map
 
     def iterate_once(self, batch_size):
-        if self.enable_shuffle: self.shuffle()
+        """
+        generator that iterates over the dataset
+
+        :param batch_size: (int) the size of the batch
+        :return: (dict) a batch of the input data of size 'batch_size'
+        """
+        if self.enable_shuffle:
+            self.shuffle()
 
-        while self._next_id <= self.n - batch_size:
+        while self._next_id <= self.n_samples - batch_size:
             yield self.next_batch(batch_size)
         self._next_id = 0
 
     def subset(self, num_elements, deterministic=True):
+        """
+        Return a subset of the current dataset
+
+        :param num_elements: (int) the number of element you wish to have in the subset
+        :param deterministic: (bool) disables the shuffle function
+        :return: (Dataset) a new subset of the current Dataset object
+        """
         data_map = dict()
         for key in self.data_map:
             data_map[key] = self.data_map[key][:num_elements]
@@ -48,13 +80,24 @@ def subset(self, num_elements, deterministic=True):
 
 
 def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
+    """
+    Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None.
+
+    :param arrays: (tuple) a tuple of arrays
+    :param num_batches: (int) the number of batches, must be None is batch_size is defined
+    :param batch_size: (int) the size of the batch, must be None is num_batches is defined
+    :param shuffle: (bool) enable auto shuffle
+    :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size
+    :return: (tuples) a tuple of a batch of the arrays
+    """
     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
     arrays = tuple(map(np.asarray, arrays))
-    n = arrays[0].shape[0]
-    assert all(a.shape[0] == n for a in arrays[1:])
-    inds = np.arange(n)
-    if shuffle: np.random.shuffle(inds)
-    sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
+    n_samples = arrays[0].shape[0]
+    assert all(a.shape[0] == n_samples for a in arrays[1:])
+    inds = np.arange(n_samples)
+    if shuffle:
+        np.random.shuffle(inds)
+    sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches
     for batch_inds in np.array_split(inds, sections):
         if include_final_partial_batch or len(batch_inds) == batch_size:
             yield tuple(a[batch_inds] for a in arrays)
diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py
index 8a57c37605..18202232f9 100644
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -1,309 +1,493 @@
 import tensorflow as tf
-import numpy as np
-import baselines.common.tf_util as U
-from baselines.a2c.utils import fc
 from tensorflow.python.ops import math_ops
+import numpy as np
+from gym import spaces
+
+from baselines.a2c.utils import linear
+
 
-class Pd(object):
+class ProbabilityDistribution(object):
     """
     A particular probability distribution
     """
+
     def flatparam(self):
+        """
+        Return the direct probabilities
+
+        :return: ([float]) the probabilites
+        """
         raise NotImplementedError
+
     def mode(self):
+        """
+        Returns the index of the highest probability
+
+        :return: (int) the max index of the probabilites
+        """
         raise NotImplementedError
+
     def neglogp(self, x):
+        """
+        returns the of the negative log likelihood
+
+        :param x: (str) the labels of each index
+        :return: ([float]) The negative log likelihood of the distribution
+        """
         # Usually it's easier to define the negative logprob
         raise NotImplementedError
+
     def kl(self, other):
+        """
+        Calculates the Kullback-Leiber divergence from the given probabilty distribution
+
+        :param other: ([float]) the distibution to compare with
+        :return: (float) the KL divergence of the two distributions
+        """
         raise NotImplementedError
+
     def entropy(self):
+        """
+        Returns shannon's entropy of the probability
+
+        :return: (float) the entropy
+        """
         raise NotImplementedError
+
     def sample(self):
+        """
+        Sample an index from the probabilty distribution
+
+        :return: (int) the sampled index
+        """
         raise NotImplementedError
+
     def logp(self, x):
+        """
+        returns the of the log likelihood
+
+        :param x: (str) the labels of each index
+        :return: ([float]) The log likelihood of the distribution
+        """
         return - self.neglogp(x)
 
-class PdType(object):
+
+class ProbabilityDistributionType(object):
     """
     Parametrized family of probability distributions
     """
-    def pdclass(self):
+
+    def probability_distribution_class(self):
+        """
+        returns the ProbabilityDistribution class of this type
+
+        :return: (Type ProbabilityDistribution) the probability distribution class associated
+        """
         raise NotImplementedError
-    def pdfromflat(self, flat):
-        return self.pdclass()(flat)
-    def pdfromlatent(self, latent_vector):
+
+    def proba_distribution_from_flat(self, flat):
+        """
+        returns the probability distribution from flat probabilities
+
+        :param flat: ([float]) the flat probabilities
+        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
+        """
+        return self.probability_distribution_class()(flat)
+
+    def proba_distribution_from_latent(self, latent_vector):
+        """
+        returns the probability distribution from latent values
+
+        :param latent_vector: ([float]) the latent values
+        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
+        """
         raise NotImplementedError
+
     def param_shape(self):
+        """
+        returns the shape of the input parameters
+
+        :return: ([int]) the shape
+        """
         raise NotImplementedError
+
     def sample_shape(self):
+        """
+        returns the shape of the sampling
+
+        :return: ([int]) the shape
+        """
         raise NotImplementedError
+
     def sample_dtype(self):
+        """
+        returns the type of the sampling
+
+        :return: (type) the type
+        """
         raise NotImplementedError
 
     def param_placeholder(self, prepend_shape, name=None):
-        return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
+        """
+        returns the TensorFlow placeholder for the input parameters
+
+        :param prepend_shape: ([int]) the prepend shape
+        :param name: (str) the placeholder name
+        :return: (TensorFlow Tensor) the placeholder
+        """
+        return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name)
+
     def sample_placeholder(self, prepend_shape, name=None):
-        return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
+        """
+        returns the TensorFlow placeholder for the sampling
+
+        :param prepend_shape: ([int]) the prepend shape
+        :param name: (str) the placeholder name
+        :return: (TensorFlow Tensor) the placeholder
+        """
+        return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(), name=name)
+
 
-class CategoricalPdType(PdType):
-    def __init__(self, ncat):
-        self.ncat = ncat
-    def pdclass(self):
-        return CategoricalPd
-    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
-        return self.pdfromflat(pdparam), pdparam
+class CategoricalProbabilityDistributionType(ProbabilityDistributionType):
+    def __init__(self, n_cat):
+        """
+        The probability distribution type for categorical input
+
+        :param n_cat: (int) the number of categories
+        """
+        self.n_cat = n_cat
+
+    def probability_distribution_class(self):
+        return CategoricalProbabilityDistribution
+
+    def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        """
+        returns the probability distribution from latent values
+
+        :param latent_vector: ([float]) the latent values
+        :param init_scale: (float) the inital scale of the distribution
+        :param init_bias: (float) the inital bias of the distribution
+        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
+        """
+        pdparam = linear(latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias)
+        return self.proba_distribution_from_flat(pdparam), pdparam
 
     def param_shape(self):
-        return [self.ncat]
+        return [self.n_cat]
+
     def sample_shape(self):
         return []
+
     def sample_dtype(self):
         return tf.int32
 
 
-class MultiCategoricalPdType(PdType):
-    def __init__(self, nvec):
-        self.ncats = nvec
-    def pdclass(self):
-        return MultiCategoricalPd
-    def pdfromflat(self, flat):
-        return MultiCategoricalPd(self.ncats, flat)
+class MultiCategoricalProbabilityDistributionType(ProbabilityDistributionType):
+    def __init__(self, n_vec):
+        """
+        The probability distribution type for multiple categorical input
+
+        :param n_vec: (int) the number of vectors
+        """
+        self.n_cats = n_vec
+
+    def probability_distribution_class(self):
+        return MultiCategoricalProbabilityDistribution
+
+    def proba_distribution_from_flat(self, flat):
+        return MultiCategoricalProbabilityDistribution(self.n_cats, flat)
+
+    def proba_distribution_from_latent(self, latent_vector):
+        raise NotImplementedError
+
     def param_shape(self):
-        return [sum(self.ncats)]
+        return [sum(self.n_cats)]
+
     def sample_shape(self):
-        return [len(self.ncats)]
+        return [len(self.n_cats)]
+
     def sample_dtype(self):
         return tf.int32
 
-class DiagGaussianPdType(PdType):
+
+class DiagGaussianProbabilityDistributionType(ProbabilityDistributionType):
     def __init__(self, size):
+        """
+        The probability distribution type for multivariate gaussian input
+
+        :param size: (int) the number of dimentions of the multivariate gaussian
+        """
         self.size = size
-    def pdclass(self):
-        return DiagGaussianPd
 
-    def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
-        mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
+    def probability_distribution_class(self):
+        return DiagGaussianProbabilityDistribution
+
+    def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0):
+        """
+        returns the probability distribution from latent values
+
+        :param latent_vector: ([float]) the latent values
+        :param init_scale: (float) the inital scale of the distribution
+        :param init_bias: (float) the inital bias of the distribution
+        :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated
+        """
+        mean = linear(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
         logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
         pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
-        return self.pdfromflat(pdparam), mean
+        return self.proba_distribution_from_flat(pdparam), mean
 
     def param_shape(self):
-        return [2*self.size]
+        return [2 * self.size]
+
     def sample_shape(self):
         return [self.size]
+
     def sample_dtype(self):
         return tf.float32
 
-class BernoulliPdType(PdType):
+
+class BernoulliProbabilityDistributionType(ProbabilityDistributionType):
     def __init__(self, size):
+        """
+        The probability distribution type for bernoulli input
+
+        :param size: (int) the number of dimentions of the bernoulli distribution
+        """
         self.size = size
-    def pdclass(self):
-        return BernoulliPd
+
+    def probability_distribution_class(self):
+        return BernoulliProbabilityDistribution
+
+    def proba_distribution_from_latent(self, latent_vector):
+        raise NotImplementedError
+
     def param_shape(self):
         return [self.size]
+
     def sample_shape(self):
         return [self.size]
+
     def sample_dtype(self):
         return tf.int32
 
-# WRONG SECOND DERIVATIVES
-# class CategoricalPd(Pd):
-#     def __init__(self, logits):
-#         self.logits = logits
-#         self.ps = tf.nn.softmax(logits)
-#     @classmethod
-#     def fromflat(cls, flat):
-#         return cls(flat)
-#     def flatparam(self):
-#         return self.logits
-#     def mode(self):
-#         return U.argmax(self.logits, axis=-1)
-#     def logp(self, x):
-#         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
-#     def kl(self, other):
-#         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
-#                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
-#     def entropy(self):
-#         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
-#     def sample(self):
-#         u = tf.random_uniform(tf.shape(self.logits))
-#         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
-
-class CategoricalPd(Pd):
+
+class CategoricalProbabilityDistribution(ProbabilityDistribution):
     def __init__(self, logits):
+        """
+        Probability distributions from categorical input
+
+        :param logits: ([float]) the categorical logits input
+        """
         self.logits = logits
+
     def flatparam(self):
         return self.logits
+
     def mode(self):
         return tf.argmax(self.logits, axis=-1)
+
     def neglogp(self, x):
-        # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
+        # return tf.nn. (logits=self.logits, labels=x)
         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
         #       the implementation does not allow second-order derivatives...
         one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
         return tf.nn.softmax_cross_entropy_with_logits(
             logits=self.logits,
             labels=one_hot_actions)
+
     def kl(self, other):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
-        a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
-        ea0 = tf.exp(a0)
-        ea1 = tf.exp(a1)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
-        z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True)
-        p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
+        a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
+        a_1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True)
+        exp_a_0 = tf.exp(a_0)
+        exp_a_1 = tf.exp(a_1)
+        z_0 = tf.reduce_sum(exp_a_0, axis=-1, keep_dims=True)
+        z_1 = tf.reduce_sum(exp_a_1, axis=-1, keep_dims=True)
+        p_0 = exp_a_0 / z_0
+        return tf.reduce_sum(p_0 * (a_0 - tf.log(z_0) - a_1 + tf.log(z_1)), axis=-1)
+
     def entropy(self):
-        a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
-        ea0 = tf.exp(a0)
-        z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True)
-        p0 = ea0 / z0
-        return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
+        a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True)
+        exp_a_0 = tf.exp(a_0)
+        z_0 = tf.reduce_sum(exp_a_0, axis=-1, keep_dims=True)
+        p_0 = exp_a_0 / z_0
+        return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), axis=-1)
+
     def sample(self):
-        u = tf.random_uniform(tf.shape(self.logits))
-        return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
+        uniform = tf.random_uniform(tf.shape(self.logits))
+        return tf.argmax(self.logits - tf.log(-tf.log(uniform)), axis=-1)
+
     @classmethod
     def fromflat(cls, flat):
+        """
+        Create an instance of this from new logits values
+
+        :param flat: ([float]) the categorical logits input
+        :return: (ProbabilityDistribution) the instance from the given categorical input
+        """
         return cls(flat)
 
-class MultiCategoricalPd(Pd):
+
+class MultiCategoricalProbabilityDistribution(ProbabilityDistribution):
     def __init__(self, nvec, flat):
+        """
+        Probability distributions from multicategorical input
+
+        :param nvec: (int) the number of categorical inputs
+        :param flat: ([float]) the categorical logits input
+        """
         self.flat = flat
-        self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
+        self.categoricals = list(map(CategoricalProbabilityDistribution, tf.split(flat, nvec, axis=-1)))
+
     def flatparam(self):
         return self.flat
+
     def mode(self):
         return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
+
     def neglogp(self, x):
         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))])
+
     def kl(self, other):
         return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)])
+
     def entropy(self):
         return tf.add_n([p.entropy() for p in self.categoricals])
+
     def sample(self):
         return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
+
     @classmethod
     def fromflat(cls, flat):
+        """
+        Create an instance of this from new logits values
+
+        :param flat: ([float]) the multi categorical logits input
+        :return: (ProbabilityDistribution) the instance from the given multi categorical input
+        """
         raise NotImplementedError
 
-class DiagGaussianPd(Pd):
+
+class DiagGaussianProbabilityDistribution(ProbabilityDistribution):
     def __init__(self, flat):
+        """
+        Probability distributions from multivariate gaussian input
+
+        :param flat: ([float]) the multivariate gaussian input data
+        """
         self.flat = flat
-        mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
+        mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat)
         self.mean = mean
         self.logstd = logstd
         self.std = tf.exp(logstd)
+
     def flatparam(self):
         return self.flat
+
     def mode(self):
         return self.mean
+
     def neglogp(self, x):
         return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
                + tf.reduce_sum(self.logstd, axis=-1)
+
     def kl(self, other):
-        assert isinstance(other, DiagGaussianPd)
-        return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
+        assert isinstance(other, DiagGaussianProbabilityDistribution)
+        return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) /
+                             (2.0 * tf.square(other.std)) - 0.5, axis=-1)
+
     def entropy(self):
         return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
+
     def sample(self):
         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
+
     @classmethod
     def fromflat(cls, flat):
+        """
+        Create an instance of this from new multivariate gaussian input
+
+        :param flat: ([float]) the multivariate gaussian input data
+        :return: (ProbabilityDistribution) the instance from the given multivariate gaussian input data
+        """
         return cls(flat)
 
-class BernoulliPd(Pd):
+
+class BernoulliProbabilityDistribution(ProbabilityDistribution):
     def __init__(self, logits):
+        """
+        Probability distributions from bernoulli input
+
+        :param logits: ([float]) the bernoulli input data
+        """
         self.logits = logits
-        self.ps = tf.sigmoid(logits)
+        self.probabilities = tf.sigmoid(logits)
+
     def flatparam(self):
         return self.logits
+
     def mode(self):
-        return tf.round(self.ps)
+        return tf.round(self.probabilities)
+
     def neglogp(self, x):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
+        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)),
+                             axis=-1)
+
     def kl(self, other):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits,
+                                                                     labels=self.probabilities), axis=-1) - \
+               tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
+                                                                     labels=self.probabilities), axis=-1)
+
     def entropy(self):
-        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
+        return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits,
+                                                                     labels=self.probabilities), axis=-1)
+
     def sample(self):
-        u = tf.random_uniform(tf.shape(self.ps))
-        return tf.to_float(math_ops.less(u, self.ps))
+        samples_from_uniform = tf.random_uniform(tf.shape(self.probabilities))
+        return tf.to_float(math_ops.less(samples_from_uniform, self.probabilities))
+
     @classmethod
     def fromflat(cls, flat):
+        """
+        Create an instance of this from new bernoulli input
+
+        :param flat: ([float]) the bernoulli input data
+        :return: (ProbabilityDistribution) the instance from the given bernoulli input data
+        """
         return cls(flat)
 
-def make_pdtype(ac_space):
-    from gym import spaces
+
+def make_proba_dist_type(ac_space):
+    """
+    return an instance of ProbabilityDistributionType for the correct type of action space
+
+    :param ac_space: (Gym Space) the input action space
+    :return: (ProbabilityDistributionType) the approriate instance of a ProbabilityDistributionType
+    """
     if isinstance(ac_space, spaces.Box):
         assert len(ac_space.shape) == 1
-        return DiagGaussianPdType(ac_space.shape[0])
+        return DiagGaussianProbabilityDistributionType(ac_space.shape[0])
     elif isinstance(ac_space, spaces.Discrete):
-        return CategoricalPdType(ac_space.n)
+        return CategoricalProbabilityDistributionType(ac_space.n)
     elif isinstance(ac_space, spaces.MultiDiscrete):
-        return MultiCategoricalPdType(ac_space.nvec)
+        return MultiCategoricalProbabilityDistributionType(ac_space.nvec)
     elif isinstance(ac_space, spaces.MultiBinary):
-        return BernoulliPdType(ac_space.n)
+        return BernoulliProbabilityDistributionType(ac_space.n)
     else:
         raise NotImplementedError
 
-def shape_el(v, i):
-    maybe = v.get_shape()[i]
+
+def shape_el(tensor, index):
+    """
+    get the shape of a TensorFlow Tensor element
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :param index: (int) the element
+    :return: ([int]) the shape
+    """
+    maybe = tensor.get_shape()[index]
     if maybe is not None:
         return maybe
     else:
-        return tf.shape(v)[i]
-
-@U.in_session
-def test_probtypes():
-    np.random.seed(0)
-
-    pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
-    diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
-    validate_probtype(diag_gauss, pdparam_diag_gauss)
-
-    pdparam_categorical = np.array([-.2, .3, .5])
-    categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
-    validate_probtype(categorical, pdparam_categorical)
-
-    nvec = [1,2,3]
-    pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
-    multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101
-    validate_probtype(multicategorical, pdparam_multicategorical)
-
-    pdparam_bernoulli = np.array([-.2, .3, .5])
-    bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
-    validate_probtype(bernoulli, pdparam_bernoulli)
-
-
-def validate_probtype(probtype, pdparam):
-    N = 100000
-    # Check to see if mean negative log likelihood == differential entropy
-    Mval = np.repeat(pdparam[None, :], N, axis=0)
-    M = probtype.param_placeholder([N])
-    X = probtype.sample_placeholder([N])
-    pd = probtype.pdfromflat(M)
-    calcloglik = U.function([X, M], pd.logp(X))
-    calcent = U.function([M], pd.entropy())
-    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
-    logliks = calcloglik(Xval, Mval)
-    entval_ll = - logliks.mean() #pylint: disable=E1101
-    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
-    entval = calcent(Mval).mean() #pylint: disable=E1101
-    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
-
-    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
-    M2 = probtype.param_placeholder([N])
-    pd2 = probtype.pdfromflat(M2)
-    q = pdparam + np.random.randn(pdparam.size) * 0.1
-    Mval2 = np.repeat(q[None, :], N, axis=0)
-    calckl = U.function([M, M2], pd.kl(pd2))
-    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
-    logliks = calcloglik(Xval, Mval2)
-    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
-    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
-    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
-    print('ok on', probtype, pdparam)
-
+        return tf.shape(tensor)[index]
diff --git a/baselines/common/filters.py b/baselines/common/filters.py
index 5ce019cd22..38d602004e 100644
--- a/baselines/common/filters.py
+++ b/baselines/common/filters.py
@@ -1,98 +1,211 @@
-from .running_stat import RunningStat
 from collections import deque
+
 import numpy as np
 
+from .running_stat import RunningStat
+
+
 class Filter(object):
-    def __call__(self, x, update=True):
+    """
+    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+    Can pass kwarg: 'update' (bool) if the filter can update from the value
+    """
+    def __call__(self, arr, update=True):
         raise NotImplementedError
+
     def reset(self):
+        """
+        resets the filter
+        """
         pass
 
+    def output_shape(self, input_space):
+        """
+        returns the output shape
+
+        :param input_space: (numpy int)
+        :return: (numpy int) output shape
+        """
+        raise NotImplementedError
+
+
 class IdentityFilter(Filter):
-    def __call__(self, x, update=True):
-        return x
+    """
+    A filter that implements an identity function
+
+    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+    Can pass kwarg: 'update' (bool) if the filter can update from the value
+    """
+    def __call__(self, arr, update=True):
+        return arr
+
+    def output_shape(self, input_space):
+        return input_space.shape
+
 
 class CompositionFilter(Filter):
-    def __init__(self, fs):
-        self.fs = fs
-    def __call__(self, x, update=True):
-        for f in self.fs:
-            x = f(x)
-        return x
+    def __init__(self, functions):
+        """
+        A filter that implements a composition with other functions
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+
+        :param functions: ([function]) composition of these functions and the input
+        """
+        self.functions = functions
+
+    def __call__(self, arr, update=True):
+        for func in self.functions:
+            arr = func(arr)
+        return arr
+
     def output_shape(self, input_space):
         out = input_space.shape
-        for f in self.fs:
-            out = f.output_shape(out)
+        for func in self.functions:
+            out = func.output_shape(out)
         return out
 
-class ZFilter(Filter):
-    """
-    y = (x-mean)/std
-    using running estimates of mean,std
-    """
 
+class ZFilter(Filter):
     def __init__(self, shape, demean=True, destd=True, clip=10.0):
+        """
+        A filter that implements a z-filter
+        y = (x-mean)/std
+        using running estimates of mean,std
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+
+        :param shape: ([int]) the shape of the input
+        :param demean: (bool) filter mean
+        :param destd: (bool) filter standard deviation
+        :param clip: (float) clip filter absolute value to this value
+        """
         self.demean = demean
         self.destd = destd
         self.clip = clip
 
-        self.rs = RunningStat(shape)
+        self.running_stat = RunningStat(shape)
 
-    def __call__(self, x, update=True):
-        if update: self.rs.push(x)
+    def __call__(self, arr, update=True):
+        if update:
+            self.running_stat.push(arr)
         if self.demean:
-            x = x - self.rs.mean
+            arr = arr - self.running_stat.mean
         if self.destd:
-            x = x / (self.rs.std+1e-8)
+            arr = arr / (self.running_stat.std + 1e-8)
         if self.clip:
-            x = np.clip(x, -self.clip, self.clip)
-        return x
+            arr = np.clip(arr, -self.clip, self.clip)
+        return arr
+
     def output_shape(self, input_space):
         return input_space.shape
 
+
 class AddClock(Filter):
     def __init__(self):
+        """
+        A filter that appends a counter to the input
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+        """
         self.count = 0
+
     def reset(self):
         self.count = 0
-    def __call__(self, x, update=True):
-        return np.append(x, self.count/100.0)
+
+    def __call__(self, arr, update=True):
+        return np.append(arr, self.count / 100.0)
+
     def output_shape(self, input_space):
-        return (input_space.shape[0]+1,)
+        return input_space.shape[0] + 1,
+
 
 class FlattenFilter(Filter):
-    def __call__(self, x, update=True):
-        return x.ravel()
+    """
+    A filter that flattens the input
+
+    takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+    Can pass kwarg: 'update' (bool) if the filter can update from the value
+    """
+    def __call__(self, arr, update=True):
+        return arr.ravel()
+
     def output_shape(self, input_space):
-        return (int(np.prod(input_space.shape)),)
+        return int(np.prod(input_space.shape)),
+
 
 class Ind2OneHotFilter(Filter):
-    def __init__(self, n):
-        self.n = n
-    def __call__(self, x, update=True):
-        out = np.zeros(self.n)
-        out[x] = 1
+    def __init__(self, n_cat):
+        """
+        A filter that turns indices to onehot encoding
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+
+        :param n_cat: (int) the number of categories
+        """
+        self.n_cat = n_cat
+
+    def __call__(self, arr, update=True):
+        out = np.zeros(self.n_cat)
+        out[arr] = 1
         return out
+
     def output_shape(self, input_space):
-        return (input_space.n,)
+        return input_space.n,
+
 
 class DivFilter(Filter):
     def __init__(self, divisor):
+        """
+        A filter that divides the input from a value
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+
+        :param divisor: (float) the number you want to divide by
+        """
         self.divisor = divisor
-    def __call__(self, x, update=True):
-        return x / self.divisor
+
+    def __call__(self, arr, update=True):
+        return arr / self.divisor
+
     def output_shape(self, input_space):
         return input_space.shape
 
+
 class StackFilter(Filter):
     def __init__(self, length):
+        """
+        A filter that runs a stacking of a 'length' inputs
+
+        takes a value 'x' (numpy Number), applies the filter, and returns the new value.
+
+        Can pass kwarg: 'update' (bool) if the filter can update from the value
+
+        :param length: (int) the number of inputs to stack
+        """
         self.stack = deque(maxlen=length)
+
     def reset(self):
         self.stack.clear()
-    def __call__(self, x, update=True):
-        self.stack.append(x)
+
+    def __call__(self, arr, update=True):
+        self.stack.append(arr)
         while len(self.stack) < self.stack.maxlen:
-            self.stack.append(x)
+            self.stack.append(arr)
         return np.concatenate(self.stack, axis=-1)
+
     def output_shape(self, input_space):
         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
diff --git a/baselines/common/identity_env.py b/baselines/common/identity_env.py
index f07cd5b8d4..44e1046e67 100644
--- a/baselines/common/identity_env.py
+++ b/baselines/common/identity_env.py
@@ -3,28 +3,32 @@
 
 
 class IdentityEnv(Env):
-    def __init__(
-            self,
-            dim,
-            ep_length=100,
-    ):
+    def __init__(self, dim, ep_length=100):
+        """
+        Identity environment for testing purposes
 
+        :param dim: (int) the size of the dimentions you want to learn
+        :param ep_length: (int) the length of each episodes in timesteps
+        """
         self.action_space = Discrete(dim)
+        self.ep_length = ep_length
         self.reset()
 
     def reset(self):
         self._choose_next_state()
         self.observation_space = self.action_space
-
         return self.state
 
-    def step(self, actions):
-        rew = self._get_reward(actions)
+    def step(self, action):
+        reward = self._get_reward(action)
         self._choose_next_state()
-        return self.state, rew, False, {}
+        return self.state, reward, False, {}
 
     def _choose_next_state(self):
         self.state = self.action_space.sample()
 
-    def _get_reward(self, actions):
-        return 1 if self.state == actions else 0
+    def _get_reward(self, action):
+        return 1 if self.state == action else 0
+
+    def render(self, mode='human'):
+        pass
diff --git a/baselines/common/input.py b/baselines/common/input.py
index 7fbf9fc00b..8d2419ff14 100644
--- a/baselines/common/input.py
+++ b/baselines/common/input.py
@@ -1,20 +1,19 @@
 import tensorflow as tf
 from gym.spaces import Discrete, Box
 
+
 def observation_input(ob_space, batch_size=None, name='Ob'):
-    '''
-    Build observation input with encoding depending on the 
-    observation space type
-    Params:
-    
-    ob_space: observation space (should be one of gym.spaces)
-    batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size)
-    name: tensorflow variable name for input placeholder
+    """
+    Build observation input with encoding depending on the observation space type
 
-    returns: tuple (input_placeholder, processed_input_tensor)
-    '''
+    :param ob_space: (Gym Space) The observation space
+    :param batch_size: (int) batch size for input
+                       (default is None, so that resulting input placeholder can take tensors with any batch size)
+    :param name: (str) tensorflow variable name for input placeholder
+    :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor
+    """
     if isinstance(ob_space, Discrete):
-        input_x  = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
+        input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
         processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
         return input_x, processed_x
 
@@ -26,5 +25,3 @@ def observation_input(ob_space, batch_size=None, name='Ob'):
 
     else:
         raise NotImplementedError
-
- 
diff --git a/baselines/common/math_util.py b/baselines/common/math_util.py
index 36b8927781..327e69fbe0 100644
--- a/baselines/common/math_util.py
+++ b/baselines/common/math_util.py
@@ -2,27 +2,21 @@
 import scipy.signal
 
 
-def discount(x, gamma):
+def discount(vector, gamma):
     """
-    computes discounted sums along 0th dimension of x.
-
-    inputs
-    ------
-    x: ndarray
-    gamma: float
-
-    outputs
-    -------
-    y: ndarray with same shape as x, satisfying
-
+    computes discounted sums along 0th dimension of vector x.
         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
                 where k = len(x) - t - 1
 
+    :param vector: (numpy array) the input vector
+    :param gamma: (float) the discount value
+    :return: (numpy Number) the output vector
     """
-    assert x.ndim >= 1
-    return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
+    assert vector.ndim >= 1
+    return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1]
+
 
-def explained_variance(ypred,y):
+def explained_variance(y_pred, y_true):
     """
     Computes fraction of variance that ypred explains about y.
     Returns 1 - Var[y-ypred] / Var[y]
@@ -32,54 +26,78 @@ def explained_variance(ypred,y):
         ev=1  =>  perfect prediction
         ev<0  =>  worse than just predicting zero
 
+    :param y_pred: (numpy Number) the prediction
+    :param y_true: (numpy Number) the expected value
+    :return: (float) explained variance of ypred and y
     """
-    assert y.ndim == 1 and ypred.ndim == 1
-    vary = np.var(y)
-    return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
+    assert y_true.ndim == 1 and y_pred.ndim == 1
+    var_y = np.var(y_true)
+    return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
 
-def explained_variance_2d(ypred, y):
-    assert y.ndim == 2 and ypred.ndim == 2
-    vary = np.var(y, axis=0)
-    out = 1 - np.var(y-ypred)/vary
-    out[vary < 1e-10] = 0
-    return out
 
-def ncc(ypred, y):
-    return np.corrcoef(ypred, y)[1,0]
+def explained_variance_2d(y_pred, y_true):
+    """
+    Computes fraction of variance that ypred explains about y, for 2D arrays.
+    Returns 1 - Var[y-ypred] / Var[y]
+
+    interpretation:
+        ev=0  =>  might as well have predicted zero
+        ev=1  =>  perfect prediction
+        ev<0  =>  worse than just predicting zero
+
+    :param y_pred: (numpy Number) the prediction
+    :param y_true: (numpy Number) the expected value
+    :return: (float) explained variance of ypred and y
+    """
+    assert y_true.ndim == 2 and y_pred.ndim == 2
+    var_y = np.var(y_true, axis=0)
+    explained_var = 1 - np.var(y_true - y_pred) / var_y
+    explained_var[var_y < 1e-10] = 0
+    return explained_var
+
 
 def flatten_arrays(arrs):
+    """
+    flattens a list of arrays down to 1D
+
+    :param arrs: ([numpy Number]) arrays
+    :return: (numpy Number) 1D flattend array
+    """
     return np.concatenate([arr.flat for arr in arrs])
 
+
 def unflatten_vector(vec, shapes):
-    i=0
+    """
+    reshape a flattened array
+
+    :param vec: (numpy Number) 1D arrays
+    :param shapes: (tuple)
+    :return: ([numpy Number]) reshaped array
+    """
+    i = 0
     arrs = []
     for shape in shapes:
         size = np.prod(shape)
-        arr = vec[i:i+size].reshape(shape)
+        arr = vec[i:i + size].reshape(shape)
         arrs.append(arr)
         i += size
     return arrs
 
-def discount_with_boundaries(X, New, gamma):
+
+def discount_with_boundaries(rewards, episode_starts, gamma):
     """
-    X: 2d array of floats, time x features
-    New: 2d array of bools, indicating when a new episode has started
+    computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode.
+        y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
+                where k = len(x) - t - 1
+
+    :param rewards: (numpy Number) the input vector (rewards)
+    :param episode_starts: (numpy Number) 2d array of bools, indicating when a new episode has started
+    :param gamma: (float) the discount factor
+    :return: (numpy Number) the output vector (discounted rewards)
     """
-    Y = np.zeros_like(X)
-    T = X.shape[0]
-    Y[T-1] = X[T-1]
-    for t in range(T-2, -1, -1):
-        Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
-    return Y
-
-def test_discount_with_boundaries():
-    gamma=0.9
-    x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
-    starts = [1.0, 0.0, 0.0, 1.0]
-    y = discount_with_boundaries(x, starts, gamma)
-    assert np.allclose(y, [
-        1 + gamma * 2 + gamma**2 * 3,
-        2 + gamma * 3,
-        3,
-        4
-    ])
\ No newline at end of file
+    discounted_rewards = np.zeros_like(rewards)
+    n_samples = rewards.shape[0]
+    discounted_rewards[n_samples - 1] = rewards[n_samples - 1]
+    for step in range(n_samples - 2, -1, -1):
+        discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1])
+    return discounted_rewards
diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py
index 9985dea205..19216c95d8 100644
--- a/baselines/common/misc_util.py
+++ b/baselines/common/misc_util.py
@@ -1,15 +1,23 @@
-import gym
-import numpy as np
 import os
 import pickle
 import random
 import tempfile
 import zipfile
 
+import gym
+import numpy as np
+import tensorflow as tf
+
 
 def zipsame(*seqs):
-    L = len(seqs[0])
-    assert all(len(seq) == L for seq in seqs[1:])
+    """
+    Performes a zip function, but asserts that all zipped elements are of the same size
+
+    :param seqs: a list of arrays that are zipped together
+    :return: the zipped arguments
+    """
+    length = len(seqs[0])
+    assert all(len(seq) == length for seq in seqs[1:])
     return zip(*seqs)
 
 
@@ -20,79 +28,80 @@ def unpack(seq, sizes):
 
     Example:
     unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
+
+    :param seq: (Iterable) the sequence to unpack
+    :param sizes: ([int]) the shape to unpack
+    :return: ([Any] or Any) the unpacked sequence
     """
     seq = list(seq)
-    it = iter(seq)
+    iterator = iter(seq)
     assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
     for size in sizes:
         if size is None:
-            yield it.__next__()
+            yield iterator.__next__()
         else:
-            li = []
+            _list = []
             for _ in range(size):
-                li.append(it.__next__())
-            yield li
+                _list.append(iterator.__next__())
+            yield _list
 
 
 class EzPickle(object):
-    """Objects that are pickled and unpickled via their constructor
-    arguments.
+    def __init__(self, *args, **kwargs):
+        """
+        Objects that are pickled and unpickled via their constructor arguments.
 
-    Example usage:
+        Example usage:
 
-        class Dog(Animal, EzPickle):
-            def __init__(self, furcolor, tailkind="bushy"):
-                Animal.__init__()
-                EzPickle.__init__(furcolor, tailkind)
-                ...
+            class Dog(Animal, EzPickle):
+                def __init__(self, furcolor, tailkind="bushy"):
+                    Animal.__init__()
+                    EzPickle.__init__(furcolor, tailkind)
+                    ...
 
-    When this object is unpickled, a new Dog will be constructed by passing the provided
-    furcolor and tailkind into the constructor. However, philosophers are still not sure
-    whether it is still the same dog.
+        When this object is unpickled, a new Dog will be constructed by passing the provided
+        furcolor and tailkind into the constructor. However, philosophers are still not sure
+        whether it is still the same dog.
 
-    This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
-    and Atari.
-    """
+        This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
+        and Atari.
 
-    def __init__(self, *args, **kwargs):
+        :param args: ezpickle args
+        :param kwargs: ezpickle kwargs
+        """
         self._ezpickle_args = args
         self._ezpickle_kwargs = kwargs
 
     def __getstate__(self):
         return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
 
-    def __setstate__(self, d):
-        out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
+    def __setstate__(self, _dict):
+        out = type(self)(*_dict["_ezpickle_args"], **_dict["_ezpickle_kwargs"])
         self.__dict__.update(out.__dict__)
 
 
-def set_global_seeds(i):
-    try:
-        import tensorflow as tf
-    except ImportError:
-        pass
-    else:
-        tf.set_random_seed(i)
-    np.random.seed(i)
-    random.seed(i)
+def set_global_seeds(seed):
+    """
+    set the seed for python random, tensorflow, and numpy
+
+    :param seed: (int) the seed
+    """
+    tf.set_random_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
 
 
 def pretty_eta(seconds_left):
-    """Print the number of seconds in human readable format.
+    """
+    Print the number of seconds in human readable format.
 
     Examples:
     2 days
     2 hours and 37 minutes
     less than a minute
 
-    Paramters
-    ---------
-    seconds_left: int
-        Number of seconds to be converted to the ETA
-    Returns
-    -------
-    eta: str
-        String representing the pretty ETA.
+    :param seconds_left: (int) Number of seconds to be converted to the ETA
+    :return: (str) String representing the pretty ETA.
     """
     minutes_left = seconds_left // 60
     seconds_left %= 60
@@ -121,27 +130,21 @@ def helper(cnt, name):
 
 class RunningAvg(object):
     def __init__(self, gamma, init_value=None):
-        """Keep a running estimate of a quantity. This is a bit like mean
+        """
+        Keep a running estimate of a quantity. This is a bit like mean
         but more sensitive to recent changes.
 
-        Parameters
-        ----------
-        gamma: float
-            Must be between 0 and 1, where 0 is the most sensitive to recent
-            changes.
-        init_value: float or None
-            Initial value of the estimate. If None, it will be set on the first update.
+        :param gamma: (float) Must be between 0 and 1, where 0 is the most sensitive to recent changes.
+        :param init_value: (float) Initial value of the estimate. If None, it will be set on the first update.
         """
         self._value = init_value
         self._gamma = gamma
 
     def update(self, new_val):
-        """Update the estimate.
+        """
+        Update the estimate.
 
-        Parameters
-        ----------
-        new_val: float
-            new observated value of estimated quantity.
+        :param new_val: (float) new observated value of estimated quantity.
         """
         if self._value is None:
             self._value = new_val
@@ -149,43 +152,36 @@ def update(self, new_val):
             self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
 
     def __float__(self):
-        """Get the current estimate"""
+        """
+        Get the current estimate
+
+        :return: (float) current value
+        """
         return self._value
 
-def boolean_flag(parser, name, default=False, help=None):
-    """Add a boolean flag to argparse parser.
-
-    Parameters
-    ----------
-    parser: argparse.Parser
-        parser to add the flag to
-    name: str
-        --<name> will enable the flag, while --no-<name> will disable it
-    default: bool or None
-        default value of the flag
-    help: str
-        help string for the flag
+
+def boolean_flag(parser, name, default=False, help_msg=None):
+    """
+    Add a boolean flag to argparse parser.
+
+    :param parser: (argparse.Parser) parser to add the flag to
+    :param name: (str) --<name> will enable the flag, while --no-<name> will disable it
+    :param default: (bool) default value of the flag
+    :param help_msg: (str) help string for the flag
     """
     dest = name.replace('-', '_')
-    parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
+    parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help_msg)
     parser.add_argument("--no-" + name, action="store_false", dest=dest)
 
 
 def get_wrapper_by_name(env, classname):
-    """Given an a gym environment possibly wrapped multiple times, returns a wrapper
+    """
+    Given an a gym environment possibly wrapped multiple times, returns a wrapper
     of class named classname or raises ValueError if no such wrapper was applied
 
-    Parameters
-    ----------
-    env: gym.Env of gym.Wrapper
-        gym environment
-    classname: str
-        name of the wrapper
-
-    Returns
-    -------
-    wrapper: gym.Wrapper
-        wrapper named classname
+    :param env: (Gym Environment) the environment
+    :param classname: (str) name of the wrapper
+    :return: (Gym Environment) the wrapped environment
     """
     currentenv = env
     while True:
@@ -198,7 +194,8 @@ def get_wrapper_by_name(env, classname):
 
 
 def relatively_safe_pickle_dump(obj, path, compression=False):
-    """This is just like regular pickle dump, except from the fact that failure cases are
+    """
+    This is just like regular pickle dump, except from the fact that failure cases are
     different:
 
         - It's never possible that we end up with a pickle in corrupted state.
@@ -210,14 +207,9 @@ def relatively_safe_pickle_dump(obj, path, compression=False):
     The indended use case is periodic checkpoints of experiment state, such that we never
     corrupt previous checkpoints if the current one fails.
 
-    Parameters
-    ----------
-    obj: object
-        object to pickle
-    path: str
-        path to the output file
-    compression: bool
-        if true pickle will be compressed
+    :param obj: (Object) object to pickle
+    :param path: (str) path to the output file
+    :param compression: (bool) if true pickle will be compressed
     """
     temp_storage = path + ".relatively_safe"
     if compression:
@@ -228,31 +220,24 @@ def relatively_safe_pickle_dump(obj, path, compression=False):
             with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
                 myzip.write(uncompressed_file.name, "data")
     else:
-        with open(temp_storage, "wb") as f:
-            pickle.dump(obj, f)
+        with open(temp_storage, "wb") as file_handler:
+            pickle.dump(obj, file_handler)
     os.rename(temp_storage, path)
 
 
 def pickle_load(path, compression=False):
-    """Unpickle a possible compressed pickle.
-
-    Parameters
-    ----------
-    path: str
-        path to the output file
-    compression: bool
-        if true assumes that pickle was compressed when created and attempts decompression.
-
-    Returns
-    -------
-    obj: object
-        the unpickled object
+    """
+    Unpickle a possible compressed pickle.
+
+    :param path: (str) path to the output file
+    :param compression: (bool) if true assumes that pickle was compressed when created and attempts decompression.
+    :return: (Object) the unpickled object
     """
 
     if compression:
         with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
-            with myzip.open("data") as f:
-                return pickle.load(f)
+            with myzip.open("data") as file_handler:
+                return pickle.load(file_handler)
     else:
-        with open(path, "rb") as f:
-            return pickle.load(f)
+        with open(path, "rb") as file_handler:
+            return pickle.load(file_handler)
diff --git a/baselines/common/mpi_adam.py b/baselines/common/mpi_adam.py
index 4902caf629..cd17274cb0 100644
--- a/baselines/common/mpi_adam.py
+++ b/baselines/common/mpi_adam.py
@@ -1,46 +1,78 @@
 from mpi4py import MPI
-import baselines.common.tf_util as U
+import baselines.common.tf_util as tf_utils
 import tensorflow as tf
 import numpy as np
 
+
 class MpiAdam(object):
-    def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
+    def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None,
+                 sess=None):
+        """
+        A parallel MPI implementation of the Adam optimizer for TensorFlow
+        https://arxiv.org/abs/1412.6980
+
+        :param var_list: ([TensorFlow Tensor]) the variables
+        :param beta1: (float) Adam beta1 parameter
+        :param beta2: (float) Adam beta1 parameter
+        :param epsilon: (float) to help with preventing arithmetic issues
+        :param scale_grad_by_procs: (bool) if the scaling should be done by processes
+        :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
+        :param sess: (TensorFlow Session) if None, tf.get_default_session()
+        """
         self.var_list = var_list
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.scale_grad_by_procs = scale_grad_by_procs
-        size = sum(U.numel(v) for v in var_list)
-        self.m = np.zeros(size, 'float32')
-        self.v = np.zeros(size, 'float32')
-        self.t = 0
-        self.setfromflat = U.SetFromFlat(var_list)
-        self.getflat = U.GetFlat(var_list)
+        size = sum(tf_utils.numel(v) for v in var_list)
+        # Exponential moving average of gradient values
+        # "first moment estimate" m in the paper
+        self.exp_avg = np.zeros(size, 'float32')
+        # Exponential moving average of squared gradient values
+        # "second raw moment estimate" v in the paper
+        self.exp_avg_sq = np.zeros(size, 'float32')
+        self.step = 0
+        self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess)
+        self.getflat = tf_utils.GetFlat(var_list, sess=sess)
         self.comm = MPI.COMM_WORLD if comm is None else comm
 
-    def update(self, localg, stepsize):
-        if self.t % 100 == 0:
+    def update(self, local_grad, learning_rate):
+        """
+        update the values of the graph
+
+        :param local_grad: (numpy float) the gradient
+        :param learning_rate: (float) the learning_rate for the update
+        """
+        if self.step % 100 == 0:
             self.check_synced()
-        localg = localg.astype('float32')
-        globalg = np.zeros_like(localg)
-        self.comm.Allreduce(localg, globalg, op=MPI.SUM)
+        local_grad = local_grad.astype('float32')
+        global_grad = np.zeros_like(local_grad)
+        self.comm.Allreduce(local_grad, global_grad, op=MPI.SUM)
         if self.scale_grad_by_procs:
-            globalg /= self.comm.Get_size()
+            global_grad /= self.comm.Get_size()
 
-        self.t += 1
-        a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
-        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
-        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
-        step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
+        self.step += 1
+        # Learning rate with bias correction
+        step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step)
+        # Decay the first and second moment running average coefficient
+        self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad
+        self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad)
+        step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon)
         self.setfromflat(self.getflat() + step)
 
     def sync(self):
+        """
+        syncronize the MPI threads
+        """
         theta = self.getflat()
         self.comm.Bcast(theta, root=0)
         self.setfromflat(theta)
 
     def check_synced(self):
-        if self.comm.Get_rank() == 0: # this is root
+        """
+        confirm the MPI threads are synced
+        """
+        if self.comm.Get_rank() == 0:  # this is root
             theta = self.getflat()
             self.comm.Bcast(theta, root=0)
         else:
@@ -49,31 +81,40 @@ def check_synced(self):
             self.comm.Bcast(thetaroot, root=0)
             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
 
-@U.in_session
-def test_MpiAdam():
+
+@tf_utils.in_session
+def test_mpi_adam():
+    """
+    tests the MpiAdam object's functionality
+    """
     np.random.seed(0)
     tf.set_random_seed(0)
 
-    a = tf.Variable(np.random.randn(3).astype('float32'))
-    b = tf.Variable(np.random.randn(2,5).astype('float32'))
-    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
+    a_var = tf.Variable(np.random.randn(3).astype('float32'))
+    b_var = tf.Variable(np.random.randn(2, 5).astype('float32'))
+    loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var))
 
-    stepsize = 1e-2
-    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
-    do_update = U.function([], loss, updates=[update_op])
+    learning_rate = 1e-2
+    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
+    do_update = tf_utils.function([], loss, updates=[update_op])
 
     tf.get_default_session().run(tf.global_variables_initializer())
-    for i in range(10):
-        print(i,do_update())
+    for step in range(10):
+        print(step, do_update())
 
     tf.set_random_seed(0)
     tf.get_default_session().run(tf.global_variables_initializer())
 
-    var_list = [a,b]
-    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
+    var_list = [a_var, b_var]
+    lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op])
     adam = MpiAdam(var_list)
 
-    for i in range(10):
-        l,g = lossandgrad()
-        adam.update(g, stepsize)
-        print(i,l)
\ No newline at end of file
+    for step in range(10):
+        loss, grad = lossandgrad()
+        adam.update(grad, learning_rate)
+        print(step, loss)
+
+
+if __name__ == "__main__":
+    # Run with mpirun -np 2 python <filename>
+    test_mpi_adam()
diff --git a/baselines/common/mpi_fork.py b/baselines/common/mpi_fork.py
index c5e609e66c..2012f5cad3 100644
--- a/baselines/common/mpi_fork.py
+++ b/baselines/common/mpi_fork.py
@@ -1,10 +1,18 @@
-import os, subprocess, sys
+import os
+import subprocess
+import sys
 
-def mpi_fork(n, bind_to_core=False):
-    """Re-launches the current script with workers
+
+def mpi_fork(rank, bind_to_core=False):
+    """
+    Re-launches the current script with workers
     Returns "parent" for original parent, "child" for MPI children
+
+    :param rank: (int) the rank
+    :param bind_to_core: (bool) enables binding to core
+    :return: (str) the correct type of thread name
     """
-    if n<=1: 
+    if rank <= 1:
         return "child"
     if os.getenv("IN_MPI") is None:
         env = os.environ.copy()
@@ -13,7 +21,7 @@ def mpi_fork(n, bind_to_core=False):
             OMP_NUM_THREADS="1",
             IN_MPI="1"
         )
-        args = ["mpirun", "-np", str(n)]
+        args = ["mpirun", "-np", str(rank)]
         if bind_to_core:
             args += ["-bind-to", "core"]
         args += [sys.executable] + sys.argv
diff --git a/baselines/common/mpi_moments.py b/baselines/common/mpi_moments.py
index 7fcc6cd828..1af444a4e3 100644
--- a/baselines/common/mpi_moments.py
+++ b/baselines/common/mpi_moments.py
@@ -1,26 +1,47 @@
 from mpi4py import MPI
 import numpy as np
+
 from baselines.common import zipsame
 
 
-def mpi_mean(x, axis=0, comm=None, keepdims=False):
-    x = np.asarray(x)
-    assert x.ndim > 0
-    if comm is None: comm = MPI.COMM_WORLD
-    xsum = x.sum(axis=axis, keepdims=keepdims)
-    n = xsum.size
-    localsum = np.zeros(n+1, x.dtype)
-    localsum[:n] = xsum.ravel()
-    localsum[n] = x.shape[axis]
+def mpi_mean(arr, axis=0, comm=None, keepdims=False):
+    """
+    calculates the mean of an array, using MPI
+
+    :param arr: (numpy Number)
+    :param axis: (int or tuple or list) the axis to run the means over
+    :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
+    :param keepdims: (bool) keep the other dimentions intact
+    :return: (numpy Number or Number) the result of the sum
+    """
+    arr = np.asarray(arr)
+    assert arr.ndim > 0
+    if comm is None:
+        comm = MPI.COMM_WORLD
+    xsum = arr.sum(axis=axis, keepdims=keepdims)
+    size = xsum.size
+    localsum = np.zeros(size + 1, arr.dtype)
+    localsum[:size] = xsum.ravel()
+    localsum[size] = arr.shape[axis]
     globalsum = np.zeros_like(localsum)
     comm.Allreduce(localsum, globalsum, op=MPI.SUM)
-    return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
+    return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size]
+
+
+def mpi_moments(arr, axis=0, comm=None, keepdims=False):
+    """
+    calculates the mean and std of an array, using MPI
 
-def mpi_moments(x, axis=0, comm=None, keepdims=False):
-    x = np.asarray(x)
-    assert x.ndim > 0
-    mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
-    sqdiffs = np.square(x - mean)
+    :param arr: (numpy Number)
+    :param axis: (int or tuple or list) the axis to run the moments over
+    :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
+    :param keepdims: (bool) keep the other dimentions intact
+    :return: (numpy Number or Number) the result of the moments
+    """
+    arr = np.asarray(arr)
+    assert arr.ndim > 0
+    mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True)
+    sqdiffs = np.square(arr - mean)
     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
     assert count1 == count
     std = np.sqrt(meansqdiff)
@@ -31,30 +52,20 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False):
     return mean, std, count
 
 
-def test_runningmeanstd():
-    import subprocess
-    subprocess.check_call(['mpirun', '-np', '3', 
-        'python','-c', 
-        'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
-
 def _helper_runningmeanstd():
     comm = MPI.COMM_WORLD
     np.random.seed(0)
-    for (triple,axis) in [
-        ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
-        ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
-        ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
-        ]:
-
+    for (triple, axis) in [
+         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0),
+         ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0),
+         ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]:
 
-        x = np.concatenate(triple, axis=axis)
-        ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
+        arr = np.concatenate(triple, axis=axis)
+        ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]]
 
+        ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis)
 
-        ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
-
-        for (a1,a2) in zipsame(ms1, ms2):
-            print(a1, a2)
-            assert np.allclose(a1, a2)
+        for (res_1, res_2) in zipsame(ms1, ms2):
+            print(res_1, res_2)
+            assert np.allclose(res_1, res_2)
             print("ok!")
-
diff --git a/baselines/common/mpi_running_mean_std.py b/baselines/common/mpi_running_mean_std.py
index 408f8a22b8..4b418265a7 100644
--- a/baselines/common/mpi_running_mean_std.py
+++ b/baselines/common/mpi_running_mean_std.py
@@ -1,10 +1,19 @@
 from mpi4py import MPI
-import tensorflow as tf, baselines.common.tf_util as U, numpy as np
+import tensorflow as tf
+import numpy as np
+
+import baselines.common.tf_util as tf_util
+
 
 class RunningMeanStd(object):
-    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
     def __init__(self, epsilon=1e-2, shape=()):
+        """
+        calulates the running mean and std of a data stream
+        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 
+        :param epsilon: (float) helps with arithmetic issues
+        :param shape: (tuple) the shape of the data stream's output
+        """
         self._sum = tf.get_variable(
             dtype=tf.float64,
             shape=shape,
@@ -23,74 +32,62 @@ def __init__(self, epsilon=1e-2, shape=()):
         self.shape = shape
 
         self.mean = tf.to_float(self._sum / self._count)
-        self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
+        self.std = tf.sqrt(tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2))
 
         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
-        self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
-            updates=[tf.assign_add(self._sum, newsum),
-                     tf.assign_add(self._sumsq, newsumsq),
-                     tf.assign_add(self._count, newcount)])
-
-
-    def update(self, x):
-        x = x.astype('float64')
-        n = int(np.prod(self.shape))
-        totalvec = np.zeros(n*2+1, 'float64')
-        addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
+        self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [],
+                                              updates=[tf.assign_add(self._sum, newsum),
+                                                       tf.assign_add(self._sumsq, newsumsq),
+                                                       tf.assign_add(self._count, newcount)])
+
+    def update(self, data):
+        """
+        update the running mean and std
+
+        :param data: (numpy Number) the data
+        """
+        data = data.astype('float64')
+        data_size = int(np.prod(self.shape))
+        totalvec = np.zeros(data_size * 2 + 1, 'float64')
+        addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(),
+                                 np.array([len(data)], dtype='float64')])
         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
-        self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
+        self.incfiltparams(totalvec[0: data_size].reshape(self.shape),
+                           totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size])
 
-@U.in_session
-def test_runningmeanstd():
-    for (x1, x2, x3) in [
-        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
-        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
-        ]:
 
-        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
-        U.initialize()
-
-        x = np.concatenate([x1, x2, x3], axis=0)
-        ms1 = [x.mean(axis=0), x.std(axis=0)]
-        rms.update(x1)
-        rms.update(x2)
-        rms.update(x3)
-        ms2 = [rms.mean.eval(), rms.std.eval()]
-
-        assert np.allclose(ms1, ms2)
-
-@U.in_session
+@tf_util.in_session
 def test_dist():
+    """
+    test the running mean std
+    """
     np.random.seed(0)
-    p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
-    q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
-
-    # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
-    # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
+    p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1))
+    q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1))
 
     comm = MPI.COMM_WORLD
-    assert comm.Get_size()==2
-    if comm.Get_rank()==0:
-        x1,x2,x3 = p1,p2,p3
-    elif comm.Get_rank()==1:
-        x1,x2,x3 = q1,q2,q3
+    assert comm.Get_size() == 2
+    if comm.Get_rank() == 0:
+        x_1, x_2, x_3 = p_1, p_2, p_3
+    elif comm.Get_rank() == 1:
+        x_1, x_2, x_3 = q_1, q_2, q_3
     else:
         assert False
 
     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
-    U.initialize()
+    tf_util.initialize()
 
-    rms.update(x1)
-    rms.update(x2)
-    rms.update(x3)
+    rms.update(x_1)
+    rms.update(x_2)
+    rms.update(x_3)
 
-    bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
+    bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3])
 
-    def checkallclose(x,y):
-        print(x,y)
-        return np.allclose(x,y)
+    def checkallclose(var_1, var_2):
+        print(var_1, var_2)
+        return np.allclose(var_1, var_2)
 
     assert checkallclose(
         bigvec.mean(axis=0),
diff --git a/baselines/common/runners.py b/baselines/common/runners.py
index 0a4b2214f7..7c9df3ce7d 100644
--- a/baselines/common/runners.py
+++ b/baselines/common/runners.py
@@ -1,18 +1,29 @@
 import numpy as np
 from abc import ABC, abstractmethod
 
+
 class AbstractEnvRunner(ABC):
-    def __init__(self, *, env, model, nsteps):
+    def __init__(self, *, env, model, n_steps):
+        """
+        A runner to learn the policy of an environment for a model
+
+        :param env: (Gym environment) The environment to learn from
+        :param model: (Model) The model to learn
+        :param n_steps: (int) The number of steps to run for each environment
+        """
         self.env = env
         self.model = model
-        nenv = env.num_envs
-        self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
-        self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
+        n_env = env.num_envs
+        self.batch_ob_shape = (n_env*n_steps,) + env.observation_space.shape
+        self.obs = np.zeros((n_env,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
         self.obs[:] = env.reset()
-        self.nsteps = nsteps
+        self.n_steps = n_steps
         self.states = model.initial_state
-        self.dones = [False for _ in range(nenv)]
+        self.dones = [False for _ in range(n_env)]
 
     @abstractmethod
     def run(self):
+        """
+        Run a learning step of the model
+        """
         raise NotImplementedError
diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py
index 06ba8d8f11..d6a03d6ebf 100644
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -1,46 +1,37 @@
 import numpy as np
+
+
 class RunningMeanStd(object):
-    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
     def __init__(self, epsilon=1e-4, shape=()):
+        """
+        calulates the running mean and std of a data stream
+        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+
+        :param epsilon: (float) helps with arithmetic issues
+        :param shape: (tuple) the shape of the data stream's output
+        """
         self.mean = np.zeros(shape, 'float64')
         self.var = np.ones(shape, 'float64')
         self.count = epsilon
 
-    def update(self, x):
-        batch_mean = np.mean(x, axis=0)
-        batch_var = np.var(x, axis=0)
-        batch_count = x.shape[0]
+    def update(self, arr):
+        batch_mean = np.mean(arr, axis=0)
+        batch_var = np.var(arr, axis=0)
+        batch_count = arr.shape[0]
         self.update_from_moments(batch_mean, batch_var, batch_count)
 
     def update_from_moments(self, batch_mean, batch_var, batch_count):
         delta = batch_mean - self.mean
         tot_count = self.count + batch_count
 
-        new_mean = self.mean + delta * batch_count / tot_count        
-        m_a = self.var * (self.count)
-        m_b = batch_var * (batch_count)
-        M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
-        new_var = M2 / (self.count + batch_count)
+        new_mean = self.mean + delta * batch_count / tot_count
+        m_a = self.var * self.count
+        m_b = batch_var * batch_count
+        m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
+        new_var = m_2 / (self.count + batch_count)
 
         new_count = batch_count + self.count
 
         self.mean = new_mean
         self.var = new_var
-        self.count = new_count    
-
-def test_runningmeanstd():
-    for (x1, x2, x3) in [
-        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
-        (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
-        ]:
-
-        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
-
-        x = np.concatenate([x1, x2, x3], axis=0)
-        ms1 = [x.mean(axis=0), x.var(axis=0)]
-        rms.update(x1)
-        rms.update(x2)
-        rms.update(x3)
-        ms2 = [rms.mean, rms.var]
-
-        assert np.allclose(ms1, ms2)
+        self.count = new_count
diff --git a/baselines/common/running_stat.py b/baselines/common/running_stat.py
index b9aa86c2ff..4c074590a3 100644
--- a/baselines/common/running_stat.py
+++ b/baselines/common/running_stat.py
@@ -1,46 +1,75 @@
 import numpy as np
 
-# http://www.johndcook.com/blog/standard_deviation/
+
 class RunningStat(object):
     def __init__(self, shape):
-        self._n = 0
-        self._M = np.zeros(shape)
-        self._S = np.zeros(shape)
-    def push(self, x):
-        x = np.asarray(x)
-        assert x.shape == self._M.shape
-        self._n += 1
-        if self._n == 1:
-            self._M[...] = x
+        """
+        calulates the running mean and std of a data stream
+        http://www.johndcook.com/blog/standard_deviation/
+
+        :param shape: (tuple) the shape of the data stream's output
+        """
+        self._step = 0
+        self._mean = np.zeros(shape)
+        self._std = np.zeros(shape)
+
+    def push(self, value):
+        """
+        update the running mean and std
+
+        :param value: (numpy Number) the data
+        """
+        value = np.asarray(value)
+        assert value.shape == self._mean.shape
+        self._step += 1
+        if self._step == 1:
+            self._mean[...] = value
         else:
-            oldM = self._M.copy()
-            self._M[...] = oldM + (x - oldM)/self._n
-            self._S[...] = self._S + (x - oldM)*(x - self._M)
+            old_m = self._mean.copy()
+            self._mean[...] = old_m + (value - old_m) / self._step
+            self._std[...] = self._std + (value - old_m) * (value - self._mean)
+
     @property
     def n(self):
-        return self._n
+        """
+        the number of data points
+
+        :return: (int)
+        """
+        return self._step
+
     @property
     def mean(self):
-        return self._M
+        """
+        the average value
+
+        :return: (float)
+        """
+        return self._mean
+
     @property
     def var(self):
-        return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
+        """
+        the variation of the data points
+
+        :return: (float)
+        """
+        return self._std / (self._step - 1) if self._step > 1 else np.square(self._mean)
+
     @property
     def std(self):
+        """
+        the standard deviation of the data points
+
+        :return: (float)
+        """
         return np.sqrt(self.var)
+
     @property
     def shape(self):
-        return self._M.shape
-
-def test_running_stat():
-    for shp in ((), (3,), (3,4)):
-        li = []
-        rs = RunningStat(shp)
-        for _ in range(5):
-            val = np.random.randn(*shp)
-            rs.push(val)
-            li.append(val)
-            m = np.mean(li, axis=0)
-            assert np.allclose(rs.mean, m)
-            v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
-            assert np.allclose(rs.var, v)
+        """
+        the shape of the data points
+
+        :return: (tuple)
+        """
+        return self._mean.shape
diff --git a/baselines/common/schedules.py b/baselines/common/schedules.py
index 9dfff50f95..9fc3d6f11b 100644
--- a/baselines/common/schedules.py
+++ b/baselines/common/schedules.py
@@ -10,47 +10,57 @@
 
 
 class Schedule(object):
-    def value(self, t):
-        """Value of the schedule at time t"""
-        raise NotImplementedError()
+    def value(self, step):
+        """
+        Value of the schedule for a given timestep
+
+        :param step: (int) the timestep
+        :return: (float) the output value for the given timestep
+        """
+        raise NotImplementedError
 
 
-class ConstantSchedule(object):
+class ConstantSchedule(Schedule):
     def __init__(self, value):
-        """Value remains constant over time.
+        """
+        Value remains constant over time.
 
-        Parameters
-        ----------
-        value: float
-            Constant value of the schedule
+        :param value: (float) Constant value of the schedule
         """
-        self._v = value
+        self._value = value
 
-    def value(self, t):
-        """See Schedule.value"""
-        return self._v
+    def value(self, step):
+        return self._value
 
 
-def linear_interpolation(l, r, alpha):
-    return l + alpha * (r - l)
+def linear_interpolation(left, right, alpha):
+    """
+    Linear interpolation between `left` and `right`
+    :param left: (float) left boundary
+    :param right: (float) right boundary
+    :param alpha: (float) coeff in [0, 1]
+    :return: (float)
+    """
+    return left + alpha * (right - left)
 
 
-class PiecewiseSchedule(object):
+class PiecewiseSchedule(Schedule):
     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
-        """Piecewise schedule.
+        """
+        Piecewise schedule.
 
-        endpoints: [(int, int)]
+        :param endpoints: ([(int, int)])
             list of pairs `(time, value)` meanining that schedule should output
             `value` when `t==time`. All the values for time must be sorted in
             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
             time passed between `time_a` and `time_b` for time `t`.
-        interpolation: lambda float, float, float: float
+        :param interpolation: (lambda (float, float, float): float)
             a function that takes value to the left and to the right of t according
             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
             right endpoint that t has covered. See linear_interpolation for example.
-        outside_value: float
+        :param outside_value: (float)
             if the value is requested outside of all the intervals sepecified in
             `endpoints` this value is returned. If None then AssertionError is
             raised when outside value is requested.
@@ -61,39 +71,32 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=
         self._outside_value = outside_value
         self._endpoints = endpoints
 
-    def value(self, t):
-        """See Schedule.value"""
-        for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
-            if l_t <= t and t < r_t:
-                alpha = float(t - l_t) / (r_t - l_t)
-                return self._interpolation(l, r, alpha)
+    def value(self, step):
+        for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]):
+            if left_t <= step < right_t:
+                alpha = float(step - left_t) / (right_t - left_t)
+                return self._interpolation(left, right, alpha)
 
         # t does not belong to any of the pieces, so doom.
         assert self._outside_value is not None
         return self._outside_value
 
 
-class LinearSchedule(object):
+class LinearSchedule(Schedule):
     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
-        """Linear interpolation between initial_p and final_p over
+        """
+        Linear interpolation between initial_p and final_p over
         schedule_timesteps. After this many timesteps pass final_p is
         returned.
 
-        Parameters
-        ----------
-        schedule_timesteps: int
-            Number of timesteps for which to linearly anneal initial_p
-            to final_p
-        initial_p: float
-            initial output value
-        final_p: float
-            final output value
+        :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p
+        :param initial_p: (float) initial output value
+        :param final_p: (float) final output value
         """
         self.schedule_timesteps = schedule_timesteps
         self.final_p = final_p
         self.initial_p = initial_p
 
-    def value(self, t):
-        """See Schedule.value"""
-        fraction = min(float(t) / self.schedule_timesteps, 1.0)
+    def value(self, step):
+        fraction = min(float(step) / self.schedule_timesteps, 1.0)
         return self.initial_p + fraction * (self.final_p - self.initial_p)
diff --git a/baselines/common/segment_tree.py b/baselines/common/segment_tree.py
index cb386ecdb5..1a22d8eed0 100644
--- a/baselines/common/segment_tree.py
+++ b/baselines/common/segment_tree.py
@@ -3,7 +3,8 @@
 
 class SegmentTree(object):
     def __init__(self, capacity, operation, neutral_element):
-        """Build a Segment Tree data structure.
+        """
+        Build a Segment Tree data structure.
 
         https://en.wikipedia.org/wiki/Segment_tree
 
@@ -16,17 +17,10 @@ def __init__(self, capacity, operation, neutral_element):
                `reduce` operation which reduces `operation` over
                a contiguous subsequence of items in the array.
 
-        Paramters
-        ---------
-        capacity: int
-            Total size of the array - must be a power of two.
-        operation: lambda obj, obj -> obj
-            and operation for combining elements (eg. sum, max)
-            must form a mathematical group together with the set of
-            possible values for array elements (i.e. be associative)
-        neutral_element: obj
-            neutral element for the operation above. eg. float('-inf')
-            for max and 0 for sum.
+        :param capacity: (int) Total size of the array - must be a power of two.
+        :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a
+            mathematical group together with the set of possible values for array elements (i.e. be associative)
+        :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum.
         """
         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
         self._capacity = capacity
@@ -49,22 +43,15 @@ def _reduce_helper(self, start, end, node, node_start, node_end):
                 )
 
     def reduce(self, start=0, end=None):
-        """Returns result of applying `self.operation`
+        """
+        Returns result of applying `self.operation`
         to a contiguous subsequence of the array.
 
             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 
-        Parameters
-        ----------
-        start: int
-            beginning of the subsequence
-        end: int
-            end of the subsequences
-
-        Returns
-        -------
-        reduced: obj
-            result of reducing self.operation over the specified range of array elements.
+        :param start: (int) beginning of the subsequence
+        :param end: (int) end of the subsequences
+        :return: (Any) result of reducing self.operation over the specified range of array elements.
         """
         if end is None:
             end = self._capacity
@@ -99,26 +86,26 @@ def __init__(self, capacity):
         )
 
     def sum(self, start=0, end=None):
-        """Returns arr[start] + ... + arr[end]"""
+        """
+        Returns arr[start] + ... + arr[end]
+
+        :param start: (int) start position of the reduction (must be >= 0)
+        :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1)
+        :return: (Any) reduction of SumSegmentTree
+        """
         return super(SumSegmentTree, self).reduce(start, end)
 
     def find_prefixsum_idx(self, prefixsum):
-        """Find the highest index `i` in the array such that
+        """
+        Find the highest index `i` in the array such that
             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
 
         if array values are probabilities, this function
         allows to sample indexes according to the discrete
         probability efficiently.
 
-        Parameters
-        ----------
-        perfixsum: float
-            upperbound on the sum of array prefix
-
-        Returns
-        -------
-        idx: int
-            highest index satisfying the prefixsum constraint
+        :param prefixsum: (float) upperbound on the sum of array prefix
+        :return: (int) highest index satisfying the prefixsum constraint
         """
         assert 0 <= prefixsum <= self.sum() + 1e-5
         idx = 1
@@ -140,6 +127,11 @@ def __init__(self, capacity):
         )
 
     def min(self, start=0, end=None):
-        """Returns min(arr[start], ...,  arr[end])"""
+        """
+        Returns min(arr[start], ...,  arr[end])
 
+        :param start: (int) start position of the reduction (must be >= 0)
+        :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1)
+        :return: (Any) reduction of MinSegmentTree
+        """
         return super(MinSegmentTree, self).reduce(start, end)
diff --git a/baselines/common/tests/test_schedules.py b/baselines/common/tests/test_schedules.py
deleted file mode 100644
index 4e8d02d291..0000000000
--- a/baselines/common/tests/test_schedules.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import numpy as np
-
-from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
-
-
-def test_piecewise_schedule():
-    ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
-
-    assert np.isclose(ps.value(-10), 500)
-    assert np.isclose(ps.value(0), 150)
-    assert np.isclose(ps.value(5), 200)
-    assert np.isclose(ps.value(9), 80)
-    assert np.isclose(ps.value(50), 50)
-    assert np.isclose(ps.value(80), 50)
-    assert np.isclose(ps.value(150), 0)
-    assert np.isclose(ps.value(175), -25)
-    assert np.isclose(ps.value(201), 500)
-    assert np.isclose(ps.value(500), 500)
-
-    assert np.isclose(ps.value(200 - 1e-10), -50)
-
-
-def test_constant_schedule():
-    cs = ConstantSchedule(5)
-    for i in range(-100, 100):
-        assert np.isclose(cs.value(i), 5)
diff --git a/baselines/common/tests/test_tf_util.py b/baselines/common/tests/test_tf_util.py
deleted file mode 100644
index daad9d0210..0000000000
--- a/baselines/common/tests/test_tf_util.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# tests for tf_util
-import tensorflow as tf
-from baselines.common.tf_util import (
-    function,
-    initialize,
-    single_threaded_session
-)
-
-
-def test_function():
-    with tf.Graph().as_default():
-        x = tf.placeholder(tf.int32, (), name="x")
-        y = tf.placeholder(tf.int32, (), name="y")
-        z = 3 * x + 2 * y
-        lin = function([x, y], z, givens={y: 0})
-
-        with single_threaded_session():
-            initialize()
-
-            assert lin(2) == 6
-            assert lin(2, 2) == 10
-
-
-def test_multikwargs():
-    with tf.Graph().as_default():
-        x = tf.placeholder(tf.int32, (), name="x")
-        with tf.variable_scope("other"):
-            x2 = tf.placeholder(tf.int32, (), name="x")
-        z = 3 * x + 2 * x2
-
-        lin = function([x, x2], z, givens={x2: 0})
-        with single_threaded_session():
-            initialize()
-            assert lin(2) == 6
-            assert lin(2, 2) == 10
-
-
-if __name__ == '__main__':
-    test_function()
-    test_multikwargs()
diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index afcd593e85..b4afad2129 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -1,55 +1,85 @@
-import numpy as np
-import tensorflow as tf  # pylint: ignore-module
 import copy
 import os
 import functools
 import collections
 import multiprocessing
 
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+
+from baselines import logger
+
+
 def switch(condition, then_expression, else_expression):
-    """Switches between two operations depending on a scalar value (int or bool).
+    """
+    Switches between two operations depending on a scalar value (int or bool).
     Note that both `then_expression` and `else_expression`
     should be symbolic tensors of the *same shape*.
 
-    # Arguments
-        condition: scalar tensor.
-        then_expression: TensorFlow operation.
-        else_expression: TensorFlow operation.
+    :param condition: (TensorFlow Tensor) scalar tensor.
+    :param then_expression: (TensorFlow Operation)
+    :param else_expression: (TensorFlow Operation)
+    :return: (TensorFlow Operation) the switch output
     """
     x_shape = copy.copy(then_expression.get_shape())
-    x = tf.cond(tf.cast(condition, 'bool'),
-                lambda: then_expression,
-                lambda: else_expression)
-    x.set_shape(x_shape)
-    return x
+    out_tensor = tf.cond(tf.cast(condition, 'bool'),
+                         lambda: then_expression,
+                         lambda: else_expression)
+    out_tensor.set_shape(x_shape)
+    return out_tensor
+
 
 # ================================================================
 # Extras
 # ================================================================
 
-def lrelu(x, leak=0.2):
-    f1 = 0.5 * (1 + leak)
-    f2 = 0.5 * (1 - leak)
-    return f1 * x + f2 * abs(x)
+def leaky_relu(tensor, leak=0.2):
+    """
+    Leaky ReLU
+    http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
+
+    :param tensor: (float) the input value
+    :param leak: (float) the leaking coeficient when the function is saturated
+    :return: (float) Leaky ReLU output
+    """
+    f_1 = 0.5 * (1 + leak)
+    f_2 = 0.5 * (1 - leak)
+    return f_1 * tensor + f_2 * abs(tensor)
+
 
 # ================================================================
 # Mathematical utils
 # ================================================================
 
-def huber_loss(x, delta=1.0):
-    """Reference: https://en.wikipedia.org/wiki/Huber_loss"""
+def huber_loss(tensor, delta=1.0):
+    """
+    Reference: https://en.wikipedia.org/wiki/Huber_loss
+
+    :param tensor: (TensorFlow Tensor) the input value
+    :param delta: (float) huber loss delta value
+    :return: (TensorFlow Tensor) huber loss output
+    """
     return tf.where(
-        tf.abs(x) < delta,
-        tf.square(x) * 0.5,
-        delta * (tf.abs(x) - 0.5 * delta)
+        tf.abs(tensor) < delta,
+        tf.square(tensor) * 0.5,
+        delta * (tf.abs(tensor) - 0.5 * delta)
     )
 
+
 # ================================================================
 # Global session
 # ================================================================
 
 def make_session(num_cpu=None, make_default=False, graph=None):
-    """Returns a session that will use <num_cpu> CPU's only"""
+    """
+    Returns a session that will use <num_cpu> CPU's only
+
+    :param num_cpu: (int) number of CPUs to use for TensorFlow
+    :param make_default: (bool) if this should return an InteractiveSession or a normal Session
+    :param graph: (TensorFlow Graph) the graph of the session
+    :return: (TensorFlow session)
+    """
     if num_cpu is None:
         num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count()))
     tf_config = tf.ConfigProto(
@@ -60,41 +90,88 @@ def make_session(num_cpu=None, make_default=False, graph=None):
     else:
         return tf.Session(config=tf_config, graph=graph)
 
+
 def single_threaded_session():
-    """Returns a session which will only use a single CPU"""
+    """
+    Returns a session which will only use a single CPU
+
+    :return: (TensorFlow session)
+    """
     return make_session(num_cpu=1)
 
-def in_session(f):
-    @functools.wraps(f)
+
+def in_session(func):
+    """
+    wrappes a function so that it is in a TensorFlow Session
+
+    :param func: (function) the function to wrap
+    :return: (function)
+    """
+
+    @functools.wraps(func)
     def newfunc(*args, **kwargs):
         with tf.Session():
-            f(*args, **kwargs)
+            func(*args, **kwargs)
+
     return newfunc
 
+
 ALREADY_INITIALIZED = set()
 
-def initialize():
-    """Initialize all the uninitialized variables in the global scope."""
+
+def initialize(sess=None):
+    """
+    Initialize all the uninitialized variables in the global scope.
+
+    :param sess: (TensorFlow Session)
+    """
+    if sess is None:
+        sess = tf.get_default_session()
     new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED
-    tf.get_default_session().run(tf.variables_initializer(new_variables))
+    sess.run(tf.variables_initializer(new_variables))
     ALREADY_INITIALIZED.update(new_variables)
 
+
 # ================================================================
 # Model components
 # ================================================================
 
 def normc_initializer(std=1.0, axis=0):
-    def _initializer(shape, dtype=None, partition_info=None):  # pylint: disable=W0613
+    """
+    Return a parameter initializer for TensorFlow
+
+    :param std: (float) standard deviation
+    :param axis: (int) the axis to normalize on
+    :return: (function)
+    """
+
+    def _initializer(shape, dtype=None, partition_info=None):
         out = np.random.randn(*shape).astype(np.float32)
         out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True))
         return tf.constant(out)
+
     return _initializer
 
-def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
-           summary_tag=None):
+
+def conv2d(input_tensor, num_filters, name, filter_size=(3, 3), stride=(1, 1),
+           pad="SAME", dtype=tf.float32, collections=None, summary_tag=None):
+    """
+    Creates a 2d convolutional layer for TensorFlow
+
+    :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution
+    :param num_filters: (int) The number of filters
+    :param name: (str) The TensorFlow variable scope
+    :param filter_size: (tuple) The filter size
+    :param stride: (tuple) The stride of the convolution
+    :param pad: (str) The padding type ('VALID' or 'SAME')
+    :param dtype: (type) The data type for the Tensors
+    :param collections: (list) List of graph collections keys to add the Variable to
+    :param summary_tag: (str) image summary name, can be None for no image summary
+    :return: (TensorFlow Tensor) 2d convolutional layer
+    """
     with tf.variable_scope(name):
         stride_shape = [1, stride[0], stride[1], 1]
-        filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
+        filter_shape = [filter_size[0], filter_size[1], int(input_tensor.get_shape()[3]), num_filters]
 
         # there are "num input feature maps * filter height * filter width"
         # inputs to each hidden unit
@@ -106,25 +183,26 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
         # initialize weights with random weights
         w_bound = np.sqrt(6. / (fan_in + fan_out))
 
-        w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
-                            collections=collections)
-        b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
-                            collections=collections)
+        weight = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
+                                 collections=collections)
+        bias = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(),
+                               collections=collections)
 
         if summary_tag is not None:
             tf.summary.image(summary_tag,
-                             tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
-                                          [2, 0, 1, 3]),
-                             max_images=10)
+                             tf.transpose(tf.reshape(weight, [filter_size[0], filter_size[1], -1, 1]), [2, 0, 1, 3]),
+                             max_outputs=10)
+
+        return tf.nn.conv2d(input_tensor, weight, stride_shape, pad) + bias
 
-        return tf.nn.conv2d(x, w, stride_shape, pad) + b
 
 # ================================================================
 # Theano-like Function
 # ================================================================
 
 def function(inputs, outputs, updates=None, givens=None):
-    """Just like Theano function. Take a bunch of tensorflow placeholders and expressions
+    """
+    Just like Theano function. Take a bunch of tensorflow placeholders and expressions
     computed based on those placeholders and produces f(inputs) -> outputs. Function f takes
     values to be fed to the input's placeholders and produces the values of the expressions
     in outputs.
@@ -146,28 +224,35 @@ def function(inputs, outputs, updates=None, givens=None):
             assert lin(2, 2) == 10
             assert lin(x=2, y=3) == 12
 
-    Parameters
-    ----------
-    inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method]
-        list of input arguments
-    outputs: [tf.Variable] or tf.Variable
-        list of outputs or a single output to be returned from function. Returned
+    :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments
+    :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned
         value will also have the same shape.
+    :param updates: (list) update functions
+    :param givens: (dict) the values known for the output
     """
     if isinstance(outputs, list):
         return _Function(inputs, outputs, updates, givens=givens)
     elif isinstance(outputs, (dict, collections.OrderedDict)):
-        f = _Function(inputs, outputs.values(), updates, givens=givens)
-        return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs)))
+        func = _Function(inputs, outputs.values(), updates, givens=givens)
+        return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), func(*args, **kwargs)))
     else:
-        f = _Function(inputs, [outputs], updates, givens=givens)
-        return lambda *args, **kwargs: f(*args, **kwargs)[0]
+        func = _Function(inputs, [outputs], updates, givens=givens)
+        return lambda *args, **kwargs: func(*args, **kwargs)[0]
 
 
 class _Function(object):
     def __init__(self, inputs, outputs, updates, givens):
+        """
+        Theano like function
+
+        :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments
+        :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned
+            value will also have the same shape.
+        :param updates: (list) update functions
+        :param givens: (dict) the values known for the output
+        """
         for inpt in inputs:
-            if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0):
+            if not hasattr(inpt, 'make_feed_dict') and not (isinstance(inpt, tf.Tensor)and len(inpt.op.inputs) == 0):
                 assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method"
         self.inputs = inputs
         updates = updates or []
@@ -175,14 +260,17 @@ def __init__(self, inputs, outputs, updates, givens):
         self.outputs_update = list(outputs) + [self.update_group]
         self.givens = {} if givens is None else givens
 
-    def _feed_input(self, feed_dict, inpt, value):
+    @classmethod
+    def _feed_input(cls, feed_dict, inpt, value):
         if hasattr(inpt, 'make_feed_dict'):
             feed_dict.update(inpt.make_feed_dict(value))
         else:
             feed_dict[inpt] = value
 
-    def __call__(self, *args):
+    def __call__(self, *args, sess=None):
         assert len(args) <= len(self.inputs), "Too many arguments provided"
+        if sess is None:
+            sess = tf.get_default_session()
         feed_dict = {}
         # Update the args
         for inpt, value in zip(self.inputs, args):
@@ -190,26 +278,56 @@ def __call__(self, *args):
         # Update feed dict with givens.
         for inpt in self.givens:
             feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt])
-        results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
+        results = sess.run(self.outputs_update, feed_dict=feed_dict)[:-1]
         return results
 
+
 # ================================================================
 # Flat vectors
 # ================================================================
 
-def var_shape(x):
-    out = x.get_shape().as_list()
+def var_shape(tensor):
+    """
+    get TensorFlow Tensor shape
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :return: ([int]) the shape
+    """
+    out = tensor.get_shape().as_list()
     assert all(isinstance(a, int) for a in out), \
         "shape function assumes that shape is fully known"
     return out
 
-def numel(x):
-    return intprod(var_shape(x))
 
-def intprod(x):
-    return int(np.prod(x))
+def numel(tensor):
+    """
+    get TensorFlow Tensor's number of elements
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :return: (int) the number of elements
+    """
+    return intprod(var_shape(tensor))
+
+
+def intprod(tensor):
+    """
+    calculates the product of all the elements in a list
+
+    :param tensor: ([Number]) the list of elements
+    :return: (int) the product truncated
+    """
+    return int(np.prod(tensor))
+
 
 def flatgrad(loss, var_list, clip_norm=None):
+    """
+    calculates the gradient and flattens it
+
+    :param loss: (float) the loss value
+    :param var_list: ([TensorFlow Tensor]) the variables
+    :param clip_norm: (float) clip the gradients (disabled if None)
+    :return: ([TensorFlow Tensor]) flattend gradient
+    """
     grads = tf.gradients(loss, var_list)
     if clip_norm is not None:
         grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads]
@@ -218,87 +336,130 @@ def flatgrad(loss, var_list, clip_norm=None):
         for (v, grad) in zip(var_list, grads)
     ])
 
+
 class SetFromFlat(object):
-    def __init__(self, var_list, dtype=tf.float32):
-        assigns = []
+    def __init__(self, var_list, dtype=tf.float32, sess=None):
+        """
+        Set the parameters from a flat vector
+
+        :param var_list: ([TensorFlow Tensor]) the variables
+        :param dtype: (type) the type for the placeholder
+        :param sess: (TensorFlow Session)
+        """
         shapes = list(map(var_shape, var_list))
         total_size = np.sum([intprod(shape) for shape in shapes])
 
         self.theta = theta = tf.placeholder(dtype, [total_size])
         start = 0
         assigns = []
-        for (shape, v) in zip(shapes, var_list):
+        for (shape, _var) in zip(shapes, var_list):
             size = intprod(shape)
-            assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape)))
+            assigns.append(tf.assign(_var, tf.reshape(theta[start:start + size], shape)))
             start += size
-        self.op = tf.group(*assigns)
+        self.operation = tf.group(*assigns)
+        self.sess = sess
 
     def __call__(self, theta):
-        tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
+        if self.sess is None:
+            return tf.get_default_session().run(self.operation, feed_dict={self.theta: theta})
+        else:
+            return self.sess.run(self.operation, feed_dict={self.theta: theta})
+
 
 class GetFlat(object):
-    def __init__(self, var_list):
-        self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
+    def __init__(self, var_list, sess=None):
+        """
+        Get the parameters as a flat vector
 
-    def __call__(self):
-        return tf.get_default_session().run(self.op)
+        :param var_list: ([TensorFlow Tensor]) the variables
+        :param sess: (TensorFlow Session)
+        """
+        self.operation = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list])
+        self.sess = sess
 
-_PLACEHOLDER_CACHE = {}  # name -> (placeholder, dtype, shape)
+    def __call__(self):
+        if self.sess is None:
+            return tf.get_default_session().run(self.operation)
+        else:
+            return self.sess.run(self.operation)
 
-def get_placeholder(name, dtype, shape):
-    if name in _PLACEHOLDER_CACHE:
-        out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
-        assert dtype1 == dtype and shape1 == shape
-        return out
-    else:
-        out = tf.placeholder(dtype=dtype, shape=shape, name=name)
-        _PLACEHOLDER_CACHE[name] = (out, dtype, shape)
-        return out
 
-def get_placeholder_cached(name):
-    return _PLACEHOLDER_CACHE[name][0]
+def flattenallbut0(tensor):
+    """
+    flatten all the dimension, except from the first one
 
-def flattenallbut0(x):
-    return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :return: (TensorFlow Tensor) the flattened tensor
+    """
+    return tf.reshape(tensor, [-1, intprod(tensor.get_shape().as_list()[1:])])
 
 
 # ================================================================
-# Diagnostics 
+# Diagnostics
 # ================================================================
 
-def display_var_info(vars):
-    from baselines import logger
+def display_var_info(_vars):
+    """
+    log variable information, for debug purposes
+
+    :param _vars: ([TensorFlow Tensor]) the variables
+    """
     count_params = 0
-    for v in vars:
-        name = v.name
-        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
-        v_params = np.prod(v.shape.as_list())
+    for _var in _vars:
+        name = _var.name
+        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name:
+            continue
+        v_params = np.prod(_var.shape.as_list())
         count_params += v_params
-        if "/b:" in name or "/biases" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
-        logger.info("   %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape)))
+        if "/b:" in name or "/biases" in name:
+            continue  # Wx+b, bias is not interesting to look at => count params, but not print
+        logger.info("   %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(_var.shape)))
 
-    logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
+    logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
 
 
 def get_available_gpus():
+    """
+    Return a list of all the available GPUs
+
+    :return: ([str]) the GPUs available
+    """
     # recipe from here:
     # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
- 
-    from tensorflow.python.client import device_lib
     local_device_protos = device_lib.list_local_devices()
     return [x.name for x in local_device_protos if x.device_type == 'GPU']
 
+
 # ================================================================
 # Saving variables
 # ================================================================
 
-def load_state(fname):
-    saver = tf.train.Saver()
-    saver.restore(tf.get_default_session(), fname)
+def load_state(fname, sess=None, var_list=None):
+    """
+    Load a TensorFlow saved model
+
+    :param fname: (str) the graph name
+    :param sess: (TensorFlow Session) the session, if None: get_default_session()
+    :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject,
+        or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects.
+    """
+    if sess is None:
+        sess = tf.get_default_session()
+    saver = tf.train.Saver(var_list=var_list)
+    saver.restore(sess, fname)
 
-def save_state(fname):
-    os.makedirs(os.path.dirname(fname), exist_ok=True)
-    saver = tf.train.Saver()
-    saver.save(tf.get_default_session(), fname)
 
+def save_state(fname, sess=None, var_list=None):
+    """
+    Save a TensorFlow model
 
+    :param fname: (str) the graph name
+    :param sess: (TensorFlow Session) the session, if None: get_default_session()
+    :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject,
+        or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects.
+    """
+    if sess is None:
+        sess = tf.get_default_session()
+    os.makedirs(os.path.dirname(fname), exist_ok=True)
+    saver = tf.train.Saver(var_list=var_list)
+    saver.save(sess, fname)
diff --git a/baselines/common/tile_images.py b/baselines/common/tile_images.py
index 929da8994a..14922a990a 100644
--- a/baselines/common/tile_images.py
+++ b/baselines/common/tile_images.py
@@ -1,23 +1,28 @@
 import numpy as np
 
+
 def tile_images(img_nhwc):
     """
     Tile N images into one big PxQ image
     (P,Q) are chosen to be as close as possible, and if N
     is square, then P=Q.
 
-    input: img_nhwc, list or array of images, ndim=4 once turned into array
+    :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc
         n = batch index, h = height, w = width, c = channel
-    returns:
-        bigim_HWc, ndarray with ndim=3
+    :return: (numpy float) img_HWc, ndim=3
     """
     img_nhwc = np.asarray(img_nhwc)
-    N, h, w, c = img_nhwc.shape
-    H = int(np.ceil(np.sqrt(N)))
-    W = int(np.ceil(float(N)/H))
-    img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
-    img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
-    img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
-    img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
-    return img_Hh_Ww_c
+    n_images, height, width, n_channels = img_nhwc.shape
+    # new_height was named H before
+    new_height = int(np.ceil(np.sqrt(n_images)))
+    # new_width was named W before
+    new_width = int(np.ceil(float(n_images) / new_height))
+    img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)])
+    # img_HWhwc
+    out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels)
+    # img_HhWwc
+    out_image = out_image.transpose(0, 2, 1, 3, 4)
+    # img_Hh_Ww_c
+    out_image = out_image.reshape(new_height * height, new_width * width, n_channels)
+    return out_image
 
diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py
index eb07310d15..b6377b59b9 100644
--- a/baselines/common/vec_env/__init__.py
+++ b/baselines/common/vec_env/__init__.py
@@ -1,6 +1,10 @@
 from abc import ABC, abstractmethod
+import pickle
+
+import cloudpickle
 from baselines import logger
 
+
 class AlreadySteppingError(Exception):
     """
     Raised when an asynchronous step is running while
@@ -10,6 +14,7 @@ def __init__(self):
         msg = 'already running an async step'
         Exception.__init__(self, msg)
 
+
 class NotSteppingError(Exception):
     """
     Raised when an asynchronous step is not running but
@@ -19,11 +24,16 @@ def __init__(self):
         msg = 'not running an async step'
         Exception.__init__(self, msg)
 
+
 class VecEnv(ABC):
-    """
-    An abstract asynchronous, vectorized environment.
-    """
     def __init__(self, num_envs, observation_space, action_space):
+        """
+        An abstract asynchronous, vectorized environment.
+        
+        :param num_envs: (int) the number of environments
+        :param observation_space: (Gym Space) the observation space
+        :param action_space: (Gym Space) the action space
+        """
         self.num_envs = num_envs
         self.observation_space = observation_space
         self.action_space = action_space
@@ -37,6 +47,8 @@ def reset(self):
         If step_async is still doing work, that work will
         be cancelled and step_wait() should not be called
         until step_async() is invoked again.
+        
+        :return: ([int] or [float]) observation
         """
         pass
 
@@ -56,29 +68,35 @@ def step_async(self, actions):
     def step_wait(self):
         """
         Wait for the step taken with step_async().
-
-        Returns (obs, rews, dones, infos):
-         - obs: an array of observations, or a tuple of
-                arrays of observations.
-         - rews: an array of rewards
-         - dones: an array of "episode done" booleans
-         - infos: a sequence of info objects
+        
+        :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
         """
         pass
 
     @abstractmethod
     def close(self):
         """
-        Clean up the environments' resources.
+        Clean up the environment's resources.
         """
         pass
 
     def step(self, actions):
+        """
+        Step the environments with the given action
+        
+        :param actions: ([int] or [float]) the action
+        :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
+        """
         self.step_async(actions)
         return self.step_wait()
 
     def render(self, mode='human'):
-        logger.warn('Render not defined for %s'%self)
+        """
+        Gym environment rendering
+        
+        :param mode: (str) the rendering type
+        """
+        logger.warn('Render not defined for %s' % self)
 
     @property
     def unwrapped(self):
@@ -87,13 +105,12 @@ def unwrapped(self):
         else:
             return self
 
+
 class VecEnvWrapper(VecEnv):
     def __init__(self, venv, observation_space=None, action_space=None):
         self.venv = venv
-        VecEnv.__init__(self, 
-            num_envs=venv.num_envs,
-            observation_space=observation_space or venv.observation_space, 
-            action_space=action_space or venv.action_space)
+        VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space,
+                        action_space=action_space or venv.action_space)
 
     def step_async(self, actions):
         self.venv.step_async(actions)
@@ -109,18 +126,21 @@ def step_wait(self):
     def close(self):
         return self.venv.close()
 
-    def render(self):
+    def render(self, mode='human'):
         self.venv.render()
 
+
 class CloudpickleWrapper(object):
-    """
-    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
-    """
-    def __init__(self, x):
-        self.x = x
+    def __init__(self, var):
+        """
+        Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+        
+        :param var: (Any) the variable you wish to wrap for pickling with cloudpickle
+        """
+        self.var = var
+
     def __getstate__(self):
-        import cloudpickle
-        return cloudpickle.dumps(self.x)
-    def __setstate__(self, ob):
-        import pickle
-        self.x = pickle.loads(ob)
+        return cloudpickle.dumps(self.var)
+
+    def __setstate__(self, obs):
+        self.var = pickle.loads(obs)
diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index d0ae455d4a..8c2c157e7a 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -1,10 +1,18 @@
+from collections import OrderedDict
+
 import numpy as np
 from gym import spaces
-from collections import OrderedDict
+
 from . import VecEnv
 
+
 class DummyVecEnv(VecEnv):
     def __init__(self, env_fns):
+        """
+        Creates a simple vectorized wrapper for multiple environments
+        
+        :param env_fns: ([Gym Environment]) the list of environments to vectorize
+        """
         self.envs = [fn() for fn in env_fns]
         env = self.envs[0]
         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
@@ -23,9 +31,9 @@ def __init__(self, env_fns):
             dtypes[key] = box.dtype
             self.keys.append(key)
         
-        self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
+        self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys}
         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
-        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
+        self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
         self.buf_infos = [{} for _ in range(self.num_envs)]
         self.actions = None
 
@@ -33,18 +41,19 @@ def step_async(self, actions):
         self.actions = actions
 
     def step_wait(self):
-        for e in range(self.num_envs):
-            obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e])
-            if self.buf_dones[e]:
-                obs = self.envs[e].reset()
-            self._save_obs(e, obs)
+        for env_idx in range(self.num_envs):
+            obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\
+                self.envs[env_idx].step(self.actions[env_idx])
+            if self.buf_dones[env_idx]:
+                obs = self.envs[env_idx].reset()
+            self._save_obs(env_idx, obs)
         return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
                 self.buf_infos.copy())
 
     def reset(self):
-        for e in range(self.num_envs):
-            obs = self.envs[e].reset()
-            self._save_obs(e, obs)
+        for env_idx in range(self.num_envs):
+            obs = self.envs[env_idx].reset()
+            self._save_obs(env_idx, obs)
         return self._obs_from_buf()
 
     def close(self):
@@ -53,15 +62,15 @@ def close(self):
     def render(self, mode='human'):
         return [e.render(mode=mode) for e in self.envs]
 
-    def _save_obs(self, e, obs):
-        for k in self.keys:
-            if k is None:
-                self.buf_obs[k][e] = obs
+    def _save_obs(self, env_idx, obs):
+        for key in self.keys:
+            if key is None:
+                self.buf_obs[key][env_idx] = obs
             else:
-                self.buf_obs[k][e] = obs[k]
+                self.buf_obs[key][env_idx] = obs[key]
 
     def _obs_from_buf(self):
-        if self.keys==[None]:
+        if self.keys == [None]:
             return self.buf_obs[None]
         else:
             return self.buf_obs
diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py
index fb55df45d3..3dcff51671 100644
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,47 +1,54 @@
-import numpy as np
 from multiprocessing import Process, Pipe
+
+import numpy as np
+
 from baselines.common.vec_env import VecEnv, CloudpickleWrapper
 from baselines.common.tile_images import tile_images
 
 
-def worker(remote, parent_remote, env_fn_wrapper):
+def _worker(remote, parent_remote, env_fn_wrapper):
     parent_remote.close()
-    env = env_fn_wrapper.x()
+    env = env_fn_wrapper.var()
     while True:
-        cmd, data = remote.recv()
-        if cmd == 'step':
-            ob, reward, done, info = env.step(data)
-            if done:
-                ob = env.reset()
-            remote.send((ob, reward, done, info))
-        elif cmd == 'reset':
-            ob = env.reset()
-            remote.send(ob)
-        elif cmd == 'render':
-            remote.send(env.render(mode='rgb_array'))
-        elif cmd == 'close':
-            remote.close()
+        try:
+            cmd, data = remote.recv()
+            if cmd == 'step':
+                observation, reward, done, info = env.step(data)
+                if done:
+                    observation = env.reset()
+                remote.send((observation, reward, done, info))
+            elif cmd == 'reset':
+                observation = env.reset()
+                remote.send(observation)
+            elif cmd == 'render':
+                remote.send(env.render(mode='rgb_array'))
+            elif cmd == 'close':
+                remote.close()
+                break
+            elif cmd == 'get_spaces':
+                remote.send((env.observation_space, env.action_space))
+            else:
+                raise NotImplementedError
+        except EOFError:
             break
-        elif cmd == 'get_spaces':
-            remote.send((env.observation_space, env.action_space))
-        else:
-            raise NotImplementedError
 
 
 class SubprocVecEnv(VecEnv):
-    def __init__(self, env_fns, spaces=None):
+    def __init__(self, env_fns):
         """
-        envs: list of gym environments to run in subprocesses
+        Creates a multiprocess vectorized wrapper for multiple environments
+
+        :param env_fns: ([Gym Environment]) Environments to run in subprocesses
         """
         self.waiting = False
         self.closed = False
-        nenvs = len(env_fns)
-        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
-        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
-            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
-        for p in self.ps:
-            p.daemon = True # if the main process crashes, we should not cause things to hang
-            p.start()
+        n_envs = len(env_fns)
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)])
+        self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
+                          for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
+        for process in self.processes:
+            process.daemon = True  # if the main process crashes, we should not cause things to hang
+            process.start()
         for remote in self.work_remotes:
             remote.close()
 
@@ -65,21 +72,16 @@ def reset(self):
             remote.send(('reset', None))
         return np.stack([remote.recv() for remote in self.remotes])
 
-    def reset_task(self):
-        for remote in self.remotes:
-            remote.send(('reset_task', None))
-        return np.stack([remote.recv() for remote in self.remotes])
-
     def close(self):
         if self.closed:
             return
         if self.waiting:
-            for remote in self.remotes:            
+            for remote in self.remotes:
                 remote.recv()
         for remote in self.remotes:
             remote.send(('close', None))
-        for p in self.ps:
-            p.join()
+        for process in self.processes:
+            process.join()
         self.closed = True
 
     def render(self, mode='human'):
@@ -89,9 +91,9 @@ def render(self, mode='human'):
         bigimg = tile_images(imgs)
         if mode == 'human':
             import cv2
-            cv2.imshow('vecenv', bigimg[:,:,::-1])
+            cv2.imshow('vecenv', bigimg[:, :, ::-1])
             cv2.waitKey(1)
         elif mode == 'rgb_array':
             return bigimg
         else:
-            raise NotImplementedError
\ No newline at end of file
+            raise NotImplementedError
diff --git a/baselines/common/vec_env/vec_frame_stack.py b/baselines/common/vec_env/vec_frame_stack.py
index 0bbcbdbb58..b14974b5d3 100644
--- a/baselines/common/vec_env/vec_frame_stack.py
+++ b/baselines/common/vec_env/vec_frame_stack.py
@@ -1,29 +1,34 @@
-from baselines.common.vec_env import VecEnvWrapper
 import numpy as np
 from gym import spaces
 
+from baselines.common.vec_env import VecEnvWrapper
+
+
 class VecFrameStack(VecEnvWrapper):
-    """
-    Vectorized environment base class
-    """
-    def __init__(self, venv, nstack):
+    def __init__(self, venv, n_stack):
+        """
+        Vectorized environment base class
+        
+        :param venv: ([Gym Environment]) the list of environments to vectorize and normalize
+        :param n_stack:
+        """
         self.venv = venv
-        self.nstack = nstack
-        wos = venv.observation_space # wrapped ob space
-        low = np.repeat(wos.low, self.nstack, axis=-1)
-        high = np.repeat(wos.high, self.nstack, axis=-1)
-        self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
+        self.n_stack = n_stack
+        wrapped_obs_space = venv.observation_space
+        low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1)
+        high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1)
+        self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
 
     def step_wait(self):
-        obs, rews, news, infos = self.venv.step_wait()
-        self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
-        for (i, new) in enumerate(news):
-            if new:
+        observations, rewards, dones, infos = self.venv.step_wait()
+        self.stackedobs = np.roll(self.stackedobs, shift=-observations.shape[-1], axis=-1)
+        for i, done in enumerate(dones):
+            if done:
                 self.stackedobs[i] = 0
-        self.stackedobs[..., -obs.shape[-1]:] = obs
-        return self.stackedobs, rews, news, infos
+        self.stackedobs[..., -observations.shape[-1]:] = observations
+        return self.stackedobs, rewards, dones, infos
 
     def reset(self):
         """
diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py
index dda767da15..e52c14b8c6 100644
--- a/baselines/common/vec_env/vec_normalize.py
+++ b/baselines/common/vec_env/vec_normalize.py
@@ -1,47 +1,50 @@
+import numpy as np
+
 from baselines.common.vec_env import VecEnvWrapper
 from baselines.common.running_mean_std import RunningMeanStd
-import numpy as np
+
 
 class VecNormalize(VecEnvWrapper):
-    """
-    Vectorized environment base class
-    """
-    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
+    def __init__(self, venv, norm_obs=True, norm_reward=True,
+                 clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8):
+        """
+        A rolling average, normalizing, vectorized wrapepr for environment base class
+        
+        :param venv: ([Gym Environment]) the list of environments to vectorize and normalize
+        :param norm_obs: (bool) normalize observation
+        :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new)
+        :param clip_obs: (float) clipping value for nomalizing observation
+        :param clip_reward: (float) clipping value for nomalizing reward
+        :param gamma: (float) discount factor
+        :param epsilon: (float) epsilon value to avoid arithmetic issues
+        """
         VecEnvWrapper.__init__(self, venv)
-        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
-        self.ret_rms = RunningMeanStd(shape=()) if ret else None
-        self.clipob = clipob
-        self.cliprew = cliprew
+        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if norm_obs else None
+        self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None
+        self.clip_obs = clip_obs
+        self.clip_reward = clip_reward
         self.ret = np.zeros(self.num_envs)
         self.gamma = gamma
         self.epsilon = epsilon
 
     def step_wait(self):
-        """
-        Apply sequence of actions to sequence of environments
-        actions -> (observations, rewards, news)
-
-        where 'news' is a boolean vector indicating whether each element is new.
-        """
-        obs, rews, news, infos = self.venv.step_wait()
-        self.ret = self.ret * self.gamma + rews
+        obs, rewards, dones, infos = self.venv.step_wait()
+        self.ret = self.ret * self.gamma + rewards
         obs = self._obfilt(obs)
         if self.ret_rms:
             self.ret_rms.update(self.ret)
-            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
-        return obs, rews, news, infos
+            rewards = np.clip(rewards / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward)
+        return obs, rewards, dones, infos
 
     def _obfilt(self, obs):
         if self.ob_rms:
             self.ob_rms.update(obs)
-            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
+            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon),
+                          -self.clip_obs, self.clip_obs)
             return obs
         else:
             return obs
 
     def reset(self):
-        """
-        Reset all environments
-        """
         obs = self.venv.reset()
         return self._obfilt(obs)
diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py
index e2d49501c7..9085bb03b0 100644
--- a/baselines/ddpg/ddpg.py
+++ b/baselines/ddpg/ddpg.py
@@ -4,47 +4,97 @@
 import numpy as np
 import tensorflow as tf
 import tensorflow.contrib as tc
+from mpi4py import MPI
 
 from baselines import logger
 from baselines.common.mpi_adam import MpiAdam
-import baselines.common.tf_util as U
+import baselines.common.tf_util as tf_util
 from baselines.common.mpi_running_mean_std import RunningMeanStd
-from mpi4py import MPI
 
-def normalize(x, stats):
+
+def normalize(tensor, stats):
+    """
+    normalize a tensor using a running mean and std
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :param stats: (RunningMeanStd) the running mean and std of the input to normalize
+    :return: (TensorFlow Tensor) the normalized tensor
+    """
     if stats is None:
-        return x
-    return (x - stats.mean) / stats.std
+        return tensor
+    return (tensor - stats.mean) / stats.std
+
 
+def denormalize(tensor, stats):
+    """
+    denormalize a tensor using a running mean and std
 
-def denormalize(x, stats):
+    :param tensor: (TensorFlow Tensor) the normalized tensor
+    :param stats: (RunningMeanStd) the running mean and std of the input to normalize
+    :return: (TensorFlow Tensor) the restored tensor
+    """
     if stats is None:
-        return x
-    return x * stats.std + stats.mean
+        return tensor
+    return tensor * stats.std + stats.mean
 
-def reduce_std(x, axis=None, keepdims=False):
-    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
 
-def reduce_var(x, axis=None, keepdims=False):
-    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
-    devs_squared = tf.square(x - m)
+def reduce_std(tensor, axis=None, keepdims=False):
+    """
+    get the standard deviation of a Tensor
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :param axis: (int or [int]) the axis to itterate the std over
+    :param keepdims: (bool) keep the other dimentions the same
+    :return: (TensorFlow Tensor) the std of the tensor
+    """
+    return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims))
+
+
+def reduce_var(tensor, axis=None, keepdims=False):
+    """
+    get the variance of a Tensor
+
+    :param tensor: (TensorFlow Tensor) the input tensor
+    :param axis: (int or [int]) the axis to itterate the variance over
+    :param keepdims: (bool) keep the other dimentions the same
+    :return: (TensorFlow Tensor) the variance of the tensor
+    """
+    tensor_mean = tf.reduce_mean(tensor, axis=axis, keep_dims=True)
+    devs_squared = tf.square(tensor - tensor_mean)
     return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
 
-def get_target_updates(vars, target_vars, tau):
+
+def get_target_updates(_vars, target_vars, tau):
+    """
+    get target update operations
+
+    :param _vars: ([TensorFlow Tensor]) the initial variables
+    :param target_vars: ([TensorFlow Tensor]) the target variables
+    :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
+    :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update
+    """
     logger.info('setting up target updates ...')
     soft_updates = []
     init_updates = []
-    assert len(vars) == len(target_vars)
-    for var, target_var in zip(vars, target_vars):
+    assert len(_vars) == len(target_vars)
+    for var, target_var in zip(_vars, target_vars):
         logger.info('  {} <- {}'.format(target_var.name, var.name))
         init_updates.append(tf.assign(target_var, var))
         soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var))
-    assert len(init_updates) == len(vars)
-    assert len(soft_updates) == len(vars)
+    assert len(init_updates) == len(_vars)
+    assert len(soft_updates) == len(_vars)
     return tf.group(*init_updates), tf.group(*soft_updates)
 
 
 def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev):
+    """
+    get the actor update, with noise.
+
+    :param actor: (TensorFlow Tensor) the actor
+    :param perturbed_actor: (TensorFlow Tensor) the pertubed actor
+    :param param_noise_stddev: (float) the std of the parameter noise
+    :return: (TensorFlow Operation) the update function
+    """
     assert len(actor.vars) == len(perturbed_actor.vars)
     assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars)
 
@@ -52,7 +102,8 @@ def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev):
     for var, perturbed_var in zip(actor.vars, perturbed_actor.vars):
         if var in actor.perturbable_vars:
             logger.info('  {} <- {} + noise'.format(perturbed_var.name, var.name))
-            updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
+            updates.append(tf.assign(perturbed_var,
+                                     var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev)))
         else:
             logger.info('  {} <- {}'.format(perturbed_var.name, var.name))
             updates.append(tf.assign(perturbed_var, var))
@@ -62,10 +113,37 @@ def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev):
 
 class DDPG(object):
     def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
-        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
-        batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
-        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
-        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
+                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
+                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
+                 critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
+        """
+        Deep Deterministic Policy Gradien (DDPG) model
+
+        DDPG: https://arxiv.org/pdf/1509.02971.pdf
+
+        :param actor: (TensorFlow Tensor) the actor model
+        :param critic: (TensorFlow Tensor) the critic model
+        :param memory: (Memory) the replay buffer
+        :param observation_shape: (tuple) the observation space
+        :param action_shape: (tuple) the action space
+        :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
+        :param action_noise: (ActionNoise) the action noise type (can be None)
+        :param gamma: (float) the discount rate
+        :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
+        :param normalize_returns: (bool) should the critic output be normalized
+        :param enable_popart: (bool) enable pop-art normalization of the critic output
+            (https://arxiv.org/pdf/1602.07714.pdf)
+        :param normalize_observations: (bool) should the observation be normalized
+        :param batch_size: (int) the size of the batch for learning the policy
+        :param observation_range: (tuple) the bounding values for the observation
+        :param action_range: (tuple) the bounding values for the actions
+        :param return_range: (tuple) the bounding values for the critic output
+        :param critic_l2_reg: (float) l2 regularizer coefficient
+        :param actor_lr: (float) the actor learning rate
+        :param critic_lr: (float) the critic learning rate
+        :param clip_norm: (float) clip the gradients (disabled if None)
+        :param reward_scale: (float) the value the reward should be scaled by
+        """
         # Inputs.
         self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
         self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
@@ -96,6 +174,12 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param
         self.batch_size = batch_size
         self.stats_sample = None
         self.critic_l2_reg = critic_l2_reg
+        self.target_init_updates = None
+        self.target_soft_updates = None
+        self.critic_loss = None
+        self.critic_grads = None
+        self.critic_optimizer = None
+        self.sess = None
 
         # Observation normalization.
         if self.normalize_observations:
@@ -103,10 +187,10 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param
                 self.obs_rms = RunningMeanStd(shape=observation_shape)
         else:
             self.obs_rms = None
-        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
-            self.observation_range[0], self.observation_range[1])
-        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
-            self.observation_range[0], self.observation_range[1])
+        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0],
+                                           self.observation_range[1])
+        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0],
+                                           self.observation_range[1])
 
         # Return normalization.
         if self.normalize_returns:
@@ -126,11 +210,14 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param
         # Create networks and core TF parts that are shared across setup parts.
         self.actor_tf = actor(normalized_obs0)
         self.normalized_critic_tf = critic(normalized_obs0, self.actions)
-        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
+        self.critic_tf = denormalize(
+            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
         self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
-        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
-        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
-        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1
+        self.critic_with_actor_tf = denormalize(
+            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
+            self.ret_rms)
+        q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
+        self.target_q = self.rewards + (1. - self.terminals1) * gamma * q_obs1
 
         # Set up parts.
         if self.param_noise is not None:
@@ -143,12 +230,20 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param
         self.setup_target_network_updates()
 
     def setup_target_network_updates(self):
+        """
+        set the target update operations
+        """
         actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
-        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
+        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars,
+                                                                      self.tau)
         self.target_init_updates = [actor_init_updates, critic_init_updates]
         self.target_soft_updates = [actor_soft_updates, critic_soft_updates]
 
     def setup_param_noise(self, normalized_obs0):
+        """
+        set the parameter noise operations
+        :param normalized_obs0: (TensorFlow Tensor) the normalized observation
+        """
         assert self.param_noise is not None
 
         # Configure perturbed actor.
@@ -162,26 +257,35 @@ def setup_param_noise(self, normalized_obs0):
         adaptive_param_noise_actor = copy(self.actor)
         adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
         adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
-        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
+        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor,
+                                                                       self.param_noise_stddev)
         self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
 
     def setup_actor_optimizer(self):
+        """
+        setup the optimizer for the actor
+        """
         logger.info('setting up actor optimizer')
         self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
         actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
         actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
         logger.info('  actor shapes: {}'.format(actor_shapes))
         logger.info('  actor params: {}'.format(actor_nb_params))
-        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
+        self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
         self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
-            beta1=0.9, beta2=0.999, epsilon=1e-08)
+                                       beta1=0.9, beta2=0.999, epsilon=1e-08)
 
     def setup_critic_optimizer(self):
+        """
+        setup the optimizer for the critic
+        """
         logger.info('setting up critic optimizer')
-        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
+        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
+                                                       self.return_range[0], self.return_range[1])
         self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
         if self.critic_l2_reg > 0.:
-            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
+            critic_reg_vars = [var for var in self.critic.trainable_vars if
+                               'kernel' in var.name and 'output' not in var.name]
             for var in critic_reg_vars:
                 logger.info('  regularizing: {}'.format(var.name))
             logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
@@ -194,29 +298,37 @@ def setup_critic_optimizer(self):
         critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
         logger.info('  critic shapes: {}'.format(critic_shapes))
         logger.info('  critic params: {}'.format(critic_nb_params))
-        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
-        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
-            beta1=0.9, beta2=0.999, epsilon=1e-08)
+        self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
+        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
 
     def setup_popart(self):
-        # See https://arxiv.org/pdf/1602.07714.pdf for details.
+        """
+        setup pop-art normalization of the critic output
+
+        See https://arxiv.org/pdf/1602.07714.pdf for details.
+        Preserving Outputs Precisely, while Adaptively Rescaling Targets”.
+        """
         self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
         new_std = self.ret_rms.std
         self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
         new_mean = self.ret_rms.mean
 
-        self.renormalize_Q_outputs_op = []
-        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
-            assert len(vs) == 2
-            M, b = vs
-            assert 'kernel' in M.name
-            assert 'bias' in b.name
-            assert M.get_shape()[-1] == 1
-            assert b.get_shape()[-1] == 1
-            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
-            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]
+        self.renormalize_q_outputs_op = []
+        for out_vars in [self.critic.output_vars, self.target_critic.output_vars]:
+            assert len(out_vars) == 2
+            # wieght and bias of the last layer
+            weight, bias = out_vars
+            assert 'kernel' in weight.name
+            assert 'bias' in bias.name
+            assert weight.get_shape()[-1] == 1
+            assert bias.get_shape()[-1] == 1
+            self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)]
+            self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)]
 
     def setup_stats(self):
+        """
+        setup the running means and std of the inputs and outputs of the model
+        """
         ops = []
         names = []
 
@@ -252,58 +364,71 @@ def setup_stats(self):
         self.stats_ops = ops
         self.stats_names = names
 
-    def pi(self, obs, apply_noise=True, compute_Q=True):
+    def policy(self, obs, apply_noise=True, compute_q=True):
+        """
+        Get the actions and critic output, from a given observation
+
+        :param obs: ([float] or [int]) the observation
+        :param apply_noise: (bool) enable the noise
+        :param compute_q: (bool) compute the critic output
+        :return: ([float], float) the action and critic value
+        """
         if self.param_noise is not None and apply_noise:
             actor_tf = self.perturbed_actor_tf
         else:
             actor_tf = self.actor_tf
         feed_dict = {self.obs0: [obs]}
-        if compute_Q:
-            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
+        if compute_q:
+            action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
         else:
             action = self.sess.run(actor_tf, feed_dict=feed_dict)
-            q = None
+            q_value = None
         action = action.flatten()
         if self.action_noise is not None and apply_noise:
             noise = self.action_noise()
             assert noise.shape == action.shape
             action += noise
         action = np.clip(action, self.action_range[0], self.action_range[1])
-        return action, q
+        return action, q_value
 
     def store_transition(self, obs0, action, reward, obs1, terminal1):
+        """
+        Store a transition in the replay buffer
+
+        :param obs0: ([float] or [int]) the last observation
+        :param action: ([float]) the action
+        :param reward: (float] the reward
+        :param obs1: ([float] or [int]) the current observation
+        :param terminal1: (bool) is the episode done
+        """
         reward *= self.reward_scale
         self.memory.append(obs0, action, reward, obs1, terminal1)
         if self.normalize_observations:
             self.obs_rms.update(np.array([obs0]))
 
     def train(self):
+        """
+        run a step of training from batch
+        :return: (float, float) critic loss, actor loss
+        """
         # Get a batch.
         batch = self.memory.sample(batch_size=self.batch_size)
 
         if self.normalize_returns and self.enable_popart:
-            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
-                self.obs1: batch['obs1'],
-                self.rewards: batch['rewards'],
-                self.terminals1: batch['terminals1'].astype('float32'),
-            })
-            self.ret_rms.update(target_Q.flatten())
-            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
-                self.old_std : np.array([old_std]),
-                self.old_mean : np.array([old_mean]),
+            old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q],
+                                                        feed_dict={
+                                                            self.obs1: batch['obs1'],
+                                                            self.rewards: batch['rewards'],
+                                                            self.terminals1: batch['terminals1'].astype('float32'),
+                                                        })
+            self.ret_rms.update(target_q.flatten())
+            self.sess.run(self.renormalize_q_outputs_op, feed_dict={
+                self.old_std: np.array([old_std]),
+                self.old_mean: np.array([old_mean]),
             })
 
-            # Run sanity check. Disabled by default since it slows down things considerably.
-            # print('running sanity check')
-            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
-            #     self.obs1: batch['obs1'],
-            #     self.rewards: batch['rewards'],
-            #     self.terminals1: batch['terminals1'].astype('float32'),
-            # })
-            # print(target_Q_new, target_Q, new_mean, new_std)
-            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
         else:
-            target_Q = self.sess.run(self.target_Q, feed_dict={
+            target_q = self.sess.run(self.target_q, feed_dict={
                 self.obs1: batch['obs1'],
                 self.rewards: batch['rewards'],
                 self.terminals1: batch['terminals1'].astype('float32'),
@@ -314,14 +439,19 @@ def train(self):
         actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
             self.obs0: batch['obs0'],
             self.actions: batch['actions'],
-            self.critic_target: target_Q,
+            self.critic_target: target_q,
         })
-        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
-        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
+        self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr)
+        self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr)
 
         return critic_loss, actor_loss
 
     def initialize(self, sess):
+        """
+        initialize the model parameters and optimizers
+
+        :param sess: (TensorFlow Session) the current TensorFlow session
+        """
         self.sess = sess
         self.sess.run(tf.global_variables_initializer())
         self.actor_optimizer.sync()
@@ -329,9 +459,17 @@ def initialize(self, sess):
         self.sess.run(self.target_init_updates)
 
     def update_target_net(self):
+        """
+        run target soft update operation
+        """
         self.sess.run(self.target_soft_updates)
 
     def get_stats(self):
+        """
+        Get the mean and standard deviation of the model's inputs and outputs
+
+        :return: (dict) the means and stds
+        """
         if self.stats_sample is None:
             # Get a sample and keep that fixed for all further computations.
             # This allows us to estimate the change in value for the same set of inputs.
@@ -351,6 +489,11 @@ def get_stats(self):
         return stats
 
     def adapt_param_noise(self):
+        """
+        calculate the adaptation for the parameter noise
+
+        :return: (float) the mean distance for the parameter noise
+        """
         if self.param_noise is None:
             return 0.
 
@@ -369,7 +512,9 @@ def adapt_param_noise(self):
         return mean_distance
 
     def reset(self):
-        # Reset internal state after an episode is complete.
+        """
+        Reset internal state after an episode is complete.
+        """
         if self.action_noise is not None:
             self.action_noise.reset()
         if self.param_noise is not None:
diff --git a/baselines/ddpg/main.py b/baselines/ddpg/main.py
index e877507b8e..48a3d78cec 100644
--- a/baselines/ddpg/main.py
+++ b/baselines/ddpg/main.py
@@ -1,22 +1,33 @@
 import argparse
 import time
 import os
-import logging
+
+import gym
+import tensorflow as tf
+import numpy as np
+from mpi4py import MPI
+
 from baselines import logger, bench
-from baselines.common.misc_util import (
-    set_global_seeds,
-    boolean_flag,
-)
+from baselines.common.misc_util import set_global_seeds, boolean_flag
 import baselines.ddpg.training as training
 from baselines.ddpg.models import Actor, Critic
 from baselines.ddpg.memory import Memory
-from baselines.ddpg.noise import *
+from baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise
 
-import gym
-import tensorflow as tf
-from mpi4py import MPI
 
 def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
+    """
+    run the training of DDPG
+
+    :param env_id: (str) the environment ID
+    :param seed: (int) the initial random seed
+    :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
+        seperating them with commas
+    :param layer_norm: (bool) use layer normalization
+    :param evaluation: (bool) enable evaluation of DDPG training
+    :param kwargs: (dict) extra keywords for the training.train function
+    """
+
     # Configure things.
     rank = MPI.COMM_WORLD.Get_rank()
     if rank != 0:
@@ -26,7 +37,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
     env = gym.make(env_id)
     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
 
-    if evaluation and rank==0:
+    if evaluation and rank == 0:
         eval_env = gym.make(env_id)
         eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
         env = bench.Monitor(env, None)
@@ -46,10 +57,11 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
         elif 'normal' in current_noise_type:
             _, stddev = current_noise_type.split('_')
-            action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
+            action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
         elif 'ou' in current_noise_type:
             _, stddev = current_noise_type.split('_')
-            action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
+            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions),
+                                                        sigma=float(stddev) * np.ones(nb_actions))
         else:
             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))
 
@@ -70,8 +82,8 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
     # Disable logging for rank != 0 to avoid noise.
     if rank == 0:
         start_time = time.time()
-    training.train(env=env, eval_env=eval_env, param_noise=param_noise,
-        action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
+    training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor,
+                   critic=critic, memory=memory, **kwargs)
     env.close()
     if eval_env is not None:
         eval_env.close()
@@ -80,6 +92,11 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
 
 
 def parse_args():
+    """
+    parse the arguments for DDPG training
+
+    :return: (dict) the arguments
+    """
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
@@ -102,14 +119,15 @@ def parse_args():
     parser.add_argument('--nb-train-steps', type=int, default=50)  # per epoch cycle and MPI worker
     parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
     parser.add_argument('--nb-rollout-steps', type=int, default=100)  # per epoch cycle and MPI worker
-    parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
+    # choices are adaptive-param_xx, ou_xx, normal_xx, none
+    parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')
     parser.add_argument('--num-timesteps', type=int, default=None)
     boolean_flag(parser, 'evaluation', default=False)
     args = parser.parse_args()
     # we don't directly specify timesteps for this script, so make sure that if we do specify them
     # they agree with the other parameters
     if args.num_timesteps is not None:
-        assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps)
+        assert args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps
     dict_args = vars(args)
     del dict_args['num_timesteps']
     return dict_args
diff --git a/baselines/ddpg/memory.py b/baselines/ddpg/memory.py
index 90f0f9a18a..474c42a82b 100644
--- a/baselines/ddpg/memory.py
+++ b/baselines/ddpg/memory.py
@@ -3,6 +3,13 @@
 
 class RingBuffer(object):
     def __init__(self, maxlen, shape, dtype='float32'):
+        """
+        A buffer object, when full restarts at the initial position
+
+        :param maxlen: (int) the max number of numpy objects to store
+        :param shape: (tuple) the shape of the numpy objects you want to store
+        :param dtype: (str) the name of the type of the numpy object you want to store
+        """
         self.maxlen = maxlen
         self.start = 0
         self.length = 0
@@ -17,9 +24,20 @@ def __getitem__(self, idx):
         return self.data[(self.start + idx) % self.maxlen]
 
     def get_batch(self, idxs):
+        """
+        get the value at the indexes
+
+        :param idxs: (int or numpy int) the indexes
+        :return: (numpy Any) the stored information in the buffer at the asked positions
+        """
         return self.data[(self.start + idxs) % self.maxlen]
 
-    def append(self, v):
+    def append(self, var):
+        """
+        Append an object to the buffer
+
+        :param var: (numpy Any) the object you wish to add
+        """
         if self.length < self.maxlen:
             # We have space, simply increase the length.
             self.length += 1
@@ -29,18 +47,31 @@ def append(self, v):
         else:
             # This should never happen.
             raise RuntimeError()
-        self.data[(self.start + self.length - 1) % self.maxlen] = v
+        self.data[(self.start + self.length - 1) % self.maxlen] = var
 
 
-def array_min2d(x):
-    x = np.array(x)
-    if x.ndim >= 2:
-        return x
-    return x.reshape(-1, 1)
+def array_min2d(arr):
+    """
+    cast to numpy array, and make sure it is of 2 dim
+
+    :param arr: ([Any]) the array to clean
+    :return: (numpy Any) the cleaned array
+    """
+    arr = np.array(arr)
+    if arr.ndim >= 2:
+        return arr
+    return arr.reshape(-1, 1)
 
 
 class Memory(object):
     def __init__(self, limit, action_shape, observation_shape):
+        """
+        The replay buffer object
+
+        :param limit: (int) the max number of transitions to store
+        :param action_shape: (tuple) the action shape
+        :param observation_shape: (tuple) the observation shape
+        """
         self.limit = limit
 
         self.observations0 = RingBuffer(limit, shape=observation_shape)
@@ -50,6 +81,12 @@ def __init__(self, limit, action_shape, observation_shape):
         self.observations1 = RingBuffer(limit, shape=observation_shape)
 
     def sample(self, batch_size):
+        """
+        sample a random batch from the buffer
+
+        :param batch_size: (int) the number of element to sample for the batch
+        :return: (dict) the sampled batch
+        """
         # Draw such that we always have a proceeding element.
         batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size)
 
@@ -69,6 +106,16 @@ def sample(self, batch_size):
         return result
 
     def append(self, obs0, action, reward, obs1, terminal1, training=True):
+        """
+        Append a transition to the buffer
+
+        :param obs0: ([float] or [int]) the last observation
+        :param action: ([float]) the action
+        :param reward: (float] the reward
+        :param obs1: ([float] or [int]) the current observation
+        :param terminal1: (bool) is the episode done
+        :param training: (bool) is the RL model training or not
+        """
         if not training:
             return
         
diff --git a/baselines/ddpg/models.py b/baselines/ddpg/models.py
index dc5803a035..efb89360ee 100644
--- a/baselines/ddpg/models.py
+++ b/baselines/ddpg/models.py
@@ -3,8 +3,14 @@
 
 
 class Model(object):
-    def __init__(self, name):
+    def __init__(self, name, layer_norm=True):
+        """
+        A TensorFlow Model type
+
+        :param name: (str) the name of the model
+        """
         self.name = name
+        self.layer_norm = layer_norm
 
     @property
     def vars(self):
@@ -18,10 +24,27 @@ def trainable_vars(self):
     def perturbable_vars(self):
         return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
 
+    def fc_with_relu(self, input_tensor):
+        """
+        Fully connected layer followed by ReLU
+        with optional batchnorm
+        """
+        preactivation = tf.layers.dense(input_tensor, 64)
+        if self.layer_norm:
+            preactivation = tc.layers.layer_norm(preactivation, center=True, scale=True)
+        return tf.nn.relu(preactivation)
+
 
 class Actor(Model):
     def __init__(self, nb_actions, name='actor', layer_norm=True):
-        super(Actor, self).__init__(name=name)
+        """
+        A TensorFlow Actor model, this is used to output the actions
+
+        :param nb_actions: (int) the size of the action space
+        :param name: (str) the name of the model (default: 'actor')
+        :param layer_norm: (bool) enable layer normalization
+        """
+        super(Actor, self).__init__(name=name, layer_norm=layer_norm)
         self.nb_actions = nb_actions
         self.layer_norm = layer_norm
 
@@ -30,25 +53,23 @@ def __call__(self, obs, reuse=False):
             if reuse:
                 scope.reuse_variables()
 
-            x = obs
-            x = tf.layers.dense(x, 64)
-            if self.layer_norm:
-                x = tc.layers.layer_norm(x, center=True, scale=True)
-            x = tf.nn.relu(x)
-            
-            x = tf.layers.dense(x, 64)
-            if self.layer_norm:
-                x = tc.layers.layer_norm(x, center=True, scale=True)
-            x = tf.nn.relu(x)
-            
-            x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
-            x = tf.nn.tanh(x)
-        return x
+            layer_1 = self.fc_with_relu(obs)
+            layer_2 = self.fc_with_relu(layer_1)
+            last_layer = tf.layers.dense(layer_2, self.nb_actions,
+                                         kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
+            squashed_out = tf.nn.tanh(last_layer)
+        return squashed_out
 
 
 class Critic(Model):
     def __init__(self, name='critic', layer_norm=True):
-        super(Critic, self).__init__(name=name)
+        """
+        A TensorFlow Critic model, this is used to output the value of a state
+
+        :param name: (str) the name of the model (default: 'critic')
+        :param layer_norm: (bool) enable layer normalization
+        """
+        super(Critic, self).__init__(name=name, layer_norm=layer_norm)
         self.layer_norm = layer_norm
 
     def __call__(self, obs, action, reuse=False):
@@ -56,20 +77,12 @@ def __call__(self, obs, action, reuse=False):
             if reuse:
                 scope.reuse_variables()
 
-            x = obs
-            x = tf.layers.dense(x, 64)
-            if self.layer_norm:
-                x = tc.layers.layer_norm(x, center=True, scale=True)
-            x = tf.nn.relu(x)
-
-            x = tf.concat([x, action], axis=-1)
-            x = tf.layers.dense(x, 64)
-            if self.layer_norm:
-                x = tc.layers.layer_norm(x, center=True, scale=True)
-            x = tf.nn.relu(x)
-
-            x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
-        return x
+            layer_1 = self.fc_with_relu(obs)
+            layer_2 = tf.concat([layer_1, action], axis=-1)
+            layer_3 = self.fc_with_relu(layer_2)
+            value = tf.layers.dense(layer_3, 1,
+                                    kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
+        return value
 
     @property
     def output_vars(self):
diff --git a/baselines/ddpg/noise.py b/baselines/ddpg/noise.py
index c48d0d6a22..46c1da8011 100644
--- a/baselines/ddpg/noise.py
+++ b/baselines/ddpg/noise.py
@@ -3,6 +3,13 @@
 
 class AdaptiveParamNoiseSpec(object):
     def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
+        """
+        Implements adaptive parameter noise
+
+        :param initial_stddev: (float) the initial value for the standard deviation of the noise
+        :param desired_action_stddev: (float) the desired value for the standard deviation of the noise
+        :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise
+        """
         self.initial_stddev = initial_stddev
         self.desired_action_stddev = desired_action_stddev
         self.adoption_coefficient = adoption_coefficient
@@ -10,6 +17,11 @@ def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coeff
         self.current_stddev = initial_stddev
 
     def adapt(self, distance):
+        """
+        update the standard deviation for the parameter noise
+
+        :param distance: (float) the noise distance applied to the parameters
+        """
         if distance > self.desired_action_stddev:
             # Decrease stddev.
             self.current_stddev /= self.adoption_coefficient
@@ -18,10 +30,12 @@ def adapt(self, distance):
             self.current_stddev *= self.adoption_coefficient
 
     def get_stats(self):
-        stats = {
-            'param_noise_stddev': self.current_stddev,
-        }
-        return stats
+        """
+        return the standard deviation for the parameter noise
+
+        :return: (dict) the stats of the noise
+        """
+        return {'param_noise_stddev': self.current_stddev}
 
     def __repr__(self):
         fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
@@ -29,39 +43,66 @@ def __repr__(self):
 
 
 class ActionNoise(object):
+    """
+    The action noise base class
+    """
     def reset(self):
+        """
+        call end of episode reset for the noise
+        """
         pass
 
 
 class NormalActionNoise(ActionNoise):
-    def __init__(self, mu, sigma):
-        self.mu = mu
-        self.sigma = sigma
+    def __init__(self, mean, sigma):
+        """
+        A guassian action noise
+
+        :param mean: (float) the mean value of the noise
+        :param sigma: (float) the scale of the noise (std here)
+        """
+        self._mu = mean
+        self._sigma = sigma
 
     def __call__(self):
-        return np.random.normal(self.mu, self.sigma)
+        return np.random.normal(self._mu, self._sigma)
 
     def __repr__(self):
-        return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
+        return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
 
 
-# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
 class OrnsteinUhlenbeckActionNoise(ActionNoise):
-    def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
-        self.theta = theta
-        self.mu = mu
-        self.sigma = sigma
-        self.dt = dt
-        self.x0 = x0
+    def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None):
+        """
+        A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction.
+
+        Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
+
+        :param mean: (float) the mean of the noise
+        :param sigma: (float) the scale of the noise
+        :param theta: (float) the rate of mean reversion
+        :param dt: (float) the timestep for the noise
+        :param initial_noise: ([float]) the initial value for the noise output, (if None: 0)
+        """
+        self._theta = theta
+        self._mu = mean
+        self._sigma = sigma
+        self._dt = dt
+        self.initial_noise = initial_noise
+        self.noise_prev = None
         self.reset()
 
     def __call__(self):
-        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
-        self.x_prev = x
-        return x
+        noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \
+                self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape)
+        self.noise_prev = noise
+        return noise
 
     def reset(self):
-        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
+        """
+        reset the Ornstein Uhlenbeck noise, to the initial position
+        """
+        self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu)
 
     def __repr__(self):
-        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
+        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
diff --git a/baselines/ddpg/training.py b/baselines/ddpg/training.py
index 74a9b8fd1c..454279dc3b 100644
--- a/baselines/ddpg/training.py
+++ b/baselines/ddpg/training.py
@@ -3,43 +3,72 @@
 from collections import deque
 import pickle
 
-from baselines.ddpg.ddpg import DDPG
-import baselines.common.tf_util as U
-
-from baselines import logger
 import numpy as np
 import tensorflow as tf
 from mpi4py import MPI
 
+from baselines.ddpg.ddpg import DDPG
+import baselines.common.tf_util as tf_util
+from baselines import logger
+
 
 def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
-    normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
-    popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
-    tau=0.01, eval_env=None, param_noise_adaption_interval=50):
+          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
+          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
+          tau=0.01, eval_env=None, param_noise_adaption_interval=50):
+    """
+    Runs the training of the Deep Deterministic Policy Gradien (DDPG) model
+
+    DDPG: https://arxiv.org/pdf/1509.02971.pdf
+
+    :param env: (Gym Environment) the environment
+    :param nb_epochs: (int) the number of training epochs
+    :param nb_epoch_cycles: (int) the number cycles within each epoch
+    :param render_eval: (bool) enable rendering of the evalution environment
+    :param reward_scale: (float) the value the reward should be scaled by
+    :param render: (bool) enable rendering of the environment
+    :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None)
+    :param actor: (TensorFlow Tensor) the actor model
+    :param critic: (TensorFlow Tensor) the critic model
+    :param normalize_returns: (bool) should the critic output be normalized
+    :param normalize_observations: (bool) should the observation be normalized
+    :param critic_l2_reg: (float) l2 regularizer coefficient
+    :param actor_lr: (float) the actor learning rate
+    :param critic_lr: (float) the critic learning rate
+    :param action_noise: (ActionNoise) the action noise type (can be None)
+    :param popart: (bool) enable pop-art normalization of the critic output
+        (https://arxiv.org/pdf/1602.07714.pdf)
+    :param gamma: (float) the discount rate
+    :param clip_norm: (float) clip the gradients (disabled if None)
+    :param nb_train_steps: (int) the number of training steps
+    :param nb_rollout_steps: (int) the number of rollout steps
+    :param nb_eval_steps: (int) the number of evalutation steps
+    :param batch_size: (int) the size of the batch for learning the policy
+    :param memory: (Memory) the replay buffer
+    :param tau: (float) the soft update coefficient (keep old values, between 0 and 1)
+    :param eval_env: (Gym Environment) the evaluation environment (can be None)
+    :param param_noise_adaption_interval: (int) apply param noise every N steps
+    """
     rank = MPI.COMM_WORLD.Get_rank()
 
     assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
     max_action = env.action_space.high
     logger.info('scaling actions by {} before executing in env'.format(max_action))
-    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
-        gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
-        batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
-        actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
-        reward_scale=reward_scale)
+    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, param_noise=param_noise,
+                 action_noise=action_noise, gamma=gamma, tau=tau, normalize_returns=normalize_returns,
+                 enable_popart=popart, normalize_observations=normalize_observations, batch_size=batch_size,
+                 critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, clip_norm=clip_norm,
+                 reward_scale=reward_scale)
     logger.info('Using agent with the following configuration:')
     logger.info(str(agent.__dict__.items()))
 
     # Set up logging stuff only for a single worker.
     if rank == 0:
-        saver = tf.train.Saver()
-    else:
-        saver = None
+        tf.train.Saver()
 
-    step = 0
-    episode = 0
     eval_episode_rewards_history = deque(maxlen=100)
     episode_rewards_history = deque(maxlen=100)
-    with U.single_threaded_session() as sess:
+    with tf_util.single_threaded_session() as sess:
         # Prepare everything.
         agent.initialize(sess)
         sess.graph.finalize()
@@ -48,46 +77,42 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
         obs = env.reset()
         if eval_env is not None:
             eval_obs = eval_env.reset()
-        done = False
         episode_reward = 0.
         episode_step = 0
         episodes = 0
-        t = 0
+        step = 0
 
-        epoch = 0
         start_time = time.time()
 
         epoch_episode_rewards = []
         epoch_episode_steps = []
-        epoch_episode_eval_rewards = []
-        epoch_episode_eval_steps = []
-        epoch_start_time = time.time()
         epoch_actions = []
         epoch_qs = []
         epoch_episodes = 0
         for epoch in range(nb_epochs):
-            for cycle in range(nb_epoch_cycles):
+            for _ in range(nb_epoch_cycles):
                 # Perform rollouts.
-                for t_rollout in range(nb_rollout_steps):
+                for _ in range(nb_rollout_steps):
                     # Predict next action.
-                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
+                    action, q_value = agent.policy(obs, apply_noise=True, compute_q=True)
                     assert action.shape == env.action_space.shape
 
                     # Execute next action.
                     if rank == 0 and render:
                         env.render()
                     assert max_action.shape == action.shape
-                    new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
-                    t += 1
+                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
+                    new_obs, reward, done, _ = env.step(max_action * action)
+                    step += 1
                     if rank == 0 and render:
                         env.render()
-                    episode_reward += r
+                    episode_reward += reward
                     episode_step += 1
 
                     # Book-keeping.
                     epoch_actions.append(action)
-                    epoch_qs.append(q)
-                    agent.store_transition(obs, action, r, new_obs, done)
+                    epoch_qs.append(q_value)
+                    agent.store_transition(obs, action, reward, new_obs, done)
                     obs = new_obs
 
                     if done:
@@ -113,9 +138,9 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                         distance = agent.adapt_param_noise()
                         epoch_adaptive_distances.append(distance)
 
-                    cl, al = agent.train()
-                    epoch_critic_losses.append(cl)
-                    epoch_actor_losses.append(al)
+                    critic_loss, actor_loss = agent.train()
+                    epoch_critic_losses.append(critic_loss)
+                    epoch_actor_losses.append(actor_loss)
                     agent.update_target_net()
 
                 # Evaluate.
@@ -123,9 +148,10 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                 eval_qs = []
                 if eval_env is not None:
                     eval_episode_reward = 0.
-                    for t_rollout in range(nb_eval_steps):
-                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
-                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
+                    for _ in range(nb_eval_steps):
+                        eval_action, eval_q = agent.policy(eval_obs, apply_noise=False, compute_q=True)
+                        # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
+                        eval_obs, eval_r, eval_done, _ = eval_env.step(max_action * eval_action)
                         if render_eval:
                             eval_env.render()
                         eval_episode_reward += eval_r
@@ -152,7 +178,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
             combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
             combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
             combined_stats['total/duration'] = duration
-            combined_stats['total/steps_per_second'] = float(t) / float(duration)
+            combined_stats['total/steps_per_second'] = float(step) / float(duration)
             combined_stats['total/episodes'] = episodes
             combined_stats['rollout/episodes'] = epoch_episodes
             combined_stats['rollout/actions_std'] = np.std(epoch_actions)
@@ -162,20 +188,27 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa
                 combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                 combined_stats['eval/Q'] = eval_qs
                 combined_stats['eval/episodes'] = len(eval_episode_rewards)
-            def as_scalar(x):
-                if isinstance(x, np.ndarray):
-                    assert x.size == 1
-                    return x[0]
-                elif np.isscalar(x):
-                    return x
+
+            def as_scalar(scalar):
+                """
+                check and return the input if it is a scalar, otherwise raise ValueError
+
+                :param scalar: (Any) the object to check
+                :return: (Number) the scalar if x is a scalar
+                """
+                if isinstance(scalar, np.ndarray):
+                    assert scalar.size == 1
+                    return scalar[0]
+                elif np.isscalar(scalar):
+                    return scalar
                 else:
-                    raise ValueError('expected scalar, got %s'%x)
+                    raise ValueError('expected scalar, got %s' % scalar)
             combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()]))
-            combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)}
+            combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)}
 
             # Total statistics.
             combined_stats['total/epochs'] = epoch + 1
-            combined_stats['total/steps'] = t
+            combined_stats['total/steps'] = step
 
             for key in sorted(combined_stats.keys()):
                 logger.record_tabular(key, combined_stats[key])
@@ -184,8 +217,8 @@ def as_scalar(x):
             logdir = logger.get_dir()
             if rank == 0 and logdir:
                 if hasattr(env, 'get_state'):
-                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
-                        pickle.dump(env.get_state(), f)
+                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler:
+                        pickle.dump(env.get_state(), file_handler)
                 if eval_env and hasattr(eval_env, 'get_state'):
-                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
-                        pickle.dump(eval_env.get_state(), f)
+                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler:
+                        pickle.dump(eval_env.get_state(), file_handler)
diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py
index 4472399a51..d30bf14b5f 100644
--- a/baselines/deepq/__init__.py
+++ b/baselines/deepq/__init__.py
@@ -3,6 +3,13 @@
 from baselines.deepq.simple import learn, load  # noqa
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
 
+
 def wrap_atari_dqn(env):
+    """
+    wrap the environment in atari wrappers for DeepQ
+
+    :param env: (Gym Environment) the environment
+    :return: (Gym Environment) the wrapped environment
+    """
     from baselines.common.atari_wrappers import wrap_deepmind
-    return wrap_deepmind(env, frame_stack=True, scale=True)
\ No newline at end of file
+    return wrap_deepmind(env, frame_stack=True, scale=True)
diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py
index e9ff1a41a3..1b4a3aae6a 100644
--- a/baselines/deepq/build_graph.py
+++ b/baselines/deepq/build_graph.py
@@ -6,46 +6,28 @@
 
     Function to chose an action given an observation
 
-    Parameters
-    ----------
-    observation: object
-        Observation that can be feed into the output of make_obs_ph
-    stochastic: bool
-        if set to False all the actions are always deterministic (default False)
-    update_eps_ph: float
-        update epsilon a new value, if negative not update happens
-        (default: no update)
-
-    Returns
-    -------
-    Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
-    every element of the batch.
+    :param observation: (Any) Observation that can be feed into the output of make_obs_ph
+    :param stochastic: (bool) if set to False all the actions are always deterministic (default False)
+    :param update_eps_ph: (float) update epsilon a new value, if negative not update happens (default: no update)
+    :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
+        every element of the batch.
 
 
 ======= act (in case of parameter noise) ========
 
     Function to chose an action given an observation
 
-    Parameters
-    ----------
-    observation: object
-        Observation that can be feed into the output of make_obs_ph
-    stochastic: bool
-        if set to False all the actions are always deterministic (default False)
-    update_eps_ph: float
-        update epsilon a new value, if negative not update happens
+    :param observation: (Any) Observation that can be feed into the output of make_obs_ph
+    :param stochastic: (bool) if set to False all the actions are always deterministic (default False)
+    :param update_eps_ph: (float) update epsilon a new value, if negative not update happens
         (default: no update)
-    reset_ph: bool
-        reset the perturbed policy by sampling a new perturbation
-    update_param_noise_threshold_ph: float
-        the desired threshold for the difference between non-perturbed and perturbed policy
-    update_param_noise_scale_ph: bool
-        whether or not to update the scale of the noise for the next time it is re-perturbed
-
-    Returns
-    -------
-    Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
-    every element of the batch.
+    :param reset_ph: (bool) reset the perturbed policy by sampling a new perturbation
+    :param update_param_noise_threshold_ph: (float) the desired threshold for the difference between
+        non-perturbed and perturbed policy
+    :param update_param_noise_scale_ph: (bool) whether or not to update the scale of the noise for the next time it is
+        re-perturbed
+    :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for
+        every element of the batch.
 
 
 ======= train =======
@@ -55,30 +37,17 @@
         td_error = Q(s,a) - (r + gamma * max_a' Q(s', a'))
         loss = huber_loss[td_error]
 
-    Parameters
-    ----------
-    obs_t: object
-        a batch of observations
-    action: np.array
-        actions that were selected upon seeing obs_t.
-        dtype must be int32 and shape must be (batch_size,)
-    reward: np.array
-        immediate reward attained after executing those actions
-        dtype must be float32 and shape must be (batch_size,)
-    obs_tp1: object
-        observations that followed obs_t
-    done: np.array
-        1 if obs_t was the last observation in the episode and 0 otherwise
-        obs_tp1 gets ignored, but must be of the valid shape.
-        dtype must be float32 and shape must be (batch_size,)
-    weight: np.array
-        imporance weights for every element of the batch (gradient is multiplied
-        by the importance weight) dtype must be float32 and shape must be (batch_size,)
-
-    Returns
-    -------
-    td_error: np.array
-        a list of differences between Q(s,a) and the target in Bellman's equation.
+    :param obs_t: (Any) a batch of observations
+    :param action: (numpy int) actions that were selected upon seeing obs_t. dtype must be int32 and shape must be
+        (batch_size,)
+    :param reward: (numpy float) immediate reward attained after executing those actions dtype must be float32 and
+        shape must be (batch_size,)
+    :param obs_tp1: (Any) observations that followed obs_t
+    :param done: (numpy bool) 1 if obs_t was the last observation in the episode and 0 otherwise obs_tp1 gets ignored,
+        but must be of the valid shape. dtype must be float32 and shape must be (batch_size,)
+    :param weight: (numpy float) imporance weights for every element of the batch (gradient is multiplied by the
+        importance weight) dtype must be float32 and shape must be (batch_size,)
+    :return: (numpy float) td_error: a list of differences between Q(s,a) and the target in Bellman's equation.
         dtype is float32 and shape is (batch_size,)
 
 ======= update_target ========
@@ -94,23 +63,17 @@
 
 """
 import tensorflow as tf
-import baselines.common.tf_util as U
+import baselines.common.tf_util as tf_utils
 
 
 def scope_vars(scope, trainable_only=False):
     """
     Get variables inside a scope
     The scope can be specified as a string
-    Parameters
-    ----------
-    scope: str or VariableScope
-        scope in which the variables reside.
-    trainable_only: bool
-        whether or not to return only the variables that were marked as trainable.
-    Returns
-    -------
-    vars: [tf.Variable]
-        list of variables in `scope`.
+
+    :param scope: (str or VariableScope) scope in which the variables reside.
+    :param trainable_only: (bool) whether or not to return only the variables that were marked as trainable.
+    :return: ([TensorFlow Tensor]) vars: list of variables in `scope`.
     """
     return tf.get_collection(
         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES,
@@ -119,16 +82,30 @@ def scope_vars(scope, trainable_only=False):
 
 
 def scope_name():
-    """Returns the name of current scope as a string, e.g. deepq/q_func"""
+    """
+    Returns the name of current scope as a string, e.g. deepq/q_func
+
+    :return: (str) the name of current scope
+    """
     return tf.get_variable_scope().name
 
 
 def absolute_scope_name(relative_scope_name):
-    """Appends parent scope name to `relative_scope_name`"""
+    """
+    Appends parent scope name to `relative_scope_name`
+
+    :return: (str) the absolute name of the scope
+    """
     return scope_name() + "/" + relative_scope_name
 
 
 def default_param_noise_filter(var):
+    """
+    check whether or not a variable is perturbable or not
+
+    :param var: (TensorFlow Tensor) the variable
+    :return: (bool) can be perturb
+    """
     if var not in tf.trainable_variables():
         # We never perturb non-trainable vars.
         return False
@@ -146,11 +123,9 @@ def default_param_noise_filter(var):
 def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
     """Creates the act function:
 
-    Parameters
-    ----------
-    make_obs_ph: str -> tf.placeholder or TfInput
-        a function that take a name and creates a placeholder of input with that name
-    q_func: (tf.Variable, int, str, bool) -> tf.Variable
+    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of
+        input with that name
+    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
         the model that takes the following inputs:
             observation_in: object
                 the output of observation placeholder
@@ -160,18 +135,11 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
             reuse: bool
                 should be passed to outer variable scope
         and returns a tensor of shape (batch_size, num_actions) with values of every action.
-    num_actions: int
-        number of actions.
-    scope: str or VariableScope
-        optional scope for variable_scope.
-    reuse: bool or None
-        whether or not the variables should be reused. To be able to reuse the scope must be given.
-
-    Returns
-    -------
-    act: (tf.Variable, bool, float) -> tf.Variable
-        function to select and action given observation.
-`       See the top of the file for details.
+    :param num_actions: (int) number of actions.
+    :param scope: (str or VariableScope) optional scope for variable_scope.
+    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
+    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given
+        observation. See the top of the file for details.
     """
     with tf.variable_scope(scope, reuse=reuse):
         observations_ph = make_obs_ph("observation")
@@ -190,23 +158,26 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
 
         output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
         update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
-        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
-                         outputs=output_actions,
-                         givens={update_eps_ph: -1.0, stochastic_ph: True},
-                         updates=[update_eps_expr])
-        def act(ob, stochastic=True, update_eps=-1):
-            return _act(ob, stochastic, update_eps)
+        _act = tf_utils.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
+                                 outputs=output_actions,
+                                 givens={update_eps_ph: -1.0, stochastic_ph: True},
+                                 updates=[update_eps_expr])
+
+        def act(obs, stochastic=True, update_eps=-1):
+            return _act(obs, stochastic, update_eps)
+
         return act
 
 
-def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
+def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None,
+                               param_noise_filter_func=None):
     """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):
 
     Parameters
     ----------
-    make_obs_ph: str -> tf.placeholder or TfInput
-        a function that take a name and creates a placeholder of input with that name
-    q_func: (tf.Variable, int, str, bool) -> tf.Variable
+    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of
+        input with that name
+    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
         the model that takes the following inputs:
             observation_in: object
                 the output of observation placeholder
@@ -216,21 +187,14 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
             reuse: bool
                 should be passed to outer variable scope
         and returns a tensor of shape (batch_size, num_actions) with values of every action.
-    num_actions: int
-        number of actions.
-    scope: str or VariableScope
-        optional scope for variable_scope.
-    reuse: bool or None
-        whether or not the variables should be reused. To be able to reuse the scope must be given.
-    param_noise_filter_func: tf.Variable -> bool
-        function that decides whether or not a variable should be perturbed. Only applicable
-        if param_noise is True. If set to None, default_param_noise_filter is used by default.
-
-    Returns
-    -------
-    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
-        function to select and action given observation.
-`       See the top of the file for details.
+    :param num_actions: (int) number of actions.
+    :param scope: (str or VariableScope) optional scope for variable_scope.
+    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
+    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
+        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
+        is used by default.
+    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given
+        observation. See the top of the file for details.
     """
     if param_noise_filter_func is None:
         param_noise_filter_func = default_param_noise_filter
@@ -244,18 +208,28 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq",
         reset_ph = tf.placeholder(tf.bool, (), name="reset")
 
         eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
-        param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False)
-        param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False)
+        param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01),
+                                            trainable=False)
+        param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05),
+                                                trainable=False)
 
         # Unmodified Q.
         q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
 
         # Perturbable Q used for the actual rollout.
         q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
-        # We have to wrap this code into a function due to the way tf.cond() works. See
-        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
-        # a more detailed discussion.
+
         def perturb_vars(original_scope, perturbed_scope):
+            """
+            We have to wrap this code into a function due to the way tf.cond() works.
+
+            See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed
+            discussion.
+
+            :param original_scope: (str or VariableScope) the original scope.
+            :param perturbed_scope: (str or VariableScope) the perturbed scope.
+            :return: (TensorFlow Operation)
+            """
             all_vars = scope_vars(absolute_scope_name(original_scope))
             all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
             assert len(all_vars) == len(all_perturbed_vars)
@@ -263,11 +237,13 @@ def perturb_vars(original_scope, perturbed_scope):
             for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                 if param_noise_filter_func(perturbed_var):
                     # Perturb this variable.
-                    op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale))
+                    operation = tf.assign(perturbed_var,
+                                          var + tf.random_normal(shape=tf.shape(var), mean=0.,
+                                                                 stddev=param_noise_scale))
                 else:
                     # Do not perturb, just assign.
-                    op = tf.assign(perturbed_var, var)
-                perturb_ops.append(op)
+                    operation = tf.assign(perturbed_var, var)
+                perturb_ops.append(operation)
             assert len(perturb_ops) == len(all_vars)
             return tf.group(*perturb_ops)
 
@@ -276,19 +252,28 @@ def perturb_vars(original_scope, perturbed_scope):
         # is too big, reduce scale of perturbation, otherwise increase.
         q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
         perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
-        kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
-        mean_kl = tf.reduce_mean(kl)
+        kl_loss = tf.reduce_sum(
+            tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))),
+            axis=-1)
+        mean_kl = tf.reduce_mean(kl_loss)
+
         def update_scale():
+            """
+            update the scale expression
+
+            :return: (TensorFlow Tensor) the updated scale expression
+            """
             with tf.control_dependencies([perturb_for_adaption]):
                 update_scale_expr = tf.cond(mean_kl < param_noise_threshold,
-                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
-                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
-                )
+                                            lambda: param_noise_scale.assign(param_noise_scale * 1.01),
+                                            lambda: param_noise_scale.assign(param_noise_scale / 1.01),
+                                            )
             return update_scale_expr
 
         # Functionality to update the threshold for parameter space noise.
-        update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0,
-            lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold))
+        update_param_noise_thres_expr = param_noise_threshold.assign(
+            tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph,
+                    lambda: param_noise_threshold))
 
         # Put everything together.
         deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
@@ -301,77 +286,83 @@ def update_scale():
         update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
         updates = [
             update_eps_expr,
-            tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])),
+            tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"),
+                    lambda: tf.group(*[])),
             tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
-            update_param_noise_threshold_expr,
+            update_param_noise_thres_expr,
         ]
-        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
-                         outputs=output_actions,
-                         givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
-                         updates=updates)
-        def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
-            return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
+        _act = tf_utils.function(
+            inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph,
+                    update_param_noise_scale_ph],
+            outputs=output_actions,
+            givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False,
+                    update_param_noise_scale_ph: False},
+            updates=updates)
+
+        def act(obs, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
+            """
+            get the action from the current observation
+
+            :param obs: (Any) Observation that can be feed into the output of make_obs_ph
+            :param reset: (bool) reset the perturbed policy by sampling a new perturbation
+            :param update_param_noise_threshold: (float) the desired threshold for the difference between
+                non-perturbed and perturbed policy
+            :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time
+                it is re-perturbed
+            :param stochastic: (bool) if set to False all the actions are always deterministic (default False)
+            :param update_eps: (float) update epsilon a new value, if negative not update happens
+                (default: no update)
+            :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be
+                performed for every element of the batch.
+            """
+            return _act(obs, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
+
         return act
 
 
 def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
-    double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
-    """Creates the train function:
+                double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
+    """
+    Creates the train function:
 
-    Parameters
-    ----------
-    make_obs_ph: str -> tf.placeholder or TfInput
-        a function that takes a name and creates a placeholder of input with that name
-    q_func: (tf.Variable, int, str, bool) -> tf.Variable
+    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of
+        input with that name
+    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
         the model that takes the following inputs:
-            observation_in: object
-                the output of observation placeholder
-            num_actions: int
-                number of actions
-            scope: str
-            reuse: bool
-                should be passed to outer variable scope
-        and returns a tensor of shape (batch_size, num_actions) with values of every action.
-    num_actions: int
-        number of actions
-    reuse: bool
-        whether or not to reuse the graph variables
-    optimizer: tf.train.Optimizer
-        optimizer to use for the Q-learning objective.
-    grad_norm_clipping: float or None
-        clip gradient norms to this value. If None no clipping is performed.
-    gamma: float
-        discount rate.
-    double_q: bool
-        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
-        In general it is a good idea to keep it enabled.
-    scope: str or VariableScope
-        optional scope for variable_scope.
-    reuse: bool or None
-        whether or not the variables should be reused. To be able to reuse the scope must be given.
-    param_noise: bool
-        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
-    param_noise_filter_func: tf.Variable -> bool
-        function that decides whether or not a variable should be perturbed. Only applicable
-        if param_noise is True. If set to None, default_param_noise_filter is used by default.
-
-    Returns
-    -------
-    act: (tf.Variable, bool, float) -> tf.Variable
-        function to select and action given observation.
-`       See the top of the file for details.
-    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
-        optimize the error in Bellman's equation.
-`       See the top of the file for details.
-    update_target: () -> ()
-        copy the parameters from optimized Q function to the target Q function.
-`       See the top of the file for details.
-    debug: {str: function}
-        a bunch of functions to print debug data like q_values.
+            - observation_in: (Any) the output of observation placeholder
+            - num_actions: int  number of actions
+            - scope: (str)
+            - reuse: (bool)
+
+            should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions)
+            with values of every action.
+    :param num_actions: (int) number of actions
+    :param reuse: (bool) whether or not to reuse the graph variables
+    :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective.
+    :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed.
+    :param gamma: (float) discount rate.
+    :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a
+        good idea to keep it enabled.
+    :param scope: (str or VariableScope) optional scope for variable_scope.
+    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
+    :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
+    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
+        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
+        is used by default.
+
+    :return: (tuple)
+
+        act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given
+            observation. See the top of the file for details.
+        train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float)
+            optimize the error in Bellman's equation. See the top of the file for details.
+        update_target: (function) copy the parameters from optimized Q function to the target Q function.
+            See the top of the file for details.
+        debug: ({str: function}) a bunch of functions to print debug data like q_values.
     """
     if param_noise:
         act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse,
-            param_noise_filter_func=param_noise_filter_func)
+                                           param_noise_filter_func=param_noise_filter_func)
     else:
         act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)
 
@@ -390,7 +381,8 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
 
         # target q network evalution
         q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
-        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
+        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
+                                               scope=tf.get_variable_scope().name + "/target_q_func")
 
         # q scores for actions which we know were selected in the given state.
         q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)
@@ -409,7 +401,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
 
         # compute the error (potentially clipped)
         td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        errors = U.huber_loss(td_error)
+        errors = tf_utils.huber_loss(td_error)
         weighted_error = tf.reduce_mean(importance_weights_ph * errors)
 
         # compute optimization op (potentially with gradient clipping)
@@ -430,7 +422,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
         update_target_expr = tf.group(*update_target_expr)
 
         # Create callable functions
-        train = U.function(
+        train = tf_utils.function(
             inputs=[
                 obs_t_input,
                 act_t_ph,
@@ -442,8 +434,8 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=
             outputs=td_error,
             updates=[optimize_expr]
         )
-        update_target = U.function([], [], updates=[update_target_expr])
+        update_target = tf_utils.function([], [], updates=[update_target_expr])
 
-        q_values = U.function([obs_t_input], q_t)
+        q_values = tf_utils.function([obs_t_input], q_t)
 
         return act_f, train, update_target, {'q_values': q_values}
diff --git a/baselines/deepq/experiments/custom_cartpole.py b/baselines/deepq/experiments/custom_cartpole.py
index b5a381a37e..8fb9fc0bb9 100644
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -1,20 +1,28 @@
-import gym
 import itertools
+import argparse
+
+import gym
 import numpy as np
 import tensorflow as tf
 import tensorflow.contrib.layers as layers
 
-import baselines.common.tf_util as U
-
-from baselines import logger
-from baselines import deepq
+import baselines.common.tf_util as tf_utils
+from baselines import logger, deepq
 from baselines.deepq.replay_buffer import ReplayBuffer
 from baselines.deepq.utils import ObservationInput
 from baselines.common.schedules import LinearSchedule
 
 
 def model(inpt, num_actions, scope, reuse=False):
-    """This model takes as input an observation and returns values of all actions."""
+    """
+    This model takes as input an observation and returns values of all actions.
+
+    :param inpt: (TensorFlow Tensor) the input placeholder
+    :param num_actions: (int) size of the action space
+    :param scope: (str) the variable scope
+    :param reuse: (bool) is a reusable model
+    :return: (TensorFlow Tensor)
+    """
     with tf.variable_scope(scope, reuse=reuse):
         out = inpt
         out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
@@ -23,7 +31,13 @@ def model(inpt, num_actions, scope, reuse=False):
 
 
 if __name__ == '__main__':
-    with U.make_session(8):
+    parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp")
+    parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
+    parser.add_argument('--max-timesteps', default=50000, type=int,
+                        help="Maximum number of timesteps when not rendering")
+    args = parser.parse_args()
+
+    with tf_utils.make_session(8):
         # Create the environment
         env = gym.make("CartPole-v0")
         # Create all the functions necessary to train the model
@@ -40,7 +54,7 @@ def model(inpt, num_actions, scope, reuse=False):
         exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
 
         # Initialize the parameters and copy them to the target network.
-        U.initialize()
+        tf_utils.initialize()
         update_target()
 
         episode_rewards = [0.0]
@@ -58,8 +72,19 @@ def model(inpt, num_actions, scope, reuse=False):
                 obs = env.reset()
                 episode_rewards.append(0)
 
-            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
+            if len(episode_rewards[-101:-1]) == 0:
+                mean_100ep_reward = -np.inf
+            else:
+                mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
+
+            is_solved = t > 100 and mean_100ep_reward >= 200
+
+            if args.no_render and t > args.max_timesteps:
+                break
+
             if is_solved:
+                if args.no_render:
+                    break
                 # Show off the result
                 env.render()
             else:
@@ -74,6 +99,6 @@ def model(inpt, num_actions, scope, reuse=False):
             if done and len(episode_rewards) % 10 == 0:
                 logger.record_tabular("steps", t)
                 logger.record_tabular("episodes", len(episode_rewards))
-                logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
+                logger.record_tabular("mean episode reward", mean_100ep_reward)
                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                 logger.dump_tabular()
diff --git a/baselines/deepq/experiments/enjoy_cartpole.py b/baselines/deepq/experiments/enjoy_cartpole.py
index 1c6176bac3..378e29380f 100644
--- a/baselines/deepq/experiments/enjoy_cartpole.py
+++ b/baselines/deepq/experiments/enjoy_cartpole.py
@@ -1,9 +1,16 @@
+import argparse
+
 import gym
 
 from baselines import deepq
 
 
-def main():
+def main(args):
+    """
+    run a trained model for the cartpole problem
+
+    :param args: (ArgumentParser) the input arguments
+    """
     env = gym.make("CartPole-v0")
     act = deepq.load("cartpole_model.pkl")
 
@@ -11,11 +18,18 @@ def main():
         obs, done = env.reset(), False
         episode_rew = 0
         while not done:
-            env.render()
+            if not args.no_render:
+                env.render()
             obs, rew, done, _ = env.step(act(obs[None])[0])
             episode_rew += rew
         print("Episode reward", episode_rew)
+        # No render is only used for automatic testing
+        if args.no_render:
+            break
 
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole")
+    parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
+    args = parser.parse_args()
+    main(args)
diff --git a/baselines/deepq/experiments/enjoy_mountaincar.py b/baselines/deepq/experiments/enjoy_mountaincar.py
index 8bced8c0f8..73396e2aa0 100644
--- a/baselines/deepq/experiments/enjoy_mountaincar.py
+++ b/baselines/deepq/experiments/enjoy_mountaincar.py
@@ -1,9 +1,16 @@
+import argparse
+
 import gym
 
 from baselines import deepq
 
 
-def main():
+def main(args):
+    """
+    run a trained model for the mountain car problem
+
+    :param args: (ArgumentParser) the input arguments
+    """
     env = gym.make("MountainCar-v0")
     act = deepq.load("mountaincar_model.pkl")
 
@@ -11,11 +18,18 @@ def main():
         obs, done = env.reset(), False
         episode_rew = 0
         while not done:
-            env.render()
+            if not args.no_render:
+                env.render()
             obs, rew, done, _ = env.step(act(obs[None])[0])
             episode_rew += rew
         print("Episode reward", episode_rew)
+        # No render is only used for automatic testing
+        if args.no_render:
+            break
 
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar")
+    parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
+    args = parser.parse_args()
+    main(args)
diff --git a/baselines/deepq/experiments/enjoy_pong.py b/baselines/deepq/experiments/enjoy_pong.py
index 5b16fec6b6..0c4db3e2a9 100644
--- a/baselines/deepq/experiments/enjoy_pong.py
+++ b/baselines/deepq/experiments/enjoy_pong.py
@@ -1,8 +1,12 @@
 import gym
+
 from baselines import deepq
 
 
 def main():
+    """
+    run a trained model for the pong problem
+    """
     env = gym.make("PongNoFrameskip-v4")
     env = deepq.wrap_atari_dqn(env)
     act = deepq.load("pong_model.pkl")
diff --git a/baselines/deepq/experiments/run_atari.py b/baselines/deepq/experiments/run_atari.py
index b6b427ba7a..04ffb18cfb 100644
--- a/baselines/deepq/experiments/run_atari.py
+++ b/baselines/deepq/experiments/run_atari.py
@@ -1,12 +1,16 @@
-from baselines import deepq
-from baselines.common import set_global_seeds
-from baselines import bench
 import argparse
-from baselines import logger
+
+import tensorflow as tf
+
+from baselines import deepq, bench, logger
+from baselines.common import set_global_seeds
 from baselines.common.atari_wrappers import make_atari
 
 
 def main():
+    """
+    run the atari test
+    """
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
@@ -29,23 +33,24 @@ def main():
         dueling=bool(args.dueling),
     )
 
-    deepq.learn(
-        env,
-        q_func=model,
-        lr=1e-4,
-        max_timesteps=args.num_timesteps,
-        buffer_size=10000,
-        exploration_fraction=0.1,
-        exploration_final_eps=0.01,
-        train_freq=4,
-        learning_starts=10000,
-        target_network_update_freq=1000,
-        gamma=0.99,
-        prioritized_replay=bool(args.prioritized),
-        prioritized_replay_alpha=args.prioritized_replay_alpha,
-        checkpoint_freq=args.checkpoint_freq,
-        checkpoint_path=args.checkpoint_path,
-    )
+    with tf.Session():
+        deepq.learn(
+            env,
+            q_func=model,
+            learning_rate=1e-4,
+            max_timesteps=args.num_timesteps,
+            buffer_size=10000,
+            exploration_fraction=0.1,
+            exploration_final_eps=0.01,
+            train_freq=4,
+            learning_starts=10000,
+            target_network_update_freq=1000,
+            gamma=0.99,
+            prioritized_replay=bool(args.prioritized),
+            prioritized_replay_alpha=args.prioritized_replay_alpha,
+            checkpoint_freq=args.checkpoint_freq,
+            checkpoint_path=args.checkpoint_path,
+        )
 
     env.close()
 
diff --git a/baselines/deepq/experiments/train_cartpole.py b/baselines/deepq/experiments/train_cartpole.py
index a50c2428f9..bf68adeb36 100644
--- a/baselines/deepq/experiments/train_cartpole.py
+++ b/baselines/deepq/experiments/train_cartpole.py
@@ -1,22 +1,41 @@
+import argparse
+
 import gym
+import numpy as np
 
 from baselines import deepq
 
 
 def callback(lcl, _glb):
+    """
+    the callback function for logging and saving
+
+    :param lcl: (dict) the local variables
+    :param _glb: (dict) the global variables
+    :return: (bool) is solved
+    """
     # stop training if reward exceeds 199
-    is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
+    if len(lcl['episode_rewards'][-101:-1]) == 0:
+        mean_100ep_reward = -np.inf
+    else:
+        mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1)
+    is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199
     return is_solved
 
 
-def main():
+def main(args):
+    """
+    train and save the DeepQ model, for the cartpole problem
+
+    :param args: (ArgumentParser) the input arguments
+    """
     env = gym.make("CartPole-v0")
     model = deepq.models.mlp([64])
     act = deepq.learn(
         env,
         q_func=model,
-        lr=1e-3,
-        max_timesteps=100000,
+        learning_rate=1e-3,
+        max_timesteps=args.max_timesteps,
         buffer_size=50000,
         exploration_fraction=0.1,
         exploration_final_eps=0.02,
@@ -28,4 +47,7 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description="Train DQN on cartpole")
+    parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps")
+    args = parser.parse_args()
+    main(args)
diff --git a/baselines/deepq/experiments/train_mountaincar.py b/baselines/deepq/experiments/train_mountaincar.py
index 061967d760..32ba2645c0 100644
--- a/baselines/deepq/experiments/train_mountaincar.py
+++ b/baselines/deepq/experiments/train_mountaincar.py
@@ -1,17 +1,24 @@
+import argparse
+
 import gym
 
 from baselines import deepq
 
 
-def main():
+def main(args):
+    """
+    train and save the DeepQ model, for the mountain car problem
+
+    :param args: (ArgumentParser) the input arguments
+    """
     env = gym.make("MountainCar-v0")
     # Enabling layer_norm here is import for parameter space noise!
     model = deepq.models.mlp([64], layer_norm=True)
     act = deepq.learn(
         env,
         q_func=model,
-        lr=1e-3,
-        max_timesteps=100000,
+        learning_rate=1e-3,
+        max_timesteps=args.max_timesteps,
         buffer_size=50000,
         exploration_fraction=0.1,
         exploration_final_eps=0.1,
@@ -23,4 +30,7 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description="Train DQN on cartpole")
+    parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps")
+    args = parser.parse_args()
+    main(args)
diff --git a/baselines/deepq/models.py b/baselines/deepq/models.py
index 198d795a06..686d989260 100644
--- a/baselines/deepq/models.py
+++ b/baselines/deepq/models.py
@@ -14,19 +14,17 @@ def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
         return q_out
 
 
-def mlp(hiddens=[], layer_norm=False):
-    """This model takes as input an observation and returns values of all actions.
+def mlp(hiddens=None, layer_norm=False):
+    """
+    This model takes as input an observation and returns values of all actions.
 
-    Parameters
-    ----------
-    hiddens: [int]
-        list of sizes of hidden layers
+    :param hiddens: ([int]) list of sizes of hidden layers
+    :param layer_norm: (bool) if true, use layer normalization
 
-    Returns
-    -------
-    q_func: function
-        q_function for DQN algorithm.
+    :return: (function) q_function for DQN algorithm.
     """
+    if hiddens is None:
+        hiddens = []
     return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
 
 
@@ -70,21 +68,11 @@ def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False,
 def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
     """This model takes as input an observation and returns values of all actions.
 
-    Parameters
-    ----------
-    convs: [(int, int int)]
-        list of convolutional layers in form of
-        (num_outputs, kernel_size, stride)
-    hiddens: [int]
-        list of sizes of hidden layers
-    dueling: bool
-        if true double the output MLP to compute a baseline
-        for action scores
-
-    Returns
-    -------
-    q_func: function
-        q_function for DQN algorithm.
+    :param convs: ([(int, int, int)]) list of convolutional layers in form of (num_outputs, kernel_size, stride)
+    :param hiddens: ([int]) list of sizes of hidden layers
+    :param dueling: (bool) if true double the output MLP to compute a baseline for action scores
+    :param layer_norm: (bool) if true, use layer normalization
+    :return: (function) q_function for DQN algorithm.
     """
 
     return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
diff --git a/baselines/deepq/replay_buffer.py b/baselines/deepq/replay_buffer.py
index 7988113b0e..dcd79fbd2b 100644
--- a/baselines/deepq/replay_buffer.py
+++ b/baselines/deepq/replay_buffer.py
@@ -1,18 +1,17 @@
-import numpy as np
 import random
 
+import numpy as np
+
 from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
 
 
 class ReplayBuffer(object):
     def __init__(self, size):
-        """Create Replay buffer.
+        """
+        Create Replay buffer.
 
-        Parameters
-        ----------
-        size: int
-            Max number of transitions to store in the buffer. When the buffer
-            overflows the old memories are dropped.
+        :param size: (int)  Max number of transitions to store in the buffer. When the buffer overflows the old
+            memories are dropped.
         """
         self._storage = []
         self._maxsize = size
@@ -22,6 +21,15 @@ def __len__(self):
         return len(self._storage)
 
     def add(self, obs_t, action, reward, obs_tp1, done):
+        """
+        add a new transition to the buffer
+
+        :param obs_t: (Any) the last observation
+        :param action: ([float]) the action
+        :param reward: (float) the reward of the transition
+        :param obs_tp1: (Any) the current observation
+        :param done: (bool) is the episode done
+        """
         data = (obs_t, action, reward, obs_tp1, done)
 
         if self._next_idx >= len(self._storage):
@@ -42,27 +50,18 @@ def _encode_sample(self, idxes):
             dones.append(done)
         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
 
-    def sample(self, batch_size):
-        """Sample a batch of experiences.
-
-        Parameters
-        ----------
-        batch_size: int
-            How many transitions to sample.
-
-        Returns
-        -------
-        obs_batch: np.array
-            batch of observations
-        act_batch: np.array
-            batch of actions executed given obs_batch
-        rew_batch: np.array
-            rewards received as results of executing act_batch
-        next_obs_batch: np.array
-            next set of observations seen after executing act_batch
-        done_mask: np.array
-            done_mask[i] = 1 if executing act_batch[i] resulted in
-            the end of an episode and 0 otherwise.
+    def sample(self, batch_size, **_kwargs):
+        """
+        Sample a batch of experiences.
+
+        :param batch_size: (int) How many transitions to sample.
+        :return:
+            - obs_batch: (numpy Any) batch of observations
+            - act_batch: (numpy float) batch of actions executed given obs_batch
+            - rew_batch: (numpy float) rewards received as results of executing act_batch
+            - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch
+            - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode
+                and 0 otherwise.
         """
         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
         return self._encode_sample(idxes)
@@ -70,20 +69,14 @@ def sample(self, batch_size):
 
 class PrioritizedReplayBuffer(ReplayBuffer):
     def __init__(self, size, alpha):
-        """Create Prioritized Replay buffer.
-
-        Parameters
-        ----------
-        size: int
-            Max number of transitions to store in the buffer. When the buffer
-            overflows the old memories are dropped.
-        alpha: float
-            how much prioritization is used
-            (0 - no prioritization, 1 - full prioritization)
-
-        See Also
-        --------
-        ReplayBuffer.__init__
+        """
+        Create Prioritized Replay buffer.
+
+        See Also ReplayBuffer.__init__
+
+        :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old memories
+            are dropped.
+        :param alpha: (float) how much prioritization is used (0 - no prioritization, 1 - full prioritization)
         """
         super(PrioritizedReplayBuffer, self).__init__(size)
         assert alpha >= 0
@@ -97,10 +90,18 @@ def __init__(self, size, alpha):
         self._it_min = MinSegmentTree(it_capacity)
         self._max_priority = 1.0
 
-    def add(self, *args, **kwargs):
-        """See ReplayBuffer.store_effect"""
+    def add(self, obs_t, action, reward, obs_tp1, done):
+        """
+        add a new transition to the buffer
+
+        :param obs_t: (Any) the last observation
+        :param action: ([float]) the action
+        :param reward: (float) the reward of the transition
+        :param obs_tp1: (Any) the current observation
+        :param done: (bool) is the episode done
+        """
         idx = self._next_idx
-        super().add(*args, **kwargs)
+        super().add(obs_t, action, reward, obs_tp1, done)
         self._it_sum[idx] = self._max_priority ** self._alpha
         self._it_min[idx] = self._max_priority ** self._alpha
 
@@ -113,41 +114,26 @@ def _sample_proportional(self, batch_size):
             res.append(idx)
         return res
 
-    def sample(self, batch_size, beta):
-        """Sample a batch of experiences.
+    def sample(self, batch_size, beta=0):
+        """
+        Sample a batch of experiences.
 
         compared to ReplayBuffer.sample
         it also returns importance weights and idxes
         of sampled experiences.
 
-
-        Parameters
-        ----------
-        batch_size: int
-            How many transitions to sample.
-        beta: float
-            To what degree to use importance weights
-            (0 - no corrections, 1 - full correction)
-
-        Returns
-        -------
-        obs_batch: np.array
-            batch of observations
-        act_batch: np.array
-            batch of actions executed given obs_batch
-        rew_batch: np.array
-            rewards received as results of executing act_batch
-        next_obs_batch: np.array
-            next set of observations seen after executing act_batch
-        done_mask: np.array
-            done_mask[i] = 1 if executing act_batch[i] resulted in
-            the end of an episode and 0 otherwise.
-        weights: np.array
-            Array of shape (batch_size,) and dtype np.float32
-            denoting importance weight of each sampled transition
-        idxes: np.array
-            Array of shape (batch_size,) and dtype np.int32
-            idexes in buffer of sampled experiences
+        :param batch_size: (int) How many transitions to sample.
+        :param beta: (float) To what degree to use importance weights (0 - no corrections, 1 - full correction)
+        :return:
+            - obs_batch: (numpy Any) batch of observations
+            - act_batch: (numpy float) batch of actions executed given obs_batch
+            - rew_batch: (numpy float) rewards received as results of executing act_batch
+            - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch
+            - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode
+                and 0 otherwise.
+            - weights: (numpy float) Array of shape (batch_size,) and dtype np.float32 denoting importance weight of
+                each sampled transition
+            - idxes: (numpy int) Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences
         """
         assert beta > 0
 
@@ -166,19 +152,15 @@ def sample(self, batch_size, beta):
         return tuple(list(encoded_sample) + [weights, idxes])
 
     def update_priorities(self, idxes, priorities):
-        """Update priorities of sampled transitions.
+        """
+        Update priorities of sampled transitions.
 
         sets priority of transition at index idxes[i] in buffer
         to priorities[i].
 
-        Parameters
-        ----------
-        idxes: [int]
-            List of idxes of sampled transitions
-        priorities: [float]
-            List of updated priorities corresponding to
-            transitions at the sampled idxes denoted by
-            variable `idxes`.
+        :param idxes: ([int]) List of idxes of sampled transitions
+        :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes
+            denoted by variable `idxes`.
         """
         assert len(idxes) == len(priorities)
         for idx, priority in zip(idxes, priorities):
diff --git a/baselines/deepq/simple.py b/baselines/deepq/simple.py
index 4bad145503..238662840d 100644
--- a/baselines/deepq/simple.py
+++ b/baselines/deepq/simple.py
@@ -6,182 +6,154 @@
 import cloudpickle
 import numpy as np
 
-import baselines.common.tf_util as U
+from baselines import logger, deepq
+from baselines.common import tf_util
 from baselines.common.tf_util import load_state, save_state
-from baselines import logger
 from baselines.common.schedules import LinearSchedule
-from baselines.common.input import observation_input
-
-from baselines import deepq
 from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
 from baselines.deepq.utils import ObservationInput
 
 
 class ActWrapper(object):
-    def __init__(self, act, act_params):
+    def __init__(self, act, act_params, sess=None):
+        """
+        the actor wrapper for loading and saving
+
+        :param act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) the actor function
+        :param act_params: (dict) {'make_obs_ph', 'q_func', 'num_actions'}
+        :param sess: (TensorFlow Session) the current session
+        """
         self._act = act
         self._act_params = act_params
+        if sess is None:
+            self.sess = tf_util.make_session()
+        else:
+            self.sess = sess
 
     @staticmethod
     def load(path):
-        with open(path, "rb") as f:
-            model_data, act_params = cloudpickle.load(f)
+        """
+        Load from a path an actor model
+
+        :param path: (str) the save location
+        :return: (ActWrapper) a loaded actor model
+        """
+        with open(path, "rb") as file_handler:
+            model_data, act_params = cloudpickle.load(file_handler)
         act = deepq.build_act(**act_params)
-        sess = tf.Session()
-        sess.__enter__()
-        with tempfile.TemporaryDirectory() as td:
-            arc_path = os.path.join(td, "packed.zip")
-            with open(arc_path, "wb") as f:
-                f.write(model_data)
+        sess = tf_util.make_session()
+        with tempfile.TemporaryDirectory() as temp_dir:
+            arc_path = os.path.join(temp_dir, "packed.zip")
+            with open(arc_path, "wb") as file_handler:
+                file_handler.write(model_data)
 
-            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
-            load_state(os.path.join(td, "model"))
+            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(temp_dir)
+            load_state(os.path.join(temp_dir, "model"), sess)
 
-        return ActWrapper(act, act_params)
+        return ActWrapper(act, act_params, sess=sess)
 
     def __call__(self, *args, **kwargs):
-        return self._act(*args, **kwargs)
+        with self.sess.as_default():
+            return self._act(*args, **kwargs)
 
     def save(self, path=None):
-        """Save model to a pickle located at `path`"""
+        """
+        Save model to a pickle located at `path`
+
+        :param path: (str) the save location
+        """
         if path is None:
             path = os.path.join(logger.get_dir(), "model.pkl")
 
-        with tempfile.TemporaryDirectory() as td:
-            save_state(os.path.join(td, "model"))
-            arc_name = os.path.join(td, "packed.zip")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            save_state(os.path.join(temp_dir, "model"), self.sess)
+            arc_name = os.path.join(temp_dir, "packed.zip")
             with zipfile.ZipFile(arc_name, 'w') as zipf:
-                for root, dirs, files in os.walk(td):
+                for root, _, files in os.walk(temp_dir):
                     for fname in files:
                         file_path = os.path.join(root, fname)
                         if file_path != arc_name:
-                            zipf.write(file_path, os.path.relpath(file_path, td))
-            with open(arc_name, "rb") as f:
-                model_data = f.read()
-        with open(path, "wb") as f:
-            cloudpickle.dump((model_data, self._act_params), f)
+                            zipf.write(file_path, os.path.relpath(file_path, temp_dir))
+            with open(arc_name, "rb") as file_handler:
+                model_data = file_handler.read()
+        with open(path, "wb") as file_handler:
+            cloudpickle.dump((model_data, self._act_params), file_handler)
 
 
 def load(path):
-    """Load act function that was returned by learn function.
-
-    Parameters
-    ----------
-    path: str
-        path to the act function pickle
-
-    Returns
-    -------
-    act: ActWrapper
-        function that takes a batch of observations
-        and returns actions.
+    """
+    Load act function that was returned by learn function.
+
+    :param path: (str) path to the act function pickle
+
+    :return: (ActWrapper) function that takes a batch of observations and returns actions.
     """
     return ActWrapper.load(path)
 
 
-def learn(env,
-          q_func,
-          lr=5e-4,
-          max_timesteps=100000,
-          buffer_size=50000,
-          exploration_fraction=0.1,
-          exploration_final_eps=0.02,
-          train_freq=1,
-          batch_size=32,
-          print_freq=100,
-          checkpoint_freq=10000,
-          checkpoint_path=None,
-          learning_starts=1000,
-          gamma=1.0,
-          target_network_update_freq=500,
-          prioritized_replay=False,
-          prioritized_replay_alpha=0.6,
-          prioritized_replay_beta0=0.4,
-          prioritized_replay_beta_iters=None,
-          prioritized_replay_eps=1e-6,
-          param_noise=False,
-          callback=None):
-    """Train a deepq model.
-
-    Parameters
-    -------
-    env: gym.Env
-        environment to train on
-    q_func: (tf.Variable, int, str, bool) -> tf.Variable
+def learn(env, q_func, learning_rate=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1,
+          exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000,
+          checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500,
+          prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4,
+          prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None):
+    """
+    Train a deepq model.
+
+    :param env: (Gym Environment) environment to train on
+    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
         the model that takes the following inputs:
-            observation_in: object
-                the output of observation placeholder
-            num_actions: int
-                number of actions
-            scope: str
-            reuse: bool
-                should be passed to outer variable scope
+            - observation_in: (object) the output of observation placeholder
+            - num_actions: (int) number of actions
+            - scope: (str)
+            - reuse: (bool) should be passed to outer variable scope
         and returns a tensor of shape (batch_size, num_actions) with values of every action.
-    lr: float
-        learning rate for adam optimizer
-    max_timesteps: int
-        number of env steps to optimizer for
-    buffer_size: int
-        size of the replay buffer
-    exploration_fraction: float
-        fraction of entire training period over which the exploration rate is annealed
-    exploration_final_eps: float
-        final value of random action probability
-    train_freq: int
-        update the model every `train_freq` steps.
-        set to None to disable printing
-    batch_size: int
-        size of a batched sampled from replay buffer for training
-    print_freq: int
-        how often to print out training progress
-        set to None to disable printing
-    checkpoint_freq: int
-        how often to save the model. This is so that the best version is restored
-        at the end of the training. If you do not wish to restore the best version at
-        the end of the training set this variable to None.
-    learning_starts: int
-        how many steps of the model to collect transitions for before learning starts
-    gamma: float
-        discount factor
-    target_network_update_freq: int
-        update the target network every `target_network_update_freq` steps.
-    prioritized_replay: True
-        if True prioritized replay buffer will be used.
-    prioritized_replay_alpha: float
-        alpha parameter for prioritized replay buffer
-    prioritized_replay_beta0: float
-        initial value of beta for prioritized replay buffer
-    prioritized_replay_beta_iters: int
-        number of iterations over which beta will be annealed from initial value
+    :param learning_rate: (float) learning rate for adam optimizer
+    :param max_timesteps: (int) number of env steps to optimizer for
+    :param buffer_size: (int) size of the replay buffer
+    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed
+    :param exploration_final_eps: (float) final value of random action probability
+    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
+    :param batch_size: (int) size of a batched sampled from replay buffer for training
+    :param print_freq: (int) how often to print out training progress set to None to disable printing
+    :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end
+        of the training. If you do not wish to restore the best version at the end of the training set this variable
+        to None.
+    :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory.
+    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
+    :param gamma: (float) discount factor
+    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
+    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
+    :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer
+    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
+    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value
         to 1.0. If set to None equals to max_timesteps.
-    prioritized_replay_eps: float
-        epsilon to add to the TD errors when updating priorities.
-    callback: (locals, globals) -> None
-        function called at every steps with state of the algorithm.
-        If callback returns true training stops.
-
-    Returns
-    -------
-    act: ActWrapper
-        Wrapper over act function. Adds ability to save it and load it.
-        See header of baselines/deepq/categorical.py for details on the act function.
+    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
+    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
+    :param callback: (function (dict, dict)) function called at every steps with state of the algorithm.
+        If callback returns true training stops. It takes the local and global variables.
+    :return: (ActWrapper) Wrapper over act function. Adds ability to save it and load it. See header of
+        baselines/deepq/categorical.py for details on the act function.
     """
     # Create all the functions necessary to train the model
 
-    sess = tf.Session()
-    sess.__enter__()
-
     # capture the shape outside the closure so that the env object is not serialized
     # by cloudpickle when serializing make_obs_ph
+    observation_space_shape = env.observation_space
 
     def make_obs_ph(name):
-        return ObservationInput(env.observation_space, name=name)
+        """
+        makes the observation placeholder
+
+        :param name: (str) the placeholder name
+        :return: (TensorFlow Tensor) the placeholder
+        """
+        return ObservationInput(observation_space_shape, name=name)
 
-    act, train, update_target, debug = deepq.build_train(
+    act, train, update_target, _ = deepq.build_train(
         make_obs_ph=make_obs_ph,
         q_func=q_func,
         num_actions=env.action_space.n,
-        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
+        optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate),
         gamma=gamma,
         grad_norm_clipping=10,
         param_noise=param_noise
@@ -212,32 +184,32 @@ def make_obs_ph(name):
                                  final_p=exploration_final_eps)
 
     # Initialize the parameters and copy them to the target network.
-    U.initialize()
-    update_target()
+    tf_util.initialize(act.sess)
+    update_target(sess=act.sess)
 
     episode_rewards = [0.0]
     saved_mean_reward = None
     obs = env.reset()
     reset = True
 
-    with tempfile.TemporaryDirectory() as td:
-        td = checkpoint_path or td
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir = checkpoint_path or temp_dir
 
-        model_file = os.path.join(td, "model")
+        model_file = os.path.join(temp_dir, "model")
         model_saved = False
-        if tf.train.latest_checkpoint(td) is not None:
-            load_state(model_file)
+        if tf.train.latest_checkpoint(temp_dir) is not None:
+            load_state(model_file, act.sess)
             logger.log('Loaded model from {}'.format(model_file))
             model_saved = True
 
-        for t in range(max_timesteps):
+        for step in range(max_timesteps):
             if callback is not None:
                 if callback(locals(), globals()):
                     break
             # Take action and update exploration to the newest value
             kwargs = {}
             if not param_noise:
-                update_eps = exploration.value(t)
+                update_eps = exploration.value(step)
                 update_param_noise_threshold = 0.
             else:
                 update_eps = 0.
@@ -245,7 +217,8 @@ def make_obs_ph(name):
                 # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                 # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                 # for detailed explanation.
-                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
+                update_param_noise_threshold = -np.log(1. - exploration.value(step) +
+                                                       exploration.value(step) / float(env.action_space.n))
                 kwargs['reset'] = reset
                 kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                 kwargs['update_param_noise_scale'] = True
@@ -263,44 +236,48 @@ def make_obs_ph(name):
                 episode_rewards.append(0.0)
                 reset = True
 
-            if t > learning_starts and t % train_freq == 0:
+            if step > learning_starts and step % train_freq == 0:
                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                 if prioritized_replay:
-                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
+                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(step))
                     (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                 else:
                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                     weights, batch_idxes = np.ones_like(rewards), None
-                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
+                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights, sess=act.sess)
                 if prioritized_replay:
                     new_priorities = np.abs(td_errors) + prioritized_replay_eps
                     replay_buffer.update_priorities(batch_idxes, new_priorities)
 
-            if t > learning_starts and t % target_network_update_freq == 0:
+            if step > learning_starts and step % target_network_update_freq == 0:
                 # Update target network periodically.
-                update_target()
+                update_target(sess=act.sess)
+
+            if len(episode_rewards[-101:-1]) == 0:
+                mean_100ep_reward = -np.inf
+            else:
+                mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
 
-            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
             num_episodes = len(episode_rewards)
             if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
-                logger.record_tabular("steps", t)
+                logger.record_tabular("steps", step)
                 logger.record_tabular("episodes", num_episodes)
                 logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
-                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
+                logger.record_tabular("% time spent exploring", int(100 * exploration.value(step)))
                 logger.dump_tabular()
 
-            if (checkpoint_freq is not None and t > learning_starts and
-                    num_episodes > 100 and t % checkpoint_freq == 0):
+            if (checkpoint_freq is not None and step > learning_starts and
+                    num_episodes > 100 and step % checkpoint_freq == 0):
                 if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                     if print_freq is not None:
                         logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                    saved_mean_reward, mean_100ep_reward))
-                    save_state(model_file)
+                    save_state(model_file, act.sess)
                     model_saved = True
                     saved_mean_reward = mean_100ep_reward
         if model_saved:
             if print_freq is not None:
                 logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
-            load_state(model_file)
+            load_state(model_file, act.sess)
 
     return act
diff --git a/baselines/deepq/test_identity.py b/baselines/deepq/test_identity.py
index ef57e70b45..f0885d6e3a 100644
--- a/baselines/deepq/test_identity.py
+++ b/baselines/deepq/test_identity.py
@@ -1,42 +1,45 @@
-import tensorflow as tf
 import random
 
+import tensorflow as tf
+
 from baselines import deepq
 from baselines.common.identity_env import IdentityEnv
 
 
 def test_identity():
-
-    with tf.Graph().as_default():
-        env = IdentityEnv(10)
-        random.seed(0)
-
-        tf.set_random_seed(0)
-
-        param_noise = False
-        model = deepq.models.mlp([32])
-        act = deepq.learn(
-            env,
-            q_func=model,
-            lr=1e-3,
-            max_timesteps=10000,
-            buffer_size=50000,
-            exploration_fraction=0.1,
-            exploration_final_eps=0.02,
-            print_freq=10,
-            param_noise=param_noise,
-        )
-
-        tf.set_random_seed(0)
-
-        N_TRIALS = 1000
-        sum_rew = 0
-        obs = env.reset()
-        for i in range(N_TRIALS):
-            obs, rew, done, _ = env.step(act([obs]))
-            sum_rew += rew
-
-        assert sum_rew > 0.9 * N_TRIALS
+    """
+    test identity function for DeepQ
+    """
+    env = IdentityEnv(10)
+    random.seed(0)
+
+    tf.set_random_seed(0)
+
+    param_noise = False
+    model = deepq.models.mlp([32])
+
+    act = deepq.learn(
+        env,
+        q_func=model,
+        learning_rate=1e-3,
+        max_timesteps=10000,
+        buffer_size=50000,
+        exploration_fraction=0.1,
+        exploration_final_eps=0.02,
+        print_freq=10,
+        param_noise=param_noise,
+    )
+
+    tf.set_random_seed(0)
+
+    n_trials = 1000
+    sum_rew = 0
+    obs = env.reset()
+    for _ in range(n_trials):
+        obs, rew, _, _ = env.step(act([obs]))
+        sum_rew += rew
+
+    assert sum_rew > 0.9 * n_trials
 
 
 if __name__ == '__main__':
diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py
index 90b932e74a..c00b68801f 100644
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -1,7 +1,7 @@
-from baselines.common.input import observation_input
-
 import tensorflow as tf
 
+from baselines.common.input import observation_input
+
 # ================================================================
 # Placeholders
 # ================================================================
@@ -9,26 +9,40 @@
 
 class TfInput(object):
     def __init__(self, name="(unnamed)"):
-        """Generalized Tensorflow placeholder. The main differences are:
+        """
+        Generalized Tensorflow placeholder. The main differences are:
             - possibly uses multiple placeholders internally and returns multiple values
             - can apply light postprocessing to the value feed to placeholder.
+
+        :param name: (str) the input name
         """
         self.name = name
 
     def get(self):
-        """Return the tf variable(s) representing the possibly postprocessed value
+        """
+        Return the tf variable(s) representing the possibly postprocessed value
         of placeholder(s).
+
+        :return: (TensorFlow Tensor) the placeholder
+        """
+        raise NotImplementedError
+
+    def make_feed_dict(self, data):
         """
-        raise NotImplemented()
+        Given data input it to the placeholder(s).
 
-    def make_feed_dict(data):
-        """Given data input it to the placeholder(s)."""
-        raise NotImplemented()
+        :return: (dict) the given data input
+        """
+        raise NotImplementedError
 
 
 class PlaceholderTfInput(TfInput):
     def __init__(self, placeholder):
-        """Wrapper for regular tensorflow placeholder."""
+        """
+        Wrapper for regular tensorflow placeholder.
+
+        :param placeholder: (TensorFlow Tensor)
+        """
         super().__init__(placeholder.name)
         self._placeholder = placeholder
 
@@ -41,17 +55,14 @@ def make_feed_dict(self, data):
 
 class Uint8Input(PlaceholderTfInput):
     def __init__(self, shape, name=None):
-        """Takes input in uint8 format which is cast to float32 and divided by 255
+        """
+        Takes input in uint8 format which is cast to float32 and divided by 255
         before passing it to the model.
 
         On GPU this ensures lower data transfer times.
 
-        Parameters
-        ----------
-        shape: [int]
-            shape of the tensor.
-        name: str
-            name of the underlying placeholder
+        :param shape: ([int]) shape of the tensor.
+        :param name: (str) name of the underlying placeholder
         """
 
         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
@@ -64,20 +75,15 @@ def get(self):
 
 class ObservationInput(PlaceholderTfInput):
     def __init__(self, observation_space, name=None):
-        """Creates an input placeholder tailored to a specific observation space
-        
-        Parameters
-        ----------
-
-        observation_space: 
-                observation space of the environment. Should be one of the gym.spaces types
-        name: str 
-                tensorflow name of the underlying placeholder
+        """
+        Creates an input placeholder tailored to a specific observation space
+
+        :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces
+            types
+        :param name: (str) tensorflow name of the underlying placeholder
         """
         inpt, self.processed_inpt = observation_input(observation_space, name=name)
         super().__init__(inpt)
 
     def get(self):
         return self.processed_inpt
-    
-    
diff --git a/baselines/gail/adversary.py b/baselines/gail/adversary.py
index 18df69ccca..c52da864aa 100644
--- a/baselines/gail/adversary.py
+++ b/baselines/gail/adversary.py
@@ -1,28 +1,50 @@
-'''
+"""
 Reference: https://github.com/openai/imitation
 I follow the architecture from the official repository
-'''
+"""
 import tensorflow as tf
 import numpy as np
 
 from baselines.common.mpi_running_mean_std import RunningMeanStd
-from baselines.common import tf_util as U
+from baselines.common import tf_util as tf_util
+
+
+def logsigmoid(input_tensor):
+    """
+    Equivalent to tf.log(tf.sigmoid(a))
+
+    :param input_tensor: (TensorFlow Tensor)
+    :return: (TensorFlow Tensor)
+    """
+    return -tf.nn.softplus(-input_tensor)
 
-def logsigmoid(a):
-    '''Equivalent to tf.log(tf.sigmoid(a))'''
-    return -tf.nn.softplus(-a)
 
-""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
 def logit_bernoulli_entropy(logits):
-    ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
+    """
+    Reference:
+    https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51
+
+    :param logits: (TensorFlow Tensor) the logits
+    :return: (TensorFlow Tensor) the bernoulli entropy
+    """
+    ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits)
     return ent
 
+
 class TransitionClassifier(object):
-    def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
+    def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"):
+        """
+        reward regression from observations and transitions
+
+        :param env: (Gym Environment)
+        :param hidden_size: ([int]) the hidden dimension for the MLP
+        :param entcoeff: (float) the entropy loss weight
+        :param scope: (str) tensorflow variable scope
+        """
         self.scope = scope
         self.observation_shape = env.observation_space.shape
         self.actions_shape = env.action_space.shape
-        self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)])
+        self.input_shape = tuple([o + a for o, a in zip(self.observation_shape, self.actions_shape)])
         self.num_actions = env.action_space.shape[0]
         self.hidden_size = hidden_size
         self.build_ph()
@@ -35,31 +57,48 @@ def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="advers
         # Build regression loss
         # let x = logits, z = targets.
         # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
-        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
+        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits,
+                                                                 labels=tf.zeros_like(generator_logits))
         generator_loss = tf.reduce_mean(generator_loss)
         expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
         expert_loss = tf.reduce_mean(expert_loss)
         # Build entropy loss
         logits = tf.concat([generator_logits, expert_logits], 0)
         entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
-        entropy_loss = -entcoeff*entropy
+        entropy_loss = -entcoeff * entropy
         # Loss + Accuracy terms
         self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
         self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
         self.total_loss = generator_loss + expert_loss + entropy_loss
         # Build Reward for policy
-        self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
+        self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
         var_list = self.get_trainable_variables()
-        self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
-                                      self.losses + [U.flatgrad(self.total_loss, var_list)])
+        self.lossandgrad = tf_util.function(
+            [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
+            self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
 
     def build_ph(self):
-        self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
-        self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
-        self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
-        self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
+        """
+        build placeholder
+        """
+        self.generator_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape,
+                                               name="observations_ph")
+        self.generator_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape,
+                                               name="actions_ph")
+        self.expert_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape,
+                                            name="expert_observations_ph")
+        self.expert_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape,
+                                            name="expert_actions_ph")
 
     def build_graph(self, obs_ph, acs_ph, reuse=False):
+        """
+        build the graph
+
+        :param obs_ph: (TensorFlow Tensor) the observation placeholder
+        :param acs_ph: (TensorFlow Tensor) the action placeholder
+        :param reuse: (bool)
+        :return: (TensorFlow Tensor) the graph output
+        """
         with tf.variable_scope(self.scope):
             if reuse:
                 tf.get_variable_scope().reuse_variables()
@@ -74,14 +113,26 @@ def build_graph(self, obs_ph, acs_ph, reuse=False):
         return logits
 
     def get_trainable_variables(self):
+        """
+        get all the trainable variables from the graph
+
+        :return: ([TensorFlow Tensor]) the variables
+        """
         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
 
-    def get_reward(self, obs, acs):
+    def get_reward(self, obs, actions):
+        """
+        get the reward using the observation and action
+
+        :param obs: (TensorFlow Tensor or numpy Number) the observation
+        :param actions: (TensorFlow Tensor or numpy Number) the action
+        :return: (numpy Number) the reward
+        """
         sess = tf.get_default_session()
         if len(obs.shape) == 1:
             obs = np.expand_dims(obs, 0)
-        if len(acs.shape) == 1:
-            acs = np.expand_dims(acs, 0)
-        feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
+        if len(actions.shape) == 1:
+            actions = np.expand_dims(actions, 0)
+        feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: actions}
         reward = sess.run(self.reward_op, feed_dict)
         return reward
diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py
index 82f65ecf19..daef9850d1 100644
--- a/baselines/gail/behavior_clone.py
+++ b/baselines/gail/behavior_clone.py
@@ -1,27 +1,30 @@
-'''
+"""
 The code is used to train BC imitator, or pretrained GAIL imitator
-'''
-
+"""
+import os
 import argparse
 import tempfile
-import os.path as osp
-import gym
 import logging
-from tqdm import tqdm
 
+from tqdm import tqdm
+import gym
 import tensorflow as tf
 
 from baselines.gail import mlp_policy
-from baselines import bench
-from baselines import logger
-from baselines.common import set_global_seeds, tf_util as U
+from baselines import logger, bench
+from baselines.common import set_global_seeds, tf_util
 from baselines.common.misc_util import boolean_flag
 from baselines.common.mpi_adam import MpiAdam
 from baselines.gail.run_mujoco import runner
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
+from baselines.gail.dataset.mujocodset import MujocoDset
 
 
 def argsparser():
+    """
+    make a behavior cloning argument parser
+
+    :return: (ArgumentParser)
+    """
     parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning")
     parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
@@ -33,37 +36,50 @@ def argsparser():
     # Network Configuration (Using MLP Policy)
     parser.add_argument('--policy_hidden_size', type=int, default=100)
     # for evaluatation
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
-    boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
+    boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate')
+    boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not')
     parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5)
     return parser.parse_args()
 
 
-def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
-          adam_epsilon=1e-5, optim_stepsize=3e-4,
-          ckpt_dir=None, log_dir=None, task_name=None,
-          verbose=False):
+def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4,
+          ckpt_dir=None, task_name=None, verbose=False):
+    """
+    Learn a behavior clone policy, and return the save location
+
+    :param env: (Gym Environment) the environment
+    :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy
+    :param dataset: (Dset or MujocoDset) the dataset manager
+    :param optim_batch_size: (int) the batch size
+    :param max_iters: (int) the maximum number of iterations
+    :param adam_epsilon: (float) the epsilon value for the adam optimizer
+    :param optim_stepsize: (float) the optimizer stepsize
+    :param ckpt_dir: (str) the save directory, can be None for temporary directory
+    :param task_name: (str) the save name, can be None for saving directly to the directory name
+    :param verbose: (bool)
+    :return: (str) the save location for the TensorFlow model
+    """
 
     val_per_iter = int(max_iters/10)
     ob_space = env.observation_space
     ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
+    policy = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
     # placeholder
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
-    stochastic = U.get_placeholder_cached(name="stochastic")
-    loss = tf.reduce_mean(tf.square(ac-pi.ac))
-    var_list = pi.get_trainable_variables()
+    obs_ph = policy.obs_ph
+    action_ph = policy.pdtype.sample_placeholder([None])
+    stochastic_ph = policy.stochastic_ph
+    loss = tf.reduce_mean(tf.square(action_ph - policy.ac))
+    var_list = policy.get_trainable_variables()
     adam = MpiAdam(var_list, epsilon=adam_epsilon)
-    lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])
+    lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph], [loss] + [tf_util.flatgrad(loss, var_list)])
 
-    U.initialize()
+    tf_util.initialize()
     adam.sync()
     logger.log("Pretraining with Behavior Cloning...")
     for iter_so_far in tqdm(range(int(max_iters))):
         ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
-        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
-        adam.update(g, optim_stepsize)
+        train_loss, grad = lossandgrad(ob_expert, ac_expert, True)
+        adam.update(grad, optim_stepsize)
         if verbose and iter_so_far % val_per_iter == 0:
             ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
             val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
@@ -72,12 +88,18 @@ def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4,
     if ckpt_dir is None:
         savedir_fname = tempfile.TemporaryDirectory().name
     else:
-        savedir_fname = osp.join(ckpt_dir, task_name)
-    U.save_state(savedir_fname, var_list=pi.get_variables())
+        savedir_fname = os.path.join(ckpt_dir, task_name)
+    tf_util.save_state(savedir_fname, var_list=policy.get_variables())
     return savedir_fname
 
 
 def get_task_name(args):
+    """
+    Get the task name
+
+    :param args: (ArgumentParser) the training argument
+    :return: (str) the task name
+    """
     task_name = 'BC'
     task_name += '.{}'.format(args.env_id.split("-")[0])
     task_name += '.traj_limitation_{}'.format(args.traj_limitation)
@@ -86,37 +108,36 @@ def get_task_name(args):
 
 
 def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    env = gym.make(args.env_id)
-
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
-    env = bench.Monitor(env, logger.get_dir() and
-                        osp.join(logger.get_dir(), "monitor.json"))
-    env.seed(args.seed)
-    gym.logger.setLevel(logging.WARN)
-    task_name = get_task_name(args)
-    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
-    args.log_dir = osp.join(args.log_dir, task_name)
-    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
-    savedir_fname = learn(env,
-                          policy_fn,
-                          dataset,
-                          max_iters=args.BC_max_iter,
-                          ckpt_dir=args.checkpoint_dir,
-                          log_dir=args.log_dir,
-                          task_name=task_name,
-                          verbose=True)
-    avg_len, avg_ret = runner(env,
-                              policy_fn,
-                              savedir_fname,
-                              timesteps_per_batch=1024,
-                              number_trajs=10,
-                              stochastic_policy=args.stochastic_policy,
-                              save=args.save_sample,
-                              reuse=True)
+    """
+    start training the model
+
+    :param args: (ArgumentParser) the training argument
+    """
+    with tf_util.make_session(num_cpu=1):
+        set_global_seeds(args.seed)
+        env = gym.make(args.env_id)
+
+        def policy_fn(name, ob_space, ac_space, reuse=False, sess=None):
+            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess,
+                                        reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
+        env = bench.Monitor(env, logger.get_dir() and
+                            os.path.join(logger.get_dir(), "monitor.json"))
+        env.seed(args.seed)
+        gym.logger.setLevel(logging.WARN)
+        task_name = get_task_name(args)
+        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
+        args.log_dir = os.path.join(args.log_dir, task_name)
+        dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
+        savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir,
+                              task_name=task_name, verbose=True)
+        runner(env,
+               policy_fn,
+               savedir_fname,
+               timesteps_per_batch=1024,
+               number_trajs=10,
+               stochastic_policy=args.stochastic_policy,
+               save=args.save_sample,
+               reuse=True)
 
 
 if __name__ == '__main__':
diff --git a/baselines/gail/dataset/mujoco_dset.py b/baselines/gail/dataset/mujocodset.py
similarity index 71%
rename from baselines/gail/dataset/mujoco_dset.py
rename to baselines/gail/dataset/mujocodset.py
index 0693262270..2c1ac60fe5 100644
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujocodset.py
@@ -1,16 +1,25 @@
-'''
+"""
 Data structure of the input .npz:
 the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
 the values of each item is a list storing the expert trajectory sequentially
 a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
-'''
+"""
 
-from baselines import logger
 import numpy as np
+import matplotlib.pyplot as plt
+
+from baselines import logger
 
 
 class Dset(object):
     def __init__(self, inputs, labels, randomize):
+        """
+        Dataset object
+
+        :param inputs: (numpy Number) the input values
+        :param labels: (numpy Number) the target values
+        :param randomize: (bool) if the dataset should be shuffled
+        """
         self.inputs = inputs
         self.labels = labels
         assert len(self.inputs) == len(self.labels)
@@ -19,6 +28,9 @@ def __init__(self, inputs, labels, randomize):
         self.init_pointer()
 
     def init_pointer(self):
+        """
+        initialize the pointer and shuffle the dataset, if randomize the dataset
+        """
         self.pointer = 0
         if self.randomize:
             idx = np.arange(self.num_pairs)
@@ -27,6 +39,12 @@ def init_pointer(self):
             self.labels = self.labels[idx, :]
 
     def get_next_batch(self, batch_size):
+        """
+        get the batch from the dataset
+
+        :param batch_size: (int) the size of the batch from the dataset
+        :return: (numpy Number, numpy Number) inputs and labels
+        """
         # if batch_size is negative -> return all
         if batch_size < 0:
             return self.inputs, self.labels
@@ -39,8 +57,16 @@ def get_next_batch(self, batch_size):
         return inputs, labels
 
 
-class Mujoco_Dset(object):
+class MujocoDset(object):
     def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
+        """
+        Dataset for mujoco
+
+        :param expert_path: (str) the path to trajectory data
+        :param train_fraction: (float) the train val split (0 to 1)
+        :param traj_limitation: (int) the dims to load (if -1, load all)
+        :param randomize: (bool) if the dataset should be shuffled
+        """
         traj_data = np.load(expert_path)
         if traj_limitation < 0:
             traj_limitation = len(traj_data['obs'])
@@ -73,12 +99,22 @@ def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomiz
         self.log_info()
 
     def log_info(self):
+        """
+        log the information of the dataset
+        """
         logger.log("Total trajectorues: %d" % self.num_traj)
         logger.log("Total transitions: %d" % self.num_transition)
         logger.log("Average returns: %f" % self.avg_ret)
         logger.log("Std for returns: %f" % self.std_ret)
 
     def get_next_batch(self, batch_size, split=None):
+        """
+        get the batch from the dataset
+
+        :param batch_size: (int) the size of the batch from the dataset
+        :param split: (str) the type of data split (can be None, 'train', 'val')
+        :return: (numpy Number, numpy Number) inputs and labels
+        """
         if split is None:
             return self.dset.get_next_batch(batch_size)
         elif split == 'train':
@@ -89,17 +125,27 @@ def get_next_batch(self, batch_size, split=None):
             raise NotImplementedError
 
     def plot(self):
-        import matplotlib.pyplot as plt
+        """
+        show and save (to 'histogram_rets.png') a histogram plotting of the episode returns
+        """
         plt.hist(self.rets)
         plt.savefig("histogram_rets.png")
         plt.close()
 
 
 def test(expert_path, traj_limitation, plot):
-    dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation)
+    """
+    test mujoco dataset object
+
+    :param expert_path: (str) the path to trajectory data
+    :param traj_limitation: (int) the dims to load (if -1, load all)
+    :param plot: (bool) enable plotting
+    """
+    dset = MujocoDset(expert_path, traj_limitation=traj_limitation)
     if plot:
         dset.plot()
 
+
 if __name__ == '__main__':
     import argparse
     parser = argparse.ArgumentParser()
diff --git a/baselines/gail/gail-eval.py b/baselines/gail/gail_eval.py
similarity index 66%
rename from baselines/gail/gail-eval.py
rename to baselines/gail/gail_eval.py
index 1148cb309c..78157659f4 100644
--- a/baselines/gail/gail-eval.py
+++ b/baselines/gail/gail_eval.py
@@ -1,22 +1,21 @@
-'''
+"""
 This code is used to evalaute the imitators trained with different number of trajectories
 and plot the results in the same figure for easy comparison.
-'''
+"""
 
 import argparse
 import os
 import glob
-import gym
 
+import gym
 import matplotlib.pyplot as plt
 import numpy as np
 import tensorflow as tf
 
-from baselines.gail import run_mujoco
-from baselines.gail import mlp_policy
-from baselines.common import set_global_seeds, tf_util as U
+from baselines.gail import run_mujoco, mlp_policy
+from baselines.common import set_global_seeds, tf_util
 from baselines.common.misc_util import boolean_flag
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
+from baselines.gail.dataset.mujocodset import MujocoDset
 
 
 plt.style.use('ggplot')
@@ -26,30 +25,52 @@
 
 
 def load_dataset(expert_path):
-    dataset = Mujoco_Dset(expert_path=expert_path)
+    """
+    load mujoco dataset
+
+    :param expert_path: (str) the path to trajectory data
+    :return: (MujocoDset) the dataset manager object
+    """
+    dataset = MujocoDset(expert_path=expert_path)
     return dataset
 
 
 def argsparser():
+    """
+    make a argument parser for evaluation of gail
+
+    :return: (ArgumentParser)
+    """
     parser = argparse.ArgumentParser('Do evaluation')
     parser.add_argument('--seed', type=int, default=0)
     parser.add_argument('--policy_hidden_size', type=int, default=100)
     parser.add_argument('--env', type=str, choices=['Hopper', 'Walker2d', 'HalfCheetah',
                                                     'Humanoid', 'HumanoidStandup'])
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
+    boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate')
     return parser.parse_args()
 
 
 def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix):
-
-    def get_checkpoint_dir(checkpoint_list, limit, prefix):
+    """
+    Evaluate an environment
+
+    :param env_name: (str) the environment name
+    :param seed: (int) the initial random seed
+    :param policy_hidden_size: (int) the number of hidden neurons in the 4 layer MLP
+    :param stochastic: (bool) use a stochastic policy
+    :param reuse: (bool) allow reuse of the graph
+    :param prefix: (str) the checkpoint prefix for the type ('BC' or 'gail')
+    :return: (dict) the logging information of the evaluation
+    """
+
+    def _get_checkpoint_dir(checkpoint_list, limit, prefix):
         for checkpoint in checkpoint_list:
             if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint):
                 return checkpoint
         return None
 
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
+    def _policy_fn(name, ob_space, ac_space, reuse=False, sess=None):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess,
                                     reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2)
 
     data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz')
@@ -65,13 +86,13 @@ def policy_fn(name, ob_space, ac_space, reuse=False):
     for i, limit in enumerate(CONFIG['traj_limitation']):
         # Do one evaluation
         upper_bound = sum(dataset.rets[:limit])/limit
-        checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix)
+        checkpoint_dir = _get_checkpoint_dir(checkpoint_list, limit, prefix=prefix)
         checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
         env = gym.make(env_name + '-v1')
         env.seed(seed)
         print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path))
         avg_len, avg_ret = run_mujoco.runner(env,
-                                             policy_fn,
+                                             _policy_fn,
                                              checkpoint_path,
                                              timesteps_per_batch=1024,
                                              number_trajs=10,
@@ -90,6 +111,14 @@ def policy_fn(name, ob_space, ac_space, reuse=False):
 
 
 def plot(env_name, bc_log, gail_log, stochastic):
+    """
+    plot and display all the evalutation results
+
+    :param env_name: (str) the environment name
+    :param bc_log: (dict) the behavior_clone log
+    :param gail_log: (dict) the gail log
+    :param stochastic: (bool) use a stochastic policy
+    """
     upper_bound = bc_log['upper_bound']
     bc_avg_ret = bc_log['avg_ret']
     gail_avg_ret = gail_log['avg_ret']
@@ -128,18 +157,23 @@ def plot(env_name, bc_log, gail_log, stochastic):
 
 
 def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    print('Evaluating {}'.format(args.env))
-    bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
-                          args.stochastic_policy, False, 'BC')
-    print('Evaluation for {}'.format(args.env))
-    print(bc_log)
-    gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
-                            args.stochastic_policy, True, 'gail')
-    print('Evaluation for {}'.format(args.env))
-    print(gail_log)
-    plot(args.env, bc_log, gail_log, args.stochastic_policy)
+    """
+    evaluate and plot Behavior clone and gail
+
+    :param args: (ArgumentParser) the arguments for training and evaluating
+    """
+    with tf_util.make_session(num_cpu=1):
+        set_global_seeds(args.seed)
+        print('Evaluating {}'.format(args.env))
+        bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
+                              args.stochastic_policy, False, 'BC')
+        print('Evaluation for {}'.format(args.env))
+        print(bc_log)
+        gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size,
+                                args.stochastic_policy, True, 'gail')
+        print('Evaluation for {}'.format(args.env))
+        print(gail_log)
+        plot(args.env, bc_log, gail_log, args.stochastic_policy)
 
 
 if __name__ == '__main__':
diff --git a/baselines/gail/mlp_policy.py b/baselines/gail/mlp_policy.py
index d8df120719..347045a9c9 100644
--- a/baselines/gail/mlp_policy.py
+++ b/baselines/gail/mlp_policy.py
@@ -1,21 +1,36 @@
-'''
+"""
 from baselines/ppo1/mlp_policy.py and add simple modification
 (1) add reuse argument
 (2) cache the `stochastic` placeholder
-'''
-import tensorflow as tf
+"""
 import gym
+import tensorflow as tf
 
-import baselines.common.tf_util as U
-from baselines.common.mpi_running_mean_std import RunningMeanStd
-from baselines.common.distributions import make_pdtype
+import baselines.common.tf_util as tf_util
 from baselines.acktr.utils import dense
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+from baselines.ppo1.mlp_policy import BasePolicy
 
 
-class MlpPolicy(object):
+class MlpPolicy(BasePolicy):
     recurrent = False
 
-    def __init__(self, name, reuse=False, *args, **kwargs):
+    def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs):
+        """
+        MLP policy for Gail
+
+        :param name: (str) the variable scope name
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param hid_size: (int) the size of the hidden layers
+        :param num_hid_layers: (int) the number of hidden layers
+        :param sess: (TensorFlow session) The current TensorFlow session containing the variables.
+        :param reuse: (bool) allow resue of the graph
+        :param placeholders: (dict) To feed existing placeholders if needed
+        :param gaussian_fixed_var: (bool) fix the gaussian variance
+        """
+        super(MlpPolicy, self).__init__(placeholders=placeholders)
+        self.sess = sess
         with tf.variable_scope(name):
             if reuse:
                 tf.get_variable_scope().reuse_variables()
@@ -23,53 +38,39 @@ def __init__(self, name, reuse=False, *args, **kwargs):
             self.scope = tf.get_variable_scope().name
 
     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
-        assert isinstance(ob_space, gym.spaces.Box)
-
-        self.pdtype = pdtype = make_pdtype(ac_space)
-        sequence_length = None
 
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
+        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
 
         with tf.variable_scope("obfilter"):
             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
 
-        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
+        obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
         last_out = obz
         for i in range(num_hid_layers):
-            last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
-        self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
+            last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1),
+                                        weight_init=tf_util.normc_initializer(1.0)))
+        self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0]
 
         last_out = obz
         for i in range(num_hid_layers):
-            last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
+            last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1),
+                                        weight_init=tf_util.normc_initializer(1.0)))
 
         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
-            mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
-            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
+            mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01))
+            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2],
+                                     initializer=tf.zeros_initializer())
             pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
         else:
-            pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
+            pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01))
 
-        self.pd = pdtype.pdfromflat(pdparam)
+        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
 
         self.state_in = []
         self.state_out = []
 
         # change for BC
-        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
-        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
-        self.ac = ac
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
-
-    def act(self, stochastic, ob):
-        ac1, vpred1 = self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-
-    def get_initial_state(self):
-        return []
+        self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic")
+        action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode())
+        self.action = action
+        self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py
index 379f7f8cb8..8378565553 100644
--- a/baselines/gail/run_mujoco.py
+++ b/baselines/gail/run_mujoco.py
@@ -1,26 +1,30 @@
-'''
+"""
 Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation
-'''
+"""
 
 import argparse
-import os.path as osp
+import os
 import logging
+
 from mpi4py import MPI
 from tqdm import tqdm
-
 import numpy as np
 import gym
 
-from baselines.gail import mlp_policy
-from baselines.common import set_global_seeds, tf_util as U
+from baselines.gail import mlp_policy, behavior_clone, trpo_mpi
+from baselines.common import set_global_seeds, tf_util
 from baselines.common.misc_util import boolean_flag
-from baselines import bench
-from baselines import logger
-from baselines.gail.dataset.mujoco_dset import Mujoco_Dset
+from baselines import bench, logger
+from baselines.gail.dataset.mujocodset import MujocoDset
 from baselines.gail.adversary import TransitionClassifier
 
 
 def argsparser():
+    """
+    get an argument parser for training mujoco on gail
+
+    :return: (ArgumentParser)
+    """
     parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
     parser.add_argument('--env_id', help='environment ID', default='Hopper-v2')
     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
@@ -31,8 +35,8 @@ def argsparser():
     # Task
     parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')
     # for evaluatation
-    boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate')
-    boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not')
+    boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate')
+    boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not')
     #  Mujoco Dataset Configuration
     parser.add_argument('--traj_limitation', type=int, default=-1)
     # Optimization Configuration
@@ -50,12 +54,18 @@ def argsparser():
     parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
     parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6)
     # Behavior Cloning
-    boolean_flag(parser, 'pretrained', default=False, help='Use BC to pretrain')
-    parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4)
+    boolean_flag(parser, 'pretrained', default=False, help_msg='Use BC to pretrain')
+    parser.add_argument('--bc_max_iter', help='Max iteration for training BC', type=int, default=1e4)
     return parser.parse_args()
 
 
 def get_task_name(args):
+    """
+    get the task name
+
+    :param args: (ArgumentParser) the training argument
+    :return: (str) the task name
+    """
     task_name = args.algo + "_gail."
     if args.pretrained:
         task_name += "with_pretrained."
@@ -69,68 +79,74 @@ def get_task_name(args):
 
 
 def main(args):
-    U.make_session(num_cpu=1).__enter__()
-    set_global_seeds(args.seed)
-    env = gym.make(args.env_id)
-
-    def policy_fn(name, ob_space, ac_space, reuse=False):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
-    env = bench.Monitor(env, logger.get_dir() and
-                        osp.join(logger.get_dir(), "monitor.json"))
-    env.seed(args.seed)
-    gym.logger.setLevel(logging.WARN)
-    task_name = get_task_name(args)
-    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
-    args.log_dir = osp.join(args.log_dir, task_name)
-
-    if args.task == 'train':
-        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
-        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
-        train(env,
-              args.seed,
-              policy_fn,
-              reward_giver,
-              dataset,
-              args.algo,
-              args.g_step,
-              args.d_step,
-              args.policy_entcoeff,
-              args.num_timesteps,
-              args.save_per_iter,
-              args.checkpoint_dir,
-              args.log_dir,
-              args.pretrained,
-              args.BC_max_iter,
-              task_name
-              )
-    elif args.task == 'evaluate':
-        runner(env,
-               policy_fn,
-               args.load_model_path,
-               timesteps_per_batch=1024,
-               number_trajs=10,
-               stochastic_policy=args.stochastic_policy,
-               save=args.save_sample
-               )
-    else:
-        raise NotImplementedError
-    env.close()
+    """
+    start training the model
 
+    :param args: (ArgumentParser) the training argument
+    """
+    with tf_util.make_session(num_cpu=1):
+        set_global_seeds(args.seed)
+        env = gym.make(args.env_id)
 
-def train(env, seed, policy_fn, reward_giver, dataset, algo,
-          g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter,
-          checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None):
+        def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None):
+            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess,
+                                        hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders)
+        env = bench.Monitor(env, logger.get_dir() and
+                            os.path.join(logger.get_dir(), "monitor.json"))
+        env.seed(args.seed)
+        gym.logger.setLevel(logging.WARN)
+        task_name = get_task_name(args)
+        args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name)
+        args.log_dir = os.path.join(args.log_dir, task_name)
+
+        if args.task == 'train':
+            dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
+            reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
+            train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step,
+                  args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained,
+                  args.bc_max_iter, task_name)
+        elif args.task == 'evaluate':
+            runner(env,
+                   policy_fn,
+                   args.load_model_path,
+                   timesteps_per_batch=1024,
+                   number_trajs=10,
+                   stochastic_policy=args.stochastic_policy,
+                   save=args.save_sample
+                   )
+        else:
+            raise NotImplementedError
+        env.close()
+
+
+def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps,
+          save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None):
+    """
+    train gail on mujoco
+
+    :param env: (Gym Environment) the environment
+    :param seed: (int) the initial random seed
+    :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
+    :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
+    :param dataset: (MujocoDset) the dataset manager
+    :param algo: (str) the algorithm type (only 'trpo' is supported)
+    :param g_step: (int) number of steps to train policy in each epoch
+    :param d_step: (int) number of steps to train discriminator in each epoch
+    :param policy_entcoeff: (float) the weight of the entropy loss for the policy
+    :param num_timesteps: (int) the number of timesteps to run
+    :param save_per_iter: (int) the number of iterations before saving
+    :param checkpoint_dir: (str) the location for saving checkpoints
+    :param pretrained: (bool) use a pretrained behavior clone
+    :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone
+    :param task_name: (str) the name of the task (can be None)
+    """
 
     pretrained_weight = None
-    if pretrained and (BC_max_iter > 0):
+    if pretrained and (bc_max_iter > 0):
         # Pretrain with behavior cloning
-        from baselines.gail import behavior_clone
-        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
-                                                 max_iters=BC_max_iter)
+        pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter)
 
     if algo == 'trpo':
-        from baselines.gail import trpo_mpi
         # Set up for MPI seed
         rank = MPI.COMM_WORLD.Get_rank()
         if rank != 0:
@@ -138,41 +154,47 @@ def train(env, seed, policy_fn, reward_giver, dataset, algo,
         workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
         set_global_seeds(workerseed)
         env.seed(workerseed)
-        trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank,
-                       pretrained=pretrained, pretrained_weight=pretrained_weight,
-                       g_step=g_step, d_step=d_step,
-                       entcoeff=policy_entcoeff,
-                       max_timesteps=num_timesteps,
-                       ckpt_dir=checkpoint_dir, log_dir=log_dir,
-                       save_per_iter=save_per_iter,
-                       timesteps_per_batch=1024,
-                       max_kl=0.01, cg_iters=10, cg_damping=0.1,
-                       gamma=0.995, lam=0.97,
-                       vf_iters=5, vf_stepsize=1e-3,
-                       task_name=task_name)
+        trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97,
+                       entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5,
+                       max_timesteps=num_timesteps, pretrained_weight=pretrained_weight, reward_giver=reward_giver,
+                       expert_dataset=dataset, rank=rank, save_per_iter=save_per_iter, ckpt_dir=checkpoint_dir,
+                       g_step=g_step, d_step=d_step, task_name=task_name)
     else:
         raise NotImplementedError
 
 
 def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
            stochastic_policy, save=False, reuse=False):
+    """
+    run the training for all the trajectories
+
+    :param env: (Gym Environment) the environment
+    :param policy_func: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
+    :param load_model_path: (str) the path to the model
+    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
+    :param number_trajs: (int) the number of trajectories to run
+    :param stochastic_policy: (bool) use a stochastic policy
+    :param save: (bool) save the policy
+    :param reuse: (bool) allow reuse of the graph
+    :return: (float, float) average trajectory lenght, average trajectory reward
+    """
 
     # Setup network
     # ----------------------------------------
     ob_space = env.observation_space
     ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
-    U.initialize()
+    policy = policy_func("pi", ob_space, ac_space, reuse=reuse)
+    tf_util.initialize()
     # Prepare for rollouts
     # ----------------------------------------
-    U.load_state(load_model_path)
+    tf_util.load_state(load_model_path)
 
     obs_list = []
     acs_list = []
     len_list = []
     ret_list = []
     for _ in tqdm(range(number_trajs)):
-        traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy)
+        traj = traj_1_generator(policy, env, timesteps_per_batch, stochastic=stochastic_policy)
         obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret']
         obs_list.append(obs)
         acs_list.append(acs)
@@ -193,43 +215,51 @@ def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs,
     return avg_len, avg_ret
 
 
-# Sample one trajectory (until trajectory end)
-def traj_1_generator(pi, env, horizon, stochastic):
+def traj_1_generator(policy, env, horizon, stochastic):
+    """
+    Sample one trajectory (until trajectory end)
+
+    :param policy: (MLPPolicy) the policy
+    :param env: (Gym Environment) the environment
+    :param horizon: (int) the search horizon
+    :param stochastic: (bool) use a stochastic policy
+    :return: (dict) the trajectory
+    """
 
-    t = 0
-    ac = env.action_space.sample()  # not used, just so we have the datatype
+    step = 0
+    env.action_space.sample()  # not used, just so we have the datatype
     new = True  # marks if we're on first timestep of an episode
 
-    ob = env.reset()
+    observation = env.reset()
     cur_ep_ret = 0  # return in current episode
     cur_ep_len = 0  # len of current episode
 
     # Initialize history arrays
-    obs = []
-    rews = []
+    observations = []
+    rewards = []
     news = []
-    acs = []
+    actions = []
 
     while True:
-        ac, vpred = pi.act(stochastic, ob)
-        obs.append(ob)
+        acttion, _ = policy.act(stochastic, observation)
+        observations.append(observation)
         news.append(new)
-        acs.append(ac)
+        actions.append(acttion)
 
-        ob, rew, new, _ = env.step(ac)
-        rews.append(rew)
+        observation, reward, new, _ = env.step(acttion)
+        rewards.append(reward)
 
-        cur_ep_ret += rew
+        cur_ep_ret += reward
         cur_ep_len += 1
-        if new or t >= horizon:
+        if new or step >= horizon:
             break
-        t += 1
+        step += 1
 
-    obs = np.array(obs)
-    rews = np.array(rews)
+    observations = np.array(observations)
+    rewards = np.array(rewards)
     news = np.array(news)
-    acs = np.array(acs)
-    traj = {"ob": obs, "rew": rews, "new": news, "ac": acs,
+    actions = np.array(actions)
+    traj = {"ob": observations, "rew": rewards, "new": news, "ac": actions,
             "ep_ret": cur_ep_ret, "ep_len": cur_ep_len}
     return traj
 
diff --git a/baselines/gail/statistics.py b/baselines/gail/statistics.py
index 5f7c57e449..96c4f96263 100644
--- a/baselines/gail/statistics.py
+++ b/baselines/gail/statistics.py
@@ -1,16 +1,26 @@
-'''
+"""
 This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
-'''
+"""
 
 import tensorflow as tf
 import numpy as np
 
-import baselines.common.tf_util as U
+import baselines.common.tf_util as tf_util
 
 
-class stats():
+class Stats:
 
-    def __init__(self, scalar_keys=[], histogram_keys=[]):
+    def __init__(self, scalar_keys=None, histogram_keys=None):
+        """
+        initialize the placeholders from the input keys, for summary logging
+
+        :param scalar_keys: ([str]) the name of all the scalar inputs
+        :param histogram_keys: ([str]) the name of all the histogram inputs
+        """
+        if scalar_keys is None:
+            scalar_keys = []
+        if histogram_keys is None:
+            histogram_keys = []
         self.scalar_keys = scalar_keys
         self.histogram_keys = histogram_keys
         self.scalar_summaries = []
@@ -18,28 +28,34 @@ def __init__(self, scalar_keys=[], histogram_keys=[]):
         self.histogram_summaries_ph = []
         self.histogram_summaries = []
         with tf.variable_scope('summary'):
-            for k in scalar_keys:
-                ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
-                sm = tf.summary.scalar(k+'.scalar.summary', ph)
-                self.scalar_summaries_ph.append(ph)
-                self.scalar_summaries.append(sm)
-            for k in histogram_keys:
-                ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
-                sm = tf.summary.scalar(k+'.histogram.summary', ph)
-                self.histogram_summaries_ph.append(ph)
-                self.histogram_summaries.append(sm)
-
-        self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
-
-    def add_all_summary(self, writer, values, iter):
-        # Note that the order of the incoming ```values``` should be the same as the that of the
-        #            ```scalar_keys``` given in ```__init__```
-        if np.sum(np.isnan(values)+0) != 0:
+            for key in scalar_keys:
+                place_holder = tf.placeholder('float32', None, name=key + '.scalar.summary')
+                string_summary = tf.summary.scalar(key + '.scalar.summary', place_holder)
+                self.scalar_summaries_ph.append(place_holder)
+                self.scalar_summaries.append(string_summary)
+            for key in histogram_keys:
+                place_holder = tf.placeholder('float32', None, name=key + '.histogram.summary')
+                string_summary = tf.summary.scalar(key + '.histogram.summary', place_holder)
+                self.histogram_summaries_ph.append(place_holder)
+                self.histogram_summaries.append(string_summary)
+
+        self.summaries = tf.summary.merge(self.scalar_summaries + self.histogram_summaries)
+
+    def add_all_summary(self, writer, values, _iter):
+        """
+        Note that the order of the incoming ```values``` should be the same as the that of the
+                   ```scalar_keys``` given in ```__init__```
+
+        :param writer: (TensorFlow FileWriter) the writer
+        :param values: (TensorFlow Tensor or numpy Number) the input for the summary run
+        :param _iter: (Number) the global step value
+        """
+        if np.sum(np.isnan(values) + 0) != 0:
             return
-        sess = U.get_session()
+        sess = tf_util.get_session()
         keys = self.scalar_summaries_ph + self.histogram_summaries_ph
         feed_dict = {}
-        for k, v in zip(keys, values):
-            feed_dict.update({k: v})
+        for key, value in zip(keys, values):
+            feed_dict.update({key: value})
         summaries_str = sess.run(self.summaries, feed_dict)
-        writer.add_summary(summaries_str, iter)
+        writer.add_summary(summaries_str, _iter)
diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py
index 615a4326a7..2446db00a6 100644
--- a/baselines/gail/trpo_mpi.py
+++ b/baselines/gail/trpo_mpi.py
@@ -1,142 +1,202 @@
-'''
-Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines
-'''
-
 import time
 import os
 from contextlib import contextmanager
-from mpi4py import MPI
 from collections import deque
 
+from mpi4py import MPI
 import tensorflow as tf
 import numpy as np
 
-import baselines.common.tf_util as U
-from baselines.common import explained_variance, zipsame, dataset, fmt_row
+import baselines.common.tf_util as tf_util
+from baselines.common import explained_variance, zipsame, dataset, fmt_row, colorize
 from baselines import logger
-from baselines.common import colorize
 from baselines.common.mpi_adam import MpiAdam
-from baselines.common.cg import cg
-from baselines.gail.statistics import stats
-
-
-def traj_segment_generator(pi, env, reward_giver, horizon, stochastic):
+from baselines.common.cg import conjugate_gradient
+
+
+# from baselines.gail.statistics import Stats
+
+
+def traj_segment_generator(policy, env, horizon, stochastic, reward_giver=None, gail=False):
+    """
+    Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
+
+    :param policy: (MLPPolicy) the policy
+    :param env: (Gym Environment) the environment
+    :param horizon: (int) the number of timesteps to run per batch
+    :param stochastic: (bool) use a stochastic policy
+    :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
+    :param gail: (bool) Whether we are using this generator for standard trpo or with gail
+    :return: (dict) generator that returns a dict with the following keys:
+
+        - ob: (numpy Number) observations
+        - rew: (numpy float) rewards (if gail is used it is the predicted reward)
+        - vpred: (numpy float) action logits
+        - new: (numpy bool) dones (is end of episode)
+        - ac: (numpy Number) actions
+        - prevac: (numpy Number) previous actions
+        - nextvpred: (numpy float) next action logits
+        - ep_rets: (float) cumulated current episode reward
+        - ep_lens: (int) the length of the current episode
+        - ep_true_rets: (float) the real environment reward
+    """
+    # Check when using GAIL
+    assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL"
 
     # Initialize state variables
-    t = 0
-    ac = env.action_space.sample()
-    new = True
-    rew = 0.0
-    true_rew = 0.0
-    ob = env.reset()
-
-    cur_ep_ret = 0
-    cur_ep_len = 0
+    step = 0
+    action = env.action_space.sample()  # not used, just so we have the datatype
+    done = True
+    observation = env.reset()
+
+    cur_ep_ret = 0  # return in current episode
+    cur_ep_len = 0  # len of current episode
     cur_ep_true_ret = 0
     ep_true_rets = []
-    ep_rets = []
-    ep_lens = []
+    ep_rets = []  # returns of completed episodes in this segment
+    ep_lens = []  # Episode lengths
 
     # Initialize history arrays
-    obs = np.array([ob for _ in range(horizon)])
-    true_rews = np.zeros(horizon, 'float32')
-    rews = np.zeros(horizon, 'float32')
+    observations = np.array([observation for _ in range(horizon)])
+    true_rewards = np.zeros(horizon, 'float32')
+    rewards = np.zeros(horizon, 'float32')
     vpreds = np.zeros(horizon, 'float32')
-    news = np.zeros(horizon, 'int32')
-    acs = np.array([ac for _ in range(horizon)])
-    prevacs = acs.copy()
+    dones = np.zeros(horizon, 'int32')
+    actions = np.array([action for _ in range(horizon)])
+    prev_actions = actions.copy()
 
     while True:
-        prevac = ac
-        ac, vpred = pi.act(stochastic, ob)
+        prevac = action
+        action, vpred = policy.act(stochastic, observation)
         # Slight weirdness here because we need value function at time T
         # before returning segment [0, T-1] so we get the correct
         # terminal value
-        if t > 0 and t % horizon == 0:
-            yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news,
-                   "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new),
+        if step > 0 and step % horizon == 0:
+            yield {"ob": observations, "rew": rewards, "vpred": vpreds, "new": dones,
+                   "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - done),
                    "ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets}
-            _, vpred = pi.act(stochastic, ob)
+            _, vpred = policy.act(stochastic, observation)
             # Be careful!!! if you change the downstream algorithm to aggregate
             # several of these batches, then be sure to do a deepcopy
             ep_rets = []
             ep_true_rets = []
             ep_lens = []
-        i = t % horizon
-        obs[i] = ob
-        vpreds[i] = vpred
-        news[i] = new
-        acs[i] = ac
-        prevacs[i] = prevac
-
-        rew = reward_giver.get_reward(ob, ac)
-        ob, true_rew, new, _ = env.step(ac)
-        rews[i] = rew
-        true_rews[i] = true_rew
-
-        cur_ep_ret += rew
-        cur_ep_true_ret += true_rew
+        idx = step % horizon
+        observations[idx] = observation
+        vpreds[idx] = vpred
+        dones[idx] = done
+        actions[idx] = action
+        prev_actions[idx] = prevac
+
+        if gail:
+            reward = reward_giver.get_reward(observation, action)
+            observation, true_reward, done, _ = env.step(action)
+        else:
+            observation, reward, done, _ = env.step(action)
+            true_reward = reward
+        rewards[idx] = reward
+        true_rewards[idx] = true_reward
+
+        cur_ep_ret += reward
+        cur_ep_true_ret += true_reward
         cur_ep_len += 1
-        if new:
+        if done:
             ep_rets.append(cur_ep_ret)
             ep_true_rets.append(cur_ep_true_ret)
             ep_lens.append(cur_ep_len)
             cur_ep_ret = 0
             cur_ep_true_ret = 0
             cur_ep_len = 0
-            ob = env.reset()
-        t += 1
+            observation = env.reset()
+        step += 1
 
 
 def add_vtarg_and_adv(seg, gamma, lam):
-    new = np.append(seg["new"], 0)  # last element is only used for last vtarg, but we already zeroed it if last new = 1
+    """
+    Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
+
+    :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information)
+    :param gamma: (float) Discount factor
+    :param lam: (float) GAE factor
+    """
+    # last element is only used for last vtarg, but we already zeroed it if last done = 1
+    done = np.append(seg["new"], 0)
     vpred = np.append(seg["vpred"], seg["nextvpred"])
-    T = len(seg["rew"])
-    seg["adv"] = gaelam = np.empty(T, 'float32')
+    time_horizon = len(seg["rew"])
+    seg["adv"] = gae_lam = np.empty(time_horizon, 'float32')
     rew = seg["rew"]
-    lastgaelam = 0
-    for t in reversed(range(T)):
-        nonterminal = 1-new[t+1]
-        delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
-        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
+    last_gae_lam = 0
+    for step in reversed(range(time_horizon)):
+        non_terminal = 1 - done[step + 1]
+        delta = rew[step] + gamma * vpred[step + 1] * non_terminal - vpred[step]
+        gae_lam[step] = last_gae_lam = delta + gamma * lam * non_terminal * last_gae_lam
     seg["tdlamret"] = seg["adv"] + seg["vpred"]
 
 
-def learn(env, policy_func, reward_giver, expert_dataset, rank,
-          pretrained, pretrained_weight, *,
-          g_step, d_step, entcoeff, save_per_iter,
-          ckpt_dir, log_dir, timesteps_per_batch, task_name,
-          gamma, lam,
-          max_kl, cg_iters, cg_damping=1e-2,
-          vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3,
-          max_timesteps=0, max_episodes=0, max_iters=0,
-          callback=None
-          ):
+def learn(env, policy_func, *, timesteps_per_batch, max_kl, cg_iters, gamma, lam, entcoeff=0.0, cg_damping=1e-2,
+          vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None,
+          # GAIL Params
+          pretrained_weight=None, reward_giver=None, expert_dataset=None, rank=0, save_per_iter=1,
+          ckpt_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, task_name="task_name", d_stepsize=3e-4, using_gail=True):
+    """
+    learns a GAIL policy using the given environment
+
+    :param env: (Gym Environment) the environment
+    :param policy_func: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
+    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
+    :param max_kl: (float) the kullback leiber loss threashold
+    :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
+    :param gamma: (float) the discount value
+    :param lam: (float) GAE factor
+    :param entcoeff: (float) the weight for the entropy loss
+    :param cg_damping: (float) the compute gradient dampening factor
+    :param vf_stepsize: (float) the value function stepsize
+    :param vf_iters: (int) the value function's number iterations for learning
+    :param max_timesteps: (int) the maximum number of timesteps before halting
+    :param max_episodes: (int) the maximum number of episodes before halting
+    :param max_iters: (int) the maximum number of training iterations  before halting
+    :param callback: (function (dict, dict)) the call back function, takes the local and global attribute dictionary
+    :param pretrained_weight: (str) the save location for the pretrained weights
+    :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
+    :param expert_dataset: (MujocoDset) the dataset manager
+    :param rank: (int) the rank of the mpi thread
+    :param save_per_iter: (int) the number of iterations before saving
+    :param ckpt_dir: (str) the location for saving checkpoints
+    :param g_step: (int) number of steps to train policy in each epoch
+    :param d_step: (int) number of steps to train discriminator in each epoch
+    :param task_name: (str) the name of the task (can be None)
+    :param d_stepsize: (float) the reward giver stepsize
+    :param using_gail: (bool) using the GAIL model
+    """
 
     nworkers = MPI.COMM_WORLD.Get_size()
     rank = MPI.COMM_WORLD.Get_rank()
     np.set_printoptions(precision=3)
+    sess = tf_util.single_threaded_session()
     # Setup losses and stuff
     # ----------------------------------------
     ob_space = env.observation_space
     ac_space = env.action_space
-    pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None))
-    oldpi = policy_func("oldpi", ob_space, ac_space)
+    policy = policy_func("pi", ob_space, ac_space, sess=sess)
+    old_policy = policy_func("oldpi", ob_space, ac_space, sess=sess,
+                             placeholders={"obs": policy.obs_ph, "stochastic": policy.stochastic_ph})
+
     atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
     ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
 
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
+    observation = policy.obs_ph
+    action = policy.pdtype.sample_placeholder([None])
 
-    kloldnew = oldpi.pd.kl(pi.pd)
-    ent = pi.pd.entropy()
+    kloldnew = old_policy.proba_distribution.kl(policy.proba_distribution)
+    ent = policy.proba_distribution.entropy()
     meankl = tf.reduce_mean(kloldnew)
     meanent = tf.reduce_mean(ent)
     entbonus = entcoeff * meanent
 
-    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
+    vferr = tf.reduce_mean(tf.square(policy.vpred - ret))
 
-    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
+    # advantage * pnew / pold
+    ratio = tf.exp(policy.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action))
     surrgain = tf.reduce_mean(ratio * atarg)
 
     optimgain = surrgain + entbonus
@@ -145,83 +205,102 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank,
 
     dist = meankl
 
-    all_var_list = pi.get_trainable_variables()
-    var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
-    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
-    assert len(var_list) == len(vf_var_list) + 1
-    d_adam = MpiAdam(reward_giver.get_trainable_variables())
-    vfadam = MpiAdam(vf_var_list)
+    all_var_list = policy.get_trainable_variables()
+    if using_gail:
+        var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")]
+        vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
+        assert len(var_list) == len(vf_var_list) + 1
+        d_adam = MpiAdam(reward_giver.get_trainable_variables())
+    else:
+        var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
+        vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
+
+    vfadam = MpiAdam(vf_var_list, sess=sess)
+    get_flat = tf_util.GetFlat(var_list, sess=sess)
+    set_from_flat = tf_util.SetFromFlat(var_list, sess=sess)
 
-    get_flat = U.GetFlat(var_list)
-    set_from_flat = U.SetFromFlat(var_list)
     klgrads = tf.gradients(dist, var_list)
     flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
     shapes = [var.get_shape().as_list() for var in var_list]
     start = 0
     tangents = []
     for shape in shapes:
-        sz = U.intprod(shape)
-        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
-        start += sz
-    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
-    fvp = U.flatgrad(gvp, var_list)
-
-    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
-                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
-    compute_losses = U.function([ob, ac, atarg], losses)
-    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
-    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
-    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
+        var_size = tf_util.intprod(shape)
+        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
+        start += var_size
+    gvp = tf.add_n(
+        [tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
+    fvp = tf_util.flatgrad(gvp, var_list)
+
+    assign_old_eq_new = tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
+                                                          zipsame(old_policy.get_variables(), policy.get_variables())])
+    compute_losses = tf_util.function([observation, action, atarg], losses)
+    compute_lossandgrad = tf_util.function([observation, action, atarg],
+                                           losses + [tf_util.flatgrad(optimgain, var_list)])
+    compute_fvp = tf_util.function([flat_tangent, observation, action, atarg], fvp)
+    compute_vflossandgrad = tf_util.function([observation, ret], tf_util.flatgrad(vferr, vf_var_list))
 
     @contextmanager
     def timed(msg):
         if rank == 0:
             print(colorize(msg, color='magenta'))
-            tstart = time.time()
+            start_time = time.time()
             yield
-            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
+            print(colorize("done in %.3f seconds" % (time.time() - start_time), color='magenta'))
         else:
             yield
 
-    def allmean(x):
-        assert isinstance(x, np.ndarray)
-        out = np.empty_like(x)
-        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
+    def allmean(arr):
+        assert isinstance(arr, np.ndarray)
+        out = np.empty_like(arr)
+        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
         out /= nworkers
         return out
 
-    U.initialize()
+    tf_util.initialize(sess=sess)
+
     th_init = get_flat()
     MPI.COMM_WORLD.Bcast(th_init, root=0)
     set_from_flat(th_init)
-    d_adam.sync()
+
+    if using_gail:
+        d_adam.sync()
     vfadam.sync()
+
     if rank == 0:
         print("Init param sum", th_init.sum(), flush=True)
 
     # Prepare for rollouts
     # ----------------------------------------
-    seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True)
+    if using_gail:
+        seg_gen = traj_segment_generator(policy, env, timesteps_per_batch, stochastic=True,
+                                         reward_giver=reward_giver, gail=True)
+    else:
+        seg_gen = traj_segment_generator(policy, env, timesteps_per_batch, stochastic=True)
 
     episodes_so_far = 0
     timesteps_so_far = 0
     iters_so_far = 0
-    tstart = time.time()
+    t_start = time.time()
     lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
     rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
-    true_rewbuffer = deque(maxlen=40)
 
     assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1
 
-    g_loss_stats = stats(loss_names)
-    d_loss_stats = stats(reward_giver.loss_name)
-    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
-    # if provide pretrained weight
-    if pretrained_weight is not None:
-        U.load_state(pretrained_weight, var_list=pi.get_variables())
+    if using_gail:
+        true_rewbuffer = deque(maxlen=40)
+        #  Stats not used for now
+        #  g_loss_stats = Stats(loss_names)
+        #  d_loss_stats = Stats(reward_giver.loss_name)
+        #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])
+
+        # if provide pretrained weight
+        if pretrained_weight is not None:
+            tf_util.load_state(pretrained_weight, var_list=policy.get_variables())
 
     while True:
-        if callback: callback(locals(), globals())
+        if callback:
+            callback(locals(), globals())
         if max_timesteps and timesteps_so_far >= max_timesteps:
             break
         elif max_episodes and episodes_so_far >= max_episodes:
@@ -230,60 +309,66 @@ def allmean(x):
             break
 
         # Save model
-        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
+        if using_gail and rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
             fname = os.path.join(ckpt_dir, task_name)
             os.makedirs(os.path.dirname(fname), exist_ok=True)
             saver = tf.train.Saver()
-            saver.save(tf.get_default_session(), fname)
+            saver.save(sess, fname)
 
         logger.log("********** Iteration %i ************" % iters_so_far)
 
-        def fisher_vector_product(p):
-            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
+        def fisher_vector_product(vec):
+            return allmean(compute_fvp(vec, *fvpargs, sess=sess)) + cg_damping * vec
         # ------------------ Update G ------------------
         logger.log("Optimizing Policy...")
+        # g_step = 1 when not using GAIL
         for _ in range(g_step):
             with timed("sampling"):
                 seg = seg_gen.__next__()
             add_vtarg_and_adv(seg, gamma, lam)
             # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
-            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
+            observation, action, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
             vpredbefore = seg["vpred"]  # predicted value function before udpate
             atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
 
-            if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
+            if hasattr(policy, "ret_rms"):
+                policy.ret_rms.update(tdlamret)
+            if hasattr(policy, "ob_rms"):
+                policy.ob_rms.update(observation)  # update running mean/std for policy
 
             args = seg["ob"], seg["ac"], atarg
             fvpargs = [arr[::5] for arr in args]
 
-            assign_old_eq_new()  # set old parameter values to new parameter values
+            assign_old_eq_new(sess=sess)
+
             with timed("computegrad"):
-                *lossbefore, g = compute_lossandgrad(*args)
+                *lossbefore, grad = compute_lossandgrad(*args, sess=sess)
             lossbefore = allmean(np.array(lossbefore))
-            g = allmean(g)
-            if np.allclose(g, 0):
+            grad = allmean(grad)
+            if np.allclose(grad, 0):
                 logger.log("Got zero gradient. not updating")
             else:
                 with timed("cg"):
-                    stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
+                    stepdir = conjugate_gradient(fisher_vector_product, grad, cg_iters=cg_iters, verbose=rank == 0)
                 assert np.isfinite(stepdir).all()
-                shs = .5*stepdir.dot(fisher_vector_product(stepdir))
-                lm = np.sqrt(shs / max_kl)
+                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
+                # abs(shs) to avoid taking square root of negative values
+                lagrange_multiplier = np.sqrt(abs(shs) / max_kl)
                 # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
-                fullstep = stepdir / lm
-                expectedimprove = g.dot(fullstep)
+                fullstep = stepdir / lagrange_multiplier
+                expectedimprove = grad.dot(fullstep)
                 surrbefore = lossbefore[0]
                 stepsize = 1.0
                 thbefore = get_flat()
                 for _ in range(10):
                     thnew = thbefore + fullstep * stepsize
                     set_from_flat(thnew)
-                    meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
+                    mean_losses = surr, kl_loss, *_ = allmean(np.array(compute_losses(*args, sess=sess)))
                     improve = surr - surrbefore
                     logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
-                    if not np.isfinite(meanlosses).all():
+                    if not np.isfinite(mean_losses).all():
                         logger.log("Got non-finite value of losses -- bad!")
-                    elif kl > max_kl * 1.5:
+                    elif kl_loss > max_kl * 1.5:
                         logger.log("violated KL constraint. shrinking step.")
                     elif improve < 0:
                         logger.log("surrogate didn't improve. shrinking step.")
@@ -297,46 +382,55 @@ def fisher_vector_product(p):
                 if nworkers > 1 and iters_so_far % 20 == 0:
                     paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                     assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
+
             with timed("vf"):
                 for _ in range(vf_iters):
                     for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                                                              include_final_partial_batch=False, batch_size=128):
-                        if hasattr(pi, "ob_rms"):
-                            pi.ob_rms.update(mbob)  # update running mean/std for policy
-                        g = allmean(compute_vflossandgrad(mbob, mbret))
-                        vfadam.update(g, vf_stepsize)
-
-        g_losses = meanlosses
-        for (lossname, lossval) in zip(loss_names, meanlosses):
-            logger.record_tabular(lossname, lossval)
+                        if hasattr(policy, "ob_rms"):
+                            policy.ob_rms.update(mbob)  # update running mean/std for policy
+                        grad = allmean(compute_vflossandgrad(mbob, mbret, sess=sess))
+                        vfadam.update(grad, vf_stepsize)
+
+        for (loss_name, loss_val) in zip(loss_names, mean_losses):
+            logger.record_tabular(loss_name, loss_val)
+
         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
-        # ------------------ Update D ------------------
-        logger.log("Optimizing Discriminator...")
-        logger.log(fmt_row(13, reward_giver.loss_name))
-        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
-        batch_size = len(ob) // d_step
-        d_losses = []  # list of tuples, each of which gives the loss for a minibatch
-        for ob_batch, ac_batch in dataset.iterbatches((ob, ac),
-                                                      include_final_partial_batch=False,
-                                                      batch_size=batch_size):
-            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
-            # update running mean/std for reward_giver
-            if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
-            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
-            d_adam.update(allmean(g), d_stepsize)
-            d_losses.append(newlosses)
-        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
-
-        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
-        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
-        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
-        true_rewbuffer.extend(true_rets)
+
+        if using_gail:
+            # ------------------ Update D ------------------
+            logger.log("Optimizing Discriminator...")
+            logger.log(fmt_row(13, reward_giver.loss_name))
+            ob_expert, ac_expert = expert_dataset.get_next_batch(len(observation))
+            batch_size = len(observation) // d_step
+            d_losses = []  # list of tuples, each of which gives the loss for a minibatch
+            for ob_batch, ac_batch in dataset.iterbatches((observation, action),
+                                                          include_final_partial_batch=False,
+                                                          batch_size=batch_size):
+                ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
+                # update running mean/std for reward_giver
+                if hasattr(reward_giver, "obs_rms"):
+                    reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
+                *newlosses, grad = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
+                d_adam.update(allmean(grad), d_stepsize)
+                d_losses.append(newlosses)
+            logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
+
+            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"])  # local values
+            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
+            lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
+            true_rewbuffer.extend(true_rets)
+        else:
+            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
+            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
+            lens, rews = map(flatten_lists, zip(*listoflrpairs))
         lenbuffer.extend(lens)
         rewbuffer.extend(rews)
 
         logger.record_tabular("EpLenMean", np.mean(lenbuffer))
         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
-        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
+        if using_gail:
+            logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
         logger.record_tabular("EpThisIter", len(lens))
         episodes_so_far += len(lens)
         timesteps_so_far += sum(lens)
@@ -344,11 +438,17 @@ def fisher_vector_product(p):
 
         logger.record_tabular("EpisodesSoFar", episodes_so_far)
         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
-        logger.record_tabular("TimeElapsed", time.time() - tstart)
+        logger.record_tabular("TimeElapsed", time.time() - t_start)
 
         if rank == 0:
             logger.dump_tabular()
 
 
 def flatten_lists(listoflists):
+    """
+    Flatten a python list of list
+
+    :param listoflists: (list(list))
+    :return: (list)
+    """
     return [el for list_ in listoflists for el in list_]
diff --git a/baselines/her/actor_critic.py b/baselines/her/actor_critic.py
index d5443fe0c3..9632cf6f6c 100644
--- a/baselines/her/actor_critic.py
+++ b/baselines/her/actor_critic.py
@@ -1,44 +1,52 @@
 import tensorflow as tf
-from baselines.her.util import store_args, nn
+
+from baselines.her.util import mlp
 
 
 class ActorCritic:
-    @store_args
-    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
-                 **kwargs):
+    def __init__(self, inputs_tf, dim_obs, dim_goal, dim_action,
+                 max_u, o_stats, g_stats, hidden, layers, **kwargs):
         """The actor-critic network and related training code.
 
-        Args:
-            inputs_tf (dict of tensors): all necessary inputs for the network: the
-                observation (o), the goal (g), and the action (u)
-            dimo (int): the dimension of the observations
-            dimg (int): the dimension of the goals
-            dimu (int): the dimension of the actions
-            max_u (float): the maximum magnitude of actions; action outputs will be scaled
-                accordingly
-            o_stats (baselines.her.Normalizer): normalizer for observations
-            g_stats (baselines.her.Normalizer): normalizer for goals
-            hidden (int): number of hidden units that should be used in hidden layers
-            layers (int): number of hidden layers
+        :param inputs_tf: ({str: TensorFlow Tensor}) all necessary inputs for the network: the
+            observation (o), the goal (g), and the action (u)
+        :param dim_obs: (int) the dimension of the observations
+        :param dim_goal: (int) the dimension of the goals
+        :param dim_action: (int) the dimension of the actions
+        :param max_u: (float) the maximum magnitude of actions; action outputs will be scaled accordingly
+        :param o_stats (baselines.her.Normalizer): normalizer for observations
+        :param g_stats (baselines.her.Normalizer): normalizer for goals
+        :param hidden (int): number of hidden units that should be used in hidden layers
+        :param layers (int): number of hidden layers
         """
+        self.inputs_tf = inputs_tf
+        self.dim_obs = dim_obs
+        self.dim_goal = dim_goal
+        self.dim_action = dim_action
+        self.max_u = max_u
+        self.o_stats = o_stats
+        self.g_stats = g_stats
+        self.hidden = hidden
+        self.layers = layers
+
         self.o_tf = inputs_tf['o']
         self.g_tf = inputs_tf['g']
         self.u_tf = inputs_tf['u']
 
         # Prepare inputs for actor and critic.
-        o = self.o_stats.normalize(self.o_tf)
-        g = self.g_stats.normalize(self.g_tf)
-        input_pi = tf.concat(axis=1, values=[o, g])  # for actor
+        obs = self.o_stats.normalize(self.o_tf)
+        goals = self.g_stats.normalize(self.g_tf)
+        input_pi = tf.concat(axis=1, values=[obs, goals])  # for actor
 
         # Networks.
         with tf.variable_scope('pi'):
-            self.pi_tf = self.max_u * tf.tanh(nn(
+            self.pi_tf = self.max_u * tf.tanh(mlp(
                 input_pi, [self.hidden] * self.layers + [self.dimu]))
         with tf.variable_scope('Q'):
             # for policy training
-            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
-            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
+            input_q = tf.concat(axis=1, values=[obs, goals, self.pi_tf / self.max_u])
+            self.q_pi_tf = mlp(input_q, [self.hidden] * self.layers + [1])
             # for critic training
-            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
-            self._input_Q = input_Q  # exposed for tests
-            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
+            input_q = tf.concat(axis=1, values=[obs, goals, self.u_tf / self.max_u])
+            self._input_q = input_q  # exposed for tests
+            self.q_tf = mlp(input_q, [self.hidden] * self.layers + [1], reuse=True)
diff --git a/baselines/her/ddpg.py b/baselines/her/ddpg.py
index 92165de958..6abc45ac8d 100644
--- a/baselines/her/ddpg.py
+++ b/baselines/her/ddpg.py
@@ -5,8 +5,7 @@
 from tensorflow.contrib.staging import StagingArea
 
 from baselines import logger
-from baselines.her.util import (
-    import_function, store_args, flatten_grads, transitions_in_episode_batch)
+from baselines.her.util import import_function, flatten_grads, transitions_in_episode_batch
 from baselines.her.normalizer import Normalizer
 from baselines.her.replay_buffer import ReplayBuffer
 from baselines.common.mpi_adam import MpiAdam
@@ -17,49 +16,74 @@ def dims_to_shapes(input_dims):
 
 
 class DDPG(object):
-    @store_args
     def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size,
-                 Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T,
+                 q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, time_horizon,
                  rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return,
-                 sample_transitions, gamma, reuse=False, **kwargs):
-        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
-
-        Args:
-            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
-                actions (u)
-            buffer_size (int): number of transitions that are stored in the replay buffer
-            hidden (int): number of units in the hidden layers
-            layers (int): number of hidden layers
-            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
-            polyak (float): coefficient for Polyak-averaging of the target network
-            batch_size (int): batch size for training
-            Q_lr (float): learning rate for the Q (critic) network
-            pi_lr (float): learning rate for the pi (actor) network
-            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
-            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
-            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
-            action_l2 (float): coefficient for L2 penalty on the actions
-            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
-            scope (str): the scope used for the TensorFlow graph
-            T (int): the time horizon for rollouts
-            rollout_batch_size (int): number of parallel rollouts per DDPG agent
-            subtract_goals (function): function that subtracts goals from each other
-            relative_goals (boolean): whether or not relative goals should be fed into the network
-            clip_pos_returns (boolean): whether or not positive returns should be clipped
-            clip_return (float): clip returns to be in [-clip_return, clip_return]
-            sample_transitions (function) function that samples from the replay buffer
-            gamma (float): gamma used for Q learning updates
-            reuse (boolean): whether or not the networks should be reused
+                 sample_transitions, gamma, reuse=False):
         """
+        Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
+
+        :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u)
+        :param buffer_size: (int) number of transitions that are stored in the replay buffer
+        :param hidden: (int) number of units in the hidden layers
+        :param layers: (int) number of hidden layers
+        :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic')
+        :param polyak: (float) coefficient for Polyak-averaging of the target network
+        :param batch_size: (int) batch size for training
+        :param q_lr: (float) learning rate for the Q (critic) network
+        :param pi_lr: (float) learning rate for the pi (actor) network
+        :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities
+        :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip]
+        :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u]
+        :param action_l2: (float) coefficient for L2 penalty on the actions
+        :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs]
+        :param scope: (str) the scope used for the TensorFlow graph
+        :param time_horizon: (int) the time horizon for rollouts
+        :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent
+        :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals
+            from each other
+        :param relative_goals: (boolean) whether or not relative goals should be fed into the network
+        :param clip_pos_returns: (boolean) whether or not positive returns should be clipped
+        :param clip_return: (float) clip returns to be in [-clip_return, clip_return]
+        :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer
+        :param gamma: (float) gamma used for Q learning updates
+        :param reuse: (boolean) whether or not the networks should be reused
+        """
+        # Updated in experiments/config.py
+        self.input_dims = input_dims
+        self.buffer_size = buffer_size
+        self.hidden = hidden
+        self.layers = layers
+        self.network_class = network_class
+        self.polyak = polyak
+        self.batch_size = batch_size
+        self.q_lr = q_lr
+        self.pi_lr = pi_lr
+        self.norm_eps = norm_eps
+        self.norm_clip = norm_clip
+        self.max_u = max_u
+        self.action_l2 = action_l2
+        self.clip_obs = clip_obs
+        self.scope = scope
+        self.time_horizon = time_horizon
+        self.rollout_batch_size = rollout_batch_size
+        self.subtract_goals = subtract_goals
+        self.relative_goals = relative_goals
+        self.clip_pos_returns = clip_pos_returns
+        self.clip_return = clip_return
+        self.sample_transitions = sample_transitions
+        self.gamma = gamma
+        self.reuse = reuse
+
         if self.clip_return is None:
             self.clip_return = np.inf
 
         self.create_actor_critic = import_function(self.network_class)
 
         input_shapes = dims_to_shapes(self.input_dims)
-        self.dimo = self.input_dims['o']
-        self.dimg = self.input_dims['g']
-        self.dimu = self.input_dims['u']
+        self.dim_obs = self.input_dims['o']
+        self.dim_goal = self.input_dims['g']
+        self.dim_action = self.input_dims['u']
 
         # Prepare staging area for feeding data to the model.
         stage_shapes = OrderedDict()
@@ -84,54 +108,67 @@ def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polya
             self._create_network(reuse=reuse)
 
         # Configure the replay buffer.
-        buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key])
+        buffer_shapes = {key: (self.time_horizon if key != 'o' else self.time_horizon + 1, *input_shapes[key])
                          for key, val in input_shapes.items()}
-        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
-        buffer_shapes['ag'] = (self.T+1, self.dimg)
+        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal)
+        buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal)
 
         buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size
-        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
+        self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.time_horizon, self.sample_transitions)
 
-    def _random_action(self, n):
-        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu))
+    def _random_action(self, num):
+        return np.random.uniform(low=-self.max_u, high=self.max_u, size=(num, self.dim_action))
 
-    def _preprocess_og(self, o, ag, g):
+    def _preprocess_obs_goal(self, obs, achieved_goal, goal):
         if self.relative_goals:
-            g_shape = g.shape
-            g = g.reshape(-1, self.dimg)
-            ag = ag.reshape(-1, self.dimg)
-            g = self.subtract_goals(g, ag)
-            g = g.reshape(*g_shape)
-        o = np.clip(o, -self.clip_obs, self.clip_obs)
-        g = np.clip(g, -self.clip_obs, self.clip_obs)
-        return o, g
-
-    def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False,
-                    compute_Q=False):
-        o, g = self._preprocess_og(o, ag, g)
+            g_shape = goal.shape
+            goal = goal.reshape(-1, self.dim_goal)
+            achieved_goal = achieved_goal.reshape(-1, self.dim_goal)
+            goal = self.subtract_goals(goal, achieved_goal)
+            goal = goal.reshape(*g_shape)
+        obs = np.clip(obs, -self.clip_obs, self.clip_obs)
+        goal = np.clip(goal, -self.clip_obs, self.clip_obs)
+        return obs, goal
+
+    def get_actions(self, obs, achieved_goal, goal, noise_eps=0., random_eps=0., use_target_net=False, compute_q=False):
+        """
+        return the action from an observation and goal
+
+        :param obs: (numpy Number) the observation
+        :param achieved_goal: (numpy Number) the achieved goal
+        :param goal: (numpy Number) the goal
+        :param noise_eps: (float) the noise epsilon
+        :param random_eps: (float) the random epsilon
+        :param use_target_net: (bool) whether or not to use the target network
+        :param compute_q: (bool) whether or not to compute Q value
+        :return: (numpy float or float) the actions
+        """
+        obs, goal = self._preprocess_obs_goal(obs, achieved_goal, goal)
         policy = self.target if use_target_net else self.main
         # values to compute
         vals = [policy.pi_tf]
-        if compute_Q:
-            vals += [policy.Q_pi_tf]
+        if compute_q:
+            vals += [policy.q_pi_tf]
         # feed
         feed = {
-            policy.o_tf: o.reshape(-1, self.dimo),
-            policy.g_tf: g.reshape(-1, self.dimg),
-            policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
+            policy.o_tf: obs.reshape(-1, self.dim_obs),
+            policy.g_tf: goal.reshape(-1, self.dim_goal),
+            policy.u_tf: np.zeros((obs.size // self.dim_obs, self.dim_action), dtype=np.float32)
         }
 
         ret = self.sess.run(vals, feed_dict=feed)
         # action postprocessing
-        u = ret[0]
-        noise = noise_eps * self.max_u * np.random.randn(*u.shape)  # gaussian noise
-        u += noise
-        u = np.clip(u, -self.max_u, self.max_u)
-        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u)  # eps-greedy
-        if u.shape[0] == 1:
-            u = u[0]
-        u = u.copy()
-        ret[0] = u
+        action = ret[0]
+        noise = noise_eps * self.max_u * np.random.randn(*action.shape)  # gaussian noise
+        action += noise
+        action = np.clip(action, -self.max_u, self.max_u)
+        # eps-greedy
+        n_ac = action.shape[0]
+        action += np.random.binomial(1, random_eps, n_ac).reshape(-1, 1) * (self._random_action(n_ac) - action)
+        if action.shape[0] == 1:
+            action = action[0]
+        action = action.copy()
+        ret[0] = action
 
         if len(ret) == 1:
             return ret[0]
@@ -140,8 +177,11 @@ def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=Fals
 
     def store_episode(self, episode_batch, update_stats=True):
         """
-        episode_batch: array of batch_size x (T or T+1) x dim_key
-                       'o' is of size T+1, others are of size T
+        Story the episode transitions
+
+        :param episode_batch: (numpy Number) array of batch_size x (T or T+1) x dim_key 'o' is of size T+1,
+            others are of size T
+        :param update_stats: (bool) whether to update stats or not
         """
 
         self.buffer.store_episode(episode_batch)
@@ -153,8 +193,8 @@ def store_episode(self, episode_batch, update_stats=True):
             num_normalizing_transitions = transitions_in_episode_batch(episode_batch)
             transitions = self.sample_transitions(episode_batch, num_normalizing_transitions)
 
-            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
-            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
+            obs, _, goal, achieved_goal = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag']
+            transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal)
             # No need to preprocess the o_2 and g_2 since this is only used for stats
 
             self.o_stats.update(transitions['o'])
@@ -164,56 +204,83 @@ def store_episode(self, episode_batch, update_stats=True):
             self.g_stats.recompute_stats()
 
     def get_current_buffer_size(self):
+        """
+        returns the current buffer size
+
+        :return: (int) buffer size
+        """
         return self.buffer.get_current_size()
 
     def _sync_optimizers(self):
-        self.Q_adam.sync()
+        self.q_adam.sync()
         self.pi_adam.sync()
 
     def _grads(self):
         # Avoid feed_dict here for performance!
-        critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([
-            self.Q_loss_tf,
-            self.main.Q_pi_tf,
-            self.Q_grad_tf,
+        critic_loss, actor_loss, q_grad, pi_grad = self.sess.run([
+            self.q_loss_tf,
+            self.main.q_pi_tf,
+            self.q_grad_tf,
             self.pi_grad_tf
         ])
-        return critic_loss, actor_loss, Q_grad, pi_grad
+        return critic_loss, actor_loss, q_grad, pi_grad
 
-    def _update(self, Q_grad, pi_grad):
-        self.Q_adam.update(Q_grad, self.Q_lr)
+    def _update(self, q_grad, pi_grad):
+        self.q_adam.update(q_grad, self.q_lr)
         self.pi_adam.update(pi_grad, self.pi_lr)
 
     def sample_batch(self):
+        """
+        sample a batch
+
+        :return: (dict) the batch
+        """
         transitions = self.buffer.sample(self.batch_size)
-        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
-        ag, ag_2 = transitions['ag'], transitions['ag_2']
-        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
-        transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g)
+        obs, obs_2, goal = transitions['o'], transitions['o_2'], transitions['g']
+        achieved_goal, achieved_goal_2 = transitions['ag'], transitions['ag_2']
+        transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal)
+        transitions['o_2'], transitions['g_2'] = self._preprocess_obs_goal(obs_2, achieved_goal_2, goal)
 
         transitions_batch = [transitions[key] for key in self.stage_shapes.keys()]
         return transitions_batch
 
     def stage_batch(self, batch=None):
+        """
+        apply a batch to staging
+
+        :param batch: (dict) the batch to add to staging, if None: self.sample_batch()
+        """
         if batch is None:
             batch = self.sample_batch()
         assert len(self.buffer_ph_tf) == len(batch)
         self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch)))
 
     def train(self, stage=True):
+        """
+        train DDPG
+
+        :param stage: (bool) enable staging
+        :return: (float, float) critic loss, actor loss
+        """
         if stage:
             self.stage_batch()
-        critic_loss, actor_loss, Q_grad, pi_grad = self._grads()
-        self._update(Q_grad, pi_grad)
+        critic_loss, actor_loss, q_grad, pi_grad = self._grads()
+        self._update(q_grad, pi_grad)
         return critic_loss, actor_loss
 
     def _init_target_net(self):
         self.sess.run(self.init_target_net_op)
 
     def update_target_net(self):
+        """
+        update the target network
+        """
         self.sess.run(self.update_target_net_op)
 
     def clear_buffer(self):
+        """
+        clears the replay buffer
+        """
         self.buffer.clear_buffer()
 
     def _vars(self, scope):
@@ -226,21 +293,21 @@ def _global_vars(self, scope):
         return res
 
     def _create_network(self, reuse=False):
-        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
+        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dim_action, self.max_u))
 
         self.sess = tf.get_default_session()
         if self.sess is None:
             self.sess = tf.InteractiveSession()
 
         # running averages
-        with tf.variable_scope('o_stats') as vs:
+        with tf.variable_scope('o_stats') as scope:
             if reuse:
-                vs.reuse_variables()
-            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
-        with tf.variable_scope('g_stats') as vs:
+                scope.reuse_variables()
+            self.o_stats = Normalizer(self.dim_obs, self.norm_eps, self.norm_clip, sess=self.sess)
+        with tf.variable_scope('g_stats') as scope:
             if reuse:
-                vs.reuse_variables()
-            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)
+                scope.reuse_variables()
+            self.g_stats = Normalizer(self.dim_goal, self.norm_eps, self.norm_clip, sess=self.sess)
 
         # mini-batch sampling.
         batch = self.staging_tf.get()
@@ -249,40 +316,44 @@ def _create_network(self, reuse=False):
         batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
 
         # networks
-        with tf.variable_scope('main') as vs:
+        with tf.variable_scope('main') as scope:
             if reuse:
-                vs.reuse_variables()
+                scope.reuse_variables()
             self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
-            vs.reuse_variables()
-        with tf.variable_scope('target') as vs:
+            scope.reuse_variables()
+        with tf.variable_scope('target') as scope:
             if reuse:
-                vs.reuse_variables()
+                scope.reuse_variables()
             target_batch_tf = batch_tf.copy()
             target_batch_tf['o'] = batch_tf['o_2']
             target_batch_tf['g'] = batch_tf['g_2']
             self.target = self.create_actor_critic(
                 target_batch_tf, net_type='target', **self.__dict__)
-            vs.reuse_variables()
+            scope.reuse_variables()
         assert len(self._vars("main")) == len(self._vars("target"))
 
         # loss functions
-        target_Q_pi_tf = self.target.Q_pi_tf
+        target_q_pi_tf = self.target.q_pi_tf
         clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
-        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
-        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))
-        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
+        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_q_pi_tf, *clip_range)
+
+        self.q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.q_tf))
+        self.pi_loss_tf = -tf.reduce_mean(self.main.q_pi_tf)
         self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
-        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
+
+        q_grads_tf = tf.gradients(self.q_loss_tf, self._vars('main/Q'))
         pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
-        assert len(self._vars('main/Q')) == len(Q_grads_tf)
+
+        assert len(self._vars('main/Q')) == len(q_grads_tf)
         assert len(self._vars('main/pi')) == len(pi_grads_tf)
-        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
+
+        self.q_grads_vars_tf = zip(q_grads_tf, self._vars('main/Q'))
         self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
-        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
+        self.q_grad_tf = flatten_grads(grads=q_grads_tf, var_list=self._vars('main/Q'))
         self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))
 
         # optimizers
-        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
+        self.q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
         self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)
 
         # polyak averaging
@@ -292,7 +363,8 @@ def _create_network(self, reuse=False):
         self.init_target_net_op = list(
             map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
         self.update_target_net_op = list(
-            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))
+            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]),
+                zip(self.target_vars, self.main_vars)))
 
         # initialize all variables
         tf.variables_initializer(self._global_vars('')).run()
@@ -300,6 +372,11 @@ def _create_network(self, reuse=False):
         self._init_target_net()
 
     def logs(self, prefix=''):
+        """
+        create a log dictionary
+        :param prefix: (str) the prefix for evey index
+        :return: ({str: Any}) the log
+        """
         logs = []
         logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
         logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
@@ -318,7 +395,7 @@ def __getstate__(self):
                              'main', 'target', 'lock', 'env', 'sample_transitions',
                              'stage_shapes', 'create_actor_critic']
 
-        state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])}
+        state = {k: v for k, v in self.__dict__.items() if all([subname not in k for subname in excluded_subnames])}
         state['buffer_size'] = self.buffer_size
         state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name])
         return state
@@ -330,11 +407,11 @@ def __setstate__(self, state):
 
         self.__init__(**state)
         # set up stats (they are overwritten in __init__)
-        for k, v in state.items():
-            if k[-6:] == '_stats':
-                self.__dict__[k] = v
+        for key, value in state.items():
+            if key[-6:] == '_stats':
+                self.__dict__[key] = value
         # load TF variables
-        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
-        assert(len(vars) == len(state["tf"]))
-        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
+        _vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
+        assert len(_vars) == len(state["tf"])
+        node = [tf.assign(var, val) for var, val in zip(_vars, state["tf"])]
         self.sess.run(node)
diff --git a/baselines/her/experiment/config.py b/baselines/her/experiment/config.py
index cf29ca52b8..f32d6c1670 100644
--- a/baselines/her/experiment/config.py
+++ b/baselines/her/experiment/config.py
@@ -20,7 +20,7 @@
     'layers': 3,  # number of layers in the critic/actor networks
     'hidden': 256,  # number of neurons in each hidden layers
     'network_class': 'baselines.her.actor_critic:ActorCritic',
-    'Q_lr': 0.001,  # critic learning rate
+    'q_lr': 0.001,  # critic learning rate
     'pi_lr': 0.001,  # actor learning rate
     'buffer_size': int(1E6),  # for experience replay
     'polyak': 0.95,  # polyak averaging coefficient
@@ -55,6 +55,9 @@ def cached_make_env(make_env):
     Only creates a new environment from the provided function if one has not yet already been
     created. This is useful here because we need to infer certain properties of the env, e.g.
     its observation and action spaces, without any intend of actually using it.
+
+    :param make_env: (function (): Gym Environment) creates the environment
+    :return: (Gym Environment) the created environment
     """
     if make_env not in CACHED_ENVS:
         env = make_env()
@@ -63,6 +66,12 @@ def cached_make_env(make_env):
 
 
 def prepare_params(kwargs):
+    """
+    prepares DDPG params from kwargs
+
+    :param kwargs: (dict) the input kwargs
+    :return: (dict) DDPG parameters
+    """
     # DDPG params
     ddpg_params = dict()
 
@@ -73,18 +82,18 @@ def make_env():
     kwargs['make_env'] = make_env
     tmp_env = cached_make_env(kwargs['make_env'])
     assert hasattr(tmp_env, '_max_episode_steps')
-    kwargs['T'] = tmp_env._max_episode_steps
+    kwargs['time_horizon'] = tmp_env.spec.max_episode_steps  # wrapped envs preserve their spec
     tmp_env.reset()
     kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u']
-    kwargs['gamma'] = 1. - 1. / kwargs['T']
+    kwargs['gamma'] = 1. - 1. / kwargs['time_horizon']
     if 'lr' in kwargs:
         kwargs['pi_lr'] = kwargs['lr']
-        kwargs['Q_lr'] = kwargs['lr']
+        kwargs['q_lr'] = kwargs['lr']
         del kwargs['lr']
     for name in ['buffer_size', 'hidden', 'layers',
                  'network_class',
                  'polyak',
-                 'batch_size', 'Q_lr', 'pi_lr',
+                 'batch_size', 'q_lr', 'pi_lr',
                  'norm_eps', 'norm_clip', 'max_u',
                  'action_l2', 'clip_obs', 'scope', 'relative_goals']:
         ddpg_params[name] = kwargs[name]
@@ -95,17 +104,29 @@ def make_env():
     return kwargs
 
 
-def log_params(params, logger=logger):
+def log_params(params, logger_input=logger):
+    """
+    log the parameters
+
+    :param params: (dict) parameters to log
+    :param logger_input: (logger) the logger
+    """
     for key in sorted(params.keys()):
-        logger.info('{}: {}'.format(key, params[key]))
+        logger_input.info('{}: {}'.format(key, params[key]))
 
 
 def configure_her(params):
+    """
+    configure hindsight experience replay
+
+    :param params: (dict) input parameters
+    :return: (function (dict, int): dict) returns a HER update function for replay buffer batch
+    """
     env = cached_make_env(params['make_env'])
     env.reset()
 
-    def reward_fun(ag_2, g, info):  # vectorized
-        return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)
+    def reward_fun(achieved_goal, goal, info):  # vectorized
+        return env.compute_reward(achieved_goal=achieved_goal, desired_goal=goal, info=info)
 
     # Prepare configuration for HER.
     her_params = {
@@ -120,12 +141,29 @@ def reward_fun(ag_2, g, info):  # vectorized
     return sample_her_transitions
 
 
-def simple_goal_subtract(a, b):
-    assert a.shape == b.shape
-    return a - b
+def simple_goal_subtract(vec_a, vec_b):
+    """
+    checks if a and b have the same shape, and does a - b
+
+    :param vec_a: (numpy array)
+    :param vec_b: (numpy array)
+    :return: (numpy array) a - b
+    """
+    assert vec_a.shape == vec_b.shape
+    return vec_a - vec_b
 
 
 def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
+    """
+    configure a DDPG model from parameters
+
+    :param dims: ({str: int}) the dimensions
+    :param params: (dict) the DDPG parameters
+    :param reuse: (bool) whether or not the networks should be reused
+    :param use_mpi: (bool) whether or not to use MPI
+    :param clip_return: (float) clip returns to be in [-clip_return, clip_return]
+    :return: (her.DDPG) the ddpg model
+    """
     sample_her_transitions = configure_her(params)
     # Extract relevant parameters.
     gamma = params['gamma']
@@ -138,7 +176,7 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
     env = cached_make_env(params['make_env'])
     env.reset()
     ddpg_params.update({'input_dims': input_dims,  # agent takes an input observations
-                        'T': params['T'],
+                        'time_horizon': params['time_horizon'],
                         'clip_pos_returns': True,  # clip positive returns
                         'clip_return': (1. / (1. - gamma)) if clip_return else np.inf,  # max abs of return
                         'rollout_batch_size': rollout_batch_size,
@@ -154,6 +192,12 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True):
 
 
 def configure_dims(params):
+    """
+    configure input and output dimensions
+
+    :param params: (dict) the parameters
+    :return: ({str: int}) the dimensions
+    """
     env = cached_make_env(params['make_env'])
     env.reset()
     obs, _, _, info = env.step(env.action_space.sample())
diff --git a/baselines/her/experiment/play.py b/baselines/her/experiment/play.py
index 5b2f85d2ff..e31e57a093 100644
--- a/baselines/her/experiment/play.py
+++ b/baselines/her/experiment/play.py
@@ -1,7 +1,8 @@
 import click
-import numpy as np
 import pickle
 
+import numpy as np
+
 from baselines import logger
 from baselines.common import set_global_seeds
 import baselines.her.experiment.config as config
@@ -14,11 +15,19 @@
 @click.option('--n_test_rollouts', type=int, default=10)
 @click.option('--render', type=int, default=1)
 def main(policy_file, seed, n_test_rollouts, render):
+    """
+    run HER from a saved policy
+
+    :param policy_file: (str) pickle path to a saved policy
+    :param seed: (int) initial seed
+    :param n_test_rollouts: (int) the number of test rollouts
+    :param render: (bool) if rendering should be done
+    """
     set_global_seeds(seed)
 
     # Load policy.
-    with open(policy_file, 'rb') as f:
-        policy = pickle.load(f)
+    with open(policy_file, 'rb') as file_handler:
+        policy = pickle.load(file_handler)
     env_name = policy.info['env_name']
 
     # Prepare params.
@@ -27,21 +36,21 @@ def main(policy_file, seed, n_test_rollouts, render):
         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
     params['env_name'] = env_name
     params = config.prepare_params(params)
-    config.log_params(params, logger=logger)
+    config.log_params(params, logger_input=logger)
 
     dims = config.configure_dims(params)
 
     eval_params = {
         'exploit': True,
         'use_target_net': params['test_with_polyak'],
-        'compute_Q': True,
+        'compute_q': True,
         'rollout_batch_size': 1,
         'render': bool(render),
     }
 
-    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
+    for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']:
         eval_params[name] = params[name]
-    
+
     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
     evaluator.seed(seed)
 
diff --git a/baselines/her/experiment/plot.py b/baselines/her/experiment/plot.py
index 560903f82d..e9ee808a2e 100644
--- a/baselines/her/experiment/plot.py
+++ b/baselines/her/experiment/plot.py
@@ -1,26 +1,42 @@
 import os
+import json
+import argparse
+
 import matplotlib.pyplot as plt
 import numpy as np
-import json
-import seaborn as sns; sns.set()
+import seaborn as sns
 import glob2
-import argparse
 
+# Initialize seaborn
+sns.set()
 
 def smooth_reward_curve(x, y):
+    """
+    smooth the reward curve
+
+    :param x: (numpy float) the x coord of the reward
+    :param y: (numpy float) the y coord of the reward
+    :return: (numpy float, numpy float) smoothed x, smoothed y
+    """
     halfwidth = int(np.ceil(len(x) / 60))  # Halfwidth of our smoothing convolution
     k = halfwidth
     xsmoo = x
     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
-        mode='same')
+                                                                          mode='same')
     return xsmoo, ysmoo
 
 
 def load_results(file):
+    """
+    load the results from a file
+
+    :param file: (str) the saved results
+    :return: (dict) the result
+    """
     if not os.path.exists(file):
         return None
-    with open(file, 'r') as f:
-        lines = [line for line in f]
+    with open(file, 'r') as file_handler:
+        lines = [line for line in file_handler]
     if len(lines) < 2:
         return None
     keys = [name.strip() for name in lines[0].split(',')]
@@ -36,13 +52,20 @@ def load_results(file):
 
 
 def pad(xs, value=np.nan):
+    """
+
+
+    :param xs:
+    :param value:
+    :return:
+    """
     maxlen = np.max([len(x) for x in xs])
-    
+
     padded_xs = []
     for x in xs:
         if x.shape[0] >= maxlen:
             padded_xs.append(x)
-    
+
         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
         x_padded = np.concatenate([x, padding], axis=0)
         assert x_padded.shape[1:] == x.shape[1:]
diff --git a/baselines/her/experiment/train.py b/baselines/her/experiment/train.py
index aeaf1c5418..256e069091 100644
--- a/baselines/her/experiment/train.py
+++ b/baselines/her/experiment/train.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from subprocess import CalledProcessError
 
 import click
 import numpy as np
@@ -7,26 +8,43 @@
 from mpi4py import MPI
 
 from baselines import logger
-from baselines.common import set_global_seeds
+from baselines.common import set_global_seeds, tf_util
 from baselines.common.mpi_moments import mpi_moments
 import baselines.her.experiment.config as config
 from baselines.her.rollout import RolloutWorker
 from baselines.her.util import mpi_fork
 
-from subprocess import CalledProcessError
-
 
 def mpi_average(value):
-    if value == []:
+    """
+    calculate the average from the array, using MPI
+
+    :param value: (numpy Number) the array
+    :return: (float) the average
+    """
+    if len(value) == 0:
         value = [0.]
     if not isinstance(value, list):
         value = [value]
     return mpi_moments(np.array(value))[0]
 
 
-def train(policy, rollout_worker, evaluator,
-          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
-          save_policies, **kwargs):
+def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
+          save_policies):
+    """
+    train the given policy
+
+    :param policy: (her.DDPG) the policy to train
+    :param rollout_worker: (RolloutWorker) Rollout worker generates experience for training.
+    :param evaluator: (RolloutWorker)  Rollout worker for evalutation
+    :param n_epochs: (int) the number of epochs
+    :param n_test_rollouts: (int) the number of for the evalutation RolloutWorker
+    :param n_cycles: (int) the number of cycles for training per epoch
+    :param n_batches: (int) the batch size
+    :param policy_save_interval: (int) the interval with which policy pickles are saved.
+        If set to 0, only the best and latest policy will be pickled.
+    :param save_policies: (bool) whether or not to save the policies
+    """
     rank = MPI.COMM_WORLD.Get_rank()
 
     latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl')
@@ -66,7 +84,8 @@ def train(policy, rollout_worker, evaluator,
         success_rate = mpi_average(evaluator.current_success_rate())
         if rank == 0 and success_rate >= best_success_rate and save_policies:
             best_success_rate = success_rate
-            logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
+            logger.info('New best success rate: {}. Saving policy to {} ...'
+                        .format(best_success_rate, best_policy_path))
             evaluator.save_policy(best_policy_path)
             evaluator.save_policy(latest_policy_path)
         if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies:
@@ -82,10 +101,26 @@ def train(policy, rollout_worker, evaluator,
             assert local_uniform[0] != root_uniform[0]
 
 
-def launch(
-    env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
-    override_params={}, save_policies=True
-):
+def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return,
+           override_params=None, save_policies=True):
+    """
+    launch training with mpi
+
+    :param env: (str) environment ID
+    :param logdir: (str) the log directory
+    :param n_epochs: (int) the number of training epochs
+    :param num_cpu: (int) the number of CPUs to run on
+    :param seed: (int) the initial random seed
+    :param replay_strategy: (str) the type of replay strategy ('future' or 'none')
+    :param policy_save_interval: (int) the interval with which policy pickles are saved.
+        If set to 0, only the best and latest policy will be pickled.
+    :param clip_return: (float): clip returns to be in [-clip_return, clip_return]
+    :param override_params: (dict) override any parameter for training
+    :param save_policies: (bool) whether or not to save the policies
+    """
+
+    if override_params is None:
+        override_params = {}
     # Fork for multi-CPU MPI implementation.
     if num_cpu > 1:
         try:
@@ -96,14 +131,13 @@ def launch(
 
         if whoami == 'parent':
             sys.exit(0)
-        import baselines.common.tf_util as U
-        U.single_threaded_session().__enter__()
+        tf_util.single_threaded_session().__enter__()
     rank = MPI.COMM_WORLD.Get_rank()
 
     # Configure logging
     if rank == 0:
         if logdir or logger.get_dir() is None:
-            logger.configure(dir=logdir)
+            logger.configure(folder=logdir)
     else:
         logger.configure()
     logdir = logger.get_dir()
@@ -121,10 +155,10 @@ def launch(
     if env in config.DEFAULT_ENV_PARAMS:
         params.update(config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
     params.update(**override_params)  # makes it possible to override any parameter
-    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
-        json.dump(params, f)
+    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler:
+        json.dump(params, file_handler)
     params = config.prepare_params(params)
-    config.log_params(params, logger=logger)
+    config.log_params(params, logger_input=logger)
 
     if num_cpu == 1:
         logger.warn()
@@ -144,20 +178,20 @@ def launch(
     rollout_params = {
         'exploit': False,
         'use_target_net': False,
-        'use_demo_states': True,
-        'compute_Q': False,
-        'T': params['T'],
+        # 'use_demo_states': True,
+        'compute_q': False,
+        'time_horizon': params['time_horizon'],
     }
 
     eval_params = {
         'exploit': True,
         'use_target_net': params['test_with_polyak'],
-        'use_demo_states': False,
-        'compute_Q': True,
-        'T': params['T'],
+        # 'use_demo_states': False,
+        'compute_q': True,
+        'time_horizon': params['time_horizon'],
     }
 
-    for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
+    for name in ['time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
         rollout_params[name] = params[name]
         eval_params[name] = params[name]
 
@@ -168,22 +202,33 @@ def launch(
     evaluator.seed(rank_seed)
 
     train(
-        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
+        policy=policy, rollout_worker=rollout_worker,
         evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
         n_cycles=params['n_cycles'], n_batches=params['n_batches'],
         policy_save_interval=policy_save_interval, save_policies=save_policies)
 
 
 @click.command()
-@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on')
-@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
+@click.option('--env', type=str, default='FetchReach-v1',
+              help='the name of the OpenAI Gym environment that you want to train on')
+@click.option('--logdir', type=str, default=None,
+              help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/')
 @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run')
 @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)')
-@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code')
-@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.')
-@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
+@click.option('--seed', type=int, default=0,
+              help='the random seed used to seed both the environment and the training code')
+@click.option('--policy_save_interval', type=int, default=5,
+              help='the interval with which policy pickles are saved. '
+                   'If set to 0, only the best and latest policy will be pickled.')
+@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future',
+              help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.')
 @click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped')
 def main(**kwargs):
+    """
+    run launch for MPI HER DDPG training
+
+    :param kwargs: (dict) the launch kwargs
+    """
     launch(**kwargs)
 
 
diff --git a/baselines/her/her.py b/baselines/her/her.py
index 76f3c346ae..33c13b4eba 100644
--- a/baselines/her/her.py
+++ b/baselines/her/her.py
@@ -2,14 +2,14 @@
 
 
 def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
-    """Creates a sample function that can be used for HER experience replay.
+    """
+    Creates a sample function that can be used for HER experience replay.
 
-    Args:
-        replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
-            regular DDPG experience replay is used
-        replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
+    :param replay_strategy: (str) the HER replay strategy; if set to 'none', regular DDPG experience replay is used
+        (can be 'future' or 'none').
+    :param replay_k: (int) the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
             as many HER replays as regular replays are used)
-        reward_fun (function): function to re-compute the reward with substituted goals
+    :param reward_fun: (function (dict, dict): float) function to re-compute the reward with substituted goals
     """
     if replay_strategy == 'future':
         future_p = 1 - (1. / (1 + replay_k))
@@ -19,20 +19,20 @@ def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
     def _sample_her_transitions(episode_batch, batch_size_in_transitions):
         """episode_batch is {key: array(buffer_size x T x dim_key)}
         """
-        T = episode_batch['u'].shape[1]
+        time_horizon = episode_batch['u'].shape[1]
         rollout_batch_size = episode_batch['u'].shape[0]
         batch_size = batch_size_in_transitions
 
         # Select which episodes and time steps to use.
         episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
-        t_samples = np.random.randint(T, size=batch_size)
+        t_samples = np.random.randint(time_horizon, size=batch_size)
         transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
                        for key in episode_batch.keys()}
 
         # Select future time indexes proportional with probability future_p. These
         # will be used for HER replay by substituting in future goals.
         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
-        future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
+        future_offset = np.random.uniform(size=batch_size) * (time_horizon - t_samples)
         future_offset = future_offset.astype(int)
         future_t = (t_samples + 1 + future_offset)[her_indexes]
 
@@ -56,7 +56,7 @@ def _sample_her_transitions(episode_batch, batch_size_in_transitions):
         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
                        for k in transitions.keys()}
 
-        assert(transitions['u'].shape[0] == batch_size_in_transitions)
+        assert transitions['u'].shape[0] == batch_size_in_transitions
 
         return transitions
 
diff --git a/baselines/her/normalizer.py b/baselines/her/normalizer.py
index d2b0588e8b..38e8aeed1b 100644
--- a/baselines/her/normalizer.py
+++ b/baselines/her/normalizer.py
@@ -9,15 +9,15 @@
 
 class Normalizer:
     def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None):
-        """A normalizer that ensures that observations are approximately distributed according to
+        """
+        A normalizer that ensures that observations are approximately distributed according to
         a standard Normal distribution (i.e. have mean zero and variance one).
 
-        Args:
-            size (int): the size of the observation to be normalized
-            eps (float): a small constant that avoids underflows
-            default_clip_range (float): normalized observations are clipped to be in
-                [-default_clip_range, default_clip_range]
-            sess (object): the TensorFlow session to be used
+        :param size: (int) the size of the observation to be normalized
+        :param eps: (float) a small constant that avoids underflows
+        :param default_clip_range: (float) normalized observations are clipped to be in
+            [-default_clip_range, default_clip_range]
+        :param sess: (TensorFlow Session) the TensorFlow session to be used
         """
         self.size = size
         self.eps = eps
@@ -61,39 +61,69 @@ def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None):
         )
         self.lock = threading.Lock()
 
-    def update(self, v):
-        v = v.reshape(-1, self.size)
+    def update(self, arr):
+        """
+        update the parameters from the input
+
+        :param arr: (numpy Number) the input
+        """
+        arr = arr.reshape(-1, self.size)
 
         with self.lock:
-            self.local_sum += v.sum(axis=0)
-            self.local_sumsq += (np.square(v)).sum(axis=0)
-            self.local_count[0] += v.shape[0]
+            self.local_sum += arr.sum(axis=0)
+            self.local_sumsq += (np.square(arr)).sum(axis=0)
+            self.local_count[0] += arr.shape[0]
 
-    def normalize(self, v, clip_range=None):
+    def normalize(self, arr, clip_range=None):
+        """
+        normalize the input
+
+        :param arr: (numpy Number) the input
+        :param clip_range: (float) the range to clip to [-clip_range, clip_range]
+        :return: (numpy Number) normalized input
+        """
         if clip_range is None:
             clip_range = self.default_clip_range
-        mean = reshape_for_broadcasting(self.mean, v)
-        std = reshape_for_broadcasting(self.std,  v)
-        return tf.clip_by_value((v - mean) / std, -clip_range, clip_range)
-
-    def denormalize(self, v):
-        mean = reshape_for_broadcasting(self.mean, v)
-        std = reshape_for_broadcasting(self.std,  v)
-        return mean + v * std
-
-    def _mpi_average(self, x):
-        buf = np.zeros_like(x)
-        MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM)
+        mean = reshape_for_broadcasting(self.mean, arr)
+        std = reshape_for_broadcasting(self.std, arr)
+        return tf.clip_by_value((arr - mean) / std, -clip_range, clip_range)
+
+    def denormalize(self, arr):
+        """
+        denormalize the input
+
+        :param arr: (numpy Number) the normalized input
+        :return: (numpy Number) original input
+        """
+        mean = reshape_for_broadcasting(self.mean, arr)
+        std = reshape_for_broadcasting(self.std, arr)
+        return mean + arr * std
+
+    @classmethod
+    def _mpi_average(cls, arr):
+        buf = np.zeros_like(arr)
+        MPI.COMM_WORLD.Allreduce(arr, buf, op=MPI.SUM)
         buf /= MPI.COMM_WORLD.Get_size()
         return buf
 
-    def synchronize(self, local_sum, local_sumsq, local_count, root=None):
+    def synchronize(self, local_sum, local_sumsq, local_count):
+        """
+        syncronize over mpi threads
+
+        :param local_sum: (numpy Number) the sum
+        :param local_sumsq: (numpy Number) the square root sum
+        :param local_count: (numpy Number) the number of values updated
+        :return: (numpy Number, numpy Number, numpy Number) the updated local_sum, local_sumsq, and local_count
+        """
         local_sum[...] = self._mpi_average(local_sum)
         local_sumsq[...] = self._mpi_average(local_sumsq)
         local_count[...] = self._mpi_average(local_count)
         return local_sum, local_sumsq, local_count
 
     def recompute_stats(self):
+        """
+        recompute the stats
+        """
         with self.lock:
             # Copy over results.
             local_count = self.local_count.copy()
@@ -120,21 +150,50 @@ def recompute_stats(self):
 
 class IdentityNormalizer:
     def __init__(self, size, std=1.):
+        """
+        Normalizer that returns the input unchanged
+
+        :param size: (int or [int]) the shape of the input to normalize
+        :param std: (float) the initial standard deviation or the normalization
+        """
         self.size = size
         self.mean = tf.zeros(self.size, tf.float32)
         self.std = std * tf.ones(self.size, tf.float32)
 
-    def update(self, x):
+    def update(self, arr):
+        """
+        update the parameters from the input
+
+        :param arr: (numpy Number) the input
+        """
         pass
 
-    def normalize(self, x, clip_range=None):
-        return x / self.std
+    def normalize(self, arr, **_kwargs):
+        """
+        normalize the input
+
+        :param arr: (numpy Number) the input
+        :return: (numpy Number) normalized input
+        """
+        return arr / self.std
+
+    def denormalize(self, arr):
+        """
+        denormalize the input
 
-    def denormalize(self, x):
-        return self.std * x
+        :param arr: (numpy Number) the normalized input
+        :return: (numpy Number) original input
+        """
+        return self.std * arr
 
     def synchronize(self):
+        """
+        syncronize over mpi threads
+        """
         pass
 
     def recompute_stats(self):
+        """
+        recompute the stats
+        """
         pass
diff --git a/baselines/her/replay_buffer.py b/baselines/her/replay_buffer.py
index b0005523fd..c46de90b0c 100644
--- a/baselines/her/replay_buffer.py
+++ b/baselines/her/replay_buffer.py
@@ -4,19 +4,18 @@
 
 
 class ReplayBuffer:
-    def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions):
-        """Creates a replay buffer.
-
-        Args:
-            buffer_shapes (dict of ints): the shape for all buffers that are used in the replay
-                buffer
-            size_in_transitions (int): the size of the buffer, measured in transitions
-            T (int): the time horizon for episodes
-            sample_transitions (function): a function that samples from the replay buffer
+    def __init__(self, buffer_shapes, size_in_transitions, time_horizon, sample_transitions):
+        """
+        Creates a replay buffer.
+
+        :param buffer_shapes: ({str: int}) the shape for all buffers that are used in the replay buffer
+        :param size_in_transitions: (int) the size of the buffer, measured in transitions
+        :param time_horizon: (int) the time horizon for episodes
+        :param sample_transitions: (function) a function that samples from the replay buffer
         """
         self.buffer_shapes = buffer_shapes
-        self.size = size_in_transitions // T
-        self.T = T
+        self.size = size_in_transitions // time_horizon
+        self.time_horizon = time_horizon
         self.sample_transitions = sample_transitions
 
         # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
@@ -35,7 +34,11 @@ def full(self):
             return self.current_size == self.size
 
     def sample(self, batch_size):
-        """Returns a dict {key: array(batch_size x shapes[key])}
+        """
+        sample random transitions
+
+        :param batch_size: (int) How many transitions to sample.
+        :return: (dict) {key: array(batch_size x shapes[key])}
         """
         buffers = {}
 
@@ -55,7 +58,10 @@ def sample(self, batch_size):
         return transitions
 
     def store_episode(self, episode_batch):
-        """episode_batch: array(batch_size x (T or T+1) x dim_key)
+        """
+        Store an episode in the replay buffer
+
+        :param episode_batch: (numpy Number) batch_size x (T or T+1) x dim_key
         """
         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
         assert np.all(np.array(batch_sizes) == batch_sizes[0])
@@ -68,30 +74,48 @@ def store_episode(self, episode_batch):
             for key in self.buffers.keys():
                 self.buffers[key][idxs] = episode_batch[key]
 
-            self.n_transitions_stored += batch_size * self.T
+            self.n_transitions_stored += batch_size * self.time_horizon
 
     def get_current_episode_size(self):
+        """
+        get current episode size
+
+        :return: (int) the current size of the episode
+        """
         with self.lock:
             return self.current_size
 
     def get_current_size(self):
+        """
+        get current size of the buffer
+
+        :return: (int) the current size of the buffer
+        """
         with self.lock:
-            return self.current_size * self.T
+            return self.current_size * self.time_horizon
 
     def get_transitions_stored(self):
+        """
+        get the number of stored transitions
+
+        :return: (int) the number of transitions stored
+        """
         with self.lock:
             return self.n_transitions_stored
 
     def clear_buffer(self):
+        """
+        clear the buffer of all entries
+        """
         with self.lock:
             self.current_size = 0
 
     def _get_storage_idx(self, inc=None):
-        inc = inc or 1   # size increment
+        inc = inc or 1  # size increment
         assert inc <= self.size, "Batch committed to replay is too large!"
         # go consecutively until you hit the end, and then go randomly.
-        if self.current_size+inc <= self.size:
-            idx = np.arange(self.current_size, self.current_size+inc)
+        if self.current_size + inc <= self.size:
+            idx = np.arange(self.current_size, self.current_size + inc)
         elif self.current_size < self.size:
             overflow = inc - (self.size - self.current_size)
             idx_a = np.arange(self.current_size, self.size)
@@ -101,7 +125,7 @@ def _get_storage_idx(self, inc=None):
             idx = np.random.randint(0, self.size, inc)
 
         # update replay size
-        self.current_size = min(self.size, self.current_size+inc)
+        self.current_size = min(self.size, self.current_size + inc)
 
         if inc == 1:
             idx = idx[0]
diff --git a/baselines/her/rollout.py b/baselines/her/rollout.py
index 5beba69dd7..e2368e4436 100644
--- a/baselines/her/rollout.py
+++ b/baselines/her/rollout.py
@@ -1,118 +1,138 @@
 from collections import deque
+import pickle
 
 import numpy as np
-import pickle
 from mujoco_py import MujocoException
 
-from baselines.her.util import convert_episode_to_batch_major, store_args
+from baselines.her.util import convert_episode_to_batch_major
 
 
 class RolloutWorker:
-
-    @store_args
-    def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1,
-                 exploit=False, use_target_net=False, compute_Q=False, noise_eps=0,
-                 random_eps=0, history_len=100, render=False, **kwargs):
-        """Rollout worker generates experience by interacting with one or many environments.
-
-        Args:
-            make_env (function): a factory function that creates a new instance of the environment
-                when called
-            policy (object): the policy that is used to act
-            dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u)
-            logger (object): the logger that is used by the rollout worker
-            rollout_batch_size (int): the number of parallel rollouts that should be used
-            exploit (boolean): whether or not to exploit, i.e. to act optimally according to the
-                current policy without any exploration
-            use_target_net (boolean): whether or not to use the target net for rollouts
-            compute_Q (boolean): whether or not to compute the Q values alongside the actions
-            noise_eps (float): scale of the additive Gaussian noise
-            random_eps (float): probability of selecting a completely random action
-            history_len (int): length of history for statistics smoothing
-            render (boolean): whether or not to render the rollouts
+    def __init__(self, make_env, policy, dims, logger, time_horizon, rollout_batch_size=1,
+                 exploit=False, use_target_net=False, compute_q=False, noise_eps=0,
+                 random_eps=0, history_len=100, render=False):
         """
+        Rollout worker generates experience by interacting with one or many environments.
+
+        :param make_env: (function (): Gym Environment) a factory function that creates a new instance of the
+            environment when called
+        :param policy: (Object) the policy that is used to act
+        :param dims: ({str: int}) the dimensions for observations (o), goals (g), and actions (u)
+        :param logger: (Object) the logger that is used by the rollout worker
+        :param rollout_batch_size: (int) the number of parallel rollouts that should be used
+        :param exploit: (bool) whether or not to exploit, i.e. to act optimally according to the current policy without
+            any exploration
+        :param use_target_net: (bool) whether or not to use the target net for rollouts
+        :param compute_q: (bool) whether or not to compute the Q values alongside the actions
+        :param noise_eps: (float) scale of the additive Gaussian noise
+        :param random_eps: (float) probability of selecting a completely random action
+        :param history_len: (int) length of history for statistics smoothing
+        :param render: (boolean) whether or not to render the rollouts
+        """
+        self.make_env = make_env
+        self.policy = policy
+        self.dims = dims
+        self.logger = logger
+        self.time_horizon = time_horizon
+        self.rollout_batch_size = rollout_batch_size
+        self.exploit = exploit
+        self.use_target_net = use_target_net
+        self.compute_q = compute_q
+        self.noise_eps = noise_eps
+        self.random_eps = random_eps
+        self.history_len = history_len
+        self.render = render
+
         self.envs = [make_env() for _ in range(rollout_batch_size)]
-        assert self.T > 0
+        assert self.time_horizon > 0
 
         self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')]
 
         self.success_history = deque(maxlen=history_len)
-        self.Q_history = deque(maxlen=history_len)
+        self.q_history = deque(maxlen=history_len)
 
         self.n_episodes = 0
-        self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # goals
-        self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
+        self.goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # goals
+        self.initial_obs = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
         self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
         self.reset_all_rollouts()
         self.clear_history()
 
-    def reset_rollout(self, i):
-        """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o`
-        and `g` arrays accordingly.
+    def reset_rollout(self, index):
         """
-        obs = self.envs[i].reset()
-        self.initial_o[i] = obs['observation']
-        self.initial_ag[i] = obs['achieved_goal']
-        self.g[i] = obs['desired_goal']
+        Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` and `g` arrays
+        accordingly.
+
+        :param index: (int) the index to reset
+        """
+        obs = self.envs[index].reset()
+        self.initial_obs[index] = obs['observation']
+        self.initial_ag[index] = obs['achieved_goal']
+        self.goals[index] = obs['desired_goal']
 
     def reset_all_rollouts(self):
-        """Resets all `rollout_batch_size` rollout workers.
         """
-        for i in range(self.rollout_batch_size):
-            self.reset_rollout(i)
+        Resets all `rollout_batch_size` rollout workers.
+        """
+        for step in range(self.rollout_batch_size):
+            self.reset_rollout(step)
 
     def generate_rollouts(self):
-        """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
+        """
+        Performs `rollout_batch_size` rollouts in parallel for time horizon with the current
         policy acting on it accordingly.
+
+        :return: (dict) batch
         """
         self.reset_all_rollouts()
 
         # compute observations
-        o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
-        ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
-        o[:] = self.initial_o
-        ag[:] = self.initial_ag
+        observations = np.empty((self.rollout_batch_size, self.dims['o']), np.float32)  # observations
+        achieved_goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32)  # achieved goals
+        observations[:] = self.initial_obs
+        achieved_goals[:] = self.initial_ag
 
         # generate episodes
         obs, achieved_goals, acts, goals, successes = [], [], [], [], []
-        info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
-        Qs = []
-        for t in range(self.T):
+        info_values = [np.empty((self.time_horizon, self.rollout_batch_size, self.dims['info_' + key]), np.float32)
+                       for key in self.info_keys]
+        q_values = []
+        for step in range(self.time_horizon):
             policy_output = self.policy.get_actions(
-                o, ag, self.g,
-                compute_Q=self.compute_Q,
+                observations, achieved_goals, self.goals,
+                compute_q=self.compute_q,
                 noise_eps=self.noise_eps if not self.exploit else 0.,
                 random_eps=self.random_eps if not self.exploit else 0.,
                 use_target_net=self.use_target_net)
 
-            if self.compute_Q:
-                u, Q = policy_output
-                Qs.append(Q)
+            if self.compute_q:
+                action, q_value = policy_output
+                q_values.append(q_value)
             else:
-                u = policy_output
+                action = policy_output
 
-            if u.ndim == 1:
+            if action.ndim == 1:
                 # The non-batched case should still have a reasonable shape.
-                u = u.reshape(1, -1)
+                action = action.reshape(1, -1)
 
             o_new = np.empty((self.rollout_batch_size, self.dims['o']))
             ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
             success = np.zeros(self.rollout_batch_size)
             # compute new states and observations
-            for i in range(self.rollout_batch_size):
+            for batch_idx in range(self.rollout_batch_size):
                 try:
                     # We fully ignore the reward here because it will have to be re-computed
                     # for HER.
-                    curr_o_new, _, _, info = self.envs[i].step(u[i])
+                    curr_o_new, _, _, info = self.envs[batch_idx].step(action[batch_idx])
                     if 'is_success' in info:
-                        success[i] = info['is_success']
-                    o_new[i] = curr_o_new['observation']
-                    ag_new[i] = curr_o_new['achieved_goal']
+                        success[batch_idx] = info['is_success']
+                    o_new[batch_idx] = curr_o_new['observation']
+                    ag_new[batch_idx] = curr_o_new['achieved_goal']
                     for idx, key in enumerate(self.info_keys):
-                        info_values[idx][t, i] = info[key]
+                        info_values[idx][step, batch_idx] = info[key]
                     if self.render:
-                        self.envs[i].render()
-                except MujocoException as e:
+                        self.envs[batch_idx].render()
+                except MujocoException:
                     return self.generate_rollouts()
 
             if np.isnan(o_new).any():
@@ -120,16 +140,16 @@ def generate_rollouts(self):
                 self.reset_all_rollouts()
                 return self.generate_rollouts()
 
-            obs.append(o.copy())
-            achieved_goals.append(ag.copy())
+            obs.append(observations.copy())
+            achieved_goals.append(achieved_goals.copy())
             successes.append(success.copy())
-            acts.append(u.copy())
-            goals.append(self.g.copy())
-            o[...] = o_new
-            ag[...] = ag_new
-        obs.append(o.copy())
-        achieved_goals.append(ag.copy())
-        self.initial_o[:] = o
+            acts.append(action.copy())
+            goals.append(self.goals.copy())
+            observations[...] = o_new
+            achieved_goals[...] = ag_new
+        obs.append(observations.copy())
+        achieved_goals.append(achieved_goals.copy())
+        self.initial_obs[:] = observations
 
         episode = dict(o=obs,
                        u=acts,
@@ -143,37 +163,54 @@ def generate_rollouts(self):
         assert successful.shape == (self.rollout_batch_size,)
         success_rate = np.mean(successful)
         self.success_history.append(success_rate)
-        if self.compute_Q:
-            self.Q_history.append(np.mean(Qs))
+
+        if self.compute_q:
+            self.q_history.append(np.mean(q_values))
         self.n_episodes += self.rollout_batch_size
 
         return convert_episode_to_batch_major(episode)
 
     def clear_history(self):
-        """Clears all histories that are used for statistics
+        """
+        Clears all histories that are used for statistics
         """
         self.success_history.clear()
-        self.Q_history.clear()
+        self.q_history.clear()
 
     def current_success_rate(self):
+        """
+        returns the current success rate
+        :return: (float) the success rate
+        """
         return np.mean(self.success_history)
 
-    def current_mean_Q(self):
-        return np.mean(self.Q_history)
+    def current_mean_q(self):
+        """
+        returns the current mean Q value
+        :return: (float) the mean Q value
+        """
+        return np.mean(self.q_history)
 
     def save_policy(self, path):
-        """Pickles the current policy for later inspection.
         """
-        with open(path, 'wb') as f:
-            pickle.dump(self.policy, f)
+        Pickles the current policy for later inspection.
+
+        :param path: (str) the save location
+        """
+        with open(path, 'wb') as file_handler:
+            pickle.dump(self.policy, file_handler)
 
     def logs(self, prefix='worker'):
-        """Generates a dictionary that contains all collected statistics.
+        """
+        Generates a dictionary that contains all collected statistics.
+
+        :param prefix: (str) the prefix for the name in logging
+        :return: ([(str, float)]) the logging information
         """
         logs = []
         logs += [('success_rate', np.mean(self.success_history))]
-        if self.compute_Q:
-            logs += [('mean_Q', np.mean(self.Q_history))]
+        if self.compute_q:
+            logs += [('mean_q', np.mean(self.q_history))]
         logs += [('episode', self.n_episodes)]
 
         if prefix is not '' and not prefix.endswith('/'):
@@ -182,7 +219,10 @@ def logs(self, prefix='worker'):
             return logs
 
     def seed(self, seed):
-        """Seeds each environment with a distinct seed derived from the passed in global seed.
+        """
+        Seeds each environment with a distinct seed derived from the passed in global seed.
+
+        :param seed: (int) the random seed
         """
         for idx, env in enumerate(self.envs):
             env.seed(seed + 1000 * idx)
diff --git a/baselines/her/util.py b/baselines/her/util.py
index d637aa69f1..784329b03c 100644
--- a/baselines/her/util.py
+++ b/baselines/her/util.py
@@ -2,79 +2,69 @@
 import subprocess
 import sys
 import importlib
-import inspect
-import functools
 
 import tensorflow as tf
 import numpy as np
+from mpi4py import MPI
 
-from baselines.common import tf_util as U
+from baselines.common import tf_util
 
 
-def store_args(method):
-    """Stores provided method args as instance attributes.
+def import_function(spec):
     """
-    argspec = inspect.getfullargspec(method)
-    defaults = {}
-    if argspec.defaults is not None:
-        defaults = dict(
-            zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
-    if argspec.kwonlydefaults is not None:
-        defaults.update(argspec.kwonlydefaults)
-    arg_names = argspec.args[1:]
-
-    @functools.wraps(method)
-    def wrapper(*positional_args, **keyword_args):
-        self = positional_args[0]
-        # Get default arg values
-        args = defaults.copy()
-        # Add provided arg values
-        for name, value in zip(arg_names, positional_args[1:]):
-            args[name] = value
-        args.update(keyword_args)
-        self.__dict__.update(args)
-        return method(*positional_args, **keyword_args)
-
-    return wrapper
+    Import a function identified by a string like "pkg.module:fn_name".
 
-
-def import_function(spec):
-    """Import a function identified by a string like "pkg.module:fn_name".
+    :param spec: (str) the function to import
+    :return: (function)
     """
     mod_name, fn_name = spec.split(':')
     module = importlib.import_module(mod_name)
-    fn = getattr(module, fn_name)
-    return fn
+    func = getattr(module, fn_name)
+    return func
 
 
 def flatten_grads(var_list, grads):
-    """Flattens a variables and their gradients.
     """
-    return tf.concat([tf.reshape(grad, [U.numel(v)])
+    Flattens a variables and their gradients.
+
+    :param var_list: ([TensorFlow Tensor]) the variables
+    :param grads: ([TensorFlow Tensor]) the gradients
+    :return: (TensorFlow Tensor) the flattend variable and gradient
+    """
+    return tf.concat([tf.reshape(grad, [tf_util.numel(v)])
                       for (v, grad) in zip(var_list, grads)], 0)
 
 
-def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
-    """Creates a simple neural network
+def mlp(_input, layers_sizes, reuse=None, flatten=False, name=""):
+    """
+    Creates a simple fully-connected neural network
+
+    :param _input: (TensorFlow Tensor) the input
+    :param layers_sizes: ([int]) the hidden layers
+    :param reuse: (bool) Enable reuse of the network
+    :param flatten: (bool) flatten the network output
+    :param name: (str) the name of the network
+    :return: (TensorFlow Tensor) the network
     """
     for i, size in enumerate(layers_sizes):
         activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
-        input = tf.layers.dense(inputs=input,
-                                units=size,
-                                kernel_initializer=tf.contrib.layers.xavier_initializer(),
-                                reuse=reuse,
-                                name=name + '_' + str(i))
+        _input = tf.layers.dense(inputs=_input,
+                                 units=size,
+                                 kernel_initializer=tf.contrib.layers.xavier_initializer(),
+                                 reuse=reuse,
+                                 name=name + '_' + str(i))
         if activation:
-            input = activation(input)
+            _input = activation(_input)
     if flatten:
         assert layers_sizes[-1] == 1
-        input = tf.reshape(input, [-1])
-    return input
+        _input = tf.reshape(_input, [-1])
+    return _input
 
 
 def install_mpi_excepthook():
-    import sys
-    from mpi4py import MPI
+    """
+    setup the MPI exception hooks
+    """
     old_hook = sys.excepthook
 
     def new_hook(a, b, c):
@@ -82,14 +72,23 @@ def new_hook(a, b, c):
         sys.stdout.flush()
         sys.stderr.flush()
         MPI.COMM_WORLD.Abort()
+
     sys.excepthook = new_hook
 
 
-def mpi_fork(n, extra_mpi_args=[]):
-    """Re-launches the current script with workers
+def mpi_fork(rank, extra_mpi_args=None):
+    """
+    Re-launches the current script with workers
     Returns "parent" for original parent, "child" for MPI children
+
+    :param rank: (int) the thread rank
+    :param extra_mpi_args: (dict) extra arguments for MPI
+    :return: (str) the correct type of thread name
     """
-    if n <= 1:
+    if extra_mpi_args is None:
+        extra_mpi_args = []
+
+    if rank <= 1:
         return "child"
     if os.getenv("IN_MPI") is None:
         env = os.environ.copy()
@@ -99,9 +98,9 @@ def mpi_fork(n, extra_mpi_args=[]):
             IN_MPI="1"
         )
         # "-bind-to core" is crucial for good performance
-        args = ["mpirun", "-np", str(n)] + \
-            extra_mpi_args + \
-            [sys.executable]
+        args = ["mpirun", "-np", str(rank)] + \
+               extra_mpi_args + \
+               [sys.executable]
 
         args += sys.argv
         subprocess.check_call(args, env=env)
@@ -112,8 +111,11 @@ def mpi_fork(n, extra_mpi_args=[]):
 
 
 def convert_episode_to_batch_major(episode):
-    """Converts an episode to have the batch dimension in the major (first)
-    dimension.
+    """
+    Converts an episode to have the batch dimension in the major (first) dimension.
+
+    :param episode: (dict) the episode batch
+    :return: (dict) the episode batch with he batch dimension in the major (first) dimension.
     """
     episode_batch = {}
     for key in episode.keys():
@@ -125,15 +127,23 @@ def convert_episode_to_batch_major(episode):
 
 
 def transitions_in_episode_batch(episode_batch):
-    """Number of transitions in a given episode batch.
+    """
+    Number of transitions in a given episode batch.
+
+    :param episode_batch: (dict) the episode batch
+    :return: (int) the number of transitions in episode batch
     """
     shape = episode_batch['u'].shape
     return shape[0] * shape[1]
 
 
 def reshape_for_broadcasting(source, target):
-    """Reshapes a tensor (source) to have the correct shape and dtype of the target
-    before broadcasting it with MPI.
+    """
+    Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI.
+
+    :param source: (TensorFlow Tensor) the input tensor
+    :param target: (TensorFlow Tensor) the target tensor
+    :return: (TensorFlow Tensor) the rehshaped tensor
     """
     dim = len(target.get_shape())
     shape = ([1] * (dim - 1)) + [-1]
diff --git a/baselines/logger.py b/baselines/logger.py
index 0abad0e8c5..eccd89d550 100644
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import shutil
-import os.path as osp
 import json
 import time
 import datetime
@@ -15,21 +14,45 @@
 
 DISABLED = 50
 
+
 class KVWriter(object):
+    """
+    Key Value writer
+    """
     def writekvs(self, kvs):
+        """
+        write a dictionary to file
+
+        :param kvs: (dict)
+        """
         raise NotImplementedError
 
+
 class SeqWriter(object):
+    """
+    sequence writer
+    """
     def writeseq(self, seq):
+        """
+        write an array to file
+
+        :param seq: (list)
+        """
         raise NotImplementedError
 
+
 class HumanOutputFormat(KVWriter, SeqWriter):
     def __init__(self, filename_or_file):
+        """
+        log to a file, in a human readable format
+
+        :param filename_or_file: (str or File) the file to write the log to
+        """
         if isinstance(filename_or_file, str):
             self.file = open(filename_or_file, 'wt')
             self.own_file = True
         else:
-            assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file
+            assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s' % filename_or_file
             self.file = filename_or_file
             self.own_file = False
 
@@ -67,39 +90,58 @@ def writekvs(self, kvs):
         # Flush the output to the file
         self.file.flush()
 
-    def _truncate(self, s):
-        return s[:20] + '...' if len(s) > 23 else s
+    @classmethod
+    def _truncate(cls, string):
+        return string[:20] + '...' if len(string) > 23 else string
 
     def writeseq(self, seq):
         seq = list(seq)
         for (i, elem) in enumerate(seq):
             self.file.write(elem)
-            if i < len(seq) - 1: # add space unless this is the last one
+            if i < len(seq) - 1:  # add space unless this is the last one
                 self.file.write(' ')
         self.file.write('\n')
         self.file.flush()
 
     def close(self):
+        """
+        closes the file
+        """
         if self.own_file:
             self.file.close()
 
+
 class JSONOutputFormat(KVWriter):
     def __init__(self, filename):
+        """
+        log to a file, in the JSON format
+
+        :param filename: (str) the file to write the log to
+        """
         self.file = open(filename, 'wt')
 
     def writekvs(self, kvs):
-        for k, v in sorted(kvs.items()):
-            if hasattr(v, 'dtype'):
-                v = v.tolist()
-                kvs[k] = float(v)
+        for key, value in sorted(kvs.items()):
+            if hasattr(value, 'dtype'):
+                value = value.tolist()
+                kvs[key] = float(value)
         self.file.write(json.dumps(kvs) + '\n')
         self.file.flush()
 
     def close(self):
+        """
+        closes the file
+        """
         self.file.close()
 
+
 class CSVOutputFormat(KVWriter):
     def __init__(self, filename):
+        """
+        log to a file, in a CSV format
+
+        :param filename: (str) the file to write the log to
+        """
         self.file = open(filename, 'w+t')
         self.keys = []
         self.sep = ','
@@ -112,77 +154,96 @@ def writekvs(self, kvs):
             self.file.seek(0)
             lines = self.file.readlines()
             self.file.seek(0)
-            for (i, k) in enumerate(self.keys):
+            for (i, key) in enumerate(self.keys):
                 if i > 0:
                     self.file.write(',')
-                self.file.write(k)
+                self.file.write(key)
             self.file.write('\n')
             for line in lines[1:]:
                 self.file.write(line[:-1])
                 self.file.write(self.sep * len(extra_keys))
                 self.file.write('\n')
-        for (i, k) in enumerate(self.keys):
+        for i, key in enumerate(self.keys):
             if i > 0:
                 self.file.write(',')
-            v = kvs.get(k)
-            if v is not None:
-                self.file.write(str(v))
+            value = kvs.get(key)
+            if value is not None:
+                self.file.write(str(value))
         self.file.write('\n')
         self.file.flush()
 
     def close(self):
+        """
+        closes the file
+        """
         self.file.close()
 
 
 class TensorBoardOutputFormat(KVWriter):
-    """
-    Dumps key/value pairs into TensorBoard's numeric format.
-    """
-    def __init__(self, dir):
-        os.makedirs(dir, exist_ok=True)
-        self.dir = dir
+    def __init__(self, folder):
+        """
+        Dumps key/value pairs into TensorBoard's numeric format.
+
+        :param folder: (str) the folder to write the log to
+        """
+        os.makedirs(folder, exist_ok=True)
+        self.dir = folder
         self.step = 1
         prefix = 'events'
-        path = osp.join(osp.abspath(dir), prefix)
+        path = os.path.join(os.path.abspath(folder), prefix)
         import tensorflow as tf
         from tensorflow.python import pywrap_tensorflow
         from tensorflow.core.util import event_pb2
         from tensorflow.python.util import compat
-        self.tf = tf
+        self._tf = tf
         self.event_pb2 = event_pb2
         self.pywrap_tensorflow = pywrap_tensorflow
         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
 
     def writekvs(self, kvs):
-        def summary_val(k, v):
-            kwargs = {'tag': k, 'simple_value': float(v)}
-            return self.tf.Summary.Value(**kwargs)
-        summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
+        def summary_val(key, value):
+            kwargs = {'tag': key, 'simple_value': float(value)}
+            return self._tf.Summary.Value(**kwargs)
+
+        summary = self._tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
-        event.step = self.step # is there any reason why you'd want to specify the step?
+        event.step = self.step  # is there any reason why you'd want to specify the step?
         self.writer.WriteEvent(event)
         self.writer.Flush()
         self.step += 1
 
     def close(self):
+        """
+        closes the file
+        """
         if self.writer:
             self.writer.Close()
             self.writer = None
 
-def make_output_format(format, ev_dir, log_suffix=''):
+
+def make_output_format(_format, ev_dir, log_suffix=''):
+    """
+    return a logger for the requested format
+
+    :param _format: (str) the requested format to log to ('stdout', 'log', 'json', 'csv' or 'tensorboard')
+    :param ev_dir: (str) the logging directory
+    :param log_suffix: (str) the suffix for the log file
+    :return: (KVWrite) the logger
+    """
     os.makedirs(ev_dir, exist_ok=True)
-    if format == 'stdout':
+    if _format == 'stdout':
         return HumanOutputFormat(sys.stdout)
-    elif format == 'log':
-        return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix))
-    elif format == 'json':
-        return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix))
-    elif format == 'csv':
-        return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix))
-    elif format == 'tensorboard':
-        return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix))
+    elif _format == 'log':
+        return HumanOutputFormat(os.path.join(ev_dir, 'log%s.txt' % log_suffix))
+    elif _format == 'json':
+        return JSONOutputFormat(os.path.join(ev_dir, 'progress%s.json' % log_suffix))
+    elif _format == 'csv':
+        return CSVOutputFormat(os.path.join(ev_dir, 'progress%s.csv' % log_suffix))
+    elif _format == 'tensorboard':
+        return TensorBoardOutputFormat(os.path.join(ev_dir, 'tb%s' % log_suffix))
     else:
-        raise ValueError('Unknown format specified: %s' % (format,))
+        raise ValueError('Unknown format specified: %s' % (_format,))
+
 
 # ================================================================
 # API
@@ -193,94 +254,164 @@ def logkv(key, val):
     Log a value of some diagnostic
     Call this once for each diagnostic quantity, each iteration
     If called many times, last value will be used.
+
+    :param key: (Any) save to log this key
+    :param val: (Any) save to log this value
     """
     Logger.CURRENT.logkv(key, val)
 
+
 def logkv_mean(key, val):
     """
     The same as logkv(), but if called many times, values averaged.
+
+    :param key: (Any) save to log this key
+    :param val: (Number) save to log this value
     """
     Logger.CURRENT.logkv_mean(key, val)
 
-def logkvs(d):
+
+def logkvs(key_values):
     """
     Log a dictionary of key-value pairs
+
+    :param key_values: (dict) the list of keys and values to save to log
     """
-    for (k, v) in d.items():
-        logkv(k, v)
+    for key, value in key_values.items():
+        logkv(key, value)
+
 
 def dumpkvs():
     """
     Write all of the diagnostics from the current iteration
-
-    level: int. (see logger.py docs) If the global logger level is higher than
-                the level argument here, don't print to stdout.
     """
     Logger.CURRENT.dumpkvs()
 
+
 def getkvs():
+    """
+    get the key values logs
+
+    :return: (dict) the logged values
+    """
     return Logger.CURRENT.name2val
 
 
 def log(*args, level=INFO):
     """
-    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    Write the sequence of args, with no separators,
+    to the console and output files (if you've configured an output file).
+
+    level: int. (see logger.py docs) If the global logger level is higher than
+                the level argument here, don't print to stdout.
+
+    :param args: (list) log the arguments
+    :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50)
     """
     Logger.CURRENT.log(*args, level=level)
 
+
 def debug(*args):
+    """
+    Write the sequence of args, with no separators,
+    to the console and output files (if you've configured an output file).
+    Using the DEBUG level.
+
+    :param args: (list) log the arguments
+    """
     log(*args, level=DEBUG)
 
+
 def info(*args):
+    """
+    Write the sequence of args, with no separators,
+    to the console and output files (if you've configured an output file).
+    Using the INFO level.
+
+    :param args: (list) log the arguments
+    """
     log(*args, level=INFO)
 
+
 def warn(*args):
+    """
+    Write the sequence of args, with no separators,
+    to the console and output files (if you've configured an output file).
+    Using the WARN level.
+
+    :param args: (list) log the arguments
+    """
     log(*args, level=WARN)
 
+
 def error(*args):
+    """
+    Write the sequence of args, with no separators,
+    to the console and output files (if you've configured an output file).
+    Using the ERROR level.
+
+    :param args: (list) log the arguments
+    """
     log(*args, level=ERROR)
 
 
 def set_level(level):
     """
     Set logging threshold on current logger.
+
+    :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50)
     """
     Logger.CURRENT.set_level(level)
 
+
 def get_dir():
     """
     Get directory that log files are being written to.
     will be None if there is no output directory (i.e., if you didn't call start)
+
+    :return: (str) the logging directory
     """
     return Logger.CURRENT.get_dir()
 
+
 record_tabular = logkv
 dump_tabular = dumpkvs
 
+
 class ProfileKV:
-    """
-    Usage:
-    with logger.ProfileKV("interesting_scope"):
-        code
-    """
-    def __init__(self, n):
-        self.n = "wait_" + n
+    def __init__(self, name):
+        """
+        Usage:
+        with logger.ProfileKV("interesting_scope"):
+            code
+
+        :param name: (str) the profiling name
+        """
+        self.name = "wait_" + name
+
     def __enter__(self):
-        self.t1 = time.time()
-    def __exit__(self ,type, value, traceback):
-        Logger.CURRENT.name2val[self.n] += time.time() - self.t1
+        self.start_time = time.time()
+
+    def __exit__(self, _type, value, traceback):
+        Logger.CURRENT.name2val[self.name] += time.time() - self.start_time
+
 
-def profile(n):
+def profile(name):
     """
     Usage:
     @profile("my_func")
     def my_func(): code
+
+    :param name: (str) the profiling name
+    :return: (function) the wrapped function
     """
     def decorator_with_name(func):
         def func_wrapper(*args, **kwargs):
-            with ProfileKV(n):
+            with ProfileKV(name):
                 return func(*args, **kwargs)
+
         return func_wrapper
+
     return decorator_with_name
 
 
@@ -289,32 +420,57 @@ def func_wrapper(*args, **kwargs):
 # ================================================================
 
 class Logger(object):
-    DEFAULT = None  # A logger with no output files. (See right below class definition)
-                    # So that you can still log to the terminal without setting up any output files
+    # A logger with no output files. (See right below class definition)
+    #  So that you can still log to the terminal without setting up any output files
+    DEFAULT = None
     CURRENT = None  # Current logger being used by the free functions above
 
-    def __init__(self, dir, output_formats):
+    def __init__(self, folder, output_formats):
+        """
+        the logger class
+
+        :param folder: (str) the logging location
+        :param output_formats: ([str]) the list of output format
+        """
         self.name2val = defaultdict(float)  # values this iteration
         self.name2cnt = defaultdict(int)
         self.level = INFO
-        self.dir = dir
+        self.dir = folder
         self.output_formats = output_formats
 
     # Logging API, forwarded
     # ----------------------------------------
     def logkv(self, key, val):
+        """
+        Log a value of some diagnostic
+        Call this once for each diagnostic quantity, each iteration
+        If called many times, last value will be used.
+
+        :param key: (Any) save to log this key
+        :param val: (Any) save to log this value
+        """
         self.name2val[key] = val
 
     def logkv_mean(self, key, val):
+        """
+        The same as logkv(), but if called many times, values averaged.
+
+        :param key: (Any) save to log this key
+        :param val: (Number) save to log this value
+        """
         if val is None:
             self.name2val[key] = None
             return
         oldval, cnt = self.name2val[key], self.name2cnt[key]
-        self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1)
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
         self.name2cnt[key] = cnt + 1
 
     def dumpkvs(self):
-        if self.level == DISABLED: return
+        """
+        Write all of the diagnostics from the current iteration
+        """
+        if self.level == DISABLED:
+            return
         for fmt in self.output_formats:
             if isinstance(fmt, KVWriter):
                 fmt.writekvs(self.name2val)
@@ -322,38 +478,75 @@ def dumpkvs(self):
         self.name2cnt.clear()
 
     def log(self, *args, level=INFO):
+        """
+        Write the sequence of args, with no separators,
+        to the console and output files (if you've configured an output file).
+
+        level: int. (see logger.py docs) If the global logger level is higher than
+                    the level argument here, don't print to stdout.
+
+        :param args: (list) log the arguments
+        :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50)
+        """
         if self.level <= level:
             self._do_log(args)
 
     # Configuration
     # ----------------------------------------
     def set_level(self, level):
+        """
+        Set logging threshold on current logger.
+
+        :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50)
+        """
         self.level = level
 
     def get_dir(self):
+        """
+        Get directory that log files are being written to.
+        will be None if there is no output directory (i.e., if you didn't call start)
+
+        :return: (str) the logging directory
+        """
         return self.dir
 
     def close(self):
+        """
+        closes the file
+        """
         for fmt in self.output_formats:
             fmt.close()
 
     # Misc
     # ----------------------------------------
     def _do_log(self, args):
+        """
+        log to the requested format outputs
+
+        :param args: (list) the arguments to log
+        """
         for fmt in self.output_formats:
             if isinstance(fmt, SeqWriter):
                 fmt.writeseq(map(str, args))
 
-Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
 
-def configure(dir=None, format_strs=None):
-    if dir is None:
-        dir = os.getenv('OPENAI_LOGDIR')
-    if dir is None:
-        dir = osp.join(tempfile.gettempdir(),
-            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
-    assert isinstance(dir, str)
-    os.makedirs(dir, exist_ok=True)
+Logger.DEFAULT = Logger.CURRENT = Logger(folder=None, output_formats=[HumanOutputFormat(sys.stdout)])
+
+
+def configure(folder=None, format_strs=None):
+    """
+    configure the current logger
+
+    :param folder: (str) the save location (if None, $OPENAI_LOGDIR, if still None, tempdir/openai-[date & time])
+    :param format_strs: (list) the output logging format
+        (if None, $OPENAI_LOG_FORMAT, if still None, ['stdout', 'log', 'csv'])
+    """
+    if folder is None:
+        folder = os.getenv('OPENAI_LOGDIR')
+    if folder is None:
+        folder = os.path.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
+    assert isinstance(folder, str)
+    os.makedirs(folder, exist_ok=True)
 
     log_suffix = ''
     from mpi4py import MPI
@@ -367,40 +560,61 @@ def configure(dir=None, format_strs=None):
         else:
             format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',')
     format_strs = filter(None, format_strs)
-    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
+    output_formats = [make_output_format(f, folder, log_suffix) for f in format_strs]
+
+    Logger.CURRENT = Logger(folder=folder, output_formats=output_formats)
+    log('Logging to %s' % folder)
 
-    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
-    log('Logging to %s'%dir)
 
 def reset():
+    """
+    reset the current logger
+    """
     if Logger.CURRENT is not Logger.DEFAULT:
         Logger.CURRENT.close()
         Logger.CURRENT = Logger.DEFAULT
         log('Reset logger')
 
-class scoped_configure(object):
-    def __init__(self, dir=None, format_strs=None):
-        self.dir = dir
+
+class ScopedConfigure(object):
+    def __init__(self, folder=None, format_strs=None):
+        """
+        Class for using context manager while logging
+
+        usage:
+        with ScopedConfigure(folder=None, format_strs=None):
+            {code}
+
+        :param folder: (str) the logging folder
+        :param format_strs: ([str]) the list of output logging format
+        """
+        self.dir = folder
         self.format_strs = format_strs
         self.prevlogger = None
+
     def __enter__(self):
         self.prevlogger = Logger.CURRENT
-        configure(dir=self.dir, format_strs=self.format_strs)
+        configure(folder=self.dir, format_strs=self.format_strs)
+
     def __exit__(self, *args):
         Logger.CURRENT.close()
         Logger.CURRENT = self.prevlogger
 
+
 # ================================================================
 
 def _demo():
+    """
+    tests for the logger module
+    """
     info("hi")
     debug("shouldn't appear")
     set_level(DEBUG)
     debug("should appear")
-    dir = "/tmp/testlogging"
-    if os.path.exists(dir):
-        shutil.rmtree(dir)
-    configure(dir=dir)
+    folder = "/tmp/testlogging"
+    if os.path.exists(folder):
+        shutil.rmtree(folder)
+    configure(folder=folder)
     logkv("a", 3)
     logkv("b", 2.5)
     dumpkvs()
@@ -412,13 +626,19 @@ def _demo():
     logkv_mean("b", -44.4)
     logkv("a", 5.5)
     dumpkvs()
-    info("^^^ should see b = 33.3")
+    with ScopedConfigure(None, None):
+        info("^^^ should see b = 33.3")
 
-    logkv("b", -2.5)
-    dumpkvs()
+    with ScopedConfigure("/tmp/test-logger/", ["json"]):
+        logkv("b", -2.5)
+        dumpkvs()
 
+    reset()
     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
     dumpkvs()
+    warn("hey")
+    error("oh")
+    logkvs({"test": 1})
 
 
 # ================================================================
@@ -426,50 +646,67 @@ def _demo():
 # ================================================================
 
 def read_json(fname):
+    """
+    read a json file using pandas
+
+    :param fname: (str) the file path to read
+    :return: (pandas DataFrame) the data in the json
+    """
     import pandas
-    ds = []
-    with open(fname, 'rt') as fh:
-        for line in fh:
-            ds.append(json.loads(line))
-    return pandas.DataFrame(ds)
+    data = []
+    with open(fname, 'rt') as file_handler:
+        for line in file_handler:
+            data.append(json.loads(line))
+    return pandas.DataFrame(data)
+
 
 def read_csv(fname):
+    """
+    read a csv file using pandas
+
+    :param fname: (str) the file path to read
+    :return: (pandas DataFrame) the data in the csv
+    """
     import pandas
     return pandas.read_csv(fname, index_col=None, comment='#')
 
+
 def read_tb(path):
     """
-    path : a tensorboard file OR a directory, where we will find all TB files
-           of the form events.*
+    read a tensorboard output
+
+    :param path: (str) a tensorboard file OR a directory, where we will find all TB files of the form events.
+    :return: (pandas DataFrame) the tensorboad data
     """
     import pandas
     import numpy as np
     from glob import glob
-    from collections import defaultdict
+    # from collections import defaultdict
     import tensorflow as tf
-    if osp.isdir(path):
-        fnames = glob(osp.join(path, "events.*"))
-    elif osp.basename(path).startswith("events."):
+    if os.path.isdir(path):
+        fnames = glob(os.path.join(path, "events.*"))
+    elif os.path.basename(path).startswith("events."):
         fnames = [path]
     else:
-        raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path)
+        raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s" % path)
     tag2pairs = defaultdict(list)
     maxstep = 0
     for fname in fnames:
         for summary in tf.train.summary_iterator(fname):
             if summary.step > 0:
-                for v in summary.summary.value:
-                    pair = (summary.step, v.simple_value)
-                    tag2pairs[v.tag].append(pair)
+                for value in summary.summary.value:
+                    pair = (summary.step, value.simple_value)
+                    tag2pairs[value.tag].append(pair)
                 maxstep = max(summary.step, maxstep)
     data = np.empty((maxstep, len(tag2pairs)))
     data[:] = np.nan
     tags = sorted(tag2pairs.keys())
-    for (colidx,tag) in enumerate(tags):
+    for (colidx, tag) in enumerate(tags):
         pairs = tag2pairs[tag]
         for (step, value) in pairs:
-            data[step-1, colidx] = value
+            data[step - 1, colidx] = value
     return pandas.DataFrame(data, columns=tags)
 
+
 if __name__ == "__main__":
     _demo()
diff --git a/baselines/ppo1/cnn_policy.py b/baselines/ppo1/cnn_policy.py
index 6aec8c0e97..f2498f4fdf 100644
--- a/baselines/ppo1/cnn_policy.py
+++ b/baselines/ppo1/cnn_policy.py
@@ -1,56 +1,71 @@
-import baselines.common.tf_util as U
 import tensorflow as tf
-import gym
-from baselines.common.distributions import make_pdtype
 
-class CnnPolicy(object):
+import baselines.common.tf_util as tf_util
+from baselines.ppo1.mlp_policy import BasePolicy
+
+
+class CnnPolicy(BasePolicy):
     recurrent = False
-    def __init__(self, name, ob_space, ac_space, kind='large'):
-        with tf.variable_scope(name):
-            self._init(ob_space, ac_space, kind)
-            self.scope = tf.get_variable_scope().name
-
-    def _init(self, ob_space, ac_space, kind):
-        assert isinstance(ob_space, gym.spaces.Box)
-
-        self.pdtype = pdtype = make_pdtype(ac_space)
-        sequence_length = None
-
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
-
-        x = ob / 255.0
-        if kind == 'small': # from A3C paper
-            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-        elif kind == 'large': # Nature DQN
-            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-        else:
-            raise NotImplementedError
-
-        logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
-        self.pd = pdtype.pdfromflat(logits)
-        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
 
-        self.state_in = []
-        self.state_out = []
+    def __init__(self, name, ob_space, ac_space, architecture_size='large', sess=None, reuse=False, placeholders=None):
+        """
+        A CNN policy object for PPO1
+
+        :param name: (str) type of the policy (lin, logits, value)
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param architecture_size: (str) size of the policy's architecture
+               (small as in A3C paper, large as in Nature DQN)
+        :param sess: (TensorFlow session) The current TensorFlow session containing the variables.
+        :param reuse: (bool) If the policy is reusable or not
+        :param placeholders: (dict) To feed existing placeholders if needed
+        """
+        super(CnnPolicy, self).__init__(placeholders=placeholders)
+        self.reuse = reuse
+        self.name = name
+        self._init(ob_space, ac_space, architecture_size)
+        self.scope = tf.get_variable_scope().name
+        self.sess = sess
+
+    def _init(self, ob_space, ac_space, architecture_size):
+        """
 
-        stochastic = tf.placeholder(dtype=tf.bool, shape=())
-        ac = self.pd.sample() # XXX
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param architecture_size: (str) size of the policy's architecture
+               (small as in A3C paper, large as in Nature DQN)
+        """
+        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
 
-    def act(self, stochastic, ob):
-        ac1, vpred1 =  self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-    def get_initial_state(self):
-        return []
+        with tf.variable_scope(self.name, reuse=self.reuse):
+            normalized_obs = obs / 255.0
+            if architecture_size == 'small':  # from A3C paper
+                layer_1 = tf.nn.relu(tf_util.conv2d(normalized_obs, 16, "l1", [8, 8], [4, 4], pad="VALID"))
+                layer_2 = tf.nn.relu(tf_util.conv2d(layer_1, 32, "l2", [4, 4], [2, 2], pad="VALID"))
+                flattened_layer_2 = tf_util.flattenallbut0(layer_2)
+                last_layer = tf.nn.relu(tf.layers.dense(flattened_layer_2, 256,
+                                                        name='lin', kernel_initializer=tf_util.normc_initializer(1.0)))
+            elif architecture_size == 'large':  # Nature DQN
+                layer_1 = tf.nn.relu(tf_util.conv2d(normalized_obs, 32, "l1", [8, 8], [4, 4], pad="VALID"))
+                layer_2 = tf.nn.relu(tf_util.conv2d(layer_1, 64, "l2", [4, 4], [2, 2], pad="VALID"))
+                layer_3 = tf.nn.relu(tf_util.conv2d(layer_2, 64, "l3", [3, 3], [1, 1], pad="VALID"))
+                flattened_layer_3 = tf_util.flattenallbut0(layer_3)
+                last_layer = tf.nn.relu(tf.layers.dense(flattened_layer_3, 512,
+                                                        name='lin', kernel_initializer=tf_util.normc_initializer(1.0)))
+            else:
+                raise NotImplementedError
+
+            logits = tf.layers.dense(last_layer, pdtype.param_shape()[0], name='logits',
+                                     kernel_initializer=tf_util.normc_initializer(0.01))
+
+            self.proba_distribution = pdtype.proba_distribution_from_flat(logits)
+            self.vpred = tf.layers.dense(last_layer, 1,
+                                         name='value', kernel_initializer=tf_util.normc_initializer(1.0))[:, 0]
+
+        self.state_in = []
+        self.state_out = []
 
+        if self.stochastic_ph is None:
+            self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())
+        action = self.proba_distribution.sample()
+        self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py
index 7f979b3495..34322fd608 100644
--- a/baselines/ppo1/mlp_policy.py
+++ b/baselines/ppo1/mlp_policy.py
@@ -1,61 +1,152 @@
-from baselines.common.mpi_running_mean_std import RunningMeanStd
-import baselines.common.tf_util as U
 import tensorflow as tf
 import gym
-from baselines.common.distributions import make_pdtype
 
-class MlpPolicy(object):
-    recurrent = False
-    def __init__(self, name, *args, **kwargs):
-        with tf.variable_scope(name):
-            self._init(*args, **kwargs)
-            self.scope = tf.get_variable_scope().name
+from baselines.common.mpi_running_mean_std import RunningMeanStd
+import baselines.common.tf_util as tf_util
+from baselines.common.distributions import make_proba_dist_type
 
-    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
+
+class BasePolicy(object):
+    def __init__(self, placeholders=None):
+        """
+        A base policy object for PPO1
+
+        :param placeholders: (dict) To feed existing placeholders if needed
+        """
+        super(BasePolicy, self).__init__()
+        self.sess = None
+        self.pdtype = None
+        self._act = None
+        self.scope = None
+        self.obs_ph = None
+        self.stochastic_ph = None
+
+        if placeholders is not None:
+            self.obs_ph = placeholders.get("obs", None)
+            self.stochastic_ph = placeholders.get("stochastic", None)
+
+    def get_obs_and_pdtype(self, ob_space, ac_space):
+        """
+        Initialize probability distribution and get observation placeholder.
+
+        :param ob_space: (Gym Spaces) the observation space
+        :param ac_space: (Gym Spaces) the action space
+        """
         assert isinstance(ob_space, gym.spaces.Box)
 
-        self.pdtype = pdtype = make_pdtype(ac_space)
+        self.pdtype = pdtype = make_proba_dist_type(ac_space)
         sequence_length = None
 
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
+        if self.obs_ph is None:
+            self.obs_ph = tf.placeholder(dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape), name="ob")
+
+        return self.obs_ph, pdtype
+
+    def act(self, stochastic, obs):
+        """
+        Get the action from the policy, using the observation
+
+        :param stochastic: (bool) whether or not to use a stochastic or deterministic policy
+        :param obs: (TensorFlow Tensor or numpy Number) the observation
+        :return: (numpy Number, numpy Number) the action and value function
+        """
+        ac1, vpred1 = self._act(stochastic, obs[None], sess=self.sess)
+        return ac1[0], vpred1[0]
+
+    def get_variables(self):
+        """
+        Get all the policy's variables
+
+        :return: ([TensorFlow Tensor]) the variables of the network
+        """
+        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
+
+    def get_trainable_variables(self):
+        """
+        Get the policy's trainable variables
+
+        :return: ([TensorFlow Tensor]) the trainable variables of the network
+        """
+        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
+
+    @classmethod
+    def get_initial_state(cls):
+        """
+        Get the initial state
+
+        :return: ([numpy Number]) the initial state
+        """
+        return []
+
+
+class MlpPolicy(BasePolicy):
+    recurrent = False
+
+    def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs):
+        """
+        A MLP policy object for PPO1
+
+        :param name: (str) type of the policy (lin, logits, value)
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param hid_size: (int) the size of the hidden layers
+        :param num_hid_layers: (int) the number of hidden layers
+        :param sess: (TensorFlow session) The current TensorFlow session containing the variables.
+        :param reuse: (bool) If the policy is reusable or not
+        :param placeholders: (dict) To feed existing placeholders if needed
+        :param gaussian_fixed_var: (bool) enable gaussian sampling with fixed variance, when using continuous actions
+        """
+        super(MlpPolicy, self).__init__(placeholders=placeholders)
+        self.reuse = reuse
+        self.name = name
+        self._init(*args, **kwargs)
+        self.scope = tf.get_variable_scope().name
+        self.sess = sess
+
+    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
+        """
+
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param hid_size: (int) the size of the hidden layers
+        :param num_hid_layers: (int) the number of hidden layers
+        :param gaussian_fixed_var: (bool) enable gaussian sampling with fixed variance, when using continuous actions
+        """
+        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
 
-        with tf.variable_scope("obfilter"):
+        with tf.variable_scope(self.name + "/obfilter", reuse=self.reuse):
             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
 
-        with tf.variable_scope('vf'):
-            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
+        with tf.variable_scope(self.name + '/vf', reuse=self.reuse):
+            obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
             last_out = obz
             for i in range(num_hid_layers):
-                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
-            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
+                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i" % (i + 1),
+                                                      kernel_initializer=tf_util.normc_initializer(1.0)))
+            self.vpred = tf.layers.dense(last_out, 1, name='final',
+                                         kernel_initializer=tf_util.normc_initializer(1.0))[:, 0]
 
-        with tf.variable_scope('pol'):
+        with tf.variable_scope(self.name + '/pol', reuse=self.reuse):
             last_out = obz
             for i in range(num_hid_layers):
-                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
+                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1),
+                                                      kernel_initializer=tf_util.normc_initializer(1.0)))
             if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
-                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
-                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
+                mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final',
+                                       kernel_initializer=tf_util.normc_initializer(0.01))
+                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
+                                         initializer=tf.zeros_initializer())
                 pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
             else:
-                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))
+                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final',
+                                          kernel_initializer=tf_util.normc_initializer(0.01))
 
-        self.pd = pdtype.pdfromflat(pdparam)
+        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
 
         self.state_in = []
         self.state_out = []
 
-        stochastic = tf.placeholder(dtype=tf.bool, shape=())
-        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
-
-    def act(self, stochastic, ob):
-        ac1, vpred1 =  self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-    def get_initial_state(self):
-        return []
-
+        if self.stochastic_ph is None:
+            self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())
+        action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode())
+        self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py
index f2f13a6172..f663a66c10 100644
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -1,202 +1,193 @@
+from collections import deque
+import time
+
+import tensorflow as tf
+import numpy as np
+from mpi4py import MPI
+
 from baselines.common import Dataset, explained_variance, fmt_row, zipsame
 from baselines import logger
-import baselines.common.tf_util as U
-import tensorflow as tf, numpy as np
-import time
+import baselines.common.tf_util as tf_util
 from baselines.common.mpi_adam import MpiAdam
 from baselines.common.mpi_moments import mpi_moments
-from mpi4py import MPI
-from collections import deque
+from baselines.gail.trpo_mpi import traj_segment_generator, add_vtarg_and_adv, flatten_lists
 
-def traj_segment_generator(pi, env, horizon, stochastic):
-    t = 0
-    ac = env.action_space.sample() # not used, just so we have the datatype
-    new = True # marks if we're on first timestep of an episode
-    ob = env.reset()
-
-    cur_ep_ret = 0 # return in current episode
-    cur_ep_len = 0 # len of current episode
-    ep_rets = [] # returns of completed episodes in this segment
-    ep_lens = [] # lengths of ...
-
-    # Initialize history arrays
-    obs = np.array([ob for _ in range(horizon)])
-    rews = np.zeros(horizon, 'float32')
-    vpreds = np.zeros(horizon, 'float32')
-    news = np.zeros(horizon, 'int32')
-    acs = np.array([ac for _ in range(horizon)])
-    prevacs = acs.copy()
 
-    while True:
-        prevac = ac
-        ac, vpred = pi.act(stochastic, ob)
-        # Slight weirdness here because we need value function at time T
-        # before returning segment [0, T-1] so we get the correct
-        # terminal value
-        if t > 0 and t % horizon == 0:
-            yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
-                    "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
-                    "ep_rets" : ep_rets, "ep_lens" : ep_lens}
-            # Be careful!!! if you change the downstream algorithm to aggregate
-            # several of these batches, then be sure to do a deepcopy
-            ep_rets = []
-            ep_lens = []
-        i = t % horizon
-        obs[i] = ob
-        vpreds[i] = vpred
-        news[i] = new
-        acs[i] = ac
-        prevacs[i] = prevac
-
-        ob, rew, new, _ = env.step(ac)
-        rews[i] = rew
-
-        cur_ep_ret += rew
-        cur_ep_len += 1
-        if new:
-            ep_rets.append(cur_ep_ret)
-            ep_lens.append(cur_ep_len)
-            cur_ep_ret = 0
-            cur_ep_len = 0
-            ob = env.reset()
-        t += 1
-
-def add_vtarg_and_adv(seg, gamma, lam):
+def learn(env, policy_fn, *, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize,
+          optim_batchsize, gamma, lam, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, callback=None,
+          adam_epsilon=1e-5, schedule='constant'):
     """
-    Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
+    Learning PPO with Stochastic Gradient Descent
+
+    :param env: (Gym Environment) environment to train on
+    :param policy_fn: (function (str, Gym Spaces, Gym Spaces): TensorFlow Tensor) creates the policy
+    :param timesteps_per_actorbatch: (int) timesteps per actor per update
+    :param clip_param: (float) clipping parameter epsilon
+    :param entcoeff: (float) the entropy loss weight
+    :param optim_epochs: (float) the optimizer's number of epochs
+    :param optim_stepsize: (float) the optimizer's stepsize
+    :param optim_batchsize: (int) the optimizer's the batch size
+    :param gamma: (float) discount factor
+    :param lam: (float) advantage estimation
+    :param max_timesteps: (int) number of env steps to optimizer for
+    :param max_episodes: (int) the maximum number of epochs
+    :param max_iters: (int) the maximum number of iterations
+    :param max_seconds: (int) the maximal duration
+    :param callback: (function (dict, dict)) function called at every steps with state of the algorithm.
+        It takes the local and global variables.
+    :param adam_epsilon: (float) the epsilon value for the adam optimizer
+    :param schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
+                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
     """
-    new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
-    vpred = np.append(seg["vpred"], seg["nextvpred"])
-    T = len(seg["rew"])
-    seg["adv"] = gaelam = np.empty(T, 'float32')
-    rew = seg["rew"]
-    lastgaelam = 0
-    for t in reversed(range(T)):
-        nonterminal = 1-new[t+1]
-        delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
-        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
-    seg["tdlamret"] = seg["adv"] + seg["vpred"]
-
-def learn(env, policy_fn, *,
-        timesteps_per_actorbatch, # timesteps per actor per update
-        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
-        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
-        gamma, lam, # advantage estimation
-        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
-        callback=None, # you can do anything in the callback, since it takes locals(), globals()
-        adam_epsilon=1e-5,
-        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
-        ):
+
     # Setup losses and stuff
-    # ----------------------------------------
     ob_space = env.observation_space
     ac_space = env.action_space
-    pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy
-    oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy
-    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
-    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
+    sess = tf_util.single_threaded_session()
+
+    # Construct network for new policy
+    policy = policy_fn("pi", ob_space, ac_space, sess=sess)
+
+    # Network for old policy
+    oldpi = policy_fn("oldpi", ob_space, ac_space, sess=sess,
+                      placeholders={"obs": policy.obs_ph, "stochastic": policy.stochastic_ph})
 
-    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
-    clip_param = clip_param * lrmult # Annealed cliping parameter epislon
+    # Target advantage function (if applicable)
+    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
 
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
+    # Empirical return
+    ret = tf.placeholder(dtype=tf.float32, shape=[None])
 
-    kloldnew = oldpi.pd.kl(pi.pd)
-    ent = pi.pd.entropy()
+    # learning rate multiplier, updated with schedule
+    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])
+
+    # Annealed cliping parameter epislon
+    clip_param = clip_param * lrmult
+
+    obs_ph = policy.obs_ph
+    action_ph = policy.pdtype.sample_placeholder([None])
+
+    kloldnew = oldpi.proba_distribution.kl(policy.proba_distribution)
+    ent = policy.proba_distribution.entropy()
     meankl = tf.reduce_mean(kloldnew)
     meanent = tf.reduce_mean(ent)
     pol_entpen = (-entcoeff) * meanent
 
-    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
-    surr1 = ratio * atarg # surrogate from conservative policy iteration
-    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
-    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
-    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
+    # pnew / pold
+    ratio = tf.exp(policy.proba_distribution.logp(action_ph) - oldpi.proba_distribution.logp(action_ph))
+
+    # surrogate from conservative policy iteration
+    surr1 = ratio * atarg
+    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
+
+    # PPO's pessimistic surrogate (L^CLIP)
+    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))
+    vf_loss = tf.reduce_mean(tf.square(policy.vpred - ret))
     total_loss = pol_surr + pol_entpen + vf_loss
     losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
     loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
 
-    var_list = pi.get_trainable_variables()
-    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
-    adam = MpiAdam(var_list, epsilon=adam_epsilon)
+    var_list = policy.get_trainable_variables()
+    lossandgrad = tf_util.function([obs_ph, action_ph, atarg, ret, lrmult],
+                                   losses + [tf_util.flatgrad(total_loss, var_list)])
+    adam = MpiAdam(var_list, epsilon=adam_epsilon, sess=sess)
 
-    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
-        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
-    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
+    assign_old_eq_new = tf_util.function([], [], updates=[tf.assign(oldv, newv)
+                                                          for (oldv, newv) in
+                                                          zipsame(oldpi.get_variables(), policy.get_variables())])
+    compute_losses = tf_util.function([obs_ph, action_ph, atarg, ret, lrmult], losses)
 
-    U.initialize()
+    tf_util.initialize(sess=sess)
     adam.sync()
 
     # Prepare for rollouts
-    # ----------------------------------------
-    seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)
+    seg_gen = traj_segment_generator(policy, env, timesteps_per_actorbatch, stochastic=True)
 
     episodes_so_far = 0
     timesteps_so_far = 0
     iters_so_far = 0
-    tstart = time.time()
-    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
-    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
+    t_start = time.time()
+
+    # rolling buffer for episode lengths
+    lenbuffer = deque(maxlen=100)
+    # rolling buffer for episode rewards
+    rewbuffer = deque(maxlen=100)
 
-    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"
+    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0,
+                max_seconds > 0]) == 1, "Only one time constraint permitted"
 
     while True:
-        if callback: callback(locals(), globals())
+        if callback:
+            callback(locals(), globals())
         if max_timesteps and timesteps_so_far >= max_timesteps:
             break
         elif max_episodes and episodes_so_far >= max_episodes:
             break
         elif max_iters and iters_so_far >= max_iters:
             break
-        elif max_seconds and time.time() - tstart >= max_seconds:
+        elif max_seconds and time.time() - t_start >= max_seconds:
             break
 
         if schedule == 'constant':
             cur_lrmult = 1.0
         elif schedule == 'linear':
-            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
+            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
         else:
             raise NotImplementedError
 
-        logger.log("********** Iteration %i ************"%iters_so_far)
+        logger.log("********** Iteration %i ************" % iters_so_far)
 
         seg = seg_gen.__next__()
         add_vtarg_and_adv(seg, gamma, lam)
 
         # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
-        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
-        vpredbefore = seg["vpred"] # predicted value function before udpate
-        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
-        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
-        optim_batchsize = optim_batchsize or ob.shape[0]
+        obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
 
-        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
+        # predicted value function before udpate
+        vpredbefore = seg["vpred"]
 
-        assign_old_eq_new() # set old parameter values to new parameter values
+        # standardized advantage function estimate
+        atarg = (atarg - atarg.mean()) / atarg.std()
+        dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret),
+                          shuffle=not policy.recurrent)
+        optim_batchsize = optim_batchsize or obs_ph.shape[0]
+
+        if hasattr(policy, "ob_rms"):
+            # update running mean/std for policy
+            policy.ob_rms.update(obs_ph)
+
+        # set old parameter values to new parameter values
+        assign_old_eq_new(sess=sess)
         logger.log("Optimizing...")
         logger.log(fmt_row(13, loss_names))
+
         # Here we do a bunch of optimization epochs over the data
         for _ in range(optim_epochs):
-            losses = [] # list of tuples, each of which gives the loss for a minibatch
-            for batch in d.iterate_once(optim_batchsize):
-                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
-                adam.update(g, optim_stepsize * cur_lrmult)
+            # list of tuples, each of which gives the loss for a minibatch
+            losses = []
+            for batch in dataset.iterate_once(optim_batchsize):
+                *newlosses, grad = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult,
+                                               sess=sess)
+                adam.update(grad, optim_stepsize * cur_lrmult)
                 losses.append(newlosses)
             logger.log(fmt_row(13, np.mean(losses, axis=0)))
 
         logger.log("Evaluating losses...")
         losses = []
-        for batch in d.iterate_once(optim_batchsize):
-            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
+        for batch in dataset.iterate_once(optim_batchsize):
+            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=sess)
             losses.append(newlosses)
-        meanlosses,_,_ = mpi_moments(losses, axis=0)
-        logger.log(fmt_row(13, meanlosses))
-        for (lossval, name) in zipsame(meanlosses, loss_names):
-            logger.record_tabular("loss_"+name, lossval)
+        mean_losses, _, _ = mpi_moments(losses, axis=0)
+        logger.log(fmt_row(13, mean_losses))
+        for (loss_val, name) in zipsame(mean_losses, loss_names):
+            logger.record_tabular("loss_" + name, loss_val)
         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
-        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
-        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
+
+        # local values
+        lrlocal = (seg["ep_lens"], seg["ep_rets"])
+
+        # list of tuples
+        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
         lens, rews = map(flatten_lists, zip(*listoflrpairs))
         lenbuffer.extend(lens)
         rewbuffer.extend(rews)
@@ -208,11 +199,8 @@ def learn(env, policy_fn, *,
         iters_so_far += 1
         logger.record_tabular("EpisodesSoFar", episodes_so_far)
         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
-        logger.record_tabular("TimeElapsed", time.time() - tstart)
-        if MPI.COMM_WORLD.Get_rank()==0:
+        logger.record_tabular("TimeElapsed", time.time() - t_start)
+        if MPI.COMM_WORLD.Get_rank() == 0:
             logger.dump_tabular()
 
-    return pi
-
-def flatten_lists(listoflists):
-    return [el for list_ in listoflists for el in list_]
+    return policy
diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py
index 17941c6d39..186963b9ec 100644
--- a/baselines/ppo1/run_atari.py
+++ b/baselines/ppo1/run_atari.py
@@ -1,19 +1,25 @@
 #!/usr/bin/env python3
+import os
 
 from mpi4py import MPI
+
 from baselines.common import set_global_seeds
-from baselines import bench
-import os.path as osp
-from baselines import logger
+from baselines import bench, logger
 from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 from baselines.common.cmd_util import atari_arg_parser
+from baselines.ppo1 import pposgd_simple, cnn_policy
+
 
 def train(env_id, num_timesteps, seed):
-    from baselines.ppo1 import pposgd_simple, cnn_policy
-    import baselines.common.tf_util as U
+    """
+    Train PPO1 model for Atari environments, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
     rank = MPI.COMM_WORLD.Get_rank()
-    sess = U.single_threaded_session()
-    sess.__enter__()
+
     if rank == 0:
         logger.configure()
     else:
@@ -21,28 +27,36 @@ def train(env_id, num_timesteps, seed):
     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
     set_global_seeds(workerseed)
     env = make_atari(env_id)
-    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
-        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
+
+    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
+        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess,
+                                    placeholders=placeholders)
+
     env = bench.Monitor(env, logger.get_dir() and
-        osp.join(logger.get_dir(), str(rank)))
+                        os.path.join(logger.get_dir(), str(rank)))
     env.seed(workerseed)
 
     env = wrap_deepmind(env)
     env.seed(workerseed)
 
     pposgd_simple.learn(env, policy_fn,
-        max_timesteps=int(num_timesteps * 1.1),
-        timesteps_per_actorbatch=256,
-        clip_param=0.2, entcoeff=0.01,
-        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
-        gamma=0.99, lam=0.95,
-        schedule='linear'
-    )
+                        max_timesteps=int(num_timesteps * 1.1),
+                        timesteps_per_actorbatch=256,
+                        clip_param=0.2, entcoeff=0.01,
+                        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
+                        gamma=0.99, lam=0.95,
+                        schedule='linear'
+                        )
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     args = atari_arg_parser().parse_args()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
+
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py
index d7d8f5a49b..1df67f5743 100644
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -1,75 +1,88 @@
 #!/usr/bin/env python3
 import os
+
+import gym
+
+from baselines.ppo1 import mlp_policy, pposgd_simple
 from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
-from baselines.common import tf_util as U
+from baselines.common import tf_util
 from baselines import logger
 
-import gym
 
 def train(num_timesteps, seed, model_path=None):
+    """
+    Train PPO1 model for the Humanoid environment, for testing purposes
+
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    :param model_path: (str) path to the model
+    """
     env_id = 'Humanoid-v2'
-    from baselines.ppo1 import mlp_policy, pposgd_simple
-    U.make_session(num_cpu=1).__enter__()
-    def policy_fn(name, ob_space, ac_space):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-            hid_size=64, num_hid_layers=2)
+
+    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2,
+                                    sess=sess, placeholders=placeholders)
+
     env = make_mujoco_env(env_id, seed)
 
     # parameters below were the best found in a simple random search
     # these are good enough to make humanoid walk, but whether those are
     # an absolute best or not is not certain
     env = RewScale(env, 0.1)
-    pi = pposgd_simple.learn(env, policy_fn,
-            max_timesteps=num_timesteps,
-            timesteps_per_actorbatch=2048,
-            clip_param=0.2, entcoeff=0.0,
-            optim_epochs=10, 
-            optim_stepsize=3e-4, 
-            optim_batchsize=64, 
-            gamma=0.99, 
-            lam=0.95,
-            schedule='linear',
-        )
+    policy = pposgd_simple.learn(env, policy_fn,
+                                 max_timesteps=num_timesteps,
+                                 timesteps_per_actorbatch=2048,
+                                 clip_param=0.2, entcoeff=0.0,
+                                 optim_epochs=10,
+                                 optim_stepsize=3e-4,
+                                 optim_batchsize=64,
+                                 gamma=0.99,
+                                 lam=0.95,
+                                 schedule='linear')
     env.close()
     if model_path:
-        U.save_state(model_path)
-        
-    return pi
+        tf_util.save_state(model_path)
+
+    return policy
+
 
 class RewScale(gym.RewardWrapper):
     def __init__(self, env, scale):
         gym.RewardWrapper.__init__(self, env)
         self.scale = scale
-    def reward(self, r):
-        return r * self.scale
+
+    def reward(self, _reward):
+        return _reward * self.scale
+
 
 def main():
+    """
+    Runs the test
+    """
     logger.configure()
     parser = mujoco_arg_parser()
     parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
     parser.set_defaults(num_timesteps=int(2e7))
-   
+
     args = parser.parse_args()
-    
+
     if not args.play:
         # train the model
         train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
-    else:       
+    else:
         # construct the model object, load pre-trained model and render
-        pi = train(num_timesteps=1, seed=args.seed)
-        U.load_state(args.model_path)
+        policy = train(num_timesteps=1, seed=args.seed)
+        tf_util.load_state(args.model_path)
         env = make_mujoco_env('Humanoid-v2', seed=0)
 
-        ob = env.reset()        
+        obs = env.reset()
         while True:
-            action = pi.act(stochastic=False, ob=ob)[0]
-            ob, _, done, _ =  env.step(action)
+            action = policy.act(stochastic=False, obs=obs)[0]
+            obs, _, done, _ = env.step(action)
             env.render()
             if done:
-                ob = env.reset()
-        
-        
-    
+                obs = env.reset()
+
 
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo1/run_mujoco.py b/baselines/ppo1/run_mujoco.py
index 638998316b..84f0075632 100644
--- a/baselines/ppo1/run_mujoco.py
+++ b/baselines/ppo1/run_mujoco.py
@@ -1,29 +1,40 @@
 #!/usr/bin/env python3
 
+from baselines.ppo1 import mlp_policy, pposgd_simple
 from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
-from baselines.common import tf_util as U
 from baselines import logger
 
+
 def train(env_id, num_timesteps, seed):
-    from baselines.ppo1 import mlp_policy, pposgd_simple
-    U.make_session(num_cpu=1).__enter__()
-    def policy_fn(name, ob_space, ac_space):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-            hid_size=64, num_hid_layers=2)
+    """
+    Train PPO1 model for the Mujoco environment, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
+    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
+        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2,
+                                    sess=sess, placeholders=placeholders)
+
     env = make_mujoco_env(env_id, seed)
     pposgd_simple.learn(env, policy_fn,
-            max_timesteps=num_timesteps,
-            timesteps_per_actorbatch=2048,
-            clip_param=0.2, entcoeff=0.0,
-            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
-            gamma=0.99, lam=0.95, schedule='linear',
-        )
+                        max_timesteps=num_timesteps,
+                        timesteps_per_actorbatch=2048,
+                        clip_param=0.2, entcoeff=0.0,
+                        optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
+                        gamma=0.99, lam=0.95, schedule='linear')
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     args = mujoco_arg_parser().parse_args()
     logger.configure()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
+
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo1/run_robotics.py b/baselines/ppo1/run_robotics.py
index 7d84185a1b..892c38e55a 100644
--- a/baselines/ppo1/run_robotics.py
+++ b/baselines/ppo1/run_robotics.py
@@ -1,37 +1,45 @@
 #!/usr/bin/env python3
 
 from mpi4py import MPI
+import mujoco_py
+
 from baselines.common import set_global_seeds
-from baselines import logger
 from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
-import mujoco_py
+from baselines.ppo1 import mlp_policy, pposgd_simple
 
 
 def train(env_id, num_timesteps, seed):
-    from baselines.ppo1 import mlp_policy, pposgd_simple
-    import baselines.common.tf_util as U
+    """
+    Train PPO1 model for Robotics environment, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
+
     rank = MPI.COMM_WORLD.Get_rank()
-    sess = U.single_threaded_session()
-    sess.__enter__()
-    mujoco_py.ignore_mujoco_warnings().__enter__()
-    workerseed = seed + 10000 * rank
-    set_global_seeds(workerseed)
-    env = make_robotics_env(env_id, workerseed, rank=rank)
-    def policy_fn(name, ob_space, ac_space):
-        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-            hid_size=256, num_hid_layers=3)
-
-    pposgd_simple.learn(env, policy_fn,
-            max_timesteps=num_timesteps,
-            timesteps_per_actorbatch=2048,
-            clip_param=0.2, entcoeff=0.0,
-            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
-            gamma=0.99, lam=0.95, schedule='linear',
-        )
-    env.close()
+    with mujoco_py.ignore_mujoco_warnings():
+        workerseed = seed + 10000 * rank
+        set_global_seeds(workerseed)
+        env = make_robotics_env(env_id, workerseed, rank=rank)
+
+        def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
+            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3,
+                                        sess=sess, placeholders=placeholders)
+
+        pposgd_simple.learn(env, policy_fn,
+                            max_timesteps=num_timesteps,
+                            timesteps_per_actorbatch=2048,
+                            clip_param=0.2, entcoeff=0.0,
+                            optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
+                            gamma=0.99, lam=0.95, schedule='linear')
+        env.close()
 
 
 def main():
+    """
+    Runs the test
+    """
     args = robotics_arg_parser().parse_args()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
diff --git a/baselines/ppo2/policies.py b/baselines/ppo2/policies.py
deleted file mode 100644
index 6fbbb14ac8..0000000000
--- a/baselines/ppo2/policies.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm
-from baselines.common.distributions import make_pdtype
-from baselines.common.input import observation_input
-
-def nature_cnn(unscaled_images, **conv_kwargs):
-    """
-    CNN from Nature paper.
-    """
-    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
-    activ = tf.nn.relu
-    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
-                   **conv_kwargs))
-    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
-    h3 = conv_to_fc(h3)
-    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
-
-class LnLstmPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        X, processed_x = observation_input(ob_space, nbatch)
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class LstmPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
-        nenv = nbatch // nsteps
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-
-        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
-        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(X)
-            xs = batch_to_seq(h, nenv, nsteps)
-            ms = batch_to_seq(M, nenv, nsteps)
-            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
-            h5 = seq_to_batch(h5)
-            vf = fc(h5, 'v', 1)
-            self.pd, self.pi = self.pdtype.pdfromlatent(h5)
-
-        v0 = vf[:, 0]
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
-
-        def step(ob, state, mask):
-            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
-
-        def value(ob, state, mask):
-            return sess.run(v0, {X:ob, S:state, M:mask})
-
-        self.X = X
-        self.M = M
-        self.S = S
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class CnnPolicy(object):
-
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        X, processed_x = observation_input(ob_space, nbatch)
-        with tf.variable_scope("model", reuse=reuse):
-            h = nature_cnn(processed_x, **conv_kwargs)
-            vf = fc(h, 'v', 1)[:,0]
-            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
-
-class MlpPolicy(object):
-    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
-        self.pdtype = make_pdtype(ac_space)
-        with tf.variable_scope("model", reuse=reuse):
-            X, processed_x = observation_input(ob_space, nbatch)
-            activ = tf.tanh
-            processed_x = tf.layers.flatten(processed_x)
-            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
-            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
-            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
-            vf = fc(vf_h2, 'vf', 1)[:,0]
-
-            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)
-
-
-        a0 = self.pd.sample()
-        neglogp0 = self.pd.neglogp(a0)
-        self.initial_state = None
-
-        def step(ob, *_args, **_kwargs):
-            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
-            return a, v, self.initial_state, neglogp
-
-        def value(ob, *_args, **_kwargs):
-            return sess.run(vf, {X:ob})
-
-        self.X = X
-        self.vf = vf
-        self.step = step
-        self.value = value
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index fd34f52f36..093d0e4cdf 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -1,44 +1,73 @@
 import os
 import time
 import joblib
+from collections import deque
+import sys
+import multiprocessing
+
 import numpy as np
-import os.path as osp
 import tensorflow as tf
+
 from baselines import logger
-from collections import deque
 from baselines.common import explained_variance
 from baselines.common.runners import AbstractEnvRunner
 
+
 class Model(object):
-    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
-                nsteps, ent_coef, vf_coef, max_grad_norm):
-        sess = tf.get_default_session()
-
-        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
-        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)
-
-        A = train_model.pdtype.sample_placeholder([None])
-        ADV = tf.placeholder(tf.float32, [None])
-        R = tf.placeholder(tf.float32, [None])
-        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
-        OLDVPRED = tf.placeholder(tf.float32, [None])
-        LR = tf.placeholder(tf.float32, [])
-        CLIPRANGE = tf.placeholder(tf.float32, [])
-
-        neglogpac = train_model.pd.neglogp(A)
-        entropy = tf.reduce_mean(train_model.pd.entropy())
-
-        vpred = train_model.vf
-        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
-        vf_losses1 = tf.square(vpred - R)
-        vf_losses2 = tf.square(vpredclipped - R)
+    def __init__(self, *, policy, ob_space, ac_space, n_batch_act, n_batch_train, n_steps, ent_coef, vf_coef,
+                 max_grad_norm):
+        """
+        The PPO (Proximal Policy Optimization) model class https://arxiv.org/abs/1707.06347.
+        It shares policies with A2C.
+
+        :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+        :param ob_space: (Gym Spaces) Observation space
+        :param ac_space: (Gym Spaces) Action space
+        :param n_batch_act: (int) Minibatch size for the actor policy, used mostly for reccurent policies
+        :param n_batch_train: (int) Minibatch size during training
+        :param n_steps: (int) The number of steps to run for each environment
+        :param ent_coef: (float) Entropy coefficient for the loss caculation
+        :param vf_coef: (float) Value function coefficient for the loss calculation
+        :param max_grad_norm: (float) The maximum value for the gradient clipping
+        """
+
+        n_cpu = multiprocessing.cpu_count()
+        if sys.platform == 'darwin':
+            n_cpu //= 2
+
+        config = tf.ConfigProto(allow_soft_placement=True,
+                                intra_op_parallelism_threads=n_cpu,
+                                inter_op_parallelism_threads=n_cpu)
+        config.gpu_options.allow_growth = True  # pylint: disable=E1101
+
+        sess = tf.Session(config=config)
+
+        act_model = policy(sess, ob_space, ac_space, n_batch_act, 1, reuse=False)
+        train_model = policy(sess, ob_space, ac_space, n_batch_train, n_steps, reuse=True)
+
+        action_ph = train_model.pdtype.sample_placeholder([None])
+        advs_ph = tf.placeholder(tf.float32, [None])
+        rewards_ph = tf.placeholder(tf.float32, [None])
+        old_neglog_pac_ph = tf.placeholder(tf.float32, [None])
+        old_vpred_ph = tf.placeholder(tf.float32, [None])
+        learning_rate_ph = tf.placeholder(tf.float32, [])
+        clip_range_ph = tf.placeholder(tf.float32, [])
+
+        neglogpac = train_model.proba_distribution.neglogp(action_ph)
+        entropy = tf.reduce_mean(train_model.proba_distribution.entropy())
+
+        vpred = train_model.value_fn
+        vpredclipped = old_vpred_ph \
+                       + tf.clip_by_value(train_model.value_fn - old_vpred_ph, - clip_range_ph, clip_range_ph)
+        vf_losses1 = tf.square(vpred - rewards_ph)
+        vf_losses2 = tf.square(vpredclipped - rewards_ph)
         vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
-        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
-        pg_losses = -ADV * ratio
-        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+        ratio = tf.exp(old_neglog_pac_ph - neglogpac)
+        pg_losses = -advs_ph * ratio
+        pg_losses2 = -advs_ph * tf.clip_by_value(ratio, 1.0 - clip_range_ph, 1.0 + clip_range_ph)
         pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
-        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
-        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - old_neglog_pac_ph))
+        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), clip_range_ph)))
         loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
         with tf.variable_scope('model'):
             params = tf.trainable_variables()
@@ -46,32 +75,56 @@ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
         if max_grad_norm is not None:
             grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
         grads = list(zip(grads, params))
-        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
+        trainer = tf.train.AdamOptimizer(learning_rate=learning_rate_ph, epsilon=1e-5)
         _train = trainer.apply_gradients(grads)
 
-        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+        def train(learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+            """
+            Training of PPO2 Algorithm
+
+            :param learning_rate: (float) learning rate
+            :param cliprange: (float) Clipping factor
+            :param obs: (numpy array) The current observation of the environment
+            :param returns: (numpy array) the rewards
+            :param masks: (numpy array) The last masks for done episodes (used in recurent policies)
+            :param actions: (numpy array) the actions
+            :param values: (numpy array) the values
+            :param neglogpacs: (numpy array) Negative Log-likelihood probability of Actions
+            :param states: (numpy array) For recurrent policies, the internal state of the recurrent model
+            :return: policy gradient loss, value function loss, policy entropy,
+                    approximation of kl divergence, updated clipping range, training update operation
+            """
             advs = returns - values
             advs = (advs - advs.mean()) / (advs.std() + 1e-8)
-            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
-                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
+            td_map = {train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: returns,
+                      learning_rate_ph: learning_rate, clip_range_ph: cliprange, old_neglog_pac_ph: neglogpacs,
+                      old_vpred_ph: values}
             if states is not None:
-                td_map[train_model.S] = states
-                td_map[train_model.M] = masks
-            return sess.run(
-                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
-                td_map
-            )[:-1]
+                td_map[train_model.states_ph] = states
+                td_map[train_model.masks_ph] = masks
+            return sess.run([pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1]
+
         self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
 
         def save(save_path):
-            ps = sess.run(params)
-            joblib.dump(ps, save_path)
+            """
+            Save the policy to a file
+
+            :param save_path: (str) the location to save the policy
+            """
+            saved_params = sess.run(params)
+            joblib.dump(saved_params, save_path)
 
         def load(load_path):
+            """
+            load a policy from the file
+
+            :param load_path: (str) the saved location of the policy
+            """
             loaded_params = joblib.load(load_path)
             restores = []
-            for p, loaded_p in zip(params, loaded_params):
-                restores.append(p.assign(loaded_p))
+            for param, loaded_p in zip(params, loaded_params):
+                restores.append(param.assign(loaded_p))
             sess.run(restores)
             # If you want to load weights, also save/load observation scaling inside VecNormalize
 
@@ -83,20 +136,43 @@ def load(load_path):
         self.initial_state = act_model.initial_state
         self.save = save
         self.load = load
-        tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
+        tf.global_variables_initializer().run(session=sess)  # pylint: disable=E1101
+
 
 class Runner(AbstractEnvRunner):
+    def __init__(self, *, env, model, n_steps, gamma, lam):
+        """
+        A runner to learn the policy of an environment for a model
 
-    def __init__(self, *, env, model, nsteps, gamma, lam):
-        super().__init__(env=env, model=model, nsteps=nsteps)
+        :param env: (Gym environment) The environment to learn from
+        :param model: (Model) The model to learn
+        :param n_steps: (int) The number of steps to run for each environment
+        :param gamma: (float) Discount factor
+        :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator
+        """
+        super().__init__(env=env, model=model, n_steps=n_steps)
         self.lam = lam
         self.gamma = gamma
 
     def run(self):
-        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
+        """
+        Run a learning step of the model
+
+        :return:
+            - observations: (numpy Number) the observations
+            - rewards: (numpy Number) the rewards
+            - masks: (numpy bool) whether an episode is over or not
+            - actions: (numpy Number) the actions
+            - values: (numpy Number) the value function output
+            - negative log probabilities: (numpy Number)
+            - states: (numpy Number) the internal states of the recurrent policies
+            - infos: (dict) the extra information of the model
+        """
+        # mb stands for minibatch
+        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], []
         mb_states = self.states
-        epinfos = []
-        for _ in range(self.nsteps):
+        ep_infos = []
+        for _ in range(self.n_steps):
             actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones)
             mb_obs.append(self.obs.copy())
             mb_actions.append(actions)
@@ -105,10 +181,11 @@ def run(self):
             mb_dones.append(self.dones)
             self.obs[:], rewards, self.dones, infos = self.env.step(actions)
             for info in infos:
-                maybeepinfo = info.get('episode')
-                if maybeepinfo: epinfos.append(maybeepinfo)
+                maybeep_info = info.get('episode')
+                if maybeep_info:
+                    ep_infos.append(maybeep_info)
             mb_rewards.append(rewards)
-        #batch of steps to batch of rollouts
+        # batch of steps to batch of rollouts
         mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
         mb_actions = np.asarray(mb_actions)
@@ -116,127 +193,173 @@ def run(self):
         mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         last_values = self.model.value(self.obs, self.states, self.dones)
-        #discount/bootstrap off value fn
-        mb_returns = np.zeros_like(mb_rewards)
+        # discount/bootstrap off value fn
         mb_advs = np.zeros_like(mb_rewards)
-        lastgaelam = 0
-        for t in reversed(range(self.nsteps)):
-            if t == self.nsteps - 1:
+        last_gae_lam = 0
+        for step in reversed(range(self.n_steps)):
+            if step == self.n_steps - 1:
                 nextnonterminal = 1.0 - self.dones
                 nextvalues = last_values
             else:
-                nextnonterminal = 1.0 - mb_dones[t+1]
-                nextvalues = mb_values[t+1]
-            delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
-            mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+                nextnonterminal = 1.0 - mb_dones[step + 1]
+                nextvalues = mb_values[step + 1]
+            delta = mb_rewards[step] + self.gamma * nextvalues * nextnonterminal - mb_values[step]
+            mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam
         mb_returns = mb_advs + mb_values
-        return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
-            mb_states, epinfos)
+        return (*map(swap_and_flatten, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states,
+                ep_infos)
+
+
 # obs, returns, masks, actions, values, neglogpacs, states = runner.run()
-def sf01(arr):
+def swap_and_flatten(arr):
     """
     swap and then flatten axes 0 and 1
+
+    :param arr: (numpy array)
+    :return: (numpy array)
     """
-    s = arr.shape
-    return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
+    shape = arr.shape
+    return arr.swapaxes(0, 1).reshape(shape[0] * shape[1], *shape[2:])
+
 
 def constfn(val):
-    def f(_):
+    """
+    Create a function that returns a constant
+    It is useful for learning rate schedule (to avoid code duplication)
+
+    :param val: (float)
+    :return: (function)
+    """
+
+    def func(_):
         return val
-    return f
 
-def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr,
-            vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
-            log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2,
-            save_interval=0, load_path=None):
+    return func
 
-    if isinstance(lr, float): lr = constfn(lr)
-    else: assert callable(lr)
-    if isinstance(cliprange, float): cliprange = constfn(cliprange)
-    else: assert callable(cliprange)
+
+def learn(*, policy, env, n_steps, total_timesteps, ent_coef, learning_rate,
+          vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95,
+          log_interval=10, nminibatches=4, noptepochs=4,
+          cliprange=0.2, save_interval=0, load_path=None):
+    """
+    Return a trained PPO2 model.
+
+    :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
+    :param env: (Gym environment) The environment to learn from
+    :param n_steps: (int) The number of steps to run for each environment
+    :param total_timesteps: (int) The total number of samples
+    :param ent_coef: (float) Entropy coefficient for the loss caculation
+    :param learning_rate: (float or callable) The learning rate, it can be a function
+    :param vf_coef: (float) Value function coefficient for the loss calculation
+    :param max_grad_norm: (float) The maximum value for the gradient clipping
+    :param gamma: (float) Discount factor
+    :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator
+    :param nminibatches: (int) Number of minibatches for the policies
+    :param noptepochs: (int) Number of epoch when optimizing the surrogate
+    :param cliprange: (float or callable) Clipping parameter, it can be a function
+    :param log_interval: (int) The number of timesteps before logging.
+    :param save_interval: (int) The number of timesteps before saving.
+    :param load_path: (str) Path to a trained ppo2 model, set to None, it will learn from scratch
+    :return: (Model) PPO2 model
+    """
+    if isinstance(learning_rate, float):
+        learning_rate = constfn(learning_rate)
+    else:
+        assert callable(learning_rate)
+    if isinstance(cliprange, float):
+        cliprange = constfn(cliprange)
+    else:
+        assert callable(cliprange)
     total_timesteps = int(total_timesteps)
 
-    nenvs = env.num_envs
+    n_envs = env.num_envs
     ob_space = env.observation_space
     ac_space = env.action_space
-    nbatch = nenvs * nsteps
-    nbatch_train = nbatch // nminibatches
+    n_batch = n_envs * n_steps
+    n_batch_train = n_batch // nminibatches
 
-    make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
-                    nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
-                    max_grad_norm=max_grad_norm)
+    make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_batch_act=n_envs,
+                               n_batch_train=n_batch_train, n_steps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef,
+                               max_grad_norm=max_grad_norm)
     if save_interval and logger.get_dir():
         import cloudpickle
-        with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh:
-            fh.write(cloudpickle.dumps(make_model))
+        with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as file_handler:
+            file_handler.write(cloudpickle.dumps(make_model))
     model = make_model()
     if load_path is not None:
         model.load(load_path)
-    runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
+    runner = Runner(env=env, model=model, n_steps=n_steps, gamma=gamma, lam=lam)
 
-    epinfobuf = deque(maxlen=100)
-    tfirststart = time.time()
+    ep_info_buf = deque(maxlen=100)
+    t_first_start = time.time()
 
-    nupdates = total_timesteps//nbatch
-    for update in range(1, nupdates+1):
-        assert nbatch % nminibatches == 0
-        nbatch_train = nbatch // nminibatches
-        tstart = time.time()
+    nupdates = total_timesteps // n_batch
+    for update in range(1, nupdates + 1):
+        assert n_batch % nminibatches == 0
+        n_batch_train = n_batch // nminibatches
+        t_start = time.time()
         frac = 1.0 - (update - 1.0) / nupdates
-        lrnow = lr(frac)
+        lr_now = learning_rate(frac)
         cliprangenow = cliprange(frac)
-        obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
-        epinfobuf.extend(epinfos)
-        mblossvals = []
-        if states is None: # nonrecurrent version
-            inds = np.arange(nbatch)
+        obs, returns, masks, actions, values, neglogpacs, states, ep_infos = runner.run()  # pylint: disable=E0632
+        ep_info_buf.extend(ep_infos)
+        mb_loss_vals = []
+        if states is None:  # nonrecurrent version
+            inds = np.arange(n_batch)
             for _ in range(noptepochs):
                 np.random.shuffle(inds)
-                for start in range(0, nbatch, nbatch_train):
-                    end = start + nbatch_train
+                for start in range(0, n_batch, n_batch_train):
+                    end = start + n_batch_train
                     mbinds = inds[start:end]
                     slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
-                    mblossvals.append(model.train(lrnow, cliprangenow, *slices))
-        else: # recurrent version
-            assert nenvs % nminibatches == 0
-            envsperbatch = nenvs // nminibatches
-            envinds = np.arange(nenvs)
-            flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
-            envsperbatch = nbatch_train // nsteps
+                    mb_loss_vals.append(model.train(lr_now, cliprangenow, *slices))
+        else:  # recurrent version
+            assert n_envs % nminibatches == 0
+            envinds = np.arange(n_envs)
+            flatinds = np.arange(n_envs * n_steps).reshape(n_envs, n_steps)
+            envsperbatch = n_batch_train // n_steps
             for _ in range(noptepochs):
                 np.random.shuffle(envinds)
-                for start in range(0, nenvs, envsperbatch):
+                for start in range(0, n_envs, envsperbatch):
                     end = start + envsperbatch
-                    mbenvinds = envinds[start:end]
-                    mbflatinds = flatinds[mbenvinds].ravel()
-                    slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs))
-                    mbstates = states[mbenvinds]
-                    mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
-
-        lossvals = np.mean(mblossvals, axis=0)
-        tnow = time.time()
-        fps = int(nbatch / (tnow - tstart))
+                    mb_env_inds = envinds[start:end]
+                    mb_flat_inds = flatinds[mb_env_inds].ravel()
+                    slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs))
+                    mb_states = states[mb_env_inds]
+                    mb_loss_vals.append(model.train(lr_now, cliprangenow, *slices, mb_states))
+
+        loss_vals = np.mean(mb_loss_vals, axis=0)
+        t_now = time.time()
+        fps = int(n_batch / (t_now - t_start))
         if update % log_interval == 0 or update == 1:
-            ev = explained_variance(values, returns)
-            logger.logkv("serial_timesteps", update*nsteps)
+            explained_var = explained_variance(values, returns)
+            logger.logkv("serial_timesteps", update * n_steps)
             logger.logkv("nupdates", update)
-            logger.logkv("total_timesteps", update*nbatch)
+            logger.logkv("total_timesteps", update * n_batch)
             logger.logkv("fps", fps)
-            logger.logkv("explained_variance", float(ev))
-            logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf]))
-            logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
-            logger.logkv('time_elapsed', tnow - tfirststart)
-            for (lossval, lossname) in zip(lossvals, model.loss_names):
-                logger.logkv(lossname, lossval)
+            logger.logkv("explained_variance", float(explained_var))
+            logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf]))
+            logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf]))
+            logger.logkv('time_elapsed', t_start - t_first_start)
+            for (loss_val, loss_name) in zip(loss_vals, model.loss_names):
+                logger.logkv(loss_name, loss_val)
             logger.dumpkvs()
         if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir():
-            checkdir = osp.join(logger.get_dir(), 'checkpoints')
+            checkdir = os.path.join(logger.get_dir(), 'checkpoints')
             os.makedirs(checkdir, exist_ok=True)
-            savepath = osp.join(checkdir, '%.5i'%update)
-            print('Saving to', savepath)
-            model.save(savepath)
+            save_path = os.path.join(checkdir, '%.5i' % update)
+            print('Saving to', save_path)
+            model.save(save_path)
     env.close()
     return model
 
-def safemean(xs):
-    return np.nan if len(xs) == 0 else np.mean(xs)
+
+def safe_mean(arr):
+    """
+    Compute the mean of an array if there is at least one element.
+    For empty array, return zero. It is used for logging only.
+
+    :param arr: (numpy array)
+    :return: (float)
+    """
+    return np.nan if len(arr) == 0 else np.mean(arr)
diff --git a/baselines/ppo2/run_atari.py b/baselines/ppo2/run_atari.py
index 322837ac86..a6239d852e 100644
--- a/baselines/ppo2/run_atari.py
+++ b/baselines/ppo2/run_atari.py
@@ -1,40 +1,42 @@
 #!/usr/bin/env python3
-import sys
 from baselines import logger
 from baselines.common.cmd_util import make_atari_env, atari_arg_parser
 from baselines.common.vec_env.vec_frame_stack import VecFrameStack
 from baselines.ppo2 import ppo2
-from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
-import multiprocessing
-import tensorflow as tf
+from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy
 
 
 def train(env_id, num_timesteps, seed, policy):
+    """
+    Train PPO2 model for atari environment, for testing purposes
 
-    ncpu = multiprocessing.cpu_count()
-    if sys.platform == 'darwin': ncpu //= 2
-    config = tf.ConfigProto(allow_soft_placement=True,
-                            intra_op_parallelism_threads=ncpu,
-                            inter_op_parallelism_threads=ncpu)
-    config.gpu_options.allow_growth = True #pylint: disable=E1101
-    tf.Session(config=config).__enter__()
+    :param env_id: (str) the environment id string
+    :param num_timesteps: (int) the number of timesteps to run
+    :param seed: (int) Used to seed the random generator.
+    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
+    """
 
     env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
-    policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy]
-    ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4,
-        lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
-        ent_coef=.01,
-        lr=lambda f : f * 2.5e-4,
-        cliprange=lambda f : f * 0.1,
-        total_timesteps=int(num_timesteps * 1.1))
+    policy = {'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy}[policy]
+    ppo2.learn(policy=policy, env=env, n_steps=128, nminibatches=4,
+               lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
+               ent_coef=.01,
+               learning_rate=lambda f: f * 2.5e-4,
+               cliprange=lambda f: f * 0.1,
+               total_timesteps=int(num_timesteps * 1.1))
+
 
 def main():
+    """
+    Runs the test
+    """
     parser = atari_arg_parser()
     parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn')
     args = parser.parse_args()
     logger.configure()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
-        policy=args.policy)
+          policy=args.policy)
+
 
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo2/run_mujoco.py b/baselines/ppo2/run_mujoco.py
index 282aa3f134..790c24e2a3 100644
--- a/baselines/ppo2/run_mujoco.py
+++ b/baselines/ppo2/run_mujoco.py
@@ -1,37 +1,38 @@
 #!/usr/bin/env python3
 import numpy as np
+import gym
+
 from baselines.common.cmd_util import mujoco_arg_parser
 from baselines import bench, logger
+from baselines.common import set_global_seeds
+from baselines.common.vec_env.vec_normalize import VecNormalize
+from baselines.ppo2 import ppo2
+from baselines.a2c.policies import MlpPolicy
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 
 
 def train(env_id, num_timesteps, seed):
-    from baselines.common import set_global_seeds
-    from baselines.common.vec_env.vec_normalize import VecNormalize
-    from baselines.ppo2 import ppo2
-    from baselines.ppo2.policies import MlpPolicy
-    import gym
-    import tensorflow as tf
-    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
-    ncpu = 1
-    config = tf.ConfigProto(allow_soft_placement=True,
-                            intra_op_parallelism_threads=ncpu,
-                            inter_op_parallelism_threads=ncpu)
-    tf.Session(config=config).__enter__()
+    """
+    Train PPO2 model for Mujoco environment, for testing purposes
 
+    :param env_id: (str) the environment id string
+    :param num_timesteps: (int) the number of timesteps to run
+    :param seed: (int) Used to seed the random generator.
+    """
     def make_env():
-        env = gym.make(env_id)
-        env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True)
-        return env
+        env_out = gym.make(env_id)
+        env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
+        return env_out
 
     env = DummyVecEnv([make_env])
     env = VecNormalize(env)
 
     set_global_seeds(seed)
     policy = MlpPolicy
-    model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32,
+    model = ppo2.learn(policy=policy, env=env, n_steps=2048, nminibatches=32,
                        lam=0.95, gamma=0.99, noptepochs=10, log_interval=1,
                        ent_coef=0.0,
-                       lr=3e-4,
+                       learning_rate=3e-4,
                        cliprange=0.2,
                        total_timesteps=num_timesteps)
 
@@ -39,6 +40,9 @@ def make_env():
 
 
 def main():
+    """
+    Runs the test
+    """
     args = mujoco_arg_parser().parse_args()
     logger.configure()
     model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
@@ -49,7 +53,7 @@ def main():
         obs[:] = env.reset()
         while True:
             actions = model.step(obs)[0]
-            obs[:]  = env.step(actions)[0]
+            obs[:] = env.step(actions)[0]
             env.render()
 
 
diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py
index 051420474a..589d407da1 100644
--- a/baselines/results_plotter.py
+++ b/baselines/results_plotter.py
@@ -1,53 +1,90 @@
 import numpy as np
 import matplotlib
-matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
-
 import matplotlib.pyplot as plt
-plt.rcParams['svg.fonttype'] = 'none'
 
 from baselines.bench.monitor import load_results
 
+matplotlib.use('TkAgg')  # Can change to 'Agg' for non-interactive mode
+plt.rcParams['svg.fonttype'] = 'none'
+
 X_TIMESTEPS = 'timesteps'
 X_EPISODES = 'episodes'
 X_WALLTIME = 'walltime_hrs'
 POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 EPISODES_WINDOW = 100
 COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
-        'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
-        'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+          'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
+          'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
+
+
+def rolling_window(array, window):
+    """
+    apply a rolling window to a numpy array
+
+    :param array: (numpy Any) the input Array
+    :param window: (int) length of the rolling window
+    :return: (numpy Any) rolling window on the input array
+    """
+    shape = array.shape[:-1] + (array.shape[-1] - window + 1, window)
+    strides = array.strides + (array.strides[-1],)
+    return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides)
+
+
+def window_func(var_1, var_2, window, func):
+    """
+    apply a function to the rolling window of 2 arrays
 
-def rolling_window(a, window):
-    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
-    strides = a.strides + (a.strides[-1],)
-    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+    :param var_1: (numpy Any) variable 1
+    :param var_2: (numpy Any) variable 2
+    :param window: (int) length of the rolling window
+    :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean)
+    :return: (numpy Any, numpy Any)  the rolling output with applied function
+    """
+    var_2_window = rolling_window(var_2, window)
+    function_on_var2 = func(var_2_window, axis=-1)
+    return var_1[window - 1:], function_on_var2
 
-def window_func(x, y, window, func):
-    yw = rolling_window(y, window)
-    yw_func = func(yw, axis=-1)
-    return x[window-1:], yw_func
 
-def ts2xy(ts, xaxis):
+def ts2xy(timesteps, xaxis):
+    """
+    Decompose a timesteps variable to x ans ys
+
+    :param timesteps: (Pandas DataFrame) the input data
+    :param xaxis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :return: (numpy Number, numpy Number) the x and y output
+    """
     if xaxis == X_TIMESTEPS:
-        x = np.cumsum(ts.l.values)
-        y = ts.r.values
+        x_var = np.cumsum(timesteps.l.values)
+        y_var = timesteps.r.values
     elif xaxis == X_EPISODES:
-        x = np.arange(len(ts))
-        y = ts.r.values
+        x_var = np.arange(len(timesteps))
+        y_var = timesteps.r.values
     elif xaxis == X_WALLTIME:
-        x = ts.t.values / 3600.
-        y = ts.r.values
+        x_var = timesteps.t.values / 3600.
+        y_var = timesteps.r.values
     else:
         raise NotImplementedError
-    return x, y
+    return x_var, y_var
+
 
 def plot_curves(xy_list, xaxis, title):
-    plt.figure(figsize=(8,2))
+    """
+    plot the curves
+
+    :param xy_list: ([(numpy Number, numpy Number)]) the x and y coordinates to plot
+    :param xaxis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :param title: (str) the title of the plot
+    """
+
+    plt.figure(figsize=(8, 2))
     maxx = max(xy[0][-1] for xy in xy_list)
     minx = 0
     for (i, (x, y)) in enumerate(xy_list):
         color = COLORS[i]
         plt.scatter(x, y, s=2)
-        x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
+        x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean)  # So returns average of last EPISODE_WINDOW episodes
         plt.plot(x, y_mean, color=color)
     plt.xlim(minx, maxx)
     plt.title(title)
@@ -55,33 +92,47 @@ def plot_curves(xy_list, xaxis, title):
     plt.ylabel("Episode Rewards")
     plt.tight_layout()
 
+
 def plot_results(dirs, num_timesteps, xaxis, task_name):
+    """
+    plot the results
+
+    :param dirs: (str) the save location of the results to plot
+    :param num_timesteps: (int) only plot the points below this value
+    :param xaxis: (str) the axis for the x and y output
+        (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
+    :param task_name: (str) the title of the task to plot
+    """
+
     tslist = []
-    for dir in dirs:
-        ts = load_results(dir)
-        ts = ts[ts.l.cumsum() <= num_timesteps]
-        tslist.append(ts)
-    xy_list = [ts2xy(ts, xaxis) for ts in tslist]
+    for folder in dirs:
+        timesteps = load_results(folder)
+        timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps]
+        tslist.append(timesteps)
+    xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist]
     plot_curves(xy_list, xaxis, task_name)
 
-# Example usage in jupyter-notebook
-# from baselines import log_viewer
-# %matplotlib inline
-# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
-# Here ./log is a directory containing the monitor.csv files
 
 def main():
+    """
+    Example usage in jupyter-notebook
+    from baselines import log_viewer
+    %matplotlib inline
+    log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
+    Here ./log is a directory containing the monitor.csv files
+    """
     import argparse
     import os
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
+    parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log'])
     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
-    parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
-    parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
+    parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS)
+    parser.add_argument('--task_name', help='Title of plot', default='Breakout')
     args = parser.parse_args()
-    args.dirs = [os.path.abspath(dir) for dir in args.dirs]
+    args.dirs = [os.path.abspath(folder) for folder in args.dirs]
     plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
     plt.show()
 
+
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py
index 97b2dcd0b5..9133bba956 100644
--- a/baselines/trpo_mpi/nosharing_cnn_policy.py
+++ b/baselines/trpo_mpi/nosharing_cnn_policy.py
@@ -1,56 +1,63 @@
-import baselines.common.tf_util as U
 import tensorflow as tf
-import gym
-from baselines.common.distributions import make_pdtype
 
-class CnnPolicy(object):
+import baselines.common.tf_util as tf_utils
+from baselines.ppo1.mlp_policy import BasePolicy
+
+
+class CnnPolicy(BasePolicy):
     recurrent = False
-    def __init__(self, name, ob_space, ac_space):
-        with tf.variable_scope(name):
-            self._init(ob_space, ac_space)
-            self.scope = tf.get_variable_scope().name
+
+    def __init__(self, name, ob_space, ac_space, sess=None, reuse=False, placeholders=None):
+        """
+        A CNN policy object for TRPO
+
+        :param name: (str) type of the policy (lin, logits, value)
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        :param sess: (TensorFlow session) The current TensorFlow session containing the variables.
+        :param reuse: (bool) If the policy is reusable or not
+        :param placeholders: (dict) To feed existing placeholders if needed
+        """
+        super(CnnPolicy, self).__init__(placeholders=placeholders)
+        self.sess = sess
+        self.reuse = reuse
+        self.name = name
+        self._init(ob_space, ac_space)
+        self.scope = tf.get_variable_scope().name
 
     def _init(self, ob_space, ac_space):
-        assert isinstance(ob_space, gym.spaces.Box)
-
-        self.pdtype = pdtype = make_pdtype(ac_space)
-        sequence_length = None
-
-        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
-
-        obscaled = ob / 255.0
-
-        with tf.variable_scope("pol"):
-            x = obscaled
-            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-            logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
-            self.pd = pdtype.pdfromflat(logits)
-        with tf.variable_scope("vf"):
-            x = obscaled
-            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
-            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
-            x = U.flattenallbut0(x)
-            x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0)))
-            self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))
+        """
+
+        :param ob_space: (Gym Space) The observation space of the environment
+        :param ac_space: (Gym Space) The action space of the environment
+        """
+        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
+
+        obs_normalized = obs / 255.0
+
+        with tf.variable_scope(self.name + "/pol", reuse=self.reuse):
+            layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID"))
+            layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID"))
+            layer_2 = tf_utils.flattenallbut0(layer_2)
+            layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin',
+                                                 kernel_initializer=tf_utils.normc_initializer(1.0)))
+            logits = tf.layers.dense(layer_3, pdtype.param_shape()[0], name='logits',
+                                     kernel_initializer=tf_utils.normc_initializer(0.01))
+            self.proba_distribution = pdtype.proba_distribution_from_flat(logits)
+        with tf.variable_scope(self.name + "/vf", reuse=self.reuse):
+            layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID"))
+            layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID"))
+            layer_2 = tf_utils.flattenallbut0(layer_2)
+            layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin',
+                                                 kernel_initializer=tf_utils.normc_initializer(1.0)))
+            self.vpred = tf.layers.dense(layer_3, 1, name='value',
+                                         kernel_initializer=tf_utils.normc_initializer(1.0))
             self.vpredz = self.vpred
 
         self.state_in = []
         self.state_out = []
 
-        stochastic = tf.placeholder(dtype=tf.bool, shape=())
-        ac = self.pd.sample()
-        self._act = U.function([stochastic, ob], [ac, self.vpred])
-
-    def act(self, stochastic, ob):
-        ac1, vpred1 =  self._act(stochastic, ob[None])
-        return ac1[0], vpred1[0]
-    def get_variables(self):
-        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
-    def get_trainable_variables(self):
-        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
-    def get_initial_state(self):
-        return []
-
+        if self.stochastic_ph is None:
+            self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())
+        action = self.proba_distribution.sample()
+        self._act = tf_utils.function([self.stochastic_ph, obs], [action, self.vpred])
diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py
index f31ebfd7c5..305b333b27 100644
--- a/baselines/trpo_mpi/run_atari.py
+++ b/baselines/trpo_mpi/run_atari.py
@@ -1,20 +1,26 @@
-    #!/usr/bin/env python3
+#!/usr/bin/env python3
+import os
+
 from mpi4py import MPI
+
 from baselines.common import set_global_seeds
-import os.path as osp
-import gym, logging
-from baselines import logger
-from baselines import bench
+from baselines import bench, logger
 from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 from baselines.common.cmd_util import atari_arg_parser
+from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
+from baselines.trpo_mpi import trpo_mpi
+
 
 def train(env_id, num_timesteps, seed):
-    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
-    from baselines.trpo_mpi import trpo_mpi
-    import baselines.common.tf_util as U
+    """
+    Train TRPO model for the atari environment, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
     rank = MPI.COMM_WORLD.Get_rank()
-    sess = U.single_threaded_session()
-    sess.__enter__()
+
     if rank == 0:
         logger.configure()
     else:
@@ -23,21 +29,29 @@ def train(env_id, num_timesteps, seed):
     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
     set_global_seeds(workerseed)
     env = make_atari(env_id)
-    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
-        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
-    env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
+
+    def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
+        return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)
+
+    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
     env.seed(workerseed)
 
     env = wrap_deepmind(env)
     env.seed(workerseed)
 
     trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
-        max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
+                   max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4,
+                   entcoeff=0.00)
     env.close()
 
+
 def main():
+    """
+    Runs the test
+    """
     args = atari_arg_parser().parse_args()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
+
 if __name__ == "__main__":
     main()
diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py
index 220bb91aba..4bf36efa28 100644
--- a/baselines/trpo_mpi/run_mujoco.py
+++ b/baselines/trpo_mpi/run_mujoco.py
@@ -1,36 +1,48 @@
 #!/usr/bin/env python3
 # noinspection PyUnresolvedReferences
 from mpi4py import MPI
+
 from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 from baselines import logger
 from baselines.ppo1.mlp_policy import MlpPolicy
 from baselines.trpo_mpi import trpo_mpi
+import baselines.common.tf_util as tf_util
+
 
 def train(env_id, num_timesteps, seed):
-    import baselines.common.tf_util as U
-    sess = U.single_threaded_session()
-    sess.__enter__()
-
-    rank = MPI.COMM_WORLD.Get_rank()
-    if rank == 0:
-        logger.configure()
-    else:
-        logger.configure(format_strs=[])
-        logger.set_level(logger.DISABLED)
-    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
-    def policy_fn(name, ob_space, ac_space):
-        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
-            hid_size=32, num_hid_layers=2)
-    env = make_mujoco_env(env_id, workerseed)
-    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
-        max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
-    env.close()
+    """
+    Train TRPO model for the mujoco environment, for testing purposes
+
+    :param env_id: (str) Environment ID
+    :param num_timesteps: (int) The total number of samples
+    :param seed: (int) The initial seed for training
+    """
+    with tf_util.single_threaded_session():
+        rank = MPI.COMM_WORLD.Get_rank()
+        if rank == 0:
+            logger.configure()
+        else:
+            logger.configure(format_strs=[])
+            logger.set_level(logger.DISABLED)
+        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
+
+        def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):
+            return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2, sess=sess,
+                             placeholders=placeholders)
+
+        env = make_mujoco_env(env_id, workerseed)
+        trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
+                       max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
+        env.close()
+
 
 def main():
+    """
+    Runs the test
+    """
     args = mujoco_arg_parser().parse_args()
     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
 
 
 if __name__ == '__main__':
     main()
-
diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py
index e23d9ac793..7edf702a26 100644
--- a/baselines/trpo_mpi/trpo_mpi.py
+++ b/baselines/trpo_mpi/trpo_mpi.py
@@ -1,291 +1,36 @@
-from baselines.common import explained_variance, zipsame, dataset
-from baselines import logger
-import baselines.common.tf_util as U
-import tensorflow as tf, numpy as np
-import time
-from baselines.common import colorize
-from mpi4py import MPI
-from collections import deque
-from baselines.common.mpi_adam import MpiAdam
-from baselines.common.cg import cg
-from contextlib import contextmanager
+from baselines.gail.trpo_mpi import learn as base_learn
 
-def traj_segment_generator(pi, env, horizon, stochastic):
-    # Initialize state variables
-    t = 0
-    ac = env.action_space.sample()
-    new = True
-    rew = 0.0
-    ob = env.reset()
-
-    cur_ep_ret = 0
-    cur_ep_len = 0
-    ep_rets = []
-    ep_lens = []
-
-    # Initialize history arrays
-    obs = np.array([ob for _ in range(horizon)])
-    rews = np.zeros(horizon, 'float32')
-    vpreds = np.zeros(horizon, 'float32')
-    news = np.zeros(horizon, 'int32')
-    acs = np.array([ac for _ in range(horizon)])
-    prevacs = acs.copy()
-
-    while True:
-        prevac = ac
-        ac, vpred = pi.act(stochastic, ob)
-        # Slight weirdness here because we need value function at time T
-        # before returning segment [0, T-1] so we get the correct
-        # terminal value
-        if t > 0 and t % horizon == 0:
-            yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
-                    "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
-                    "ep_rets" : ep_rets, "ep_lens" : ep_lens}
-            _, vpred = pi.act(stochastic, ob)
-            # Be careful!!! if you change the downstream algorithm to aggregate
-            # several of these batches, then be sure to do a deepcopy
-            ep_rets = []
-            ep_lens = []
-        i = t % horizon
-        obs[i] = ob
-        vpreds[i] = vpred
-        news[i] = new
-        acs[i] = ac
-        prevacs[i] = prevac
-
-        ob, rew, new, _ = env.step(ac)
-        rews[i] = rew
-
-        cur_ep_ret += rew
-        cur_ep_len += 1
-        if new:
-            ep_rets.append(cur_ep_ret)
-            ep_lens.append(cur_ep_len)
-            cur_ep_ret = 0
-            cur_ep_len = 0
-            ob = env.reset()
-        t += 1
-
-def add_vtarg_and_adv(seg, gamma, lam):
-    new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
-    vpred = np.append(seg["vpred"], seg["nextvpred"])
-    T = len(seg["rew"])
-    seg["adv"] = gaelam = np.empty(T, 'float32')
-    rew = seg["rew"]
-    lastgaelam = 0
-    for t in reversed(range(T)):
-        nonterminal = 1-new[t+1]
-        delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
-        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
-    seg["tdlamret"] = seg["adv"] + seg["vpred"]
 
 def learn(env, policy_fn, *,
-        timesteps_per_batch, # what to train on
-        max_kl, cg_iters,
-        gamma, lam, # advantage estimation
-        entcoeff=0.0,
-        cg_damping=1e-2,
-        vf_stepsize=3e-4,
-        vf_iters =3,
-        max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
-        callback=None
-        ):
-    nworkers = MPI.COMM_WORLD.Get_size()
-    rank = MPI.COMM_WORLD.Get_rank()
-    np.set_printoptions(precision=3)
-    # Setup losses and stuff
-    # ----------------------------------------
-    ob_space = env.observation_space
-    ac_space = env.action_space
-    pi = policy_fn("pi", ob_space, ac_space)
-    oldpi = policy_fn("oldpi", ob_space, ac_space)
-    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
-    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
-
-    ob = U.get_placeholder_cached(name="ob")
-    ac = pi.pdtype.sample_placeholder([None])
-
-    kloldnew = oldpi.pd.kl(pi.pd)
-    ent = pi.pd.entropy()
-    meankl = tf.reduce_mean(kloldnew)
-    meanent = tf.reduce_mean(ent)
-    entbonus = entcoeff * meanent
-
-    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))
-
-    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
-    surrgain = tf.reduce_mean(ratio * atarg)
-
-    optimgain = surrgain + entbonus
-    losses = [optimgain, meankl, entbonus, surrgain, meanent]
-    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
-
-    dist = meankl
-
-    all_var_list = pi.get_trainable_variables()
-    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
-    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
-    vfadam = MpiAdam(vf_var_list)
-
-    get_flat = U.GetFlat(var_list)
-    set_from_flat = U.SetFromFlat(var_list)
-    klgrads = tf.gradients(dist, var_list)
-    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
-    shapes = [var.get_shape().as_list() for var in var_list]
-    start = 0
-    tangents = []
-    for shape in shapes:
-        sz = U.intprod(shape)
-        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
-        start += sz
-    gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
-    fvp = U.flatgrad(gvp, var_list)
-
-    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
-        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
-    compute_losses = U.function([ob, ac, atarg], losses)
-    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
-    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
-    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
-
-    @contextmanager
-    def timed(msg):
-        if rank == 0:
-            print(colorize(msg, color='magenta'))
-            tstart = time.time()
-            yield
-            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
-        else:
-            yield
-
-    def allmean(x):
-        assert isinstance(x, np.ndarray)
-        out = np.empty_like(x)
-        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
-        out /= nworkers
-        return out
-
-    U.initialize()
-    th_init = get_flat()
-    MPI.COMM_WORLD.Bcast(th_init, root=0)
-    set_from_flat(th_init)
-    vfadam.sync()
-    print("Init param sum", th_init.sum(), flush=True)
-
-    # Prepare for rollouts
-    # ----------------------------------------
-    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
-
-    episodes_so_far = 0
-    timesteps_so_far = 0
-    iters_so_far = 0
-    tstart = time.time()
-    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
-    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
-
-    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
-
-    while True:
-        if callback: callback(locals(), globals())
-        if max_timesteps and timesteps_so_far >= max_timesteps:
-            break
-        elif max_episodes and episodes_so_far >= max_episodes:
-            break
-        elif max_iters and iters_so_far >= max_iters:
-            break
-        logger.log("********** Iteration %i ************"%iters_so_far)
-
-        with timed("sampling"):
-            seg = seg_gen.__next__()
-        add_vtarg_and_adv(seg, gamma, lam)
-
-        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
-        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
-        vpredbefore = seg["vpred"] # predicted value function before udpate
-        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
-
-        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
-        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
-
-        args = seg["ob"], seg["ac"], atarg
-        fvpargs = [arr[::5] for arr in args]
-        def fisher_vector_product(p):
-            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
-
-        assign_old_eq_new() # set old parameter values to new parameter values
-        with timed("computegrad"):
-            *lossbefore, g = compute_lossandgrad(*args)
-        lossbefore = allmean(np.array(lossbefore))
-        g = allmean(g)
-        if np.allclose(g, 0):
-            logger.log("Got zero gradient. not updating")
-        else:
-            with timed("cg"):
-                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
-            assert np.isfinite(stepdir).all()
-            shs = .5*stepdir.dot(fisher_vector_product(stepdir))
-            lm = np.sqrt(shs / max_kl)
-            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
-            fullstep = stepdir / lm
-            expectedimprove = g.dot(fullstep)
-            surrbefore = lossbefore[0]
-            stepsize = 1.0
-            thbefore = get_flat()
-            for _ in range(10):
-                thnew = thbefore + fullstep * stepsize
-                set_from_flat(thnew)
-                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
-                improve = surr - surrbefore
-                logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
-                if not np.isfinite(meanlosses).all():
-                    logger.log("Got non-finite value of losses -- bad!")
-                elif kl > max_kl * 1.5:
-                    logger.log("violated KL constraint. shrinking step.")
-                elif improve < 0:
-                    logger.log("surrogate didn't improve. shrinking step.")
-                else:
-                    logger.log("Stepsize OK!")
-                    break
-                stepsize *= .5
-            else:
-                logger.log("couldn't compute a good step")
-                set_from_flat(thbefore)
-            if nworkers > 1 and iters_so_far % 20 == 0:
-                paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
-                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
-
-        for (lossname, lossval) in zip(loss_names, meanlosses):
-            logger.record_tabular(lossname, lossval)
-
-        with timed("vf"):
-
-            for _ in range(vf_iters):
-                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
-                include_final_partial_batch=False, batch_size=64):
-                    g = allmean(compute_vflossandgrad(mbob, mbret))
-                    vfadam.update(g, vf_stepsize)
-
-        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
-
-        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
-        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
-        lens, rews = map(flatten_lists, zip(*listoflrpairs))
-        lenbuffer.extend(lens)
-        rewbuffer.extend(rews)
-
-        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
-        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
-        logger.record_tabular("EpThisIter", len(lens))
-        episodes_so_far += len(lens)
-        timesteps_so_far += sum(lens)
-        iters_so_far += 1
-
-        logger.record_tabular("EpisodesSoFar", episodes_so_far)
-        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
-        logger.record_tabular("TimeElapsed", time.time() - tstart)
-
-        if rank==0:
-            logger.dump_tabular()
-
-def flatten_lists(listoflists):
-    return [el for list_ in listoflists for el in list_]
\ No newline at end of file
+          timesteps_per_batch,  # what to train on
+          max_kl, cg_iters,
+          gamma, lam,  # advantage estimation
+          entcoeff=0.0,
+          cg_damping=1e-2,
+          vf_stepsize=3e-4,
+          vf_iters=3,
+          max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
+          callback=None):
+    """
+    learns a TRPO policy using the given environment
+
+    :param env: (Gym Environment) the environment
+    :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator
+    :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
+    :param max_kl: (float) the kullback leiber loss threashold
+    :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
+    :param gamma: (float) the discount value
+    :param lam: (float) GAE factor
+    :param entcoeff: (float) the weight for the entropy loss
+    :param cg_damping: (float) the compute gradient dampening factor
+    :param vf_stepsize: (float) the value function stepsize
+    :param vf_iters: (int) the value function's number iterations for learning
+    :param max_timesteps: (int) the maximum number of timesteps before halting
+    :param max_episodes: (int) the maximum number of episodes before halting
+    :param max_iters: (int) the maximum number of training iterations  before halting
+    :param callback: (function (dict, dict)) the call back function, takes the local and global attribute dictionary
+    """
+    base_learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=cg_iters, gamma=gamma,
+               lam=lam, entcoeff=entcoeff, cg_damping=cg_damping, vf_stepsize=vf_stepsize, vf_iters=vf_iters,
+               max_timesteps=max_timesteps, max_episodes=max_episodes, max_iters=max_iters, callback=callback,
+               using_gail=False)
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000000..21ec52f09b
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,14 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests")
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--rungpu"):
+        return
+    skip_gpu = pytest.mark.skip(reason="need --rungpu option to run")
+    for item in items:
+        if "gpu" in item.keywords:
+            item.add_marker(skip_gpu)
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 0000000000..bbb432ba68
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m pytest --cov-config .coveragerc --cov-report html --cov-report term --cov=. --rungpu
diff --git a/setup.py b/setup.py
index bf8badcf60..c0a495a043 100644
--- a/setup.py
+++ b/setup.py
@@ -21,10 +21,16 @@
           'cloudpickle',
           'tensorflow>=1.4.0',
           'click',
-          'opencv-python'
+          'opencv-python',
+          'numpy',
+          'pandas',
+          'pytest',
+          'matplotlib',
+          'seaborn',
+          'glob2'
       ],
       description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
       author='OpenAI',
       url='https://github.com/openai/baselines',
       author_email='gym@openai.com',
-      version='0.1.5')
+      version='0.1.6')
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/test_atari.py b/tests/test_atari.py
new file mode 100644
index 0000000000..56b3092119
--- /dev/null
+++ b/tests/test_atari.py
@@ -0,0 +1,112 @@
+import pytest
+
+import tensorflow as tf
+
+from baselines import deepq, bench, logger
+from baselines.common import set_global_seeds
+from baselines.common.atari_wrappers import make_atari
+import baselines.a2c.run_atari as a2c_atari
+import baselines.acer.run_atari as acer_atari
+import baselines.acktr.run_atari as acktr_atari
+import baselines.ppo1.run_atari as ppo1_atari
+import baselines.ppo2.run_atari as ppo2_atari
+import baselines.trpo_mpi.run_atari as trpo_atari
+
+
+ENV_ID = 'BreakoutNoFrameskip-v4'
+SEED = 3
+NUM_TIMESTEPS = 2500
+NUM_CPU = 4
+
+
+def clear_tf_session():
+    """
+    clears the Tensorflow session, this is needed for sequential testing of the baselines
+    """
+    tf.reset_default_graph()
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm'])
+def test_a2c(policy):
+    """
+    test A2C on atari
+
+    :param policy: (str) the policy to test for A2C
+    """
+    clear_tf_session()
+    a2c_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED,
+                    policy=policy, lr_schedule='constant', num_env=NUM_CPU)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("policy", ['cnn', 'lstm'])
+def test_acer(policy):
+    """
+    test ACER on atari
+
+    :param policy: (str) the policy to test for ACER
+    """
+    clear_tf_session()
+    acer_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED,
+                     policy=policy, lr_schedule='constant', num_cpu=NUM_CPU)
+
+
+@pytest.mark.slow
+def test_acktr():
+    """
+    test ACKTR on atari
+    """
+    clear_tf_session()
+    acktr_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, num_cpu=NUM_CPU)
+
+
+@pytest.mark.slow
+def test_deepq():
+    """
+    test DeepQ on atari
+    """
+    clear_tf_session()
+    logger.configure()
+    set_global_seeds(SEED)
+    env = make_atari(ENV_ID)
+    env = bench.Monitor(env, logger.get_dir())
+    env = deepq.wrap_atari_dqn(env)
+    model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True)
+
+    deepq.learn(env, q_func=model, learning_rate=1e-4, max_timesteps=NUM_TIMESTEPS, buffer_size=10000,
+                exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000,
+                target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6,
+                checkpoint_freq=10000)
+
+    env.close()
+
+
+@pytest.mark.slow
+def test_ppo1():
+    """
+    test PPO1 on atari
+    """
+    clear_tf_session()
+    ppo1_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm', 'mlp'])
+def test_ppo2(policy):
+    """
+    test PPO2 on atari
+
+    :param policy: (str) the policy to test for PPO2
+    """
+    clear_tf_session()
+    ppo2_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, policy=policy)
+
+
+@pytest.mark.slow
+def test_trpo():
+    """
+    test TRPO on atari
+    """
+    clear_tf_session()
+    trpo_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED)
diff --git a/tests/test_common.py b/tests/test_common.py
new file mode 100644
index 0000000000..7fccd367a9
--- /dev/null
+++ b/tests/test_common.py
@@ -0,0 +1,6 @@
+def _assert_eq(left, right):
+    assert left == right, '{} != {}'.format(left, right)
+
+
+def _assert_neq(left, right):
+    assert left != right, '{} == {}'.format(left, right)
diff --git a/tests/test_continuous.py b/tests/test_continuous.py
new file mode 100644
index 0000000000..4daec6bc19
--- /dev/null
+++ b/tests/test_continuous.py
@@ -0,0 +1,12 @@
+import subprocess
+
+from .test_common import _assert_eq
+
+ENV_ID = 'Pendulum-v0'
+
+
+def test_ddpg():
+    args = ['--env-id', ENV_ID, '--nb-epochs', 2, '--nb-epoch-cycles', 2, '--nb-rollout-steps', 100]
+    args = list(map(str, args))
+    return_code = subprocess.call(['python', '-m', 'baselines.ddpg.main'] + args)
+    _assert_eq(return_code, 0)
diff --git a/tests/test_deepq.py b/tests/test_deepq.py
new file mode 100644
index 0000000000..8be95a045f
--- /dev/null
+++ b/tests/test_deepq.py
@@ -0,0 +1,28 @@
+import subprocess
+
+from .test_common import _assert_eq
+
+
+def test_custom_cartpole():
+    args = ['--no-render', '--max-timesteps', 1000]
+    args = list(map(str, args))
+    return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.custom_cartpole'] + args)
+    _assert_eq(return_code, 0)
+
+def test_cartpole():
+    args = ['--max-timesteps', 1000]
+    args = list(map(str, args))
+    return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.train_cartpole'] + args)
+    _assert_eq(return_code, 0)
+
+    return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.enjoy_cartpole', '--no-render'])
+    _assert_eq(return_code, 0)
+
+def test_mountaincar():
+    args = ['--max-timesteps', 1000]
+    args = list(map(str, args))
+    return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.train_mountaincar'] + args)
+    _assert_eq(return_code, 0)
+
+    return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.enjoy_mountaincar', '--no-render'])
+    _assert_eq(return_code, 0)
diff --git a/tests/test_distri.py b/tests/test_distri.py
new file mode 100644
index 0000000000..735b06239e
--- /dev/null
+++ b/tests/test_distri.py
@@ -0,0 +1,68 @@
+import numpy as np
+import tensorflow as tf
+
+import baselines.common.tf_util as tf_util
+from baselines.common.distributions import DiagGaussianProbabilityDistributionType,\
+    CategoricalProbabilityDistributionType, \
+    MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType
+
+
+@tf_util.in_session
+def test_probtypes():
+    """
+    test probability distribution types
+    """
+    np.random.seed(0)
+
+    pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
+    diag_gauss = DiagGaussianProbabilityDistributionType(pdparam_diag_gauss.size // 2)
+    validate_probtype(diag_gauss, pdparam_diag_gauss)
+
+    pdparam_categorical = np.array([-.2, .3, .5])
+    categorical = CategoricalProbabilityDistributionType(pdparam_categorical.size)
+    validate_probtype(categorical, pdparam_categorical)
+
+    nvec = [1, 2, 3]
+    pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1])
+    multicategorical = MultiCategoricalProbabilityDistributionType(nvec)
+    validate_probtype(multicategorical, pdparam_multicategorical)
+
+    pdparam_bernoulli = np.array([-.2, .3, .5])
+    bernoulli = BernoulliProbabilityDistributionType(pdparam_bernoulli.size)
+    validate_probtype(bernoulli, pdparam_bernoulli)
+
+
+def validate_probtype(probtype, pdparam):
+    """
+    validate probability distribution types
+
+    :param probtype: (ProbabilityDistributionType) the type to validate
+    :param pdparam: ([float]) the flat probabilities to test
+    """
+    number_samples = 100000
+    # Check to see if mean negative log likelihood == differential entropy
+    mval = np.repeat(pdparam[None, :], number_samples, axis=0)
+    mval_ph = probtype.param_placeholder([number_samples])
+    xval_ph = probtype.sample_placeholder([number_samples])
+    proba_distribution = probtype.proba_distribution_from_flat(mval_ph)
+    calcloglik = tf_util.function([xval_ph, mval_ph], proba_distribution.logp(xval_ph))
+    calcent = tf_util.function([mval_ph], proba_distribution.entropy())
+    xval = tf.get_default_session().run(proba_distribution.sample(), feed_dict={mval_ph: mval})
+    logliks = calcloglik(xval, mval)
+    entval_ll = - logliks.mean()
+    entval_ll_stderr = logliks.std() / np.sqrt(number_samples)
+    entval = calcent(mval).mean()
+    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas
+
+    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
+    mval2_ph = probtype.param_placeholder([number_samples])
+    pd2 = probtype.proba_distribution_from_flat(mval2_ph)
+    tmp = pdparam + np.random.randn(pdparam.size) * 0.1
+    mval2 = np.repeat(tmp[None, :], number_samples, axis=0)
+    calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2))
+    klval = calckl(mval, mval2).mean()
+    logliks = calcloglik(xval, mval2)
+    klval_ll = - entval - logliks.mean()
+    klval_ll_stderr = logliks.std() / np.sqrt(number_samples)
+    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
+    print('ok on', probtype, pdparam)
diff --git a/baselines/common/test_identity.py b/tests/test_identity.py
similarity index 66%
rename from baselines/common/test_identity.py
rename to tests/test_identity.py
index a429e0c27b..38ca72be35 100644
--- a/baselines/common/test_identity.py
+++ b/tests/test_identity.py
@@ -1,29 +1,32 @@
+import random
+
 import pytest
 import tensorflow as tf
-import random
 import numpy as np
-from gym.spaces import np_random
+from gym.spaces.prng import np_random
 
 from baselines.a2c import a2c
 from baselines.ppo2 import ppo2
 from baselines.common.identity_env import IdentityEnv
 from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
-from baselines.ppo2.policies import MlpPolicy
+from baselines.a2c.policies import MlpPolicy
 
 
 learn_func_list = [
     lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000),
-    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01)
+    lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, learning_rate=1e-3, n_steps=128, ent_coef=0.01)
 ]
 
 
 @pytest.mark.slow
 @pytest.mark.parametrize("learn_func", learn_func_list)
 def test_identity(learn_func):
-    '''
+    """
     Test if the algorithm (with a given policy) 
     can learn an identity transformation (i.e. return observation as an action)
-    '''
+
+    :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator
+    """
     np.random.seed(0)
     np_random.seed(0)
     random.seed(0)
@@ -34,11 +37,11 @@ def test_identity(learn_func):
         tf.set_random_seed(0)
         model = learn_func(env)
 
-        N_TRIALS = 1000
-        sum_rew = 0
+        n_trials = 1000
+        reward_sum = 0
         obs = env.reset()
-        for i in range(N_TRIALS):
-            obs, rew, done, _ = env.step(model.step(obs)[0])
-            sum_rew += rew
+        for _ in range(n_trials):
+            obs, reward, _, _ = env.step(model.step(obs)[0])
+            reward_sum += reward
 
-        assert sum_rew > 0.9 * N_TRIALS
+        assert reward_sum > 0.9 * n_trials
diff --git a/tests/test_logger.py b/tests/test_logger.py
new file mode 100644
index 0000000000..fe3e9eee0f
--- /dev/null
+++ b/tests/test_logger.py
@@ -0,0 +1,50 @@
+import subprocess
+
+import pytest
+
+from baselines.logger import make_output_format, read_tb, read_csv, read_json
+
+KEY_VALUES = {'test': 1, 'b': -3.14, '8': 9.9}
+LOG_DIR = '/tmp/openai_baselines/'
+
+
+def _assert_eq(left, right):
+    assert left == right, '{} != {}'.format(left, right)
+
+
+def _assert_neq(left, right):
+    assert left != right, '{} == {}'.format(left, right)
+
+
+def test_main():
+    """
+    Dry-run python -m baselines.logger
+    """
+    return_code = subprocess.call(['python', 'baselines/logger.py'])
+    _assert_eq(return_code, 0)
+
+
+@pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv'])
+def test_make_output(_format):
+    """
+    test make output
+
+    :param _format: (str) output format
+    """
+    writer = make_output_format(_format, LOG_DIR)
+    writer.writekvs(KEY_VALUES)
+    if _format == 'tensorboard':
+        read_tb(LOG_DIR)
+    elif _format == "csv":
+        read_csv(LOG_DIR + 'progress.csv')
+    elif _format == 'json':
+        read_json(LOG_DIR + 'progress.json')
+    writer.close()
+
+
+def test_make_output_fail():
+    """
+    test value error on logger
+    """
+    with pytest.raises(ValueError):
+        make_output_format('dummy_format', LOG_DIR)
diff --git a/tests/test_math_util.py b/tests/test_math_util.py
new file mode 100644
index 0000000000..b1db323b22
--- /dev/null
+++ b/tests/test_math_util.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+from baselines.common.math_util import discount_with_boundaries
+
+
+def test_discount_with_boundaries():
+    """
+    test the discount_with_boundaries function
+    """
+    gamma = 0.9
+    rewards = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
+    episode_starts = [1.0, 0.0, 0.0, 1.0]
+    discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma)
+    assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4])
+    return
diff --git a/tests/test_mpi_adam.py b/tests/test_mpi_adam.py
new file mode 100644
index 0000000000..bc0c1337b7
--- /dev/null
+++ b/tests/test_mpi_adam.py
@@ -0,0 +1,10 @@
+import subprocess
+
+from .test_common import _assert_eq
+
+
+def test_mpi_adam():
+    """Test RunningMeanStd object for MPI"""
+    return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2',
+                                   'python', '-m', 'baselines.common.mpi_adam'])
+    _assert_eq(return_code, 0)
diff --git a/tests/test_running_stat.py b/tests/test_running_stat.py
new file mode 100644
index 0000000000..cda4eda7f0
--- /dev/null
+++ b/tests/test_running_stat.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+from baselines.common.running_stat import RunningStat
+
+
+def test_running_stat():
+    """
+    test RunningStat object
+    """
+    for shape in ((), (3,), (3, 4)):
+        hist = []
+        running_stat = RunningStat(shape)
+        for _ in range(5):
+            val = np.random.randn(*shape)
+            running_stat.push(val)
+            hist.append(val)
+            _mean = np.mean(hist, axis=0)
+            assert np.allclose(running_stat.mean, _mean)
+            _var = np.square(_mean) if (len(hist) == 1) else np.var(hist, ddof=1, axis=0)
+            assert np.allclose(running_stat.var, _var)
diff --git a/tests/test_schedules.py b/tests/test_schedules.py
new file mode 100644
index 0000000000..849b84a155
--- /dev/null
+++ b/tests/test_schedules.py
@@ -0,0 +1,33 @@
+import numpy as np
+
+from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
+
+
+def test_piecewise_schedule():
+    """
+    test PiecewiseSchedule
+    """
+    piecewise_sched = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)],
+                                        outside_value=500)
+
+    assert np.isclose(piecewise_sched.value(-10), 500)
+    assert np.isclose(piecewise_sched.value(0), 150)
+    assert np.isclose(piecewise_sched.value(5), 200)
+    assert np.isclose(piecewise_sched.value(9), 80)
+    assert np.isclose(piecewise_sched.value(50), 50)
+    assert np.isclose(piecewise_sched.value(80), 50)
+    assert np.isclose(piecewise_sched.value(150), 0)
+    assert np.isclose(piecewise_sched.value(175), -25)
+    assert np.isclose(piecewise_sched.value(201), 500)
+    assert np.isclose(piecewise_sched.value(500), 500)
+
+    assert np.isclose(piecewise_sched.value(200 - 1e-10), -50)
+
+
+def test_constant_schedule():
+    """
+    test ConstantSchedule
+    """
+    constant_sched = ConstantSchedule(5)
+    for i in range(-100, 100):
+        assert np.isclose(constant_sched.value(i), 5)
diff --git a/baselines/common/tests/test_segment_tree.py b/tests/test_segment_tree.py
similarity index 91%
rename from baselines/common/tests/test_segment_tree.py
rename to tests/test_segment_tree.py
index 700e0bb456..4e8de75540 100644
--- a/baselines/common/tests/test_segment_tree.py
+++ b/tests/test_segment_tree.py
@@ -4,6 +4,9 @@
 
 
 def test_tree_set():
+    """
+    test Segment Tree data structure
+    """
     tree = SumSegmentTree(4)
 
     tree[2] = 1.0
@@ -18,6 +21,9 @@ def test_tree_set():
 
 
 def test_tree_set_overlap():
+    """
+    test Segment Tree data structure
+    """
     tree = SumSegmentTree(4)
 
     tree[2] = 1.0
@@ -31,6 +37,9 @@ def test_tree_set_overlap():
 
 
 def test_prefixsum_idx():
+    """
+    test Segment Tree data structure
+    """
     tree = SumSegmentTree(4)
 
     tree[2] = 1.0
@@ -45,6 +54,9 @@ def test_prefixsum_idx():
 
 
 def test_prefixsum_idx2():
+    """
+    test Segment Tree data structure
+    """
     tree = SumSegmentTree(4)
 
     tree[0] = 0.5
@@ -61,6 +73,9 @@ def test_prefixsum_idx2():
 
 
 def test_max_interval_tree():
+    """
+    test Segment Tree data structure
+    """
     tree = MinSegmentTree(4)
 
     tree[0] = 1.0
diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py
new file mode 100644
index 0000000000..e810e21d90
--- /dev/null
+++ b/tests/test_tf_util.py
@@ -0,0 +1,43 @@
+# tests for tf_util
+import tensorflow as tf
+
+from baselines.common.tf_util import function, initialize, single_threaded_session
+
+
+def test_function():
+    """
+    test the function function in tf_util
+    """
+    with tf.Graph().as_default():
+        x_ph = tf.placeholder(tf.int32, (), name="x")
+        y_ph = tf.placeholder(tf.int32, (), name="y")
+        z_ph = 3 * x_ph + 2 * y_ph
+        linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0})
+
+        with single_threaded_session():
+            initialize()
+
+            assert linear_fn(2) == 6
+            assert linear_fn(2, 2) == 10
+
+
+def test_multikwargs():
+    """
+    test the function function in tf_util
+    """
+    with tf.Graph().as_default():
+        x_ph = tf.placeholder(tf.int32, (), name="x")
+        with tf.variable_scope("other"):
+            x2_ph = tf.placeholder(tf.int32, (), name="x")
+        z_ph = 3 * x_ph + 2 * x2_ph
+
+        linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0})
+        with single_threaded_session():
+            initialize()
+            assert linear_fn(2) == 6
+            assert linear_fn(2, 2) == 10
+
+
+if __name__ == '__main__':
+    test_function()
+    test_multikwargs()
diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py
new file mode 100644
index 0000000000..9c4f615048
--- /dev/null
+++ b/tests/test_vec_normalize.py
@@ -0,0 +1,58 @@
+import subprocess
+
+import gym
+import numpy as np
+
+from baselines.common.running_mean_std import RunningMeanStd
+from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
+from baselines.common.vec_env.vec_normalize import VecNormalize
+from .test_common import _assert_eq
+
+ENV_ID = 'BreakoutNoFrameskip-v4'
+
+
+def test_runningmeanstd():
+    """Test RunningMeanStd object"""
+    for (x_1, x_2, x_3) in [
+        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
+        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
+        rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])
+
+        x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
+        moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)]
+        rms.update(x_1)
+        rms.update(x_2)
+        rms.update(x_3)
+        moments_2 = [rms.mean, rms.var]
+
+        assert np.allclose(moments_1, moments_2)
+
+
+def test_vec_env():
+    """Test VecNormalize Object"""
+
+    def make_env():
+        return gym.make(ENV_ID)
+
+    env = DummyVecEnv([make_env])
+    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
+    _, done = env.reset(), [False]
+    while not done[0]:
+        actions = [env.action_space.sample()]
+        obs, _, done, _ = env.step(actions)
+    assert np.max(obs) <= 10
+
+
+def test_mpi_runningmeanstd():
+    """Test RunningMeanStd object for MPI"""
+    return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2',
+                                   'python', '-m', 'baselines.common.mpi_running_mean_std'])
+    _assert_eq(return_code, 0)
+
+
+def test_mpi_moments():
+    """
+    test running mean std function
+    """
+    subprocess.check_call(['mpirun', '--allow-run-as-root', '-np', '3', 'python', '-c',
+                           'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])