diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000..a0bbe87e15 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,16 @@ +[run] +branch = False +omit = + baselines/common/tests/* + # Mujoco requires a licence + baselines/*/run_mujoco.py + baselines/ppo1/run_humanoid.py + baselines/ppo1/run_robotics.py + # HER requires mpi and Mujoco + baselines/her/experiment/ + +[report] +exclude_lines = + pragma: no cover + raise NotImplementedError() + if KFAC_DEBUG: diff --git a/.gitignore b/.gitignore index 722e942b29..ac2dba664e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,13 @@ *.pyc *.pkl *.py~ +*.bak .pytest_cache .DS_Store .idea +.coverage +.coverage.* +__pycache__/ # Setuptools distribution and build folders. /dist/ @@ -34,5 +38,3 @@ src .cache MUJOCO_LOG.TXT - - diff --git a/.travis.yml b/.travis.yml index 5ba3eadd97..4d5abfbdaf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,9 @@ language: python python: - "3.6" +notifications: + email: false + services: - docker @@ -11,4 +14,4 @@ install: script: - flake8 --select=F baselines/common - - docker run baselines-test pytest + - docker run --env CODACY_PROJECT_TOKEN=$CODACY_PROJECT_TOKEN baselines-test sh -c 'pytest --cov-config .coveragerc --cov-report term --cov-report xml --cov=. && python-codacy-coverage -r coverage.xml --token=$CODACY_PROJECT_TOKEN' diff --git a/Dockerfile b/Dockerfile index eeac22ad2f..3b1d0d4ad4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,43 @@ FROM ubuntu:16.04 -RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake +RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake libglib2.0-0 libsm6 libxext6 libfontconfig1 libxrender1 ENV CODE_DIR /root/code ENV VENV /root/venv -COPY . $CODE_DIR/baselines RUN \ pip install virtualenv && \ virtualenv $VENV --python=python3 && \ . $VENV/bin/activate && \ + mkdir $CODE_DIR && \ cd $CODE_DIR && \ pip install --upgrade pip && \ - pip install -e baselines && \ - pip install pytest + pip install pytest && \ + pip install pytest-cov && \ + pip install codacy-coverage && \ + pip install scipy && \ + pip install tqdm && \ + pip install joblib && \ + pip install zmq && \ + pip install dill && \ + pip install progressbar2 && \ + pip install mpi4py && \ + pip install cloudpickle && \ + pip install tensorflow>=1.4.0 && \ + pip install click && \ + pip install opencv-python && \ + pip install numpy && \ + pip install pandas && \ + pip install pytest && \ + pip install matplotlib && \ + pip install seaborn && \ + pip install glob2 && \ + pip install gym[mujoco,atari,classic_control,robotics] + +COPY . $CODE_DIR/baselines +RUN \ + . $VENV/bin/activate && \ + cd $CODE_DIR && \ + pip install -e baselines ENV PATH=$VENV/bin:$PATH WORKDIR $CODE_DIR/baselines diff --git a/README.md b/README.md index 197f01af97..a48c78c3dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines) + [![Build Status](https://travis-ci.org/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.org/hill-a/stable-baselines) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage) # Baselines diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py index f1de88a37e..653a1b4a76 100644 --- a/baselines/a2c/a2c.py +++ b/baselines/a2c/a2c.py @@ -1,59 +1,75 @@ -import os.path as osp +import os import time import joblib + import numpy as np import tensorflow as tf -from baselines import logger -from baselines.common import set_global_seeds, explained_variance +from baselines import logger +from baselines.common import set_global_seeds, explained_variance, tf_util from baselines.common.runners import AbstractEnvRunner -from baselines.common import tf_util +from baselines.a2c.utils import discount_with_dones, Scheduler, make_path, find_trainable_variables, calc_entropy, mse -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse class Model(object): - - def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, - ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, - alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): + def __init__(self, policy, ob_space, ac_space, n_envs, n_steps, + ent_coef=0.01, vf_coef=0.25, max_grad_norm=0.5, learning_rate=7e-4, + alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lr_schedule='linear'): + """ + The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783 + + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param ob_space: (Gym Space) Observation space + :param ac_space: (Gym Space) Action space + :param n_envs: (int) The number of environments + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param vf_coef: (float) Value function coefficient for the loss calculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param learning_rate: (float) The learning rate + :param alpha: (float) RMS prop optimizer decay + :param epsilon: (float) RMS prop optimizer epsilon + :param total_timesteps: (int) The total number of samples + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + """ sess = tf_util.make_session() - nbatch = nenvs*nsteps + n_batch = n_envs * n_steps - A = tf.placeholder(tf.int32, [nbatch]) - ADV = tf.placeholder(tf.float32, [nbatch]) - R = tf.placeholder(tf.float32, [nbatch]) - LR = tf.placeholder(tf.float32, []) + actions_ph = tf.placeholder(tf.int32, [n_batch]) + advs_ph = tf.placeholder(tf.float32, [n_batch]) + rewards_ph = tf.placeholder(tf.float32, [n_batch]) + learning_rate_ph = tf.placeholder(tf.float32, []) - step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) + step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) + train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) - neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) - pg_loss = tf.reduce_mean(ADV * neglogpac) - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) - loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef + neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=actions_ph) + pg_loss = tf.reduce_mean(advs_ph * neglogpac) + vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) + entropy = tf.reduce_mean(calc_entropy(train_model.policy)) + loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: - grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) + grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) - trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) + trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) - lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) + learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values - for step in range(len(obs)): - cur_lr = lr.value() - td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} + for _ in range(len(obs)): + cur_lr = learning_rate.value() + td_map = {train_model.obs_ph: obs, actions_ph: actions, advs_ph: advs, + rewards_ph: rewards, learning_rate_ph: cur_lr} if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks + td_map[train_model.states_ph] = states + td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map @@ -61,15 +77,15 @@ def train(obs, states, rewards, masks, actions, values): return policy_loss, value_loss, policy_entropy def save(save_path): - ps = sess.run(params) - make_path(osp.dirname(save_path)) - joblib.dump(ps, save_path) + parameters = sess.run(params) + make_path(os.path.dirname(save_path)) + joblib.dump(parameters, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) + for param, loaded_p in zip(params, loaded_params): + restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train @@ -82,16 +98,30 @@ def load(load_path): self.load = load tf.global_variables_initializer().run(session=sess) -class Runner(AbstractEnvRunner): - def __init__(self, env, model, nsteps=5, gamma=0.99): - super().__init__(env=env, model=model, nsteps=nsteps) +class Runner(AbstractEnvRunner): + def __init__(self, env, model, n_steps=5, gamma=0.99): + """ + A runner to learn the policy of an environment for a model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + :param gamma: (float) Discount factor + """ + super(Runner, self).__init__(env=env, model=model, n_steps=n_steps) self.gamma = gamma def run(self): - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] + """ + Run a learning step of the model + + :return: ([float], [float], [float], [bool], [float], [float]) + observations, states, rewards, masks, actions, values + """ + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] mb_states = self.states - for n in range(self.nsteps): + for _ in range(self.n_steps): actions, values, states, _ = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) @@ -102,11 +132,11 @@ def run(self): self.dones = dones for n, done in enumerate(dones): if done: - self.obs[n] = self.obs[n]*0 + self.obs[n] = self.obs[n] * 0 self.obs = obs mb_rewards.append(rewards) mb_dones.append(self.dones) - #batch of steps to batch of rollouts + # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) @@ -115,12 +145,12 @@ def run(self): mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = self.model.value(self.obs, self.states, self.dones).tolist() - #discount/bootstrap off value fn + # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: - rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] + rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards @@ -130,31 +160,56 @@ def run(self): mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values -def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): + +def learn(policy, env, seed, n_steps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, + learning_rate=7e-4, lr_schedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): + """ + Return a trained A2C model. + + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment) The environment to learn from + :param seed: (int) The initial seed for training + :param n_steps: (int) The number of steps to run for each environment + :param total_timesteps: (int) The total number of samples + :param vf_coef: (float) Value function coefficient for the loss calculation + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param learning_rate: (float) The learning rate + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param epsilon: (float) RMS prop optimizer epsilon + :param alpha: (float) RMS prop optimizer decay + :param gamma: (float) Discount factor + :param log_interval: (int) The number of timesteps before logging. + :return: (Model) A2C model + """ set_global_seeds(seed) - nenvs = env.num_envs + n_envs = env.num_envs ob_space = env.observation_space ac_space = env.action_space - model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) - runner = Runner(env, model, nsteps=nsteps, gamma=gamma) - - nbatch = nenvs*nsteps - tstart = time.time() - for update in range(1, total_timesteps//nbatch+1): + model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs, + n_steps=n_steps, ent_coef=ent_coef, + vf_coef=vf_coef, max_grad_norm=max_grad_norm, learning_rate=learning_rate, + alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, + lr_schedule=lr_schedule) + runner = Runner(env, model, n_steps=n_steps, gamma=gamma) + + n_batch = n_envs * n_steps + t_start = time.time() + for update in range(1, total_timesteps // n_batch + 1): obs, states, rewards, masks, actions, values = runner.run() - policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) - nseconds = time.time()-tstart - fps = int((update*nbatch)/nseconds) + _, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) + n_seconds = time.time() - t_start + fps = int((update * n_batch) / n_seconds) if update % log_interval == 0 or update == 1: - ev = explained_variance(values, rewards) + explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", update*nbatch) + logger.record_tabular("total_timesteps", update * n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(ev)) + logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() env.close() return model diff --git a/baselines/a2c/policies.py b/baselines/a2c/policies.py index 6fbbb14ac8..61ebe71780 100644 --- a/baselines/a2c/policies.py +++ b/baselines/a2c/policies.py @@ -1,146 +1,141 @@ import numpy as np import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype + +from baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm +from baselines.common.distributions import make_proba_dist_type from baselines.common.input import observation_input -def nature_cnn(unscaled_images, **conv_kwargs): + +def nature_cnn(unscaled_images, **kwargs): """ CNN from Nature paper. + + :param unscaled_images: (TensorFlow Tensor) Image input placeholder + :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN + :return: (TensorFlow Tensor) The CNN output layer """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) + layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs)) + layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs)) + layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs)) + layer_3 = conv_to_fc(layer_3) + return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2))) + + +class A2CPolicy(object): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False): + """ + Policy object for A2C + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_steps: (int) The number of steps to run for each environment + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + :param reuse: (bool) If the policy is reusable or not + """ + self.n_env = n_batch // n_steps + self.obs_ph, self.processed_x = observation_input(ob_space, n_batch) + self.masks_ph = tf.placeholder(tf.float32, [n_batch]) # mask (done t-1) + self.states_ph = tf.placeholder(tf.float32, [self.n_env, n_lstm * 2]) # states + self.pdtype = make_proba_dist_type(ac_space) + self.sess = sess + self.reuse = reuse + + def step(self, obs, state=None, mask=None): + """ + Returns the policy for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :return: ([float], [float], [float], [float]) actions, values, states, neglogp + """ + raise NotImplementedError + + def value(self, obs, state=None, mask=None): + """ + Returns the value for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :return: ([float]) The associated value of the action + """ + raise NotImplementedError + + +class LstmPolicy(A2CPolicy): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, layer_norm=False, **kwargs): + super(LstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse) with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) + extracted_features = nature_cnn(self.obs_ph, **kwargs) + input_sequence = batch_to_seq(extracted_features, self.n_env, n_steps) + masks = batch_to_seq(self.masks_ph, self.n_env, n_steps) + rnn_output, self.snew = lstm(input_sequence, masks, self.states_ph, 'lstm1', n_hidden=n_lstm, + layer_norm=layer_norm) + rnn_output = seq_to_batch(rnn_output) + value_fn = linear(rnn_output, 'v', 1) + self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(rnn_output) - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None + self._value = value_fn[:, 0] + self.action = self.proba_distribution.sample() + self.neglogp = self.proba_distribution.neglogp(self.action) + self.initial_state = np.zeros((self.n_env, n_lstm * 2), dtype=np.float32) + self.value_fn = value_fn + + def step(self, obs, state=None, mask=None): + return self.sess.run([self.action, self._value, self.snew, self.neglogp], + {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) + + def value(self, obs, state=None, mask=None): + return self.sess.run(self._value, {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) +class LnLstmPolicy(LstmPolicy): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_): + super(LnLstmPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, layer_norm=True) - self.X = X - self.vf = vf - self.step = step - self.value = value -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) +class FeedForwardPolicy(A2CPolicy): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, _type="cnn", **kwargs): + super(FeedForwardPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse) with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] + if _type == "cnn": + extracted_features = nature_cnn(self.processed_x, **kwargs) + value_fn = linear(extracted_features, 'v', 1)[:, 0] + else: + activ = tf.tanh + processed_x = tf.layers.flatten(self.processed_x) + pi_h1 = activ(linear(processed_x, 'pi_fc1', n_hidden=64, init_scale=np.sqrt(2))) + pi_h2 = activ(linear(pi_h1, 'pi_fc2', n_hidden=64, init_scale=np.sqrt(2))) + vf_h1 = activ(linear(processed_x, 'vf_fc1', n_hidden=64, init_scale=np.sqrt(2))) + vf_h2 = activ(linear(vf_h1, 'vf_fc2', n_hidden=64, init_scale=np.sqrt(2))) + value_fn = linear(vf_h2, 'vf', 1)[:, 0] + extracted_features = pi_h2 + self.proba_distribution, self.policy = self.pdtype.proba_distribution_from_latent(extracted_features, + init_scale=0.01) + + self.action = self.proba_distribution.sample() + self.neglogp = self.proba_distribution.neglogp(self.action) + self.initial_state = None + self.value_fn = value_fn - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) + def step(self, obs, state=None, mask=None): + action, value, neglogp = self.sess.run([self.action, self.value_fn, self.neglogp], {self.obs_ph: obs}) + return action, value, self.initial_state, neglogp + def value(self, obs, state=None, mask=None): + return self.sess.run(self.value_fn, {self.obs_ph: obs}) - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp +class CnnPolicy(FeedForwardPolicy): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_kwargs): + super(CnnPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, _type="cnn") - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - self.X = X - self.vf = vf - self.step = step - self.value = value +class MlpPolicy(FeedForwardPolicy): + def __init__(self, sess, ob_space, ac_space, n_batch, n_steps, n_lstm=256, reuse=False, **_kwargs): + super(MlpPolicy, self).__init__(sess, ob_space, ac_space, n_batch, n_steps, n_lstm, reuse, _type="mlp") diff --git a/baselines/a2c/run_atari.py b/baselines/a2c/run_atari.py index b09d9bbffe..2b91609841 100644 --- a/baselines/a2c/run_atari.py +++ b/baselines/a2c/run_atari.py @@ -4,27 +4,49 @@ from baselines.common.cmd_util import make_atari_env, atari_arg_parser from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.a2c.a2c import learn -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy +from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy -def train(env_id, num_timesteps, seed, policy, lrschedule, num_env): + +def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): + """ + Train A2C model for atari environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param num_env: (int) The number of environments + """ + policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy + if policy_fn is None: + raise ValueError("Error: policy {} not implemented".format(policy)) + env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) + learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule) env.close() + def main(): + """ + Runs the test + """ parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') + parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') + parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', + help='Learning rate schedule') args = parser.parse_args() logger.configure() - train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_env=16) + train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule, + num_env=16) + if __name__ == '__main__': main() diff --git a/baselines/a2c/utils.py b/baselines/a2c/utils.py index a7610ebcdc..09ec86738c 100644 --- a/baselines/a2c/utils.py +++ b/baselines/a2c/utils.py @@ -1,254 +1,482 @@ import os -import gym +from collections import deque + import numpy as np import tensorflow as tf -from gym import spaces -from collections import deque + def sample(logits): + """ + Creates a sampling Tensor for non deterministic policies + + :param logits: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The sampled action + """ noise = tf.random_uniform(tf.shape(logits)) return tf.argmax(logits - tf.log(-tf.log(noise)), 1) -def cat_entropy(logits): - a0 = logits - tf.reduce_max(logits, 1, keep_dims=True) - ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, 1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1) -def cat_entropy_softmax(p0): - return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1) +def calc_entropy(logits): + """ + Calculates the entropy of the output values of the network + + :param logits: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The Entropy of the output values of the network + """ + # Compute softmax + a_0 = logits - tf.reduce_max(logits, 1, keep_dims=True) + exp_a_0 = tf.exp(a_0) + z_0 = tf.reduce_sum(exp_a_0, 1, keep_dims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), 1) + + +def calc_entropy_softmax(action_proba): + """ + Calculates the softmax entropy of the output values of the network + + :param action_proba: (TensorFlow Tensor) The input probability for each action + :return: (TensorFlow Tensor) The softmax entropy of the output values of the network + """ + return - tf.reduce_sum(action_proba * tf.log(action_proba + 1e-6), axis=1) + def mse(pred, target): - return tf.square(pred-target)/2. + """ + Returns the Mean squared error between prediction and target + + :param pred: (TensorFlow Tensor) The predicted value + :param target: (TensorFlow Tensor) The target value + :return: (TensorFlow Tensor) The Mean squared error between prediction and target + """ + return tf.reduce_mean(tf.square(pred - target)) + def ortho_init(scale=1.0): - def _ortho_init(shape, dtype, partition_info=None): - #lasagne ortho init for tf + """ + Orthogonal initialization for the policy weights + + :param scale: (float) Scaling factor for the weights. + :return: (function) an initialization function for the weights + """ + + # _ortho_init(shape, dtype, partition_info=None) + def _ortho_init(shape, *_, **_kwargs): + """Intialize weights as Orthogonal matrix. + + Orthogonal matrix initialization [1]_. For n-dimensional shapes where + n > 2, the n-1 trailing axes are flattened. For convolutional layers, this + corresponds to the fan-in, so this makes the initialization usable for + both dense and convolutional layers. + + References + ---------- + .. [1] Saxe, Andrew M., James L. McClelland, and Surya Ganguli. + "Exact solutions to the nonlinear dynamics of learning in deep + linear + """ + # lasagne ortho init for tf shape = tuple(shape) if len(shape) == 2: flat_shape = shape - elif len(shape) == 4: # assumes NHWC + elif len(shape) == 4: # assumes NHWC flat_shape = (np.prod(shape[:-1]), shape[-1]) else: raise NotImplementedError - a = np.random.normal(0.0, 1.0, flat_shape) - u, _, v = np.linalg.svd(a, full_matrices=False) - q = u if u.shape == flat_shape else v # pick the one with the correct shape - q = q.reshape(shape) - return (scale * q[:shape[0], :shape[1]]).astype(np.float32) + gaussian_noise = np.random.normal(0.0, 1.0, flat_shape) + u, _, v = np.linalg.svd(gaussian_noise, full_matrices=False) + weights = u if u.shape == flat_shape else v # pick the one with the correct shape + weights = weights.reshape(shape) + return (scale * weights[:shape[0], :shape[1]]).astype(np.float32) + return _ortho_init -def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): + +def conv(input_tensor, scope, *, n_filters, filter_size, stride, + pad='VALID', init_scale=1.0, data_format='NHWC', one_dim_bias=False): + """ + Creates a 2d convolutional layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution + :param scope: (str) The TensorFlow variable scope + :param n_filters: (int) The number of filters + :param filter_size: (int) The filter size + :param stride: (int) The stride of the convolution + :param pad: (str) The padding type ('VALID' or 'SAME') + :param init_scale: (int) The initialization scale + :param data_format: (str) The data format for the convolution weights + :param one_dim_bias: (bool) If the bias should be one dimentional or not + :return: (TensorFlow Tensor) 2d convolutional layer + """ if data_format == 'NHWC': channel_ax = 3 strides = [1, stride, stride, 1] - bshape = [1, 1, 1, nf] + bshape = [1, 1, 1, n_filters] elif data_format == 'NCHW': channel_ax = 1 strides = [1, 1, stride, stride] - bshape = [1, nf, 1, 1] + bshape = [1, n_filters, 1, 1] else: raise NotImplementedError - bias_var_shape = [nf] if one_dim_bias else [1, nf, 1, 1] - nin = x.get_shape()[channel_ax].value - wshape = [rf, rf, nin, nf] + bias_var_shape = [n_filters] if one_dim_bias else [1, n_filters, 1, 1] + n_input = input_tensor.get_shape()[channel_ax].value + wshape = [filter_size, filter_size, n_input, n_filters] with tf.variable_scope(scope): - w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) - b = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) + weight = tf.get_variable("w", wshape, initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", bias_var_shape, initializer=tf.constant_initializer(0.0)) if not one_dim_bias and data_format == 'NHWC': - b = tf.reshape(b, bshape) - return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format) + bias = tf.reshape(bias, bshape) + return bias + tf.nn.conv2d(input_tensor, weight, strides=strides, padding=pad, data_format=data_format) -def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): - with tf.variable_scope(scope): - nin = x.get_shape()[1].value - w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale)) - b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias)) - return tf.matmul(x, w)+b -def batch_to_seq(h, nbatch, nsteps, flat=False): +def linear(input_tensor, scope, n_hidden, *, init_scale=1.0, init_bias=0.0): + """ + Creates a fully connected layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the fully connected layer + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :param init_bias: (int) The initialization offset bias + :return: (TensorFlow Tensor) fully connected layer + """ + with tf.variable_scope(scope): + n_input = input_tensor.get_shape()[1].value + weight = tf.get_variable("w", [n_input, n_hidden], initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", [n_hidden], initializer=tf.constant_initializer(init_bias)) + return tf.matmul(input_tensor, weight) + bias + + +def batch_to_seq(tensor_batch, n_batch, n_steps, flat=False): + """ + Transform a batch of Tensors, into a sequence of Tensors for reccurent policies + + :param tensor_batch: (TensorFlow Tensor) The input tensor to unroll + :param n_batch: (int) The number of batch to run (n_envs * n_steps) + :param n_steps: (int) The number of steps to run for each environment + :param flat: (bool) If the input Tensor is flat + :return: (TensorFlow Tensor) sequence of Tensors for reccurent policies + """ if flat: - h = tf.reshape(h, [nbatch, nsteps]) + tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps]) else: - h = tf.reshape(h, [nbatch, nsteps, -1]) - return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)] + tensor_batch = tf.reshape(tensor_batch, [n_batch, n_steps, -1]) + return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=n_steps, value=tensor_batch)] + -def seq_to_batch(h, flat = False): - shape = h[0].get_shape().as_list() +def seq_to_batch(tensor_sequence, flat=False): + """ + Transform a sequence of Tensors, into a batch of Tensors for reccurent policies + + :param tensor_sequence: (TensorFlow Tensor) The input tensor to batch + :param flat: (bool) If the input Tensor is flat + :return: (TensorFlow Tensor) batch of Tensors for reccurent policies + """ + shape = tensor_sequence[0].get_shape().as_list() if not flat: - assert(len(shape) > 1) - nh = h[0].get_shape()[-1].value - return tf.reshape(tf.concat(axis=1, values=h), [-1, nh]) + assert len(shape) > 1 + n_hidden = tensor_sequence[0].get_shape()[-1].value + return tf.reshape(tf.concat(axis=1, values=tensor_sequence), [-1, n_hidden]) else: - return tf.reshape(tf.stack(values=h, axis=1), [-1]) - -def lstm(xs, ms, s, scope, nh, init_scale=1.0): - nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) - with tf.variable_scope(scope): - wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) - wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) - b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) - - c, h = tf.split(axis=1, num_or_size_splits=2, value=s) - for idx, (x, m) in enumerate(zip(xs, ms)): - c = c*(1-m) - h = h*(1-m) - z = tf.matmul(x, wx) + tf.matmul(h, wh) + b - i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) - i = tf.nn.sigmoid(i) - f = tf.nn.sigmoid(f) - o = tf.nn.sigmoid(o) - u = tf.tanh(u) - c = f*c + i*u - h = o*tf.tanh(c) - xs[idx] = h - s = tf.concat(axis=1, values=[c, h]) - return xs, s - -def _ln(x, g, b, e=1e-5, axes=[1]): - u, s = tf.nn.moments(x, axes=axes, keep_dims=True) - x = (x-u)/tf.sqrt(s+e) - x = x*g+b - return x - -def lnlstm(xs, ms, s, scope, nh, init_scale=1.0): - nbatch, nin = [v.value for v in xs[0].get_shape()] - nsteps = len(xs) + return tf.reshape(tf.stack(values=tensor_sequence, axis=1), [-1]) + + +def lstm(input_tensor, mask_tensor, cell_state_hidden, scope, n_hidden, init_scale=1.0, layer_norm=False): + """ + Creates an Long Short Term Memory (LSTM) cell for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell + :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell + :param cell_state_hidden: (TensorFlow Tensor) The state tensor for the LSTM cell + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :param layer_norm: (bool) Whether to apply Layer Normalization or not + :return: (TensorFlow Tensor) LSTM cell + """ + _, n_input = [v.value for v in input_tensor[0].get_shape()] with tf.variable_scope(scope): - wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale)) - gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0)) - bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0)) - - wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale)) - gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0)) - bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0)) - - b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0)) - - gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0)) - bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0)) - - c, h = tf.split(axis=1, num_or_size_splits=2, value=s) - for idx, (x, m) in enumerate(zip(xs, ms)): - c = c*(1-m) - h = h*(1-m) - z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b - i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z) - i = tf.nn.sigmoid(i) - f = tf.nn.sigmoid(f) - o = tf.nn.sigmoid(o) - u = tf.tanh(u) - c = f*c + i*u - h = o*tf.tanh(_ln(c, gc, bc)) - xs[idx] = h - s = tf.concat(axis=1, values=[c, h]) - return xs, s - -def conv_to_fc(x): - nh = np.prod([v.value for v in x.get_shape()[1:]]) - x = tf.reshape(x, [-1, nh]) - return x + weight_x = tf.get_variable("wx", [n_input, n_hidden * 4], initializer=ortho_init(init_scale)) + weight_h = tf.get_variable("wh", [n_hidden, n_hidden * 4], initializer=ortho_init(init_scale)) + bias = tf.get_variable("b", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + if layer_norm: + # Gain and bias of layer norm + gain_x = tf.get_variable("gx", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) + bias_x = tf.get_variable("bx", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + gain_h = tf.get_variable("gh", [n_hidden * 4], initializer=tf.constant_initializer(1.0)) + bias_h = tf.get_variable("bh", [n_hidden * 4], initializer=tf.constant_initializer(0.0)) + + gain_c = tf.get_variable("gc", [n_hidden], initializer=tf.constant_initializer(1.0)) + bias_c = tf.get_variable("bc", [n_hidden], initializer=tf.constant_initializer(0.0)) + + cell_state, hidden = tf.split(axis=1, num_or_size_splits=2, value=cell_state_hidden) + for idx, (_input, mask) in enumerate(zip(input_tensor, mask_tensor)): + cell_state = cell_state * (1 - mask) + hidden = hidden * (1 - mask) + if layer_norm: + gates = _ln(tf.matmul(_input, weight_x), gain_x, bias_x) \ + + _ln(tf.matmul(hidden, weight_h), gain_h, bias_h) + bias + else: + gates = tf.matmul(_input, weight_x) + tf.matmul(hidden, weight_h) + bias + in_gate, forget_gate, out_gate, cell_candidate = tf.split(axis=1, num_or_size_splits=4, value=gates) + in_gate = tf.nn.sigmoid(in_gate) + forget_gate = tf.nn.sigmoid(forget_gate) + out_gate = tf.nn.sigmoid(out_gate) + cell_candidate = tf.tanh(cell_candidate) + cell_state = forget_gate * cell_state + in_gate * cell_candidate + if layer_norm: + hidden = out_gate * tf.tanh(_ln(cell_state, gain_c, bias_c)) + else: + hidden = out_gate * tf.tanh(cell_state) + input_tensor[idx] = hidden + cell_state_hidden = tf.concat(axis=1, values=[cell_state, hidden]) + return input_tensor, cell_state_hidden + + +def _ln(input_tensor, gain, bias, epsilon=1e-5, axes=None): + """ + Apply layer normalisation. + + :param input_tensor: (TensorFlow Tensor) The input tensor for the Layer normalization + :param gain: (TensorFlow Tensor) The scale tensor for the Layer normalization + :param bias: (TensorFlow Tensor) The bias tensor for the Layer normalization + :param epsilon: (float) The epsilon value for floating point calculations + :param axes: (tuple, list or int) The axes to apply the mean and variance calculation + :return: (TensorFlow Tensor) a normalizing layer + """ + if axes is None: + axes = [1] + mean, variance = tf.nn.moments(input_tensor, axes=axes, keep_dims=True) + input_tensor = (input_tensor - mean) / tf.sqrt(variance + epsilon) + input_tensor = input_tensor * gain + bias + return input_tensor + + +def lnlstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale=1.0): + """ + Creates a LSTM with Layer Normalization (lnlstm) cell for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the LSTM cell + :param mask_tensor: (TensorFlow Tensor) The mask tensor for the LSTM cell + :param cell_state: (TensorFlow Tensor) The state tensor for the LSTM cell + :param scope: (str) The TensorFlow variable scope + :param n_hidden: (int) The number of hidden neurons + :param init_scale: (int) The initialization scale + :return: (TensorFlow Tensor) lnlstm cell + """ + return lstm(input_tensor, mask_tensor, cell_state, scope, n_hidden, init_scale, layer_norm=True) + + +def conv_to_fc(input_tensor): + """ + Reshapes a Tensor from a convolutional network to a Tensor for a fully connected network + + :param input_tensor: (TensorFlow Tensor) The convolutional input tensor + :return: (TensorFlow Tensor) The fully connected output tensor + """ + n_hidden = np.prod([v.value for v in input_tensor.get_shape()[1:]]) + input_tensor = tf.reshape(input_tensor, [-1, n_hidden]) + return input_tensor + def discount_with_dones(rewards, dones, gamma): + """ + Apply the discount value to the reward, where the environment is not done + + :param rewards: ([float]) The rewards + :param dones: ([bool]) Whether an environment is done or not + :param gamma: (float) The discount value + :return: ([float]) The discounted rewards + """ discounted = [] - r = 0 + ret = 0 # Return: discounted reward for reward, done in zip(rewards[::-1], dones[::-1]): - r = reward + gamma*r*(1.-done) # fixed off by one bug - discounted.append(r) + ret = reward + gamma * ret * (1. - done) # fixed off by one bug + discounted.append(ret) return discounted[::-1] + def find_trainable_variables(key): + """ + Returns the trainable variables within a given scope + + :param key: (str) The variable scope + :return: ([TensorFlow Tensor]) the trainable variables + """ with tf.variable_scope(key): return tf.trainable_variables() -def make_path(f): - return os.makedirs(f, exist_ok=True) -def constant(p): - return 1 +def make_path(path): + """ + For a given path, create the folders if they do not exist + + :param path: (str) The path + :return: (bool) Whether or not it finished correctly + """ + return os.makedirs(path, exist_ok=True) -def linear(p): - return 1-p -def middle_drop(p): +def constant(_): + """ + Returns a constant value for the Scheduler + + :param _: ignored + :return: (float) 1 + """ + return 1. + + +def linear_schedule(progress): + """ + Returns a linear value for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) 1 - progress + """ + return 1 - progress + + +def middle_drop(progress): + """ + Returns a linear value with a drop near the middle to a constant value for the Scheduler + + :param progress: (float) Current progress status (in [0, 1]) + :return: (float) 1 - progress if (1 - progress) >= 0.75 else 0.075 + """ eps = 0.75 - if 1-p= 0.125 else 0.125 + """ + progress *= 2 eps = 0.125 - if 1-p 0: - buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size) + buffer = Buffer(env=env, n_steps=n_steps, n_stack=n_stack, size=buffer_size) else: buffer = None - nbatch = nenvs*nsteps + n_batch = n_envs * n_steps acer = Acer(runner, model, buffer, log_interval) - acer.tstart = time.time() - for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls + acer.t_start = time.time() + for acer.steps in range(0, total_timesteps, + n_batch): # n_batch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): - n = np.random.poisson(replay_ratio) - for _ in range(n): + samples_number = np.random.poisson(replay_ratio) + for _ in range(samples_number): acer.call(on_policy=False) # no simulation steps in this env.close() diff --git a/baselines/acer/buffer.py b/baselines/acer/buffer.py index 2dcfa1098a..26ba89a71e 100644 --- a/baselines/acer/buffer.py +++ b/baselines/acer/buffer.py @@ -1,14 +1,23 @@ import numpy as np + class Buffer(object): - # gets obs, actions, rewards, mu's, (states, masks), dones - def __init__(self, env, nsteps, nstack, size=50000): - self.nenv = env.num_envs - self.nsteps = nsteps - self.nh, self.nw, self.nc = env.observation_space.shape - self.nstack = nstack - self.nbatch = self.nenv * self.nsteps - self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames + def __init__(self, env, n_steps, n_stack, size=50000): + """ + A buffer for observations, actions, rewards, mu's, states, masks and dones values + + :param env: (Gym environment) The environment to learn from + :param n_steps: (int) The number of steps to run for each environment + :param n_stack: (int) The number of stacked frames + :param size: (int) The buffer size in number of steps + """ + self.n_env = env.num_envs + self.n_steps = n_steps + self.height, self.width, self.n_channels = env.observation_space.shape + self.n_stack = n_stack + self.n_batch = self.n_env * self.n_steps + # Each loc contains n_env * n_steps frames, thus total buffer is n_env * size frames + self.size = size // self.n_steps # Memory self.enc_obs = None @@ -23,37 +32,66 @@ def __init__(self, env, nsteps, nstack, size=50000): self.num_in_buffer = 0 def has_atleast(self, frames): - # Frames per env, so total (nenv * frames) Frames needed - # Each buffer loc has nenv * nsteps frames - return self.num_in_buffer >= (frames // self.nsteps) + """ + Check to see if the buffer has at least the asked number of frames + + :param frames: (int) The number of frames checked + :return: (bool) number of frames in buffer >= number asked + """ + # Frames per env, so total (n_env * frames) Frames needed + # Each buffer loc has n_env * n_steps frames + return self.num_in_buffer >= (frames // self.n_steps) def can_sample(self): + """ + Check if the buffer has at least one frame + + :return: (bool) if the buffer has at least one frame + """ return self.num_in_buffer > 0 - # Generate stacked frames def decode(self, enc_obs, dones): - # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc] - # dones has shape [nenvs, nsteps, nh, nw, nc] - # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc] - nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc - y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32) - obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8) - x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, - 0) # [nsteps + nstack, nenv, nh, nw, nc] - y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep - y[:3] = 1.0 - # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1]) - for i in range(nstack): - obs[-(i + 1), i:] = x + """ + Get the stacked frames of an observation + + :param enc_obs: ([float]) the encoded observation + :param dones: ([bool]) + :return: ([float]) the decoded observation + """ + # enc_obs has shape [n_envs, n_steps + n_stack, nh, nw, nc] + # dones has shape [n_envs, n_steps, nh, nw, nc] + # returns stacked obs of shape [n_env, (n_steps + 1), nh, nw, n_stack*nc] + n_stack, n_env, n_steps = self.n_stack, self.n_env, self.n_steps + height, width, n_channels = self.height, self.width, self.n_channels + y_var = np.empty([n_steps + n_stack - 1, n_env, 1, 1, 1], dtype=np.float32) + obs = np.zeros([n_stack, n_steps + n_stack, n_env, height, width, n_channels], dtype=np.uint8) + # [n_steps + n_stack, n_env, nh, nw, nc] + x_var = np.reshape(enc_obs, [n_env, n_steps + n_stack, height, width, n_channels]).swapaxes(1, 0) + y_var[3:] = np.reshape(1.0 - dones, [n_env, n_steps, 1, 1, 1]).swapaxes(1, 0) # keep + y_var[:3] = 1.0 + # y = np.reshape(1 - dones, [n_envs, n_steps, 1, 1, 1]) + for i in range(n_stack): + obs[-(i + 1), i:] = x_var # obs[:,i:,:,:,-(i+1),:] = x - x = x[:-1] * y - y = y[1:] - return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc]) + x_var = x_var[:-1] * y_var + y_var = y_var[1:] + return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), + [n_env, (n_steps + 1), height, width, n_stack * n_channels]) def put(self, enc_obs, actions, rewards, mus, dones, masks): - # enc_obs [nenv, (nsteps + nstack), nh, nw, nc] - # actions, rewards, dones [nenv, nsteps] - # mus [nenv, nsteps, nact] + """ + Adds a frame to the buffer + + :param enc_obs: ([float]) the encoded observation + :param actions: ([float]) the actions + :param rewards: ([float]) the rewards + :param mus: ([float]) the policy probability for the actions + :param dones: ([bool]) + :param masks: ([bool]) + """ + # enc_obs [n_env, (n_steps + n_stack), nh, nw, nc] + # actions, rewards, dones [n_env, n_steps] + # mus [n_env, n_steps, n_act] if self.enc_obs is None: self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8) @@ -73,31 +111,44 @@ def put(self, enc_obs, actions, rewards, mus, dones, masks): self.next_idx = (self.next_idx + 1) % self.size self.num_in_buffer = min(self.size, self.num_in_buffer + 1) - def take(self, x, idx, envx): - nenv = self.nenv - out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype) - for i in range(nenv): - out[i] = x[idx[i], envx[i]] + def take(self, arr, idx, envx): + """ + Reads a frame from a list and index for the asked environment ids + + :param arr: (numpy array) the array that is read + :param idx: ([int]) the idx that are read + :param envx: ([int]) the idx for the environments + :return: ([float]) the askes frames from the list + """ + n_env = self.n_env + out = np.empty([n_env] + list(arr.shape[2:]), dtype=arr.dtype) + for i in range(n_env): + out[i] = arr[idx[i], envx[i]] return out def get(self): + """ + randomly read a frame from the buffer + + :return: ([float], [float], [float], [float], [bool], [float]) + observations, actions, rewards, mus, dones, maskes + """ # returns - # obs [nenv, (nsteps + 1), nh, nw, nstack*nc] - # actions, rewards, dones [nenv, nsteps] - # mus [nenv, nsteps, nact] - nenv = self.nenv + # obs [n_env, (n_steps + 1), nh, nw, n_stack*nc] + # actions, rewards, dones [n_env, n_steps] + # mus [n_env, n_steps, n_act] + n_env = self.n_env assert self.can_sample() # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env. - idx = np.random.randint(0, self.num_in_buffer, nenv) - envx = np.arange(nenv) + idx = np.random.randint(0, self.num_in_buffer, n_env) + envx = np.arange(n_env) - take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0) - dones = take(self.dones) - enc_obs = take(self.enc_obs) + dones = self.take(self.dones, idx, envx) + enc_obs = self.take(self.enc_obs, idx, envx) obs = self.decode(enc_obs, dones) - actions = take(self.actions) - rewards = take(self.rewards) - mus = take(self.mus) - masks = take(self.masks) + actions = self.take(self.actions, idx, envx) + rewards = self.take(self.rewards, idx, envx) + mus = self.take(self.mus, idx, envx) + masks = self.take(self.masks, idx, envx) return obs, actions, rewards, mus, dones, masks diff --git a/baselines/acer/policies.py b/baselines/acer/policies.py index 627c40016c..2fb6cd9e0c 100644 --- a/baselines/acer/policies.py +++ b/baselines/acer/policies.py @@ -1,79 +1,131 @@ import numpy as np import tensorflow as tf -from baselines.ppo2.policies import nature_cnn -from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample +from baselines.a2c.policies import nature_cnn +from baselines.a2c.utils import linear, batch_to_seq, seq_to_batch, lstm, sample -class AcerCnnPolicy(object): +class AcerPolicy(object): + """ + Policy object for Acer + + :param sess: (TensorFlow session) The current TensorFlow session + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param n_env: (int) The number of environments + :param n_steps: (int) The number of steps to run + :param n_stack: (int) The number of frames stacked + :param reuse: (bool) If the policy is reusable or not + :param n_lstm: (int) The number of LSTM cells (for reccurent policies) + """ - def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): - nbatch = nenv * nsteps - nh, nw, nc = ob_space.shape - ob_shape = (nbatch, nh, nw, nc * nstack) - nact = ac_space.n - X = tf.placeholder(tf.uint8, ob_shape) # obs + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False, n_lstm=256): + self.n_batch = n_env * n_steps + height, width, n_channels = ob_space.shape + self.ob_shape = (self.n_batch, height, width, n_channels * n_stack) + self.n_act = ac_space.n + self.obs_ph = tf.placeholder(tf.uint8, self.ob_shape) # obs + self.masks_ph = tf.placeholder(tf.float32, [self.n_batch]) # mask (done t-1) + self.states_ph = tf.placeholder(tf.float32, [n_env, n_lstm * 2]) # states + self.sess = sess + self.reuse = reuse + + def step(self, obs, state, mask, *args, **kwargs): + """ + Returns the policy for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :param args: + :param kwargs: + :return: ([float], [float], [float], [float]) action, mu, states + """ + raise NotImplementedError + + def out(self, obs, state, mask, *args, **kwargs): + """ + Returns the pi and q values for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :param args: + :param kwargs: + :return: ([float], [float]) pi, q + """ + raise NotImplementedError + + def act(self, obs, state, mask, *args, **kwargs): + """ + Returns the action for a single step + + :param obs: ([float] or [int]) The current observation of the environment + :param state: ([float]) The last states (used in reccurent policies) + :param mask: ([float]) The last masks (used in reccurent policies) + :param args: + :param kwargs: + :return: ([float]) The action + """ + raise NotImplementedError + + +class AcerCnnPolicy(AcerPolicy): + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False): + super(AcerCnnPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse) with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - pi_logits = fc(h, 'pi', nact, init_scale=0.01) - pi = tf.nn.softmax(pi_logits) - q = fc(h, 'q', nact) + extracted_features = nature_cnn(self.obs_ph) + pi_logits = linear(extracted_features, 'pi', self.n_act, init_scale=0.01) + policy = tf.nn.softmax(pi_logits) + q_value = linear(extracted_features, 'q', self.n_act) - a = sample(pi_logits) # could change this to use self.pi instead + self.action = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful - self.X = X - self.pi = pi # actual policy params now - self.q = q - - def step(ob, *args, **kwargs): - # returns actions, mus, states - a0, pi0 = sess.run([a, pi], {X: ob}) - return a0, pi0, [] # dummy state - - def out(ob, *args, **kwargs): - pi0, q0 = sess.run([pi, q], {X: ob}) - return pi0, q0 - - def act(ob, *args, **kwargs): - return sess.run(a, {X: ob}) - - self.step = step - self.out = out - self.act = act - -class AcerLstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): - nbatch = nenv * nsteps - nh, nw, nc = ob_space.shape - ob_shape = (nbatch, nh, nw, nc * nstack) - nact = ac_space.n - X = tf.placeholder(tf.uint8, ob_shape) # obs - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states + self.policy = policy # actual policy params now + self.q_value = q_value + + def step(self, obs, state, mask, *args, **kwargs): + # returns actions, mus, states + action, policy = self.sess.run([self.action, self.policy], {self.obs_ph: obs}) + return action, policy, [] # dummy state + + def out(self, obs, state, mask, *args, **kwargs): + policy, q_value = self.sess.run([self.policy, self.q_value], {self.obs_ph: obs}) + return policy, q_value + + def act(self, obs, state, mask, *args, **kwargs): + return self.sess.run(self.action, {self.obs_ph: obs}) + + +class AcerLstmPolicy(AcerPolicy): + def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse=False, n_lstm=256): + super(AcerLstmPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_stack, reuse, n_lstm) with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) + extracted_features = nature_cnn(self.obs_ph) # lstm - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - - pi_logits = fc(h5, 'pi', nact, init_scale=0.01) - pi = tf.nn.softmax(pi_logits) - q = fc(h5, 'q', nact) - - a = sample(pi_logits) # could change this to use self.pi instead - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - self.X = X - self.M = M - self.S = S - self.pi = pi # actual policy params now - self.q = q - - def step(ob, state, mask, *args, **kwargs): - # returns actions, mus, states - a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) - return a0, pi0, s - - self.step = step + input_seq = batch_to_seq(extracted_features, n_env, n_steps) + masks = batch_to_seq(self.masks_ph, n_env, n_steps) + rnn_output, self.snew = lstm(input_seq, masks, self.states_ph, 'lstm1', n_hidden=n_lstm) + rnn_output = seq_to_batch(rnn_output) + + pi_logits = linear(rnn_output, 'pi', self.n_act, init_scale=0.01) + policy = tf.nn.softmax(pi_logits) + q_value = linear(rnn_output, 'q', self.n_act) + + self.action = sample(pi_logits) # could change this to use self.pi instead + self.initial_state = np.zeros((n_env, n_lstm * 2), dtype=np.float32) + self.policy = policy # actual policy params now + self.q_value = q_value + + def step(self, obs, state, mask, *args, **kwargs): + # returns actions, mus, states + action, policy, states = self.sess.run([self.action, self.policy, self.snew], + {self.obs_ph: obs, self.states_ph: state, self.masks_ph: mask}) + return action, policy, states + + def out(self, obs, state, mask, *args, **kwargs): + policy, q_value = self.sess.run([self.policy, self.q_value], {self.obs_ph: obs}) + return policy, q_value + + def act(self, obs, state, mask, *args, **kwargs): + return self.sess.run(self.action, {self.obs_ph: obs}) diff --git a/baselines/acer/run_atari.py b/baselines/acer/run_atari.py index cce979eddd..2c1354e183 100644 --- a/baselines/acer/run_atari.py +++ b/baselines/acer/run_atari.py @@ -4,7 +4,19 @@ from baselines.acer.policies import AcerCnnPolicy, AcerLstmPolicy from baselines.common.cmd_util import make_atari_env, atari_arg_parser -def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): + +def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): + """ + train an ACER model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + :param num_cpu: (int) The number of cpu to train on + """ env = make_atari_env(env_id, num_cpu, seed) if policy == 'cnn': policy_fn = AcerCnnPolicy @@ -13,18 +25,24 @@ def train(env_id, num_timesteps, seed, policy, lrschedule, num_cpu): else: print("Policy {} not implemented".format(policy)) return - learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lrschedule=lrschedule) + learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), lr_schedule=lr_schedule, buffer_size=5000) env.close() + def main(): + """ + Runs the test + """ parser = atari_arg_parser() - parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn') - parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant') - parser.add_argument('--logdir', help ='Directory for logging') + parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') + parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', + help='Learning rate schedule') + parser.add_argument('--logdir', help='Directory for logging') args = parser.parse_args() logger.configure(args.logdir) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy, lrschedule=args.lrschedule, num_cpu=16) + policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16) + if __name__ == '__main__': main() diff --git a/baselines/acktr/acktr_cont.py b/baselines/acktr/acktr_cont.py index 45f2fa29fa..9b75398579 100644 --- a/baselines/acktr/acktr_cont.py +++ b/baselines/acktr/acktr_cont.py @@ -1,93 +1,119 @@ +""" +Continuous acktr +""" + import numpy as np import tensorflow as tf + from baselines import logger import baselines.common as common -from baselines.common import tf_util as U +from baselines.common import tf_util from baselines.acktr import kfac from baselines.common.filters import ZFilter -def pathlength(path): - return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient def rollout(env, policy, max_pathlength, animate=False, obfilter=None): """ Simulate the env and policy for max_pathlength steps + + :param env: (Gym environment) The environment to learn from + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param max_pathlength: (int) The maximum length for an episode + :param animate: (bool) if render env + :param obfilter: (Filter) the observation filter + :return: (dict) observation, terminated, reward, action, action_dist, logp """ - ob = env.reset() - prev_ob = np.float32(np.zeros(ob.shape)) - if obfilter: ob = obfilter(ob) + observation = env.reset() + prev_ob = np.float32(np.zeros(observation.shape)) + if obfilter: + observation = obfilter(observation) terminated = False - obs = [] - acs = [] - ac_dists = [] + observations = [] + actions = [] + action_dists = [] logps = [] rewards = [] for _ in range(max_pathlength): if animate: env.render() - state = np.concatenate([ob, prev_ob], -1) - obs.append(state) - ac, ac_dist, logp = policy.act(state) - acs.append(ac) - ac_dists.append(ac_dist) + state = np.concatenate([observation, prev_ob], -1) + observations.append(state) + action, ac_dist, logp = policy.act(state) + actions.append(action) + action_dists.append(ac_dist) logps.append(logp) - prev_ob = np.copy(ob) - scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low) + prev_ob = np.copy(observation) + scaled_ac = env.action_space.low + (action + 1.) * 0.5 * (env.action_space.high - env.action_space.low) scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high) - ob, rew, done, _ = env.step(scaled_ac) - if obfilter: ob = obfilter(ob) + observation, rew, done, _ = env.step(scaled_ac) + if obfilter: + observation = obfilter(observation) rewards.append(rew) if done: terminated = True break - return {"observation" : np.array(obs), "terminated" : terminated, - "reward" : np.array(rewards), "action" : np.array(acs), - "action_dist": np.array(ac_dists), "logp" : np.array(logps)} + return {"observation": np.array(observations), "terminated": terminated, + "reward": np.array(rewards), "action": np.array(actions), + "action_dist": np.array(action_dists), "logp": np.array(logps)} -def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, - animate=False, callback=None, desired_kl=0.002): +def learn(env, policy, value_fn, gamma, lam, timesteps_per_batch, num_timesteps, + animate=False, callback=None, desired_kl=0.002): + """ + Traines an ACKTR model. + + :param env: (Gym environment) The environment to learn from + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...) + :param gamma: (float) The discount value + :param lam: (float) the tradeoff between exploration and exploitation + :param timesteps_per_batch: (int) the number of timesteps for each batch + :param num_timesteps: (int) the total number of timesteps to run + :param animate: (bool) if render env + :param callback: (function) called every step, used for logging and saving + :param desired_kl: (float) the Kullback leibler weight for the loss + """ obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info - optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ - epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, - weight_decay_dict=policy.wd_dict, max_grad_norm=None) + optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, + epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, + weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) - do_update = U.function(inputs, update_op) - U.initialize() + do_update = tf_util.function(inputs, update_op) + tf_util.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() - for qr in [q_runner, vf.q_runner]: - assert (qr != None) - enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) + for queue_runner in [q_runner, value_fn.q_runner]: + assert queue_runner is not None + enqueue_threads.extend(queue_runner.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break - logger.log("********** Iteration %i ************"%i) + logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: - path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) + path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), + obfilter=obfilter) paths.append(path) - n = pathlength(path) - timesteps_this_batch += n - timesteps_so_far += n + timesteps_this_batch += path["reward"].shape[0] + timesteps_so_far += path["reward"].shape[0] if timesteps_this_batch > timesteps_per_batch: break @@ -98,13 +124,13 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) - vpred_t = vf.predict(path) + vpred_t = value_fn.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) - delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] + delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function - vf.fit(paths, vtargs) + value_fn.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) @@ -119,20 +145,20 @@ def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize - kl = policy.compute_kl(ob_no, oldac_dist) - if kl > desired_kl * 2: + kl_loss = policy.compute_kl(ob_no, oldac_dist) + if kl_loss > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() - elif kl < desired_kl / 2: + elif kl_loss < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) - logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) - logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) - logger.record_tabular("KL", kl) + logger.record_tabular("EpRewSEM", np.std([path["reward"].sum() / np.sqrt(len(paths)) for path in paths])) + logger.record_tabular("EpLenMean", np.mean([path["reward"].shape[0] for path in paths])) + logger.record_tabular("KL", kl_loss) if callback: callback() logger.dump_tabular() diff --git a/baselines/acktr/acktr_disc.py b/baselines/acktr/acktr_disc.py index a8b77b6fd5..9bcd7b34d0 100644 --- a/baselines/acktr/acktr_disc.py +++ b/baselines/acktr/acktr_disc.py @@ -1,80 +1,99 @@ -import os.path as osp +""" +Discrete acktr +""" + +import os import time import joblib -import numpy as np + import tensorflow as tf -from baselines import logger +from baselines import logger from baselines.common import set_global_seeds, explained_variance - from baselines.a2c.a2c import Runner -from baselines.a2c.utils import discount_with_dones -from baselines.a2c.utils import Scheduler, find_trainable_variables -from baselines.a2c.utils import cat_entropy, mse +from baselines.a2c.utils import Scheduler, find_trainable_variables, calc_entropy, mse from baselines.acktr import kfac class Model(object): + def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, + ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, + kfac_clip=0.001, lr_schedule='linear'): + """ + The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 + + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param ob_space: (Gym Space) The observation space + :param ac_space: (Gym Space) The action space + :param n_envs: (int) The number of environments + :param total_timesteps: (int) The total number of timesteps for training the model + :param nprocs: (int) The number of threads for TensorFlow operations + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) The weight for the entropic loss + :param vf_coef: (float) The weight for the loss on the value function + :param vf_fisher_coef: (float) The weight for the fisher loss on the value function + :param learning_rate: (float) The initial learning rate for the RMS prop optimizer + :param max_grad_norm: (float) The clipping value for the maximum gradient + :param kfac_clip: (float) gradient clipping for Kullback leiber + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + """ - def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, - ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, - kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) - nact = ac_space.n - nbatch = nenvs * nsteps - A = tf.placeholder(tf.int32, [nbatch]) - ADV = tf.placeholder(tf.float32, [nbatch]) - R = tf.placeholder(tf.float32, [nbatch]) - PG_LR = tf.placeholder(tf.float32, []) - VF_LR = tf.placeholder(tf.float32, []) - - self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) - self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) - - logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) - self.logits = logits = train_model.pi - - ##training loss - pg_loss = tf.reduce_mean(ADV*logpac) - entropy = tf.reduce_mean(cat_entropy(train_model.pi)) + n_batch = n_envs * n_steps + action_ph = tf.placeholder(tf.int32, [n_batch]) + advs_ph = tf.placeholder(tf.float32, [n_batch]) + rewards_ph = tf.placeholder(tf.float32, [n_batch]) + pg_lr_ph = tf.placeholder(tf.float32, []) + + self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) + self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) + + logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=action_ph) + self.logits = train_model.policy + + # training loss + pg_loss = tf.reduce_mean(advs_ph * logpac) + entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy - vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) + vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss - - ##Fisher loss construction + # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) - sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) - self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) - self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss + sample_net = train_model.value_fn + tf.random_normal(tf.shape(train_model.value_fn)) + self.vf_fisher = vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean( + tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) + self.joint_fisher = pg_fisher_loss + vf_fisher_loss - self.params=params = find_trainable_variables("model") + self.params = params = find_trainable_variables("model") - self.grads_check = grads = tf.gradients(train_loss,params) + self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): - self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ - momentum=0.9, kfac_update=1, epsilon=0.01,\ - stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) + self.optim = optim = kfac.KfacOptimizer(learning_rate=pg_lr_ph, clip_kl=kfac_clip, + momentum=0.9, kfac_update=1, epsilon=0.01, + stats_decay=0.99, async=1, cold_iter=10, + max_grad_norm=max_grad_norm) - update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) - train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) + optim.compute_and_apply_stats(self.joint_fisher, var_list=params) + train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner - self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) + self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values - for step in range(len(obs)): - cur_lr = self.lr.value() + for _ in range(len(obs)): + cur_lr = self.learning_rate.value() - td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} + td_map = {train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr} if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks + td_map[train_model.states_ph] = states + td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], @@ -83,18 +102,16 @@ def train(obs, states, rewards, masks, actions, values): return policy_loss, value_loss, policy_entropy def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) + session_params = sess.run(params) + joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) + for param, loaded_p in zip(params, loaded_params): + restores.append(param.assign(loaded_p)) sess.run(restores) - - self.train = train self.save = save self.load = load @@ -105,49 +122,70 @@ def load(load_path): self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess) -def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, - ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, - kfac_clip=0.001, save_interval=None, lrschedule='linear'): - tf.reset_default_graph() + +def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, n_steps=20, + ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, + kfac_clip=0.001, save_interval=None, lr_schedule='linear'): + """ + Traines an ACKTR model. + + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment) The environment to learn from + :param seed: (int) The initial seed for training + :param total_timesteps: (int) The total number of samples + :param gamma: (float) Discount factor + :param log_interval: (int) The number of timesteps before logging. + :param nprocs: (int) The number of threads for TensorFlow operations + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) The weight for the entropic loss + :param vf_coef: (float) The weight for the loss on the value function + :param vf_fisher_coef: (float) The weight for the fisher loss on the value function + :param learning_rate: (float) The learning rate + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param kfac_clip: (float) gradient clipping for Kullback leiber + :param save_interval: (int) The number of timesteps before saving. + :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') + """ set_global_seeds(seed) - nenvs = env.num_envs + n_envs = env.num_envs ob_space = env.observation_space ac_space = env.action_space - make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps - =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= - vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, - lrschedule=lrschedule) + make_model = lambda: Model(policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=nprocs, n_steps=n_steps, + ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, + learning_rate=learning_rate, + max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lr_schedule=lr_schedule) if save_interval and logger.get_dir(): import cloudpickle - with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: - fh.write(cloudpickle.dumps(make_model)) + with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as file_handler: + file_handler.write(cloudpickle.dumps(make_model)) model = make_model() - runner = Runner(env, model, nsteps=nsteps, gamma=gamma) - nbatch = nenvs*nsteps - tstart = time.time() + runner = Runner(env, model, n_steps=n_steps, gamma=gamma) + n_batch = n_envs * n_steps + t_start = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) - for update in range(1, total_timesteps//nbatch+1): + for update in range(1, total_timesteps // n_batch + 1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs - nseconds = time.time()-tstart - fps = int((update*nbatch)/nseconds) + n_seconds = time.time() - t_start + fps = int((update * n_batch) / n_seconds) if update % log_interval == 0 or update == 1: - ev = explained_variance(values, rewards) + explained_var = explained_variance(values, rewards) logger.record_tabular("nupdates", update) - logger.record_tabular("total_timesteps", update*nbatch) + logger.record_tabular("total_timesteps", update * n_batch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) - logger.record_tabular("explained_variance", float(ev)) + logger.record_tabular("explained_variance", float(explained_var)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): - savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) + savepath = os.path.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() diff --git a/baselines/acktr/kfac.py b/baselines/acktr/kfac.py index b4208199dc..3fe2a317bd 100644 --- a/baselines/acktr/kfac.py +++ b/baselines/acktr/kfac.py @@ -1,16 +1,44 @@ -import tensorflow as tf -import numpy as np import re -from baselines.acktr.kfac_utils import * from functools import reduce +import tensorflow as tf +import numpy as np + +from baselines.acktr.kfac_utils import detect_min_val, factor_reshape, gmatmul + KFAC_OPS = ['MatMul', 'Conv2D', 'BiasAdd'] KFAC_DEBUG = False -class KfacOptimizer(): - - def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approxT2=False, use_float64=False, weight_decay_dict={},max_grad_norm=0.5): +class KfacOptimizer: + def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2, stats_accum_iter=60, + full_stats_init=False, cold_iter=100, cold_lr=None, async=False, async_stats=False, epsilon=1e-2, + stats_decay=0.95, blockdiag_bias=False, channel_fac=False, factored_damping=False, approx_t2=False, + use_float64=False, weight_decay_dict=None, max_grad_norm=0.5): + """ + Kfac Optimizer for ACKTR models + link: https://arxiv.org/pdf/1708.05144.pdf + + :param learning_rate: (float) The learning rate + :param momentum: (float) The momentum value for the TensorFlow momentum optimizer + :param clip_kl: (float) gradient clipping for Kullback leiber + :param kfac_update: (int) update kfac after kfac_update steps + :param stats_accum_iter: (int) how may steps to accumulate stats + :param full_stats_init: (bool) whether or not to fully initalize stats + :param cold_iter: (int) Cold start learning rate for how many steps + :param cold_lr: (float) Cold start learning rate + :param async: (bool) Use async eigen decomposition + :param async_stats: (bool) Asynchronous stats update + :param epsilon: (float) epsilon value for small numbers + :param stats_decay: (float) the stats decay rate + :param blockdiag_bias: (bool) + :param channel_fac: (bool) factorization along the channels + :param factored_damping: (bool) use factored damping + :param approx_t2: (bool) approximate T2 act and grad fisher + :param use_float64: (bool) use 64-bit float + :param weight_decay_dict: (dict) custom weight decay coeff for a given gradient + :param max_grad_norm: (float) The maximum value for the gradient clipping + """ self.max_grad_norm = max_grad_norm self._lr = learning_rate self._momentum = momentum @@ -22,16 +50,18 @@ def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2 self._epsilon = epsilon self._stats_decay = stats_decay self._blockdiag_bias = blockdiag_bias - self._approxT2 = approxT2 + self._approx_t2 = approx_t2 self._use_float64 = use_float64 self._factored_damping = factored_damping self._cold_iter = cold_iter - if cold_lr == None: + if cold_lr is None: # good heuristics - self._cold_lr = self._lr# * 3. + self._cold_lr = self._lr # * 3. else: self._cold_lr = cold_lr self._stats_accum_iter = stats_accum_iter + if weight_decay_dict is None: + weight_decay_dict = {} self._weight_decay_dict = weight_decay_dict self._diag_init_coeff = 0. self._full_stats_init = full_stats_init @@ -46,241 +76,252 @@ def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2 0, name='KFAC/factor_step', trainable=False) self.stats_step = tf.Variable( 0, name='KFAC/stats_step', trainable=False) - self.vFv = tf.Variable(0., name='KFAC/vFv', trainable=False) + self.v_f_v = tf.Variable(0., name='KFAC/vFv', trainable=False) self.factors = {} self.param_vars = [] self.stats = {} self.stats_eigen = {} - def getFactors(self, g, varlist): - graph = tf.get_default_graph() - factorTensors = {} - fpropTensors = [] - bpropTensors = [] - opTypes = [] - fops = [] + def get_factors(self, gradients, varlist): + """ + get factors to update + + :param gradients: ([TensorFlow Tensor]) The gradients + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) The factors to update + """ + default_graph = tf.get_default_graph() + factor_tensors = {} + fprop_tensors = [] + bprop_tensors = [] + op_types = [] - def searchFactors(gradient, graph): + def _search_factors(gradient, graph): # hard coded search stratergy - bpropOp = gradient.op - bpropOp_name = bpropOp.name + bprop_op = gradient.op + bprop_op_name = bprop_op.name - bTensors = [] - fTensors = [] + b_tensors = [] + f_tensors = [] # combining additive gradient, assume they are the same op type and # indepedent - if 'AddN' in bpropOp_name: + if 'AddN' in bprop_op_name: factors = [] - for g in gradient.op.inputs: - factors.append(searchFactors(g, graph)) - op_names = [item['opName'] for item in factors] - # TO-DO: need to check all the attribute of the ops as well - print (gradient.name) - print (op_names) - print (len(np.unique(op_names))) - assert len(np.unique(op_names)) == 1, gradient.name + \ - ' is shared among different computation OPs' - - bTensors = reduce(lambda x, y: x + y, - [item['bpropFactors'] for item in factors]) + for grad in gradient.op.inputs: + factors.append(_search_factors(grad, graph)) + op_names = [_item['opName'] for _item in factors] + # TODO: need to check all the attribute of the ops as well + print(gradient.name) + print(op_names) + print(len(np.unique(op_names))) + assert len(np.unique(op_names)) == 1, \ + 'Error: {} is shared among different computation OPs'.format(gradient.name) + + b_tensors = reduce(lambda x, y: x + y, + [_item['bpropFactors'] for _item in factors]) if len(factors[0]['fpropFactors']) > 0: - fTensors = reduce( - lambda x, y: x + y, [item['fpropFactors'] for item in factors]) - fpropOp_name = op_names[0] - fpropOp = factors[0]['op'] + f_tensors = reduce( + lambda x, y: x + y, [_item['fpropFactors'] for _item in factors]) + fprop_op_name = op_names[0] + fprop_op = factors[0]['op'] else: - fpropOp_name = re.search( - 'gradientsSampled(_[0-9]+|)/(.+?)_grad', bpropOp_name).group(2) - fpropOp = graph.get_operation_by_name(fpropOp_name) - if fpropOp.op_def.name in KFAC_OPS: + fprop_op_name = re.search('gradientsSampled(_[0-9]+|)/(.+?)_grad', bprop_op_name).group(2) + fprop_op = graph.get_operation_by_name(fprop_op_name) + if fprop_op.op_def.name in KFAC_OPS: # Known OPs - ### - bTensor = [ - i for i in bpropOp.inputs if 'gradientsSampled' in i.name][-1] - bTensorShape = fpropOp.outputs[0].get_shape() - if bTensor.get_shape()[0].value == None: - bTensor.set_shape(bTensorShape) - bTensors.append(bTensor) - ### - if fpropOp.op_def.name == 'BiasAdd': - fTensors = [] + b_tensor = [_i for _i in bprop_op.inputs if 'gradientsSampled' in _i.name][-1] + b_tensor_shape = fprop_op.outputs[0].get_shape() + if b_tensor.get_shape()[0].value is None: + b_tensor.set_shape(b_tensor_shape) + b_tensors.append(b_tensor) + + if fprop_op.op_def.name == 'BiasAdd': + f_tensors = [] else: - fTensors.append( - [i for i in fpropOp.inputs if param.op.name not in i.name][0]) - fpropOp_name = fpropOp.op_def.name + f_tensors.append([_i for _i in fprop_op.inputs if param.op.name not in _i.name][0]) + fprop_op_name = fprop_op.op_def.name else: # unknown OPs, block approximation used - bInputsList = [i for i in bpropOp.inputs[ - 0].op.inputs if 'gradientsSampled' in i.name if 'Shape' not in i.name] - if len(bInputsList) > 0: - bTensor = bInputsList[0] - bTensorShape = fpropOp.outputs[0].get_shape() - if len(bTensor.get_shape()) > 0 and bTensor.get_shape()[0].value == None: - bTensor.set_shape(bTensorShape) - bTensors.append(bTensor) - fpropOp_name = opTypes.append('UNK-' + fpropOp.op_def.name) - - return {'opName': fpropOp_name, 'op': fpropOp, 'fpropFactors': fTensors, 'bpropFactors': bTensors} - - for t, param in zip(g, varlist): + b_inputs_list = [_i for _i in bprop_op.inputs[0].op.inputs + if 'gradientsSampled' in _i.name if 'Shape' not in _i.name] + if len(b_inputs_list) > 0: + b_tensor = b_inputs_list[0] + # only if tensor shape is defined, usually this will prevent tensor like Sum:0 to be used. + if b_tensor.get_shape(): + b_tensor_shape = fprop_op.outputs[0].get_shape() + if len(b_tensor.get_shape()) > 0 and b_tensor.get_shape()[0].value is None: + b_tensor.set_shape(b_tensor_shape) + b_tensors.append(b_tensor) + fprop_op_name = op_types.append('UNK-' + fprop_op.op_def.name) + + return {'opName': fprop_op_name, 'op': fprop_op, 'fpropFactors': f_tensors, 'bpropFactors': b_tensors} + + for _grad, param in zip(gradients, varlist): if KFAC_DEBUG: - print(('get factor for '+param.name)) - factors = searchFactors(t, graph) - factorTensors[param] = factors + print(('get factor for ' + param.name)) + found_factors = _search_factors(_grad, default_graph) + factor_tensors[param] = found_factors - ######## # check associated weights and bias for homogeneous coordinate representation # and check redundent factors - # TO-DO: there may be a bug to detect associate bias and weights for - # forking layer, e.g. in inception models. + # TODO: there may be a bug to detect associate bias and weights for forking layer, e.g. in inception models. for param in varlist: - factorTensors[param]['assnWeights'] = None - factorTensors[param]['assnBias'] = None + factor_tensors[param]['assnWeights'] = None + factor_tensors[param]['assnBias'] = None for param in varlist: - if factorTensors[param]['opName'] == 'BiasAdd': - factorTensors[param]['assnWeights'] = None + if factor_tensors[param]['opName'] == 'BiasAdd': + factor_tensors[param]['assnWeights'] = None for item in varlist: - if len(factorTensors[item]['bpropFactors']) > 0: - if (set(factorTensors[item]['bpropFactors']) == set(factorTensors[param]['bpropFactors'])) and (len(factorTensors[item]['fpropFactors']) > 0): - factorTensors[param]['assnWeights'] = item - factorTensors[item]['assnBias'] = param - factorTensors[param]['bpropFactors'] = factorTensors[ + if len(factor_tensors[item]['bpropFactors']) > 0: + if (set(factor_tensors[item]['bpropFactors']) == set(factor_tensors[param]['bpropFactors'])) \ + and (len(factor_tensors[item]['fpropFactors']) > 0): + factor_tensors[param]['assnWeights'] = item + factor_tensors[item]['assnBias'] = param + factor_tensors[param]['bpropFactors'] = factor_tensors[ item]['bpropFactors'] - ######## - - ######## - # concatenate the additive gradients along the batch dimension, i.e. - # assuming independence structure + # concatenate the additive gradients along the batch dimension, i.e. assuming independence structure for key in ['fpropFactors', 'bpropFactors']: for i, param in enumerate(varlist): - if len(factorTensors[param][key]) > 0: - if (key + '_concat') not in factorTensors[param]: - name_scope = factorTensors[param][key][0].name.split(':')[ + if len(factor_tensors[param][key]) > 0: + if (key + '_concat') not in factor_tensors[param]: + name_scope = factor_tensors[param][key][0].name.split(':')[ 0] with tf.name_scope(name_scope): - factorTensors[param][ - key + '_concat'] = tf.concat(factorTensors[param][key], 0) + factor_tensors[param][ + key + '_concat'] = tf.concat(factor_tensors[param][key], 0) else: - factorTensors[param][key + '_concat'] = None - for j, param2 in enumerate(varlist[(i + 1):]): - if (len(factorTensors[param][key]) > 0) and (set(factorTensors[param2][key]) == set(factorTensors[param][key])): - factorTensors[param2][key] = factorTensors[param][key] - factorTensors[param2][ - key + '_concat'] = factorTensors[param][key + '_concat'] - ######## + factor_tensors[param][key + '_concat'] = None + for _, param2 in enumerate(varlist[(i + 1):]): + if (len(factor_tensors[param][key]) > 0) and ( + set(factor_tensors[param2][key]) == set(factor_tensors[param][key])): + factor_tensors[param2][key] = factor_tensors[param][key] + factor_tensors[param2][ + key + '_concat'] = factor_tensors[param][key + '_concat'] if KFAC_DEBUG: - for items in zip(varlist, fpropTensors, bpropTensors, opTypes): - print((items[0].name, factorTensors[item])) - self.factors = factorTensors - return factorTensors + for items in zip(varlist, fprop_tensors, bprop_tensors, op_types): + print((items[0].name, factor_tensors[item])) + self.factors = factor_tensors + return factor_tensors + + def get_stats(self, factors, varlist): + """ + return the stats values from the factors to update and the parameters - def getStats(self, factors, varlist): + :param factors: ([TensorFlow Tensor]) The factors to update + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) The stats values + """ if len(self.stats) == 0: # initialize stats variables on CPU because eigen decomp is # computed on CPU with tf.device('/cpu'): - tmpStatsCache = {} + tmp_stats_cache = {} # search for tensor factors and # use block diag approx for the bias units for var in varlist: - fpropFactor = factors[var]['fpropFactors_concat'] - bpropFactor = factors[var]['bpropFactors_concat'] - opType = factors[var]['opName'] - if opType == 'Conv2D': - Kh = var.get_shape()[0] - Kw = var.get_shape()[1] - C = fpropFactor.get_shape()[-1] - - Oh = bpropFactor.get_shape()[1] - Ow = bpropFactor.get_shape()[2] - if Oh == 1 and Ow == 1 and self._channel_fac: + bprop_factor = factors[var]['bpropFactors_concat'] + op_type = factors[var]['opName'] + if op_type == 'Conv2D': + operator_height = bprop_factor.get_shape()[1] + operator_width = bprop_factor.get_shape()[2] + if operator_height == 1 and operator_width == 1 and self._channel_fac: # factorization along the channels do not support # homogeneous coordinate - var_assnBias = factors[var]['assnBias'] - if var_assnBias: + var_assn_bias = factors[var]['assnBias'] + if var_assn_bias: factors[var]['assnBias'] = None - factors[var_assnBias]['assnWeights'] = None - ## + factors[var_assn_bias]['assnWeights'] = None for var in varlist: - fpropFactor = factors[var]['fpropFactors_concat'] - bpropFactor = factors[var]['bpropFactors_concat'] - opType = factors[var]['opName'] - self.stats[var] = {'opName': opType, + fprop_factor = factors[var]['fpropFactors_concat'] + bprop_factor = factors[var]['bpropFactors_concat'] + op_type = factors[var]['opName'] + self.stats[var] = {'opName': op_type, 'fprop_concat_stats': [], 'bprop_concat_stats': [], 'assnWeights': factors[var]['assnWeights'], 'assnBias': factors[var]['assnBias'], } - if fpropFactor is not None: - if fpropFactor not in tmpStatsCache: - if opType == 'Conv2D': - Kh = var.get_shape()[0] - Kw = var.get_shape()[1] - C = fpropFactor.get_shape()[-1] - - Oh = bpropFactor.get_shape()[1] - Ow = bpropFactor.get_shape()[2] - if Oh == 1 and Ow == 1 and self._channel_fac: + if fprop_factor is not None: + if fprop_factor not in tmp_stats_cache: + if op_type == 'Conv2D': + kernel_height = var.get_shape()[0] + kernel_width = var.get_shape()[1] + n_channels = fprop_factor.get_shape()[-1] + + operator_height = bprop_factor.get_shape()[1] + operator_width = bprop_factor.get_shape()[2] + if operator_height == 1 and operator_width == 1 and self._channel_fac: # factorization along the channels # assume independence between input channels and spatial # 2K-1 x 2K-1 covariance matrix and C x C covariance matrix # factorization along the channels do not # support homogeneous coordinate, assnBias # is always None - fpropFactor2_size = Kh * Kw - slot_fpropFactor_stats2 = tf.Variable(tf.diag(tf.ones( - [fpropFactor2_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False) + fprop_factor2_size = kernel_height * kernel_width + slot_fprop_factor_stats2 = tf.Variable(tf.diag(tf.ones( + [fprop_factor2_size])) * self._diag_init_coeff, + name='KFAC_STATS/' + fprop_factor.op.name, + trainable=False) self.stats[var]['fprop_concat_stats'].append( - slot_fpropFactor_stats2) + slot_fprop_factor_stats2) - fpropFactor_size = C + fprop_factor_size = n_channels else: # 2K-1 x 2K-1 x C x C covariance matrix # assume BHWC - fpropFactor_size = Kh * Kw * C + fprop_factor_size = kernel_height * kernel_width * n_channels else: # D x D covariance matrix - fpropFactor_size = fpropFactor.get_shape()[-1] + fprop_factor_size = fprop_factor.get_shape()[-1] # use homogeneous coordinate if not self._blockdiag_bias and self.stats[var]['assnBias']: - fpropFactor_size += 1 + fprop_factor_size += 1 - slot_fpropFactor_stats = tf.Variable(tf.diag(tf.ones( - [fpropFactor_size])) * self._diag_init_coeff, name='KFAC_STATS/' + fpropFactor.op.name, trainable=False) + slot_fprop_factor_stats = tf.Variable( + tf.diag(tf.ones([fprop_factor_size])) * self._diag_init_coeff, + name='KFAC_STATS/' + fprop_factor.op.name, trainable=False) self.stats[var]['fprop_concat_stats'].append( - slot_fpropFactor_stats) - if opType != 'Conv2D': - tmpStatsCache[fpropFactor] = self.stats[ + slot_fprop_factor_stats) + if op_type != 'Conv2D': + tmp_stats_cache[fprop_factor] = self.stats[ var]['fprop_concat_stats'] else: self.stats[var][ - 'fprop_concat_stats'] = tmpStatsCache[fpropFactor] + 'fprop_concat_stats'] = tmp_stats_cache[fprop_factor] - if bpropFactor is not None: + if bprop_factor is not None: # no need to collect backward stats for bias vectors if # using homogeneous coordinates - if not((not self._blockdiag_bias) and self.stats[var]['assnWeights']): - if bpropFactor not in tmpStatsCache: - slot_bpropFactor_stats = tf.Variable(tf.diag(tf.ones([bpropFactor.get_shape( - )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bpropFactor.op.name, trainable=False) + if not ((not self._blockdiag_bias) and self.stats[var]['assnWeights']): + if bprop_factor not in tmp_stats_cache: + slot_bprop_factor_stats = tf.Variable(tf.diag(tf.ones([bprop_factor.get_shape( + )[-1]])) * self._diag_init_coeff, name='KFAC_STATS/' + bprop_factor.op.name, + trainable=False) self.stats[var]['bprop_concat_stats'].append( - slot_bpropFactor_stats) - tmpStatsCache[bpropFactor] = self.stats[ + slot_bprop_factor_stats) + tmp_stats_cache[bprop_factor] = self.stats[ var]['bprop_concat_stats'] else: self.stats[var][ - 'bprop_concat_stats'] = tmpStatsCache[bpropFactor] + 'bprop_concat_stats'] = tmp_stats_cache[bprop_factor] return self.stats def compute_and_apply_stats(self, loss_sampled, var_list=None): + """ + compute and apply stats + + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: (function) apply stats + """ varlist = var_list if varlist is None: varlist = tf.trainable_variables() @@ -289,206 +330,211 @@ def compute_and_apply_stats(self, loss_sampled, var_list=None): return self.apply_stats(stats) def compute_stats(self, loss_sampled, var_list=None): + """ + compute the stats values + + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) stats updates + """ varlist = var_list if varlist is None: varlist = tf.trainable_variables() - gs = tf.gradients(loss_sampled, varlist, name='gradientsSampled') - self.gs = gs - factors = self.getFactors(gs, varlist) - stats = self.getStats(factors, varlist) + gradient_sampled = tf.gradients(loss_sampled, varlist, name='gradientsSampled') + self.gradient_sampled = gradient_sampled + factors = self.get_factors(gradient_sampled, varlist) + stats = self.get_stats(factors, varlist) - updateOps = [] - statsUpdates = {} - statsUpdates_cache = {} + update_ops = [] + stats_updates = {} + stats_updates_cache = {} for var in varlist: - opType = factors[var]['opName'] + op_type = factors[var]['opName'] fops = factors[var]['op'] - fpropFactor = factors[var]['fpropFactors_concat'] - fpropStats_vars = stats[var]['fprop_concat_stats'] - bpropFactor = factors[var]['bpropFactors_concat'] - bpropStats_vars = stats[var]['bprop_concat_stats'] - SVD_factors = {} - for stats_var in fpropStats_vars: + fprop_factor = factors[var]['fpropFactors_concat'] + fprop_stats_vars = stats[var]['fprop_concat_stats'] + bprop_factor = factors[var]['bpropFactors_concat'] + bprop_stats_vars = stats[var]['bprop_concat_stats'] + svd_factors = {} + for stats_var in fprop_stats_vars: stats_var_dim = int(stats_var.get_shape()[0]) - if stats_var not in statsUpdates_cache: - old_fpropFactor = fpropFactor - B = (tf.shape(fpropFactor)[0]) # batch size - if opType == 'Conv2D': + if stats_var not in stats_updates_cache: + batch_size = (tf.shape(fprop_factor)[0]) # batch size + if op_type == 'Conv2D': strides = fops.get_attr("strides") padding = fops.get_attr("padding") convkernel_size = var.get_shape()[0:3] - KH = int(convkernel_size[0]) - KW = int(convkernel_size[1]) - C = int(convkernel_size[2]) - flatten_size = int(KH * KW * C) + kernel_height = int(convkernel_size[0]) + kernel_width = int(convkernel_size[1]) + chan = int(convkernel_size[2]) + flatten_size = int(kernel_height * kernel_width * chan) - Oh = int(bpropFactor.get_shape()[1]) - Ow = int(bpropFactor.get_shape()[2]) + operator_height = int(bprop_factor.get_shape()[1]) + operator_width = int(bprop_factor.get_shape()[2]) - if Oh == 1 and Ow == 1 and self._channel_fac: - # factorization along the channels - # assume independence among input channels - # factor = B x 1 x 1 x (KH xKW x C) - # patches = B x Oh x Ow x (KH xKW x C) - if len(SVD_factors) == 0: + if operator_height == 1 and operator_width == 1 and self._channel_fac: + # factorization along the channels + # assume independence among input channels + # factor = B x 1 x 1 x (KH xKW x C) + # patches = B x Oh x Ow x (KH xKW x C) + if len(svd_factors) == 0: if KFAC_DEBUG: - print(('approx %s act factor with rank-1 SVD factors' % (var.name))) + print(('approx %s act factor with rank-1 SVD factors' % var.name)) # find closest rank-1 approx to the feature map S, U, V = tf.batch_svd(tf.reshape( - fpropFactor, [-1, KH * KW, C])) + fprop_factor, [-1, kernel_height * kernel_width, chan])) # get rank-1 approx slides - sqrtS1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) - patches_k = U[:, :, 0] * sqrtS1 # B x KH*KW - full_factor_shape = fpropFactor.get_shape() + sqrt_s1 = tf.expand_dims(tf.sqrt(S[:, 0, 0]), 1) + patches_k = U[:, :, 0] * sqrt_s1 # B x KH*KW + full_factor_shape = fprop_factor.get_shape() patches_k.set_shape( - [full_factor_shape[0], KH * KW]) - patches_c = V[:, :, 0] * sqrtS1 # B x C - patches_c.set_shape([full_factor_shape[0], C]) - SVD_factors[C] = patches_c - SVD_factors[KH * KW] = patches_k - fpropFactor = SVD_factors[stats_var_dim] + [full_factor_shape[0], kernel_height * kernel_width]) + patches_c = V[:, :, 0] * sqrt_s1 # B x C + patches_c.set_shape([full_factor_shape[0], chan]) + svd_factors[chan] = patches_c + svd_factors[kernel_height * kernel_width] = patches_k + fprop_factor = svd_factors[stats_var_dim] else: # poor mem usage implementation - patches = tf.extract_image_patches(fpropFactor, ksizes=[1, convkernel_size[ - 0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding) + patches = tf.extract_image_patches(fprop_factor, ksizes=[1, convkernel_size[ + 0], convkernel_size[1], 1], strides=strides, rates=[1, 1, 1, 1], padding=padding) - if self._approxT2: + if self._approx_t2: if KFAC_DEBUG: - print(('approxT2 act fisher for %s' % (var.name))) + print(('approxT2 act fisher for %s' % var.name)) # T^2 terms * 1/T^2, size: B x C - fpropFactor = tf.reduce_mean(patches, [1, 2]) + fprop_factor = tf.reduce_mean(patches, [1, 2]) else: # size: (B x Oh x Ow) x C - fpropFactor = tf.reshape( - patches, [-1, flatten_size]) / Oh / Ow - fpropFactor_size = int(fpropFactor.get_shape()[-1]) - if stats_var_dim == (fpropFactor_size + 1) and not self._blockdiag_bias: - if opType == 'Conv2D' and not self._approxT2: + fprop_factor = tf.reshape( + patches, [-1, flatten_size]) / operator_height / operator_width + fprop_factor_size = int(fprop_factor.get_shape()[-1]) + if stats_var_dim == (fprop_factor_size + 1) and not self._blockdiag_bias: + if op_type == 'Conv2D' and not self._approx_t2: # correct padding for numerical stability (we # divided out OhxOw from activations for T1 approx) - fpropFactor = tf.concat([fpropFactor, tf.ones( - [tf.shape(fpropFactor)[0], 1]) / Oh / Ow], 1) + fprop_factor = tf.concat([fprop_factor, tf.ones( + [tf.shape(fprop_factor)[0], 1]) / operator_height / operator_width], 1) else: # use homogeneous coordinates - fpropFactor = tf.concat( - [fpropFactor, tf.ones([tf.shape(fpropFactor)[0], 1])], 1) + fprop_factor = tf.concat( + [fprop_factor, tf.ones([tf.shape(fprop_factor)[0], 1])], 1) # average over the number of data points in a batch # divided by B - cov = tf.matmul(fpropFactor, fpropFactor, - transpose_a=True) / tf.cast(B, tf.float32) - updateOps.append(cov) - statsUpdates[stats_var] = cov - if opType != 'Conv2D': + cov = tf.matmul(fprop_factor, fprop_factor, + transpose_a=True) / tf.cast(batch_size, tf.float32) + update_ops.append(cov) + stats_updates[stats_var] = cov + if op_type != 'Conv2D': # HACK: for convolution we recompute fprop stats for # every layer including forking layers - statsUpdates_cache[stats_var] = cov - - for stats_var in bpropStats_vars: - stats_var_dim = int(stats_var.get_shape()[0]) - if stats_var not in statsUpdates_cache: - old_bpropFactor = bpropFactor - bpropFactor_shape = bpropFactor.get_shape() - B = tf.shape(bpropFactor)[0] # batch size - C = int(bpropFactor_shape[-1]) # num channels - if opType == 'Conv2D' or len(bpropFactor_shape) == 4: - if fpropFactor is not None: - if self._approxT2: + stats_updates_cache[stats_var] = cov + + for stats_var in bprop_stats_vars: + if stats_var not in stats_updates_cache: + bprop_factor_shape = bprop_factor.get_shape() + batch_size = tf.shape(bprop_factor)[0] # batch size + chan = int(bprop_factor_shape[-1]) # num channels + if op_type == 'Conv2D' or len(bprop_factor_shape) == 4: + if fprop_factor is not None: + if self._approx_t2: if KFAC_DEBUG: - print(('approxT2 grad fisher for %s' % (var.name))) - bpropFactor = tf.reduce_sum( - bpropFactor, [1, 2]) # T^2 terms * 1/T^2 + print(('approxT2 grad fisher for %s' % var.name)) + bprop_factor = tf.reduce_sum( + bprop_factor, [1, 2]) # T^2 terms * 1/T^2 else: - bpropFactor = tf.reshape( - bpropFactor, [-1, C]) * Oh * Ow # T * 1/T terms + bprop_factor = tf.reshape( + bprop_factor, [-1, chan]) * operator_height * operator_width # T * 1/T terms else: # just doing block diag approx. spatial independent # structure does not apply here. summing over # spatial locations if KFAC_DEBUG: - print(('block diag approx fisher for %s' % (var.name))) - bpropFactor = tf.reduce_sum(bpropFactor, [1, 2]) + print(('block diag approx fisher for %s' % var.name)) + bprop_factor = tf.reduce_sum(bprop_factor, [1, 2]) - # assume sampled loss is averaged. TO-DO:figure out better + # assume sampled loss is averaged. TODO:figure out better # way to handle this - bpropFactor *= tf.to_float(B) + bprop_factor *= tf.to_float(batch_size) ## cov_b = tf.matmul( - bpropFactor, bpropFactor, transpose_a=True) / tf.to_float(tf.shape(bpropFactor)[0]) + bprop_factor, bprop_factor, transpose_a=True) / tf.to_float(tf.shape(bprop_factor)[0]) - updateOps.append(cov_b) - statsUpdates[stats_var] = cov_b - statsUpdates_cache[stats_var] = cov_b + update_ops.append(cov_b) + stats_updates[stats_var] = cov_b + stats_updates_cache[stats_var] = cov_b if KFAC_DEBUG: - aKey = list(statsUpdates.keys())[0] - statsUpdates[aKey] = tf.Print(statsUpdates[aKey], - [tf.convert_to_tensor('step:'), - self.global_step, - tf.convert_to_tensor( - 'computing stats'), - ]) - self.statsUpdates = statsUpdates - return statsUpdates - - def apply_stats(self, statsUpdates): - """ compute stats and update/apply the new stats to the running average + a_key = list(stats_updates.keys())[0] + stats_updates[a_key] = tf.Print(stats_updates[a_key], [tf.convert_to_tensor('step:'), self.global_step, + tf.convert_to_tensor('computing stats')]) + self.stats_updates = stats_updates + return stats_updates + + def apply_stats(self, stats_updates): + """ + compute stats and update/apply the new stats to the running average + + :param stats_updates: ([TensorFlow Tensor]) The stats updates + :return: (function) update stats operation """ - def updateAccumStats(): + def _update_accum_stats(): if self._full_stats_init: - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)), tf.no_op) + return tf.cond(tf.greater(self.sgd_step, self._cold_iter), lambda: tf.group( + *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)), + tf.no_op) else: - return tf.group(*self._apply_stats(statsUpdates, accumulate=True, accumulateCoeff=1. / self._stats_accum_iter)) + return tf.group( + *self._apply_stats(stats_updates, accumulate=True, accumulate_coeff=1. / self._stats_accum_iter)) - def updateRunningAvgStats(statsUpdates, fac_iter=1): - # return tf.cond(tf.greater_equal(self.factor_step, - # tf.convert_to_tensor(fac_iter)), lambda: - # tf.group(*self._apply_stats(stats_list, varlist)), tf.no_op) - return tf.group(*self._apply_stats(statsUpdates)) + def _update_running_avg_stats(stats_updates): + return tf.group(*self._apply_stats(stats_updates)) if self._async_stats: # asynchronous stats update - update_stats = self._apply_stats(statsUpdates) + update_stats = self._apply_stats(stats_updates) queue = tf.FIFOQueue(1, [item.dtype for item in update_stats], shapes=[ - item.get_shape() for item in update_stats]) + item.get_shape() for item in update_stats]) enqueue_op = queue.enqueue(update_stats) def dequeue_stats_op(): return queue.dequeue() + self.qr_stats = tf.train.QueueRunner(queue, [enqueue_op]) update_stats_op = tf.cond(tf.equal(queue.size(), tf.convert_to_tensor( 0)), tf.no_op, lambda: tf.group(*[dequeue_stats_op(), ])) else: # synchronous stats update - update_stats_op = tf.cond(tf.greater_equal( - self.stats_step, self._stats_accum_iter), lambda: updateRunningAvgStats(statsUpdates), updateAccumStats) + update_stats_op = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), + lambda: _update_running_avg_stats(stats_updates), _update_accum_stats) self._update_stats_op = update_stats_op return update_stats_op - def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.): - updateOps = [] + def _apply_stats(self, stats_updates, accumulate=False, accumulate_coeff=0.): + update_ops = [] # obtain the stats var list - for stats_var in statsUpdates: - stats_new = statsUpdates[stats_var] + for stats_var in stats_updates: + stats_new = stats_updates[stats_var] if accumulate: # simple superbatch averaging update_op = tf.assign_add( - stats_var, accumulateCoeff * stats_new, use_locking=True) + stats_var, accumulate_coeff * stats_new, use_locking=True) else: # exponential running averaging update_op = tf.assign( stats_var, stats_var * self._stats_decay, use_locking=True) update_op = tf.assign_add( update_op, (1. - self._stats_decay) * stats_new, use_locking=True) - updateOps.append(update_op) + update_ops.append(update_op) - with tf.control_dependencies(updateOps): + with tf.control_dependencies(update_ops): stats_step_op = tf.assign_add(self.stats_step, 1) if KFAC_DEBUG: @@ -502,120 +548,115 @@ def _apply_stats(self, statsUpdates, accumulate=False, accumulateCoeff=0.): tf.convert_to_tensor('Accum:'), tf.convert_to_tensor(accumulate), tf.convert_to_tensor('Accum coeff:'), - tf.convert_to_tensor(accumulateCoeff), + tf.convert_to_tensor(accumulate_coeff), tf.convert_to_tensor('stat step:'), - self.stats_step, updateOps[0], updateOps[1]])) + self.stats_step, update_ops[0], update_ops[1]])) return [stats_step_op, ] - def getStatsEigen(self, stats=None): + def get_stats_eigen(self, stats=None): + """ + Return the eigen values from the stats + + :param stats: ([TensorFlow Tensor]) The stats + :return: ([TensorFlow Tensor]) The stats eigen values + """ if len(self.stats_eigen) == 0: stats_eigen = {} if stats is None: stats = self.stats - tmpEigenCache = {} + tmp_eigen_cache = {} with tf.device('/cpu:0'): for var in stats: for key in ['fprop_concat_stats', 'bprop_concat_stats']: for stats_var in stats[var][key]: - if stats_var not in tmpEigenCache: + if stats_var not in tmp_eigen_cache: stats_dim = stats_var.get_shape()[1].value - e = tf.Variable(tf.ones( - [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False) - Q = tf.Variable(tf.diag(tf.ones( - [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False) - stats_eigen[stats_var] = {'e': e, 'Q': Q} - tmpEigenCache[ + eigen_values = tf.Variable(tf.ones( + [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', + trainable=False) + eigen_vectors = tf.Variable(tf.diag(tf.ones( + [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', + trainable=False) + stats_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} + tmp_eigen_cache[ stats_var] = stats_eigen[stats_var] else: - stats_eigen[stats_var] = tmpEigenCache[ + stats_eigen[stats_var] = tmp_eigen_cache[ stats_var] self.stats_eigen = stats_eigen return self.stats_eigen - def computeStatsEigen(self): - """ compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue """ - # TO-DO: figure out why this op has delays (possibly moving - # eigenvectors around?) - with tf.device('/cpu:0'): - def removeNone(tensor_list): - local_list = [] - for item in tensor_list: - if item is not None: - local_list.append(item) - return local_list - - def copyStats(var_list): - print("copying stats to buffer tensors before eigen decomp") - redundant_stats = {} - copied_list = [] - for item in var_list: - if item is not None: - if item not in redundant_stats: - if self._use_float64: - redundant_stats[item] = tf.cast( - tf.identity(item), tf.float64) - else: - redundant_stats[item] = tf.identity(item) - copied_list.append(redundant_stats[item]) - else: - copied_list.append(None) - return copied_list - #stats = [copyStats(self.fStats), copyStats(self.bStats)] - #stats = [self.fStats, self.bStats] + def compute_stats_eigen(self): + """ + compute the eigen decomp using copied var stats to avoid concurrent read/write from other queue + :return: ([TensorFlow Tensor]) update operations + """ + # TODO: figure out why this op has delays (possibly moving eigenvectors around?) + with tf.device('/cpu:0'): stats_eigen = self.stats_eigen - computedEigen = {} + computed_eigen = {} eigen_reverse_lookup = {} - updateOps = [] + update_ops = [] # sync copied stats - # with tf.control_dependencies(removeNone(stats[0]) + - # removeNone(stats[1])): with tf.control_dependencies([]): for stats_var in stats_eigen: - if stats_var not in computedEigen: - eigens = tf.self_adjoint_eig(stats_var) - e = eigens[0] - Q = eigens[1] + if stats_var not in computed_eigen: + eigen_decomposition = tf.self_adjoint_eig(stats_var) + eigen_values = eigen_decomposition[0] + eigen_vectors = eigen_decomposition[1] if self._use_float64: - e = tf.cast(e, tf.float32) - Q = tf.cast(Q, tf.float32) - updateOps.append(e) - updateOps.append(Q) - computedEigen[stats_var] = {'e': e, 'Q': Q} - eigen_reverse_lookup[e] = stats_eigen[stats_var]['e'] - eigen_reverse_lookup[Q] = stats_eigen[stats_var]['Q'] + eigen_values = tf.cast(eigen_values, tf.float64) + eigen_vectors = tf.cast(eigen_vectors, tf.float64) + update_ops.append(eigen_values) + update_ops.append(eigen_vectors) + computed_eigen[stats_var] = {'e': eigen_values, 'Q': eigen_vectors} + eigen_reverse_lookup[eigen_values] = stats_eigen[stats_var]['e'] + eigen_reverse_lookup[eigen_vectors] = stats_eigen[stats_var]['Q'] self.eigen_reverse_lookup = eigen_reverse_lookup - self.eigen_update_list = updateOps + self.eigen_update_list = update_ops if KFAC_DEBUG: - self.eigen_update_list = [item for item in updateOps] - with tf.control_dependencies(updateOps): - updateOps.append(tf.Print(tf.constant( + self.eigen_update_list = [item for item in update_ops] + with tf.control_dependencies(update_ops): + update_ops.append(tf.Print(tf.constant( 0.), [tf.convert_to_tensor('computed factor eigen')])) - return updateOps + return update_ops + + def apply_stats_eigen(self, eigen_list): + """ + apply the update using the eigen values of the stats - def applyStatsEigen(self, eigen_list): - updateOps = [] + :param eigen_list: ([TensorFlow Tensor]) The list of eigen values of the stats + :return: ([TensorFlow Tensor]) update operations + """ + update_ops = [] print(('updating %d eigenvalue/vectors' % len(eigen_list))) - for i, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)): + for _, (tensor, mark) in enumerate(zip(eigen_list, self.eigen_update_list)): stats_eigen_var = self.eigen_reverse_lookup[mark] - updateOps.append( + update_ops.append( tf.assign(stats_eigen_var, tensor, use_locking=True)) - with tf.control_dependencies(updateOps): + with tf.control_dependencies(update_ops): factor_step_op = tf.assign_add(self.factor_step, 1) - updateOps.append(factor_step_op) + update_ops.append(factor_step_op) if KFAC_DEBUG: - updateOps.append(tf.Print(tf.constant( + update_ops.append(tf.Print(tf.constant( 0.), [tf.convert_to_tensor('updated kfac factors')])) - return updateOps + return update_ops + + def get_kfac_precond_updates(self, gradlist, varlist): + """ + return the KFAC updates - def getKfacPrecondUpdates(self, gradlist, varlist): - updatelist = [] - vg = 0. + :param gradlist: ([TensorFlow Tensor]) The gradients + :param varlist: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) the update list + """ + v_g = 0. assert len(self.stats) > 0 assert len(self.stats_eigen) > 0 @@ -625,220 +666,223 @@ def getKfacPrecondUpdates(self, gradlist, varlist): grad_dict = {var: grad for grad, var in zip(gradlist, varlist)} for grad, var in zip(gradlist, varlist): - GRAD_RESHAPE = False - GRAD_TRANSPOSE = False + grad_reshape = False - fpropFactoredFishers = self.stats[var]['fprop_concat_stats'] - bpropFactoredFishers = self.stats[var]['bprop_concat_stats'] + fprop_factored_fishers = self.stats[var]['fprop_concat_stats'] + bprop_factored_fishers = self.stats[var]['bprop_concat_stats'] - if (len(fpropFactoredFishers) + len(bpropFactoredFishers)) > 0: + if (len(fprop_factored_fishers) + len(bprop_factored_fishers)) > 0: counter += 1 - GRAD_SHAPE = grad.get_shape() + grad_shape = grad.get_shape() if len(grad.get_shape()) > 2: # reshape conv kernel parameters - KW = int(grad.get_shape()[0]) - KH = int(grad.get_shape()[1]) - C = int(grad.get_shape()[2]) - D = int(grad.get_shape()[3]) + kernel_width = int(grad.get_shape()[0]) + kernel_height = int(grad.get_shape()[1]) + n_channels = int(grad.get_shape()[2]) + depth = int(grad.get_shape()[3]) - if len(fpropFactoredFishers) > 1 and self._channel_fac: + if len(fprop_factored_fishers) > 1 and self._channel_fac: # reshape conv kernel parameters into tensor - grad = tf.reshape(grad, [KW * KH, C, D]) + grad = tf.reshape(grad, [kernel_width * kernel_height, n_channels, depth]) else: # reshape conv kernel parameters into 2D grad - grad = tf.reshape(grad, [-1, D]) - GRAD_RESHAPE = True + grad = tf.reshape(grad, [-1, depth]) + grad_reshape = True elif len(grad.get_shape()) == 1: # reshape bias or 1D parameters - D = int(grad.get_shape()[0]) grad = tf.expand_dims(grad, 0) - GRAD_RESHAPE = True - else: - # 2D parameters - C = int(grad.get_shape()[0]) - D = int(grad.get_shape()[1]) + grad_reshape = True if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: # use homogeneous coordinates only works for 2D grad. - # TO-DO: figure out how to factorize bias grad + # TODO: figure out how to factorize bias grad # stack bias grad - var_assnBias = self.stats[var]['assnBias'] + var_assn_bias = self.stats[var]['assnBias'] grad = tf.concat( - [grad, tf.expand_dims(grad_dict[var_assnBias], 0)], 0) + [grad, tf.expand_dims(grad_dict[var_assn_bias], 0)], 0) # project gradient to eigen space and reshape the eigenvalues # for broadcasting - eigVals = [] + eig_vals = [] for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - e = detectMinVal(self.stats_eigen[stats][ - 'e'], var, name='act', debug=KFAC_DEBUG) + eigen_vectors = self.stats_eigen[stats]['Q'] + eigen_values = detect_min_val(self.stats_eigen[stats][ + 'e'], var, name='act', debug=KFAC_DEBUG) - Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='act') - eigVals.append(e) - grad = gmatmul(Q, grad, transpose_a=True, reduce_dim=idx) + eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, + grad, fac_idx=idx, f_type='act') + eig_vals.append(eigen_values) + grad = gmatmul(eigen_vectors, grad, transpose_a=True, reduce_dim=idx) for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - e = detectMinVal(self.stats_eigen[stats][ - 'e'], var, name='grad', debug=KFAC_DEBUG) + eigen_vectors = self.stats_eigen[stats]['Q'] + eigen_values = detect_min_val(self.stats_eigen[stats][ + 'e'], var, name='grad', debug=KFAC_DEBUG) - Q, e = factorReshape(Q, e, grad, facIndx=idx, ftype='grad') - eigVals.append(e) - grad = gmatmul(grad, Q, transpose_b=False, reduce_dim=idx) - ## + eigen_vectors, eigen_values = factor_reshape(eigen_vectors, eigen_values, + grad, fac_idx=idx, f_type='grad') + eig_vals.append(eigen_values) + grad = gmatmul(grad, eigen_vectors, transpose_b=False, reduce_dim=idx) - ##### # whiten using eigenvalues - weightDecayCoeff = 0. + weight_decay_coeff = 0. if var in self._weight_decay_dict: - weightDecayCoeff = self._weight_decay_dict[var] + weight_decay_coeff = self._weight_decay_dict[var] if KFAC_DEBUG: - print(('weight decay coeff for %s is %f' % (var.name, weightDecayCoeff))) + print(('weight decay coeff for %s is %f' % (var.name, weight_decay_coeff))) if self._factored_damping: if KFAC_DEBUG: - print(('use factored damping for %s' % (var.name))) + print(('use factored damping for %s' % var.name)) coeffs = 1. - num_factors = len(eigVals) + num_factors = len(eig_vals) # compute the ratio of two trace norm of the left and right # KFac matrices, and their generalization - if len(eigVals) == 1: - damping = self._epsilon + weightDecayCoeff + if len(eig_vals) == 1: + damping = self._epsilon + weight_decay_coeff else: damping = tf.pow( - self._epsilon + weightDecayCoeff, 1. / num_factors) - eigVals_tnorm_avg = [tf.reduce_mean( - tf.abs(e)) for e in eigVals] - for e, e_tnorm in zip(eigVals, eigVals_tnorm_avg): - eig_tnorm_negList = [ - item for item in eigVals_tnorm_avg if item != e_tnorm] - if len(eigVals) == 1: + self._epsilon + weight_decay_coeff, 1. / num_factors) + eig_vals_tnorm_avg = [tf.reduce_mean( + tf.abs(e)) for e in eig_vals] + for eigen_val, e_tnorm in zip(eig_vals, eig_vals_tnorm_avg): + eig_tnorm_neg_list = [ + item for item in eig_vals_tnorm_avg if item != e_tnorm] + if len(eig_vals) == 1: adjustment = 1. - elif len(eigVals) == 2: + elif len(eig_vals) == 2: adjustment = tf.sqrt( - e_tnorm / eig_tnorm_negList[0]) + e_tnorm / eig_tnorm_neg_list[0]) else: - eig_tnorm_negList_prod = reduce( - lambda x, y: x * y, eig_tnorm_negList) + eig_tnorm_neg_list_prod = reduce( + lambda x, y: x * y, eig_tnorm_neg_list) adjustment = tf.pow( - tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_negList_prod, 1. / num_factors) - coeffs *= (e + adjustment * damping) + tf.pow(e_tnorm, num_factors - 1.) / eig_tnorm_neg_list_prod, 1. / num_factors) + coeffs *= (eigen_val + adjustment * damping) else: coeffs = 1. - damping = (self._epsilon + weightDecayCoeff) - for e in eigVals: - coeffs *= e + damping = (self._epsilon + weight_decay_coeff) + for eigen_val in eig_vals: + coeffs *= eigen_val coeffs += damping - #grad = tf.Print(grad, [tf.convert_to_tensor('1'), tf.convert_to_tensor(var.name), grad.get_shape()]) - grad /= coeffs - #grad = tf.Print(grad, [tf.convert_to_tensor('2'), tf.convert_to_tensor(var.name), grad.get_shape()]) - ##### # project gradient back to euclidean space for idx, stats in enumerate(self.stats[var]['fprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - grad = gmatmul(Q, grad, transpose_a=False, reduce_dim=idx) + eigen_vectors = self.stats_eigen[stats]['Q'] + grad = gmatmul(eigen_vectors, grad, transpose_a=False, reduce_dim=idx) for idx, stats in enumerate(self.stats[var]['bprop_concat_stats']): - Q = self.stats_eigen[stats]['Q'] - grad = gmatmul(grad, Q, transpose_b=True, reduce_dim=idx) - ## + eigen_vectors = self.stats_eigen[stats]['Q'] + grad = gmatmul(grad, eigen_vectors, transpose_b=True, reduce_dim=idx) - #grad = tf.Print(grad, [tf.convert_to_tensor('3'), tf.convert_to_tensor(var.name), grad.get_shape()]) if (self.stats[var]['assnBias'] is not None) and not self._blockdiag_bias: # use homogeneous coordinates only works for 2D grad. - # TO-DO: figure out how to factorize bias grad + # TODO: figure out how to factorize bias grad # un-stack bias grad - var_assnBias = self.stats[var]['assnBias'] - C_plus_one = int(grad.get_shape()[0]) - grad_assnBias = tf.reshape(tf.slice(grad, - begin=[ - C_plus_one - 1, 0], - size=[1, -1]), var_assnBias.get_shape()) - grad_assnWeights = tf.slice(grad, - begin=[0, 0], - size=[C_plus_one - 1, -1]) - grad_dict[var_assnBias] = grad_assnBias - grad = grad_assnWeights - - #grad = tf.Print(grad, [tf.convert_to_tensor('4'), tf.convert_to_tensor(var.name), grad.get_shape()]) - if GRAD_RESHAPE: - grad = tf.reshape(grad, GRAD_SHAPE) + var_assn_bias = self.stats[var]['assnBias'] + c_plus_one = int(grad.get_shape()[0]) + grad_assn_bias = tf.reshape(tf.slice(grad, + begin=[ + c_plus_one - 1, 0], + size=[1, -1]), var_assn_bias.get_shape()) + grad_assn_weights = tf.slice(grad, + begin=[0, 0], + size=[c_plus_one - 1, -1]) + grad_dict[var_assn_bias] = grad_assn_bias + grad = grad_assn_weights + + if grad_reshape: + grad = tf.reshape(grad, grad_shape) grad_dict[var] = grad print(('projecting %d gradient matrices' % counter)) - for g, var in zip(gradlist, varlist): + for grad_1, var in zip(gradlist, varlist): grad = grad_dict[var] - ### clipping ### + # clipping if KFAC_DEBUG: - print(('apply clipping to %s' % (var.name))) + print(('apply clipping to %s' % var.name)) tf.Print(grad, [tf.sqrt(tf.reduce_sum(tf.pow(grad, 2)))], "Euclidean norm of new grad") - local_vg = tf.reduce_sum(grad * g * (self._lr * self._lr)) - vg += local_vg + local_vg = tf.reduce_sum(grad * grad_1 * (self._lr * self._lr)) + v_g += local_vg - # recale everything + # rescale everything if KFAC_DEBUG: print('apply vFv clipping') - scaling = tf.minimum(1., tf.sqrt(self._clip_kl / vg)) + scaling = tf.minimum(1., tf.sqrt(self._clip_kl / v_g)) if KFAC_DEBUG: scaling = tf.Print(scaling, [tf.convert_to_tensor( - 'clip: '), scaling, tf.convert_to_tensor(' vFv: '), vg]) - with tf.control_dependencies([tf.assign(self.vFv, vg)]): + 'clip: '), scaling, tf.convert_to_tensor(' vFv: '), v_g]) + with tf.control_dependencies([tf.assign(self.v_f_v, v_g)]): updatelist = [grad_dict[var] for var in varlist] for i, item in enumerate(updatelist): updatelist[i] = scaling * item return updatelist - def compute_gradients(self, loss, var_list=None): + @classmethod + def compute_gradients(cls, loss, var_list=None): + """ + compute the gradients from the loss and the parameters + + :param loss: ([TensorFlow Tensor]) The loss + :param var_list: ([TensorFlow Tensor]) The parameters + :return: ([TensorFlow Tensor]) the gradient + """ varlist = var_list if varlist is None: varlist = tf.trainable_variables() - g = tf.gradients(loss, varlist) + gradients = tf.gradients(loss, varlist) - return [(a, b) for a, b in zip(g, varlist)] + return [(a, b) for a, b in zip(gradients, varlist)] def apply_gradients_kfac(self, grads): - g, varlist = list(zip(*grads)) + """ + apply the kfac gradient + + :param grads: ([TensorFlow Tensor]) the gradient + :return: ([function], QueueRunner) Update functions, queue operation runner + """ + grad, varlist = list(zip(*grads)) if len(self.stats_eigen) == 0: - self.getStatsEigen() + self.get_stats_eigen() - qr = None + queue_runner = None # launch eigen-decomp on a queue thread if self._async: print('Use async eigen decomp') # get a list of factor loading tensors - factorOps_dummy = self.computeStatsEigen() + factor_ops_dummy = self.compute_stats_eigen() # define a queue for the list of factor loading tensors - queue = tf.FIFOQueue(1, [item.dtype for item in factorOps_dummy], shapes=[ - item.get_shape() for item in factorOps_dummy]) - enqueue_op = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor( - 0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: queue.enqueue(self.computeStatsEigen()), tf.no_op) + queue = tf.FIFOQueue(1, [item.dtype for item in factor_ops_dummy], + shapes=[item.get_shape() for item in factor_ops_dummy]) + enqueue_op = tf.cond( + tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), tf.convert_to_tensor( + 0)), tf.greater_equal(self.stats_step, self._stats_accum_iter)), + lambda: queue.enqueue(self.compute_stats_eigen()), tf.no_op) def dequeue_op(): return queue.dequeue() - qr = tf.train.QueueRunner(queue, [enqueue_op]) + queue_runner = tf.train.QueueRunner(queue, [enqueue_op]) - updateOps = [] + update_ops = [] global_step_op = tf.assign_add(self.global_step, 1) - updateOps.append(global_step_op) + update_ops.append(global_step_op) with tf.control_dependencies([global_step_op]): # compute updates - assert self._update_stats_op != None - updateOps.append(self._update_stats_op) + assert self._update_stats_op is not None + update_ops.append(self._update_stats_op) dependency_list = [] if not self._async: dependency_list.append(self._update_stats_op) @@ -849,78 +893,99 @@ def no_op_wrapper(): if not self._async: # synchronous eigen-decomp updates - updateFactorOps = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), - tf.convert_to_tensor(0)), - tf.greater_equal(self.stats_step, self._stats_accum_iter)), lambda: tf.group(*self.applyStatsEigen(self.computeStatsEigen())), no_op_wrapper) + update_factor_ops = tf.cond(tf.logical_and(tf.equal(tf.mod(self.stats_step, self._kfac_update), + tf.convert_to_tensor(0)), + tf.greater_equal(self.stats_step, + self._stats_accum_iter)), + lambda: tf.group(*self.apply_stats_eigen(self.compute_stats_eigen())), + no_op_wrapper) else: # asynchronous eigen-decomp updates using queue - updateFactorOps = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), - lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)), - tf.no_op, + update_factor_ops = tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), + lambda: tf.cond(tf.equal(queue.size(), tf.convert_to_tensor(0)), + tf.no_op, + + lambda: tf.group( + *self.apply_stats_eigen(dequeue_op())), + ), + no_op_wrapper) - lambda: tf.group( - *self.applyStatsEigen(dequeue_op())), - ), - no_op_wrapper) + update_ops.append(update_factor_ops) - updateOps.append(updateFactorOps) + with tf.control_dependencies([update_factor_ops]): + def grad_op(): + return list(grad) - with tf.control_dependencies([updateFactorOps]): - def gradOp(): - return list(g) + def get_kfac_grad_op(): + return self.get_kfac_precond_updates(grad, varlist) - def getKfacGradOp(): - return self.getKfacPrecondUpdates(g, varlist) u = tf.cond(tf.greater(self.factor_step, - tf.convert_to_tensor(0)), getKfacGradOp, gradOp) + tf.convert_to_tensor(0)), get_kfac_grad_op, grad_op) optim = tf.train.MomentumOptimizer( self._lr * (1. - self._momentum), self._momentum) - #optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01) - def optimOp(): - def updateOptimOp(): + # optim = tf.train.AdamOptimizer(self._lr, epsilon=0.01) + + def optim_op(): + def update_optim_op(): if self._full_stats_init: - return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op) + return tf.cond(tf.greater(self.factor_step, tf.convert_to_tensor(0)), + lambda: optim.apply_gradients(list(zip(u, varlist))), tf.no_op) else: return optim.apply_gradients(list(zip(u, varlist))) + if self._full_stats_init: - return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), updateOptimOp, tf.no_op) + return tf.cond(tf.greater_equal(self.stats_step, self._stats_accum_iter), update_optim_op, + tf.no_op) else: - return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), updateOptimOp, tf.no_op) - updateOps.append(optimOp()) + return tf.cond(tf.greater_equal(self.sgd_step, self._cold_iter), update_optim_op, tf.no_op) - return tf.group(*updateOps), qr + update_ops.append(optim_op()) + + return tf.group(*update_ops), queue_runner def apply_gradients(self, grads): - coldOptim = tf.train.MomentumOptimizer( - self._cold_lr, self._momentum) + """ + apply the gradient + + :param grads: ([TensorFlow Tensor]) the gradient + :return: (function, QueueRunner) train operation, queue operation runner + """ + cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum) - def coldSGDstart(): + def _cold_sgd_start(): sgd_grads, sgd_var = zip(*grads) - if self.max_grad_norm != None: - sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm) + if self.max_grad_norm is not None: + sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm) - sgd_grads = list(zip(sgd_grads,sgd_var)) + sgd_grads = list(zip(sgd_grads, sgd_var)) sgd_step_op = tf.assign_add(self.sgd_step, 1) - coldOptim_op = coldOptim.apply_gradients(sgd_grads) + cold_optim_op = cold_optim.apply_gradients(sgd_grads) if KFAC_DEBUG: - with tf.control_dependencies([sgd_step_op, coldOptim_op]): + with tf.control_dependencies([sgd_step_op, cold_optim_op]): sgd_step_op = tf.Print( sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')]) - return tf.group(*[sgd_step_op, coldOptim_op]) + return tf.group(*[sgd_step_op, cold_optim_op]) - kfacOptim_op, qr = self.apply_gradients_kfac(grads) + kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads) - def warmKFACstart(): - return kfacOptim_op + def _warm_kfac_start(): + return kfac_optim_op - return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr + return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner def minimize(self, loss, loss_sampled, var_list=None): + """ + minimize the gradient loss + + :param loss: ([TensorFlow Tensor]) The loss + :param loss_sampled: ([TensorFlow Tensor]) the loss function output + :param var_list: ([TensorFlow Tensor]) The parameters + :return: (function, q_runner) train operation, queue operation runner + """ grads = self.compute_gradients(loss, var_list=var_list) - update_stats_op = self.compute_and_apply_stats( - loss_sampled, var_list=var_list) + self.compute_and_apply_stats(loss_sampled, var_list=var_list) return self.apply_gradients(grads) diff --git a/baselines/acktr/kfac_utils.py b/baselines/acktr/kfac_utils.py index edc623d737..512e21a239 100644 --- a/baselines/acktr/kfac_utils.py +++ b/baselines/acktr/kfac_utils.py @@ -1,20 +1,31 @@ import tensorflow as tf -def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): + +def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None): + """ + Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match + + :param tensor_a: (TensorFlow Tensor) + :param tensor_b: (TensorFlow Tensor) + :param transpose_a: (bool) If 'a' needs transposing + :param transpose_b: (bool) If 'b' needs transposing + :param reduce_dim: (int) the multiplication over the dim + :return: (TensorFlow Tensor) a * b + """ assert reduce_dim is not None # weird batch matmul - if len(a.get_shape()) == 2 and len(b.get_shape()) > 2: + if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2: # reshape reduce_dim to the left most dim in b - b_shape = b.get_shape() + b_shape = tensor_b.get_shape() if reduce_dim != 0: b_dims = list(range(len(b_shape))) b_dims.remove(reduce_dim) b_dims.insert(0, reduce_dim) - b = tf.transpose(b, b_dims) - b_t_shape = b.get_shape() - b = tf.reshape(b, [int(b_shape[reduce_dim]), -1]) - result = tf.matmul(a, b, transpose_a=transpose_a, + tensor_b = tf.transpose(tensor_b, b_dims) + b_t_shape = tensor_b.get_shape() + tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1]) + result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) result = tf.reshape(result, b_t_shape) if reduce_dim != 0: @@ -24,19 +35,19 @@ def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): result = tf.transpose(result, b_dims) return result - elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: + elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2: # reshape reduce_dim to the right most dim in a - a_shape = a.get_shape() + a_shape = tensor_a.get_shape() outter_dim = len(a_shape) - 1 reduce_dim = len(a_shape) - reduce_dim - 1 if reduce_dim != outter_dim: a_dims = list(range(len(a_shape))) a_dims.remove(reduce_dim) a_dims.insert(outter_dim, reduce_dim) - a = tf.transpose(a, a_dims) - a_t_shape = a.get_shape() - a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) - result = tf.matmul(a, b, transpose_a=transpose_a, + tensor_a = tf.transpose(tensor_a, a_dims) + a_t_shape = tensor_a.get_shape() + tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])]) + result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) result = tf.reshape(result, a_t_shape) if reduce_dim != outter_dim: @@ -46,41 +57,72 @@ def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): result = tf.transpose(result, a_dims) return result - elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: - return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) + elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2: + return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) assert False, 'something went wrong' -def clipoutNeg(vec, threshold=1e-6): +def clipout_neg(vec, threshold=1e-6): + """ + clip to 0 if input lower than threshold value + + :param vec: (TensorFlow Tensor) + :param threshold: (float) the cutoff threshold + :return: (TensorFlow Tensor) clipped input + """ mask = tf.cast(vec > threshold, tf.float32) return mask * vec -def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False): +def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False): + """ + If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values + + :param input_mat: (TensorFlow Tensor) + :param var: (TensorFlow Tensor) variable + :param threshold: (float) the cutoff threshold + :param name: (str) the name of the variable + :param debug: (bool) debug function + :return: (TensorFlow Tensor) clipped tensor + """ eigen_min = tf.reduce_min(input_mat) eigen_max = tf.reduce_max(input_mat) eigen_ratio = eigen_max / eigen_min - input_mat_clipped = clipoutNeg(input_mat, threshold) + input_mat_clipped = clipout_neg(input_mat, threshold) if debug: - input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print( - input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio])) + input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), + lambda: input_mat_clipped, lambda: tf.Print( + input_mat_clipped, + [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), + eigen_min, eigen_max, eigen_ratio])) return input_mat_clipped -def factorReshape(Q, e, grad, facIndx=0, ftype='act'): +def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'): + """ + factor and reshape input eigen values + + :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors + :param eigen_values: ([TensorFlow Tensor]) eigen values + :param grad: ([TensorFlow Tensor]) gradient + :param fac_idx: (int) index that should be factored + :param f_type: (str) function type to factor and reshape + :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors + and eigen values + """ grad_shape = grad.get_shape() - if ftype == 'act': - assert e.get_shape()[0] == grad_shape[facIndx] + if f_type == 'act': + assert eigen_values.get_shape()[0] == grad_shape[fac_idx] expanded_shape = [1, ] * len(grad_shape) - expanded_shape[facIndx] = -1 - e = tf.reshape(e, expanded_shape) - if ftype == 'grad': - assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1] + expanded_shape[fac_idx] = -1 + eigen_values = tf.reshape(eigen_values, expanded_shape) + if f_type == 'grad': + assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1] expanded_shape = [1, ] * len(grad_shape) - expanded_shape[len(grad_shape) - facIndx - 1] = -1 - e = tf.reshape(e, expanded_shape) + expanded_shape[len(grad_shape) - fac_idx - 1] = -1 + eigen_values = tf.reshape(eigen_values, expanded_shape) - return Q, e + return eigen_vectors, eigen_values diff --git a/baselines/acktr/policies.py b/baselines/acktr/policies.py index 39bb6cbe6d..56896e30ad 100644 --- a/baselines/acktr/policies.py +++ b/baselines/acktr/policies.py @@ -1,42 +1,75 @@ import numpy as np import tensorflow as tf + from baselines.acktr.utils import dense, kl_div -import baselines.common.tf_util as U +import baselines.common.tf_util as tf_util + class GaussianMlpPolicy(object): def __init__(self, ob_dim, ac_dim): + """ + Create a gaussian MLP policy + + :param ob_dim: (int) Observation dimention + :param ac_dim: (int) action dimention + """ # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them - ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations - oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions - oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions - adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate + ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations + oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions + # batch of actions previous action distributions + oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist") + adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} - h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) - h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) - mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output + layer_1 = tf.nn.tanh(dense(ob_no, 64, "h1", + weight_init=tf_util.normc_initializer(1.0), + bias_init=0.0, weight_loss_dict=wd_dict)) + layer_2 = tf.nn.tanh(dense(layer_1, 64, "h2", + weight_init=tf_util.normc_initializer(1.0), + bias_init=0.0, weight_loss_dict=wd_dict)) + mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1), + bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict - self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs + # Variance on outputs + self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) - sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. - logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action - logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) - kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) - #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n - surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient - surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy - self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob - #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy - self.compute_kl = U.function([ob_no, oldac_dist], kl) - self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss - U.initialize() # Initialize uninitialized TF variables + # This is the sampled action we'll perform. + sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim] + logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( + 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( + tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), + axis=1) # Logprob of sampled action + logprob_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( + 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( + tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), + axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) + kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) + # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) + # Approximation of KL divergence between old policy used to generate actions, + # and new policy used to compute logprob_n + surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient + surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy + # Generate a new action and its logprob + self._act = tf_util.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) + # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) + # Compute (approximate) KL divergence between old policy and new policy + self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss) + # Input and output variables needed for computing loss + self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) + tf_util.initialize() # Initialize uninitialized TF variables + + def act(self, obs): + """ + get the action from an observation - def act(self, ob): - ac, ac_dist, logp = self._act(ob[None]) - return ac[0], ac_dist[0], logp[0] + :param obs: ([float]) observation + :return: ([float], [float], [float]) action, action_proba, logp + """ + action, ac_dist, logp = self._act(obs[None]) + return action[0], ac_dist[0], logp[0] diff --git a/baselines/acktr/run_atari.py b/baselines/acktr/run_atari.py index 6e398ce25d..f9df54991a 100644 --- a/baselines/acktr/run_atari.py +++ b/baselines/acktr/run_atari.py @@ -1,23 +1,36 @@ #!/usr/bin/env python3 - from functools import partial from baselines import logger from baselines.acktr.acktr_disc import learn from baselines.common.cmd_util import make_atari_env, atari_arg_parser from baselines.common.vec_env.vec_frame_stack import VecFrameStack -from baselines.ppo2.policies import CnnPolicy +from baselines.a2c.policies import CnnPolicy + def train(env_id, num_timesteps, seed, num_cpu): + """ + train an ACKTR model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param num_cpu: (int) The number of cpu to train on + """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) policy_fn = partial(CnnPolicy, one_dim_bias=True) learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu) env.close() + def main(): + """ + Runs the test + """ args = atari_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) + if __name__ == '__main__': main() diff --git a/baselines/acktr/run_mujoco.py b/baselines/acktr/run_mujoco.py index 9065d58807..2ca413177b 100644 --- a/baselines/acktr/run_mujoco.py +++ b/baselines/acktr/run_mujoco.py @@ -1,34 +1,46 @@ #!/usr/bin/env python3 import tensorflow as tf + from baselines import logger from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser from baselines.acktr.acktr_cont import learn from baselines.acktr.policies import GaussianMlpPolicy from baselines.acktr.value_functions import NeuralNetValueFunction + def train(env_id, num_timesteps, seed): + """ + train an ACKTR model on atari + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ env = make_mujoco_env(env_id, seed) with tf.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): - vf = NeuralNetValueFunction(ob_dim, ac_dim) + value_fn = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) - learn(env, policy=policy, vf=vf, - gamma=0.99, lam=0.97, timesteps_per_batch=2500, - desired_kl=0.002, - num_timesteps=num_timesteps, animate=False) + learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, + num_timesteps=num_timesteps, animate=False) env.close() + def main(): + """ + Runs the test + """ args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + if __name__ == "__main__": main() diff --git a/baselines/acktr/utils.py b/baselines/acktr/utils.py index 227350fe5e..5b67b2c804 100644 --- a/baselines/acktr/utils.py +++ b/baselines/acktr/utils.py @@ -1,28 +1,49 @@ import tensorflow as tf -def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): + +def dense(input_tensor, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): + """ + A dense Layer + + :param input_tensor: ([TensorFlow Tensor]) input + :param size: (int) number of hidden neurons + :param name: (str) layer name + :param weight_init: (function or int or float) initialize the weight + :param bias_init: (function or int or float) initialize the weight + :param weight_loss_dict: (dict) store the weight loss if not None + :param reuse: (bool) if can be reused + :return: ([TensorFlow Tensor]) the output of the dense Layer + """ with tf.variable_scope(name, reuse=reuse): - assert (len(tf.get_variable_scope().name.split('/')) == 2) + assert len(tf.get_variable_scope().name.split('/')) == 2 - w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init) - b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) + weight = tf.get_variable("w", [input_tensor.get_shape()[1], size], initializer=weight_init) + bias = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) weight_decay_fc = 3e-4 if weight_loss_dict is not None: - weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss') - if weight_loss_dict is not None: - weight_loss_dict[w] = weight_decay_fc - weight_loss_dict[b] = 0.0 + weight_decay = tf.multiply(tf.nn.l2_loss(weight), weight_decay_fc, name='weight_decay_loss') + weight_loss_dict[weight] = weight_decay_fc + weight_loss_dict[bias] = 0.0 tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) - return tf.nn.bias_add(tf.matmul(x, w), b) + return tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) + def kl_div(action_dist1, action_dist2, action_size): + """ + Kullback leiber divergence + + :param action_dist1: ([TensorFlow Tensor]) action distribution 1 + :param action_dist2: ([TensorFlow Tensor]) action distribution 2 + :param action_size: (int) the shape of an action + :return: (float) Kullback leiber divergence + """ mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) denominator = 2 * tf.square(std2) + 1e-8 return tf.reduce_sum( - numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1) + numerator / denominator + tf.log(std2) - tf.log(std1), reduction_indices=-1) diff --git a/baselines/acktr/value_functions.py b/baselines/acktr/value_functions.py index d1e9e1a361..c97e0c8949 100644 --- a/baselines/acktr/value_functions.py +++ b/baselines/acktr/value_functions.py @@ -1,50 +1,86 @@ -from baselines import logger import numpy as np -import baselines.common as common -from baselines.common import tf_util as U import tensorflow as tf + +from baselines import logger +import baselines.common as common +from baselines.common import tf_util from baselines.acktr import kfac from baselines.acktr.utils import dense + class NeuralNetValueFunction(object): - def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 - X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations + def __init__(self, ob_dim, ac_dim): + """ + Create an MLP policy for a value function + + :param ob_dim: (int) Observation dimention + :param ac_dim: (int) action dimention + """ + obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} - h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) - vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] + layer_1 = tf.nn.elu(dense(obs_ph, 64, "h1", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) + layer_2 = tf.nn.elu(dense(layer_1, 64, "h2", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) + vpred_n = dense(layer_2, 1, "hfinal", + weight_init=tf_util.normc_initializer(1.0), bias_init=0, + weight_loss_dict=wd_dict)[:, 0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) - self._predict = U.function([X], vpred_n) - optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ - clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ - async=1, kfac_update=2, cold_iter=50, \ - weight_decay_dict=wd_dict, max_grad_norm=None) + + self._predict = tf_util.function([obs_ph], vpred_n) + + optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, + clip_kl=0.3, epsilon=0.1, stats_decay=0.95, + async=1, kfac_update=2, cold_iter=50, + weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) - self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 - U.initialize() # Initialize uninitialized TF variables - def _preproc(self, path): - l = pathlength(path) - al = np.arange(l).reshape(-1,1)/10.0 + self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 + tf_util.initialize() # Initialize uninitialized TF variables + + @classmethod + def _preproc(cls, path): + """ + preprocess path + + :param path: ({TensorFlow Tensor}) the history of the network + :return: ([TensorFlow Tensor]) processed input + """ + length = path["reward"].shape[0] + # used to be named 'al', unfortunalty we cant seem to know why it was called 'al' or what it means. + # Feel free to fix it if you know what is meant here. + # Could mean 'array_length', but even then we are not sure how this array is useful for the network. + al_capone = np.arange(length).reshape(-1, 1) / 10.0 act = path["action_dist"].astype('float32') - X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) - return X + return np.concatenate([path['observation'], act, al_capone, np.ones((length, 1))], axis=1) + def predict(self, path): + """ + predict value from history + + :param path: ({TensorFlow Tensor}) the history of the network + :return: ([TensorFlow Tensor]) value function output + """ return self._predict(self._preproc(path)) + def fit(self, paths, targvals): - X = np.concatenate([self._preproc(p) for p in paths]) - y = np.concatenate(targvals) - logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y)) - for _ in range(25): self.do_update(X, y) - logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y)) - -def pathlength(path): - return path["reward"].shape[0] + """ + fit paths to target values + + :param paths: ({TensorFlow Tensor}) the history of the network + :param targvals: ([TensorFlow Tensor]) the expected value + """ + _input = np.concatenate([self._preproc(p) for p in paths]) + targets = np.concatenate(targvals) + logger.record_tabular("EVBefore", common.explained_variance(self._predict(_input), targets)) + for _ in range(25): + self.do_update(_input, targets) + logger.record_tabular("EVAfter", common.explained_variance(self._predict(_input), targets)) diff --git a/baselines/bench/__init__.py b/baselines/bench/__init__.py index 4fd3874b39..2ef5aaa071 100644 --- a/baselines/bench/__init__.py +++ b/baselines/bench/__init__.py @@ -1,2 +1 @@ -from baselines.bench.benchmarks import * -from baselines.bench.monitor import * \ No newline at end of file +from baselines.bench.monitor import Monitor, load_results diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py index a5a35f831a..298edd23b6 100644 --- a/baselines/bench/benchmarks.py +++ b/baselines/bench/benchmarks.py @@ -1,10 +1,10 @@ import re -import os.path as osp import os + SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -_atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] -_atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] +_ATARI7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] +_ATARIEXPL7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] _BENCHMARKS = [] @@ -12,39 +12,67 @@ def register_benchmark(benchmark): - for b in _BENCHMARKS: - if b['name'] == benchmark['name']: - raise ValueError('Benchmark with name %s already registered!' % b['name']) + """ + Register an OpenAI gym environment + + :param benchmark: (dict) Containes the name, description and tasks of the environment you wish to register + """ + for bench in _BENCHMARKS: + if bench['name'] == benchmark['name']: + raise ValueError('Benchmark with name %s already registered!' % bench['name']) # automatically add a description if it is not present if 'tasks' in benchmark: - for t in benchmark['tasks']: - if 'desc' not in t: - t['desc'] = remove_version_re.sub('', t['env_id']) + for task in benchmark['tasks']: + if 'desc' not in task: + task['desc'] = remove_version_re.sub('', task['env_id']) _BENCHMARKS.append(benchmark) def list_benchmarks(): + """ + Retuns a list of all the benchmark dictionaries registed by this module + + :return: ([dict]) the benchmarks + """ return [b['name'] for b in _BENCHMARKS] def get_benchmark(benchmark_name): - for b in _BENCHMARKS: - if b['name'] == benchmark_name: - return b + """ + Returns the registered benchmark of the same name, will raise a ValueError if the name is not present + + :param benchmark_name: (str) the name of the benchmark you wish to lookup + :return: (dict) the benchmark dictionarie + """ + for bench in _BENCHMARKS: + if bench['name'] == benchmark_name: + return bench raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) def get_task(benchmark, env_id): - """Get a task by env_id. Return None if the benchmark doesn't have the env""" + """ + Get a task by env_id. Return None if the benchmark doesn't have the env. + + :param benchmark: (dict) the benchmark you wish to look in + :param env_id: (str) the environment id you want to find + :return: (dict) the task + """ return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) -def find_task_for_env_id_in_any_benchmark(env_id): - for bm in _BENCHMARKS: - for task in bm["tasks"]: +def find_task_in_benchmarks(env_id): + """ + Get the first task and benchmark, that has the corresponding environment id + + :param env_id: (str) the environment id you want to find + :return: (dict, dict) the benchmark and task dictionaries + """ + for bench in _BENCHMARKS: + for task in bench["tasks"]: if task["env_id"] == env_id: - return bm, task + return bench, task return None, None @@ -53,38 +81,42 @@ def find_task_for_env_id_in_any_benchmark(env_id): register_benchmark({ 'name': 'Atari50M', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} + for _game in _ATARI7] }) register_benchmark({ 'name': 'Atari10M', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARI7] }) register_benchmark({ 'name': 'Atari1Hr', 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} + for _game in _ATARI7] }) register_benchmark({ 'name': 'AtariExploration10M', 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARIEXPL7] }) # MuJoCo -_mujocosmall = [ +_MUJOCO_SMALL = [ 'InvertedDoublePendulum-v2', 'InvertedPendulum-v2', 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2', 'Reacher-v2', 'Swimmer-v2'] register_benchmark({ 'name': 'Mujoco1M', 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', - 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] + 'tasks': [{'env_id': _envid, 'trials': 3, 'num_timesteps': int(1e6)} for _envid in _MUJOCO_SMALL] }) register_benchmark({ 'name': 'MujocoWalkers', @@ -121,7 +153,7 @@ def find_task_for_env_id_in_any_benchmark(env_id): # Other -_atari50 = [ # actually 47 +_ATARI50 = [ # actually 47 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', @@ -137,7 +169,8 @@ def find_task_for_env_id_in_any_benchmark(env_id): register_benchmark({ 'name': 'Atari50_10M', 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', - 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50] + 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} + for _game in _ATARI50] }) # HER DDPG @@ -147,4 +180,3 @@ def find_task_for_env_id_in_any_benchmark(env_id): 'description': 'Smoke-test only benchmark of HER', 'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}] }) - diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py index 0da1b4f878..91b6ebecdd 100644 --- a/baselines/bench/monitor.py +++ b/baselines/bench/monitor.py @@ -1,35 +1,48 @@ __all__ = ['Monitor', 'get_monitor_files', 'load_results'] -import gym -from gym.core import Wrapper +import os import time -from glob import glob import csv -import os.path as osp import json -import numpy as np +import uuid +from glob import glob + +import gym +from gym.core import Wrapper +import pandas + class Monitor(Wrapper): EXT = "monitor.csv" - f = None + file_handler = None def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): + """ + A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data. + + :param env: (Gym environment) The environment + :param filename: (str) the location to save a log file, can be None for no log + :param allow_early_resets: (bool) allows the reset of the environment before it is done + :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset + :param info_keywords: (tuple) extra information to log, from the information return of environment.step + """ Wrapper.__init__(self, env=env) - self.tstart = time.time() + self.t_start = time.time() if filename is None: - self.f = None + self.file_handler = None self.logger = None else: if not filename.endswith(Monitor.EXT): - if osp.isdir(filename): - filename = osp.join(filename, Monitor.EXT) + if os.path.isdir(filename): + filename = os.path.join(filename, Monitor.EXT) else: filename = filename + "." + Monitor.EXT - self.f = open(filename, "wt") - self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id})) - self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords) + self.file_handler = open(filename, "wt") + self.file_handler.write('#%s\n' % json.dumps({"t_start": self.t_start, 'env_id': env.spec and env.spec.id})) + self.logger = csv.DictWriter(self.file_handler, + fieldnames=('r', 'l', 't') + reset_keywords + info_keywords) self.logger.writeheader() - self.f.flush() + self.file_handler.flush() self.reset_keywords = reset_keywords self.info_keywords = info_keywords @@ -40,103 +53,159 @@ def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), i self.episode_lengths = [] self.episode_times = [] self.total_steps = 0 - self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() + self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() def reset(self, **kwargs): + """ + Calls the Gym environment reset. Can only be called if the environment is over, or if allow_early_resets is True + + :param kwargs: Extra keywords saved for the next episode. only if defined by reset_keywords + :return: ([int] or [float]) the first observation of the environment + """ if not self.allow_early_resets and not self.needs_reset: - raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") + raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, " + "wrap your env with Monitor(env, path, allow_early_resets=True)") self.rewards = [] self.needs_reset = False - for k in self.reset_keywords: - v = kwargs.get(k) - if v is None: - raise ValueError('Expected you to pass kwarg %s into reset'%k) - self.current_reset_info[k] = v + for key in self.reset_keywords: + value = kwargs.get(key) + if value is None: + raise ValueError('Expected you to pass kwarg %s into reset' % key) + self.current_reset_info[key] = value return self.env.reset(**kwargs) def step(self, action): + """ + Step the environment with the given action + + :param action: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ if self.needs_reset: raise RuntimeError("Tried to step environment that needs reset") - ob, rew, done, info = self.env.step(action) - self.rewards.append(rew) + observation, reward, done, info = self.env.step(action) + self.rewards.append(reward) if done: self.needs_reset = True - eprew = sum(self.rewards) + ep_rew = sum(self.rewards) eplen = len(self.rewards) - epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} - for k in self.info_keywords: - epinfo[k] = info[k] - self.episode_rewards.append(eprew) + ep_info = {"r": round(ep_rew, 6), "l": eplen, "t": round(time.time() - self.t_start, 6)} + for key in self.info_keywords: + ep_info[key] = info[key] + self.episode_rewards.append(ep_rew) self.episode_lengths.append(eplen) - self.episode_times.append(time.time() - self.tstart) - epinfo.update(self.current_reset_info) + self.episode_times.append(time.time() - self.t_start) + ep_info.update(self.current_reset_info) if self.logger: - self.logger.writerow(epinfo) - self.f.flush() - info['episode'] = epinfo + self.logger.writerow(ep_info) + self.file_handler.flush() + info['episode'] = ep_info self.total_steps += 1 - return (ob, rew, done, info) + return observation, reward, done, info def close(self): - if self.f is not None: - self.f.close() + """ + Closes the environment + """ + if self.file_handler is not None: + self.file_handler.close() def get_total_steps(self): + """ + Returns the total number of timesteps + + :return: (int) + """ return self.total_steps def get_episode_rewards(self): + """ + Returns the rewards of all the episodes + + :return: ([float]) + """ return self.episode_rewards def get_episode_lengths(self): + """ + Returns the number of timesteps of all the episodes + + :return: ([int]) + """ return self.episode_lengths def get_episode_times(self): + """ + Returns the runtime in seconds of all the episodes + + :return: ([float]) + """ return self.episode_times + class LoadMonitorResultsError(Exception): + """ + Raised when loading the monitor log fails. + """ pass -def get_monitor_files(dir): - return glob(osp.join(dir, "*" + Monitor.EXT)) -def load_results(dir): - import pandas - monitor_files = ( - glob(osp.join(dir, "*monitor.json")) + - glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files +def get_monitor_files(path): + """ + get all the monitor files in the given path + + :param path: (str) the logging folder + :return: ([str]) the log files + """ + return glob(os.path.join(path, "*" + Monitor.EXT)) + + +def load_results(path): + """ + Load results from a given file + + :param path: (str) the path to the log file + :return: (Pandas DataFrame) the logged data + """ + # get both csv and (old) json files + monitor_files = (glob(os.path.join(path, "*monitor.json")) + glob(os.path.join(path, "*monitor.csv"))) if not monitor_files: - raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) - dfs = [] + raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path)) + data_frames = [] headers = [] - for fname in monitor_files: - with open(fname, 'rt') as fh: - if fname.endswith('csv'): - firstline = fh.readline() - assert firstline[0] == '#' - header = json.loads(firstline[1:]) - df = pandas.read_csv(fh, index_col=None) + for file_name in monitor_files: + with open(file_name, 'rt') as file_handler: + if file_name.endswith('csv'): + first_line = file_handler.readline() + assert first_line[0] == '#' + header = json.loads(first_line[1:]) + data_frame = pandas.read_csv(file_handler, index_col=None) headers.append(header) - elif fname.endswith('json'): # Deprecated json format + elif file_name.endswith('json'): # Deprecated json format episodes = [] - lines = fh.readlines() + lines = file_handler.readlines() header = json.loads(lines[0]) headers.append(header) for line in lines[1:]: episode = json.loads(line) episodes.append(episode) - df = pandas.DataFrame(episodes) + data_frame = pandas.DataFrame(episodes) else: assert 0, 'unreachable' - df['t'] += header['t_start'] - dfs.append(df) - df = pandas.concat(dfs) - df.sort_values('t', inplace=True) - df.reset_index(inplace=True) - df['t'] -= min(header['t_start'] for header in headers) - df.headers = headers # HACK to preserve backwards compatibility - return df + data_frame['t'] += header['t_start'] + data_frames.append(data_frame) + data_frame = pandas.concat(data_frames) + data_frame.sort_values('t', inplace=True) + data_frame.reset_index(inplace=True) + data_frame['t'] -= min(header['t_start'] for header in headers) + data_frame.headers = headers # HACK to preserve backwards compatibility + return data_frame + def test_monitor(): + """ + test the monitor wrapper + """ env = gym.make("CartPole-v1") env.seed(0) mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4() @@ -147,15 +216,15 @@ def test_monitor(): if done: menv.reset() - f = open(mon_file, 'rt') + file_handler = open(mon_file, 'rt') - firstline = f.readline() + firstline = file_handler.readline() assert firstline.startswith('#') metadata = json.loads(firstline[1:]) assert metadata['env_id'] == "CartPole-v1" - assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata" + assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata" - last_logline = pandas.read_csv(f, index_col=None) + last_logline = pandas.read_csv(file_handler, index_col=None) assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" - f.close() - os.remove(mon_file) \ No newline at end of file + file_handler.close() + os.remove(mon_file) diff --git a/baselines/common/__init__.py b/baselines/common/__init__.py index 0834b36492..508c62e135 100644 --- a/baselines/common/__init__.py +++ b/baselines/common/__init__.py @@ -1,5 +1,7 @@ # flake8: noqa F403 -from baselines.common.console_util import * +from baselines.common.console_util import fmt_row, fmt_item, colorize from baselines.common.dataset import Dataset -from baselines.common.math_util import * -from baselines.common.misc_util import * +from baselines.common.math_util import discount, discount_with_boundaries, explained_variance, explained_variance_2d,\ + flatten_arrays, unflatten_vector +from baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\ + boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py index 2aefad78cf..666f08a7ad 100644 --- a/baselines/common/atari_wrappers.py +++ b/baselines/common/atari_wrappers.py @@ -1,14 +1,20 @@ -import numpy as np from collections import deque + +import numpy as np import gym from gym import spaces import cv2 cv2.ocl.setUseOpenCL(False) + class NoopResetEnv(gym.Wrapper): def __init__(self, env, noop_max=30): - """Sample initial states by taking random number of no-ops on reset. + """ + Sample initial states by taking random number of no-ops on reset. No-op is assumed to be action 0. + + :param env: (Gym Environment) the environment to wrap + :param noop_max: (int) the maximum value of no-ops to run """ gym.Wrapper.__init__(self, env) self.noop_max = noop_max @@ -17,12 +23,11 @@ def __init__(self, env, noop_max=30): assert env.unwrapped.get_action_meanings()[0] == 'NOOP' def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset(**kwargs) if self.override_num_noops is not None: noops = self.override_num_noops else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) assert noops > 0 obs = None for _ in range(noops): @@ -31,12 +36,17 @@ def reset(self, **kwargs): obs = self.env.reset(**kwargs) return obs - def step(self, ac): - return self.env.step(ac) + def step(self, action): + return self.env.step(action) + class FireResetEnv(gym.Wrapper): def __init__(self, env): - """Take action on reset for environments that are fixed until firing.""" + """ + Take action on reset for environments that are fixed until firing. + + :param env: (Gym Environment) the environment to wrap + """ gym.Wrapper.__init__(self, env) assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 @@ -51,17 +61,21 @@ def reset(self, **kwargs): self.env.reset(**kwargs) return obs - def step(self, ac): - return self.env.step(ac) + def step(self, action): + return self.env.step(action) + class EpisodicLifeEnv(gym.Wrapper): def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on true game over. + """ + Make end-of-life == end-of-episode, but only reset on true game over. Done by DeepMind for the DQN and co. since it helps value estimation. + + :param env: (Gym Environment) the environment to wrap """ gym.Wrapper.__init__(self, env) self.lives = 0 - self.was_real_done = True + self.was_real_done = True def step(self, action): obs, reward, done, info = self.env.step(action) @@ -69,7 +83,7 @@ def step(self, action): # check current lives, make loss of life terminal, # then update lives to handle bonus lives lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: + if 0 < lives < self.lives: # for Qbert sometimes we stay in lives == 0 condtion for a few frames # so its important to keep lives > 0, so that we only reset once # the environment advertises done. @@ -78,9 +92,13 @@ def step(self, action): return obs, reward, done, info def reset(self, **kwargs): - """Reset only when lives are exhausted. + """ + Calls the Gym environment reset, only when lives are exhausted. This way all states are still reachable even though lives are episodic, and the learner need not know about any of this behind-the-scenes. + + :param kwargs: Extra keywords passed to env.reset() call + :return: ([int] or [float]) the first observation of the environment """ if self.was_real_done: obs = self.env.reset(**kwargs) @@ -90,22 +108,36 @@ def reset(self, **kwargs): self.lives = self.env.unwrapped.ale.lives() return obs + class MaxAndSkipEnv(gym.Wrapper): def __init__(self, env, skip=4): - """Return only every `skip`-th frame""" + """ + Return only every `skip`-th frame (frameskipping) + + :param env: (Gym Environment) the environment + :param skip: (int) number of `skip`-th frame + """ gym.Wrapper.__init__(self, env) # most recent raw observations (for max pooling across time steps) self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) - self._skip = skip + self._skip = skip def step(self, action): - """Repeat action, sum reward, and max over last observations.""" + """ + Step the environment with the given action + Repeat action, sum reward, and max over last observations. + + :param action: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ total_reward = 0.0 done = None for i in range(self._skip): obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: self._obs_buffer[0] = obs - if i == self._skip - 1: self._obs_buffer[1] = obs + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs total_reward += reward if done: break @@ -118,59 +150,84 @@ def step(self, action): def reset(self, **kwargs): return self.env.reset(**kwargs) + class ClipRewardEnv(gym.RewardWrapper): def __init__(self, env): + """ + clips the reward to {+1, 0, -1} by its sign. + + :param env: (Gym Environment) the environment + """ gym.RewardWrapper.__init__(self, env) def reward(self, reward): - """Bin reward to {+1, 0, -1} by its sign.""" + """ + Bin reward to {+1, 0, -1} by its sign. + + :param reward: (float) + """ return np.sign(reward) + class WarpFrame(gym.ObservationWrapper): def __init__(self, env): - """Warp frames to 84x84 as done in the Nature paper and later work.""" + """ + Warp frames to 84x84 as done in the Nature paper and later work. + + :param env: (Gym Environment) the environment + """ gym.ObservationWrapper.__init__(self, env) self.width = 84 self.height = 84 - self.observation_space = spaces.Box(low=0, high=255, - shape=(self.height, self.width, 1), dtype=np.uint8) + self.observation_space = spaces.Box(low=0, high=255, shape=(self.height, self.width, 1), dtype=np.uint8) def observation(self, frame): + """ + returns the current observation from a frame + + :param frame: ([int] or [float]) environment frame + :return: ([int] or [float]) the observation + """ frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) return frame[:, :, None] + class FrameStack(gym.Wrapper): - def __init__(self, env, k): - """Stack k last frames. + def __init__(self, env, n_frames): + """Stack n_frames last frames. Returns lazy array, which is much more memory efficient. See Also -------- baselines.common.atari_wrappers.LazyFrames + + :param env: (Gym Environment) the environment + :param n_frames: (int) the number of frames to stack """ gym.Wrapper.__init__(self, env) - self.k = k - self.frames = deque([], maxlen=k) + self.n_frames = n_frames + self.frames = deque([], maxlen=n_frames) shp = env.observation_space.shape - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) + self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * n_frames), dtype=np.uint8) def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) + obs = self.env.reset() + for _ in range(self.n_frames): + self.frames.append(obs) return self._get_ob() def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) + obs, reward, done, info = self.env.step(action) + self.frames.append(obs) return self._get_ob(), reward, done, info def _get_ob(self): - assert len(self.frames) == self.k + assert len(self.frames) == self.n_frames return LazyFrames(list(self.frames)) + class ScaledFloatFrame(gym.ObservationWrapper): def __init__(self, env): gym.ObservationWrapper.__init__(self, env) @@ -180,15 +237,18 @@ def observation(self, observation): # with smaller replay buffers only. return np.array(observation).astype(np.float32) / 255.0 + class LazyFrames(object): def __init__(self, frames): - """This object ensures that common frames between the observations are only stored once. + """ + This object ensures that common frames between the observations are only stored once. It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay buffers. This object should only be converted to numpy array before being passed to the model. - You'd not believe how complex the previous solution was.""" + :param frames: ([int] or [float]) environment frames + """ self._frames = frames self._out = None @@ -210,15 +270,31 @@ def __len__(self): def __getitem__(self, i): return self._force()[i] + def make_atari(env_id): + """ + Create a wrapped atari envrionment + + :param env_id: (str) the environment ID + :return: (Gym Environment) the wrapped atari environment + """ env = gym.make(env_id) assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) return env + def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): - """Configure environment for DeepMind-style Atari. + """ + Configure environment for DeepMind-style Atari. + + :param env: (Gym Environment) the atari environment + :param episode_life: (bool) wrap the episode life wrapper + :param clip_rewards: (bool) wrap the reward clipping wrapper + :param frame_stack: (bool) wrap the frame stacking wrapper + :param scale: (bool) wrap the scaling observation wrapper + :return: (Gym Environment) the wrapped atari environment """ if episode_life: env = EpisodicLifeEnv(env) @@ -232,4 +308,3 @@ def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, if frame_stack: env = FrameStack(env, 4) return env - diff --git a/baselines/common/cg.py b/baselines/common/cg.py index a913186666..15c0f9524d 100644 --- a/baselines/common/cg.py +++ b/baselines/common/cg.py @@ -1,34 +1,49 @@ import numpy as np -def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): + + +def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): """ - Demmel p 312 + conjugate gradient calculation (Ax = b), bases on + https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312 + + :param f_ax: (function) The function describing the Matrix A dot the vector x + (x being the input parameter of the function) + :param b_vec: (numpy float) vector b, where Ax = b + :param cg_iters: (int) the maximum number of iterations for converging + :param callback: (function) callback the values of x while converging + :param verbose: (bool) print extra information + :param residual_tol: (float) the break point if the residual is below this value + :return: (numpy float) vector x, where Ax = b """ - p = b.copy() - r = b.copy() - x = np.zeros_like(b) - rdotr = r.dot(r) + first_basis_vect = b_vec.copy() # the first basis vector + residual = b_vec.copy() # the residual + x_var = np.zeros_like(b_vec) # vector x, where Ax = b + residual_dot_residual = residual.dot(residual) # L2 norm of the residual - fmtstr = "%10i %10.3g %10.3g" - titlestr = "%10s %10s %10s" - if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) + fmt_str = "%10i %10.3g %10.3g" + title_str = "%10s %10s %10s" + if verbose: + print(title_str % ("iter", "residual norm", "soln norm")) for i in range(cg_iters): if callback is not None: - callback(x) - if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) - z = f_Ax(p) - v = rdotr / p.dot(z) - x += v*p - r -= v*z - newrdotr = r.dot(r) - mu = newrdotr/rdotr - p = r + mu*p + callback(x_var) + if verbose: + print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var))) + z_var = f_ax(first_basis_vect) + v_var = residual_dot_residual / first_basis_vect.dot(z_var) + x_var += v_var * first_basis_vect + residual -= v_var * z_var + new_residual_dot_residual = residual.dot(residual) + mu_val = new_residual_dot_residual / residual_dot_residual + first_basis_vect = residual + mu_val * first_basis_vect - rdotr = newrdotr - if rdotr < residual_tol: + residual_dot_residual = new_residual_dot_residual + if residual_dot_residual < residual_tol: break if callback is not None: - callback(x) - if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 - return x \ No newline at end of file + callback(x_var) + if verbose: + print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var))) + return x_var diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py index 5707695487..58b4d1a0b1 100644 --- a/baselines/common/cmd_util.py +++ b/baselines/common/cmd_util.py @@ -3,21 +3,33 @@ """ import os + from mpi4py import MPI import gym from gym.wrappers import FlattenDictWrapper + from baselines import logger from baselines.bench import Monitor from baselines.common import set_global_seeds from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv + def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. + + :param env_id: (str) the environment ID + :param num_env: (int) the number of environment you wish to have in subprocesses + :param seed: (int) the inital seed for RNG + :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function + :param start_index: (int) start rank index + :return: (Gym Environment) The atari environment """ - if wrapper_kwargs is None: wrapper_kwargs = {} - def make_env(rank): # pylint: disable=C0111 + if wrapper_kwargs is None: + wrapper_kwargs = {} + + def make_env(rank): def _thunk(): env = make_atari(env_id) env.seed(seed + rank) @@ -27,9 +39,14 @@ def _thunk(): set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) + def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. + + :param env_id: (str) the environment ID + :param seed: (int) the inital seed for RNG + :return: (Gym Environment) The mujoco environment """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) @@ -38,9 +55,15 @@ def make_mujoco_env(env_id, seed): env.seed(seed) return env + def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. + + :param env_id: (str) the environment ID + :param seed: (int) the inital seed for RNG + :param rank: (int) the rank of the environment (for logging) + :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) @@ -51,26 +74,35 @@ def make_robotics_env(env_id, seed, rank=0): env.seed(seed) return env + def arg_parser(): """ Create an empty argparse.ArgumentParser. + + :return: (ArgumentParser) """ import argparse return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + def atari_arg_parser(): """ Create an argparse.ArgumentParser for run_atari.py. + + :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) - parser.add_argument('--num-timesteps', type=int, default=int(10e6)) + parser.add_argument('--num-timesteps', type=int, default=int(1e7)) return parser + def mujoco_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. + + :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') @@ -79,9 +111,12 @@ def mujoco_arg_parser(): parser.add_argument('--play', default=False, action='store_true') return parser + def robotics_arg_parser(): """ Create an argparse.ArgumentParser for run_mujoco.py. + + :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)} """ parser = arg_parser() parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') diff --git a/baselines/common/console_util.py b/baselines/common/console_util.py index 8adc3f83ad..c8b4c94cb2 100644 --- a/baselines/common/console_util.py +++ b/baselines/common/console_util.py @@ -1,31 +1,51 @@ from __future__ import print_function -from contextlib import contextmanager + import numpy as np -import time + # ================================================================ # Misc # ================================================================ + def fmt_row(width, row, header=False): + """ + fits a list of items to at least a certain length + + :param width: (int) the minimum width of the string + :param row: ([Any]) a list of object you wish to get the string representation + :param header: (bool) whether or not to return the string as a header + :return: (str) the string representation of all the elements in 'row', of length >= 'width' + """ out = " | ".join(fmt_item(x, width) for x in row) - if header: out = out + "\n" + "-"*len(out) + if header: + out = out + "\n" + "-" * len(out) return out -def fmt_item(x, l): - if isinstance(x, np.ndarray): - assert x.ndim==0 - x = x.item() - if isinstance(x, (float, np.float32, np.float64)): - v = abs(x) - if (v < 1e-4 or v > 1e+4) and v > 0: - rep = "%7.2e" % x + +def fmt_item(item, min_width): + """ + fits items to a given string length + + :param item: (Any) the item you wish to get the string representation + :param min_width: (int) the minimum width of the string + :return: (str) the string representation of 'x' of length >= 'l' + """ + if isinstance(item, np.ndarray): + assert item.ndim == 0 + item = item.item() + if isinstance(item, (float, np.float32, np.float64)): + value = abs(item) + if (value < 1e-4 or value > 1e+4) and value > 0: + rep = "%7.2e" % item else: - rep = "%7.5f" % x - else: rep = str(x) - return " "*(l - len(rep)) + rep + rep = "%7.5f" % item + else: + rep = str(item) + return " " * (min_width - len(rep)) + rep -color2num = dict( + +COLOR_TO_NUM = dict( gray=30, red=31, green=32, @@ -37,23 +57,22 @@ def fmt_item(x, l): crimson=38 ) + def colorize(string, color, bold=False, highlight=False): + """ + Colorize, bold and/or highlight a string for terminal print + + :param string: (str) input string + :param color: (str) the color, the lookup table is the dict at console_util.color2num + :param bold: (bool) if the string should be bold or not + :param highlight: (bool) if the string should be highlighted or not + :return: (str) the stylized output string + """ attr = [] - num = color2num[color] - if highlight: num += 10 + num = COLOR_TO_NUM[color] + if highlight: + num += 10 attr.append(str(num)) - if bold: attr.append('1') + if bold: + attr.append('1') return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) - - -MESSAGE_DEPTH = 0 - -@contextmanager -def timed(msg): - global MESSAGE_DEPTH #pylint: disable=W0603 - print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) - tstart = time.time() - MESSAGE_DEPTH += 1 - yield - MESSAGE_DEPTH -= 1 - print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) diff --git a/baselines/common/dataset.py b/baselines/common/dataset.py index 41a38c8af6..1f951b3b02 100644 --- a/baselines/common/dataset.py +++ b/baselines/common/dataset.py @@ -1,31 +1,49 @@ import numpy as np + class Dataset(object): def __init__(self, data_map, deterministic=False, shuffle=True): + """ + Data loader that handles batches and shuffling. + WARNING: this will alter the given data_map ordering, as dicts are mutable + + :param data_map: (dict) the input data, where every column is a key + :param deterministic: (bool) disables the shuffle function + :param shuffle: (bool) enable auto shuffle + """ self.data_map = data_map self.deterministic = deterministic self.enable_shuffle = shuffle - self.n = next(iter(data_map.values())).shape[0] + self.n_samples = next(iter(data_map.values())).shape[0] self._next_id = 0 self.shuffle() def shuffle(self): + """ + shuffles the data_map + """ if self.deterministic: return - perm = np.arange(self.n) + perm = np.arange(self.n_samples) np.random.shuffle(perm) for key in self.data_map: self.data_map[key] = self.data_map[key][perm] - self._next_id = 0 - def next_batch(self, batch_size): - if self._next_id >= self.n and self.enable_shuffle: - self.shuffle() + """ + returns a batch of data of a given size + + :param batch_size: (int) the size of the batch + :return: (dict) a batch of the input data of size 'batch_size' + """ + if self._next_id >= self.n_samples: + self._next_id = 0 + if self.enable_shuffle: + self.shuffle() cur_id = self._next_id - cur_batch_size = min(batch_size, self.n - self._next_id) + cur_batch_size = min(batch_size, self.n_samples - self._next_id) self._next_id += cur_batch_size data_map = dict() @@ -34,13 +52,27 @@ def next_batch(self, batch_size): return data_map def iterate_once(self, batch_size): - if self.enable_shuffle: self.shuffle() + """ + generator that iterates over the dataset + + :param batch_size: (int) the size of the batch + :return: (dict) a batch of the input data of size 'batch_size' + """ + if self.enable_shuffle: + self.shuffle() - while self._next_id <= self.n - batch_size: + while self._next_id <= self.n_samples - batch_size: yield self.next_batch(batch_size) self._next_id = 0 def subset(self, num_elements, deterministic=True): + """ + Return a subset of the current dataset + + :param num_elements: (int) the number of element you wish to have in the subset + :param deterministic: (bool) disables the shuffle function + :return: (Dataset) a new subset of the current Dataset object + """ data_map = dict() for key in self.data_map: data_map[key] = self.data_map[key][:num_elements] @@ -48,13 +80,24 @@ def subset(self, num_elements, deterministic=True): def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): + """ + Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None. + + :param arrays: (tuple) a tuple of arrays + :param num_batches: (int) the number of batches, must be None is batch_size is defined + :param batch_size: (int) the size of the batch, must be None is num_batches is defined + :param shuffle: (bool) enable auto shuffle + :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size + :return: (tuples) a tuple of a batch of the arrays + """ assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' arrays = tuple(map(np.asarray, arrays)) - n = arrays[0].shape[0] - assert all(a.shape[0] == n for a in arrays[1:]) - inds = np.arange(n) - if shuffle: np.random.shuffle(inds) - sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches + n_samples = arrays[0].shape[0] + assert all(a.shape[0] == n_samples for a in arrays[1:]) + inds = np.arange(n_samples) + if shuffle: + np.random.shuffle(inds) + sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches for batch_inds in np.array_split(inds, sections): if include_final_partial_batch or len(batch_inds) == batch_size: yield tuple(a[batch_inds] for a in arrays) diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py index 8a57c37605..18202232f9 100644 --- a/baselines/common/distributions.py +++ b/baselines/common/distributions.py @@ -1,309 +1,493 @@ import tensorflow as tf -import numpy as np -import baselines.common.tf_util as U -from baselines.a2c.utils import fc from tensorflow.python.ops import math_ops +import numpy as np +from gym import spaces + +from baselines.a2c.utils import linear + -class Pd(object): +class ProbabilityDistribution(object): """ A particular probability distribution """ + def flatparam(self): + """ + Return the direct probabilities + + :return: ([float]) the probabilites + """ raise NotImplementedError + def mode(self): + """ + Returns the index of the highest probability + + :return: (int) the max index of the probabilites + """ raise NotImplementedError + def neglogp(self, x): + """ + returns the of the negative log likelihood + + :param x: (str) the labels of each index + :return: ([float]) The negative log likelihood of the distribution + """ # Usually it's easier to define the negative logprob raise NotImplementedError + def kl(self, other): + """ + Calculates the Kullback-Leiber divergence from the given probabilty distribution + + :param other: ([float]) the distibution to compare with + :return: (float) the KL divergence of the two distributions + """ raise NotImplementedError + def entropy(self): + """ + Returns shannon's entropy of the probability + + :return: (float) the entropy + """ raise NotImplementedError + def sample(self): + """ + Sample an index from the probabilty distribution + + :return: (int) the sampled index + """ raise NotImplementedError + def logp(self, x): + """ + returns the of the log likelihood + + :param x: (str) the labels of each index + :return: ([float]) The log likelihood of the distribution + """ return - self.neglogp(x) -class PdType(object): + +class ProbabilityDistributionType(object): """ Parametrized family of probability distributions """ - def pdclass(self): + + def probability_distribution_class(self): + """ + returns the ProbabilityDistribution class of this type + + :return: (Type ProbabilityDistribution) the probability distribution class associated + """ raise NotImplementedError - def pdfromflat(self, flat): - return self.pdclass()(flat) - def pdfromlatent(self, latent_vector): + + def proba_distribution_from_flat(self, flat): + """ + returns the probability distribution from flat probabilities + + :param flat: ([float]) the flat probabilities + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + return self.probability_distribution_class()(flat) + + def proba_distribution_from_latent(self, latent_vector): + """ + returns the probability distribution from latent values + + :param latent_vector: ([float]) the latent values + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ raise NotImplementedError + def param_shape(self): + """ + returns the shape of the input parameters + + :return: ([int]) the shape + """ raise NotImplementedError + def sample_shape(self): + """ + returns the shape of the sampling + + :return: ([int]) the shape + """ raise NotImplementedError + def sample_dtype(self): + """ + returns the type of the sampling + + :return: (type) the type + """ raise NotImplementedError def param_placeholder(self, prepend_shape, name=None): - return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name) + """ + returns the TensorFlow placeholder for the input parameters + + :param prepend_shape: ([int]) the prepend shape + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name) + def sample_placeholder(self, prepend_shape, name=None): - return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name) + """ + returns the TensorFlow placeholder for the sampling + + :param prepend_shape: ([int]) the prepend shape + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(), name=name) + -class CategoricalPdType(PdType): - def __init__(self, ncat): - self.ncat = ncat - def pdclass(self): - return CategoricalPd - def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): - pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) - return self.pdfromflat(pdparam), pdparam +class CategoricalProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, n_cat): + """ + The probability distribution type for categorical input + + :param n_cat: (int) the number of categories + """ + self.n_cat = n_cat + + def probability_distribution_class(self): + return CategoricalProbabilityDistribution + + def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0): + """ + returns the probability distribution from latent values + + :param latent_vector: ([float]) the latent values + :param init_scale: (float) the inital scale of the distribution + :param init_bias: (float) the inital bias of the distribution + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + pdparam = linear(latent_vector, 'pi', self.n_cat, init_scale=init_scale, init_bias=init_bias) + return self.proba_distribution_from_flat(pdparam), pdparam def param_shape(self): - return [self.ncat] + return [self.n_cat] + def sample_shape(self): return [] + def sample_dtype(self): return tf.int32 -class MultiCategoricalPdType(PdType): - def __init__(self, nvec): - self.ncats = nvec - def pdclass(self): - return MultiCategoricalPd - def pdfromflat(self, flat): - return MultiCategoricalPd(self.ncats, flat) +class MultiCategoricalProbabilityDistributionType(ProbabilityDistributionType): + def __init__(self, n_vec): + """ + The probability distribution type for multiple categorical input + + :param n_vec: (int) the number of vectors + """ + self.n_cats = n_vec + + def probability_distribution_class(self): + return MultiCategoricalProbabilityDistribution + + def proba_distribution_from_flat(self, flat): + return MultiCategoricalProbabilityDistribution(self.n_cats, flat) + + def proba_distribution_from_latent(self, latent_vector): + raise NotImplementedError + def param_shape(self): - return [sum(self.ncats)] + return [sum(self.n_cats)] + def sample_shape(self): - return [len(self.ncats)] + return [len(self.n_cats)] + def sample_dtype(self): return tf.int32 -class DiagGaussianPdType(PdType): + +class DiagGaussianProbabilityDistributionType(ProbabilityDistributionType): def __init__(self, size): + """ + The probability distribution type for multivariate gaussian input + + :param size: (int) the number of dimentions of the multivariate gaussian + """ self.size = size - def pdclass(self): - return DiagGaussianPd - def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): - mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) + def probability_distribution_class(self): + return DiagGaussianProbabilityDistribution + + def proba_distribution_from_latent(self, latent_vector, init_scale=1.0, init_bias=0.0): + """ + returns the probability distribution from latent values + + :param latent_vector: ([float]) the latent values + :param init_scale: (float) the inital scale of the distribution + :param init_bias: (float) the inital bias of the distribution + :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated + """ + mean = linear(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) - return self.pdfromflat(pdparam), mean + return self.proba_distribution_from_flat(pdparam), mean def param_shape(self): - return [2*self.size] + return [2 * self.size] + def sample_shape(self): return [self.size] + def sample_dtype(self): return tf.float32 -class BernoulliPdType(PdType): + +class BernoulliProbabilityDistributionType(ProbabilityDistributionType): def __init__(self, size): + """ + The probability distribution type for bernoulli input + + :param size: (int) the number of dimentions of the bernoulli distribution + """ self.size = size - def pdclass(self): - return BernoulliPd + + def probability_distribution_class(self): + return BernoulliProbabilityDistribution + + def proba_distribution_from_latent(self, latent_vector): + raise NotImplementedError + def param_shape(self): return [self.size] + def sample_shape(self): return [self.size] + def sample_dtype(self): return tf.int32 -# WRONG SECOND DERIVATIVES -# class CategoricalPd(Pd): -# def __init__(self, logits): -# self.logits = logits -# self.ps = tf.nn.softmax(logits) -# @classmethod -# def fromflat(cls, flat): -# return cls(flat) -# def flatparam(self): -# return self.logits -# def mode(self): -# return U.argmax(self.logits, axis=-1) -# def logp(self, x): -# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x) -# def kl(self, other): -# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \ -# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) -# def entropy(self): -# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps) -# def sample(self): -# u = tf.random_uniform(tf.shape(self.logits)) -# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) - -class CategoricalPd(Pd): + +class CategoricalProbabilityDistribution(ProbabilityDistribution): def __init__(self, logits): + """ + Probability distributions from categorical input + + :param logits: ([float]) the categorical logits input + """ self.logits = logits + def flatparam(self): return self.logits + def mode(self): return tf.argmax(self.logits, axis=-1) + def neglogp(self, x): - # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x) + # return tf.nn. (logits=self.logits, labels=x) # Note: we can't use sparse_softmax_cross_entropy_with_logits because # the implementation does not allow second-order derivatives... one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1]) return tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=one_hot_actions) + def kl(self, other): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) - a1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) - ea0 = tf.exp(a0) - ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) - z1 = tf.reduce_sum(ea1, axis=-1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1) + a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) + a_1 = other.logits - tf.reduce_max(other.logits, axis=-1, keep_dims=True) + exp_a_0 = tf.exp(a_0) + exp_a_1 = tf.exp(a_1) + z_0 = tf.reduce_sum(exp_a_0, axis=-1, keep_dims=True) + z_1 = tf.reduce_sum(exp_a_1, axis=-1, keep_dims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (a_0 - tf.log(z_0) - a_1 + tf.log(z_1)), axis=-1) + def entropy(self): - a0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) - ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, axis=-1, keep_dims=True) - p0 = ea0 / z0 - return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1) + a_0 = self.logits - tf.reduce_max(self.logits, axis=-1, keep_dims=True) + exp_a_0 = tf.exp(a_0) + z_0 = tf.reduce_sum(exp_a_0, axis=-1, keep_dims=True) + p_0 = exp_a_0 / z_0 + return tf.reduce_sum(p_0 * (tf.log(z_0) - a_0), axis=-1) + def sample(self): - u = tf.random_uniform(tf.shape(self.logits)) - return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1) + uniform = tf.random_uniform(tf.shape(self.logits)) + return tf.argmax(self.logits - tf.log(-tf.log(uniform)), axis=-1) + @classmethod def fromflat(cls, flat): + """ + Create an instance of this from new logits values + + :param flat: ([float]) the categorical logits input + :return: (ProbabilityDistribution) the instance from the given categorical input + """ return cls(flat) -class MultiCategoricalPd(Pd): + +class MultiCategoricalProbabilityDistribution(ProbabilityDistribution): def __init__(self, nvec, flat): + """ + Probability distributions from multicategorical input + + :param nvec: (int) the number of categorical inputs + :param flat: ([float]) the categorical logits input + """ self.flat = flat - self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1))) + self.categoricals = list(map(CategoricalProbabilityDistribution, tf.split(flat, nvec, axis=-1))) + def flatparam(self): return self.flat + def mode(self): return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) + def neglogp(self, x): return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) + def kl(self, other): return tf.add_n([p.kl(q) for p, q in zip(self.categoricals, other.categoricals)]) + def entropy(self): return tf.add_n([p.entropy() for p in self.categoricals]) + def sample(self): return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) + @classmethod def fromflat(cls, flat): + """ + Create an instance of this from new logits values + + :param flat: ([float]) the multi categorical logits input + :return: (ProbabilityDistribution) the instance from the given multi categorical input + """ raise NotImplementedError -class DiagGaussianPd(Pd): + +class DiagGaussianProbabilityDistribution(ProbabilityDistribution): def __init__(self, flat): + """ + Probability distributions from multivariate gaussian input + + :param flat: ([float]) the multivariate gaussian input data + """ self.flat = flat - mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat) + mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat) self.mean = mean self.logstd = logstd self.std = tf.exp(logstd) + def flatparam(self): return self.flat + def mode(self): return self.mean + def neglogp(self, x): return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \ + tf.reduce_sum(self.logstd, axis=-1) + def kl(self, other): - assert isinstance(other, DiagGaussianPd) - return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1) + assert isinstance(other, DiagGaussianProbabilityDistribution) + return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / + (2.0 * tf.square(other.std)) - 0.5, axis=-1) + def entropy(self): return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1) + def sample(self): return self.mean + self.std * tf.random_normal(tf.shape(self.mean)) + @classmethod def fromflat(cls, flat): + """ + Create an instance of this from new multivariate gaussian input + + :param flat: ([float]) the multivariate gaussian input data + :return: (ProbabilityDistribution) the instance from the given multivariate gaussian input data + """ return cls(flat) -class BernoulliPd(Pd): + +class BernoulliProbabilityDistribution(ProbabilityDistribution): def __init__(self, logits): + """ + Probability distributions from bernoulli input + + :param logits: ([float]) the bernoulli input data + """ self.logits = logits - self.ps = tf.sigmoid(logits) + self.probabilities = tf.sigmoid(logits) + def flatparam(self): return self.logits + def mode(self): - return tf.round(self.ps) + return tf.round(self.probabilities) + def neglogp(self, x): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1) + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), + axis=-1) + def kl(self, other): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, + labels=self.probabilities), axis=-1) - \ + tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, + labels=self.probabilities), axis=-1) + def entropy(self): - return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1) + return tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, + labels=self.probabilities), axis=-1) + def sample(self): - u = tf.random_uniform(tf.shape(self.ps)) - return tf.to_float(math_ops.less(u, self.ps)) + samples_from_uniform = tf.random_uniform(tf.shape(self.probabilities)) + return tf.to_float(math_ops.less(samples_from_uniform, self.probabilities)) + @classmethod def fromflat(cls, flat): + """ + Create an instance of this from new bernoulli input + + :param flat: ([float]) the bernoulli input data + :return: (ProbabilityDistribution) the instance from the given bernoulli input data + """ return cls(flat) -def make_pdtype(ac_space): - from gym import spaces + +def make_proba_dist_type(ac_space): + """ + return an instance of ProbabilityDistributionType for the correct type of action space + + :param ac_space: (Gym Space) the input action space + :return: (ProbabilityDistributionType) the approriate instance of a ProbabilityDistributionType + """ if isinstance(ac_space, spaces.Box): assert len(ac_space.shape) == 1 - return DiagGaussianPdType(ac_space.shape[0]) + return DiagGaussianProbabilityDistributionType(ac_space.shape[0]) elif isinstance(ac_space, spaces.Discrete): - return CategoricalPdType(ac_space.n) + return CategoricalProbabilityDistributionType(ac_space.n) elif isinstance(ac_space, spaces.MultiDiscrete): - return MultiCategoricalPdType(ac_space.nvec) + return MultiCategoricalProbabilityDistributionType(ac_space.nvec) elif isinstance(ac_space, spaces.MultiBinary): - return BernoulliPdType(ac_space.n) + return BernoulliProbabilityDistributionType(ac_space.n) else: raise NotImplementedError -def shape_el(v, i): - maybe = v.get_shape()[i] + +def shape_el(tensor, index): + """ + get the shape of a TensorFlow Tensor element + + :param tensor: (TensorFlow Tensor) the input tensor + :param index: (int) the element + :return: ([int]) the shape + """ + maybe = tensor.get_shape()[index] if maybe is not None: return maybe else: - return tf.shape(v)[i] - -@U.in_session -def test_probtypes(): - np.random.seed(0) - - pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) - diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101 - validate_probtype(diag_gauss, pdparam_diag_gauss) - - pdparam_categorical = np.array([-.2, .3, .5]) - categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101 - validate_probtype(categorical, pdparam_categorical) - - nvec = [1,2,3] - pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) - multicategorical = MultiCategoricalPdType(nvec) #pylint: disable=E1101 - validate_probtype(multicategorical, pdparam_multicategorical) - - pdparam_bernoulli = np.array([-.2, .3, .5]) - bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101 - validate_probtype(bernoulli, pdparam_bernoulli) - - -def validate_probtype(probtype, pdparam): - N = 100000 - # Check to see if mean negative log likelihood == differential entropy - Mval = np.repeat(pdparam[None, :], N, axis=0) - M = probtype.param_placeholder([N]) - X = probtype.sample_placeholder([N]) - pd = probtype.pdfromflat(M) - calcloglik = U.function([X, M], pd.logp(X)) - calcent = U.function([M], pd.entropy()) - Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) - logliks = calcloglik(Xval, Mval) - entval_ll = - logliks.mean() #pylint: disable=E1101 - entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 - entval = calcent(Mval).mean() #pylint: disable=E1101 - assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas - - # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] - M2 = probtype.param_placeholder([N]) - pd2 = probtype.pdfromflat(M2) - q = pdparam + np.random.randn(pdparam.size) * 0.1 - Mval2 = np.repeat(q[None, :], N, axis=0) - calckl = U.function([M, M2], pd.kl(pd2)) - klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 - logliks = calcloglik(Xval, Mval2) - klval_ll = - entval - logliks.mean() #pylint: disable=E1101 - klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 - assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas - print('ok on', probtype, pdparam) - + return tf.shape(tensor)[index] diff --git a/baselines/common/filters.py b/baselines/common/filters.py index 5ce019cd22..38d602004e 100644 --- a/baselines/common/filters.py +++ b/baselines/common/filters.py @@ -1,98 +1,211 @@ -from .running_stat import RunningStat from collections import deque + import numpy as np +from .running_stat import RunningStat + + class Filter(object): - def __call__(self, x, update=True): + """ + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): raise NotImplementedError + def reset(self): + """ + resets the filter + """ pass + def output_shape(self, input_space): + """ + returns the output shape + + :param input_space: (numpy int) + :return: (numpy int) output shape + """ + raise NotImplementedError + + class IdentityFilter(Filter): - def __call__(self, x, update=True): - return x + """ + A filter that implements an identity function + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): + return arr + + def output_shape(self, input_space): + return input_space.shape + class CompositionFilter(Filter): - def __init__(self, fs): - self.fs = fs - def __call__(self, x, update=True): - for f in self.fs: - x = f(x) - return x + def __init__(self, functions): + """ + A filter that implements a composition with other functions + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param functions: ([function]) composition of these functions and the input + """ + self.functions = functions + + def __call__(self, arr, update=True): + for func in self.functions: + arr = func(arr) + return arr + def output_shape(self, input_space): out = input_space.shape - for f in self.fs: - out = f.output_shape(out) + for func in self.functions: + out = func.output_shape(out) return out -class ZFilter(Filter): - """ - y = (x-mean)/std - using running estimates of mean,std - """ +class ZFilter(Filter): def __init__(self, shape, demean=True, destd=True, clip=10.0): + """ + A filter that implements a z-filter + y = (x-mean)/std + using running estimates of mean,std + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param shape: ([int]) the shape of the input + :param demean: (bool) filter mean + :param destd: (bool) filter standard deviation + :param clip: (float) clip filter absolute value to this value + """ self.demean = demean self.destd = destd self.clip = clip - self.rs = RunningStat(shape) + self.running_stat = RunningStat(shape) - def __call__(self, x, update=True): - if update: self.rs.push(x) + def __call__(self, arr, update=True): + if update: + self.running_stat.push(arr) if self.demean: - x = x - self.rs.mean + arr = arr - self.running_stat.mean if self.destd: - x = x / (self.rs.std+1e-8) + arr = arr / (self.running_stat.std + 1e-8) if self.clip: - x = np.clip(x, -self.clip, self.clip) - return x + arr = np.clip(arr, -self.clip, self.clip) + return arr + def output_shape(self, input_space): return input_space.shape + class AddClock(Filter): def __init__(self): + """ + A filter that appends a counter to the input + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ self.count = 0 + def reset(self): self.count = 0 - def __call__(self, x, update=True): - return np.append(x, self.count/100.0) + + def __call__(self, arr, update=True): + return np.append(arr, self.count / 100.0) + def output_shape(self, input_space): - return (input_space.shape[0]+1,) + return input_space.shape[0] + 1, + class FlattenFilter(Filter): - def __call__(self, x, update=True): - return x.ravel() + """ + A filter that flattens the input + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + """ + def __call__(self, arr, update=True): + return arr.ravel() + def output_shape(self, input_space): - return (int(np.prod(input_space.shape)),) + return int(np.prod(input_space.shape)), + class Ind2OneHotFilter(Filter): - def __init__(self, n): - self.n = n - def __call__(self, x, update=True): - out = np.zeros(self.n) - out[x] = 1 + def __init__(self, n_cat): + """ + A filter that turns indices to onehot encoding + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param n_cat: (int) the number of categories + """ + self.n_cat = n_cat + + def __call__(self, arr, update=True): + out = np.zeros(self.n_cat) + out[arr] = 1 return out + def output_shape(self, input_space): - return (input_space.n,) + return input_space.n, + class DivFilter(Filter): def __init__(self, divisor): + """ + A filter that divides the input from a value + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param divisor: (float) the number you want to divide by + """ self.divisor = divisor - def __call__(self, x, update=True): - return x / self.divisor + + def __call__(self, arr, update=True): + return arr / self.divisor + def output_shape(self, input_space): return input_space.shape + class StackFilter(Filter): def __init__(self, length): + """ + A filter that runs a stacking of a 'length' inputs + + takes a value 'x' (numpy Number), applies the filter, and returns the new value. + + Can pass kwarg: 'update' (bool) if the filter can update from the value + + :param length: (int) the number of inputs to stack + """ self.stack = deque(maxlen=length) + def reset(self): self.stack.clear() - def __call__(self, x, update=True): - self.stack.append(x) + + def __call__(self, arr, update=True): + self.stack.append(arr) while len(self.stack) < self.stack.maxlen: - self.stack.append(x) + self.stack.append(arr) return np.concatenate(self.stack, axis=-1) + def output_shape(self, input_space): return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) diff --git a/baselines/common/identity_env.py b/baselines/common/identity_env.py index f07cd5b8d4..44e1046e67 100644 --- a/baselines/common/identity_env.py +++ b/baselines/common/identity_env.py @@ -3,28 +3,32 @@ class IdentityEnv(Env): - def __init__( - self, - dim, - ep_length=100, - ): + def __init__(self, dim, ep_length=100): + """ + Identity environment for testing purposes + :param dim: (int) the size of the dimentions you want to learn + :param ep_length: (int) the length of each episodes in timesteps + """ self.action_space = Discrete(dim) + self.ep_length = ep_length self.reset() def reset(self): self._choose_next_state() self.observation_space = self.action_space - return self.state - def step(self, actions): - rew = self._get_reward(actions) + def step(self, action): + reward = self._get_reward(action) self._choose_next_state() - return self.state, rew, False, {} + return self.state, reward, False, {} def _choose_next_state(self): self.state = self.action_space.sample() - def _get_reward(self, actions): - return 1 if self.state == actions else 0 + def _get_reward(self, action): + return 1 if self.state == action else 0 + + def render(self, mode='human'): + pass diff --git a/baselines/common/input.py b/baselines/common/input.py index 7fbf9fc00b..8d2419ff14 100644 --- a/baselines/common/input.py +++ b/baselines/common/input.py @@ -1,20 +1,19 @@ import tensorflow as tf from gym.spaces import Discrete, Box + def observation_input(ob_space, batch_size=None, name='Ob'): - ''' - Build observation input with encoding depending on the - observation space type - Params: - - ob_space: observation space (should be one of gym.spaces) - batch_size: batch size for input (default is None, so that resulting input placeholder can take tensors with any batch size) - name: tensorflow variable name for input placeholder + """ + Build observation input with encoding depending on the observation space type - returns: tuple (input_placeholder, processed_input_tensor) - ''' + :param ob_space: (Gym Space) The observation space + :param batch_size: (int) batch size for input + (default is None, so that resulting input placeholder can take tensors with any batch size) + :param name: (str) tensorflow variable name for input placeholder + :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor + """ if isinstance(ob_space, Discrete): - input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) + input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n)) return input_x, processed_x @@ -26,5 +25,3 @@ def observation_input(ob_space, batch_size=None, name='Ob'): else: raise NotImplementedError - - diff --git a/baselines/common/math_util.py b/baselines/common/math_util.py index 36b8927781..327e69fbe0 100644 --- a/baselines/common/math_util.py +++ b/baselines/common/math_util.py @@ -2,27 +2,21 @@ import scipy.signal -def discount(x, gamma): +def discount(vector, gamma): """ - computes discounted sums along 0th dimension of x. - - inputs - ------ - x: ndarray - gamma: float - - outputs - ------- - y: ndarray with same shape as x, satisfying - + computes discounted sums along 0th dimension of vector x. y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], where k = len(x) - t - 1 + :param vector: (numpy array) the input vector + :param gamma: (float) the discount value + :return: (numpy Number) the output vector """ - assert x.ndim >= 1 - return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] + assert vector.ndim >= 1 + return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1] + -def explained_variance(ypred,y): +def explained_variance(y_pred, y_true): """ Computes fraction of variance that ypred explains about y. Returns 1 - Var[y-ypred] / Var[y] @@ -32,54 +26,78 @@ def explained_variance(ypred,y): ev=1 => perfect prediction ev<0 => worse than just predicting zero + :param y_pred: (numpy Number) the prediction + :param y_true: (numpy Number) the expected value + :return: (float) explained variance of ypred and y """ - assert y.ndim == 1 and ypred.ndim == 1 - vary = np.var(y) - return np.nan if vary==0 else 1 - np.var(y-ypred)/vary + assert y_true.ndim == 1 and y_pred.ndim == 1 + var_y = np.var(y_true) + return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y -def explained_variance_2d(ypred, y): - assert y.ndim == 2 and ypred.ndim == 2 - vary = np.var(y, axis=0) - out = 1 - np.var(y-ypred)/vary - out[vary < 1e-10] = 0 - return out -def ncc(ypred, y): - return np.corrcoef(ypred, y)[1,0] +def explained_variance_2d(y_pred, y_true): + """ + Computes fraction of variance that ypred explains about y, for 2D arrays. + Returns 1 - Var[y-ypred] / Var[y] + + interpretation: + ev=0 => might as well have predicted zero + ev=1 => perfect prediction + ev<0 => worse than just predicting zero + + :param y_pred: (numpy Number) the prediction + :param y_true: (numpy Number) the expected value + :return: (float) explained variance of ypred and y + """ + assert y_true.ndim == 2 and y_pred.ndim == 2 + var_y = np.var(y_true, axis=0) + explained_var = 1 - np.var(y_true - y_pred) / var_y + explained_var[var_y < 1e-10] = 0 + return explained_var + def flatten_arrays(arrs): + """ + flattens a list of arrays down to 1D + + :param arrs: ([numpy Number]) arrays + :return: (numpy Number) 1D flattend array + """ return np.concatenate([arr.flat for arr in arrs]) + def unflatten_vector(vec, shapes): - i=0 + """ + reshape a flattened array + + :param vec: (numpy Number) 1D arrays + :param shapes: (tuple) + :return: ([numpy Number]) reshaped array + """ + i = 0 arrs = [] for shape in shapes: size = np.prod(shape) - arr = vec[i:i+size].reshape(shape) + arr = vec[i:i + size].reshape(shape) arrs.append(arr) i += size return arrs -def discount_with_boundaries(X, New, gamma): + +def discount_with_boundaries(rewards, episode_starts, gamma): """ - X: 2d array of floats, time x features - New: 2d array of bools, indicating when a new episode has started + computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode. + y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], + where k = len(x) - t - 1 + + :param rewards: (numpy Number) the input vector (rewards) + :param episode_starts: (numpy Number) 2d array of bools, indicating when a new episode has started + :param gamma: (float) the discount factor + :return: (numpy Number) the output vector (discounted rewards) """ - Y = np.zeros_like(X) - T = X.shape[0] - Y[T-1] = X[T-1] - for t in range(T-2, -1, -1): - Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) - return Y - -def test_discount_with_boundaries(): - gamma=0.9 - x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') - starts = [1.0, 0.0, 0.0, 1.0] - y = discount_with_boundaries(x, starts, gamma) - assert np.allclose(y, [ - 1 + gamma * 2 + gamma**2 * 3, - 2 + gamma * 3, - 3, - 4 - ]) \ No newline at end of file + discounted_rewards = np.zeros_like(rewards) + n_samples = rewards.shape[0] + discounted_rewards[n_samples - 1] = rewards[n_samples - 1] + for step in range(n_samples - 2, -1, -1): + discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1]) + return discounted_rewards diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py index 9985dea205..19216c95d8 100644 --- a/baselines/common/misc_util.py +++ b/baselines/common/misc_util.py @@ -1,15 +1,23 @@ -import gym -import numpy as np import os import pickle import random import tempfile import zipfile +import gym +import numpy as np +import tensorflow as tf + def zipsame(*seqs): - L = len(seqs[0]) - assert all(len(seq) == L for seq in seqs[1:]) + """ + Performes a zip function, but asserts that all zipped elements are of the same size + + :param seqs: a list of arrays that are zipped together + :return: the zipped arguments + """ + length = len(seqs[0]) + assert all(len(seq) == length for seq in seqs[1:]) return zip(*seqs) @@ -20,79 +28,80 @@ def unpack(seq, sizes): Example: unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6]) + + :param seq: (Iterable) the sequence to unpack + :param sizes: ([int]) the shape to unpack + :return: ([Any] or Any) the unpacked sequence """ seq = list(seq) - it = iter(seq) + iterator = iter(seq) assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes) for size in sizes: if size is None: - yield it.__next__() + yield iterator.__next__() else: - li = [] + _list = [] for _ in range(size): - li.append(it.__next__()) - yield li + _list.append(iterator.__next__()) + yield _list class EzPickle(object): - """Objects that are pickled and unpickled via their constructor - arguments. + def __init__(self, *args, **kwargs): + """ + Objects that are pickled and unpickled via their constructor arguments. - Example usage: + Example usage: - class Dog(Animal, EzPickle): - def __init__(self, furcolor, tailkind="bushy"): - Animal.__init__() - EzPickle.__init__(furcolor, tailkind) - ... + class Dog(Animal, EzPickle): + def __init__(self, furcolor, tailkind="bushy"): + Animal.__init__() + EzPickle.__init__(furcolor, tailkind) + ... - When this object is unpickled, a new Dog will be constructed by passing the provided - furcolor and tailkind into the constructor. However, philosophers are still not sure - whether it is still the same dog. + When this object is unpickled, a new Dog will be constructed by passing the provided + furcolor and tailkind into the constructor. However, philosophers are still not sure + whether it is still the same dog. - This is generally needed only for environments which wrap C/C++ code, such as MuJoCo - and Atari. - """ + This is generally needed only for environments which wrap C/C++ code, such as MuJoCo + and Atari. - def __init__(self, *args, **kwargs): + :param args: ezpickle args + :param kwargs: ezpickle kwargs + """ self._ezpickle_args = args self._ezpickle_kwargs = kwargs def __getstate__(self): return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs} - def __setstate__(self, d): - out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"]) + def __setstate__(self, _dict): + out = type(self)(*_dict["_ezpickle_args"], **_dict["_ezpickle_kwargs"]) self.__dict__.update(out.__dict__) -def set_global_seeds(i): - try: - import tensorflow as tf - except ImportError: - pass - else: - tf.set_random_seed(i) - np.random.seed(i) - random.seed(i) +def set_global_seeds(seed): + """ + set the seed for python random, tensorflow, and numpy + + :param seed: (int) the seed + """ + tf.set_random_seed(seed) + np.random.seed(seed) + random.seed(seed) def pretty_eta(seconds_left): - """Print the number of seconds in human readable format. + """ + Print the number of seconds in human readable format. Examples: 2 days 2 hours and 37 minutes less than a minute - Paramters - --------- - seconds_left: int - Number of seconds to be converted to the ETA - Returns - ------- - eta: str - String representing the pretty ETA. + :param seconds_left: (int) Number of seconds to be converted to the ETA + :return: (str) String representing the pretty ETA. """ minutes_left = seconds_left // 60 seconds_left %= 60 @@ -121,27 +130,21 @@ def helper(cnt, name): class RunningAvg(object): def __init__(self, gamma, init_value=None): - """Keep a running estimate of a quantity. This is a bit like mean + """ + Keep a running estimate of a quantity. This is a bit like mean but more sensitive to recent changes. - Parameters - ---------- - gamma: float - Must be between 0 and 1, where 0 is the most sensitive to recent - changes. - init_value: float or None - Initial value of the estimate. If None, it will be set on the first update. + :param gamma: (float) Must be between 0 and 1, where 0 is the most sensitive to recent changes. + :param init_value: (float) Initial value of the estimate. If None, it will be set on the first update. """ self._value = init_value self._gamma = gamma def update(self, new_val): - """Update the estimate. + """ + Update the estimate. - Parameters - ---------- - new_val: float - new observated value of estimated quantity. + :param new_val: (float) new observated value of estimated quantity. """ if self._value is None: self._value = new_val @@ -149,43 +152,36 @@ def update(self, new_val): self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val def __float__(self): - """Get the current estimate""" + """ + Get the current estimate + + :return: (float) current value + """ return self._value -def boolean_flag(parser, name, default=False, help=None): - """Add a boolean flag to argparse parser. - - Parameters - ---------- - parser: argparse.Parser - parser to add the flag to - name: str - -- will enable the flag, while --no- will disable it - default: bool or None - default value of the flag - help: str - help string for the flag + +def boolean_flag(parser, name, default=False, help_msg=None): + """ + Add a boolean flag to argparse parser. + + :param parser: (argparse.Parser) parser to add the flag to + :param name: (str) -- will enable the flag, while --no- will disable it + :param default: (bool) default value of the flag + :param help_msg: (str) help string for the flag """ dest = name.replace('-', '_') - parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help) + parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help_msg) parser.add_argument("--no-" + name, action="store_false", dest=dest) def get_wrapper_by_name(env, classname): - """Given an a gym environment possibly wrapped multiple times, returns a wrapper + """ + Given an a gym environment possibly wrapped multiple times, returns a wrapper of class named classname or raises ValueError if no such wrapper was applied - Parameters - ---------- - env: gym.Env of gym.Wrapper - gym environment - classname: str - name of the wrapper - - Returns - ------- - wrapper: gym.Wrapper - wrapper named classname + :param env: (Gym Environment) the environment + :param classname: (str) name of the wrapper + :return: (Gym Environment) the wrapped environment """ currentenv = env while True: @@ -198,7 +194,8 @@ def get_wrapper_by_name(env, classname): def relatively_safe_pickle_dump(obj, path, compression=False): - """This is just like regular pickle dump, except from the fact that failure cases are + """ + This is just like regular pickle dump, except from the fact that failure cases are different: - It's never possible that we end up with a pickle in corrupted state. @@ -210,14 +207,9 @@ def relatively_safe_pickle_dump(obj, path, compression=False): The indended use case is periodic checkpoints of experiment state, such that we never corrupt previous checkpoints if the current one fails. - Parameters - ---------- - obj: object - object to pickle - path: str - path to the output file - compression: bool - if true pickle will be compressed + :param obj: (Object) object to pickle + :param path: (str) path to the output file + :param compression: (bool) if true pickle will be compressed """ temp_storage = path + ".relatively_safe" if compression: @@ -228,31 +220,24 @@ def relatively_safe_pickle_dump(obj, path, compression=False): with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip: myzip.write(uncompressed_file.name, "data") else: - with open(temp_storage, "wb") as f: - pickle.dump(obj, f) + with open(temp_storage, "wb") as file_handler: + pickle.dump(obj, file_handler) os.rename(temp_storage, path) def pickle_load(path, compression=False): - """Unpickle a possible compressed pickle. - - Parameters - ---------- - path: str - path to the output file - compression: bool - if true assumes that pickle was compressed when created and attempts decompression. - - Returns - ------- - obj: object - the unpickled object + """ + Unpickle a possible compressed pickle. + + :param path: (str) path to the output file + :param compression: (bool) if true assumes that pickle was compressed when created and attempts decompression. + :return: (Object) the unpickled object """ if compression: with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip: - with myzip.open("data") as f: - return pickle.load(f) + with myzip.open("data") as file_handler: + return pickle.load(file_handler) else: - with open(path, "rb") as f: - return pickle.load(f) + with open(path, "rb") as file_handler: + return pickle.load(file_handler) diff --git a/baselines/common/mpi_adam.py b/baselines/common/mpi_adam.py index 4902caf629..cd17274cb0 100644 --- a/baselines/common/mpi_adam.py +++ b/baselines/common/mpi_adam.py @@ -1,46 +1,78 @@ from mpi4py import MPI -import baselines.common.tf_util as U +import baselines.common.tf_util as tf_utils import tensorflow as tf import numpy as np + class MpiAdam(object): - def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): + def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None, + sess=None): + """ + A parallel MPI implementation of the Adam optimizer for TensorFlow + https://arxiv.org/abs/1412.6980 + + :param var_list: ([TensorFlow Tensor]) the variables + :param beta1: (float) Adam beta1 parameter + :param beta2: (float) Adam beta1 parameter + :param epsilon: (float) to help with preventing arithmetic issues + :param scale_grad_by_procs: (bool) if the scaling should be done by processes + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param sess: (TensorFlow Session) if None, tf.get_default_session() + """ self.var_list = var_list self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.scale_grad_by_procs = scale_grad_by_procs - size = sum(U.numel(v) for v in var_list) - self.m = np.zeros(size, 'float32') - self.v = np.zeros(size, 'float32') - self.t = 0 - self.setfromflat = U.SetFromFlat(var_list) - self.getflat = U.GetFlat(var_list) + size = sum(tf_utils.numel(v) for v in var_list) + # Exponential moving average of gradient values + # "first moment estimate" m in the paper + self.exp_avg = np.zeros(size, 'float32') + # Exponential moving average of squared gradient values + # "second raw moment estimate" v in the paper + self.exp_avg_sq = np.zeros(size, 'float32') + self.step = 0 + self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess) + self.getflat = tf_utils.GetFlat(var_list, sess=sess) self.comm = MPI.COMM_WORLD if comm is None else comm - def update(self, localg, stepsize): - if self.t % 100 == 0: + def update(self, local_grad, learning_rate): + """ + update the values of the graph + + :param local_grad: (numpy float) the gradient + :param learning_rate: (float) the learning_rate for the update + """ + if self.step % 100 == 0: self.check_synced() - localg = localg.astype('float32') - globalg = np.zeros_like(localg) - self.comm.Allreduce(localg, globalg, op=MPI.SUM) + local_grad = local_grad.astype('float32') + global_grad = np.zeros_like(local_grad) + self.comm.Allreduce(local_grad, global_grad, op=MPI.SUM) if self.scale_grad_by_procs: - globalg /= self.comm.Get_size() + global_grad /= self.comm.Get_size() - self.t += 1 - a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) - self.m = self.beta1 * self.m + (1 - self.beta1) * globalg - self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) - step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) + self.step += 1 + # Learning rate with bias correction + step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step) + # Decay the first and second moment running average coefficient + self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad + self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad) + step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon) self.setfromflat(self.getflat() + step) def sync(self): + """ + syncronize the MPI threads + """ theta = self.getflat() self.comm.Bcast(theta, root=0) self.setfromflat(theta) def check_synced(self): - if self.comm.Get_rank() == 0: # this is root + """ + confirm the MPI threads are synced + """ + if self.comm.Get_rank() == 0: # this is root theta = self.getflat() self.comm.Bcast(theta, root=0) else: @@ -49,31 +81,40 @@ def check_synced(self): self.comm.Bcast(thetaroot, root=0) assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) -@U.in_session -def test_MpiAdam(): + +@tf_utils.in_session +def test_mpi_adam(): + """ + tests the MpiAdam object's functionality + """ np.random.seed(0) tf.set_random_seed(0) - a = tf.Variable(np.random.randn(3).astype('float32')) - b = tf.Variable(np.random.randn(2,5).astype('float32')) - loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) + a_var = tf.Variable(np.random.randn(3).astype('float32')) + b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) + loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) - stepsize = 1e-2 - update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) - do_update = U.function([], loss, updates=[update_op]) + learning_rate = 1e-2 + update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) + do_update = tf_utils.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) - for i in range(10): - print(i,do_update()) + for step in range(10): + print(step, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) - var_list = [a,b] - lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) + var_list = [a_var, b_var] + lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) - for i in range(10): - l,g = lossandgrad() - adam.update(g, stepsize) - print(i,l) \ No newline at end of file + for step in range(10): + loss, grad = lossandgrad() + adam.update(grad, learning_rate) + print(step, loss) + + +if __name__ == "__main__": + # Run with mpirun -np 2 python + test_mpi_adam() diff --git a/baselines/common/mpi_fork.py b/baselines/common/mpi_fork.py index c5e609e66c..2012f5cad3 100644 --- a/baselines/common/mpi_fork.py +++ b/baselines/common/mpi_fork.py @@ -1,10 +1,18 @@ -import os, subprocess, sys +import os +import subprocess +import sys -def mpi_fork(n, bind_to_core=False): - """Re-launches the current script with workers + +def mpi_fork(rank, bind_to_core=False): + """ + Re-launches the current script with workers Returns "parent" for original parent, "child" for MPI children + + :param rank: (int) the rank + :param bind_to_core: (bool) enables binding to core + :return: (str) the correct type of thread name """ - if n<=1: + if rank <= 1: return "child" if os.getenv("IN_MPI") is None: env = os.environ.copy() @@ -13,7 +21,7 @@ def mpi_fork(n, bind_to_core=False): OMP_NUM_THREADS="1", IN_MPI="1" ) - args = ["mpirun", "-np", str(n)] + args = ["mpirun", "-np", str(rank)] if bind_to_core: args += ["-bind-to", "core"] args += [sys.executable] + sys.argv diff --git a/baselines/common/mpi_moments.py b/baselines/common/mpi_moments.py index 7fcc6cd828..1af444a4e3 100644 --- a/baselines/common/mpi_moments.py +++ b/baselines/common/mpi_moments.py @@ -1,26 +1,47 @@ from mpi4py import MPI import numpy as np + from baselines.common import zipsame -def mpi_mean(x, axis=0, comm=None, keepdims=False): - x = np.asarray(x) - assert x.ndim > 0 - if comm is None: comm = MPI.COMM_WORLD - xsum = x.sum(axis=axis, keepdims=keepdims) - n = xsum.size - localsum = np.zeros(n+1, x.dtype) - localsum[:n] = xsum.ravel() - localsum[n] = x.shape[axis] +def mpi_mean(arr, axis=0, comm=None, keepdims=False): + """ + calculates the mean of an array, using MPI + + :param arr: (numpy Number) + :param axis: (int or tuple or list) the axis to run the means over + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param keepdims: (bool) keep the other dimentions intact + :return: (numpy Number or Number) the result of the sum + """ + arr = np.asarray(arr) + assert arr.ndim > 0 + if comm is None: + comm = MPI.COMM_WORLD + xsum = arr.sum(axis=axis, keepdims=keepdims) + size = xsum.size + localsum = np.zeros(size + 1, arr.dtype) + localsum[:size] = xsum.ravel() + localsum[size] = arr.shape[axis] globalsum = np.zeros_like(localsum) comm.Allreduce(localsum, globalsum, op=MPI.SUM) - return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] + return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size] + + +def mpi_moments(arr, axis=0, comm=None, keepdims=False): + """ + calculates the mean and std of an array, using MPI -def mpi_moments(x, axis=0, comm=None, keepdims=False): - x = np.asarray(x) - assert x.ndim > 0 - mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) - sqdiffs = np.square(x - mean) + :param arr: (numpy Number) + :param axis: (int or tuple or list) the axis to run the moments over + :param comm: (MPI Communicators) if None, MPI.COMM_WORLD + :param keepdims: (bool) keep the other dimentions intact + :return: (numpy Number or Number) the result of the moments + """ + arr = np.asarray(arr) + assert arr.ndim > 0 + mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True) + sqdiffs = np.square(arr - mean) meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) assert count1 == count std = np.sqrt(meansqdiff) @@ -31,30 +52,20 @@ def mpi_moments(x, axis=0, comm=None, keepdims=False): return mean, std, count -def test_runningmeanstd(): - import subprocess - subprocess.check_call(['mpirun', '-np', '3', - 'python','-c', - 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) - def _helper_runningmeanstd(): comm = MPI.COMM_WORLD np.random.seed(0) - for (triple,axis) in [ - ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), - ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), - ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), - ]: - + for (triple, axis) in [ + ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0), + ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0), + ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]: - x = np.concatenate(triple, axis=axis) - ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] + arr = np.concatenate(triple, axis=axis) + ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]] + ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis) - ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) - - for (a1,a2) in zipsame(ms1, ms2): - print(a1, a2) - assert np.allclose(a1, a2) + for (res_1, res_2) in zipsame(ms1, ms2): + print(res_1, res_2) + assert np.allclose(res_1, res_2) print("ok!") - diff --git a/baselines/common/mpi_running_mean_std.py b/baselines/common/mpi_running_mean_std.py index 408f8a22b8..4b418265a7 100644 --- a/baselines/common/mpi_running_mean_std.py +++ b/baselines/common/mpi_running_mean_std.py @@ -1,10 +1,19 @@ from mpi4py import MPI -import tensorflow as tf, baselines.common.tf_util as U, numpy as np +import tensorflow as tf +import numpy as np + +import baselines.common.tf_util as tf_util + class RunningMeanStd(object): - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm def __init__(self, epsilon=1e-2, shape=()): + """ + calulates the running mean and std of a data stream + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + :param epsilon: (float) helps with arithmetic issues + :param shape: (tuple) the shape of the data stream's output + """ self._sum = tf.get_variable( dtype=tf.float64, shape=shape, @@ -23,74 +32,62 @@ def __init__(self, epsilon=1e-2, shape=()): self.shape = shape self.mean = tf.to_float(self._sum / self._count) - self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) + self.std = tf.sqrt(tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') - self.incfiltparams = U.function([newsum, newsumsq, newcount], [], - updates=[tf.assign_add(self._sum, newsum), - tf.assign_add(self._sumsq, newsumsq), - tf.assign_add(self._count, newcount)]) - - - def update(self, x): - x = x.astype('float64') - n = int(np.prod(self.shape)) - totalvec = np.zeros(n*2+1, 'float64') - addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) + self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [], + updates=[tf.assign_add(self._sum, newsum), + tf.assign_add(self._sumsq, newsumsq), + tf.assign_add(self._count, newcount)]) + + def update(self, data): + """ + update the running mean and std + + :param data: (numpy Number) the data + """ + data = data.astype('float64') + data_size = int(np.prod(self.shape)) + totalvec = np.zeros(data_size * 2 + 1, 'float64') + addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(), + np.array([len(data)], dtype='float64')]) MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) - self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) + self.incfiltparams(totalvec[0: data_size].reshape(self.shape), + totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size]) -@U.in_session -def test_runningmeanstd(): - for (x1, x2, x3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), - ]: - rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) - U.initialize() - - x = np.concatenate([x1, x2, x3], axis=0) - ms1 = [x.mean(axis=0), x.std(axis=0)] - rms.update(x1) - rms.update(x2) - rms.update(x3) - ms2 = [rms.mean.eval(), rms.std.eval()] - - assert np.allclose(ms1, ms2) - -@U.in_session +@tf_util.in_session def test_dist(): + """ + test the running mean std + """ np.random.seed(0) - p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) - q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) - - # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) - # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) + p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) + q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) comm = MPI.COMM_WORLD - assert comm.Get_size()==2 - if comm.Get_rank()==0: - x1,x2,x3 = p1,p2,p3 - elif comm.Get_rank()==1: - x1,x2,x3 = q1,q2,q3 + assert comm.Get_size() == 2 + if comm.Get_rank() == 0: + x_1, x_2, x_3 = p_1, p_2, p_3 + elif comm.Get_rank() == 1: + x_1, x_2, x_3 = q_1, q_2, q_3 else: assert False rms = RunningMeanStd(epsilon=0.0, shape=(1,)) - U.initialize() + tf_util.initialize() - rms.update(x1) - rms.update(x2) - rms.update(x3) + rms.update(x_1) + rms.update(x_2) + rms.update(x_3) - bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) + bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3]) - def checkallclose(x,y): - print(x,y) - return np.allclose(x,y) + def checkallclose(var_1, var_2): + print(var_1, var_2) + return np.allclose(var_1, var_2) assert checkallclose( bigvec.mean(axis=0), diff --git a/baselines/common/runners.py b/baselines/common/runners.py index 0a4b2214f7..7c9df3ce7d 100644 --- a/baselines/common/runners.py +++ b/baselines/common/runners.py @@ -1,18 +1,29 @@ import numpy as np from abc import ABC, abstractmethod + class AbstractEnvRunner(ABC): - def __init__(self, *, env, model, nsteps): + def __init__(self, *, env, model, n_steps): + """ + A runner to learn the policy of an environment for a model + + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + """ self.env = env self.model = model - nenv = env.num_envs - self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape - self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) + n_env = env.num_envs + self.batch_ob_shape = (n_env*n_steps,) + env.observation_space.shape + self.obs = np.zeros((n_env,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) self.obs[:] = env.reset() - self.nsteps = nsteps + self.n_steps = n_steps self.states = model.initial_state - self.dones = [False for _ in range(nenv)] + self.dones = [False for _ in range(n_env)] @abstractmethod def run(self): + """ + Run a learning step of the model + """ raise NotImplementedError diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py index 06ba8d8f11..d6a03d6ebf 100644 --- a/baselines/common/running_mean_std.py +++ b/baselines/common/running_mean_std.py @@ -1,46 +1,37 @@ import numpy as np + + class RunningMeanStd(object): - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm def __init__(self, epsilon=1e-4, shape=()): + """ + calulates the running mean and std of a data stream + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + + :param epsilon: (float) helps with arithmetic issues + :param shape: (tuple) the shape of the data stream's output + """ self.mean = np.zeros(shape, 'float64') self.var = np.ones(shape, 'float64') self.count = epsilon - def update(self, x): - batch_mean = np.mean(x, axis=0) - batch_var = np.var(x, axis=0) - batch_count = x.shape[0] + def update(self, arr): + batch_mean = np.mean(arr, axis=0) + batch_var = np.var(arr, axis=0) + batch_count = arr.shape[0] self.update_from_moments(batch_mean, batch_var, batch_count) def update_from_moments(self, batch_mean, batch_var, batch_count): delta = batch_mean - self.mean tot_count = self.count + batch_count - new_mean = self.mean + delta * batch_count / tot_count - m_a = self.var * (self.count) - m_b = batch_var * (batch_count) - M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) - new_var = M2 / (self.count + batch_count) + new_mean = self.mean + delta * batch_count / tot_count + m_a = self.var * self.count + m_b = batch_var * batch_count + m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) + new_var = m_2 / (self.count + batch_count) new_count = batch_count + self.count self.mean = new_mean self.var = new_var - self.count = new_count - -def test_runningmeanstd(): - for (x1, x2, x3) in [ - (np.random.randn(3), np.random.randn(4), np.random.randn(5)), - (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), - ]: - - rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) - - x = np.concatenate([x1, x2, x3], axis=0) - ms1 = [x.mean(axis=0), x.var(axis=0)] - rms.update(x1) - rms.update(x2) - rms.update(x3) - ms2 = [rms.mean, rms.var] - - assert np.allclose(ms1, ms2) + self.count = new_count diff --git a/baselines/common/running_stat.py b/baselines/common/running_stat.py index b9aa86c2ff..4c074590a3 100644 --- a/baselines/common/running_stat.py +++ b/baselines/common/running_stat.py @@ -1,46 +1,75 @@ import numpy as np -# http://www.johndcook.com/blog/standard_deviation/ + class RunningStat(object): def __init__(self, shape): - self._n = 0 - self._M = np.zeros(shape) - self._S = np.zeros(shape) - def push(self, x): - x = np.asarray(x) - assert x.shape == self._M.shape - self._n += 1 - if self._n == 1: - self._M[...] = x + """ + calulates the running mean and std of a data stream + http://www.johndcook.com/blog/standard_deviation/ + + :param shape: (tuple) the shape of the data stream's output + """ + self._step = 0 + self._mean = np.zeros(shape) + self._std = np.zeros(shape) + + def push(self, value): + """ + update the running mean and std + + :param value: (numpy Number) the data + """ + value = np.asarray(value) + assert value.shape == self._mean.shape + self._step += 1 + if self._step == 1: + self._mean[...] = value else: - oldM = self._M.copy() - self._M[...] = oldM + (x - oldM)/self._n - self._S[...] = self._S + (x - oldM)*(x - self._M) + old_m = self._mean.copy() + self._mean[...] = old_m + (value - old_m) / self._step + self._std[...] = self._std + (value - old_m) * (value - self._mean) + @property def n(self): - return self._n + """ + the number of data points + + :return: (int) + """ + return self._step + @property def mean(self): - return self._M + """ + the average value + + :return: (float) + """ + return self._mean + @property def var(self): - return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) + """ + the variation of the data points + + :return: (float) + """ + return self._std / (self._step - 1) if self._step > 1 else np.square(self._mean) + @property def std(self): + """ + the standard deviation of the data points + + :return: (float) + """ return np.sqrt(self.var) + @property def shape(self): - return self._M.shape - -def test_running_stat(): - for shp in ((), (3,), (3,4)): - li = [] - rs = RunningStat(shp) - for _ in range(5): - val = np.random.randn(*shp) - rs.push(val) - li.append(val) - m = np.mean(li, axis=0) - assert np.allclose(rs.mean, m) - v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) - assert np.allclose(rs.var, v) + """ + the shape of the data points + + :return: (tuple) + """ + return self._mean.shape diff --git a/baselines/common/schedules.py b/baselines/common/schedules.py index 9dfff50f95..9fc3d6f11b 100644 --- a/baselines/common/schedules.py +++ b/baselines/common/schedules.py @@ -10,47 +10,57 @@ class Schedule(object): - def value(self, t): - """Value of the schedule at time t""" - raise NotImplementedError() + def value(self, step): + """ + Value of the schedule for a given timestep + + :param step: (int) the timestep + :return: (float) the output value for the given timestep + """ + raise NotImplementedError -class ConstantSchedule(object): +class ConstantSchedule(Schedule): def __init__(self, value): - """Value remains constant over time. + """ + Value remains constant over time. - Parameters - ---------- - value: float - Constant value of the schedule + :param value: (float) Constant value of the schedule """ - self._v = value + self._value = value - def value(self, t): - """See Schedule.value""" - return self._v + def value(self, step): + return self._value -def linear_interpolation(l, r, alpha): - return l + alpha * (r - l) +def linear_interpolation(left, right, alpha): + """ + Linear interpolation between `left` and `right` + :param left: (float) left boundary + :param right: (float) right boundary + :param alpha: (float) coeff in [0, 1] + :return: (float) + """ + return left + alpha * (right - left) -class PiecewiseSchedule(object): +class PiecewiseSchedule(Schedule): def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): - """Piecewise schedule. + """ + Piecewise schedule. - endpoints: [(int, int)] + :param endpoints: ([(int, int)]) list of pairs `(time, value)` meanining that schedule should output `value` when `t==time`. All the values for time must be sorted in an increasing order. When t is between two times, e.g. `(time_a, value_a)` and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs `interpolation(value_a, value_b, alpha)` where alpha is a fraction of time passed between `time_a` and `time_b` for time `t`. - interpolation: lambda float, float, float: float + :param interpolation: (lambda (float, float, float): float) a function that takes value to the left and to the right of t according to the `endpoints`. Alpha is the fraction of distance from left endpoint to right endpoint that t has covered. See linear_interpolation for example. - outside_value: float + :param outside_value: (float) if the value is requested outside of all the intervals sepecified in `endpoints` this value is returned. If None then AssertionError is raised when outside value is requested. @@ -61,39 +71,32 @@ def __init__(self, endpoints, interpolation=linear_interpolation, outside_value= self._outside_value = outside_value self._endpoints = endpoints - def value(self, t): - """See Schedule.value""" - for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): - if l_t <= t and t < r_t: - alpha = float(t - l_t) / (r_t - l_t) - return self._interpolation(l, r, alpha) + def value(self, step): + for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]): + if left_t <= step < right_t: + alpha = float(step - left_t) / (right_t - left_t) + return self._interpolation(left, right, alpha) # t does not belong to any of the pieces, so doom. assert self._outside_value is not None return self._outside_value -class LinearSchedule(object): +class LinearSchedule(Schedule): def __init__(self, schedule_timesteps, final_p, initial_p=1.0): - """Linear interpolation between initial_p and final_p over + """ + Linear interpolation between initial_p and final_p over schedule_timesteps. After this many timesteps pass final_p is returned. - Parameters - ---------- - schedule_timesteps: int - Number of timesteps for which to linearly anneal initial_p - to final_p - initial_p: float - initial output value - final_p: float - final output value + :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p + :param initial_p: (float) initial output value + :param final_p: (float) final output value """ self.schedule_timesteps = schedule_timesteps self.final_p = final_p self.initial_p = initial_p - def value(self, t): - """See Schedule.value""" - fraction = min(float(t) / self.schedule_timesteps, 1.0) + def value(self, step): + fraction = min(float(step) / self.schedule_timesteps, 1.0) return self.initial_p + fraction * (self.final_p - self.initial_p) diff --git a/baselines/common/segment_tree.py b/baselines/common/segment_tree.py index cb386ecdb5..1a22d8eed0 100644 --- a/baselines/common/segment_tree.py +++ b/baselines/common/segment_tree.py @@ -3,7 +3,8 @@ class SegmentTree(object): def __init__(self, capacity, operation, neutral_element): - """Build a Segment Tree data structure. + """ + Build a Segment Tree data structure. https://en.wikipedia.org/wiki/Segment_tree @@ -16,17 +17,10 @@ def __init__(self, capacity, operation, neutral_element): `reduce` operation which reduces `operation` over a contiguous subsequence of items in the array. - Paramters - --------- - capacity: int - Total size of the array - must be a power of two. - operation: lambda obj, obj -> obj - and operation for combining elements (eg. sum, max) - must form a mathematical group together with the set of - possible values for array elements (i.e. be associative) - neutral_element: obj - neutral element for the operation above. eg. float('-inf') - for max and 0 for sum. + :param capacity: (int) Total size of the array - must be a power of two. + :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a + mathematical group together with the set of possible values for array elements (i.e. be associative) + :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum. """ assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." self._capacity = capacity @@ -49,22 +43,15 @@ def _reduce_helper(self, start, end, node, node_start, node_end): ) def reduce(self, start=0, end=None): - """Returns result of applying `self.operation` + """ + Returns result of applying `self.operation` to a contiguous subsequence of the array. self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) - Parameters - ---------- - start: int - beginning of the subsequence - end: int - end of the subsequences - - Returns - ------- - reduced: obj - result of reducing self.operation over the specified range of array elements. + :param start: (int) beginning of the subsequence + :param end: (int) end of the subsequences + :return: (Any) result of reducing self.operation over the specified range of array elements. """ if end is None: end = self._capacity @@ -99,26 +86,26 @@ def __init__(self, capacity): ) def sum(self, start=0, end=None): - """Returns arr[start] + ... + arr[end]""" + """ + Returns arr[start] + ... + arr[end] + + :param start: (int) start position of the reduction (must be >= 0) + :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) + :return: (Any) reduction of SumSegmentTree + """ return super(SumSegmentTree, self).reduce(start, end) def find_prefixsum_idx(self, prefixsum): - """Find the highest index `i` in the array such that + """ + Find the highest index `i` in the array such that sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum if array values are probabilities, this function allows to sample indexes according to the discrete probability efficiently. - Parameters - ---------- - perfixsum: float - upperbound on the sum of array prefix - - Returns - ------- - idx: int - highest index satisfying the prefixsum constraint + :param prefixsum: (float) upperbound on the sum of array prefix + :return: (int) highest index satisfying the prefixsum constraint """ assert 0 <= prefixsum <= self.sum() + 1e-5 idx = 1 @@ -140,6 +127,11 @@ def __init__(self, capacity): ) def min(self, start=0, end=None): - """Returns min(arr[start], ..., arr[end])""" + """ + Returns min(arr[start], ..., arr[end]) + :param start: (int) start position of the reduction (must be >= 0) + :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) + :return: (Any) reduction of MinSegmentTree + """ return super(MinSegmentTree, self).reduce(start, end) diff --git a/baselines/common/tests/test_schedules.py b/baselines/common/tests/test_schedules.py deleted file mode 100644 index 4e8d02d291..0000000000 --- a/baselines/common/tests/test_schedules.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - -from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule - - -def test_piecewise_schedule(): - ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) - - assert np.isclose(ps.value(-10), 500) - assert np.isclose(ps.value(0), 150) - assert np.isclose(ps.value(5), 200) - assert np.isclose(ps.value(9), 80) - assert np.isclose(ps.value(50), 50) - assert np.isclose(ps.value(80), 50) - assert np.isclose(ps.value(150), 0) - assert np.isclose(ps.value(175), -25) - assert np.isclose(ps.value(201), 500) - assert np.isclose(ps.value(500), 500) - - assert np.isclose(ps.value(200 - 1e-10), -50) - - -def test_constant_schedule(): - cs = ConstantSchedule(5) - for i in range(-100, 100): - assert np.isclose(cs.value(i), 5) diff --git a/baselines/common/tests/test_tf_util.py b/baselines/common/tests/test_tf_util.py deleted file mode 100644 index daad9d0210..0000000000 --- a/baselines/common/tests/test_tf_util.py +++ /dev/null @@ -1,40 +0,0 @@ -# tests for tf_util -import tensorflow as tf -from baselines.common.tf_util import ( - function, - initialize, - single_threaded_session -) - - -def test_function(): - with tf.Graph().as_default(): - x = tf.placeholder(tf.int32, (), name="x") - y = tf.placeholder(tf.int32, (), name="y") - z = 3 * x + 2 * y - lin = function([x, y], z, givens={y: 0}) - - with single_threaded_session(): - initialize() - - assert lin(2) == 6 - assert lin(2, 2) == 10 - - -def test_multikwargs(): - with tf.Graph().as_default(): - x = tf.placeholder(tf.int32, (), name="x") - with tf.variable_scope("other"): - x2 = tf.placeholder(tf.int32, (), name="x") - z = 3 * x + 2 * x2 - - lin = function([x, x2], z, givens={x2: 0}) - with single_threaded_session(): - initialize() - assert lin(2) == 6 - assert lin(2, 2) == 10 - - -if __name__ == '__main__': - test_function() - test_multikwargs() diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py index afcd593e85..b4afad2129 100644 --- a/baselines/common/tf_util.py +++ b/baselines/common/tf_util.py @@ -1,55 +1,85 @@ -import numpy as np -import tensorflow as tf # pylint: ignore-module import copy import os import functools import collections import multiprocessing +import numpy as np +import tensorflow as tf +from tensorflow.python.client import device_lib + +from baselines import logger + + def switch(condition, then_expression, else_expression): - """Switches between two operations depending on a scalar value (int or bool). + """ + Switches between two operations depending on a scalar value (int or bool). Note that both `then_expression` and `else_expression` should be symbolic tensors of the *same shape*. - # Arguments - condition: scalar tensor. - then_expression: TensorFlow operation. - else_expression: TensorFlow operation. + :param condition: (TensorFlow Tensor) scalar tensor. + :param then_expression: (TensorFlow Operation) + :param else_expression: (TensorFlow Operation) + :return: (TensorFlow Operation) the switch output """ x_shape = copy.copy(then_expression.get_shape()) - x = tf.cond(tf.cast(condition, 'bool'), - lambda: then_expression, - lambda: else_expression) - x.set_shape(x_shape) - return x + out_tensor = tf.cond(tf.cast(condition, 'bool'), + lambda: then_expression, + lambda: else_expression) + out_tensor.set_shape(x_shape) + return out_tensor + # ================================================================ # Extras # ================================================================ -def lrelu(x, leak=0.2): - f1 = 0.5 * (1 + leak) - f2 = 0.5 * (1 - leak) - return f1 * x + f2 * abs(x) +def leaky_relu(tensor, leak=0.2): + """ + Leaky ReLU + http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf + + :param tensor: (float) the input value + :param leak: (float) the leaking coeficient when the function is saturated + :return: (float) Leaky ReLU output + """ + f_1 = 0.5 * (1 + leak) + f_2 = 0.5 * (1 - leak) + return f_1 * tensor + f_2 * abs(tensor) + # ================================================================ # Mathematical utils # ================================================================ -def huber_loss(x, delta=1.0): - """Reference: https://en.wikipedia.org/wiki/Huber_loss""" +def huber_loss(tensor, delta=1.0): + """ + Reference: https://en.wikipedia.org/wiki/Huber_loss + + :param tensor: (TensorFlow Tensor) the input value + :param delta: (float) huber loss delta value + :return: (TensorFlow Tensor) huber loss output + """ return tf.where( - tf.abs(x) < delta, - tf.square(x) * 0.5, - delta * (tf.abs(x) - 0.5 * delta) + tf.abs(tensor) < delta, + tf.square(tensor) * 0.5, + delta * (tf.abs(tensor) - 0.5 * delta) ) + # ================================================================ # Global session # ================================================================ def make_session(num_cpu=None, make_default=False, graph=None): - """Returns a session that will use CPU's only""" + """ + Returns a session that will use CPU's only + + :param num_cpu: (int) number of CPUs to use for TensorFlow + :param make_default: (bool) if this should return an InteractiveSession or a normal Session + :param graph: (TensorFlow Graph) the graph of the session + :return: (TensorFlow session) + """ if num_cpu is None: num_cpu = int(os.getenv('RCALL_NUM_CPU', multiprocessing.cpu_count())) tf_config = tf.ConfigProto( @@ -60,41 +90,88 @@ def make_session(num_cpu=None, make_default=False, graph=None): else: return tf.Session(config=tf_config, graph=graph) + def single_threaded_session(): - """Returns a session which will only use a single CPU""" + """ + Returns a session which will only use a single CPU + + :return: (TensorFlow session) + """ return make_session(num_cpu=1) -def in_session(f): - @functools.wraps(f) + +def in_session(func): + """ + wrappes a function so that it is in a TensorFlow Session + + :param func: (function) the function to wrap + :return: (function) + """ + + @functools.wraps(func) def newfunc(*args, **kwargs): with tf.Session(): - f(*args, **kwargs) + func(*args, **kwargs) + return newfunc + ALREADY_INITIALIZED = set() -def initialize(): - """Initialize all the uninitialized variables in the global scope.""" + +def initialize(sess=None): + """ + Initialize all the uninitialized variables in the global scope. + + :param sess: (TensorFlow Session) + """ + if sess is None: + sess = tf.get_default_session() new_variables = set(tf.global_variables()) - ALREADY_INITIALIZED - tf.get_default_session().run(tf.variables_initializer(new_variables)) + sess.run(tf.variables_initializer(new_variables)) ALREADY_INITIALIZED.update(new_variables) + # ================================================================ # Model components # ================================================================ def normc_initializer(std=1.0, axis=0): - def _initializer(shape, dtype=None, partition_info=None): # pylint: disable=W0613 + """ + Return a parameter initializer for TensorFlow + + :param std: (float) standard deviation + :param axis: (int) the axis to normalize on + :return: (function) + """ + + def _initializer(shape, dtype=None, partition_info=None): out = np.random.randn(*shape).astype(np.float32) out *= std / np.sqrt(np.square(out).sum(axis=axis, keepdims=True)) return tf.constant(out) + return _initializer -def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, - summary_tag=None): + +def conv2d(input_tensor, num_filters, name, filter_size=(3, 3), stride=(1, 1), + pad="SAME", dtype=tf.float32, collections=None, summary_tag=None): + """ + Creates a 2d convolutional layer for TensorFlow + + :param input_tensor: (TensorFlow Tensor) The input tensor for the convolution + :param num_filters: (int) The number of filters + :param name: (str) The TensorFlow variable scope + :param filter_size: (tuple) The filter size + :param stride: (tuple) The stride of the convolution + :param pad: (str) The padding type ('VALID' or 'SAME') + :param dtype: (type) The data type for the Tensors + :param collections: (list) List of graph collections keys to add the Variable to + :param summary_tag: (str) image summary name, can be None for no image summary + :return: (TensorFlow Tensor) 2d convolutional layer + """ with tf.variable_scope(name): stride_shape = [1, stride[0], stride[1], 1] - filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] + filter_shape = [filter_size[0], filter_size[1], int(input_tensor.get_shape()[3]), num_filters] # there are "num input feature maps * filter height * filter width" # inputs to each hidden unit @@ -106,25 +183,26 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", # initialize weights with random weights w_bound = np.sqrt(6. / (fan_in + fan_out)) - w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), - collections=collections) - b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), - collections=collections) + weight = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), + collections=collections) + bias = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer(), + collections=collections) if summary_tag is not None: tf.summary.image(summary_tag, - tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), - [2, 0, 1, 3]), - max_images=10) + tf.transpose(tf.reshape(weight, [filter_size[0], filter_size[1], -1, 1]), [2, 0, 1, 3]), + max_outputs=10) + + return tf.nn.conv2d(input_tensor, weight, stride_shape, pad) + bias - return tf.nn.conv2d(x, w, stride_shape, pad) + b # ================================================================ # Theano-like Function # ================================================================ def function(inputs, outputs, updates=None, givens=None): - """Just like Theano function. Take a bunch of tensorflow placeholders and expressions + """ + Just like Theano function. Take a bunch of tensorflow placeholders and expressions computed based on those placeholders and produces f(inputs) -> outputs. Function f takes values to be fed to the input's placeholders and produces the values of the expressions in outputs. @@ -146,28 +224,35 @@ def function(inputs, outputs, updates=None, givens=None): assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12 - Parameters - ---------- - inputs: [tf.placeholder, tf.constant, or object with make_feed_dict method] - list of input arguments - outputs: [tf.Variable] or tf.Variable - list of outputs or a single output to be returned from function. Returned + :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments + :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned value will also have the same shape. + :param updates: (list) update functions + :param givens: (dict) the values known for the output """ if isinstance(outputs, list): return _Function(inputs, outputs, updates, givens=givens) elif isinstance(outputs, (dict, collections.OrderedDict)): - f = _Function(inputs, outputs.values(), updates, givens=givens) - return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), f(*args, **kwargs))) + func = _Function(inputs, outputs.values(), updates, givens=givens) + return lambda *args, **kwargs: type(outputs)(zip(outputs.keys(), func(*args, **kwargs))) else: - f = _Function(inputs, [outputs], updates, givens=givens) - return lambda *args, **kwargs: f(*args, **kwargs)[0] + func = _Function(inputs, [outputs], updates, givens=givens) + return lambda *args, **kwargs: func(*args, **kwargs)[0] class _Function(object): def __init__(self, inputs, outputs, updates, givens): + """ + Theano like function + + :param inputs: (TensorFlow Tensor or Object with make_feed_dict) list of input arguments + :param outputs: (TensorFlow Tensor) list of outputs or a single output to be returned from function. Returned + value will also have the same shape. + :param updates: (list) update functions + :param givens: (dict) the values known for the output + """ for inpt in inputs: - if not hasattr(inpt, 'make_feed_dict') and not (type(inpt) is tf.Tensor and len(inpt.op.inputs) == 0): + if not hasattr(inpt, 'make_feed_dict') and not (isinstance(inpt, tf.Tensor)and len(inpt.op.inputs) == 0): assert False, "inputs should all be placeholders, constants, or have a make_feed_dict method" self.inputs = inputs updates = updates or [] @@ -175,14 +260,17 @@ def __init__(self, inputs, outputs, updates, givens): self.outputs_update = list(outputs) + [self.update_group] self.givens = {} if givens is None else givens - def _feed_input(self, feed_dict, inpt, value): + @classmethod + def _feed_input(cls, feed_dict, inpt, value): if hasattr(inpt, 'make_feed_dict'): feed_dict.update(inpt.make_feed_dict(value)) else: feed_dict[inpt] = value - def __call__(self, *args): + def __call__(self, *args, sess=None): assert len(args) <= len(self.inputs), "Too many arguments provided" + if sess is None: + sess = tf.get_default_session() feed_dict = {} # Update the args for inpt, value in zip(self.inputs, args): @@ -190,26 +278,56 @@ def __call__(self, *args): # Update feed dict with givens. for inpt in self.givens: feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) - results = tf.get_default_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] + results = sess.run(self.outputs_update, feed_dict=feed_dict)[:-1] return results + # ================================================================ # Flat vectors # ================================================================ -def var_shape(x): - out = x.get_shape().as_list() +def var_shape(tensor): + """ + get TensorFlow Tensor shape + + :param tensor: (TensorFlow Tensor) the input tensor + :return: ([int]) the shape + """ + out = tensor.get_shape().as_list() assert all(isinstance(a, int) for a in out), \ "shape function assumes that shape is fully known" return out -def numel(x): - return intprod(var_shape(x)) -def intprod(x): - return int(np.prod(x)) +def numel(tensor): + """ + get TensorFlow Tensor's number of elements + + :param tensor: (TensorFlow Tensor) the input tensor + :return: (int) the number of elements + """ + return intprod(var_shape(tensor)) + + +def intprod(tensor): + """ + calculates the product of all the elements in a list + + :param tensor: ([Number]) the list of elements + :return: (int) the product truncated + """ + return int(np.prod(tensor)) + def flatgrad(loss, var_list, clip_norm=None): + """ + calculates the gradient and flattens it + + :param loss: (float) the loss value + :param var_list: ([TensorFlow Tensor]) the variables + :param clip_norm: (float) clip the gradients (disabled if None) + :return: ([TensorFlow Tensor]) flattend gradient + """ grads = tf.gradients(loss, var_list) if clip_norm is not None: grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] @@ -218,87 +336,130 @@ def flatgrad(loss, var_list, clip_norm=None): for (v, grad) in zip(var_list, grads) ]) + class SetFromFlat(object): - def __init__(self, var_list, dtype=tf.float32): - assigns = [] + def __init__(self, var_list, dtype=tf.float32, sess=None): + """ + Set the parameters from a flat vector + + :param var_list: ([TensorFlow Tensor]) the variables + :param dtype: (type) the type for the placeholder + :param sess: (TensorFlow Session) + """ shapes = list(map(var_shape, var_list)) total_size = np.sum([intprod(shape) for shape in shapes]) self.theta = theta = tf.placeholder(dtype, [total_size]) start = 0 assigns = [] - for (shape, v) in zip(shapes, var_list): + for (shape, _var) in zip(shapes, var_list): size = intprod(shape) - assigns.append(tf.assign(v, tf.reshape(theta[start:start + size], shape))) + assigns.append(tf.assign(_var, tf.reshape(theta[start:start + size], shape))) start += size - self.op = tf.group(*assigns) + self.operation = tf.group(*assigns) + self.sess = sess def __call__(self, theta): - tf.get_default_session().run(self.op, feed_dict={self.theta: theta}) + if self.sess is None: + return tf.get_default_session().run(self.operation, feed_dict={self.theta: theta}) + else: + return self.sess.run(self.operation, feed_dict={self.theta: theta}) + class GetFlat(object): - def __init__(self, var_list): - self.op = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) + def __init__(self, var_list, sess=None): + """ + Get the parameters as a flat vector - def __call__(self): - return tf.get_default_session().run(self.op) + :param var_list: ([TensorFlow Tensor]) the variables + :param sess: (TensorFlow Session) + """ + self.operation = tf.concat(axis=0, values=[tf.reshape(v, [numel(v)]) for v in var_list]) + self.sess = sess -_PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) + def __call__(self): + if self.sess is None: + return tf.get_default_session().run(self.operation) + else: + return self.sess.run(self.operation) -def get_placeholder(name, dtype, shape): - if name in _PLACEHOLDER_CACHE: - out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] - assert dtype1 == dtype and shape1 == shape - return out - else: - out = tf.placeholder(dtype=dtype, shape=shape, name=name) - _PLACEHOLDER_CACHE[name] = (out, dtype, shape) - return out -def get_placeholder_cached(name): - return _PLACEHOLDER_CACHE[name][0] +def flattenallbut0(tensor): + """ + flatten all the dimension, except from the first one -def flattenallbut0(x): - return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) + :param tensor: (TensorFlow Tensor) the input tensor + :return: (TensorFlow Tensor) the flattened tensor + """ + return tf.reshape(tensor, [-1, intprod(tensor.get_shape().as_list()[1:])]) # ================================================================ -# Diagnostics +# Diagnostics # ================================================================ -def display_var_info(vars): - from baselines import logger +def display_var_info(_vars): + """ + log variable information, for debug purposes + + :param _vars: ([TensorFlow Tensor]) the variables + """ count_params = 0 - for v in vars: - name = v.name - if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue - v_params = np.prod(v.shape.as_list()) + for _var in _vars: + name = _var.name + if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: + continue + v_params = np.prod(_var.shape.as_list()) count_params += v_params - if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print - logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape))) + if "/b:" in name or "/biases" in name: + continue # Wx+b, bias is not interesting to look at => count params, but not print + logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(_var.shape))) - logger.info("Total model parameters: %0.2f million" % (count_params*1e-6)) + logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6)) def get_available_gpus(): + """ + Return a list of all the available GPUs + + :return: ([str]) the GPUs available + """ # recipe from here: # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa - - from tensorflow.python.client import device_lib local_device_protos = device_lib.list_local_devices() return [x.name for x in local_device_protos if x.device_type == 'GPU'] + # ================================================================ # Saving variables # ================================================================ -def load_state(fname): - saver = tf.train.Saver() - saver.restore(tf.get_default_session(), fname) +def load_state(fname, sess=None, var_list=None): + """ + Load a TensorFlow saved model + + :param fname: (str) the graph name + :param sess: (TensorFlow Session) the session, if None: get_default_session() + :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject, + or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects. + """ + if sess is None: + sess = tf.get_default_session() + saver = tf.train.Saver(var_list=var_list) + saver.restore(sess, fname) -def save_state(fname): - os.makedirs(os.path.dirname(fname), exist_ok=True) - saver = tf.train.Saver() - saver.save(tf.get_default_session(), fname) +def save_state(fname, sess=None, var_list=None): + """ + Save a TensorFlow model + :param fname: (str) the graph name + :param sess: (TensorFlow Session) the session, if None: get_default_session() + :param var_list: ([TensorFlow Tensor] or {str: TensorFlow Tensor}) A list of Variable/SaveableObject, + or a dictionary mapping names to SaveableObject`s. If `None, defaults to the list of all saveable objects. + """ + if sess is None: + sess = tf.get_default_session() + os.makedirs(os.path.dirname(fname), exist_ok=True) + saver = tf.train.Saver(var_list=var_list) + saver.save(sess, fname) diff --git a/baselines/common/tile_images.py b/baselines/common/tile_images.py index 929da8994a..14922a990a 100644 --- a/baselines/common/tile_images.py +++ b/baselines/common/tile_images.py @@ -1,23 +1,28 @@ import numpy as np + def tile_images(img_nhwc): """ Tile N images into one big PxQ image (P,Q) are chosen to be as close as possible, and if N is square, then P=Q. - input: img_nhwc, list or array of images, ndim=4 once turned into array + :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc n = batch index, h = height, w = width, c = channel - returns: - bigim_HWc, ndarray with ndim=3 + :return: (numpy float) img_HWc, ndim=3 """ img_nhwc = np.asarray(img_nhwc) - N, h, w, c = img_nhwc.shape - H = int(np.ceil(np.sqrt(N))) - W = int(np.ceil(float(N)/H)) - img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) - img_HWhwc = img_nhwc.reshape(H, W, h, w, c) - img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) - img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) - return img_Hh_Ww_c + n_images, height, width, n_channels = img_nhwc.shape + # new_height was named H before + new_height = int(np.ceil(np.sqrt(n_images))) + # new_width was named W before + new_width = int(np.ceil(float(n_images) / new_height)) + img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) + # img_HWhwc + out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) + # img_HhWwc + out_image = out_image.transpose(0, 2, 1, 3, 4) + # img_Hh_Ww_c + out_image = out_image.reshape(new_height * height, new_width * width, n_channels) + return out_image diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py index eb07310d15..b6377b59b9 100644 --- a/baselines/common/vec_env/__init__.py +++ b/baselines/common/vec_env/__init__.py @@ -1,6 +1,10 @@ from abc import ABC, abstractmethod +import pickle + +import cloudpickle from baselines import logger + class AlreadySteppingError(Exception): """ Raised when an asynchronous step is running while @@ -10,6 +14,7 @@ def __init__(self): msg = 'already running an async step' Exception.__init__(self, msg) + class NotSteppingError(Exception): """ Raised when an asynchronous step is not running but @@ -19,11 +24,16 @@ def __init__(self): msg = 'not running an async step' Exception.__init__(self, msg) + class VecEnv(ABC): - """ - An abstract asynchronous, vectorized environment. - """ def __init__(self, num_envs, observation_space, action_space): + """ + An abstract asynchronous, vectorized environment. + + :param num_envs: (int) the number of environments + :param observation_space: (Gym Space) the observation space + :param action_space: (Gym Space) the action space + """ self.num_envs = num_envs self.observation_space = observation_space self.action_space = action_space @@ -37,6 +47,8 @@ def reset(self): If step_async is still doing work, that work will be cancelled and step_wait() should not be called until step_async() is invoked again. + + :return: ([int] or [float]) observation """ pass @@ -56,29 +68,35 @@ def step_async(self, actions): def step_wait(self): """ Wait for the step taken with step_async(). - - Returns (obs, rews, dones, infos): - - obs: an array of observations, or a tuple of - arrays of observations. - - rews: an array of rewards - - dones: an array of "episode done" booleans - - infos: a sequence of info objects + + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information """ pass @abstractmethod def close(self): """ - Clean up the environments' resources. + Clean up the environment's resources. """ pass def step(self, actions): + """ + Step the environments with the given action + + :param actions: ([int] or [float]) the action + :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information + """ self.step_async(actions) return self.step_wait() def render(self, mode='human'): - logger.warn('Render not defined for %s'%self) + """ + Gym environment rendering + + :param mode: (str) the rendering type + """ + logger.warn('Render not defined for %s' % self) @property def unwrapped(self): @@ -87,13 +105,12 @@ def unwrapped(self): else: return self + class VecEnvWrapper(VecEnv): def __init__(self, venv, observation_space=None, action_space=None): self.venv = venv - VecEnv.__init__(self, - num_envs=venv.num_envs, - observation_space=observation_space or venv.observation_space, - action_space=action_space or venv.action_space) + VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space, + action_space=action_space or venv.action_space) def step_async(self, actions): self.venv.step_async(actions) @@ -109,18 +126,21 @@ def step_wait(self): def close(self): return self.venv.close() - def render(self): + def render(self, mode='human'): self.venv.render() + class CloudpickleWrapper(object): - """ - Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) - """ - def __init__(self, x): - self.x = x + def __init__(self, var): + """ + Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) + + :param var: (Any) the variable you wish to wrap for pickling with cloudpickle + """ + self.var = var + def __getstate__(self): - import cloudpickle - return cloudpickle.dumps(self.x) - def __setstate__(self, ob): - import pickle - self.x = pickle.loads(ob) + return cloudpickle.dumps(self.var) + + def __setstate__(self, obs): + self.var = pickle.loads(obs) diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py index d0ae455d4a..8c2c157e7a 100644 --- a/baselines/common/vec_env/dummy_vec_env.py +++ b/baselines/common/vec_env/dummy_vec_env.py @@ -1,10 +1,18 @@ +from collections import OrderedDict + import numpy as np from gym import spaces -from collections import OrderedDict + from . import VecEnv + class DummyVecEnv(VecEnv): def __init__(self, env_fns): + """ + Creates a simple vectorized wrapper for multiple environments + + :param env_fns: ([Gym Environment]) the list of environments to vectorize + """ self.envs = [fn() for fn in env_fns] env = self.envs[0] VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) @@ -23,9 +31,9 @@ def __init__(self, env_fns): dtypes[key] = box.dtype self.keys.append(key) - self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } + self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys} self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) - self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) + self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) self.buf_infos = [{} for _ in range(self.num_envs)] self.actions = None @@ -33,18 +41,19 @@ def step_async(self, actions): self.actions = actions def step_wait(self): - for e in range(self.num_envs): - obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(self.actions[e]) - if self.buf_dones[e]: - obs = self.envs[e].reset() - self._save_obs(e, obs) + for env_idx in range(self.num_envs): + obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\ + self.envs[env_idx].step(self.actions[env_idx]) + if self.buf_dones[env_idx]: + obs = self.envs[env_idx].reset() + self._save_obs(env_idx, obs) return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), self.buf_infos.copy()) def reset(self): - for e in range(self.num_envs): - obs = self.envs[e].reset() - self._save_obs(e, obs) + for env_idx in range(self.num_envs): + obs = self.envs[env_idx].reset() + self._save_obs(env_idx, obs) return self._obs_from_buf() def close(self): @@ -53,15 +62,15 @@ def close(self): def render(self, mode='human'): return [e.render(mode=mode) for e in self.envs] - def _save_obs(self, e, obs): - for k in self.keys: - if k is None: - self.buf_obs[k][e] = obs + def _save_obs(self, env_idx, obs): + for key in self.keys: + if key is None: + self.buf_obs[key][env_idx] = obs else: - self.buf_obs[k][e] = obs[k] + self.buf_obs[key][env_idx] = obs[key] def _obs_from_buf(self): - if self.keys==[None]: + if self.keys == [None]: return self.buf_obs[None] else: return self.buf_obs diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py index fb55df45d3..3dcff51671 100644 --- a/baselines/common/vec_env/subproc_vec_env.py +++ b/baselines/common/vec_env/subproc_vec_env.py @@ -1,47 +1,54 @@ -import numpy as np from multiprocessing import Process, Pipe + +import numpy as np + from baselines.common.vec_env import VecEnv, CloudpickleWrapper from baselines.common.tile_images import tile_images -def worker(remote, parent_remote, env_fn_wrapper): +def _worker(remote, parent_remote, env_fn_wrapper): parent_remote.close() - env = env_fn_wrapper.x() + env = env_fn_wrapper.var() while True: - cmd, data = remote.recv() - if cmd == 'step': - ob, reward, done, info = env.step(data) - if done: - ob = env.reset() - remote.send((ob, reward, done, info)) - elif cmd == 'reset': - ob = env.reset() - remote.send(ob) - elif cmd == 'render': - remote.send(env.render(mode='rgb_array')) - elif cmd == 'close': - remote.close() + try: + cmd, data = remote.recv() + if cmd == 'step': + observation, reward, done, info = env.step(data) + if done: + observation = env.reset() + remote.send((observation, reward, done, info)) + elif cmd == 'reset': + observation = env.reset() + remote.send(observation) + elif cmd == 'render': + remote.send(env.render(mode='rgb_array')) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + except EOFError: break - elif cmd == 'get_spaces': - remote.send((env.observation_space, env.action_space)) - else: - raise NotImplementedError class SubprocVecEnv(VecEnv): - def __init__(self, env_fns, spaces=None): + def __init__(self, env_fns): """ - envs: list of gym environments to run in subprocesses + Creates a multiprocess vectorized wrapper for multiple environments + + :param env_fns: ([Gym Environment]) Environments to run in subprocesses """ self.waiting = False self.closed = False - nenvs = len(env_fns) - self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) - self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) - for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] - for p in self.ps: - p.daemon = True # if the main process crashes, we should not cause things to hang - p.start() + n_envs = len(env_fns) + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)]) + self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for process in self.processes: + process.daemon = True # if the main process crashes, we should not cause things to hang + process.start() for remote in self.work_remotes: remote.close() @@ -65,21 +72,16 @@ def reset(self): remote.send(('reset', None)) return np.stack([remote.recv() for remote in self.remotes]) - def reset_task(self): - for remote in self.remotes: - remote.send(('reset_task', None)) - return np.stack([remote.recv() for remote in self.remotes]) - def close(self): if self.closed: return if self.waiting: - for remote in self.remotes: + for remote in self.remotes: remote.recv() for remote in self.remotes: remote.send(('close', None)) - for p in self.ps: - p.join() + for process in self.processes: + process.join() self.closed = True def render(self, mode='human'): @@ -89,9 +91,9 @@ def render(self, mode='human'): bigimg = tile_images(imgs) if mode == 'human': import cv2 - cv2.imshow('vecenv', bigimg[:,:,::-1]) + cv2.imshow('vecenv', bigimg[:, :, ::-1]) cv2.waitKey(1) elif mode == 'rgb_array': return bigimg else: - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/baselines/common/vec_env/vec_frame_stack.py b/baselines/common/vec_env/vec_frame_stack.py index 0bbcbdbb58..b14974b5d3 100644 --- a/baselines/common/vec_env/vec_frame_stack.py +++ b/baselines/common/vec_env/vec_frame_stack.py @@ -1,29 +1,34 @@ -from baselines.common.vec_env import VecEnvWrapper import numpy as np from gym import spaces +from baselines.common.vec_env import VecEnvWrapper + + class VecFrameStack(VecEnvWrapper): - """ - Vectorized environment base class - """ - def __init__(self, venv, nstack): + def __init__(self, venv, n_stack): + """ + Vectorized environment base class + + :param venv: ([Gym Environment]) the list of environments to vectorize and normalize + :param n_stack: + """ self.venv = venv - self.nstack = nstack - wos = venv.observation_space # wrapped ob space - low = np.repeat(wos.low, self.nstack, axis=-1) - high = np.repeat(wos.high, self.nstack, axis=-1) - self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype) + self.n_stack = n_stack + wrapped_obs_space = venv.observation_space + low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1) + high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1) + self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) VecEnvWrapper.__init__(self, venv, observation_space=observation_space) def step_wait(self): - obs, rews, news, infos = self.venv.step_wait() - self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) - for (i, new) in enumerate(news): - if new: + observations, rewards, dones, infos = self.venv.step_wait() + self.stackedobs = np.roll(self.stackedobs, shift=-observations.shape[-1], axis=-1) + for i, done in enumerate(dones): + if done: self.stackedobs[i] = 0 - self.stackedobs[..., -obs.shape[-1]:] = obs - return self.stackedobs, rews, news, infos + self.stackedobs[..., -observations.shape[-1]:] = observations + return self.stackedobs, rewards, dones, infos def reset(self): """ diff --git a/baselines/common/vec_env/vec_normalize.py b/baselines/common/vec_env/vec_normalize.py index dda767da15..e52c14b8c6 100644 --- a/baselines/common/vec_env/vec_normalize.py +++ b/baselines/common/vec_env/vec_normalize.py @@ -1,47 +1,50 @@ +import numpy as np + from baselines.common.vec_env import VecEnvWrapper from baselines.common.running_mean_std import RunningMeanStd -import numpy as np + class VecNormalize(VecEnvWrapper): - """ - Vectorized environment base class - """ - def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): + def __init__(self, venv, norm_obs=True, norm_reward=True, + clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): + """ + A rolling average, normalizing, vectorized wrapepr for environment base class + + :param venv: ([Gym Environment]) the list of environments to vectorize and normalize + :param norm_obs: (bool) normalize observation + :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new) + :param clip_obs: (float) clipping value for nomalizing observation + :param clip_reward: (float) clipping value for nomalizing reward + :param gamma: (float) discount factor + :param epsilon: (float) epsilon value to avoid arithmetic issues + """ VecEnvWrapper.__init__(self, venv) - self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None - self.ret_rms = RunningMeanStd(shape=()) if ret else None - self.clipob = clipob - self.cliprew = cliprew + self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if norm_obs else None + self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None + self.clip_obs = clip_obs + self.clip_reward = clip_reward self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): - """ - Apply sequence of actions to sequence of environments - actions -> (observations, rewards, news) - - where 'news' is a boolean vector indicating whether each element is new. - """ - obs, rews, news, infos = self.venv.step_wait() - self.ret = self.ret * self.gamma + rews + obs, rewards, dones, infos = self.venv.step_wait() + self.ret = self.ret * self.gamma + rewards obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) - return obs, rews, news, infos + rewards = np.clip(rewards / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) + return obs, rewards, dones, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) - obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) + obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), + -self.clip_obs, self.clip_obs) return obs else: return obs def reset(self): - """ - Reset all environments - """ obs = self.venv.reset() return self._obfilt(obs) diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py index e2d49501c7..9085bb03b0 100644 --- a/baselines/ddpg/ddpg.py +++ b/baselines/ddpg/ddpg.py @@ -4,47 +4,97 @@ import numpy as np import tensorflow as tf import tensorflow.contrib as tc +from mpi4py import MPI from baselines import logger from baselines.common.mpi_adam import MpiAdam -import baselines.common.tf_util as U +import baselines.common.tf_util as tf_util from baselines.common.mpi_running_mean_std import RunningMeanStd -from mpi4py import MPI -def normalize(x, stats): + +def normalize(tensor, stats): + """ + normalize a tensor using a running mean and std + + :param tensor: (TensorFlow Tensor) the input tensor + :param stats: (RunningMeanStd) the running mean and std of the input to normalize + :return: (TensorFlow Tensor) the normalized tensor + """ if stats is None: - return x - return (x - stats.mean) / stats.std + return tensor + return (tensor - stats.mean) / stats.std + +def denormalize(tensor, stats): + """ + denormalize a tensor using a running mean and std -def denormalize(x, stats): + :param tensor: (TensorFlow Tensor) the normalized tensor + :param stats: (RunningMeanStd) the running mean and std of the input to normalize + :return: (TensorFlow Tensor) the restored tensor + """ if stats is None: - return x - return x * stats.std + stats.mean + return tensor + return tensor * stats.std + stats.mean -def reduce_std(x, axis=None, keepdims=False): - return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims)) -def reduce_var(x, axis=None, keepdims=False): - m = tf.reduce_mean(x, axis=axis, keep_dims=True) - devs_squared = tf.square(x - m) +def reduce_std(tensor, axis=None, keepdims=False): + """ + get the standard deviation of a Tensor + + :param tensor: (TensorFlow Tensor) the input tensor + :param axis: (int or [int]) the axis to itterate the std over + :param keepdims: (bool) keep the other dimentions the same + :return: (TensorFlow Tensor) the std of the tensor + """ + return tf.sqrt(reduce_var(tensor, axis=axis, keepdims=keepdims)) + + +def reduce_var(tensor, axis=None, keepdims=False): + """ + get the variance of a Tensor + + :param tensor: (TensorFlow Tensor) the input tensor + :param axis: (int or [int]) the axis to itterate the variance over + :param keepdims: (bool) keep the other dimentions the same + :return: (TensorFlow Tensor) the variance of the tensor + """ + tensor_mean = tf.reduce_mean(tensor, axis=axis, keep_dims=True) + devs_squared = tf.square(tensor - tensor_mean) return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims) -def get_target_updates(vars, target_vars, tau): + +def get_target_updates(_vars, target_vars, tau): + """ + get target update operations + + :param _vars: ([TensorFlow Tensor]) the initial variables + :param target_vars: ([TensorFlow Tensor]) the target variables + :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) + :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update + """ logger.info('setting up target updates ...') soft_updates = [] init_updates = [] - assert len(vars) == len(target_vars) - for var, target_var in zip(vars, target_vars): + assert len(_vars) == len(target_vars) + for var, target_var in zip(_vars, target_vars): logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) - assert len(init_updates) == len(vars) - assert len(soft_updates) == len(vars) + assert len(init_updates) == len(_vars) + assert len(soft_updates) == len(_vars) return tf.group(*init_updates), tf.group(*soft_updates) def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): + """ + get the actor update, with noise. + + :param actor: (TensorFlow Tensor) the actor + :param perturbed_actor: (TensorFlow Tensor) the pertubed actor + :param param_noise_stddev: (float) the std of the parameter noise + :return: (TensorFlow Operation) the update function + """ assert len(actor.vars) == len(perturbed_actor.vars) assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) @@ -52,7 +102,8 @@ def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): for var, perturbed_var in zip(actor.vars, perturbed_actor.vars): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) - updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) + updates.append(tf.assign(perturbed_var, + var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) @@ -62,10 +113,37 @@ def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev): class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, - gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, - batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), - adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, - critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): + gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, + batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), + critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): + """ + Deep Deterministic Policy Gradien (DDPG) model + + DDPG: https://arxiv.org/pdf/1509.02971.pdf + + :param actor: (TensorFlow Tensor) the actor model + :param critic: (TensorFlow Tensor) the critic model + :param memory: (Memory) the replay buffer + :param observation_shape: (tuple) the observation space + :param action_shape: (tuple) the action space + :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) + :param action_noise: (ActionNoise) the action noise type (can be None) + :param gamma: (float) the discount rate + :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) + :param normalize_returns: (bool) should the critic output be normalized + :param enable_popart: (bool) enable pop-art normalization of the critic output + (https://arxiv.org/pdf/1602.07714.pdf) + :param normalize_observations: (bool) should the observation be normalized + :param batch_size: (int) the size of the batch for learning the policy + :param observation_range: (tuple) the bounding values for the observation + :param action_range: (tuple) the bounding values for the actions + :param return_range: (tuple) the bounding values for the critic output + :param critic_l2_reg: (float) l2 regularizer coefficient + :param actor_lr: (float) the actor learning rate + :param critic_lr: (float) the critic learning rate + :param clip_norm: (float) clip the gradients (disabled if None) + :param reward_scale: (float) the value the reward should be scaled by + """ # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') @@ -96,6 +174,12 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg + self.target_init_updates = None + self.target_soft_updates = None + self.critic_loss = None + self.critic_grads = None + self.critic_optimizer = None + self.sess = None # Observation normalization. if self.normalize_observations: @@ -103,10 +187,10 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None - normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), - self.observation_range[0], self.observation_range[1]) - normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), - self.observation_range[0], self.observation_range[1]) + normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], + self.observation_range[1]) + normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], + self.observation_range[1]) # Return normalization. if self.normalize_returns: @@ -126,11 +210,14 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) - self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) + self.critic_tf = denormalize( + tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) - self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) - Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) - self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 + self.critic_with_actor_tf = denormalize( + tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), + self.ret_rms) + q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) + self.target_q = self.rewards + (1. - self.terminals1) * gamma * q_obs1 # Set up parts. if self.param_noise is not None: @@ -143,12 +230,20 @@ def __init__(self, actor, critic, memory, observation_shape, action_shape, param self.setup_target_network_updates() def setup_target_network_updates(self): + """ + set the target update operations + """ actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau) - critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau) + critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, + self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): + """ + set the parameter noise operations + :param normalized_obs0: (TensorFlow Tensor) the normalized observation + """ assert self.param_noise is not None # Configure perturbed actor. @@ -162,26 +257,35 @@ def setup_param_noise(self, normalized_obs0): adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) - self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev) + self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, + self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): + """ + setup the optimizer for the actor + """ logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) - self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) + self.actor_grads = tf_util.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, - beta1=0.9, beta2=0.999, epsilon=1e-08) + beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): + """ + setup the optimizer for the critic + """ logger.info('setting up critic optimizer') - normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) + normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), + self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: - critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] + critic_reg_vars = [var for var in self.critic.trainable_vars if + 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) @@ -194,29 +298,37 @@ def setup_critic_optimizer(self): critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) - self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) - self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, - beta1=0.9, beta2=0.999, epsilon=1e-08) + self.critic_grads = tf_util.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) + self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): - # See https://arxiv.org/pdf/1602.07714.pdf for details. + """ + setup pop-art normalization of the critic output + + See https://arxiv.org/pdf/1602.07714.pdf for details. + Preserving Outputs Precisely, while Adaptively Rescaling Targets”. + """ self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean - self.renormalize_Q_outputs_op = [] - for vs in [self.critic.output_vars, self.target_critic.output_vars]: - assert len(vs) == 2 - M, b = vs - assert 'kernel' in M.name - assert 'bias' in b.name - assert M.get_shape()[-1] == 1 - assert b.get_shape()[-1] == 1 - self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] - self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)] + self.renormalize_q_outputs_op = [] + for out_vars in [self.critic.output_vars, self.target_critic.output_vars]: + assert len(out_vars) == 2 + # wieght and bias of the last layer + weight, bias = out_vars + assert 'kernel' in weight.name + assert 'bias' in bias.name + assert weight.get_shape()[-1] == 1 + assert bias.get_shape()[-1] == 1 + self.renormalize_q_outputs_op += [weight.assign(weight * self.old_std / new_std)] + self.renormalize_q_outputs_op += [bias.assign((bias * self.old_std + self.old_mean - new_mean) / new_std)] def setup_stats(self): + """ + setup the running means and std of the inputs and outputs of the model + """ ops = [] names = [] @@ -252,58 +364,71 @@ def setup_stats(self): self.stats_ops = ops self.stats_names = names - def pi(self, obs, apply_noise=True, compute_Q=True): + def policy(self, obs, apply_noise=True, compute_q=True): + """ + Get the actions and critic output, from a given observation + + :param obs: ([float] or [int]) the observation + :param apply_noise: (bool) enable the noise + :param compute_q: (bool) compute the critic output + :return: ([float], float) the action and critic value + """ if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: [obs]} - if compute_Q: - action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) + if compute_q: + action, q_value = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) - q = None + q_value = None action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) - return action, q + return action, q_value def store_transition(self, obs0, action, reward, obs1, terminal1): + """ + Store a transition in the replay buffer + + :param obs0: ([float] or [int]) the last observation + :param action: ([float]) the action + :param reward: (float] the reward + :param obs1: ([float] or [int]) the current observation + :param terminal1: (bool) is the episode done + """ reward *= self.reward_scale self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): + """ + run a step of training from batch + :return: (float, float) critic loss, actor loss + """ # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: - old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ - self.obs1: batch['obs1'], - self.rewards: batch['rewards'], - self.terminals1: batch['terminals1'].astype('float32'), - }) - self.ret_rms.update(target_Q.flatten()) - self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ - self.old_std : np.array([old_std]), - self.old_mean : np.array([old_mean]), + old_mean, old_std, target_q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_q], + feed_dict={ + self.obs1: batch['obs1'], + self.rewards: batch['rewards'], + self.terminals1: batch['terminals1'].astype('float32'), + }) + self.ret_rms.update(target_q.flatten()) + self.sess.run(self.renormalize_q_outputs_op, feed_dict={ + self.old_std: np.array([old_std]), + self.old_mean: np.array([old_mean]), }) - # Run sanity check. Disabled by default since it slows down things considerably. - # print('running sanity check') - # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ - # self.obs1: batch['obs1'], - # self.rewards: batch['rewards'], - # self.terminals1: batch['terminals1'].astype('float32'), - # }) - # print(target_Q_new, target_Q, new_mean, new_std) - # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: - target_Q = self.sess.run(self.target_Q, feed_dict={ + target_q = self.sess.run(self.target_q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), @@ -314,14 +439,19 @@ def train(self): actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], - self.critic_target: target_Q, + self.critic_target: target_q, }) - self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) - self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) + self.actor_optimizer.update(actor_grads, learning_rate=self.actor_lr) + self.critic_optimizer.update(critic_grads, learning_rate=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): + """ + initialize the model parameters and optimizers + + :param sess: (TensorFlow Session) the current TensorFlow session + """ self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() @@ -329,9 +459,17 @@ def initialize(self, sess): self.sess.run(self.target_init_updates) def update_target_net(self): + """ + run target soft update operation + """ self.sess.run(self.target_soft_updates) def get_stats(self): + """ + Get the mean and standard deviation of the model's inputs and outputs + + :return: (dict) the means and stds + """ if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. @@ -351,6 +489,11 @@ def get_stats(self): return stats def adapt_param_noise(self): + """ + calculate the adaptation for the parameter noise + + :return: (float) the mean distance for the parameter noise + """ if self.param_noise is None: return 0. @@ -369,7 +512,9 @@ def adapt_param_noise(self): return mean_distance def reset(self): - # Reset internal state after an episode is complete. + """ + Reset internal state after an episode is complete. + """ if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: diff --git a/baselines/ddpg/main.py b/baselines/ddpg/main.py index e877507b8e..48a3d78cec 100644 --- a/baselines/ddpg/main.py +++ b/baselines/ddpg/main.py @@ -1,22 +1,33 @@ import argparse import time import os -import logging + +import gym +import tensorflow as tf +import numpy as np +from mpi4py import MPI + from baselines import logger, bench -from baselines.common.misc_util import ( - set_global_seeds, - boolean_flag, -) +from baselines.common.misc_util import set_global_seeds, boolean_flag import baselines.ddpg.training as training from baselines.ddpg.models import Actor, Critic from baselines.ddpg.memory import Memory -from baselines.ddpg.noise import * +from baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise -import gym -import tensorflow as tf -from mpi4py import MPI def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): + """ + run the training of DDPG + + :param env_id: (str) the environment ID + :param seed: (int) the initial random seed + :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by + seperating them with commas + :param layer_norm: (bool) use layer normalization + :param evaluation: (bool) enable evaluation of DDPG training + :param kwargs: (dict) extra keywords for the training.train function + """ + # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: @@ -26,7 +37,7 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) - if evaluation and rank==0: + if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) @@ -46,10 +57,11 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') - action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) + action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') - action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) + action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), + sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) @@ -70,8 +82,8 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() - training.train(env=env, eval_env=eval_env, param_noise=param_noise, - action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) + training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, + critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() @@ -80,6 +92,11 @@ def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): def parse_args(): + """ + parse the arguments for DDPG training + + :return: (dict) the arguments + """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') @@ -102,14 +119,15 @@ def parse_args(): parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker - parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') # choices are adaptive-param_xx, ou_xx, normal_xx, none + # choices are adaptive-param_xx, ou_xx, normal_xx, none + parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') parser.add_argument('--num-timesteps', type=int, default=None) boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() # we don't directly specify timesteps for this script, so make sure that if we do specify them # they agree with the other parameters if args.num_timesteps is not None: - assert(args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps) + assert args.num_timesteps == args.nb_epochs * args.nb_epoch_cycles * args.nb_rollout_steps dict_args = vars(args) del dict_args['num_timesteps'] return dict_args diff --git a/baselines/ddpg/memory.py b/baselines/ddpg/memory.py index 90f0f9a18a..474c42a82b 100644 --- a/baselines/ddpg/memory.py +++ b/baselines/ddpg/memory.py @@ -3,6 +3,13 @@ class RingBuffer(object): def __init__(self, maxlen, shape, dtype='float32'): + """ + A buffer object, when full restarts at the initial position + + :param maxlen: (int) the max number of numpy objects to store + :param shape: (tuple) the shape of the numpy objects you want to store + :param dtype: (str) the name of the type of the numpy object you want to store + """ self.maxlen = maxlen self.start = 0 self.length = 0 @@ -17,9 +24,20 @@ def __getitem__(self, idx): return self.data[(self.start + idx) % self.maxlen] def get_batch(self, idxs): + """ + get the value at the indexes + + :param idxs: (int or numpy int) the indexes + :return: (numpy Any) the stored information in the buffer at the asked positions + """ return self.data[(self.start + idxs) % self.maxlen] - def append(self, v): + def append(self, var): + """ + Append an object to the buffer + + :param var: (numpy Any) the object you wish to add + """ if self.length < self.maxlen: # We have space, simply increase the length. self.length += 1 @@ -29,18 +47,31 @@ def append(self, v): else: # This should never happen. raise RuntimeError() - self.data[(self.start + self.length - 1) % self.maxlen] = v + self.data[(self.start + self.length - 1) % self.maxlen] = var -def array_min2d(x): - x = np.array(x) - if x.ndim >= 2: - return x - return x.reshape(-1, 1) +def array_min2d(arr): + """ + cast to numpy array, and make sure it is of 2 dim + + :param arr: ([Any]) the array to clean + :return: (numpy Any) the cleaned array + """ + arr = np.array(arr) + if arr.ndim >= 2: + return arr + return arr.reshape(-1, 1) class Memory(object): def __init__(self, limit, action_shape, observation_shape): + """ + The replay buffer object + + :param limit: (int) the max number of transitions to store + :param action_shape: (tuple) the action shape + :param observation_shape: (tuple) the observation shape + """ self.limit = limit self.observations0 = RingBuffer(limit, shape=observation_shape) @@ -50,6 +81,12 @@ def __init__(self, limit, action_shape, observation_shape): self.observations1 = RingBuffer(limit, shape=observation_shape) def sample(self, batch_size): + """ + sample a random batch from the buffer + + :param batch_size: (int) the number of element to sample for the batch + :return: (dict) the sampled batch + """ # Draw such that we always have a proceeding element. batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size) @@ -69,6 +106,16 @@ def sample(self, batch_size): return result def append(self, obs0, action, reward, obs1, terminal1, training=True): + """ + Append a transition to the buffer + + :param obs0: ([float] or [int]) the last observation + :param action: ([float]) the action + :param reward: (float] the reward + :param obs1: ([float] or [int]) the current observation + :param terminal1: (bool) is the episode done + :param training: (bool) is the RL model training or not + """ if not training: return diff --git a/baselines/ddpg/models.py b/baselines/ddpg/models.py index dc5803a035..efb89360ee 100644 --- a/baselines/ddpg/models.py +++ b/baselines/ddpg/models.py @@ -3,8 +3,14 @@ class Model(object): - def __init__(self, name): + def __init__(self, name, layer_norm=True): + """ + A TensorFlow Model type + + :param name: (str) the name of the model + """ self.name = name + self.layer_norm = layer_norm @property def vars(self): @@ -18,10 +24,27 @@ def trainable_vars(self): def perturbable_vars(self): return [var for var in self.trainable_vars if 'LayerNorm' not in var.name] + def fc_with_relu(self, input_tensor): + """ + Fully connected layer followed by ReLU + with optional batchnorm + """ + preactivation = tf.layers.dense(input_tensor, 64) + if self.layer_norm: + preactivation = tc.layers.layer_norm(preactivation, center=True, scale=True) + return tf.nn.relu(preactivation) + class Actor(Model): def __init__(self, nb_actions, name='actor', layer_norm=True): - super(Actor, self).__init__(name=name) + """ + A TensorFlow Actor model, this is used to output the actions + + :param nb_actions: (int) the size of the action space + :param name: (str) the name of the model (default: 'actor') + :param layer_norm: (bool) enable layer normalization + """ + super(Actor, self).__init__(name=name, layer_norm=layer_norm) self.nb_actions = nb_actions self.layer_norm = layer_norm @@ -30,25 +53,23 @@ def __call__(self, obs, reuse=False): if reuse: scope.reuse_variables() - x = obs - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) - x = tf.nn.tanh(x) - return x + layer_1 = self.fc_with_relu(obs) + layer_2 = self.fc_with_relu(layer_1) + last_layer = tf.layers.dense(layer_2, self.nb_actions, + kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) + squashed_out = tf.nn.tanh(last_layer) + return squashed_out class Critic(Model): def __init__(self, name='critic', layer_norm=True): - super(Critic, self).__init__(name=name) + """ + A TensorFlow Critic model, this is used to output the value of a state + + :param name: (str) the name of the model (default: 'critic') + :param layer_norm: (bool) enable layer normalization + """ + super(Critic, self).__init__(name=name, layer_norm=layer_norm) self.layer_norm = layer_norm def __call__(self, obs, action, reuse=False): @@ -56,20 +77,12 @@ def __call__(self, obs, action, reuse=False): if reuse: scope.reuse_variables() - x = obs - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.concat([x, action], axis=-1) - x = tf.layers.dense(x, 64) - if self.layer_norm: - x = tc.layers.layer_norm(x, center=True, scale=True) - x = tf.nn.relu(x) - - x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) - return x + layer_1 = self.fc_with_relu(obs) + layer_2 = tf.concat([layer_1, action], axis=-1) + layer_3 = self.fc_with_relu(layer_2) + value = tf.layers.dense(layer_3, 1, + kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) + return value @property def output_vars(self): diff --git a/baselines/ddpg/noise.py b/baselines/ddpg/noise.py index c48d0d6a22..46c1da8011 100644 --- a/baselines/ddpg/noise.py +++ b/baselines/ddpg/noise.py @@ -3,6 +3,13 @@ class AdaptiveParamNoiseSpec(object): def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): + """ + Implements adaptive parameter noise + + :param initial_stddev: (float) the initial value for the standard deviation of the noise + :param desired_action_stddev: (float) the desired value for the standard deviation of the noise + :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise + """ self.initial_stddev = initial_stddev self.desired_action_stddev = desired_action_stddev self.adoption_coefficient = adoption_coefficient @@ -10,6 +17,11 @@ def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coeff self.current_stddev = initial_stddev def adapt(self, distance): + """ + update the standard deviation for the parameter noise + + :param distance: (float) the noise distance applied to the parameters + """ if distance > self.desired_action_stddev: # Decrease stddev. self.current_stddev /= self.adoption_coefficient @@ -18,10 +30,12 @@ def adapt(self, distance): self.current_stddev *= self.adoption_coefficient def get_stats(self): - stats = { - 'param_noise_stddev': self.current_stddev, - } - return stats + """ + return the standard deviation for the parameter noise + + :return: (dict) the stats of the noise + """ + return {'param_noise_stddev': self.current_stddev} def __repr__(self): fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' @@ -29,39 +43,66 @@ def __repr__(self): class ActionNoise(object): + """ + The action noise base class + """ def reset(self): + """ + call end of episode reset for the noise + """ pass class NormalActionNoise(ActionNoise): - def __init__(self, mu, sigma): - self.mu = mu - self.sigma = sigma + def __init__(self, mean, sigma): + """ + A guassian action noise + + :param mean: (float) the mean value of the noise + :param sigma: (float) the scale of the noise (std here) + """ + self._mu = mean + self._sigma = sigma def __call__(self): - return np.random.normal(self.mu, self.sigma) + return np.random.normal(self._mu, self._sigma) def __repr__(self): - return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) + return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) -# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab class OrnsteinUhlenbeckActionNoise(ActionNoise): - def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None): - self.theta = theta - self.mu = mu - self.sigma = sigma - self.dt = dt - self.x0 = x0 + def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None): + """ + A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction. + + Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab + + :param mean: (float) the mean of the noise + :param sigma: (float) the scale of the noise + :param theta: (float) the rate of mean reversion + :param dt: (float) the timestep for the noise + :param initial_noise: ([float]) the initial value for the noise output, (if None: 0) + """ + self._theta = theta + self._mu = mean + self._sigma = sigma + self._dt = dt + self.initial_noise = initial_noise + self.noise_prev = None self.reset() def __call__(self): - x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) - self.x_prev = x - return x + noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \ + self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape) + self.noise_prev = noise + return noise def reset(self): - self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) + """ + reset the Ornstein Uhlenbeck noise, to the initial position + """ + self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu) def __repr__(self): - return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) + return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) diff --git a/baselines/ddpg/training.py b/baselines/ddpg/training.py index 74a9b8fd1c..454279dc3b 100644 --- a/baselines/ddpg/training.py +++ b/baselines/ddpg/training.py @@ -3,43 +3,72 @@ from collections import deque import pickle -from baselines.ddpg.ddpg import DDPG -import baselines.common.tf_util as U - -from baselines import logger import numpy as np import tensorflow as tf from mpi4py import MPI +from baselines.ddpg.ddpg import DDPG +import baselines.common.tf_util as tf_util +from baselines import logger + def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, - normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, - popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, - tau=0.01, eval_env=None, param_noise_adaption_interval=50): + normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, + popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, + tau=0.01, eval_env=None, param_noise_adaption_interval=50): + """ + Runs the training of the Deep Deterministic Policy Gradien (DDPG) model + + DDPG: https://arxiv.org/pdf/1509.02971.pdf + + :param env: (Gym Environment) the environment + :param nb_epochs: (int) the number of training epochs + :param nb_epoch_cycles: (int) the number cycles within each epoch + :param render_eval: (bool) enable rendering of the evalution environment + :param reward_scale: (float) the value the reward should be scaled by + :param render: (bool) enable rendering of the environment + :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) + :param actor: (TensorFlow Tensor) the actor model + :param critic: (TensorFlow Tensor) the critic model + :param normalize_returns: (bool) should the critic output be normalized + :param normalize_observations: (bool) should the observation be normalized + :param critic_l2_reg: (float) l2 regularizer coefficient + :param actor_lr: (float) the actor learning rate + :param critic_lr: (float) the critic learning rate + :param action_noise: (ActionNoise) the action noise type (can be None) + :param popart: (bool) enable pop-art normalization of the critic output + (https://arxiv.org/pdf/1602.07714.pdf) + :param gamma: (float) the discount rate + :param clip_norm: (float) clip the gradients (disabled if None) + :param nb_train_steps: (int) the number of training steps + :param nb_rollout_steps: (int) the number of rollout steps + :param nb_eval_steps: (int) the number of evalutation steps + :param batch_size: (int) the size of the batch for learning the policy + :param memory: (Memory) the replay buffer + :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) + :param eval_env: (Gym Environment) the evaluation environment (can be None) + :param param_noise_adaption_interval: (int) apply param noise every N steps + """ rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) - agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, - gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, - batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, - actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, - reward_scale=reward_scale) + agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, param_noise=param_noise, + action_noise=action_noise, gamma=gamma, tau=tau, normalize_returns=normalize_returns, + enable_popart=popart, normalize_observations=normalize_observations, batch_size=batch_size, + critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, clip_norm=clip_norm, + reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: - saver = tf.train.Saver() - else: - saver = None + tf.train.Saver() - step = 0 - episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) - with U.single_threaded_session() as sess: + with tf_util.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() @@ -48,46 +77,42 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() - done = False episode_reward = 0. episode_step = 0 episodes = 0 - t = 0 + step = 0 - epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] - epoch_episode_eval_rewards = [] - epoch_episode_eval_steps = [] - epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): - for cycle in range(nb_epoch_cycles): + for _ in range(nb_epoch_cycles): # Perform rollouts. - for t_rollout in range(nb_rollout_steps): + for _ in range(nb_rollout_steps): # Predict next action. - action, q = agent.pi(obs, apply_noise=True, compute_Q=True) + action, q_value = agent.policy(obs, apply_noise=True, compute_q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape - new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) - t += 1 + # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + new_obs, reward, done, _ = env.step(max_action * action) + step += 1 if rank == 0 and render: env.render() - episode_reward += r + episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) - epoch_qs.append(q) - agent.store_transition(obs, action, r, new_obs, done) + epoch_qs.append(q_value) + agent.store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: @@ -113,9 +138,9 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) - cl, al = agent.train() - epoch_critic_losses.append(cl) - epoch_actor_losses.append(al) + critic_loss, actor_loss = agent.train() + epoch_critic_losses.append(critic_loss) + epoch_actor_losses.append(actor_loss) agent.update_target_net() # Evaluate. @@ -123,9 +148,10 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa eval_qs = [] if eval_env is not None: eval_episode_reward = 0. - for t_rollout in range(nb_eval_steps): - eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) - eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + for _ in range(nb_eval_steps): + eval_action, eval_q = agent.policy(eval_obs, apply_noise=False, compute_q=True) + # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + eval_obs, eval_r, eval_done, _ = eval_env.step(max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r @@ -152,7 +178,7 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration - combined_stats['total/steps_per_second'] = float(t) / float(duration) + combined_stats['total/steps_per_second'] = float(step) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) @@ -162,20 +188,27 @@ def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, pa combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) - def as_scalar(x): - if isinstance(x, np.ndarray): - assert x.size == 1 - return x[0] - elif np.isscalar(x): - return x + + def as_scalar(scalar): + """ + check and return the input if it is a scalar, otherwise raise ValueError + + :param scalar: (Any) the object to check + :return: (Number) the scalar if x is a scalar + """ + if isinstance(scalar, np.ndarray): + assert scalar.size == 1 + return scalar[0] + elif np.isscalar(scalar): + return scalar else: - raise ValueError('expected scalar, got %s'%x) + raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) - combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} + combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 - combined_stats['total/steps'] = t + combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) @@ -184,8 +217,8 @@ def as_scalar(x): logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): - with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: - pickle.dump(env.get_state(), f) + with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: + pickle.dump(env.get_state(), file_handler) if eval_env and hasattr(eval_env, 'get_state'): - with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: - pickle.dump(eval_env.get_state(), f) + with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: + pickle.dump(eval_env.get_state(), file_handler) diff --git a/baselines/deepq/__init__.py b/baselines/deepq/__init__.py index 4472399a51..d30bf14b5f 100644 --- a/baselines/deepq/__init__.py +++ b/baselines/deepq/__init__.py @@ -3,6 +3,13 @@ from baselines.deepq.simple import learn, load # noqa from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa + def wrap_atari_dqn(env): + """ + wrap the environment in atari wrappers for DeepQ + + :param env: (Gym Environment) the environment + :return: (Gym Environment) the wrapped environment + """ from baselines.common.atari_wrappers import wrap_deepmind - return wrap_deepmind(env, frame_stack=True, scale=True) \ No newline at end of file + return wrap_deepmind(env, frame_stack=True, scale=True) diff --git a/baselines/deepq/build_graph.py b/baselines/deepq/build_graph.py index e9ff1a41a3..1b4a3aae6a 100644 --- a/baselines/deepq/build_graph.py +++ b/baselines/deepq/build_graph.py @@ -6,46 +6,28 @@ Function to chose an action given an observation - Parameters - ---------- - observation: object - Observation that can be feed into the output of make_obs_ph - stochastic: bool - if set to False all the actions are always deterministic (default False) - update_eps_ph: float - update epsilon a new value, if negative not update happens - (default: no update) - - Returns - ------- - Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. + :param observation: (Any) Observation that can be feed into the output of make_obs_ph + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps_ph: (float) update epsilon a new value, if negative not update happens (default: no update) + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for + every element of the batch. ======= act (in case of parameter noise) ======== Function to chose an action given an observation - Parameters - ---------- - observation: object - Observation that can be feed into the output of make_obs_ph - stochastic: bool - if set to False all the actions are always deterministic (default False) - update_eps_ph: float - update epsilon a new value, if negative not update happens + :param observation: (Any) Observation that can be feed into the output of make_obs_ph + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps_ph: (float) update epsilon a new value, if negative not update happens (default: no update) - reset_ph: bool - reset the perturbed policy by sampling a new perturbation - update_param_noise_threshold_ph: float - the desired threshold for the difference between non-perturbed and perturbed policy - update_param_noise_scale_ph: bool - whether or not to update the scale of the noise for the next time it is re-perturbed - - Returns - ------- - Tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for - every element of the batch. + :param reset_ph: (bool) reset the perturbed policy by sampling a new perturbation + :param update_param_noise_threshold_ph: (float) the desired threshold for the difference between + non-perturbed and perturbed policy + :param update_param_noise_scale_ph: (bool) whether or not to update the scale of the noise for the next time it is + re-perturbed + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be performed for + every element of the batch. ======= train ======= @@ -55,30 +37,17 @@ td_error = Q(s,a) - (r + gamma * max_a' Q(s', a')) loss = huber_loss[td_error] - Parameters - ---------- - obs_t: object - a batch of observations - action: np.array - actions that were selected upon seeing obs_t. - dtype must be int32 and shape must be (batch_size,) - reward: np.array - immediate reward attained after executing those actions - dtype must be float32 and shape must be (batch_size,) - obs_tp1: object - observations that followed obs_t - done: np.array - 1 if obs_t was the last observation in the episode and 0 otherwise - obs_tp1 gets ignored, but must be of the valid shape. - dtype must be float32 and shape must be (batch_size,) - weight: np.array - imporance weights for every element of the batch (gradient is multiplied - by the importance weight) dtype must be float32 and shape must be (batch_size,) - - Returns - ------- - td_error: np.array - a list of differences between Q(s,a) and the target in Bellman's equation. + :param obs_t: (Any) a batch of observations + :param action: (numpy int) actions that were selected upon seeing obs_t. dtype must be int32 and shape must be + (batch_size,) + :param reward: (numpy float) immediate reward attained after executing those actions dtype must be float32 and + shape must be (batch_size,) + :param obs_tp1: (Any) observations that followed obs_t + :param done: (numpy bool) 1 if obs_t was the last observation in the episode and 0 otherwise obs_tp1 gets ignored, + but must be of the valid shape. dtype must be float32 and shape must be (batch_size,) + :param weight: (numpy float) imporance weights for every element of the batch (gradient is multiplied by the + importance weight) dtype must be float32 and shape must be (batch_size,) + :return: (numpy float) td_error: a list of differences between Q(s,a) and the target in Bellman's equation. dtype is float32 and shape is (batch_size,) ======= update_target ======== @@ -94,23 +63,17 @@ """ import tensorflow as tf -import baselines.common.tf_util as U +import baselines.common.tf_util as tf_utils def scope_vars(scope, trainable_only=False): """ Get variables inside a scope The scope can be specified as a string - Parameters - ---------- - scope: str or VariableScope - scope in which the variables reside. - trainable_only: bool - whether or not to return only the variables that were marked as trainable. - Returns - ------- - vars: [tf.Variable] - list of variables in `scope`. + + :param scope: (str or VariableScope) scope in which the variables reside. + :param trainable_only: (bool) whether or not to return only the variables that were marked as trainable. + :return: ([TensorFlow Tensor]) vars: list of variables in `scope`. """ return tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.GLOBAL_VARIABLES, @@ -119,16 +82,30 @@ def scope_vars(scope, trainable_only=False): def scope_name(): - """Returns the name of current scope as a string, e.g. deepq/q_func""" + """ + Returns the name of current scope as a string, e.g. deepq/q_func + + :return: (str) the name of current scope + """ return tf.get_variable_scope().name def absolute_scope_name(relative_scope_name): - """Appends parent scope name to `relative_scope_name`""" + """ + Appends parent scope name to `relative_scope_name` + + :return: (str) the absolute name of the scope + """ return scope_name() + "/" + relative_scope_name def default_param_noise_filter(var): + """ + check whether or not a variable is perturbable or not + + :param var: (TensorFlow Tensor) the variable + :return: (bool) can be perturb + """ if var not in tf.trainable_variables(): # We never perturb non-trainable vars. return False @@ -146,11 +123,9 @@ def default_param_noise_filter(var): def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: - Parameters - ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that take a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: observation_in: object the output of observation placeholder @@ -160,18 +135,11 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - - Returns - ------- - act: (tf.Variable, bool, float) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. + :param num_actions: (int) number of actions. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given + observation. See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") @@ -190,23 +158,26 @@ def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) - _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True}, - updates=[update_eps_expr]) - def act(ob, stochastic=True, update_eps=-1): - return _act(ob, stochastic, update_eps) + _act = tf_utils.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], + outputs=output_actions, + givens={update_eps_ph: -1.0, stochastic_ph: True}, + updates=[update_eps_expr]) + + def act(obs, stochastic=True, update_eps=-1): + return _act(obs, stochastic, update_eps) + return act -def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): +def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, + param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that take a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: observation_in: object the output of observation placeholder @@ -216,21 +187,14 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - param_noise_filter_func: tf.Variable -> bool - function that decides whether or not a variable should be perturbed. Only applicable - if param_noise is True. If set to None, default_param_noise_filter is used by default. - - Returns - ------- - act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. + :param num_actions: (int) number of actions. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a + variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter + is used by default. + :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given + observation. See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter @@ -244,18 +208,28 @@ def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) - param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) - param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) + param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), + trainable=False) + param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), + trainable=False) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") - # We have to wrap this code into a function due to the way tf.cond() works. See - # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for - # a more detailed discussion. + def perturb_vars(original_scope, perturbed_scope): + """ + We have to wrap this code into a function due to the way tf.cond() works. + + See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed + discussion. + + :param original_scope: (str or VariableScope) the original scope. + :param perturbed_scope: (str or VariableScope) the perturbed scope. + :return: (TensorFlow Operation) + """ all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) @@ -263,11 +237,13 @@ def perturb_vars(original_scope, perturbed_scope): for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. - op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) + operation = tf.assign(perturbed_var, + var + tf.random_normal(shape=tf.shape(var), mean=0., + stddev=param_noise_scale)) else: # Do not perturb, just assign. - op = tf.assign(perturbed_var, var) - perturb_ops.append(op) + operation = tf.assign(perturbed_var, var) + perturb_ops.append(operation) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) @@ -276,19 +252,28 @@ def perturb_vars(original_scope, perturbed_scope): # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") - kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) - mean_kl = tf.reduce_mean(kl) + kl_loss = tf.reduce_sum( + tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), + axis=-1) + mean_kl = tf.reduce_mean(kl_loss) + def update_scale(): + """ + update the scale expression + + :return: (TensorFlow Tensor) the updated scale expression + """ with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond(mean_kl < param_noise_threshold, - lambda: param_noise_scale.assign(param_noise_scale * 1.01), - lambda: param_noise_scale.assign(param_noise_scale / 1.01), - ) + lambda: param_noise_scale.assign(param_noise_scale * 1.01), + lambda: param_noise_scale.assign(param_noise_scale / 1.01), + ) return update_scale_expr # Functionality to update the threshold for parameter space noise. - update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0, - lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) + update_param_noise_thres_expr = param_noise_threshold.assign( + tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, + lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) @@ -301,77 +286,83 @@ def update_scale(): update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, - tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), + tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), + lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), - update_param_noise_threshold_expr, + update_param_noise_thres_expr, ] - _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], - outputs=output_actions, - givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, - updates=updates) - def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): - return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) + _act = tf_utils.function( + inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, + update_param_noise_scale_ph], + outputs=output_actions, + givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, + update_param_noise_scale_ph: False}, + updates=updates) + + def act(obs, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): + """ + get the action from the current observation + + :param obs: (Any) Observation that can be feed into the output of make_obs_ph + :param reset: (bool) reset the perturbed policy by sampling a new perturbation + :param update_param_noise_threshold: (float) the desired threshold for the difference between + non-perturbed and perturbed policy + :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time + it is re-perturbed + :param stochastic: (bool) if set to False all the actions are always deterministic (default False) + :param update_eps: (float) update epsilon a new value, if negative not update happens + (default: no update) + :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be + performed for every element of the batch. + """ + return _act(obs, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) + return act def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, - double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): - """Creates the train function: + double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): + """ + Creates the train function: - Parameters - ---------- - make_obs_ph: str -> tf.placeholder or TfInput - a function that takes a name and creates a placeholder of input with that name - q_func: (tf.Variable, int, str, bool) -> tf.Variable + :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of + input with that name + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope - and returns a tensor of shape (batch_size, num_actions) with values of every action. - num_actions: int - number of actions - reuse: bool - whether or not to reuse the graph variables - optimizer: tf.train.Optimizer - optimizer to use for the Q-learning objective. - grad_norm_clipping: float or None - clip gradient norms to this value. If None no clipping is performed. - gamma: float - discount rate. - double_q: bool - if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). - In general it is a good idea to keep it enabled. - scope: str or VariableScope - optional scope for variable_scope. - reuse: bool or None - whether or not the variables should be reused. To be able to reuse the scope must be given. - param_noise: bool - whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) - param_noise_filter_func: tf.Variable -> bool - function that decides whether or not a variable should be perturbed. Only applicable - if param_noise is True. If set to None, default_param_noise_filter is used by default. - - Returns - ------- - act: (tf.Variable, bool, float) -> tf.Variable - function to select and action given observation. -` See the top of the file for details. - train: (object, np.array, np.array, object, np.array, np.array) -> np.array - optimize the error in Bellman's equation. -` See the top of the file for details. - update_target: () -> () - copy the parameters from optimized Q function to the target Q function. -` See the top of the file for details. - debug: {str: function} - a bunch of functions to print debug data like q_values. + - observation_in: (Any) the output of observation placeholder + - num_actions: int number of actions + - scope: (str) + - reuse: (bool) + + should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) + with values of every action. + :param num_actions: (int) number of actions + :param reuse: (bool) whether or not to reuse the graph variables + :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective. + :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed. + :param gamma: (float) discount rate. + :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a + good idea to keep it enabled. + :param scope: (str or VariableScope) optional scope for variable_scope. + :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given. + :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) + :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a + variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter + is used by default. + + :return: (tuple) + + act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given + observation. See the top of the file for details. + train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float) + optimize the error in Bellman's equation. See the top of the file for details. + update_target: (function) copy the parameters from optimized Q function to the target Q function. + See the top of the file for details. + debug: ({str: function}) a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, - param_noise_filter_func=param_noise_filter_func) + param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) @@ -390,7 +381,8 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping= # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") - target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") + target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, + scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) @@ -409,7 +401,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping= # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - errors = U.huber_loss(td_error) + errors = tf_utils.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) @@ -430,7 +422,7 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping= update_target_expr = tf.group(*update_target_expr) # Create callable functions - train = U.function( + train = tf_utils.function( inputs=[ obs_t_input, act_t_ph, @@ -442,8 +434,8 @@ def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping= outputs=td_error, updates=[optimize_expr] ) - update_target = U.function([], [], updates=[update_target_expr]) + update_target = tf_utils.function([], [], updates=[update_target_expr]) - q_values = U.function([obs_t_input], q_t) + q_values = tf_utils.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values} diff --git a/baselines/deepq/experiments/custom_cartpole.py b/baselines/deepq/experiments/custom_cartpole.py index b5a381a37e..8fb9fc0bb9 100644 --- a/baselines/deepq/experiments/custom_cartpole.py +++ b/baselines/deepq/experiments/custom_cartpole.py @@ -1,20 +1,28 @@ -import gym import itertools +import argparse + +import gym import numpy as np import tensorflow as tf import tensorflow.contrib.layers as layers -import baselines.common.tf_util as U - -from baselines import logger -from baselines import deepq +import baselines.common.tf_util as tf_utils +from baselines import logger, deepq from baselines.deepq.replay_buffer import ReplayBuffer from baselines.deepq.utils import ObservationInput from baselines.common.schedules import LinearSchedule def model(inpt, num_actions, scope, reuse=False): - """This model takes as input an observation and returns values of all actions.""" + """ + This model takes as input an observation and returns values of all actions. + + :param inpt: (TensorFlow Tensor) the input placeholder + :param num_actions: (int) size of the action space + :param scope: (str) the variable scope + :param reuse: (bool) is a reusable model + :return: (TensorFlow Tensor) + """ with tf.variable_scope(scope, reuse=reuse): out = inpt out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) @@ -23,7 +31,13 @@ def model(inpt, num_actions, scope, reuse=False): if __name__ == '__main__': - with U.make_session(8): + parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + parser.add_argument('--max-timesteps', default=50000, type=int, + help="Maximum number of timesteps when not rendering") + args = parser.parse_args() + + with tf_utils.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model @@ -40,7 +54,7 @@ def model(inpt, num_actions, scope, reuse=False): exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. - U.initialize() + tf_utils.initialize() update_target() episode_rewards = [0.0] @@ -58,8 +72,19 @@ def model(inpt, num_actions, scope, reuse=False): obs = env.reset() episode_rewards.append(0) - is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 + if len(episode_rewards[-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) + + is_solved = t > 100 and mean_100ep_reward >= 200 + + if args.no_render and t > args.max_timesteps: + break + if is_solved: + if args.no_render: + break # Show off the result env.render() else: @@ -74,6 +99,6 @@ def model(inpt, num_actions, scope, reuse=False): if done and len(episode_rewards) % 10 == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", len(episode_rewards)) - logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) + logger.record_tabular("mean episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() diff --git a/baselines/deepq/experiments/enjoy_cartpole.py b/baselines/deepq/experiments/enjoy_cartpole.py index 1c6176bac3..378e29380f 100644 --- a/baselines/deepq/experiments/enjoy_cartpole.py +++ b/baselines/deepq/experiments/enjoy_cartpole.py @@ -1,9 +1,16 @@ +import argparse + import gym from baselines import deepq -def main(): +def main(args): + """ + run a trained model for the cartpole problem + + :param args: (ArgumentParser) the input arguments + """ env = gym.make("CartPole-v0") act = deepq.load("cartpole_model.pkl") @@ -11,11 +18,18 @@ def main(): obs, done = env.reset(), False episode_rew = 0 while not done: - env.render() + if not args.no_render: + env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew) + # No render is only used for automatic testing + if args.no_render: + break if __name__ == '__main__': - main() + parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/experiments/enjoy_mountaincar.py b/baselines/deepq/experiments/enjoy_mountaincar.py index 8bced8c0f8..73396e2aa0 100644 --- a/baselines/deepq/experiments/enjoy_mountaincar.py +++ b/baselines/deepq/experiments/enjoy_mountaincar.py @@ -1,9 +1,16 @@ +import argparse + import gym from baselines import deepq -def main(): +def main(args): + """ + run a trained model for the mountain car problem + + :param args: (ArgumentParser) the input arguments + """ env = gym.make("MountainCar-v0") act = deepq.load("mountaincar_model.pkl") @@ -11,11 +18,18 @@ def main(): obs, done = env.reset(), False episode_rew = 0 while not done: - env.render() + if not args.no_render: + env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew) + # No render is only used for automatic testing + if args.no_render: + break if __name__ == '__main__': - main() + parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar") + parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/experiments/enjoy_pong.py b/baselines/deepq/experiments/enjoy_pong.py index 5b16fec6b6..0c4db3e2a9 100644 --- a/baselines/deepq/experiments/enjoy_pong.py +++ b/baselines/deepq/experiments/enjoy_pong.py @@ -1,8 +1,12 @@ import gym + from baselines import deepq def main(): + """ + run a trained model for the pong problem + """ env = gym.make("PongNoFrameskip-v4") env = deepq.wrap_atari_dqn(env) act = deepq.load("pong_model.pkl") diff --git a/baselines/deepq/experiments/run_atari.py b/baselines/deepq/experiments/run_atari.py index b6b427ba7a..04ffb18cfb 100644 --- a/baselines/deepq/experiments/run_atari.py +++ b/baselines/deepq/experiments/run_atari.py @@ -1,12 +1,16 @@ -from baselines import deepq -from baselines.common import set_global_seeds -from baselines import bench import argparse -from baselines import logger + +import tensorflow as tf + +from baselines import deepq, bench, logger +from baselines.common import set_global_seeds from baselines.common.atari_wrappers import make_atari def main(): + """ + run the atari test + """ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) @@ -29,23 +33,24 @@ def main(): dueling=bool(args.dueling), ) - deepq.learn( - env, - q_func=model, - lr=1e-4, - max_timesteps=args.num_timesteps, - buffer_size=10000, - exploration_fraction=0.1, - exploration_final_eps=0.01, - train_freq=4, - learning_starts=10000, - target_network_update_freq=1000, - gamma=0.99, - prioritized_replay=bool(args.prioritized), - prioritized_replay_alpha=args.prioritized_replay_alpha, - checkpoint_freq=args.checkpoint_freq, - checkpoint_path=args.checkpoint_path, - ) + with tf.Session(): + deepq.learn( + env, + q_func=model, + learning_rate=1e-4, + max_timesteps=args.num_timesteps, + buffer_size=10000, + exploration_fraction=0.1, + exploration_final_eps=0.01, + train_freq=4, + learning_starts=10000, + target_network_update_freq=1000, + gamma=0.99, + prioritized_replay=bool(args.prioritized), + prioritized_replay_alpha=args.prioritized_replay_alpha, + checkpoint_freq=args.checkpoint_freq, + checkpoint_path=args.checkpoint_path, + ) env.close() diff --git a/baselines/deepq/experiments/train_cartpole.py b/baselines/deepq/experiments/train_cartpole.py index a50c2428f9..bf68adeb36 100644 --- a/baselines/deepq/experiments/train_cartpole.py +++ b/baselines/deepq/experiments/train_cartpole.py @@ -1,22 +1,41 @@ +import argparse + import gym +import numpy as np from baselines import deepq def callback(lcl, _glb): + """ + the callback function for logging and saving + + :param lcl: (dict) the local variables + :param _glb: (dict) the global variables + :return: (bool) is solved + """ # stop training if reward exceeds 199 - is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 + if len(lcl['episode_rewards'][-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1) + is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199 return is_solved -def main(): +def main(args): + """ + train and save the DeepQ model, for the cartpole problem + + :param args: (ArgumentParser) the input arguments + """ env = gym.make("CartPole-v0") model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, - lr=1e-3, - max_timesteps=100000, + learning_rate=1e-3, + max_timesteps=args.max_timesteps, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, @@ -28,4 +47,7 @@ def main(): if __name__ == '__main__': - main() + parser = argparse.ArgumentParser(description="Train DQN on cartpole") + parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/experiments/train_mountaincar.py b/baselines/deepq/experiments/train_mountaincar.py index 061967d760..32ba2645c0 100644 --- a/baselines/deepq/experiments/train_mountaincar.py +++ b/baselines/deepq/experiments/train_mountaincar.py @@ -1,17 +1,24 @@ +import argparse + import gym from baselines import deepq -def main(): +def main(args): + """ + train and save the DeepQ model, for the mountain car problem + + :param args: (ArgumentParser) the input arguments + """ env = gym.make("MountainCar-v0") # Enabling layer_norm here is import for parameter space noise! model = deepq.models.mlp([64], layer_norm=True) act = deepq.learn( env, q_func=model, - lr=1e-3, - max_timesteps=100000, + learning_rate=1e-3, + max_timesteps=args.max_timesteps, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.1, @@ -23,4 +30,7 @@ def main(): if __name__ == '__main__': - main() + parser = argparse.ArgumentParser(description="Train DQN on cartpole") + parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") + args = parser.parse_args() + main(args) diff --git a/baselines/deepq/models.py b/baselines/deepq/models.py index 198d795a06..686d989260 100644 --- a/baselines/deepq/models.py +++ b/baselines/deepq/models.py @@ -14,19 +14,17 @@ def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False): return q_out -def mlp(hiddens=[], layer_norm=False): - """This model takes as input an observation and returns values of all actions. +def mlp(hiddens=None, layer_norm=False): + """ + This model takes as input an observation and returns values of all actions. - Parameters - ---------- - hiddens: [int] - list of sizes of hidden layers + :param hiddens: ([int]) list of sizes of hidden layers + :param layer_norm: (bool) if true, use layer normalization - Returns - ------- - q_func: function - q_function for DQN algorithm. + :return: (function) q_function for DQN algorithm. """ + if hiddens is None: + hiddens = [] return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs) @@ -70,21 +68,11 @@ def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False): """This model takes as input an observation and returns values of all actions. - Parameters - ---------- - convs: [(int, int int)] - list of convolutional layers in form of - (num_outputs, kernel_size, stride) - hiddens: [int] - list of sizes of hidden layers - dueling: bool - if true double the output MLP to compute a baseline - for action scores - - Returns - ------- - q_func: function - q_function for DQN algorithm. + :param convs: ([(int, int, int)]) list of convolutional layers in form of (num_outputs, kernel_size, stride) + :param hiddens: ([int]) list of sizes of hidden layers + :param dueling: (bool) if true double the output MLP to compute a baseline for action scores + :param layer_norm: (bool) if true, use layer normalization + :return: (function) q_function for DQN algorithm. """ return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs) diff --git a/baselines/deepq/replay_buffer.py b/baselines/deepq/replay_buffer.py index 7988113b0e..dcd79fbd2b 100644 --- a/baselines/deepq/replay_buffer.py +++ b/baselines/deepq/replay_buffer.py @@ -1,18 +1,17 @@ -import numpy as np import random +import numpy as np + from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree class ReplayBuffer(object): def __init__(self, size): - """Create Replay buffer. + """ + Create Replay buffer. - Parameters - ---------- - size: int - Max number of transitions to store in the buffer. When the buffer - overflows the old memories are dropped. + :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old + memories are dropped. """ self._storage = [] self._maxsize = size @@ -22,6 +21,15 @@ def __len__(self): return len(self._storage) def add(self, obs_t, action, reward, obs_tp1, done): + """ + add a new transition to the buffer + + :param obs_t: (Any) the last observation + :param action: ([float]) the action + :param reward: (float) the reward of the transition + :param obs_tp1: (Any) the current observation + :param done: (bool) is the episode done + """ data = (obs_t, action, reward, obs_tp1, done) if self._next_idx >= len(self._storage): @@ -42,27 +50,18 @@ def _encode_sample(self, idxes): dones.append(done) return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) - def sample(self, batch_size): - """Sample a batch of experiences. - - Parameters - ---------- - batch_size: int - How many transitions to sample. - - Returns - ------- - obs_batch: np.array - batch of observations - act_batch: np.array - batch of actions executed given obs_batch - rew_batch: np.array - rewards received as results of executing act_batch - next_obs_batch: np.array - next set of observations seen after executing act_batch - done_mask: np.array - done_mask[i] = 1 if executing act_batch[i] resulted in - the end of an episode and 0 otherwise. + def sample(self, batch_size, **_kwargs): + """ + Sample a batch of experiences. + + :param batch_size: (int) How many transitions to sample. + :return: + - obs_batch: (numpy Any) batch of observations + - act_batch: (numpy float) batch of actions executed given obs_batch + - rew_batch: (numpy float) rewards received as results of executing act_batch + - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch + - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode + and 0 otherwise. """ idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] return self._encode_sample(idxes) @@ -70,20 +69,14 @@ def sample(self, batch_size): class PrioritizedReplayBuffer(ReplayBuffer): def __init__(self, size, alpha): - """Create Prioritized Replay buffer. - - Parameters - ---------- - size: int - Max number of transitions to store in the buffer. When the buffer - overflows the old memories are dropped. - alpha: float - how much prioritization is used - (0 - no prioritization, 1 - full prioritization) - - See Also - -------- - ReplayBuffer.__init__ + """ + Create Prioritized Replay buffer. + + See Also ReplayBuffer.__init__ + + :param size: (int) Max number of transitions to store in the buffer. When the buffer overflows the old memories + are dropped. + :param alpha: (float) how much prioritization is used (0 - no prioritization, 1 - full prioritization) """ super(PrioritizedReplayBuffer, self).__init__(size) assert alpha >= 0 @@ -97,10 +90,18 @@ def __init__(self, size, alpha): self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 - def add(self, *args, **kwargs): - """See ReplayBuffer.store_effect""" + def add(self, obs_t, action, reward, obs_tp1, done): + """ + add a new transition to the buffer + + :param obs_t: (Any) the last observation + :param action: ([float]) the action + :param reward: (float) the reward of the transition + :param obs_tp1: (Any) the current observation + :param done: (bool) is the episode done + """ idx = self._next_idx - super().add(*args, **kwargs) + super().add(obs_t, action, reward, obs_tp1, done) self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha @@ -113,41 +114,26 @@ def _sample_proportional(self, batch_size): res.append(idx) return res - def sample(self, batch_size, beta): - """Sample a batch of experiences. + def sample(self, batch_size, beta=0): + """ + Sample a batch of experiences. compared to ReplayBuffer.sample it also returns importance weights and idxes of sampled experiences. - - Parameters - ---------- - batch_size: int - How many transitions to sample. - beta: float - To what degree to use importance weights - (0 - no corrections, 1 - full correction) - - Returns - ------- - obs_batch: np.array - batch of observations - act_batch: np.array - batch of actions executed given obs_batch - rew_batch: np.array - rewards received as results of executing act_batch - next_obs_batch: np.array - next set of observations seen after executing act_batch - done_mask: np.array - done_mask[i] = 1 if executing act_batch[i] resulted in - the end of an episode and 0 otherwise. - weights: np.array - Array of shape (batch_size,) and dtype np.float32 - denoting importance weight of each sampled transition - idxes: np.array - Array of shape (batch_size,) and dtype np.int32 - idexes in buffer of sampled experiences + :param batch_size: (int) How many transitions to sample. + :param beta: (float) To what degree to use importance weights (0 - no corrections, 1 - full correction) + :return: + - obs_batch: (numpy Any) batch of observations + - act_batch: (numpy float) batch of actions executed given obs_batch + - rew_batch: (numpy float) rewards received as results of executing act_batch + - next_obs_batch: (numpy Any) next set of observations seen after executing act_batch + - done_mask: (numpy bool) done_mask[i] = 1 if executing act_batch[i] resulted in the end of an episode + and 0 otherwise. + - weights: (numpy float) Array of shape (batch_size,) and dtype np.float32 denoting importance weight of + each sampled transition + - idxes: (numpy int) Array of shape (batch_size,) and dtype np.int32 idexes in buffer of sampled experiences """ assert beta > 0 @@ -166,19 +152,15 @@ def sample(self, batch_size, beta): return tuple(list(encoded_sample) + [weights, idxes]) def update_priorities(self, idxes, priorities): - """Update priorities of sampled transitions. + """ + Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. - Parameters - ---------- - idxes: [int] - List of idxes of sampled transitions - priorities: [float] - List of updated priorities corresponding to - transitions at the sampled idxes denoted by - variable `idxes`. + :param idxes: ([int]) List of idxes of sampled transitions + :param priorities: ([float]) List of updated priorities corresponding to transitions at the sampled idxes + denoted by variable `idxes`. """ assert len(idxes) == len(priorities) for idx, priority in zip(idxes, priorities): diff --git a/baselines/deepq/simple.py b/baselines/deepq/simple.py index 4bad145503..238662840d 100644 --- a/baselines/deepq/simple.py +++ b/baselines/deepq/simple.py @@ -6,182 +6,154 @@ import cloudpickle import numpy as np -import baselines.common.tf_util as U +from baselines import logger, deepq +from baselines.common import tf_util from baselines.common.tf_util import load_state, save_state -from baselines import logger from baselines.common.schedules import LinearSchedule -from baselines.common.input import observation_input - -from baselines import deepq from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer from baselines.deepq.utils import ObservationInput class ActWrapper(object): - def __init__(self, act, act_params): + def __init__(self, act, act_params, sess=None): + """ + the actor wrapper for loading and saving + + :param act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) the actor function + :param act_params: (dict) {'make_obs_ph', 'q_func', 'num_actions'} + :param sess: (TensorFlow Session) the current session + """ self._act = act self._act_params = act_params + if sess is None: + self.sess = tf_util.make_session() + else: + self.sess = sess @staticmethod def load(path): - with open(path, "rb") as f: - model_data, act_params = cloudpickle.load(f) + """ + Load from a path an actor model + + :param path: (str) the save location + :return: (ActWrapper) a loaded actor model + """ + with open(path, "rb") as file_handler: + model_data, act_params = cloudpickle.load(file_handler) act = deepq.build_act(**act_params) - sess = tf.Session() - sess.__enter__() - with tempfile.TemporaryDirectory() as td: - arc_path = os.path.join(td, "packed.zip") - with open(arc_path, "wb") as f: - f.write(model_data) + sess = tf_util.make_session() + with tempfile.TemporaryDirectory() as temp_dir: + arc_path = os.path.join(temp_dir, "packed.zip") + with open(arc_path, "wb") as file_handler: + file_handler.write(model_data) - zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td) - load_state(os.path.join(td, "model")) + zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(temp_dir) + load_state(os.path.join(temp_dir, "model"), sess) - return ActWrapper(act, act_params) + return ActWrapper(act, act_params, sess=sess) def __call__(self, *args, **kwargs): - return self._act(*args, **kwargs) + with self.sess.as_default(): + return self._act(*args, **kwargs) def save(self, path=None): - """Save model to a pickle located at `path`""" + """ + Save model to a pickle located at `path` + + :param path: (str) the save location + """ if path is None: path = os.path.join(logger.get_dir(), "model.pkl") - with tempfile.TemporaryDirectory() as td: - save_state(os.path.join(td, "model")) - arc_name = os.path.join(td, "packed.zip") + with tempfile.TemporaryDirectory() as temp_dir: + save_state(os.path.join(temp_dir, "model"), self.sess) + arc_name = os.path.join(temp_dir, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: - for root, dirs, files in os.walk(td): + for root, _, files in os.walk(temp_dir): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: - zipf.write(file_path, os.path.relpath(file_path, td)) - with open(arc_name, "rb") as f: - model_data = f.read() - with open(path, "wb") as f: - cloudpickle.dump((model_data, self._act_params), f) + zipf.write(file_path, os.path.relpath(file_path, temp_dir)) + with open(arc_name, "rb") as file_handler: + model_data = file_handler.read() + with open(path, "wb") as file_handler: + cloudpickle.dump((model_data, self._act_params), file_handler) def load(path): - """Load act function that was returned by learn function. - - Parameters - ---------- - path: str - path to the act function pickle - - Returns - ------- - act: ActWrapper - function that takes a batch of observations - and returns actions. + """ + Load act function that was returned by learn function. + + :param path: (str) path to the act function pickle + + :return: (ActWrapper) function that takes a batch of observations and returns actions. """ return ActWrapper.load(path) -def learn(env, - q_func, - lr=5e-4, - max_timesteps=100000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - train_freq=1, - batch_size=32, - print_freq=100, - checkpoint_freq=10000, - checkpoint_path=None, - learning_starts=1000, - gamma=1.0, - target_network_update_freq=500, - prioritized_replay=False, - prioritized_replay_alpha=0.6, - prioritized_replay_beta0=0.4, - prioritized_replay_beta_iters=None, - prioritized_replay_eps=1e-6, - param_noise=False, - callback=None): - """Train a deepq model. - - Parameters - ------- - env: gym.Env - environment to train on - q_func: (tf.Variable, int, str, bool) -> tf.Variable +def learn(env, q_func, learning_rate=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, + exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, + checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, + prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, + prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): + """ + Train a deepq model. + + :param env: (Gym Environment) environment to train on + :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor) the model that takes the following inputs: - observation_in: object - the output of observation placeholder - num_actions: int - number of actions - scope: str - reuse: bool - should be passed to outer variable scope + - observation_in: (object) the output of observation placeholder + - num_actions: (int) number of actions + - scope: (str) + - reuse: (bool) should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. - lr: float - learning rate for adam optimizer - max_timesteps: int - number of env steps to optimizer for - buffer_size: int - size of the replay buffer - exploration_fraction: float - fraction of entire training period over which the exploration rate is annealed - exploration_final_eps: float - final value of random action probability - train_freq: int - update the model every `train_freq` steps. - set to None to disable printing - batch_size: int - size of a batched sampled from replay buffer for training - print_freq: int - how often to print out training progress - set to None to disable printing - checkpoint_freq: int - how often to save the model. This is so that the best version is restored - at the end of the training. If you do not wish to restore the best version at - the end of the training set this variable to None. - learning_starts: int - how many steps of the model to collect transitions for before learning starts - gamma: float - discount factor - target_network_update_freq: int - update the target network every `target_network_update_freq` steps. - prioritized_replay: True - if True prioritized replay buffer will be used. - prioritized_replay_alpha: float - alpha parameter for prioritized replay buffer - prioritized_replay_beta0: float - initial value of beta for prioritized replay buffer - prioritized_replay_beta_iters: int - number of iterations over which beta will be annealed from initial value + :param learning_rate: (float) learning rate for adam optimizer + :param max_timesteps: (int) number of env steps to optimizer for + :param buffer_size: (int) size of the replay buffer + :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is annealed + :param exploration_final_eps: (float) final value of random action probability + :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing + :param batch_size: (int) size of a batched sampled from replay buffer for training + :param print_freq: (int) how often to print out training progress set to None to disable printing + :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the end + of the training. If you do not wish to restore the best version at the end of the training set this variable + to None. + :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary directory. + :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts + :param gamma: (float) discount factor + :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps. + :param prioritized_replay: (bool) if True prioritized replay buffer will be used. + :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer + :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer + :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. - prioritized_replay_eps: float - epsilon to add to the TD errors when updating priorities. - callback: (locals, globals) -> None - function called at every steps with state of the algorithm. - If callback returns true training stops. - - Returns - ------- - act: ActWrapper - Wrapper over act function. Adds ability to save it and load it. - See header of baselines/deepq/categorical.py for details on the act function. + :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities. + :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy. + :param callback: (function (dict, dict)) function called at every steps with state of the algorithm. + If callback returns true training stops. It takes the local and global variables. + :return: (ActWrapper) Wrapper over act function. Adds ability to save it and load it. See header of + baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model - sess = tf.Session() - sess.__enter__() - # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph + observation_space_shape = env.observation_space def make_obs_ph(name): - return ObservationInput(env.observation_space, name=name) + """ + makes the observation placeholder + + :param name: (str) the placeholder name + :return: (TensorFlow Tensor) the placeholder + """ + return ObservationInput(observation_space_shape, name=name) - act, train, update_target, debug = deepq.build_train( + act, train, update_target, _ = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, - optimizer=tf.train.AdamOptimizer(learning_rate=lr), + optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise @@ -212,32 +184,32 @@ def make_obs_ph(name): final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. - U.initialize() - update_target() + tf_util.initialize(act.sess) + update_target(sess=act.sess) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True - with tempfile.TemporaryDirectory() as td: - td = checkpoint_path or td + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = checkpoint_path or temp_dir - model_file = os.path.join(td, "model") + model_file = os.path.join(temp_dir, "model") model_saved = False - if tf.train.latest_checkpoint(td) is not None: - load_state(model_file) + if tf.train.latest_checkpoint(temp_dir) is not None: + load_state(model_file, act.sess) logger.log('Loaded model from {}'.format(model_file)) model_saved = True - for t in range(max_timesteps): + for step in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: - update_eps = exploration.value(t) + update_eps = exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. @@ -245,7 +217,8 @@ def make_obs_ph(name): # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. - update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) + update_param_noise_threshold = -np.log(1. - exploration.value(step) + + exploration.value(step) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True @@ -263,44 +236,48 @@ def make_obs_ph(name): episode_rewards.append(0.0) reset = True - if t > learning_starts and t % train_freq == 0: + if step > learning_starts and step % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: - experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) + experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None - td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) + td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights, sess=act.sess) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) - if t > learning_starts and t % target_network_update_freq == 0: + if step > learning_starts and step % target_network_update_freq == 0: # Update target network periodically. - update_target() + update_target(sess=act.sess) + + if len(episode_rewards[-101:-1]) == 0: + mean_100ep_reward = -np.inf + else: + mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) - mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: - logger.record_tabular("steps", t) + logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) - logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) + logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) logger.dump_tabular() - if (checkpoint_freq is not None and t > learning_starts and - num_episodes > 100 and t % checkpoint_freq == 0): + if (checkpoint_freq is not None and step > learning_starts and + num_episodes > 100 and step % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) - save_state(model_file) + save_state(model_file, act.sess) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) - load_state(model_file) + load_state(model_file, act.sess) return act diff --git a/baselines/deepq/test_identity.py b/baselines/deepq/test_identity.py index ef57e70b45..f0885d6e3a 100644 --- a/baselines/deepq/test_identity.py +++ b/baselines/deepq/test_identity.py @@ -1,42 +1,45 @@ -import tensorflow as tf import random +import tensorflow as tf + from baselines import deepq from baselines.common.identity_env import IdentityEnv def test_identity(): - - with tf.Graph().as_default(): - env = IdentityEnv(10) - random.seed(0) - - tf.set_random_seed(0) - - param_noise = False - model = deepq.models.mlp([32]) - act = deepq.learn( - env, - q_func=model, - lr=1e-3, - max_timesteps=10000, - buffer_size=50000, - exploration_fraction=0.1, - exploration_final_eps=0.02, - print_freq=10, - param_noise=param_noise, - ) - - tf.set_random_seed(0) - - N_TRIALS = 1000 - sum_rew = 0 - obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(act([obs])) - sum_rew += rew - - assert sum_rew > 0.9 * N_TRIALS + """ + test identity function for DeepQ + """ + env = IdentityEnv(10) + random.seed(0) + + tf.set_random_seed(0) + + param_noise = False + model = deepq.models.mlp([32]) + + act = deepq.learn( + env, + q_func=model, + learning_rate=1e-3, + max_timesteps=10000, + buffer_size=50000, + exploration_fraction=0.1, + exploration_final_eps=0.02, + print_freq=10, + param_noise=param_noise, + ) + + tf.set_random_seed(0) + + n_trials = 1000 + sum_rew = 0 + obs = env.reset() + for _ in range(n_trials): + obs, rew, _, _ = env.step(act([obs])) + sum_rew += rew + + assert sum_rew > 0.9 * n_trials if __name__ == '__main__': diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py index 90b932e74a..c00b68801f 100644 --- a/baselines/deepq/utils.py +++ b/baselines/deepq/utils.py @@ -1,7 +1,7 @@ -from baselines.common.input import observation_input - import tensorflow as tf +from baselines.common.input import observation_input + # ================================================================ # Placeholders # ================================================================ @@ -9,26 +9,40 @@ class TfInput(object): def __init__(self, name="(unnamed)"): - """Generalized Tensorflow placeholder. The main differences are: + """ + Generalized Tensorflow placeholder. The main differences are: - possibly uses multiple placeholders internally and returns multiple values - can apply light postprocessing to the value feed to placeholder. + + :param name: (str) the input name """ self.name = name def get(self): - """Return the tf variable(s) representing the possibly postprocessed value + """ + Return the tf variable(s) representing the possibly postprocessed value of placeholder(s). + + :return: (TensorFlow Tensor) the placeholder + """ + raise NotImplementedError + + def make_feed_dict(self, data): """ - raise NotImplemented() + Given data input it to the placeholder(s). - def make_feed_dict(data): - """Given data input it to the placeholder(s).""" - raise NotImplemented() + :return: (dict) the given data input + """ + raise NotImplementedError class PlaceholderTfInput(TfInput): def __init__(self, placeholder): - """Wrapper for regular tensorflow placeholder.""" + """ + Wrapper for regular tensorflow placeholder. + + :param placeholder: (TensorFlow Tensor) + """ super().__init__(placeholder.name) self._placeholder = placeholder @@ -41,17 +55,14 @@ def make_feed_dict(self, data): class Uint8Input(PlaceholderTfInput): def __init__(self, shape, name=None): - """Takes input in uint8 format which is cast to float32 and divided by 255 + """ + Takes input in uint8 format which is cast to float32 and divided by 255 before passing it to the model. On GPU this ensures lower data transfer times. - Parameters - ---------- - shape: [int] - shape of the tensor. - name: str - name of the underlying placeholder + :param shape: ([int]) shape of the tensor. + :param name: (str) name of the underlying placeholder """ super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) @@ -64,20 +75,15 @@ def get(self): class ObservationInput(PlaceholderTfInput): def __init__(self, observation_space, name=None): - """Creates an input placeholder tailored to a specific observation space - - Parameters - ---------- - - observation_space: - observation space of the environment. Should be one of the gym.spaces types - name: str - tensorflow name of the underlying placeholder + """ + Creates an input placeholder tailored to a specific observation space + + :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces + types + :param name: (str) tensorflow name of the underlying placeholder """ inpt, self.processed_inpt = observation_input(observation_space, name=name) super().__init__(inpt) def get(self): return self.processed_inpt - - diff --git a/baselines/gail/adversary.py b/baselines/gail/adversary.py index 18df69ccca..c52da864aa 100644 --- a/baselines/gail/adversary.py +++ b/baselines/gail/adversary.py @@ -1,28 +1,50 @@ -''' +""" Reference: https://github.com/openai/imitation I follow the architecture from the official repository -''' +""" import tensorflow as tf import numpy as np from baselines.common.mpi_running_mean_std import RunningMeanStd -from baselines.common import tf_util as U +from baselines.common import tf_util as tf_util + + +def logsigmoid(input_tensor): + """ + Equivalent to tf.log(tf.sigmoid(a)) + + :param input_tensor: (TensorFlow Tensor) + :return: (TensorFlow Tensor) + """ + return -tf.nn.softplus(-input_tensor) -def logsigmoid(a): - '''Equivalent to tf.log(tf.sigmoid(a))''' - return -tf.nn.softplus(-a) -""" Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51""" def logit_bernoulli_entropy(logits): - ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits) + """ + Reference: + https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51 + + :param logits: (TensorFlow Tensor) the logits + :return: (TensorFlow Tensor) the bernoulli entropy + """ + ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits) return ent + class TransitionClassifier(object): - def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): + def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"): + """ + reward regression from observations and transitions + + :param env: (Gym Environment) + :param hidden_size: ([int]) the hidden dimension for the MLP + :param entcoeff: (float) the entropy loss weight + :param scope: (str) tensorflow variable scope + """ self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape - self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) + self.input_shape = tuple([o + a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() @@ -35,31 +57,48 @@ def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="advers # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) - generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) + generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, + labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) - entropy_loss = -entcoeff*entropy + entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy - self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) + self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() - self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], - self.losses + [U.flatgrad(self.total_loss, var_list)]) + self.lossandgrad = tf_util.function( + [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], + self.losses + [tf_util.flatgrad(self.total_loss, var_list)]) def build_ph(self): - self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph") - self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph") - self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph") - self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph") + """ + build placeholder + """ + self.generator_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape, + name="observations_ph") + self.generator_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape, + name="actions_ph") + self.expert_obs_ph = tf.placeholder(tf.float32, (None,) + self.observation_shape, + name="expert_observations_ph") + self.expert_acs_ph = tf.placeholder(tf.float32, (None,) + self.actions_shape, + name="expert_actions_ph") def build_graph(self, obs_ph, acs_ph, reuse=False): + """ + build the graph + + :param obs_ph: (TensorFlow Tensor) the observation placeholder + :param acs_ph: (TensorFlow Tensor) the action placeholder + :param reuse: (bool) + :return: (TensorFlow Tensor) the graph output + """ with tf.variable_scope(self.scope): if reuse: tf.get_variable_scope().reuse_variables() @@ -74,14 +113,26 @@ def build_graph(self, obs_ph, acs_ph, reuse=False): return logits def get_trainable_variables(self): + """ + get all the trainable variables from the graph + + :return: ([TensorFlow Tensor]) the variables + """ return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_reward(self, obs, acs): + def get_reward(self, obs, actions): + """ + get the reward using the observation and action + + :param obs: (TensorFlow Tensor or numpy Number) the observation + :param actions: (TensorFlow Tensor or numpy Number) the action + :return: (numpy Number) the reward + """ sess = tf.get_default_session() if len(obs.shape) == 1: obs = np.expand_dims(obs, 0) - if len(acs.shape) == 1: - acs = np.expand_dims(acs, 0) - feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs} + if len(actions.shape) == 1: + actions = np.expand_dims(actions, 0) + feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: actions} reward = sess.run(self.reward_op, feed_dict) return reward diff --git a/baselines/gail/behavior_clone.py b/baselines/gail/behavior_clone.py index 82f65ecf19..daef9850d1 100644 --- a/baselines/gail/behavior_clone.py +++ b/baselines/gail/behavior_clone.py @@ -1,27 +1,30 @@ -''' +""" The code is used to train BC imitator, or pretrained GAIL imitator -''' - +""" +import os import argparse import tempfile -import os.path as osp -import gym import logging -from tqdm import tqdm +from tqdm import tqdm +import gym import tensorflow as tf from baselines.gail import mlp_policy -from baselines import bench -from baselines import logger -from baselines.common import set_global_seeds, tf_util as U +from baselines import logger, bench +from baselines.common import set_global_seeds, tf_util from baselines.common.misc_util import boolean_flag from baselines.common.mpi_adam import MpiAdam from baselines.gail.run_mujoco import runner -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset +from baselines.gail.dataset.mujocodset import MujocoDset def argsparser(): + """ + make a behavior cloning argument parser + + :return: (ArgumentParser) + """ parser = argparse.ArgumentParser("Tensorflow Implementation of Behavior Cloning") parser.add_argument('--env_id', help='environment ID', default='Hopper-v1') parser.add_argument('--seed', help='RNG seed', type=int, default=0) @@ -33,37 +36,50 @@ def argsparser(): # Network Configuration (Using MLP Policy) parser.add_argument('--policy_hidden_size', type=int, default=100) # for evaluatation - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') - boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not') parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e5) return parser.parse_args() -def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, - adam_epsilon=1e-5, optim_stepsize=3e-4, - ckpt_dir=None, log_dir=None, task_name=None, - verbose=False): +def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, + ckpt_dir=None, task_name=None, verbose=False): + """ + Learn a behavior clone policy, and return the save location + + :param env: (Gym Environment) the environment + :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy + :param dataset: (Dset or MujocoDset) the dataset manager + :param optim_batch_size: (int) the batch size + :param max_iters: (int) the maximum number of iterations + :param adam_epsilon: (float) the epsilon value for the adam optimizer + :param optim_stepsize: (float) the optimizer stepsize + :param ckpt_dir: (str) the save directory, can be None for temporary directory + :param task_name: (str) the save name, can be None for saving directly to the directory name + :param verbose: (bool) + :return: (str) the save location for the TensorFlow model + """ val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy + policy = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - stochastic = U.get_placeholder_cached(name="stochastic") - loss = tf.reduce_mean(tf.square(ac-pi.ac)) - var_list = pi.get_trainable_variables() + obs_ph = policy.obs_ph + action_ph = policy.pdtype.sample_placeholder([None]) + stochastic_ph = policy.stochastic_ph + loss = tf.reduce_mean(tf.square(action_ph - policy.ac)) + var_list = policy.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) - lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) + lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph], [loss] + [tf_util.flatgrad(loss, var_list)]) - U.initialize() + tf_util.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') - train_loss, g = lossandgrad(ob_expert, ac_expert, True) - adam.update(g, optim_stepsize) + train_loss, grad = lossandgrad(ob_expert, ac_expert, True) + adam.update(grad, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) @@ -72,12 +88,18 @@ def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: - savedir_fname = osp.join(ckpt_dir, task_name) - U.save_state(savedir_fname, var_list=pi.get_variables()) + savedir_fname = os.path.join(ckpt_dir, task_name) + tf_util.save_state(savedir_fname, var_list=policy.get_variables()) return savedir_fname def get_task_name(args): + """ + Get the task name + + :param args: (ArgumentParser) the training argument + :return: (str) the task name + """ task_name = 'BC' task_name += '.{}'.format(args.env_id.split("-")[0]) task_name += '.traj_limitation_{}'.format(args.traj_limitation) @@ -86,37 +108,36 @@ def get_task_name(args): def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - env = gym.make(args.env_id) - - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "monitor.json")) - env.seed(args.seed) - gym.logger.setLevel(logging.WARN) - task_name = get_task_name(args) - args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) - args.log_dir = osp.join(args.log_dir, task_name) - dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) - savedir_fname = learn(env, - policy_fn, - dataset, - max_iters=args.BC_max_iter, - ckpt_dir=args.checkpoint_dir, - log_dir=args.log_dir, - task_name=task_name, - verbose=True) - avg_len, avg_ret = runner(env, - policy_fn, - savedir_fname, - timesteps_per_batch=1024, - number_trajs=10, - stochastic_policy=args.stochastic_policy, - save=args.save_sample, - reuse=True) + """ + start training the model + + :param args: (ArgumentParser) the training argument + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + env = gym.make(args.env_id) + + def policy_fn(name, ob_space, ac_space, reuse=False, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, + reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) + env = bench.Monitor(env, logger.get_dir() and + os.path.join(logger.get_dir(), "monitor.json")) + env.seed(args.seed) + gym.logger.setLevel(logging.WARN) + task_name = get_task_name(args) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) + args.log_dir = os.path.join(args.log_dir, task_name) + dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) + savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, + task_name=task_name, verbose=True) + runner(env, + policy_fn, + savedir_fname, + timesteps_per_batch=1024, + number_trajs=10, + stochastic_policy=args.stochastic_policy, + save=args.save_sample, + reuse=True) if __name__ == '__main__': diff --git a/baselines/gail/dataset/mujoco_dset.py b/baselines/gail/dataset/mujocodset.py similarity index 71% rename from baselines/gail/dataset/mujoco_dset.py rename to baselines/gail/dataset/mujocodset.py index 0693262270..2c1ac60fe5 100644 --- a/baselines/gail/dataset/mujoco_dset.py +++ b/baselines/gail/dataset/mujocodset.py @@ -1,16 +1,25 @@ -''' +""" Data structure of the input .npz: the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs' the values of each item is a list storing the expert trajectory sequentially a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t] -''' +""" -from baselines import logger import numpy as np +import matplotlib.pyplot as plt + +from baselines import logger class Dset(object): def __init__(self, inputs, labels, randomize): + """ + Dataset object + + :param inputs: (numpy Number) the input values + :param labels: (numpy Number) the target values + :param randomize: (bool) if the dataset should be shuffled + """ self.inputs = inputs self.labels = labels assert len(self.inputs) == len(self.labels) @@ -19,6 +28,9 @@ def __init__(self, inputs, labels, randomize): self.init_pointer() def init_pointer(self): + """ + initialize the pointer and shuffle the dataset, if randomize the dataset + """ self.pointer = 0 if self.randomize: idx = np.arange(self.num_pairs) @@ -27,6 +39,12 @@ def init_pointer(self): self.labels = self.labels[idx, :] def get_next_batch(self, batch_size): + """ + get the batch from the dataset + + :param batch_size: (int) the size of the batch from the dataset + :return: (numpy Number, numpy Number) inputs and labels + """ # if batch_size is negative -> return all if batch_size < 0: return self.inputs, self.labels @@ -39,8 +57,16 @@ def get_next_batch(self, batch_size): return inputs, labels -class Mujoco_Dset(object): +class MujocoDset(object): def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True): + """ + Dataset for mujoco + + :param expert_path: (str) the path to trajectory data + :param train_fraction: (float) the train val split (0 to 1) + :param traj_limitation: (int) the dims to load (if -1, load all) + :param randomize: (bool) if the dataset should be shuffled + """ traj_data = np.load(expert_path) if traj_limitation < 0: traj_limitation = len(traj_data['obs']) @@ -73,12 +99,22 @@ def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomiz self.log_info() def log_info(self): + """ + log the information of the dataset + """ logger.log("Total trajectorues: %d" % self.num_traj) logger.log("Total transitions: %d" % self.num_transition) logger.log("Average returns: %f" % self.avg_ret) logger.log("Std for returns: %f" % self.std_ret) def get_next_batch(self, batch_size, split=None): + """ + get the batch from the dataset + + :param batch_size: (int) the size of the batch from the dataset + :param split: (str) the type of data split (can be None, 'train', 'val') + :return: (numpy Number, numpy Number) inputs and labels + """ if split is None: return self.dset.get_next_batch(batch_size) elif split == 'train': @@ -89,17 +125,27 @@ def get_next_batch(self, batch_size, split=None): raise NotImplementedError def plot(self): - import matplotlib.pyplot as plt + """ + show and save (to 'histogram_rets.png') a histogram plotting of the episode returns + """ plt.hist(self.rets) plt.savefig("histogram_rets.png") plt.close() def test(expert_path, traj_limitation, plot): - dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation) + """ + test mujoco dataset object + + :param expert_path: (str) the path to trajectory data + :param traj_limitation: (int) the dims to load (if -1, load all) + :param plot: (bool) enable plotting + """ + dset = MujocoDset(expert_path, traj_limitation=traj_limitation) if plot: dset.plot() + if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() diff --git a/baselines/gail/gail-eval.py b/baselines/gail/gail_eval.py similarity index 66% rename from baselines/gail/gail-eval.py rename to baselines/gail/gail_eval.py index 1148cb309c..78157659f4 100644 --- a/baselines/gail/gail-eval.py +++ b/baselines/gail/gail_eval.py @@ -1,22 +1,21 @@ -''' +""" This code is used to evalaute the imitators trained with different number of trajectories and plot the results in the same figure for easy comparison. -''' +""" import argparse import os import glob -import gym +import gym import matplotlib.pyplot as plt import numpy as np import tensorflow as tf -from baselines.gail import run_mujoco -from baselines.gail import mlp_policy -from baselines.common import set_global_seeds, tf_util as U +from baselines.gail import run_mujoco, mlp_policy +from baselines.common import set_global_seeds, tf_util from baselines.common.misc_util import boolean_flag -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset +from baselines.gail.dataset.mujocodset import MujocoDset plt.style.use('ggplot') @@ -26,30 +25,52 @@ def load_dataset(expert_path): - dataset = Mujoco_Dset(expert_path=expert_path) + """ + load mujoco dataset + + :param expert_path: (str) the path to trajectory data + :return: (MujocoDset) the dataset manager object + """ + dataset = MujocoDset(expert_path=expert_path) return dataset def argsparser(): + """ + make a argument parser for evaluation of gail + + :return: (ArgumentParser) + """ parser = argparse.ArgumentParser('Do evaluation') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--policy_hidden_size', type=int, default=100) parser.add_argument('--env', type=str, choices=['Hopper', 'Walker2d', 'HalfCheetah', 'Humanoid', 'HumanoidStandup']) - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') return parser.parse_args() def evaluate_env(env_name, seed, policy_hidden_size, stochastic, reuse, prefix): - - def get_checkpoint_dir(checkpoint_list, limit, prefix): + """ + Evaluate an environment + + :param env_name: (str) the environment name + :param seed: (int) the initial random seed + :param policy_hidden_size: (int) the number of hidden neurons in the 4 layer MLP + :param stochastic: (bool) use a stochastic policy + :param reuse: (bool) allow reuse of the graph + :param prefix: (str) the checkpoint prefix for the type ('BC' or 'gail') + :return: (dict) the logging information of the evaluation + """ + + def _get_checkpoint_dir(checkpoint_list, limit, prefix): for checkpoint in checkpoint_list: if ('limitation_'+str(limit) in checkpoint) and (prefix in checkpoint): return checkpoint return None - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, + def _policy_fn(name, ob_space, ac_space, reuse=False, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, reuse=reuse, hid_size=policy_hidden_size, num_hid_layers=2) data_path = os.path.join('data', 'deterministic.trpo.' + env_name + '.0.00.npz') @@ -65,13 +86,13 @@ def policy_fn(name, ob_space, ac_space, reuse=False): for i, limit in enumerate(CONFIG['traj_limitation']): # Do one evaluation upper_bound = sum(dataset.rets[:limit])/limit - checkpoint_dir = get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) + checkpoint_dir = _get_checkpoint_dir(checkpoint_list, limit, prefix=prefix) checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) env = gym.make(env_name + '-v1') env.seed(seed) print('Trajectory limitation: {}, Load checkpoint: {}, '.format(limit, checkpoint_path)) avg_len, avg_ret = run_mujoco.runner(env, - policy_fn, + _policy_fn, checkpoint_path, timesteps_per_batch=1024, number_trajs=10, @@ -90,6 +111,14 @@ def policy_fn(name, ob_space, ac_space, reuse=False): def plot(env_name, bc_log, gail_log, stochastic): + """ + plot and display all the evalutation results + + :param env_name: (str) the environment name + :param bc_log: (dict) the behavior_clone log + :param gail_log: (dict) the gail log + :param stochastic: (bool) use a stochastic policy + """ upper_bound = bc_log['upper_bound'] bc_avg_ret = bc_log['avg_ret'] gail_avg_ret = gail_log['avg_ret'] @@ -128,18 +157,23 @@ def plot(env_name, bc_log, gail_log, stochastic): def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - print('Evaluating {}'.format(args.env)) - bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, - args.stochastic_policy, False, 'BC') - print('Evaluation for {}'.format(args.env)) - print(bc_log) - gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, - args.stochastic_policy, True, 'gail') - print('Evaluation for {}'.format(args.env)) - print(gail_log) - plot(args.env, bc_log, gail_log, args.stochastic_policy) + """ + evaluate and plot Behavior clone and gail + + :param args: (ArgumentParser) the arguments for training and evaluating + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + print('Evaluating {}'.format(args.env)) + bc_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, + args.stochastic_policy, False, 'BC') + print('Evaluation for {}'.format(args.env)) + print(bc_log) + gail_log = evaluate_env(args.env, args.seed, args.policy_hidden_size, + args.stochastic_policy, True, 'gail') + print('Evaluation for {}'.format(args.env)) + print(gail_log) + plot(args.env, bc_log, gail_log, args.stochastic_policy) if __name__ == '__main__': diff --git a/baselines/gail/mlp_policy.py b/baselines/gail/mlp_policy.py index d8df120719..347045a9c9 100644 --- a/baselines/gail/mlp_policy.py +++ b/baselines/gail/mlp_policy.py @@ -1,21 +1,36 @@ -''' +""" from baselines/ppo1/mlp_policy.py and add simple modification (1) add reuse argument (2) cache the `stochastic` placeholder -''' -import tensorflow as tf +""" import gym +import tensorflow as tf -import baselines.common.tf_util as U -from baselines.common.mpi_running_mean_std import RunningMeanStd -from baselines.common.distributions import make_pdtype +import baselines.common.tf_util as tf_util from baselines.acktr.utils import dense +from baselines.common.mpi_running_mean_std import RunningMeanStd +from baselines.ppo1.mlp_policy import BasePolicy -class MlpPolicy(object): +class MlpPolicy(BasePolicy): recurrent = False - def __init__(self, name, reuse=False, *args, **kwargs): + def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs): + """ + MLP policy for Gail + + :param name: (str) the variable scope name + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param hid_size: (int) the size of the hidden layers + :param num_hid_layers: (int) the number of hidden layers + :param sess: (TensorFlow session) The current TensorFlow session containing the variables. + :param reuse: (bool) allow resue of the graph + :param placeholders: (dict) To feed existing placeholders if needed + :param gaussian_fixed_var: (bool) fix the gaussian variance + """ + super(MlpPolicy, self).__init__(placeholders=placeholders) + self.sess = sess with tf.variable_scope(name): if reuse: tf.get_variable_scope().reuse_variables() @@ -23,53 +38,39 @@ def __init__(self, name, reuse=False, *args, **kwargs): self.scope = tf.get_variable_scope().name def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) + obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) - obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) + obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): - last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) - self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] + last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), + weight_init=tf_util.normc_initializer(1.0))) + self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): - last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) + last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), + weight_init=tf_util.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) - logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) + mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) + logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], + initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: - pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) + pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(pdparam) + self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) self.state_in = [] self.state_out = [] # change for BC - stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) - ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) - self.ac = ac - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - - def get_initial_state(self): - return [] + self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") + action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) + self.action = action + self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred]) diff --git a/baselines/gail/run_mujoco.py b/baselines/gail/run_mujoco.py index 379f7f8cb8..8378565553 100644 --- a/baselines/gail/run_mujoco.py +++ b/baselines/gail/run_mujoco.py @@ -1,26 +1,30 @@ -''' +""" Disclaimer: this code is highly based on trpo_mpi at @openai/baselines and @openai/imitation -''' +""" import argparse -import os.path as osp +import os import logging + from mpi4py import MPI from tqdm import tqdm - import numpy as np import gym -from baselines.gail import mlp_policy -from baselines.common import set_global_seeds, tf_util as U +from baselines.gail import mlp_policy, behavior_clone, trpo_mpi +from baselines.common import set_global_seeds, tf_util from baselines.common.misc_util import boolean_flag -from baselines import bench -from baselines import logger -from baselines.gail.dataset.mujoco_dset import Mujoco_Dset +from baselines import bench, logger +from baselines.gail.dataset.mujocodset import MujocoDset from baselines.gail.adversary import TransitionClassifier def argsparser(): + """ + get an argument parser for training mujoco on gail + + :return: (ArgumentParser) + """ parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL") parser.add_argument('--env_id', help='environment ID', default='Hopper-v2') parser.add_argument('--seed', help='RNG seed', type=int, default=0) @@ -31,8 +35,8 @@ def argsparser(): # Task parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train') # for evaluatation - boolean_flag(parser, 'stochastic_policy', default=False, help='use stochastic/deterministic policy to evaluate') - boolean_flag(parser, 'save_sample', default=False, help='save the trajectories or not') + boolean_flag(parser, 'stochastic_policy', default=False, help_msg='use stochastic/deterministic policy to evaluate') + boolean_flag(parser, 'save_sample', default=False, help_msg='save the trajectories or not') # Mujoco Dataset Configuration parser.add_argument('--traj_limitation', type=int, default=-1) # Optimization Configuration @@ -50,12 +54,18 @@ def argsparser(): parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100) parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6) # Behavior Cloning - boolean_flag(parser, 'pretrained', default=False, help='Use BC to pretrain') - parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4) + boolean_flag(parser, 'pretrained', default=False, help_msg='Use BC to pretrain') + parser.add_argument('--bc_max_iter', help='Max iteration for training BC', type=int, default=1e4) return parser.parse_args() def get_task_name(args): + """ + get the task name + + :param args: (ArgumentParser) the training argument + :return: (str) the task name + """ task_name = args.algo + "_gail." if args.pretrained: task_name += "with_pretrained." @@ -69,68 +79,74 @@ def get_task_name(args): def main(args): - U.make_session(num_cpu=1).__enter__() - set_global_seeds(args.seed) - env = gym.make(args.env_id) - - def policy_fn(name, ob_space, ac_space, reuse=False): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) - env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), "monitor.json")) - env.seed(args.seed) - gym.logger.setLevel(logging.WARN) - task_name = get_task_name(args) - args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) - args.log_dir = osp.join(args.log_dir, task_name) - - if args.task == 'train': - dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) - reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) - train(env, - args.seed, - policy_fn, - reward_giver, - dataset, - args.algo, - args.g_step, - args.d_step, - args.policy_entcoeff, - args.num_timesteps, - args.save_per_iter, - args.checkpoint_dir, - args.log_dir, - args.pretrained, - args.BC_max_iter, - task_name - ) - elif args.task == 'evaluate': - runner(env, - policy_fn, - args.load_model_path, - timesteps_per_batch=1024, - number_trajs=10, - stochastic_policy=args.stochastic_policy, - save=args.save_sample - ) - else: - raise NotImplementedError - env.close() + """ + start training the model + :param args: (ArgumentParser) the training argument + """ + with tf_util.make_session(num_cpu=1): + set_global_seeds(args.seed) + env = gym.make(args.env_id) -def train(env, seed, policy_fn, reward_giver, dataset, algo, - g_step, d_step, policy_entcoeff, num_timesteps, save_per_iter, - checkpoint_dir, log_dir, pretrained, BC_max_iter, task_name=None): + def policy_fn(name, ob_space, ac_space, reuse=False, placeholders=None, sess=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, sess=sess, + hid_size=args.policy_hidden_size, num_hid_layers=2, placeholders=placeholders) + env = bench.Monitor(env, logger.get_dir() and + os.path.join(logger.get_dir(), "monitor.json")) + env.seed(args.seed) + gym.logger.setLevel(logging.WARN) + task_name = get_task_name(args) + args.checkpoint_dir = os.path.join(args.checkpoint_dir, task_name) + args.log_dir = os.path.join(args.log_dir, task_name) + + if args.task == 'train': + dataset = MujocoDset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) + reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) + train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, + args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.pretrained, + args.bc_max_iter, task_name) + elif args.task == 'evaluate': + runner(env, + policy_fn, + args.load_model_path, + timesteps_per_batch=1024, + number_trajs=10, + stochastic_policy=args.stochastic_policy, + save=args.save_sample + ) + else: + raise NotImplementedError + env.close() + + +def train(env, seed, policy_fn, reward_giver, dataset, algo, g_step, d_step, policy_entcoeff, num_timesteps, + save_per_iter, checkpoint_dir, pretrained, bc_max_iter, task_name=None): + """ + train gail on mujoco + + :param env: (Gym Environment) the environment + :param seed: (int) the initial random seed + :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action + :param dataset: (MujocoDset) the dataset manager + :param algo: (str) the algorithm type (only 'trpo' is supported) + :param g_step: (int) number of steps to train policy in each epoch + :param d_step: (int) number of steps to train discriminator in each epoch + :param policy_entcoeff: (float) the weight of the entropy loss for the policy + :param num_timesteps: (int) the number of timesteps to run + :param save_per_iter: (int) the number of iterations before saving + :param checkpoint_dir: (str) the location for saving checkpoints + :param pretrained: (bool) use a pretrained behavior clone + :param bc_max_iter: (int) the maximum number of training iterations for the behavior clone + :param task_name: (str) the name of the task (can be None) + """ pretrained_weight = None - if pretrained and (BC_max_iter > 0): + if pretrained and (bc_max_iter > 0): # Pretrain with behavior cloning - from baselines.gail import behavior_clone - pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, - max_iters=BC_max_iter) + pretrained_weight = behavior_clone.learn(env, policy_fn, dataset, max_iters=bc_max_iter) if algo == 'trpo': - from baselines.gail import trpo_mpi # Set up for MPI seed rank = MPI.COMM_WORLD.Get_rank() if rank != 0: @@ -138,41 +154,47 @@ def train(env, seed, policy_fn, reward_giver, dataset, algo, workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env.seed(workerseed) - trpo_mpi.learn(env, policy_fn, reward_giver, dataset, rank, - pretrained=pretrained, pretrained_weight=pretrained_weight, - g_step=g_step, d_step=d_step, - entcoeff=policy_entcoeff, - max_timesteps=num_timesteps, - ckpt_dir=checkpoint_dir, log_dir=log_dir, - save_per_iter=save_per_iter, - timesteps_per_batch=1024, - max_kl=0.01, cg_iters=10, cg_damping=0.1, - gamma=0.995, lam=0.97, - vf_iters=5, vf_stepsize=1e-3, - task_name=task_name) + trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, gamma=0.995, lam=0.97, + entcoeff=policy_entcoeff, cg_damping=0.1, vf_stepsize=1e-3, vf_iters=5, + max_timesteps=num_timesteps, pretrained_weight=pretrained_weight, reward_giver=reward_giver, + expert_dataset=dataset, rank=rank, save_per_iter=save_per_iter, ckpt_dir=checkpoint_dir, + g_step=g_step, d_step=d_step, task_name=task_name) else: raise NotImplementedError def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): + """ + run the training for all the trajectories + + :param env: (Gym Environment) the environment + :param policy_func: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param load_model_path: (str) the path to the model + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param number_trajs: (int) the number of trajectories to run + :param stochastic_policy: (bool) use a stochastic policy + :param save: (bool) save the policy + :param reuse: (bool) allow reuse of the graph + :return: (float, float) average trajectory lenght, average trajectory reward + """ # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space, reuse=reuse) - U.initialize() + policy = policy_func("pi", ob_space, ac_space, reuse=reuse) + tf_util.initialize() # Prepare for rollouts # ---------------------------------------- - U.load_state(load_model_path) + tf_util.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): - traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) + traj = traj_1_generator(policy, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) @@ -193,43 +215,51 @@ def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, return avg_len, avg_ret -# Sample one trajectory (until trajectory end) -def traj_1_generator(pi, env, horizon, stochastic): +def traj_1_generator(policy, env, horizon, stochastic): + """ + Sample one trajectory (until trajectory end) + + :param policy: (MLPPolicy) the policy + :param env: (Gym Environment) the environment + :param horizon: (int) the search horizon + :param stochastic: (bool) use a stochastic policy + :return: (dict) the trajectory + """ - t = 0 - ac = env.action_space.sample() # not used, just so we have the datatype + step = 0 + env.action_space.sample() # not used, just so we have the datatype new = True # marks if we're on first timestep of an episode - ob = env.reset() + observation = env.reset() cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode # Initialize history arrays - obs = [] - rews = [] + observations = [] + rewards = [] news = [] - acs = [] + actions = [] while True: - ac, vpred = pi.act(stochastic, ob) - obs.append(ob) + acttion, _ = policy.act(stochastic, observation) + observations.append(observation) news.append(new) - acs.append(ac) + actions.append(acttion) - ob, rew, new, _ = env.step(ac) - rews.append(rew) + observation, reward, new, _ = env.step(acttion) + rewards.append(reward) - cur_ep_ret += rew + cur_ep_ret += reward cur_ep_len += 1 - if new or t >= horizon: + if new or step >= horizon: break - t += 1 + step += 1 - obs = np.array(obs) - rews = np.array(rews) + observations = np.array(observations) + rewards = np.array(rewards) news = np.array(news) - acs = np.array(acs) - traj = {"ob": obs, "rew": rews, "new": news, "ac": acs, + actions = np.array(actions) + traj = {"ob": observations, "rew": rewards, "new": news, "ac": actions, "ep_ret": cur_ep_ret, "ep_len": cur_ep_len} return traj diff --git a/baselines/gail/statistics.py b/baselines/gail/statistics.py index 5f7c57e449..96c4f96263 100644 --- a/baselines/gail/statistics.py +++ b/baselines/gail/statistics.py @@ -1,16 +1,26 @@ -''' +""" This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py -''' +""" import tensorflow as tf import numpy as np -import baselines.common.tf_util as U +import baselines.common.tf_util as tf_util -class stats(): +class Stats: - def __init__(self, scalar_keys=[], histogram_keys=[]): + def __init__(self, scalar_keys=None, histogram_keys=None): + """ + initialize the placeholders from the input keys, for summary logging + + :param scalar_keys: ([str]) the name of all the scalar inputs + :param histogram_keys: ([str]) the name of all the histogram inputs + """ + if scalar_keys is None: + scalar_keys = [] + if histogram_keys is None: + histogram_keys = [] self.scalar_keys = scalar_keys self.histogram_keys = histogram_keys self.scalar_summaries = [] @@ -18,28 +28,34 @@ def __init__(self, scalar_keys=[], histogram_keys=[]): self.histogram_summaries_ph = [] self.histogram_summaries = [] with tf.variable_scope('summary'): - for k in scalar_keys: - ph = tf.placeholder('float32', None, name=k+'.scalar.summary') - sm = tf.summary.scalar(k+'.scalar.summary', ph) - self.scalar_summaries_ph.append(ph) - self.scalar_summaries.append(sm) - for k in histogram_keys: - ph = tf.placeholder('float32', None, name=k+'.histogram.summary') - sm = tf.summary.scalar(k+'.histogram.summary', ph) - self.histogram_summaries_ph.append(ph) - self.histogram_summaries.append(sm) - - self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries) - - def add_all_summary(self, writer, values, iter): - # Note that the order of the incoming ```values``` should be the same as the that of the - # ```scalar_keys``` given in ```__init__``` - if np.sum(np.isnan(values)+0) != 0: + for key in scalar_keys: + place_holder = tf.placeholder('float32', None, name=key + '.scalar.summary') + string_summary = tf.summary.scalar(key + '.scalar.summary', place_holder) + self.scalar_summaries_ph.append(place_holder) + self.scalar_summaries.append(string_summary) + for key in histogram_keys: + place_holder = tf.placeholder('float32', None, name=key + '.histogram.summary') + string_summary = tf.summary.scalar(key + '.histogram.summary', place_holder) + self.histogram_summaries_ph.append(place_holder) + self.histogram_summaries.append(string_summary) + + self.summaries = tf.summary.merge(self.scalar_summaries + self.histogram_summaries) + + def add_all_summary(self, writer, values, _iter): + """ + Note that the order of the incoming ```values``` should be the same as the that of the + ```scalar_keys``` given in ```__init__``` + + :param writer: (TensorFlow FileWriter) the writer + :param values: (TensorFlow Tensor or numpy Number) the input for the summary run + :param _iter: (Number) the global step value + """ + if np.sum(np.isnan(values) + 0) != 0: return - sess = U.get_session() + sess = tf_util.get_session() keys = self.scalar_summaries_ph + self.histogram_summaries_ph feed_dict = {} - for k, v in zip(keys, values): - feed_dict.update({k: v}) + for key, value in zip(keys, values): + feed_dict.update({key: value}) summaries_str = sess.run(self.summaries, feed_dict) - writer.add_summary(summaries_str, iter) + writer.add_summary(summaries_str, _iter) diff --git a/baselines/gail/trpo_mpi.py b/baselines/gail/trpo_mpi.py index 615a4326a7..2446db00a6 100644 --- a/baselines/gail/trpo_mpi.py +++ b/baselines/gail/trpo_mpi.py @@ -1,142 +1,202 @@ -''' -Disclaimer: The trpo part highly rely on trpo_mpi at @openai/baselines -''' - import time import os from contextlib import contextmanager -from mpi4py import MPI from collections import deque +from mpi4py import MPI import tensorflow as tf import numpy as np -import baselines.common.tf_util as U -from baselines.common import explained_variance, zipsame, dataset, fmt_row +import baselines.common.tf_util as tf_util +from baselines.common import explained_variance, zipsame, dataset, fmt_row, colorize from baselines import logger -from baselines.common import colorize from baselines.common.mpi_adam import MpiAdam -from baselines.common.cg import cg -from baselines.gail.statistics import stats - - -def traj_segment_generator(pi, env, reward_giver, horizon, stochastic): +from baselines.common.cg import conjugate_gradient + + +# from baselines.gail.statistics import Stats + + +def traj_segment_generator(policy, env, horizon, stochastic, reward_giver=None, gail=False): + """ + Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) + + :param policy: (MLPPolicy) the policy + :param env: (Gym Environment) the environment + :param horizon: (int) the number of timesteps to run per batch + :param stochastic: (bool) use a stochastic policy + :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action + :param gail: (bool) Whether we are using this generator for standard trpo or with gail + :return: (dict) generator that returns a dict with the following keys: + + - ob: (numpy Number) observations + - rew: (numpy float) rewards (if gail is used it is the predicted reward) + - vpred: (numpy float) action logits + - new: (numpy bool) dones (is end of episode) + - ac: (numpy Number) actions + - prevac: (numpy Number) previous actions + - nextvpred: (numpy float) next action logits + - ep_rets: (float) cumulated current episode reward + - ep_lens: (int) the length of the current episode + - ep_true_rets: (float) the real environment reward + """ + # Check when using GAIL + assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL" # Initialize state variables - t = 0 - ac = env.action_space.sample() - new = True - rew = 0.0 - true_rew = 0.0 - ob = env.reset() - - cur_ep_ret = 0 - cur_ep_len = 0 + step = 0 + action = env.action_space.sample() # not used, just so we have the datatype + done = True + observation = env.reset() + + cur_ep_ret = 0 # return in current episode + cur_ep_len = 0 # len of current episode cur_ep_true_ret = 0 ep_true_rets = [] - ep_rets = [] - ep_lens = [] + ep_rets = [] # returns of completed episodes in this segment + ep_lens = [] # Episode lengths # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - true_rews = np.zeros(horizon, 'float32') - rews = np.zeros(horizon, 'float32') + observations = np.array([observation for _ in range(horizon)]) + true_rewards = np.zeros(horizon, 'float32') + rewards = np.zeros(horizon, 'float32') vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() + dones = np.zeros(horizon, 'int32') + actions = np.array([action for _ in range(horizon)]) + prev_actions = actions.copy() while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) + prevac = action + action, vpred = policy.act(stochastic, observation) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value - if t > 0 and t % horizon == 0: - yield {"ob": obs, "rew": rews, "vpred": vpreds, "new": news, - "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new), + if step > 0 and step % horizon == 0: + yield {"ob": observations, "rew": rewards, "vpred": vpreds, "new": dones, + "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - done), "ep_rets": ep_rets, "ep_lens": ep_lens, "ep_true_rets": ep_true_rets} - _, vpred = pi.act(stochastic, ob) + _, vpred = policy.act(stochastic, observation) # Be careful!!! if you change the downstream algorithm to aggregate # several of these batches, then be sure to do a deepcopy ep_rets = [] ep_true_rets = [] ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - rew = reward_giver.get_reward(ob, ac) - ob, true_rew, new, _ = env.step(ac) - rews[i] = rew - true_rews[i] = true_rew - - cur_ep_ret += rew - cur_ep_true_ret += true_rew + idx = step % horizon + observations[idx] = observation + vpreds[idx] = vpred + dones[idx] = done + actions[idx] = action + prev_actions[idx] = prevac + + if gail: + reward = reward_giver.get_reward(observation, action) + observation, true_reward, done, _ = env.step(action) + else: + observation, reward, done, _ = env.step(action) + true_reward = reward + rewards[idx] = reward + true_rewards[idx] = true_reward + + cur_ep_ret += reward + cur_ep_true_ret += true_reward cur_ep_len += 1 - if new: + if done: ep_rets.append(cur_ep_ret) ep_true_rets.append(cur_ep_true_ret) ep_lens.append(cur_ep_len) cur_ep_ret = 0 cur_ep_true_ret = 0 cur_ep_len = 0 - ob = env.reset() - t += 1 + observation = env.reset() + step += 1 def add_vtarg_and_adv(seg, gamma, lam): - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 + """ + Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) + + :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information) + :param gamma: (float) Discount factor + :param lam: (float) GAE factor + """ + # last element is only used for last vtarg, but we already zeroed it if last done = 1 + done = np.append(seg["new"], 0) vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') + time_horizon = len(seg["rew"]) + seg["adv"] = gae_lam = np.empty(time_horizon, 'float32') rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam + last_gae_lam = 0 + for step in reversed(range(time_horizon)): + non_terminal = 1 - done[step + 1] + delta = rew[step] + gamma * vpred[step + 1] * non_terminal - vpred[step] + gae_lam[step] = last_gae_lam = delta + gamma * lam * non_terminal * last_gae_lam seg["tdlamret"] = seg["adv"] + seg["vpred"] -def learn(env, policy_func, reward_giver, expert_dataset, rank, - pretrained, pretrained_weight, *, - g_step, d_step, entcoeff, save_per_iter, - ckpt_dir, log_dir, timesteps_per_batch, task_name, - gamma, lam, - max_kl, cg_iters, cg_damping=1e-2, - vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, - max_timesteps=0, max_episodes=0, max_iters=0, - callback=None - ): +def learn(env, policy_func, *, timesteps_per_batch, max_kl, cg_iters, gamma, lam, entcoeff=0.0, cg_damping=1e-2, + vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, callback=None, + # GAIL Params + pretrained_weight=None, reward_giver=None, expert_dataset=None, rank=0, save_per_iter=1, + ckpt_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, task_name="task_name", d_stepsize=3e-4, using_gail=True): + """ + learns a GAIL policy using the given environment + + :param env: (Gym Environment) the environment + :param policy_func: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param max_kl: (float) the kullback leiber loss threashold + :param cg_iters: (int) the number of iterations for the conjugate gradient calculation + :param gamma: (float) the discount value + :param lam: (float) GAE factor + :param entcoeff: (float) the weight for the entropy loss + :param cg_damping: (float) the compute gradient dampening factor + :param vf_stepsize: (float) the value function stepsize + :param vf_iters: (int) the value function's number iterations for learning + :param max_timesteps: (int) the maximum number of timesteps before halting + :param max_episodes: (int) the maximum number of episodes before halting + :param max_iters: (int) the maximum number of training iterations before halting + :param callback: (function (dict, dict)) the call back function, takes the local and global attribute dictionary + :param pretrained_weight: (str) the save location for the pretrained weights + :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action + :param expert_dataset: (MujocoDset) the dataset manager + :param rank: (int) the rank of the mpi thread + :param save_per_iter: (int) the number of iterations before saving + :param ckpt_dir: (str) the location for saving checkpoints + :param g_step: (int) number of steps to train policy in each epoch + :param d_step: (int) number of steps to train discriminator in each epoch + :param task_name: (str) the name of the task (can be None) + :param d_stepsize: (float) the reward giver stepsize + :param using_gail: (bool) using the GAIL model + """ nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) + sess = tf_util.single_threaded_session() # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space - pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) - oldpi = policy_func("oldpi", ob_space, ac_space) + policy = policy_func("pi", ob_space, ac_space, sess=sess) + old_policy = policy_func("oldpi", ob_space, ac_space, sess=sess, + placeholders={"obs": policy.obs_ph, "stochastic": policy.stochastic_ph}) + atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) + observation = policy.obs_ph + action = policy.pdtype.sample_placeholder([None]) - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() + kloldnew = old_policy.proba_distribution.kl(policy.proba_distribution) + ent = policy.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent - vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) + vferr = tf.reduce_mean(tf.square(policy.vpred - ret)) - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold + # advantage * pnew / pold + ratio = tf.exp(policy.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus @@ -145,83 +205,102 @@ def learn(env, policy_func, reward_giver, expert_dataset, rank, dist = meankl - all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] - vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] - assert len(var_list) == len(vf_var_list) + 1 - d_adam = MpiAdam(reward_giver.get_trainable_variables()) - vfadam = MpiAdam(vf_var_list) + all_var_list = policy.get_trainable_variables() + if using_gail: + var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] + vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] + assert len(var_list) == len(vf_var_list) + 1 + d_adam = MpiAdam(reward_giver.get_trainable_variables()) + else: + var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] + vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] + + vfadam = MpiAdam(vf_var_list, sess=sess) + get_flat = tf_util.GetFlat(var_list, sess=sess) + set_from_flat = tf_util.SetFromFlat(var_list, sess=sess) - get_flat = U.GetFlat(var_list) - set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: - sz = U.intprod(shape) - tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) - start += sz - gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 - fvp = U.flatgrad(gvp, var_list) - - assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg], losses) - compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) - compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) - compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) + var_size = tf_util.intprod(shape) + tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) + start += var_size + gvp = tf.add_n( + [tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 + fvp = tf_util.flatgrad(gvp, var_list) + + assign_old_eq_new = tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in + zipsame(old_policy.get_variables(), policy.get_variables())]) + compute_losses = tf_util.function([observation, action, atarg], losses) + compute_lossandgrad = tf_util.function([observation, action, atarg], + losses + [tf_util.flatgrad(optimgain, var_list)]) + compute_fvp = tf_util.function([flat_tangent, observation, action, atarg], fvp) + compute_vflossandgrad = tf_util.function([observation, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) - tstart = time.time() + start_time = time.time() yield - print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) + print(colorize("done in %.3f seconds" % (time.time() - start_time), color='magenta')) else: yield - def allmean(x): - assert isinstance(x, np.ndarray) - out = np.empty_like(x) - MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) + def allmean(arr): + assert isinstance(arr, np.ndarray) + out = np.empty_like(arr) + MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= nworkers return out - U.initialize() + tf_util.initialize(sess=sess) + th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) - d_adam.sync() + + if using_gail: + d_adam.sync() vfadam.sync() + if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) + if using_gail: + seg_gen = traj_segment_generator(policy, env, timesteps_per_batch, stochastic=True, + reward_giver=reward_giver, gail=True) + else: + seg_gen = traj_segment_generator(policy, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 - tstart = time.time() + t_start = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 - g_loss_stats = stats(loss_names) - d_loss_stats = stats(reward_giver.loss_name) - ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) - # if provide pretrained weight - if pretrained_weight is not None: - U.load_state(pretrained_weight, var_list=pi.get_variables()) + if using_gail: + true_rewbuffer = deque(maxlen=40) + # Stats not used for now + #  g_loss_stats = Stats(loss_names) + # d_loss_stats = Stats(reward_giver.loss_name) + # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) + + # if provide pretrained weight + if pretrained_weight is not None: + tf_util.load_state(pretrained_weight, var_list=policy.get_variables()) while True: - if callback: callback(locals(), globals()) + if callback: + callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: @@ -230,60 +309,66 @@ def allmean(x): break # Save model - if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: + if using_gail and rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() - saver.save(tf.get_default_session(), fname) + saver.save(sess, fname) logger.log("********** Iteration %i ************" % iters_so_far) - def fisher_vector_product(p): - return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p + def fisher_vector_product(vec): + return allmean(compute_fvp(vec, *fvpargs, sess=sess)) + cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") + # g_step = 1 when not using GAIL for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] + observation, action, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy + if hasattr(policy, "ret_rms"): + policy.ret_rms.update(tdlamret) + if hasattr(policy, "ob_rms"): + policy.ob_rms.update(observation) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] - assign_old_eq_new() # set old parameter values to new parameter values + assign_old_eq_new(sess=sess) + with timed("computegrad"): - *lossbefore, g = compute_lossandgrad(*args) + *lossbefore, grad = compute_lossandgrad(*args, sess=sess) lossbefore = allmean(np.array(lossbefore)) - g = allmean(g) - if np.allclose(g, 0): + grad = allmean(grad) + if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): - stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) + stepdir = conjugate_gradient(fisher_vector_product, grad, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() - shs = .5*stepdir.dot(fisher_vector_product(stepdir)) - lm = np.sqrt(shs / max_kl) + shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) + # abs(shs) to avoid taking square root of negative values + lagrange_multiplier = np.sqrt(abs(shs) / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) - fullstep = stepdir / lm - expectedimprove = g.dot(fullstep) + fullstep = stepdir / lagrange_multiplier + expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) - meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) + mean_losses = surr, kl_loss, *_ = allmean(np.array(compute_losses(*args, sess=sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) - if not np.isfinite(meanlosses).all(): + if not np.isfinite(mean_losses).all(): logger.log("Got non-finite value of losses -- bad!") - elif kl > max_kl * 1.5: + elif kl_loss > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") @@ -297,46 +382,55 @@ def fisher_vector_product(p): if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) + with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): - if hasattr(pi, "ob_rms"): - pi.ob_rms.update(mbob) # update running mean/std for policy - g = allmean(compute_vflossandgrad(mbob, mbret)) - vfadam.update(g, vf_stepsize) - - g_losses = meanlosses - for (lossname, lossval) in zip(loss_names, meanlosses): - logger.record_tabular(lossname, lossval) + if hasattr(policy, "ob_rms"): + policy.ob_rms.update(mbob) # update running mean/std for policy + grad = allmean(compute_vflossandgrad(mbob, mbret, sess=sess)) + vfadam.update(grad, vf_stepsize) + + for (loss_name, loss_val) in zip(loss_names, mean_losses): + logger.record_tabular(loss_name, loss_val) + logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - # ------------------ Update D ------------------ - logger.log("Optimizing Discriminator...") - logger.log(fmt_row(13, reward_giver.loss_name)) - ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob)) - batch_size = len(ob) // d_step - d_losses = [] # list of tuples, each of which gives the loss for a minibatch - for ob_batch, ac_batch in dataset.iterbatches((ob, ac), - include_final_partial_batch=False, - batch_size=batch_size): - ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) - # update running mean/std for reward_giver - if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) - *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) - d_adam.update(allmean(g), d_stepsize) - d_losses.append(newlosses) - logger.log(fmt_row(13, np.mean(d_losses, axis=0))) - - lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples - lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) - true_rewbuffer.extend(true_rets) + + if using_gail: + # ------------------ Update D ------------------ + logger.log("Optimizing Discriminator...") + logger.log(fmt_row(13, reward_giver.loss_name)) + ob_expert, ac_expert = expert_dataset.get_next_batch(len(observation)) + batch_size = len(observation) // d_step + d_losses = [] # list of tuples, each of which gives the loss for a minibatch + for ob_batch, ac_batch in dataset.iterbatches((observation, action), + include_final_partial_batch=False, + batch_size=batch_size): + ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) + # update running mean/std for reward_giver + if hasattr(reward_giver, "obs_rms"): + reward_giver.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0)) + *newlosses, grad = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) + d_adam.update(allmean(grad), d_stepsize) + d_losses.append(newlosses) + logger.log(fmt_row(13, np.mean(d_losses, axis=0))) + + lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples + lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) + true_rewbuffer.extend(true_rets) + else: + lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples + lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) - logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) + if using_gail: + logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) @@ -344,11 +438,17 @@ def fisher_vector_product(p): logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) + logger.record_tabular("TimeElapsed", time.time() - t_start) if rank == 0: logger.dump_tabular() def flatten_lists(listoflists): + """ + Flatten a python list of list + + :param listoflists: (list(list)) + :return: (list) + """ return [el for list_ in listoflists for el in list_] diff --git a/baselines/her/actor_critic.py b/baselines/her/actor_critic.py index d5443fe0c3..9632cf6f6c 100644 --- a/baselines/her/actor_critic.py +++ b/baselines/her/actor_critic.py @@ -1,44 +1,52 @@ import tensorflow as tf -from baselines.her.util import store_args, nn + +from baselines.her.util import mlp class ActorCritic: - @store_args - def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, - **kwargs): + def __init__(self, inputs_tf, dim_obs, dim_goal, dim_action, + max_u, o_stats, g_stats, hidden, layers, **kwargs): """The actor-critic network and related training code. - Args: - inputs_tf (dict of tensors): all necessary inputs for the network: the - observation (o), the goal (g), and the action (u) - dimo (int): the dimension of the observations - dimg (int): the dimension of the goals - dimu (int): the dimension of the actions - max_u (float): the maximum magnitude of actions; action outputs will be scaled - accordingly - o_stats (baselines.her.Normalizer): normalizer for observations - g_stats (baselines.her.Normalizer): normalizer for goals - hidden (int): number of hidden units that should be used in hidden layers - layers (int): number of hidden layers + :param inputs_tf: ({str: TensorFlow Tensor}) all necessary inputs for the network: the + observation (o), the goal (g), and the action (u) + :param dim_obs: (int) the dimension of the observations + :param dim_goal: (int) the dimension of the goals + :param dim_action: (int) the dimension of the actions + :param max_u: (float) the maximum magnitude of actions; action outputs will be scaled accordingly + :param o_stats (baselines.her.Normalizer): normalizer for observations + :param g_stats (baselines.her.Normalizer): normalizer for goals + :param hidden (int): number of hidden units that should be used in hidden layers + :param layers (int): number of hidden layers """ + self.inputs_tf = inputs_tf + self.dim_obs = dim_obs + self.dim_goal = dim_goal + self.dim_action = dim_action + self.max_u = max_u + self.o_stats = o_stats + self.g_stats = g_stats + self.hidden = hidden + self.layers = layers + self.o_tf = inputs_tf['o'] self.g_tf = inputs_tf['g'] self.u_tf = inputs_tf['u'] # Prepare inputs for actor and critic. - o = self.o_stats.normalize(self.o_tf) - g = self.g_stats.normalize(self.g_tf) - input_pi = tf.concat(axis=1, values=[o, g]) # for actor + obs = self.o_stats.normalize(self.o_tf) + goals = self.g_stats.normalize(self.g_tf) + input_pi = tf.concat(axis=1, values=[obs, goals]) # for actor # Networks. with tf.variable_scope('pi'): - self.pi_tf = self.max_u * tf.tanh(nn( + self.pi_tf = self.max_u * tf.tanh(mlp( input_pi, [self.hidden] * self.layers + [self.dimu])) with tf.variable_scope('Q'): # for policy training - input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) - self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) + input_q = tf.concat(axis=1, values=[obs, goals, self.pi_tf / self.max_u]) + self.q_pi_tf = mlp(input_q, [self.hidden] * self.layers + [1]) # for critic training - input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) - self._input_Q = input_Q # exposed for tests - self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) + input_q = tf.concat(axis=1, values=[obs, goals, self.u_tf / self.max_u]) + self._input_q = input_q # exposed for tests + self.q_tf = mlp(input_q, [self.hidden] * self.layers + [1], reuse=True) diff --git a/baselines/her/ddpg.py b/baselines/her/ddpg.py index 92165de958..6abc45ac8d 100644 --- a/baselines/her/ddpg.py +++ b/baselines/her/ddpg.py @@ -5,8 +5,7 @@ from tensorflow.contrib.staging import StagingArea from baselines import logger -from baselines.her.util import ( - import_function, store_args, flatten_grads, transitions_in_episode_batch) +from baselines.her.util import import_function, flatten_grads, transitions_in_episode_batch from baselines.her.normalizer import Normalizer from baselines.her.replay_buffer import ReplayBuffer from baselines.common.mpi_adam import MpiAdam @@ -17,49 +16,74 @@ def dims_to_shapes(input_dims): class DDPG(object): - @store_args def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, - Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, + q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, time_horizon, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, - sample_transitions, gamma, reuse=False, **kwargs): - """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). - - Args: - input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the - actions (u) - buffer_size (int): number of transitions that are stored in the replay buffer - hidden (int): number of units in the hidden layers - layers (int): number of hidden layers - network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') - polyak (float): coefficient for Polyak-averaging of the target network - batch_size (int): batch size for training - Q_lr (float): learning rate for the Q (critic) network - pi_lr (float): learning rate for the pi (actor) network - norm_eps (float): a small value used in the normalizer to avoid numerical instabilities - norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] - max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] - action_l2 (float): coefficient for L2 penalty on the actions - clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] - scope (str): the scope used for the TensorFlow graph - T (int): the time horizon for rollouts - rollout_batch_size (int): number of parallel rollouts per DDPG agent - subtract_goals (function): function that subtracts goals from each other - relative_goals (boolean): whether or not relative goals should be fed into the network - clip_pos_returns (boolean): whether or not positive returns should be clipped - clip_return (float): clip returns to be in [-clip_return, clip_return] - sample_transitions (function) function that samples from the replay buffer - gamma (float): gamma used for Q learning updates - reuse (boolean): whether or not the networks should be reused + sample_transitions, gamma, reuse=False): """ + Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). + + :param input_dims: ({str: int}) dimensions for the observation (o), the goal (g), and the actions (u) + :param buffer_size: (int) number of transitions that are stored in the replay buffer + :param hidden: (int) number of units in the hidden layers + :param layers: (int) number of hidden layers + :param network_class: (str) the network class that should be used (e.g. 'baselines.her.ActorCritic') + :param polyak: (float) coefficient for Polyak-averaging of the target network + :param batch_size: (int) batch size for training + :param q_lr: (float) learning rate for the Q (critic) network + :param pi_lr: (float) learning rate for the pi (actor) network + :param norm_eps: (float) a small value used in the normalizer to avoid numerical instabilities + :param norm_clip: (float) normalized inputs are clipped to be in [-norm_clip, norm_clip] + :param max_u: (float) maximum action magnitude, i.e. actions are in [-max_u, max_u] + :param action_l2: (float) coefficient for L2 penalty on the actions + :param clip_obs: (float) clip observations before normalization to be in [-clip_obs, clip_obs] + :param scope: (str) the scope used for the TensorFlow graph + :param time_horizon: (int) the time horizon for rollouts + :param rollout_batch_size: (int) number of parallel rollouts per DDPG agent + :param subtract_goals: (function (numpy Number, numpy Number): numpy Number) function that subtracts goals + from each other + :param relative_goals: (boolean) whether or not relative goals should be fed into the network + :param clip_pos_returns: (boolean) whether or not positive returns should be clipped + :param clip_return: (float) clip returns to be in [-clip_return, clip_return] + :param sample_transitions: (function (dict, int): dict) function that samples from the replay buffer + :param gamma: (float) gamma used for Q learning updates + :param reuse: (boolean) whether or not the networks should be reused + """ + # Updated in experiments/config.py + self.input_dims = input_dims + self.buffer_size = buffer_size + self.hidden = hidden + self.layers = layers + self.network_class = network_class + self.polyak = polyak + self.batch_size = batch_size + self.q_lr = q_lr + self.pi_lr = pi_lr + self.norm_eps = norm_eps + self.norm_clip = norm_clip + self.max_u = max_u + self.action_l2 = action_l2 + self.clip_obs = clip_obs + self.scope = scope + self.time_horizon = time_horizon + self.rollout_batch_size = rollout_batch_size + self.subtract_goals = subtract_goals + self.relative_goals = relative_goals + self.clip_pos_returns = clip_pos_returns + self.clip_return = clip_return + self.sample_transitions = sample_transitions + self.gamma = gamma + self.reuse = reuse + if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) - self.dimo = self.input_dims['o'] - self.dimg = self.input_dims['g'] - self.dimu = self.input_dims['u'] + self.dim_obs = self.input_dims['o'] + self.dim_goal = self.input_dims['g'] + self.dim_action = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() @@ -84,54 +108,67 @@ def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polya self._create_network(reuse=reuse) # Configure the replay buffer. - buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) + buffer_shapes = {key: (self.time_horizon if key != 'o' else self.time_horizon + 1, *input_shapes[key]) for key, val in input_shapes.items()} - buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) - buffer_shapes['ag'] = (self.T+1, self.dimg) + buffer_shapes['g'] = (buffer_shapes['g'][0], self.dim_goal) + buffer_shapes['ag'] = (self.time_horizon + 1, self.dim_goal) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size - self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) + self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.time_horizon, self.sample_transitions) - def _random_action(self, n): - return np.random.uniform(low=-self.max_u, high=self.max_u, size=(n, self.dimu)) + def _random_action(self, num): + return np.random.uniform(low=-self.max_u, high=self.max_u, size=(num, self.dim_action)) - def _preprocess_og(self, o, ag, g): + def _preprocess_obs_goal(self, obs, achieved_goal, goal): if self.relative_goals: - g_shape = g.shape - g = g.reshape(-1, self.dimg) - ag = ag.reshape(-1, self.dimg) - g = self.subtract_goals(g, ag) - g = g.reshape(*g_shape) - o = np.clip(o, -self.clip_obs, self.clip_obs) - g = np.clip(g, -self.clip_obs, self.clip_obs) - return o, g - - def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, - compute_Q=False): - o, g = self._preprocess_og(o, ag, g) + g_shape = goal.shape + goal = goal.reshape(-1, self.dim_goal) + achieved_goal = achieved_goal.reshape(-1, self.dim_goal) + goal = self.subtract_goals(goal, achieved_goal) + goal = goal.reshape(*g_shape) + obs = np.clip(obs, -self.clip_obs, self.clip_obs) + goal = np.clip(goal, -self.clip_obs, self.clip_obs) + return obs, goal + + def get_actions(self, obs, achieved_goal, goal, noise_eps=0., random_eps=0., use_target_net=False, compute_q=False): + """ + return the action from an observation and goal + + :param obs: (numpy Number) the observation + :param achieved_goal: (numpy Number) the achieved goal + :param goal: (numpy Number) the goal + :param noise_eps: (float) the noise epsilon + :param random_eps: (float) the random epsilon + :param use_target_net: (bool) whether or not to use the target network + :param compute_q: (bool) whether or not to compute Q value + :return: (numpy float or float) the actions + """ + obs, goal = self._preprocess_obs_goal(obs, achieved_goal, goal) policy = self.target if use_target_net else self.main # values to compute vals = [policy.pi_tf] - if compute_Q: - vals += [policy.Q_pi_tf] + if compute_q: + vals += [policy.q_pi_tf] # feed feed = { - policy.o_tf: o.reshape(-1, self.dimo), - policy.g_tf: g.reshape(-1, self.dimg), - policy.u_tf: np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32) + policy.o_tf: obs.reshape(-1, self.dim_obs), + policy.g_tf: goal.reshape(-1, self.dim_goal), + policy.u_tf: np.zeros((obs.size // self.dim_obs, self.dim_action), dtype=np.float32) } ret = self.sess.run(vals, feed_dict=feed) # action postprocessing - u = ret[0] - noise = noise_eps * self.max_u * np.random.randn(*u.shape) # gaussian noise - u += noise - u = np.clip(u, -self.max_u, self.max_u) - u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (self._random_action(u.shape[0]) - u) # eps-greedy - if u.shape[0] == 1: - u = u[0] - u = u.copy() - ret[0] = u + action = ret[0] + noise = noise_eps * self.max_u * np.random.randn(*action.shape) # gaussian noise + action += noise + action = np.clip(action, -self.max_u, self.max_u) + # eps-greedy + n_ac = action.shape[0] + action += np.random.binomial(1, random_eps, n_ac).reshape(-1, 1) * (self._random_action(n_ac) - action) + if action.shape[0] == 1: + action = action[0] + action = action.copy() + ret[0] = action if len(ret) == 1: return ret[0] @@ -140,8 +177,11 @@ def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=Fals def store_episode(self, episode_batch, update_stats=True): """ - episode_batch: array of batch_size x (T or T+1) x dim_key - 'o' is of size T+1, others are of size T + Story the episode transitions + + :param episode_batch: (numpy Number) array of batch_size x (T or T+1) x dim_key 'o' is of size T+1, + others are of size T + :param update_stats: (bool) whether to update stats or not """ self.buffer.store_episode(episode_batch) @@ -153,8 +193,8 @@ def store_episode(self, episode_batch, update_stats=True): num_normalizing_transitions = transitions_in_episode_batch(episode_batch) transitions = self.sample_transitions(episode_batch, num_normalizing_transitions) - o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] - transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) + obs, _, goal, achieved_goal = transitions['o'], transitions['o_2'], transitions['g'], transitions['ag'] + transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal) # No need to preprocess the o_2 and g_2 since this is only used for stats self.o_stats.update(transitions['o']) @@ -164,56 +204,83 @@ def store_episode(self, episode_batch, update_stats=True): self.g_stats.recompute_stats() def get_current_buffer_size(self): + """ + returns the current buffer size + + :return: (int) buffer size + """ return self.buffer.get_current_size() def _sync_optimizers(self): - self.Q_adam.sync() + self.q_adam.sync() self.pi_adam.sync() def _grads(self): # Avoid feed_dict here for performance! - critic_loss, actor_loss, Q_grad, pi_grad = self.sess.run([ - self.Q_loss_tf, - self.main.Q_pi_tf, - self.Q_grad_tf, + critic_loss, actor_loss, q_grad, pi_grad = self.sess.run([ + self.q_loss_tf, + self.main.q_pi_tf, + self.q_grad_tf, self.pi_grad_tf ]) - return critic_loss, actor_loss, Q_grad, pi_grad + return critic_loss, actor_loss, q_grad, pi_grad - def _update(self, Q_grad, pi_grad): - self.Q_adam.update(Q_grad, self.Q_lr) + def _update(self, q_grad, pi_grad): + self.q_adam.update(q_grad, self.q_lr) self.pi_adam.update(pi_grad, self.pi_lr) def sample_batch(self): + """ + sample a batch + + :return: (dict) the batch + """ transitions = self.buffer.sample(self.batch_size) - o, o_2, g = transitions['o'], transitions['o_2'], transitions['g'] - ag, ag_2 = transitions['ag'], transitions['ag_2'] - transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g) - transitions['o_2'], transitions['g_2'] = self._preprocess_og(o_2, ag_2, g) + obs, obs_2, goal = transitions['o'], transitions['o_2'], transitions['g'] + achieved_goal, achieved_goal_2 = transitions['ag'], transitions['ag_2'] + transitions['o'], transitions['g'] = self._preprocess_obs_goal(obs, achieved_goal, goal) + transitions['o_2'], transitions['g_2'] = self._preprocess_obs_goal(obs_2, achieved_goal_2, goal) transitions_batch = [transitions[key] for key in self.stage_shapes.keys()] return transitions_batch def stage_batch(self, batch=None): + """ + apply a batch to staging + + :param batch: (dict) the batch to add to staging, if None: self.sample_batch() + """ if batch is None: batch = self.sample_batch() assert len(self.buffer_ph_tf) == len(batch) self.sess.run(self.stage_op, feed_dict=dict(zip(self.buffer_ph_tf, batch))) def train(self, stage=True): + """ + train DDPG + + :param stage: (bool) enable staging + :return: (float, float) critic loss, actor loss + """ if stage: self.stage_batch() - critic_loss, actor_loss, Q_grad, pi_grad = self._grads() - self._update(Q_grad, pi_grad) + critic_loss, actor_loss, q_grad, pi_grad = self._grads() + self._update(q_grad, pi_grad) return critic_loss, actor_loss def _init_target_net(self): self.sess.run(self.init_target_net_op) def update_target_net(self): + """ + update the target network + """ self.sess.run(self.update_target_net_op) def clear_buffer(self): + """ + clears the replay buffer + """ self.buffer.clear_buffer() def _vars(self, scope): @@ -226,21 +293,21 @@ def _global_vars(self, scope): return res def _create_network(self, reuse=False): - logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) + logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dim_action, self.max_u)) self.sess = tf.get_default_session() if self.sess is None: self.sess = tf.InteractiveSession() # running averages - with tf.variable_scope('o_stats') as vs: + with tf.variable_scope('o_stats') as scope: if reuse: - vs.reuse_variables() - self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) - with tf.variable_scope('g_stats') as vs: + scope.reuse_variables() + self.o_stats = Normalizer(self.dim_obs, self.norm_eps, self.norm_clip, sess=self.sess) + with tf.variable_scope('g_stats') as scope: if reuse: - vs.reuse_variables() - self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) + scope.reuse_variables() + self.g_stats = Normalizer(self.dim_goal, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() @@ -249,40 +316,44 @@ def _create_network(self, reuse=False): batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks - with tf.variable_scope('main') as vs: + with tf.variable_scope('main') as scope: if reuse: - vs.reuse_variables() + scope.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) - vs.reuse_variables() - with tf.variable_scope('target') as vs: + scope.reuse_variables() + with tf.variable_scope('target') as scope: if reuse: - vs.reuse_variables() + scope.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) - vs.reuse_variables() + scope.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions - target_Q_pi_tf = self.target.Q_pi_tf + target_q_pi_tf = self.target.q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) - target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) - self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) - self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) + target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_q_pi_tf, *clip_range) + + self.q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.q_tf)) + self.pi_loss_tf = -tf.reduce_mean(self.main.q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) - Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) + + q_grads_tf = tf.gradients(self.q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) - assert len(self._vars('main/Q')) == len(Q_grads_tf) + + assert len(self._vars('main/Q')) == len(q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) - self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) + + self.q_grads_vars_tf = zip(q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) - self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) + self.q_grad_tf = flatten_grads(grads=q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers - self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) + self.q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging @@ -292,7 +363,8 @@ def _create_network(self, reuse=False): self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( - map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) + map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), + zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() @@ -300,6 +372,11 @@ def _create_network(self, reuse=False): self._init_target_net() def logs(self, prefix=''): + """ + create a log dictionary + :param prefix: (str) the prefix for evey index + :return: ({str: Any}) the log + """ logs = [] logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))] logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))] @@ -318,7 +395,7 @@ def __getstate__(self): 'main', 'target', 'lock', 'env', 'sample_transitions', 'stage_shapes', 'create_actor_critic'] - state = {k: v for k, v in self.__dict__.items() if all([not subname in k for subname in excluded_subnames])} + state = {k: v for k, v in self.__dict__.items() if all([subname not in k for subname in excluded_subnames])} state['buffer_size'] = self.buffer_size state['tf'] = self.sess.run([x for x in self._global_vars('') if 'buffer' not in x.name]) return state @@ -330,11 +407,11 @@ def __setstate__(self, state): self.__init__(**state) # set up stats (they are overwritten in __init__) - for k, v in state.items(): - if k[-6:] == '_stats': - self.__dict__[k] = v + for key, value in state.items(): + if key[-6:] == '_stats': + self.__dict__[key] = value # load TF variables - vars = [x for x in self._global_vars('') if 'buffer' not in x.name] - assert(len(vars) == len(state["tf"])) - node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])] + _vars = [x for x in self._global_vars('') if 'buffer' not in x.name] + assert len(_vars) == len(state["tf"]) + node = [tf.assign(var, val) for var, val in zip(_vars, state["tf"])] self.sess.run(node) diff --git a/baselines/her/experiment/config.py b/baselines/her/experiment/config.py index cf29ca52b8..f32d6c1670 100644 --- a/baselines/her/experiment/config.py +++ b/baselines/her/experiment/config.py @@ -20,7 +20,7 @@ 'layers': 3, # number of layers in the critic/actor networks 'hidden': 256, # number of neurons in each hidden layers 'network_class': 'baselines.her.actor_critic:ActorCritic', - 'Q_lr': 0.001, # critic learning rate + 'q_lr': 0.001, # critic learning rate 'pi_lr': 0.001, # actor learning rate 'buffer_size': int(1E6), # for experience replay 'polyak': 0.95, # polyak averaging coefficient @@ -55,6 +55,9 @@ def cached_make_env(make_env): Only creates a new environment from the provided function if one has not yet already been created. This is useful here because we need to infer certain properties of the env, e.g. its observation and action spaces, without any intend of actually using it. + + :param make_env: (function (): Gym Environment) creates the environment + :return: (Gym Environment) the created environment """ if make_env not in CACHED_ENVS: env = make_env() @@ -63,6 +66,12 @@ def cached_make_env(make_env): def prepare_params(kwargs): + """ + prepares DDPG params from kwargs + + :param kwargs: (dict) the input kwargs + :return: (dict) DDPG parameters + """ # DDPG params ddpg_params = dict() @@ -73,18 +82,18 @@ def make_env(): kwargs['make_env'] = make_env tmp_env = cached_make_env(kwargs['make_env']) assert hasattr(tmp_env, '_max_episode_steps') - kwargs['T'] = tmp_env._max_episode_steps + kwargs['time_horizon'] = tmp_env.spec.max_episode_steps # wrapped envs preserve their spec tmp_env.reset() kwargs['max_u'] = np.array(kwargs['max_u']) if isinstance(kwargs['max_u'], list) else kwargs['max_u'] - kwargs['gamma'] = 1. - 1. / kwargs['T'] + kwargs['gamma'] = 1. - 1. / kwargs['time_horizon'] if 'lr' in kwargs: kwargs['pi_lr'] = kwargs['lr'] - kwargs['Q_lr'] = kwargs['lr'] + kwargs['q_lr'] = kwargs['lr'] del kwargs['lr'] for name in ['buffer_size', 'hidden', 'layers', 'network_class', 'polyak', - 'batch_size', 'Q_lr', 'pi_lr', + 'batch_size', 'q_lr', 'pi_lr', 'norm_eps', 'norm_clip', 'max_u', 'action_l2', 'clip_obs', 'scope', 'relative_goals']: ddpg_params[name] = kwargs[name] @@ -95,17 +104,29 @@ def make_env(): return kwargs -def log_params(params, logger=logger): +def log_params(params, logger_input=logger): + """ + log the parameters + + :param params: (dict) parameters to log + :param logger_input: (logger) the logger + """ for key in sorted(params.keys()): - logger.info('{}: {}'.format(key, params[key])) + logger_input.info('{}: {}'.format(key, params[key])) def configure_her(params): + """ + configure hindsight experience replay + + :param params: (dict) input parameters + :return: (function (dict, int): dict) returns a HER update function for replay buffer batch + """ env = cached_make_env(params['make_env']) env.reset() - def reward_fun(ag_2, g, info): # vectorized - return env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info) + def reward_fun(achieved_goal, goal, info): # vectorized + return env.compute_reward(achieved_goal=achieved_goal, desired_goal=goal, info=info) # Prepare configuration for HER. her_params = { @@ -120,12 +141,29 @@ def reward_fun(ag_2, g, info): # vectorized return sample_her_transitions -def simple_goal_subtract(a, b): - assert a.shape == b.shape - return a - b +def simple_goal_subtract(vec_a, vec_b): + """ + checks if a and b have the same shape, and does a - b + + :param vec_a: (numpy array) + :param vec_b: (numpy array) + :return: (numpy array) a - b + """ + assert vec_a.shape == vec_b.shape + return vec_a - vec_b def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): + """ + configure a DDPG model from parameters + + :param dims: ({str: int}) the dimensions + :param params: (dict) the DDPG parameters + :param reuse: (bool) whether or not the networks should be reused + :param use_mpi: (bool) whether or not to use MPI + :param clip_return: (float) clip returns to be in [-clip_return, clip_return] + :return: (her.DDPG) the ddpg model + """ sample_her_transitions = configure_her(params) # Extract relevant parameters. gamma = params['gamma'] @@ -138,7 +176,7 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): env = cached_make_env(params['make_env']) env.reset() ddpg_params.update({'input_dims': input_dims, # agent takes an input observations - 'T': params['T'], + 'time_horizon': params['time_horizon'], 'clip_pos_returns': True, # clip positive returns 'clip_return': (1. / (1. - gamma)) if clip_return else np.inf, # max abs of return 'rollout_batch_size': rollout_batch_size, @@ -154,6 +192,12 @@ def configure_ddpg(dims, params, reuse=False, use_mpi=True, clip_return=True): def configure_dims(params): + """ + configure input and output dimensions + + :param params: (dict) the parameters + :return: ({str: int}) the dimensions + """ env = cached_make_env(params['make_env']) env.reset() obs, _, _, info = env.step(env.action_space.sample()) diff --git a/baselines/her/experiment/play.py b/baselines/her/experiment/play.py index 5b2f85d2ff..e31e57a093 100644 --- a/baselines/her/experiment/play.py +++ b/baselines/her/experiment/play.py @@ -1,7 +1,8 @@ import click -import numpy as np import pickle +import numpy as np + from baselines import logger from baselines.common import set_global_seeds import baselines.her.experiment.config as config @@ -14,11 +15,19 @@ @click.option('--n_test_rollouts', type=int, default=10) @click.option('--render', type=int, default=1) def main(policy_file, seed, n_test_rollouts, render): + """ + run HER from a saved policy + + :param policy_file: (str) pickle path to a saved policy + :param seed: (int) initial seed + :param n_test_rollouts: (int) the number of test rollouts + :param render: (bool) if rendering should be done + """ set_global_seeds(seed) # Load policy. - with open(policy_file, 'rb') as f: - policy = pickle.load(f) + with open(policy_file, 'rb') as file_handler: + policy = pickle.load(file_handler) env_name = policy.info['env_name'] # Prepare params. @@ -27,21 +36,21 @@ def main(policy_file, seed, n_test_rollouts, render): params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) - config.log_params(params, logger=logger) + config.log_params(params, logger_input=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], - 'compute_Q': True, + 'compute_q': True, 'rollout_batch_size': 1, 'render': bool(render), } - for name in ['T', 'gamma', 'noise_eps', 'random_eps']: + for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] - + evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) diff --git a/baselines/her/experiment/plot.py b/baselines/her/experiment/plot.py index 560903f82d..e9ee808a2e 100644 --- a/baselines/her/experiment/plot.py +++ b/baselines/her/experiment/plot.py @@ -1,26 +1,42 @@ import os +import json +import argparse + import matplotlib.pyplot as plt import numpy as np -import json -import seaborn as sns; sns.set() +import seaborn as sns import glob2 -import argparse +# Initialize seaborn +sns.set() def smooth_reward_curve(x, y): + """ + smooth the reward curve + + :param x: (numpy float) the x coord of the reward + :param y: (numpy float) the y coord of the reward + :return: (numpy float, numpy float) smoothed x, smoothed y + """ halfwidth = int(np.ceil(len(x) / 60)) # Halfwidth of our smoothing convolution k = halfwidth xsmoo = x ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1), - mode='same') + mode='same') return xsmoo, ysmoo def load_results(file): + """ + load the results from a file + + :param file: (str) the saved results + :return: (dict) the result + """ if not os.path.exists(file): return None - with open(file, 'r') as f: - lines = [line for line in f] + with open(file, 'r') as file_handler: + lines = [line for line in file_handler] if len(lines) < 2: return None keys = [name.strip() for name in lines[0].split(',')] @@ -36,13 +52,20 @@ def load_results(file): def pad(xs, value=np.nan): + """ + + + :param xs: + :param value: + :return: + """ maxlen = np.max([len(x) for x in xs]) - + padded_xs = [] for x in xs: if x.shape[0] >= maxlen: padded_xs.append(x) - + padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value x_padded = np.concatenate([x, padding], axis=0) assert x_padded.shape[1:] == x.shape[1:] diff --git a/baselines/her/experiment/train.py b/baselines/her/experiment/train.py index aeaf1c5418..256e069091 100644 --- a/baselines/her/experiment/train.py +++ b/baselines/her/experiment/train.py @@ -1,5 +1,6 @@ import os import sys +from subprocess import CalledProcessError import click import numpy as np @@ -7,26 +8,43 @@ from mpi4py import MPI from baselines import logger -from baselines.common import set_global_seeds +from baselines.common import set_global_seeds, tf_util from baselines.common.mpi_moments import mpi_moments import baselines.her.experiment.config as config from baselines.her.rollout import RolloutWorker from baselines.her.util import mpi_fork -from subprocess import CalledProcessError - def mpi_average(value): - if value == []: + """ + calculate the average from the array, using MPI + + :param value: (numpy Number) the array + :return: (float) the average + """ + if len(value) == 0: value = [0.] if not isinstance(value, list): value = [value] return mpi_moments(np.array(value))[0] -def train(policy, rollout_worker, evaluator, - n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, - save_policies, **kwargs): +def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, + save_policies): + """ + train the given policy + + :param policy: (her.DDPG) the policy to train + :param rollout_worker: (RolloutWorker) Rollout worker generates experience for training. + :param evaluator: (RolloutWorker) Rollout worker for evalutation + :param n_epochs: (int) the number of epochs + :param n_test_rollouts: (int) the number of for the evalutation RolloutWorker + :param n_cycles: (int) the number of cycles for training per epoch + :param n_batches: (int) the batch size + :param policy_save_interval: (int) the interval with which policy pickles are saved. + If set to 0, only the best and latest policy will be pickled. + :param save_policies: (bool) whether or not to save the policies + """ rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') @@ -66,7 +84,8 @@ def train(policy, rollout_worker, evaluator, success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate - logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) + logger.info('New best success rate: {}. Saving policy to {} ...' + .format(best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: @@ -82,10 +101,26 @@ def train(policy, rollout_worker, evaluator, assert local_uniform[0] != root_uniform[0] -def launch( - env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, - override_params={}, save_policies=True -): +def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, + override_params=None, save_policies=True): + """ + launch training with mpi + + :param env: (str) environment ID + :param logdir: (str) the log directory + :param n_epochs: (int) the number of training epochs + :param num_cpu: (int) the number of CPUs to run on + :param seed: (int) the initial random seed + :param replay_strategy: (str) the type of replay strategy ('future' or 'none') + :param policy_save_interval: (int) the interval with which policy pickles are saved. + If set to 0, only the best and latest policy will be pickled. + :param clip_return: (float): clip returns to be in [-clip_return, clip_return] + :param override_params: (dict) override any parameter for training + :param save_policies: (bool) whether or not to save the policies + """ + + if override_params is None: + override_params = {} # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: @@ -96,14 +131,13 @@ def launch( if whoami == 'parent': sys.exit(0) - import baselines.common.tf_util as U - U.single_threaded_session().__enter__() + tf_util.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: - logger.configure(dir=logdir) + logger.configure(folder=logdir) else: logger.configure() logdir = logger.get_dir() @@ -121,10 +155,10 @@ def launch( if env in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter - with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: - json.dump(params, f) + with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler: + json.dump(params, file_handler) params = config.prepare_params(params) - config.log_params(params, logger=logger) + config.log_params(params, logger_input=logger) if num_cpu == 1: logger.warn() @@ -144,20 +178,20 @@ def launch( rollout_params = { 'exploit': False, 'use_target_net': False, - 'use_demo_states': True, - 'compute_Q': False, - 'T': params['T'], + # 'use_demo_states': True, + 'compute_q': False, + 'time_horizon': params['time_horizon'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], - 'use_demo_states': False, - 'compute_Q': True, - 'T': params['T'], + # 'use_demo_states': False, + 'compute_q': True, + 'time_horizon': params['time_horizon'], } - for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: + for name in ['time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] @@ -168,22 +202,33 @@ def launch( evaluator.seed(rank_seed) train( - logdir=logdir, policy=policy, rollout_worker=rollout_worker, + policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies) @click.command() -@click.option('--env', type=str, default='FetchReach-v1', help='the name of the OpenAI Gym environment that you want to train on') -@click.option('--logdir', type=str, default=None, help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') +@click.option('--env', type=str, default='FetchReach-v1', + help='the name of the OpenAI Gym environment that you want to train on') +@click.option('--logdir', type=str, default=None, + help='the path to where logs and policy pickles should go. If not specified, creates a folder in /tmp/') @click.option('--n_epochs', type=int, default=50, help='the number of training epochs to run') @click.option('--num_cpu', type=int, default=1, help='the number of CPU cores to use (using MPI)') -@click.option('--seed', type=int, default=0, help='the random seed used to seed both the environment and the training code') -@click.option('--policy_save_interval', type=int, default=5, help='the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled.') -@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') +@click.option('--seed', type=int, default=0, + help='the random seed used to seed both the environment and the training code') +@click.option('--policy_save_interval', type=int, default=5, + help='the interval with which policy pickles are saved. ' + 'If set to 0, only the best and latest policy will be pickled.') +@click.option('--replay_strategy', type=click.Choice(['future', 'none']), default='future', + help='the HER replay strategy to be used. "future" uses HER, "none" disables HER.') @click.option('--clip_return', type=int, default=1, help='whether or not returns should be clipped') def main(**kwargs): + """ + run launch for MPI HER DDPG training + + :param kwargs: (dict) the launch kwargs + """ launch(**kwargs) diff --git a/baselines/her/her.py b/baselines/her/her.py index 76f3c346ae..33c13b4eba 100644 --- a/baselines/her/her.py +++ b/baselines/her/her.py @@ -2,14 +2,14 @@ def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): - """Creates a sample function that can be used for HER experience replay. + """ + Creates a sample function that can be used for HER experience replay. - Args: - replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none', - regular DDPG experience replay is used - replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times + :param replay_strategy: (str) the HER replay strategy; if set to 'none', regular DDPG experience replay is used + (can be 'future' or 'none'). + :param replay_k: (int) the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times as many HER replays as regular replays are used) - reward_fun (function): function to re-compute the reward with substituted goals + :param reward_fun: (function (dict, dict): float) function to re-compute the reward with substituted goals """ if replay_strategy == 'future': future_p = 1 - (1. / (1 + replay_k)) @@ -19,20 +19,20 @@ def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): def _sample_her_transitions(episode_batch, batch_size_in_transitions): """episode_batch is {key: array(buffer_size x T x dim_key)} """ - T = episode_batch['u'].shape[1] + time_horizon = episode_batch['u'].shape[1] rollout_batch_size = episode_batch['u'].shape[0] batch_size = batch_size_in_transitions # Select which episodes and time steps to use. episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) - t_samples = np.random.randint(T, size=batch_size) + t_samples = np.random.randint(time_horizon, size=batch_size) transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() for key in episode_batch.keys()} # Select future time indexes proportional with probability future_p. These # will be used for HER replay by substituting in future goals. her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) - future_offset = np.random.uniform(size=batch_size) * (T - t_samples) + future_offset = np.random.uniform(size=batch_size) * (time_horizon - t_samples) future_offset = future_offset.astype(int) future_t = (t_samples + 1 + future_offset)[her_indexes] @@ -56,7 +56,7 @@ def _sample_her_transitions(episode_batch, batch_size_in_transitions): transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) for k in transitions.keys()} - assert(transitions['u'].shape[0] == batch_size_in_transitions) + assert transitions['u'].shape[0] == batch_size_in_transitions return transitions diff --git a/baselines/her/normalizer.py b/baselines/her/normalizer.py index d2b0588e8b..38e8aeed1b 100644 --- a/baselines/her/normalizer.py +++ b/baselines/her/normalizer.py @@ -9,15 +9,15 @@ class Normalizer: def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): - """A normalizer that ensures that observations are approximately distributed according to + """ + A normalizer that ensures that observations are approximately distributed according to a standard Normal distribution (i.e. have mean zero and variance one). - Args: - size (int): the size of the observation to be normalized - eps (float): a small constant that avoids underflows - default_clip_range (float): normalized observations are clipped to be in - [-default_clip_range, default_clip_range] - sess (object): the TensorFlow session to be used + :param size: (int) the size of the observation to be normalized + :param eps: (float) a small constant that avoids underflows + :param default_clip_range: (float) normalized observations are clipped to be in + [-default_clip_range, default_clip_range] + :param sess: (TensorFlow Session) the TensorFlow session to be used """ self.size = size self.eps = eps @@ -61,39 +61,69 @@ def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): ) self.lock = threading.Lock() - def update(self, v): - v = v.reshape(-1, self.size) + def update(self, arr): + """ + update the parameters from the input + + :param arr: (numpy Number) the input + """ + arr = arr.reshape(-1, self.size) with self.lock: - self.local_sum += v.sum(axis=0) - self.local_sumsq += (np.square(v)).sum(axis=0) - self.local_count[0] += v.shape[0] + self.local_sum += arr.sum(axis=0) + self.local_sumsq += (np.square(arr)).sum(axis=0) + self.local_count[0] += arr.shape[0] - def normalize(self, v, clip_range=None): + def normalize(self, arr, clip_range=None): + """ + normalize the input + + :param arr: (numpy Number) the input + :param clip_range: (float) the range to clip to [-clip_range, clip_range] + :return: (numpy Number) normalized input + """ if clip_range is None: clip_range = self.default_clip_range - mean = reshape_for_broadcasting(self.mean, v) - std = reshape_for_broadcasting(self.std, v) - return tf.clip_by_value((v - mean) / std, -clip_range, clip_range) - - def denormalize(self, v): - mean = reshape_for_broadcasting(self.mean, v) - std = reshape_for_broadcasting(self.std, v) - return mean + v * std - - def _mpi_average(self, x): - buf = np.zeros_like(x) - MPI.COMM_WORLD.Allreduce(x, buf, op=MPI.SUM) + mean = reshape_for_broadcasting(self.mean, arr) + std = reshape_for_broadcasting(self.std, arr) + return tf.clip_by_value((arr - mean) / std, -clip_range, clip_range) + + def denormalize(self, arr): + """ + denormalize the input + + :param arr: (numpy Number) the normalized input + :return: (numpy Number) original input + """ + mean = reshape_for_broadcasting(self.mean, arr) + std = reshape_for_broadcasting(self.std, arr) + return mean + arr * std + + @classmethod + def _mpi_average(cls, arr): + buf = np.zeros_like(arr) + MPI.COMM_WORLD.Allreduce(arr, buf, op=MPI.SUM) buf /= MPI.COMM_WORLD.Get_size() return buf - def synchronize(self, local_sum, local_sumsq, local_count, root=None): + def synchronize(self, local_sum, local_sumsq, local_count): + """ + syncronize over mpi threads + + :param local_sum: (numpy Number) the sum + :param local_sumsq: (numpy Number) the square root sum + :param local_count: (numpy Number) the number of values updated + :return: (numpy Number, numpy Number, numpy Number) the updated local_sum, local_sumsq, and local_count + """ local_sum[...] = self._mpi_average(local_sum) local_sumsq[...] = self._mpi_average(local_sumsq) local_count[...] = self._mpi_average(local_count) return local_sum, local_sumsq, local_count def recompute_stats(self): + """ + recompute the stats + """ with self.lock: # Copy over results. local_count = self.local_count.copy() @@ -120,21 +150,50 @@ def recompute_stats(self): class IdentityNormalizer: def __init__(self, size, std=1.): + """ + Normalizer that returns the input unchanged + + :param size: (int or [int]) the shape of the input to normalize + :param std: (float) the initial standard deviation or the normalization + """ self.size = size self.mean = tf.zeros(self.size, tf.float32) self.std = std * tf.ones(self.size, tf.float32) - def update(self, x): + def update(self, arr): + """ + update the parameters from the input + + :param arr: (numpy Number) the input + """ pass - def normalize(self, x, clip_range=None): - return x / self.std + def normalize(self, arr, **_kwargs): + """ + normalize the input + + :param arr: (numpy Number) the input + :return: (numpy Number) normalized input + """ + return arr / self.std + + def denormalize(self, arr): + """ + denormalize the input - def denormalize(self, x): - return self.std * x + :param arr: (numpy Number) the normalized input + :return: (numpy Number) original input + """ + return self.std * arr def synchronize(self): + """ + syncronize over mpi threads + """ pass def recompute_stats(self): + """ + recompute the stats + """ pass diff --git a/baselines/her/replay_buffer.py b/baselines/her/replay_buffer.py index b0005523fd..c46de90b0c 100644 --- a/baselines/her/replay_buffer.py +++ b/baselines/her/replay_buffer.py @@ -4,19 +4,18 @@ class ReplayBuffer: - def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions): - """Creates a replay buffer. - - Args: - buffer_shapes (dict of ints): the shape for all buffers that are used in the replay - buffer - size_in_transitions (int): the size of the buffer, measured in transitions - T (int): the time horizon for episodes - sample_transitions (function): a function that samples from the replay buffer + def __init__(self, buffer_shapes, size_in_transitions, time_horizon, sample_transitions): + """ + Creates a replay buffer. + + :param buffer_shapes: ({str: int}) the shape for all buffers that are used in the replay buffer + :param size_in_transitions: (int) the size of the buffer, measured in transitions + :param time_horizon: (int) the time horizon for episodes + :param sample_transitions: (function) a function that samples from the replay buffer """ self.buffer_shapes = buffer_shapes - self.size = size_in_transitions // T - self.T = T + self.size = size_in_transitions // time_horizon + self.time_horizon = time_horizon self.sample_transitions = sample_transitions # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)} @@ -35,7 +34,11 @@ def full(self): return self.current_size == self.size def sample(self, batch_size): - """Returns a dict {key: array(batch_size x shapes[key])} + """ + sample random transitions + + :param batch_size: (int) How many transitions to sample. + :return: (dict) {key: array(batch_size x shapes[key])} """ buffers = {} @@ -55,7 +58,10 @@ def sample(self, batch_size): return transitions def store_episode(self, episode_batch): - """episode_batch: array(batch_size x (T or T+1) x dim_key) + """ + Store an episode in the replay buffer + + :param episode_batch: (numpy Number) batch_size x (T or T+1) x dim_key """ batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] assert np.all(np.array(batch_sizes) == batch_sizes[0]) @@ -68,30 +74,48 @@ def store_episode(self, episode_batch): for key in self.buffers.keys(): self.buffers[key][idxs] = episode_batch[key] - self.n_transitions_stored += batch_size * self.T + self.n_transitions_stored += batch_size * self.time_horizon def get_current_episode_size(self): + """ + get current episode size + + :return: (int) the current size of the episode + """ with self.lock: return self.current_size def get_current_size(self): + """ + get current size of the buffer + + :return: (int) the current size of the buffer + """ with self.lock: - return self.current_size * self.T + return self.current_size * self.time_horizon def get_transitions_stored(self): + """ + get the number of stored transitions + + :return: (int) the number of transitions stored + """ with self.lock: return self.n_transitions_stored def clear_buffer(self): + """ + clear the buffer of all entries + """ with self.lock: self.current_size = 0 def _get_storage_idx(self, inc=None): - inc = inc or 1 # size increment + inc = inc or 1 # size increment assert inc <= self.size, "Batch committed to replay is too large!" # go consecutively until you hit the end, and then go randomly. - if self.current_size+inc <= self.size: - idx = np.arange(self.current_size, self.current_size+inc) + if self.current_size + inc <= self.size: + idx = np.arange(self.current_size, self.current_size + inc) elif self.current_size < self.size: overflow = inc - (self.size - self.current_size) idx_a = np.arange(self.current_size, self.size) @@ -101,7 +125,7 @@ def _get_storage_idx(self, inc=None): idx = np.random.randint(0, self.size, inc) # update replay size - self.current_size = min(self.size, self.current_size+inc) + self.current_size = min(self.size, self.current_size + inc) if inc == 1: idx = idx[0] diff --git a/baselines/her/rollout.py b/baselines/her/rollout.py index 5beba69dd7..e2368e4436 100644 --- a/baselines/her/rollout.py +++ b/baselines/her/rollout.py @@ -1,118 +1,138 @@ from collections import deque +import pickle import numpy as np -import pickle from mujoco_py import MujocoException -from baselines.her.util import convert_episode_to_batch_major, store_args +from baselines.her.util import convert_episode_to_batch_major class RolloutWorker: - - @store_args - def __init__(self, make_env, policy, dims, logger, T, rollout_batch_size=1, - exploit=False, use_target_net=False, compute_Q=False, noise_eps=0, - random_eps=0, history_len=100, render=False, **kwargs): - """Rollout worker generates experience by interacting with one or many environments. - - Args: - make_env (function): a factory function that creates a new instance of the environment - when called - policy (object): the policy that is used to act - dims (dict of ints): the dimensions for observations (o), goals (g), and actions (u) - logger (object): the logger that is used by the rollout worker - rollout_batch_size (int): the number of parallel rollouts that should be used - exploit (boolean): whether or not to exploit, i.e. to act optimally according to the - current policy without any exploration - use_target_net (boolean): whether or not to use the target net for rollouts - compute_Q (boolean): whether or not to compute the Q values alongside the actions - noise_eps (float): scale of the additive Gaussian noise - random_eps (float): probability of selecting a completely random action - history_len (int): length of history for statistics smoothing - render (boolean): whether or not to render the rollouts + def __init__(self, make_env, policy, dims, logger, time_horizon, rollout_batch_size=1, + exploit=False, use_target_net=False, compute_q=False, noise_eps=0, + random_eps=0, history_len=100, render=False): """ + Rollout worker generates experience by interacting with one or many environments. + + :param make_env: (function (): Gym Environment) a factory function that creates a new instance of the + environment when called + :param policy: (Object) the policy that is used to act + :param dims: ({str: int}) the dimensions for observations (o), goals (g), and actions (u) + :param logger: (Object) the logger that is used by the rollout worker + :param rollout_batch_size: (int) the number of parallel rollouts that should be used + :param exploit: (bool) whether or not to exploit, i.e. to act optimally according to the current policy without + any exploration + :param use_target_net: (bool) whether or not to use the target net for rollouts + :param compute_q: (bool) whether or not to compute the Q values alongside the actions + :param noise_eps: (float) scale of the additive Gaussian noise + :param random_eps: (float) probability of selecting a completely random action + :param history_len: (int) length of history for statistics smoothing + :param render: (boolean) whether or not to render the rollouts + """ + self.make_env = make_env + self.policy = policy + self.dims = dims + self.logger = logger + self.time_horizon = time_horizon + self.rollout_batch_size = rollout_batch_size + self.exploit = exploit + self.use_target_net = use_target_net + self.compute_q = compute_q + self.noise_eps = noise_eps + self.random_eps = random_eps + self.history_len = history_len + self.render = render + self.envs = [make_env() for _ in range(rollout_batch_size)] - assert self.T > 0 + assert self.time_horizon > 0 self.info_keys = [key.replace('info_', '') for key in dims.keys() if key.startswith('info_')] self.success_history = deque(maxlen=history_len) - self.Q_history = deque(maxlen=history_len) + self.q_history = deque(maxlen=history_len) self.n_episodes = 0 - self.g = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals - self.initial_o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations + self.goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # goals + self.initial_obs = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations self.initial_ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals self.reset_all_rollouts() self.clear_history() - def reset_rollout(self, i): - """Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` - and `g` arrays accordingly. + def reset_rollout(self, index): """ - obs = self.envs[i].reset() - self.initial_o[i] = obs['observation'] - self.initial_ag[i] = obs['achieved_goal'] - self.g[i] = obs['desired_goal'] + Resets the `i`-th rollout environment, re-samples a new goal, and updates the `initial_o` and `g` arrays + accordingly. + + :param index: (int) the index to reset + """ + obs = self.envs[index].reset() + self.initial_obs[index] = obs['observation'] + self.initial_ag[index] = obs['achieved_goal'] + self.goals[index] = obs['desired_goal'] def reset_all_rollouts(self): - """Resets all `rollout_batch_size` rollout workers. """ - for i in range(self.rollout_batch_size): - self.reset_rollout(i) + Resets all `rollout_batch_size` rollout workers. + """ + for step in range(self.rollout_batch_size): + self.reset_rollout(step) def generate_rollouts(self): - """Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current + """ + Performs `rollout_batch_size` rollouts in parallel for time horizon with the current policy acting on it accordingly. + + :return: (dict) batch """ self.reset_all_rollouts() # compute observations - o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations - ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals - o[:] = self.initial_o - ag[:] = self.initial_ag + observations = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations + achieved_goals = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals + observations[:] = self.initial_obs + achieved_goals[:] = self.initial_ag # generate episodes obs, achieved_goals, acts, goals, successes = [], [], [], [], [] - info_values = [np.empty((self.T, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys] - Qs = [] - for t in range(self.T): + info_values = [np.empty((self.time_horizon, self.rollout_batch_size, self.dims['info_' + key]), np.float32) + for key in self.info_keys] + q_values = [] + for step in range(self.time_horizon): policy_output = self.policy.get_actions( - o, ag, self.g, - compute_Q=self.compute_Q, + observations, achieved_goals, self.goals, + compute_q=self.compute_q, noise_eps=self.noise_eps if not self.exploit else 0., random_eps=self.random_eps if not self.exploit else 0., use_target_net=self.use_target_net) - if self.compute_Q: - u, Q = policy_output - Qs.append(Q) + if self.compute_q: + action, q_value = policy_output + q_values.append(q_value) else: - u = policy_output + action = policy_output - if u.ndim == 1: + if action.ndim == 1: # The non-batched case should still have a reasonable shape. - u = u.reshape(1, -1) + action = action.reshape(1, -1) o_new = np.empty((self.rollout_batch_size, self.dims['o'])) ag_new = np.empty((self.rollout_batch_size, self.dims['g'])) success = np.zeros(self.rollout_batch_size) # compute new states and observations - for i in range(self.rollout_batch_size): + for batch_idx in range(self.rollout_batch_size): try: # We fully ignore the reward here because it will have to be re-computed # for HER. - curr_o_new, _, _, info = self.envs[i].step(u[i]) + curr_o_new, _, _, info = self.envs[batch_idx].step(action[batch_idx]) if 'is_success' in info: - success[i] = info['is_success'] - o_new[i] = curr_o_new['observation'] - ag_new[i] = curr_o_new['achieved_goal'] + success[batch_idx] = info['is_success'] + o_new[batch_idx] = curr_o_new['observation'] + ag_new[batch_idx] = curr_o_new['achieved_goal'] for idx, key in enumerate(self.info_keys): - info_values[idx][t, i] = info[key] + info_values[idx][step, batch_idx] = info[key] if self.render: - self.envs[i].render() - except MujocoException as e: + self.envs[batch_idx].render() + except MujocoException: return self.generate_rollouts() if np.isnan(o_new).any(): @@ -120,16 +140,16 @@ def generate_rollouts(self): self.reset_all_rollouts() return self.generate_rollouts() - obs.append(o.copy()) - achieved_goals.append(ag.copy()) + obs.append(observations.copy()) + achieved_goals.append(achieved_goals.copy()) successes.append(success.copy()) - acts.append(u.copy()) - goals.append(self.g.copy()) - o[...] = o_new - ag[...] = ag_new - obs.append(o.copy()) - achieved_goals.append(ag.copy()) - self.initial_o[:] = o + acts.append(action.copy()) + goals.append(self.goals.copy()) + observations[...] = o_new + achieved_goals[...] = ag_new + obs.append(observations.copy()) + achieved_goals.append(achieved_goals.copy()) + self.initial_obs[:] = observations episode = dict(o=obs, u=acts, @@ -143,37 +163,54 @@ def generate_rollouts(self): assert successful.shape == (self.rollout_batch_size,) success_rate = np.mean(successful) self.success_history.append(success_rate) - if self.compute_Q: - self.Q_history.append(np.mean(Qs)) + + if self.compute_q: + self.q_history.append(np.mean(q_values)) self.n_episodes += self.rollout_batch_size return convert_episode_to_batch_major(episode) def clear_history(self): - """Clears all histories that are used for statistics + """ + Clears all histories that are used for statistics """ self.success_history.clear() - self.Q_history.clear() + self.q_history.clear() def current_success_rate(self): + """ + returns the current success rate + :return: (float) the success rate + """ return np.mean(self.success_history) - def current_mean_Q(self): - return np.mean(self.Q_history) + def current_mean_q(self): + """ + returns the current mean Q value + :return: (float) the mean Q value + """ + return np.mean(self.q_history) def save_policy(self, path): - """Pickles the current policy for later inspection. """ - with open(path, 'wb') as f: - pickle.dump(self.policy, f) + Pickles the current policy for later inspection. + + :param path: (str) the save location + """ + with open(path, 'wb') as file_handler: + pickle.dump(self.policy, file_handler) def logs(self, prefix='worker'): - """Generates a dictionary that contains all collected statistics. + """ + Generates a dictionary that contains all collected statistics. + + :param prefix: (str) the prefix for the name in logging + :return: ([(str, float)]) the logging information """ logs = [] logs += [('success_rate', np.mean(self.success_history))] - if self.compute_Q: - logs += [('mean_Q', np.mean(self.Q_history))] + if self.compute_q: + logs += [('mean_q', np.mean(self.q_history))] logs += [('episode', self.n_episodes)] if prefix is not '' and not prefix.endswith('/'): @@ -182,7 +219,10 @@ def logs(self, prefix='worker'): return logs def seed(self, seed): - """Seeds each environment with a distinct seed derived from the passed in global seed. + """ + Seeds each environment with a distinct seed derived from the passed in global seed. + + :param seed: (int) the random seed """ for idx, env in enumerate(self.envs): env.seed(seed + 1000 * idx) diff --git a/baselines/her/util.py b/baselines/her/util.py index d637aa69f1..784329b03c 100644 --- a/baselines/her/util.py +++ b/baselines/her/util.py @@ -2,79 +2,69 @@ import subprocess import sys import importlib -import inspect -import functools import tensorflow as tf import numpy as np +from mpi4py import MPI -from baselines.common import tf_util as U +from baselines.common import tf_util -def store_args(method): - """Stores provided method args as instance attributes. +def import_function(spec): """ - argspec = inspect.getfullargspec(method) - defaults = {} - if argspec.defaults is not None: - defaults = dict( - zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) - if argspec.kwonlydefaults is not None: - defaults.update(argspec.kwonlydefaults) - arg_names = argspec.args[1:] - - @functools.wraps(method) - def wrapper(*positional_args, **keyword_args): - self = positional_args[0] - # Get default arg values - args = defaults.copy() - # Add provided arg values - for name, value in zip(arg_names, positional_args[1:]): - args[name] = value - args.update(keyword_args) - self.__dict__.update(args) - return method(*positional_args, **keyword_args) - - return wrapper + Import a function identified by a string like "pkg.module:fn_name". - -def import_function(spec): - """Import a function identified by a string like "pkg.module:fn_name". + :param spec: (str) the function to import + :return: (function) """ mod_name, fn_name = spec.split(':') module = importlib.import_module(mod_name) - fn = getattr(module, fn_name) - return fn + func = getattr(module, fn_name) + return func def flatten_grads(var_list, grads): - """Flattens a variables and their gradients. """ - return tf.concat([tf.reshape(grad, [U.numel(v)]) + Flattens a variables and their gradients. + + :param var_list: ([TensorFlow Tensor]) the variables + :param grads: ([TensorFlow Tensor]) the gradients + :return: (TensorFlow Tensor) the flattend variable and gradient + """ + return tf.concat([tf.reshape(grad, [tf_util.numel(v)]) for (v, grad) in zip(var_list, grads)], 0) -def nn(input, layers_sizes, reuse=None, flatten=False, name=""): - """Creates a simple neural network +def mlp(_input, layers_sizes, reuse=None, flatten=False, name=""): + """ + Creates a simple fully-connected neural network + + :param _input: (TensorFlow Tensor) the input + :param layers_sizes: ([int]) the hidden layers + :param reuse: (bool) Enable reuse of the network + :param flatten: (bool) flatten the network output + :param name: (str) the name of the network + :return: (TensorFlow Tensor) the network """ for i, size in enumerate(layers_sizes): activation = tf.nn.relu if i < len(layers_sizes) - 1 else None - input = tf.layers.dense(inputs=input, - units=size, - kernel_initializer=tf.contrib.layers.xavier_initializer(), - reuse=reuse, - name=name + '_' + str(i)) + _input = tf.layers.dense(inputs=_input, + units=size, + kernel_initializer=tf.contrib.layers.xavier_initializer(), + reuse=reuse, + name=name + '_' + str(i)) if activation: - input = activation(input) + _input = activation(_input) if flatten: assert layers_sizes[-1] == 1 - input = tf.reshape(input, [-1]) - return input + _input = tf.reshape(_input, [-1]) + return _input def install_mpi_excepthook(): - import sys - from mpi4py import MPI + """ + setup the MPI exception hooks + """ old_hook = sys.excepthook def new_hook(a, b, c): @@ -82,14 +72,23 @@ def new_hook(a, b, c): sys.stdout.flush() sys.stderr.flush() MPI.COMM_WORLD.Abort() + sys.excepthook = new_hook -def mpi_fork(n, extra_mpi_args=[]): - """Re-launches the current script with workers +def mpi_fork(rank, extra_mpi_args=None): + """ + Re-launches the current script with workers Returns "parent" for original parent, "child" for MPI children + + :param rank: (int) the thread rank + :param extra_mpi_args: (dict) extra arguments for MPI + :return: (str) the correct type of thread name """ - if n <= 1: + if extra_mpi_args is None: + extra_mpi_args = [] + + if rank <= 1: return "child" if os.getenv("IN_MPI") is None: env = os.environ.copy() @@ -99,9 +98,9 @@ def mpi_fork(n, extra_mpi_args=[]): IN_MPI="1" ) # "-bind-to core" is crucial for good performance - args = ["mpirun", "-np", str(n)] + \ - extra_mpi_args + \ - [sys.executable] + args = ["mpirun", "-np", str(rank)] + \ + extra_mpi_args + \ + [sys.executable] args += sys.argv subprocess.check_call(args, env=env) @@ -112,8 +111,11 @@ def mpi_fork(n, extra_mpi_args=[]): def convert_episode_to_batch_major(episode): - """Converts an episode to have the batch dimension in the major (first) - dimension. + """ + Converts an episode to have the batch dimension in the major (first) dimension. + + :param episode: (dict) the episode batch + :return: (dict) the episode batch with he batch dimension in the major (first) dimension. """ episode_batch = {} for key in episode.keys(): @@ -125,15 +127,23 @@ def convert_episode_to_batch_major(episode): def transitions_in_episode_batch(episode_batch): - """Number of transitions in a given episode batch. + """ + Number of transitions in a given episode batch. + + :param episode_batch: (dict) the episode batch + :return: (int) the number of transitions in episode batch """ shape = episode_batch['u'].shape return shape[0] * shape[1] def reshape_for_broadcasting(source, target): - """Reshapes a tensor (source) to have the correct shape and dtype of the target - before broadcasting it with MPI. + """ + Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI. + + :param source: (TensorFlow Tensor) the input tensor + :param target: (TensorFlow Tensor) the target tensor + :return: (TensorFlow Tensor) the rehshaped tensor """ dim = len(target.get_shape()) shape = ([1] * (dim - 1)) + [-1] diff --git a/baselines/logger.py b/baselines/logger.py index 0abad0e8c5..eccd89d550 100644 --- a/baselines/logger.py +++ b/baselines/logger.py @@ -1,7 +1,6 @@ import os import sys import shutil -import os.path as osp import json import time import datetime @@ -15,21 +14,45 @@ DISABLED = 50 + class KVWriter(object): + """ + Key Value writer + """ def writekvs(self, kvs): + """ + write a dictionary to file + + :param kvs: (dict) + """ raise NotImplementedError + class SeqWriter(object): + """ + sequence writer + """ def writeseq(self, seq): + """ + write an array to file + + :param seq: (list) + """ raise NotImplementedError + class HumanOutputFormat(KVWriter, SeqWriter): def __init__(self, filename_or_file): + """ + log to a file, in a human readable format + + :param filename_or_file: (str or File) the file to write the log to + """ if isinstance(filename_or_file, str): self.file = open(filename_or_file, 'wt') self.own_file = True else: - assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s'%filename_or_file + assert hasattr(filename_or_file, 'read'), 'expected file or str, got %s' % filename_or_file self.file = filename_or_file self.own_file = False @@ -67,39 +90,58 @@ def writekvs(self, kvs): # Flush the output to the file self.file.flush() - def _truncate(self, s): - return s[:20] + '...' if len(s) > 23 else s + @classmethod + def _truncate(cls, string): + return string[:20] + '...' if len(string) > 23 else string def writeseq(self, seq): seq = list(seq) for (i, elem) in enumerate(seq): self.file.write(elem) - if i < len(seq) - 1: # add space unless this is the last one + if i < len(seq) - 1: # add space unless this is the last one self.file.write(' ') self.file.write('\n') self.file.flush() def close(self): + """ + closes the file + """ if self.own_file: self.file.close() + class JSONOutputFormat(KVWriter): def __init__(self, filename): + """ + log to a file, in the JSON format + + :param filename: (str) the file to write the log to + """ self.file = open(filename, 'wt') def writekvs(self, kvs): - for k, v in sorted(kvs.items()): - if hasattr(v, 'dtype'): - v = v.tolist() - kvs[k] = float(v) + for key, value in sorted(kvs.items()): + if hasattr(value, 'dtype'): + value = value.tolist() + kvs[key] = float(value) self.file.write(json.dumps(kvs) + '\n') self.file.flush() def close(self): + """ + closes the file + """ self.file.close() + class CSVOutputFormat(KVWriter): def __init__(self, filename): + """ + log to a file, in a CSV format + + :param filename: (str) the file to write the log to + """ self.file = open(filename, 'w+t') self.keys = [] self.sep = ',' @@ -112,77 +154,96 @@ def writekvs(self, kvs): self.file.seek(0) lines = self.file.readlines() self.file.seek(0) - for (i, k) in enumerate(self.keys): + for (i, key) in enumerate(self.keys): if i > 0: self.file.write(',') - self.file.write(k) + self.file.write(key) self.file.write('\n') for line in lines[1:]: self.file.write(line[:-1]) self.file.write(self.sep * len(extra_keys)) self.file.write('\n') - for (i, k) in enumerate(self.keys): + for i, key in enumerate(self.keys): if i > 0: self.file.write(',') - v = kvs.get(k) - if v is not None: - self.file.write(str(v)) + value = kvs.get(key) + if value is not None: + self.file.write(str(value)) self.file.write('\n') self.file.flush() def close(self): + """ + closes the file + """ self.file.close() class TensorBoardOutputFormat(KVWriter): - """ - Dumps key/value pairs into TensorBoard's numeric format. - """ - def __init__(self, dir): - os.makedirs(dir, exist_ok=True) - self.dir = dir + def __init__(self, folder): + """ + Dumps key/value pairs into TensorBoard's numeric format. + + :param folder: (str) the folder to write the log to + """ + os.makedirs(folder, exist_ok=True) + self.dir = folder self.step = 1 prefix = 'events' - path = osp.join(osp.abspath(dir), prefix) + path = os.path.join(os.path.abspath(folder), prefix) import tensorflow as tf from tensorflow.python import pywrap_tensorflow from tensorflow.core.util import event_pb2 from tensorflow.python.util import compat - self.tf = tf + self._tf = tf self.event_pb2 = event_pb2 self.pywrap_tensorflow = pywrap_tensorflow self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path)) def writekvs(self, kvs): - def summary_val(k, v): - kwargs = {'tag': k, 'simple_value': float(v)} - return self.tf.Summary.Value(**kwargs) - summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) + def summary_val(key, value): + kwargs = {'tag': key, 'simple_value': float(value)} + return self._tf.Summary.Value(**kwargs) + + summary = self._tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()]) event = self.event_pb2.Event(wall_time=time.time(), summary=summary) - event.step = self.step # is there any reason why you'd want to specify the step? + event.step = self.step # is there any reason why you'd want to specify the step? self.writer.WriteEvent(event) self.writer.Flush() self.step += 1 def close(self): + """ + closes the file + """ if self.writer: self.writer.Close() self.writer = None -def make_output_format(format, ev_dir, log_suffix=''): + +def make_output_format(_format, ev_dir, log_suffix=''): + """ + return a logger for the requested format + + :param _format: (str) the requested format to log to ('stdout', 'log', 'json', 'csv' or 'tensorboard') + :param ev_dir: (str) the logging directory + :param log_suffix: (str) the suffix for the log file + :return: (KVWrite) the logger + """ os.makedirs(ev_dir, exist_ok=True) - if format == 'stdout': + if _format == 'stdout': return HumanOutputFormat(sys.stdout) - elif format == 'log': - return HumanOutputFormat(osp.join(ev_dir, 'log%s.txt' % log_suffix)) - elif format == 'json': - return JSONOutputFormat(osp.join(ev_dir, 'progress%s.json' % log_suffix)) - elif format == 'csv': - return CSVOutputFormat(osp.join(ev_dir, 'progress%s.csv' % log_suffix)) - elif format == 'tensorboard': - return TensorBoardOutputFormat(osp.join(ev_dir, 'tb%s' % log_suffix)) + elif _format == 'log': + return HumanOutputFormat(os.path.join(ev_dir, 'log%s.txt' % log_suffix)) + elif _format == 'json': + return JSONOutputFormat(os.path.join(ev_dir, 'progress%s.json' % log_suffix)) + elif _format == 'csv': + return CSVOutputFormat(os.path.join(ev_dir, 'progress%s.csv' % log_suffix)) + elif _format == 'tensorboard': + return TensorBoardOutputFormat(os.path.join(ev_dir, 'tb%s' % log_suffix)) else: - raise ValueError('Unknown format specified: %s' % (format,)) + raise ValueError('Unknown format specified: %s' % (_format,)) + # ================================================================ # API @@ -193,94 +254,164 @@ def logkv(key, val): Log a value of some diagnostic Call this once for each diagnostic quantity, each iteration If called many times, last value will be used. + + :param key: (Any) save to log this key + :param val: (Any) save to log this value """ Logger.CURRENT.logkv(key, val) + def logkv_mean(key, val): """ The same as logkv(), but if called many times, values averaged. + + :param key: (Any) save to log this key + :param val: (Number) save to log this value """ Logger.CURRENT.logkv_mean(key, val) -def logkvs(d): + +def logkvs(key_values): """ Log a dictionary of key-value pairs + + :param key_values: (dict) the list of keys and values to save to log """ - for (k, v) in d.items(): - logkv(k, v) + for key, value in key_values.items(): + logkv(key, value) + def dumpkvs(): """ Write all of the diagnostics from the current iteration - - level: int. (see logger.py docs) If the global logger level is higher than - the level argument here, don't print to stdout. """ Logger.CURRENT.dumpkvs() + def getkvs(): + """ + get the key values logs + + :return: (dict) the logged values + """ return Logger.CURRENT.name2val def log(*args, level=INFO): """ - Write the sequence of args, with no separators, to the console and output files (if you've configured an output file). + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + + level: int. (see logger.py docs) If the global logger level is higher than + the level argument here, don't print to stdout. + + :param args: (list) log the arguments + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) """ Logger.CURRENT.log(*args, level=level) + def debug(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the DEBUG level. + + :param args: (list) log the arguments + """ log(*args, level=DEBUG) + def info(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the INFO level. + + :param args: (list) log the arguments + """ log(*args, level=INFO) + def warn(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the WARN level. + + :param args: (list) log the arguments + """ log(*args, level=WARN) + def error(*args): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + Using the ERROR level. + + :param args: (list) log the arguments + """ log(*args, level=ERROR) def set_level(level): """ Set logging threshold on current logger. + + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) """ Logger.CURRENT.set_level(level) + def get_dir(): """ Get directory that log files are being written to. will be None if there is no output directory (i.e., if you didn't call start) + + :return: (str) the logging directory """ return Logger.CURRENT.get_dir() + record_tabular = logkv dump_tabular = dumpkvs + class ProfileKV: - """ - Usage: - with logger.ProfileKV("interesting_scope"): - code - """ - def __init__(self, n): - self.n = "wait_" + n + def __init__(self, name): + """ + Usage: + with logger.ProfileKV("interesting_scope"): + code + + :param name: (str) the profiling name + """ + self.name = "wait_" + name + def __enter__(self): - self.t1 = time.time() - def __exit__(self ,type, value, traceback): - Logger.CURRENT.name2val[self.n] += time.time() - self.t1 + self.start_time = time.time() + + def __exit__(self, _type, value, traceback): + Logger.CURRENT.name2val[self.name] += time.time() - self.start_time + -def profile(n): +def profile(name): """ Usage: @profile("my_func") def my_func(): code + + :param name: (str) the profiling name + :return: (function) the wrapped function """ def decorator_with_name(func): def func_wrapper(*args, **kwargs): - with ProfileKV(n): + with ProfileKV(name): return func(*args, **kwargs) + return func_wrapper + return decorator_with_name @@ -289,32 +420,57 @@ def func_wrapper(*args, **kwargs): # ================================================================ class Logger(object): - DEFAULT = None # A logger with no output files. (See right below class definition) - # So that you can still log to the terminal without setting up any output files + # A logger with no output files. (See right below class definition) + # So that you can still log to the terminal without setting up any output files + DEFAULT = None CURRENT = None # Current logger being used by the free functions above - def __init__(self, dir, output_formats): + def __init__(self, folder, output_formats): + """ + the logger class + + :param folder: (str) the logging location + :param output_formats: ([str]) the list of output format + """ self.name2val = defaultdict(float) # values this iteration self.name2cnt = defaultdict(int) self.level = INFO - self.dir = dir + self.dir = folder self.output_formats = output_formats # Logging API, forwarded # ---------------------------------------- def logkv(self, key, val): + """ + Log a value of some diagnostic + Call this once for each diagnostic quantity, each iteration + If called many times, last value will be used. + + :param key: (Any) save to log this key + :param val: (Any) save to log this value + """ self.name2val[key] = val def logkv_mean(self, key, val): + """ + The same as logkv(), but if called many times, values averaged. + + :param key: (Any) save to log this key + :param val: (Number) save to log this value + """ if val is None: self.name2val[key] = None return oldval, cnt = self.name2val[key], self.name2cnt[key] - self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1) + self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1) self.name2cnt[key] = cnt + 1 def dumpkvs(self): - if self.level == DISABLED: return + """ + Write all of the diagnostics from the current iteration + """ + if self.level == DISABLED: + return for fmt in self.output_formats: if isinstance(fmt, KVWriter): fmt.writekvs(self.name2val) @@ -322,38 +478,75 @@ def dumpkvs(self): self.name2cnt.clear() def log(self, *args, level=INFO): + """ + Write the sequence of args, with no separators, + to the console and output files (if you've configured an output file). + + level: int. (see logger.py docs) If the global logger level is higher than + the level argument here, don't print to stdout. + + :param args: (list) log the arguments + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ if self.level <= level: self._do_log(args) # Configuration # ---------------------------------------- def set_level(self, level): + """ + Set logging threshold on current logger. + + :param level: (int) the logging level (can be DEBUG=10, INFO=20, WARN=30, ERROR=40, DISABLED=50) + """ self.level = level def get_dir(self): + """ + Get directory that log files are being written to. + will be None if there is no output directory (i.e., if you didn't call start) + + :return: (str) the logging directory + """ return self.dir def close(self): + """ + closes the file + """ for fmt in self.output_formats: fmt.close() # Misc # ---------------------------------------- def _do_log(self, args): + """ + log to the requested format outputs + + :param args: (list) the arguments to log + """ for fmt in self.output_formats: if isinstance(fmt, SeqWriter): fmt.writeseq(map(str, args)) -Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)]) -def configure(dir=None, format_strs=None): - if dir is None: - dir = os.getenv('OPENAI_LOGDIR') - if dir is None: - dir = osp.join(tempfile.gettempdir(), - datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) - assert isinstance(dir, str) - os.makedirs(dir, exist_ok=True) +Logger.DEFAULT = Logger.CURRENT = Logger(folder=None, output_formats=[HumanOutputFormat(sys.stdout)]) + + +def configure(folder=None, format_strs=None): + """ + configure the current logger + + :param folder: (str) the save location (if None, $OPENAI_LOGDIR, if still None, tempdir/openai-[date & time]) + :param format_strs: (list) the output logging format + (if None, $OPENAI_LOG_FORMAT, if still None, ['stdout', 'log', 'csv']) + """ + if folder is None: + folder = os.getenv('OPENAI_LOGDIR') + if folder is None: + folder = os.path.join(tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) + assert isinstance(folder, str) + os.makedirs(folder, exist_ok=True) log_suffix = '' from mpi4py import MPI @@ -367,40 +560,61 @@ def configure(dir=None, format_strs=None): else: format_strs = os.getenv('OPENAI_LOG_FORMAT_MPI', 'log').split(',') format_strs = filter(None, format_strs) - output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs] + output_formats = [make_output_format(f, folder, log_suffix) for f in format_strs] + + Logger.CURRENT = Logger(folder=folder, output_formats=output_formats) + log('Logging to %s' % folder) - Logger.CURRENT = Logger(dir=dir, output_formats=output_formats) - log('Logging to %s'%dir) def reset(): + """ + reset the current logger + """ if Logger.CURRENT is not Logger.DEFAULT: Logger.CURRENT.close() Logger.CURRENT = Logger.DEFAULT log('Reset logger') -class scoped_configure(object): - def __init__(self, dir=None, format_strs=None): - self.dir = dir + +class ScopedConfigure(object): + def __init__(self, folder=None, format_strs=None): + """ + Class for using context manager while logging + + usage: + with ScopedConfigure(folder=None, format_strs=None): + {code} + + :param folder: (str) the logging folder + :param format_strs: ([str]) the list of output logging format + """ + self.dir = folder self.format_strs = format_strs self.prevlogger = None + def __enter__(self): self.prevlogger = Logger.CURRENT - configure(dir=self.dir, format_strs=self.format_strs) + configure(folder=self.dir, format_strs=self.format_strs) + def __exit__(self, *args): Logger.CURRENT.close() Logger.CURRENT = self.prevlogger + # ================================================================ def _demo(): + """ + tests for the logger module + """ info("hi") debug("shouldn't appear") set_level(DEBUG) debug("should appear") - dir = "/tmp/testlogging" - if os.path.exists(dir): - shutil.rmtree(dir) - configure(dir=dir) + folder = "/tmp/testlogging" + if os.path.exists(folder): + shutil.rmtree(folder) + configure(folder=folder) logkv("a", 3) logkv("b", 2.5) dumpkvs() @@ -412,13 +626,19 @@ def _demo(): logkv_mean("b", -44.4) logkv("a", 5.5) dumpkvs() - info("^^^ should see b = 33.3") + with ScopedConfigure(None, None): + info("^^^ should see b = 33.3") - logkv("b", -2.5) - dumpkvs() + with ScopedConfigure("/tmp/test-logger/", ["json"]): + logkv("b", -2.5) + dumpkvs() + reset() logkv("a", "longasslongasslongasslongasslongasslongassvalue") dumpkvs() + warn("hey") + error("oh") + logkvs({"test": 1}) # ================================================================ @@ -426,50 +646,67 @@ def _demo(): # ================================================================ def read_json(fname): + """ + read a json file using pandas + + :param fname: (str) the file path to read + :return: (pandas DataFrame) the data in the json + """ import pandas - ds = [] - with open(fname, 'rt') as fh: - for line in fh: - ds.append(json.loads(line)) - return pandas.DataFrame(ds) + data = [] + with open(fname, 'rt') as file_handler: + for line in file_handler: + data.append(json.loads(line)) + return pandas.DataFrame(data) + def read_csv(fname): + """ + read a csv file using pandas + + :param fname: (str) the file path to read + :return: (pandas DataFrame) the data in the csv + """ import pandas return pandas.read_csv(fname, index_col=None, comment='#') + def read_tb(path): """ - path : a tensorboard file OR a directory, where we will find all TB files - of the form events.* + read a tensorboard output + + :param path: (str) a tensorboard file OR a directory, where we will find all TB files of the form events. + :return: (pandas DataFrame) the tensorboad data """ import pandas import numpy as np from glob import glob - from collections import defaultdict + # from collections import defaultdict import tensorflow as tf - if osp.isdir(path): - fnames = glob(osp.join(path, "events.*")) - elif osp.basename(path).startswith("events."): + if os.path.isdir(path): + fnames = glob(os.path.join(path, "events.*")) + elif os.path.basename(path).startswith("events."): fnames = [path] else: - raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s"%path) + raise NotImplementedError("Expected tensorboard file or directory containing them. Got %s" % path) tag2pairs = defaultdict(list) maxstep = 0 for fname in fnames: for summary in tf.train.summary_iterator(fname): if summary.step > 0: - for v in summary.summary.value: - pair = (summary.step, v.simple_value) - tag2pairs[v.tag].append(pair) + for value in summary.summary.value: + pair = (summary.step, value.simple_value) + tag2pairs[value.tag].append(pair) maxstep = max(summary.step, maxstep) data = np.empty((maxstep, len(tag2pairs))) data[:] = np.nan tags = sorted(tag2pairs.keys()) - for (colidx,tag) in enumerate(tags): + for (colidx, tag) in enumerate(tags): pairs = tag2pairs[tag] for (step, value) in pairs: - data[step-1, colidx] = value + data[step - 1, colidx] = value return pandas.DataFrame(data, columns=tags) + if __name__ == "__main__": _demo() diff --git a/baselines/ppo1/cnn_policy.py b/baselines/ppo1/cnn_policy.py index 6aec8c0e97..f2498f4fdf 100644 --- a/baselines/ppo1/cnn_policy.py +++ b/baselines/ppo1/cnn_policy.py @@ -1,56 +1,71 @@ -import baselines.common.tf_util as U import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype -class CnnPolicy(object): +import baselines.common.tf_util as tf_util +from baselines.ppo1.mlp_policy import BasePolicy + + +class CnnPolicy(BasePolicy): recurrent = False - def __init__(self, name, ob_space, ac_space, kind='large'): - with tf.variable_scope(name): - self._init(ob_space, ac_space, kind) - self.scope = tf.get_variable_scope().name - - def _init(self, ob_space, ac_space, kind): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - x = ob / 255.0 - if kind == 'small': # from A3C paper - x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) - elif kind == 'large': # Nature DQN - x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) - else: - raise NotImplementedError - - logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(logits) - self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] - self.state_in = [] - self.state_out = [] + def __init__(self, name, ob_space, ac_space, architecture_size='large', sess=None, reuse=False, placeholders=None): + """ + A CNN policy object for PPO1 + + :param name: (str) type of the policy (lin, logits, value) + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param architecture_size: (str) size of the policy's architecture + (small as in A3C paper, large as in Nature DQN) + :param sess: (TensorFlow session) The current TensorFlow session containing the variables. + :param reuse: (bool) If the policy is reusable or not + :param placeholders: (dict) To feed existing placeholders if needed + """ + super(CnnPolicy, self).__init__(placeholders=placeholders) + self.reuse = reuse + self.name = name + self._init(ob_space, ac_space, architecture_size) + self.scope = tf.get_variable_scope().name + self.sess = sess + + def _init(self, ob_space, ac_space, architecture_size): + """ - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() # XXX - self._act = U.function([stochastic, ob], [ac, self.vpred]) + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param architecture_size: (str) size of the policy's architecture + (small as in A3C paper, large as in Nature DQN) + """ + obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] + with tf.variable_scope(self.name, reuse=self.reuse): + normalized_obs = obs / 255.0 + if architecture_size == 'small': # from A3C paper + layer_1 = tf.nn.relu(tf_util.conv2d(normalized_obs, 16, "l1", [8, 8], [4, 4], pad="VALID")) + layer_2 = tf.nn.relu(tf_util.conv2d(layer_1, 32, "l2", [4, 4], [2, 2], pad="VALID")) + flattened_layer_2 = tf_util.flattenallbut0(layer_2) + last_layer = tf.nn.relu(tf.layers.dense(flattened_layer_2, 256, + name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) + elif architecture_size == 'large': # Nature DQN + layer_1 = tf.nn.relu(tf_util.conv2d(normalized_obs, 32, "l1", [8, 8], [4, 4], pad="VALID")) + layer_2 = tf.nn.relu(tf_util.conv2d(layer_1, 64, "l2", [4, 4], [2, 2], pad="VALID")) + layer_3 = tf.nn.relu(tf_util.conv2d(layer_2, 64, "l3", [3, 3], [1, 1], pad="VALID")) + flattened_layer_3 = tf_util.flattenallbut0(layer_3) + last_layer = tf.nn.relu(tf.layers.dense(flattened_layer_3, 512, + name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) + else: + raise NotImplementedError + + logits = tf.layers.dense(last_layer, pdtype.param_shape()[0], name='logits', + kernel_initializer=tf_util.normc_initializer(0.01)) + + self.proba_distribution = pdtype.proba_distribution_from_flat(logits) + self.vpred = tf.layers.dense(last_layer, 1, + name='value', kernel_initializer=tf_util.normc_initializer(1.0))[:, 0] + + self.state_in = [] + self.state_out = [] + if self.stochastic_ph is None: + self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) + action = self.proba_distribution.sample() + self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred]) diff --git a/baselines/ppo1/mlp_policy.py b/baselines/ppo1/mlp_policy.py index 7f979b3495..34322fd608 100644 --- a/baselines/ppo1/mlp_policy.py +++ b/baselines/ppo1/mlp_policy.py @@ -1,61 +1,152 @@ -from baselines.common.mpi_running_mean_std import RunningMeanStd -import baselines.common.tf_util as U import tensorflow as tf import gym -from baselines.common.distributions import make_pdtype -class MlpPolicy(object): - recurrent = False - def __init__(self, name, *args, **kwargs): - with tf.variable_scope(name): - self._init(*args, **kwargs) - self.scope = tf.get_variable_scope().name +from baselines.common.mpi_running_mean_std import RunningMeanStd +import baselines.common.tf_util as tf_util +from baselines.common.distributions import make_proba_dist_type - def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): + +class BasePolicy(object): + def __init__(self, placeholders=None): + """ + A base policy object for PPO1 + + :param placeholders: (dict) To feed existing placeholders if needed + """ + super(BasePolicy, self).__init__() + self.sess = None + self.pdtype = None + self._act = None + self.scope = None + self.obs_ph = None + self.stochastic_ph = None + + if placeholders is not None: + self.obs_ph = placeholders.get("obs", None) + self.stochastic_ph = placeholders.get("stochastic", None) + + def get_obs_and_pdtype(self, ob_space, ac_space): + """ + Initialize probability distribution and get observation placeholder. + + :param ob_space: (Gym Spaces) the observation space + :param ac_space: (Gym Spaces) the action space + """ assert isinstance(ob_space, gym.spaces.Box) - self.pdtype = pdtype = make_pdtype(ac_space) + self.pdtype = pdtype = make_proba_dist_type(ac_space) sequence_length = None - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) + if self.obs_ph is None: + self.obs_ph = tf.placeholder(dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape), name="ob") + + return self.obs_ph, pdtype + + def act(self, stochastic, obs): + """ + Get the action from the policy, using the observation + + :param stochastic: (bool) whether or not to use a stochastic or deterministic policy + :param obs: (TensorFlow Tensor or numpy Number) the observation + :return: (numpy Number, numpy Number) the action and value function + """ + ac1, vpred1 = self._act(stochastic, obs[None], sess=self.sess) + return ac1[0], vpred1[0] + + def get_variables(self): + """ + Get all the policy's variables + + :return: ([TensorFlow Tensor]) the variables of the network + """ + return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) + + def get_trainable_variables(self): + """ + Get the policy's trainable variables + + :return: ([TensorFlow Tensor]) the trainable variables of the network + """ + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) + + @classmethod + def get_initial_state(cls): + """ + Get the initial state + + :return: ([numpy Number]) the initial state + """ + return [] + + +class MlpPolicy(BasePolicy): + recurrent = False + + def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs): + """ + A MLP policy object for PPO1 + + :param name: (str) type of the policy (lin, logits, value) + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param hid_size: (int) the size of the hidden layers + :param num_hid_layers: (int) the number of hidden layers + :param sess: (TensorFlow session) The current TensorFlow session containing the variables. + :param reuse: (bool) If the policy is reusable or not + :param placeholders: (dict) To feed existing placeholders if needed + :param gaussian_fixed_var: (bool) enable gaussian sampling with fixed variance, when using continuous actions + """ + super(MlpPolicy, self).__init__(placeholders=placeholders) + self.reuse = reuse + self.name = name + self._init(*args, **kwargs) + self.scope = tf.get_variable_scope().name + self.sess = sess + + def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): + """ + + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param hid_size: (int) the size of the hidden layers + :param num_hid_layers: (int) the number of hidden layers + :param gaussian_fixed_var: (bool) enable gaussian sampling with fixed variance, when using continuous actions + """ + obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) - with tf.variable_scope("obfilter"): + with tf.variable_scope(self.name + "/obfilter", reuse=self.reuse): self.ob_rms = RunningMeanStd(shape=ob_space.shape) - with tf.variable_scope('vf'): - obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) + with tf.variable_scope(self.name + '/vf', reuse=self.reuse): + obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] + last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i" % (i + 1), + kernel_initializer=tf_util.normc_initializer(1.0))) + self.vpred = tf.layers.dense(last_out, 1, name='final', + kernel_initializer=tf_util.normc_initializer(1.0))[:, 0] - with tf.variable_scope('pol'): + with tf.variable_scope(self.name + '/pol', reuse=self.reuse): last_out = obz for i in range(num_hid_layers): - last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) + last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1), + kernel_initializer=tf_util.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): - mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) - logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) + mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', + kernel_initializer=tf_util.normc_initializer(0.01)) + logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], + initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: - pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) + pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', + kernel_initializer=tf_util.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(pdparam) + self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) self.state_in = [] self.state_out = [] - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - + if self.stochastic_ph is None: + self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) + action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) + self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred]) diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py index f2f13a6172..f663a66c10 100644 --- a/baselines/ppo1/pposgd_simple.py +++ b/baselines/ppo1/pposgd_simple.py @@ -1,202 +1,193 @@ +from collections import deque +import time + +import tensorflow as tf +import numpy as np +from mpi4py import MPI + from baselines.common import Dataset, explained_variance, fmt_row, zipsame from baselines import logger -import baselines.common.tf_util as U -import tensorflow as tf, numpy as np -import time +import baselines.common.tf_util as tf_util from baselines.common.mpi_adam import MpiAdam from baselines.common.mpi_moments import mpi_moments -from mpi4py import MPI -from collections import deque +from baselines.gail.trpo_mpi import traj_segment_generator, add_vtarg_and_adv, flatten_lists -def traj_segment_generator(pi, env, horizon, stochastic): - t = 0 - ac = env.action_space.sample() # not used, just so we have the datatype - new = True # marks if we're on first timestep of an episode - ob = env.reset() - - cur_ep_ret = 0 # return in current episode - cur_ep_len = 0 # len of current episode - ep_rets = [] # returns of completed episodes in this segment - ep_lens = [] # lengths of ... - - # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - rews = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() - while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if t > 0 and t % horizon == 0: - yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, - "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), - "ep_rets" : ep_rets, "ep_lens" : ep_lens} - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - ob, rew, new, _ = env.step(ac) - rews[i] = rew - - cur_ep_ret += rew - cur_ep_len += 1 - if new: - ep_rets.append(cur_ep_ret) - ep_lens.append(cur_ep_len) - cur_ep_ret = 0 - cur_ep_len = 0 - ob = env.reset() - t += 1 - -def add_vtarg_and_adv(seg, gamma, lam): +def learn(env, policy_fn, *, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, + optim_batchsize, gamma, lam, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, callback=None, + adam_epsilon=1e-5, schedule='constant'): """ - Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) + Learning PPO with Stochastic Gradient Descent + + :param env: (Gym Environment) environment to train on + :param policy_fn: (function (str, Gym Spaces, Gym Spaces): TensorFlow Tensor) creates the policy + :param timesteps_per_actorbatch: (int) timesteps per actor per update + :param clip_param: (float) clipping parameter epsilon + :param entcoeff: (float) the entropy loss weight + :param optim_epochs: (float) the optimizer's number of epochs + :param optim_stepsize: (float) the optimizer's stepsize + :param optim_batchsize: (int) the optimizer's the batch size + :param gamma: (float) discount factor + :param lam: (float) advantage estimation + :param max_timesteps: (int) number of env steps to optimizer for + :param max_episodes: (int) the maximum number of epochs + :param max_iters: (int) the maximum number of iterations + :param max_seconds: (int) the maximal duration + :param callback: (function (dict, dict)) function called at every steps with state of the algorithm. + It takes the local and global variables. + :param adam_epsilon: (float) the epsilon value for the adam optimizer + :param schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', + 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 - vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') - rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] - -def learn(env, policy_fn, *, - timesteps_per_actorbatch, # timesteps per actor per update - clip_param, entcoeff, # clipping parameter epsilon, entropy coeff - optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers - gamma, lam, # advantage estimation - max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint - callback=None, # you can do anything in the callback, since it takes locals(), globals() - adam_epsilon=1e-5, - schedule='constant' # annealing for stepsize parameters (epsilon and adam) - ): + # Setup losses and stuff - # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space - pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy - oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return + sess = tf_util.single_threaded_session() + + # Construct network for new policy + policy = policy_fn("pi", ob_space, ac_space, sess=sess) + + # Network for old policy + oldpi = policy_fn("oldpi", ob_space, ac_space, sess=sess, + placeholders={"obs": policy.obs_ph, "stochastic": policy.stochastic_ph}) - lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule - clip_param = clip_param * lrmult # Annealed cliping parameter epislon + # Target advantage function (if applicable) + atarg = tf.placeholder(dtype=tf.float32, shape=[None]) - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) + # Empirical return + ret = tf.placeholder(dtype=tf.float32, shape=[None]) - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() + # learning rate multiplier, updated with schedule + lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) + + # Annealed cliping parameter epislon + clip_param = clip_param * lrmult + + obs_ph = policy.obs_ph + action_ph = policy.pdtype.sample_placeholder([None]) + + kloldnew = oldpi.proba_distribution.kl(policy.proba_distribution) + ent = policy.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold - surr1 = ratio * atarg # surrogate from conservative policy iteration - surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # - pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) - vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) + # pnew / pold + ratio = tf.exp(policy.proba_distribution.logp(action_ph) - oldpi.proba_distribution.logp(action_ph)) + + # surrogate from conservative policy iteration + surr1 = ratio * atarg + surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg + + # PPO's pessimistic surrogate (L^CLIP) + pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) + vf_loss = tf.reduce_mean(tf.square(policy.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] - var_list = pi.get_trainable_variables() - lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) - adam = MpiAdam(var_list, epsilon=adam_epsilon) + var_list = policy.get_trainable_variables() + lossandgrad = tf_util.function([obs_ph, action_ph, atarg, ret, lrmult], + losses + [tf_util.flatgrad(total_loss, var_list)]) + adam = MpiAdam(var_list, epsilon=adam_epsilon, sess=sess) - assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) + assign_old_eq_new = tf_util.function([], [], updates=[tf.assign(oldv, newv) + for (oldv, newv) in + zipsame(oldpi.get_variables(), policy.get_variables())]) + compute_losses = tf_util.function([obs_ph, action_ph, atarg, ret, lrmult], losses) - U.initialize() + tf_util.initialize(sess=sess) adam.sync() # Prepare for rollouts - # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) + seg_gen = traj_segment_generator(policy, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 - tstart = time.time() - lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths - rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards + t_start = time.time() + + # rolling buffer for episode lengths + lenbuffer = deque(maxlen=100) + # rolling buffer for episode rewards + rewbuffer = deque(maxlen=100) - assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" + assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, + max_seconds > 0]) == 1, "Only one time constraint permitted" while True: - if callback: callback(locals(), globals()) + if callback: + callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break - elif max_seconds and time.time() - tstart >= max_seconds: + elif max_seconds and time.time() - t_start >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': - cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) + cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError - logger.log("********** Iteration %i ************"%iters_so_far) + logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate - atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) - optim_batchsize = optim_batchsize or ob.shape[0] + obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy + # predicted value function before udpate + vpredbefore = seg["vpred"] - assign_old_eq_new() # set old parameter values to new parameter values + # standardized advantage function estimate + atarg = (atarg - atarg.mean()) / atarg.std() + dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), + shuffle=not policy.recurrent) + optim_batchsize = optim_batchsize or obs_ph.shape[0] + + if hasattr(policy, "ob_rms"): + # update running mean/std for policy + policy.ob_rms.update(obs_ph) + + # set old parameter values to new parameter values + assign_old_eq_new(sess=sess) logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) + # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): - losses = [] # list of tuples, each of which gives the loss for a minibatch - for batch in d.iterate_once(optim_batchsize): - *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) - adam.update(g, optim_stepsize * cur_lrmult) + # list of tuples, each of which gives the loss for a minibatch + losses = [] + for batch in dataset.iterate_once(optim_batchsize): + *newlosses, grad = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, + sess=sess) + adam.update(grad, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] - for batch in d.iterate_once(optim_batchsize): - newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) + for batch in dataset.iterate_once(optim_batchsize): + newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=sess) losses.append(newlosses) - meanlosses,_,_ = mpi_moments(losses, axis=0) - logger.log(fmt_row(13, meanlosses)) - for (lossval, name) in zipsame(meanlosses, loss_names): - logger.record_tabular("loss_"+name, lossval) + mean_losses, _, _ = mpi_moments(losses, axis=0) + logger.log(fmt_row(13, mean_losses)) + for (loss_val, name) in zipsame(mean_losses, loss_names): + logger.record_tabular("loss_" + name, loss_val) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples + + # local values + lrlocal = (seg["ep_lens"], seg["ep_rets"]) + + # list of tuples + listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) @@ -208,11 +199,8 @@ def learn(env, policy_fn, *, iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) - if MPI.COMM_WORLD.Get_rank()==0: + logger.record_tabular("TimeElapsed", time.time() - t_start) + if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() - return pi - -def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] + return policy diff --git a/baselines/ppo1/run_atari.py b/baselines/ppo1/run_atari.py index 17941c6d39..186963b9ec 100644 --- a/baselines/ppo1/run_atari.py +++ b/baselines/ppo1/run_atari.py @@ -1,19 +1,25 @@ #!/usr/bin/env python3 +import os from mpi4py import MPI + from baselines.common import set_global_seeds -from baselines import bench -import os.path as osp -from baselines import logger +from baselines import bench, logger from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.cmd_util import atari_arg_parser +from baselines.ppo1 import pposgd_simple, cnn_policy + def train(env_id, num_timesteps, seed): - from baselines.ppo1 import pposgd_simple, cnn_policy - import baselines.common.tf_util as U + """ + Train PPO1 model for Atari environments, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() + if rank == 0: logger.configure() else: @@ -21,28 +27,36 @@ def train(env_id, num_timesteps, seed): workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) - def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 - return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) + + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 + return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, + placeholders=placeholders) + env = bench.Monitor(env, logger.get_dir() and - osp.join(logger.get_dir(), str(rank))) + os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, - max_timesteps=int(num_timesteps * 1.1), - timesteps_per_actorbatch=256, - clip_param=0.2, entcoeff=0.01, - optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, - gamma=0.99, lam=0.95, - schedule='linear' - ) + max_timesteps=int(num_timesteps * 1.1), + timesteps_per_actorbatch=256, + clip_param=0.2, entcoeff=0.01, + optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, + gamma=0.99, lam=0.95, + schedule='linear' + ) env.close() + def main(): + """ + Runs the test + """ args = atari_arg_parser().parse_args() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + if __name__ == '__main__': main() diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py index d7d8f5a49b..1df67f5743 100644 --- a/baselines/ppo1/run_humanoid.py +++ b/baselines/ppo1/run_humanoid.py @@ -1,75 +1,88 @@ #!/usr/bin/env python3 import os + +import gym + +from baselines.ppo1 import mlp_policy, pposgd_simple from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines.common import tf_util as U +from baselines.common import tf_util from baselines import logger -import gym def train(num_timesteps, seed, model_path=None): + """ + Train PPO1 model for the Humanoid environment, for testing purposes + + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + :param model_path: (str) path to the model + """ env_id = 'Humanoid-v2' - from baselines.ppo1 import mlp_policy, pposgd_simple - U.make_session(num_cpu=1).__enter__() - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=64, num_hid_layers=2) + + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, + sess=sess, placeholders=placeholders) + env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) - pi = pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=10, - optim_stepsize=3e-4, - optim_batchsize=64, - gamma=0.99, - lam=0.95, - schedule='linear', - ) + policy = pposgd_simple.learn(env, policy_fn, + max_timesteps=num_timesteps, + timesteps_per_actorbatch=2048, + clip_param=0.2, entcoeff=0.0, + optim_epochs=10, + optim_stepsize=3e-4, + optim_batchsize=64, + gamma=0.99, + lam=0.95, + schedule='linear') env.close() if model_path: - U.save_state(model_path) - - return pi + tf_util.save_state(model_path) + + return policy + class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale - def reward(self, r): - return r * self.scale + + def reward(self, _reward): + return _reward * self.scale + def main(): + """ + Runs the test + """ logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) - + args = parser.parse_args() - + if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) - else: + else: # construct the model object, load pre-trained model and render - pi = train(num_timesteps=1, seed=args.seed) - U.load_state(args.model_path) + policy = train(num_timesteps=1, seed=args.seed) + tf_util.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) - ob = env.reset() + obs = env.reset() while True: - action = pi.act(stochastic=False, ob=ob)[0] - ob, _, done, _ = env.step(action) + action = policy.act(stochastic=False, obs=obs)[0] + obs, _, done, _ = env.step(action) env.render() if done: - ob = env.reset() - - - + obs = env.reset() + if __name__ == '__main__': main() diff --git a/baselines/ppo1/run_mujoco.py b/baselines/ppo1/run_mujoco.py index 638998316b..84f0075632 100644 --- a/baselines/ppo1/run_mujoco.py +++ b/baselines/ppo1/run_mujoco.py @@ -1,29 +1,40 @@ #!/usr/bin/env python3 +from baselines.ppo1 import mlp_policy, pposgd_simple from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser -from baselines.common import tf_util as U from baselines import logger + def train(env_id, num_timesteps, seed): - from baselines.ppo1 import mlp_policy, pposgd_simple - U.make_session(num_cpu=1).__enter__() - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=64, num_hid_layers=2) + """ + Train PPO1 model for the Mujoco environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, + sess=sess, placeholders=placeholders) + env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, - gamma=0.99, lam=0.95, schedule='linear', - ) + max_timesteps=num_timesteps, + timesteps_per_actorbatch=2048, + clip_param=0.2, entcoeff=0.0, + optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, + gamma=0.99, lam=0.95, schedule='linear') env.close() + def main(): + """ + Runs the test + """ args = mujoco_arg_parser().parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + if __name__ == '__main__': main() diff --git a/baselines/ppo1/run_robotics.py b/baselines/ppo1/run_robotics.py index 7d84185a1b..892c38e55a 100644 --- a/baselines/ppo1/run_robotics.py +++ b/baselines/ppo1/run_robotics.py @@ -1,37 +1,45 @@ #!/usr/bin/env python3 from mpi4py import MPI +import mujoco_py + from baselines.common import set_global_seeds -from baselines import logger from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser -import mujoco_py +from baselines.ppo1 import mlp_policy, pposgd_simple def train(env_id, num_timesteps, seed): - from baselines.ppo1 import mlp_policy, pposgd_simple - import baselines.common.tf_util as U + """ + Train PPO1 model for Robotics environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() - mujoco_py.ignore_mujoco_warnings().__enter__() - workerseed = seed + 10000 * rank - set_global_seeds(workerseed) - env = make_robotics_env(env_id, workerseed, rank=rank) - def policy_fn(name, ob_space, ac_space): - return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=256, num_hid_layers=3) - - pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=2048, - clip_param=0.2, entcoeff=0.0, - optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, - gamma=0.99, lam=0.95, schedule='linear', - ) - env.close() + with mujoco_py.ignore_mujoco_warnings(): + workerseed = seed + 10000 * rank + set_global_seeds(workerseed) + env = make_robotics_env(env_id, workerseed, rank=rank) + + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): + return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3, + sess=sess, placeholders=placeholders) + + pposgd_simple.learn(env, policy_fn, + max_timesteps=num_timesteps, + timesteps_per_actorbatch=2048, + clip_param=0.2, entcoeff=0.0, + optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, + gamma=0.99, lam=0.95, schedule='linear') + env.close() def main(): + """ + Runs the test + """ args = robotics_arg_parser().parse_args() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) diff --git a/baselines/ppo2/policies.py b/baselines/ppo2/policies.py deleted file mode 100644 index 6fbbb14ac8..0000000000 --- a/baselines/ppo2/policies.py +++ /dev/null @@ -1,146 +0,0 @@ -import numpy as np -import tensorflow as tf -from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm -from baselines.common.distributions import make_pdtype -from baselines.common.input import observation_input - -def nature_cnn(unscaled_images, **conv_kwargs): - """ - CNN from Nature paper. - """ - scaled_images = tf.cast(unscaled_images, tf.float32) / 255. - activ = tf.nn.relu - h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), - **conv_kwargs)) - h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) - h3 = conv_to_fc(h3) - return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) - -class LnLstmPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - X, processed_x = observation_input(ob_space, nbatch) - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class LstmPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): - nenv = nbatch // nsteps - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - - M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) - S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(X) - xs = batch_to_seq(h, nenv, nsteps) - ms = batch_to_seq(M, nenv, nsteps) - h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) - h5 = seq_to_batch(h5) - vf = fc(h5, 'v', 1) - self.pd, self.pi = self.pdtype.pdfromlatent(h5) - - v0 = vf[:, 0] - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) - - def step(ob, state, mask): - return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) - - def value(ob, state, mask): - return sess.run(v0, {X:ob, S:state, M:mask}) - - self.X = X - self.M = M - self.S = S - self.vf = vf - self.step = step - self.value = value - -class CnnPolicy(object): - - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - X, processed_x = observation_input(ob_space, nbatch) - with tf.variable_scope("model", reuse=reuse): - h = nature_cnn(processed_x, **conv_kwargs) - vf = fc(h, 'v', 1)[:,0] - self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value - -class MlpPolicy(object): - def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 - self.pdtype = make_pdtype(ac_space) - with tf.variable_scope("model", reuse=reuse): - X, processed_x = observation_input(ob_space, nbatch) - activ = tf.tanh - processed_x = tf.layers.flatten(processed_x) - pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) - pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) - vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) - vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) - vf = fc(vf_h2, 'vf', 1)[:,0] - - self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) - - - a0 = self.pd.sample() - neglogp0 = self.pd.neglogp(a0) - self.initial_state = None - - def step(ob, *_args, **_kwargs): - a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) - return a, v, self.initial_state, neglogp - - def value(ob, *_args, **_kwargs): - return sess.run(vf, {X:ob}) - - self.X = X - self.vf = vf - self.step = step - self.value = value diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py index fd34f52f36..093d0e4cdf 100644 --- a/baselines/ppo2/ppo2.py +++ b/baselines/ppo2/ppo2.py @@ -1,44 +1,73 @@ import os import time import joblib +from collections import deque +import sys +import multiprocessing + import numpy as np -import os.path as osp import tensorflow as tf + from baselines import logger -from collections import deque from baselines.common import explained_variance from baselines.common.runners import AbstractEnvRunner + class Model(object): - def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, - nsteps, ent_coef, vf_coef, max_grad_norm): - sess = tf.get_default_session() - - act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) - train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) - - A = train_model.pdtype.sample_placeholder([None]) - ADV = tf.placeholder(tf.float32, [None]) - R = tf.placeholder(tf.float32, [None]) - OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) - OLDVPRED = tf.placeholder(tf.float32, [None]) - LR = tf.placeholder(tf.float32, []) - CLIPRANGE = tf.placeholder(tf.float32, []) - - neglogpac = train_model.pd.neglogp(A) - entropy = tf.reduce_mean(train_model.pd.entropy()) - - vpred = train_model.vf - vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) - vf_losses1 = tf.square(vpred - R) - vf_losses2 = tf.square(vpredclipped - R) + def __init__(self, *, policy, ob_space, ac_space, n_batch_act, n_batch_train, n_steps, ent_coef, vf_coef, + max_grad_norm): + """ + The PPO (Proximal Policy Optimization) model class https://arxiv.org/abs/1707.06347. + It shares policies with A2C. + + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param ob_space: (Gym Spaces) Observation space + :param ac_space: (Gym Spaces) Action space + :param n_batch_act: (int) Minibatch size for the actor policy, used mostly for reccurent policies + :param n_batch_train: (int) Minibatch size during training + :param n_steps: (int) The number of steps to run for each environment + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param vf_coef: (float) Value function coefficient for the loss calculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + """ + + n_cpu = multiprocessing.cpu_count() + if sys.platform == 'darwin': + n_cpu //= 2 + + config = tf.ConfigProto(allow_soft_placement=True, + intra_op_parallelism_threads=n_cpu, + inter_op_parallelism_threads=n_cpu) + config.gpu_options.allow_growth = True # pylint: disable=E1101 + + sess = tf.Session(config=config) + + act_model = policy(sess, ob_space, ac_space, n_batch_act, 1, reuse=False) + train_model = policy(sess, ob_space, ac_space, n_batch_train, n_steps, reuse=True) + + action_ph = train_model.pdtype.sample_placeholder([None]) + advs_ph = tf.placeholder(tf.float32, [None]) + rewards_ph = tf.placeholder(tf.float32, [None]) + old_neglog_pac_ph = tf.placeholder(tf.float32, [None]) + old_vpred_ph = tf.placeholder(tf.float32, [None]) + learning_rate_ph = tf.placeholder(tf.float32, []) + clip_range_ph = tf.placeholder(tf.float32, []) + + neglogpac = train_model.proba_distribution.neglogp(action_ph) + entropy = tf.reduce_mean(train_model.proba_distribution.entropy()) + + vpred = train_model.value_fn + vpredclipped = old_vpred_ph \ + + tf.clip_by_value(train_model.value_fn - old_vpred_ph, - clip_range_ph, clip_range_ph) + vf_losses1 = tf.square(vpred - rewards_ph) + vf_losses2 = tf.square(vpredclipped - rewards_ph) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) - ratio = tf.exp(OLDNEGLOGPAC - neglogpac) - pg_losses = -ADV * ratio - pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) + ratio = tf.exp(old_neglog_pac_ph - neglogpac) + pg_losses = -advs_ph * ratio + pg_losses2 = -advs_ph * tf.clip_by_value(ratio, 1.0 - clip_range_ph, 1.0 + clip_range_ph) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) - approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) - clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) + approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - old_neglog_pac_ph)) + clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), clip_range_ph))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef with tf.variable_scope('model'): params = tf.trainable_variables() @@ -46,32 +75,56 @@ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) - trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) + trainer = tf.train.AdamOptimizer(learning_rate=learning_rate_ph, epsilon=1e-5) _train = trainer.apply_gradients(grads) - def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): + def train(learning_rate, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): + """ + Training of PPO2 Algorithm + + :param learning_rate: (float) learning rate + :param cliprange: (float) Clipping factor + :param obs: (numpy array) The current observation of the environment + :param returns: (numpy array) the rewards + :param masks: (numpy array) The last masks for done episodes (used in recurent policies) + :param actions: (numpy array) the actions + :param values: (numpy array) the values + :param neglogpacs: (numpy array) Negative Log-likelihood probability of Actions + :param states: (numpy array) For recurrent policies, the internal state of the recurrent model + :return: policy gradient loss, value function loss, policy entropy, + approximation of kl divergence, updated clipping range, training update operation + """ advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) - td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, - CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} + td_map = {train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: returns, + learning_rate_ph: learning_rate, clip_range_ph: cliprange, old_neglog_pac_ph: neglogpacs, + old_vpred_ph: values} if states is not None: - td_map[train_model.S] = states - td_map[train_model.M] = masks - return sess.run( - [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], - td_map - )[:-1] + td_map[train_model.states_ph] = states + td_map[train_model.masks_ph] = masks + return sess.run([pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] + self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] def save(save_path): - ps = sess.run(params) - joblib.dump(ps, save_path) + """ + Save the policy to a file + + :param save_path: (str) the location to save the policy + """ + saved_params = sess.run(params) + joblib.dump(saved_params, save_path) def load(load_path): + """ + load a policy from the file + + :param load_path: (str) the saved location of the policy + """ loaded_params = joblib.load(load_path) restores = [] - for p, loaded_p in zip(params, loaded_params): - restores.append(p.assign(loaded_p)) + for param, loaded_p in zip(params, loaded_params): + restores.append(param.assign(loaded_p)) sess.run(restores) # If you want to load weights, also save/load observation scaling inside VecNormalize @@ -83,20 +136,43 @@ def load(load_path): self.initial_state = act_model.initial_state self.save = save self.load = load - tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101 + tf.global_variables_initializer().run(session=sess) # pylint: disable=E1101 + class Runner(AbstractEnvRunner): + def __init__(self, *, env, model, n_steps, gamma, lam): + """ + A runner to learn the policy of an environment for a model - def __init__(self, *, env, model, nsteps, gamma, lam): - super().__init__(env=env, model=model, nsteps=nsteps) + :param env: (Gym environment) The environment to learn from + :param model: (Model) The model to learn + :param n_steps: (int) The number of steps to run for each environment + :param gamma: (float) Discount factor + :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator + """ + super().__init__(env=env, model=model, n_steps=n_steps) self.lam = lam self.gamma = gamma def run(self): - mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] + """ + Run a learning step of the model + + :return: + - observations: (numpy Number) the observations + - rewards: (numpy Number) the rewards + - masks: (numpy bool) whether an episode is over or not + - actions: (numpy Number) the actions + - values: (numpy Number) the value function output + - negative log probabilities: (numpy Number) + - states: (numpy Number) the internal states of the recurrent policies + - infos: (dict) the extra information of the model + """ + # mb stands for minibatch + mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [], [], [], [], [], [] mb_states = self.states - epinfos = [] - for _ in range(self.nsteps): + ep_infos = [] + for _ in range(self.n_steps): actions, values, self.states, neglogpacs = self.model.step(self.obs, self.states, self.dones) mb_obs.append(self.obs.copy()) mb_actions.append(actions) @@ -105,10 +181,11 @@ def run(self): mb_dones.append(self.dones) self.obs[:], rewards, self.dones, infos = self.env.step(actions) for info in infos: - maybeepinfo = info.get('episode') - if maybeepinfo: epinfos.append(maybeepinfo) + maybeep_info = info.get('episode') + if maybeep_info: + ep_infos.append(maybeep_info) mb_rewards.append(rewards) - #batch of steps to batch of rollouts + # batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) @@ -116,127 +193,173 @@ def run(self): mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) mb_dones = np.asarray(mb_dones, dtype=np.bool) last_values = self.model.value(self.obs, self.states, self.dones) - #discount/bootstrap off value fn - mb_returns = np.zeros_like(mb_rewards) + # discount/bootstrap off value fn mb_advs = np.zeros_like(mb_rewards) - lastgaelam = 0 - for t in reversed(range(self.nsteps)): - if t == self.nsteps - 1: + last_gae_lam = 0 + for step in reversed(range(self.n_steps)): + if step == self.n_steps - 1: nextnonterminal = 1.0 - self.dones nextvalues = last_values else: - nextnonterminal = 1.0 - mb_dones[t+1] - nextvalues = mb_values[t+1] - delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] - mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam + nextnonterminal = 1.0 - mb_dones[step + 1] + nextvalues = mb_values[step + 1] + delta = mb_rewards[step] + self.gamma * nextvalues * nextnonterminal - mb_values[step] + mb_advs[step] = last_gae_lam = delta + self.gamma * self.lam * nextnonterminal * last_gae_lam mb_returns = mb_advs + mb_values - return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), - mb_states, epinfos) + return (*map(swap_and_flatten, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), mb_states, + ep_infos) + + # obs, returns, masks, actions, values, neglogpacs, states = runner.run() -def sf01(arr): +def swap_and_flatten(arr): """ swap and then flatten axes 0 and 1 + + :param arr: (numpy array) + :return: (numpy array) """ - s = arr.shape - return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) + shape = arr.shape + return arr.swapaxes(0, 1).reshape(shape[0] * shape[1], *shape[2:]) + def constfn(val): - def f(_): + """ + Create a function that returns a constant + It is useful for learning rate schedule (to avoid code duplication) + + :param val: (float) + :return: (function) + """ + + def func(_): return val - return f -def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, - vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, - log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, - save_interval=0, load_path=None): + return func - if isinstance(lr, float): lr = constfn(lr) - else: assert callable(lr) - if isinstance(cliprange, float): cliprange = constfn(cliprange) - else: assert callable(cliprange) + +def learn(*, policy, env, n_steps, total_timesteps, ent_coef, learning_rate, + vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, + log_interval=10, nminibatches=4, noptepochs=4, + cliprange=0.2, save_interval=0, load_path=None): + """ + Return a trained PPO2 model. + + :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) + :param env: (Gym environment) The environment to learn from + :param n_steps: (int) The number of steps to run for each environment + :param total_timesteps: (int) The total number of samples + :param ent_coef: (float) Entropy coefficient for the loss caculation + :param learning_rate: (float or callable) The learning rate, it can be a function + :param vf_coef: (float) Value function coefficient for the loss calculation + :param max_grad_norm: (float) The maximum value for the gradient clipping + :param gamma: (float) Discount factor + :param lam: (float) Factor for trade-off of bias vs variance for Generalized Advantage Estimator + :param nminibatches: (int) Number of minibatches for the policies + :param noptepochs: (int) Number of epoch when optimizing the surrogate + :param cliprange: (float or callable) Clipping parameter, it can be a function + :param log_interval: (int) The number of timesteps before logging. + :param save_interval: (int) The number of timesteps before saving. + :param load_path: (str) Path to a trained ppo2 model, set to None, it will learn from scratch + :return: (Model) PPO2 model + """ + if isinstance(learning_rate, float): + learning_rate = constfn(learning_rate) + else: + assert callable(learning_rate) + if isinstance(cliprange, float): + cliprange = constfn(cliprange) + else: + assert callable(cliprange) total_timesteps = int(total_timesteps) - nenvs = env.num_envs + n_envs = env.num_envs ob_space = env.observation_space ac_space = env.action_space - nbatch = nenvs * nsteps - nbatch_train = nbatch // nminibatches + n_batch = n_envs * n_steps + n_batch_train = n_batch // nminibatches - make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, - nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, - max_grad_norm=max_grad_norm) + make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_batch_act=n_envs, + n_batch_train=n_batch_train, n_steps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef, + max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle - with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: - fh.write(cloudpickle.dumps(make_model)) + with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as file_handler: + file_handler.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) - runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) + runner = Runner(env=env, model=model, n_steps=n_steps, gamma=gamma, lam=lam) - epinfobuf = deque(maxlen=100) - tfirststart = time.time() + ep_info_buf = deque(maxlen=100) + t_first_start = time.time() - nupdates = total_timesteps//nbatch - for update in range(1, nupdates+1): - assert nbatch % nminibatches == 0 - nbatch_train = nbatch // nminibatches - tstart = time.time() + nupdates = total_timesteps // n_batch + for update in range(1, nupdates + 1): + assert n_batch % nminibatches == 0 + n_batch_train = n_batch // nminibatches + t_start = time.time() frac = 1.0 - (update - 1.0) / nupdates - lrnow = lr(frac) + lr_now = learning_rate(frac) cliprangenow = cliprange(frac) - obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 - epinfobuf.extend(epinfos) - mblossvals = [] - if states is None: # nonrecurrent version - inds = np.arange(nbatch) + obs, returns, masks, actions, values, neglogpacs, states, ep_infos = runner.run() # pylint: disable=E0632 + ep_info_buf.extend(ep_infos) + mb_loss_vals = [] + if states is None: # nonrecurrent version + inds = np.arange(n_batch) for _ in range(noptepochs): np.random.shuffle(inds) - for start in range(0, nbatch, nbatch_train): - end = start + nbatch_train + for start in range(0, n_batch, n_batch_train): + end = start + n_batch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mblossvals.append(model.train(lrnow, cliprangenow, *slices)) - else: # recurrent version - assert nenvs % nminibatches == 0 - envsperbatch = nenvs // nminibatches - envinds = np.arange(nenvs) - flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) - envsperbatch = nbatch_train // nsteps + mb_loss_vals.append(model.train(lr_now, cliprangenow, *slices)) + else: # recurrent version + assert n_envs % nminibatches == 0 + envinds = np.arange(n_envs) + flatinds = np.arange(n_envs * n_steps).reshape(n_envs, n_steps) + envsperbatch = n_batch_train // n_steps for _ in range(noptepochs): np.random.shuffle(envinds) - for start in range(0, nenvs, envsperbatch): + for start in range(0, n_envs, envsperbatch): end = start + envsperbatch - mbenvinds = envinds[start:end] - mbflatinds = flatinds[mbenvinds].ravel() - slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) - mbstates = states[mbenvinds] - mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) - - lossvals = np.mean(mblossvals, axis=0) - tnow = time.time() - fps = int(nbatch / (tnow - tstart)) + mb_env_inds = envinds[start:end] + mb_flat_inds = flatinds[mb_env_inds].ravel() + slices = (arr[mb_flat_inds] for arr in (obs, returns, masks, actions, values, neglogpacs)) + mb_states = states[mb_env_inds] + mb_loss_vals.append(model.train(lr_now, cliprangenow, *slices, mb_states)) + + loss_vals = np.mean(mb_loss_vals, axis=0) + t_now = time.time() + fps = int(n_batch / (t_now - t_start)) if update % log_interval == 0 or update == 1: - ev = explained_variance(values, returns) - logger.logkv("serial_timesteps", update*nsteps) + explained_var = explained_variance(values, returns) + logger.logkv("serial_timesteps", update * n_steps) logger.logkv("nupdates", update) - logger.logkv("total_timesteps", update*nbatch) + logger.logkv("total_timesteps", update * n_batch) logger.logkv("fps", fps) - logger.logkv("explained_variance", float(ev)) - logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) - logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) - logger.logkv('time_elapsed', tnow - tfirststart) - for (lossval, lossname) in zip(lossvals, model.loss_names): - logger.logkv(lossname, lossval) + logger.logkv("explained_variance", float(explained_var)) + logger.logkv('ep_rewmean', safe_mean([ep_info['r'] for ep_info in ep_info_buf])) + logger.logkv('eplenmean', safe_mean([ep_info['l'] for ep_info in ep_info_buf])) + logger.logkv('time_elapsed', t_start - t_first_start) + for (loss_val, loss_name) in zip(loss_vals, model.loss_names): + logger.logkv(loss_name, loss_val) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): - checkdir = osp.join(logger.get_dir(), 'checkpoints') + checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) - savepath = osp.join(checkdir, '%.5i'%update) - print('Saving to', savepath) - model.save(savepath) + save_path = os.path.join(checkdir, '%.5i' % update) + print('Saving to', save_path) + model.save(save_path) env.close() return model -def safemean(xs): - return np.nan if len(xs) == 0 else np.mean(xs) + +def safe_mean(arr): + """ + Compute the mean of an array if there is at least one element. + For empty array, return zero. It is used for logging only. + + :param arr: (numpy array) + :return: (float) + """ + return np.nan if len(arr) == 0 else np.mean(arr) diff --git a/baselines/ppo2/run_atari.py b/baselines/ppo2/run_atari.py index 322837ac86..a6239d852e 100644 --- a/baselines/ppo2/run_atari.py +++ b/baselines/ppo2/run_atari.py @@ -1,40 +1,42 @@ #!/usr/bin/env python3 -import sys from baselines import logger from baselines.common.cmd_util import make_atari_env, atari_arg_parser from baselines.common.vec_env.vec_frame_stack import VecFrameStack from baselines.ppo2 import ppo2 -from baselines.ppo2.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy -import multiprocessing -import tensorflow as tf +from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy, MlpPolicy def train(env_id, num_timesteps, seed, policy): + """ + Train PPO2 model for atari environment, for testing purposes - ncpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': ncpu //= 2 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - config.gpu_options.allow_growth = True #pylint: disable=E1101 - tf.Session(config=config).__enter__() + :param env_id: (str) the environment id string + :param num_timesteps: (int) the number of timesteps to run + :param seed: (int) Used to seed the random generator. + :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) + """ env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) - policy = {'cnn' : CnnPolicy, 'lstm' : LstmPolicy, 'lnlstm' : LnLstmPolicy, 'mlp': MlpPolicy}[policy] - ppo2.learn(policy=policy, env=env, nsteps=128, nminibatches=4, - lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, - ent_coef=.01, - lr=lambda f : f * 2.5e-4, - cliprange=lambda f : f * 0.1, - total_timesteps=int(num_timesteps * 1.1)) + policy = {'cnn': CnnPolicy, 'lstm': LstmPolicy, 'lnlstm': LnLstmPolicy, 'mlp': MlpPolicy}[policy] + ppo2.learn(policy=policy, env=env, n_steps=128, nminibatches=4, + lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, + ent_coef=.01, + learning_rate=lambda f: f * 2.5e-4, + cliprange=lambda f: f * 0.1, + total_timesteps=int(num_timesteps * 1.1)) + def main(): + """ + Runs the test + """ parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') args = parser.parse_args() logger.configure() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, - policy=args.policy) + policy=args.policy) + if __name__ == '__main__': main() diff --git a/baselines/ppo2/run_mujoco.py b/baselines/ppo2/run_mujoco.py index 282aa3f134..790c24e2a3 100644 --- a/baselines/ppo2/run_mujoco.py +++ b/baselines/ppo2/run_mujoco.py @@ -1,37 +1,38 @@ #!/usr/bin/env python3 import numpy as np +import gym + from baselines.common.cmd_util import mujoco_arg_parser from baselines import bench, logger +from baselines.common import set_global_seeds +from baselines.common.vec_env.vec_normalize import VecNormalize +from baselines.ppo2 import ppo2 +from baselines.a2c.policies import MlpPolicy +from baselines.common.vec_env.dummy_vec_env import DummyVecEnv def train(env_id, num_timesteps, seed): - from baselines.common import set_global_seeds - from baselines.common.vec_env.vec_normalize import VecNormalize - from baselines.ppo2 import ppo2 - from baselines.ppo2.policies import MlpPolicy - import gym - import tensorflow as tf - from baselines.common.vec_env.dummy_vec_env import DummyVecEnv - ncpu = 1 - config = tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=ncpu, - inter_op_parallelism_threads=ncpu) - tf.Session(config=config).__enter__() + """ + Train PPO2 model for Mujoco environment, for testing purposes + :param env_id: (str) the environment id string + :param num_timesteps: (int) the number of timesteps to run + :param seed: (int) Used to seed the random generator. + """ def make_env(): - env = gym.make(env_id) - env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) - return env + env_out = gym.make(env_id) + env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) + return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy - model = ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, + model = ppo2.learn(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, - lr=3e-4, + learning_rate=3e-4, cliprange=0.2, total_timesteps=num_timesteps) @@ -39,6 +40,9 @@ def make_env(): def main(): + """ + Runs the test + """ args = mujoco_arg_parser().parse_args() logger.configure() model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) @@ -49,7 +53,7 @@ def main(): obs[:] = env.reset() while True: actions = model.step(obs)[0] - obs[:] = env.step(actions)[0] + obs[:] = env.step(actions)[0] env.render() diff --git a/baselines/results_plotter.py b/baselines/results_plotter.py index 051420474a..589d407da1 100644 --- a/baselines/results_plotter.py +++ b/baselines/results_plotter.py @@ -1,53 +1,90 @@ import numpy as np import matplotlib -matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode - import matplotlib.pyplot as plt -plt.rcParams['svg.fonttype'] = 'none' from baselines.bench.monitor import load_results +matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode +plt.rcParams['svg.fonttype'] = 'none' + X_TIMESTEPS = 'timesteps' X_EPISODES = 'episodes' X_WALLTIME = 'walltime_hrs' POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] EPISODES_WINDOW = 100 COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', - 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', - 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] + 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', + 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] + + +def rolling_window(array, window): + """ + apply a rolling window to a numpy array + + :param array: (numpy Any) the input Array + :param window: (int) length of the rolling window + :return: (numpy Any) rolling window on the input array + """ + shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) + strides = array.strides + (array.strides[-1],) + return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) + + +def window_func(var_1, var_2, window, func): + """ + apply a function to the rolling window of 2 arrays -def rolling_window(a, window): - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + :param var_1: (numpy Any) variable 1 + :param var_2: (numpy Any) variable 2 + :param window: (int) length of the rolling window + :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean) + :return: (numpy Any, numpy Any) the rolling output with applied function + """ + var_2_window = rolling_window(var_2, window) + function_on_var2 = func(var_2_window, axis=-1) + return var_1[window - 1:], function_on_var2 -def window_func(x, y, window, func): - yw = rolling_window(y, window) - yw_func = func(yw, axis=-1) - return x[window-1:], yw_func -def ts2xy(ts, xaxis): +def ts2xy(timesteps, xaxis): + """ + Decompose a timesteps variable to x ans ys + + :param timesteps: (Pandas DataFrame) the input data + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :return: (numpy Number, numpy Number) the x and y output + """ if xaxis == X_TIMESTEPS: - x = np.cumsum(ts.l.values) - y = ts.r.values + x_var = np.cumsum(timesteps.l.values) + y_var = timesteps.r.values elif xaxis == X_EPISODES: - x = np.arange(len(ts)) - y = ts.r.values + x_var = np.arange(len(timesteps)) + y_var = timesteps.r.values elif xaxis == X_WALLTIME: - x = ts.t.values / 3600. - y = ts.r.values + x_var = timesteps.t.values / 3600. + y_var = timesteps.r.values else: raise NotImplementedError - return x, y + return x_var, y_var + def plot_curves(xy_list, xaxis, title): - plt.figure(figsize=(8,2)) + """ + plot the curves + + :param xy_list: ([(numpy Number, numpy Number)]) the x and y coordinates to plot + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param title: (str) the title of the plot + """ + + plt.figure(figsize=(8, 2)) maxx = max(xy[0][-1] for xy in xy_list) minx = 0 for (i, (x, y)) in enumerate(xy_list): color = COLORS[i] plt.scatter(x, y, s=2) - x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes + x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) # So returns average of last EPISODE_WINDOW episodes plt.plot(x, y_mean, color=color) plt.xlim(minx, maxx) plt.title(title) @@ -55,33 +92,47 @@ def plot_curves(xy_list, xaxis, title): plt.ylabel("Episode Rewards") plt.tight_layout() + def plot_results(dirs, num_timesteps, xaxis, task_name): + """ + plot the results + + :param dirs: (str) the save location of the results to plot + :param num_timesteps: (int) only plot the points below this value + :param xaxis: (str) the axis for the x and y output + (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') + :param task_name: (str) the title of the task to plot + """ + tslist = [] - for dir in dirs: - ts = load_results(dir) - ts = ts[ts.l.cumsum() <= num_timesteps] - tslist.append(ts) - xy_list = [ts2xy(ts, xaxis) for ts in tslist] + for folder in dirs: + timesteps = load_results(folder) + timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps] + tslist.append(timesteps) + xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist] plot_curves(xy_list, xaxis, task_name) -# Example usage in jupyter-notebook -# from baselines import log_viewer -# %matplotlib inline -# log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") -# Here ./log is a directory containing the monitor.csv files def main(): + """ + Example usage in jupyter-notebook + from baselines import log_viewer + %matplotlib inline + log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") + Here ./log is a directory containing the monitor.csv files + """ import argparse import os parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) + parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log']) parser.add_argument('--num_timesteps', type=int, default=int(10e6)) - parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) - parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') + parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS) + parser.add_argument('--task_name', help='Title of plot', default='Breakout') args = parser.parse_args() - args.dirs = [os.path.abspath(dir) for dir in args.dirs] + args.dirs = [os.path.abspath(folder) for folder in args.dirs] plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) plt.show() + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/baselines/trpo_mpi/nosharing_cnn_policy.py b/baselines/trpo_mpi/nosharing_cnn_policy.py index 97b2dcd0b5..9133bba956 100644 --- a/baselines/trpo_mpi/nosharing_cnn_policy.py +++ b/baselines/trpo_mpi/nosharing_cnn_policy.py @@ -1,56 +1,63 @@ -import baselines.common.tf_util as U import tensorflow as tf -import gym -from baselines.common.distributions import make_pdtype -class CnnPolicy(object): +import baselines.common.tf_util as tf_utils +from baselines.ppo1.mlp_policy import BasePolicy + + +class CnnPolicy(BasePolicy): recurrent = False - def __init__(self, name, ob_space, ac_space): - with tf.variable_scope(name): - self._init(ob_space, ac_space) - self.scope = tf.get_variable_scope().name + + def __init__(self, name, ob_space, ac_space, sess=None, reuse=False, placeholders=None): + """ + A CNN policy object for TRPO + + :param name: (str) type of the policy (lin, logits, value) + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + :param sess: (TensorFlow session) The current TensorFlow session containing the variables. + :param reuse: (bool) If the policy is reusable or not + :param placeholders: (dict) To feed existing placeholders if needed + """ + super(CnnPolicy, self).__init__(placeholders=placeholders) + self.sess = sess + self.reuse = reuse + self.name = name + self._init(ob_space, ac_space) + self.scope = tf.get_variable_scope().name def _init(self, ob_space, ac_space): - assert isinstance(ob_space, gym.spaces.Box) - - self.pdtype = pdtype = make_pdtype(ac_space) - sequence_length = None - - ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) - - obscaled = ob / 255.0 - - with tf.variable_scope("pol"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) - self.pd = pdtype.pdfromflat(logits) - with tf.variable_scope("vf"): - x = obscaled - x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) - x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) - x = U.flattenallbut0(x) - x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) - self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0)) + """ + + :param ob_space: (Gym Space) The observation space of the environment + :param ac_space: (Gym Space) The action space of the environment + """ + obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) + + obs_normalized = obs / 255.0 + + with tf.variable_scope(self.name + "/pol", reuse=self.reuse): + layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID")) + layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID")) + layer_2 = tf_utils.flattenallbut0(layer_2) + layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin', + kernel_initializer=tf_utils.normc_initializer(1.0))) + logits = tf.layers.dense(layer_3, pdtype.param_shape()[0], name='logits', + kernel_initializer=tf_utils.normc_initializer(0.01)) + self.proba_distribution = pdtype.proba_distribution_from_flat(logits) + with tf.variable_scope(self.name + "/vf", reuse=self.reuse): + layer_1 = tf.nn.relu(tf_utils.conv2d(obs_normalized, 8, "l1", [8, 8], [4, 4], pad="VALID")) + layer_2 = tf.nn.relu(tf_utils.conv2d(layer_1, 16, "l2", [4, 4], [2, 2], pad="VALID")) + layer_2 = tf_utils.flattenallbut0(layer_2) + layer_3 = tf.nn.relu(tf.layers.dense(layer_2, 128, name='lin', + kernel_initializer=tf_utils.normc_initializer(1.0))) + self.vpred = tf.layers.dense(layer_3, 1, name='value', + kernel_initializer=tf_utils.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] - stochastic = tf.placeholder(dtype=tf.bool, shape=()) - ac = self.pd.sample() - self._act = U.function([stochastic, ob], [ac, self.vpred]) - - def act(self, stochastic, ob): - ac1, vpred1 = self._act(stochastic, ob[None]) - return ac1[0], vpred1[0] - def get_variables(self): - return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) - def get_trainable_variables(self): - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) - def get_initial_state(self): - return [] - + if self.stochastic_ph is None: + self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) + action = self.proba_distribution.sample() + self._act = tf_utils.function([self.stochastic_ph, obs], [action, self.vpred]) diff --git a/baselines/trpo_mpi/run_atari.py b/baselines/trpo_mpi/run_atari.py index f31ebfd7c5..305b333b27 100644 --- a/baselines/trpo_mpi/run_atari.py +++ b/baselines/trpo_mpi/run_atari.py @@ -1,20 +1,26 @@ - #!/usr/bin/env python3 +#!/usr/bin/env python3 +import os + from mpi4py import MPI + from baselines.common import set_global_seeds -import os.path as osp -import gym, logging -from baselines import logger -from baselines import bench +from baselines import bench, logger from baselines.common.atari_wrappers import make_atari, wrap_deepmind from baselines.common.cmd_util import atari_arg_parser +from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy +from baselines.trpo_mpi import trpo_mpi + def train(env_id, num_timesteps, seed): - from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy - from baselines.trpo_mpi import trpo_mpi - import baselines.common.tf_util as U + """ + Train TRPO model for the atari environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ rank = MPI.COMM_WORLD.Get_rank() - sess = U.single_threaded_session() - sess.__enter__() + if rank == 0: logger.configure() else: @@ -23,21 +29,29 @@ def train(env_id, num_timesteps, seed): workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) - def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 - return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) - env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) + + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 + return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) + + env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, - max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) + max_timesteps=int(num_timesteps * 1.1), gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, + entcoeff=0.00) env.close() + def main(): + """ + Runs the test + """ args = atari_arg_parser().parse_args() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) + if __name__ == "__main__": main() diff --git a/baselines/trpo_mpi/run_mujoco.py b/baselines/trpo_mpi/run_mujoco.py index 220bb91aba..4bf36efa28 100644 --- a/baselines/trpo_mpi/run_mujoco.py +++ b/baselines/trpo_mpi/run_mujoco.py @@ -1,36 +1,48 @@ #!/usr/bin/env python3 # noinspection PyUnresolvedReferences from mpi4py import MPI + from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser from baselines import logger from baselines.ppo1.mlp_policy import MlpPolicy from baselines.trpo_mpi import trpo_mpi +import baselines.common.tf_util as tf_util + def train(env_id, num_timesteps, seed): - import baselines.common.tf_util as U - sess = U.single_threaded_session() - sess.__enter__() - - rank = MPI.COMM_WORLD.Get_rank() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - logger.set_level(logger.DISABLED) - workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() - def policy_fn(name, ob_space, ac_space): - return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=32, num_hid_layers=2) - env = make_mujoco_env(env_id, workerseed) - trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, - max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) - env.close() + """ + Train TRPO model for the mujoco environment, for testing purposes + + :param env_id: (str) Environment ID + :param num_timesteps: (int) The total number of samples + :param seed: (int) The initial seed for training + """ + with tf_util.single_threaded_session(): + rank = MPI.COMM_WORLD.Get_rank() + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + logger.set_level(logger.DISABLED) + workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() + + def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): + return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2, sess=sess, + placeholders=placeholders) + + env = make_mujoco_env(env_id, workerseed) + trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, + max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) + env.close() + def main(): + """ + Runs the test + """ args = mujoco_arg_parser().parse_args() train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) if __name__ == '__main__': main() - diff --git a/baselines/trpo_mpi/trpo_mpi.py b/baselines/trpo_mpi/trpo_mpi.py index e23d9ac793..7edf702a26 100644 --- a/baselines/trpo_mpi/trpo_mpi.py +++ b/baselines/trpo_mpi/trpo_mpi.py @@ -1,291 +1,36 @@ -from baselines.common import explained_variance, zipsame, dataset -from baselines import logger -import baselines.common.tf_util as U -import tensorflow as tf, numpy as np -import time -from baselines.common import colorize -from mpi4py import MPI -from collections import deque -from baselines.common.mpi_adam import MpiAdam -from baselines.common.cg import cg -from contextlib import contextmanager +from baselines.gail.trpo_mpi import learn as base_learn -def traj_segment_generator(pi, env, horizon, stochastic): - # Initialize state variables - t = 0 - ac = env.action_space.sample() - new = True - rew = 0.0 - ob = env.reset() - - cur_ep_ret = 0 - cur_ep_len = 0 - ep_rets = [] - ep_lens = [] - - # Initialize history arrays - obs = np.array([ob for _ in range(horizon)]) - rews = np.zeros(horizon, 'float32') - vpreds = np.zeros(horizon, 'float32') - news = np.zeros(horizon, 'int32') - acs = np.array([ac for _ in range(horizon)]) - prevacs = acs.copy() - - while True: - prevac = ac - ac, vpred = pi.act(stochastic, ob) - # Slight weirdness here because we need value function at time T - # before returning segment [0, T-1] so we get the correct - # terminal value - if t > 0 and t % horizon == 0: - yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news, - "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new), - "ep_rets" : ep_rets, "ep_lens" : ep_lens} - _, vpred = pi.act(stochastic, ob) - # Be careful!!! if you change the downstream algorithm to aggregate - # several of these batches, then be sure to do a deepcopy - ep_rets = [] - ep_lens = [] - i = t % horizon - obs[i] = ob - vpreds[i] = vpred - news[i] = new - acs[i] = ac - prevacs[i] = prevac - - ob, rew, new, _ = env.step(ac) - rews[i] = rew - - cur_ep_ret += rew - cur_ep_len += 1 - if new: - ep_rets.append(cur_ep_ret) - ep_lens.append(cur_ep_len) - cur_ep_ret = 0 - cur_ep_len = 0 - ob = env.reset() - t += 1 - -def add_vtarg_and_adv(seg, gamma, lam): - new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 - vpred = np.append(seg["vpred"], seg["nextvpred"]) - T = len(seg["rew"]) - seg["adv"] = gaelam = np.empty(T, 'float32') - rew = seg["rew"] - lastgaelam = 0 - for t in reversed(range(T)): - nonterminal = 1-new[t+1] - delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t] - gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam - seg["tdlamret"] = seg["adv"] + seg["vpred"] def learn(env, policy_fn, *, - timesteps_per_batch, # what to train on - max_kl, cg_iters, - gamma, lam, # advantage estimation - entcoeff=0.0, - cg_damping=1e-2, - vf_stepsize=3e-4, - vf_iters =3, - max_timesteps=0, max_episodes=0, max_iters=0, # time constraint - callback=None - ): - nworkers = MPI.COMM_WORLD.Get_size() - rank = MPI.COMM_WORLD.Get_rank() - np.set_printoptions(precision=3) - # Setup losses and stuff - # ---------------------------------------- - ob_space = env.observation_space - ac_space = env.action_space - pi = policy_fn("pi", ob_space, ac_space) - oldpi = policy_fn("oldpi", ob_space, ac_space) - atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) - ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return - - ob = U.get_placeholder_cached(name="ob") - ac = pi.pdtype.sample_placeholder([None]) - - kloldnew = oldpi.pd.kl(pi.pd) - ent = pi.pd.entropy() - meankl = tf.reduce_mean(kloldnew) - meanent = tf.reduce_mean(ent) - entbonus = entcoeff * meanent - - vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) - - ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold - surrgain = tf.reduce_mean(ratio * atarg) - - optimgain = surrgain + entbonus - losses = [optimgain, meankl, entbonus, surrgain, meanent] - loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] - - dist = meankl - - all_var_list = pi.get_trainable_variables() - var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] - vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] - vfadam = MpiAdam(vf_var_list) - - get_flat = U.GetFlat(var_list) - set_from_flat = U.SetFromFlat(var_list) - klgrads = tf.gradients(dist, var_list) - flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") - shapes = [var.get_shape().as_list() for var in var_list] - start = 0 - tangents = [] - for shape in shapes: - sz = U.intprod(shape) - tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) - start += sz - gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 - fvp = U.flatgrad(gvp, var_list) - - assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) - for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) - compute_losses = U.function([ob, ac, atarg], losses) - compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) - compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) - compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) - - @contextmanager - def timed(msg): - if rank == 0: - print(colorize(msg, color='magenta')) - tstart = time.time() - yield - print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) - else: - yield - - def allmean(x): - assert isinstance(x, np.ndarray) - out = np.empty_like(x) - MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) - out /= nworkers - return out - - U.initialize() - th_init = get_flat() - MPI.COMM_WORLD.Bcast(th_init, root=0) - set_from_flat(th_init) - vfadam.sync() - print("Init param sum", th_init.sum(), flush=True) - - # Prepare for rollouts - # ---------------------------------------- - seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) - - episodes_so_far = 0 - timesteps_so_far = 0 - iters_so_far = 0 - tstart = time.time() - lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths - rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards - - assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 - - while True: - if callback: callback(locals(), globals()) - if max_timesteps and timesteps_so_far >= max_timesteps: - break - elif max_episodes and episodes_so_far >= max_episodes: - break - elif max_iters and iters_so_far >= max_iters: - break - logger.log("********** Iteration %i ************"%iters_so_far) - - with timed("sampling"): - seg = seg_gen.__next__() - add_vtarg_and_adv(seg, gamma, lam) - - # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) - ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] - vpredbefore = seg["vpred"] # predicted value function before udpate - atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate - - if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) - if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy - - args = seg["ob"], seg["ac"], atarg - fvpargs = [arr[::5] for arr in args] - def fisher_vector_product(p): - return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p - - assign_old_eq_new() # set old parameter values to new parameter values - with timed("computegrad"): - *lossbefore, g = compute_lossandgrad(*args) - lossbefore = allmean(np.array(lossbefore)) - g = allmean(g) - if np.allclose(g, 0): - logger.log("Got zero gradient. not updating") - else: - with timed("cg"): - stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) - assert np.isfinite(stepdir).all() - shs = .5*stepdir.dot(fisher_vector_product(stepdir)) - lm = np.sqrt(shs / max_kl) - # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) - fullstep = stepdir / lm - expectedimprove = g.dot(fullstep) - surrbefore = lossbefore[0] - stepsize = 1.0 - thbefore = get_flat() - for _ in range(10): - thnew = thbefore + fullstep * stepsize - set_from_flat(thnew) - meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) - improve = surr - surrbefore - logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) - if not np.isfinite(meanlosses).all(): - logger.log("Got non-finite value of losses -- bad!") - elif kl > max_kl * 1.5: - logger.log("violated KL constraint. shrinking step.") - elif improve < 0: - logger.log("surrogate didn't improve. shrinking step.") - else: - logger.log("Stepsize OK!") - break - stepsize *= .5 - else: - logger.log("couldn't compute a good step") - set_from_flat(thbefore) - if nworkers > 1 and iters_so_far % 20 == 0: - paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples - assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) - - for (lossname, lossval) in zip(loss_names, meanlosses): - logger.record_tabular(lossname, lossval) - - with timed("vf"): - - for _ in range(vf_iters): - for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), - include_final_partial_batch=False, batch_size=64): - g = allmean(compute_vflossandgrad(mbob, mbret)) - vfadam.update(g, vf_stepsize) - - logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) - - lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values - listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples - lens, rews = map(flatten_lists, zip(*listoflrpairs)) - lenbuffer.extend(lens) - rewbuffer.extend(rews) - - logger.record_tabular("EpLenMean", np.mean(lenbuffer)) - logger.record_tabular("EpRewMean", np.mean(rewbuffer)) - logger.record_tabular("EpThisIter", len(lens)) - episodes_so_far += len(lens) - timesteps_so_far += sum(lens) - iters_so_far += 1 - - logger.record_tabular("EpisodesSoFar", episodes_so_far) - logger.record_tabular("TimestepsSoFar", timesteps_so_far) - logger.record_tabular("TimeElapsed", time.time() - tstart) - - if rank==0: - logger.dump_tabular() - -def flatten_lists(listoflists): - return [el for list_ in listoflists for el in list_] \ No newline at end of file + timesteps_per_batch, # what to train on + max_kl, cg_iters, + gamma, lam, # advantage estimation + entcoeff=0.0, + cg_damping=1e-2, + vf_stepsize=3e-4, + vf_iters=3, + max_timesteps=0, max_episodes=0, max_iters=0, # time constraint + callback=None): + """ + learns a TRPO policy using the given environment + + :param env: (Gym Environment) the environment + :param policy_fn: (function (str, Gym Space, Gym Space, bool): MLPPolicy) policy generator + :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) + :param max_kl: (float) the kullback leiber loss threashold + :param cg_iters: (int) the number of iterations for the conjugate gradient calculation + :param gamma: (float) the discount value + :param lam: (float) GAE factor + :param entcoeff: (float) the weight for the entropy loss + :param cg_damping: (float) the compute gradient dampening factor + :param vf_stepsize: (float) the value function stepsize + :param vf_iters: (int) the value function's number iterations for learning + :param max_timesteps: (int) the maximum number of timesteps before halting + :param max_episodes: (int) the maximum number of episodes before halting + :param max_iters: (int) the maximum number of training iterations before halting + :param callback: (function (dict, dict)) the call back function, takes the local and global attribute dictionary + """ + base_learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=cg_iters, gamma=gamma, + lam=lam, entcoeff=entcoeff, cg_damping=cg_damping, vf_stepsize=vf_stepsize, vf_iters=vf_iters, + max_timesteps=max_timesteps, max_episodes=max_episodes, max_iters=max_iters, callback=callback, + using_gail=False) diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000..21ec52f09b --- /dev/null +++ b/conftest.py @@ -0,0 +1,14 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption("--rungpu", action="store_true", default=False, help="run gpu tests") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--rungpu"): + return + skip_gpu = pytest.mark.skip(reason="need --rungpu option to run") + for item in items: + if "gpu" in item.keywords: + item.add_marker(skip_gpu) diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000000..bbb432ba68 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python -m pytest --cov-config .coveragerc --cov-report html --cov-report term --cov=. --rungpu diff --git a/setup.py b/setup.py index bf8badcf60..c0a495a043 100644 --- a/setup.py +++ b/setup.py @@ -21,10 +21,16 @@ 'cloudpickle', 'tensorflow>=1.4.0', 'click', - 'opencv-python' + 'opencv-python', + 'numpy', + 'pandas', + 'pytest', + 'matplotlib', + 'seaborn', + 'glob2' ], description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', author='OpenAI', url='https://github.com/openai/baselines', author_email='gym@openai.com', - version='0.1.5') + version='0.1.6') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/test_atari.py b/tests/test_atari.py new file mode 100644 index 0000000000..56b3092119 --- /dev/null +++ b/tests/test_atari.py @@ -0,0 +1,112 @@ +import pytest + +import tensorflow as tf + +from baselines import deepq, bench, logger +from baselines.common import set_global_seeds +from baselines.common.atari_wrappers import make_atari +import baselines.a2c.run_atari as a2c_atari +import baselines.acer.run_atari as acer_atari +import baselines.acktr.run_atari as acktr_atari +import baselines.ppo1.run_atari as ppo1_atari +import baselines.ppo2.run_atari as ppo2_atari +import baselines.trpo_mpi.run_atari as trpo_atari + + +ENV_ID = 'BreakoutNoFrameskip-v4' +SEED = 3 +NUM_TIMESTEPS = 2500 +NUM_CPU = 4 + + +def clear_tf_session(): + """ + clears the Tensorflow session, this is needed for sequential testing of the baselines + """ + tf.reset_default_graph() + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm']) +def test_a2c(policy): + """ + test A2C on atari + + :param policy: (str) the policy to test for A2C + """ + clear_tf_session() + a2c_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, + policy=policy, lr_schedule='constant', num_env=NUM_CPU) + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm']) +def test_acer(policy): + """ + test ACER on atari + + :param policy: (str) the policy to test for ACER + """ + clear_tf_session() + acer_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, + policy=policy, lr_schedule='constant', num_cpu=NUM_CPU) + + +@pytest.mark.slow +def test_acktr(): + """ + test ACKTR on atari + """ + clear_tf_session() + acktr_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, num_cpu=NUM_CPU) + + +@pytest.mark.slow +def test_deepq(): + """ + test DeepQ on atari + """ + clear_tf_session() + logger.configure() + set_global_seeds(SEED) + env = make_atari(ENV_ID) + env = bench.Monitor(env, logger.get_dir()) + env = deepq.wrap_atari_dqn(env) + model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True) + + deepq.learn(env, q_func=model, learning_rate=1e-4, max_timesteps=NUM_TIMESTEPS, buffer_size=10000, + exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, + target_network_update_freq=1000, gamma=0.99, prioritized_replay=True, prioritized_replay_alpha=0.6, + checkpoint_freq=10000) + + env.close() + + +@pytest.mark.slow +def test_ppo1(): + """ + test PPO1 on atari + """ + clear_tf_session() + ppo1_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) + + +@pytest.mark.slow +@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm', 'mlp']) +def test_ppo2(policy): + """ + test PPO2 on atari + + :param policy: (str) the policy to test for PPO2 + """ + clear_tf_session() + ppo2_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, policy=policy) + + +@pytest.mark.slow +def test_trpo(): + """ + test TRPO on atari + """ + clear_tf_session() + trpo_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) diff --git a/tests/test_common.py b/tests/test_common.py new file mode 100644 index 0000000000..7fccd367a9 --- /dev/null +++ b/tests/test_common.py @@ -0,0 +1,6 @@ +def _assert_eq(left, right): + assert left == right, '{} != {}'.format(left, right) + + +def _assert_neq(left, right): + assert left != right, '{} == {}'.format(left, right) diff --git a/tests/test_continuous.py b/tests/test_continuous.py new file mode 100644 index 0000000000..4daec6bc19 --- /dev/null +++ b/tests/test_continuous.py @@ -0,0 +1,12 @@ +import subprocess + +from .test_common import _assert_eq + +ENV_ID = 'Pendulum-v0' + + +def test_ddpg(): + args = ['--env-id', ENV_ID, '--nb-epochs', 2, '--nb-epoch-cycles', 2, '--nb-rollout-steps', 100] + args = list(map(str, args)) + return_code = subprocess.call(['python', '-m', 'baselines.ddpg.main'] + args) + _assert_eq(return_code, 0) diff --git a/tests/test_deepq.py b/tests/test_deepq.py new file mode 100644 index 0000000000..8be95a045f --- /dev/null +++ b/tests/test_deepq.py @@ -0,0 +1,28 @@ +import subprocess + +from .test_common import _assert_eq + + +def test_custom_cartpole(): + args = ['--no-render', '--max-timesteps', 1000] + args = list(map(str, args)) + return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.custom_cartpole'] + args) + _assert_eq(return_code, 0) + +def test_cartpole(): + args = ['--max-timesteps', 1000] + args = list(map(str, args)) + return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.train_cartpole'] + args) + _assert_eq(return_code, 0) + + return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.enjoy_cartpole', '--no-render']) + _assert_eq(return_code, 0) + +def test_mountaincar(): + args = ['--max-timesteps', 1000] + args = list(map(str, args)) + return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.train_mountaincar'] + args) + _assert_eq(return_code, 0) + + return_code = subprocess.call(['python', '-m', 'baselines.deepq.experiments.enjoy_mountaincar', '--no-render']) + _assert_eq(return_code, 0) diff --git a/tests/test_distri.py b/tests/test_distri.py new file mode 100644 index 0000000000..735b06239e --- /dev/null +++ b/tests/test_distri.py @@ -0,0 +1,68 @@ +import numpy as np +import tensorflow as tf + +import baselines.common.tf_util as tf_util +from baselines.common.distributions import DiagGaussianProbabilityDistributionType,\ + CategoricalProbabilityDistributionType, \ + MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType + + +@tf_util.in_session +def test_probtypes(): + """ + test probability distribution types + """ + np.random.seed(0) + + pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8]) + diag_gauss = DiagGaussianProbabilityDistributionType(pdparam_diag_gauss.size // 2) + validate_probtype(diag_gauss, pdparam_diag_gauss) + + pdparam_categorical = np.array([-.2, .3, .5]) + categorical = CategoricalProbabilityDistributionType(pdparam_categorical.size) + validate_probtype(categorical, pdparam_categorical) + + nvec = [1, 2, 3] + pdparam_multicategorical = np.array([-.2, .3, .5, .1, 1, -.1]) + multicategorical = MultiCategoricalProbabilityDistributionType(nvec) + validate_probtype(multicategorical, pdparam_multicategorical) + + pdparam_bernoulli = np.array([-.2, .3, .5]) + bernoulli = BernoulliProbabilityDistributionType(pdparam_bernoulli.size) + validate_probtype(bernoulli, pdparam_bernoulli) + + +def validate_probtype(probtype, pdparam): + """ + validate probability distribution types + + :param probtype: (ProbabilityDistributionType) the type to validate + :param pdparam: ([float]) the flat probabilities to test + """ + number_samples = 100000 + # Check to see if mean negative log likelihood == differential entropy + mval = np.repeat(pdparam[None, :], number_samples, axis=0) + mval_ph = probtype.param_placeholder([number_samples]) + xval_ph = probtype.sample_placeholder([number_samples]) + proba_distribution = probtype.proba_distribution_from_flat(mval_ph) + calcloglik = tf_util.function([xval_ph, mval_ph], proba_distribution.logp(xval_ph)) + calcent = tf_util.function([mval_ph], proba_distribution.entropy()) + xval = tf.get_default_session().run(proba_distribution.sample(), feed_dict={mval_ph: mval}) + logliks = calcloglik(xval, mval) + entval_ll = - logliks.mean() + entval_ll_stderr = logliks.std() / np.sqrt(number_samples) + entval = calcent(mval).mean() + assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas + + # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] + mval2_ph = probtype.param_placeholder([number_samples]) + pd2 = probtype.proba_distribution_from_flat(mval2_ph) + tmp = pdparam + np.random.randn(pdparam.size) * 0.1 + mval2 = np.repeat(tmp[None, :], number_samples, axis=0) + calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2)) + klval = calckl(mval, mval2).mean() + logliks = calcloglik(xval, mval2) + klval_ll = - entval - logliks.mean() + klval_ll_stderr = logliks.std() / np.sqrt(number_samples) + assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas + print('ok on', probtype, pdparam) diff --git a/baselines/common/test_identity.py b/tests/test_identity.py similarity index 66% rename from baselines/common/test_identity.py rename to tests/test_identity.py index a429e0c27b..38ca72be35 100644 --- a/baselines/common/test_identity.py +++ b/tests/test_identity.py @@ -1,29 +1,32 @@ +import random + import pytest import tensorflow as tf -import random import numpy as np -from gym.spaces import np_random +from gym.spaces.prng import np_random from baselines.a2c import a2c from baselines.ppo2 import ppo2 from baselines.common.identity_env import IdentityEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv -from baselines.ppo2.policies import MlpPolicy +from baselines.a2c.policies import MlpPolicy learn_func_list = [ lambda e: a2c.learn(policy=MlpPolicy, env=e, seed=0, total_timesteps=50000), - lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.01) + lambda e: ppo2.learn(policy=MlpPolicy, env=e, total_timesteps=50000, learning_rate=1e-3, n_steps=128, ent_coef=0.01) ] @pytest.mark.slow @pytest.mark.parametrize("learn_func", learn_func_list) def test_identity(learn_func): - ''' + """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) - ''' + + :param learn_func: (lambda (Gym Environment): A2CPolicy) the policy generator + """ np.random.seed(0) np_random.seed(0) random.seed(0) @@ -34,11 +37,11 @@ def test_identity(learn_func): tf.set_random_seed(0) model = learn_func(env) - N_TRIALS = 1000 - sum_rew = 0 + n_trials = 1000 + reward_sum = 0 obs = env.reset() - for i in range(N_TRIALS): - obs, rew, done, _ = env.step(model.step(obs)[0]) - sum_rew += rew + for _ in range(n_trials): + obs, reward, _, _ = env.step(model.step(obs)[0]) + reward_sum += reward - assert sum_rew > 0.9 * N_TRIALS + assert reward_sum > 0.9 * n_trials diff --git a/tests/test_logger.py b/tests/test_logger.py new file mode 100644 index 0000000000..fe3e9eee0f --- /dev/null +++ b/tests/test_logger.py @@ -0,0 +1,50 @@ +import subprocess + +import pytest + +from baselines.logger import make_output_format, read_tb, read_csv, read_json + +KEY_VALUES = {'test': 1, 'b': -3.14, '8': 9.9} +LOG_DIR = '/tmp/openai_baselines/' + + +def _assert_eq(left, right): + assert left == right, '{} != {}'.format(left, right) + + +def _assert_neq(left, right): + assert left != right, '{} == {}'.format(left, right) + + +def test_main(): + """ + Dry-run python -m baselines.logger + """ + return_code = subprocess.call(['python', 'baselines/logger.py']) + _assert_eq(return_code, 0) + + +@pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv']) +def test_make_output(_format): + """ + test make output + + :param _format: (str) output format + """ + writer = make_output_format(_format, LOG_DIR) + writer.writekvs(KEY_VALUES) + if _format == 'tensorboard': + read_tb(LOG_DIR) + elif _format == "csv": + read_csv(LOG_DIR + 'progress.csv') + elif _format == 'json': + read_json(LOG_DIR + 'progress.json') + writer.close() + + +def test_make_output_fail(): + """ + test value error on logger + """ + with pytest.raises(ValueError): + make_output_format('dummy_format', LOG_DIR) diff --git a/tests/test_math_util.py b/tests/test_math_util.py new file mode 100644 index 0000000000..b1db323b22 --- /dev/null +++ b/tests/test_math_util.py @@ -0,0 +1,15 @@ +import numpy as np + +from baselines.common.math_util import discount_with_boundaries + + +def test_discount_with_boundaries(): + """ + test the discount_with_boundaries function + """ + gamma = 0.9 + rewards = np.array([1.0, 2.0, 3.0, 4.0], 'float32') + episode_starts = [1.0, 0.0, 0.0, 1.0] + discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma) + assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4]) + return diff --git a/tests/test_mpi_adam.py b/tests/test_mpi_adam.py new file mode 100644 index 0000000000..bc0c1337b7 --- /dev/null +++ b/tests/test_mpi_adam.py @@ -0,0 +1,10 @@ +import subprocess + +from .test_common import _assert_eq + + +def test_mpi_adam(): + """Test RunningMeanStd object for MPI""" + return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', + 'python', '-m', 'baselines.common.mpi_adam']) + _assert_eq(return_code, 0) diff --git a/tests/test_running_stat.py b/tests/test_running_stat.py new file mode 100644 index 0000000000..cda4eda7f0 --- /dev/null +++ b/tests/test_running_stat.py @@ -0,0 +1,20 @@ +import numpy as np + +from baselines.common.running_stat import RunningStat + + +def test_running_stat(): + """ + test RunningStat object + """ + for shape in ((), (3,), (3, 4)): + hist = [] + running_stat = RunningStat(shape) + for _ in range(5): + val = np.random.randn(*shape) + running_stat.push(val) + hist.append(val) + _mean = np.mean(hist, axis=0) + assert np.allclose(running_stat.mean, _mean) + _var = np.square(_mean) if (len(hist) == 1) else np.var(hist, ddof=1, axis=0) + assert np.allclose(running_stat.var, _var) diff --git a/tests/test_schedules.py b/tests/test_schedules.py new file mode 100644 index 0000000000..849b84a155 --- /dev/null +++ b/tests/test_schedules.py @@ -0,0 +1,33 @@ +import numpy as np + +from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule + + +def test_piecewise_schedule(): + """ + test PiecewiseSchedule + """ + piecewise_sched = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], + outside_value=500) + + assert np.isclose(piecewise_sched.value(-10), 500) + assert np.isclose(piecewise_sched.value(0), 150) + assert np.isclose(piecewise_sched.value(5), 200) + assert np.isclose(piecewise_sched.value(9), 80) + assert np.isclose(piecewise_sched.value(50), 50) + assert np.isclose(piecewise_sched.value(80), 50) + assert np.isclose(piecewise_sched.value(150), 0) + assert np.isclose(piecewise_sched.value(175), -25) + assert np.isclose(piecewise_sched.value(201), 500) + assert np.isclose(piecewise_sched.value(500), 500) + + assert np.isclose(piecewise_sched.value(200 - 1e-10), -50) + + +def test_constant_schedule(): + """ + test ConstantSchedule + """ + constant_sched = ConstantSchedule(5) + for i in range(-100, 100): + assert np.isclose(constant_sched.value(i), 5) diff --git a/baselines/common/tests/test_segment_tree.py b/tests/test_segment_tree.py similarity index 91% rename from baselines/common/tests/test_segment_tree.py rename to tests/test_segment_tree.py index 700e0bb456..4e8de75540 100644 --- a/baselines/common/tests/test_segment_tree.py +++ b/tests/test_segment_tree.py @@ -4,6 +4,9 @@ def test_tree_set(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -18,6 +21,9 @@ def test_tree_set(): def test_tree_set_overlap(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -31,6 +37,9 @@ def test_tree_set_overlap(): def test_prefixsum_idx(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[2] = 1.0 @@ -45,6 +54,9 @@ def test_prefixsum_idx(): def test_prefixsum_idx2(): + """ + test Segment Tree data structure + """ tree = SumSegmentTree(4) tree[0] = 0.5 @@ -61,6 +73,9 @@ def test_prefixsum_idx2(): def test_max_interval_tree(): + """ + test Segment Tree data structure + """ tree = MinSegmentTree(4) tree[0] = 1.0 diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py new file mode 100644 index 0000000000..e810e21d90 --- /dev/null +++ b/tests/test_tf_util.py @@ -0,0 +1,43 @@ +# tests for tf_util +import tensorflow as tf + +from baselines.common.tf_util import function, initialize, single_threaded_session + + +def test_function(): + """ + test the function function in tf_util + """ + with tf.Graph().as_default(): + x_ph = tf.placeholder(tf.int32, (), name="x") + y_ph = tf.placeholder(tf.int32, (), name="y") + z_ph = 3 * x_ph + 2 * y_ph + linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0}) + + with single_threaded_session(): + initialize() + + assert linear_fn(2) == 6 + assert linear_fn(2, 2) == 10 + + +def test_multikwargs(): + """ + test the function function in tf_util + """ + with tf.Graph().as_default(): + x_ph = tf.placeholder(tf.int32, (), name="x") + with tf.variable_scope("other"): + x2_ph = tf.placeholder(tf.int32, (), name="x") + z_ph = 3 * x_ph + 2 * x2_ph + + linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0}) + with single_threaded_session(): + initialize() + assert linear_fn(2) == 6 + assert linear_fn(2, 2) == 10 + + +if __name__ == '__main__': + test_function() + test_multikwargs() diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py new file mode 100644 index 0000000000..9c4f615048 --- /dev/null +++ b/tests/test_vec_normalize.py @@ -0,0 +1,58 @@ +import subprocess + +import gym +import numpy as np + +from baselines.common.running_mean_std import RunningMeanStd +from baselines.common.vec_env.dummy_vec_env import DummyVecEnv +from baselines.common.vec_env.vec_normalize import VecNormalize +from .test_common import _assert_eq + +ENV_ID = 'BreakoutNoFrameskip-v4' + + +def test_runningmeanstd(): + """Test RunningMeanStd object""" + for (x_1, x_2, x_3) in [ + (np.random.randn(3), np.random.randn(4), np.random.randn(5)), + (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: + rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) + + x_cat = np.concatenate([x_1, x_2, x_3], axis=0) + moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] + rms.update(x_1) + rms.update(x_2) + rms.update(x_3) + moments_2 = [rms.mean, rms.var] + + assert np.allclose(moments_1, moments_2) + + +def test_vec_env(): + """Test VecNormalize Object""" + + def make_env(): + return gym.make(ENV_ID) + + env = DummyVecEnv([make_env]) + env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.) + _, done = env.reset(), [False] + while not done[0]: + actions = [env.action_space.sample()] + obs, _, done, _ = env.step(actions) + assert np.max(obs) <= 10 + + +def test_mpi_runningmeanstd(): + """Test RunningMeanStd object for MPI""" + return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', + 'python', '-m', 'baselines.common.mpi_running_mean_std']) + _assert_eq(return_code, 0) + + +def test_mpi_moments(): + """ + test running mean std function + """ + subprocess.check_call(['mpirun', '--allow-run-as-root', '-np', '3', 'python', '-c', + 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])