Skip to content

Commit

Permalink
Merge pull request #1 from hill-a/fixes_cleanup
Browse files Browse the repository at this point in the history
Fixes and cleanup
  • Loading branch information
hill-a authored Jul 27, 2018
2 parents 978e116 + 3a4dcbd commit 5f11927
Show file tree
Hide file tree
Showing 129 changed files with 8,684 additions and 5,300 deletions.
16 changes: 16 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[run]
branch = False
omit =
baselines/common/tests/*
# Mujoco requires a licence
baselines/*/run_mujoco.py
baselines/ppo1/run_humanoid.py
baselines/ppo1/run_robotics.py
# HER requires mpi and Mujoco
baselines/her/experiment/

[report]
exclude_lines =
pragma: no cover
raise NotImplementedError()
if KFAC_DEBUG:
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
*.pyc
*.pkl
*.py~
*.bak
.pytest_cache
.DS_Store
.idea
.coverage
.coverage.*
__pycache__/

# Setuptools distribution and build folders.
/dist/
Expand Down Expand Up @@ -34,5 +38,3 @@ src
.cache

MUJOCO_LOG.TXT


5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ language: python
python:
- "3.6"

notifications:
email: false

services:
- docker

Expand All @@ -11,4 +14,4 @@ install:

script:
- flake8 --select=F baselines/common
- docker run baselines-test pytest
- docker run --env CODACY_PROJECT_TOKEN=$CODACY_PROJECT_TOKEN baselines-test sh -c 'pytest --cov-config .coveragerc --cov-report term --cov-report xml --cov=. && python-codacy-coverage -r coverage.xml --token=$CODACY_PROJECT_TOKEN'
33 changes: 29 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,43 @@
FROM ubuntu:16.04

RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake
RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake libglib2.0-0 libsm6 libxext6 libfontconfig1 libxrender1
ENV CODE_DIR /root/code
ENV VENV /root/venv

COPY . $CODE_DIR/baselines
RUN \
pip install virtualenv && \
virtualenv $VENV --python=python3 && \
. $VENV/bin/activate && \
mkdir $CODE_DIR && \
cd $CODE_DIR && \
pip install --upgrade pip && \
pip install -e baselines && \
pip install pytest
pip install pytest && \
pip install pytest-cov && \
pip install codacy-coverage && \
pip install scipy && \
pip install tqdm && \
pip install joblib && \
pip install zmq && \
pip install dill && \
pip install progressbar2 && \
pip install mpi4py && \
pip install cloudpickle && \
pip install tensorflow>=1.4.0 && \
pip install click && \
pip install opencv-python && \
pip install numpy && \
pip install pandas && \
pip install pytest && \
pip install matplotlib && \
pip install seaborn && \
pip install glob2 && \
pip install gym[mujoco,atari,classic_control,robotics]

COPY . $CODE_DIR/baselines
RUN \
. $VENV/bin/activate && \
cd $CODE_DIR && \
pip install -e baselines

ENV PATH=$VENV/bin:$PATH
WORKDIR $CODE_DIR/baselines
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<img src="data/logo.jpg" width=25% align="right" /> [![Build status](https://travis-ci.org/openai/baselines.svg?branch=master)](https://travis-ci.org/openai/baselines)
<img src="data/logo.jpg" width=25% align="right" /> [![Build Status](https://travis-ci.org/hill-a/stable-baselines.svg?branch=master)](https://travis-ci.org/hill-a/stable-baselines) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=hill-a/stable-baselines&amp;utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/3bcb4cd6d76a4270acb16b5fe6dd9efa)](https://www.codacy.com/app/baselines_janitors/stable-baselines?utm_source=github.com&utm_medium=referral&utm_content=hill-a/stable-baselines&utm_campaign=Badge_Coverage)

# Baselines

Expand Down
175 changes: 115 additions & 60 deletions baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,91 @@
import os.path as osp
import os
import time
import joblib

import numpy as np
import tensorflow as tf
from baselines import logger

from baselines.common import set_global_seeds, explained_variance
from baselines import logger
from baselines.common import set_global_seeds, explained_variance, tf_util
from baselines.common.runners import AbstractEnvRunner
from baselines.common import tf_util
from baselines.a2c.utils import discount_with_dones, Scheduler, make_path, find_trainable_variables, calc_entropy, mse

from baselines.a2c.utils import discount_with_dones
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
from baselines.a2c.utils import cat_entropy, mse

class Model(object):

def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
def __init__(self, policy, ob_space, ac_space, n_envs, n_steps,
ent_coef=0.01, vf_coef=0.25, max_grad_norm=0.5, learning_rate=7e-4,
alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lr_schedule='linear'):
"""
The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783
:param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
:param ob_space: (Gym Space) Observation space
:param ac_space: (Gym Space) Action space
:param n_envs: (int) The number of environments
:param n_steps: (int) The number of steps to run for each environment
:param ent_coef: (float) Entropy coefficient for the loss caculation
:param vf_coef: (float) Value function coefficient for the loss calculation
:param max_grad_norm: (float) The maximum value for the gradient clipping
:param learning_rate: (float) The learning rate
:param alpha: (float) RMS prop optimizer decay
:param epsilon: (float) RMS prop optimizer epsilon
:param total_timesteps: (int) The total number of samples
:param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
'double_linear_con', 'middle_drop' or 'double_middle_drop')
"""

sess = tf_util.make_session()
nbatch = nenvs*nsteps
n_batch = n_envs * n_steps

A = tf.placeholder(tf.int32, [nbatch])
ADV = tf.placeholder(tf.float32, [nbatch])
R = tf.placeholder(tf.float32, [nbatch])
LR = tf.placeholder(tf.float32, [])
actions_ph = tf.placeholder(tf.int32, [n_batch])
advs_ph = tf.placeholder(tf.float32, [n_batch])
rewards_ph = tf.placeholder(tf.float32, [n_batch])
learning_rate_ph = tf.placeholder(tf.float32, [])

step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False)
train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True)

neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
pg_loss = tf.reduce_mean(ADV * neglogpac)
vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
entropy = tf.reduce_mean(cat_entropy(train_model.pi))
loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.policy, labels=actions_ph)
pg_loss = tf.reduce_mean(advs_ph * neglogpac)
vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
entropy = tf.reduce_mean(calc_entropy(train_model.policy))
loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

params = find_trainable_variables("model")
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=alpha, epsilon=epsilon)
_train = trainer.apply_gradients(grads)

lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule)

def train(obs, states, rewards, masks, actions, values):
advs = rewards - values
for step in range(len(obs)):
cur_lr = lr.value()
td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
for _ in range(len(obs)):
cur_lr = learning_rate.value()
td_map = {train_model.obs_ph: obs, actions_ph: actions, advs_ph: advs,
rewards_ph: rewards, learning_rate_ph: cur_lr}
if states is not None:
td_map[train_model.S] = states
td_map[train_model.M] = masks
td_map[train_model.states_ph] = states
td_map[train_model.masks_ph] = masks
policy_loss, value_loss, policy_entropy, _ = sess.run(
[pg_loss, vf_loss, entropy, _train],
td_map
)
return policy_loss, value_loss, policy_entropy

def save(save_path):
ps = sess.run(params)
make_path(osp.dirname(save_path))
joblib.dump(ps, save_path)
parameters = sess.run(params)
make_path(os.path.dirname(save_path))
joblib.dump(parameters, save_path)

def load(load_path):
loaded_params = joblib.load(load_path)
restores = []
for p, loaded_p in zip(params, loaded_params):
restores.append(p.assign(loaded_p))
for param, loaded_p in zip(params, loaded_params):
restores.append(param.assign(loaded_p))
sess.run(restores)

self.train = train
Expand All @@ -82,16 +98,30 @@ def load(load_path):
self.load = load
tf.global_variables_initializer().run(session=sess)

class Runner(AbstractEnvRunner):

def __init__(self, env, model, nsteps=5, gamma=0.99):
super().__init__(env=env, model=model, nsteps=nsteps)
class Runner(AbstractEnvRunner):
def __init__(self, env, model, n_steps=5, gamma=0.99):
"""
A runner to learn the policy of an environment for a model
:param env: (Gym environment) The environment to learn from
:param model: (Model) The model to learn
:param n_steps: (int) The number of steps to run for each environment
:param gamma: (float) Discount factor
"""
super(Runner, self).__init__(env=env, model=model, n_steps=n_steps)
self.gamma = gamma

def run(self):
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
"""
Run a learning step of the model
:return: ([float], [float], [float], [bool], [float], [float])
observations, states, rewards, masks, actions, values
"""
mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
mb_states = self.states
for n in range(self.nsteps):
for _ in range(self.n_steps):
actions, values, states, _ = self.model.step(self.obs, self.states, self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
Expand All @@ -102,11 +132,11 @@ def run(self):
self.dones = dones
for n, done in enumerate(dones):
if done:
self.obs[n] = self.obs[n]*0
self.obs[n] = self.obs[n] * 0
self.obs = obs
mb_rewards.append(rewards)
mb_dones.append(self.dones)
#batch of steps to batch of rollouts
# batch of steps to batch of rollouts
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
Expand All @@ -115,12 +145,12 @@ def run(self):
mb_masks = mb_dones[:, :-1]
mb_dones = mb_dones[:, 1:]
last_values = self.model.value(self.obs, self.states, self.dones).tolist()
#discount/bootstrap off value fn
# discount/bootstrap off value fn
for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
rewards = rewards.tolist()
dones = dones.tolist()
if dones[-1] == 0:
rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1]
else:
rewards = discount_with_dones(rewards, dones, self.gamma)
mb_rewards[n] = rewards
Expand All @@ -130,31 +160,56 @@ def run(self):
mb_masks = mb_masks.flatten()
return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values

def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):

def learn(policy, env, seed, n_steps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5,
learning_rate=7e-4, lr_schedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
"""
Return a trained A2C model.
:param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
:param env: (Gym environment) The environment to learn from
:param seed: (int) The initial seed for training
:param n_steps: (int) The number of steps to run for each environment
:param total_timesteps: (int) The total number of samples
:param vf_coef: (float) Value function coefficient for the loss calculation
:param ent_coef: (float) Entropy coefficient for the loss caculation
:param max_grad_norm: (float) The maximum value for the gradient clipping
:param learning_rate: (float) The learning rate
:param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
'double_linear_con', 'middle_drop' or 'double_middle_drop')
:param epsilon: (float) RMS prop optimizer epsilon
:param alpha: (float) RMS prop optimizer decay
:param gamma: (float) Discount factor
:param log_interval: (int) The number of timesteps before logging.
:return: (Model) A2C model
"""
set_global_seeds(seed)

nenvs = env.num_envs
n_envs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
runner = Runner(env, model, nsteps=nsteps, gamma=gamma)

nbatch = nenvs*nsteps
tstart = time.time()
for update in range(1, total_timesteps//nbatch+1):
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, n_envs=n_envs,
n_steps=n_steps, ent_coef=ent_coef,
vf_coef=vf_coef, max_grad_norm=max_grad_norm, learning_rate=learning_rate,
alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps,
lr_schedule=lr_schedule)
runner = Runner(env, model, n_steps=n_steps, gamma=gamma)

n_batch = n_envs * n_steps
t_start = time.time()
for update in range(1, total_timesteps // n_batch + 1):
obs, states, rewards, masks, actions, values = runner.run()
policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
nseconds = time.time()-tstart
fps = int((update*nbatch)/nseconds)
_, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
n_seconds = time.time() - t_start
fps = int((update * n_batch) / n_seconds)
if update % log_interval == 0 or update == 1:
ev = explained_variance(values, rewards)
explained_var = explained_variance(values, rewards)
logger.record_tabular("nupdates", update)
logger.record_tabular("total_timesteps", update*nbatch)
logger.record_tabular("total_timesteps", update * n_batch)
logger.record_tabular("fps", fps)
logger.record_tabular("policy_entropy", float(policy_entropy))
logger.record_tabular("value_loss", float(value_loss))
logger.record_tabular("explained_variance", float(ev))
logger.record_tabular("explained_variance", float(explained_var))
logger.dump_tabular()
env.close()
return model
Loading

0 comments on commit 5f11927

Please sign in to comment.