Skip to content

Commit

Permalink
add command line options for hid1 size and initial policy variance
Browse files Browse the repository at this point in the history
  • Loading branch information
pat-coady committed Oct 29, 2017
1 parent 29ce990 commit 7c61906
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ src/__pycache__/
src/log-files/
src/.ipynb_checkpoints/
tmp/
doc/
.mlt/
.mlt


# generated files
exportToHTML/
Expand Down
10 changes: 7 additions & 3 deletions src/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,20 @@

class Policy(object):
""" NN-based policy approximation """
def __init__(self, obs_dim, act_dim, kl_targ):
def __init__(self, obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar):
"""
Args:
obs_dim: num observation dimensions (int)
act_dim: num action dimensions (int)
kl_targ: target KL divergence between pi_old and pi_new
hid1_mult: size of first hidden layer, multiplier of obs_dim
policy_logvar: natural log of initial policy variance
"""
self.beta = 1.0 # dynamically adjusted D_KL loss multiplier
self.eta = 50 # multiplier for D_KL-kl_targ hinge-squared loss
self.kl_targ = kl_targ
self.hid1_mult = hid1_mult
self.policy_logvar = policy_logvar
self.epochs = 20
self.lr = None
self.lr_multiplier = 1.0 # dynamically adjust lr when D_KL out of control
Expand Down Expand Up @@ -62,7 +66,7 @@ def _policy_nn(self):
for each action dimension (i.e. variances not determined by NN).
"""
# hidden layer sizes determined by obs_dim and act_dim (hid2 is geometric mean)
hid1_size = self.obs_dim * 10 # 10 empirically determined
hid1_size = self.obs_dim * self.hid1_mult # 10 empirically determined
hid3_size = self.act_dim * 10 # 10 empirically determined
hid2_size = int(np.sqrt(hid1_size * hid3_size))
# heuristic to set learning rate based on NN size (tuned on 'Hopper-v1')
Expand All @@ -85,7 +89,7 @@ def _policy_nn(self):
logvar_speed = (10 * hid3_size) // 48
log_vars = tf.get_variable('logvars', (logvar_speed, self.act_dim), tf.float32,
tf.constant_initializer(0.0))
self.log_vars = tf.reduce_sum(log_vars, axis=0) - 1.0
self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar

print('Policy Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}, logvar_speed: {}'
.format(hid1_size, hid2_size, hid3_size, self.lr, logvar_speed))
Expand Down
15 changes: 12 additions & 3 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode
})


def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar):
""" Main training loop
Args:
Expand All @@ -270,6 +270,8 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
lam: lambda from Generalized Advantage Estimate
kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new)
batch_size: number of episodes per policy training batch
hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension)
policy_logvar: natural log of initial policy variance
"""
killer = GracefulKiller()
env, obs_dim, act_dim = init_gym(env_name)
Expand All @@ -279,8 +281,8 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
aigym_path = os.path.join('/tmp', env_name, now)
env = wrappers.Monitor(env, aigym_path, force=True)
scaler = Scaler(obs_dim)
val_func = NNValueFunction(obs_dim)
policy = Policy(obs_dim, act_dim, kl_targ)
val_func = NNValueFunction(obs_dim, hid1_mult)
policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar)
# run a few episodes of untrained policy to initialize scaler:
run_policy(env, policy, scaler, logger, episodes=5)
episode = 0
Expand Down Expand Up @@ -320,6 +322,13 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size):
parser.add_argument('-b', '--batch_size', type=int,
help='Number of episodes per training batch',
default=20)
parser.add_argument('-m', '--hid1_mult', type=int,
help='Size of first hidden layer for value and policy NNs'
'(integer multiplier of observation dimension)',
default=10)
parser.add_argument('-v', '--policy_logvar', type=float,
help='Initial policy log-variance (natural log of variance)',
default=-1.0)

args = parser.parse_args()
main(**vars(args))
6 changes: 4 additions & 2 deletions src/value_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@

class NNValueFunction(object):
""" NN-based state-value function """
def __init__(self, obs_dim):
def __init__(self, obs_dim, hid1_mult):
"""
Args:
obs_dim: number of dimensions in observation vector (int)
hid1_mult: size of first hidden layer, multiplier of obs_dim
"""
self.replay_buffer_x = None
self.replay_buffer_y = None
self.obs_dim = obs_dim
self.hid1_mult = hid1_mult
self.epochs = 10
self.lr = None # learning rate set in _build_graph()
self._build_graph()
Expand All @@ -32,7 +34,7 @@ def _build_graph(self):
self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc')
self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc')
# hid1 layer size is 10x obs_dim, hid3 size is 10, and hid2 is geometric mean
hid1_size = self.obs_dim * 10 # 10 chosen empirically on 'Hopper-v1'
hid1_size = self.obs_dim * self.hid1_mult # default multipler 10 chosen empirically on 'Hopper-v1'
hid3_size = 5 # 5 chosen empirically on 'Hopper-v1'
hid2_size = int(np.sqrt(hid1_size * hid3_size))
# heuristic to set learning rate based on NN size (tuned on 'Hopper-v1')
Expand Down

0 comments on commit 7c61906

Please sign in to comment.