diff --git a/.gitignore b/.gitignore index 63d747d..af25086 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,10 @@ src/__pycache__/ src/log-files/ src/.ipynb_checkpoints/ tmp/ +doc/ +.mlt/ +.mlt + # generated files exportToHTML/ diff --git a/src/policy.py b/src/policy.py index f44e2d7..947e4d2 100644 --- a/src/policy.py +++ b/src/policy.py @@ -9,16 +9,20 @@ class Policy(object): """ NN-based policy approximation """ - def __init__(self, obs_dim, act_dim, kl_targ): + def __init__(self, obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar): """ Args: obs_dim: num observation dimensions (int) act_dim: num action dimensions (int) kl_targ: target KL divergence between pi_old and pi_new + hid1_mult: size of first hidden layer, multiplier of obs_dim + policy_logvar: natural log of initial policy variance """ self.beta = 1.0 # dynamically adjusted D_KL loss multiplier self.eta = 50 # multiplier for D_KL-kl_targ hinge-squared loss self.kl_targ = kl_targ + self.hid1_mult = hid1_mult + self.policy_logvar = policy_logvar self.epochs = 20 self.lr = None self.lr_multiplier = 1.0 # dynamically adjust lr when D_KL out of control @@ -62,7 +66,7 @@ def _policy_nn(self): for each action dimension (i.e. variances not determined by NN). """ # hidden layer sizes determined by obs_dim and act_dim (hid2 is geometric mean) - hid1_size = self.obs_dim * 10 # 10 empirically determined + hid1_size = self.obs_dim * self.hid1_mult # 10 empirically determined hid3_size = self.act_dim * 10 # 10 empirically determined hid2_size = int(np.sqrt(hid1_size * hid3_size)) # heuristic to set learning rate based on NN size (tuned on 'Hopper-v1') @@ -85,7 +89,7 @@ def _policy_nn(self): logvar_speed = (10 * hid3_size) // 48 log_vars = tf.get_variable('logvars', (logvar_speed, self.act_dim), tf.float32, tf.constant_initializer(0.0)) - self.log_vars = tf.reduce_sum(log_vars, axis=0) - 1.0 + self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar print('Policy Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}, logvar_speed: {}' .format(hid1_size, hid2_size, hid3_size, self.lr, logvar_speed)) diff --git a/src/train.py b/src/train.py index 35156b6..854ce82 100755 --- a/src/train.py +++ b/src/train.py @@ -260,7 +260,7 @@ def log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode }) -def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): +def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: @@ -270,6 +270,8 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch + hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) + policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) @@ -279,8 +281,8 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): aigym_path = os.path.join('/tmp', env_name, now) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) - val_func = NNValueFunction(obs_dim) - policy = Policy(obs_dim, act_dim, kl_targ) + val_func = NNValueFunction(obs_dim, hid1_mult) + policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 @@ -320,6 +322,13 @@ def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): parser.add_argument('-b', '--batch_size', type=int, help='Number of episodes per training batch', default=20) + parser.add_argument('-m', '--hid1_mult', type=int, + help='Size of first hidden layer for value and policy NNs' + '(integer multiplier of observation dimension)', + default=10) + parser.add_argument('-v', '--policy_logvar', type=float, + help='Initial policy log-variance (natural log of variance)', + default=-1.0) args = parser.parse_args() main(**vars(args)) diff --git a/src/value_function.py b/src/value_function.py index 1941e1b..1719b21 100644 --- a/src/value_function.py +++ b/src/value_function.py @@ -11,14 +11,16 @@ class NNValueFunction(object): """ NN-based state-value function """ - def __init__(self, obs_dim): + def __init__(self, obs_dim, hid1_mult): """ Args: obs_dim: number of dimensions in observation vector (int) + hid1_mult: size of first hidden layer, multiplier of obs_dim """ self.replay_buffer_x = None self.replay_buffer_y = None self.obs_dim = obs_dim + self.hid1_mult = hid1_mult self.epochs = 10 self.lr = None # learning rate set in _build_graph() self._build_graph() @@ -32,7 +34,7 @@ def _build_graph(self): self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs_valfunc') self.val_ph = tf.placeholder(tf.float32, (None,), 'val_valfunc') # hid1 layer size is 10x obs_dim, hid3 size is 10, and hid2 is geometric mean - hid1_size = self.obs_dim * 10 # 10 chosen empirically on 'Hopper-v1' + hid1_size = self.obs_dim * self.hid1_mult # default multipler 10 chosen empirically on 'Hopper-v1' hid3_size = 5 # 5 chosen empirically on 'Hopper-v1' hid2_size = int(np.sqrt(hid1_size * hid3_size)) # heuristic to set learning rate based on NN size (tuned on 'Hopper-v1')