-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbaseline1_train.py
161 lines (145 loc) · 7.15 KB
/
baseline1_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/python3
import tensorflow as tf;
import tensorflow_probability as tfp;
from tf_agents.environments import tf_py_environment, suite_gym; # environment and problem
from tf_agents.agents.ppo import ppo_agent; # ppo agent
from tf_agents.networks import network, utils;
from tf_agents.specs import tensor_spec, distribution_spec;
from tf_agents.trajectories import time_step, trajectory;
from tf_agents.replay_buffers import tf_uniform_replay_buffer; # replay buffer
from tf_agents.policies import random_tf_policy, policy_saver; # random policy
from FuturesEnv import FuturesEnv;
batch_size = 64;
class ActorNetwork(network.DistributionNetwork):
def __init__(self, obs_spec, action_spec, logits_init_output_factor = 0.1, name = 'ActorNetwork'):
output_spec = distribution_spec.DistributionSpec(tfp.distributions.JointDistributionSequential, None, action_spec);
super(ActorNetwork, self).__init__(
input_tensor_spec = obs_spec,
state_spec = (),
output_spec = output_spec,
name = name);
num_actions = action_spec.maximum - action_spec.minimum + 1;
self.gru = tf.keras.layers.GRU(units = 100, name = "gru");
self.lever_dense = tf.keras.layers.Dense(
num_actions[0],
kernel_initializer = tf.keras.initializers.VarianceScaling(scale = logits_init_output_factor),
bias_initializer = tf.keras.initializers.Zeros(),
name = 'lever_logits');
self.sellprice_dense = tf.keras.layers.Dense(
num_actions[1],
kernel_initializer = tf.keras.initializers.VarianceScaling(scale = logits_init_output_factor),
bias_initializer = tf.keras.initializers.Zeros(),
name = 'sell_price_logits');
self.buyprice_dense = tf.keras.layers.Dense(
num_actions[2],
kernel_initializer = tf.keras.initializers.VarianceScaling(scale = logits_init_output_factor),
bias_initializer = tf.keras.initializers.Zeros(),
name = 'buy_price_logits');
def call(self, inputs, step_type = None, network_state = ()):
flatten = tf.keras.layers.Flatten()(inputs);
# flatten.shape = (batch, seq_len, channel)
flatten = tf.keras.layers.Lambda(lambda x: tf.expand_dims(tf.cast(x, dtype = tf.float32), axis = 1))(flatten);
embedding = self.gru(flatten);
lever_logits = self.lever_dense(embedding);
sellprice_logits = self.sellprice_dense(embedding);
buyprice_logits = self.buyprice_dense(embedding);
action = tfp.distributions.JointDistributionSequential([
tfp.distributions.Categorical(lever_logits),
tfp.distributions.Categorical(sellprice_logits),
tfp.distributions.Categorical(buyprice_logits)
]);
return action;
class ValueNetwork(network.Network):
def __init__(self, obs_spec, name = "ValueNetwork"):
super(ValueNetwork, self).__init__(
input_tensor_spec = obs_spec,
state_spec = (),
name = name);
self.dense = tf.keras.layers.Dense(
1,
kernel_initializer = tf.keras.initializers.Constant([2,1]),
bias_initializer = tf.keras.initializers.Constant([5]));
def call(self, inputs, step_type = None, network_state = ()):
flatten = tf.keras.layers.Flatten()(inputs);
flatten = tf.keras.layers.Lambda(lambda x: tf.cast(x, dtype = tf.float32))(flatten);
logits = self.dense(flatten);
return logits, network_state;
def main():
# environment serves as the dataset in reinforcement learning
train_env = tf_py_environment.TFPyEnvironment(FuturesEnv());
eval_env = tf_py_environment.TFPyEnvironment(FuturesEnv());
# create agent
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = 1e-3);
tf_agent = ppo_agent.PPOAgent(
time_step.time_step_spec(train_env.observation_spec()),
train_env.action_spec(),
optimizer = optimizer,
actor_net = ActorNetwork(train_env.observation_spec(),
train_env.action_spec()),
value_net = ValueNetwork(train_env.observation_spec()),
normalize_observations = False,
use_gae = True,
lambda_value = 0.95
);
tf_agent.initialize();
# replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec = tf_agent.collect_data_spec,
batch_size = train_env.batch_size,
max_length = 100000);
# shape = batch x 2 x
dataset = replay_buffer.as_dataset(num_parallel_calls = 3, sample_batch_size = batch_size, num_steps = 2).prefetch(3);
iterator = iter(dataset);
# policy saver
saver = policy_saver.PolicySaver(tf_agent.policy);
# training
print("training...");
for train_iter in range(20000):
# collect initial trajectory to avoid a cold start
if train_iter == 0:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec());
for _ in range(1000):
status = train_env.current_time_step();
action = random_policy.action(status);
next_status = train_env.step(action.action);
traj = trajectory.from_transition(status, action, next_status);
replay_buffer.add_batch(traj);
# collect trajectory for some step every training iteration
for _ in range(1):
status = train_env.current_time_step();
action = tf_agent.collect_policy.action(status);
next_status = train_env.step(action.action);
traj = trajectory.from_transition(status, action, next_status);
replay_buffer.add_batch(traj);
# get a batch of dataset
experience, unused_info = next(iterator);
train_loss = tf_agent.train(experience);
if tf_agent.train_step_counter.numpy() % 200 == 0:
print('step = {0}: loss = {1}'.format(tf_agent.train_step_counter.numpy(), train_loss.loss));
if tf_agent.train_step_counter.numpy() % 1000 == 0:
# save policy
saver.save('checkpoints/policy_%d' % tf_agent.train_step_counter.numpy());
# get the average return for the updated policy
total_return = 0.0;
for _ in range(10):
status = eval_env.reset();
episode_return = 0.0;
while not status.is_last():
action = tf_agent.policy.action(status);
status = eval_env.step(action.action);
episode_return += status.reward;
total_return += episode_return;
avg_return = total_return / 10;
print('step = {0}: Average Return = {1}'.format(tf_agent.train_step_counter.numpy(), avg_return));
# play futures environment for the last time
print("evaluating...");
status = eval_env.reset();
total_reward = 0;
while not status.is_last():
action = tf_agent.policy.action(status);
status = eval_env.step(action.action);
total_reward += status.reward;
print("total reward = " + str(total_reward));
if __name__ == "__main__":
assert tf.executing_eagerly();
main();