Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update for the latest library enviromnet #61

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions REINFORCE.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ def main():


for n_epi in range(10000):
s = env.reset()
s = env.reset()[0]
done = False

while not done: # CartPole-v1 forced to terminates at 500 step.
prob = pi(torch.from_numpy(s).float())
m = Categorical(prob)
a = m.sample()
s_prime, r, done, info = env.step(a.item())
s_prime, r, done, info, _ = env.step(a.item())
pi.put_data((r,prob[a]))
s = s_prime
score += r
Expand Down
23 changes: 17 additions & 6 deletions acer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gym
import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down Expand Up @@ -34,7 +35,7 @@ def sample(self, on_policy=False):

s_lst, a_lst, r_lst, prob_lst, done_lst, is_first_lst = [], [], [], [], [], []
for seq in mini_batch:
is_first = True # Flag for indicating whether the transition is the first item from a sequence
is_first = True
for transition in seq:
s, a, r, prob, done = transition

Expand All @@ -47,9 +48,19 @@ def sample(self, on_policy=False):
is_first_lst.append(is_first)
is_first = False

s,a,r,prob,done_mask,is_first = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
r_lst, torch.tensor(prob_lst, dtype=torch.float), done_lst, \
is_first_lst
s_np = np.float32(s_lst)
a_np = np.int64(a_lst)
r_np = np.float32(r_lst)
prob_np = np.float32(prob_lst)
done_mask_np = np.float32(done_lst)
is_first_np = np.float32(is_first_lst)

s = torch.tensor(s_np)
a = torch.tensor(a_np, dtype=torch.int64)
r = torch.tensor(r_np)
prob = torch.tensor(prob_np)
done_mask = torch.tensor(done_mask_np)
is_first = torch.tensor(is_first_np)
return s,a,r,prob,done_mask,is_first

def size(self):
Expand Down Expand Up @@ -118,15 +129,15 @@ def main():
print_interval = 20

for n_epi in range(10000):
s = env.reset()
s = env.reset()[0]
done = False

while not done:
seq_data = []
for t in range(rollout_len):
prob = model.pi(torch.from_numpy(s).float())
a = Categorical(prob).sample().item()
s_prime, r, done, info = env.step(a)
s_prime, r, done, info, _ = env.step(a)
seq_data.append((s, a, r/100.0, prob.detach().numpy(), done))

score +=r
Expand Down
37 changes: 19 additions & 18 deletions actor_critic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,22 @@ def put_data(self, transition):
self.data.append(transition)

def make_batch(self):
s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
for transition in self.data:
s,a,r,s_prime,done = transition
s_lst.append(s)
a_lst.append([a])
r_lst.append([r/100.0])
s_prime_lst.append(s_prime)
done_mask = 0.0 if done else 1.0
done_lst.append([done_mask])

s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float)
transposed_data = list(map(list, zip(*self.data)))
s_lst = transposed_data[0]
a_lst = list(map(lambda x: [x], transposed_data[1]))
r_lst = list(map(lambda x: [x/100.0], transposed_data[2]))
s_prime_lst = transposed_data[3]
done_lst = list(map(lambda x: [0.0] if x else [1.0], transposed_data[4]))

s_batch = torch.tensor(s_lst, dtype=torch.float)
a_batch = torch.tensor(a_lst)
r_batch = torch.tensor(r_lst, dtype=torch.float)
s_prime_batch = torch.tensor(s_prime_lst, dtype=torch.float)
done_batch = torch.tensor(done_lst, dtype=torch.float)

self.data = []
return s_batch, a_batch, r_batch, s_prime_batch, done_batch

def train_net(self):
s, a, r, s_prime, done = self.make_batch()
td_target = r + gamma * self.v(s_prime) * done
Expand All @@ -61,8 +61,9 @@ def train_net(self):
loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
loss = loss.mean()
loss.backward()
self.optimizer.step()

def main():
env = gym.make('CartPole-v1')
Expand All @@ -72,13 +73,13 @@ def main():

for n_epi in range(10000):
done = False
s = env.reset()
s = env.reset()[0]
while not done:
for t in range(n_rollout):
prob = model.pi(torch.from_numpy(s).float())
m = Categorical(prob)
a = m.sample().item()
s_prime, r, done, info = env.step(a)
s_prime, r, done, info, _ = env.step(a)
model.put_data((s,a,r,s_prime,done))

s = s_prime
Expand Down
16 changes: 10 additions & 6 deletions dqn.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import gym
import collections
import random
import numpy as np

import torch
import torch.nn as nn
Expand All @@ -27,14 +28,17 @@ def sample(self, n):
for transition in mini_batch:
s, a, r, s_prime, done_mask = transition
s_lst.append(s)
a_lst.append([a])
a_lst.append([a])
r_lst.append([r])
s_prime_lst.append(s_prime)
done_mask_lst.append([done_mask])

return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_mask_lst)
return (torch.tensor(np.array(s_lst), dtype=torch.float),
torch.tensor(np.array(a_lst), dtype=torch.int64),
torch.tensor(np.array(r_lst), dtype=torch.float),
torch.tensor(np.array(s_prime_lst), dtype=torch.float),
torch.tensor(np.array(done_mask_lst), dtype=torch.float))


def size(self):
return len(self.buffer)
Expand Down Expand Up @@ -87,12 +91,12 @@ def main():

for n_epi in range(10000):
epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
s = env.reset()
s = env.reset()[0]
done = False

while not done:
a = q.sample_action(torch.from_numpy(s).float(), epsilon)
s_prime, r, done, info = env.step(a)
s_prime, r, done, info, _ = env.step(a)
done_mask = 0.0 if done else 1.0
memory.put((s,a,r/100.0,s_prime, done_mask))
s = s_prime
Expand Down
19 changes: 10 additions & 9 deletions ppo-lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def make_batch(self):
s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, h_in_lst, h_out_lst, done_lst = [], [], [], [], [], [], [], []
for transition in self.data:
s, a, r, s_prime, prob_a, h_in, h_out, done = transition

s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
Expand All @@ -59,12 +59,14 @@ def make_batch(self):
h_out_lst.append(h_out)
done_mask = 0 if done else 1
done_lst.append([done_mask])

s,a,r,s_prime,done_mask,prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)

s, r, s_prime, prob_a, done_mask = map(torch.from_numpy,
[np.stack(s_lst), np.stack(r_lst),
np.stack(s_prime_lst), np.stack(prob_a_lst), np.stack(done_lst)])
a = torch.tensor(a_lst)

self.data = []
return s,a,r,s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0]
return s, a, r, s_prime, done_mask, prob_a, h_in_lst[0], h_out_lst[0]

def train_net(self):
s,a,r,s_prime,done_mask, prob_a, (h1_in, h2_in), (h1_out, h2_out) = self.make_batch()
Expand Down Expand Up @@ -106,7 +108,7 @@ def main():

for n_epi in range(10000):
h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float))
s = env.reset()
s = env.reset()[0]
done = False

while not done:
Expand All @@ -116,8 +118,7 @@ def main():
prob = prob.view(-1)
m = Categorical(prob)
a = m.sample().item()
s_prime, r, done, info = env.step(a)

s_prime, r, done, info, _ = env.step(a)
model.put_data((s, a, r/100.0, s_prime, prob[a].item(), h_in, h_out, done))
s = s_prime

Expand Down
11 changes: 6 additions & 5 deletions ppo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
Expand Down Expand Up @@ -50,9 +51,9 @@ def make_batch(self):
done_mask = 0 if done else 1
done_lst.append([done_mask])

s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
s,a,r,s_prime,done_mask, prob_a = torch.tensor(np.array(s_lst), dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(np.array(s_prime_lst), dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
self.data = []
return s, a, r, s_prime, done_mask, prob_a

Expand Down Expand Up @@ -91,14 +92,14 @@ def main():
print_interval = 20

for n_epi in range(10000):
s = env.reset()
s = env.reset()[0]
done = False
while not done:
for t in range(T_horizon):
prob = model.pi(torch.from_numpy(s).float())
m = Categorical(prob)
a = m.sample().item()
s_prime, r, done, info = env.step(a)
s_prime, r, done, info, _ = env.step(a)

model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
s = s_prime
Expand Down
20 changes: 11 additions & 9 deletions vtrace.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,22 @@ def put_data(self, transition):
def make_batch(self):
s_lst, a_lst, r_lst, s_prime_lst, mu_a_lst, done_lst = [], [], [], [], [], []
for transition in self.data:
s, a, r, s_prime, mu_a, done = transition

s, a, r, s_prime, mu_a, done, info = transition
s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
s_prime_lst.append(s_prime)
mu_a_lst.append([mu_a])
done_mask = 0 if done else 1
done_lst.append([done_mask])

s,a,r,s_prime,done_mask, mu_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
torch.tensor(done_lst, dtype=torch.float), torch.tensor(mu_a_lst)

s, r, s_prime, mu_a, done_mask = map(torch.from_numpy,
[np.stack(s_lst), np.stack(r_lst),
np.stack(s_prime_lst), np.stack(mu_a_lst), np.stack(done_lst)])
a = torch.tensor(a_lst)

self.data = []

return s, a, r, s_prime, done_mask, mu_a

def vtrace(self, s, a, r, s_prime, done_mask, mu_a):
Expand Down Expand Up @@ -109,16 +111,16 @@ def main():
score = 0.0

for n_epi in range(10000):
s = env.reset()
s = env.reset()[0]
done = False
while not done:
for t in range(T_horizon):
prob = model.pi(torch.from_numpy(s).float())
m = Categorical(prob)
a = m.sample().item()
s_prime, r, done, info = env.step(a)
s_prime, r, done, info, _ = env.step(a)

model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done))
model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done, info))
s = s_prime

score += r
Expand Down