-
Notifications
You must be signed in to change notification settings - Fork 0
/
policy.py
46 lines (42 loc) · 1.28 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import shelve
class policy():
def __init__(self, network):
self.network = network
def initialize(self):
torch.nn.init.xavier_uniform(self.network.weight)
def inference(self, state):
torch.no_grad()
a, p = self.network(state.cuda()).cuda()
a1 = a.numpy()
p1 = p.numpy()
result = np.where(a1 == np.amax(a1))
temp = np.array([0,0,0])
if len(result[0]) == 1:
temp[result[0][0]] = 1
a1 = a1*temp
elif len(result[0]) == 2:
dice = np.random.uniform(1)
if dice > 0.5:
temp[result[0][0]] = 1
a1 = a1 * temp
else:
temp[result[0][1]] = 1
a1 = a1 * temp
elif len(result[0]) == 2:
dice = np.random.uniform(1)
if dice < 0.33:
temp[result[0][0]] = 1
a1 = a1 * temp
elif 0.67 >= dice >= 0.33:
temp[result[0][1]] = 1
a1 = a1 * temp
else:
temp[result[0][1]] = 1
a1 = a1 * temp
vec = np.concatenate([a1, p1])
return vec