-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqlearning.py
63 lines (54 loc) · 1.96 KB
/
qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from RobotHoop import RobotHoop
import random, time
import numpy as np
import matplotlib.pyplot as plt
Q = np.zeros((40, 40, 21, 21, 2, 10))
discount_factor = .2
alpha = .6
epsilon = .2
iterations = 100000
maxsteps = 1000
actions = [('f', 'f', False), ('f', 's', False), ('f', '', False), ('s', 'f', False), ('s', 's', False), ('s', '', False), ('', 'f', False), ('', 's', False), ('','',False), ('', '', True)]
rewards = np.zeros(iterations)
scores = np.zeros(iterations)
scoretot = 0
for i in range(iterations):
dovis=False
if i%1000 == 999:
dovis = True
env = RobotHoop(-2,.5,.5, dovis)
# alpha = 1/(i+1)
# epsilon = 1/(math.sqrt(i+1))
for j in range(maxsteps):
initial_state = env.state()
actions_q = Q[int(10*env.state()[0]), int(10*env.state()[1]), int(10*env.state()[2]), int(10*env.state()[3]), int(env.state()[4])]
best_action = np.argmax(actions_q)
ran = random.uniform(0,1)
if ran > epsilon :
action = best_action
else:
action = random.randrange(10)
old_q = actions_q[action]
res = env.step(*actions[action])
rewards[i] += res['reward']
new_actions_q = Q[int(10*env.state()[0]), int(10*env.state()[1]), int(10*env.state()[2]), int(10*env.state()[3]), int(env.state()[4])]
best_next = np.argmax(new_actions_q)
newQ = (1-alpha)*(old_q) + alpha*(res['reward'] + discount_factor*max(new_actions_q))
Q[int(10*initial_state[0]), int(10*initial_state[1]), int(10*initial_state[2]), int(10*initial_state[3]), int(initial_state[4]), action] = newQ
if res['end']:
if res['reward'] == 100:
print('win')
scoretot += 1
scores[i] = scoretot/i
pass
else:
pass
break
if dovis:
env.vis()
time.sleep(.1)
if dovis:
env.closeFig()
print(max(rewards))
plt.plot(scores)
plt.show()