-
Notifications
You must be signed in to change notification settings - Fork 6
/
MonteCarloPlayer.py
75 lines (59 loc) · 2.81 KB
/
MonteCarloPlayer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
from numpy import random
from BlackJack import BlackJack
from Player import Player, HIT, STICK
from copy import deepcopy
class MonteCarloPlayer(Player):
def __init__(self):
Player.__init__(self)
self.Q = np.random.randn(11, 21, 2, 2)
self.N = np.zeros([11, 21, 2, 2])
self.epsilon = 1.
self.last_reward = 0
def choose_action(self, state):
if state is None:
raise StandardError("No game associated to player")
dealers_first_card = state[0]
random_action = random.randint(0, 2)
action_choice = random.choice(['RAND', 'GREEDY'], p=[self.epsilon, 1 - self.epsilon])
if action_choice == 'RAND':
return random_action
if self.Q[dealers_first_card - 1][self.current_total - 1][min(self.number_of_aces_used, 1)][HIT] == \
self.Q[dealers_first_card - 1][self.current_total - 1][min(self.number_of_aces_used, 1)][STICK]:
return random_action
if self.Q[dealers_first_card - 1][self.current_total - 1][min(self.number_of_aces_used, 1)][HIT] > \
self.Q[dealers_first_card - 1][self.current_total - 1][min(self.number_of_aces_used, 1)][STICK]:
return HIT
else:
return STICK
def receive_reward(self, reward):
self.last_reward = reward
def run_episode(self):
game = BlackJack([self])
self.current_total = 0
self.number_of_aces_used = 0
action = self.choose_action(game.get_current_state())
old_state = deepcopy(game.get_current_state())
old_total = deepcopy(self.current_total)
old_n_aces = deepcopy(self.number_of_aces_used)
reward = 0
new_q = deepcopy(self.Q)
while not game.game_over:
game.step([action])
reward = self.last_reward
dealers_first_card = old_state[0]
self.N[dealers_first_card - 1][old_total - 1][min(old_n_aces, 1)][action] += 1
alpha = 1. / self.N[dealers_first_card - 1][old_total - 1][min(old_n_aces, 1)][action]
new_q[dealers_first_card - 1][old_total - 1][min(old_n_aces, 1)][action] += \
alpha * (reward - self.Q[dealers_first_card - 1][old_total - 1][min(old_n_aces, 1)][action])
self.epsilon = 100. / (100 + self.N[dealers_first_card - 1][old_total - 1][min(old_n_aces, 1)][action])
action = self.choose_action(game.get_current_state()) if not game.game_over else 0
old_state = deepcopy(game.get_current_state())
old_total = deepcopy(self.current_total)
old_n_aces = deepcopy(self.number_of_aces_used)
self.Q = deepcopy(new_q)
return reward
def run_episodes(self, n):
for k in range(1, n+1):
self.run_episode()
return self.Q, self.N