-
Notifications
You must be signed in to change notification settings - Fork 0
/
mabpsol.py
194 lines (150 loc) · 6.1 KB
/
mabpsol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# https://www.offerzen.com/blog/how-to-build-a-product-recommender-using-multi-armed-bandit-algorithms
# https://github.com/WhatIThinkAbout/BabyRobot/blob/master/Multi_Armed_Bandits/Part%205%20-%20Thompson%20Sampling.ipynb
import random
import numpy as np
class Bandit:
def __init__(self, payoff_probs):
self.actions = range(len(payoff_probs))
self.pay_offs = payoff_probs
def sample(self, action):
selector = random.random()
# 1 or 0 are the rewards
return 1 if selector <= self.pay_offs[action] else 0
def explore_greedy_agent(bandit, iterations, initial_rounds=10, random_agent=False):
"""Initially explore initial_rounds times and then stick to the best action.
"""
pay_offs = dict()
best_action = -1
actions = []
rewards = []
if random_agent:
initial_rounds = iterations
for t in range(iterations):
# for the initial rounds pick a random action
if t < initial_rounds:
a = random.choice(bandit.actions)
r = bandit.sample(a)
# update rewards
if a in pay_offs:
pay_offs[a].append(r)
else:
pay_offs[a] = [r]
# otherwise pick the best one thus far
else:
if (best_action == -1):
# check for the action with the best average payoff
mean_dict = {}
for key,val in pay_offs.items():
mean_dict[key] = np.mean(val)
best_action = max(mean_dict, key=mean_dict.get)
a = best_action
r = bandit.sample(a)
actions.append(a)
rewards.append(r)
return actions, rewards
def random_agent(bandit, iterations):
"""Call the initial explore agent with random_agent==True to choose every action at random.
"""
return explore_greedy_agent(bandit, iterations, random_agent=True)
def epsilon_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=10, decay=1, optimistic=False):
"""Use the epsilon-greedy algorithm by performing the action with the best average
payoff with the probability (1-epsilon), otherwise pick a random action to keep
exploring.
"""
pay_offs = dict()
actions = []
rewards = []
if optimistic:
pay_offs = {a:[1] for a in bandit.actions}
for t in range(iterations):
# sometimes randomly pick an action to explore
if random.random() < epsilon or t < initial_rounds:
a = random.choice(bandit.actions)
# otherwise pick the best one thus far
else:
# check for the action with the best average payoff
new_dict = {}
for key,val in pay_offs.items():
new_dict[key] = np.mean(val)
a = max(new_dict, key=new_dict.get)
r = bandit.sample(a)
# update rewards
if a in pay_offs:
pay_offs[a].append(r)
else:
pay_offs[a] = [r]
epsilon *= decay
actions.append(a)
rewards.append(r)
return actions, rewards
def decaying_epsilon_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=1, decay=0.99):
"""Use epsilon greedy with decay.
"""
return epsilon_greedy_agent(bandit, iterations, epsilon=epsilon, initial_rounds=initial_rounds, decay=decay)
def optimistic_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=1, decay=0.99):
"""Start with high initial Q values, to make the agent explore more in the beginning.
"""
return epsilon_greedy_agent(bandit, iterations, epsilon=epsilon, initial_rounds=initial_rounds, decay=decay, optimistic=True)
def optimal_agent(bandit, iterations):
"""Always choose the best action. This agent isn't realistic, because he has knowledge that other agents don't have.
"""
a = np.argmax(bandit.pay_offs)
actions = []
rewards = []
for _ in range(iterations):
r = bandit.sample(a)
actions.append(a)
rewards.append(r)
return actions, rewards
def ucb_agent(bandit, iterations, c=0.3):
"""Uses the Q-value of the action (reward so far divided by the number of times chosen) as exploitation.
Uses the c * square root of ln(t)/Nt(a) for exploration.
"""
pay_offs = dict()
no_of_times_action_chosen = {a:0 for a in bandit.actions}
q_values = {a:0 for a in bandit.actions}
actions = []
rewards = []
# first play all the actions twice to avoid division by zero error
for a in bandit.actions:
for _ in range(2):
r = bandit.sample(a)
if a in pay_offs:
pay_offs[a].append(r)
else:
pay_offs[a] = [r]
actions.append(a)
rewards.append(r)
no_of_times_action_chosen[a] += 1
q_values[a] = sum(pay_offs[a])/no_of_times_action_chosen[a]
for t in range(2*len(bandit.actions), iterations):
values = [q_values[a] + c*np.sqrt(np.log(t)/no_of_times_action_chosen[a]) for a in bandit.actions]
a = np.argmax(values)
r = bandit.sample(a)
pay_offs[a].append(r)
actions.append(a)
rewards.append(r)
# update q values and no_of_times_per_action
no_of_times_action_chosen[a] += 1
q_values[a] = sum(pay_offs[a])/no_of_times_action_chosen[a]
return actions, rewards
def thompson_agent(bandit, iterations):
"""Build up a probability model from the obtained rewards, and then samples from this to choose an action.
The model provides a level of confidence in the rewards. Confidence increases as more samples are collected.
"""
pay_offs = dict()
actions = []
rewards = []
alphas, betas = {a:1 for a in bandit.actions}, {a:1 for a in bandit.actions}
for t in range(iterations):
a = np.argmax([np.random.beta(alphas[a], betas[a]) for a in bandit.actions])
r = bandit.sample(a)
if a in pay_offs:
pay_offs[a].append(r)
else:
pay_offs[a] = [r]
actions.append(a)
rewards.append(r)
alphas[a] += r
betas[a] += (1-r)
return actions, rewards