-
Notifications
You must be signed in to change notification settings - Fork 0
/
mabpagents.py
122 lines (95 loc) · 3.57 KB
/
mabpagents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import random
import numpy as np
class Bandit:
def __init__(self, payoff_probs):
self.actions = range(len(payoff_probs))
self.pay_offs = payoff_probs
def sample(self, action):
selector = random.random()
# 1 or 0 are the rewards
return 1 if selector <= self.pay_offs[action] else 0
def optimal_agent(bandit, iterations):
"""Always choose the best action. This agent isn't realistic, because he has knowledge that other agents don't have.
"""
a = np.argmax(bandit.pay_offs)
actions = []
rewards = []
for _ in range(iterations):
r = bandit.sample(a)
actions.append(a)
rewards.append(r)
return actions, rewards
def explore_greedy_agent(bandit, iterations, initial_rounds=10, random_agent=False):
"""Initially explore initial_rounds times and then stick to the best action.
"""
pay_offs = dict()
best_action = -1
actions = []
rewards = []
if random_agent:
initial_rounds = iterations
for t in range(iterations):
# for the initial rounds pick a random action
if t < initial_rounds:
a = random.choice(bandit.actions)
r = bandit.sample(a)
# update rewards
if a in pay_offs:
pay_offs[a].append(r)
else:
pay_offs[a] = [r]
# otherwise pick the best one thus far
else:
if (best_action == -1):
# check for the action with the best average payoff
mean_dict = {}
for key,val in pay_offs.items():
mean_dict[key] = np.mean(val)
best_action = max(mean_dict, key=mean_dict.get)
a = best_action
r = bandit.sample(a)
actions.append(a)
rewards.append(r)
return actions, rewards
def random_agent(bandit, iterations):
"""Call the initial explore agent with random_agent==True to choose every action at random.
"""
return explore_greedy_agent(bandit, iterations, random_agent=True)
def epsilon_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=10, decay=1, optimistic=False):
"""Use the epsilon-greedy algorithm by performing the action with the best average
payoff with the probability (1-epsilon), otherwise pick a random action to keep
exploring.
"""
pay_offs = dict()
actions = []
rewards = []
return actions, rewards
def decaying_epsilon_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=1, decay=0.99):
"""Use epsilon greedy with decaying epsilon.
"""
pay_offs = dict()
actions = []
rewards = []
return actions, rewards
def optimistic_greedy_agent(bandit, iterations, epsilon=0.2, initial_rounds=1, decay=0.99):
"""Start with high initial Q values, to make the agent explore more in the beginning.
"""
actions = []
rewards = []
return actions, rewards
def ucb_agent(bandit, iterations, c=0.3):
"""Uses the Q-value of the action (reward so far divided by the number of times chosen) as exploitation.
Uses the c * square root of ln(t)/Nt(a) for exploration.
"""
pay_offs = dict()
actions = []
rewards = []
return actions, rewards
def thompson_agent(bandit, iterations):
"""Build up a probability model from the obtained rewards, and then samples from this to choose an action.
The model provides a level of confidence in the rewards. Confidence increases as more samples are collected.
"""
pay_offs = dict()
actions = []
rewards = []
return actions, rewards