-
Notifications
You must be signed in to change notification settings - Fork 1.7k
/
q_learning_agent.py
112 lines (88 loc) · 3.8 KB
/
q_learning_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from collections import defaultdict
import random
import math
import numpy as np
class QLearningAgent:
def __init__(self, alpha, epsilon, discount, get_legal_actions):
"""
Q-Learning Agent
based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html
Instance variables you have access to
- self.epsilon (exploration prob)
- self.alpha (learning rate)
- self.discount (discount rate aka gamma)
Functions you should use
- self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}
which returns legal actions for a state
- self.get_qvalue(state,action)
which returns Q(state,action)
- self.set_qvalue(state,action,value)
which sets Q(state,action) := value
!!!Important!!!
Note: please avoid using self._qValues directly.
There's a special self.get_qvalue/set_qvalue for that.
"""
self.get_legal_actions = get_legal_actions
self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.epsilon = epsilon
self.discount = discount
def get_qvalue(self, state, action):
""" Returns Q(state,action) """
return self._qvalues[state][action]
def set_qvalue(self, state, action, value):
""" Sets the Qvalue for [state,action] to the given value """
self._qvalues[state][action] = value
def get_value(self, state):
"""
Compute your agent's estimate of V(s) using current q-values
V(s) = max_over_action Q(state,action) over possible actions.
Note: please take into account that q-values can be negative.
"""
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return 0.0
if len(possible_actions) == 0:
return 0.0
value = max([self.get_qvalue(state, a) for a in possible_actions])
return value
def update(self, state, action, reward, next_state, done):
"""
You should do your Q-Value update here:
Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))
"""
# agent parameters
gamma = self.discount
learning_rate = self.alpha
q = reward + gamma * (1 - done) * self.get_value(next_state)
q = (1 - learning_rate) * self.get_qvalue(state, action) + learning_rate * q
self.set_qvalue(state, action, q)
def get_best_action(self, state):
"""
Compute the best action to take in a state (using current q-values).
"""
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return None
if len(possible_actions) == 0:
return None
idx = np.argmax([self.get_qvalue(state, a) for a in possible_actions])
return possible_actions[idx]
def get_action(self, state):
"""
Compute the action to take in the current state, including exploration.
With probability self.epsilon, we should take a random action.
otherwise - the best policy action (self.get_best_action).
Note: To pick randomly from a list, use random.choice(list).
To pick True or False with a given probablity, generate uniform number in [0, 1]
and compare it with your probability
"""
# Pick Action
possible_actions = self.get_legal_actions(state)
action = None
# If there are no legal actions, return None
if len(possible_actions) == 0:
return None
# agent parameters:
epsilon = self.epsilon
if np.random.rand() < epsilon:
return np.random.choice(possible_actions)
return self.get_best_action(state)