Deep Q-Learning
Source
Table of Contents
Value iteration algorithm: use Bellman equation as an iterative update
$$ Q_{k+1}(s,a) \; \leftarrow \; \mathbb{E}_{s' \sim P\left(s'\mid s,a\right)} \left[ R(s) + \gamma \max_{a'} Q_{k}\left(s',a'\right) \right] $$
$Q_k$ will converge to $Q_*$ as $k$ goes to $\infty$
Instead of a table, we have a parameterized Q function: $Q_{\omega}(s,a) \approx Q_*(s,a)$
$$Q_{\omega}(s,a) = \omega_0 f_0(s,a) + \omega_1 f_1(s,a) + \cdots + \omega_n f_n(s,a)$$
Learning rule:
transition = $\left(s,a,r,s' \right)$
remember:
$$\text{target}\left(s'\right) = R(s) + \gamma \max_{a'}Q_{\omega}{\left(s',a'\right)}$$
$$
\begin{align*}
\omega_{k+1} \; &\leftarrow \; \omega_k - \alpha \; \nabla_{\omega} \ell(\omega) \qquad \qquad \implies \text{Deep Learning Framework}\\ \\
\; &\leftarrow \; \omega_k - \alpha \; \nabla_{\omega} \left[ \frac{1}{2} \left(\text{target}\left(s'\right) - Q_{\omega}(s,a)\right)^2 \right] \\ \\
\; &\leftarrow \; \omega_k + \alpha \; \left[ \left(\text{target}\left(s'\right) - Q_{\omega}(s,a)\right) \right] \nabla_{\omega} Q_{\omega}(s,a)
\end{align*}
$$
Example: linear regression
DQN method (Q-learning with deep network as Q function approximator) became famous in 2013 for learning to play a wide variety of Atari games better than humans.
%%html
<center><iframe src="https://www.youtube.com/embed/V1eYniJ0Rnk?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
# !pip install gym
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
import gym
env = gym.make('CartPole-v1')
n_input = env.observation_space.shape[0] # 4
n_actions = env.action_space.n # 2
n_output = n_actions
n_hidden1 = 100
n_hidden2 = 50
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
weights = {
'hidden1' : tf.Variable(tf.random_normal([n_input, n_hidden1], stddev = 0.01)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden1, n_hidden2], stddev = 0.01)),
'output' : tf.Variable(tf.random_normal([n_hidden2, n_output], stddev = 0.01))
}
biases = {
'hidden1' : tf.Variable(tf.random_normal([n_hidden1], stddev = 0.01)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden2], stddev = 0.01)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.01))
}
def Qnet(x, weights, biases):
hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
hidden1 = tf.nn.relu(hidden1)
hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
hidden2 = tf.nn.relu(hidden2)
output = tf.add(tf.matmul(hidden2, weights['output']), biases['output'])
return output
Qpred = Qnet(x, weights, biases)
loss = tf.reduce_mean(tf.square(y - Qpred))
LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
gamma = 0.9
for episode in range(801):
done = False
state = env.reset()
count = 0
while not done:
count += 1
state = np.reshape(state, [1, n_input])
Q = sess.run(Qpred, feed_dict = {x: state})
# Exploration vs. Exploitation for action
epsilon = 0.1
if np.random.uniform() < epsilon:
a = env.action_space.sample()
else:
a = np.argmax(Q)
# next state and reward
next_state, reward, done, _ = env.step(a)
if done:
reward = -200
Q[0][a] = reward
else:
next_state = np.reshape(next_state, [1, n_input])
next_Q = sess.run(Qpred, feed_dict = {x: next_state})
Q[0][a] = reward + gamma*np.max(next_Q)
sess.run(optm, feed_dict = {x: state, y: Q})
state = next_state
if episode % 100 == 0:
print("Episode: {} steps: {}".format(episode, count))
env.close()
state = env.reset()
done = False
while not done:
env.render()
state = np.reshape(state, [1, n_input])
Q = sess.run(Qpred, feed_dict = {x: state})
action = np.argmax(Q)
next_state, reward, done, _ = env.step(action)
state = next_state
env.close()
sess.close()
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import random
from collections import deque
import gym
%matplotlib inline
env = gym.make('CartPole-v1')
n_input = env.observation_space.shape[0] # 4
n_actions = env.action_space.n # 2
n_output = n_actions
n_hidden1 = 100
n_hidden2 = 50
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
weights = {
'hidden1' : tf.Variable(tf.random_normal([n_input, n_hidden1], stddev = 0.01)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden1, n_hidden2], stddev = 0.01)),
'output' : tf.Variable(tf.random_normal([n_hidden2, n_output], stddev = 0.01))
}
biases = {
'hidden1' : tf.Variable(tf.random_normal([n_hidden1], stddev = 0.01)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden2], stddev = 0.01)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.01))
}
def DQN(x, weights, biases):
hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
hidden1 = tf.nn.relu(hidden1)
hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
hidden2 = tf.nn.relu(hidden2)
output = tf.add(tf.matmul(hidden2, weights['output']), biases['output'])
return output
Qpred = DQN(x, weights, biases)
loss = tf.reduce_mean(tf.square(y - Qpred))
LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
replay_buffer = []
n_buffer = 5000
n_batch = 10
gamma = 0.9
for episode in range(2001):
done = False
state = env.reset()
count = 0
while not done:
count += 1
Q = sess.run(Qpred, feed_dict = {x: np.reshape(state, [1, n_input])})
# Exploration vs. Exploitation for action
epsilon = 0.1
if np.random.uniform() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q)
next_state, reward, done, _ = env.step(action)
if done:
reward = -200
#next_state = np.zeros(state.shape)
replay_buffer.append([state, action, reward, next_state, done])
if len(replay_buffer) > n_buffer:
replay_buffer.pop(0)
state = next_state
if episode % n_batch == 1:
for _ in range(50):
if len(replay_buffer) < n_batch:
break
minibatch = random.sample(replay_buffer, n_batch)
x_stack = np.empty(0).reshape(0, n_input)
y_stack = np.empty(0).reshape(0, n_output)
for state, action, reward, next_state, done in minibatch:
Q = sess.run(Qpred, feed_dict = {x: np.reshape(state, [1, n_input])})
if done:
Q[0][action] = reward
else:
next_Q = sess.run(Qpred, feed_dict = {x: np.reshape(next_state, [1, n_input])})
Q[0][action] = reward + gamma*np.max(next_Q)
x_stack = np.vstack([x_stack, state])
y_stack = np.vstack([y_stack, Q])
sess.run(optm, feed_dict = {x: x_stack, y: y_stack})
if episode % 200 == 1:
print("Episode: {} steps: {}".format(episode, count))
env.close()
state = env.reset()
done = False
while not done:
env.render()
state = np.reshape(state, [1, n_input])
Q = sess.run(Qpred, feed_dict = {x: state})
action = np.argmax(Q)
next_state, reward, done, _ = env.step(action)
state = next_state
env.close()
sess.close()
%%html
<center><iframe src="https://www.youtube.com/embed/PDkT3gYFl-4?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/9YGfMOq7zaY?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/CVyi3qi4iCI?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/9EEqapUaX9E?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')