Advanced ANN
Table of Contents
The Vanishing Gradient Problem
As more layers using certain activation functions are added to neural networks, the gradients of the loss function approaches zero, making the network hard to train.
For example,
Batch normalization is a technique for improving the performance and stability of artificial neural networks.
It is used to normalize the input layer by adjusting and scaling the activations.
During training batch normalization shifts and rescales according to the mean and variance estimated on the batch.
During test, it simply shifts and rescales according to the empirical moments estimated during training.
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
Overfitting in Regression
N = 10
data_x = np.linspace(-4.5, 4.5, N)
data_y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512])
data_x = data_x.reshape(-1,1)
data_y = data_y.reshape(-1,1)
plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.grid(alpha = 0.3)
plt.show()
n_input = 1
n_hidden1 = 30
n_hidden2 = 100
n_hidden3 = 100
n_hidden4 = 30
n_output = 1
weights = {
'hidden1' : tf.Variable(tf.random_normal([n_input, n_hidden1], stddev = 0.1)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden1, n_hidden2], stddev = 0.1)),
'hidden3' : tf.Variable(tf.random_normal([n_hidden2, n_hidden3], stddev = 0.1)),
'hidden4' : tf.Variable(tf.random_normal([n_hidden3, n_hidden4], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_hidden4, n_output], stddev = 0.1)),
}
biases = {
'hidden1' : tf.Variable(tf.random_normal([n_hidden1], stddev = 0.1)),
'hidden2' : tf.Variable(tf.random_normal([n_hidden2], stddev = 0.1)),
'hidden3' : tf.Variable(tf.random_normal([n_hidden3], stddev = 0.1)),
'hidden4' : tf.Variable(tf.random_normal([n_hidden4], stddev = 0.1)),
'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1)),
}
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
def build_model(x, weights, biases):
hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
hidden1 = tf.nn.sigmoid(hidden1)
hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
hidden2 = tf.nn.sigmoid(hidden2)
hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
hidden3 = tf.nn.sigmoid(hidden3)
hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
hidden4 = tf.nn.sigmoid(hidden4)
output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
return output
pred = build_model(x, weights, biases)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)
LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
n_batch = 50
n_iter = 10000
n_prt = 1000
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_record = []
for epoch in range(n_iter):
idx = np.random.randint(N, size = n_batch)
train_x = data_x[idx,:]
train_y = data_y[idx,:]
sess.run(optm, feed_dict = {x: train_x, y: train_y})
if epoch % n_prt == 0:
c = sess.run(loss, feed_dict = {x: train_x, y: train_y})
loss_record.append(c)
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp})
plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
Batch Normalization Implementation
is_training = tf.placeholder(tf.bool)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
def build_model(x, weights, biases, is_training):
hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
hidden1 = tf.layers.batch_normalization(hidden1, training = is_training)
hidden1 = tf.nn.sigmoid(hidden1)
hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
hidden2 = tf.layers.batch_normalization(hidden2, training = is_training)
hidden2 = tf.nn.sigmoid(hidden2)
hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
hidden3 = tf.layers.batch_normalization(hidden3, training = is_training)
hidden3 = tf.nn.sigmoid(hidden3)
hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
hidden4 = tf.layers.batch_normalization(hidden4, training = is_training)
hidden4 = tf.nn.sigmoid(hidden4)
output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
return output
pred = build_model(x, weights, biases, is_training)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)
LR = 0.001
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optm = tf.train.AdamOptimizer(LR).minimize(loss)
tf.get_default_graph().get_all_collection_keys()
tf.get_collection('trainable_variables')
tf.get_collection('variables')
tf.get_collection('update_ops')
n_batch = 50
n_iter = 10000
n_prt = 1000
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_record = []
for epoch in range(n_iter):
idx = np.random.randint(N, size = n_batch)
train_x = data_x[idx,:]
train_y = data_y[idx,:]
sess.run(optm, feed_dict = {x: train_x, y: train_y, is_training: True})
if epoch % n_prt == 0:
c = sess.run(loss, feed_dict = {x: train_x, y: train_y, is_training: True})
loss_record.append(c)
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, is_training: False})
plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
Often, overfitting associated with very large estimated parameters $\omega$
We want to balance
how well function fits data
magnitude of coefficients
$$
\begin{align*}
\text{Total loss } = \;&\underbrace{\text{measure of fit}}_{RSS(\omega)} + \;\lambda \cdot \underbrace{\text{measure of magnitude of coefficients}}_{\lambda \cdot \lVert \omega \rVert_d} \\ \\
\implies &\min\; \lVert h_{\omega} (x_i) - y \rVert_2^2 + \lambda \lVert \omega \rVert_d
\end{align*}
$$
where $ RSS(\omega) = \lVert h_{\omega} (x_i) - y \rVert^2_2 $, ( = Rresidual Sum of Squares) and $\lambda$ is a tuning parameter to be determined separately
tf.nn.dropout(layer, rate = p)
Dropout Implementation
p = tf.placeholder(tf.float32)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
def build_model(x, weights, biases, p):
hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
hidden1 = tf.nn.sigmoid(hidden1)
dropout1 = tf.nn.dropout(hidden1, rate = p)
hidden2 = tf.add(tf.matmul(dropout1, weights['hidden2']), biases['hidden2'])
hidden2 = tf.nn.sigmoid(hidden2)
dropout2 = tf.nn.dropout(hidden2, rate = p)
hidden3 = tf.add(tf.matmul(dropout2, weights['hidden3']), biases['hidden3'])
hidden3 = tf.nn.sigmoid(hidden3)
dropout3 = tf.nn.dropout(hidden3, rate = p)
hidden4 = tf.add(tf.matmul(dropout3, weights['hidden4']), biases['hidden4'])
hidden4 = tf.nn.sigmoid(hidden4)
dropout4 = tf.nn.dropout(hidden4, rate = p)
output = tf.add(tf.matmul(dropout4, weights['output']), biases['output'])
return output
pred = build_model(x, weights, biases, p)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)
LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
n_batch = 50
n_iter = 10000
n_prt = 1000
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
loss_record = []
for epoch in range(n_iter):
idx = np.random.randint(N, size = n_batch)
train_x = data_x[idx,:]
train_y = data_y[idx,:]
sess.run(optm, feed_dict = {x: train_x, y: train_y, p: 0.2})
if epoch % n_prt == 0:
c = sess.run(loss, feed_dict = {x: train_x, y: train_y, p: 0.2})
loss_record.append(c)
#print ("Iter : {}".format(epoch))
#print ("Train Cost : {}".format(c))
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, p: 0})
plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')