Advanced ANN


By Prof. Seungchul Lee
http://iai.postech.ac.kr/
Industrial AI Lab at POSTECH

Table of Contents

1. Nonlinear Activation Function

  • The Vanishing Gradient Problem

  • As more layers using certain activation functions are added to neural networks, the gradients of the loss function approaches zero, making the network hard to train.

  • For example,

$$\frac{z}{u} = \frac{z}{y} \cdot \frac{y}{x} \cdot \frac{x}{\omega} \cdot \frac{\omega}{u} $$




  • Rectifiers
  • The use of the ReLU activation function was a great improvement compared to the historical tanh.




  • This can be explained by the derivative of ReLU itself not vanishing, and by the resulting coding being sparse (Glorot et al., 2011).




2. Batch Normalization

Batch normalization is a technique for improving the performance and stability of artificial neural networks.

It is used to normalize the input layer by adjusting and scaling the activations.




  • During training batch normalization shifts and rescales according to the mean and variance estimated on the batch.

  • During test, it simply shifts and rescales according to the empirical moments estimated during training.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

Overfitting in Regression

In [2]:
N = 10
data_x = np.linspace(-4.5, 4.5, N)
data_y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512])

data_x = data_x.reshape(-1,1)
data_y = data_y.reshape(-1,1)

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.grid(alpha = 0.3)
plt.show()
In [3]:
n_input = 1
n_hidden1 = 30
n_hidden2 = 100
n_hidden3 = 100
n_hidden4 = 30
n_output = 1
In [4]:
weights = {
    'hidden1' : tf.Variable(tf.random_normal([n_input, n_hidden1], stddev = 0.1)),
    'hidden2' : tf.Variable(tf.random_normal([n_hidden1, n_hidden2], stddev = 0.1)),
    'hidden3' : tf.Variable(tf.random_normal([n_hidden2, n_hidden3], stddev = 0.1)),
    'hidden4' : tf.Variable(tf.random_normal([n_hidden3, n_hidden4], stddev = 0.1)),
    'output' : tf.Variable(tf.random_normal([n_hidden4, n_output], stddev = 0.1)),
}

biases = {
    'hidden1' : tf.Variable(tf.random_normal([n_hidden1], stddev = 0.1)),
    'hidden2' : tf.Variable(tf.random_normal([n_hidden2], stddev = 0.1)),
    'hidden3' : tf.Variable(tf.random_normal([n_hidden3], stddev = 0.1)),
    'hidden4' : tf.Variable(tf.random_normal([n_hidden4], stddev = 0.1)),
    'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1)),
}
In [5]:
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
In [6]:
def build_model(x, weights, biases):
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
    hidden1 = tf.nn.sigmoid(hidden1)
    
    hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.nn.sigmoid(hidden2)
 
    hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.nn.sigmoid(hidden3)
    
    hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.nn.sigmoid(hidden4)
    
    output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
    return output
In [7]:
pred = build_model(x, weights, biases)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
In [8]:
n_batch = 50    
n_iter = 10000 
n_prt = 1000    

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []

for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y})
        loss_record.append(c)
In [9]:
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show() 
In [10]:
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()

Batch Normalization Implementation

In [11]:
is_training = tf.placeholder(tf.bool)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
In [12]:
def build_model(x, weights, biases, is_training):    
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])    
    hidden1 = tf.layers.batch_normalization(hidden1, training = is_training)
    hidden1 = tf.nn.sigmoid(hidden1)    
    
    hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.layers.batch_normalization(hidden2, training = is_training)
    hidden2 = tf.nn.sigmoid(hidden2)    
    
    hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.layers.batch_normalization(hidden3, training = is_training)
    hidden3 = tf.nn.sigmoid(hidden3)    
    
    hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.layers.batch_normalization(hidden4, training = is_training)
    hidden4 = tf.nn.sigmoid(hidden4)
    
    output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
    return output
In [13]:
pred = build_model(x, weights, biases, is_training)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optm = tf.train.AdamOptimizer(LR).minimize(loss)
WARNING: Logging before flag parsing goes to stderr.
W1011 19:15:51.129559  9360 deprecation.py:323] From <ipython-input-12-bf173161ddb7>:3: batch_normalization (from tensorflow.python.layers.normalization) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).
In [14]:
tf.get_default_graph().get_all_collection_keys()
Out[14]:
['update_ops', 'trainable_variables', 'variables', 'train_op', 'cond_context']
In [15]:
tf.get_collection('trainable_variables')
Out[15]:
[<tf.Variable 'Variable:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta:0' shape=(30,) dtype=float32_ref>]
In [16]:
tf.get_collection('variables')
Out[16]:
[<tf.Variable 'Variable:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_1:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_1:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_1:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_1:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_1:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/moving_mean:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/moving_variance:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/moving_mean:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/moving_variance:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/moving_mean:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/moving_variance:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/moving_mean:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/moving_variance:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'beta1_power_1:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power_1:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_2:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_3:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_2:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_3:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_3:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_2:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_2:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_3:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_2:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_3:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_2:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_3:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_2:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_3:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_2:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_3:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_2:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_3:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta/Adam_1:0' shape=(30,) dtype=float32_ref>]
In [17]:
tf.get_collection('update_ops')
Out[17]:
[<tf.Operation 'batch_normalization/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_1/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_1/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_2/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_2/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_3/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_3/cond_3/Merge' type=Merge>]
In [18]:
n_batch = 50 
n_iter = 10000
n_prt = 1000  

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []
for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y, is_training: True})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y, is_training: True})
        loss_record.append(c)
In [19]:
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()
In [20]:
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, is_training: False})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()

3. Dropout as Regularization

3.1. Regularization (Shrinkage Methods)

Often, overfitting associated with very large estimated parameters $\omega$

We want to balance

  • how well function fits data

  • magnitude of coefficients

    $$ \begin{align*} \text{Total loss } = \;&\underbrace{\text{measure of fit}}_{RSS(\omega)} + \;\lambda \cdot \underbrace{\text{measure of magnitude of coefficients}}_{\lambda \cdot \lVert \omega \rVert_d} \\ \\ \implies &\min\; \lVert h_{\omega} (x_i) - y \rVert_2^2 + \lambda \lVert \omega \rVert_d \end{align*} $$
    where $ RSS(\omega) = \lVert h_{\omega} (x_i) - y \rVert^2_2 $, ( = Rresidual Sum of Squares) and $\lambda$ is a tuning parameter to be determined separately


  • the second term, $\lambda \, \lVert \omega \rVert_d$, called a shrinkage penalty, is small when $\omega_1, \cdots,\omega_n$ are close to zeros, and so it has the effect of shrinking the estimates of $\omega_j$ towards zero
  • The tuning parameter $\lambda$ serves to control the relative impact of these two terms on the weights' estimates

3.2. Different Regularization Techniques

  • Big Data
  • Data augmentation
    • The simplest way to reduce overfitting is to increase the size of the training data




  • Early stopping
    • When we see that the performance on the validation set is getting worse, we immediately stop the training on the model




3.3. Dropout

  • This is the one of the most interesting types of regularization techniques.
  • It also produces very good results and is consequently the most frequently used regularization technique in the field of deep learning.
  • At every iteration, it randomly selects some nodes and removes them.
  • It can also be thought of as an ensemble technique in machine learning.




  • tf.nn.dropout(layer, rate = p)
  • For training
    • rate: the probability that each element is dropped. For example, setting rate = 0.1 would drop 10% of input elements with probability rate, drops elements of layers. Input that are kept are scaled up by $\frac{1}{1−\text{rate}}$, otherwise outputs 0. The scaling is so that the expected sum is unchanged.
  • For testing
    • All the elements are kept

Dropout Implementation

In [21]:
p = tf.placeholder(tf.float32)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])
In [22]:
def build_model(x, weights, biases, p):    
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])    
    hidden1 = tf.nn.sigmoid(hidden1)    
    dropout1 = tf.nn.dropout(hidden1, rate = p)
    
    hidden2 = tf.add(tf.matmul(dropout1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.nn.sigmoid(hidden2)    
    dropout2 = tf.nn.dropout(hidden2, rate = p)
    
    hidden3 = tf.add(tf.matmul(dropout2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.nn.sigmoid(hidden3)    
    dropout3 = tf.nn.dropout(hidden3, rate = p)
    
    hidden4 = tf.add(tf.matmul(dropout3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.nn.sigmoid(hidden4)    
    dropout4 = tf.nn.dropout(hidden4, rate = p)
    
    output = tf.add(tf.matmul(dropout4, weights['output']), biases['output'])
    return output
In [23]:
pred = build_model(x, weights, biases, p)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)
In [24]:
n_batch = 50 
n_iter = 10000
n_prt = 1000  

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []
for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y, p: 0.2})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y, p: 0.2})
        loss_record.append(c)
        #print ("Iter : {}".format(epoch))
        #print ("Train Cost : {}".format(c))        
In [25]:
plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()        
In [26]:
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, p: 0})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
In [27]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')