Advanced ANN

By Prof. Seungchul Lee
http://iai.postech.ac.kr/
Industrial AI Lab at POSTECH

Table of Contents

1. Nonlinear Activation Function¶

The Vanishing Gradient Problem
As more layers using certain activation functions are added to neural networks, the gradients of the loss function approaches zero, making the network hard to train.
For example,

$$\frac{z}{u} = \frac{z}{y} \cdot \frac{y}{x} \cdot \frac{x}{\omega} \cdot \frac{\omega}{u} $$

Rectifiers
The use of the ReLU activation function was a great improvement compared to the historical tanh.

This can be explained by the derivative of ReLU itself not vanishing, and by the resulting coding being sparse (Glorot et al., 2011).

2. Batch Normalization¶

Batch normalization is a technique for improving the performance and stability of artificial neural networks.

It is used to normalize the input layer by adjusting and scaling the activations.

During training batch normalization shifts and rescales according to the mean and variance estimated on the batch.
During test, it simply shifts and rescales according to the empirical moments estimated during training.

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

Overfitting in Regression

N = 10
data_x = np.linspace(-4.5, 4.5, N)
data_y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512])

data_x = data_x.reshape(-1,1)
data_y = data_y.reshape(-1,1)

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.grid(alpha = 0.3)
plt.show()

n_input = 1
n_hidden1 = 30
n_hidden2 = 100
n_hidden3 = 100
n_hidden4 = 30
n_output = 1

weights = {
    'hidden1' : tf.Variable(tf.random_normal([n_input, n_hidden1], stddev = 0.1)),
    'hidden2' : tf.Variable(tf.random_normal([n_hidden1, n_hidden2], stddev = 0.1)),
    'hidden3' : tf.Variable(tf.random_normal([n_hidden2, n_hidden3], stddev = 0.1)),
    'hidden4' : tf.Variable(tf.random_normal([n_hidden3, n_hidden4], stddev = 0.1)),
    'output' : tf.Variable(tf.random_normal([n_hidden4, n_output], stddev = 0.1)),
}

biases = {
    'hidden1' : tf.Variable(tf.random_normal([n_hidden1], stddev = 0.1)),
    'hidden2' : tf.Variable(tf.random_normal([n_hidden2], stddev = 0.1)),
    'hidden3' : tf.Variable(tf.random_normal([n_hidden3], stddev = 0.1)),
    'hidden4' : tf.Variable(tf.random_normal([n_hidden4], stddev = 0.1)),
    'output' : tf.Variable(tf.random_normal([n_output], stddev = 0.1)),
}

x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])

def build_model(x, weights, biases):
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])
    hidden1 = tf.nn.sigmoid(hidden1)
    
    hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.nn.sigmoid(hidden2)
 
    hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.nn.sigmoid(hidden3)
    
    hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.nn.sigmoid(hidden4)
    
    output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
    return output

pred = build_model(x, weights, biases)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)

n_batch = 50    
n_iter = 10000 
n_prt = 1000    

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []

for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y})
        loss_record.append(c)

plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()

xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()

Batch Normalization Implementation

is_training = tf.placeholder(tf.bool)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])

def build_model(x, weights, biases, is_training):    
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])    
    hidden1 = tf.layers.batch_normalization(hidden1, training = is_training)
    hidden1 = tf.nn.sigmoid(hidden1)    
    
    hidden2 = tf.add(tf.matmul(hidden1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.layers.batch_normalization(hidden2, training = is_training)
    hidden2 = tf.nn.sigmoid(hidden2)    
    
    hidden3 = tf.add(tf.matmul(hidden2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.layers.batch_normalization(hidden3, training = is_training)
    hidden3 = tf.nn.sigmoid(hidden3)    
    
    hidden4 = tf.add(tf.matmul(hidden3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.layers.batch_normalization(hidden4, training = is_training)
    hidden4 = tf.nn.sigmoid(hidden4)
    
    output = tf.add(tf.matmul(hidden4, weights['output']), biases['output'])
    return output

pred = build_model(x, weights, biases, is_training)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    optm = tf.train.AdamOptimizer(LR).minimize(loss)

WARNING: Logging before flag parsing goes to stderr.
W1011 19:15:51.129559  9360 deprecation.py:323] From <ipython-input-12-bf173161ddb7>:3: batch_normalization (from tensorflow.python.layers.normalization) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).

tf.get_default_graph().get_all_collection_keys()

['update_ops', 'trainable_variables', 'variables', 'train_op', 'cond_context']

tf.get_collection('trainable_variables')

[<tf.Variable 'Variable:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta:0' shape=(30,) dtype=float32_ref>]

tf.get_collection('variables')

[<tf.Variable 'Variable:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'beta1_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_1:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_1:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_1:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_1:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_1:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_1:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/moving_mean:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/moving_variance:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/moving_mean:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/moving_variance:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/moving_mean:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/moving_variance:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/moving_mean:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/moving_variance:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'beta1_power_1:0' shape=() dtype=float32_ref>,
 <tf.Variable 'beta2_power_1:0' shape=() dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_2:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable/Adam_3:0' shape=(1, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_2:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_1/Adam_3:0' shape=(30, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_2:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_2/Adam_3:0' shape=(100, 100) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_2:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_3/Adam_3:0' shape=(100, 30) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_2:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_4/Adam_3:0' shape=(30, 1) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_2:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_5/Adam_3:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_2:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_6/Adam_3:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_2:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_7/Adam_3:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_2:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_8/Adam_3:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_2:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'Variable_9/Adam_3:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/gamma/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization/beta/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/gamma/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_1/beta/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/gamma/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta/Adam:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_2/beta/Adam_1:0' shape=(100,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/gamma/Adam_1:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta/Adam:0' shape=(30,) dtype=float32_ref>,
 <tf.Variable 'batch_normalization_3/beta/Adam_1:0' shape=(30,) dtype=float32_ref>]

tf.get_collection('update_ops')

[<tf.Operation 'batch_normalization/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_1/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_1/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_2/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_2/cond_3/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_3/cond_2/Merge' type=Merge>,
 <tf.Operation 'batch_normalization_3/cond_3/Merge' type=Merge>]

n_batch = 50 
n_iter = 10000
n_prt = 1000  

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []
for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y, is_training: True})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y, is_training: True})
        loss_record.append(c)

plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()

xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, is_training: False})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()

3. Dropout as Regularization¶

3.1. Regularization (Shrinkage Methods)¶

Often, overfitting associated with very large estimated parameters $\omega$

We want to balance

how well function fits data
magnitude of coefficients

$$ \begin{align*} \text{Total loss } = \;&\underbrace{\text{measure of fit}}_{RSS(\omega)} + \;\lambda \cdot \underbrace{\text{measure of magnitude of coefficients}}_{\lambda \cdot \lVert \omega \rVert_d} \\ \\ \implies &\min\; \lVert h_{\omega} (x_i) - y \rVert_2^2 + \lambda \lVert \omega \rVert_d \end{align*} $$
where $ RSS(\omega) = \lVert h_{\omega} (x_i) - y \rVert^2_2 $, ( = Rresidual Sum of Squares) and $\lambda$ is a tuning parameter to be determined separately

the second term, $\lambda \, \lVert \omega \rVert_d$, called a shrinkage penalty, is small when $\omega_1, \cdots,\omega_n$ are close to zeros, and so it has the effect of shrinking the estimates of $\omega_j$ towards zero

The tuning parameter $\lambda$ serves to control the relative impact of these two terms on the weights' estimates

3.2. Different Regularization Techniques¶

Big Data

Data augmentation
- The simplest way to reduce overfitting is to increase the size of the training data

Early stopping
- When we see that the performance on the validation set is getting worse, we immediately stop the training on the model

3.3. Dropout¶

This is the one of the most interesting types of regularization techniques.
It also produces very good results and is consequently the most frequently used regularization technique in the field of deep learning.
At every iteration, it randomly selects some nodes and removes them.
It can also be thought of as an ensemble technique in machine learning.

tf.nn.dropout(layer, rate = p)

For training
- rate: the probability that each element is dropped. For example, setting rate = 0.1 would drop 10% of input elements with probability rate, drops elements of layers. Input that are kept are scaled up by $\frac{1}{1−\text{rate}}$, otherwise outputs 0. The scaling is so that the expected sum is unchanged.

For testing
- All the elements are kept

Dropout Implementation

p = tf.placeholder(tf.float32)
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_output])

def build_model(x, weights, biases, p):    
    hidden1 = tf.add(tf.matmul(x, weights['hidden1']), biases['hidden1'])    
    hidden1 = tf.nn.sigmoid(hidden1)    
    dropout1 = tf.nn.dropout(hidden1, rate = p)
    
    hidden2 = tf.add(tf.matmul(dropout1, weights['hidden2']), biases['hidden2'])
    hidden2 = tf.nn.sigmoid(hidden2)    
    dropout2 = tf.nn.dropout(hidden2, rate = p)
    
    hidden3 = tf.add(tf.matmul(dropout2, weights['hidden3']), biases['hidden3'])
    hidden3 = tf.nn.sigmoid(hidden3)    
    dropout3 = tf.nn.dropout(hidden3, rate = p)
    
    hidden4 = tf.add(tf.matmul(dropout3, weights['hidden4']), biases['hidden4'])
    hidden4 = tf.nn.sigmoid(hidden4)    
    dropout4 = tf.nn.dropout(hidden4, rate = p)
    
    output = tf.add(tf.matmul(dropout4, weights['output']), biases['output'])
    return output

pred = build_model(x, weights, biases, p)
loss = tf.square(pred - y)
loss = tf.reduce_mean(loss)

LR = 0.001
optm = tf.train.AdamOptimizer(LR).minimize(loss)

n_batch = 50 
n_iter = 10000
n_prt = 1000  

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

loss_record = []
for epoch in range(n_iter):
    idx = np.random.randint(N, size = n_batch)
    train_x = data_x[idx,:]
    train_y = data_y[idx,:]
    
    sess.run(optm, feed_dict = {x: train_x,  y: train_y, p: 0.2})
    
    if epoch % n_prt == 0:
        c = sess.run(loss, feed_dict = {x: train_x, y: train_y, p: 0.2})
        loss_record.append(c)
        #print ("Iter : {}".format(epoch))
        #print ("Train Cost : {}".format(c))

plt.figure(figsize = (10,8))
plt.plot(np.arange(len(loss_record))*n_prt, loss_record, label = 'training')
plt.xlabel('iteration', fontsize = 15)
plt.ylabel('loss', fontsize = 15)
plt.grid('on', alpha = 0.3)
plt.legend(fontsize = 12)
plt.ylim([0, 10])
plt.show()

xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = sess.run(pred, feed_dict = {x: xp, p: 0})

plt.figure(figsize = (10,8))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()

%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')