Artificial Neural Networks (ANN)
Table of Contents
Perceptron
XOR Problem
Neurons compute the weighted sum of their inputs
A neuron is activated or fired when the sum $a$ is positive
$$
\begin{align*}
a &= \omega_0 + \omega_1 x_1 + \omega_2 x_2 \\ \\
\hat{y} &= g(a) =
\begin{cases}
1 & a > 0\\
0 & \text{otherwise}
\end{cases}
\end{align*}
$$
Multi-neurons
Differentiable activation function
In a compact representation
Multi-layer perceptron
We can represent this “neuron” as follows:
The main weakness of linear predictors is their lack of capacity. For classification, the populations have to be linearly separable.
The XOR example can be solved by pre-processing the data to make the two populations linearly separable.
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
train_X = np.asmatrix(train_X)
train_y = np.asmatrix(train_y)
plt.figure(figsize = (6, 4))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
LogisticRegression = tf.keras.models.Sequential([
tf.keras.layers.Dense(input_dim = 2,
units = 1,
activation = 'sigmoid')
])
LogisticRegression.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1),
loss = 'binary_crossentropy')
loss = LogisticRegression.fit(train_X, train_y, epochs = 10)
w = LogisticRegression.layers[0].get_weights()[0]
b = LogisticRegression.layers[0].get_weights()[1]
print(w)
print(b)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w[0,0]/w[1,0]*x1p - b[0]/w[1,0]
plt.figure(figsize = (6, 4))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
plt.figure(figsize = (6, 4))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
LogisticRegression = tf.keras.models.Sequential([
tf.keras.layers.Dense(input_dim = 2, units = 2, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 1, activation = 'sigmoid')
])
LogisticRegression.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1),
loss = 'binary_crossentropy')
loss = LogisticRegression.fit(train_X, train_y, epochs = 10)
w1 = LogisticRegression.layers[0].get_weights()[0]
b1 = LogisticRegression.layers[0].get_weights()[1]
w2 = LogisticRegression.layers[1].get_weights()[0]
b2 = LogisticRegression.layers[1].get_weights()[1]
H = train_X*w1 + b1
H = 1/(1 + np.exp(-H))
plt.figure(figsize = (6, 4))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(0, 1, 0.01).reshape(-1, 1)
x2p = - w2[0,0]/w2[1,0]*x1p - b2[0]/w2[1,0]
plt.figure(figsize = (6, 4))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w1[0,0]/w1[1,0]*x1p - b1[0]/w1[1,0]
x3p = - w1[0,1]/w1[1,1]*x1p - b1[1]/w1[1,1]
plt.figure(figsize = (6, 4))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
One of the central ideas of computer science
Depends on solutions to smaller instances of the same problem ( = subproblem)
Function to call itself (it is impossible in the real world)
%%html
<center><iframe src="https://www.youtube.com/embed/t4MSwiqfLaY?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
$$n ! = n \cdot (n-1) \cdots 2 \cdot 1$$
n = 5
m = 1
for i in range(n):
m = m*(i+1)
print(m)
def fac(n):
if n == 1:
return 1
else:
return n*fac(n-1)
# recursive
fac(5)
Dynamic Programming: general, powerful algorithm design technique
Fibonacci numbers:
# naive Fibonacci
def fib(n):
if n <= 2:
return 1
else:
return fib(n-1) + fib(n-2)
fib(10)
# Memorized DP Fibonacci
def mfib(n):
global memo
if memo[n-1] != 0:
return memo[n-1]
elif n <= 2:
memo[n-1] = 1
return memo[n-1]
else:
memo[n-1] = mfib(n-1) + mfib(n-2)
return memo[n-1]
import numpy as np
n = 10
memo = np.zeros(n)
mfib(n)
n = 30
%timeit fib(30)
memo = np.zeros(n)
%timeit mfib(30)
$=$ Learning or estimating weights and biases of multi-layer perceptron from training data
3 key components
In mathematical expression
$$ \min_{\omega} \sum_{i=1}^{m}\ell\left( h_{\omega}\left(x^{(i)}\right),y^{(i)}\right)$$
Learning weights and biases from data using gradient descent
Backpropagation
Chain Rule
Computing the derivative of the composition of functions
$\space f(g(x))' = f'(g(x))g'(x)$
$\space {dz \over dx} = {dz \over dy} \bullet {dy \over dx}$
$\space {dz \over dw} = ({dz \over dy} \bullet {dy \over dx}) \bullet {dx \over dw}$
$\space {dz \over du} = ({dz \over dy} \bullet {dy \over dx} \bullet {dx \over dw}) \bullet {dw \over du}$
Optimization procedure
Summary
%%html
<center><iframe src="https://www.youtube.com/embed/aircAruvnKk?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/IHZwWFHWa-w?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/Ilg3gGewQ5U?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/tIeHLnjs5U8?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
From Wikipedia
More here
We will be using MNIST to create a Multinomial Classifier that can detect if the MNIST image shown is a member of class 0,1,2,3,4,5,6,7,8 or 9. Susinctly, we're teaching a computer to recognize hand written digets.
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
Let's download and load the dataset.
mnist = tf.keras.datasets.mnist
(train_x, train_y), (test_x, test_y) = mnist.load_data()
train_x, test_x = train_x/255.0, test_x/255.0
print ("The training data set is:\n")
print (train_x.shape)
print (train_y.shape)
print ("The test data set is:")
print (test_x.shape)
print (test_y.shape)
Display a few random samples from it:
# So now we have a 28x28 matrix, where each element is an intensity level from 0 to 1.
img = train_x[5]
img.shape
Let's visualize what some of these images and their corresponding training labels look like.
plt.figure(figsize = (4, 4))
plt.imshow(img, 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
train_y[5]
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
mnist = tf.keras.datasets.mnist
(train_x, train_y), (test_x, test_y) = mnist.load_data()
train_x, test_x = train_x/255.0, test_x/255.0
First, the layer performs several matrix multiplication to produce a set of linear activations
Second, each linear activation is running through a nonlinear activation function
Third, predict values with an affine transformation
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape = (28, 28)),
tf.keras.layers.Dense(units = 100, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'softmax')
])
Loss
Optimizer
n_batch
: batch size for mini-batch gradient descentn_iter
: the number of iteration steps per epochn_epoch
: iteration over the entire x and y data providedInitializer
model.compile(optimizer = 'adam',
loss = 'sparse_categorical_crossentropy',
metrics = ['accuracy'])
# Train Model
loss = model.fit(train_x, train_y, epochs = 5)
# Evaluate Test Data
test_loss, test_acc = model.evaluate(test_x, test_y)
test_img = test_x[np.random.choice(test_x.shape[0], 1)]
predict = model.predict_on_batch(test_img)
mypred = np.argmax(predict, axis = 1)
plt.figure(figsize = (8,4))
plt.subplot(1,2,1)
plt.imshow(test_img.reshape(28, 28), 'gray')
plt.axis('off')
plt.subplot(1,2,2)
plt.stem(predict[0])
plt.show()
print('Prediction : {}'.format(mypred[0]))
You may observe that the accuracy on the test dataset is a little lower than the accuracy on the training dataset. This gap between training accuracy and test accuracy is an example of overfitting, when a machine learning model performs worse on new data than on its training data.
What is the highest accuracy you can achieve with this first fully connected model? Since the handwritten digit classification task is pretty straightforward, you may be wondering how we can do better...
$\Rightarrow$ As we saw in lecture, convolutional neural networks (CNNs) are particularly well-suited for a variety of tasks in computer vision, and have achieved near-perfect accuracies on the MNIST dataset. We will build a CNN and ultimately output a probability distribution over the 10 digit classes (0-9) in the next lectures.
The Vanishing Gradient Problem
As more layers using certain activation functions are added to neural networks, the gradients of the loss function approaches zero, making the network hard to train.
For example,
Batch normalization is a technique for improving the performance and stability of artificial neural networks.
It is used to normalize the input layer by adjusting and scaling the activations.
During training batch normalization shifts and rescales according to the mean and variance estimated on the batch.
During test, it simply shifts and rescales according to the empirical moments estimated during training.
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
Overfitting in Regression
N = 10
data_x = np.linspace(-4.5, 4.5, N)
data_y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512])
data_x = data_x.reshape(-1,1)
data_y = data_y.reshape(-1,1)
plt.figure(figsize = (6, 4))
plt.plot(data_x, data_y, 'o')
plt.grid(alpha = 0.3)
plt.show()
base_model = tf.keras.models.Sequential([
tf.keras.layers.Dense(input_shape = (1,),
units = 30, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 100, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 100, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 30, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 1, activation = None)
])
base_model.compile(optimizer = tf.keras.optimizers.Adam(0.001),
loss = 'mse',
metrics = ['mse'])
# Train Model & Evaluate Test Data
training = base_model.fit(data_x, data_y, epochs = 5000, verbose = 0)
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = base_model.predict(xp)
plt.figure(figsize = (6, 4))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
Batch Normalization Implementation
bn_model = tf.keras.models.Sequential([
tf.keras.layers.Dense(units = 30, activation = None, input_shape = (1,)),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(units = 100, activation = None),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(units = 100, activation = None),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(units = 30, activation = None),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Activation('sigmoid'),
tf.keras.layers.Dense(units = 1, activation = None)
])
bn_model.compile(optimizer = tf.keras.optimizers.Adam(0.001),
loss = 'mse',
metrics = ['mse'])
training = bn_model.fit(data_x, data_y, epochs = 4000, verbose = 0)
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = bn_model.predict(xp)
plt.figure(figsize = (6, 4))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
Often, overfitting associated with very large estimated parameters $\omega$
We want to balance
how well function fits data
magnitude of coefficients
$$
\begin{align*}
\text{Total loss } = \;&\underbrace{\text{measure of fit}}_{RSS(\omega)} + \;\lambda \cdot \underbrace{\text{measure of magnitude of coefficients}}_{\lambda \cdot \lVert \omega \rVert_d} \\ \\
\implies &\min\; \lVert h_{\omega} (x_i) - y \rVert_2^2 + \lambda \lVert \omega \rVert_d
\end{align*}
$$
where $ RSS(\omega) = \lVert h_{\omega} (x_i) - y \rVert^2_2 $, ( = Rresidual Sum of Squares) and $\lambda$ is a tuning parameter to be determined separately
tf.keras.layers.Dropout(rate = p)
Dropout Implementation
dropout_model = tf.keras.models.Sequential([
tf.keras.layers.Dense(input_shape = (1,),
units = 30, activation = 'sigmoid'),
tf.keras.layers.Dropout(rate = 0.2),
tf.keras.layers.Dense(units = 100, activation = 'sigmoid'),
tf.keras.layers.Dropout(rate = 0.2),
tf.keras.layers.Dense(units = 100, activation = 'sigmoid'),
tf.keras.layers.Dropout(rate = 0.2),
tf.keras.layers.Dense(units = 30, activation = 'sigmoid'),
tf.keras.layers.Dropout(rate = 0.2),
tf.keras.layers.Dense(units = 1, activation = None)
])
dropout_model.compile(optimizer = tf.keras.optimizers.Adam(0.001),
loss = 'mse',
metrics = ['mse'])
training = dropout_model.fit(data_x, data_y, epochs = 200, verbose = 0)
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
my_pred = dropout_model.predict(xp)
plt.figure(figsize = (6, 4))
plt.plot(data_x, data_y, 'o')
plt.plot(xp, my_pred, 'r')
plt.grid(alpha = 0.3)
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')