Artificial Neural Networks (ANN)
Table of Contents
%%html
<center><iframe src="https://www.youtube.com/embed/blDtzUuJtiE?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/6O_WHmBUff4?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/DZgihzTgVQ8?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
Perceptron
XOR Problem
$x_1$ | $x_2$ | $x_1$ XOR $x_2$ |
---|---|---|
0 | 0 | 0 |
0 | 1 | 1 |
1 | 0 | 1 |
1 | 1 | 0 |
Neurons compute the weighted sum of their inputs
A neuron is activated or fired when the sum $a$ is positive
$$
\begin{align*}
a &= \omega_0 + \omega_1 x_1 + \omega_2 x_2 \\ \\
\hat{y} &= g(a) =
\begin{cases}
1 & a > 0\\
0 & \text{otherwise}
\end{cases}
\end{align*}
$$
Differentiable activation function
In a compact representation
Multi-layer perceptron
We can represent this “neuron” as follows:
The main weakness of linear predictors is their lack of capacity. For classification, the populations have to be linearly separable.
The XOR example can be solved by pre-processing the data to make the two populations linearly separable.
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
# X1 = np.hstack([np.ones([N,1]), x1[C1], x2[C1]])
# X0 = np.hstack([np.ones([M,1]), x1[C0], x2[C0]])
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
train_X = np.asmatrix(train_X)
train_y = np.asmatrix(train_y)
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
LogisticRegression = tf.keras.models.Sequential([
tf.keras.layers.Dense(units = 1, input_dim = 2, activation = 'sigmoid')
])
LogisticRegression.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1),
loss = 'binary_crossentropy')
loss = LogisticRegression.fit(train_X, train_y, epochs = 10)
w = LogisticRegression.layers[0].get_weights()[0]
b = LogisticRegression.layers[0].get_weights()[1]
print(w)
print(b)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w[0,0]/w[1,0]*x1p - b[0]/w[1,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
# ohe = OneHotEncoder(handle_unknown='ignore')
# train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
LogisticRegression = tf.keras.models.Sequential([
tf.keras.layers.Dense(units = 2, input_dim = 2, activation = 'sigmoid'),
tf.keras.layers.Dense(units = 1, activation = 'sigmoid')
])
LogisticRegression.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.1),
loss = 'binary_crossentropy')
loss = LogisticRegression.fit(train_X, train_y, epochs = 10)
w1 = LogisticRegression.layers[0].get_weights()[0]
b1 = LogisticRegression.layers[0].get_weights()[1]
print(w1)
print(b1)
w2 = LogisticRegression.layers[1].get_weights()[0]
b2 = LogisticRegression.layers[1].get_weights()[1]
print(w2)
print(b2)
H = train_X*w1 + b1
H = 1/(1 + np.exp(-H))
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(0, 1, 0.01).reshape(-1, 1)
x2p = - w2[0,0]/w2[1,0]*x1p - b2[0]/w2[1,0]
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.xlabel('$z_1$', fontsize = 15)
plt.ylabel('$z_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w1[0,0]/w1[1,0]*x1p - b1[0]/w1[1,0]
x3p = - w1[0,1]/w1[1,1]*x1p - b1[1]/w1[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
$=$ Learning or estimating weights and biases of multi-layer perceptron from training data
Loss Function
$$ \min_{\omega} \sum_{i=1}^{m}\ell\left( h_{\omega}\left(x^{(i)}\right),y^{(i)}\right)$$
Learning
Learning weights and biases from data using gradient descent
Backpropagation
We will be using MNIST to create a Multinomial Classifier that can detect if the MNIST image shown is a member of class 0,1,2,3,4,5,6,7,8 or 9. Susinctly, we're teaching a computer to recognize hand written digets.
Let's download and load the dataset.
mnist = tf.keras.datasets.mnist
(train_x, train_y), (test_x, test_y) = mnist.load_data()
train_x, test_x = train_x/255.0, test_x/255.0
print ("The training data set is:\n")
print (train_x.shape)
print (train_y.shape)
print ("The test data set is:")
print (test_x.shape)
print (test_y.shape)
Let's visualize what some of these images and their corresponding training labels look like.
print('label :', train_y[5])
plt.figure(figsize = (6,6))
plt.imshow(train_x[5], 'gray')
plt.xticks([])
plt.yticks([])
plt.show()
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape = (28, 28)),
tf.keras.layers.Dense(units = 100, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'softmax')
])
model.compile(optimizer = 'adam',
loss = 'sparse_categorical_crossentropy',
metrics = ['accuracy'])
loss = model.fit(train_x, train_y, epochs = 5)
test_loss, test_acc = model.evaluate(test_x, test_y)
test_img = test_x[np.random.choice(test_x.shape[0], 1)]
predict = model.predict_on_batch(test_img)
mypred = np.argmax(predict, axis = 1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.imshow(test_img.reshape(28, 28), 'gray')
plt.axis('off')
plt.subplot(1,2,2)
plt.stem(predict[0])
plt.show()
print('Prediction : {}'.format(mypred[0]))
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')