Artificial Neural Networks (ANN)
Table of Contents
Perceptron
XOR Problem
$x_1$ | $x_2$ | $x_1$ XOR $x_2$ |
---|---|---|
0 | 0 | 0 |
0 | 1 | 1 |
1 | 0 | 1 |
1 | 1 | 0 |
Neurons compute the weighted sum of their inputs
A neuron is activated or fired when the sum $a$ is positive
$$
\begin{align*}
a &= \omega_0 + \omega_1 x_1 + \omega_2 x_2 \\ \\
\hat{y} &= g(a) =
\begin{cases}
1 & a > 0\\
0 & \text{otherwise}
\end{cases}
\end{align*}
$$
Differentiable activation function
In a compact representation
Multi-layer perceptron
%%html
<center><iframe src="https://www.youtube.com/embed/3liCbRZPrZA?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
We can represent this “neuron” as follows:
The main weakness of linear predictors is their lack of capacity. For classification, the populations have to be linearly separable.
The XOR example can be solved by pre-processing the data to make the two populations linearly separable.
Kernel
Often we want to capture nonlinear patterns in the data
Linear models (e.g. linear regression, linear SVM) are not just rich enough
Kernels: make linear model work in nonlinear settings
Kerenl + Neuron
We can generalize an MLP
Universal function approximator Universal function classifier
Parameterized
Example: Linear Classifier
Example: Neural Networks
colah's blog
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([np.ones([N,1]), x1[C1], x2[C1]])
X0 = np.hstack([np.ones([M,1]), x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), -np.ones([M,1])])
train_X = np.asmatrix(train_X)
train_y = np.asmatrix(train_y)
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.title('Linearly Separable Classes', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
# perceptron
w = np.random.randn(3,1)
w = np.asmatrix(w)
n_iter = m
for k in range(n_iter):
for i in range(n_iter):
if train_y[i,0] != np.sign(train_X[i,:]*w)[0,0]:
w += train_y[i,0]*train_X[i,:].T
x1p = np.linspace(0,8,100).reshape(-1,1)
x2p = - w[1,0]/w[2,0]*x1p - w[0,0]/w[2,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = 'perceptron')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
#training data gerneration
m = 1000
x1 = 8*np.random.rand(m, 1)
x2 = 7*np.random.rand(m, 1) - 4
g = 0.8*x1 + x2 - 3
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.title('Linearly Separable Classes', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 15)
plt.show()
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(), activation = 'identity', max_iter = 15000, learning_rate_init = 0.05)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
Weights and Bias
$$y^{(i)} \in \{1,0\} \Rightarrow y^{(i)} \in \{[0,1],[1,0]\}$$
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown = 'ignore')
train_y = ohe.fit_transform(train_y).toarray()
print(train_y)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(), activation = 'identity', max_iter = 15000, learning_rate_init = 0.05)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
print(w_hat)
x1p = np.arange(0, 8, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlim([0, 8])
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.title('Nonlinearly Distributed Data', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 12)
plt.show()
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(2,), activation = 'logistic', max_iter = 50000, learning_rate_init = 0.01)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
H = train_X*w_hat + b_hat
H = 1/(1 + np.exp(-H))
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
w_hat = clf.coefs_[1]
b_hat = clf.intercepts_[1]
x1p = np.arange(0, 1, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(H[0:N,0], H[0:N,1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(H[N:m,0], H[N:m,1], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(3,), activation = 'logistic', max_iter = 50000, learning_rate_init = 0.01)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
x4p = - w_hat[0,2]/w_hat[1,2]*x1p - b_hat[2]/w_hat[1,2]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.plot(x1p, x4p, 'm', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1*x2-1)**2 + 2*x2 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.title('Nonlinearly Distributed Data', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 12)
plt.show()
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(4,), activation = 'logistic', max_iter = 80000, learning_rate_init = 0.01)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
x4p = - w_hat[0,2]/w_hat[1,2]*x1p - b_hat[2]/w_hat[1,2]
x5p = - w_hat[0,3]/w_hat[1,3]*x1p - b_hat[3]/w_hat[1,3]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.plot(x1p, x4p, 'm', linewidth = 3, label = '')
plt.plot(x1p, x5p, 'c', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
# training data gerneration
m = 1000
x1 = 10*np.random.rand(m, 1) - 5
x2 = 8*np.random.rand(m, 1) - 4
g = - 0.5*(x1-1)**2 + 2*x2*x1 + 5
C1 = np.where(g >= 0)[0]
C0 = np.where(g < 0)[0]
N = C1.shape[0]
M = C0.shape[0]
m = N + M
X1 = np.hstack([x1[C1], x2[C1]])
X0 = np.hstack([x1[C0], x2[C0]])
train_X = np.vstack([X1, X0])
train_X = np.asmatrix(train_X)
train_y = np.vstack([np.ones([N,1]), np.zeros([M,1])])
ohe = OneHotEncoder(handle_unknown='ignore')
train_y = ohe.fit_transform(train_y).toarray()
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.title('Nonlinearly Distributed Data', fontsize = 15)
plt.legend(loc = 1, fontsize = 15)
plt.xlabel(r'$x_1$', fontsize = 15)
plt.ylabel(r'$x_2$', fontsize = 12)
plt.show()
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(4,), activation = 'logistic', max_iter = 80000, learning_rate_init = 0.01)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
x4p = - w_hat[0,2]/w_hat[1,2]*x1p - b_hat[2]/w_hat[1,2]
x5p = - w_hat[0,3]/w_hat[1,3]*x1p - b_hat[3]/w_hat[1,3]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.plot(x1p, x4p, 'm', linewidth = 3, label = '')
plt.plot(x1p, x5p, 'c', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(2,), activation = 'logistic', max_iter = 80000, tol = 0.0000001, learning_rate_init = 0.01)
clf.fit(train_X, train_y)
w_hat = clf.coefs_[0]
b_hat = clf.intercepts_[0]
x1p = np.arange(-5, 5, 0.01).reshape(-1, 1)
x2p = - w_hat[0,0]/w_hat[1,0]*x1p - b_hat[0]/w_hat[1,0]
x3p = - w_hat[0,1]/w_hat[1,1]*x1p - b_hat[1]/w_hat[1,1]
plt.figure(figsize=(10, 8))
plt.plot(x1[C1], x2[C1], 'ro', alpha = 0.4, label = 'C1')
plt.plot(x1[C0], x2[C0], 'bo', alpha = 0.4, label = 'C0')
plt.plot(x1p, x2p, 'k', linewidth = 3, label = '')
plt.plot(x1p, x3p, 'g', linewidth = 3, label = '')
plt.xlabel('$x_1$', fontsize = 15)
plt.xlabel('$x_1$', fontsize = 15)
plt.ylabel('$x_2$', fontsize = 15)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.xlim([-5, 5])
plt.ylim([-4, 4])
plt.show()
%%html
<center><iframe src="https://www.youtube.com/embed/BR9h47Jtqyw?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')