Overfitting
Table of Contents
Nonlinear regression
(= linear regression for non-linearly distributed data)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# 10 data points
n = 10
x = np.linspace(-4.5, 4.5, 10)
y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512])
plt.figure(figsize=(10, 8))
plt.plot(x, y, 'o', label = 'Data')
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
plt.grid(alpha = 0.3)
plt.show()
p = np.polyfit(x, y, deg = 1)
xp = np.arange(-4.5, 4.5, 0.01)
plt.figure(figsize=(10, 8))
plt.plot(x, y, 'o', label = 'Data')
plt.plot(xp, np.polyval(p, xp), linewidth = 2, label = 'Polinomial')
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
plt.legend(fontsize = 15)
plt.grid(alpha = 0.3)
plt.show()
p = np.polyfit(x, y, deg = 9)
xp = np.arange(-4.5, 4.5, 0.01)
plt.figure(figsize=(10, 8))
plt.plot(x, y, 'o', label = 'Data')
plt.plot(xp, np.polyval(p, xp), linewidth = 2, label = 'Polinomial')
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
plt.legend(fontsize = 15)
plt.grid(alpha = 0.3)
plt.show()
p = np.polyfit(x, y, deg = 3)
xp = np.arange(-4.5, 4.5, 0.01)
plt.figure(figsize=(10, 8))
plt.plot(x, y, 'o', label = 'Data')
plt.plot(xp, np.polyval(p, xp), linewidth = 2, label = 'Polynomial')
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
plt.legend(fontsize = 15)
plt.grid(alpha = 0.3)
plt.show()
Construct explicit feature vectors
Consider linear combinations of fixed nonlinear functions of the input variables, of the form
$$
\begin{bmatrix}
1 & x_{1} & x_1^2\\1 & x_{2} & x_2^2\\\vdots & \vdots\\1 & x_{m} & x_m^2
\end{bmatrix}
\begin{bmatrix}\theta_0\\\theta_1 \\ \theta_2 \end{bmatrix} \quad \Rightarrow \quad
\begin{bmatrix}
\mid & \mid & \mid \\
b_0(x) & b_1(x) & b_2(x)\\
\mid & \mid & \mid
\end{bmatrix}
\begin{bmatrix}\theta_0\\\theta_1 \\ \theta_2 \end{bmatrix}
$$
$$ \hat{y}=\sum_{i=0}^d{\theta_i b_i(x)} = \Phi \theta$$
$$b_i(x) = x^i, \quad i = 0,\cdots,d$$
from sklearn.preprocessing import MaxAbsScaler
# 10 data points
m = 10
train_x = np.linspace(-4.5, 4.5, 10).reshape(-1,1)
train_y = np.array([0.9819, 0.7973, 1.9737, 0.1838, 1.3180, -0.8361, -0.6591, -2.4701, -2.8122, -6.2512]).reshape(-1,1)
d = 9
train_X = np.hstack([train_x**(i+1) for i in range(d)])
train_X = MaxAbsScaler().fit_transform(train_X)
train_X = np.asmatrix(train_X)
plt.figure(figsize = (10,8))
for i in range(d):
plt.plot(train_X[:,i], label = '$x^{}$'.format(i+1))
plt.title('Polynomial Basis', fontsize = 15)
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
plt.grid(alpha = 0.3)
plt.legend(fontsize = 12)
plt.show()
import tensorflow as tf
LR = 0.4
n_iter = 300000
x = tf.placeholder(tf.float32, [m, d])
y = tf.placeholder(tf.float32, [m, 1])
w = tf.Variable(tf.random_normal([d, 1]))
b = tf.Variable(tf.random_normal([1, 1]))
y_pred = tf.add(tf.matmul(x, w), b)
loss = tf.square(y_pred - y)
loss = tf.reduce_mean(loss)
optm = tf.train.AdamOptimizer(LR).minimize(loss)
init = tf.global_variables_initializer()
loss_record = []
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_val = sess.run(w)
b_val = sess.run(b)
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
Xp = np.hstack([xp**(i+1) for i in range(d)])
Xp = MaxAbsScaler().fit_transform(Xp)
Xp = np.asmatrix(Xp)
yp = Xp*w_val + b_val
plt.figure(figsize=(10,8))
plt.plot(train_x,train_y,'ko')
plt.plot(xp, yp, 'r')
plt.title('Data', fontsize = 15)
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
#plt.axis('equal')
plt.grid(alpha = 0.3)
#plt.xlim([0, 5])
plt.show()
Overfitting problem
Have you come across a situation where your model performed exceptionally well on train data, but was not able to predict test data ?
One of the most common problem data science professionals face is to avoid overfitting.
Issue with rich representation
Generalization Error
With many features, prediction function becomes very expressive (model complexity)
Often, overfitting associated with very large estimated parameters $\theta$
We want to balance
how well function fits data
magnitude of coefficients
$$
\begin{align*}
\text{Total cost } = \;&\underbrace{\text{measure of fit}}_{RSS(\theta)} + \;\lambda \cdot \underbrace{\text{measure of magnitude of coefficients}}_{\lambda \cdot \lVert \theta \rVert_2^2} \\ \\
\implies &\min\; \lVert \Phi \theta - y \rVert_2^2 + \lambda \lVert \theta \rVert_2^2
\end{align*}
$$
where $ RSS(\theta) = \lVert \Phi\theta - y \rVert^2_2 $, ( = Rresidual Sum of Squares) and $\lambda$ is a tuning parameter to be determined separately
import tensorflow as tf
LR = 0.4
n_iter = 3000
x = tf.placeholder(tf.float32, [m, d])
y = tf.placeholder(tf.float32, [m, 1])
w = tf.Variable(tf.random_normal([d, 1]))
b = tf.Variable(tf.random_normal([1, 1]))
lamb = 0.1
y_pred = tf.add(tf.matmul(x, w), b)
loss = tf.reduce_mean(tf.square(y_pred - y))
reg = tf.reduce_mean(tf.square(w))
loss = tf.reduce_mean(loss + lamb*reg)
optm = tf.train.AdamOptimizer(LR).minimize(loss)
init = tf.global_variables_initializer()
loss_record = []
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_iter):
sess.run(optm, feed_dict = {x: train_X, y: train_y})
loss_record.append(sess.run(loss, feed_dict = {x: train_X, y: train_y}))
w_val = sess.run(w)
b_val = sess.run(b)
xp = np.linspace(-4.5, 4.5, 100).reshape(-1,1)
Xp = np.hstack([xp**(i+1) for i in range(d)])
Xp = MaxAbsScaler().fit_transform(Xp)
Xp = np.asmatrix(Xp)
yp = Xp*w_val + b_val
plt.figure(figsize=(10,8))
plt.plot(train_x,train_y,'ko')
plt.plot(xp, yp, 'r')
plt.title('Data', fontsize = 15)
plt.xlabel('X', fontsize = 15)
plt.ylabel('Y', fontsize = 15)
#plt.axis('equal')
plt.grid(alpha = 0.3)
#plt.xlim([0, 5])
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')