K-Nearest Neighbor (KNN) and Decision Tree
Table of Contents
1. Supervised Learning¶
- Given training set $\left\{ \left(x^{(1)}, y^{(1)}\right), \left(x^{(2)}, y^{(2)}\right),\cdots,\left(x^{(m)}, y^{(m)}\right) \right\}$
- Want to find a function $f_{\omega}$ with learning parameter, $\omega$
- $f_{\omega}$ desired to be as close as possible to $y$ for future $(x,y)$
- $i.e., f_{\omega}(x) \approx y$
- Define a loss function
$$\ell \left(f_{\omega} \left(x^{(i)}\right), y^{(i)}\right)$$
- Solve the following optimization problem:
$$ \begin{align*} \text{minimize} &\quad \frac{1}{m} \sum_{i=1}^{m} \ell \left(f_{\omega} \left(x^{(i)}\right), y^{(i)}\right)\\ \text{subject to} &\quad \omega \in \boldsymbol{\omega} \end{align*} $$
- Function approximation between inputs and outputs
- Once it is learned,
2. K-Nearest Neighbor (KNN)¶
2.1. K-Nearest Neighbor (KNN) Regression¶
Non-parametric method
We write our model as
$$y = f(x) + \varepsilon$$
where $\varepsilon$ captures measurement errors and other discrepancies.
Then, with a good $f$ we can make predictions of $y$ at new points $x_{\text{new}}$ . One possible way so called "nearest neighbor method" is:
$$\hat y = \text{avg} \left(y \mid x \in \mathcal{N}(x_{\text{new}}) \right)$$
where $\mathcal{N}(x)$ is some neighborhood of $x$
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
N = 100
w1 = 0.5
w0 = 2
x = np.random.normal(0, 15, N).reshape(-1,1)
y = w1*x + w0 + 5*np.random.normal(0, 1, N).reshape(-1,1)
plt.figure(figsize = (6, 4))
plt.title('Data Set', fontsize = 12)
plt.plot(x, y, '.', label = 'Data')
plt.xlabel('X', fontsize = 12)
plt.ylabel('Y', fontsize = 12)
plt.legend()
plt.axis('equal')
plt.axis([-40, 40, -30, 30])
plt.grid(alpha = 0.3)
plt.show()
from sklearn import neighbors
reg = neighbors.KNeighborsRegressor(n_neighbors = 1)
reg.fit(x, y)
x_new = np.array([[5]])
pred = reg.predict(x_new)[0,0]
print(pred)
xp = np.linspace(-30, 30, 100).reshape(-1,1)
yp = reg.predict(xp)
plt.figure(figsize = (6, 4))
plt.title('k-Nearest Neighbor Regression', fontsize = 12)
plt.plot(x, y, '.', label = 'Original Data')
plt.plot(xp, yp, label = 'kNN')
plt.plot(x_new, pred, 'o', label = 'Prediction')
plt.plot([x_new[0,0], x_new[0,0]], [-30, pred], 'k--', alpha = 0.5)
plt.plot([-40, x_new[0,0]], [pred, pred], 'k--', alpha = 0.5)
plt.xlabel('X', fontsize = 12)
plt.ylabel('Y', fontsize = 12)
plt.legend()
plt.axis('equal')
plt.axis([-40, 40, -30, 30])
plt.grid(alpha = 0.3)
plt.show()
reg = neighbors.KNeighborsRegressor(n_neighbors = 21)
reg.fit(x, y)
xp = np.linspace(-30, 30, 100).reshape(-1,1)
yp = reg.predict(xp)
plt.figure(figsize = (6, 4))
plt.title('k-Nearest Neighbor Regression', fontsize = 12)
plt.plot(x, y, '.', label='Original Data')
plt.plot(xp, yp, label = 'Regression Result')
plt.plot(x_new, pred, 'o', label='Prediction')
plt.plot([x_new[0,0], x_new[0,0]], [-30, pred], 'k--', alpha = 0.5)
plt.plot([-40, x_new[0,0]], [pred, pred], 'k--', alpha = 0.5)
plt.xlabel('X', fontsize = 12)
plt.ylabel('Y', fontsize = 12)
plt.legend()
plt.axis('equal')
plt.axis([-40, 40, -30, 30])
plt.grid(alpha = 0.3)
plt.show()
2.2. K-Nearest Neighbor (KNN) Classification¶
Non-parametric method
In k-NN classification, an object is assigned to the class most common among its $k$ nearest neighbors ($k$ is a positive integer, typically small).
If $k = 1$, then the object is simply assigned to the class of that single nearest neighbor.
- Zoom in,
m = 1000
X = -1.5 + 3*np.random.uniform(size = (m,2))
y = np.zeros([m,1])
for i in range(m):
if np.linalg.norm(X[i,:],2) <= 1:
y[i] = 1
C1 = np.where(y == 1)[0]
C0 = np.where(y == 0)[0]
theta = np.linspace(0, 2*np.pi, 100)
plt.figure(figsize = (6, 6))
plt.plot(X[C1,0], X[C1,1], 'o', label = 'C1', markerfacecolor = "k", markeredgecolor = 'k', markersize = 4)
plt.plot(X[C0,0], X[C0,1], 'o', label = 'C0', markerfacecolor = "None", alpha = 0.3, markeredgecolor = 'k', markersize = 4)
plt.plot(np.cos(theta), np.sin(theta), '--', color = 'orange')
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.axis('equal')
plt.axis('off')
plt.show()
from sklearn import neighbors
clf = neighbors.KNeighborsClassifier(n_neighbors = 1)
clf.fit(X, np.ravel(y))
X_new = np.array([1,1]).reshape(1,-1)
result = clf.predict(X_new)[0]
print(result)
res = 0.01
[X1gr, X2gr] = np.meshgrid(np.arange(-1.5, 1.5, res), np.arange(-1.5, 1.5, res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
inC1 = clf.predict(Xp).reshape(-1,1)
inCircle = np.where(inC1 == 1)[0]
plt.figure(figsize = (6, 6))
plt.plot(X[C1,0], X[C1,1], 'o', label = 'C1', markerfacecolor = "k", alpha = 0.5, markeredgecolor = 'k', markersize = 4)
plt.plot(X[C0,0], X[C0,1], 'o', label = 'C0', markerfacecolor = "None", alpha = 0.3, markeredgecolor='k', markersize = 4)
plt.plot(np.cos(theta), np.sin(theta), '--', color = 'orange')
plt.plot(Xp[inCircle][:,0], Xp[inCircle][:,1], 's', alpha = 0.5, color = 'r', markersize = 1)
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.axis('equal')
plt.axis('off')
plt.show()
When outliers exist
m = 1000
X = -1.5 + 3*np.random.uniform(size = (m,2))
y = np.zeros([m,1])
for i in range(m):
if np.linalg.norm(X[i,:], 2) <= 1:
if np.random.uniform() < 0.05:
y[i] = 0
else:
y[i] = 1
else:
if np.random.uniform() < 0.05:
y[i] = 1
else:
y[i] = 0
C1 = np.where(y == 1)[0]
C0 = np.where(y == 0)[0]
theta = np.linspace(0, 2*np.pi, 100)
plt.figure(figsize = (6, 6))
plt.plot(X[C1,0], X[C1,1], 'o', label = 'C1', markerfacecolor = "k", markeredgecolor = 'k', markersize = 4)
plt.plot(X[C0,0], X[C0,1], 'o', label = 'C0', markerfacecolor = "None", alpha = 0.3, markeredgecolor = 'k', markersize = 4)
plt.plot(np.cos(theta), np.sin(theta), '--', color = 'orange')
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.axis('equal')
plt.axis('off')
plt.show()
$k = 1$
clf = neighbors.KNeighborsClassifier(n_neighbors = 1)
clf.fit(X, np.ravel(y))
res = 0.01
[X1gr, X2gr] = np.meshgrid(np.arange(-1.5, 1.5, res), np.arange(-1.5, 1.5, res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
inC1 = clf.predict(Xp).reshape(-1,1)
inCircle = np.where(inC1 == 1)[0]
plt.figure(figsize = (6, 6))
plt.plot(X[C1,0], X[C1,1], 'o', label = 'C1', markerfacecolor = "k", alpha = 0.5, markeredgecolor = 'k', markersize = 4)
plt.plot(X[C0,0], X[C0,1], 'o', label = 'C0', markerfacecolor = "None", alpha = 0.3, markeredgecolor='k', markersize = 4)
plt.plot(np.cos(theta), np.sin(theta), '--', color = 'orange')
plt.plot(Xp[inCircle][:,0], Xp[inCircle][:,1], 's', alpha = 0.5, color = 'r', markersize = 1)
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.axis('equal')
plt.axis('off')
plt.show()
$k = 11$
clf = neighbors.KNeighborsClassifier(n_neighbors = 11)
clf.fit(X, np.ravel(y))
res = 0.01
[X1gr, X2gr] = np.meshgrid(np.arange(-1.5, 1.5, res), np.arange(-1.5, 1.5, res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
inC1 = clf.predict(Xp).reshape(-1,1)
inCircle = np.where(inC1 == 1)[0]
plt.figure(figsize = (6, 6))
plt.plot(Xp[inCircle][:,0], Xp[inCircle][:,1], 's', alpha = 0.5, color = 'r', markersize = 1)
plt.plot(X[C1,0], X[C1,1], 'o', label = 'C1', markerfacecolor = "k", alpha = 0.5, markeredgecolor = 'k', markersize = 4)
plt.plot(X[C0,0], X[C0,1], 'o', label = 'C0', markerfacecolor = "None", alpha = 0.3, markeredgecolor='k', markersize = 4)
plt.plot(np.cos(theta), np.sin(theta), '--', color = 'orange')
# plt.legend(fontsize = 12)
plt.axis([-1.5, 1.5, -1.5, 1.5])
plt.axis('equal')
plt.axis('off')
plt.show()
3. Decision Tree¶
- A decision tree is a flowchart-like structure in which each internal node represents a "test" on an attribute (e.g. whether a coin flip comes up heads or tails), each branch represents the outcome of the test, and each leaf node represents a class label (decision taken after computing all attributes). The paths from root to leaf represent classification rules.
- Source: Artificial Intelligence at MIT by Prof. Patrick Henry Winston
%%html
<center><iframe src="https://www.youtube.com/embed/SXBG3RGr_Rc?rel=0"
width="560" height="315" frameborder="0" allowfullscreen></iframe></center>
3.1. Decision Tree Algorithm¶
Feature test
Homogeneous set
Issue: a large data set will be most likely to produce zero homogeneous set
Disorder of single set
$$\begin{align*} D & = -x \log_2 x - (1-x) \log_2 (1-x)\\ \\ D & = -\frac{G}{T} \log_2 \frac{G}{T} - \frac{B}{T} \log_2 \frac{B}{T} \qquad \text{where }\; G: \text{Good}, \; B: \text{Bad},\; T: \text{Total}\\ \\ \text{When }\; \frac{G}{T} & = \frac{1}{2} \Rightarrow \left( -\frac{1}{2} \log_2 \frac{1}{2} \right) \times 2 = 1 \\ \\ \text{When }\;\frac{G}{T} & = 1 \Rightarrow -1 \log_2 1 - 0 \log_2 0 = 0 \end{align*} $$
Note: look at the entropy
Quality of test
$$Q(\text{test}) = \sum D(\text{set}) \times \frac{\text{# of samples in set}}{\text{# of samples in all sets}}$$
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
x = np.linspace(0.01, 0.99, 100)
y = -x*np.log2(x) - (1-x)*np.log2(1-x)
plt.figure(figsize = (6, 4))
plt.plot(x, y, linewidth = 3)
plt.xlabel(r'$x$', fontsize = 12)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.show()
# Quality of test
def D(x):
y = -x*np.log2(x) - (1-x)*np.log2(1-x)
return y
from sklearn import tree
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image
data = np.array([[0, 0, 1, 0, 0],
[1, 0, 2, 0, 0],
[0, 1, 2, 0, 1],
[2, 1, 0, 2, 1],
[0, 1, 0, 1, 1],
[1, 1, 1, 2, 0],
[1, 1, 0, 2, 0],
[0, 0, 2, 1, 0]])
x = data[:,0:4]
y = data[:,4]
print(x, '\n')
print(y)
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 0)
clf.fit(x, y)
clf.predict([[0, 0, 1, 0]])
dot_data = export_graphviz(clf)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# look different from what we compute by hands because "tree.DecisionTreeClassifier" uses a binary classifier
3.2. Nonlinear Classification¶
X1 = np.array([[-1.1,0],[-0.3,0.1],[-0.9,1],[0.8,0.4],[0.4,0.9],[0.3,-0.6],
[-0.5,0.3],[-0.8,0.6],[-0.5,-0.5]])
X0 = np.array([[-1,-1.3], [-1.6,2.2],[0.9,-0.7],[1.6,0.5],[1.8,-1.1],[1.6,1.6],
[-1.6,-1.7],[-1.4,1.8],[1.6,-0.9],[0,-1.6],[0.3,1.7],[-1.6,0],[-2.1,0.2]])
X1 = np.asmatrix(X1)
X0 = np.asmatrix(X0)
plt.figure(figsize = (6, 4))
plt.plot(X1[:,0], X1[:,1], 'ro', label = 'C1')
plt.plot(X0[:,0], X0[:,1], 'bo', label = 'C0')
plt.title('SVM for Nonlinear Data', fontsize = 12)
plt.xlabel(r'$x_1$', fontsize = 12)
plt.ylabel(r'$x_2$', fontsize = 12)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.show()
N = X1.shape[0]
M = X0.shape[0]
X = np.asarray(np.vstack([X1,X0]))
y = np.asarray(np.vstack([np.ones([N,1]), np.zeros([M,1])]))
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 4, random_state = 0)
clf.fit(X,y)
clf.predict([[0,1]])
# to plot
[X1gr, X2gr] = np.meshgrid(np.arange(-3, 3, 0.1), np.arange(-3, 3, 0.1))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
q = clf.predict(Xp)
q = np.asmatrix(q).reshape(-1,1)
C1 = np.where(q == 1)[0]
plt.figure(figsize = (6, 4))
plt.plot(X1[:,0], X1[:,1], 'ro', label = 'C1')
plt.plot(X0[:,0], X0[:,1], 'bo', label = 'C0')
plt.plot(Xp[C1,0], Xp[C1,1], 'gs', markersize = 8, alpha = 0.1, label = 'Decison Tree')
plt.xlabel(r'$x_1$', fontsize = 12)
plt.ylabel(r'$x_2$', fontsize = 12)
plt.legend(loc = 1, fontsize = 12)
plt.axis('equal')
plt.show()
3.3. Multiclass Classification¶
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
## generate three simulated clusters
mu1 = np.array([1, 7])
mu2 = np.array([3, 4])
mu3 = np.array([6, 5])
SIGMA1 = 0.8*np.array([[1, 1.5],
[1.5, 3]])
SIGMA2 = 0.5*np.array([[2, 0],
[0, 2]])
SIGMA3 = 0.5*np.array([[1, -1],
[-1, 2]])
X1 = np.random.multivariate_normal(mu1, SIGMA1, 100)
X2 = np.random.multivariate_normal(mu2, SIGMA2, 100)
X3 = np.random.multivariate_normal(mu3, SIGMA3, 100)
y1 = 1*np.ones([100,1])
y2 = 2*np.ones([100,1])
y3 = 3*np.ones([100,1])
plt.figure(figsize = (6, 4))
plt.title('Generated Data', fontsize = 12)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.xlabel('$X_1$', fontsize = 12)
plt.ylabel('$X_2$', fontsize = 12)
plt.legend(fontsize = 12)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
X = np.vstack([X1, X2, X3])
y = np.vstack([y1, y2, y3])
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 0)
clf.fit(X,y)
res = 0.3
[X1gr, X2gr] = np.meshgrid(np.arange(-2, 10, res), np.arange(0, 12, res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
q = clf.predict(Xp)
q = np.asmatrix(q).reshape(-1,1)
C1 = np.where(q == 1)[0]
C2 = np.where(q == 2)[0]
C3 = np.where(q == 3)[0]
plt.figure(figsize = (6, 4))
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp[C1,0], Xp[C1,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp[C2,0], Xp[C2,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp[C3,0], Xp[C3,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.xlabel('$X_1$', fontsize = 12)
plt.ylabel('$X_2$', fontsize = 12)
plt.legend(fontsize = 12)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
3.4. Random Forest¶
- Ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees
from sklearn import ensemble
clf = ensemble.RandomForestClassifier(n_estimators = 100, max_depth = 3, random_state = 0)
clf.fit(X,np.ravel(y))
res = 0.3
[X1gr, X2gr] = np.meshgrid(np.arange(-2, 10, res), np.arange(0, 12, res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
q = clf.predict(Xp)
q = np.asmatrix(q).reshape(-1,1)
C1 = np.where(q == 1)[0]
C2 = np.where(q == 2)[0]
C3 = np.where(q == 3)[0]
plt.figure(figsize = (6, 4))
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp[C1,0], Xp[C1,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp[C2,0], Xp[C2,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp[C3,0], Xp[C3,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.xlabel('$X_1$', fontsize = 12)
plt.ylabel('$X_2$', fontsize = 12)
plt.legend(fontsize = 12)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')