Unsupervised Learning: K-means Clustering
Table of Contents
Supervised Learning
Unsupervised Learning
Data clustering is an unsupervised learning problem
Given:
The only information clustering uses is the similarity between examples
Clustering groups examples based of their mutual similarities
A good clustering is one that achieves:
1) Initialization
Input:
Randomly initialized anywhere in $\mathbb{R}^n$
2) Iteration
$$
\begin{align*}
c_k &= \{n: k = \arg \min_k \, \lVert x_n - \mu_k \rVert^2\}\\
\mu_k &= \frac{1}{\lvert c_k \rvert} \sum_{n \in c_k}x_n
\end{align*}
$$
Repeat until convergence
3) Output
Output: model
$ \,\text{Randomly initialize } k \,\text{cluster centroids } \mu_1,\mu_2,\cdots,\mu_k \in \mathbb{R}^n$
$ \begin{align*} \text{Repeat}\;&\{ \\ &\text{for $i=1$ to $m$} \\ &\quad \text{$c_i$ := index (from 1 to $k$) of cluster centroid closest to $x^{(i)}$} \\ &\text{for $k=1$ to $k$} \\ &\quad \text{$\mu_k$ := average (mean) of points assigned to cluster $k$} \\ &\} \end{align*} $
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# data generation
G0 = np.random.multivariate_normal([1, 1], np.eye(2), 100)
G1 = np.random.multivariate_normal([3, 5], np.eye(2), 100)
G2 = np.random.multivariate_normal([9, 9], np.eye(2), 100)
X = np.vstack([G0, G1, G2])
X = np.asmatrix(X)
print(X.shape)
plt.figure(figsize = (10, 8))
plt.plot(X[:,0], X[:,1], 'b.')
plt.axis('equal')
plt.show()
# The number of clusters and data
k = 3
m = X.shape[0]
# ramdomly initialize mean points
mu = X[np.random.randint(0, m, k), :]
pre_mu = mu.copy()
print(mu)
plt.figure(figsize = (10, 8))
plt.plot(X[:,0], X[:,1], 'b.')
plt.plot(mu[:,0], mu[:,1], 'ko')
plt.axis('equal')
plt.show()
y = np.empty([m,1])
# Run K-means
for n_iter in range(500):
for i in range(m):
d0 = np.linalg.norm(X[i,:] - mu[0,:], 2)
d1 = np.linalg.norm(X[i,:] - mu[1,:], 2)
d2 = np.linalg.norm(X[i,:] - mu[2,:], 2)
y[i] = np.argmin([d0, d1, d2])
err = 0
for i in range(k):
mu[i,:] = np.mean(X[np.where(y == i)[0]], axis = 0)
err += np.linalg.norm(pre_mu[i,:] - mu[i,:], 2)
pre_mu = mu.copy()
if err < 1e-10:
print("Iteration:", n_iter)
break
X0 = X[np.where(y==0)[0]]
X1 = X[np.where(y==1)[0]]
X2 = X[np.where(y==2)[0]]
plt.figure(figsize = (10, 8))
plt.plot(X0[:,0], X0[:,1], 'b.', label = 'C0')
plt.plot(X1[:,0], X1[:,1], 'g.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], 'r.', label = 'C2')
plt.axis('equal')
plt.legend(fontsize = 12)
plt.show()
# use kmeans from the scikit-learn module
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3, random_state = 0)
kmeans.fit(X)
plt.figure(figsize = (10,8))
plt.plot(X[kmeans.labels_ == 0,0],X[kmeans.labels_ == 0,1], 'b.', label = 'C0')
plt.plot(X[kmeans.labels_ == 1,0],X[kmeans.labels_ == 1,1], 'g.', label = 'C1')
plt.plot(X[kmeans.labels_ == 2,0],X[kmeans.labels_ == 2,1], 'r.', label = 'C2')
plt.axis('equal')
plt.legend(fontsize = 12)
plt.show()
Idea: when adding another cluster does not give much better modeling of the data
One way to select $k$ for the K-means algorithm is to try different values of $k$, plot the K-means objective versus $k$, and look at the 'elbow-point' in the plot
# data generation
G0 = np.random.multivariate_normal([1, 1], np.eye(2), 100)
G1 = np.random.multivariate_normal([3, 5], np.eye(2), 100)
G2 = np.random.multivariate_normal([9, 9], np.eye(2), 100)
X = np.vstack([G0, G1, G2])
X = np.asmatrix(X)
cost = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i, random_state = 0).fit(X)
cost.append(abs(kmeans.score(X)))
plt.figure(figsize = (10,8))
plt.stem(range(1,11), cost)
plt.xticks(np.arange(11))
plt.xlim([0.5, 10.5])
plt.grid(alpha = 0.3)
plt.show()
%%html
<center><iframe src="https://www.youtube.com/embed/gd8IhnWqjSo?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed//t-ectMGYe-4?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/nBfPsNg6F_k?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/8f4-TD8-rPA?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/tk9LGtOYVL8?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')