Using Scikit-Learn



By Prof. Seungchul Lee
http://iai.postech.ac.kr/
Industrial AI Lab at POSTECH

Table of Contents

1. Linear Regression

1.1 Data Generation

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


# data points in column vector [input, output]
x = np.array([0.1, 0.4, 0.7, 1.2, 1.3, 1.7, 2.2, 2.8, 3.0, 4.0, 4.3, 4.4, 4.9]).reshape(-1, 1)
y = np.array([0.5, 0.9, 1.1, 1.5, 1.5, 2.0, 2.2, 2.8, 2.7, 3.0, 3.5, 3.7, 3.9]).reshape(-1, 1)

# to plot
plt.figure(figsize=(10, 6))
plt.title('Linear Regression', fontsize=15)
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.plot(x, y, 'ko', label="data")
plt.xlim([0, 5])
plt.grid(alpha=0.3)
plt.axis('scaled')
plt.show()

1.2 Solve with sklearn

In [2]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(x,y)
Out[2]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
In [3]:
print(reg.coef_)       # Coef
print(reg.intercept_)  # Bias
[[ 0.67129519]]
[ 0.65306531]
In [4]:
# to plot
plt.figure(figsize=(10, 6))
plt.title('Linear Regression', fontsize=15)
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.plot(x, y, 'ko', label="data")

# to plot a straight line (fitted line)
xp = np.arange(0, 5, 0.01).reshape(-1, 1)
yp = reg.coef_*xp + reg.intercept_

plt.plot(xp, yp, 'r', linewidth=2, label="$L_2$")
plt.legend(fontsize=15)
plt.axis('scaled')
plt.grid(alpha=0.3)
plt.xlim([0, 5])
plt.show()

2. Nonlinear Regression

2.1 Data Generation

In [5]:
n = 100            
x = -5 + 15*np.random.rand(n, 1)
noise = 10*np.random.randn(n, 1)
y = 10 + 1*x + 2*x**2 + noise

plt.figure(figsize=(10, 6))
plt.title('Nonlinear Regression', fontsize=15)
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.plot(x, y, 'o', markersize=4, label='actual')
plt.xlim([np.min(x), np.max(x)])
plt.grid(alpha=0.3)
plt.legend(fontsize=15)
plt.show()

2.2 Solve with sklearn

In [6]:
from sklearn.kernel_ridge import KernelRidge

reg = KernelRidge(kernel='rbf', gamma=0.1)
reg.fit(x, y)
Out[6]:
KernelRidge(alpha=1, coef0=1, degree=3, gamma=0.1, kernel='rbf',
      kernel_params=None)
In [7]:
p = reg.predict(x)
In [8]:
plt.figure(figsize=(10, 6))
plt.title('Nonlinear Regression', fontsize=15)
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.plot(x, y, 'o', markersize=4, label='actual')
plt.plot(x, p, 'ro', markersize=4, label='predict')
plt.grid(alpha=0.3)
plt.legend(fontsize=15)
plt.xlim([np.min(x), np.max(x)])
plt.show()

3. Support Vector Machine

3.1 Data generation

In [9]:
x1 = 8*np.random.rand(100, 1)
x2 = 7*np.random.rand(100, 1) - 4

g0 = 0.8*x1 + x2 - 3
g1 = g0 - 1
g2 = g0 + 1

C1 = np.where(g1 >= 0)[0]
C2 = np.where(g2 < 0)[0]

X1 = np.hstack([x1[C1],x2[C1]])
X2 = np.hstack([x1[C2],x2[C2]])
n = X1.shape[0]
m = X2.shape[0]
X = np.vstack([X1, X2])
y = np.vstack([np.zeros([n, 1]), np.ones([m, 1])])

plt.figure(figsize=(10, 6))
plt.plot(x1[C1], x2[C1], 'ro', label='C1')
plt.plot(x1[C2], x2[C2], 'bo', label='C2')
plt.xlabel('$x_1$', fontsize = 20)
plt.ylabel('$x_2$', fontsize = 20)
plt.legend(loc = 4)
plt.xlim([0, 8])
plt.ylim([-4, 3])
plt.show()

3.2 Solve with sklearn

In [10]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(X, np.ravel(y))
Out[10]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [11]:
print(clf.coef_)
print(clf.intercept_)
[[-0.72805514 -0.88381608]]
[ 2.69436036]
In [12]:
xp = np.linspace(0,8,100).reshape(-1,1)
yp = -clf.coef_[0,0]/clf.coef_[0,1]*xp - clf.intercept_/clf.coef_[0,1]

plt.figure(figsize=(10, 6))
plt.plot(X[0:n, 0], X[0:n, 1], 'ro', label='C1')
plt.plot(X[n:-1, 0], X[n:-1, 1], 'bo', label='C2')
plt.plot(xp, yp, '--k', label='SVM')
plt.xlabel('$x_1$', fontsize = 20)
plt.ylabel('$x_2$', fontsize = 20)
plt.legend(loc = 4)
plt.xlim([0, 8])
plt.ylim([-4, 3])
plt.show()

4. Logistic Regression

4.1 Data Generation

In [13]:
m = 500

X0 = np.random.multivariate_normal([0, 0], np.eye(2), m)
X1 = np.random.multivariate_normal([10, 10], np.eye(2), m)

X = np.vstack([X0, X1])
y = np.vstack([np.zeros([m,1]), np.ones([m,1])])

plt.figure(figsize=(10, 6))
plt.plot(X0[:,0], X0[:,1], '.b', label='Class 0')
plt.plot(X1[:,0], X1[:,1], '.k', label='Class 1')

plt.title('Data Classes', fontsize=15)
plt.legend(loc='lower right', fontsize=15)
plt.xlabel('X1', fontsize=15)
plt.ylabel('X2', fontsize=15)
plt.xlim([-10,20])
plt.ylim([-4,14])
plt.grid(alpha=0.3)
plt.show()

4.2 Solve with sklearn

In [14]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X, np.ravel(y))
Out[14]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
In [15]:
print(clf.coef_)
print(clf.intercept_)
[[ 0.59553674  0.64840718]]
[-4.89457084]
In [16]:
xp = np.linspace(-10,20,100).reshape(-1,1)
yp = -clf.coef_[0,0]/clf.coef_[0,1]*xp - clf.intercept_/clf.coef_[0,1]

plt.figure(figsize=(10, 6))
plt.plot(X0[:,0], X0[:,1], '.b', label='Class 0')
plt.plot(X1[:,0], X1[:,1], '.k', label='Class 1')
plt.plot(xp, yp, '--k', label='Logistic')
plt.xlim([-10,20])
plt.ylim([-4,14])

plt.title('Data Classes', fontsize=15)
plt.legend(loc='lower right', fontsize=15)
plt.xlabel('X1', fontsize=15)
plt.ylabel('X2', fontsize=15)
plt.grid(alpha=0.3)
plt.show()
In [17]:
pred = clf.predict_proba([[0,6]])
pred
Out[17]:
array([[ 0.73186937,  0.26813063]])

5. K-means

5.1 Data generation

In [18]:
m = 200

X0 = np.random.multivariate_normal([-1, 1], np.eye(2), m)
X1 = np.random.multivariate_normal([15, 10], np.eye(2), m)
X2 = np.random.multivariate_normal([0, 6], np.eye(2), m)
X = np.vstack([X0, X1, X2])

plt.figure(figsize=(10, 6))
plt.plot(X[:,0], X[:,1], '.b')

plt.xlim([-10,20])
plt.ylim([-4,14])
plt.grid(alpha=0.3)
plt.show()
In [19]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 3, random_state = 0)
kmeans.fit(X)
Out[19]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)
In [20]:
print(kmeans.labels_)
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
In [21]:
plt.figure(figsize=(10,6))

plt.plot(X[kmeans.labels_ == 0,0],X[kmeans.labels_ == 0,1],'g.', label=0)
plt.plot(X[kmeans.labels_ == 1,0],X[kmeans.labels_ == 1,1],'k.', label=1)
plt.plot(X[kmeans.labels_ == 2,0],X[kmeans.labels_ == 2,1],'r.', label=2)

plt.xlim([-10,20])
plt.ylim([-4,14])
plt.grid(alpha=0.3)
plt.legend(loc='lower right', fontsize=15)
plt.show()

6. PCA

6.1 Data generation

In [22]:
m = 5000
mu = np.array([0, 0])
sigma = np.array([[3, 1.5], 
                  [1.5, 1]])

X = np.random.multivariate_normal(mu, sigma, m)

fig = plt.figure(figsize=(10, 6))
plt.plot(X[:,0], X[:,1], 'k.')
plt.axis('equal')
plt.show()

6.2 Solve with sklearn

In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(X)
Out[23]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
In [24]:
plt.figure()
plt.stem(range(1,3),pca.explained_variance_ratio_)

plt.xlim([0.5, 2.5])
plt.ylim([0, 1])
plt.title('Score (%)')
plt.show()
In [25]:
principal_axis = pca.components_[0, :]
u1 = principal_axis/(np.linalg.norm(principal_axis)) 
h = u1[1]/u1[0]

xp = np.linspace(-6,6,200)
yp = xp.dot(h)

plt.figure(figsize=(10,6))
plt.plot(X[:, 0], X[:, 1],'k.')
plt.plot(xp, yp, 'r.')
plt.axis('equal')
plt.show()

7. LDA

7.1 Data generation

In [26]:
n0 = 200
n1 = 200

mu = [0, 0]
sigma = [[0.9, -0.4],
         [-0.4, 0.3]]

x0 = np.random.multivariate_normal([2.5,2.5], sigma, n0)        # data in class 0
x1 = np.random.multivariate_normal([1,1], sigma, n1)            # data in class 0

X = np.vstack([x0, x1])
y = np.vstack([np.zeros([n0, 1]), np.ones([n1, 1])])

plt.figure(figsize = (10, 6))
plt.plot(x0[:,0],x0[:,1],'r.')
plt.plot(x1[:,0],x1[:,1],'b.')

plt.axis('equal')
plt.ylim([-2, 6])
plt.xlim([-4, 8])
plt.show()

7.2 Solve with sklearn

In [27]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit(X, np.ravel(y))
Out[27]:
LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
In [28]:
print(lda.coef_)
print(lda.intercept_)
[[ -9.1242776  -16.85725469]]
[ 45.91203963]
In [29]:
xp = np.linspace(-4,8,100).reshape(-1,1)
yp = -lda.coef_[0,0]/lda.coef_[0,1]*xp - lda.intercept_/lda.coef_[0,1]
projection_line = lda.coef_[0,1]/lda.coef_[0,0]*xp

plt.figure(figsize = (10, 6))
plt.plot(x0[:,0],x0[:,1],'r.')
plt.plot(x1[:,0],x1[:,1],'b.')
plt.plot(xp, yp, '--k', label='Decision Boundary')
plt.plot(xp, projection_line, 'k', label='Projection Line')
plt.axis('equal')
plt.ylim([-2, 6])
plt.xlim([-4, 8])
plt.legend(loc='lower right', fontsize=15)
plt.show()
In [30]:
X_new = lda.transform(X)

plt.figure(figsize = (10, 6))
plt.hist(X_new[0:200], 21, color='r', rwidth=0.5)
plt.hist(X_new[200:-1], 21, color='b', rwidth=0.5)
plt.show()
In [31]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')