Ensemble Methods: Bagging and Boosting
Table of Contents
Weighting the error
$\hat{y}$ is prediction
Simple classification example
import numpy as np
import matplotlib.pyplot as plt
Data generation
%matplotlib inline
## generate three simulated clusters
mu1 = np.array([1, 7])
mu2 = np.array([3, 4])
mu3 = np.array([6, 5])
SIGMA1 = 0.9*np.array([[1, 1.5],
[1.5, 3]])
SIGMA2 = 0.6*np.array([[2, 0],
[0, 2]])
SIGMA3 = 0.8*np.array([[1, -1],
[-1, 2]])
m = 100
X1 = np.random.multivariate_normal(mu1, SIGMA1, m)
X2 = np.random.multivariate_normal(mu2, SIGMA2, m)
X3 = np.random.multivariate_normal(mu3, SIGMA3, m)
y1 = 1*np.ones([m,1])
y2 = 2*np.ones([m,1])
y3 = 3*np.ones([m,1])
plt.figure(figsize = (10, 8))
plt.title('Generated Data', fontsize = 15)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.xlabel('$X_1$', fontsize = 15)
plt.ylabel('$X_2$', fontsize = 15)
plt.legend(fontsize = 12)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
Single Decision Tree vs Bagging
import sklearn
from sklearn.ensemble import BaggingClassifier
Dicision Tree
X = np.vstack([X1, X2, X3])
y = np.vstack([y1, y2, y3])
clf = sklearn.tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 2, random_state = 0, max_features = 1.0)
clf.fit(X,y)
res = 0.5
[X1gr, X2gr] = np.meshgrid(np.arange(-2,10,res), np.arange(0,12,res))
Xp = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
Xp = np.asmatrix(Xp)
q = clf.predict(Xp)
q = np.asmatrix(q).reshape(-1,1)
C1 = np.where(q == 1)[0]
C2 = np.where(q == 2)[0]
C3 = np.where(q == 3)[0]
Bagging
bclf = BaggingClassifier(base_estimator = None, n_estimators = 50, max_samples = 1.0, bootstrap = True)
bclf.fit(X, np.ravel(y))
Xp1 = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
Xp1 = np.asmatrix(Xp1)
q1 = bclf.predict(Xp1)
q1 = np.asmatrix(q1).reshape(-1,1)
C11 = np.where(q1 == 1)[0]
C21 = np.where(q1 == 2)[0]
C31 = np.where(q1 == 3)[0]
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
GB = GradientBoostingClassifier(learning_rate = 0.05, n_estimators = 300, subsample = 1.0, max_depth = 6)
GB.fit(X, np.ravel(y))
Xp2 = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
Xp2 = np.asmatrix(Xp2)
q2 = GB.predict(Xp2)
q2 = np.asmatrix(q2).reshape(-1,1)
C12 = np.where(q2 == 1)[0]
C22 = np.where(q2 == 2)[0]
C32 = np.where(q2 == 3)[0]
Plot results
plt.figure(figsize = (16, 5))
plt.subplot(1,3,1)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp[C1,0], Xp[C1,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp[C2,0], Xp[C2,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp[C3,0], Xp[C3,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.title('Single Decision Tree', fontsize = 20)
plt.xlabel('$X11$', fontsize = 15)
plt.ylabel('$X12$', fontsize = 15)
plt.legend(fontsize = 12, loc = 1)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.subplot(1,3,2)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp1[C11,0], Xp1[C11,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp1[C21,0], Xp1[C21,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp1[C31,0], Xp1[C31,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.title('Bagging (Low variance)', fontsize = 20)
plt.xlabel('$X11$', fontsize = 15)
plt.ylabel('$X12$', fontsize = 15)
plt.legend(fontsize = 12, loc = 1)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.subplot(1,3,3)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp2[C12,0], Xp2[C12,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp2[C22,0], Xp2[C22,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp2[C32,0], Xp2[C32,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.title('Boosting (Low bias)', fontsize = 20)
plt.xlabel('$X11$', fontsize = 15)
plt.ylabel('$X12$', fontsize = 15)
plt.legend(fontsize = 12, loc = 1)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
# ! pip install xgboost
import xgboost
# 'reg_lambda' is regularization form
XGB = xgboost.XGBClassifier(n_estimators = 300, learning_rate = 0.2, max_depth = 5, scoring = 'accuracy', reg_lambda = 0.05)
XGB.fit(X, np.ravel(y))
Xp3 = np.hstack([X1gr.reshape(-1,1), X2gr.reshape(-1,1)])
Xp3 = np.asmatrix(Xp3)
q3 = XGB.predict(Xp3)
q3 = np.asmatrix(q3).reshape(-1,1)
C13 = np.where(q3 == 1)[0]
C23 = np.where(q3 == 2)[0]
C33 = np.where(q3 == 3)[0]
plt.figure(figsize = (16, 5))
plt.subplot(1,2,1)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp2[C12,0], Xp2[C12,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp2[C22,0], Xp2[C22,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp2[C32,0], Xp2[C32,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.title('Boosting (Overfitting)', fontsize = 20)
plt.xlabel('$X11$', fontsize = 15)
plt.ylabel('$X12$', fontsize = 15)
plt.legend(fontsize = 12, loc = 1)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.subplot(1,2,2)
plt.plot(X1[:,0], X1[:,1], '.', label = 'C1')
plt.plot(X2[:,0], X2[:,1], '.', label = 'C2')
plt.plot(X3[:,0], X3[:,1], '.', label = 'C3')
plt.plot(Xp3[C13,0], Xp3[C13,1], 's', color = 'blue', markersize = 8, alpha = 0.1)
plt.plot(Xp3[C23,0], Xp3[C23,1], 's', color = 'orange', markersize = 8, alpha = 0.1)
plt.plot(Xp3[C33,0], Xp3[C33,1], 's', color = 'green', markersize = 8, alpha = 0.1)
plt.title('XGBoosting (Reduce overfitting)', fontsize = 20)
plt.xlabel('$X11$', fontsize = 15)
plt.ylabel('$X12$', fontsize = 15)
plt.legend(fontsize = 12, loc = 1)
plt.axis('equal')
plt.grid(alpha = 0.3)
plt.axis([-2, 10, 0, 12])
plt.show()
%%html
<center><iframe src="https://www.youtube.com/embed/9R7tee22XLA?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/eOgSBLIAKLY?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/3LCnnFoduAo?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/7AsL5vaOIjg?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%html
<center><iframe src="https://www.youtube.com/embed/7cS7pVIb878?rel=0"
width="420" height="315" frameborder="0" allowfullscreen></iframe></center>
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')