from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import random
Regression analysis is the process of finding the function $f(x)$ that outputs the most similar value $\hat{y}$ to the dependent variable $y$ corresponding to the independent variable $x$.
$$ \hat{y} = f(x) ≈ y$$If $f(x)$ is a linear function, then this function is called a linear regression model.
$$ \hat{y} = \omega_0 + \omega_1x_1 + \omega_2x_2 + ... + \omega_Dx_D = \omega_0 + \omega^Tx$$In the above equation, the independent variable $x = (x_1, x_2, ... , x_D)$ is a $D$-dimension vector. The weight vector $\omega = (\omega_0, ... , \omega_D)$ is the coefficient of the function $f(x)$ and the parameter is this linear regression model.
Complex/Nonlinear universal function approximator
Multi-neurons
train_dataset = pd.read_excel('/content/drive/MyDrive/tutorials/산학협동강좌/data/train_dataset.xlsx',
index_col = 0,
engine = 'openpyxl')
# USB로 파일을 다운 받으신 경우
# train_dataset = pd.read_excel('train_dataset.xlsx',
# index_col = 0,
# engine = 'openpyxl')
train_dataset
train_dataset.describe()
train_x = train_dataset.drop('Weight (g)', axis = 1)
train_y = train_dataset.loc[:, 'Weight (g)']
test_dataset = pd.read_excel('/content/drive/MyDrive/tutorials/산학협동강좌/data/test_dataset.xlsx',
index_col = 0,
engine = 'openpyxl')
# USB로 파일을 다운 받으신 경우
# test_dataset = pd.read_excel('test_dataset.xlsx',
# index_col = 0,
# engine = 'openpyxl')
test_x = test_dataset.drop('Weight (g)', axis = 1)
test_y = test_dataset.loc[:, 'Weight (g)']
print('train_x: {}, train_y: {}'.format(train_x.shape, train_y.shape))
print('test_x: {}, test_y: {}'.format(test_x.shape, test_y.shape))
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x, train_y)
print('Regression Coefficient: \n{}\n'.format(reg.coef_))
print('Regression Bias: \n{}'.format(reg.intercept_))
pred_lr = reg.predict(test_x)
print(pred_lr)
plt.figure(figsize = (8, 6))
plt.plot(pred_lr, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Linear Regression', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
n_estimators
: How many trees to ensemble ?from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 100,
max_depth = 10000,
random_state = 42)
reg.fit(train_x, train_y)
pred_rf = reg.predict(test_x)
plt.figure(figsize = (8, 6))
plt.plot(pred_rf, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Random Forest', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
tf.random.set_seed(42)
model = tf.keras.models.Sequential([
tf.keras.layers.Input(shape = (train_x.shape[1],)),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 1, activation = None)
])
model.summary()
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
loss = 'mse')
loss = model.fit(train_x, train_y, epochs = 50, verbose = 0)
pred_dnn = model.predict(test_x)
plt.figure(figsize = (8, 6))
plt.plot(pred_dnn, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13, loc = 1)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Deep Neural Network', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
Unlike the toy example (e.g., MNIST dataset), when applying artificial intelligence to industrial applications, a variety of issues may arise. The curse of dimensionality is an example of one of them. Because several physical phenomena overlap in the case of industrial data, it is difficult to know important features for predicted values from the standpoint of domain knowledge.
To ensure statistical stability, the number of features should be chosen to correspond to the number of data points. Therefore, we try correlation analysis to select important features in data-based prediction.
# 삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형에 False)
mask = np.zeros_like(train_dataset.corr(), dtype = bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (16, 16))
sns.heatmap(train_dataset.corr(),
cmap = 'RdYlBu_r',
annot = True, # 실제 값을 표시한다
mask = mask, # 표시하지 않을 마스크 부분을 지정한다
linewidths = 1, # 경계면 실선으로 구분하기
cbar_kws = {"shrink": .5}, # 컬러바 크기 절반으로 줄이기
vmin = -1, vmax = 1 # 컬러바 범위 -1 ~ 1
)
plt.show()
weight_corr = train_dataset.corr()['Weight (g)']
weight_corr.sort_values(ascending = False)
del_features = weight_corr[weight_corr.abs() < 0.8]
del_features.keys()
train_x_fs = train_x.drop(list(del_features.keys()[5:]), axis = 1)
test_x_fs = test_x.drop(list(del_features.keys()[5:]), axis = 1)
print('train_x_fs: {}, train_y: {}'.format(train_x_fs.shape, train_y.shape))
print('test_x_fs: {}, test_y: {}'.format(test_x_fs.shape, test_y.shape))
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_x_fs, train_y)
print('Regression Coefficient: \n{}\n'.format(reg.coef_))
print('Regression Bias: \n{}'.format(reg.intercept_))
pred_s_lr = reg.predict(test_x_fs)
plt.figure(figsize = (8, 6))
plt.plot(pred_s_lr, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Linear Regression with FS', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators = 100,
max_depth = 10000,
random_state = 42).fit(train_x_fs, train_y)
pred_s_rf = reg.predict(test_x_fs)
plt.figure(figsize = (8, 6))
plt.plot(pred_s_rf, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Random Forest with FS', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()
tf.random.set_seed(42)
model = tf.keras.models.Sequential([
tf.keras.layers.Input(shape = (train_x_fs.shape[1],)),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 10, activation = 'relu'),
tf.keras.layers.Dense(units = 1, activation = None)
])
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
loss = 'mse')
loss = model.fit(train_x_fs, train_y, epochs = 50, verbose = 0)
pred_s_dnn = model.predict(test_x_fs)
plt.figure(figsize = (8, 6))
plt.plot(pred_s_dnn, 'ro--', label = 'Prediction')
plt.plot(np.array(test_y), 'bo--', label = 'Ground Truth')
plt.legend(fontsize = 13)
plt.ylabel('Weight (g)', fontsize = 13)
plt.title('Deep Neural Network with FS', fontsize = 13)
plt.ylim([21, 31])
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()