Table of Contents
Why XAI?
Model-Specific XAI
Model-Agnostic XAI
!pip install shap lime
from google.colab import drive
drive.mount('/content/drive')
Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Define Input Data
df = pd.read_csv("/content/drive/MyDrive/kstp/data_files/datafile.csv")
normalized_df = (df-df.min())/(df.max()-df.min())
df = normalized_df
df
y = df['Weight'].values
X = df.drop(["Weight"], axis = 1)
print("There are {} possible descriptors:".format(len(X.columns)))
print(X.columns)
Split Train-Test Set and Train Random Forest Regressor
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)
rf = RandomForestRegressor(random_state = 1)
rf.fit(X_train, y_train)
Evaluate the Random Forest Regressor with Test Set
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('RMSE = {:.3f} '.format(np.sqrt(mse)))
r2 = r2_score(y_test, y_pred)
print('R2 = {:.3f} '.format(r2))
fig, ax = plt.subplots(figsize=(10, 10))
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], "r--", lw=4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
SHAP Implementation
import shap
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)
SHAP Value of a Test Data (Local Feature Importance)
shap.bar_plot(shap_values[0],features = X_test.iloc[0,:] ,feature_names = X.columns )
Average of Absolute SHAP Values of Entire Test Data (Global Feature Importance)
shap.summary_plot(shap_values, X_test, plot_type = "bar")
LIME Implementation
from lime import lime_tabular
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
explainer = lime_tabular.LimeTabularExplainer(X_train,
mode = "regression",
feature_names = X.columns)
explainer
LIME Result
explanation = explainer.explain_instance(X_test[0], rf.predict, num_features = len(X.columns))
explanation
explanation.show_in_notebook()
with plt.style.context("ggplot"):
explanation.as_pyplot_figure()
Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
Define Input Data
df = pd.read_csv("/content/drive/MyDrive/kstp/data_files/datafile.csv")
normalized_df = (df - df.min())/(df.max() - df.min())
df = normalized_df
df
y = df['Weight'].values
X = df.drop(["Weight"], axis = 1)
print("There are {} possible descriptors:".format(len(X.columns)))
print(X.columns)
Split Train-Test Set
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.30, random_state = 10)
X_train.shape
Define ANN Model
model = tf.keras.models.Sequential([
tf.keras.Input(shape = (X_train.shape[1],)),
tf.keras.layers.Dense(1024, activation = 'relu'),
tf.keras.layers.Dense(128, activation = 'relu'),
tf.keras.layers.Dense(1)
])
model.summary()
model.compile(loss = tf.keras.losses.MeanSquaredError(),
optimizer = tf.keras.optimizers.Adam())
Train DNN Model
model.fit(X_train,
Y_train,
epochs = 150,
batch_size = 64,
validation_split = 0.1)
Evaluate ANN Model
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
print('RMSE = {:.3f} '.format(np.sqrt(mse)))
r2 = r2_score(Y_test, y_pred)
print('R2 = {:.3f} '.format(r2))
fig, ax = plt.subplots(figsize = (10, 10))
ax.scatter(Y_test, y_pred, edgecolors = (0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], "r--", lw = 4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
SHAP Implementation
import shap
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
explainer_shap = shap.DeepExplainer(model = model, data = X_train)
shap_values = explainer_shap.shap_values(X_test)
SHAP Value of a Test Data (Local Feature Importance)
shap.bar_plot(shap_values[0][1], features = X_test[1], feature_names = X.columns )
Average of Absolute SHAP Values of Entire Test Data (Global Feature Importance)
shap.summary_plot(shap_values, X_test, feature_names = X.columns)
LIME Implementation
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(X_train,
mode = "regression",
feature_names = X.columns)
explainer
print("Prediction : ", model.predict(X_test[1].reshape(1,-1)))
print("Actual : ", [[Y_test[1]]])
explanation = explainer.explain_instance(X_test[1], model.predict, num_features = len(X.columns))
explanation
LIME Result
explanation.show_in_notebook()
with plt.style.context("ggplot"):
explanation.as_pyplot_figure()
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')