"""
This module contains functions to assess feature importance.
"""
import sys
import numpy as np
import pyprind
from sklearn import clone
import pandas as pd
from sklearn.metrics import mean_squared_error
__all__ = [
'drop_col_feat_imp',
]
[docs]def drop_col_feat_imp(model, X_train, y_train, X_test, y_test, random_state=43):
"""
TODO
Adapted from https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e.
Args:
model:
X_train:
y_train:
random_state:
Returns:
"""
# clone the model to have the exact same specification as the one initially trained
model_clone = clone(model)
# set random_state for comparability
model_clone.random_state = random_state
# training and scoring the benchmark model
model_clone.fit(X_train, y_train)
benchmark_score = model_clone.score(X_test, y_test)
benchmark_error = sum_squared_error(model_clone.predict(X_test), y_test)
# list for storing feature importances
importances = []
# iterating over all columns and storing feature importance (difference between benchmark and new model)
print('Computing feature importances...')
# Initialize progress bar.
bar = pyprind.ProgBar(X_train.shape[1], stream=sys.stdout)
for col in X_train.columns:
model_clone = clone(model)
model_clone.random_state = random_state
model_clone.fit(X_train.drop(columns=col), y_train)
drop_col_score = model_clone.score(X_test.drop(columns=col), y_test)
drop_col_error = sum_squared_error(model_clone.predict(X_test.drop(columns=col)), y_test)
# importances.append(benchmark_score - drop_col_score)
importances.append(drop_col_error - benchmark_error)
bar.update()
importances_df = pd.Series(data=importances, index=X_train.columns)
return importances_df
def sum_squared_error(y_pred, y_true):
d = np.asarray(y_pred) - np.asarray(y_true)
sse = np.inner(d, d)
return sse