Source code for nnsa.training.feature_importance

"""
This module contains functions to assess feature importance.
"""
import sys
import numpy as np
import pyprind
from sklearn import clone
import pandas as pd
from sklearn.metrics import mean_squared_error

__all__ = [
    'drop_col_feat_imp',
]


[docs]def drop_col_feat_imp(model, X_train, y_train, X_test, y_test, random_state=43): """ TODO Adapted from https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e. Args: model: X_train: y_train: random_state: Returns: """ # clone the model to have the exact same specification as the one initially trained model_clone = clone(model) # set random_state for comparability model_clone.random_state = random_state # training and scoring the benchmark model model_clone.fit(X_train, y_train) benchmark_score = model_clone.score(X_test, y_test) benchmark_error = sum_squared_error(model_clone.predict(X_test), y_test) # list for storing feature importances importances = [] # iterating over all columns and storing feature importance (difference between benchmark and new model) print('Computing feature importances...') # Initialize progress bar. bar = pyprind.ProgBar(X_train.shape[1], stream=sys.stdout) for col in X_train.columns: model_clone = clone(model) model_clone.random_state = random_state model_clone.fit(X_train.drop(columns=col), y_train) drop_col_score = model_clone.score(X_test.drop(columns=col), y_test) drop_col_error = sum_squared_error(model_clone.predict(X_test.drop(columns=col)), y_test) # importances.append(benchmark_score - drop_col_score) importances.append(drop_col_error - benchmark_error) bar.update() importances_df = pd.Series(data=importances, index=X_train.columns) return importances_df
def sum_squared_error(y_pred, y_true): d = np.asarray(y_pred) - np.asarray(y_true) sse = np.inner(d, d) return sse