Source code for nnsa.training.feature_importance

"""
This module contains functions to assess feature importance.
"""
import sys
import numpy as np
import pyprind
from sklearn import clone
import pandas as pd
from sklearn.metrics import mean_squared_error

__all__ = [
    'drop_col_feat_imp',
]


[docs]def drop_col_feat_imp(model, X_train, y_train, X_test, y_test, random_state=43):
    """
    TODO
    Adapted from https://towardsdatascience.com/explaining-feature-importance-by-example-of-a-random-forest-d9166011959e.
    Args:
        model:
        X_train:
        y_train:
        random_state:

    Returns:

    """
    # clone the model to have the exact same specification as the one initially trained
    model_clone = clone(model)
    # set random_state for comparability
    model_clone.random_state = random_state
    # training and scoring the benchmark model
    model_clone.fit(X_train, y_train)
    benchmark_score = model_clone.score(X_test, y_test)
    benchmark_error = sum_squared_error(model_clone.predict(X_test), y_test)

    # list for storing feature importances
    importances = []

    # iterating over all columns and storing feature importance (difference between benchmark and new model)
    print('Computing feature importances...')

    # Initialize progress bar.
    bar = pyprind.ProgBar(X_train.shape[1], stream=sys.stdout)

    for col in X_train.columns:
        model_clone = clone(model)
        model_clone.random_state = random_state
        model_clone.fit(X_train.drop(columns=col), y_train)
        drop_col_score = model_clone.score(X_test.drop(columns=col), y_test)
        drop_col_error = sum_squared_error(model_clone.predict(X_test.drop(columns=col)), y_test)
        # importances.append(benchmark_score - drop_col_score)
        importances.append(drop_col_error - benchmark_error)

        bar.update()

    importances_df = pd.Series(data=importances, index=X_train.columns)
    return importances_df


def sum_squared_error(y_pred, y_true):
    d = np.asarray(y_pred) - np.asarray(y_true)
    sse = np.inner(d, d)
    return sse