import numpy as np
import copy
import pyprind
__all__ = [
'cross_validate',
'kfold_generator',
]
[docs]def cross_validate(X, y, model, n_folds=5, how='stratified', verbose=0):
"""
Cross validate a regression model using (stratified) k-fold cross validation.
For leave-one-out cross validation, set n_folds to len(X) or None.
Args:
X (np.ndarray): train data with shape (n_samples, n_features).
y (np.ndarray): outcome values with shape (n_samples).
model: (untrained) model that implements fit() and predict() methods.
n_folds (int, optional): number of folds for k-fold cross validation.
Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation.
how (str, optional): how to divide in folds. See kfold_generator.
verbose (int, optional): verbosity level.
Defaults to 0.
Returns:
y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the
train set, but the predictions are made when the sample was not included in the training.
all_models (list): list of all models, each trained on a specific part of the data.
"""
if n_folds is None:
n_folds = len(X)
# Generator that generate train and validation folds.
fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True)
# Initialize output.
y_pred_val = np.zeros_like(y)
# Loop over folds.
all_models = []
bar = pyprind.ProgBar(n_folds)
for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \
enumerate(fold_generator):
# Copy the initial (untrained) model.
model_i = copy.deepcopy(model)
# Train model on train folds.
model_i.fit(X_train, y_train)
# Predict on validation fold.
y_pred_val[idx_val] = model_i.predict(X_val)
# Append to output list.
all_models.append(model_i)
if verbose:
bar.update()
return y_pred_val, all_models
def cross_validate_permutation_importance(
X, y, model, n_folds=5, how='stratified', n_repeats=5, verbose=0, **kwargs):
"""
Cross validate a regression model using (stratified) k-fold cross validation to compute feature importance.
Uses a permutation method for assessing feature importance.
For leave-one-out cross validation, set n_folds to len(X) or None.
Args:
X (np.ndarray): train data with shape (n_samples, n_features).
y (np.ndarray): outcome values with shape (n_samples).
model: (untrained) model that implements fit() and predict() methods.
n_folds (int, optional): number of folds for k-fold cross validation.
Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation.
how (str, optional): how to divide in folds. See kfold_generator.
n_repeats (int): number of permutations to do per feature. See sklearn's permutation_importance.
verbose (int, optional): verbosity level.
Defaults to 0.
**kwargs: for sklearn's permutation_importance.
Returns:
importances (np.ndarray): importances (mean accuracy decrease in left-out fold) with shape
(n_features, n_repeats, n_folds).
y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the
train set, but the predictions are made when the sample was not included in the training.
all_models (list): list of all models, each trained on a specific part of the data.
"""
from sklearn.inspection import permutation_importance
if n_folds is None:
n_folds = len(X)
# Generator that generate train and validation folds.
fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True)
# Initialize output.
y_pred_val = np.zeros_like(y)
# Loop over folds.
all_models = []
importances_all = []
bar = pyprind.ProgBar(n_folds)
for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \
enumerate(fold_generator):
# Copy the initial (untrained) model.
model_i = copy.deepcopy(model)
# Train model on train folds.
model_i.fit(X_train, y_train)
# Predict on validation fold.
y_pred_val[idx_val] = model_i.predict(X_val)
# Compute permutation feature importance.
result = permutation_importance(
model_i, X_val, y_val, n_repeats=n_repeats, **kwargs,
)
# Append to output list.
importances_all.append(result.importances)
if verbose:
bar.update()
# To shape (n_features, n_repeats, n_folds).
importances = np.dstack(importances_all)
assert importances.shape == (X.shape[1], n_repeats, n_folds)
return importances, y_pred_val, all_models
[docs]def kfold_generator(X, y, n_folds=5, how='stratified', return_idx=False):
"""
Generate train and validation sets from k folds.
For fold `k`, every `n_folds`th sample is put in the set, starting at k.
Args:
X (np.ndarray): input data with len (n_samples).
y (np.ndarray): output data with len (n_samples).
n_folds (int, optional): number of folds.
Defaults to 5.
how (str, optional): how to divide the data into folds. Choose from:
'random': random division in folds.
'stratified': sorts the data based on y, so that the folds are stratified (for regression).
Defaults to 'stratified_regression'.
return_idx (bool, optional): if True, return/yield the indices for train and validation.
Defaults to False.
Yields:
X_train (np.ndarray): train input data for current fold.
X_val (np.ndarray): validation input data for current fold.
y_train (np.ndarray): train output data for current fold.
y_val (np.ndarray): validation output data for current fold.
idx_train (np.ndarray, optional): boolean mask for train data (if return_masks is True).
idx_val (np.ndarray, optional): boolean mask for validation data (if return_masks is True).
"""
# Check inputs.
X = np.asarray(X)
y = np.asarray(y)
# Get shape.
if len(X) != len(y):
raise ValueError('X ({}) and y ({}) should have the same length.'.format(len(X), len(y)))
n_samples = len(X)
if how == 'random':
# Randomly shuffle the data before dividing into folds.
reorder_idx = np.random.permutation(n_samples)
elif how == 'stratified':
# Sort X and y based on y before dividing into folds.
reorder_idx = np.argsort(y)
else:
raise ValueError('Invalid choice for `how`. Choose from: {}.'
.format(['random', 'stratified']))
# Reorder the data.
X = X[reorder_idx]
y = y[reorder_idx]
for k in range(n_folds):
# Indices for validation data.
idx_val = np.arange(0, n_samples - k, n_folds) + k
# Create boolean mask for samples.
mask_val = np.full(n_samples, False)
mask_val[idx_val] = True
mask_train = ~mask_val
# Select the train and validation data for the current fold.
X_train = X[mask_train]
X_val = X[mask_val]
y_train = y[mask_train]
y_val = y[mask_val]
if return_idx:
idx_train = reorder_idx[mask_train]
idx_val = reorder_idx[mask_val]
yield X_train, X_val, y_train, y_val, idx_train, idx_val
else:
yield X_train, X_val, y_train, y_val