Source code for nnsa.training.cross_validation

import numpy as np
import copy
import pyprind


__all__ = [
    'cross_validate',
    'kfold_generator',
]


[docs]def cross_validate(X, y, model, n_folds=5, how='stratified', verbose=0): """ Cross validate a regression model using (stratified) k-fold cross validation. For leave-one-out cross validation, set n_folds to len(X) or None. Args: X (np.ndarray): train data with shape (n_samples, n_features). y (np.ndarray): outcome values with shape (n_samples). model: (untrained) model that implements fit() and predict() methods. n_folds (int, optional): number of folds for k-fold cross validation. Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation. how (str, optional): how to divide in folds. See kfold_generator. verbose (int, optional): verbosity level. Defaults to 0. Returns: y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the train set, but the predictions are made when the sample was not included in the training. all_models (list): list of all models, each trained on a specific part of the data. """ if n_folds is None: n_folds = len(X) # Generator that generate train and validation folds. fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True) # Initialize output. y_pred_val = np.zeros_like(y) # Loop over folds. all_models = [] bar = pyprind.ProgBar(n_folds) for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \ enumerate(fold_generator): # Copy the initial (untrained) model. model_i = copy.deepcopy(model) # Train model on train folds. model_i.fit(X_train, y_train) # Predict on validation fold. y_pred_val[idx_val] = model_i.predict(X_val) # Append to output list. all_models.append(model_i) if verbose: bar.update() return y_pred_val, all_models
def cross_validate_permutation_importance( X, y, model, n_folds=5, how='stratified', n_repeats=5, verbose=0, **kwargs): """ Cross validate a regression model using (stratified) k-fold cross validation to compute feature importance. Uses a permutation method for assessing feature importance. For leave-one-out cross validation, set n_folds to len(X) or None. Args: X (np.ndarray): train data with shape (n_samples, n_features). y (np.ndarray): outcome values with shape (n_samples). model: (untrained) model that implements fit() and predict() methods. n_folds (int, optional): number of folds for k-fold cross validation. Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation. how (str, optional): how to divide in folds. See kfold_generator. n_repeats (int): number of permutations to do per feature. See sklearn's permutation_importance. verbose (int, optional): verbosity level. Defaults to 0. **kwargs: for sklearn's permutation_importance. Returns: importances (np.ndarray): importances (mean accuracy decrease in left-out fold) with shape (n_features, n_repeats, n_folds). y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the train set, but the predictions are made when the sample was not included in the training. all_models (list): list of all models, each trained on a specific part of the data. """ from sklearn.inspection import permutation_importance if n_folds is None: n_folds = len(X) # Generator that generate train and validation folds. fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True) # Initialize output. y_pred_val = np.zeros_like(y) # Loop over folds. all_models = [] importances_all = [] bar = pyprind.ProgBar(n_folds) for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \ enumerate(fold_generator): # Copy the initial (untrained) model. model_i = copy.deepcopy(model) # Train model on train folds. model_i.fit(X_train, y_train) # Predict on validation fold. y_pred_val[idx_val] = model_i.predict(X_val) # Compute permutation feature importance. result = permutation_importance( model_i, X_val, y_val, n_repeats=n_repeats, **kwargs, ) # Append to output list. importances_all.append(result.importances) if verbose: bar.update() # To shape (n_features, n_repeats, n_folds). importances = np.dstack(importances_all) assert importances.shape == (X.shape[1], n_repeats, n_folds) return importances, y_pred_val, all_models
[docs]def kfold_generator(X, y, n_folds=5, how='stratified', return_idx=False): """ Generate train and validation sets from k folds. For fold `k`, every `n_folds`th sample is put in the set, starting at k. Args: X (np.ndarray): input data with len (n_samples). y (np.ndarray): output data with len (n_samples). n_folds (int, optional): number of folds. Defaults to 5. how (str, optional): how to divide the data into folds. Choose from: 'random': random division in folds. 'stratified': sorts the data based on y, so that the folds are stratified (for regression). Defaults to 'stratified_regression'. return_idx (bool, optional): if True, return/yield the indices for train and validation. Defaults to False. Yields: X_train (np.ndarray): train input data for current fold. X_val (np.ndarray): validation input data for current fold. y_train (np.ndarray): train output data for current fold. y_val (np.ndarray): validation output data for current fold. idx_train (np.ndarray, optional): boolean mask for train data (if return_masks is True). idx_val (np.ndarray, optional): boolean mask for validation data (if return_masks is True). """ # Check inputs. X = np.asarray(X) y = np.asarray(y) # Get shape. if len(X) != len(y): raise ValueError('X ({}) and y ({}) should have the same length.'.format(len(X), len(y))) n_samples = len(X) if how == 'random': # Randomly shuffle the data before dividing into folds. reorder_idx = np.random.permutation(n_samples) elif how == 'stratified': # Sort X and y based on y before dividing into folds. reorder_idx = np.argsort(y) else: raise ValueError('Invalid choice for `how`. Choose from: {}.' .format(['random', 'stratified'])) # Reorder the data. X = X[reorder_idx] y = y[reorder_idx] for k in range(n_folds): # Indices for validation data. idx_val = np.arange(0, n_samples - k, n_folds) + k # Create boolean mask for samples. mask_val = np.full(n_samples, False) mask_val[idx_val] = True mask_train = ~mask_val # Select the train and validation data for the current fold. X_train = X[mask_train] X_val = X[mask_val] y_train = y[mask_train] y_val = y[mask_val] if return_idx: idx_train = reorder_idx[mask_train] idx_val = reorder_idx[mask_val] yield X_train, X_val, y_train, y_val, idx_train, idx_val else: yield X_train, X_val, y_train, y_val