Source code for nnsa.training.cross_validation

import numpy as np
import copy
import pyprind


__all__ = [
    'cross_validate',
    'kfold_generator',
]


[docs]def cross_validate(X, y, model, n_folds=5, how='stratified', verbose=0):
    """
    Cross validate a regression model using (stratified) k-fold cross validation.

    For leave-one-out cross validation, set n_folds to len(X) or None.

    Args:
        X (np.ndarray): train data with shape (n_samples, n_features).
        y (np.ndarray): outcome values with shape (n_samples).
        model: (untrained) model that implements fit() and predict() methods.
        n_folds (int, optional): number of folds for k-fold cross validation.
            Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation.
        how (str, optional): how to divide in folds. See kfold_generator.
        verbose (int, optional): verbosity level.
            Defaults to 0.

    Returns:
        y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the
            train set, but the predictions are made when the sample was not included in the training.
        all_models (list): list of all models, each trained on a specific part of the data.
    """
    if n_folds is None:
        n_folds = len(X)

    # Generator that generate train and validation folds.
    fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True)

    # Initialize output.
    y_pred_val = np.zeros_like(y)

    # Loop over folds.
    all_models = []
    bar = pyprind.ProgBar(n_folds)
    for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \
            enumerate(fold_generator):

        # Copy the initial (untrained) model.
        model_i = copy.deepcopy(model)

        # Train model on train folds.
        model_i.fit(X_train, y_train)

        # Predict on validation fold.
        y_pred_val[idx_val] = model_i.predict(X_val)

        # Append to output list.
        all_models.append(model_i)

        if verbose:
            bar.update()

    return y_pred_val, all_models


def cross_validate_permutation_importance(
        X, y, model, n_folds=5, how='stratified', n_repeats=5, verbose=0, **kwargs):
    """
    Cross validate a regression model using (stratified) k-fold cross validation to compute feature importance.

    Uses a permutation method for assessing feature importance.

    For leave-one-out cross validation, set n_folds to len(X) or None.

    Args:
        X (np.ndarray): train data with shape (n_samples, n_features).
        y (np.ndarray): outcome values with shape (n_samples).
        model: (untrained) model that implements fit() and predict() methods.
        n_folds (int, optional): number of folds for k-fold cross validation.
            Defaults to 5. If n_folds is None, takes n_folds=len(X), i.e., leave-one-out cross validation.
        how (str, optional): how to divide in folds. See kfold_generator.
        n_repeats (int): number of permutations to do per feature. See sklearn's permutation_importance.
        verbose (int, optional): verbosity level.
            Defaults to 0.
        **kwargs: for sklearn's permutation_importance.

    Returns:
        importances (np.ndarray): importances (mean accuracy decrease in left-out fold) with shape
            (n_features, n_repeats, n_folds).
        y_pred_val (np.ndarray): predicted scores on the validation set, which is the same size as the
            train set, but the predictions are made when the sample was not included in the training.
        all_models (list): list of all models, each trained on a specific part of the data.
    """
    from sklearn.inspection import permutation_importance

    if n_folds is None:
        n_folds = len(X)

    # Generator that generate train and validation folds.
    fold_generator = kfold_generator(X, y, n_folds=n_folds, how=how, return_idx=True)

    # Initialize output.
    y_pred_val = np.zeros_like(y)

    # Loop over folds.
    all_models = []
    importances_all = []
    bar = pyprind.ProgBar(n_folds)
    for i_fold, (X_train, X_val, y_train, y_val, idx_train, idx_val) in \
            enumerate(fold_generator):

        # Copy the initial (untrained) model.
        model_i = copy.deepcopy(model)

        # Train model on train folds.
        model_i.fit(X_train, y_train)

        # Predict on validation fold.
        y_pred_val[idx_val] = model_i.predict(X_val)

        # Compute permutation feature importance.
        result = permutation_importance(
            model_i, X_val, y_val, n_repeats=n_repeats, **kwargs,
        )

        # Append to output list.
        importances_all.append(result.importances)

        if verbose:
            bar.update()

    # To shape (n_features, n_repeats, n_folds).
    importances = np.dstack(importances_all)

    assert importances.shape == (X.shape[1], n_repeats, n_folds)

    return importances, y_pred_val, all_models


[docs]def kfold_generator(X, y, n_folds=5, how='stratified', return_idx=False):
    """
    Generate train and validation sets from k folds.

    For fold `k`, every `n_folds`th sample is put in the set, starting at k.

    Args:
        X (np.ndarray): input data with len (n_samples).
        y (np.ndarray): output data with len (n_samples).
        n_folds (int, optional): number of folds.
            Defaults to 5.
        how (str, optional): how to divide the data into folds. Choose from:
            'random': random division in folds.
            'stratified': sorts the data based on y, so that the folds are stratified (for regression).
            Defaults to 'stratified_regression'.
        return_idx (bool, optional): if True, return/yield the indices for train and validation.
            Defaults to False.

    Yields:
        X_train (np.ndarray): train input data for current fold.
        X_val (np.ndarray): validation input data for current fold.
        y_train (np.ndarray): train output data for current fold.
        y_val (np.ndarray): validation output data for current fold.
        idx_train (np.ndarray, optional): boolean mask for train data (if return_masks is True).
        idx_val (np.ndarray, optional): boolean mask for validation data (if return_masks is True).
    """
    # Check inputs.
    X = np.asarray(X)
    y = np.asarray(y)

    # Get shape.
    if len(X) != len(y):
        raise ValueError('X ({}) and y ({}) should have the same length.'.format(len(X), len(y)))
    n_samples = len(X)

    if how == 'random':
        # Randomly shuffle the data before dividing into folds.
        reorder_idx = np.random.permutation(n_samples)
    elif how == 'stratified':
        # Sort X and y based on y before dividing into folds.
        reorder_idx = np.argsort(y)
    else:
        raise ValueError('Invalid choice for `how`. Choose from: {}.'
                         .format(['random', 'stratified']))

    # Reorder the data.
    X = X[reorder_idx]
    y = y[reorder_idx]

    for k in range(n_folds):
        # Indices for validation data.
        idx_val = np.arange(0, n_samples - k, n_folds) + k

        # Create boolean mask for samples.
        mask_val = np.full(n_samples, False)
        mask_val[idx_val] = True
        mask_train = ~mask_val

        # Select the train and validation data for the current fold.
        X_train = X[mask_train]
        X_val = X[mask_val]
        y_train = y[mask_train]
        y_val = y[mask_val]

        if return_idx:
            idx_train = reorder_idx[mask_train]
            idx_val = reorder_idx[mask_val]
            yield X_train, X_val, y_train, y_val, idx_train, idx_val
        else:
            yield X_train, X_val, y_train, y_val