Source code for nnsa.training.utils

import numpy as np

__all__ = [
    'dropna',
    'split_data',
]

[docs]def dropna(X, y, verbose=False):
    """
    Drop rows with nans in X or y.

    Args:
        X (np.ndarray): data array with dimensions (n_samples, n_features).
        y (np.ndarray): data array with dimensions (n_samples,) or (n_samples, 1).
        verbose (bool, optional): if True, prints the fraction of rows that is removed.

    Returns:
        X, y: without nans.
    """
    nanmask = np.any(np.isnan(X), axis=-1) | np.isnan(y).squeeze()
    keep = ~nanmask
    X = X[keep, :]
    y = y[keep]

    if verbose:
        print('Removed {:.2f} % of the samples.'.format(np.mean(nanmask)*100))

    return X, y


[docs]def split_data(x, y, train_frac=0.75, shuffle=True):
    """
    TODO Deprecate this method. Use sklearn.model_selection.train_test_split insetad.

    Split the data samples into a train set and test set.

    Args:
        x (np.ndarray): array with (feature) data. The first axis must correspond to the samples.
        y (np.ndarray): array with labels. The first axis must correspond to the samples.
        train_frac (float, optional): fraction of data to keep in train set.
            Defaults to 0.75.
        shuffle (bool, optional): if True, the data is shuffled before splitting.
            If False, the data is not shuffled.
            Defaults to True.

    Returns:
        x_train (np.ndarray): train set features.
        y_train (np.ndarray): train set labels.
        x_test (np.ndarray): test set features.
        y_test (np.ndarray): test set labels.

    """
    raise DeprecationWarning('split_data() is deprecated. Use sklearn.model_selection.train_test_split insetad.')
    # Compute number of samples in train set.
    n = len(y)
    num_train = int(np.floor(train_frac * n))

    # Split.
    if shuffle:
        np.random.seed(0)
        random_idx = np.random.permutation(n)
        x_train = x[random_idx[:num_train]]
        y_train = y[random_idx[:num_train]]
        x_test = x[random_idx[num_train:]]
        y_test = y[random_idx[num_train:]]
    else:
        x_train = x[:num_train]
        y_train = y[:num_train]
        x_test = x[num_train:]
        y_test = y[num_train:]

    return x_train, y_train, x_test, y_test