Source code for nnsa.training.utils

import numpy as np

__all__ = [
    'dropna',
    'split_data',
]

[docs]def dropna(X, y, verbose=False): """ Drop rows with nans in X or y. Args: X (np.ndarray): data array with dimensions (n_samples, n_features). y (np.ndarray): data array with dimensions (n_samples,) or (n_samples, 1). verbose (bool, optional): if True, prints the fraction of rows that is removed. Returns: X, y: without nans. """ nanmask = np.any(np.isnan(X), axis=-1) | np.isnan(y).squeeze() keep = ~nanmask X = X[keep, :] y = y[keep] if verbose: print('Removed {:.2f} % of the samples.'.format(np.mean(nanmask)*100)) return X, y
[docs]def split_data(x, y, train_frac=0.75, shuffle=True): """ TODO Deprecate this method. Use sklearn.model_selection.train_test_split insetad. Split the data samples into a train set and test set. Args: x (np.ndarray): array with (feature) data. The first axis must correspond to the samples. y (np.ndarray): array with labels. The first axis must correspond to the samples. train_frac (float, optional): fraction of data to keep in train set. Defaults to 0.75. shuffle (bool, optional): if True, the data is shuffled before splitting. If False, the data is not shuffled. Defaults to True. Returns: x_train (np.ndarray): train set features. y_train (np.ndarray): train set labels. x_test (np.ndarray): test set features. y_test (np.ndarray): test set labels. """ raise DeprecationWarning('split_data() is deprecated. Use sklearn.model_selection.train_test_split insetad.') # Compute number of samples in train set. n = len(y) num_train = int(np.floor(train_frac * n)) # Split. if shuffle: np.random.seed(0) random_idx = np.random.permutation(n) x_train = x[random_idx[:num_train]] y_train = y[random_idx[:num_train]] x_test = x[random_idx[num_train:]] y_test = y[random_idx[num_train:]] else: x_train = x[:num_train] y_train = y[:num_train] x_test = x[num_train:] y_test = y[num_train:] return x_train, y_train, x_test, y_test