import numpy as np
__all__ = [
'dropna',
'split_data',
]
[docs]def dropna(X, y, verbose=False):
"""
Drop rows with nans in X or y.
Args:
X (np.ndarray): data array with dimensions (n_samples, n_features).
y (np.ndarray): data array with dimensions (n_samples,) or (n_samples, 1).
verbose (bool, optional): if True, prints the fraction of rows that is removed.
Returns:
X, y: without nans.
"""
nanmask = np.any(np.isnan(X), axis=-1) | np.isnan(y).squeeze()
keep = ~nanmask
X = X[keep, :]
y = y[keep]
if verbose:
print('Removed {:.2f} % of the samples.'.format(np.mean(nanmask)*100))
return X, y
[docs]def split_data(x, y, train_frac=0.75, shuffle=True):
"""
TODO Deprecate this method. Use sklearn.model_selection.train_test_split insetad.
Split the data samples into a train set and test set.
Args:
x (np.ndarray): array with (feature) data. The first axis must correspond to the samples.
y (np.ndarray): array with labels. The first axis must correspond to the samples.
train_frac (float, optional): fraction of data to keep in train set.
Defaults to 0.75.
shuffle (bool, optional): if True, the data is shuffled before splitting.
If False, the data is not shuffled.
Defaults to True.
Returns:
x_train (np.ndarray): train set features.
y_train (np.ndarray): train set labels.
x_test (np.ndarray): test set features.
y_test (np.ndarray): test set labels.
"""
raise DeprecationWarning('split_data() is deprecated. Use sklearn.model_selection.train_test_split insetad.')
# Compute number of samples in train set.
n = len(y)
num_train = int(np.floor(train_frac * n))
# Split.
if shuffle:
np.random.seed(0)
random_idx = np.random.permutation(n)
x_train = x[random_idx[:num_train]]
y_train = y[random_idx[:num_train]]
x_test = x[random_idx[num_train:]]
y_test = y[random_idx[num_train:]]
else:
x_train = x[:num_train]
y_train = y[:num_train]
x_test = x[num_train:]
y_test = y[num_train:]
return x_train, y_train, x_test, y_test