Source code for nnsa.training.feature_selection


from sklearn.ensemble import RandomForestRegressor
import numpy as np

__all__ = [
    'get_uncorrelating_features',
    'select_features',
    'select_features_rfr',
]


[docs]def get_uncorrelating_features(df, features=None, max_corr=0.8, match_class=False, verbose=1): """ Get a subset of features, which are present in df, and uncorrelated with each other. The order of the features in `features` corresponds to the priority that the features have. E.g. if if 2 features correlate significantly, the feature that is located more to the beginning of the `features` list will be kept, the other one not. Args: df (pd.DataFrame): DataFrame containing the features (columns) for a number of samples (index). features (list, optional): list of features in `df` to find an uncorrelating subset of. This list must be sorted in order of priority. I.e., put features that you want to keep in the beginning of the list. If None, all numeric columns of the df are used as features, favouring features that correlate with most other features. Defaults to None. max_corr (float, optional): the maximum allowable correlation coefficient between two features. If the correlation between two features is higher than `max_corr` the correlating appearing latest in `features` will be removed. Defaults to 0.7. match_class (bool, optional): if True, each feature is only correlated with features of the same type. Note that the type of the feature is assumed to be coded by the first couple of characters (before the first underscore), e.g. POW_delta. If False, considers correlations between features of different types. Defaults to False. verbose (int, optional): verbosity level. Defaults to 1. Returns: uncorrelating_features (dict): dict where the keys are a subset of `features` containing only features whose mutual correlation coefficient is lower than `max_corr`. The values corresponding to the keys are the features that where removed due to significant correlating with the key feature. """ def compute_corr_scores(features_to_check, feature): # Compute the correlation of the feature with all other features in the set. if match_class: # Only consider features of the same type. f_type = feature.split('_')[0] features_to_check = [fi for fi in features_to_check if fi.startswith(f_type)] corr_scores_ = df[features_to_check].corrwith(df[feature]).abs() if len(corr_scores_) == 0 and len(features_to_check) != 0: raise AssertionError return corr_scores_ if features is None: # Use numeric columns of the df. features = df.select_dtypes(include='number').columns.to_list() # Sort based on the number of features that correlate. counts = [] for f in features: corr_scores = compute_corr_scores(features_to_check=features, feature=f) counts.append((corr_scores > max_corr).sum()) features = [features[idx] for idx in np.argsort(counts)[::-1]] # Descending order. else: # Check if all features are in df. for f in features: if f not in df: raise ValueError('Feature {} not in `df`.'.format(f)) if verbose > 0: print('Removing uncorrelating features...') n_features = len(features) # Look at the first item in `features` and remove all correlating features. features = features.copy() uncorrelating_features = dict() while len(features) > 1: feature_i = features.pop(0) # Append to output list. uncorrelating_features[feature_i] = list() # Compute correlations. corr_scores = compute_corr_scores(features_to_check=features, feature=feature_i) # Remove feature that correlate too much. corr_features = corr_scores[corr_scores > max_corr].index.tolist() for f in corr_features: uncorrelating_features[feature_i].append(f) features.remove(f) if verbose > 0: print('Found {} uncorrelating features out of {}.'.format( len(uncorrelating_features), n_features)) return uncorrelating_features
[docs]def select_features(X, y=None, how='RFR'): """ Select features in X. Args: X (pd.DataFrame): feature data, X.values.shape = (n_samples, n_features). y (np.ndarray, optional): outcome data with shape (n_samples,). For some options of `how`, this input is not needed. Defaults to None. how (str, optional): how to select the features. Choose from: 'all' or None: use all available features. 'RFR' or 'RandomForestRegressor': do a feature selection step using a random forest regressor. Defaults to 'RFR'. Returns: X (pd.DataFrame): data array with selected data. """ if how is not None: how = how.lower() # Select features. if how is None or how == 'all': # Select all features. pass elif how == 'rfr' or how == 'randomforestregressor': # Select features using the importance of a random forest regressor. X = select_features_rfr(X, y, max_corr=0.8) elif how in ['rfr+', 'randomforestregressor+']: # Select features using the importance of a random forest regressor. And assign a minimum importance. X = select_features_rfr(X, y, max_corr=0.8, min_imp='q50') else: raise ValueError('Invalid input for `how` ("{}"). Select from {}.' .format(how, ['all', 'RFR', 'RFR+'])) return X
[docs]def select_features_rfr(X, y, max_corr=0.8, min_imp=0, **kwargs): """ Remove correlating features, keeping features most important for prediction y according to a RandomForestRegressor. Args: X (pd.DataFrame): feature data with shape (n_samples, n_features). y (np.ndarray, optional): outcome data with shape (n_samples,). max_corr (float, optional): maximum allowed correlation. Features are removed that correlate more than this with a more important feature. Defaults to 0.8. min_imp (float or str, optional): if a float, only features are selected that have this minimum importance. If a str, you can specify q5 or q10 to define the 5th or 10th percentile as the threshold. **kwargs (optional): optional keyword parameters to pass to RandomForestRegressor. Returns: X (pd.DataFrame): data array with selected data. """ n_samples, n_features = X.shape feature_labels = X.columns.to_list() # Default parameters for the RandomForestRegressor. rfr_kwargs = { 'criterion': 'mse', 'random_state': 43, 'n_estimators': 1500, 'max_features': X.shape[1] } # Update parameters with user specified parameters. rfr_kwargs.update(kwargs) # Train RFR on all features to compute feature importance. model = RandomForestRegressor(**rfr_kwargs).fit(X, y) # Sort feature labels on importance, from high to low. feature_importances = model.feature_importances_ sort_idx = np.argsort(feature_importances)[::-1] if isinstance(min_imp, str): if min_imp[0] == 'q': q = float(min_imp[1:]) min_imp = np.percentile(feature_importances, q) else: raise ValueError('Invalid input for `min_imp`.') # Remove features with low importance. sort_idx = [idx for idx in sort_idx if feature_importances[idx] >= min_imp] # Get the labels. feature_labels_sorted = [feature_labels[idx] for idx in sort_idx] # Get labels of subset of mutually uncorrelating features. features_to_keep = list(get_uncorrelating_features(df=X, features=feature_labels_sorted, max_corr=max_corr).keys()) # Select those features. X = X[features_to_keep] return X