Source code for nnsa.stats.statistics

"""
Author: Tim Hermans (tim-hermans@hotmail.com).
"""
import warnings

import pandas as pd
import pyprind
from scipy import stats
from scipy.stats import pearsonr, spearmanr, kendalltau, mannwhitneyu
from scipy.stats._stats_py import SignificanceResult

from nnsa.utils.arrays import mirror_boundaries, remove_nans
from nnsa.utils.segmentation import segment_generator
import numpy as np
import statsmodels.formula.api as smf


__all__ = [
    'compute_mad',
]

[docs]def compute_mad(x, k=1.4826, n=None, axis=-1, keepdims=False):
    """
    Compute the (running) median absolute deviation.

    Ignores nan values.

    Args:
        x (np.ndarray): input array.
        k (float): scale factor for MAD. Defaults to 1.4826 to approximate SD of normal distribution.
            See also: https://en.wikipedia.org/wiki/Median_absolute_deviation.
        n (int): Deprecated. Do not use. This is only here to provide Deprecation warnings.
        axis (int): axis along which to compute the MAD.
        keepdims (bool): whether to keep dims or not (see np.median()).

    Returns:
        mad (float or np.ndarray): MAD value(s) along the specified axis.

    """
    if n is not None:
        deprc_msg = 'Use moving_mad() instead of compute_mad()[0].'
        raise DeprecationWarning(deprc_msg)

    # To array.
    x = np.asarray(x)

    # Median.
    med = np.nanmedian(x, axis=axis, keepdims=True)

    # Median absolute deviation.
    mad = k * np.nanmedian(np.abs(x - med), axis=axis, keepdims=keepdims)

    return mad


def pairwise_multiple_regression(x_columns, y_columns, df, formula='{y}~{x}', verbose=1):
    """
    Compute R2-adjusted scores and p-values for pairwise y~x regressions.

    Args:
        x_columns (list): list of columns to use for x-variables (one at a time).
        y_columns (list): list of columns to use for y-variables (one at a time).
        df (pd.DataFrame): dataframe with the data.
        formula (str): formula in which to insert the x and y variables.
            Indicate where the variables should be placed using {x} and {y}, e.g. {y} ~ {x} + covariate_column
        verbose (int): verbosity level (show a progress bar (1) or not (0)).

    Returns:
        df_results (pd.DataFrame): dataframe with complex values, where the real part is the R2-adjusted score,
            and the imaginary part the p-value for the contribution of {x} on the model for {y}.
    """
    # Check inputs.
    if '{x}' not in formula:
        raise ValueError('"{x}" missing in formula.')
    if '{y}' not in formula:
        raise ValueError('"{y}" missing in formula.')

    # Loop over x and y columns.
    output = np.zeros((len(x_columns), len(y_columns)), np.complex)
    bar = pyprind.ProgBar(len(x_columns) * len(y_columns))
    for ix, x in enumerate(x_columns):
        for iy, y in enumerate(y_columns):
            # Create formula.
            formula_i = formula.format(x=x, y=y)

            # Fit multiple linear regression.
            model = smf.ols(formula=formula_i, data=df, missing='drop')
            results = model.fit()

            # Score and p-value.
            score = results.rsquared_adj
            pvalue = results.pvalues[x]

            # Save in output (pvalue as imaginary part).
            output[ix, iy] = score + 1j * pvalue

            if verbose:
                bar.update()

    df_results = pd.DataFrame(data=output, index=x_columns, columns=y_columns)

    return df_results


def partial_corr(x, y, z, method='pearson'):
    """
    Calculate the partial correlation coefficient and its p-value
    between x and y, controlling for the effects of z.

    Args:
        x : array-like, shape (n_samples,)
            First variable of interest.
        y : array-like, shape (n_samples,)
            Second variable of interest.
        z : array-like, shape (n_samples,)
            Control variable.
        method : str, optional
            Correlation method to use: 'pearson', 'spearman', or 'kendall'.

    Returns:
        res : SignificanceResult containing the partial correlation coefficient (statistic)
            between x and y, controlling for z, and its two-tailed pvalue.

    Examples:
        # Create x and y with confounfing z.
        >>> rng = np.random.RandomState(43)
        >>> z = rng.random(100)
        >>> x = z + 0.2*rng.random(100)
        >>> y = z + 0.2*rng.random(100)

        >>> pearsonr(x, y)
        PearsonRResult(statistic=0.9617349300921725, pvalue=6.584320485068616e-57)
        >>> spearmanr(x, y)
        SignificanceResult(statistic=0.9613921392139213, pvalue=1.0109548429778636e-56)

        >>> partial_corr(x, y, z, method='pearson')
        SignificanceResult(statistic=0.04490137980266458, pvalue=0.6589872916398769)
        >>> partial_corr(x, y, z, method='spearman')
        SignificanceResult(statistic=0.12897318724509949, pvalue=0.20326996612845738)
    """
    x = np.asarray(x)
    y = np.asarray(y)
    z = np.asarray(z)
    method = method.lower()

    # Force z to be 2D.
    if x.ndim != 1 or y.ndim != 1 or z.ndim != 1:
        raise ValueError('All input arrays (x, y, z) must be 1-dimensional.')

    # Calculate correlation matrices.
    if method == 'pearson':
        r_XY = np.corrcoef(x, y)[0, 1]
        r_XZ = np.corrcoef(x, z)[0, 1]
        r_YZ = np.corrcoef(y, z)[0, 1]
    elif method == 'spearman':
        r_XY = spearmanr(x, y)[0]
        r_XZ = spearmanr(x, z)[0]
        r_YZ = spearmanr(y, z)[0]
    elif method == 'kendall':
        r_XY = kendalltau(x, y)[0]
        r_XZ = kendalltau(x, z)[0]
        r_YZ = kendalltau(y, z)[0]
    else:
        raise ValueError(f"Invalid method '{method}'. Choose 'pearson', 'spearman', or 'kendall'.")

    # Compute partial correlation coefficients.
    r_XY_Z = (r_XY - r_XZ * r_YZ) / np.sqrt((1 - r_XZ ** 2) * (1 - r_YZ ** 2))

    # Compute degrees of freedom.
    n_samples = x.shape[0]
    n_controls = 1
    df = n_samples - n_controls - 2

    # Compute p-value.
    t = r_XY_Z * np.sqrt(df / (1 - r_XY_Z ** 2))
    p = 2 * (1 - stats.t.cdf(abs(t), df))

    # Named tuple, like the result of stats.spearmanr.
    res = SignificanceResult(r_XY_Z, p)

    return res


def test_groups_pairwise(x, group, data=None, test_fun=mannwhitneyu):
    """
    Do pairwise test between unique groups.
    """
    # Test between each group.
    if data is not None and isinstance(x, str):
        x = data[x]
    if data is not None and isinstance(group, str):
        group = data[group]

    x = np.asarray(x)
    group = np.asarray(group)

    nan_mask = np.isnan(x)
    x = x[~nan_mask]
    group = group[~nan_mask]

    unique_groups = np.unique(group)
    stat_matrix = np.full((len(unique_groups), len(unique_groups)), fill_value=np.nan)
    pval_matrix = np.full_like(stat_matrix, fill_value=np.nan)
    for ii, group_i in enumerate(unique_groups):
        mask_i = group == group_i
        a_i = x[mask_i]
        for jj, group_j in enumerate(unique_groups):
            if jj <= ii:
                continue
            mask_j = group == group_j
            b_j = x[mask_j]
            stat, pval = test_fun(a_i, b_j)
            stat_matrix[ii, jj] = stat
            pval_matrix[ii, jj] = pval

            # Symmetric.
            stat_matrix[jj, ii] = stat
            pval_matrix[jj, ii] = pval

    df_stat = pd.DataFrame(stat_matrix, columns=unique_groups, index=unique_groups)
    df_pval = pd.DataFrame(pval_matrix, columns=unique_groups, index=unique_groups)

    return df_stat, df_pval