Source code for nnsa.stats.paired

import os
import warnings

import pandas as pd
import numpy as np
import scipy.stats


__all__ = [
    'wilcoxon',
]


[docs]def wilcoxon(x, y=None, fill_outside_table=np.nan, **kwargs):
    """
    This function is deprecated, as SciPy has implemented a 'mode' parameter that allows for exact p value computation.

    Calculate the Wilcoxon signed-rank test.

    The scipy.stats.wilcoxon() function is used to compute the test statistic. For the p-value, the scipy function
    uses a normal approximation, which is only valid for larger sample sizes (see docs scipy.stats.wilcoxon()).
    Therefore, for len(x) > 20, the scipy.stats.wilcoxon() function is also used to compute the p-value.
    For len(x) <= 20, a lookup table is used to determine the lowest p-value for which the statistic is equal or lower.

    Args:
        x (np.ndarray): see scipy.stats.wilcoxon().
        y (np.ndarray, optional): see scipy.stats.wilcoxon().
        fill_outside_table (float, optional): p-value to return if the test statistic does not fall in the range of the
            lookup table (for computing the p-value if len(x) <= 20).
            Defaults to np.nan.
        **kwargs (optional): keyword arguments for scipy.stats.wilcoxon().

    Returns:
        statistic (float): see scipy.stats.wilcoxon().
        pvalue (float): see scipy.stats.wilcoxon(). If len(x) <= 20, this is the lowest p-value for which the statistic
            is equal or lower.

    Examples:
        Create random data with different means.
        >>> np.random.seed(65)
        >>> n = 20
        >>> x = np.random.normal(2, 1, n)
        >>> y = np.random.normal(2.5, 1, n)

        Compute Wilcoxon p-value using scipy (uses the normal equation).
        >>> scipy.stats.wilcoxon(x, y, mode='approx')[1]
        0.020633435105949553

        Compute the lowest Wilcoxon p-value in the look up table for which the statistic is equal or lower. It should
        be similar to the wilcoxon computed above using the normal equation with n = 20.
        >>> wilcoxon(x, y)[1]
        0.025
    """
    raise DeprecationWarning("This function is deprecated. Use the scipy.stats.wilcoxon with mode='auto' instead.")

    # If all differences are zero, return np.nan (otherwise, scipy will raise an error).
    d = x - y if y is not None else x
    if np.all(d == 0):
        return np.nan, np.nan

    # Compute statistic and p-value.
    statistic, pvalue = scipy.stats.wilcoxon(x, y, **kwargs)

    n = len(x)
    if n <= 20:
        # The scipy package uses the normal equation, which requires that n > 20 (see scipy.stats.wilcoxon() docs).
        # For lower values, a lookup table with critical values for the test statistic may be used.

        if 'alternative' in kwargs and kwargs['alternative'] != 'two-sided':
            raise NotImplementedError('Look-up Wilcoxon p-value not implemented for `alternative` == "{}"'
                                      .format(kwargs['alternative']))

        # Filepath to Excel file with critical values.
        # (copied from http://www.real-statistics.com/statistics-tables/wilcoxon-signed-ranks-table/)
        file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'wilcoxon_signed_ranks_critical_values.xlsx')
        df = pd.read_excel(file)
        df.set_index('n', inplace=True)

        # Find the p-values for which the statistic is significantly small.
        critical_values = df.loc[n, :].dropna().sort_index()
        significant = critical_values[statistic <= critical_values]

        if len(significant) == 0:
            # No p-value match in table (p-value is higher than the ones in the table).
            pvalue = fill_outside_table
        else:
            # Select the smallest p-value.
            pvalue = significant.idxmin()

    return statistic, pvalue