Source code for nnsa.stats.partially_paired

"""
Statistical test for comparing groups in partially paired data.
"""
import numpy as np
import scipy.stats


__all__ = [
    'opt_pooled_ttest',
]


[docs]def opt_pooled_ttest(a, b):
    """
    Calculate the optimal pooled t-test on 2 partially paired samples.

    This is a two-sided test for the nul hypothesis that 2 sets of partially paired data have identical averages.
    The partially paired data refers to e.g. the case where there is a pre-treatment and a post-treatment group.
    Some subjects may have values for both pre and post treament, while other subjects may only be measured pre or post
    treatment. This test is suitable to compare the means of such partially paired pre and post groups, when the data
    is not normally distributed or when the sample size is low. See the reference for more detail and for other test
    comparing partially paired data.

    Reference:
        B. Guo and Y. Yuan, "A comparative review of methods for comparing means using partially paired data,"
        Statistical Methods in Medical Research, vol. 26, no. 3, pp. 1323-1340, Apr. 2015.

    Args:
        a (np.ndarray): sample of the first group. Must have the same shape as b. a[i] and b[i] must correspond to
            the same subject. Missing values must be indicated by np.nan.
        b (np.ndarray): sample of the second group. Must have the same shape as a. a[i] and b[i] must correspond to
            the same subject. Missing values must be indicated by np.nan.

    Returns:
        statistic (float): t-statistic.
        pvalue (float): two-tailed p-value.

    Examples:
        Compute the t-statstic and p-value for the example case in the paper by Guo et al. 2015 (see Table 5).
        >>> paired_data = np.asarray(((20, 10), (30, 20), (25, 10), (20, 20), (25, 20), (10, 10), (15, 15), (20, 20), (30, 30)))
        >>> pre_data = np.array((10,20,25,30,20,30,15,20,30,15,15,20,10,25,30,20,20,30,25,30,20,20,10,25,20,10,20,20))
        >>> post_data = np.array((15,25,30,20,10,20,10,30,10,10,10,25,15,20,20,20,20,10,10,10,20,30,10))
        >>> pre = paired_data[:, 0]
        >>> post = paired_data[:, 1]
        >>> pre = np.concatenate((pre, pre_data, np.full(len(post_data), np.nan)))
        >>> post = np.concatenate((post, np.full(len(pre_data), np.nan), post_data))
        >>> statistic, pvalue = opt_pooled_ttest(pre, post)
        >>> print("Test statistic: {:.3f}, p-value: {:.4f}".format(statistic, pvalue))
        Test statistic: 2.697, p-value: 0.0119

        Shuffle the data.
        >>> idx = np.random.permutation(len(pre))
        >>> statistic_2, pvalue_2 = opt_pooled_ttest(pre[idx], post[idx])
        >>> pvalue == pvalue_2
        True

    """
    # Input check.
    if len(a) != len(b):
        raise ValueError('a and b arrays should have the same shape, but len(a)={} and len(b)={}.'
                         .format(len(a), len(b)))

    # Extract paired and unpaired data.
    paired_mask = np.logical_and(~np.isnan(a), ~np.isnan(b))
    pre_paired = a[paired_mask]
    post_paired = b[paired_mask]
    pre_unpaired = a[np.logical_and(~np.isnan(a), ~paired_mask)]
    post_unpaired = b[np.logical_and(~np.isnan(b), ~paired_mask)]

    # See section 2 Guo et al. 2015.
    n = len(pre_paired)
    n_0 = len(pre_unpaired)
    n_1 = len(post_unpaired)

    if n_0 == 0 or n_1 == 0:
        raise ValueError('All values are paired. Test not suitable for purely paired data.')

    # Section 2.1.2
    D = pre_paired - post_paired
    S_D = np.std(D, ddof=1)

    # Section 2.4
    S_0_u = np.std(pre_unpaired, ddof=1)
    S_1_u = np.std(post_unpaired, ddof=1)

    # Section 2.5
    # Compute mu, a weighted sum of the paired and unpaired differences.
    mu_1 = np.mean(D)
    mu_2 = np.mean(pre_unpaired) - np.mean(post_unpaired)
    S_1 = S_D/np.sqrt(n)
    S_2 = np.sqrt(S_0_u**2/n_0 + S_1_u**2/n_1)
    omega = S_1**(-2) + S_2**(-2)
    omega_1 = S_1**(-2)/omega
    omega_2 = S_2**(-2)/omega
    mu = omega_1*mu_1 + omega_2*mu_2

    # Compute standard deviation/variance of mu.
    df_1 = n - 1
    nom = (S_0_u**2/n_0 + S_1_u**2/n_1)**2
    den = (S_0_u**2/n_0)**2/(n_0 - 1) + (S_1_u**2/n_1)**2/(n_1 - 1)
    df_2 = nom/den
    S_mu = np.sqrt((1 + 4*omega_1*(1 - omega_1)/df_1 + 4*omega_2*(1 - omega_2)/df_2)/omega)

    # T test statistic.
    statistic = mu/S_mu

    # Degrees of freedom for t-distribution.
    df = 1/(omega_1**2/df_1 + omega_2**2/df_2)

    # Compute 2-sided p-value.
    pvalue = 2*(1 - scipy.stats.t.cdf(abs(statistic), df=df))

    return statistic, pvalue