Source code for nnsa.stats.partially_paired

"""
Statistical test for comparing groups in partially paired data.
"""
import numpy as np
import scipy.stats


__all__ = [
    'opt_pooled_ttest',
]


[docs]def opt_pooled_ttest(a, b): """ Calculate the optimal pooled t-test on 2 partially paired samples. This is a two-sided test for the nul hypothesis that 2 sets of partially paired data have identical averages. The partially paired data refers to e.g. the case where there is a pre-treatment and a post-treatment group. Some subjects may have values for both pre and post treament, while other subjects may only be measured pre or post treatment. This test is suitable to compare the means of such partially paired pre and post groups, when the data is not normally distributed or when the sample size is low. See the reference for more detail and for other test comparing partially paired data. Reference: B. Guo and Y. Yuan, "A comparative review of methods for comparing means using partially paired data," Statistical Methods in Medical Research, vol. 26, no. 3, pp. 1323-1340, Apr. 2015. Args: a (np.ndarray): sample of the first group. Must have the same shape as b. a[i] and b[i] must correspond to the same subject. Missing values must be indicated by np.nan. b (np.ndarray): sample of the second group. Must have the same shape as a. a[i] and b[i] must correspond to the same subject. Missing values must be indicated by np.nan. Returns: statistic (float): t-statistic. pvalue (float): two-tailed p-value. Examples: Compute the t-statstic and p-value for the example case in the paper by Guo et al. 2015 (see Table 5). >>> paired_data = np.asarray(((20, 10), (30, 20), (25, 10), (20, 20), (25, 20), (10, 10), (15, 15), (20, 20), (30, 30))) >>> pre_data = np.array((10,20,25,30,20,30,15,20,30,15,15,20,10,25,30,20,20,30,25,30,20,20,10,25,20,10,20,20)) >>> post_data = np.array((15,25,30,20,10,20,10,30,10,10,10,25,15,20,20,20,20,10,10,10,20,30,10)) >>> pre = paired_data[:, 0] >>> post = paired_data[:, 1] >>> pre = np.concatenate((pre, pre_data, np.full(len(post_data), np.nan))) >>> post = np.concatenate((post, np.full(len(pre_data), np.nan), post_data)) >>> statistic, pvalue = opt_pooled_ttest(pre, post) >>> print("Test statistic: {:.3f}, p-value: {:.4f}".format(statistic, pvalue)) Test statistic: 2.697, p-value: 0.0119 Shuffle the data. >>> idx = np.random.permutation(len(pre)) >>> statistic_2, pvalue_2 = opt_pooled_ttest(pre[idx], post[idx]) >>> pvalue == pvalue_2 True """ # Input check. if len(a) != len(b): raise ValueError('a and b arrays should have the same shape, but len(a)={} and len(b)={}.' .format(len(a), len(b))) # Extract paired and unpaired data. paired_mask = np.logical_and(~np.isnan(a), ~np.isnan(b)) pre_paired = a[paired_mask] post_paired = b[paired_mask] pre_unpaired = a[np.logical_and(~np.isnan(a), ~paired_mask)] post_unpaired = b[np.logical_and(~np.isnan(b), ~paired_mask)] # See section 2 Guo et al. 2015. n = len(pre_paired) n_0 = len(pre_unpaired) n_1 = len(post_unpaired) if n_0 == 0 or n_1 == 0: raise ValueError('All values are paired. Test not suitable for purely paired data.') # Section 2.1.2 D = pre_paired - post_paired S_D = np.std(D, ddof=1) # Section 2.4 S_0_u = np.std(pre_unpaired, ddof=1) S_1_u = np.std(post_unpaired, ddof=1) # Section 2.5 # Compute mu, a weighted sum of the paired and unpaired differences. mu_1 = np.mean(D) mu_2 = np.mean(pre_unpaired) - np.mean(post_unpaired) S_1 = S_D/np.sqrt(n) S_2 = np.sqrt(S_0_u**2/n_0 + S_1_u**2/n_1) omega = S_1**(-2) + S_2**(-2) omega_1 = S_1**(-2)/omega omega_2 = S_2**(-2)/omega mu = omega_1*mu_1 + omega_2*mu_2 # Compute standard deviation/variance of mu. df_1 = n - 1 nom = (S_0_u**2/n_0 + S_1_u**2/n_1)**2 den = (S_0_u**2/n_0)**2/(n_0 - 1) + (S_1_u**2/n_1)**2/(n_1 - 1) df_2 = nom/den S_mu = np.sqrt((1 + 4*omega_1*(1 - omega_1)/df_1 + 4*omega_2*(1 - omega_2)/df_2)/omega) # T test statistic. statistic = mu/S_mu # Degrees of freedom for t-distribution. df = 1/(omega_1**2/df_1 + omega_2**2/df_2) # Compute 2-sided p-value. pvalue = 2*(1 - scipy.stats.t.cdf(abs(statistic), df=df)) return statistic, pvalue