"""
Statistical test for comparing groups in partially paired data.
"""
import numpy as np
import scipy.stats
__all__ = [
'opt_pooled_ttest',
]
[docs]def opt_pooled_ttest(a, b):
"""
Calculate the optimal pooled t-test on 2 partially paired samples.
This is a two-sided test for the nul hypothesis that 2 sets of partially paired data have identical averages.
The partially paired data refers to e.g. the case where there is a pre-treatment and a post-treatment group.
Some subjects may have values for both pre and post treament, while other subjects may only be measured pre or post
treatment. This test is suitable to compare the means of such partially paired pre and post groups, when the data
is not normally distributed or when the sample size is low. See the reference for more detail and for other test
comparing partially paired data.
Reference:
B. Guo and Y. Yuan, "A comparative review of methods for comparing means using partially paired data,"
Statistical Methods in Medical Research, vol. 26, no. 3, pp. 1323-1340, Apr. 2015.
Args:
a (np.ndarray): sample of the first group. Must have the same shape as b. a[i] and b[i] must correspond to
the same subject. Missing values must be indicated by np.nan.
b (np.ndarray): sample of the second group. Must have the same shape as a. a[i] and b[i] must correspond to
the same subject. Missing values must be indicated by np.nan.
Returns:
statistic (float): t-statistic.
pvalue (float): two-tailed p-value.
Examples:
Compute the t-statstic and p-value for the example case in the paper by Guo et al. 2015 (see Table 5).
>>> paired_data = np.asarray(((20, 10), (30, 20), (25, 10), (20, 20), (25, 20), (10, 10), (15, 15), (20, 20), (30, 30)))
>>> pre_data = np.array((10,20,25,30,20,30,15,20,30,15,15,20,10,25,30,20,20,30,25,30,20,20,10,25,20,10,20,20))
>>> post_data = np.array((15,25,30,20,10,20,10,30,10,10,10,25,15,20,20,20,20,10,10,10,20,30,10))
>>> pre = paired_data[:, 0]
>>> post = paired_data[:, 1]
>>> pre = np.concatenate((pre, pre_data, np.full(len(post_data), np.nan)))
>>> post = np.concatenate((post, np.full(len(pre_data), np.nan), post_data))
>>> statistic, pvalue = opt_pooled_ttest(pre, post)
>>> print("Test statistic: {:.3f}, p-value: {:.4f}".format(statistic, pvalue))
Test statistic: 2.697, p-value: 0.0119
Shuffle the data.
>>> idx = np.random.permutation(len(pre))
>>> statistic_2, pvalue_2 = opt_pooled_ttest(pre[idx], post[idx])
>>> pvalue == pvalue_2
True
"""
# Input check.
if len(a) != len(b):
raise ValueError('a and b arrays should have the same shape, but len(a)={} and len(b)={}.'
.format(len(a), len(b)))
# Extract paired and unpaired data.
paired_mask = np.logical_and(~np.isnan(a), ~np.isnan(b))
pre_paired = a[paired_mask]
post_paired = b[paired_mask]
pre_unpaired = a[np.logical_and(~np.isnan(a), ~paired_mask)]
post_unpaired = b[np.logical_and(~np.isnan(b), ~paired_mask)]
# See section 2 Guo et al. 2015.
n = len(pre_paired)
n_0 = len(pre_unpaired)
n_1 = len(post_unpaired)
if n_0 == 0 or n_1 == 0:
raise ValueError('All values are paired. Test not suitable for purely paired data.')
# Section 2.1.2
D = pre_paired - post_paired
S_D = np.std(D, ddof=1)
# Section 2.4
S_0_u = np.std(pre_unpaired, ddof=1)
S_1_u = np.std(post_unpaired, ddof=1)
# Section 2.5
# Compute mu, a weighted sum of the paired and unpaired differences.
mu_1 = np.mean(D)
mu_2 = np.mean(pre_unpaired) - np.mean(post_unpaired)
S_1 = S_D/np.sqrt(n)
S_2 = np.sqrt(S_0_u**2/n_0 + S_1_u**2/n_1)
omega = S_1**(-2) + S_2**(-2)
omega_1 = S_1**(-2)/omega
omega_2 = S_2**(-2)/omega
mu = omega_1*mu_1 + omega_2*mu_2
# Compute standard deviation/variance of mu.
df_1 = n - 1
nom = (S_0_u**2/n_0 + S_1_u**2/n_1)**2
den = (S_0_u**2/n_0)**2/(n_0 - 1) + (S_1_u**2/n_1)**2/(n_1 - 1)
df_2 = nom/den
S_mu = np.sqrt((1 + 4*omega_1*(1 - omega_1)/df_1 + 4*omega_2*(1 - omega_2)/df_2)/omega)
# T test statistic.
statistic = mu/S_mu
# Degrees of freedom for t-distribution.
df = 1/(omega_1**2/df_1 + omega_2**2/df_2)
# Compute 2-sided p-value.
pvalue = 2*(1 - scipy.stats.t.cdf(abs(statistic), df=df))
return statistic, pvalue