368 lines
13 KiB
Python
368 lines
13 KiB
Python
|
"""
|
||
|
|
||
|
Created on Wed Mar 28 15:34:18 2012
|
||
|
|
||
|
Author: Josef Perktold
|
||
|
"""
|
||
|
from statsmodels.compat.python import asbytes
|
||
|
|
||
|
from io import BytesIO
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
import pytest
|
||
|
from numpy.testing import assert_, assert_allclose, assert_almost_equal, assert_equal, \
|
||
|
assert_raises
|
||
|
|
||
|
from statsmodels.stats.libqsturng import qsturng
|
||
|
from statsmodels.stats.multicomp import (tukeyhsd, pairwise_tukeyhsd,
|
||
|
MultiComparison)
|
||
|
|
||
|
ss = '''\
|
||
|
43.9 1 1
|
||
|
39.0 1 2
|
||
|
46.7 1 3
|
||
|
43.8 1 4
|
||
|
44.2 1 5
|
||
|
47.7 1 6
|
||
|
43.6 1 7
|
||
|
38.9 1 8
|
||
|
43.6 1 9
|
||
|
40.0 1 10
|
||
|
89.8 2 1
|
||
|
87.1 2 2
|
||
|
92.7 2 3
|
||
|
90.6 2 4
|
||
|
87.7 2 5
|
||
|
92.4 2 6
|
||
|
86.1 2 7
|
||
|
88.1 2 8
|
||
|
90.8 2 9
|
||
|
89.1 2 10
|
||
|
68.4 3 1
|
||
|
69.3 3 2
|
||
|
68.5 3 3
|
||
|
66.4 3 4
|
||
|
70.0 3 5
|
||
|
68.1 3 6
|
||
|
70.6 3 7
|
||
|
65.2 3 8
|
||
|
63.8 3 9
|
||
|
69.2 3 10
|
||
|
36.2 4 1
|
||
|
45.2 4 2
|
||
|
40.7 4 3
|
||
|
40.5 4 4
|
||
|
39.3 4 5
|
||
|
40.3 4 6
|
||
|
43.2 4 7
|
||
|
38.7 4 8
|
||
|
40.9 4 9
|
||
|
39.7 4 10'''
|
||
|
|
||
|
#idx Treatment StressReduction
|
||
|
ss2 = '''\
|
||
|
1 mental 2
|
||
|
2 mental 2
|
||
|
3 mental 3
|
||
|
4 mental 4
|
||
|
5 mental 4
|
||
|
6 mental 5
|
||
|
7 mental 3
|
||
|
8 mental 4
|
||
|
9 mental 4
|
||
|
10 mental 4
|
||
|
11 physical 4
|
||
|
12 physical 4
|
||
|
13 physical 3
|
||
|
14 physical 5
|
||
|
15 physical 4
|
||
|
16 physical 1
|
||
|
17 physical 1
|
||
|
18 physical 2
|
||
|
19 physical 3
|
||
|
20 physical 3
|
||
|
21 medical 1
|
||
|
22 medical 2
|
||
|
23 medical 2
|
||
|
24 medical 2
|
||
|
25 medical 3
|
||
|
26 medical 2
|
||
|
27 medical 3
|
||
|
28 medical 1
|
||
|
29 medical 3
|
||
|
30 medical 1'''
|
||
|
|
||
|
ss3 = '''\
|
||
|
1 24.5
|
||
|
1 23.5
|
||
|
1 26.4
|
||
|
1 27.1
|
||
|
1 29.9
|
||
|
2 28.4
|
||
|
2 34.2
|
||
|
2 29.5
|
||
|
2 32.2
|
||
|
2 30.1
|
||
|
3 26.1
|
||
|
3 28.3
|
||
|
3 24.3
|
||
|
3 26.2
|
||
|
3 27.8'''
|
||
|
|
||
|
ss5 = '''\
|
||
|
2 - 3\t4.340\t0.691\t7.989\t***
|
||
|
2 - 1\t4.600\t0.951\t8.249\t***
|
||
|
3 - 2\t-4.340\t-7.989\t-0.691\t***
|
||
|
3 - 1\t0.260\t-3.389\t3.909\t-
|
||
|
1 - 2\t-4.600\t-8.249\t-0.951\t***
|
||
|
1 - 3\t-0.260\t-3.909\t3.389\t-
|
||
|
'''
|
||
|
|
||
|
cylinders = np.array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, 4,
|
||
|
4, 4, 4, 4, 6, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6,
|
||
|
6, 6, 4, 4, 4, 4, 4, 8, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
|
||
|
4, 4, 4, 4, 4, 4, 4, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4])
|
||
|
cyl_labels = np.array(['USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'France',
|
||
|
'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Japan',
|
||
|
'Germany', 'France', 'Germany', 'Sweden', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany',
|
||
|
'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany',
|
||
|
'Japan', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'Sweden', 'USA', 'France',
|
||
|
'Japan', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA',
|
||
|
'Germany', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'USA',
|
||
|
'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'USA'])
|
||
|
|
||
|
#accommodate recfromtxt for python 3.2, requires bytes
|
||
|
ss = asbytes(ss)
|
||
|
ss2 = asbytes(ss2)
|
||
|
ss3 = asbytes(ss3)
|
||
|
ss5 = asbytes(ss5)
|
||
|
|
||
|
dta = pd.read_csv(BytesIO(ss), sep=r'\s+', header=None, engine='python')
|
||
|
dta.columns = "Rust", "Brand", "Replication"
|
||
|
dta2 = pd.read_csv(BytesIO(ss2), sep=r'\s+', header=None, engine='python')
|
||
|
dta2.columns = "idx", "Treatment", "StressReduction"
|
||
|
dta2["Treatment"] = dta2["Treatment"].map(lambda v: v.encode('utf-8'))
|
||
|
dta3 = pd.read_csv(BytesIO(ss3), sep=r'\s+', header=None, engine='python')
|
||
|
dta3.columns = ["Brand", "Relief"]
|
||
|
dta5 = pd.read_csv(BytesIO(ss5), sep=r'\t', header=None, engine='python')
|
||
|
dta5.columns = ['pair', 'mean', 'lower', 'upper', 'sig']
|
||
|
for col in ('pair', 'sig'):
|
||
|
dta5[col] = dta5[col].map(lambda v: v.encode('utf-8'))
|
||
|
sas_ = dta5.iloc[[1, 3, 2]]
|
||
|
|
||
|
|
||
|
def get_thsd(mci, alpha=0.05):
|
||
|
var_ = np.var(mci.groupstats.groupdemean(), ddof=len(mci.groupsunique))
|
||
|
means = mci.groupstats.groupmean
|
||
|
nobs = mci.groupstats.groupnobs
|
||
|
resi = tukeyhsd(means, nobs, var_, df=None, alpha=alpha,
|
||
|
q_crit=qsturng(1-alpha, len(means), (nobs-1).sum()))
|
||
|
#print resi[4]
|
||
|
var2 = (mci.groupstats.groupvarwithin() * (nobs - 1.)).sum() \
|
||
|
/ (nobs - 1.).sum()
|
||
|
#print nobs, (nobs - 1).sum()
|
||
|
#print mci.groupstats.groupvarwithin()
|
||
|
assert_almost_equal(var_, var2, decimal=14)
|
||
|
return resi
|
||
|
|
||
|
class CheckTuckeyHSDMixin:
|
||
|
|
||
|
@classmethod
|
||
|
def setup_class_(cls):
|
||
|
cls.mc = MultiComparison(cls.endog, cls.groups)
|
||
|
cls.res = cls.mc.tukeyhsd(alpha=cls.alpha)
|
||
|
|
||
|
def test_multicomptukey(self):
|
||
|
assert_almost_equal(self.res.meandiffs, self.meandiff2, decimal=14)
|
||
|
assert_almost_equal(self.res.confint, self.confint2, decimal=2)
|
||
|
assert_equal(self.res.reject, self.reject2)
|
||
|
|
||
|
def test_group_tukey(self):
|
||
|
res_t = get_thsd(self.mc, alpha=self.alpha)
|
||
|
assert_almost_equal(res_t[4], self.confint2, decimal=2)
|
||
|
|
||
|
def test_shortcut_function(self):
|
||
|
#check wrapper function
|
||
|
res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha)
|
||
|
assert_almost_equal(res.confint, self.res.confint, decimal=14)
|
||
|
|
||
|
@pytest.mark.smoke
|
||
|
@pytest.mark.matplotlib
|
||
|
def test_plot_simultaneous_ci(self, close_figures):
|
||
|
self.res._simultaneous_ci()
|
||
|
reference = self.res.groupsunique[1]
|
||
|
self.res.plot_simultaneous(comparison_name=reference)
|
||
|
|
||
|
|
||
|
class TestTuckeyHSD2(CheckTuckeyHSDMixin):
|
||
|
|
||
|
@classmethod
|
||
|
def setup_class(cls):
|
||
|
#balanced case
|
||
|
cls.endog = dta2['StressReduction']
|
||
|
cls.groups = dta2['Treatment']
|
||
|
cls.alpha = 0.05
|
||
|
cls.setup_class_() #in super
|
||
|
|
||
|
#from R
|
||
|
tukeyhsd2s = np.array([ 1.5,1,-0.5,0.3214915,
|
||
|
-0.1785085,-1.678509,2.678509,2.178509,
|
||
|
0.6785085,0.01056279,0.1079035,0.5513904]
|
||
|
).reshape(3,4, order='F')
|
||
|
cls.meandiff2 = tukeyhsd2s[:, 0]
|
||
|
cls.confint2 = tukeyhsd2s[:, 1:3]
|
||
|
pvals = tukeyhsd2s[:, 3]
|
||
|
cls.reject2 = pvals < 0.05
|
||
|
|
||
|
def test_table_names_default_group_order(self):
|
||
|
t = self.res._results_table
|
||
|
# if the group_order parameter is not used, the groups should
|
||
|
# be reported in alphabetical order
|
||
|
expected_order = [(b'medical', b'mental'),
|
||
|
(b'medical', b'physical'),
|
||
|
(b'mental', b'physical')]
|
||
|
for i in range(1, 4):
|
||
|
first_group = t[i][0].data
|
||
|
second_group = t[i][1].data
|
||
|
assert_((first_group, second_group) == expected_order[i - 1])
|
||
|
|
||
|
def test_table_names_custom_group_order(self):
|
||
|
# if the group_order parameter is used, the groups should
|
||
|
# be reported in the specified order
|
||
|
mc = MultiComparison(self.endog, self.groups,
|
||
|
group_order=[b'physical', b'medical', b'mental'])
|
||
|
res = mc.tukeyhsd(alpha=self.alpha)
|
||
|
#print(res)
|
||
|
t = res._results_table
|
||
|
expected_order = [(b'physical',b'medical'),
|
||
|
(b'physical',b'mental'),
|
||
|
(b'medical', b'mental')]
|
||
|
for i in range(1, 4):
|
||
|
first_group = t[i][0].data
|
||
|
second_group = t[i][1].data
|
||
|
assert_((first_group, second_group) == expected_order[i - 1])
|
||
|
|
||
|
|
||
|
class TestTuckeyHSD2Pandas(TestTuckeyHSD2):
|
||
|
|
||
|
@classmethod
|
||
|
def setup_class(cls):
|
||
|
super().setup_class()
|
||
|
|
||
|
cls.endog = pd.Series(cls.endog)
|
||
|
# we are working with bytes on python 3, not with strings in this case
|
||
|
cls.groups = pd.Series(cls.groups, dtype=object)
|
||
|
|
||
|
def test_incorrect_output(self):
|
||
|
# too few groups
|
||
|
with pytest.raises(ValueError):
|
||
|
MultiComparison(np.array([1] * 10), [1, 2] * 4)
|
||
|
# too many groups
|
||
|
with pytest.raises(ValueError):
|
||
|
MultiComparison(np.array([1] * 10), [1, 2] * 6)
|
||
|
# just one group
|
||
|
with pytest.raises(ValueError):
|
||
|
MultiComparison(np.array([1] * 10), [1] * 10)
|
||
|
|
||
|
# group_order does not select all observations, only one group left
|
||
|
with warnings.catch_warnings(record=True) as w:
|
||
|
warnings.simplefilter('always')
|
||
|
assert_raises(ValueError, MultiComparison, np.array([1] * 10),
|
||
|
[1, 2] * 5, group_order=[1])
|
||
|
|
||
|
# group_order does not select all observations,
|
||
|
# we do tukey_hsd with reduced set of observations
|
||
|
data = np.arange(15)
|
||
|
groups = np.repeat([1, 2, 3], 5)
|
||
|
with warnings.catch_warnings(record=True) as w:
|
||
|
warnings.simplefilter('always')
|
||
|
mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2])
|
||
|
assert_equal(len(w), 1)
|
||
|
assert issubclass(w[0].category, UserWarning)
|
||
|
|
||
|
res1 = mod1.tukeyhsd(alpha=0.01)
|
||
|
mod2 = MultiComparison(np.array(data[:10]), groups[:10])
|
||
|
res2 = mod2.tukeyhsd(alpha=0.01)
|
||
|
|
||
|
attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique',
|
||
|
'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs',
|
||
|
'variance']
|
||
|
for att in attributes:
|
||
|
err_msg = att + 'failed'
|
||
|
assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14,
|
||
|
err_msg=err_msg)
|
||
|
|
||
|
attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique',
|
||
|
'ngroups', 'nobs', 'pairindices']
|
||
|
for att in attributes:
|
||
|
err_msg = att + 'failed'
|
||
|
assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14,
|
||
|
err_msg=err_msg)
|
||
|
|
||
|
|
||
|
class TestTuckeyHSD2s(CheckTuckeyHSDMixin):
|
||
|
@classmethod
|
||
|
def setup_class(cls):
|
||
|
#unbalanced case
|
||
|
cls.endog = dta2['StressReduction'][3:29]
|
||
|
cls.groups = dta2['Treatment'][3:29]
|
||
|
cls.alpha = 0.01
|
||
|
cls.setup_class_()
|
||
|
|
||
|
#from R
|
||
|
tukeyhsd2s = np.array(
|
||
|
[1.8888888888888889, 0.888888888888889, -1, 0.2658549,
|
||
|
-0.5908785, -2.587133, 3.511923, 2.368656,
|
||
|
0.5871331, 0.002837638, 0.150456, 0.1266072]
|
||
|
).reshape(3,4, order='F')
|
||
|
cls.meandiff2 = tukeyhsd2s[:, 0]
|
||
|
cls.confint2 = tukeyhsd2s[:, 1:3]
|
||
|
pvals = tukeyhsd2s[:, 3]
|
||
|
cls.reject2 = pvals < 0.01
|
||
|
|
||
|
|
||
|
class TestTuckeyHSD3(CheckTuckeyHSDMixin):
|
||
|
|
||
|
@classmethod
|
||
|
def setup_class(cls):
|
||
|
#SAS case
|
||
|
cls.endog = dta3['Relief']
|
||
|
cls.groups = dta3['Brand']
|
||
|
cls.alpha = 0.05
|
||
|
cls.setup_class_()
|
||
|
#super(cls, cls).setup_class_()
|
||
|
#CheckTuckeyHSD.setup_class_()
|
||
|
cls.meandiff2 = sas_['mean']
|
||
|
cls.confint2 = sas_[['lower','upper']].astype(float).values.reshape((3, 2))
|
||
|
cls.reject2 = sas_['sig'] == asbytes('***')
|
||
|
|
||
|
|
||
|
class TestTuckeyHSD4(CheckTuckeyHSDMixin):
|
||
|
|
||
|
@classmethod
|
||
|
def setup_class(cls):
|
||
|
#unbalanced case verified in Matlab
|
||
|
cls.endog = cylinders
|
||
|
cls.groups = cyl_labels
|
||
|
cls.alpha = 0.05
|
||
|
cls.setup_class_()
|
||
|
cls.res._simultaneous_ci()
|
||
|
|
||
|
#from Matlab
|
||
|
cls.halfwidth2 = np.array([1.5228335685980883, 0.9794949704444682, 0.78673802805533644,
|
||
|
2.3321237694566364, 0.57355135882752939])
|
||
|
cls.meandiff2 = np.array([0.22222222222222232, 0.13333333333333375, 0.0, 2.2898550724637685,
|
||
|
-0.088888888888888573, -0.22222222222222232, 2.0676328502415462,
|
||
|
-0.13333333333333375, 2.1565217391304348, 2.2898550724637685])
|
||
|
cls.confint2 = np.array([-2.32022210717, 2.76466655161, -2.247517583, 2.51418424967,
|
||
|
-3.66405224956, 3.66405224956, 0.113960166573, 4.46574997835,
|
||
|
-1.87278583908, 1.6950080613, -3.529655688, 3.08521124356, 0.568180988881,
|
||
|
3.5670847116, -3.31822643175, 3.05155976508, 0.951206924521, 3.36183655374,
|
||
|
-0.74487911754, 5.32458926247]).reshape(10,2)
|
||
|
cls.reject2 = np.array([False, False, False, True, False, False, True, False, True, False])
|
||
|
|
||
|
def test_hochberg_intervals(self):
|
||
|
assert_almost_equal(self.res.halfwidths, self.halfwidth2, 4)
|