""" Created on Wed Mar 28 15:34:18 2012 Author: Josef Perktold """ from statsmodels.compat.python import asbytes from io import BytesIO import warnings import numpy as np import pandas as pd import pytest from numpy.testing import assert_, assert_allclose, assert_almost_equal, assert_equal, \ assert_raises from statsmodels.stats.libqsturng import qsturng from statsmodels.stats.multicomp import (tukeyhsd, pairwise_tukeyhsd, MultiComparison) ss = '''\ 43.9 1 1 39.0 1 2 46.7 1 3 43.8 1 4 44.2 1 5 47.7 1 6 43.6 1 7 38.9 1 8 43.6 1 9 40.0 1 10 89.8 2 1 87.1 2 2 92.7 2 3 90.6 2 4 87.7 2 5 92.4 2 6 86.1 2 7 88.1 2 8 90.8 2 9 89.1 2 10 68.4 3 1 69.3 3 2 68.5 3 3 66.4 3 4 70.0 3 5 68.1 3 6 70.6 3 7 65.2 3 8 63.8 3 9 69.2 3 10 36.2 4 1 45.2 4 2 40.7 4 3 40.5 4 4 39.3 4 5 40.3 4 6 43.2 4 7 38.7 4 8 40.9 4 9 39.7 4 10''' #idx Treatment StressReduction ss2 = '''\ 1 mental 2 2 mental 2 3 mental 3 4 mental 4 5 mental 4 6 mental 5 7 mental 3 8 mental 4 9 mental 4 10 mental 4 11 physical 4 12 physical 4 13 physical 3 14 physical 5 15 physical 4 16 physical 1 17 physical 1 18 physical 2 19 physical 3 20 physical 3 21 medical 1 22 medical 2 23 medical 2 24 medical 2 25 medical 3 26 medical 2 27 medical 3 28 medical 1 29 medical 3 30 medical 1''' ss3 = '''\ 1 24.5 1 23.5 1 26.4 1 27.1 1 29.9 2 28.4 2 34.2 2 29.5 2 32.2 2 30.1 3 26.1 3 28.3 3 24.3 3 26.2 3 27.8''' ss5 = '''\ 2 - 3\t4.340\t0.691\t7.989\t*** 2 - 1\t4.600\t0.951\t8.249\t*** 3 - 2\t-4.340\t-7.989\t-0.691\t*** 3 - 1\t0.260\t-3.389\t3.909\t- 1 - 2\t-4.600\t-8.249\t-0.951\t*** 1 - 3\t-0.260\t-3.909\t3.389\t- ''' cylinders = np.array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 6, 6, 6, 4, 4, 4, 4, 4, 4, 6, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 4, 4, 4, 4, 4, 8, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 4, 6, 4, 4, 4, 4, 4, 4, 4, 4]) cyl_labels = np.array(['USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Japan', 'Germany', 'France', 'Germany', 'Sweden', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'France', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'Sweden', 'USA', 'France', 'Japan', 'Germany', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'USA', 'Germany', 'Japan', 'Japan', 'USA', 'USA', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'Japan', 'USA', 'USA', 'USA', 'USA', 'Japan', 'USA', 'USA', 'USA', 'Germany', 'USA', 'USA', 'USA']) #accommodate recfromtxt for python 3.2, requires bytes ss = asbytes(ss) ss2 = asbytes(ss2) ss3 = asbytes(ss3) ss5 = asbytes(ss5) dta = pd.read_csv(BytesIO(ss), sep=r'\s+', header=None, engine='python') dta.columns = "Rust", "Brand", "Replication" dta2 = pd.read_csv(BytesIO(ss2), sep=r'\s+', header=None, engine='python') dta2.columns = "idx", "Treatment", "StressReduction" dta2["Treatment"] = dta2["Treatment"].map(lambda v: v.encode('utf-8')) dta3 = pd.read_csv(BytesIO(ss3), sep=r'\s+', header=None, engine='python') dta3.columns = ["Brand", "Relief"] dta5 = pd.read_csv(BytesIO(ss5), sep=r'\t', header=None, engine='python') dta5.columns = ['pair', 'mean', 'lower', 'upper', 'sig'] for col in ('pair', 'sig'): dta5[col] = dta5[col].map(lambda v: v.encode('utf-8')) sas_ = dta5.iloc[[1, 3, 2]] def get_thsd(mci, alpha=0.05): var_ = np.var(mci.groupstats.groupdemean(), ddof=len(mci.groupsunique)) means = mci.groupstats.groupmean nobs = mci.groupstats.groupnobs resi = tukeyhsd(means, nobs, var_, df=None, alpha=alpha, q_crit=qsturng(1-alpha, len(means), (nobs-1).sum())) #print resi[4] var2 = (mci.groupstats.groupvarwithin() * (nobs - 1.)).sum() \ / (nobs - 1.).sum() #print nobs, (nobs - 1).sum() #print mci.groupstats.groupvarwithin() assert_almost_equal(var_, var2, decimal=14) return resi class CheckTuckeyHSDMixin: @classmethod def setup_class_(cls): cls.mc = MultiComparison(cls.endog, cls.groups) cls.res = cls.mc.tukeyhsd(alpha=cls.alpha) def test_multicomptukey(self): assert_almost_equal(self.res.meandiffs, self.meandiff2, decimal=14) assert_almost_equal(self.res.confint, self.confint2, decimal=2) assert_equal(self.res.reject, self.reject2) def test_group_tukey(self): res_t = get_thsd(self.mc, alpha=self.alpha) assert_almost_equal(res_t[4], self.confint2, decimal=2) def test_shortcut_function(self): #check wrapper function res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha) assert_almost_equal(res.confint, self.res.confint, decimal=14) @pytest.mark.smoke @pytest.mark.matplotlib def test_plot_simultaneous_ci(self, close_figures): self.res._simultaneous_ci() reference = self.res.groupsunique[1] self.res.plot_simultaneous(comparison_name=reference) class TestTuckeyHSD2(CheckTuckeyHSDMixin): @classmethod def setup_class(cls): #balanced case cls.endog = dta2['StressReduction'] cls.groups = dta2['Treatment'] cls.alpha = 0.05 cls.setup_class_() #in super #from R tukeyhsd2s = np.array([ 1.5,1,-0.5,0.3214915, -0.1785085,-1.678509,2.678509,2.178509, 0.6785085,0.01056279,0.1079035,0.5513904] ).reshape(3,4, order='F') cls.meandiff2 = tukeyhsd2s[:, 0] cls.confint2 = tukeyhsd2s[:, 1:3] pvals = tukeyhsd2s[:, 3] cls.reject2 = pvals < 0.05 def test_table_names_default_group_order(self): t = self.res._results_table # if the group_order parameter is not used, the groups should # be reported in alphabetical order expected_order = [(b'medical', b'mental'), (b'medical', b'physical'), (b'mental', b'physical')] for i in range(1, 4): first_group = t[i][0].data second_group = t[i][1].data assert_((first_group, second_group) == expected_order[i - 1]) def test_table_names_custom_group_order(self): # if the group_order parameter is used, the groups should # be reported in the specified order mc = MultiComparison(self.endog, self.groups, group_order=[b'physical', b'medical', b'mental']) res = mc.tukeyhsd(alpha=self.alpha) #print(res) t = res._results_table expected_order = [(b'physical',b'medical'), (b'physical',b'mental'), (b'medical', b'mental')] for i in range(1, 4): first_group = t[i][0].data second_group = t[i][1].data assert_((first_group, second_group) == expected_order[i - 1]) class TestTuckeyHSD2Pandas(TestTuckeyHSD2): @classmethod def setup_class(cls): super().setup_class() cls.endog = pd.Series(cls.endog) # we are working with bytes on python 3, not with strings in this case cls.groups = pd.Series(cls.groups, dtype=object) def test_incorrect_output(self): # too few groups with pytest.raises(ValueError): MultiComparison(np.array([1] * 10), [1, 2] * 4) # too many groups with pytest.raises(ValueError): MultiComparison(np.array([1] * 10), [1, 2] * 6) # just one group with pytest.raises(ValueError): MultiComparison(np.array([1] * 10), [1] * 10) # group_order does not select all observations, only one group left with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') assert_raises(ValueError, MultiComparison, np.array([1] * 10), [1, 2] * 5, group_order=[1]) # group_order does not select all observations, # we do tukey_hsd with reduced set of observations data = np.arange(15) groups = np.repeat([1, 2, 3], 5) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') mod1 = MultiComparison(np.array(data), groups, group_order=[1, 2]) assert_equal(len(w), 1) assert issubclass(w[0].category, UserWarning) res1 = mod1.tukeyhsd(alpha=0.01) mod2 = MultiComparison(np.array(data[:10]), groups[:10]) res2 = mod2.tukeyhsd(alpha=0.01) attributes = ['confint', 'data', 'df_total', 'groups', 'groupsunique', 'meandiffs', 'q_crit', 'reject', 'reject2', 'std_pairs', 'variance'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(res1, att), getattr(res2, att), rtol=1e-14, err_msg=err_msg) attributes = ['data', 'datali', 'groupintlab', 'groups', 'groupsunique', 'ngroups', 'nobs', 'pairindices'] for att in attributes: err_msg = att + 'failed' assert_allclose(getattr(mod1, att), getattr(mod2, att), rtol=1e-14, err_msg=err_msg) class TestTuckeyHSD2s(CheckTuckeyHSDMixin): @classmethod def setup_class(cls): #unbalanced case cls.endog = dta2['StressReduction'][3:29] cls.groups = dta2['Treatment'][3:29] cls.alpha = 0.01 cls.setup_class_() #from R tukeyhsd2s = np.array( [1.8888888888888889, 0.888888888888889, -1, 0.2658549, -0.5908785, -2.587133, 3.511923, 2.368656, 0.5871331, 0.002837638, 0.150456, 0.1266072] ).reshape(3,4, order='F') cls.meandiff2 = tukeyhsd2s[:, 0] cls.confint2 = tukeyhsd2s[:, 1:3] pvals = tukeyhsd2s[:, 3] cls.reject2 = pvals < 0.01 class TestTuckeyHSD3(CheckTuckeyHSDMixin): @classmethod def setup_class(cls): #SAS case cls.endog = dta3['Relief'] cls.groups = dta3['Brand'] cls.alpha = 0.05 cls.setup_class_() #super(cls, cls).setup_class_() #CheckTuckeyHSD.setup_class_() cls.meandiff2 = sas_['mean'] cls.confint2 = sas_[['lower','upper']].astype(float).values.reshape((3, 2)) cls.reject2 = sas_['sig'] == asbytes('***') class TestTuckeyHSD4(CheckTuckeyHSDMixin): @classmethod def setup_class(cls): #unbalanced case verified in Matlab cls.endog = cylinders cls.groups = cyl_labels cls.alpha = 0.05 cls.setup_class_() cls.res._simultaneous_ci() #from Matlab cls.halfwidth2 = np.array([1.5228335685980883, 0.9794949704444682, 0.78673802805533644, 2.3321237694566364, 0.57355135882752939]) cls.meandiff2 = np.array([0.22222222222222232, 0.13333333333333375, 0.0, 2.2898550724637685, -0.088888888888888573, -0.22222222222222232, 2.0676328502415462, -0.13333333333333375, 2.1565217391304348, 2.2898550724637685]) cls.confint2 = np.array([-2.32022210717, 2.76466655161, -2.247517583, 2.51418424967, -3.66405224956, 3.66405224956, 0.113960166573, 4.46574997835, -1.87278583908, 1.6950080613, -3.529655688, 3.08521124356, 0.568180988881, 3.5670847116, -3.31822643175, 3.05155976508, 0.951206924521, 3.36183655374, -0.74487911754, 5.32458926247]).reshape(10,2) cls.reject2 = np.array([False, False, False, True, False, False, True, False, True, False]) def test_hochberg_intervals(self): assert_almost_equal(self.res.halfwidths, self.halfwidth2, 4)