import io import os import pytest import numpy as np from numpy.testing import assert_allclose, assert_equal import pandas as pd import patsy from statsmodels.api import families from statsmodels.tools.sm_exceptions import ( ValueWarning, ) from statsmodels.othermod.betareg import BetaModel from .results import results_betareg as resultsb links = families.links cur_dir = os.path.dirname(os.path.abspath(__file__)) res_dir = os.path.join(cur_dir, "results") # betareg(I(food/income) ~ income + persons, data = FoodExpenditure) _income_estimates_mean = """\ varname Estimate StdError zvalue Pr(>|z|) (Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03 income -0.01229884 0.003035585 -4.051556 5.087819e-05 persons 0.11846210 0.035340667 3.352005 8.022853e-04""" _income_estimates_precision = """\ varname Estimate StdError zvalue Pr(>|z|) (phi) 35.60975 8.079598 4.407366 1.046351e-05 """ _methylation_estimates_mean = """\ varname Estimate StdError zvalue Pr(>|z|) (Intercept) 1.44224 0.03401 42.404 2e-16 genderM 0.06986 0.04359 1.603 0.109 CpGCpG_1 0.60735 0.04834 12.563 2e-16 CpGCpG_2 0.97355 0.05311 18.331 2e-16""" _methylation_estimates_precision = """\ varname Estimate StdError zvalue Pr(>|z|) (Intercept) 8.22829 1.79098 4.594 4.34e-06 age -0.03471 0.03276 -1.059 0.289""" expected_income_mean = pd.read_table( io.StringIO(_income_estimates_mean), sep=r"\s+") expected_income_precision = pd.read_table( io.StringIO(_income_estimates_precision), sep=r"\s+") expected_methylation_mean = pd.read_table( io.StringIO(_methylation_estimates_mean), sep=r"\s+") expected_methylation_precision = pd.read_table( io.StringIO(_methylation_estimates_precision), sep=r"\s+") income = pd.read_csv(os.path.join(res_dir, 'foodexpenditure.csv')) methylation = pd.read_csv(os.path.join(res_dir, 'methylation-test.csv')) def check_same(a, b, eps, name): assert np.allclose(a, b, rtol=0.01, atol=eps), \ ("different from expected", name, list(a), list(b)) def assert_close(a, b, eps): assert np.allclose(a, b, rtol=0.01, atol=eps), (list(a), list(b)) class TestBetaModel: @classmethod def setup_class(cls): model = "I(food/income) ~ income + persons" cls.income_fit = BetaModel.from_formula(model, income).fit() model = cls.model = "methylation ~ gender + CpG" Z = cls.Z = patsy.dmatrix("~ age", methylation) mod = BetaModel.from_formula(model, methylation, exog_precision=Z, link_precision=links.Identity()) cls.meth_fit = mod.fit() mod = BetaModel.from_formula(model, methylation, exog_precision=Z, link_precision=links.Log()) cls.meth_log_fit = mod.fit() def test_income_coefficients(self): rslt = self.income_fit assert_close(rslt.params[:-1], expected_income_mean['Estimate'], 1e-3) assert_close(rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1) assert_close(rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3) def test_income_precision(self): rslt = self.income_fit # note that we have to exp the phi results for now. assert_close(np.exp(rslt.params[-1:]), expected_income_precision['Estimate'], 1e-3) # yield check_same, rslt.tvalues[-1:], # expected_income_precision['zvalue'], 0.1, "z-score" assert_close(rslt.pvalues[-1:], expected_income_precision['Pr(>|z|)'], 1e-3) def test_methylation_coefficients(self): rslt = self.meth_fit assert_close(rslt.params[:-2], expected_methylation_mean['Estimate'], 1e-2) assert_close(rslt.tvalues[:-2], expected_methylation_mean['zvalue'], 0.1) assert_close(rslt.pvalues[:-2], expected_methylation_mean['Pr(>|z|)'], 1e-2) def test_methylation_precision(self): # R results are from log link_precision rslt = self.meth_log_fit assert_allclose(rslt.params[-2:], expected_methylation_precision['Estimate'], atol=1e-5, rtol=1e-10) # expected_methylation_precision['Estimate'] # yield check_same, links.logit()(rslt.params[-2:]), # expected_methylation_precision['Estimate'], 1e-3, "estimate" # yield check_same, rslt.tvalues[-2:], # expected_methylation_precision['zvalue'], 0.1, "z-score" def test_precision_formula(self): m = BetaModel.from_formula(self.model, methylation, exog_precision_formula='~ age', link_precision=links.Identity()) rslt = m.fit() assert_close(rslt.params, self.meth_fit.params, 1e-10) assert isinstance(rslt.params, pd.Series) with pytest.warns(ValueWarning, match="unknown kwargs"): BetaModel.from_formula(self.model, methylation, exog_precision_formula='~ age', link_precision=links.Identity(), junk=False) def test_scores(self): model, Z = self.model, self.Z for link in (links.Identity(), links.Log()): mod2 = BetaModel.from_formula(model, methylation, exog_precision=Z, link_precision=link) rslt_m = mod2.fit() # evaluate away from optimum to get larger score analytical = rslt_m.model.score(rslt_m.params * 1.01) numerical = rslt_m.model._score_check(rslt_m.params * 1.01) assert_allclose(analytical, numerical, rtol=1e-6, atol=1e-6) assert_allclose(link.inverse(analytical[3:]), link.inverse(numerical[3:]), rtol=5e-7, atol=5e-6) def test_results_other(self): rslt = self.meth_fit distr = rslt.get_distribution() mean, var = distr.stats() assert_allclose(rslt.fittedvalues, mean, rtol=1e-13) assert_allclose(rslt.model._predict_var(rslt.params), var, rtol=1e-13) resid = rslt.model.endog - mean assert_allclose(rslt.resid, resid, rtol=1e-12) assert_allclose(rslt.resid_pearson, resid / np.sqrt(var), rtol=1e-12) class TestBetaMeth(): @classmethod def setup_class(cls): formula = "methylation ~ gender + CpG" mod = BetaModel.from_formula(formula, methylation, exog_precision_formula="~ age", link_precision=links.Log()) cls.res1 = mod.fit(cov_type="eim") cls.res2 = resultsb.results_meth def test_basic(self): res1 = self.res1 res2 = self.res2 k_mean = 4 p, se, zv, pv = res2.table_mean.T assert_allclose(res1.params[:k_mean], p, rtol=1e-6) assert_allclose(res1.bse[:k_mean], se, rtol=1e-6) assert_allclose(res1.tvalues[:k_mean], zv, rtol=1e-6) assert_allclose(res1.pvalues[:k_mean], pv, rtol=1e-5) p, se, zv, pv = res2.table_precision.T assert_allclose(res1.params[k_mean:], p, rtol=1e-6) assert_allclose(res1.bse[k_mean:], se, rtol=1e-6) assert_allclose(res1.tvalues[k_mean:], zv, rtol=1e-6) assert_allclose(res1.pvalues[k_mean:], pv, rtol=1e-5) assert_allclose(res1.llf, res2.loglik, rtol=1e-10) assert_allclose(res1.aic, res2.aic, rtol=1e-10) assert_allclose(res1.bic, res2.bic, rtol=1e-10) # dofferent definitions for prsquared assert_allclose(res1.prsquared, res2.pseudo_r_squared, atol=0.01) assert_equal(res1.df_resid, res2.df_residual) assert_equal(res1.nobs, res2.nobs) # null model compared to R betareg and lmtest df_c = res1.df_resid_null - res1.df_resid assert_equal(res1.k_null, 2) # > lrt = lrtest(res_meth_null, res_meth) # results from R pv = 7.21872953868659e-18 lln = 60.88809589492269 llf = 104.14802840534323 chisq = 86.51986502084107 dfc = 4 # stats.chi2.sf(86.51986502093865, 4) assert_equal(df_c, dfc) assert_allclose(res1.llf, llf, rtol=1e-10) assert_allclose(res1.llnull, lln, rtol=1e-10) assert_allclose(res1.llr, chisq, rtol=1e-10) assert_allclose(res1.llr_pvalue, pv, rtol=1e-6) def test_resid(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1.fittedvalues, res2.resid['fittedvalues'], rtol=1e-8) assert_allclose(res1.resid, res2.resid['response'], atol=1e-8, rtol=1e-8) def test_oim(self): # estimate with default oim, cov_type nonrobust res1 = self.res1.model.fit() res2 = self.res2 k_mean = 4 # R betareg uses numerical derivatives from bfgs, has lower precision p, se, zv, pv = res2.table_mean_oim.T assert_allclose(res1.params[:k_mean], p, rtol=1e-6) assert_allclose(res1.bse[:k_mean], se, rtol=1e-5) assert_allclose(res1.tvalues[:k_mean], zv, rtol=1e-5) assert_allclose(res1.pvalues[:k_mean], pv, atol=1e-6, rtol=1e-5) p, se, zv, pv = res2.table_precision_oim.T assert_allclose(res1.params[k_mean:], p, rtol=1e-6) assert_allclose(res1.bse[k_mean:], se, rtol=1e-3) assert_allclose(res1.tvalues[k_mean:], zv, rtol=1e-3) assert_allclose(res1.pvalues[k_mean:], pv, rtol=1e-2) def test_predict_distribution(self): res1 = self.res1 mean = res1.predict() var_ = res1.model._predict_var(res1.params) distr = res1.get_distribution() m2, v2 = distr.stats() assert_allclose(mean, m2, rtol=1e-13) assert_allclose(var_, v2, rtol=1e-13) # from R: > predict(res_meth, type="variance") var_r6 = [ 3.1090848852102e-04, 2.4509604000073e-04, 3.7199753140565e-04, 2.8088261358738e-04, 2.7561111800350e-04, 3.3929220526847e-04] n = 6 assert_allclose(v2[:n], var_r6, rtol=1e-7) ex = res1.model.exog[:n] ex_prec = res1.model.exog_precision[:n] mean6 = res1.predict(ex, transform=False) prec = res1.predict(which="precision") # todo: prec6 wrong exog if not used as keyword, no exception raised prec6 = res1.predict(exog_precision=ex_prec, which="precision", transform=False) var6 = res1.model._predict_var(res1.params, exog=ex, exog_precision=ex_prec) assert_allclose(mean6, mean[:n], rtol=1e-13) assert_allclose(prec6, prec[:n], rtol=1e-13) assert_allclose(var6, var_[:n], rtol=1e-13) assert_allclose(var6, var_r6, rtol=1e-7) distr6 = res1.model.get_distribution(res1.params, exog=ex, exog_precision=ex_prec) m26, v26 = distr6.stats() assert_allclose(m26, m2[:n], rtol=1e-13) assert_allclose(v26, v2[:n], rtol=1e-13) distr6f = res1.get_distribution(exog=ex, exog_precision=ex_prec, transform=False) m26, v26 = distr6f.stats() assert_allclose(m26, m2[:n], rtol=1e-13) assert_allclose(v26, v2[:n], rtol=1e-13) # check formula transform works for predict, currently mean only df6 = methylation.iloc[:6] mean6f = res1.predict(df6) # todo: prec6 wrong exog if not used as keyword, no exception raised # formula not supported for exog_precision in predict # prec6f = res1.predict(exog_precision=ex_prec, which="precision") assert_allclose(mean6f, mean[:n], rtol=1e-13) # assert_allclose(prec6f, prec[:n], rtol=1e-13) distr6f = res1.get_distribution(exog=df6, exog_precision=ex_prec) m26, v26 = distr6f.stats() assert_allclose(m26, m2[:n], rtol=1e-13) assert_allclose(v26, v2[:n], rtol=1e-13) # check that we don't have pandas in distr assert isinstance(distr6f.args[0], np.ndarray) # minimal checks for get_prediction pma = res1.get_prediction(which="mean", average=True) dfma = pma.summary_frame() assert_allclose(pma.predicted, mean.mean(), rtol=1e-13) assert_equal(dfma.shape, (1, 4)) pm = res1.get_prediction(exog=df6, which="mean", average=False) dfm = pm.summary_frame() assert_allclose(pm.predicted, mean6, rtol=1e-13) assert_equal(dfm.shape, (6, 4)) pv = res1.get_prediction(exog=df6, exog_precision=ex_prec, which="var", average=False) dfv = pv.summary_frame() assert_allclose(pv.predicted, var6, rtol=1e-13) assert_equal(dfv.shape, (6, 4)) # smoke tests res1.get_prediction(which="linear", average=False) res1.get_prediction(which="precision", average=True) res1.get_prediction(exog_precision=ex_prec, which="precision", average=False) res1.get_prediction(which="linear-precision", average=True) # test agg_weights pm = res1.get_prediction(exog=df6, which="mean", average=True) dfm = pm.summary_frame() aw = np.zeros(len(res1.model.endog)) aw[:6] = 1 aw /= aw.mean() pm6 = res1.get_prediction(exog=df6, which="mean", average=True) dfm6 = pm6.summary_frame() pmw = res1.get_prediction(which="mean", average=True, agg_weights=aw) dfmw = pmw.summary_frame() assert_allclose(pmw.predicted, pm6.predicted, rtol=1e-13) assert_allclose(dfmw, dfm6, rtol=1e-13) class TestBetaIncome(): @classmethod def setup_class(cls): formula = "I(food/income) ~ income + persons" exog_prec = patsy.dmatrix("~ persons", income) mod_income = BetaModel.from_formula( formula, income, exog_precision=exog_prec, link_precision=links.Log() ) res_income = mod_income.fit(method="newton") mod_restricted = BetaModel.from_formula( formula, income, link_precision=links.Log() ) res_restricted = mod_restricted.fit(method="newton") cls.res1 = res_income cls.resr = res_restricted def test_score_test(self): res1 = self.res1 resr = self.resr params_restr = np.concatenate([resr.params, [0]]) r_matrix = np.zeros((1, len(params_restr))) r_matrix[0, -1] = 1 exog_prec_extra = res1.model.exog_precision[:, 1:] from statsmodels.base._parameter_inference import score_test sc1 = score_test(res1, params_constrained=params_restr, k_constraints=1) sc2 = score_test(resr, exog_extra=(None, exog_prec_extra)) assert_allclose(sc2[:2], sc1[:2]) sc1_hc = score_test(res1, params_constrained=params_restr, k_constraints=1, r_matrix=r_matrix, cov_type="HC0") sc2_hc = score_test(resr, exog_extra=(None, exog_prec_extra), cov_type="HC0") assert_allclose(sc2_hc[:2], sc1_hc[:2]) def test_influence(self): # currently only smoke test res1 = self.res1 from statsmodels.stats.outliers_influence import MLEInfluence influ0 = MLEInfluence(res1) influ = res1.get_influence() attrs = ['cooks_distance', 'd_fittedvalues', 'd_fittedvalues_scaled', 'd_params', 'dfbetas', 'hat_matrix_diag', 'resid_studentized' ] for attr in attrs: getattr(influ, attr) frame = influ.summary_frame() frame0 = influ0.summary_frame() assert_allclose(frame, frame0, rtol=1e-13, atol=1e-13)