AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/othermod/tests/test_beta.py
2024-10-02 22:15:59 +04:00

400 lines
16 KiB
Python

import io
import os
import pytest
import numpy as np
from numpy.testing import assert_allclose, assert_equal
import pandas as pd
import patsy
from statsmodels.api import families
from statsmodels.tools.sm_exceptions import (
ValueWarning,
)
from statsmodels.othermod.betareg import BetaModel
from .results import results_betareg as resultsb
links = families.links
cur_dir = os.path.dirname(os.path.abspath(__file__))
res_dir = os.path.join(cur_dir, "results")
# betareg(I(food/income) ~ income + persons, data = FoodExpenditure)
_income_estimates_mean = """\
varname Estimate StdError zvalue Pr(>|z|)
(Intercept) -0.62254806 0.223853539 -2.781051 5.418326e-03
income -0.01229884 0.003035585 -4.051556 5.087819e-05
persons 0.11846210 0.035340667 3.352005 8.022853e-04"""
_income_estimates_precision = """\
varname Estimate StdError zvalue Pr(>|z|)
(phi) 35.60975 8.079598 4.407366 1.046351e-05
"""
_methylation_estimates_mean = """\
varname Estimate StdError zvalue Pr(>|z|)
(Intercept) 1.44224 0.03401 42.404 2e-16
genderM 0.06986 0.04359 1.603 0.109
CpGCpG_1 0.60735 0.04834 12.563 2e-16
CpGCpG_2 0.97355 0.05311 18.331 2e-16"""
_methylation_estimates_precision = """\
varname Estimate StdError zvalue Pr(>|z|)
(Intercept) 8.22829 1.79098 4.594 4.34e-06
age -0.03471 0.03276 -1.059 0.289"""
expected_income_mean = pd.read_table(
io.StringIO(_income_estimates_mean), sep=r"\s+")
expected_income_precision = pd.read_table(
io.StringIO(_income_estimates_precision), sep=r"\s+")
expected_methylation_mean = pd.read_table(
io.StringIO(_methylation_estimates_mean), sep=r"\s+")
expected_methylation_precision = pd.read_table(
io.StringIO(_methylation_estimates_precision), sep=r"\s+")
income = pd.read_csv(os.path.join(res_dir, 'foodexpenditure.csv'))
methylation = pd.read_csv(os.path.join(res_dir, 'methylation-test.csv'))
def check_same(a, b, eps, name):
assert np.allclose(a, b, rtol=0.01, atol=eps), \
("different from expected", name, list(a), list(b))
def assert_close(a, b, eps):
assert np.allclose(a, b, rtol=0.01, atol=eps), (list(a), list(b))
class TestBetaModel:
@classmethod
def setup_class(cls):
model = "I(food/income) ~ income + persons"
cls.income_fit = BetaModel.from_formula(model, income).fit()
model = cls.model = "methylation ~ gender + CpG"
Z = cls.Z = patsy.dmatrix("~ age", methylation)
mod = BetaModel.from_formula(model, methylation, exog_precision=Z,
link_precision=links.Identity())
cls.meth_fit = mod.fit()
mod = BetaModel.from_formula(model, methylation, exog_precision=Z,
link_precision=links.Log())
cls.meth_log_fit = mod.fit()
def test_income_coefficients(self):
rslt = self.income_fit
assert_close(rslt.params[:-1], expected_income_mean['Estimate'], 1e-3)
assert_close(rslt.tvalues[:-1], expected_income_mean['zvalue'], 0.1)
assert_close(rslt.pvalues[:-1], expected_income_mean['Pr(>|z|)'], 1e-3)
def test_income_precision(self):
rslt = self.income_fit
# note that we have to exp the phi results for now.
assert_close(np.exp(rslt.params[-1:]),
expected_income_precision['Estimate'], 1e-3)
# yield check_same, rslt.tvalues[-1:],
# expected_income_precision['zvalue'], 0.1, "z-score"
assert_close(rslt.pvalues[-1:],
expected_income_precision['Pr(>|z|)'], 1e-3)
def test_methylation_coefficients(self):
rslt = self.meth_fit
assert_close(rslt.params[:-2],
expected_methylation_mean['Estimate'], 1e-2)
assert_close(rslt.tvalues[:-2],
expected_methylation_mean['zvalue'], 0.1)
assert_close(rslt.pvalues[:-2],
expected_methylation_mean['Pr(>|z|)'], 1e-2)
def test_methylation_precision(self):
# R results are from log link_precision
rslt = self.meth_log_fit
assert_allclose(rslt.params[-2:],
expected_methylation_precision['Estimate'],
atol=1e-5, rtol=1e-10)
# expected_methylation_precision['Estimate']
# yield check_same, links.logit()(rslt.params[-2:]),
# expected_methylation_precision['Estimate'], 1e-3, "estimate"
# yield check_same, rslt.tvalues[-2:],
# expected_methylation_precision['zvalue'], 0.1, "z-score"
def test_precision_formula(self):
m = BetaModel.from_formula(self.model, methylation,
exog_precision_formula='~ age',
link_precision=links.Identity())
rslt = m.fit()
assert_close(rslt.params, self.meth_fit.params, 1e-10)
assert isinstance(rslt.params, pd.Series)
with pytest.warns(ValueWarning, match="unknown kwargs"):
BetaModel.from_formula(self.model, methylation,
exog_precision_formula='~ age',
link_precision=links.Identity(),
junk=False)
def test_scores(self):
model, Z = self.model, self.Z
for link in (links.Identity(), links.Log()):
mod2 = BetaModel.from_formula(model, methylation, exog_precision=Z,
link_precision=link)
rslt_m = mod2.fit()
# evaluate away from optimum to get larger score
analytical = rslt_m.model.score(rslt_m.params * 1.01)
numerical = rslt_m.model._score_check(rslt_m.params * 1.01)
assert_allclose(analytical, numerical, rtol=1e-6, atol=1e-6)
assert_allclose(link.inverse(analytical[3:]),
link.inverse(numerical[3:]), rtol=5e-7, atol=5e-6)
def test_results_other(self):
rslt = self.meth_fit
distr = rslt.get_distribution()
mean, var = distr.stats()
assert_allclose(rslt.fittedvalues, mean, rtol=1e-13)
assert_allclose(rslt.model._predict_var(rslt.params), var, rtol=1e-13)
resid = rslt.model.endog - mean
assert_allclose(rslt.resid, resid, rtol=1e-12)
assert_allclose(rslt.resid_pearson, resid / np.sqrt(var), rtol=1e-12)
class TestBetaMeth():
@classmethod
def setup_class(cls):
formula = "methylation ~ gender + CpG"
mod = BetaModel.from_formula(formula, methylation,
exog_precision_formula="~ age",
link_precision=links.Log())
cls.res1 = mod.fit(cov_type="eim")
cls.res2 = resultsb.results_meth
def test_basic(self):
res1 = self.res1
res2 = self.res2
k_mean = 4
p, se, zv, pv = res2.table_mean.T
assert_allclose(res1.params[:k_mean], p, rtol=1e-6)
assert_allclose(res1.bse[:k_mean], se, rtol=1e-6)
assert_allclose(res1.tvalues[:k_mean], zv, rtol=1e-6)
assert_allclose(res1.pvalues[:k_mean], pv, rtol=1e-5)
p, se, zv, pv = res2.table_precision.T
assert_allclose(res1.params[k_mean:], p, rtol=1e-6)
assert_allclose(res1.bse[k_mean:], se, rtol=1e-6)
assert_allclose(res1.tvalues[k_mean:], zv, rtol=1e-6)
assert_allclose(res1.pvalues[k_mean:], pv, rtol=1e-5)
assert_allclose(res1.llf, res2.loglik, rtol=1e-10)
assert_allclose(res1.aic, res2.aic, rtol=1e-10)
assert_allclose(res1.bic, res2.bic, rtol=1e-10)
# dofferent definitions for prsquared
assert_allclose(res1.prsquared, res2.pseudo_r_squared, atol=0.01)
assert_equal(res1.df_resid, res2.df_residual)
assert_equal(res1.nobs, res2.nobs)
# null model compared to R betareg and lmtest
df_c = res1.df_resid_null - res1.df_resid
assert_equal(res1.k_null, 2)
# > lrt = lrtest(res_meth_null, res_meth) # results from R
pv = 7.21872953868659e-18
lln = 60.88809589492269
llf = 104.14802840534323
chisq = 86.51986502084107
dfc = 4
# stats.chi2.sf(86.51986502093865, 4)
assert_equal(df_c, dfc)
assert_allclose(res1.llf, llf, rtol=1e-10)
assert_allclose(res1.llnull, lln, rtol=1e-10)
assert_allclose(res1.llr, chisq, rtol=1e-10)
assert_allclose(res1.llr_pvalue, pv, rtol=1e-6)
def test_resid(self):
res1 = self.res1
res2 = self.res2
assert_allclose(res1.fittedvalues, res2.resid['fittedvalues'],
rtol=1e-8)
assert_allclose(res1.resid, res2.resid['response'],
atol=1e-8, rtol=1e-8)
def test_oim(self):
# estimate with default oim, cov_type nonrobust
res1 = self.res1.model.fit()
res2 = self.res2
k_mean = 4
# R betareg uses numerical derivatives from bfgs, has lower precision
p, se, zv, pv = res2.table_mean_oim.T
assert_allclose(res1.params[:k_mean], p, rtol=1e-6)
assert_allclose(res1.bse[:k_mean], se, rtol=1e-5)
assert_allclose(res1.tvalues[:k_mean], zv, rtol=1e-5)
assert_allclose(res1.pvalues[:k_mean], pv, atol=1e-6, rtol=1e-5)
p, se, zv, pv = res2.table_precision_oim.T
assert_allclose(res1.params[k_mean:], p, rtol=1e-6)
assert_allclose(res1.bse[k_mean:], se, rtol=1e-3)
assert_allclose(res1.tvalues[k_mean:], zv, rtol=1e-3)
assert_allclose(res1.pvalues[k_mean:], pv, rtol=1e-2)
def test_predict_distribution(self):
res1 = self.res1
mean = res1.predict()
var_ = res1.model._predict_var(res1.params)
distr = res1.get_distribution()
m2, v2 = distr.stats()
assert_allclose(mean, m2, rtol=1e-13)
assert_allclose(var_, v2, rtol=1e-13)
# from R: > predict(res_meth, type="variance")
var_r6 = [
3.1090848852102e-04, 2.4509604000073e-04, 3.7199753140565e-04,
2.8088261358738e-04, 2.7561111800350e-04, 3.3929220526847e-04]
n = 6
assert_allclose(v2[:n], var_r6, rtol=1e-7)
ex = res1.model.exog[:n]
ex_prec = res1.model.exog_precision[:n]
mean6 = res1.predict(ex, transform=False)
prec = res1.predict(which="precision")
# todo: prec6 wrong exog if not used as keyword, no exception raised
prec6 = res1.predict(exog_precision=ex_prec, which="precision",
transform=False)
var6 = res1.model._predict_var(res1.params, exog=ex,
exog_precision=ex_prec)
assert_allclose(mean6, mean[:n], rtol=1e-13)
assert_allclose(prec6, prec[:n], rtol=1e-13)
assert_allclose(var6, var_[:n], rtol=1e-13)
assert_allclose(var6, var_r6, rtol=1e-7)
distr6 = res1.model.get_distribution(res1.params,
exog=ex, exog_precision=ex_prec)
m26, v26 = distr6.stats()
assert_allclose(m26, m2[:n], rtol=1e-13)
assert_allclose(v26, v2[:n], rtol=1e-13)
distr6f = res1.get_distribution(exog=ex, exog_precision=ex_prec,
transform=False)
m26, v26 = distr6f.stats()
assert_allclose(m26, m2[:n], rtol=1e-13)
assert_allclose(v26, v2[:n], rtol=1e-13)
# check formula transform works for predict, currently mean only
df6 = methylation.iloc[:6]
mean6f = res1.predict(df6)
# todo: prec6 wrong exog if not used as keyword, no exception raised
# formula not supported for exog_precision in predict
# prec6f = res1.predict(exog_precision=ex_prec, which="precision")
assert_allclose(mean6f, mean[:n], rtol=1e-13)
# assert_allclose(prec6f, prec[:n], rtol=1e-13)
distr6f = res1.get_distribution(exog=df6, exog_precision=ex_prec)
m26, v26 = distr6f.stats()
assert_allclose(m26, m2[:n], rtol=1e-13)
assert_allclose(v26, v2[:n], rtol=1e-13)
# check that we don't have pandas in distr
assert isinstance(distr6f.args[0], np.ndarray)
# minimal checks for get_prediction
pma = res1.get_prediction(which="mean", average=True)
dfma = pma.summary_frame()
assert_allclose(pma.predicted, mean.mean(), rtol=1e-13)
assert_equal(dfma.shape, (1, 4))
pm = res1.get_prediction(exog=df6, which="mean", average=False)
dfm = pm.summary_frame()
assert_allclose(pm.predicted, mean6, rtol=1e-13)
assert_equal(dfm.shape, (6, 4))
pv = res1.get_prediction(exog=df6, exog_precision=ex_prec,
which="var", average=False)
dfv = pv.summary_frame()
assert_allclose(pv.predicted, var6, rtol=1e-13)
assert_equal(dfv.shape, (6, 4))
# smoke tests
res1.get_prediction(which="linear", average=False)
res1.get_prediction(which="precision", average=True)
res1.get_prediction(exog_precision=ex_prec, which="precision",
average=False)
res1.get_prediction(which="linear-precision", average=True)
# test agg_weights
pm = res1.get_prediction(exog=df6, which="mean", average=True)
dfm = pm.summary_frame()
aw = np.zeros(len(res1.model.endog))
aw[:6] = 1
aw /= aw.mean()
pm6 = res1.get_prediction(exog=df6, which="mean", average=True)
dfm6 = pm6.summary_frame()
pmw = res1.get_prediction(which="mean", average=True, agg_weights=aw)
dfmw = pmw.summary_frame()
assert_allclose(pmw.predicted, pm6.predicted, rtol=1e-13)
assert_allclose(dfmw, dfm6, rtol=1e-13)
class TestBetaIncome():
@classmethod
def setup_class(cls):
formula = "I(food/income) ~ income + persons"
exog_prec = patsy.dmatrix("~ persons", income)
mod_income = BetaModel.from_formula(
formula,
income,
exog_precision=exog_prec,
link_precision=links.Log()
)
res_income = mod_income.fit(method="newton")
mod_restricted = BetaModel.from_formula(
formula,
income,
link_precision=links.Log()
)
res_restricted = mod_restricted.fit(method="newton")
cls.res1 = res_income
cls.resr = res_restricted
def test_score_test(self):
res1 = self.res1
resr = self.resr
params_restr = np.concatenate([resr.params, [0]])
r_matrix = np.zeros((1, len(params_restr)))
r_matrix[0, -1] = 1
exog_prec_extra = res1.model.exog_precision[:, 1:]
from statsmodels.base._parameter_inference import score_test
sc1 = score_test(res1, params_constrained=params_restr,
k_constraints=1)
sc2 = score_test(resr, exog_extra=(None, exog_prec_extra))
assert_allclose(sc2[:2], sc1[:2])
sc1_hc = score_test(res1, params_constrained=params_restr,
k_constraints=1, r_matrix=r_matrix, cov_type="HC0")
sc2_hc = score_test(resr, exog_extra=(None, exog_prec_extra),
cov_type="HC0")
assert_allclose(sc2_hc[:2], sc1_hc[:2])
def test_influence(self):
# currently only smoke test
res1 = self.res1
from statsmodels.stats.outliers_influence import MLEInfluence
influ0 = MLEInfluence(res1)
influ = res1.get_influence()
attrs = ['cooks_distance', 'd_fittedvalues', 'd_fittedvalues_scaled',
'd_params', 'dfbetas', 'hat_matrix_diag', 'resid_studentized'
]
for attr in attrs:
getattr(influ, attr)
frame = influ.summary_frame()
frame0 = influ0.summary_frame()
assert_allclose(frame, frame0, rtol=1e-13, atol=1e-13)