2664 lines
100 KiB
Python
2664 lines
100 KiB
Python
"""
|
|
Test functions for models.GLM
|
|
"""
|
|
import os
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from numpy.testing import (
|
|
assert_,
|
|
assert_allclose,
|
|
assert_almost_equal,
|
|
assert_array_less,
|
|
assert_equal,
|
|
assert_raises,
|
|
)
|
|
import pandas as pd
|
|
from pandas.testing import assert_series_equal
|
|
import pytest
|
|
from scipy import stats
|
|
|
|
import statsmodels.api as sm
|
|
from statsmodels.compat.scipy import SP_LT_17
|
|
from statsmodels.datasets import cpunish, longley
|
|
from statsmodels.discrete import discrete_model as discrete
|
|
from statsmodels.genmod.generalized_linear_model import GLM, SET_USE_BIC_LLF
|
|
from statsmodels.tools.numdiff import (
|
|
approx_fprime,
|
|
approx_fprime_cs,
|
|
approx_hess,
|
|
approx_hess_cs,
|
|
)
|
|
from statsmodels.tools.sm_exceptions import (
|
|
DomainWarning,
|
|
PerfectSeparationWarning,
|
|
ValueWarning,
|
|
)
|
|
from statsmodels.tools.tools import add_constant
|
|
|
|
# Test Precisions
|
|
DECIMAL_4 = 4
|
|
DECIMAL_3 = 3
|
|
DECIMAL_2 = 2
|
|
DECIMAL_1 = 1
|
|
DECIMAL_0 = 0
|
|
|
|
pdf_output = False
|
|
|
|
if pdf_output:
|
|
from matplotlib.backends.backend_pdf import PdfPages
|
|
pdf = PdfPages("test_glm.pdf")
|
|
else:
|
|
pdf = None
|
|
|
|
|
|
def close_or_save(pdf, fig):
|
|
if pdf_output:
|
|
pdf.savefig(fig)
|
|
|
|
|
|
def teardown_module():
|
|
if pdf_output:
|
|
pdf.close()
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def iris():
|
|
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
|
return np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
|
|
delimiter=",", skip_header=1)
|
|
|
|
|
|
class CheckModelResultsMixin:
|
|
'''
|
|
res2 should be either the results from RModelWrap
|
|
or the results as defined in model_results_data
|
|
'''
|
|
|
|
decimal_params = DECIMAL_4
|
|
def test_params(self):
|
|
assert_almost_equal(self.res1.params, self.res2.params,
|
|
self.decimal_params)
|
|
|
|
decimal_bse = DECIMAL_4
|
|
def test_standard_errors(self):
|
|
assert_allclose(self.res1.bse, self.res2.bse,
|
|
atol=10**(-self.decimal_bse), rtol=1e-5)
|
|
|
|
decimal_resids = DECIMAL_4
|
|
def test_residuals(self):
|
|
# fix incorrect numbers in resid_working results
|
|
# residuals for Poisson are also tested in test_glm_weights.py
|
|
import copy
|
|
|
|
# new numpy would have copy method
|
|
resid2 = copy.copy(self.res2.resids)
|
|
resid2[:, 2] *= self.res1.family.link.deriv(self.res1.mu)**2
|
|
|
|
atol = 10**(-self.decimal_resids)
|
|
resid_a = self.res1.resid_anscombe_unscaled
|
|
resids = np.column_stack((self.res1.resid_pearson,
|
|
self.res1.resid_deviance, self.res1.resid_working,
|
|
resid_a, self.res1.resid_response))
|
|
assert_allclose(resids, resid2, rtol=1e-6, atol=atol)
|
|
|
|
decimal_aic_R = DECIMAL_4
|
|
|
|
def test_aic_R(self):
|
|
# R includes the estimation of the scale as a lost dof
|
|
# Does not with Gamma though
|
|
if self.res1.scale != 1:
|
|
dof = 2
|
|
else:
|
|
dof = 0
|
|
if isinstance(self.res1.model.family, (sm.families.NegativeBinomial)):
|
|
llf = self.res1.model.family.loglike(self.res1.model.endog,
|
|
self.res1.mu,
|
|
self.res1.model.var_weights,
|
|
self.res1.model.freq_weights,
|
|
scale=1)
|
|
aic = (-2*llf+2*(self.res1.df_model+1))
|
|
else:
|
|
aic = self.res1.aic
|
|
assert_almost_equal(aic+dof, self.res2.aic_R,
|
|
self.decimal_aic_R)
|
|
|
|
decimal_aic_Stata = DECIMAL_4
|
|
def test_aic_Stata(self):
|
|
# Stata uses the below llf for aic definition for these families
|
|
if isinstance(self.res1.model.family, (sm.families.Gamma,
|
|
sm.families.InverseGaussian,
|
|
sm.families.NegativeBinomial)):
|
|
llf = self.res1.model.family.loglike(self.res1.model.endog,
|
|
self.res1.mu,
|
|
self.res1.model.var_weights,
|
|
self.res1.model.freq_weights,
|
|
scale=1)
|
|
aic = (-2*llf+2*(self.res1.df_model+1))/self.res1.nobs
|
|
else:
|
|
aic = self.res1.aic/self.res1.nobs
|
|
assert_almost_equal(aic, self.res2.aic_Stata, self.decimal_aic_Stata)
|
|
|
|
decimal_deviance = DECIMAL_4
|
|
def test_deviance(self):
|
|
assert_almost_equal(self.res1.deviance, self.res2.deviance,
|
|
self.decimal_deviance)
|
|
|
|
decimal_scale = DECIMAL_4
|
|
def test_scale(self):
|
|
assert_almost_equal(self.res1.scale, self.res2.scale,
|
|
self.decimal_scale)
|
|
|
|
decimal_loglike = DECIMAL_4
|
|
def test_loglike(self):
|
|
# Stata uses the below llf for these families
|
|
# We differ with R for them
|
|
if isinstance(self.res1.model.family, (sm.families.Gamma,
|
|
sm.families.InverseGaussian,
|
|
sm.families.NegativeBinomial)):
|
|
llf = self.res1.model.family.loglike(self.res1.model.endog,
|
|
self.res1.mu,
|
|
self.res1.model.var_weights,
|
|
self.res1.model.freq_weights,
|
|
scale=1)
|
|
else:
|
|
llf = self.res1.llf
|
|
assert_almost_equal(llf, self.res2.llf, self.decimal_loglike)
|
|
|
|
decimal_null_deviance = DECIMAL_4
|
|
def test_null_deviance(self):
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", DomainWarning)
|
|
|
|
assert_almost_equal(self.res1.null_deviance,
|
|
self.res2.null_deviance,
|
|
self.decimal_null_deviance)
|
|
|
|
decimal_bic = DECIMAL_4
|
|
def test_bic(self):
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
assert_almost_equal(self.res1.bic,
|
|
self.res2.bic_Stata,
|
|
self.decimal_bic)
|
|
|
|
def test_degrees(self):
|
|
assert_equal(self.res1.model.df_resid,self.res2.df_resid)
|
|
|
|
decimal_fittedvalues = DECIMAL_4
|
|
def test_fittedvalues(self):
|
|
assert_almost_equal(self.res1.fittedvalues, self.res2.fittedvalues,
|
|
self.decimal_fittedvalues)
|
|
|
|
def test_tpvalues(self):
|
|
# test comparing tvalues and pvalues with normal implementation
|
|
# make sure they use normal distribution (inherited in results class)
|
|
params = self.res1.params
|
|
tvalues = params / self.res1.bse
|
|
pvalues = stats.norm.sf(np.abs(tvalues)) * 2
|
|
half_width = stats.norm.isf(0.025) * self.res1.bse
|
|
conf_int = np.column_stack((params - half_width, params + half_width))
|
|
if isinstance(tvalues, pd.Series):
|
|
assert_series_equal(self.res1.tvalues, tvalues)
|
|
else:
|
|
assert_almost_equal(self.res1.tvalues, tvalues)
|
|
assert_almost_equal(self.res1.pvalues, pvalues)
|
|
assert_almost_equal(self.res1.conf_int(), conf_int)
|
|
|
|
def test_pearson_chi2(self):
|
|
if hasattr(self.res2, 'pearson_chi2'):
|
|
assert_allclose(self.res1.pearson_chi2, self.res2.pearson_chi2,
|
|
atol=1e-6, rtol=1e-6)
|
|
|
|
def test_prsquared(self):
|
|
if hasattr(self.res2, 'prsquared'):
|
|
assert_allclose(self.res1.pseudo_rsquared(kind="mcf"),
|
|
self.res2.prsquared, rtol=0.05)
|
|
|
|
if hasattr(self.res2, 'prsquared_cox_snell'):
|
|
assert_allclose(float(self.res1.pseudo_rsquared(kind="cs")),
|
|
self.res2.prsquared_cox_snell, rtol=0.05)
|
|
|
|
@pytest.mark.smoke
|
|
def test_summary(self):
|
|
self.res1.summary()
|
|
|
|
@pytest.mark.smoke
|
|
def test_summary2(self):
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", DomainWarning)
|
|
self.res1.summary2()
|
|
|
|
def test_get_distribution(self):
|
|
res1 = self.res1
|
|
if not hasattr(res1.model.family, "get_distribution"):
|
|
# only Tweedie has not get_distribution
|
|
pytest.skip("get_distribution not available")
|
|
|
|
if isinstance(res1.model.family, sm.families.NegativeBinomial):
|
|
res_scale = 1 # QMLE scale can differ from 1
|
|
else:
|
|
res_scale = res1.scale
|
|
|
|
distr = res1.model.family.get_distribution(res1.fittedvalues,
|
|
res_scale)
|
|
var_endog = res1.model.family.variance(res1.fittedvalues) * res_scale
|
|
m, v = distr.stats()
|
|
assert_allclose(res1.fittedvalues, m, rtol=1e-13)
|
|
assert_allclose(var_endog, v, rtol=1e-13)
|
|
# check model method
|
|
distr2 = res1.model.get_distribution(res1.params, res_scale)
|
|
for k in distr2.kwds:
|
|
assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13)
|
|
|
|
# compare var with predict
|
|
var_ = res1.predict(which="var_unscaled")
|
|
assert_allclose(var_ * res_scale, var_endog, rtol=1e-13)
|
|
|
|
# check get_distribution of results instance
|
|
if getattr(self, "has_edispersion", False):
|
|
with pytest.warns(UserWarning, match="using scale=1"):
|
|
distr3 = res1.get_distribution()
|
|
else:
|
|
distr3 = res1.get_distribution()
|
|
for k in distr2.kwds:
|
|
assert_allclose(distr3.kwds[k], distr2.kwds[k], rtol=1e-13)
|
|
|
|
|
|
class CheckComparisonMixin:
|
|
|
|
def test_compare_discrete(self):
|
|
res1 = self.res1
|
|
resd = self.resd
|
|
|
|
assert_allclose(res1.llf, resd.llf, rtol=1e-10)
|
|
score_obs1 = res1.model.score_obs(res1.params * 0.98)
|
|
score_obsd = resd.model.score_obs(resd.params * 0.98)
|
|
assert_allclose(score_obs1, score_obsd, rtol=1e-10)
|
|
|
|
# score
|
|
score1 = res1.model.score(res1.params * 0.98)
|
|
assert_allclose(score1, score_obs1.sum(0), atol=1e-20)
|
|
score0 = res1.model.score(res1.params)
|
|
assert_allclose(score0, np.zeros(score_obs1.shape[1]), atol=5e-7)
|
|
|
|
hessian1 = res1.model.hessian(res1.params * 0.98, observed=False)
|
|
hessiand = resd.model.hessian(resd.params * 0.98)
|
|
assert_allclose(hessian1, hessiand, rtol=1e-10)
|
|
|
|
hessian1 = res1.model.hessian(res1.params * 0.98, observed=True)
|
|
hessiand = resd.model.hessian(resd.params * 0.98)
|
|
assert_allclose(hessian1, hessiand, rtol=1e-9)
|
|
|
|
def test_score_test(self):
|
|
res1 = self.res1
|
|
# fake example, should be zero, k_constraint should be 0
|
|
st, pv, df = res1.model.score_test(res1.params, k_constraints=1)
|
|
assert_allclose(st, 0, atol=1e-20)
|
|
assert_allclose(pv, 1, atol=1e-10)
|
|
assert_equal(df, 1)
|
|
|
|
st, pv, df = res1.model.score_test(res1.params, k_constraints=0)
|
|
assert_allclose(st, 0, atol=1e-20)
|
|
assert_(np.isnan(pv), msg=repr(pv))
|
|
assert_equal(df, 0)
|
|
|
|
# TODO: no verified numbers largely SMOKE test
|
|
exog_extra = res1.model.exog[:,1]**2
|
|
st, pv, df = res1.model.score_test(res1.params, exog_extra=exog_extra)
|
|
assert_array_less(0.1, st)
|
|
assert_array_less(0.1, pv)
|
|
assert_equal(df, 1)
|
|
|
|
def test_get_prediction(self):
|
|
pred1 = self.res1.get_prediction() # GLM
|
|
predd = self.resd.get_prediction() # discrete class
|
|
assert_allclose(predd.predicted, pred1.predicted_mean, rtol=1e-11)
|
|
assert_allclose(predd.se, pred1.se_mean, rtol=1e-6)
|
|
assert_allclose(predd.summary_frame().values,
|
|
pred1.summary_frame().values, rtol=1e-6)
|
|
|
|
pred1 = self.res1.get_prediction(which="mean") # GLM
|
|
predd = self.resd.get_prediction() # discrete class
|
|
assert_allclose(predd.predicted, pred1.predicted, rtol=1e-11)
|
|
assert_allclose(predd.se, pred1.se, rtol=1e-6)
|
|
assert_allclose(predd.summary_frame().values,
|
|
pred1.summary_frame().values, rtol=1e-6)
|
|
|
|
|
|
class TestGlmGaussian(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Gaussian family with canonical identity link
|
|
'''
|
|
# Test Precisions
|
|
cls.decimal_resids = DECIMAL_3
|
|
cls.decimal_params = DECIMAL_2
|
|
cls.decimal_bic = DECIMAL_0
|
|
cls.decimal_bse = DECIMAL_3
|
|
|
|
from statsmodels.datasets.longley import load
|
|
cls.data = load()
|
|
cls.data.endog = np.require(cls.data.endog, requirements="W")
|
|
cls.data.exog = np.require(cls.data.exog, requirements="W")
|
|
cls.data.exog = add_constant(cls.data.exog, prepend=False)
|
|
cls.res1 = GLM(cls.data.endog, cls.data.exog,
|
|
family=sm.families.Gaussian()).fit()
|
|
from .results.results_glm import Longley
|
|
cls.res2 = Longley()
|
|
|
|
|
|
def test_compare_OLS(self):
|
|
res1 = self.res1
|
|
# OLS does not define score_obs
|
|
from statsmodels.regression.linear_model import OLS
|
|
resd = OLS(self.data.endog, self.data.exog).fit(use_t=False)
|
|
self.resd = resd # attach to access from the outside
|
|
|
|
assert_allclose(res1.llf, resd.llf, rtol=1e-10)
|
|
score_obs1 = res1.model.score_obs(res1.params, scale=None)
|
|
score_obsd = resd.resid[:, None] / resd.scale * resd.model.exog
|
|
# low precision because of badly scaled exog
|
|
assert_allclose(score_obs1, score_obsd, rtol=1e-8)
|
|
|
|
score_obs1 = res1.model.score_obs(res1.params, scale=1)
|
|
score_obsd = resd.resid[:, None] * resd.model.exog
|
|
assert_allclose(score_obs1, score_obsd, rtol=1e-8)
|
|
|
|
hess_obs1 = res1.model.hessian(res1.params, scale=None)
|
|
hess_obsd = -1. / resd.scale * resd.model.exog.T.dot(resd.model.exog)
|
|
# low precision because of badly scaled exog
|
|
assert_allclose(hess_obs1, hess_obsd, rtol=1e-8)
|
|
|
|
pred1 = res1.get_prediction() # GLM
|
|
predd = resd.get_prediction() # discrete class
|
|
assert_allclose(predd.predicted, pred1.predicted_mean, rtol=1e-11)
|
|
assert_allclose(predd.se, pred1.se_mean, rtol=1e-6)
|
|
assert_allclose(predd.summary_frame().values[:, :4],
|
|
pred1.summary_frame().values, rtol=1e-6)
|
|
|
|
pred1 = self.res1.get_prediction(which="mean") # GLM
|
|
predd = self.resd.get_prediction() # discrete class
|
|
assert_allclose(predd.predicted, pred1.predicted, rtol=1e-11)
|
|
assert_allclose(predd.se, pred1.se, rtol=1e-6)
|
|
assert_allclose(predd.summary_frame().values[:, :4],
|
|
pred1.summary_frame().values, rtol=1e-6)
|
|
|
|
# FIXME: enable or delete
|
|
# def setup_method(self):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# Gauss = r.gaussian
|
|
# self.res2 = RModel(self.data.endog, self.data.exog, r.glm, family=Gauss)
|
|
# self.res2.resids = np.array(self.res2.resid)[:,None]*np.ones((1,5))
|
|
# self.res2.null_deviance = 185008826 # taken from R. Rpy bug?
|
|
|
|
|
|
class TestGlmGaussianGradient(TestGlmGaussian):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Gaussian family with canonical identity link
|
|
'''
|
|
# Test Precisions
|
|
cls.decimal_resids = DECIMAL_3
|
|
cls.decimal_params = DECIMAL_2
|
|
cls.decimal_bic = DECIMAL_0
|
|
cls.decimal_bse = DECIMAL_2
|
|
|
|
from statsmodels.datasets.longley import load
|
|
cls.data = load()
|
|
cls.data.endog = np.require(cls.data.endog, requirements="W")
|
|
cls.data.exog = np.require(cls.data.exog, requirements="W")
|
|
cls.data.exog = add_constant(cls.data.exog, prepend=False)
|
|
cls.res1 = GLM(cls.data.endog, cls.data.exog,
|
|
family=sm.families.Gaussian()).fit(method='newton')
|
|
from .results.results_glm import Longley
|
|
cls.res2 = Longley()
|
|
|
|
|
|
class TestGaussianLog(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precision
|
|
cls.decimal_aic_R = DECIMAL_0
|
|
cls.decimal_aic_Stata = DECIMAL_2
|
|
cls.decimal_loglike = DECIMAL_0
|
|
cls.decimal_null_deviance = DECIMAL_1
|
|
|
|
nobs = 100
|
|
x = np.arange(nobs)
|
|
np.random.seed(54321)
|
|
# y = 1.0 - .02*x - .001*x**2 + 0.001 * np.random.randn(nobs)
|
|
cls.X = np.c_[np.ones((nobs,1)),x,x**2]
|
|
cls.lny = np.exp(-(-1.0 + 0.02*x + 0.0001*x**2)) +\
|
|
0.001 * np.random.randn(nobs)
|
|
|
|
GaussLog_Model = GLM(cls.lny, cls.X,
|
|
family=sm.families.Gaussian(sm.families.links.Log()))
|
|
cls.res1 = GaussLog_Model.fit()
|
|
from .results.results_glm import GaussianLog
|
|
cls.res2 = GaussianLog()
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed"
|
|
# GaussLogLink = r.gaussian(link = "log")
|
|
# GaussLog_Res_R = RModel(cls.lny, cls.X, r.glm, family=GaussLogLink)
|
|
# cls.res2 = GaussLog_Res_R
|
|
|
|
class TestGaussianInverse(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precisions
|
|
cls.decimal_bic = DECIMAL_1
|
|
cls.decimal_aic_R = DECIMAL_1
|
|
cls.decimal_aic_Stata = DECIMAL_3
|
|
cls.decimal_loglike = DECIMAL_1
|
|
cls.decimal_resids = DECIMAL_3
|
|
|
|
nobs = 100
|
|
x = np.arange(nobs)
|
|
np.random.seed(54321)
|
|
y = 1.0 + 2.0 * x + x**2 + 0.1 * np.random.randn(nobs)
|
|
cls.X = np.c_[np.ones((nobs,1)),x,x**2]
|
|
cls.y_inv = (1. + .02*x + .001*x**2)**-1 + .001 * np.random.randn(nobs)
|
|
InverseLink_Model = GLM(cls.y_inv, cls.X,
|
|
family=sm.families.Gaussian(sm.families.links.InversePower()))
|
|
InverseLink_Res = InverseLink_Model.fit()
|
|
cls.res1 = InverseLink_Res
|
|
from .results.results_glm import GaussianInverse
|
|
cls.res2 = GaussianInverse()
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# InverseLink = r.gaussian(link = "inverse")
|
|
# InverseLink_Res_R = RModel(cls.y_inv, cls.X, r.glm, family=InverseLink)
|
|
# cls.res2 = InverseLink_Res_R
|
|
|
|
class TestGlmBinomial(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Binomial family with canonical logit link using star98 dataset.
|
|
'''
|
|
cls.decimal_resids = DECIMAL_1
|
|
cls.decimal_bic = DECIMAL_2
|
|
|
|
from statsmodels.datasets.star98 import load
|
|
|
|
from .results.results_glm import Star98
|
|
data = load()
|
|
data.endog = np.require(data.endog, requirements="W")
|
|
data.exog = np.require(data.exog, requirements="W")
|
|
data.exog = add_constant(data.exog, prepend=False)
|
|
cls.res1 = GLM(data.endog, data.exog,
|
|
family=sm.families.Binomial()).fit()
|
|
# NOTE: if you want to replicate with RModel
|
|
# res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm,
|
|
# family=r.binomial, weights=trials)
|
|
|
|
cls.res2 = Star98()
|
|
|
|
def test_endog_dtype(self):
|
|
from statsmodels.datasets.star98 import load
|
|
data = load()
|
|
data.exog = add_constant(data.exog, prepend=False)
|
|
endog = data.endog.astype(int)
|
|
res2 = GLM(endog, data.exog, family=sm.families.Binomial()).fit()
|
|
assert_allclose(res2.params, self.res1.params)
|
|
endog = data.endog.astype(np.double)
|
|
res3 = GLM(endog, data.exog, family=sm.families.Binomial()).fit()
|
|
assert_allclose(res3.params, self.res1.params)
|
|
|
|
def test_invalid_endog(self, reset_randomstate):
|
|
# GH2733 inspired check
|
|
endog = np.random.randint(0, 100, size=(1000, 3))
|
|
exog = np.random.standard_normal((1000, 2))
|
|
with pytest.raises(ValueError, match='endog has more than 2 columns'):
|
|
GLM(endog, exog, family=sm.families.Binomial())
|
|
|
|
def test_invalid_endog_formula(self, reset_randomstate):
|
|
# GH2733
|
|
n = 200
|
|
exog = np.random.normal(size=(n, 2))
|
|
endog = np.random.randint(0, 3, size=n).astype(str)
|
|
# formula interface
|
|
data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]})
|
|
with pytest.raises(ValueError, match='array with multiple columns'):
|
|
sm.GLM.from_formula("y ~ x1 + x2", data,
|
|
family=sm.families.Binomial())
|
|
|
|
def test_get_distribution_binom_count(self):
|
|
# test for binomial counts with n_trials > 1
|
|
res1 = self.res1
|
|
res_scale = 1 # QMLE scale can differ from 1
|
|
|
|
mu_prob = res1.fittedvalues
|
|
n = res1.model.n_trials
|
|
distr = res1.model.family.get_distribution(mu_prob, res_scale,
|
|
n_trials=n)
|
|
var_endog = res1.model.family.variance(mu_prob) * res_scale
|
|
m, v = distr.stats()
|
|
assert_allclose(mu_prob * n, m, rtol=1e-13)
|
|
assert_allclose(var_endog * n, v, rtol=1e-13)
|
|
|
|
# check model method
|
|
distr2 = res1.model.get_distribution(res1.params, res_scale,
|
|
n_trials=n)
|
|
for k in distr2.kwds:
|
|
assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13)
|
|
|
|
|
|
# FIXME: enable/xfail/skip or delete
|
|
# TODO:
|
|
# Non-Canonical Links for the Binomial family require the algorithm to be
|
|
# slightly changed
|
|
# class TestGlmBinomialLog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialLogit(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialProbit(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialCloglog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialPower(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialLoglog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBinomialLogc(CheckModelResultsMixin):
|
|
# TODO: need include logc link
|
|
# pass
|
|
|
|
|
|
class TestGlmBernoulli(CheckModelResultsMixin, CheckComparisonMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from .results.results_glm import Lbw
|
|
cls.res2 = Lbw()
|
|
cls.res1 = GLM(cls.res2.endog, cls.res2.exog,
|
|
family=sm.families.Binomial()).fit()
|
|
|
|
modd = discrete.Logit(cls.res2.endog, cls.res2.exog)
|
|
cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False)
|
|
|
|
def test_score_r(self):
|
|
res1 = self.res1
|
|
res2 = self.res2
|
|
st, pv, df = res1.model.score_test(res1.params,
|
|
exog_extra=res1.model.exog[:, 1]**2)
|
|
st_res = 0.2837680293459376 # (-0.5326988167303712)**2
|
|
assert_allclose(st, st_res, rtol=1e-4)
|
|
|
|
st, pv, df = res1.model.score_test(res1.params,
|
|
exog_extra=res1.model.exog[:, 0]**2)
|
|
st_res = 0.6713492821514992 # (-0.8193590679009413)**2
|
|
assert_allclose(st, st_res, rtol=1e-4)
|
|
|
|
select = list(range(9))
|
|
select.pop(7)
|
|
|
|
res1b = GLM(res2.endog, res2.exog.iloc[:, select],
|
|
family=sm.families.Binomial()).fit()
|
|
tres = res1b.model.score_test(res1b.params,
|
|
exog_extra=res1.model.exog[:, -2])
|
|
tres = np.asarray(tres[:2]).ravel()
|
|
tres_r = (2.7864148487452, 0.0950667)
|
|
assert_allclose(tres, tres_r, rtol=1e-4)
|
|
|
|
cmd_r = """\
|
|
data = read.csv("...statsmodels\\statsmodels\\genmod\\tests\\results\\stata_lbw_glm.csv")
|
|
|
|
data["race_black"] = data["race"] == "black"
|
|
data["race_other"] = data["race"] == "other"
|
|
mod = glm(low ~ age + lwt + race_black + race_other + smoke + ptl + ht + ui, family=binomial, data=data)
|
|
options(digits=16)
|
|
anova(mod, test="Rao")
|
|
|
|
library(statmod)
|
|
s = glm.scoretest(mod, data["age"]**2)
|
|
s**2
|
|
s = glm.scoretest(mod, data["lwt"]**2)
|
|
s**2
|
|
"""
|
|
|
|
# class TestGlmBernoulliIdentity(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBernoulliLog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBernoulliProbit(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBernoulliCloglog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBernoulliPower(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class TestGlmBernoulliLoglog(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# class test_glm_bernoulli_logc(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
|
|
class TestGlmGamma(CheckModelResultsMixin):
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Gamma family with canonical inverse link (power -1)
|
|
'''
|
|
# Test Precisions
|
|
cls.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata
|
|
cls.decimal_resids = DECIMAL_2
|
|
|
|
from statsmodels.datasets.scotland import load
|
|
|
|
from .results.results_glm import Scotvote
|
|
data = load()
|
|
data.exog = add_constant(data.exog, prepend=False)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
res1 = GLM(data.endog, data.exog,
|
|
family=sm.families.Gamma()).fit()
|
|
cls.res1 = res1
|
|
# res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
|
|
res2 = Scotvote()
|
|
res2.aic_R += 2 # R does not count degree of freedom for scale with gamma
|
|
cls.res2 = res2
|
|
|
|
|
|
class TestGlmGammaLog(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precisions
|
|
cls.decimal_resids = DECIMAL_3
|
|
cls.decimal_aic_R = DECIMAL_0
|
|
cls.decimal_fittedvalues = DECIMAL_3
|
|
|
|
from .results.results_glm import CancerLog
|
|
res2 = CancerLog()
|
|
cls.res1 = GLM(res2.endog, res2.exog,
|
|
family=sm.families.Gamma(link=sm.families.links.Log())).fit()
|
|
cls.res2 = res2
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
|
|
# family=r.Gamma(link="log"))
|
|
# cls.res2.null_deviance = 27.92207137420696 # From R (bug in rpy)
|
|
# cls.res2.bic = -154.1582089453923 # from Stata
|
|
|
|
|
|
class TestGlmGammaIdentity(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precisions
|
|
cls.decimal_resids = -100 #TODO Very off from Stata?
|
|
cls.decimal_params = DECIMAL_2
|
|
cls.decimal_aic_R = DECIMAL_0
|
|
cls.decimal_loglike = DECIMAL_1
|
|
|
|
from .results.results_glm import CancerIdentity
|
|
res2 = CancerIdentity()
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
fam = sm.families.Gamma(link=sm.families.links.Identity())
|
|
cls.res1 = GLM(res2.endog, res2.exog, family=fam).fit()
|
|
cls.res2 = res2
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
|
|
# family=r.Gamma(link="identity"))
|
|
# cls.res2.null_deviance = 27.92207137420696 # from R, Rpy bug
|
|
|
|
class TestGlmPoisson(CheckModelResultsMixin, CheckComparisonMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Poisson family with canonical log link.
|
|
|
|
Test results were obtained by R.
|
|
'''
|
|
from .results.results_glm import Cpunish
|
|
cls.data = cpunish.load()
|
|
cls.data.endog = np.require(cls.data.endog, requirements="W")
|
|
cls.data.exog = np.require(cls.data.exog, requirements="W")
|
|
cls.data.exog[:, 3] = np.log(cls.data.exog[:, 3])
|
|
cls.data.exog = add_constant(cls.data.exog, prepend=False)
|
|
cls.res1 = GLM(cls.data.endog, cls.data.exog,
|
|
family=sm.families.Poisson()).fit()
|
|
cls.res2 = Cpunish()
|
|
# compare with discrete, start close to save time
|
|
modd = discrete.Poisson(cls.data.endog, cls.data.exog)
|
|
cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False)
|
|
|
|
#class TestGlmPoissonIdentity(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
#class TestGlmPoissonPower(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
|
|
class TestGlmInvgauss(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests the Inverse Gaussian family in GLM.
|
|
|
|
Notes
|
|
-----
|
|
Used the rndivgx.ado file provided by Hardin and Hilbe to
|
|
generate the data. Results are read from model_results, which
|
|
were obtained by running R_ig.s
|
|
'''
|
|
# Test Precisions
|
|
cls.decimal_aic_R = DECIMAL_0
|
|
cls.decimal_loglike = DECIMAL_0
|
|
|
|
from .results.results_glm import InvGauss
|
|
res2 = InvGauss()
|
|
res1 = GLM(res2.endog, res2.exog,
|
|
family=sm.families.InverseGaussian()).fit()
|
|
cls.res1 = res1
|
|
cls.res2 = res2
|
|
|
|
def test_get_distribution(self):
|
|
res1 = self.res1
|
|
distr = res1.model.family.get_distribution(res1.fittedvalues,
|
|
res1.scale)
|
|
var_endog = res1.model.family.variance(res1.fittedvalues) * res1.scale
|
|
m, v = distr.stats()
|
|
assert_allclose(res1.fittedvalues, m, rtol=1e-13)
|
|
assert_allclose(var_endog, v, rtol=1e-13)
|
|
|
|
|
|
class TestGlmInvgaussLog(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precisions
|
|
cls.decimal_aic_R = -10 # Big difference vs R.
|
|
cls.decimal_resids = DECIMAL_3
|
|
|
|
from .results.results_glm import InvGaussLog
|
|
res2 = InvGaussLog()
|
|
cls.res1 = GLM(res2.endog, res2.exog,
|
|
family=sm.families.InverseGaussian(
|
|
link=sm.families.links.Log())).fit()
|
|
cls.res2 = res2
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
|
|
# family=r.inverse_gaussian(link="log"))
|
|
# cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug
|
|
# cls.res2.llf = -12162.72308 # from Stata, R's has big rounding diff
|
|
|
|
|
|
class TestGlmInvgaussIdentity(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
# Test Precisions
|
|
cls.decimal_aic_R = -10 #TODO: Big difference vs R
|
|
cls.decimal_fittedvalues = DECIMAL_3
|
|
cls.decimal_params = DECIMAL_3
|
|
|
|
from .results.results_glm import Medpar1
|
|
data = Medpar1()
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
cls.res1 = GLM(data.endog, data.exog,
|
|
family=sm.families.InverseGaussian(
|
|
link=sm.families.links.Identity())).fit()
|
|
from .results.results_glm import InvGaussIdentity
|
|
cls.res2 = InvGaussIdentity()
|
|
|
|
# FIXME: enable or delete
|
|
# def setup(cls):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed."
|
|
# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
|
|
# family=r.inverse_gaussian(link="identity"))
|
|
# cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug
|
|
# cls.res2.llf = -12163.25545 # from Stata, big diff with R
|
|
|
|
|
|
class TestGlmNegbinomial(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Negative Binomial family with log link
|
|
'''
|
|
# Test Precision
|
|
cls.decimal_resid = DECIMAL_1
|
|
cls.decimal_params = DECIMAL_3
|
|
cls.decimal_resids = -1 # 1 % mismatch at 0
|
|
cls.decimal_fittedvalues = DECIMAL_1
|
|
|
|
from statsmodels.datasets.committee import load
|
|
cls.data = load()
|
|
cls.data.endog = np.require(cls.data.endog, requirements="W")
|
|
cls.data.exog = np.require(cls.data.exog, requirements="W")
|
|
cls.data.exog[:,2] = np.log(cls.data.exog[:,2])
|
|
interaction = cls.data.exog[:,2]*cls.data.exog[:,1]
|
|
cls.data.exog = np.column_stack((cls.data.exog,interaction))
|
|
cls.data.exog = add_constant(cls.data.exog, prepend=False)
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", category=DomainWarning)
|
|
with pytest.warns(UserWarning):
|
|
fam = sm.families.NegativeBinomial()
|
|
|
|
cls.res1 = GLM(cls.data.endog, cls.data.exog,
|
|
family=fam).fit(scale='x2')
|
|
from .results.results_glm import Committee
|
|
res2 = Committee()
|
|
res2.aic_R += 2 # They do not count a degree of freedom for the scale
|
|
cls.res2 = res2
|
|
cls.has_edispersion = True
|
|
|
|
# FIXME: enable or delete
|
|
# def setup_method(self):
|
|
# if skipR:
|
|
# raise SkipTest, "Rpy not installed"
|
|
# r.library('MASS') # this does not work when done in rmodelwrap?
|
|
# self.res2 = RModel(self.data.endog, self.data.exog, r.glm,
|
|
# family=r.negative_binomial(1))
|
|
# self.res2.null_deviance = 27.8110469364343
|
|
|
|
# FIXME: enable/xfail/skip or delete
|
|
#class TestGlmNegbinomial_log(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# FIXME: enable/xfail/skip or delete
|
|
#class TestGlmNegbinomial_power(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
# FIXME: enable/xfail/skip or delete
|
|
#class TestGlmNegbinomial_nbinom(CheckModelResultsMixin):
|
|
# pass
|
|
|
|
|
|
class TestGlmPoissonOffset(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from .results.results_glm import Cpunish_offset
|
|
cls.decimal_params = DECIMAL_4
|
|
cls.decimal_bse = DECIMAL_4
|
|
cls.decimal_aic_R = 3
|
|
data = cpunish.load()
|
|
data.endog = np.asarray(data.endog)
|
|
data.exog = np.asarray(data.exog)
|
|
data.exog[:, 3] = np.log(data.exog[:, 3])
|
|
data.exog = add_constant(data.exog, prepend=True)
|
|
exposure = [100] * len(data.endog)
|
|
cls.data = data
|
|
cls.exposure = exposure
|
|
cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(),
|
|
exposure=exposure).fit()
|
|
cls.res2 = Cpunish_offset()
|
|
|
|
def test_missing(self):
|
|
# make sure offset is dropped correctly
|
|
endog = self.data.endog.copy()
|
|
endog[[2,4,6,8]] = np.nan
|
|
mod = GLM(endog, self.data.exog, family=sm.families.Poisson(),
|
|
exposure=self.exposure, missing='drop')
|
|
assert_equal(mod.exposure.shape[0], 13)
|
|
|
|
def test_offset_exposure(self):
|
|
# exposure=x and offset=log(x) should have the same effect
|
|
np.random.seed(382304)
|
|
endog = np.random.randint(0, 10, 100)
|
|
exog = np.random.normal(size=(100,3))
|
|
exposure = np.random.uniform(1, 2, 100)
|
|
offset = np.random.uniform(1, 2, 100)
|
|
mod1 = GLM(endog, exog, family=sm.families.Poisson(),
|
|
offset=offset, exposure=exposure).fit()
|
|
offset2 = offset + np.log(exposure)
|
|
mod2 = GLM(endog, exog, family=sm.families.Poisson(),
|
|
offset=offset2).fit()
|
|
assert_almost_equal(mod1.params, mod2.params)
|
|
assert_allclose(mod1.null, mod2.null, rtol=1e-10)
|
|
|
|
# test recreating model
|
|
mod1_ = mod1.model
|
|
kwds = mod1_._get_init_kwds()
|
|
assert_allclose(kwds['exposure'], exposure, rtol=1e-14)
|
|
assert_allclose(kwds['offset'], mod1_.offset, rtol=1e-14)
|
|
mod3 = mod1_.__class__(mod1_.endog, mod1_.exog, **kwds)
|
|
assert_allclose(mod3.exposure, mod1_.exposure, rtol=1e-14)
|
|
assert_allclose(mod3.offset, mod1_.offset, rtol=1e-14)
|
|
|
|
# test fit_regularized exposure, see #4605
|
|
resr1 = mod1.model.fit_regularized()
|
|
resr2 = mod2.model.fit_regularized()
|
|
assert_allclose(resr1.params, resr2.params, rtol=1e-10)
|
|
|
|
|
|
def test_predict(self):
|
|
np.random.seed(382304)
|
|
endog = np.random.randint(0, 10, 100)
|
|
exog = np.random.normal(size=(100,3))
|
|
exposure = np.random.uniform(1, 2, 100)
|
|
mod1 = GLM(endog, exog, family=sm.families.Poisson(),
|
|
exposure=exposure).fit()
|
|
exog1 = np.random.normal(size=(10,3))
|
|
exposure1 = np.random.uniform(1, 2, 10)
|
|
|
|
# Doubling exposure time should double expected response
|
|
pred1 = mod1.predict(exog=exog1, exposure=exposure1)
|
|
pred2 = mod1.predict(exog=exog1, exposure=2*exposure1)
|
|
assert_almost_equal(pred2, 2*pred1)
|
|
|
|
# Check exposure defaults
|
|
pred3 = mod1.predict()
|
|
pred4 = mod1.predict(exposure=exposure)
|
|
pred5 = mod1.predict(exog=exog, exposure=exposure)
|
|
assert_almost_equal(pred3, pred4)
|
|
assert_almost_equal(pred4, pred5)
|
|
|
|
# Check offset defaults
|
|
offset = np.random.uniform(1, 2, 100)
|
|
mod2 = GLM(endog, exog, offset=offset,
|
|
family=sm.families.Poisson()).fit()
|
|
pred1 = mod2.predict()
|
|
pred2 = mod2.predict(which="mean", offset=offset)
|
|
pred3 = mod2.predict(exog=exog, which="mean", offset=offset)
|
|
assert_almost_equal(pred1, pred2)
|
|
assert_almost_equal(pred2, pred3)
|
|
|
|
# Check that offset shifts the linear predictor
|
|
mod3 = GLM(endog, exog, family=sm.families.Poisson()).fit()
|
|
offset = np.random.uniform(1, 2, 10)
|
|
with pytest.warns(FutureWarning):
|
|
# deprecation warning for linear keyword
|
|
pred1 = mod3.predict(exog=exog1, offset=offset, linear=True)
|
|
pred2 = mod3.predict(exog=exog1, offset=2*offset, which="linear")
|
|
assert_almost_equal(pred2, pred1+offset)
|
|
|
|
# Passing exposure as a pandas series should not effect output type
|
|
assert isinstance(
|
|
mod1.predict(exog=exog1, exposure=pd.Series(exposure1)),
|
|
np.ndarray
|
|
)
|
|
|
|
|
|
def test_perfect_pred(iris):
|
|
y = iris[:, -1]
|
|
X = iris[:, :-1]
|
|
X = X[y != 2]
|
|
y = y[y != 2]
|
|
X = add_constant(X, prepend=True)
|
|
glm = GLM(y, X, family=sm.families.Binomial())
|
|
|
|
with pytest.warns(PerfectSeparationWarning):
|
|
glm.fit()
|
|
|
|
|
|
def test_score_test_ols():
|
|
# nicer example than Longley
|
|
from statsmodels.regression.linear_model import OLS
|
|
np.random.seed(5)
|
|
nobs = 100
|
|
sige = 0.5
|
|
x = np.random.uniform(0, 1, size=(nobs, 5))
|
|
x[:, 0] = 1
|
|
beta = 1. / np.arange(1., x.shape[1] + 1)
|
|
y = x.dot(beta) + sige * np.random.randn(nobs)
|
|
|
|
res_ols = OLS(y, x).fit()
|
|
res_olsc = OLS(y, x[:, :-2]).fit()
|
|
co = res_ols.compare_lm_test(res_olsc, demean=False)
|
|
|
|
res_glm = GLM(y, x[:, :-2], family=sm.families.Gaussian()).fit()
|
|
co2 = res_glm.model.score_test(res_glm.params, exog_extra=x[:, -2:])
|
|
# difference in df_resid versus nobs in scale see #1786
|
|
assert_allclose(co[0] * 97 / 100., co2[0], rtol=1e-13)
|
|
|
|
|
|
def test_attribute_writable_resettable():
|
|
# Regression test for mutables and class constructors.
|
|
data = sm.datasets.longley.load()
|
|
endog, exog = data.endog, data.exog
|
|
glm_model = sm.GLM(endog, exog)
|
|
assert_equal(glm_model.family.link.power, 1.0)
|
|
glm_model.family.link.power = 2.
|
|
assert_equal(glm_model.family.link.power, 2.0)
|
|
glm_model2 = sm.GLM(endog, exog)
|
|
assert_equal(glm_model2.family.link.power, 1.0)
|
|
|
|
|
|
class TestStartParams(CheckModelResultsMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Gaussian family with canonical identity link
|
|
'''
|
|
# Test Precisions
|
|
cls.decimal_resids = DECIMAL_3
|
|
cls.decimal_params = DECIMAL_2
|
|
cls.decimal_bic = DECIMAL_0
|
|
cls.decimal_bse = DECIMAL_3
|
|
|
|
from statsmodels.datasets.longley import load
|
|
cls.data = load()
|
|
cls.data.exog = add_constant(cls.data.exog, prepend=False)
|
|
params = sm.OLS(cls.data.endog, cls.data.exog).fit().params
|
|
cls.res1 = GLM(cls.data.endog, cls.data.exog,
|
|
family=sm.families.Gaussian()).fit(start_params=params)
|
|
from .results.results_glm import Longley
|
|
cls.res2 = Longley()
|
|
|
|
|
|
def test_glm_start_params():
|
|
# see 1604
|
|
y2 = np.array('0 1 0 0 0 1'.split(), int)
|
|
wt = np.array([50,1,50,1,5,10])
|
|
y2 = np.repeat(y2, wt)
|
|
x2 = np.repeat([0,0,0.001,100,-1,-1], wt)
|
|
mod = sm.GLM(y2, sm.add_constant(x2), family=sm.families.Binomial())
|
|
res = mod.fit(start_params=[-4, -5])
|
|
np.testing.assert_almost_equal(res.params, [-4.60305022, -5.29634545], 6)
|
|
|
|
|
|
def test_loglike_no_opt():
|
|
# see 1728
|
|
|
|
y = np.asarray([0, 1, 0, 0, 1, 1, 0, 1, 1, 1])
|
|
x = np.arange(10, dtype=np.float64)
|
|
|
|
def llf(params):
|
|
lin_pred = params[0] + params[1]*x
|
|
pr = 1 / (1 + np.exp(-lin_pred))
|
|
return np.sum(y*np.log(pr) + (1-y)*np.log(1-pr))
|
|
|
|
for params in [0,0], [0,1], [0.5,0.5]:
|
|
mod = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial())
|
|
res = mod.fit(start_params=params, maxiter=0)
|
|
like = llf(params)
|
|
assert_almost_equal(like, res.llf)
|
|
|
|
|
|
def test_formula_missing_exposure():
|
|
# see 2083
|
|
import statsmodels.formula.api as smf
|
|
|
|
d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan],
|
|
'constant': [1] * 4, 'exposure': np.random.uniform(size=4),
|
|
'x': [1, 3, 2, 1.5]}
|
|
df = pd.DataFrame(d)
|
|
|
|
family = sm.families.Gaussian(link=sm.families.links.Log())
|
|
|
|
mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure,
|
|
family=family)
|
|
assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray')
|
|
|
|
exposure = pd.Series(np.random.uniform(size=5))
|
|
df.loc[3, 'Bar'] = 4 # nan not relevant for Valueerror for shape mismatch
|
|
assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df,
|
|
exposure=exposure, family=family)
|
|
assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']],
|
|
exposure=exposure, family=family)
|
|
|
|
|
|
@pytest.mark.matplotlib
|
|
def test_plots(close_figures):
|
|
|
|
np.random.seed(378)
|
|
n = 200
|
|
exog = np.random.normal(size=(n, 2))
|
|
lin_pred = exog[:, 0] + exog[:, 1]**2
|
|
prob = 1 / (1 + np.exp(-lin_pred))
|
|
endog = 1 * (np.random.uniform(size=n) < prob)
|
|
|
|
model = sm.GLM(endog, exog, family=sm.families.Binomial())
|
|
result = model.fit()
|
|
|
|
import pandas as pd
|
|
|
|
from statsmodels.graphics.regressionplots import add_lowess
|
|
|
|
# array interface
|
|
for j in 0,1:
|
|
fig = result.plot_added_variable(j)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
fig = result.plot_partial_residuals(j)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
fig = result.plot_ceres_residuals(j)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
|
|
# formula interface
|
|
data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]})
|
|
model = sm.GLM.from_formula("y ~ x1 + x2", data, family=sm.families.Binomial())
|
|
result = model.fit()
|
|
for j in 0,1:
|
|
xname = ["x1", "x2"][j]
|
|
fig = result.plot_added_variable(xname)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
fig = result.plot_partial_residuals(xname)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
fig = result.plot_ceres_residuals(xname)
|
|
add_lowess(fig.axes[0], frac=0.5)
|
|
close_or_save(pdf, fig)
|
|
|
|
def gen_endog(lin_pred, family_class, link, binom_version=0):
|
|
|
|
np.random.seed(872)
|
|
|
|
fam = sm.families
|
|
|
|
mu = link().inverse(lin_pred)
|
|
|
|
if family_class == fam.Binomial:
|
|
if binom_version == 0:
|
|
endog = 1*(np.random.uniform(size=len(lin_pred)) < mu)
|
|
else:
|
|
endog = np.empty((len(lin_pred), 2))
|
|
n = 10
|
|
endog[:, 0] = (np.random.uniform(size=(len(lin_pred), n)) < mu[:, None]).sum(1)
|
|
endog[:, 1] = n - endog[:, 0]
|
|
elif family_class == fam.Poisson:
|
|
endog = np.random.poisson(mu)
|
|
elif family_class == fam.Gamma:
|
|
endog = np.random.gamma(2, mu)
|
|
elif family_class == fam.Gaussian:
|
|
endog = mu + 2 * np.random.normal(size=len(lin_pred))
|
|
elif family_class == fam.NegativeBinomial:
|
|
from scipy.stats.distributions import nbinom
|
|
endog = nbinom.rvs(mu, 0.5)
|
|
elif family_class == fam.InverseGaussian:
|
|
from scipy.stats.distributions import invgauss
|
|
endog = invgauss.rvs(mu, scale=20)
|
|
else:
|
|
raise ValueError
|
|
|
|
return endog
|
|
|
|
|
|
@pytest.mark.smoke
|
|
def test_summary():
|
|
np.random.seed(4323)
|
|
|
|
n = 100
|
|
exog = np.random.normal(size=(n, 2))
|
|
exog[:, 0] = 1
|
|
endog = np.random.normal(size=n)
|
|
|
|
for method in ["irls", "cg"]:
|
|
fa = sm.families.Gaussian()
|
|
model = sm.GLM(endog, exog, family=fa)
|
|
rslt = model.fit(method=method)
|
|
s = rslt.summary()
|
|
|
|
|
|
def check_score_hessian(results):
|
|
# compare models core and hessian with numerical derivatives
|
|
|
|
params = results.params
|
|
# avoid checking score at MLE, score close to zero
|
|
sc = results.model.score(params * 0.98, scale=1)
|
|
# cs currently (0.9) does not work for all families
|
|
llfunc = lambda x: results.model.loglike(x, scale=1) # noqa
|
|
sc2 = approx_fprime(params * 0.98, llfunc)
|
|
assert_allclose(sc, sc2, rtol=1e-4, atol=1e-4)
|
|
|
|
hess = results.model.hessian(params, scale=1)
|
|
hess2 = approx_hess(params, llfunc)
|
|
assert_allclose(hess, hess2, rtol=1e-4)
|
|
scfunc = lambda x: results.model.score(x, scale=1) # noqa
|
|
hess3 = approx_fprime(params, scfunc)
|
|
assert_allclose(hess, hess3, rtol=1e-4)
|
|
|
|
|
|
def test_gradient_irls():
|
|
# Compare the results when using gradient optimization and IRLS.
|
|
|
|
# TODO: Find working examples for inverse_squared link
|
|
|
|
np.random.seed(87342)
|
|
|
|
fam = sm.families
|
|
lnk = sm.families.links
|
|
families = [(fam.Binomial, [lnk.Logit, lnk.Probit, lnk.CLogLog, lnk.Log, lnk.Cauchy]),
|
|
(fam.Poisson, [lnk.Log, lnk.Identity, lnk.Sqrt]),
|
|
(fam.Gamma, [lnk.Log, lnk.Identity, lnk.InversePower]),
|
|
(fam.Gaussian, [lnk.Identity, lnk.Log, lnk.InversePower]),
|
|
(fam.InverseGaussian, [lnk.Log, lnk.Identity, lnk.InversePower, lnk.InverseSquared]),
|
|
(fam.NegativeBinomial, [lnk.Log, lnk.InversePower, lnk.InverseSquared, lnk.Identity])]
|
|
|
|
n = 100
|
|
p = 3
|
|
exog = np.random.normal(size=(n, p))
|
|
exog[:, 0] = 1
|
|
|
|
skip_one = False
|
|
for family_class, family_links in families:
|
|
for link in family_links:
|
|
for binom_version in 0,1:
|
|
|
|
if family_class != fam.Binomial and binom_version == 1:
|
|
continue
|
|
|
|
if (family_class, link) == (fam.Poisson, lnk.Identity):
|
|
lin_pred = 20 + exog.sum(1)
|
|
elif (family_class, link) == (fam.Binomial, lnk.Log):
|
|
lin_pred = -1 + exog.sum(1) / 8
|
|
elif (family_class, link) == (fam.Poisson, lnk.Sqrt):
|
|
lin_pred = 2 + exog.sum(1)
|
|
elif (family_class, link) == (fam.InverseGaussian, lnk.Log):
|
|
#skip_zero = True
|
|
lin_pred = -1 + exog.sum(1)
|
|
elif (family_class, link) == (fam.InverseGaussian, lnk.Identity):
|
|
lin_pred = 20 + 5*exog.sum(1)
|
|
lin_pred = np.clip(lin_pred, 1e-4, np.inf)
|
|
elif (family_class, link) == (fam.InverseGaussian, lnk.InverseSquared):
|
|
lin_pred = 0.5 + exog.sum(1) / 5
|
|
continue # skip due to non-convergence
|
|
elif (family_class, link) == (fam.InverseGaussian, lnk.InversePower):
|
|
lin_pred = 1 + exog.sum(1) / 5
|
|
elif (family_class, link) == (fam.NegativeBinomial, lnk.Identity):
|
|
lin_pred = 20 + 5*exog.sum(1)
|
|
lin_pred = np.clip(lin_pred, 1e-4, np.inf)
|
|
elif (family_class, link) == (fam.NegativeBinomial, lnk.InverseSquared):
|
|
lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
|
|
continue # skip due to non-convergence
|
|
elif (family_class, link) == (fam.NegativeBinomial, lnk.InversePower):
|
|
lin_pred = 1 + exog.sum(1) / 5
|
|
|
|
elif (family_class, link) == (fam.Gaussian, lnk.InversePower):
|
|
# adding skip because of convergence failure
|
|
skip_one = True
|
|
# the following fails with Identity link, because endog < 0
|
|
# elif family_class == fam.Gamma:
|
|
# lin_pred = 0.5 * exog.sum(1) + np.random.uniform(size=exog.shape[0])
|
|
else:
|
|
lin_pred = np.random.uniform(size=exog.shape[0])
|
|
|
|
endog = gen_endog(lin_pred, family_class, link, binom_version)
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
mod_irls = sm.GLM(endog, exog, family=family_class(link=link()))
|
|
rslt_irls = mod_irls.fit(method="IRLS")
|
|
|
|
if not (family_class, link) in [(fam.Poisson, lnk.Sqrt),
|
|
(fam.Gamma, lnk.InversePower),
|
|
(fam.InverseGaussian, lnk.Identity)
|
|
]:
|
|
check_score_hessian(rslt_irls)
|
|
|
|
# Try with and without starting values.
|
|
for max_start_irls, start_params in (0, rslt_irls.params), (3, None):
|
|
# TODO: skip convergence failures for now
|
|
if max_start_irls > 0 and skip_one:
|
|
continue
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
mod_gradient = sm.GLM(endog, exog, family=family_class(link=link()))
|
|
rslt_gradient = mod_gradient.fit(max_start_irls=max_start_irls,
|
|
start_params=start_params,
|
|
method="newton", maxiter=300)
|
|
|
|
assert_allclose(rslt_gradient.params,
|
|
rslt_irls.params, rtol=1e-6, atol=5e-5)
|
|
|
|
assert_allclose(rslt_gradient.llf, rslt_irls.llf,
|
|
rtol=1e-6, atol=1e-6)
|
|
|
|
assert_allclose(rslt_gradient.scale, rslt_irls.scale,
|
|
rtol=1e-6, atol=1e-6)
|
|
|
|
# Get the standard errors using expected information.
|
|
gradient_bse = rslt_gradient.bse
|
|
ehess = mod_gradient.hessian(rslt_gradient.params, observed=False)
|
|
gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
|
|
assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, atol=5e-5)
|
|
# rslt_irls.bse corresponds to observed=True
|
|
assert_allclose(rslt_gradient.bse, rslt_irls.bse, rtol=0.2, atol=5e-5)
|
|
|
|
rslt_gradient_eim = mod_gradient.fit(max_start_irls=0,
|
|
cov_type='eim',
|
|
start_params=rslt_gradient.params,
|
|
method="newton", maxiter=300)
|
|
assert_allclose(rslt_gradient_eim.bse, rslt_irls.bse, rtol=5e-5, atol=0)
|
|
|
|
|
|
def test_gradient_irls_eim():
|
|
# Compare the results when using eime gradient optimization and IRLS.
|
|
|
|
# TODO: Find working examples for inverse_squared link
|
|
|
|
np.random.seed(87342)
|
|
|
|
fam = sm.families
|
|
lnk = sm.families.links
|
|
families = [(fam.Binomial, [lnk.Logit, lnk.Probit, lnk.CLogLog, lnk.Log,
|
|
lnk.Cauchy]),
|
|
(fam.Poisson, [lnk.Log, lnk.Identity, lnk.Sqrt]),
|
|
(fam.Gamma, [lnk.Log, lnk.Identity, lnk.InversePower]),
|
|
(fam.Gaussian, [lnk.Identity, lnk.Log, lnk.InversePower]),
|
|
(fam.InverseGaussian, [lnk.Log, lnk.Identity,
|
|
lnk.InversePower,
|
|
lnk.InverseSquared]),
|
|
(fam.NegativeBinomial, [lnk.Log, lnk.InversePower,
|
|
lnk.InverseSquared, lnk.Identity])]
|
|
|
|
n = 100
|
|
p = 3
|
|
exog = np.random.normal(size=(n, p))
|
|
exog[:, 0] = 1
|
|
|
|
skip_one = False
|
|
for family_class, family_links in families:
|
|
for link in family_links:
|
|
for binom_version in 0, 1:
|
|
|
|
if family_class != fam.Binomial and binom_version == 1:
|
|
continue
|
|
|
|
if (family_class, link) == (fam.Poisson, lnk.Identity):
|
|
lin_pred = 20 + exog.sum(1)
|
|
elif (family_class, link) == (fam.Binomial, lnk.Log):
|
|
lin_pred = -1 + exog.sum(1) / 8
|
|
elif (family_class, link) == (fam.Poisson, lnk.Sqrt):
|
|
lin_pred = 2 + exog.sum(1)
|
|
elif (family_class, link) == (fam.InverseGaussian, lnk.Log):
|
|
# skip_zero = True
|
|
lin_pred = -1 + exog.sum(1)
|
|
elif (family_class, link) == (fam.InverseGaussian,
|
|
lnk.Identity):
|
|
lin_pred = 20 + 5*exog.sum(1)
|
|
lin_pred = np.clip(lin_pred, 1e-4, np.inf)
|
|
elif (family_class, link) == (fam.InverseGaussian,
|
|
lnk.InverseSquared):
|
|
lin_pred = 0.5 + exog.sum(1) / 5
|
|
continue # skip due to non-convergence
|
|
elif (family_class, link) == (fam.InverseGaussian,
|
|
lnk.InversePower):
|
|
lin_pred = 1 + exog.sum(1) / 5
|
|
elif (family_class, link) == (fam.NegativeBinomial,
|
|
lnk.Identity):
|
|
lin_pred = 20 + 5*exog.sum(1)
|
|
lin_pred = np.clip(lin_pred, 1e-4, np.inf)
|
|
elif (family_class, link) == (fam.NegativeBinomial,
|
|
lnk.InverseSquared):
|
|
lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
|
|
continue # skip due to non-convergence
|
|
elif (family_class, link) == (fam.NegativeBinomial,
|
|
lnk.InversePower):
|
|
lin_pred = 1 + exog.sum(1) / 5
|
|
|
|
elif (family_class, link) == (fam.Gaussian, lnk.InversePower):
|
|
# adding skip because of convergence failure
|
|
skip_one = True
|
|
else:
|
|
lin_pred = np.random.uniform(size=exog.shape[0])
|
|
|
|
endog = gen_endog(lin_pred, family_class, link, binom_version)
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
mod_irls = sm.GLM(endog, exog,
|
|
family=family_class(link=link()))
|
|
rslt_irls = mod_irls.fit(method="IRLS")
|
|
|
|
# Try with and without starting values.
|
|
for max_start_irls, start_params in ((0, rslt_irls.params),
|
|
(3, None)):
|
|
# TODO: skip convergence failures for now
|
|
if max_start_irls > 0 and skip_one:
|
|
continue
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
mod_gradient = sm.GLM(endog, exog,
|
|
family=family_class(link=link()))
|
|
rslt_gradient = mod_gradient.fit(
|
|
max_start_irls=max_start_irls,
|
|
start_params=start_params,
|
|
method="newton",
|
|
optim_hessian='eim'
|
|
)
|
|
|
|
assert_allclose(rslt_gradient.params, rslt_irls.params,
|
|
rtol=1e-6, atol=5e-5)
|
|
|
|
assert_allclose(rslt_gradient.llf, rslt_irls.llf,
|
|
rtol=1e-6, atol=1e-6)
|
|
|
|
assert_allclose(rslt_gradient.scale, rslt_irls.scale,
|
|
rtol=1e-6, atol=1e-6)
|
|
|
|
# Get the standard errors using expected information.
|
|
ehess = mod_gradient.hessian(rslt_gradient.params,
|
|
observed=False)
|
|
gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
|
|
|
|
assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6,
|
|
atol=5e-5)
|
|
|
|
|
|
def test_glm_irls_method():
|
|
nobs, k_vars = 50, 4
|
|
np.random.seed(987126)
|
|
x = np.random.randn(nobs, k_vars - 1)
|
|
exog = add_constant(x, has_constant='add')
|
|
y = exog.sum(1) + np.random.randn(nobs)
|
|
|
|
mod = GLM(y, exog)
|
|
res1 = mod.fit()
|
|
res2 = mod.fit(wls_method='pinv', attach_wls=True)
|
|
res3 = mod.fit(wls_method='qr', attach_wls=True)
|
|
# fit_gradient does not attach mle_settings
|
|
res_g1 = mod.fit(start_params=res1.params, method='bfgs')
|
|
|
|
for r in [res1, res2, res3]:
|
|
assert_equal(r.mle_settings['optimizer'], 'IRLS')
|
|
assert_equal(r.method, 'IRLS')
|
|
|
|
assert_equal(res1.mle_settings['wls_method'], 'lstsq')
|
|
assert_equal(res2.mle_settings['wls_method'], 'pinv')
|
|
assert_equal(res3.mle_settings['wls_method'], 'qr')
|
|
|
|
assert_(hasattr(res2.results_wls.model, 'pinv_wexog'))
|
|
assert_(hasattr(res3.results_wls.model, 'exog_Q'))
|
|
|
|
# fit_gradient currently does not attach mle_settings
|
|
assert_equal(res_g1.method, 'bfgs')
|
|
|
|
|
|
class CheckWtdDuplicationMixin:
|
|
decimal_params = DECIMAL_4
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.data = cpunish.load()
|
|
cls.data.endog = np.asarray(cls.data.endog)
|
|
cls.data.exog = np.asarray(cls.data.exog)
|
|
cls.endog = cls.data.endog
|
|
cls.exog = cls.data.exog
|
|
np.random.seed(1234)
|
|
cls.weight = np.random.randint(5, 100, len(cls.endog))
|
|
cls.endog_big = np.repeat(cls.endog, cls.weight)
|
|
cls.exog_big = np.repeat(cls.exog, cls.weight, axis=0)
|
|
|
|
def test_params(self):
|
|
assert_allclose(self.res1.params, self.res2.params, atol=1e-6,
|
|
rtol=1e-6)
|
|
|
|
decimal_bse = DECIMAL_4
|
|
|
|
def test_standard_errors(self):
|
|
assert_allclose(self.res1.bse, self.res2.bse, rtol=1e-5, atol=1e-6)
|
|
|
|
decimal_resids = DECIMAL_4
|
|
|
|
# TODO: This does not work... Arrays are of different shape.
|
|
# Perhaps we use self.res1.model.family.resid_XXX()?
|
|
"""
|
|
def test_residuals(self):
|
|
resids1 = np.column_stack((self.res1.resid_pearson,
|
|
self.res1.resid_deviance,
|
|
self.res1.resid_working,
|
|
self.res1.resid_anscombe,
|
|
self.res1.resid_response))
|
|
resids2 = np.column_stack((self.res1.resid_pearson,
|
|
self.res2.resid_deviance,
|
|
self.res2.resid_working,
|
|
self.res2.resid_anscombe,
|
|
self.res2.resid_response))
|
|
assert_allclose(resids1, resids2, self.decimal_resids)
|
|
"""
|
|
|
|
def test_aic(self):
|
|
# R includes the estimation of the scale as a lost dof
|
|
# Does not with Gamma though
|
|
assert_allclose(self.res1.aic, self.res2.aic, atol=1e-6, rtol=1e-6)
|
|
|
|
def test_deviance(self):
|
|
assert_allclose(self.res1.deviance, self.res2.deviance, atol=1e-6,
|
|
rtol=1e-6)
|
|
|
|
def test_scale(self):
|
|
assert_allclose(self.res1.scale, self.res2.scale, atol=1e-6, rtol=1e-6)
|
|
|
|
def test_loglike(self):
|
|
# Stata uses the below llf for these families
|
|
# We differ with R for them
|
|
assert_allclose(self.res1.llf, self.res2.llf, 1e-6)
|
|
|
|
decimal_null_deviance = DECIMAL_4
|
|
|
|
def test_null_deviance(self):
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", DomainWarning)
|
|
|
|
assert_allclose(self.res1.null_deviance,
|
|
self.res2.null_deviance,
|
|
atol=1e-6,
|
|
rtol=1e-6)
|
|
|
|
decimal_bic = DECIMAL_4
|
|
|
|
def test_bic(self):
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6)
|
|
|
|
decimal_fittedvalues = DECIMAL_4
|
|
|
|
def test_fittedvalues(self):
|
|
res2_fitted = self.res2.predict(self.res1.model.exog)
|
|
assert_allclose(self.res1.fittedvalues, res2_fitted, atol=1e-5,
|
|
rtol=1e-5)
|
|
|
|
decimal_tpvalues = DECIMAL_4
|
|
|
|
def test_tpvalues(self):
|
|
# test comparing tvalues and pvalues with normal implementation
|
|
# make sure they use normal distribution (inherited in results class)
|
|
assert_allclose(self.res1.tvalues, self.res2.tvalues, atol=1e-6,
|
|
rtol=2e-4)
|
|
assert_allclose(self.res1.pvalues, self.res2.pvalues, atol=1e-6,
|
|
rtol=1e-6)
|
|
assert_allclose(self.res1.conf_int(), self.res2.conf_int(), atol=1e-6,
|
|
rtol=1e-6)
|
|
|
|
|
|
class TestWtdGlmPoisson(CheckWtdDuplicationMixin):
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Poisson family with canonical log link.
|
|
'''
|
|
super().setup_class()
|
|
cls.endog = np.asarray(cls.endog)
|
|
cls.exog = np.asarray(cls.exog)
|
|
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=sm.families.Poisson()).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=sm.families.Poisson()).fit()
|
|
|
|
|
|
class TestWtdGlmPoissonNewton(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Poisson family with canonical log link.
|
|
'''
|
|
super().setup_class()
|
|
|
|
start_params = np.array([1.82794424e-04, -4.76785037e-02,
|
|
-9.48249717e-02, -2.92293226e-04,
|
|
2.63728909e+00, -2.05934384e+01])
|
|
|
|
fit_kwds = dict(method='newton')
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=sm.families.Poisson()).fit(**fit_kwds)
|
|
fit_kwds = dict(method='newton', start_params=start_params)
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=sm.families.Poisson()).fit(**fit_kwds)
|
|
|
|
|
|
class TestWtdGlmPoissonHC0(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
'''
|
|
Tests Poisson family with canonical log link.
|
|
'''
|
|
super().setup_class()
|
|
|
|
start_params = np.array([1.82794424e-04, -4.76785037e-02,
|
|
-9.48249717e-02, -2.92293226e-04,
|
|
2.63728909e+00, -2.05934384e+01])
|
|
|
|
fit_kwds = dict(cov_type='HC0')
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=sm.families.Poisson()).fit(**fit_kwds)
|
|
fit_kwds = dict(cov_type='HC0', start_params=start_params)
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=sm.families.Poisson()).fit(**fit_kwds)
|
|
|
|
|
|
class TestWtdGlmPoissonClu(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
'''
|
|
Tests Poisson family with canonical log link.
|
|
'''
|
|
super().setup_class()
|
|
|
|
start_params = np.array([1.82794424e-04, -4.76785037e-02,
|
|
-9.48249717e-02, -2.92293226e-04,
|
|
2.63728909e+00, -2.05934384e+01])
|
|
|
|
gid = np.arange(1, len(cls.endog) + 1) // 2
|
|
fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gid, 'use_correction':False})
|
|
|
|
import warnings
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=sm.families.Poisson()).fit(**fit_kwds)
|
|
gidr = np.repeat(gid, cls.weight)
|
|
fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gidr, 'use_correction':False})
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=sm.families.Poisson()).fit(start_params=start_params,
|
|
**fit_kwds)
|
|
|
|
|
|
class TestWtdGlmBinomial(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
'''
|
|
Tests Binomial family with canonical logit link.
|
|
'''
|
|
super().setup_class()
|
|
cls.endog = cls.endog / 100
|
|
cls.endog_big = cls.endog_big / 100
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=sm.families.Binomial()).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=sm.families.Binomial()).fit()
|
|
|
|
|
|
class TestWtdGlmNegativeBinomial(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
'''
|
|
Tests Negative Binomial family with canonical link
|
|
g(p) = log(p/(p + 1/alpha))
|
|
'''
|
|
super().setup_class()
|
|
alpha = 1.
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore", category=DomainWarning)
|
|
family_link = sm.families.NegativeBinomial(
|
|
link=sm.families.links.NegativeBinomial(alpha=alpha),
|
|
alpha=alpha)
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdGlmGamma(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
'''
|
|
Tests Gamma family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Gamma(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdGlmGaussian(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Gaussian family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Gaussian(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdGlmInverseGaussian(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests InverseGaussian family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.InverseGaussian(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdGlmGammaNewton(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Gamma family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Gamma(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link
|
|
).fit(method='newton')
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link
|
|
).fit(method='newton')
|
|
|
|
def test_init_kwargs(self):
|
|
family_link = sm.families.Gamma(sm.families.links.Log())
|
|
|
|
with pytest.warns(ValueWarning, match="unknown kwargs"):
|
|
GLM(self.endog, self.exog, family=family_link,
|
|
weights=self.weight, # incorrect keyword
|
|
)
|
|
|
|
|
|
class TestWtdGlmGammaScale_X2(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Gamma family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Gamma(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link,
|
|
).fit(scale='X2')
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link,
|
|
).fit(scale='X2')
|
|
|
|
|
|
class TestWtdGlmGammaScale_dev(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Gamma family with log link.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Gamma(sm.families.links.Log())
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link,
|
|
).fit(scale='dev')
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link,
|
|
).fit(scale='dev')
|
|
|
|
def test_missing(self):
|
|
endog = self.data.endog.copy()
|
|
exog = self.data.exog.copy()
|
|
exog[0, 0] = np.nan
|
|
endog[[2, 4, 6, 8]] = np.nan
|
|
freq_weights = self.weight
|
|
mod_misisng = GLM(endog, exog, family=self.res1.model.family,
|
|
freq_weights=freq_weights, missing='drop')
|
|
assert_equal(mod_misisng.freq_weights.shape[0],
|
|
mod_misisng.endog.shape[0])
|
|
assert_equal(mod_misisng.freq_weights.shape[0],
|
|
mod_misisng.exog.shape[0])
|
|
keep_idx = np.array([1, 3, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16])
|
|
assert_equal(mod_misisng.freq_weights, self.weight[keep_idx])
|
|
|
|
|
|
class TestWtdTweedieLog(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Tweedie family with log link and var_power=1.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=1)
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdTweediePower2(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Tweedie family with Power(1) link and var_power=2.
|
|
'''
|
|
cls.data = cpunish.load_pandas()
|
|
cls.endog = cls.data.endog
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
np.random.seed(1234)
|
|
cls.weight = np.random.randint(5, 100, len(cls.endog))
|
|
cls.endog_big = np.repeat(cls.endog.values, cls.weight)
|
|
cls.exog_big = np.repeat(cls.exog.values, cls.weight, axis=0)
|
|
link = sm.families.links.Power()
|
|
family_link = sm.families.Tweedie(link=link, var_power=2)
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
class TestWtdTweediePower15(CheckWtdDuplicationMixin):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Tests Tweedie family with Power(0.5) link and var_power=1.5.
|
|
'''
|
|
super().setup_class()
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Power(0.5),
|
|
var_power=1.5)
|
|
cls.res1 = GLM(cls.endog, cls.exog,
|
|
freq_weights=cls.weight,
|
|
family=family_link).fit()
|
|
cls.res2 = GLM(cls.endog_big, cls.exog_big,
|
|
family=family_link).fit()
|
|
|
|
|
|
def test_wtd_patsy_missing():
|
|
import pandas as pd
|
|
data = cpunish.load()
|
|
data.endog = np.require(data.endog, requirements="W")
|
|
data.exog = np.require(data.exog, requirements="W")
|
|
data.exog[0, 0] = np.nan
|
|
data.endog[[2, 4, 6, 8]] = np.nan
|
|
data.pandas = pd.DataFrame(data.exog, columns=data.exog_name)
|
|
data.pandas['EXECUTIONS'] = data.endog
|
|
weights = np.arange(1, len(data.endog)+1)
|
|
formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 +
|
|
SOUTH + DEGREE"""
|
|
mod_misisng = GLM.from_formula(formula, data=data.pandas,
|
|
freq_weights=weights)
|
|
assert_equal(mod_misisng.freq_weights.shape[0],
|
|
mod_misisng.endog.shape[0])
|
|
assert_equal(mod_misisng.freq_weights.shape[0],
|
|
mod_misisng.exog.shape[0])
|
|
assert_equal(mod_misisng.freq_weights.shape[0], 12)
|
|
keep_weights = np.array([2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17])
|
|
assert_equal(mod_misisng.freq_weights, keep_weights)
|
|
|
|
|
|
class CheckTweedie:
|
|
def test_resid(self):
|
|
idx1 = len(self.res1.resid_response) - 1
|
|
idx2 = len(self.res2.resid_response) - 1
|
|
assert_allclose(np.concatenate((self.res1.resid_response[:17],
|
|
[self.res1.resid_response[idx1]])),
|
|
np.concatenate((self.res2.resid_response[:17],
|
|
[self.res2.resid_response[idx2]])),
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(np.concatenate((self.res1.resid_pearson[:17],
|
|
[self.res1.resid_pearson[idx1]])),
|
|
np.concatenate((self.res2.resid_pearson[:17],
|
|
[self.res2.resid_pearson[idx2]])),
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(np.concatenate((self.res1.resid_deviance[:17],
|
|
[self.res1.resid_deviance[idx1]])),
|
|
np.concatenate((self.res2.resid_deviance[:17],
|
|
[self.res2.resid_deviance[idx2]])),
|
|
rtol=1e-5, atol=1e-5)
|
|
|
|
assert_allclose(np.concatenate((self.res1.resid_working[:17],
|
|
[self.res1.resid_working[idx1]])),
|
|
np.concatenate((self.res2.resid_working[:17],
|
|
[self.res2.resid_working[idx2]])),
|
|
rtol=1e-5, atol=1e-5)
|
|
|
|
|
|
def test_bse(self):
|
|
assert_allclose(self.res1.bse, self.res2.bse, atol=1e-6, rtol=1e6)
|
|
|
|
def test_params(self):
|
|
assert_allclose(self.res1.params, self.res2.params, atol=1e-5,
|
|
rtol=1e-5)
|
|
|
|
def test_deviance(self):
|
|
assert_allclose(self.res1.deviance, self.res2.deviance, atol=1e-6,
|
|
rtol=1e-6)
|
|
|
|
def test_df(self):
|
|
assert_equal(self.res1.df_model, self.res2.df_model)
|
|
assert_equal(self.res1.df_resid, self.res2.df_resid)
|
|
|
|
def test_fittedvalues(self):
|
|
idx1 = len(self.res1.fittedvalues) - 1
|
|
idx2 = len(self.res2.resid_response) - 1
|
|
assert_allclose(np.concatenate((self.res1.fittedvalues[:17],
|
|
[self.res1.fittedvalues[idx1]])),
|
|
np.concatenate((self.res2.fittedvalues[:17],
|
|
[self.res2.fittedvalues[idx2]])),
|
|
atol=1e-4, rtol=1e-4)
|
|
|
|
def test_summary(self):
|
|
self.res1.summary()
|
|
self.res1.summary2()
|
|
|
|
|
|
class TestTweediePower15(CheckTweedie):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from .results.results_glm import CpunishTweediePower15
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Power(1),
|
|
var_power=1.5)
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family_link).fit()
|
|
cls.res2 = CpunishTweediePower15()
|
|
|
|
|
|
class TestTweediePower2(CheckTweedie):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from .results.results_glm import CpunishTweediePower2
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Power(1),
|
|
var_power=2.)
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family_link).fit()
|
|
cls.res2 = CpunishTweediePower2()
|
|
|
|
|
|
class TestTweedieLog1(CheckTweedie):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from .results.results_glm import CpunishTweedieLog1
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=1.)
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family_link).fit()
|
|
cls.res2 = CpunishTweedieLog1()
|
|
|
|
|
|
class TestTweedieLog15Fair(CheckTweedie):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
from statsmodels.datasets.fair import load_pandas
|
|
|
|
from .results.results_glm import FairTweedieLog15
|
|
data = load_pandas()
|
|
family_link = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=1.5)
|
|
cls.res1 = sm.GLM(endog=data.endog,
|
|
exog=data.exog[['rate_marriage', 'age',
|
|
'yrs_married']],
|
|
family=family_link).fit()
|
|
cls.res2 = FairTweedieLog15()
|
|
|
|
|
|
class CheckTweedieSpecial:
|
|
def test_mu(self):
|
|
assert_allclose(self.res1.mu, self.res2.mu, rtol=1e-5, atol=1e-5)
|
|
|
|
def test_resid(self):
|
|
assert_allclose(self.res1.resid_response, self.res2.resid_response,
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(self.res1.resid_pearson, self.res2.resid_pearson,
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(self.res1.resid_deviance, self.res2.resid_deviance,
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(self.res1.resid_working, self.res2.resid_working,
|
|
rtol=1e-5, atol=1e-5)
|
|
assert_allclose(self.res1.resid_anscombe_unscaled,
|
|
self.res2.resid_anscombe_unscaled,
|
|
rtol=1e-5, atol=1e-5)
|
|
|
|
|
|
class TestTweedieSpecialLog0(CheckTweedieSpecial):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family1 = sm.families.Gaussian(link=sm.families.links.Log())
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family1).fit()
|
|
family2 = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=0)
|
|
cls.res2 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family2).fit()
|
|
|
|
|
|
class TestTweedieSpecialLog1(CheckTweedieSpecial):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family1 = sm.families.Poisson(link=sm.families.links.Log())
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family1).fit()
|
|
family2 = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=1)
|
|
cls.res2 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family2).fit()
|
|
|
|
|
|
class TestTweedieSpecialLog2(CheckTweedieSpecial):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family1 = sm.families.Gamma(link=sm.families.links.Log())
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family1).fit()
|
|
family2 = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=2)
|
|
cls.res2 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family2).fit()
|
|
|
|
|
|
class TestTweedieSpecialLog3(CheckTweedieSpecial):
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.data = cpunish.load_pandas()
|
|
cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
|
|
cls.endog = cls.data.endog
|
|
family1 = sm.families.InverseGaussian(link=sm.families.links.Log())
|
|
cls.res1 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family1).fit()
|
|
family2 = sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=3)
|
|
cls.res2 = sm.GLM(endog=cls.data.endog,
|
|
exog=cls.data.exog[['INCOME', 'SOUTH']],
|
|
family=family2).fit()
|
|
|
|
def gen_tweedie(p):
|
|
|
|
np.random.seed(3242)
|
|
n = 500
|
|
x = np.random.normal(size=(n, 4))
|
|
lpr = np.dot(x, np.r_[1, -1, 0, 0.5])
|
|
mu = np.exp(lpr)
|
|
lam = 10 * mu**(2 - p) / (2 - p)
|
|
alp = (2 - p) / (p - 1)
|
|
bet = 10 * mu**(1 - p) / (p - 1)
|
|
|
|
# Generate Tweedie values using commpound Poisson distribution
|
|
y = np.empty(n)
|
|
N = np.random.poisson(lam)
|
|
for i in range(n):
|
|
y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum()
|
|
|
|
return y, x
|
|
|
|
@pytest.mark.filterwarnings("ignore:GLM ridge optimization")
|
|
def test_tweedie_EQL():
|
|
# All tests below are regression tests, but the results
|
|
# are very close to the population values.
|
|
|
|
p = 1.5
|
|
y, x = gen_tweedie(p)
|
|
|
|
# Un-regularized fit using gradients
|
|
fam = sm.families.Tweedie(var_power=p, eql=True)
|
|
model1 = sm.GLM(y, x, family=fam)
|
|
result1 = model1.fit(method="newton")
|
|
assert_allclose(result1.params,
|
|
np.array([1.00350497, -0.99656954, 0.00802702, 0.50713209]),
|
|
rtol=1e-5, atol=1e-5)
|
|
|
|
# Un-regularized fit using IRLS
|
|
model1x = sm.GLM(y, x, family=fam)
|
|
result1x = model1x.fit(method="irls")
|
|
assert_allclose(result1.params, result1x.params)
|
|
assert_allclose(result1.bse, result1x.bse, rtol=1e-2)
|
|
|
|
# Lasso fit using coordinate-wise descent
|
|
# TODO: The search gets trapped in an infinite oscillation, so use
|
|
# a slack convergence tolerance.
|
|
model2 = sm.GLM(y, x, family=fam)
|
|
result2 = model2.fit_regularized(L1_wt=1, alpha=0.07, maxiter=200,
|
|
cnvrg_tol=0.01)
|
|
|
|
rtol, atol = 1e-2, 1e-4
|
|
assert_allclose(result2.params,
|
|
np.array([0.976831, -0.952854, 0., 0.470171]),
|
|
rtol=rtol, atol=atol)
|
|
|
|
# Series of ridge fits using gradients
|
|
ev = (np.array([1.001778, -0.99388, 0.00797, 0.506183]),
|
|
np.array([0.98586638, -0.96953481, 0.00749983, 0.4975267]),
|
|
np.array([0.206429, -0.164547, 0.000235, 0.102489]))
|
|
for j, alpha in enumerate([0.05, 0.5, 0.7]):
|
|
model3 = sm.GLM(y, x, family=fam)
|
|
result3 = model3.fit_regularized(L1_wt=0, alpha=alpha)
|
|
assert_allclose(result3.params, ev[j], rtol=rtol, atol=atol)
|
|
result4 = model3.fit_regularized(L1_wt=0, alpha=alpha * np.ones(x.shape[1]))
|
|
assert_allclose(result4.params, result3.params, rtol=rtol, atol=atol)
|
|
alpha = alpha * np.ones(x.shape[1])
|
|
alpha[0] = 0
|
|
result5 = model3.fit_regularized(L1_wt=0, alpha=alpha)
|
|
assert not np.allclose(result5.params, result4.params)
|
|
|
|
def test_tweedie_elastic_net():
|
|
# Check that the coefficients vanish one-by-one
|
|
# when using the elastic net.
|
|
|
|
p = 1.5 # Tweedie variance exponent
|
|
y, x = gen_tweedie(p)
|
|
|
|
# Un-regularized fit using gradients
|
|
fam = sm.families.Tweedie(var_power=p, eql=True)
|
|
model1 = sm.GLM(y, x, family=fam)
|
|
|
|
nnz = []
|
|
for alpha in np.linspace(0, 10, 20):
|
|
result1 = model1.fit_regularized(L1_wt=0.5, alpha=alpha)
|
|
nnz.append((np.abs(result1.params) > 0).sum())
|
|
nnz = np.unique(nnz)
|
|
assert len(nnz) == 5
|
|
|
|
|
|
def test_tweedie_EQL_poisson_limit():
|
|
# Test the limiting Poisson case of the Nelder/Pregibon/Tweedie
|
|
# EQL.
|
|
|
|
np.random.seed(3242)
|
|
n = 500
|
|
|
|
x = np.random.normal(size=(n, 3))
|
|
x[:, 0] = 1
|
|
lpr = 4 + x[:, 1:].sum(1)
|
|
mn = np.exp(lpr)
|
|
y = np.random.poisson(mn)
|
|
|
|
for scale in 1.0, 'x2', 'dev':
|
|
|
|
# Un-regularized fit using gradients not IRLS
|
|
fam = sm.families.Tweedie(var_power=1, eql=True)
|
|
model1 = sm.GLM(y, x, family=fam)
|
|
result1 = model1.fit(method="newton", scale=scale)
|
|
|
|
# Poisson GLM
|
|
model2 = sm.GLM(y, x, family=sm.families.Poisson())
|
|
result2 = model2.fit(method="newton", scale=scale)
|
|
|
|
assert_allclose(result1.params, result2.params, atol=1e-6, rtol=1e-6)
|
|
assert_allclose(result1.bse, result2.bse, 1e-6, 1e-6)
|
|
|
|
|
|
def test_tweedie_EQL_upper_limit():
|
|
# Test the limiting case of the Nelder/Pregibon/Tweedie
|
|
# EQL with var = mean^2. These are tests against population
|
|
# values so accuracy is not high.
|
|
|
|
np.random.seed(3242)
|
|
n = 500
|
|
|
|
x = np.random.normal(size=(n, 3))
|
|
x[:, 0] = 1
|
|
lpr = 4 + x[:, 1:].sum(1)
|
|
mn = np.exp(lpr)
|
|
y = np.random.poisson(mn)
|
|
|
|
for scale in 'x2', 'dev', 1.0:
|
|
|
|
# Un-regularized fit using gradients not IRLS
|
|
fam = sm.families.Tweedie(var_power=2, eql=True)
|
|
model1 = sm.GLM(y, x, family=fam)
|
|
result1 = model1.fit(method="newton", scale=scale)
|
|
assert_allclose(result1.params, np.r_[4, 1, 1], atol=1e-3, rtol=1e-1)
|
|
|
|
|
|
def testTweediePowerEstimate():
|
|
# Test the Pearson estimate of the Tweedie variance and scale parameters.
|
|
#
|
|
# Ideally, this would match the following R code, but I cannot make it work...
|
|
#
|
|
# setwd('c:/workspace')
|
|
# data <- read.csv('cpunish.csv', sep=",")
|
|
#
|
|
# library(tweedie)
|
|
#
|
|
# y <- c(1.00113835e+05, 6.89668315e+03, 6.15726842e+03,
|
|
# 1.41718806e+03, 5.11776456e+02, 2.55369154e+02,
|
|
# 1.07147443e+01, 3.56874698e+00, 4.06797842e-02,
|
|
# 7.06996731e-05, 2.10165106e-07, 4.34276938e-08,
|
|
# 1.56354040e-09, 0.00000000e+00, 0.00000000e+00,
|
|
# 0.00000000e+00, 0.00000000e+00)
|
|
#
|
|
# data$NewY <- y
|
|
#
|
|
# out <- tweedie.profile( NewY ~ INCOME + SOUTH - 1,
|
|
# p.vec=c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8,
|
|
# 1.9), link.power=0,
|
|
# data=data,do.plot = TRUE)
|
|
data = cpunish.load_pandas()
|
|
y = [1.00113835e+05, 6.89668315e+03, 6.15726842e+03,
|
|
1.41718806e+03, 5.11776456e+02, 2.55369154e+02,
|
|
1.07147443e+01, 3.56874698e+00, 4.06797842e-02,
|
|
7.06996731e-05, 2.10165106e-07, 4.34276938e-08,
|
|
1.56354040e-09, 0.00000000e+00, 0.00000000e+00,
|
|
0.00000000e+00, 0.00000000e+00]
|
|
model1 = sm.GLM(y, data.exog[['INCOME', 'SOUTH']],
|
|
family=sm.families.Tweedie(link=sm.families.links.Log(),
|
|
var_power=1.5))
|
|
res1 = model1.fit()
|
|
model2 = sm.GLM((y - res1.mu) ** 2,
|
|
np.column_stack((np.ones(len(res1.mu)), np.log(res1.mu))),
|
|
family=sm.families.Gamma(sm.families.links.Log()))
|
|
res2 = model2.fit()
|
|
# Sample may be too small for this...
|
|
# assert_allclose(res1.scale, np.exp(res2.params[0]), rtol=0.25)
|
|
p = model1.estimate_tweedie_power(res1.mu)
|
|
assert_allclose(p, res2.params[1], rtol=0.25)
|
|
|
|
def test_glm_lasso_6431():
|
|
|
|
# Based on issue #6431
|
|
# Fails with newton-cg as optimizer
|
|
np.random.seed(123)
|
|
|
|
from statsmodels.regression.linear_model import OLS
|
|
|
|
n = 50
|
|
x = np.ones((n, 2))
|
|
x[:, 1] = np.arange(0, n)
|
|
y = 1000 + x[:, 1] + np.random.normal(0, 1, n)
|
|
|
|
params = np.r_[999.82244338, 1.0077889]
|
|
|
|
for method in "bfgs", None:
|
|
for fun in [OLS, GLM]:
|
|
|
|
# Changing L1_wtValue from 0 to 1e-9 changes
|
|
# the algorithm from scipy gradient optimization
|
|
# to statsmodels coordinate descent
|
|
for L1_wtValue in [0, 1e-9]:
|
|
model = fun(y, x)
|
|
if fun == OLS:
|
|
fit = model.fit_regularized(alpha=0, L1_wt=L1_wtValue)
|
|
else:
|
|
fit = model._fit_ridge(alpha=0, start_params=None, method=method)
|
|
assert_allclose(params, fit.params, atol=1e-6, rtol=1e-6)
|
|
|
|
class TestRegularized:
|
|
|
|
def test_regularized(self):
|
|
|
|
import os
|
|
|
|
from .results import glmnet_r_results
|
|
|
|
for dtype in "binomial", "poisson":
|
|
|
|
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
|
data = np.loadtxt(os.path.join(cur_dir, "results", "enet_%s.csv" % dtype),
|
|
delimiter=",")
|
|
|
|
endog = data[:, 0]
|
|
exog = data[:, 1:]
|
|
|
|
fam = {"binomial" : sm.families.Binomial,
|
|
"poisson" : sm.families.Poisson}[dtype]
|
|
|
|
for j in range(9):
|
|
|
|
vn = "rslt_%s_%d" % (dtype, j)
|
|
r_result = getattr(glmnet_r_results, vn)
|
|
L1_wt = r_result[0]
|
|
alpha = r_result[1]
|
|
params = r_result[2:]
|
|
|
|
model = GLM(endog, exog, family=fam())
|
|
sm_result = model.fit_regularized(L1_wt=L1_wt, alpha=alpha)
|
|
|
|
# Agreement is OK, see below for further check
|
|
assert_allclose(params, sm_result.params, atol=1e-2, rtol=0.3)
|
|
|
|
# The penalized log-likelihood that we are maximizing.
|
|
def plf(params):
|
|
llf = model.loglike(params) / len(endog)
|
|
llf = llf - alpha * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params)))
|
|
return llf
|
|
|
|
# Confirm that we are doing better than glmnet.
|
|
llf_r = plf(params)
|
|
llf_sm = plf(sm_result.params)
|
|
assert_equal(np.sign(llf_sm - llf_r), 1)
|
|
|
|
|
|
class TestConvergence:
|
|
@classmethod
|
|
def setup_class(cls):
|
|
'''
|
|
Test Binomial family with canonical logit link using star98 dataset.
|
|
'''
|
|
from statsmodels.datasets.star98 import load
|
|
data = load()
|
|
data.exog = add_constant(data.exog, prepend=False)
|
|
cls.model = GLM(data.endog, data.exog,
|
|
family=sm.families.Binomial())
|
|
|
|
def _when_converged(self, atol=1e-8, rtol=0, tol_criterion='deviance'):
|
|
for i, dev in enumerate(self.res.fit_history[tol_criterion]):
|
|
orig = self.res.fit_history[tol_criterion][i]
|
|
new = self.res.fit_history[tol_criterion][i + 1]
|
|
if np.allclose(orig, new, atol=atol, rtol=rtol):
|
|
return i
|
|
raise ValueError('CONVERGENCE CHECK: It seems this doens\'t converge!')
|
|
|
|
def test_convergence_atol_only(self):
|
|
atol = 1e-8
|
|
rtol = 0
|
|
self.res = self.model.fit(atol=atol, rtol=rtol)
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol)
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
def test_convergence_rtol_only(self):
|
|
atol = 0
|
|
rtol = 1e-8
|
|
self.res = self.model.fit(atol=atol, rtol=rtol)
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol)
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
def test_convergence_atol_rtol(self):
|
|
atol = 1e-8
|
|
rtol = 1e-8
|
|
self.res = self.model.fit(atol=atol, rtol=rtol)
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol)
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
def test_convergence_atol_only_params(self):
|
|
atol = 1e-8
|
|
rtol = 0
|
|
self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol,
|
|
tol_criterion='params')
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
def test_convergence_rtol_only_params(self):
|
|
atol = 0
|
|
rtol = 1e-8
|
|
self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol,
|
|
tol_criterion='params')
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
def test_convergence_atol_rtol_params(self):
|
|
atol = 1e-8
|
|
rtol = 1e-8
|
|
self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
|
|
expected_iterations = self._when_converged(atol=atol, rtol=rtol,
|
|
tol_criterion='params')
|
|
actual_iterations = self.res.fit_history['iteration']
|
|
# Note the first value is the list is np.inf. The second value
|
|
# is the initial guess based off of start_params or the
|
|
# estimate thereof. The third value (index = 2) is the actual "first
|
|
# iteration"
|
|
assert_equal(expected_iterations, actual_iterations)
|
|
assert_equal(len(self.res.fit_history['deviance']) - 2,
|
|
actual_iterations)
|
|
|
|
|
|
def test_poisson_deviance():
|
|
# see #3355 missing term in deviance if resid_response.sum() != 0
|
|
np.random.seed(123987)
|
|
nobs, k_vars = 50, 3-1
|
|
x = sm.add_constant(np.random.randn(nobs, k_vars))
|
|
|
|
mu_true = np.exp(x.sum(1))
|
|
y = np.random.poisson(mu_true, size=nobs)
|
|
|
|
mod = sm.GLM(y, x[:, :], family=sm.genmod.families.Poisson())
|
|
res = mod.fit()
|
|
|
|
d_i = res.resid_deviance
|
|
d = res.deviance
|
|
lr = (mod.family.loglike(y, y+1e-20) -
|
|
mod.family.loglike(y, res.fittedvalues)) * 2
|
|
|
|
assert_allclose(d, (d_i**2).sum(), rtol=1e-12)
|
|
assert_allclose(d, lr, rtol=1e-12)
|
|
|
|
# case without constant, resid_response.sum() != 0
|
|
mod_nc = sm.GLM(y, x[:, 1:], family=sm.genmod.families.Poisson())
|
|
res_nc = mod_nc.fit()
|
|
|
|
d_i = res_nc.resid_deviance
|
|
d = res_nc.deviance
|
|
lr = (mod.family.loglike(y, y+1e-20) -
|
|
mod.family.loglike(y, res_nc.fittedvalues)) * 2
|
|
|
|
assert_allclose(d, (d_i**2).sum(), rtol=1e-12)
|
|
assert_allclose(d, lr, rtol=1e-12)
|
|
|
|
|
|
def test_non_invertible_hessian_fails_summary():
|
|
# Test when the hessian fails the summary is still available.
|
|
data = cpunish.load_pandas()
|
|
|
|
data.endog[:] = 1
|
|
with warnings.catch_warnings():
|
|
# we filter DomainWarning, the convergence problems
|
|
# and warnings in summary
|
|
warnings.simplefilter("ignore")
|
|
mod = sm.GLM(data.endog, data.exog, family=sm.families.Gamma())
|
|
res = mod.fit(maxiter=1, method='bfgs', max_start_irls=0)
|
|
res.summary()
|
|
|
|
|
|
def test_int_scale():
|
|
# GH-6627, make sure it works with int scale
|
|
data = longley.load()
|
|
mod = GLM(data.endog, data.exog, family=sm.families.Gaussian())
|
|
res = mod.fit(scale=1)
|
|
assert isinstance(res.params, pd.Series)
|
|
assert res.scale.dtype == np.float64
|
|
|
|
|
|
@pytest.mark.parametrize("dtype", [np.int8, np.int16, np.int32, np.int64])
|
|
def test_int_exog(dtype):
|
|
# GH-6627, make use of floats internally
|
|
count1, n1, count2, n2 = 60, 51477.5, 30, 54308.7
|
|
y = [count1, count2]
|
|
x = np.asarray([[1, 1], [1, 0]]).astype(dtype)
|
|
exposure = np.asarray([n1, n2])
|
|
mod = GLM(y, x, exposure=exposure, family=sm.families.Poisson())
|
|
res = mod.fit(method='bfgs', max_start_irls=0)
|
|
assert isinstance(res.params, np.ndarray)
|
|
|
|
|
|
def test_glm_bic(iris):
|
|
X = np.c_[np.ones(100), iris[50:, :4]]
|
|
y = np.array(iris)[50:, 4].astype(np.int32)
|
|
y -= 1
|
|
SET_USE_BIC_LLF(True)
|
|
model = GLM(y, X, family=sm.families.Binomial()).fit()
|
|
# 34.9244 is what glm() of R yields
|
|
assert_almost_equal(model.bic, 34.9244, decimal=3)
|
|
assert_almost_equal(model.bic_llf, 34.9244, decimal=3)
|
|
SET_USE_BIC_LLF(False)
|
|
assert_almost_equal(model.bic, model.bic_deviance, decimal=3)
|
|
SET_USE_BIC_LLF(None)
|
|
|
|
|
|
def test_glm_bic_warning(iris):
|
|
X = np.c_[np.ones(100), iris[50:, :4]]
|
|
y = np.array(iris)[50:, 4].astype(np.int32)
|
|
y -= 1
|
|
model = GLM(y, X, family=sm.families.Binomial()).fit()
|
|
with pytest.warns(FutureWarning, match="The bic"):
|
|
assert isinstance(model.bic, float)
|
|
|
|
|
|
def test_output_exposure_null(reset_randomstate):
|
|
# GH 6953
|
|
|
|
x0 = [np.sin(i / 20) + 2 for i in range(1000)]
|
|
rs = np.random.RandomState(0)
|
|
# Variable exposures for each observation
|
|
exposure = rs.randint(100, 200, size=1000)
|
|
y = [np.sum(rs.poisson(x, size=e)) for x, e in zip(x0, exposure)]
|
|
x = add_constant(x0)
|
|
|
|
model = GLM(
|
|
endog=y, exog=x, exposure=exposure, family=sm.families.Poisson()
|
|
).fit()
|
|
null_model = GLM(
|
|
endog=y, exog=x[:, 0], exposure=exposure, family=sm.families.Poisson()
|
|
).fit()
|
|
null_model_without_exposure = GLM(
|
|
endog=y, exog=x[:, 0], family=sm.families.Poisson()
|
|
).fit()
|
|
assert_allclose(model.llnull, null_model.llf)
|
|
# Check that they are different
|
|
assert np.abs(null_model_without_exposure.llf - model.llnull) > 1
|
|
|
|
|
|
def test_qaic():
|
|
|
|
# Example from documentation of R package MuMIn
|
|
import patsy
|
|
ldose = np.concatenate((np.arange(6), np.arange(6)))
|
|
sex = ["M"]*6 + ["F"]*6
|
|
numdead = [10, 4, 9, 12, 18, 20, 0, 2, 6, 10, 12, 16]
|
|
df = pd.DataFrame({"ldose": ldose, "sex": sex, "numdead": numdead})
|
|
df["numalive"] = 20 - df["numdead"]
|
|
df["SF"] = df["numdead"]
|
|
|
|
y = df[["numalive", "numdead"]].values
|
|
x = patsy.dmatrix("sex*ldose", data=df, return_type='dataframe')
|
|
m = GLM(y, x, family=sm.families.Binomial())
|
|
r = m.fit()
|
|
scale = 2.412699
|
|
qaic = r.info_criteria(crit="qaic", scale=scale)
|
|
|
|
# R gives 31.13266 because it uses a df that is 1 greater,
|
|
# presumably because they count the scale parameter in df.
|
|
# This won't matter when comparing models by differencing
|
|
# QAICs.
|
|
# Binomial doesn't have a scale parameter, so adding +1 is not correct.
|
|
assert_allclose(qaic, 29.13266, rtol=1e-5, atol=1e-5)
|
|
qaic1 = r.info_criteria(crit="qaic", scale=scale, dk_params=1)
|
|
assert_allclose(qaic1, 31.13266, rtol=1e-5, atol=1e-5)
|
|
|
|
|
|
def test_tweedie_score():
|
|
|
|
np.random.seed(3242)
|
|
n = 500
|
|
x = np.random.normal(size=(n, 4))
|
|
lpr = np.dot(x, np.r_[1, -1, 0, 0.5])
|
|
mu = np.exp(lpr)
|
|
|
|
p0 = 1.5
|
|
lam = 10 * mu**(2 - p0) / (2 - p0)
|
|
alp = (2 - p0) / (p0 - 1)
|
|
bet = 10 * mu**(1 - p0) / (p0 - 1)
|
|
y = np.empty(n)
|
|
N = np.random.poisson(lam)
|
|
for i in range(n):
|
|
y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum()
|
|
|
|
for eql in [True, False]:
|
|
for p in [1, 1.5, 2]:
|
|
if eql is False and SP_LT_17:
|
|
pytest.skip('skip, scipy too old, no bessel_wright')
|
|
|
|
fam = sm.families.Tweedie(var_power=p, eql=eql)
|
|
model = GLM(y, x, family=fam)
|
|
result = model.fit()
|
|
|
|
pa = result.params + 0.2*np.random.normal(size=result.params.size)
|
|
|
|
ngrad = approx_fprime_cs(pa, lambda x: model.loglike(x, scale=1))
|
|
agrad = model.score(pa, scale=1)
|
|
assert_allclose(ngrad, agrad, atol=1e-8, rtol=1e-8)
|
|
|
|
nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1))
|
|
ahess = model.hessian(pa, scale=1)
|
|
assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8)
|