2024-10-02 22:15:59 +04:00

1641 lines
52 KiB
Python

"""
Test functions for models.regression
"""
# TODO: Test for LM
from statsmodels.compat.python import lrange
import warnings
import numpy as np
from numpy.testing import (
assert_,
assert_allclose,
assert_almost_equal,
assert_equal,
assert_raises,
)
import pandas as pd
import pytest
from scipy.linalg import toeplitz
from scipy.stats import t as student_t
from statsmodels.datasets import longley
from statsmodels.regression.linear_model import (
GLS,
OLS,
WLS,
burg,
yule_walker,
)
from statsmodels.tools.tools import add_constant
DECIMAL_4 = 4
DECIMAL_3 = 3
DECIMAL_2 = 2
DECIMAL_1 = 1
DECIMAL_7 = 7
DECIMAL_0 = 0
try:
import cvxopt # noqa:F401
has_cvxopt = True
except ImportError:
has_cvxopt = False
class CheckRegressionResults:
"""
res2 contains results from Rmodelwrap or were obtained from a statistical
packages such as R, Stata, or SAS and were written to model_results
"""
decimal_params = DECIMAL_4
def test_params(self):
assert_almost_equal(
self.res1.params, self.res2.params, self.decimal_params
)
decimal_standarderrors = DECIMAL_4
def test_standarderrors(self):
assert_allclose(
self.res1.bse, self.res2.bse, self.decimal_standarderrors
)
decimal_confidenceintervals = DECIMAL_4
def test_confidenceintervals(self):
# NOTE: stata rounds residuals (at least) to sig digits so approx_equal
conf1 = self.res1.conf_int()
conf2 = self.res2.conf_int()
for i in range(len(conf1)):
assert_allclose(
conf1[i][0],
conf2[i][0],
rtol=10 ** -self.decimal_confidenceintervals,
)
assert_allclose(
conf1[i][1],
conf2[i][1],
rtol=10 ** -self.decimal_confidenceintervals,
)
decimal_conf_int_subset = DECIMAL_4
def test_conf_int_subset(self):
if len(self.res1.params) > 1:
with pytest.warns(FutureWarning, match="cols is"):
ci1 = self.res1.conf_int(cols=(1, 2))
ci2 = self.res1.conf_int()[1:3]
assert_almost_equal(ci1, ci2, self.decimal_conf_int_subset)
else:
pass
decimal_scale = DECIMAL_4
def test_scale(self):
assert_almost_equal(
self.res1.scale, self.res2.scale, self.decimal_scale
)
decimal_rsquared = DECIMAL_4
def test_rsquared(self):
assert_almost_equal(
self.res1.rsquared, self.res2.rsquared, self.decimal_rsquared
)
decimal_rsquared_adj = DECIMAL_4
def test_rsquared_adj(self):
assert_almost_equal(
self.res1.rsquared_adj,
self.res2.rsquared_adj,
self.decimal_rsquared_adj,
)
def test_degrees(self):
assert_equal(self.res1.model.df_model, self.res2.df_model)
assert_equal(self.res1.model.df_resid, self.res2.df_resid)
decimal_ess = DECIMAL_4
def test_ess(self):
# Explained Sum of Squares
assert_almost_equal(self.res1.ess, self.res2.ess, self.decimal_ess)
decimal_ssr = DECIMAL_4
def test_sumof_squaredresids(self):
assert_almost_equal(self.res1.ssr, self.res2.ssr, self.decimal_ssr)
decimal_mse_resid = DECIMAL_4
def test_mse_resid(self):
# Mean squared error of residuals
assert_almost_equal(
self.res1.mse_model, self.res2.mse_model, self.decimal_mse_resid
)
decimal_mse_model = DECIMAL_4
def test_mse_model(self):
assert_almost_equal(
self.res1.mse_resid, self.res2.mse_resid, self.decimal_mse_model
)
decimal_mse_total = DECIMAL_4
def test_mse_total(self):
assert_almost_equal(
self.res1.mse_total,
self.res2.mse_total,
self.decimal_mse_total,
err_msg="Test class %s" % self,
)
decimal_fvalue = DECIMAL_4
def test_fvalue(self):
# did not change this, not sure it should complain -inf not equal -inf
# if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)):
assert_almost_equal(
self.res1.fvalue, self.res2.fvalue, self.decimal_fvalue
)
decimal_loglike = DECIMAL_4
def test_loglike(self):
assert_almost_equal(self.res1.llf, self.res2.llf, self.decimal_loglike)
decimal_aic = DECIMAL_4
def test_aic(self):
assert_almost_equal(self.res1.aic, self.res2.aic, self.decimal_aic)
# the following just checks the definition
aicc1 = self.res1.info_criteria("aicc")
k = self.res1.df_model + self.res1.model.k_constant
nobs = self.res1.model.nobs
aicc2 = self.res1.aic + 2 * (k**2 + k) / (nobs - k - 1)
assert_allclose(aicc1, aicc2, rtol=1e-10)
hqic1 = self.res1.info_criteria("hqic")
hqic2 = (self.res1.aic - 2 * k) + 2 * np.log(np.log(nobs)) * k
assert_allclose(hqic1, hqic2, rtol=1e-10)
decimal_bic = DECIMAL_4
def test_bic(self):
assert_almost_equal(self.res1.bic, self.res2.bic, self.decimal_bic)
decimal_pvalues = DECIMAL_4
def test_pvalues(self):
assert_almost_equal(
self.res1.pvalues, self.res2.pvalues, self.decimal_pvalues
)
decimal_wresid = DECIMAL_4
def test_wresid(self):
assert_almost_equal(
self.res1.wresid, self.res2.wresid, self.decimal_wresid
)
decimal_resids = DECIMAL_4
def test_resids(self):
assert_almost_equal(
self.res1.resid, self.res2.resid, self.decimal_resids
)
decimal_norm_resids = DECIMAL_4
def test_norm_resids(self):
assert_almost_equal(
self.res1.resid_pearson,
self.res2.resid_pearson,
self.decimal_norm_resids,
)
# TODO: test fittedvalues and what else?
class TestOLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
from .results.results_regression import Longley
data = longley.load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
exog = add_constant(exog, prepend=False)
res1 = OLS(endog, exog).fit()
res2 = Longley()
res2.wresid = res1.wresid # workaround hack
cls.res1 = res1
cls.res2 = res2
res_qr = OLS(endog, exog).fit(method="qr")
model_qr = OLS(endog, exog)
Q, R = np.linalg.qr(exog)
model_qr.exog_Q, model_qr.exog_R = Q, R
model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R))
model_qr.rank = np.linalg.matrix_rank(R)
res_qr2 = model_qr.fit(method="qr")
cls.res_qr = res_qr
cls.res_qr_manual = res_qr2
def test_eigenvalues(self):
eigenval_perc_diff = (
self.res_qr.eigenvals - self.res_qr_manual.eigenvals
)
eigenval_perc_diff /= self.res_qr.eigenvals
zeros = np.zeros_like(eigenval_perc_diff)
assert_almost_equal(eigenval_perc_diff, zeros, DECIMAL_7)
# Robust error tests. Compare values computed with SAS
def test_HC0_errors(self):
# They are split up because the copied results do not have any
# DECIMAL_4 places for the last place.
assert_almost_equal(
self.res1.HC0_se[:-1], self.res2.HC0_se[:-1], DECIMAL_4
)
assert_allclose(np.round(self.res1.HC0_se[-1]), self.res2.HC0_se[-1])
def test_HC1_errors(self):
assert_almost_equal(
self.res1.HC1_se[:-1], self.res2.HC1_se[:-1], DECIMAL_4
)
# Note: tolerance is tight; rtol=3e-7 fails while 4e-7 passes
assert_allclose(self.res1.HC1_se[-1], self.res2.HC1_se[-1], rtol=4e-7)
def test_HC2_errors(self):
assert_almost_equal(
self.res1.HC2_se[:-1], self.res2.HC2_se[:-1], DECIMAL_4
)
# Note: tolerance is tight; rtol=4e-7 fails while 5e-7 passes
assert_allclose(self.res1.HC2_se[-1], self.res2.HC2_se[-1], rtol=5e-7)
def test_HC3_errors(self):
assert_almost_equal(
self.res1.HC3_se[:-1], self.res2.HC3_se[:-1], DECIMAL_4
)
# Note: tolerance is tight; rtol=1e-7 fails while 1.5e-7 passes
assert_allclose(
self.res1.HC3_se[-1], self.res2.HC3_se[-1], rtol=1.5e-7
)
def test_qr_params(self):
assert_almost_equal(self.res1.params, self.res_qr.params, 6)
def test_qr_normalized_cov_params(self):
# todo: need assert_close
assert_almost_equal(
np.ones_like(self.res1.normalized_cov_params),
self.res1.normalized_cov_params
/ self.res_qr.normalized_cov_params,
5,
)
def test_missing(self):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
data.endog[[3, 7, 14]] = np.nan
mod = OLS(data.endog, data.exog, missing="drop")
assert_equal(mod.endog.shape[0], 13)
assert_equal(mod.exog.shape[0], 13)
def test_rsquared_adj_overfit(self):
# Test that if df_resid = 0, rsquared_adj = 0.
# This is a regression test for user issue:
# https://github.com/statsmodels/statsmodels/issues/868
with warnings.catch_warnings(record=True):
x = np.random.randn(5)
y = np.random.randn(5, 6)
results = OLS(x, y).fit()
rsquared_adj = results.rsquared_adj
assert_equal(rsquared_adj, np.nan)
def test_qr_alternatives(self):
assert_allclose(
self.res_qr.params, self.res_qr_manual.params, rtol=5e-12
)
def test_norm_resid(self):
resid = self.res1.wresid
norm_resid = resid / np.sqrt(np.sum(resid ** 2.0) / self.res1.df_resid)
model_norm_resid = self.res1.resid_pearson
assert_almost_equal(model_norm_resid, norm_resid, DECIMAL_7)
def test_summary_slim(self):
# check that slim summary is smaller, does not verify content
with warnings.catch_warnings():
msg = "kurtosistest only valid for n>=20"
warnings.filterwarnings("ignore", message=msg,
category=UserWarning)
summ = self.res1.summary(slim=True)
assert len(summ.tables) == 2
assert len(str(summ)) < 6700
def test_norm_resid_zero_variance(self):
with warnings.catch_warnings(record=True):
y = self.res1.model.endog
res = OLS(y, y).fit()
assert_allclose(res.scale, 0, atol=1e-20)
assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
class TestRTO(CheckRegressionResults):
@classmethod
def setup_class(cls):
from .results.results_regression import LongleyRTO
data = longley.load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
res1 = OLS(endog, exog).fit()
res2 = LongleyRTO()
res2.wresid = res1.wresid # workaround hack
cls.res1 = res1
cls.res2 = res2
res_qr = OLS(endog, exog).fit(method="qr")
cls.res_qr = res_qr
class TestFtest:
"""
Tests f_test vs. RegressionResults
"""
@classmethod
def setup_class(cls):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = OLS(data.endog, data.exog).fit()
R = np.identity(7)[:-1, :]
cls.Ftest = cls.res1.f_test(R)
def test_F(self):
assert_almost_equal(self.Ftest.fvalue, self.res1.fvalue, DECIMAL_4)
def test_p(self):
assert_almost_equal(self.Ftest.pvalue, self.res1.f_pvalue, DECIMAL_4)
def test_Df_denom(self):
assert_equal(self.Ftest.df_denom, self.res1.model.df_resid)
def test_Df_num(self):
assert_equal(self.Ftest.df_num, 6)
class TestFTest2:
"""
A joint test that the coefficient on
GNP = the coefficient on UNEMP and that the coefficient on
POP = the coefficient on YEAR for the Longley dataset.
Ftest1 is from statsmodels. Results are from Rpy using R's car library.
"""
@classmethod
def setup_class(cls):
data = longley.load()
columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)]
data.exog.columns = columns
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]]
cls.Ftest1 = res1.f_test(R2)
hyp = "x2 = x3, x5 = x6"
cls.NewFtest1 = res1.f_test(hyp)
def test_new_ftest(self):
assert_equal(self.NewFtest1.fvalue, self.Ftest1.fvalue)
def test_fvalue(self):
assert_almost_equal(self.Ftest1.fvalue, 9.7404618732968196, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(
self.Ftest1.pvalue, 0.0056052885317493459, DECIMAL_4
)
def test_df_denom(self):
assert_equal(self.Ftest1.df_denom, 9)
def test_df_num(self):
assert_equal(self.Ftest1.df_num, 2)
class TestFtestQ:
"""
A joint hypothesis test that Rb = q. Coefficient tests are essentially
made up. Test values taken from Stata.
"""
@classmethod
def setup_class(cls):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
R = np.array(
[
[0, 1, 1, 0, 0, 0, 0],
[0, 1, 0, 1, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 1, 0],
]
)
q = np.array([0, 0, 0, 1, 0])
cls.Ftest1 = res1.f_test((R, q))
def test_fvalue(self):
assert_almost_equal(self.Ftest1.fvalue, 70.115557, 5)
def test_pvalue(self):
assert_almost_equal(self.Ftest1.pvalue, 6.229e-07, 10)
def test_df_denom(self):
assert_equal(self.Ftest1.df_denom, 9)
def test_df_num(self):
assert_equal(self.Ftest1.df_num, 5)
class TestTtest:
"""
Test individual t-tests. Ie., are the coefficients significantly
different than zero.
"""
@classmethod
def setup_class(cls):
data = longley.load()
columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)]
data.exog.columns = columns
data.exog = add_constant(data.exog, prepend=False)
cls.res1 = OLS(data.endog, data.exog).fit()
R = np.identity(7)
cls.Ttest = cls.res1.t_test(R)
hyp = "x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0"
cls.NewTTest = cls.res1.t_test(hyp)
def test_new_tvalue(self):
assert_equal(self.NewTTest.tvalue, self.Ttest.tvalue)
def test_tvalue(self):
assert_almost_equal(self.Ttest.tvalue, self.res1.tvalues, DECIMAL_4)
def test_sd(self):
assert_almost_equal(self.Ttest.sd, self.res1.bse, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(
self.Ttest.pvalue,
student_t.sf(np.abs(self.res1.tvalues), self.res1.model.df_resid)
* 2,
DECIMAL_4,
)
def test_df_denom(self):
assert_equal(self.Ttest.df_denom, self.res1.model.df_resid)
def test_effect(self):
assert_almost_equal(self.Ttest.effect, self.res1.params)
class TestTtest2:
"""
Tests the hypothesis that the coefficients on POP and YEAR
are equal.
Results from RPy using 'car' package.
"""
@classmethod
def setup_class(cls):
R = np.zeros(7)
R[4:6] = [1, -1]
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
res1 = OLS(data.endog, data.exog).fit()
cls.Ttest1 = res1.t_test(R)
def test_tvalue(self):
assert_almost_equal(self.Ttest1.tvalue, -4.0167754636397284, DECIMAL_4)
def test_sd(self):
assert_almost_equal(self.Ttest1.sd, 455.39079425195314, DECIMAL_4)
def test_pvalue(self):
assert_almost_equal(
self.Ttest1.pvalue, 2 * 0.0015163772380932246, DECIMAL_4
)
def test_df_denom(self):
assert_equal(self.Ttest1.df_denom, 9)
def test_effect(self):
assert_almost_equal(self.Ttest1.effect, -1829.2025687186533, DECIMAL_4)
class TestGLS:
"""
These test results were obtained by replication with R.
"""
@classmethod
def setup_class(cls):
from .results.results_regression import LongleyGls
data = longley.load()
exog = add_constant(
np.column_stack((data.exog.iloc[:, 1], data.exog.iloc[:, 4])),
prepend=False,
)
tmp_results = OLS(data.endog, exog).fit()
rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][
1
] # by assumption
order = toeplitz(np.arange(16))
sigma = rho ** order
GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
cls.res1 = GLS_results
cls.res2 = LongleyGls()
# attach for test_missing
cls.sigma = sigma
cls.exog = exog
cls.endog = data.endog
def test_aic(self):
# Note: tolerance is tight; rtol=3e-3 fails while 4e-3 passes
assert_allclose(self.res1.aic + 2, self.res2.aic, rtol=4e-3)
def test_bic(self):
# Note: tolerance is tight; rtol=1e-2 fails while 1.5e-2 passes
assert_allclose(self.res1.bic, self.res2.bic, rtol=1.5e-2)
def test_loglike(self):
assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_0)
def test_params(self):
assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_1)
def test_resid(self):
assert_almost_equal(self.res1.resid, self.res2.resid, DECIMAL_4)
def test_scale(self):
assert_almost_equal(self.res1.scale, self.res2.scale, DECIMAL_4)
def test_tvalues(self):
assert_almost_equal(self.res1.tvalues, self.res2.tvalues, DECIMAL_4)
def test_standarderrors(self):
assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4)
def test_fittedvalues(self):
assert_almost_equal(
self.res1.fittedvalues, self.res2.fittedvalues, DECIMAL_4
)
def test_pvalues(self):
assert_almost_equal(self.res1.pvalues, self.res2.pvalues, DECIMAL_4)
def test_missing(self):
endog = self.endog.copy() # copy or changes endog for other methods
endog[[4, 7, 14]] = np.nan
mod = GLS(endog, self.exog, sigma=self.sigma, missing="drop")
assert_equal(mod.endog.shape[0], 13)
assert_equal(mod.exog.shape[0], 13)
assert_equal(mod.sigma.shape, (13, 13))
class TestGLS_alt_sigma(CheckRegressionResults):
"""
Test that GLS with no argument is equivalent to OLS.
"""
@classmethod
def setup_class(cls):
data = longley.load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
exog = add_constant(exog, prepend=False)
ols_res = OLS(endog, exog).fit()
gls_res = GLS(endog, exog).fit()
gls_res_scalar = GLS(endog, exog, sigma=1)
cls.endog = endog
cls.exog = exog
cls.res1 = gls_res
cls.res2 = ols_res
cls.res3 = gls_res_scalar
# self.res2.conf_int = self.res2.conf_int()
def test_wrong_size_sigma_1d(self):
n = len(self.endog)
assert_raises(
ValueError, GLS, self.endog, self.exog, sigma=np.ones(n - 1)
)
def test_wrong_size_sigma_2d(self):
n = len(self.endog)
assert_raises(
ValueError,
GLS,
self.endog,
self.exog,
sigma=np.ones((n - 1, n - 1)),
)
@pytest.mark.skip("Test does not raise but should")
def test_singular_sigma(self):
n = len(self.endog)
sigma = np.ones((n, n)) + np.diag(np.ones(n))
sigma[0, 1] = sigma[1, 0] = 2
assert np.linalg.matrix_rank(sigma) == n - 1
with pytest.raises(np.linalg.LinAlgError):
GLS(self.endog, self.exog, sigma=sigma)
# FIXME: do not leave commented-out, use or move/remove
# def check_confidenceintervals(self, conf1, conf2):
# assert_almost_equal(conf1, conf2, DECIMAL_4)
class TestLM:
@classmethod
def setup_class(cls):
# TODO: Test HAC method
rs = np.random.RandomState(1234)
x = rs.randn(100, 3)
b = np.ones((3, 1))
e = rs.randn(100, 1)
y = np.dot(x, b) + e
# Cases?
# Homoskedastic
# HC0
cls.res1_full = OLS(y, x).fit()
cls.res1_restricted = OLS(y, x[:, 0]).fit()
cls.res2_full = cls.res1_full.get_robustcov_results("HC0")
cls.res2_restricted = cls.res1_restricted.get_robustcov_results("HC0")
cls.x = x
cls.Y = y
def test_LM_homoskedastic(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
x = self.x
S = np.dot(resid, resid) / n * np.dot(x.T, x) / n
Sinv = np.linalg.inv(S)
s = np.mean(x * resid[:, None], 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res1_full.compare_lm_test(self.res1_restricted)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_nodemean(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
x = self.x
scores = x * resid[:, None]
S = np.dot(scores.T, scores) / n
Sinv = np.linalg.inv(S)
s = np.mean(scores, 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(
self.res2_restricted, demean=False
)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_demean(self):
resid = self.res1_restricted.wresid
n = resid.shape[0]
x = self.x
scores = x * resid[:, None]
scores_demean = scores - scores.mean(0)
S = np.dot(scores_demean.T, scores_demean) / n
Sinv = np.linalg.inv(S)
s = np.mean(scores, 0)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_heteroskedastic_LRversion(self):
resid = self.res1_restricted.wresid
resid_full = self.res1_full.wresid
n = resid.shape[0]
x = self.x
scores = x * resid[:, None]
s = np.mean(scores, 0)
scores = x * resid_full[:, None]
S = np.dot(scores.T, scores) / n
Sinv = np.linalg.inv(S)
LMstat = n * np.dot(np.dot(s, Sinv), s.T)
LMstat_OLS = self.res2_full.compare_lm_test(
self.res2_restricted, use_lr=True
)
LMstat2 = LMstat_OLS[0]
assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
def test_LM_nonnested(self):
assert_raises(
ValueError, self.res2_restricted.compare_lm_test, self.res2_full
)
class TestOLS_GLS_WLS_equivalence:
@classmethod
def setup_class(cls):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
y = data.endog
x = data.exog
n = y.shape[0]
w = np.ones(n)
cls.results = []
cls.results.append(OLS(y, x).fit())
cls.results.append(WLS(y, x, w).fit())
# scaling weights does not change main results (except scale)
cls.results.append(GLS(y, x, 100 * w).fit())
cls.results.append(GLS(y, x, np.diag(0.1 * w)).fit())
def test_ll(self):
llf = np.array([r.llf for r in self.results])
llf_1 = np.ones_like(llf) * self.results[0].llf
assert_almost_equal(llf, llf_1, DECIMAL_7)
ic = np.array([r.aic for r in self.results])
ic_1 = np.ones_like(ic) * self.results[0].aic
assert_almost_equal(ic, ic_1, DECIMAL_7)
ic = np.array([r.bic for r in self.results])
ic_1 = np.ones_like(ic) * self.results[0].bic
assert_almost_equal(ic, ic_1, DECIMAL_7)
def test_params(self):
params = np.array([r.params for r in self.results])
params_1 = np.array([self.results[0].params] * len(self.results))
assert_allclose(params, params_1)
def test_ss(self):
bse = np.array([r.bse for r in self.results])
bse_1 = np.array([self.results[0].bse] * len(self.results))
assert_allclose(bse, bse_1)
def test_rsquared(self):
rsquared = np.array([r.rsquared for r in self.results])
rsquared_1 = np.array([self.results[0].rsquared] * len(self.results))
assert_almost_equal(rsquared, rsquared_1, DECIMAL_7)
class TestGLS_WLS_equivalence(TestOLS_GLS_WLS_equivalence):
# reuse test methods
@classmethod
def setup_class(cls):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
y = data.endog
x = data.exog
n = y.shape[0]
np.random.seed(5)
w = np.random.uniform(0.5, 1, n)
w_inv = 1.0 / w
cls.results = []
cls.results.append(WLS(y, x, w).fit())
# scaling weights does not change main results (except scale)
cls.results.append(WLS(y, x, 0.01 * w).fit())
cls.results.append(GLS(y, x, 100 * w_inv).fit())
cls.results.append(GLS(y, x, np.diag(0.1 * w_inv)).fit())
class TestNonFit:
@classmethod
def setup_class(cls):
data = longley.load()
data.exog = add_constant(data.exog, prepend=False)
cls.endog = data.endog
cls.exog = data.exog
cls.ols_model = OLS(data.endog, data.exog)
def test_df_resid(self):
df_resid = self.endog.shape[0] - self.exog.shape[1]
assert_equal(self.ols_model.df_resid, 9)
class TestWLS_CornerCases:
@classmethod
def setup_class(cls):
cls.exog = np.ones((1,))
cls.endog = np.ones((1,))
weights = 1
cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit()
def test_wrong_size_weights(self):
weights = np.ones((10, 10))
assert_raises(ValueError, WLS, self.endog, self.exog, weights=weights)
class TestWLSExogWeights(CheckRegressionResults):
# Test WLS with Greene's credit card data
# reg avgexp age income incomesq ownrent [aw=1/incomesq]
@classmethod
def setup_class(cls):
from statsmodels.datasets.ccard import load
from .results.results_regression import CCardWLS
dta = load()
endog = np.asarray(dta.endog)
exog = np.asarray(dta.exog)
exog = add_constant(exog, prepend=False)
nobs = 72.0
weights = 1 / exog[:, 2]
# for comparison with stata analytic weights
scaled_weights = (weights * nobs) / weights.sum()
cls.res1 = WLS(endog, exog, weights=scaled_weights).fit()
cls.res2 = CCardWLS()
cls.res2.wresid = scaled_weights ** 0.5 * cls.res2.resid
# correction because we use different definition for loglike/llf
corr_ic = 2 * (cls.res1.llf - cls.res2.llf)
cls.res2.aic -= corr_ic
cls.res2.bic -= corr_ic
cls.res2.llf += 0.5 * np.sum(np.log(cls.res1.model.weights))
def test_wls_example():
# example from the docstring, there was a note about a bug, should
# be fixed now
Y = [1, 3, 4, 5, 2, 3, 4]
x = lrange(1, 8)
x = add_constant(x, prepend=False)
wls_model = WLS(Y, x, weights=lrange(1, 8)).fit()
# taken from R lm.summary
assert_almost_equal(wls_model.fvalue, 0.127337843215, 6)
assert_almost_equal(wls_model.scale, 2.44608530786 ** 2, 6)
def test_wls_tss():
y = np.array([22, 22, 22, 23, 23, 23])
x = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]]
ols_mod = OLS(y, add_constant(x, prepend=False)).fit()
yw = np.array([22, 22, 23.0])
Xw = [[1, 0], [1, 1], [0, 1]]
w = np.array([2, 1, 3.0])
wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit()
assert_equal(ols_mod.centered_tss, wls_mod.centered_tss)
class TestWLSScalarVsArray(CheckRegressionResults):
@classmethod
def setup_class(cls):
from statsmodels.datasets.longley import load
dta = load()
endog = np.asarray(dta.endog)
exog = np.asarray(dta.exog)
exog = add_constant(exog, prepend=True)
wls_scalar = WLS(endog, exog, weights=1.0 / 3).fit()
weights = [1 / 3.0] * len(endog)
wls_array = WLS(endog, exog, weights=weights).fit()
cls.res1 = wls_scalar
cls.res2 = wls_array
class TestWLS_GLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
from statsmodels.datasets.ccard import load
data = load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
sigma = exog[:, 2]
cls.res1 = WLS(endog, exog, weights=1 / sigma).fit()
cls.res2 = GLS(endog, exog, sigma=sigma).fit()
def check_confidenceintervals(self, conf1, conf2): # FIXME: never called
assert_almost_equal(conf1, conf2(), DECIMAL_4)
def test_wls_missing():
from statsmodels.datasets.ccard import load
data = load()
endog = data.endog
endog[[10, 25]] = np.nan
mod = WLS(
data.endog, data.exog, weights=1 / data.exog.iloc[:, 2], missing="drop"
)
assert_equal(mod.endog.shape[0], 70)
assert_equal(mod.exog.shape[0], 70)
assert_equal(mod.weights.shape[0], 70)
class TestWLS_OLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
data = longley.load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
exog = add_constant(exog, prepend=False)
cls.res1 = OLS(endog, exog).fit()
cls.res2 = WLS(endog, exog).fit()
def check_confidenceintervals(self, conf1, conf2): # FIXME: never called
assert_almost_equal(conf1, conf2(), DECIMAL_4)
class TestGLS_OLS(CheckRegressionResults):
@classmethod
def setup_class(cls):
data = longley.load()
endog = np.asarray(data.endog)
exog = np.asarray(data.exog)
exog = add_constant(exog, prepend=False)
cls.res1 = GLS(endog, exog).fit()
cls.res2 = OLS(endog, exog).fit()
def check_confidenceintervals(self, conf1, conf2): # FIXME: never called
assert_almost_equal(conf1, conf2(), DECIMAL_4)
# FIXME: do not leave this commented-out sitting here
# TODO: test AR
# why the two-stage in AR?
# class TestAR:
# from statsmodels.datasets.sunspots import load
# data = load()
# model = AR(data.endog, rho=4).fit()
# R_res = RModel(data.endog, aic="FALSE", order_max=4)#
# def test_params(self):
# assert_almost_equal(self.model.rho,
# pass
# def test_order(self):
# In R this can be defined or chosen by minimizing the AIC if aic=True
# pass
class TestYuleWalker:
@classmethod
def setup_class(cls):
from statsmodels.datasets.sunspots import load
data = load()
cls.rho, cls.sigma = yule_walker(data.endog, order=4, method="mle")
cls.R_params = [
1.2831003105694765,
-0.45240924374091945,
-0.20770298557575195,
0.047943648089542337,
]
def test_params(self):
assert_almost_equal(self.rho, self.R_params, DECIMAL_4)
class TestDataDimensions(CheckRegressionResults):
@classmethod
def setup_class(cls):
np.random.seed(54321)
cls.endog_n_ = np.random.uniform(0, 20, size=30)
cls.endog_n_one = cls.endog_n_[:, None]
cls.exog_n_ = np.random.uniform(0, 20, size=30)
cls.exog_n_one = cls.exog_n_[:, None]
cls.degen_exog = cls.exog_n_one[:-1]
cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one)
cls.mod1.df_model += 1
cls.res1 = cls.mod1.fit()
# Note that these are created for every subclass..
# A little extra overhead probably
cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
def check_confidenceintervals(self, conf1, conf2): # FIXME: never called
assert_almost_equal(conf1, conf2(), DECIMAL_4)
class TestGLS_large_data(TestDataDimensions):
@classmethod
def setup_class(cls):
super().setup_class()
nobs = 1000
y = np.random.randn(nobs, 1)
x = np.random.randn(nobs, 20)
sigma = np.ones_like(y)
cls.gls_res = GLS(y, x, sigma=sigma).fit()
cls.gls_res_scalar = GLS(y, x, sigma=1).fit()
cls.gls_res_none = GLS(y, x).fit()
cls.ols_res = OLS(y, x).fit()
def test_large_equal_params(self):
assert_almost_equal(
self.ols_res.params, self.gls_res.params, DECIMAL_7
)
def test_large_equal_loglike(self):
assert_almost_equal(self.ols_res.llf, self.gls_res.llf, DECIMAL_7)
def test_large_equal_params_none(self):
assert_almost_equal(
self.gls_res.params, self.gls_res_none.params, DECIMAL_7
)
class TestNxNx(TestDataDimensions):
@classmethod
def setup_class(cls):
super().setup_class()
cls.mod2 = OLS(cls.endog_n_, cls.exog_n_)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
class TestNxOneNx(TestDataDimensions):
@classmethod
def setup_class(cls):
super().setup_class()
cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
class TestNxNxOne(TestDataDimensions):
@classmethod
def setup_class(cls):
super().setup_class()
cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one)
cls.mod2.df_model += 1
cls.res2 = cls.mod2.fit()
def test_bad_size():
np.random.seed(54321)
data = np.random.uniform(0, 20, 31)
assert_raises(ValueError, OLS, data, data[1:])
def test_const_indicator():
rs = np.random.RandomState(12345)
x = rs.randint(0, 3, size=30)
x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False,
dtype=float)
y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30)
resc = OLS(y, add_constant(x.iloc[:, 1:], prepend=True)).fit()
res = OLS(y, x, hasconst=True).fit()
assert_almost_equal(resc.rsquared, res.rsquared, 12)
assert res.model.data.k_constant == 1
assert resc.model.data.k_constant == 1
def test_fvalue_const_only():
rs = np.random.RandomState(12345)
x = rs.randint(0, 3, size=30)
x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False,
dtype=float)
x[x.columns[0]] = 1
y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30)
res = OLS(y, x, hasconst=True).fit(cov_type="HC1")
assert not np.isnan(res.fvalue)
assert isinstance(res.fvalue, float)
assert isinstance(res.f_pvalue, float)
def test_conf_int_single_regressor():
# GH#706 single-regressor model (i.e. no intercept) with 1D exog
# should get passed to DataFrame for conf_int
y = pd.Series(np.random.randn(10))
x = pd.Series(np.ones(10))
res = OLS(y, x).fit()
conf_int = res.conf_int()
np.testing.assert_equal(conf_int.shape, (1, 2))
np.testing.assert_(isinstance(conf_int, pd.DataFrame))
def test_summary_as_latex():
# GH#734
import re
dta = longley.load_pandas()
x = dta.exog
x["constant"] = 1
y = dta.endog
res = OLS(y, x).fit()
with pytest.warns(UserWarning):
table = res.summary().as_latex()
# replace the date and time
table = re.sub(
"(?<=\n\\\\textbf\\{Date:\\} &).+?&",
" Sun, 07 Apr 2013 &",
table,
)
table = re.sub(
"(?<=\n\\\\textbf\\{Time:\\} &).+?&",
" 13:46:07 &",
table,
)
expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\
\\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\
\\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\
\\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\
\\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\
\\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\
\\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\
\\textbf{Df Model:} & 6 & \\textbf{ } & \\\\
\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lcccccc}
& \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\
\\midrule
\\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\
\\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\
\\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\
\\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\
\\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\
\\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\
\\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\
\\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\
\\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\
\\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}
Notes: \\newline
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
[2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
strong multicollinearity or other numerical problems."""
assert_equal(table, expected)
class TestRegularizedFit:
# Make sure there are no problems when no variables are selected.
def test_empty_model(self):
np.random.seed(742)
n = 100
endog = np.random.normal(size=n)
exog = np.random.normal(size=(n, 3))
for cls in OLS, WLS, GLS:
model = cls(endog, exog)
result = model.fit_regularized(alpha=1000)
assert_equal(result.params, 0.0)
def test_regularized(self):
import os
from .results import glmnet_r_results
cur_dir = os.path.dirname(os.path.abspath(__file__))
data = np.loadtxt(
os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter=","
)
tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]
for test in tests:
vec = getattr(glmnet_r_results, test)
n = vec[0]
p = vec[1]
L1_wt = float(vec[2])
lam = float(vec[3])
params = vec[4:].astype(np.float64)
endog = data[0 : int(n), 0]
exog = data[0 : int(n), 1 : (int(p) + 1)]
endog = endog - endog.mean()
endog /= endog.std(ddof=1)
exog = exog - exog.mean(0)
exog /= exog.std(0, ddof=1)
for cls in OLS, WLS, GLS:
mod = cls(endog, exog)
rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
assert_almost_equal(rslt.params, params, decimal=3)
# Smoke test for profile likelihood
mod.fit_regularized(L1_wt=L1_wt, alpha=lam, profile_scale=True)
def test_regularized_weights(self):
np.random.seed(1432)
exog1 = np.random.normal(size=(100, 3))
endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
exog2 = np.random.normal(size=(100, 3))
endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)
exog_a = np.vstack((exog1, exog1, exog2))
endog_a = np.concatenate((endog1, endog1, endog2))
# Should be equivalent to exog_a, endog_a.
exog_b = np.vstack((exog1, exog2))
endog_b = np.concatenate((endog1, endog2))
wgts = np.ones(200)
wgts[0:100] = 2
sigma = np.diag(1 / wgts)
for L1_wt in 0, 0.5, 1:
for alpha in 0, 1:
mod1 = OLS(endog_a, exog_a)
rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod2 = WLS(endog_b, exog_b, weights=wgts)
rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod3 = GLS(endog_b, exog_b, sigma=sigma)
rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)
assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def test_regularized_weights_list(self):
np.random.seed(132)
exog1 = np.random.normal(size=(100, 3))
endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
exog2 = np.random.normal(size=(100, 3))
endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)
exog_a = np.vstack((exog1, exog1, exog2))
endog_a = np.concatenate((endog1, endog1, endog2))
# Should be equivalent to exog_a, endog_a.
exog_b = np.vstack((exog1, exog2))
endog_b = np.concatenate((endog1, endog2))
wgts = np.ones(200)
wgts[0:100] = 2
sigma = np.diag(1 / wgts)
for L1_wt in 0, 0.5, 1:
for alpha_element in 0, 1:
alpha = [
alpha_element,
] * 3
mod1 = OLS(endog_a, exog_a)
rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod2 = WLS(endog_b, exog_b, weights=wgts)
rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)
mod3 = GLS(endog_b, exog_b, sigma=sigma)
rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)
assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def test_formula_missing_cat():
# gh-805
from patsy import PatsyError
import statsmodels.api as sm
from statsmodels.formula.api import ols
dta = sm.datasets.grunfeld.load_pandas().data
dta.loc[dta.index[0], "firm"] = np.nan
mod = ols(
formula="value ~ invest + capital + firm + year", data=dta.dropna()
)
res = mod.fit()
mod2 = ols(formula="value ~ invest + capital + firm + year", data=dta)
res2 = mod2.fit()
assert_almost_equal(res.params.values, res2.params.values)
assert_raises(
PatsyError,
ols,
"value ~ invest + capital + firm + year",
data=dta,
missing="raise",
)
def test_missing_formula_predict():
# see 2171
nsample = 30
data = np.linspace(0, 10, nsample)
null = np.array([np.nan])
data = pd.DataFrame({"x": np.concatenate((data, null))})
beta = np.array([1, 0.1])
e = np.random.normal(size=nsample + 1)
data["y"] = beta[0] + beta[1] * data["x"] + e
model = OLS.from_formula("y ~ x", data=data)
fit = model.fit()
fit.predict(exog=data[:-1])
def test_fvalue_implicit_constant():
# if constant is implicit, return nan see #2444
nobs = 100
np.random.seed(2)
x = np.random.randn(nobs, 1)
x = ((x > 0) == [True, False]).astype(int)
y = x.sum(1) + np.random.randn(nobs)
from statsmodels.regression.linear_model import OLS, WLS
res = OLS(y, x).fit(cov_type="HC1")
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
res = WLS(y, x).fit(cov_type="HC1")
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
def test_fvalue_only_constant():
# if only constant in model, return nan see #3642
nobs = 20
np.random.seed(2)
x = np.ones(nobs)
y = np.random.randn(nobs)
from statsmodels.regression.linear_model import OLS, WLS
res = OLS(y, x).fit(cov_type="hac", cov_kwds={"maxlags": 3})
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
res = WLS(y, x).fit(cov_type="HC1")
assert_(np.isnan(res.fvalue))
assert_(np.isnan(res.f_pvalue))
res.summary()
def test_ridge():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
v = np.ones(p)
v[0] = 0
for a in (0, 1, 10):
for alpha in (a, a * np.ones(p), a * v):
model1 = OLS(yvec, xmat)
result1 = model1._fit_ridge(alpha=alpha)
model2 = OLS(yvec, xmat)
result2 = model2.fit_regularized(alpha=alpha, L1_wt=0)
assert_allclose(result1.params, result2.params)
model3 = OLS(yvec, xmat)
result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10)
assert_allclose(result1.params, result3.params)
fv1 = result1.fittedvalues
fv2 = np.dot(xmat, result1.params)
assert_allclose(fv1, fv2)
def test_regularized_refit():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
# covariates 0 and 2 matter
yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
model1 = OLS(yvec, xmat)
result1 = model1.fit_regularized(alpha=2.0, L1_wt=0.5, refit=True)
model2 = OLS(yvec, xmat[:, [0, 2]])
result2 = model2.fit()
ii = [0, 2]
assert_allclose(result1.params[ii], result2.params)
assert_allclose(result1.bse[ii], result2.bse)
def test_regularized_predict():
# this also compares WLS with GLS
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
wgt = np.random.uniform(1, 2, n)
model_wls = WLS(yvec, xmat, weights=wgt)
# TODO: params is not the same in GLS if sigma=1 / wgt, i.e 1-dim, #7755
model_gls1 = GLS(yvec, xmat, sigma=np.diag(1 / wgt))
model_gls2 = GLS(yvec, xmat, sigma=1 / wgt)
res = []
for model1 in [model_wls, model_gls1, model_gls2]:
result1 = model1.fit_regularized(alpha=20.0, L1_wt=0.5, refit=True)
res.append(result1)
params = result1.params
fittedvalues = np.dot(xmat, params)
pr = model1.predict(result1.params)
assert_allclose(fittedvalues, pr)
assert_allclose(result1.fittedvalues, pr)
pr = result1.predict()
assert_allclose(fittedvalues, pr)
assert_allclose(res[0].model.wendog, res[1].model.wendog, rtol=1e-10)
assert_allclose(res[0].model.wexog, res[1].model.wexog, rtol=1e-10)
assert_allclose(res[0].fittedvalues, res[1].fittedvalues, rtol=1e-10)
assert_allclose(res[0].params, res[1].params, rtol=1e-10)
assert_allclose(res[0].model.wendog, res[2].model.wendog, rtol=1e-10)
assert_allclose(res[0].model.wexog, res[2].model.wexog, rtol=1e-10)
assert_allclose(res[0].fittedvalues, res[2].fittedvalues, rtol=1e-10)
assert_allclose(res[0].params, res[2].params, rtol=1e-10)
def test_regularized_options():
n = 100
p = 5
np.random.seed(3132)
xmat = np.random.normal(size=(n, p))
yvec = xmat.sum(1) + np.random.normal(size=n)
model1 = OLS(yvec - 1, xmat)
result1 = model1.fit_regularized(alpha=1.0, L1_wt=0.5)
model2 = OLS(yvec, xmat, offset=1)
result2 = model2.fit_regularized(
alpha=1.0, L1_wt=0.5, start_params=np.zeros(5)
)
assert_allclose(result1.params, result2.params)
def test_burg():
rnd = np.random.RandomState(12345)
e = rnd.randn(10001)
y = e[1:] + 0.5 * e[:-1]
# R, ar.burg
expected = [
[0.3909931],
[0.4602607, -0.1771582],
[0.47473245, -0.21475602, 0.08168813],
[0.4787017, -0.2251910, 0.1047554, -0.0485900],
[0.47975462, -0.22746106, 0.10963527, -0.05896347, 0.02167001],
]
for i in range(1, 6):
ar, _ = burg(y, i)
assert_allclose(ar, expected[i - 1], atol=1e-6)
as_nodemean, _ = burg(1 + y, i, False)
assert np.all(ar != as_nodemean)
def test_burg_errors():
with pytest.raises(ValueError):
burg(np.ones((100, 2)))
with pytest.raises(ValueError):
burg(np.random.randn(100), 0)
with pytest.raises(ValueError):
burg(np.random.randn(100), "apple")
@pytest.mark.skipif(not has_cvxopt, reason="sqrt_lasso requires cvxopt")
def test_sqrt_lasso():
np.random.seed(234923)
# Based on the example in the Belloni paper
n = 100
p = 500
ii = np.arange(p)
cx = 0.5 ** np.abs(np.subtract.outer(ii, ii))
cxr = np.linalg.cholesky(cx)
x = np.dot(np.random.normal(size=(n, p)), cxr.T)
b = np.zeros(p)
b[0:5] = [1, 1, 1, 1, 1]
from scipy.stats.distributions import norm
alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p))
# Use very low noise level for a unit test
y = np.dot(x, b) + 0.25 * np.random.normal(size=n)
# At low noise levels, the sqrt lasso should be around a
# factor of 3 from the oracle without refit, and should
# almost equal the oracle with refit.
expected_oracle = {False: 3, True: 1}
# Used for regression testing
expected_params = {
False: np.r_[
0.87397122, 0.96051874, 0.9905915, 0.93868953, 0.90771773
],
True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403],
}
for refit in False, True:
rslt = OLS(y, x).fit_regularized(
method="sqrt_lasso", alpha=alpha, refit=refit
)
err = rslt.params - b
numer = np.sqrt(np.dot(err, np.dot(cx, err)))
oracle = OLS(y, x[:, 0:5]).fit()
oracle_err = np.zeros(p)
oracle_err[0:5] = oracle.params - b[0:5]
denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err)))
# Check performance relative to oracle, should be around
assert_allclose(
numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1
)
# Regression test the parameters
assert_allclose(
rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5
)
def test_bool_regressor(reset_randomstate):
exog = np.random.randint(0, 2, size=(100, 2)).astype(bool)
endog = np.random.standard_normal(100)
bool_res = OLS(endog, exog).fit()
res = OLS(endog, exog.astype(np.double)).fit()
assert_allclose(bool_res.params, res.params)
def test_ols_constant(reset_randomstate):
y = np.random.standard_normal(200)
x = np.ones((200, 1))
res = OLS(y, x).fit()
with warnings.catch_warnings(record=True) as recording:
assert np.isnan(res.fvalue)
assert np.isnan(res.f_pvalue)
assert len(recording) == 0
def test_summary_no_constant():
rs = np.random.RandomState(0)
x = rs.standard_normal((100, 2))
y = rs.standard_normal(100)
summary = OLS(y, x).fit().summary()
assert "R² is computed " in summary.as_text()
def test_condition_number(reset_randomstate):
y = np.random.standard_normal(100)
x = np.random.standard_normal((100, 1))
x = x + np.random.standard_normal((100, 5))
res = OLS(y, x).fit()
assert_allclose(res.condition_number, np.sqrt(np.linalg.cond(x.T @ x)))
assert_allclose(res.condition_number, np.linalg.cond(x))
def test_slim_summary(reset_randomstate):
y = np.random.standard_normal(100)
x = np.random.standard_normal((100, 1))
x = x + np.random.standard_normal((100, 5))
res = OLS(y, x).fit()
import copy
summ = copy.deepcopy(res.summary())
slim_summ = copy.deepcopy(res.summary(slim=True))
assert len(summ.tables) == 3
assert len(slim_summ.tables) == 2
assert summ.tables[0].as_text() != slim_summ.tables[0].as_text()
assert slim_summ.tables[1].as_text() == summ.tables[1].as_text()