AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/stats/tests/test_diagnostic.py

1999 lines
60 KiB
Python
Raw Normal View History

2024-10-02 22:15:59 +04:00
"""Tests for Regression Diagnostics and Specification Tests
Created on Thu Feb 09 13:19:47 2012
Author: Josef Perktold
License: BSD-3
currently all tests are against R
"""
import json
import os
import numpy as np
from numpy.testing import (
assert_,
assert_allclose,
assert_almost_equal,
assert_array_equal,
assert_equal,
)
import pandas as pd
from pandas.testing import assert_frame_equal
import pytest
from statsmodels.datasets import macrodata, sunspots
from statsmodels.regression.linear_model import OLS
import statsmodels.stats.diagnostic as smsdia
import statsmodels.stats.outliers_influence as oi
import statsmodels.stats.sandwich_covariance as sw
from statsmodels.tools.tools import Bunch, add_constant
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
cur_dir = os.path.abspath(os.path.dirname(__file__))
@pytest.fixture(scope="module")
def diagnostic_data():
rs = np.random.RandomState(93674328)
e = rs.standard_normal(500)
x = rs.standard_normal((500, 3))
y = x.sum(1) + e
c = np.ones_like(y)
data = pd.DataFrame(np.c_[y, c, x], columns=["y", "c", "x1", "x2", "x3"])
return data
def compare_to_reference(sp, sp_dict, decimal=(12, 12)):
assert_allclose(
sp[0],
sp_dict["statistic"],
atol=10 ** -decimal[0],
rtol=10 ** -decimal[0],
)
assert_allclose(
sp[1],
sp_dict["pvalue"],
atol=10 ** -decimal[1],
rtol=10 ** -decimal[0],
)
def test_gq():
d = macrodata.load().data
realinv = d["realinv"]
realgdp = d["realgdp"]
realint = d["realint"]
endog = realinv
exog = add_constant(np.c_[realgdp, realint])
# goldfeld-quandt
het_gq_greater = dict(
statistic=13.20512768685082,
df1=99,
df2=98,
pvalue=1.246141976112324e-30,
distr="f",
)
gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
assert_equal(gq[-1], "increasing")
class TestDiagnosticG:
@classmethod
def setup_class(cls):
d = macrodata.load_pandas().data
# growth rates
gs_l_realinv = 400 * np.diff(np.log(d["realinv"].values))
gs_l_realgdp = 400 * np.diff(np.log(d["realgdp"].values))
lint = d["realint"][:-1].values
tbilrate = d["tbilrate"][:-1].values
endogg = gs_l_realinv
exogg = add_constant(np.c_[gs_l_realgdp, lint])
exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate])
exogg3 = add_constant(np.c_[gs_l_realgdp])
res_ols = OLS(endogg, exogg).fit()
res_ols2 = OLS(endogg, exogg2).fit()
res_ols3 = OLS(endogg, exogg3).fit()
cls.res = res_ols
cls.res2 = res_ols2
cls.res3 = res_ols3
cls.endog = cls.res.model.endog
cls.exog = cls.res.model.exog
def test_basic(self):
# mainly to check I got the right regression
# > mkarray(fm$coefficients, "params")
params = np.array(
[-9.48167277465485, 4.3742216647032, -0.613996969478989]
)
assert_almost_equal(self.res.params, params, decimal=12)
def test_hac(self):
res = self.res
# > nw = NeweyWest(fm, lag = 4, prewhite = FALSE, verbose=TRUE)
# > nw2 = NeweyWest(fm, lag=10, prewhite = FALSE, verbose=TRUE)
# > mkarray(nw, "cov_hac_4")
cov_hac_4 = np.array(
[
1.385551290884014,
-0.3133096102522685,
-0.0597207976835705,
-0.3133096102522685,
0.1081011690351306,
0.000389440793564336,
-0.0597207976835705,
0.000389440793564339,
0.0862118527405036,
]
).reshape((3, 3), order="F")
# > mkarray(nw2, "cov_hac_10")
cov_hac_10 = np.array(
[
1.257386180080192,
-0.2871560199899846,
-0.03958300024627573,
-0.2871560199899845,
0.1049107028987101,
0.0003896205316866944,
-0.03958300024627578,
0.0003896205316866961,
0.0985539340694839,
]
).reshape((3, 3), order="F")
cov = sw.cov_hac_simple(res, nlags=4, use_correction=False)
bse_hac = sw.se_cov(cov)
assert_almost_equal(cov, cov_hac_4, decimal=12)
assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)
cov = sw.cov_hac_simple(res, nlags=10, use_correction=False)
bse_hac = sw.se_cov(cov)
assert_almost_equal(cov, cov_hac_10, decimal=12)
assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12)
def test_het_goldfeldquandt(self):
# TODO: test options missing
# > gq = gqtest(fm, alternative='greater')
# > mkhtest_f(gq, 'het_gq_greater', 'f')
het_gq_greater = dict(
statistic=0.5313259064778423,
pvalue=0.9990217851193723,
parameters=(98, 98),
distr="f",
)
# > gq = gqtest(fm, alternative='less')
# > mkhtest_f(gq, 'het_gq_less', 'f')
het_gq_less = dict(
statistic=0.5313259064778423,
pvalue=0.000978214880627621,
parameters=(98, 98),
distr="f",
)
# > gq = gqtest(fm, alternative='two.sided')
# > mkhtest_f(gq, 'het_gq_two_sided', 'f')
het_gq_two_sided = dict(
statistic=0.5313259064778423,
pvalue=0.001956429761255241,
parameters=(98, 98),
distr="f",
)
# > gq = gqtest(fm, fraction=0.1, alternative='two.sided')
# > mkhtest_f(gq, 'het_gq_two_sided_01', 'f')
het_gq_two_sided_01 = dict(
statistic=0.5006976835928314,
pvalue=0.001387126702579789,
parameters=(88, 87),
distr="f",
)
endogg, exogg = self.endog, self.exog
# tests
gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5)
compare_to_reference(gq, het_gq_greater, decimal=(12, 12))
assert_equal(gq[-1], "increasing")
gq = smsdia.het_goldfeldquandt(
endogg, exogg, split=0.5, alternative="decreasing"
)
compare_to_reference(gq, het_gq_less, decimal=(12, 12))
assert_equal(gq[-1], "decreasing")
gq = smsdia.het_goldfeldquandt(
endogg, exogg, split=0.5, alternative="two-sided"
)
compare_to_reference(gq, het_gq_two_sided, decimal=(12, 12))
assert_equal(gq[-1], "two-sided")
# TODO: forcing the same split as R 202-90-90-1=21
gq = smsdia.het_goldfeldquandt(
endogg, exogg, split=90, drop=21, alternative="two-sided"
)
compare_to_reference(gq, het_gq_two_sided_01, decimal=(12, 12))
assert_equal(gq[-1], "two-sided")
# TODO other options ???
def test_het_breusch_pagan(self):
res = self.res
bptest = dict(
statistic=0.709924388395087,
pvalue=0.701199952134347,
parameters=(2,),
distr="f",
)
bp = smsdia.het_breuschpagan(res.resid, res.model.exog)
compare_to_reference(bp, bptest, decimal=(12, 12))
def test_het_breusch_pagan_1d_err(self):
res = self.res
x = np.asarray(res.model.exog)[:, -1]
with pytest.raises(ValueError, match="The Breusch-Pagan"):
smsdia.het_breuschpagan(res.resid, x)
x = np.ones_like(x)
with pytest.raises(ValueError, match="The Breusch-Pagan"):
smsdia.het_breuschpagan(res.resid, x)
x = np.asarray(res.model.exog).copy()
x[:, 0] = 0
with pytest.raises(ValueError, match="The Breusch-Pagan"):
smsdia.het_breuschpagan(res.resid, x)
def test_het_breusch_pagan_nonrobust(self):
res = self.res
bptest = dict(
statistic=1.302014063483341,
pvalue=0.5215203247110649,
parameters=(2,),
distr="f",
)
bp = smsdia.het_breuschpagan(res.resid, res.model.exog, robust=False)
compare_to_reference(bp, bptest, decimal=(12, 12))
def test_het_white(self):
res = self.res
# TODO: regressiontest, compare with Greene or Gretl or Stata
hw = smsdia.het_white(res.resid, res.model.exog)
hw_values = (
33.503722896538441,
2.9887960597830259e-06,
7.7945101228430946,
1.0354575277704231e-06,
)
assert_almost_equal(hw, hw_values)
def test_het_white_error(self):
res = self.res
with pytest.raises(ValueError, match="White's heteroskedasticity"):
smsdia.het_white(res.resid, res.model.exog[:, :1])
def test_het_arch(self):
# test het_arch and indirectly het_lm against R
# > library(FinTS)
# > at = ArchTest(residuals(fm), lags=4)
# > mkhtest(at, 'archtest_4', 'chi2')
archtest_4 = dict(
statistic=3.43473400836259,
pvalue=0.487871315392619,
parameters=(4,),
distr="chi2",
)
# > at = ArchTest(residuals(fm), lags=12)
# > mkhtest(at, 'archtest_12', 'chi2')
archtest_12 = dict(
statistic=8.648320999014171,
pvalue=0.732638635007718,
parameters=(12,),
distr="chi2",
)
at4 = smsdia.het_arch(self.res.resid, nlags=4)
at12 = smsdia.het_arch(self.res.resid, nlags=12)
compare_to_reference(at4[:2], archtest_4, decimal=(12, 13))
compare_to_reference(at12[:2], archtest_12, decimal=(12, 13))
def test_het_arch2(self):
# test autolag options, this also test het_lm
# unfortunately optimal lag=1 for this data
resid = self.res.resid
res1 = smsdia.het_arch(resid, nlags=5, store=True)
rs1 = res1[-1]
res2 = smsdia.het_arch(resid, nlags=5, store=True)
rs2 = res2[-1]
assert_almost_equal(rs2.resols.params, rs1.resols.params, decimal=12)
assert_almost_equal(res2[:4], res1[:4], decimal=12)
# test that smallest lag, nlags=1 works
res3 = smsdia.het_arch(resid, nlags=5)
assert_almost_equal(res3[:4], res1[:4], decimal=12)
def test_acorr_breusch_godfrey(self):
res = self.res
# bgf = bgtest(fm, order = 4, type="F")
breuschgodfrey_f = dict(
statistic=1.179280833676792,
pvalue=0.321197487261203,
parameters=(
4,
195,
),
distr="f",
)
# > bgc = bgtest(fm, order = 4, type="Chisq")
# > mkhtest(bgc, "breuschpagan_c", "chi2")
breuschgodfrey_c = dict(
statistic=4.771042651230007,
pvalue=0.3116067133066697,
parameters=(4,),
distr="chi2",
)
bg = smsdia.acorr_breusch_godfrey(res, nlags=4)
bg_r = [
breuschgodfrey_c["statistic"],
breuschgodfrey_c["pvalue"],
breuschgodfrey_f["statistic"],
breuschgodfrey_f["pvalue"],
]
assert_almost_equal(bg, bg_r, decimal=11)
# check that lag choice works
bg2 = smsdia.acorr_breusch_godfrey(res, nlags=None)
bg3 = smsdia.acorr_breusch_godfrey(res, nlags=10)
assert_almost_equal(bg2, bg3, decimal=12)
def test_acorr_breusch_godfrey_multidim(self):
res = Bunch(resid=np.empty((100, 2)))
with pytest.raises(ValueError, match="Model resid must be a 1d array"):
smsdia.acorr_breusch_godfrey(res)
def test_acorr_breusch_godfrey_exogs(self):
data = sunspots.load_pandas().data["SUNACTIVITY"]
res = ARIMA(data, order=(1, 0, 0), trend="n").fit()
smsdia.acorr_breusch_godfrey(res, nlags=1)
def test_acorr_ljung_box(self):
# unit-test which may be useful later
# ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q
# bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2)
# mkhtest(bt, "ljung_box_4df2", "chi2")
# ljung_box_4df2 = dict(statistic=5.23587172795227,
# pvalue=0.0729532930400377,
# parameters=(2,), distr='chi2')
# bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2)
# mkhtest(bt, "ljung_box_bp_4df2", "chi2")
# ljung_box_bp_4df2 = dict(statistic=5.12462932741681,
# pvalue=0.0771260128929921,
# parameters=(2,), distr='chi2')
res = self.res
# general test
# bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box")
# mkhtest(bt, "ljung_box_4", "chi2")
ljung_box_4 = dict(
statistic=5.23587172795227,
pvalue=0.263940335284713,
parameters=(4,),
distr="chi2",
)
# bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce")
# mkhtest(bt, "ljung_box_bp_4", "chi2")
ljung_box_bp_4 = dict(
statistic=5.12462932741681,
pvalue=0.2747471266820692,
parameters=(4,),
distr="chi2",
)
df = smsdia.acorr_ljungbox(res.resid, 4, boxpierce=True)
compare_to_reference(
[df.loc[4, "lb_stat"], df.loc[4, "lb_pvalue"]],
ljung_box_4,
decimal=(12, 12),
)
compare_to_reference(
[df.loc[4, "bp_stat"], df.loc[4, "bp_pvalue"]],
ljung_box_bp_4,
decimal=(12, 12),
)
def test_acorr_ljung_box_big_default(self):
res = self.res
# R test with big dataset and default lag
# bt = Box.test(residuals(fm), type = "Ljung-Box")
# mkhtest(bt, "ljung_box_none", "chi2")
ljung_box_none = dict(
statistic=51.03724531797195, pvalue=0.11334744923390, distr="chi2"
)
# bt = Box.test(residuals(fm), type = "Box-Pierce")
# mkhtest(bt, "ljung_box_bp_none", "chi2")
ljung_box_bp_none = dict(
statistic=45.12238537034000, pvalue=0.26638168491464, distr="chi2"
)
lags = min(40, res.resid.shape[0] // 2 - 2)
df = smsdia.acorr_ljungbox(res.resid, boxpierce=True, lags=lags)
idx = df.index.max()
compare_to_reference(
[df.loc[idx, "lb_stat"], df.loc[idx, "lb_pvalue"]],
ljung_box_none,
decimal=(12, 12),
)
compare_to_reference(
[df.loc[idx, "bp_stat"], df.loc[idx, "bp_pvalue"]],
ljung_box_bp_none,
decimal=(12, 12),
)
def test_acorr_ljung_box_small_default(self):
res = self.res
# R test with small dataset and default lag
# bt = Box.test(residuals(fm), type = "Ljung-Box")
# mkhtest(bt, "ljung_box_small", "chi2")
ljung_box_small = dict(
statistic=9.61503968281915,
pvalue=0.72507000996945,
parameters=(0,),
distr="chi2",
)
# bt = Box.test(residuals(fm), type = "Box-Pierce")
# mkhtest(bt, "ljung_box_bp_small", "chi2")
ljung_box_bp_small = dict(
statistic=7.41692150864936,
pvalue=0.87940785887006,
parameters=(0,),
distr="chi2",
)
if isinstance(res.resid, np.ndarray):
resid = res.resid[:30]
else:
resid = res.resid.iloc[:30]
df = smsdia.acorr_ljungbox(
resid, boxpierce=True, lags=13
)
idx = df.index.max()
compare_to_reference(
[df.loc[idx, "lb_stat"], df.loc[idx, "lb_pvalue"]],
ljung_box_small,
decimal=(12, 12),
)
compare_to_reference(
[df.loc[idx, "bp_stat"], df.loc[idx, "bp_pvalue"]],
ljung_box_bp_small,
decimal=(12, 12),
)
def test_acorr_ljung_box_against_r(self, reset_randomstate):
rs = np.random.RandomState(9876543)
y1 = rs.standard_normal(100)
e = rs.standard_normal(201)
y2 = np.zeros_like(e)
y2[0] = e[0]
for i in range(1, 201):
y2[i] = 0.5 * y2[i - 1] - 0.4 * e[i - 1] + e[i]
y2 = y2[-100:]
r_results_y1_lb = [
[0.15685, 1, 0.6921],
[5.4737, 5, 0.3608],
[10.508, 10, 0.3971],
]
r_results_y2_lb = [
[2.8764, 1, 0.08989],
[3.8104, 5, 0.577],
[8.4779, 10, 0.5823],
]
res_y1 = smsdia.acorr_ljungbox(y1, 10)
res_y2 = smsdia.acorr_ljungbox(y2, 10)
for i, loc in enumerate((1, 5, 10)):
row = res_y1.loc[loc]
assert_allclose(
r_results_y1_lb[i][0], row.loc["lb_stat"], rtol=1e-3
)
assert_allclose(
r_results_y1_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
)
row = res_y2.loc[loc]
assert_allclose(
r_results_y2_lb[i][0], row.loc["lb_stat"], rtol=1e-3
)
assert_allclose(
r_results_y2_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3
)
res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True)
assert_allclose(res.loc[10, "bp_stat"], 7.8935, rtol=1e-3)
assert_allclose(res.loc[10, "bp_pvalue"], 0.639, rtol=1e-3)
res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, model_df=1)
assert_allclose(res.loc[10, "bp_pvalue"], 0.5449, rtol=1e-3)
def test_harvey_collier(self):
# > hc = harvtest(fm, order.by = NULL, data = list())
# > mkhtest_f(hc, 'harvey_collier', 't')
harvey_collier = dict(
statistic=0.494432160939874,
pvalue=0.6215491310408242,
parameters=198,
distr="t",
)
hc = smsdia.linear_harvey_collier(self.res)
compare_to_reference(hc, harvey_collier, decimal=(12, 12))
hc_skip = smsdia.linear_harvey_collier(self.res, skip=20)
assert not np.allclose(hc[0], hc_skip[0])
def test_rainbow(self):
# rainbow test
# > rt = raintest(fm)
# > mkhtest_f(rt, 'raintest', 'f')
raintest = dict(
statistic=0.6809600116739604,
pvalue=0.971832843583418,
parameters=(101, 98),
distr="f",
)
# > rt = raintest(fm, fraction=0.4)
# > mkhtest_f(rt, 'raintest_fraction_04', 'f')
raintest_fraction_04 = dict(
statistic=0.565551237772662,
pvalue=0.997592305968473,
parameters=(122, 77),
distr="f",
)
rb = smsdia.linear_rainbow(self.res)
compare_to_reference(rb, raintest, decimal=(12, 12))
rb = smsdia.linear_rainbow(self.res, frac=0.4)
compare_to_reference(rb, raintest_fraction_04, decimal=(12, 12))
def test_compare_lr(self):
res = self.res
res3 = self.res3 # nested within res
# lrtest
# lrt = lrtest(fm, fm2)
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp
lrtest = dict(
loglike1=-763.9752181602237,
loglike2=-766.3091902020184,
chi2value=4.66794408358942,
pvalue=0.03073069384028677,
df=(4, 3, 1),
)
lrt = res.compare_lr_test(res3)
assert_almost_equal(lrt[0], lrtest["chi2value"], decimal=11)
assert_almost_equal(lrt[1], lrtest["pvalue"], decimal=11)
waldtest = dict(
fvalue=4.65216373312492,
pvalue=0.03221346195239025,
df=(199, 200, 1),
)
wt = res.compare_f_test(res3)
assert_almost_equal(wt[0], waldtest["fvalue"], decimal=11)
assert_almost_equal(wt[1], waldtest["pvalue"], decimal=11)
def test_compare_j(self):
res = self.res
res2 = self.res2
# jt = jtest(fm, lm(ginv ~ ggdp + tbilrate))
# Estimate Std. Error t value Pr(>|t|)
jtest = [
(
"M1 + fitted(M2)",
1.591505670785873,
0.7384552861695823,
2.155182176352370,
0.032354572525314450,
"*",
),
(
"M2 + fitted(M1)",
1.305687653016899,
0.4808385176653064,
2.715438978051544,
0.007203854534057954,
"**",
),
]
jt1 = smsdia.compare_j(res2, res)
assert_almost_equal(jt1, jtest[0][3:5], decimal=12)
jt2 = smsdia.compare_j(res, res2)
assert_almost_equal(jt2, jtest[1][3:5], decimal=12)
@pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
def test_compare_error(self, comp, diagnostic_data):
data = diagnostic_data
res1 = OLS(data.y, data[["c", "x1"]]).fit()
res2 = OLS(data.x3, data[["c", "x2"]]).fit()
with pytest.raises(ValueError, match="endogenous variables"):
comp(res1, res2)
@pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j])
def test_compare_nested(self, comp, diagnostic_data):
data = diagnostic_data
res1 = OLS(data.y, data[["c", "x1"]]).fit()
res2 = OLS(data.y, data[["c", "x1", "x2"]]).fit()
with pytest.raises(ValueError, match="The exog in results_x"):
comp(res1, res2)
with pytest.raises(ValueError, match="The exog in results_x"):
comp(res2, res1)
def test_compare_cox(self):
res = self.res
res2 = self.res2
# Estimate Std. Error z value Pr(>|z|)
coxtest = [
(
"fitted(M1) ~ M2",
-0.782030488930356,
0.599696502782265,
-1.304043770977755,
1.922186587840554e-01,
" ",
),
(
"fitted(M2) ~ M1",
-2.248817107408537,
0.392656854330139,
-5.727181590258883,
1.021128495098556e-08,
"***",
),
]
ct1 = smsdia.compare_cox(res, res2)
assert_almost_equal(ct1, coxtest[0][3:5], decimal=12)
ct2 = smsdia.compare_cox(res2, res)
assert_almost_equal(ct2, coxtest[1][3:5], decimal=12)
_, _, store = smsdia.compare_cox(res, res2, store=True)
assert isinstance(store, smsdia.ResultsStore)
def test_cusum_ols(self):
# R library(strucchange)
# > sc = sctest(ginv ~ ggdp + lint, type="OLS-CUSUM")
# > mkhtest(sc, 'cusum_ols', 'BB')
cusum_ols = dict(
statistic=1.055750610401214,
pvalue=0.2149567397376543,
parameters=(),
distr="BB",
) # Brownian Bridge
k_vars = 3
cs_ols = smsdia.breaks_cusumolsresid(self.res.resid, ddof=k_vars) #
compare_to_reference(cs_ols, cusum_ols, decimal=(12, 12))
def test_breaks_hansen(self):
# > sc = sctest(ginv ~ ggdp + lint, type="Nyblom-Hansen")
# > mkhtest(sc, 'breaks_nyblom_hansen', 'BB')
breaks_nyblom_hansen = dict(
statistic=1.0300792740544484,
pvalue=0.1136087530212015,
parameters=(),
distr="BB",
)
bh = smsdia.breaks_hansen(self.res)
assert_almost_equal(
bh[0], breaks_nyblom_hansen["statistic"], decimal=12
)
# TODO: breaks_hansen does not return pvalues
def test_recursive_residuals(self):
reccumres_standardize = np.array(
[
-2.151,
-3.748,
-3.114,
-3.096,
-1.865,
-2.230,
-1.194,
-3.500,
-3.638,
-4.447,
-4.602,
-4.631,
-3.999,
-4.830,
-5.429,
-5.435,
-6.554,
-8.093,
-8.567,
-7.532,
-7.079,
-8.468,
-9.320,
-12.256,
-11.932,
-11.454,
-11.690,
-11.318,
-12.665,
-12.842,
-11.693,
-10.803,
-12.113,
-12.109,
-13.002,
-11.897,
-10.787,
-10.159,
-9.038,
-9.007,
-8.634,
-7.552,
-7.153,
-6.447,
-5.183,
-3.794,
-3.511,
-3.979,
-3.236,
-3.793,
-3.699,
-5.056,
-5.724,
-4.888,
-4.309,
-3.688,
-3.918,
-3.735,
-3.452,
-2.086,
-6.520,
-7.959,
-6.760,
-6.855,
-6.032,
-4.405,
-4.123,
-4.075,
-3.235,
-3.115,
-3.131,
-2.986,
-1.813,
-4.824,
-4.424,
-4.796,
-4.000,
-3.390,
-4.485,
-4.669,
-4.560,
-3.834,
-5.507,
-3.792,
-2.427,
-1.756,
-0.354,
1.150,
0.586,
0.643,
1.773,
-0.830,
-0.388,
0.517,
0.819,
2.240,
3.791,
3.187,
3.409,
2.431,
0.668,
0.957,
-0.928,
0.327,
-0.285,
-0.625,
-2.316,
-1.986,
-0.744,
-1.396,
-1.728,
-0.646,
-2.602,
-2.741,
-2.289,
-2.897,
-1.934,
-2.532,
-3.175,
-2.806,
-3.099,
-2.658,
-2.487,
-2.515,
-2.224,
-2.416,
-1.141,
0.650,
-0.947,
0.725,
0.439,
0.885,
2.419,
2.642,
2.745,
3.506,
4.491,
5.377,
4.624,
5.523,
6.488,
6.097,
5.390,
6.299,
6.656,
6.735,
8.151,
7.260,
7.846,
8.771,
8.400,
8.717,
9.916,
9.008,
8.910,
8.294,
8.982,
8.540,
8.395,
7.782,
7.794,
8.142,
8.362,
8.400,
7.850,
7.643,
8.228,
6.408,
7.218,
7.699,
7.895,
8.725,
8.938,
8.781,
8.350,
9.136,
9.056,
10.365,
10.495,
10.704,
10.784,
10.275,
10.389,
11.586,
11.033,
11.335,
11.661,
10.522,
10.392,
10.521,
10.126,
9.428,
9.734,
8.954,
9.949,
10.595,
8.016,
6.636,
6.975,
]
)
rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95)
assert_equal(
np.round(rr[5][1:], 3), reccumres_standardize
) # extra zero in front
# assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize))
assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3)
assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4)
order = np.arange(self.res.model.endog.shape[0])
rr_order = smsdia.recursive_olsresiduals(
self.res, skip=3, alpha=0.95, order_by=order
)
assert_allclose(rr[0], rr_order[0], atol=1e-6)
# regression number, visually checked with graph from gretl
ub0 = np.array(
[13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121]
)
ub1 = np.array(
[39.44753774, 39.58194162, 39.7163455, 39.85074937, 39.98515325]
)
lb, ub = rr[6]
assert_almost_equal(ub[:5], ub0, decimal=7)
assert_almost_equal(lb[:5], -ub0, decimal=7)
assert_almost_equal(ub[-5:], ub1, decimal=7)
assert_almost_equal(lb[-5:], -ub1, decimal=7)
# test a few values with explicit OLS
endog = self.res.model.endog
exog = self.res.model.exog
params = []
ypred = []
for i in range(3, 10):
resi = OLS(endog[:i], exog[:i]).fit()
ypred.append(resi.model.predict(resi.params, exog[i]))
params.append(resi.params)
assert_almost_equal(rr[2][3:10], ypred, decimal=12)
assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12)
assert_almost_equal(rr[1][2:9], params, decimal=12)
def test_normality(self):
res = self.res
# > library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test
# > lt = lillie.test(residuals(fm))
# > mkhtest(lt, "lilliefors", "-")
lilliefors1 = dict(
statistic=0.0723390908786589,
pvalue=0.01204113540102896,
parameters=(),
distr="-",
)
# > lt = lillie.test(residuals(fm)**2)
# > mkhtest(lt, "lilliefors", "-")
lilliefors2 = dict(
statistic=0.301311621898024,
pvalue=1.004305736618051e-51,
parameters=(),
distr="-",
)
# > lt = lillie.test(residuals(fm)[1:20])
# > mkhtest(lt, "lilliefors", "-")
lilliefors3 = dict(
statistic=0.1333956004203103,
pvalue=0.455683,
parameters=(),
distr="-",
)
lf1 = smsdia.lilliefors(res.resid, pvalmethod="approx")
lf2 = smsdia.lilliefors(res.resid ** 2, pvalmethod="approx")
if isinstance(res.resid, np.ndarray):
resid = res.resid[:20]
else:
resid = res.resid.iloc[:20]
lf3 = smsdia.lilliefors(resid, pvalmethod="approx")
compare_to_reference(lf1, lilliefors1, decimal=(12, 12))
compare_to_reference(
lf2, lilliefors2, decimal=(12, 12)
) # pvalue very small
assert_allclose(lf2[1], lilliefors2["pvalue"], rtol=1e-10)
compare_to_reference(lf3, lilliefors3, decimal=(12, 1))
# R uses different approximation for pvalue in last case
# > ad = ad.test(residuals(fm))
# > mkhtest(ad, "ad3", "-")
adr1 = dict(
statistic=1.602209621518313,
pvalue=0.0003937979149362316,
parameters=(),
distr="-",
)
# > ad = ad.test(residuals(fm)**2)
# > mkhtest(ad, "ad3", "-")
# > ad = ad.test(residuals(fm)[1:20])
# > mkhtest(ad, "ad3", "-")
adr3 = dict(
statistic=0.3017073732210775,
pvalue=0.5443499281265933,
parameters=(),
distr="-",
)
ad1 = smsdia.normal_ad(res.resid)
compare_to_reference(ad1, adr1, decimal=(11, 13))
ad2 = smsdia.normal_ad(res.resid ** 2)
assert_(np.isinf(ad2[0]))
ad3 = smsdia.normal_ad(resid)
compare_to_reference(ad3, adr3, decimal=(11, 12))
def test_influence(self):
res = self.res
# this test is slow
infl = oi.OLSInfluence(res)
path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
with open(path, encoding="utf-8") as fp:
lsdiag = json.load(fp)
# basic
assert_almost_equal(
np.array(lsdiag["cov.scaled"]).reshape(3, 3),
res.cov_params(),
decimal=12,
)
assert_almost_equal(
np.array(lsdiag["cov.unscaled"]).reshape(3, 3),
res.normalized_cov_params,
decimal=12,
)
c0, c1 = infl.cooks_distance # TODO: what's c1
assert_almost_equal(c0, lsdiag["cooks"], decimal=12)
assert_almost_equal(infl.hat_matrix_diag, lsdiag["hat"], decimal=12)
assert_almost_equal(
infl.resid_studentized_internal, lsdiag["std.res"], decimal=12
)
# slow:
# infl._get_all_obs() #slow, nobs estimation loop, called implicitly
dffits, dffth = infl.dffits
assert_almost_equal(dffits, lsdiag["dfits"], decimal=12)
assert_almost_equal(
infl.resid_studentized_external, lsdiag["stud.res"], decimal=12
)
fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
infl_r = pd.read_csv(fn, index_col=0)
# not used yet:
# infl_bool_r = pandas.read_csv(fn, index_col=0,
# converters=dict(zip(lrange(7),[conv]*7)))
infl_r2 = np.asarray(infl_r)
assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)
# duplicates
assert_almost_equal(dffits, infl_r2[:, 3], decimal=12)
assert_almost_equal(c0, infl_r2[:, 5], decimal=12)
assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=12)
# TODO: finish and check thresholds and pvalues
# Note: for dffits, R uses a threshold around 0.36,
# mine: dffits[1]=0.24373
# R has
# >>> np.nonzero(np.asarray(infl_bool_r["dffit"]))[0]
# array([ 6, 26, 63, 76, 90, 199])
# >>> np.nonzero(np.asarray(infl_bool_r["cov.r"]))[0]
# array([ 4, 26, 59, 61, 63, 72, 76, 84, 91, 92, 94, 95,
# 108, 197, 198])
# >>> np.nonzero(np.asarray(infl_bool_r["hat"]))[0]
# array([ 62, 76, 84, 90, 91, 92, 95, 108, 197, 199])
class TestDiagnosticGPandas(TestDiagnosticG):
@classmethod
def setup_class(cls):
d = macrodata.load_pandas().data
# growth rates
d["gs_l_realinv"] = 400 * np.log(d["realinv"]).diff()
d["gs_l_realgdp"] = 400 * np.log(d["realgdp"]).diff()
d["lint"] = d["realint"].shift(1)
d["tbilrate"] = d["tbilrate"].shift(1)
d = d.dropna()
cls.d = d
endogg = d["gs_l_realinv"]
exogg = add_constant(d[["gs_l_realgdp", "lint"]])
exogg2 = add_constant(d[["gs_l_realgdp", "tbilrate"]])
exogg3 = add_constant(d[["gs_l_realgdp"]])
res_ols = OLS(endogg, exogg).fit()
res_ols2 = OLS(endogg, exogg2).fit()
res_ols3 = OLS(endogg, exogg3).fit()
cls.res = res_ols
cls.res2 = res_ols2
cls.res3 = res_ols3
cls.endog = cls.res.model.endog
cls.exog = cls.res.model.exog
def test_spec_white():
resdir = os.path.join(cur_dir, "results")
wsfiles = ["wspec1.csv", "wspec2.csv", "wspec3.csv", "wspec4.csv"]
for file in wsfiles:
mdlfile = os.path.join(resdir, file)
mdl = np.asarray(pd.read_csv(mdlfile))
# DV is in last column
lastcol = mdl.shape[1] - 1
dv = mdl[:, lastcol]
# create design matrix
design = np.concatenate(
(np.ones((mdl.shape[0], 1)), np.delete(mdl, lastcol, 1)), axis=1
)
# perform OLS and generate residuals
resids = dv - np.dot(design, np.linalg.lstsq(design, dv, rcond=-1)[0])
# perform White spec test. wspec3/wspec4 contain dummies.
wsres = smsdia.spec_white(resids, design)
# compare results to SAS 9.3 output
if file == "wspec1.csv":
assert_almost_equal(wsres, [3.251, 0.661, 5], decimal=3)
elif file == "wspec2.csv":
assert_almost_equal(wsres, [6.070, 0.733, 9], decimal=3)
elif file == "wspec3.csv":
assert_almost_equal(wsres, [6.767, 0.454, 7], decimal=3)
else:
assert_almost_equal(wsres, [8.462, 0.671, 11], decimal=3)
def test_spec_white_error(reset_randomstate):
with pytest.raises(ValueError, match="White's specification test "):
smsdia.spec_white(
np.random.standard_normal(100), np.random.standard_normal((100, 1))
)
with pytest.raises(ValueError, match="White's specification test "):
smsdia.spec_white(
np.random.standard_normal(100), np.random.standard_normal((100, 2))
)
def test_linear_lm_direct(reset_randomstate):
endog = np.random.standard_normal(500)
exog = add_constant(np.random.standard_normal((500, 3)))
res = OLS(endog, exog).fit()
lm_res = smsdia.linear_lm(res.resid, exog)
aug = np.hstack([exog, exog[:, 1:] ** 2])
res_aug = OLS(res.resid, aug).fit()
stat = res_aug.rsquared * aug.shape[0]
assert_allclose(lm_res[0], stat)
# TODO: make this a test or move/remove
def grangertest():
# > gt = grangertest(ginv, ggdp, order=4)
# > gt
# Granger causality test
#
# Model 1: ggdp ~ Lags(ggdp, 1:4) + Lags(ginv, 1:4)
# Model 2: ggdp ~ Lags(ggdp, 1:4)
dict(fvalue=1.589672703015157, pvalue=0.178717196987075, df=(198, 193))
@pytest.mark.smoke
def test_outlier_influence_funcs(reset_randomstate):
x = add_constant(np.random.randn(10, 2))
y = x.sum(1) + np.random.randn(10)
res = OLS(y, x).fit()
out_05 = oi.summary_table(res)
# GH3344 : Check alpha has an effect
out_01 = oi.summary_table(res, alpha=0.01)
assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))
res2 = OLS(y, x[:, 0]).fit()
oi.summary_table(res2, alpha=0.05)
infl = res2.get_influence()
infl.summary_table()
def test_influence_wrapped():
from pandas import DataFrame
d = macrodata.load_pandas().data
# growth rates
gs_l_realinv = 400 * np.log(d["realinv"]).diff().dropna()
gs_l_realgdp = 400 * np.log(d["realgdp"]).diff().dropna()
lint = d["realint"][:-1]
# re-index these because they will not conform to lint
gs_l_realgdp.index = lint.index
gs_l_realinv.index = lint.index
data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
# order is important
exog = DataFrame(data, columns=["const", "lrealgdp", "lint"])
res = OLS(gs_l_realinv, exog).fit()
# basic
# already tested
# assert_almost_equal(lsdiag['cov.scaled'],
# res.cov_params().values.ravel(), decimal=12)
# assert_almost_equal(lsdiag['cov.unscaled'],
# res.normalized_cov_params.values.ravel(), decimal=12)
infl = oi.OLSInfluence(res)
# smoke test just to make sure it works, results separately tested
df = infl.summary_frame()
assert_(isinstance(df, DataFrame))
# this test is slow
path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
with open(path, encoding="utf-8") as fp:
lsdiag = json.load(fp)
c0, c1 = infl.cooks_distance # TODO: what's c1, it's pvalues? -ss
# NOTE: we get a hard-cored 5 decimals with pandas testing
assert_almost_equal(c0, lsdiag["cooks"], 12)
assert_almost_equal(infl.hat_matrix_diag, (lsdiag["hat"]), 12)
assert_almost_equal(infl.resid_studentized_internal, lsdiag["std.res"], 12)
# slow:
dffits, dffth = infl.dffits
assert_almost_equal(dffits, lsdiag["dfits"], 12)
assert_almost_equal(
infl.resid_studentized_external, lsdiag["stud.res"], 12
)
fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
infl_r = pd.read_csv(fn, index_col=0)
# not used yet:
# infl_bool_r = pandas.read_csv(fn, index_col=0,
# converters=dict(zip(lrange(7),[conv]*7)))
infl_r2 = np.asarray(infl_r)
# TODO: finish wrapping this stuff
assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12)
assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12)
def test_influence_dtype():
# see #2148 bug when endog is integer
y = np.ones(20)
np.random.seed(123)
x = np.random.randn(20, 3)
res1 = OLS(y, x).fit()
res2 = OLS(y * 1.0, x).fit()
cr1 = res1.get_influence().cov_ratio
cr2 = res2.get_influence().cov_ratio
assert_allclose(cr1, cr2, rtol=1e-14)
# regression test for values
cr3 = np.array(
[
1.22239215,
1.31551021,
1.52671069,
1.05003921,
0.89099323,
1.57405066,
1.03230092,
0.95844196,
1.15531836,
1.21963623,
0.87699564,
1.16707748,
1.10481391,
0.98839447,
1.08999334,
1.35680102,
1.46227715,
1.45966708,
1.13659521,
1.22799038,
]
)
assert_almost_equal(cr1, cr3, decimal=8)
def get_duncan_data():
# results from R with NA -> 1. Just testing interface here because
# outlier_test is just a wrapper
labels = [
"accountant",
"pilot",
"architect",
"author",
"chemist",
"minister",
"professor",
"dentist",
"reporter",
"engineer",
"undertaker",
"lawyer",
"physician",
"welfare.worker",
"teacher",
"conductor",
"contractor",
"factory.owner",
"store.manager",
"banker",
"bookkeeper",
"mail.carrier",
"insurance.agent",
"store.clerk",
"carpenter",
"electrician",
"RR.engineer",
"machinist",
"auto.repairman",
"plumber",
"gas.stn.attendant",
"coal.miner",
"streetcar.motorman",
"taxi.driver",
"truck.driver",
"machine.operator",
"barber",
"bartender",
"shoe.shiner",
"cook",
"soda.clerk",
"watchman",
"janitor",
"policeman",
"waiter",
]
# Duncan's prestige data from car
exog = [
[1.0, 62.0, 86.0],
[1.0, 72.0, 76.0],
[1.0, 75.0, 92.0],
[1.0, 55.0, 90.0],
[1.0, 64.0, 86.0],
[1.0, 21.0, 84.0],
[1.0, 64.0, 93.0],
[1.0, 80.0, 100.0],
[1.0, 67.0, 87.0],
[1.0, 72.0, 86.0],
[1.0, 42.0, 74.0],
[1.0, 76.0, 98.0],
[1.0, 76.0, 97.0],
[1.0, 41.0, 84.0],
[1.0, 48.0, 91.0],
[1.0, 76.0, 34.0],
[1.0, 53.0, 45.0],
[1.0, 60.0, 56.0],
[1.0, 42.0, 44.0],
[1.0, 78.0, 82.0],
[1.0, 29.0, 72.0],
[1.0, 48.0, 55.0],
[1.0, 55.0, 71.0],
[1.0, 29.0, 50.0],
[1.0, 21.0, 23.0],
[1.0, 47.0, 39.0],
[1.0, 81.0, 28.0],
[1.0, 36.0, 32.0],
[1.0, 22.0, 22.0],
[1.0, 44.0, 25.0],
[1.0, 15.0, 29.0],
[1.0, 7.0, 7.0],
[1.0, 42.0, 26.0],
[1.0, 9.0, 19.0],
[1.0, 21.0, 15.0],
[1.0, 21.0, 20.0],
[1.0, 16.0, 26.0],
[1.0, 16.0, 28.0],
[1.0, 9.0, 17.0],
[1.0, 14.0, 22.0],
[1.0, 12.0, 30.0],
[1.0, 17.0, 25.0],
[1.0, 7.0, 20.0],
[1.0, 34.0, 47.0],
[1.0, 8.0, 32.0],
]
endog = [
82.0,
83.0,
90.0,
76.0,
90.0,
87.0,
93.0,
90.0,
52.0,
88.0,
57.0,
89.0,
97.0,
59.0,
73.0,
38.0,
76.0,
81.0,
45.0,
92.0,
39.0,
34.0,
41.0,
16.0,
33.0,
53.0,
67.0,
57.0,
26.0,
29.0,
10.0,
15.0,
19.0,
10.0,
13.0,
24.0,
20.0,
7.0,
3.0,
16.0,
6.0,
11.0,
8.0,
41.0,
10.0,
]
return endog, exog, labels
def test_outlier_test():
endog, exog, labels = get_duncan_data()
ndarray_mod = OLS(endog, exog).fit()
rstudent = [
3.1345185839,
-2.3970223990,
2.0438046359,
-1.9309187757,
1.8870465798,
-1.7604905300,
-1.7040324156,
1.6024285876,
-1.4332485037,
-1.1044851583,
1.0688582315,
1.0185271840,
-0.9024219332,
-0.9023876471,
-0.8830953936,
0.8265782334,
0.8089220547,
0.7682770197,
0.7319491074,
-0.6665962829,
0.5227352794,
-0.5135016547,
0.5083881518,
0.4999224372,
-0.4980818221,
-0.4759717075,
-0.4293565820,
-0.4114056499,
-0.3779540862,
0.3556874030,
0.3409200462,
0.3062248646,
0.3038999429,
-0.3030815773,
-0.1873387893,
0.1738050251,
0.1424246593,
-0.1292266025,
0.1272066463,
-0.0798902878,
0.0788467222,
0.0722556991,
0.0505098280,
0.0233215136,
0.0007112055,
]
unadj_p = [
0.003177202,
0.021170298,
0.047432955,
0.060427645,
0.066248120,
0.085783008,
0.095943909,
0.116738318,
0.159368890,
0.275822623,
0.291386358,
0.314400295,
0.372104049,
0.372122040,
0.382333561,
0.413260793,
0.423229432,
0.446725370,
0.468363101,
0.508764039,
0.603971990,
0.610356737,
0.613905871,
0.619802317,
0.621087703,
0.636621083,
0.669911674,
0.682917818,
0.707414459,
0.723898263,
0.734904667,
0.760983108,
0.762741124,
0.763360242,
0.852319039,
0.862874018,
0.887442197,
0.897810225,
0.899398691,
0.936713197,
0.937538115,
0.942749758,
0.959961394,
0.981506948,
0.999435989,
]
bonf_p = [
0.1429741,
0.9526634,
2.1344830,
2.7192440,
2.9811654,
3.8602354,
4.3174759,
5.2532243,
7.1716001,
12.4120180,
13.1123861,
14.1480133,
16.7446822,
16.7454918,
17.2050103,
18.5967357,
19.0453245,
20.1026416,
21.0763395,
22.8943818,
27.1787396,
27.4660532,
27.6257642,
27.8911043,
27.9489466,
28.6479487,
30.1460253,
30.7313018,
31.8336506,
32.5754218,
33.0707100,
34.2442399,
34.3233506,
34.3512109,
38.3543568,
38.8293308,
39.9348989,
40.4014601,
40.4729411,
42.1520939,
42.1892152,
42.4237391,
43.1982627,
44.1678127,
44.9746195,
]
bonf_p = np.array(bonf_p)
bonf_p[bonf_p > 1] = 1
sorted_labels = [
"minister",
"reporter",
"contractor",
"insurance.agent",
"machinist",
"store.clerk",
"conductor",
"factory.owner",
"mail.carrier",
"streetcar.motorman",
"carpenter",
"coal.miner",
"bartender",
"bookkeeper",
"soda.clerk",
"chemist",
"RR.engineer",
"professor",
"electrician",
"gas.stn.attendant",
"auto.repairman",
"watchman",
"banker",
"machine.operator",
"dentist",
"waiter",
"shoe.shiner",
"welfare.worker",
"plumber",
"physician",
"pilot",
"engineer",
"accountant",
"lawyer",
"undertaker",
"barber",
"store.manager",
"truck.driver",
"cook",
"janitor",
"policeman",
"architect",
"teacher",
"taxi.driver",
"author",
]
res2 = np.c_[rstudent, unadj_p, bonf_p]
res = oi.outlier_test(ndarray_mod, method="b", labels=labels, order=True)
np.testing.assert_almost_equal(res.values, res2, 7)
np.testing.assert_equal(
res.index.tolist(), sorted_labels
) # pylint: disable-msg=E1103
data = pd.DataFrame(
np.column_stack((endog, exog)),
columns="y const var1 var2".split(),
index=labels,
)
# check `order` with pandas bug in #3971
res_pd = OLS.from_formula("y ~ const + var1 + var2 - 0", data).fit()
res_outl2 = oi.outlier_test(res_pd, method="b", order=True)
assert_almost_equal(res_outl2.values, res2, 7)
assert_equal(res_outl2.index.tolist(), sorted_labels)
res_outl1 = res_pd.outlier_test(method="b")
res_outl1 = res_outl1.sort_values(["unadj_p"], ascending=True)
assert_almost_equal(res_outl1.values, res2, 7)
assert_equal(res_outl1.index.tolist(), sorted_labels)
assert_array_equal(res_outl2.index, res_outl1.index)
# additional keywords in method
res_outl3 = res_pd.outlier_test(method="b", order=True)
assert_equal(res_outl3.index.tolist(), sorted_labels)
res_outl4 = res_pd.outlier_test(method="b", order=True, cutoff=0.15)
assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
def test_ljungbox_dof_adj():
data = sunspots.load_pandas().data["SUNACTIVITY"]
res = AutoReg(data, 4, old_names=False).fit()
resid = res.resid
res1 = smsdia.acorr_ljungbox(resid, lags=10)
res2 = smsdia.acorr_ljungbox(resid, lags=10, model_df=4)
assert_allclose(res1.iloc[:, 0], res2.iloc[:, 0])
assert np.all(np.isnan(res2.iloc[:4, 1]))
assert np.all(res2.iloc[4:, 1] <= res1.iloc[4:, 1])
def test_ljungbox_auto_lag_selection(reset_randomstate):
data = sunspots.load_pandas().data["SUNACTIVITY"]
res = AutoReg(data, 4, old_names=False).fit()
resid = res.resid
res1 = smsdia.acorr_ljungbox(resid, auto_lag=True)
res2 = smsdia.acorr_ljungbox(resid, model_df=4, auto_lag=True)
assert_allclose(res1.iloc[:, 0], res2.iloc[:, 0])
# TODO: compare selected lags with Stata/ R to confirm
# that corect auto_lag is selected
assert res1.shape[0] >= 1
assert res2.shape[0] >= 1
assert np.all(np.isnan(res2.iloc[:4, 1]))
assert np.all(res2.iloc[4:, 1] <= res1.iloc[4:, 1])
def test_ljungbox_auto_lag_whitenoise(reset_randomstate):
data = np.random.randn(1000) # white noise process
res = smsdia.acorr_ljungbox(data, auto_lag=True)
# TODO: compare selected lags with Stata/ R to confirm
# that correct auto_lag is selected
assert res.shape[0] >= 1 # auto lag selected must be at least 1
def test_ljungbox_errors_warnings():
data = sunspots.load_pandas().data["SUNACTIVITY"]
with pytest.raises(ValueError, match="model_df must"):
smsdia.acorr_ljungbox(data, model_df=-1)
with pytest.raises(ValueError, match="period must"):
smsdia.acorr_ljungbox(data, model_df=-1, period=1)
with pytest.raises(ValueError, match="period must"):
smsdia.acorr_ljungbox(data, model_df=-1, period=-2)
smsdia.acorr_ljungbox(data)
ret = smsdia.acorr_ljungbox(data, lags=10)
assert isinstance(ret, pd.DataFrame)
def test_ljungbox_period():
data = sunspots.load_pandas().data["SUNACTIVITY"]
ar_res = AutoReg(data, 4, old_names=False).fit()
res = smsdia.acorr_ljungbox(ar_res.resid, period=13)
res2 = smsdia.acorr_ljungbox(ar_res.resid, lags=26)
assert_frame_equal(res, res2)
@pytest.mark.parametrize("cov_type", ["nonrobust", "HC0"])
def test_encompasing_direct(cov_type, reset_randomstate):
x = np.random.standard_normal((500, 2))
e = np.random.standard_normal((500, 1))
x_extra = np.random.standard_normal((500, 2))
z_extra = np.random.standard_normal((500, 3))
y = x @ np.ones((2, 1)) + e
x1 = np.hstack([x[:, :1], x_extra])
z1 = np.hstack([x, z_extra])
res1 = OLS(y, x1).fit()
res2 = OLS(y, z1).fit()
df = smsdia.compare_encompassing(res1, res2, cov_type=cov_type)
direct1 = OLS(y, np.hstack([x1, x[:, 1:], z_extra])).fit(cov_type=cov_type)
r1 = np.zeros((4, 3 + 1 + 3))
r1[:, -4:] = np.eye(4)
direct_test_1 = direct1.wald_test(r1, use_f=True, scalar=True)
expected = (
float(np.squeeze(direct_test_1.statistic)),
float(np.squeeze(direct_test_1.pvalue)),
int(direct_test_1.df_num),
int(direct_test_1.df_denom),
)
assert_allclose(np.asarray(df.loc["x"]), expected, atol=1e-8)
direct2 = OLS(y, np.hstack([z1, x_extra])).fit(cov_type=cov_type)
r2 = np.zeros((2, 2 + 3 + 2))
r2[:, -2:] = np.eye(2)
direct_test_2 = direct2.wald_test(r2, use_f=True, scalar=True)
expected = (
float(np.squeeze(direct_test_2.statistic)),
float(np.squeeze(direct_test_2.pvalue)),
int(direct_test_2.df_num),
int(direct_test_2.df_denom),
)
assert_allclose(np.asarray(df.loc["z"]), expected, atol=1e-8)
def test_encompasing_error(reset_randomstate):
x = np.random.standard_normal((500, 2))
e = np.random.standard_normal((500, 1))
z_extra = np.random.standard_normal((500, 3))
y = x @ np.ones((2, 1)) + e
z = np.hstack([x, z_extra])
res1 = OLS(y, x).fit()
res2 = OLS(y, z).fit()
with pytest.raises(ValueError, match="The exog in results_x"):
smsdia.compare_encompassing(res1, res2)
with pytest.raises(TypeError, match="results_z must come from a linear"):
smsdia.compare_encompassing(res1, 2)
with pytest.raises(TypeError, match="results_x must come from a linear"):
smsdia.compare_encompassing(4, 2)
@pytest.mark.smoke
@pytest.mark.parametrize("power", [2, 3])
@pytest.mark.parametrize("test_type", ["fitted", "exog", "princomp"])
@pytest.mark.parametrize("use_f", [True, False])
@pytest.mark.parametrize(
"cov",
[
dict(cov_type="nonrobust", cov_kwargs={}),
dict(cov_type="HC0", cov_kwargs={}),
],
)
def test_reset_smoke(power, test_type, use_f, cov, reset_randomstate):
x = add_constant(np.random.standard_normal((1000, 3)))
e = np.random.standard_normal((1000, 1))
x = np.hstack([x, x[:, 1:] ** 2])
y = x @ np.ones((7, 1)) + e
res = OLS(y, x[:, :4]).fit()
smsdia.linear_reset(
res, power=power, test_type=test_type, use_f=use_f, **cov
)
@pytest.mark.smoke
@pytest.mark.parametrize("store", [True, False])
@pytest.mark.parametrize("ddof", [0, 2])
@pytest.mark.parametrize(
"cov",
[
dict(cov_type="nonrobust", cov_kwargs={}),
dict(cov_type="HC0", cov_kwargs={}),
],
)
def test_acorr_lm_smoke(store, ddof, cov, reset_randomstate):
e = np.random.standard_normal(250)
smsdia.acorr_lm(e, nlags=6, store=store, ddof=ddof, **cov)
smsdia.acorr_lm(e, nlags=None, store=store, period=12, ddof=ddof, **cov)
def test_acorr_lm_smoke_no_autolag(reset_randomstate):
e = np.random.standard_normal(250)
smsdia.acorr_lm(e, nlags=6, store=False, ddof=0)
@pytest.mark.parametrize("frac", [0.25, 0.5, 0.75])
@pytest.mark.parametrize(
"order_by",
[
None,
np.arange(500),
np.random.choice(500, size=500, replace=False),
"x0",
["x0", "x2"],
],
)
def test_rainbow_smoke_order_by(frac, order_by, reset_randomstate):
e = pd.DataFrame(np.random.standard_normal((500, 1)))
x = pd.DataFrame(
np.random.standard_normal((500, 3)),
columns=[f"x{i}" for i in range(3)],
)
y = x @ np.ones((3, 1)) + e
res = OLS(y, x).fit()
smsdia.linear_rainbow(res, frac=frac, order_by=order_by)
@pytest.mark.parametrize("center", [None, 0.33, 300])
def test_rainbow_smoke_centered(center, reset_randomstate):
e = pd.DataFrame(np.random.standard_normal((500, 1)))
x = pd.DataFrame(
np.random.standard_normal((500, 3)),
columns=[f"x{i}" for i in range(3)],
)
y = x @ np.ones((3, 1)) + e
res = OLS(y, x).fit()
smsdia.linear_rainbow(res, use_distance=True, center=center)
def test_rainbow_exception(reset_randomstate):
e = pd.DataFrame(np.random.standard_normal((500, 1)))
x = pd.DataFrame(
np.random.standard_normal((500, 3)),
columns=[f"x{i}" for i in range(3)],
)
y = x @ np.ones((3, 1)) + e
res = OLS(y, x).fit()
with pytest.raises(TypeError, match="order_by must contain"):
smsdia.linear_rainbow(res, order_by="x5")
res = OLS(np.asarray(y), np.asarray(x)).fit()
with pytest.raises(TypeError, match="order_by must contain"):
smsdia.linear_rainbow(res, order_by=("x0",))
def test_small_skip(reset_randomstate):
y = np.random.standard_normal(10)
x = np.random.standard_normal((10, 3))
x[:3] = x[:1]
with pytest.raises(ValueError, match="The initial regressor matrix,"):
smsdia.recursive_olsresiduals(OLS(y, x).fit())
# R code used in testing
# J test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
# Estimate Std. Error t value Pr(>|t|)
# M1 + fitted(M2) 1.591505670785873 0.7384552861695823 2.15518 0.0323546 *
# M2 + fitted(M1) 1.305687653016899 0.4808385176653064 2.71544 0.0072039 **
# ---
# Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
#
#
# = lm(ginv ~ ggdp + tbilrate)
# > ct = coxtest(fm, fm3)
# > ct
# Cox test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
# Estimate Std. Error z value Pr(>|z|)
# fitted(M1) ~ M2 -0.782030488930356 0.599696502782265 -1.30404 0.19222
# fitted(M2) ~ M1 -2.248817107408537 0.392656854330139 -5.72718 1.0211e-08 ***
# ---
# Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
#
#
#
# > et = encomptest(fm, fm3)
# > et
# Encompassing test
#
# Model 1: ginv ~ ggdp + lint
# Model 2: ginv ~ ggdp + tbilrate
# Model E: ginv ~ ggdp + lint + tbilrate
# Res.Df Df F Pr(>F)
# M1 vs. ME 198 -1 4.64481 0.0323546 *
# M2 vs. ME 198 -1 7.37361 0.0072039 **
# ---
# Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
#
#
# > fm4 = lm(realinv ~ realgdp + realint, data=d)
# > fm5 = lm(log(realinv) ~ realgdp + realint, data=d)
# > pet = petest(fm4, fm5)
# > pet
# PE test
#
# Model 1: realinv ~ realgdp + realint
# Model 2: log(realinv) ~ realgdp + realint
# Estimate Std. Error t value
# M1 + log(fit(M1))-fit(M2) -229.281878354594596 44.5087822087058598 -5.15139
# M2 + fit(M1)-exp(fit(M2)) 0.000634664704814 0.0000462387010349 13.72583
# Pr(>|t|)
# M1 + log(fit(M1))-fit(M2) 6.2013e-07 ***
# M2 + fit(M1)-exp(fit(M2)) < 2.22e-16 ***
# ---
# Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
@pytest.mark.smoke
def test_diagnostics_pandas(reset_randomstate):
# GH 8879
n = 100
df = pd.DataFrame(
{
"y": np.random.rand(n),
"x": np.random.rand(n),
"z": np.random.rand(n)}
)
y, x = df["y"], add_constant(df["x"])
res = OLS(df["y"], add_constant(df[["x"]])).fit()
res_large = OLS(df["y"], add_constant(df[["x", "z"]])).fit()
res_other = OLS(df["y"], add_constant(df[["z"]])).fit()
smsdia.linear_reset(res_large)
smsdia.linear_reset(res_large, test_type="fitted")
smsdia.linear_reset(res_large, test_type="exog")
smsdia.linear_reset(res_large, test_type="princomp")
smsdia.het_goldfeldquandt(y, x)
smsdia.het_breuschpagan(res.resid, x)
smsdia.het_white(res.resid, x)
smsdia.het_arch(res.resid)
smsdia.acorr_breusch_godfrey(res)
smsdia.acorr_ljungbox(y)
smsdia.linear_rainbow(res)
smsdia.linear_lm(res.resid, x)
smsdia.linear_harvey_collier(res)
smsdia.acorr_lm(res.resid)
smsdia.breaks_cusumolsresid(res.resid)
smsdia.breaks_hansen(res)
smsdia.compare_cox(res, res_other)
smsdia.compare_encompassing(res, res_other)
smsdia.compare_j(res, res_other)
smsdia.recursive_olsresiduals(res)
smsdia.recursive_olsresiduals(
res, order_by=np.arange(y.shape[0] - 1, 0 - 1, -1)
)
smsdia.spec_white(res.resid, x)