"""Tests for Regression Diagnostics and Specification Tests Created on Thu Feb 09 13:19:47 2012 Author: Josef Perktold License: BSD-3 currently all tests are against R """ import json import os import numpy as np from numpy.testing import ( assert_, assert_allclose, assert_almost_equal, assert_array_equal, assert_equal, ) import pandas as pd from pandas.testing import assert_frame_equal import pytest from statsmodels.datasets import macrodata, sunspots from statsmodels.regression.linear_model import OLS import statsmodels.stats.diagnostic as smsdia import statsmodels.stats.outliers_influence as oi import statsmodels.stats.sandwich_covariance as sw from statsmodels.tools.tools import Bunch, add_constant from statsmodels.tsa.ar_model import AutoReg from statsmodels.tsa.arima.model import ARIMA cur_dir = os.path.abspath(os.path.dirname(__file__)) @pytest.fixture(scope="module") def diagnostic_data(): rs = np.random.RandomState(93674328) e = rs.standard_normal(500) x = rs.standard_normal((500, 3)) y = x.sum(1) + e c = np.ones_like(y) data = pd.DataFrame(np.c_[y, c, x], columns=["y", "c", "x1", "x2", "x3"]) return data def compare_to_reference(sp, sp_dict, decimal=(12, 12)): assert_allclose( sp[0], sp_dict["statistic"], atol=10 ** -decimal[0], rtol=10 ** -decimal[0], ) assert_allclose( sp[1], sp_dict["pvalue"], atol=10 ** -decimal[1], rtol=10 ** -decimal[0], ) def test_gq(): d = macrodata.load().data realinv = d["realinv"] realgdp = d["realgdp"] realint = d["realint"] endog = realinv exog = add_constant(np.c_[realgdp, realint]) # goldfeld-quandt het_gq_greater = dict( statistic=13.20512768685082, df1=99, df2=98, pvalue=1.246141976112324e-30, distr="f", ) gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5) compare_to_reference(gq, het_gq_greater, decimal=(12, 12)) assert_equal(gq[-1], "increasing") class TestDiagnosticG: @classmethod def setup_class(cls): d = macrodata.load_pandas().data # growth rates gs_l_realinv = 400 * np.diff(np.log(d["realinv"].values)) gs_l_realgdp = 400 * np.diff(np.log(d["realgdp"].values)) lint = d["realint"][:-1].values tbilrate = d["tbilrate"][:-1].values endogg = gs_l_realinv exogg = add_constant(np.c_[gs_l_realgdp, lint]) exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate]) exogg3 = add_constant(np.c_[gs_l_realgdp]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() cls.res = res_ols cls.res2 = res_ols2 cls.res3 = res_ols3 cls.endog = cls.res.model.endog cls.exog = cls.res.model.exog def test_basic(self): # mainly to check I got the right regression # > mkarray(fm$coefficients, "params") params = np.array( [-9.48167277465485, 4.3742216647032, -0.613996969478989] ) assert_almost_equal(self.res.params, params, decimal=12) def test_hac(self): res = self.res # > nw = NeweyWest(fm, lag = 4, prewhite = FALSE, verbose=TRUE) # > nw2 = NeweyWest(fm, lag=10, prewhite = FALSE, verbose=TRUE) # > mkarray(nw, "cov_hac_4") cov_hac_4 = np.array( [ 1.385551290884014, -0.3133096102522685, -0.0597207976835705, -0.3133096102522685, 0.1081011690351306, 0.000389440793564336, -0.0597207976835705, 0.000389440793564339, 0.0862118527405036, ] ).reshape((3, 3), order="F") # > mkarray(nw2, "cov_hac_10") cov_hac_10 = np.array( [ 1.257386180080192, -0.2871560199899846, -0.03958300024627573, -0.2871560199899845, 0.1049107028987101, 0.0003896205316866944, -0.03958300024627578, 0.0003896205316866961, 0.0985539340694839, ] ).reshape((3, 3), order="F") cov = sw.cov_hac_simple(res, nlags=4, use_correction=False) bse_hac = sw.se_cov(cov) assert_almost_equal(cov, cov_hac_4, decimal=12) assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12) cov = sw.cov_hac_simple(res, nlags=10, use_correction=False) bse_hac = sw.se_cov(cov) assert_almost_equal(cov, cov_hac_10, decimal=12) assert_almost_equal(bse_hac, np.sqrt(np.diag(cov)), decimal=12) def test_het_goldfeldquandt(self): # TODO: test options missing # > gq = gqtest(fm, alternative='greater') # > mkhtest_f(gq, 'het_gq_greater', 'f') het_gq_greater = dict( statistic=0.5313259064778423, pvalue=0.9990217851193723, parameters=(98, 98), distr="f", ) # > gq = gqtest(fm, alternative='less') # > mkhtest_f(gq, 'het_gq_less', 'f') het_gq_less = dict( statistic=0.5313259064778423, pvalue=0.000978214880627621, parameters=(98, 98), distr="f", ) # > gq = gqtest(fm, alternative='two.sided') # > mkhtest_f(gq, 'het_gq_two_sided', 'f') het_gq_two_sided = dict( statistic=0.5313259064778423, pvalue=0.001956429761255241, parameters=(98, 98), distr="f", ) # > gq = gqtest(fm, fraction=0.1, alternative='two.sided') # > mkhtest_f(gq, 'het_gq_two_sided_01', 'f') het_gq_two_sided_01 = dict( statistic=0.5006976835928314, pvalue=0.001387126702579789, parameters=(88, 87), distr="f", ) endogg, exogg = self.endog, self.exog # tests gq = smsdia.het_goldfeldquandt(endogg, exogg, split=0.5) compare_to_reference(gq, het_gq_greater, decimal=(12, 12)) assert_equal(gq[-1], "increasing") gq = smsdia.het_goldfeldquandt( endogg, exogg, split=0.5, alternative="decreasing" ) compare_to_reference(gq, het_gq_less, decimal=(12, 12)) assert_equal(gq[-1], "decreasing") gq = smsdia.het_goldfeldquandt( endogg, exogg, split=0.5, alternative="two-sided" ) compare_to_reference(gq, het_gq_two_sided, decimal=(12, 12)) assert_equal(gq[-1], "two-sided") # TODO: forcing the same split as R 202-90-90-1=21 gq = smsdia.het_goldfeldquandt( endogg, exogg, split=90, drop=21, alternative="two-sided" ) compare_to_reference(gq, het_gq_two_sided_01, decimal=(12, 12)) assert_equal(gq[-1], "two-sided") # TODO other options ??? def test_het_breusch_pagan(self): res = self.res bptest = dict( statistic=0.709924388395087, pvalue=0.701199952134347, parameters=(2,), distr="f", ) bp = smsdia.het_breuschpagan(res.resid, res.model.exog) compare_to_reference(bp, bptest, decimal=(12, 12)) def test_het_breusch_pagan_1d_err(self): res = self.res x = np.asarray(res.model.exog)[:, -1] with pytest.raises(ValueError, match="The Breusch-Pagan"): smsdia.het_breuschpagan(res.resid, x) x = np.ones_like(x) with pytest.raises(ValueError, match="The Breusch-Pagan"): smsdia.het_breuschpagan(res.resid, x) x = np.asarray(res.model.exog).copy() x[:, 0] = 0 with pytest.raises(ValueError, match="The Breusch-Pagan"): smsdia.het_breuschpagan(res.resid, x) def test_het_breusch_pagan_nonrobust(self): res = self.res bptest = dict( statistic=1.302014063483341, pvalue=0.5215203247110649, parameters=(2,), distr="f", ) bp = smsdia.het_breuschpagan(res.resid, res.model.exog, robust=False) compare_to_reference(bp, bptest, decimal=(12, 12)) def test_het_white(self): res = self.res # TODO: regressiontest, compare with Greene or Gretl or Stata hw = smsdia.het_white(res.resid, res.model.exog) hw_values = ( 33.503722896538441, 2.9887960597830259e-06, 7.7945101228430946, 1.0354575277704231e-06, ) assert_almost_equal(hw, hw_values) def test_het_white_error(self): res = self.res with pytest.raises(ValueError, match="White's heteroskedasticity"): smsdia.het_white(res.resid, res.model.exog[:, :1]) def test_het_arch(self): # test het_arch and indirectly het_lm against R # > library(FinTS) # > at = ArchTest(residuals(fm), lags=4) # > mkhtest(at, 'archtest_4', 'chi2') archtest_4 = dict( statistic=3.43473400836259, pvalue=0.487871315392619, parameters=(4,), distr="chi2", ) # > at = ArchTest(residuals(fm), lags=12) # > mkhtest(at, 'archtest_12', 'chi2') archtest_12 = dict( statistic=8.648320999014171, pvalue=0.732638635007718, parameters=(12,), distr="chi2", ) at4 = smsdia.het_arch(self.res.resid, nlags=4) at12 = smsdia.het_arch(self.res.resid, nlags=12) compare_to_reference(at4[:2], archtest_4, decimal=(12, 13)) compare_to_reference(at12[:2], archtest_12, decimal=(12, 13)) def test_het_arch2(self): # test autolag options, this also test het_lm # unfortunately optimal lag=1 for this data resid = self.res.resid res1 = smsdia.het_arch(resid, nlags=5, store=True) rs1 = res1[-1] res2 = smsdia.het_arch(resid, nlags=5, store=True) rs2 = res2[-1] assert_almost_equal(rs2.resols.params, rs1.resols.params, decimal=12) assert_almost_equal(res2[:4], res1[:4], decimal=12) # test that smallest lag, nlags=1 works res3 = smsdia.het_arch(resid, nlags=5) assert_almost_equal(res3[:4], res1[:4], decimal=12) def test_acorr_breusch_godfrey(self): res = self.res # bgf = bgtest(fm, order = 4, type="F") breuschgodfrey_f = dict( statistic=1.179280833676792, pvalue=0.321197487261203, parameters=( 4, 195, ), distr="f", ) # > bgc = bgtest(fm, order = 4, type="Chisq") # > mkhtest(bgc, "breuschpagan_c", "chi2") breuschgodfrey_c = dict( statistic=4.771042651230007, pvalue=0.3116067133066697, parameters=(4,), distr="chi2", ) bg = smsdia.acorr_breusch_godfrey(res, nlags=4) bg_r = [ breuschgodfrey_c["statistic"], breuschgodfrey_c["pvalue"], breuschgodfrey_f["statistic"], breuschgodfrey_f["pvalue"], ] assert_almost_equal(bg, bg_r, decimal=11) # check that lag choice works bg2 = smsdia.acorr_breusch_godfrey(res, nlags=None) bg3 = smsdia.acorr_breusch_godfrey(res, nlags=10) assert_almost_equal(bg2, bg3, decimal=12) def test_acorr_breusch_godfrey_multidim(self): res = Bunch(resid=np.empty((100, 2))) with pytest.raises(ValueError, match="Model resid must be a 1d array"): smsdia.acorr_breusch_godfrey(res) def test_acorr_breusch_godfrey_exogs(self): data = sunspots.load_pandas().data["SUNACTIVITY"] res = ARIMA(data, order=(1, 0, 0), trend="n").fit() smsdia.acorr_breusch_godfrey(res, nlags=1) def test_acorr_ljung_box(self): # unit-test which may be useful later # ddof correction for fitted parameters in ARMA(p,q) fitdf=p+q # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box", fitdf=2) # mkhtest(bt, "ljung_box_4df2", "chi2") # ljung_box_4df2 = dict(statistic=5.23587172795227, # pvalue=0.0729532930400377, # parameters=(2,), distr='chi2') # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce", fitdf=2) # mkhtest(bt, "ljung_box_bp_4df2", "chi2") # ljung_box_bp_4df2 = dict(statistic=5.12462932741681, # pvalue=0.0771260128929921, # parameters=(2,), distr='chi2') res = self.res # general test # bt = Box.test(residuals(fm), lag=4, type = "Ljung-Box") # mkhtest(bt, "ljung_box_4", "chi2") ljung_box_4 = dict( statistic=5.23587172795227, pvalue=0.263940335284713, parameters=(4,), distr="chi2", ) # bt = Box.test(residuals(fm), lag=4, type = "Box-Pierce") # mkhtest(bt, "ljung_box_bp_4", "chi2") ljung_box_bp_4 = dict( statistic=5.12462932741681, pvalue=0.2747471266820692, parameters=(4,), distr="chi2", ) df = smsdia.acorr_ljungbox(res.resid, 4, boxpierce=True) compare_to_reference( [df.loc[4, "lb_stat"], df.loc[4, "lb_pvalue"]], ljung_box_4, decimal=(12, 12), ) compare_to_reference( [df.loc[4, "bp_stat"], df.loc[4, "bp_pvalue"]], ljung_box_bp_4, decimal=(12, 12), ) def test_acorr_ljung_box_big_default(self): res = self.res # R test with big dataset and default lag # bt = Box.test(residuals(fm), type = "Ljung-Box") # mkhtest(bt, "ljung_box_none", "chi2") ljung_box_none = dict( statistic=51.03724531797195, pvalue=0.11334744923390, distr="chi2" ) # bt = Box.test(residuals(fm), type = "Box-Pierce") # mkhtest(bt, "ljung_box_bp_none", "chi2") ljung_box_bp_none = dict( statistic=45.12238537034000, pvalue=0.26638168491464, distr="chi2" ) lags = min(40, res.resid.shape[0] // 2 - 2) df = smsdia.acorr_ljungbox(res.resid, boxpierce=True, lags=lags) idx = df.index.max() compare_to_reference( [df.loc[idx, "lb_stat"], df.loc[idx, "lb_pvalue"]], ljung_box_none, decimal=(12, 12), ) compare_to_reference( [df.loc[idx, "bp_stat"], df.loc[idx, "bp_pvalue"]], ljung_box_bp_none, decimal=(12, 12), ) def test_acorr_ljung_box_small_default(self): res = self.res # R test with small dataset and default lag # bt = Box.test(residuals(fm), type = "Ljung-Box") # mkhtest(bt, "ljung_box_small", "chi2") ljung_box_small = dict( statistic=9.61503968281915, pvalue=0.72507000996945, parameters=(0,), distr="chi2", ) # bt = Box.test(residuals(fm), type = "Box-Pierce") # mkhtest(bt, "ljung_box_bp_small", "chi2") ljung_box_bp_small = dict( statistic=7.41692150864936, pvalue=0.87940785887006, parameters=(0,), distr="chi2", ) if isinstance(res.resid, np.ndarray): resid = res.resid[:30] else: resid = res.resid.iloc[:30] df = smsdia.acorr_ljungbox( resid, boxpierce=True, lags=13 ) idx = df.index.max() compare_to_reference( [df.loc[idx, "lb_stat"], df.loc[idx, "lb_pvalue"]], ljung_box_small, decimal=(12, 12), ) compare_to_reference( [df.loc[idx, "bp_stat"], df.loc[idx, "bp_pvalue"]], ljung_box_bp_small, decimal=(12, 12), ) def test_acorr_ljung_box_against_r(self, reset_randomstate): rs = np.random.RandomState(9876543) y1 = rs.standard_normal(100) e = rs.standard_normal(201) y2 = np.zeros_like(e) y2[0] = e[0] for i in range(1, 201): y2[i] = 0.5 * y2[i - 1] - 0.4 * e[i - 1] + e[i] y2 = y2[-100:] r_results_y1_lb = [ [0.15685, 1, 0.6921], [5.4737, 5, 0.3608], [10.508, 10, 0.3971], ] r_results_y2_lb = [ [2.8764, 1, 0.08989], [3.8104, 5, 0.577], [8.4779, 10, 0.5823], ] res_y1 = smsdia.acorr_ljungbox(y1, 10) res_y2 = smsdia.acorr_ljungbox(y2, 10) for i, loc in enumerate((1, 5, 10)): row = res_y1.loc[loc] assert_allclose( r_results_y1_lb[i][0], row.loc["lb_stat"], rtol=1e-3 ) assert_allclose( r_results_y1_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3 ) row = res_y2.loc[loc] assert_allclose( r_results_y2_lb[i][0], row.loc["lb_stat"], rtol=1e-3 ) assert_allclose( r_results_y2_lb[i][2], row.loc["lb_pvalue"], rtol=1e-3 ) res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True) assert_allclose(res.loc[10, "bp_stat"], 7.8935, rtol=1e-3) assert_allclose(res.loc[10, "bp_pvalue"], 0.639, rtol=1e-3) res = smsdia.acorr_ljungbox(y2, 10, boxpierce=True, model_df=1) assert_allclose(res.loc[10, "bp_pvalue"], 0.5449, rtol=1e-3) def test_harvey_collier(self): # > hc = harvtest(fm, order.by = NULL, data = list()) # > mkhtest_f(hc, 'harvey_collier', 't') harvey_collier = dict( statistic=0.494432160939874, pvalue=0.6215491310408242, parameters=198, distr="t", ) hc = smsdia.linear_harvey_collier(self.res) compare_to_reference(hc, harvey_collier, decimal=(12, 12)) hc_skip = smsdia.linear_harvey_collier(self.res, skip=20) assert not np.allclose(hc[0], hc_skip[0]) def test_rainbow(self): # rainbow test # > rt = raintest(fm) # > mkhtest_f(rt, 'raintest', 'f') raintest = dict( statistic=0.6809600116739604, pvalue=0.971832843583418, parameters=(101, 98), distr="f", ) # > rt = raintest(fm, fraction=0.4) # > mkhtest_f(rt, 'raintest_fraction_04', 'f') raintest_fraction_04 = dict( statistic=0.565551237772662, pvalue=0.997592305968473, parameters=(122, 77), distr="f", ) rb = smsdia.linear_rainbow(self.res) compare_to_reference(rb, raintest, decimal=(12, 12)) rb = smsdia.linear_rainbow(self.res, frac=0.4) compare_to_reference(rb, raintest_fraction_04, decimal=(12, 12)) def test_compare_lr(self): res = self.res res3 = self.res3 # nested within res # lrtest # lrt = lrtest(fm, fm2) # Model 1: ginv ~ ggdp + lint # Model 2: ginv ~ ggdp lrtest = dict( loglike1=-763.9752181602237, loglike2=-766.3091902020184, chi2value=4.66794408358942, pvalue=0.03073069384028677, df=(4, 3, 1), ) lrt = res.compare_lr_test(res3) assert_almost_equal(lrt[0], lrtest["chi2value"], decimal=11) assert_almost_equal(lrt[1], lrtest["pvalue"], decimal=11) waldtest = dict( fvalue=4.65216373312492, pvalue=0.03221346195239025, df=(199, 200, 1), ) wt = res.compare_f_test(res3) assert_almost_equal(wt[0], waldtest["fvalue"], decimal=11) assert_almost_equal(wt[1], waldtest["pvalue"], decimal=11) def test_compare_j(self): res = self.res res2 = self.res2 # jt = jtest(fm, lm(ginv ~ ggdp + tbilrate)) # Estimate Std. Error t value Pr(>|t|) jtest = [ ( "M1 + fitted(M2)", 1.591505670785873, 0.7384552861695823, 2.155182176352370, 0.032354572525314450, "*", ), ( "M2 + fitted(M1)", 1.305687653016899, 0.4808385176653064, 2.715438978051544, 0.007203854534057954, "**", ), ] jt1 = smsdia.compare_j(res2, res) assert_almost_equal(jt1, jtest[0][3:5], decimal=12) jt2 = smsdia.compare_j(res, res2) assert_almost_equal(jt2, jtest[1][3:5], decimal=12) @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j]) def test_compare_error(self, comp, diagnostic_data): data = diagnostic_data res1 = OLS(data.y, data[["c", "x1"]]).fit() res2 = OLS(data.x3, data[["c", "x2"]]).fit() with pytest.raises(ValueError, match="endogenous variables"): comp(res1, res2) @pytest.mark.parametrize("comp", [smsdia.compare_cox, smsdia.compare_j]) def test_compare_nested(self, comp, diagnostic_data): data = diagnostic_data res1 = OLS(data.y, data[["c", "x1"]]).fit() res2 = OLS(data.y, data[["c", "x1", "x2"]]).fit() with pytest.raises(ValueError, match="The exog in results_x"): comp(res1, res2) with pytest.raises(ValueError, match="The exog in results_x"): comp(res2, res1) def test_compare_cox(self): res = self.res res2 = self.res2 # Estimate Std. Error z value Pr(>|z|) coxtest = [ ( "fitted(M1) ~ M2", -0.782030488930356, 0.599696502782265, -1.304043770977755, 1.922186587840554e-01, " ", ), ( "fitted(M2) ~ M1", -2.248817107408537, 0.392656854330139, -5.727181590258883, 1.021128495098556e-08, "***", ), ] ct1 = smsdia.compare_cox(res, res2) assert_almost_equal(ct1, coxtest[0][3:5], decimal=12) ct2 = smsdia.compare_cox(res2, res) assert_almost_equal(ct2, coxtest[1][3:5], decimal=12) _, _, store = smsdia.compare_cox(res, res2, store=True) assert isinstance(store, smsdia.ResultsStore) def test_cusum_ols(self): # R library(strucchange) # > sc = sctest(ginv ~ ggdp + lint, type="OLS-CUSUM") # > mkhtest(sc, 'cusum_ols', 'BB') cusum_ols = dict( statistic=1.055750610401214, pvalue=0.2149567397376543, parameters=(), distr="BB", ) # Brownian Bridge k_vars = 3 cs_ols = smsdia.breaks_cusumolsresid(self.res.resid, ddof=k_vars) # compare_to_reference(cs_ols, cusum_ols, decimal=(12, 12)) def test_breaks_hansen(self): # > sc = sctest(ginv ~ ggdp + lint, type="Nyblom-Hansen") # > mkhtest(sc, 'breaks_nyblom_hansen', 'BB') breaks_nyblom_hansen = dict( statistic=1.0300792740544484, pvalue=0.1136087530212015, parameters=(), distr="BB", ) bh = smsdia.breaks_hansen(self.res) assert_almost_equal( bh[0], breaks_nyblom_hansen["statistic"], decimal=12 ) # TODO: breaks_hansen does not return pvalues def test_recursive_residuals(self): reccumres_standardize = np.array( [ -2.151, -3.748, -3.114, -3.096, -1.865, -2.230, -1.194, -3.500, -3.638, -4.447, -4.602, -4.631, -3.999, -4.830, -5.429, -5.435, -6.554, -8.093, -8.567, -7.532, -7.079, -8.468, -9.320, -12.256, -11.932, -11.454, -11.690, -11.318, -12.665, -12.842, -11.693, -10.803, -12.113, -12.109, -13.002, -11.897, -10.787, -10.159, -9.038, -9.007, -8.634, -7.552, -7.153, -6.447, -5.183, -3.794, -3.511, -3.979, -3.236, -3.793, -3.699, -5.056, -5.724, -4.888, -4.309, -3.688, -3.918, -3.735, -3.452, -2.086, -6.520, -7.959, -6.760, -6.855, -6.032, -4.405, -4.123, -4.075, -3.235, -3.115, -3.131, -2.986, -1.813, -4.824, -4.424, -4.796, -4.000, -3.390, -4.485, -4.669, -4.560, -3.834, -5.507, -3.792, -2.427, -1.756, -0.354, 1.150, 0.586, 0.643, 1.773, -0.830, -0.388, 0.517, 0.819, 2.240, 3.791, 3.187, 3.409, 2.431, 0.668, 0.957, -0.928, 0.327, -0.285, -0.625, -2.316, -1.986, -0.744, -1.396, -1.728, -0.646, -2.602, -2.741, -2.289, -2.897, -1.934, -2.532, -3.175, -2.806, -3.099, -2.658, -2.487, -2.515, -2.224, -2.416, -1.141, 0.650, -0.947, 0.725, 0.439, 0.885, 2.419, 2.642, 2.745, 3.506, 4.491, 5.377, 4.624, 5.523, 6.488, 6.097, 5.390, 6.299, 6.656, 6.735, 8.151, 7.260, 7.846, 8.771, 8.400, 8.717, 9.916, 9.008, 8.910, 8.294, 8.982, 8.540, 8.395, 7.782, 7.794, 8.142, 8.362, 8.400, 7.850, 7.643, 8.228, 6.408, 7.218, 7.699, 7.895, 8.725, 8.938, 8.781, 8.350, 9.136, 9.056, 10.365, 10.495, 10.704, 10.784, 10.275, 10.389, 11.586, 11.033, 11.335, 11.661, 10.522, 10.392, 10.521, 10.126, 9.428, 9.734, 8.954, 9.949, 10.595, 8.016, 6.636, 6.975, ] ) rr = smsdia.recursive_olsresiduals(self.res, skip=3, alpha=0.95) assert_equal( np.round(rr[5][1:], 3), reccumres_standardize ) # extra zero in front # assert_equal(np.round(rr[3][4:], 3), np.diff(reccumres_standardize)) assert_almost_equal(rr[3][4:], np.diff(reccumres_standardize), 3) assert_almost_equal(rr[4][3:].std(ddof=1), 10.7242, decimal=4) order = np.arange(self.res.model.endog.shape[0]) rr_order = smsdia.recursive_olsresiduals( self.res, skip=3, alpha=0.95, order_by=order ) assert_allclose(rr[0], rr_order[0], atol=1e-6) # regression number, visually checked with graph from gretl ub0 = np.array( [13.37318571, 13.50758959, 13.64199346, 13.77639734, 13.91080121] ) ub1 = np.array( [39.44753774, 39.58194162, 39.7163455, 39.85074937, 39.98515325] ) lb, ub = rr[6] assert_almost_equal(ub[:5], ub0, decimal=7) assert_almost_equal(lb[:5], -ub0, decimal=7) assert_almost_equal(ub[-5:], ub1, decimal=7) assert_almost_equal(lb[-5:], -ub1, decimal=7) # test a few values with explicit OLS endog = self.res.model.endog exog = self.res.model.exog params = [] ypred = [] for i in range(3, 10): resi = OLS(endog[:i], exog[:i]).fit() ypred.append(resi.model.predict(resi.params, exog[i])) params.append(resi.params) assert_almost_equal(rr[2][3:10], ypred, decimal=12) assert_almost_equal(rr[0][3:10], endog[3:10] - ypred, decimal=12) assert_almost_equal(rr[1][2:9], params, decimal=12) def test_normality(self): res = self.res # > library(nortest) #Lilliefors (Kolmogorov-Smirnov) normality test # > lt = lillie.test(residuals(fm)) # > mkhtest(lt, "lilliefors", "-") lilliefors1 = dict( statistic=0.0723390908786589, pvalue=0.01204113540102896, parameters=(), distr="-", ) # > lt = lillie.test(residuals(fm)**2) # > mkhtest(lt, "lilliefors", "-") lilliefors2 = dict( statistic=0.301311621898024, pvalue=1.004305736618051e-51, parameters=(), distr="-", ) # > lt = lillie.test(residuals(fm)[1:20]) # > mkhtest(lt, "lilliefors", "-") lilliefors3 = dict( statistic=0.1333956004203103, pvalue=0.455683, parameters=(), distr="-", ) lf1 = smsdia.lilliefors(res.resid, pvalmethod="approx") lf2 = smsdia.lilliefors(res.resid ** 2, pvalmethod="approx") if isinstance(res.resid, np.ndarray): resid = res.resid[:20] else: resid = res.resid.iloc[:20] lf3 = smsdia.lilliefors(resid, pvalmethod="approx") compare_to_reference(lf1, lilliefors1, decimal=(12, 12)) compare_to_reference( lf2, lilliefors2, decimal=(12, 12) ) # pvalue very small assert_allclose(lf2[1], lilliefors2["pvalue"], rtol=1e-10) compare_to_reference(lf3, lilliefors3, decimal=(12, 1)) # R uses different approximation for pvalue in last case # > ad = ad.test(residuals(fm)) # > mkhtest(ad, "ad3", "-") adr1 = dict( statistic=1.602209621518313, pvalue=0.0003937979149362316, parameters=(), distr="-", ) # > ad = ad.test(residuals(fm)**2) # > mkhtest(ad, "ad3", "-") # > ad = ad.test(residuals(fm)[1:20]) # > mkhtest(ad, "ad3", "-") adr3 = dict( statistic=0.3017073732210775, pvalue=0.5443499281265933, parameters=(), distr="-", ) ad1 = smsdia.normal_ad(res.resid) compare_to_reference(ad1, adr1, decimal=(11, 13)) ad2 = smsdia.normal_ad(res.resid ** 2) assert_(np.isinf(ad2[0])) ad3 = smsdia.normal_ad(resid) compare_to_reference(ad3, adr3, decimal=(11, 12)) def test_influence(self): res = self.res # this test is slow infl = oi.OLSInfluence(res) path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") with open(path, encoding="utf-8") as fp: lsdiag = json.load(fp) # basic assert_almost_equal( np.array(lsdiag["cov.scaled"]).reshape(3, 3), res.cov_params(), decimal=12, ) assert_almost_equal( np.array(lsdiag["cov.unscaled"]).reshape(3, 3), res.normalized_cov_params, decimal=12, ) c0, c1 = infl.cooks_distance # TODO: what's c1 assert_almost_equal(c0, lsdiag["cooks"], decimal=12) assert_almost_equal(infl.hat_matrix_diag, lsdiag["hat"], decimal=12) assert_almost_equal( infl.resid_studentized_internal, lsdiag["std.res"], decimal=12 ) # slow: # infl._get_all_obs() #slow, nobs estimation loop, called implicitly dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag["dfits"], decimal=12) assert_almost_equal( infl.resid_studentized_external, lsdiag["stud.res"], decimal=12 ) fn = os.path.join(cur_dir, "results/influence_measures_R.csv") infl_r = pd.read_csv(fn, index_col=0) # not used yet: # infl_bool_r = pandas.read_csv(fn, index_col=0, # converters=dict(zip(lrange(7),[conv]*7))) infl_r2 = np.asarray(infl_r) assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12) # duplicates assert_almost_equal(dffits, infl_r2[:, 3], decimal=12) assert_almost_equal(c0, infl_r2[:, 5], decimal=12) assert_almost_equal(infl.hat_matrix_diag, infl_r2[:, 6], decimal=12) # TODO: finish and check thresholds and pvalues # Note: for dffits, R uses a threshold around 0.36, # mine: dffits[1]=0.24373 # R has # >>> np.nonzero(np.asarray(infl_bool_r["dffit"]))[0] # array([ 6, 26, 63, 76, 90, 199]) # >>> np.nonzero(np.asarray(infl_bool_r["cov.r"]))[0] # array([ 4, 26, 59, 61, 63, 72, 76, 84, 91, 92, 94, 95, # 108, 197, 198]) # >>> np.nonzero(np.asarray(infl_bool_r["hat"]))[0] # array([ 62, 76, 84, 90, 91, 92, 95, 108, 197, 199]) class TestDiagnosticGPandas(TestDiagnosticG): @classmethod def setup_class(cls): d = macrodata.load_pandas().data # growth rates d["gs_l_realinv"] = 400 * np.log(d["realinv"]).diff() d["gs_l_realgdp"] = 400 * np.log(d["realgdp"]).diff() d["lint"] = d["realint"].shift(1) d["tbilrate"] = d["tbilrate"].shift(1) d = d.dropna() cls.d = d endogg = d["gs_l_realinv"] exogg = add_constant(d[["gs_l_realgdp", "lint"]]) exogg2 = add_constant(d[["gs_l_realgdp", "tbilrate"]]) exogg3 = add_constant(d[["gs_l_realgdp"]]) res_ols = OLS(endogg, exogg).fit() res_ols2 = OLS(endogg, exogg2).fit() res_ols3 = OLS(endogg, exogg3).fit() cls.res = res_ols cls.res2 = res_ols2 cls.res3 = res_ols3 cls.endog = cls.res.model.endog cls.exog = cls.res.model.exog def test_spec_white(): resdir = os.path.join(cur_dir, "results") wsfiles = ["wspec1.csv", "wspec2.csv", "wspec3.csv", "wspec4.csv"] for file in wsfiles: mdlfile = os.path.join(resdir, file) mdl = np.asarray(pd.read_csv(mdlfile)) # DV is in last column lastcol = mdl.shape[1] - 1 dv = mdl[:, lastcol] # create design matrix design = np.concatenate( (np.ones((mdl.shape[0], 1)), np.delete(mdl, lastcol, 1)), axis=1 ) # perform OLS and generate residuals resids = dv - np.dot(design, np.linalg.lstsq(design, dv, rcond=-1)[0]) # perform White spec test. wspec3/wspec4 contain dummies. wsres = smsdia.spec_white(resids, design) # compare results to SAS 9.3 output if file == "wspec1.csv": assert_almost_equal(wsres, [3.251, 0.661, 5], decimal=3) elif file == "wspec2.csv": assert_almost_equal(wsres, [6.070, 0.733, 9], decimal=3) elif file == "wspec3.csv": assert_almost_equal(wsres, [6.767, 0.454, 7], decimal=3) else: assert_almost_equal(wsres, [8.462, 0.671, 11], decimal=3) def test_spec_white_error(reset_randomstate): with pytest.raises(ValueError, match="White's specification test "): smsdia.spec_white( np.random.standard_normal(100), np.random.standard_normal((100, 1)) ) with pytest.raises(ValueError, match="White's specification test "): smsdia.spec_white( np.random.standard_normal(100), np.random.standard_normal((100, 2)) ) def test_linear_lm_direct(reset_randomstate): endog = np.random.standard_normal(500) exog = add_constant(np.random.standard_normal((500, 3))) res = OLS(endog, exog).fit() lm_res = smsdia.linear_lm(res.resid, exog) aug = np.hstack([exog, exog[:, 1:] ** 2]) res_aug = OLS(res.resid, aug).fit() stat = res_aug.rsquared * aug.shape[0] assert_allclose(lm_res[0], stat) # TODO: make this a test or move/remove def grangertest(): # > gt = grangertest(ginv, ggdp, order=4) # > gt # Granger causality test # # Model 1: ggdp ~ Lags(ggdp, 1:4) + Lags(ginv, 1:4) # Model 2: ggdp ~ Lags(ggdp, 1:4) dict(fvalue=1.589672703015157, pvalue=0.178717196987075, df=(198, 193)) @pytest.mark.smoke def test_outlier_influence_funcs(reset_randomstate): x = add_constant(np.random.randn(10, 2)) y = x.sum(1) + np.random.randn(10) res = OLS(y, x).fit() out_05 = oi.summary_table(res) # GH3344 : Check alpha has an effect out_01 = oi.summary_table(res, alpha=0.01) assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6])) assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7])) res2 = OLS(y, x[:, 0]).fit() oi.summary_table(res2, alpha=0.05) infl = res2.get_influence() infl.summary_table() def test_influence_wrapped(): from pandas import DataFrame d = macrodata.load_pandas().data # growth rates gs_l_realinv = 400 * np.log(d["realinv"]).diff().dropna() gs_l_realgdp = 400 * np.log(d["realgdp"]).diff().dropna() lint = d["realint"][:-1] # re-index these because they will not conform to lint gs_l_realgdp.index = lint.index gs_l_realinv.index = lint.index data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp) # order is important exog = DataFrame(data, columns=["const", "lrealgdp", "lint"]) res = OLS(gs_l_realinv, exog).fit() # basic # already tested # assert_almost_equal(lsdiag['cov.scaled'], # res.cov_params().values.ravel(), decimal=12) # assert_almost_equal(lsdiag['cov.unscaled'], # res.normalized_cov_params.values.ravel(), decimal=12) infl = oi.OLSInfluence(res) # smoke test just to make sure it works, results separately tested df = infl.summary_frame() assert_(isinstance(df, DataFrame)) # this test is slow path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json") with open(path, encoding="utf-8") as fp: lsdiag = json.load(fp) c0, c1 = infl.cooks_distance # TODO: what's c1, it's pvalues? -ss # NOTE: we get a hard-cored 5 decimals with pandas testing assert_almost_equal(c0, lsdiag["cooks"], 12) assert_almost_equal(infl.hat_matrix_diag, (lsdiag["hat"]), 12) assert_almost_equal(infl.resid_studentized_internal, lsdiag["std.res"], 12) # slow: dffits, dffth = infl.dffits assert_almost_equal(dffits, lsdiag["dfits"], 12) assert_almost_equal( infl.resid_studentized_external, lsdiag["stud.res"], 12 ) fn = os.path.join(cur_dir, "results/influence_measures_R.csv") infl_r = pd.read_csv(fn, index_col=0) # not used yet: # infl_bool_r = pandas.read_csv(fn, index_col=0, # converters=dict(zip(lrange(7),[conv]*7))) infl_r2 = np.asarray(infl_r) # TODO: finish wrapping this stuff assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=12) assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=12) def test_influence_dtype(): # see #2148 bug when endog is integer y = np.ones(20) np.random.seed(123) x = np.random.randn(20, 3) res1 = OLS(y, x).fit() res2 = OLS(y * 1.0, x).fit() cr1 = res1.get_influence().cov_ratio cr2 = res2.get_influence().cov_ratio assert_allclose(cr1, cr2, rtol=1e-14) # regression test for values cr3 = np.array( [ 1.22239215, 1.31551021, 1.52671069, 1.05003921, 0.89099323, 1.57405066, 1.03230092, 0.95844196, 1.15531836, 1.21963623, 0.87699564, 1.16707748, 1.10481391, 0.98839447, 1.08999334, 1.35680102, 1.46227715, 1.45966708, 1.13659521, 1.22799038, ] ) assert_almost_equal(cr1, cr3, decimal=8) def get_duncan_data(): # results from R with NA -> 1. Just testing interface here because # outlier_test is just a wrapper labels = [ "accountant", "pilot", "architect", "author", "chemist", "minister", "professor", "dentist", "reporter", "engineer", "undertaker", "lawyer", "physician", "welfare.worker", "teacher", "conductor", "contractor", "factory.owner", "store.manager", "banker", "bookkeeper", "mail.carrier", "insurance.agent", "store.clerk", "carpenter", "electrician", "RR.engineer", "machinist", "auto.repairman", "plumber", "gas.stn.attendant", "coal.miner", "streetcar.motorman", "taxi.driver", "truck.driver", "machine.operator", "barber", "bartender", "shoe.shiner", "cook", "soda.clerk", "watchman", "janitor", "policeman", "waiter", ] # Duncan's prestige data from car exog = [ [1.0, 62.0, 86.0], [1.0, 72.0, 76.0], [1.0, 75.0, 92.0], [1.0, 55.0, 90.0], [1.0, 64.0, 86.0], [1.0, 21.0, 84.0], [1.0, 64.0, 93.0], [1.0, 80.0, 100.0], [1.0, 67.0, 87.0], [1.0, 72.0, 86.0], [1.0, 42.0, 74.0], [1.0, 76.0, 98.0], [1.0, 76.0, 97.0], [1.0, 41.0, 84.0], [1.0, 48.0, 91.0], [1.0, 76.0, 34.0], [1.0, 53.0, 45.0], [1.0, 60.0, 56.0], [1.0, 42.0, 44.0], [1.0, 78.0, 82.0], [1.0, 29.0, 72.0], [1.0, 48.0, 55.0], [1.0, 55.0, 71.0], [1.0, 29.0, 50.0], [1.0, 21.0, 23.0], [1.0, 47.0, 39.0], [1.0, 81.0, 28.0], [1.0, 36.0, 32.0], [1.0, 22.0, 22.0], [1.0, 44.0, 25.0], [1.0, 15.0, 29.0], [1.0, 7.0, 7.0], [1.0, 42.0, 26.0], [1.0, 9.0, 19.0], [1.0, 21.0, 15.0], [1.0, 21.0, 20.0], [1.0, 16.0, 26.0], [1.0, 16.0, 28.0], [1.0, 9.0, 17.0], [1.0, 14.0, 22.0], [1.0, 12.0, 30.0], [1.0, 17.0, 25.0], [1.0, 7.0, 20.0], [1.0, 34.0, 47.0], [1.0, 8.0, 32.0], ] endog = [ 82.0, 83.0, 90.0, 76.0, 90.0, 87.0, 93.0, 90.0, 52.0, 88.0, 57.0, 89.0, 97.0, 59.0, 73.0, 38.0, 76.0, 81.0, 45.0, 92.0, 39.0, 34.0, 41.0, 16.0, 33.0, 53.0, 67.0, 57.0, 26.0, 29.0, 10.0, 15.0, 19.0, 10.0, 13.0, 24.0, 20.0, 7.0, 3.0, 16.0, 6.0, 11.0, 8.0, 41.0, 10.0, ] return endog, exog, labels def test_outlier_test(): endog, exog, labels = get_duncan_data() ndarray_mod = OLS(endog, exog).fit() rstudent = [ 3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798, -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037, -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332, -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197, 0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518, 0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646, 0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593, -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991, 0.0505098280, 0.0233215136, 0.0007112055, ] unadj_p = [ 0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120, 0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623, 0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561, 0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039, 0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703, 0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263, 0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039, 0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197, 0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989, ] bonf_p = [ 0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354, 4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133, 16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391, 43.1982627, 44.1678127, 44.9746195, ] bonf_p = np.array(bonf_p) bonf_p[bonf_p > 1] = 1 sorted_labels = [ "minister", "reporter", "contractor", "insurance.agent", "machinist", "store.clerk", "conductor", "factory.owner", "mail.carrier", "streetcar.motorman", "carpenter", "coal.miner", "bartender", "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor", "electrician", "gas.stn.attendant", "auto.repairman", "watchman", "banker", "machine.operator", "dentist", "waiter", "shoe.shiner", "welfare.worker", "plumber", "physician", "pilot", "engineer", "accountant", "lawyer", "undertaker", "barber", "store.manager", "truck.driver", "cook", "janitor", "policeman", "architect", "teacher", "taxi.driver", "author", ] res2 = np.c_[rstudent, unadj_p, bonf_p] res = oi.outlier_test(ndarray_mod, method="b", labels=labels, order=True) np.testing.assert_almost_equal(res.values, res2, 7) np.testing.assert_equal( res.index.tolist(), sorted_labels ) # pylint: disable-msg=E1103 data = pd.DataFrame( np.column_stack((endog, exog)), columns="y const var1 var2".split(), index=labels, ) # check `order` with pandas bug in #3971 res_pd = OLS.from_formula("y ~ const + var1 + var2 - 0", data).fit() res_outl2 = oi.outlier_test(res_pd, method="b", order=True) assert_almost_equal(res_outl2.values, res2, 7) assert_equal(res_outl2.index.tolist(), sorted_labels) res_outl1 = res_pd.outlier_test(method="b") res_outl1 = res_outl1.sort_values(["unadj_p"], ascending=True) assert_almost_equal(res_outl1.values, res2, 7) assert_equal(res_outl1.index.tolist(), sorted_labels) assert_array_equal(res_outl2.index, res_outl1.index) # additional keywords in method res_outl3 = res_pd.outlier_test(method="b", order=True) assert_equal(res_outl3.index.tolist(), sorted_labels) res_outl4 = res_pd.outlier_test(method="b", order=True, cutoff=0.15) assert_equal(res_outl4.index.tolist(), sorted_labels[:1]) def test_ljungbox_dof_adj(): data = sunspots.load_pandas().data["SUNACTIVITY"] res = AutoReg(data, 4, old_names=False).fit() resid = res.resid res1 = smsdia.acorr_ljungbox(resid, lags=10) res2 = smsdia.acorr_ljungbox(resid, lags=10, model_df=4) assert_allclose(res1.iloc[:, 0], res2.iloc[:, 0]) assert np.all(np.isnan(res2.iloc[:4, 1])) assert np.all(res2.iloc[4:, 1] <= res1.iloc[4:, 1]) def test_ljungbox_auto_lag_selection(reset_randomstate): data = sunspots.load_pandas().data["SUNACTIVITY"] res = AutoReg(data, 4, old_names=False).fit() resid = res.resid res1 = smsdia.acorr_ljungbox(resid, auto_lag=True) res2 = smsdia.acorr_ljungbox(resid, model_df=4, auto_lag=True) assert_allclose(res1.iloc[:, 0], res2.iloc[:, 0]) # TODO: compare selected lags with Stata/ R to confirm # that corect auto_lag is selected assert res1.shape[0] >= 1 assert res2.shape[0] >= 1 assert np.all(np.isnan(res2.iloc[:4, 1])) assert np.all(res2.iloc[4:, 1] <= res1.iloc[4:, 1]) def test_ljungbox_auto_lag_whitenoise(reset_randomstate): data = np.random.randn(1000) # white noise process res = smsdia.acorr_ljungbox(data, auto_lag=True) # TODO: compare selected lags with Stata/ R to confirm # that correct auto_lag is selected assert res.shape[0] >= 1 # auto lag selected must be at least 1 def test_ljungbox_errors_warnings(): data = sunspots.load_pandas().data["SUNACTIVITY"] with pytest.raises(ValueError, match="model_df must"): smsdia.acorr_ljungbox(data, model_df=-1) with pytest.raises(ValueError, match="period must"): smsdia.acorr_ljungbox(data, model_df=-1, period=1) with pytest.raises(ValueError, match="period must"): smsdia.acorr_ljungbox(data, model_df=-1, period=-2) smsdia.acorr_ljungbox(data) ret = smsdia.acorr_ljungbox(data, lags=10) assert isinstance(ret, pd.DataFrame) def test_ljungbox_period(): data = sunspots.load_pandas().data["SUNACTIVITY"] ar_res = AutoReg(data, 4, old_names=False).fit() res = smsdia.acorr_ljungbox(ar_res.resid, period=13) res2 = smsdia.acorr_ljungbox(ar_res.resid, lags=26) assert_frame_equal(res, res2) @pytest.mark.parametrize("cov_type", ["nonrobust", "HC0"]) def test_encompasing_direct(cov_type, reset_randomstate): x = np.random.standard_normal((500, 2)) e = np.random.standard_normal((500, 1)) x_extra = np.random.standard_normal((500, 2)) z_extra = np.random.standard_normal((500, 3)) y = x @ np.ones((2, 1)) + e x1 = np.hstack([x[:, :1], x_extra]) z1 = np.hstack([x, z_extra]) res1 = OLS(y, x1).fit() res2 = OLS(y, z1).fit() df = smsdia.compare_encompassing(res1, res2, cov_type=cov_type) direct1 = OLS(y, np.hstack([x1, x[:, 1:], z_extra])).fit(cov_type=cov_type) r1 = np.zeros((4, 3 + 1 + 3)) r1[:, -4:] = np.eye(4) direct_test_1 = direct1.wald_test(r1, use_f=True, scalar=True) expected = ( float(np.squeeze(direct_test_1.statistic)), float(np.squeeze(direct_test_1.pvalue)), int(direct_test_1.df_num), int(direct_test_1.df_denom), ) assert_allclose(np.asarray(df.loc["x"]), expected, atol=1e-8) direct2 = OLS(y, np.hstack([z1, x_extra])).fit(cov_type=cov_type) r2 = np.zeros((2, 2 + 3 + 2)) r2[:, -2:] = np.eye(2) direct_test_2 = direct2.wald_test(r2, use_f=True, scalar=True) expected = ( float(np.squeeze(direct_test_2.statistic)), float(np.squeeze(direct_test_2.pvalue)), int(direct_test_2.df_num), int(direct_test_2.df_denom), ) assert_allclose(np.asarray(df.loc["z"]), expected, atol=1e-8) def test_encompasing_error(reset_randomstate): x = np.random.standard_normal((500, 2)) e = np.random.standard_normal((500, 1)) z_extra = np.random.standard_normal((500, 3)) y = x @ np.ones((2, 1)) + e z = np.hstack([x, z_extra]) res1 = OLS(y, x).fit() res2 = OLS(y, z).fit() with pytest.raises(ValueError, match="The exog in results_x"): smsdia.compare_encompassing(res1, res2) with pytest.raises(TypeError, match="results_z must come from a linear"): smsdia.compare_encompassing(res1, 2) with pytest.raises(TypeError, match="results_x must come from a linear"): smsdia.compare_encompassing(4, 2) @pytest.mark.smoke @pytest.mark.parametrize("power", [2, 3]) @pytest.mark.parametrize("test_type", ["fitted", "exog", "princomp"]) @pytest.mark.parametrize("use_f", [True, False]) @pytest.mark.parametrize( "cov", [ dict(cov_type="nonrobust", cov_kwargs={}), dict(cov_type="HC0", cov_kwargs={}), ], ) def test_reset_smoke(power, test_type, use_f, cov, reset_randomstate): x = add_constant(np.random.standard_normal((1000, 3))) e = np.random.standard_normal((1000, 1)) x = np.hstack([x, x[:, 1:] ** 2]) y = x @ np.ones((7, 1)) + e res = OLS(y, x[:, :4]).fit() smsdia.linear_reset( res, power=power, test_type=test_type, use_f=use_f, **cov ) @pytest.mark.smoke @pytest.mark.parametrize("store", [True, False]) @pytest.mark.parametrize("ddof", [0, 2]) @pytest.mark.parametrize( "cov", [ dict(cov_type="nonrobust", cov_kwargs={}), dict(cov_type="HC0", cov_kwargs={}), ], ) def test_acorr_lm_smoke(store, ddof, cov, reset_randomstate): e = np.random.standard_normal(250) smsdia.acorr_lm(e, nlags=6, store=store, ddof=ddof, **cov) smsdia.acorr_lm(e, nlags=None, store=store, period=12, ddof=ddof, **cov) def test_acorr_lm_smoke_no_autolag(reset_randomstate): e = np.random.standard_normal(250) smsdia.acorr_lm(e, nlags=6, store=False, ddof=0) @pytest.mark.parametrize("frac", [0.25, 0.5, 0.75]) @pytest.mark.parametrize( "order_by", [ None, np.arange(500), np.random.choice(500, size=500, replace=False), "x0", ["x0", "x2"], ], ) def test_rainbow_smoke_order_by(frac, order_by, reset_randomstate): e = pd.DataFrame(np.random.standard_normal((500, 1))) x = pd.DataFrame( np.random.standard_normal((500, 3)), columns=[f"x{i}" for i in range(3)], ) y = x @ np.ones((3, 1)) + e res = OLS(y, x).fit() smsdia.linear_rainbow(res, frac=frac, order_by=order_by) @pytest.mark.parametrize("center", [None, 0.33, 300]) def test_rainbow_smoke_centered(center, reset_randomstate): e = pd.DataFrame(np.random.standard_normal((500, 1))) x = pd.DataFrame( np.random.standard_normal((500, 3)), columns=[f"x{i}" for i in range(3)], ) y = x @ np.ones((3, 1)) + e res = OLS(y, x).fit() smsdia.linear_rainbow(res, use_distance=True, center=center) def test_rainbow_exception(reset_randomstate): e = pd.DataFrame(np.random.standard_normal((500, 1))) x = pd.DataFrame( np.random.standard_normal((500, 3)), columns=[f"x{i}" for i in range(3)], ) y = x @ np.ones((3, 1)) + e res = OLS(y, x).fit() with pytest.raises(TypeError, match="order_by must contain"): smsdia.linear_rainbow(res, order_by="x5") res = OLS(np.asarray(y), np.asarray(x)).fit() with pytest.raises(TypeError, match="order_by must contain"): smsdia.linear_rainbow(res, order_by=("x0",)) def test_small_skip(reset_randomstate): y = np.random.standard_normal(10) x = np.random.standard_normal((10, 3)) x[:3] = x[:1] with pytest.raises(ValueError, match="The initial regressor matrix,"): smsdia.recursive_olsresiduals(OLS(y, x).fit()) # R code used in testing # J test # # Model 1: ginv ~ ggdp + lint # Model 2: ginv ~ ggdp + tbilrate # Estimate Std. Error t value Pr(>|t|) # M1 + fitted(M2) 1.591505670785873 0.7384552861695823 2.15518 0.0323546 * # M2 + fitted(M1) 1.305687653016899 0.4808385176653064 2.71544 0.0072039 ** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # # = lm(ginv ~ ggdp + tbilrate) # > ct = coxtest(fm, fm3) # > ct # Cox test # # Model 1: ginv ~ ggdp + lint # Model 2: ginv ~ ggdp + tbilrate # Estimate Std. Error z value Pr(>|z|) # fitted(M1) ~ M2 -0.782030488930356 0.599696502782265 -1.30404 0.19222 # fitted(M2) ~ M1 -2.248817107408537 0.392656854330139 -5.72718 1.0211e-08 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # # # > et = encomptest(fm, fm3) # > et # Encompassing test # # Model 1: ginv ~ ggdp + lint # Model 2: ginv ~ ggdp + tbilrate # Model E: ginv ~ ggdp + lint + tbilrate # Res.Df Df F Pr(>F) # M1 vs. ME 198 -1 4.64481 0.0323546 * # M2 vs. ME 198 -1 7.37361 0.0072039 ** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # # # > fm4 = lm(realinv ~ realgdp + realint, data=d) # > fm5 = lm(log(realinv) ~ realgdp + realint, data=d) # > pet = petest(fm4, fm5) # > pet # PE test # # Model 1: realinv ~ realgdp + realint # Model 2: log(realinv) ~ realgdp + realint # Estimate Std. Error t value # M1 + log(fit(M1))-fit(M2) -229.281878354594596 44.5087822087058598 -5.15139 # M2 + fit(M1)-exp(fit(M2)) 0.000634664704814 0.0000462387010349 13.72583 # Pr(>|t|) # M1 + log(fit(M1))-fit(M2) 6.2013e-07 *** # M2 + fit(M1)-exp(fit(M2)) < 2.22e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 @pytest.mark.smoke def test_diagnostics_pandas(reset_randomstate): # GH 8879 n = 100 df = pd.DataFrame( { "y": np.random.rand(n), "x": np.random.rand(n), "z": np.random.rand(n)} ) y, x = df["y"], add_constant(df["x"]) res = OLS(df["y"], add_constant(df[["x"]])).fit() res_large = OLS(df["y"], add_constant(df[["x", "z"]])).fit() res_other = OLS(df["y"], add_constant(df[["z"]])).fit() smsdia.linear_reset(res_large) smsdia.linear_reset(res_large, test_type="fitted") smsdia.linear_reset(res_large, test_type="exog") smsdia.linear_reset(res_large, test_type="princomp") smsdia.het_goldfeldquandt(y, x) smsdia.het_breuschpagan(res.resid, x) smsdia.het_white(res.resid, x) smsdia.het_arch(res.resid) smsdia.acorr_breusch_godfrey(res) smsdia.acorr_ljungbox(y) smsdia.linear_rainbow(res) smsdia.linear_lm(res.resid, x) smsdia.linear_harvey_collier(res) smsdia.acorr_lm(res.resid) smsdia.breaks_cusumolsresid(res.resid) smsdia.breaks_hansen(res) smsdia.compare_cox(res, res_other) smsdia.compare_encompassing(res, res_other) smsdia.compare_j(res, res_other) smsdia.recursive_olsresiduals(res) smsdia.recursive_olsresiduals( res, order_by=np.arange(y.shape[0] - 1, 0 - 1, -1) ) smsdia.spec_white(res.resid, x)