368 lines
13 KiB
Python
368 lines
13 KiB
Python
"""
|
|
Created on Mon May 05 17:29:56 2014
|
|
|
|
Author: Josef Perktold
|
|
"""
|
|
|
|
import os
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from numpy.testing import assert_allclose
|
|
|
|
from statsmodels.regression.linear_model import OLS, GLS
|
|
|
|
from statsmodels.sandbox.regression.penalized import TheilGLS
|
|
|
|
|
|
class TestTheilTextile:
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
|
|
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
|
filepath = os.path.join(cur_dir, "results",
|
|
"theil_textile_predict.csv")
|
|
cls.res_predict = pd.read_csv(filepath, sep=",")
|
|
|
|
names = "year lconsump lincome lprice".split()
|
|
|
|
data = np.array('''\
|
|
1923 1.99651 1.98543 2.00432
|
|
1924 1.99564 1.99167 2.00043
|
|
1925 2 2 2
|
|
1926 2.04766 2.02078 1.95713
|
|
1927 2.08707 2.02078 1.93702
|
|
1928 2.07041 2.03941 1.95279
|
|
1929 2.08314 2.04454 1.95713
|
|
1930 2.13354 2.05038 1.91803
|
|
1931 2.18808 2.03862 1.84572
|
|
1932 2.18639 2.02243 1.81558
|
|
1933 2.20003 2.00732 1.78746
|
|
1934 2.14799 1.97955 1.79588
|
|
1935 2.13418 1.98408 1.80346
|
|
1936 2.22531 1.98945 1.72099
|
|
1937 2.18837 2.0103 1.77597
|
|
1938 2.17319 2.00689 1.77452
|
|
1939 2.2188 2.0162 1.78746'''.split(), float).reshape(-1, 4)
|
|
|
|
|
|
endog = data[:, 1]
|
|
# constant at the end to match Stata
|
|
exog = np.column_stack((data[:, 2:], np.ones(endog.shape[0])))
|
|
|
|
#prior(lprice -0.7 0.15 lincome 1 0.15) cov(lprice lincome -0.01)
|
|
r_matrix = np.array([[1, 0, 0], [0, 1, 0]])
|
|
r_mean = [1, -0.7]
|
|
|
|
cov_r = np.array([[0.15**2, -0.01], [-0.01, 0.15**2]])
|
|
mod = TheilGLS(endog, exog, r_matrix, q_matrix=r_mean, sigma_prior=cov_r)
|
|
cls.res1 = mod.fit(cov_type='data-prior', use_t=True)
|
|
#cls.res1._cache['scale'] = 0.0001852252884817586 # from tg_mixed
|
|
cls.res1._cache['scale'] = 0.00018334123641580062 # from OLS
|
|
from .results import results_theil_textile as resmodule
|
|
cls.res2 = resmodule.results_theil_textile
|
|
|
|
|
|
def test_basic(self):
|
|
pt = self.res2.params_table[:,:6].T
|
|
params2, bse2, tvalues2, pvalues2, ci_low, ci_upp = pt
|
|
assert_allclose(self.res1.params, params2, rtol=2e-6)
|
|
|
|
#TODO tgmixed seems to use scale from initial OLS, not from final res
|
|
# np.sqrt(res.scale / res_ols.scale)
|
|
# see below mse_resid which is equal to scale
|
|
corr_fact = 0.9836026210570028
|
|
corr_fact = 0.97376865041463734
|
|
corr_fact = 1
|
|
assert_allclose(self.res1.bse / corr_fact, bse2, rtol=2e-6)
|
|
assert_allclose(self.res1.tvalues * corr_fact, tvalues2, rtol=2e-6)
|
|
# pvalues are very small
|
|
#assert_allclose(self.res1.pvalues, pvalues2, atol=2e-6)
|
|
#assert_allclose(self.res1.pvalues, pvalues2, rtol=0.7)
|
|
ci = self.res1.conf_int()
|
|
# not scale corrected
|
|
assert_allclose(ci[:,0], ci_low, rtol=0.01)
|
|
assert_allclose(ci[:,1], ci_upp, rtol=0.01)
|
|
assert_allclose(self.res1.rsquared, self.res2.r2, rtol=2e-6)
|
|
|
|
# Note: tgmixed is using k_exog for df_resid
|
|
corr_fact = self.res1.df_resid / self.res2.df_r
|
|
assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact),
|
|
self.res2.rmse, rtol=2e-6)
|
|
|
|
assert_allclose(self.res1.fittedvalues,
|
|
self.res_predict['fittedvalues'], atol=5e7)
|
|
|
|
def test_other(self):
|
|
tc = self.res1.test_compatibility()
|
|
assert_allclose(np.squeeze(tc[0]), self.res2.compat, rtol=2e-6)
|
|
assert_allclose(np.squeeze(tc[1]), self.res2.pvalue, rtol=2e-6)
|
|
|
|
frac = self.res1.share_data()
|
|
# TODO check again, I guess tgmixed uses final scale in hatmatrix
|
|
# but I'm not sure, it passed in previous version, but now we override
|
|
# scale with OLS scale
|
|
# assert_allclose(frac, self.res2.frac_sample, rtol=2e-6)
|
|
# regression tests:
|
|
assert_allclose(frac, 0.6946116246864239, rtol=2e-6)
|
|
|
|
|
|
def test_no_penalization(self):
|
|
res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit()
|
|
res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior')
|
|
assert_allclose(res_theil.params, res_ols.params, rtol=1e-10)
|
|
assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10)
|
|
|
|
@pytest.mark.smoke
|
|
def test_summary(self):
|
|
with pytest.warns(UserWarning):
|
|
self.res1.summary()
|
|
|
|
|
|
class CheckEquivalenceMixin:
|
|
|
|
tol = {'default': (1e-4, 1e-20)}
|
|
|
|
@classmethod
|
|
def get_sample(cls):
|
|
np.random.seed(987456)
|
|
nobs, k_vars = 200, 5
|
|
beta = 0.5 * np.array([0.1, 1, 1, 0, 0])
|
|
x = np.random.randn(nobs, k_vars)
|
|
x[:, 0] = 1
|
|
y = np.dot(x, beta) + 2 * np.random.randn(nobs)
|
|
return y, x
|
|
|
|
def test_attributes(self):
|
|
|
|
attributes_fit = ['params', 'rsquared', 'df_resid', 'df_model',
|
|
'llf', 'aic', 'bic'
|
|
#'fittedvalues', 'resid'
|
|
]
|
|
attributes_inference = ['bse', 'tvalues', 'pvalues']
|
|
import copy
|
|
attributes = copy.copy(attributes_fit)
|
|
|
|
if not getattr(self, 'skip_inference', False):
|
|
attributes.extend(attributes_inference)
|
|
|
|
for att in attributes:
|
|
r1 = getattr(self.res1, att)
|
|
r2 = getattr(self.res2, att)
|
|
if not np.size(r1) == 1:
|
|
r1 = r1[:len(r2)]
|
|
|
|
# check if we have overwritten tolerance
|
|
rtol, atol = self.tol.get(att, self.tol['default'])
|
|
message = 'attribute: ' + att #+ '\n%r\n\%r' % (r1, r2)
|
|
assert_allclose(r1, r2, rtol=rtol, atol=atol, err_msg=message)
|
|
|
|
# models are not close enough for some attributes at high precision
|
|
assert_allclose(self.res1.fittedvalues, self.res1.fittedvalues,
|
|
rtol=1e-3, atol=1e-4)
|
|
assert_allclose(self.res1.resid, self.res1.resid,
|
|
rtol=1e-3, atol=1e-4)
|
|
|
|
|
|
class TestTheil1(CheckEquivalenceMixin):
|
|
# penalize last two parameters to zero
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
y, x = cls.get_sample()
|
|
mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.])
|
|
cls.res1 = mod1.fit(200000)
|
|
cls.res2 = OLS(y, x[:, :3]).fit()
|
|
|
|
class TestTheil2(CheckEquivalenceMixin):
|
|
# no penalization = same as OLS
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
y, x = cls.get_sample()
|
|
mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.])
|
|
cls.res1 = mod1.fit(0)
|
|
cls.res2 = OLS(y, x).fit()
|
|
|
|
|
|
class TestTheil3(CheckEquivalenceMixin):
|
|
# perfect multicollinearity = same as OLS in terms of fit
|
|
# inference: bse, ... is different
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.skip_inference = True
|
|
y, x = cls.get_sample()
|
|
xd = np.column_stack((x, x))
|
|
#sp = np.zeros(5), np.ones(5)
|
|
r_matrix = np.eye(5, 10, 5)
|
|
mod1 = TheilGLS(y, xd, r_matrix=r_matrix) #sigma_prior=[0, 0, 1., 1.])
|
|
cls.res1 = mod1.fit(0.001, cov_type='data-prior')
|
|
cls.res2 = OLS(y, x).fit()
|
|
|
|
|
|
class TestTheilGLS(CheckEquivalenceMixin):
|
|
# penalize last two parameters to zero
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
y, x = cls.get_sample()
|
|
nobs = len(y)
|
|
weights = (np.arange(nobs) < (nobs // 2)) + 0.5
|
|
mod1 = TheilGLS(y, x, sigma=weights, sigma_prior=[0, 0, 1., 1.])
|
|
cls.res1 = mod1.fit(200000)
|
|
cls.res2 = GLS(y, x[:, :3], sigma=weights).fit()
|
|
|
|
|
|
class TestTheilLinRestriction(CheckEquivalenceMixin):
|
|
# impose linear restriction with small uncertainty - close to OLS
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
y, x = cls.get_sample()
|
|
#merge var1 and var2
|
|
x2 = x[:, :2].copy()
|
|
x2[:, 1] += x[:, 2]
|
|
#mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]])
|
|
mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]])
|
|
cls.res1 = mod1.fit(200000)
|
|
cls.res2 = OLS(y, x2).fit()
|
|
|
|
# adjust precision, careful: cls.tol is mutable
|
|
tol = {'pvalues': (1e-4, 2e-7),
|
|
'tvalues': (5e-4, 0)}
|
|
tol.update(cls.tol)
|
|
cls.tol = tol
|
|
|
|
|
|
class TestTheilLinRestrictionApprox(CheckEquivalenceMixin):
|
|
# impose linear restriction with some uncertainty
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
y, x = cls.get_sample()
|
|
#merge var1 and var2
|
|
x2 = x[:, :2].copy()
|
|
x2[:, 1] += x[:, 2]
|
|
#mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]])
|
|
mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]])
|
|
cls.res1 = mod1.fit(100)
|
|
cls.res2 = OLS(y, x2).fit()
|
|
|
|
# adjust precision, careful: cls.tol is mutable
|
|
import copy
|
|
tol = copy.copy(cls.tol)
|
|
tol2 = {'default': (0.15, 0),
|
|
'params': (0.05, 0),
|
|
'pvalues': (0.02, 0.001),
|
|
}
|
|
tol.update(tol2)
|
|
cls.tol = tol
|
|
|
|
|
|
class TestTheilPanel:
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
#example 3
|
|
nobs = 300
|
|
nobs_i = 5
|
|
n_groups = nobs // nobs_i
|
|
k_vars = 3
|
|
|
|
from statsmodels.sandbox.panel.random_panel import PanelSample
|
|
dgp = PanelSample(nobs, k_vars, n_groups, seed=303305)
|
|
# add random intercept, using same RandomState
|
|
dgp.group_means = 2 + dgp.random_state.randn(n_groups)
|
|
print('seed', dgp.seed)
|
|
y = dgp.generate_panel()
|
|
x = np.column_stack((dgp.exog[:,1:],
|
|
dgp.groups[:,None] == np.arange(n_groups)))
|
|
cls.dgp = dgp
|
|
cls.endog = y
|
|
cls.exog = x
|
|
cls.res_ols = OLS(y, x).fit()
|
|
|
|
def test_regression(self):
|
|
y = self.endog
|
|
x = self.exog
|
|
n_groups, k_vars = self.dgp.n_groups, self.dgp.k_vars
|
|
|
|
Rg = (np.eye(n_groups-1) - 1. / n_groups *
|
|
np.ones((n_groups - 1, n_groups-1)))
|
|
R = np.c_[np.zeros((n_groups - 1, k_vars)), Rg]
|
|
r = np.zeros(n_groups - 1)
|
|
R[:, k_vars-1] = -1
|
|
|
|
lambd = 1 #1e-4
|
|
mod = TheilGLS(y, x, r_matrix=R, q_matrix=r, sigma_prior=lambd)
|
|
res = mod.fit()
|
|
|
|
# regression test
|
|
params1 = np.array([
|
|
0.9751655 , 1.05215277, 0.37135028, 2.0492626 , 2.82062503,
|
|
2.82139775, 1.92940468, 2.96942081, 2.86349583, 3.20695368,
|
|
4.04516422, 3.04918839, 4.54748808, 3.49026961, 3.15529618,
|
|
4.25552932, 2.65471759, 3.62328747, 3.07283053, 3.49485898,
|
|
3.42301424, 2.94677593, 2.81549427, 2.24895113, 2.29222784,
|
|
2.89194946, 3.17052308, 2.37754241, 3.54358533, 3.79838425,
|
|
1.91189071, 1.15976407, 4.05629691, 1.58556827, 4.49941666,
|
|
4.08608599, 3.1889269 , 2.86203652, 3.06785013, 1.9376162 ,
|
|
2.90657681, 3.71910592, 3.15607617, 3.58464547, 2.15466323,
|
|
4.87026717, 2.92909833, 2.64998337, 2.891171 , 4.04422964,
|
|
3.54616122, 4.12135273, 3.70232028, 3.8314497 , 2.2591451 ,
|
|
2.39321422, 3.13064532, 2.1569678 , 2.04667506, 3.92064689,
|
|
3.66243644, 3.11742725])
|
|
assert_allclose(res.params, params1)
|
|
|
|
pen_weight_aicc = mod.select_pen_weight(method='aicc')
|
|
pen_weight_gcv = mod.select_pen_weight(method='gcv')
|
|
pen_weight_cv = mod.select_pen_weight(method='cv')
|
|
pen_weight_bic = mod.select_pen_weight(method='bic')
|
|
assert_allclose(pen_weight_gcv, pen_weight_aicc, rtol=0.1)
|
|
# regression tests:
|
|
assert_allclose(pen_weight_aicc, 4.77333984, rtol=1e-4)
|
|
assert_allclose(pen_weight_gcv, 4.45546875, rtol=1e-4)
|
|
assert_allclose(pen_weight_bic, 9.35957031, rtol=1e-4)
|
|
assert_allclose(pen_weight_cv, 1.99277344, rtol=1e-4)
|
|
|
|
def test_combine_subset_regression(self):
|
|
# split sample into two, use first sample as prior for second
|
|
endog = self.endog
|
|
exog = self.exog
|
|
nobs = len(endog)
|
|
|
|
n05 = nobs // 2
|
|
np.random.seed(987125)
|
|
# shuffle to get random subsamples
|
|
shuffle_idx = np.random.permutation(np.arange(nobs))
|
|
ys = endog[shuffle_idx]
|
|
xs = exog[shuffle_idx]
|
|
k = 10
|
|
res_ols0 = OLS(ys[:n05], xs[:n05, :k]).fit()
|
|
res_ols1 = OLS(ys[n05:], xs[n05:, :k]).fit()
|
|
|
|
w = res_ols1.scale / res_ols0.scale #1.01
|
|
mod_1 = TheilGLS(ys[n05:], xs[n05:, :k], r_matrix=np.eye(k),
|
|
q_matrix=res_ols0.params,
|
|
sigma_prior=w * res_ols0.cov_params())
|
|
res_1p = mod_1.fit(cov_type='data-prior')
|
|
res_1s = mod_1.fit(cov_type='sandwich')
|
|
res_olsf = OLS(ys, xs[:, :k]).fit()
|
|
|
|
assert_allclose(res_1p.params, res_olsf.params, rtol=1e-9)
|
|
corr_fact = np.sqrt(res_1p.scale / res_olsf.scale)
|
|
# corrct for differences in scale computation
|
|
assert_allclose(res_1p.bse, res_olsf.bse * corr_fact, rtol=1e-3)
|
|
|
|
# regression test, does not verify numbers
|
|
# especially why are these smaller than OLS on full sample
|
|
# in larger sample, nobs=600, those were close to full OLS
|
|
bse1 = np.array([
|
|
0.26589869, 0.15224812, 0.38407399, 0.75679949, 0.66084200,
|
|
0.54174080, 0.53697607, 0.66006377, 0.38228551, 0.53920485])
|
|
assert_allclose(res_1s.bse, bse1, rtol=1e-7)
|