AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/nonparametric/kernel_regression.py

964 lines
34 KiB
Python
Raw Normal View History

2024-10-02 22:15:59 +04:00
"""
Multivariate Conditional and Unconditional Kernel Density Estimation
with Mixed Data Types
References
----------
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
Princeton University Press. (2007)
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
http://dx.doi.org/10.1561/0800000009
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
with Categorical and Continuous Data." Working Paper. (2000)
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
Distributions Annals of Economics and Finance 5, 211-235 (2004)
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
cumulative distribution function."
Journal of Nonparametric Statistics (2008)
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
with Categorical and Continuous Data." Working Paper
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
regression" Statistica Sinica 14(2004), pp. 485-512
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
Regression" Journal of Business & Economics Statistics
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
Categorical Predictor Variables in Nonparametric Regression
Models", 2006, Econometric Reviews 25, 523-544
"""
# TODO: make default behavior efficient=True above a certain n_obs
import copy
import numpy as np
from scipy import optimize
from scipy.stats.mstats import mquantiles
from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \
LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func
__all__ = ['KernelReg', 'KernelCensoredReg']
class KernelReg(GenericKDE):
"""
Nonparametric kernel regression class.
Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``.
Note that the "local constant" type of regression provided here is also
known as Nadaraya-Watson kernel regression; "local linear" is an extension
of that which suffers less from bias issues at the edge of the support. Note
that specifying a custom kernel works only with "local linear" kernel
regression. For example, a custom ``tricube`` kernel yields LOESS regression.
Parameters
----------
endog : array_like
This is the dependent variable.
exog : array_like
The training data for the independent variable(s)
Each element in the list is a separate variable
var_type : str
The type of the variables, one character per variable:
- c: continuous
- u: unordered (discrete)
- o: ordered (discrete)
reg_type : {'lc', 'll'}, optional
Type of regression estimator. 'lc' means local constant and
'll' local Linear estimator. Default is 'll'
bw : str or array_like, optional
Either a user-specified bandwidth or the method for bandwidth
selection. If a string, valid values are 'cv_ls' (least-squares
cross-validation) and 'aic' (AIC Hurvich bandwidth estimation).
Default is 'cv_ls'. User specified bandwidth must have as many
entries as the number of variables.
ckertype : str, optional
The kernel used for the continuous variables.
okertype : str, optional
The kernel used for the ordered discrete variables.
ukertype : str, optional
The kernel used for the unordered discrete variables.
defaults : EstimatorSettings instance, optional
The default values for the efficient bandwidth estimation.
Attributes
----------
bw : array_like
The bandwidth parameters.
"""
def __init__(self, endog, exog, var_type, reg_type='ll', bw='cv_ls',
ckertype='gaussian', okertype='wangryzin',
ukertype='aitchisonaitken', defaults=None):
self.var_type = var_type
self.data_type = var_type
self.reg_type = reg_type
self.ckertype = ckertype
self.okertype = okertype
self.ukertype = ukertype
if not (self.ckertype in kernel_func and self.ukertype in kernel_func
and self.okertype in kernel_func):
raise ValueError('user specified kernel must be a supported '
'kernel from statsmodels.nonparametric.kernels.')
self.k_vars = len(self.var_type)
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, self.k_vars)
self.data = np.column_stack((self.endog, self.exog))
self.nobs = np.shape(self.exog)[0]
self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
defaults = EstimatorSettings() if defaults is None else defaults
self._set_defaults(defaults)
if not isinstance(bw, str):
bw = np.asarray(bw)
if len(bw) != self.k_vars:
raise ValueError('bw must have the same dimension as the '
'number of variables.')
if not self.efficient:
self.bw = self._compute_reg_bw(bw)
else:
self.bw = self._compute_efficient(bw)
def _compute_reg_bw(self, bw):
if not isinstance(bw, str):
self._bw_method = "user-specified"
return np.asarray(bw)
else:
# The user specified a bandwidth selection method e.g. 'cv_ls'
self._bw_method = bw
# Workaround to avoid instance methods in __dict__
if bw == 'cv_ls':
res = self.cv_loo
else: # bw == 'aic'
res = self.aic_hurvich
X = np.std(self.exog, axis=0)
h0 = 1.06 * X * \
self.nobs ** (- 1. / (4 + np.size(self.exog, axis=1)))
func = self.est[self.reg_type]
bw_estimated = optimize.fmin(res, x0=h0, args=(func, ),
maxiter=1e3, maxfun=1e3, disp=0)
return bw_estimated
def _est_loc_linear(self, bw, endog, exog, data_predict):
"""
Local linear estimator of g(x) in the regression ``y = g(x) + e``.
Parameters
----------
bw : array_like
Vector of bandwidth value(s).
endog : 1D array_like
The dependent variable.
exog : 1D or 2D array_like
The independent variable(s).
data_predict : 1D array_like of length K, where K is the number of variables.
The point at which the density is estimated.
Returns
-------
D_x : array_like
The value of the conditional mean at `data_predict`.
Notes
-----
See p. 81 in [1] and p.38 in [2] for the formulas.
Unlike other methods, this one requires that `data_predict` be 1D.
"""
nobs, k_vars = exog.shape
ker = gpke(bw, data=exog, data_predict=data_predict,
var_type=self.var_type,
ckertype=self.ckertype,
ukertype=self.ukertype,
okertype=self.okertype,
tosum=False) / float(nobs)
# Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
# See also p. 38 in [2]
#ix_cont = np.arange(self.k_vars) # Use all vars instead of continuous only
# Note: because ix_cont was defined here such that it selected all
# columns, I removed the indexing with it from exog/data_predict.
# Convert ker to a 2-D array to make matrix operations below work
ker = ker[:, np.newaxis]
M12 = exog - data_predict
M22 = np.dot(M12.T, M12 * ker)
M12 = (M12 * ker).sum(axis=0)
M = np.empty((k_vars + 1, k_vars + 1))
M[0, 0] = ker.sum()
M[0, 1:] = M12
M[1:, 0] = M12
M[1:, 1:] = M22
ker_endog = ker * endog
V = np.empty((k_vars + 1, 1))
V[0, 0] = ker_endog.sum()
V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
mean_mfx = np.dot(np.linalg.pinv(M), V)
mean = mean_mfx[0]
mfx = mean_mfx[1:, :]
return mean, mfx
def _est_loc_constant(self, bw, endog, exog, data_predict):
"""
Local constant estimator of g(x) in the regression
y = g(x) + e
Parameters
----------
bw : array_like
Array of bandwidth value(s).
endog : 1D array_like
The dependent variable.
exog : 1D or 2D array_like
The independent variable(s).
data_predict : 1D or 2D array_like
The point(s) at which the density is estimated.
Returns
-------
G : ndarray
The value of the conditional mean at `data_predict`.
B_x : ndarray
The marginal effects.
"""
ker_x = gpke(bw, data=exog, data_predict=data_predict,
var_type=self.var_type,
ckertype=self.ckertype,
ukertype=self.ukertype,
okertype=self.okertype,
tosum=False)
ker_x = np.reshape(ker_x, np.shape(endog))
G_numer = (ker_x * endog).sum(axis=0)
G_denom = ker_x.sum(axis=0)
G = G_numer / G_denom
nobs = exog.shape[0]
f_x = G_denom / float(nobs)
ker_xc = gpke(bw, data=exog, data_predict=data_predict,
var_type=self.var_type,
ckertype='d_gaussian',
#okertype='wangryzin_reg',
tosum=False)
ker_xc = ker_xc[:, np.newaxis]
d_mx = -(endog * ker_xc).sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
d_fx = -ker_xc.sum(axis=0) / float(nobs) #* np.prod(bw[:, ix_cont]))
B_x = d_mx / f_x - G * d_fx / f_x
B_x = (G_numer * d_fx - G_denom * d_mx) / (G_denom**2)
#B_x = (f_x * d_mx - m_x * d_fx) / (f_x ** 2)
return G, B_x
def aic_hurvich(self, bw, func=None):
"""
Computes the AIC Hurvich criteria for the estimation of the bandwidth.
Parameters
----------
bw : str or array_like
See the ``bw`` parameter of `KernelReg` for details.
Returns
-------
aic : ndarray
The AIC Hurvich criteria, one element for each variable.
func : None
Unused here, needed in signature because it's used in `cv_loo`.
References
----------
See ch.2 in [1] and p.35 in [2].
"""
H = np.empty((self.nobs, self.nobs))
for j in range(self.nobs):
H[:, j] = gpke(bw, data=self.exog, data_predict=self.exog[j,:],
ckertype=self.ckertype, ukertype=self.ukertype,
okertype=self.okertype, var_type=self.var_type,
tosum=False)
denom = H.sum(axis=1)
H = H / denom
gx = KernelReg(endog=self.endog, exog=self.exog, var_type=self.var_type,
reg_type=self.reg_type, bw=bw,
defaults=EstimatorSettings(efficient=False)).fit()[0]
gx = np.reshape(gx, (self.nobs, 1))
sigma = ((self.endog - gx)**2).sum(axis=0) / float(self.nobs)
frac = (1 + np.trace(H) / float(self.nobs)) / \
(1 - (np.trace(H) + 2) / float(self.nobs))
#siga = np.dot(self.endog.T, (I - H).T)
#sigb = np.dot((I - H), self.endog)
#sigma = np.dot(siga, sigb) / float(self.nobs)
aic = np.log(sigma) + frac
return aic
def cv_loo(self, bw, func):
r"""
The cross-validation function with leave-one-out estimator.
Parameters
----------
bw : array_like
Vector of bandwidth values.
func : callable function
Returns the estimator of g(x). Can be either ``_est_loc_constant``
(local constant) or ``_est_loc_linear`` (local_linear).
Returns
-------
L : float
The value of the CV function.
Notes
-----
Calculates the cross-validation least-squares function. This function
is minimized by compute_bw to calculate the optimal value of `bw`.
For details see p.35 in [2]
.. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
and :math:`h` is the vector of bandwidths
"""
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
L = 0
for ii, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
G = func(bw, endog=Y, exog=-X_not_i,
data_predict=-self.exog[ii, :])[0]
L += (self.endog[ii] - G) ** 2
# Note: There might be a way to vectorize this. See p.72 in [1]
return L / self.nobs
def r_squared(self):
r"""
Returns the R-Squared for the nonparametric regression.
Notes
-----
For more details see p.45 in [2]
The R-Squared is calculated by:
.. math:: R^{2}=\frac{\left[\sum_{i=1}^{n}
(Y_{i}-\bar{y})(\hat{Y_{i}}-\bar{y}\right]^{2}}{\sum_{i=1}^{n}
(Y_{i}-\bar{y})^{2}\sum_{i=1}^{n}(\hat{Y_{i}}-\bar{y})^{2}},
where :math:`\hat{Y_{i}}` is the mean calculated in `fit` at the exog
points.
"""
Y = np.squeeze(self.endog)
Yhat = self.fit()[0]
Y_bar = np.mean(Yhat)
R2_numer = (((Y - Y_bar) * (Yhat - Y_bar)).sum())**2
R2_denom = ((Y - Y_bar)**2).sum(axis=0) * \
((Yhat - Y_bar)**2).sum(axis=0)
return R2_numer / R2_denom
def fit(self, data_predict=None):
"""
Returns the mean and marginal effects at the `data_predict` points.
Parameters
----------
data_predict : array_like, optional
Points at which to return the mean and marginal effects. If not
given, ``data_predict == exog``.
Returns
-------
mean : ndarray
The regression result for the mean (i.e. the actual curve).
mfx : ndarray
The marginal effects, i.e. the partial derivatives of the mean.
"""
func = self.est[self.reg_type]
if data_predict is None:
data_predict = self.exog
else:
data_predict = _adjust_shape(data_predict, self.k_vars)
N_data_predict = np.shape(data_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.k_vars))
for i in range(N_data_predict):
mean_mfx = func(self.bw, self.endog, self.exog,
data_predict=data_predict[i, :])
mean[i] = np.squeeze(mean_mfx[0])
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
def sig_test(self, var_pos, nboot=50, nested_res=25, pivot=False):
"""
Significance test for the variables in the regression.
Parameters
----------
var_pos : sequence
The position of the variable in exog to be tested.
Returns
-------
sig : str
The level of significance:
- `*` : at 90% confidence level
- `**` : at 95% confidence level
- `***` : at 99* confidence level
- "Not Significant" : if not significant
"""
var_pos = np.asarray(var_pos)
ix_cont, ix_ord, ix_unord = _get_type_pos(self.var_type)
if np.any(ix_cont[var_pos]):
if np.any(ix_ord[var_pos]) or np.any(ix_unord[var_pos]):
raise ValueError("Discrete variable in hypothesis. Must be continuous")
Sig = TestRegCoefC(self, var_pos, nboot, nested_res, pivot)
else:
Sig = TestRegCoefD(self, var_pos, nboot)
return Sig.sig
def __repr__(self):
"""Provide something sane to print."""
rpr = "KernelReg instance\n"
rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
rpr += "Number of samples: N = " + str(self.nobs) + "\n"
rpr += "Variable types: " + self.var_type + "\n"
rpr += "BW selection method: " + self._bw_method + "\n"
rpr += "Estimator type: " + self.reg_type + "\n"
return rpr
def _get_class_vars_type(self):
"""Helper method to be able to pass needed vars to _compute_subset."""
class_type = 'KernelReg'
class_vars = (self.var_type, self.k_vars, self.reg_type)
return class_type, class_vars
def _compute_dispersion(self, data):
"""
Computes the measure of dispersion.
The minimum of the standard deviation and interquartile range / 1.349
References
----------
See the user guide for the np package in R.
In the notes on bwscaling option in npreg, npudens, npcdens there is
a discussion on the measure of dispersion
"""
data = data[:, 1:]
return _compute_min_std_IQR(data)
class KernelCensoredReg(KernelReg):
"""
Nonparametric censored regression.
Calculates the conditional mean ``E[y|X]`` where ``y = g(X) + e``,
where y is left-censored. Left censored variable Y is defined as
``Y = min {Y', L}`` where ``L`` is the value at which ``Y`` is censored
and ``Y'`` is the true value of the variable.
Parameters
----------
endog : list with one element which is array_like
This is the dependent variable.
exog : list
The training data for the independent variable(s)
Each element in the list is a separate variable
dep_type : str
The type of the dependent variable(s)
c: Continuous
u: Unordered (Discrete)
o: Ordered (Discrete)
reg_type : str
Type of regression estimator
lc: Local Constant Estimator
ll: Local Linear Estimator
bw : array_like
Either a user-specified bandwidth or
the method for bandwidth selection.
cv_ls: cross-validation least squares
aic: AIC Hurvich Estimator
ckertype : str, optional
The kernel used for the continuous variables.
okertype : str, optional
The kernel used for the ordered discrete variables.
ukertype : str, optional
The kernel used for the unordered discrete variables.
censor_val : float
Value at which the dependent variable is censored
defaults : EstimatorSettings instance, optional
The default values for the efficient bandwidth estimation
Attributes
----------
bw : array_like
The bandwidth parameters
"""
def __init__(self, endog, exog, var_type, reg_type, bw='cv_ls',
ckertype='gaussian',
ukertype='aitchison_aitken_reg',
okertype='wangryzin_reg',
censor_val=0, defaults=None):
self.var_type = var_type
self.data_type = var_type
self.reg_type = reg_type
self.ckertype = ckertype
self.okertype = okertype
self.ukertype = ukertype
if not (self.ckertype in kernel_func and self.ukertype in kernel_func
and self.okertype in kernel_func):
raise ValueError('user specified kernel must be a supported '
'kernel from statsmodels.nonparametric.kernels.')
self.k_vars = len(self.var_type)
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, self.k_vars)
self.data = np.column_stack((self.endog, self.exog))
self.nobs = np.shape(self.exog)[0]
self.est = dict(lc=self._est_loc_constant, ll=self._est_loc_linear)
defaults = EstimatorSettings() if defaults is None else defaults
self._set_defaults(defaults)
self.censor_val = censor_val
if self.censor_val is not None:
self.censored(censor_val)
else:
self.W_in = np.ones((self.nobs, 1))
if not self.efficient:
self.bw = self._compute_reg_bw(bw)
else:
self.bw = self._compute_efficient(bw)
def censored(self, censor_val):
# see pp. 341-344 in [1]
self.d = (self.endog != censor_val) * 1.
ix = np.argsort(np.squeeze(self.endog))
self.sortix = ix
self.sortix_rev = np.zeros(ix.shape, int)
self.sortix_rev[ix] = np.arange(len(ix))
self.endog = np.squeeze(self.endog[ix])
self.endog = _adjust_shape(self.endog, 1)
self.exog = np.squeeze(self.exog[ix])
self.d = np.squeeze(self.d[ix])
self.W_in = np.empty((self.nobs, 1))
for i in range(1, self.nobs + 1):
P=1
for j in range(1, i):
P *= ((self.nobs - j)/(float(self.nobs)-j+1))**self.d[j-1]
self.W_in[i-1,0] = P * self.d[i-1] / (float(self.nobs) - i + 1 )
def __repr__(self):
"""Provide something sane to print."""
rpr = "KernelCensoredReg instance\n"
rpr += "Number of variables: k_vars = " + str(self.k_vars) + "\n"
rpr += "Number of samples: nobs = " + str(self.nobs) + "\n"
rpr += "Variable types: " + self.var_type + "\n"
rpr += "BW selection method: " + self._bw_method + "\n"
rpr += "Estimator type: " + self.reg_type + "\n"
return rpr
def _est_loc_linear(self, bw, endog, exog, data_predict, W):
"""
Local linear estimator of g(x) in the regression ``y = g(x) + e``.
Parameters
----------
bw : array_like
Vector of bandwidth value(s)
endog : 1D array_like
The dependent variable
exog : 1D or 2D array_like
The independent variable(s)
data_predict : 1D array_like of length K, where K is
the number of variables. The point at which
the density is estimated
Returns
-------
D_x : array_like
The value of the conditional mean at data_predict
Notes
-----
See p. 81 in [1] and p.38 in [2] for the formulas
Unlike other methods, this one requires that data_predict be 1D
"""
nobs, k_vars = exog.shape
ker = gpke(bw, data=exog, data_predict=data_predict,
var_type=self.var_type,
ckertype=self.ckertype,
ukertype=self.ukertype,
okertype=self.okertype, tosum=False)
# Create the matrix on p.492 in [7], after the multiplication w/ K_h,ij
# See also p. 38 in [2]
# Convert ker to a 2-D array to make matrix operations below work
ker = W * ker[:, np.newaxis]
M12 = exog - data_predict
M22 = np.dot(M12.T, M12 * ker)
M12 = (M12 * ker).sum(axis=0)
M = np.empty((k_vars + 1, k_vars + 1))
M[0, 0] = ker.sum()
M[0, 1:] = M12
M[1:, 0] = M12
M[1:, 1:] = M22
ker_endog = ker * endog
V = np.empty((k_vars + 1, 1))
V[0, 0] = ker_endog.sum()
V[1:, 0] = ((exog - data_predict) * ker_endog).sum(axis=0)
mean_mfx = np.dot(np.linalg.pinv(M), V)
mean = mean_mfx[0]
mfx = mean_mfx[1:, :]
return mean, mfx
def cv_loo(self, bw, func):
r"""
The cross-validation function with leave-one-out
estimator
Parameters
----------
bw : array_like
Vector of bandwidth values
func : callable function
Returns the estimator of g(x).
Can be either ``_est_loc_constant`` (local constant) or
``_est_loc_linear`` (local_linear).
Returns
-------
L : float
The value of the CV function
Notes
-----
Calculates the cross-validation least-squares
function. This function is minimized by compute_bw
to calculate the optimal value of bw
For details see p.35 in [2]
.. math:: CV(h)=n^{-1}\sum_{i=1}^{n}(Y_{i}-g_{-i}(X_{i}))^{2}
where :math:`g_{-i}(X_{i})` is the leave-one-out estimator of g(X)
and :math:`h` is the vector of bandwidths
"""
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
LOO_W = LeaveOneOut(self.W_in).__iter__()
L = 0
for ii, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
w = next(LOO_W)
G = func(bw, endog=Y, exog=-X_not_i,
data_predict=-self.exog[ii, :], W=w)[0]
L += (self.endog[ii] - G) ** 2
# Note: There might be a way to vectorize this. See p.72 in [1]
return L / self.nobs
def fit(self, data_predict=None):
"""
Returns the marginal effects at the data_predict points.
"""
func = self.est[self.reg_type]
if data_predict is None:
data_predict = self.exog
else:
data_predict = _adjust_shape(data_predict, self.k_vars)
N_data_predict = np.shape(data_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.k_vars))
for i in range(N_data_predict):
mean_mfx = func(self.bw, self.endog, self.exog,
data_predict=data_predict[i, :],
W=self.W_in)
mean[i] = np.squeeze(mean_mfx[0])
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
class TestRegCoefC:
"""
Significance test for continuous variables in a nonparametric regression.
The null hypothesis is ``dE(Y|X)/dX_not_i = 0``, the alternative hypothesis
is ``dE(Y|X)/dX_not_i != 0``.
Parameters
----------
model : KernelReg instance
This is the nonparametric regression model whose elements
are tested for significance.
test_vars : tuple, list of integers, array_like
index of position of the continuous variables to be tested
for significance. E.g. (1,3,5) jointly tests variables at
position 1,3 and 5 for significance.
nboot : int
Number of bootstrap samples used to determine the distribution
of the test statistic in a finite sample. Default is 400
nested_res : int
Number of nested resamples used to calculate lambda.
Must enable the pivot option
pivot : bool
Pivot the test statistic by dividing by its standard error
Significantly increases computational time. But pivot statistics
have more desirable properties
(See references)
Attributes
----------
sig : str
The significance level of the variable(s) tested
"Not Significant": Not significant at the 90% confidence level
Fails to reject the null
"*": Significant at the 90% confidence level
"**": Significant at the 95% confidence level
"***": Significant at the 99% confidence level
Notes
-----
This class allows testing of joint hypothesis as long as all variables
are continuous.
References
----------
Racine, J.: "Consistent Significance Testing for Nonparametric Regression"
Journal of Business & Economics Statistics.
Chapter 12 in [1].
"""
# Significance of continuous vars in nonparametric regression
# Racine: Consistent Significance Testing for Nonparametric Regression
# Journal of Business & Economics Statistics
def __init__(self, model, test_vars, nboot=400, nested_res=400,
pivot=False):
self.nboot = nboot
self.nres = nested_res
self.test_vars = test_vars
self.model = model
self.bw = model.bw
self.var_type = model.var_type
self.k_vars = len(self.var_type)
self.endog = model.endog
self.exog = model.exog
self.gx = model.est[model.reg_type]
self.test_vars = test_vars
self.pivot = pivot
self.run()
def run(self):
self.test_stat = self._compute_test_stat(self.endog, self.exog)
self.sig = self._compute_sig()
def _compute_test_stat(self, Y, X):
"""
Computes the test statistic. See p.371 in [8].
"""
lam = self._compute_lambda(Y, X)
t = lam
if self.pivot:
se_lam = self._compute_se_lambda(Y, X)
t = lam / float(se_lam)
return t
def _compute_lambda(self, Y, X):
"""Computes only lambda -- the main part of the test statistic"""
n = np.shape(X)[0]
Y = _adjust_shape(Y, 1)
X = _adjust_shape(X, self.k_vars)
b = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
defaults = EstimatorSettings(efficient=False)).fit()[1]
b = b[:, self.test_vars]
b = np.reshape(b, (n, len(self.test_vars)))
#fct = np.std(b) # Pivot the statistic by dividing by SE
fct = 1. # Do not Pivot -- Bootstrapping works better if Pivot
lam = ((b / fct) ** 2).sum() / float(n)
return lam
def _compute_se_lambda(self, Y, X):
"""
Calculates the SE of lambda by nested resampling
Used to pivot the statistic.
Bootstrapping works better with estimating pivotal statistics
but slows down computation significantly.
"""
n = np.shape(Y)[0]
lam = np.empty(shape=(self.nres,))
for i in range(self.nres):
ind = np.random.randint(0, n, size=(n, 1))
Y1 = Y[ind, 0]
X1 = X[ind, :]
lam[i] = self._compute_lambda(Y1, X1)
se_lambda = np.std(lam)
return se_lambda
def _compute_sig(self):
"""
Computes the significance value for the variable(s) tested.
The empirical distribution of the test statistic is obtained through
bootstrapping the sample. The null hypothesis is rejected if the test
statistic is larger than the 90, 95, 99 percentiles.
"""
t_dist = np.empty(shape=(self.nboot, ))
Y = self.endog
X = copy.deepcopy(self.exog)
n = np.shape(Y)[0]
X[:, self.test_vars] = np.mean(X[:, self.test_vars], axis=0)
# Calculate the restricted mean. See p. 372 in [8]
M = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
defaults=EstimatorSettings(efficient=False)).fit()[0]
M = np.reshape(M, (n, 1))
e = Y - M
e = e - np.mean(e) # recenter residuals
for i in range(self.nboot):
ind = np.random.randint(0, n, size=(n, 1))
e_boot = e[ind, 0]
Y_boot = M + e_boot
t_dist[i] = self._compute_test_stat(Y_boot, self.exog)
self.t_dist = t_dist
sig = "Not Significant"
if self.test_stat > mquantiles(t_dist, 0.9):
sig = "*"
if self.test_stat > mquantiles(t_dist, 0.95):
sig = "**"
if self.test_stat > mquantiles(t_dist, 0.99):
sig = "***"
return sig
class TestRegCoefD(TestRegCoefC):
"""
Significance test for the categorical variables in a nonparametric
regression.
Parameters
----------
model : Instance of KernelReg class
This is the nonparametric regression model whose elements
are tested for significance.
test_vars : tuple, list of one element
index of position of the discrete variable to be tested
for significance. E.g. (3) tests variable at
position 3 for significance.
nboot : int
Number of bootstrap samples used to determine the distribution
of the test statistic in a finite sample. Default is 400
Attributes
----------
sig : str
The significance level of the variable(s) tested
"Not Significant": Not significant at the 90% confidence level
Fails to reject the null
"*": Significant at the 90% confidence level
"**": Significant at the 95% confidence level
"***": Significant at the 99% confidence level
Notes
-----
This class currently does not allow joint hypothesis.
Only one variable can be tested at a time
References
----------
See [9] and chapter 12 in [1].
"""
def _compute_test_stat(self, Y, X):
"""Computes the test statistic"""
dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
n = np.shape(X)[0]
model = KernelReg(Y, X, self.var_type, self.model.reg_type, self.bw,
defaults = EstimatorSettings(efficient=False))
X1 = copy.deepcopy(X)
X1[:, self.test_vars] = 0
m0 = model.fit(data_predict=X1)[0]
m0 = np.reshape(m0, (n, 1))
zvec = np.zeros((n, 1)) # noqa:E741
for i in dom_x[1:] :
X1[:, self.test_vars] = i
m1 = model.fit(data_predict=X1)[0]
m1 = np.reshape(m1, (n, 1))
zvec += (m1 - m0) ** 2 # noqa:E741
avg = zvec.sum(axis=0) / float(n)
return avg
def _compute_sig(self):
"""Calculates the significance level of the variable tested"""
m = self._est_cond_mean()
Y = self.endog
X = self.exog
n = np.shape(X)[0]
u = Y - m
u = u - np.mean(u) # center
fct1 = (1 - 5**0.5) / 2.
fct2 = (1 + 5**0.5) / 2.
u1 = fct1 * u
u2 = fct2 * u
r = fct2 / (5 ** 0.5)
I_dist = np.empty((self.nboot,1))
for j in range(self.nboot):
u_boot = copy.deepcopy(u2)
prob = np.random.uniform(0,1, size = (n,1))
ind = prob < r
u_boot[ind] = u1[ind]
Y_boot = m + u_boot
I_dist[j] = self._compute_test_stat(Y_boot, X)
sig = "Not Significant"
if self.test_stat > mquantiles(I_dist, 0.9):
sig = "*"
if self.test_stat > mquantiles(I_dist, 0.95):
sig = "**"
if self.test_stat > mquantiles(I_dist, 0.99):
sig = "***"
return sig
def _est_cond_mean(self):
"""
Calculates the expected conditional mean
m(X, Z=l) for all possible l
"""
self.dom_x = np.sort(np.unique(self.exog[:, self.test_vars]))
X = copy.deepcopy(self.exog)
m=0
for i in self.dom_x:
X[:, self.test_vars] = i
m += self.model.fit(data_predict = X)[0]
m = m / float(len(self.dom_x))
m = np.reshape(m, (np.shape(self.exog)[0], 1))
return m