1534 lines
54 KiB
Python
1534 lines
54 KiB
Python
|
"""Influence and Outlier Measures
|
||
|
|
||
|
Created on Sun Jan 29 11:16:09 2012
|
||
|
|
||
|
Author: Josef Perktold
|
||
|
License: BSD-3
|
||
|
"""
|
||
|
|
||
|
import warnings
|
||
|
|
||
|
from statsmodels.compat.pandas import Appender
|
||
|
from statsmodels.compat.python import lzip
|
||
|
|
||
|
from collections import defaultdict
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from statsmodels.graphics._regressionplots_doc import _plot_influence_doc
|
||
|
from statsmodels.regression.linear_model import OLS
|
||
|
from statsmodels.stats.multitest import multipletests
|
||
|
from statsmodels.tools.decorators import cache_readonly
|
||
|
from statsmodels.tools.tools import maybe_unwrap_results
|
||
|
|
||
|
# outliers test convenience wrapper
|
||
|
|
||
|
def outlier_test(model_results, method='bonf', alpha=.05, labels=None,
|
||
|
order=False, cutoff=None):
|
||
|
"""
|
||
|
Outlier Tests for RegressionResults instances.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
model_results : RegressionResults
|
||
|
Linear model results
|
||
|
method : str
|
||
|
- `bonferroni` : one-step correction
|
||
|
- `sidak` : one-step correction
|
||
|
- `holm-sidak` :
|
||
|
- `holm` :
|
||
|
- `simes-hochberg` :
|
||
|
- `hommel` :
|
||
|
- `fdr_bh` : Benjamini/Hochberg
|
||
|
- `fdr_by` : Benjamini/Yekutieli
|
||
|
See `statsmodels.stats.multitest.multipletests` for details.
|
||
|
alpha : float
|
||
|
familywise error rate
|
||
|
labels : None or array_like
|
||
|
If `labels` is not None, then it will be used as index to the
|
||
|
returned pandas DataFrame. See also Returns below
|
||
|
order : bool
|
||
|
Whether or not to order the results by the absolute value of the
|
||
|
studentized residuals. If labels are provided they will also be sorted.
|
||
|
cutoff : None or float in [0, 1]
|
||
|
If cutoff is not None, then the return only includes observations with
|
||
|
multiple testing corrected p-values strictly below the cutoff. The
|
||
|
returned array or dataframe can be empty if there are no outlier
|
||
|
candidates at the specified cutoff.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
table : ndarray or DataFrame
|
||
|
Returns either an ndarray or a DataFrame if labels is not None.
|
||
|
Will attempt to get labels from model_results if available. The
|
||
|
columns are the Studentized residuals, the unadjusted p-value,
|
||
|
and the corrected p-value according to method.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The unadjusted p-value is stats.t.sf(abs(resid), df) where
|
||
|
df = df_resid - 1.
|
||
|
"""
|
||
|
from scipy import stats # lazy import
|
||
|
if labels is None:
|
||
|
labels = getattr(model_results.model.data, 'row_labels', None)
|
||
|
infl = getattr(model_results, 'get_influence', None)
|
||
|
if infl is None:
|
||
|
results = maybe_unwrap_results(model_results)
|
||
|
raise AttributeError("model_results object %s does not have a "
|
||
|
"get_influence "
|
||
|
"method." % results.__class__.__name__)
|
||
|
resid = infl().resid_studentized_external
|
||
|
if order:
|
||
|
idx = np.abs(resid).argsort()[::-1]
|
||
|
resid = resid[idx]
|
||
|
if labels is not None:
|
||
|
labels = np.asarray(labels)[idx]
|
||
|
df = model_results.df_resid - 1
|
||
|
unadj_p = stats.t.sf(np.abs(resid), df) * 2
|
||
|
adj_p = multipletests(unadj_p, alpha=alpha, method=method)
|
||
|
|
||
|
data = np.c_[resid, unadj_p, adj_p[1]]
|
||
|
if cutoff is not None:
|
||
|
mask = data[:, -1] < cutoff
|
||
|
data = data[mask]
|
||
|
else:
|
||
|
mask = slice(None)
|
||
|
|
||
|
if labels is not None:
|
||
|
from pandas import DataFrame
|
||
|
return DataFrame(data,
|
||
|
columns=['student_resid', 'unadj_p', method + "(p)"],
|
||
|
index=np.asarray(labels)[mask])
|
||
|
return data
|
||
|
|
||
|
|
||
|
# influence measures
|
||
|
|
||
|
def reset_ramsey(res, degree=5):
|
||
|
"""Ramsey's RESET specification test for linear models
|
||
|
|
||
|
This is a general specification test, for additional non-linear effects
|
||
|
in a model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
degree : int
|
||
|
Maximum power to include in the RESET test. Powers 0 and 1 are
|
||
|
excluded, so that degree tests powers 2, ..., degree of the fitted
|
||
|
values.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The test fits an auxiliary OLS regression where the design matrix, exog,
|
||
|
is augmented by powers 2 to degree of the fitted values. Then it performs
|
||
|
an F-test whether these additional terms are significant.
|
||
|
|
||
|
If the p-value of the f-test is below a threshold, e.g. 0.1, then this
|
||
|
indicates that there might be additional non-linear effects in the model
|
||
|
and that the linear model is mis-specified.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
https://en.wikipedia.org/wiki/Ramsey_RESET_test
|
||
|
"""
|
||
|
order = degree + 1
|
||
|
k_vars = res.model.exog.shape[1]
|
||
|
# vander without constant and x, and drop constant
|
||
|
norm_values = np.asarray(res.fittedvalues)
|
||
|
norm_values = norm_values / np.sqrt((norm_values ** 2).mean())
|
||
|
y_fitted_vander = np.vander(norm_values, order)[:, :-2]
|
||
|
exog = np.column_stack((res.model.exog, y_fitted_vander))
|
||
|
exog /= np.sqrt((exog ** 2).mean(0))
|
||
|
endog = res.model.endog / (res.model.endog ** 2).mean()
|
||
|
res_aux = OLS(endog, exog).fit()
|
||
|
# r_matrix = np.eye(degree, exog.shape[1], k_vars)
|
||
|
r_matrix = np.eye(degree - 1, exog.shape[1], k_vars)
|
||
|
# df1 = degree - 1
|
||
|
# df2 = exog.shape[0] - degree - res.df_model (without constant)
|
||
|
return res_aux.f_test(r_matrix) # , r_matrix, res_aux
|
||
|
|
||
|
|
||
|
def variance_inflation_factor(exog, exog_idx):
|
||
|
"""
|
||
|
Variance inflation factor, VIF, for one exogenous variable
|
||
|
|
||
|
The variance inflation factor is a measure for the increase of the
|
||
|
variance of the parameter estimates if an additional variable, given by
|
||
|
exog_idx is added to the linear regression. It is a measure for
|
||
|
multicollinearity of the design matrix, exog.
|
||
|
|
||
|
One recommendation is that if VIF is greater than 5, then the explanatory
|
||
|
variable given by exog_idx is highly collinear with the other explanatory
|
||
|
variables, and the parameter estimates will have large standard errors
|
||
|
because of this.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
exog : {ndarray, DataFrame}
|
||
|
design matrix with all explanatory variables, as for example used in
|
||
|
regression
|
||
|
exog_idx : int
|
||
|
index of the exogenous variable in the columns of exog
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
float
|
||
|
variance inflation factor
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This function does not save the auxiliary regression.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
xxx : class for regression diagnostics TODO: does not exist yet
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
https://en.wikipedia.org/wiki/Variance_inflation_factor
|
||
|
"""
|
||
|
k_vars = exog.shape[1]
|
||
|
exog = np.asarray(exog)
|
||
|
x_i = exog[:, exog_idx]
|
||
|
mask = np.arange(k_vars) != exog_idx
|
||
|
x_noti = exog[:, mask]
|
||
|
r_squared_i = OLS(x_i, x_noti).fit().rsquared
|
||
|
vif = 1. / (1. - r_squared_i)
|
||
|
return vif
|
||
|
|
||
|
|
||
|
class _BaseInfluenceMixin:
|
||
|
"""common methods between OLSInfluence and MLE/GLMInfluence
|
||
|
"""
|
||
|
|
||
|
@Appender(_plot_influence_doc.format(**{'extra_params_doc': ""}))
|
||
|
def plot_influence(self, external=None, alpha=.05, criterion="cooks",
|
||
|
size=48, plot_alpha=.75, ax=None, **kwargs):
|
||
|
|
||
|
if external is None:
|
||
|
external = hasattr(self, '_cache') and 'res_looo' in self._cache
|
||
|
from statsmodels.graphics.regressionplots import _influence_plot
|
||
|
if self.hat_matrix_diag is not None:
|
||
|
res = _influence_plot(self.results, self, external=external,
|
||
|
alpha=alpha,
|
||
|
criterion=criterion, size=size,
|
||
|
plot_alpha=plot_alpha, ax=ax, **kwargs)
|
||
|
else:
|
||
|
warnings.warn("Plot uses pearson residuals and exog hat matrix.")
|
||
|
res = _influence_plot(self.results, self, external=external,
|
||
|
alpha=alpha,
|
||
|
criterion=criterion, size=size,
|
||
|
leverage=self.hat_matrix_exog_diag,
|
||
|
resid=self.resid,
|
||
|
plot_alpha=plot_alpha, ax=ax, **kwargs)
|
||
|
return res
|
||
|
|
||
|
def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None,
|
||
|
**kwds):
|
||
|
from statsmodels.graphics import utils
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
if title is None:
|
||
|
title = "Index Plot"
|
||
|
nobs = len(self.endog)
|
||
|
index = np.arange(nobs)
|
||
|
ax.scatter(index, y, **kwds)
|
||
|
|
||
|
if threshold == 'all':
|
||
|
large_points = np.ones(nobs, np.bool_)
|
||
|
else:
|
||
|
large_points = np.abs(y) > threshold
|
||
|
psize = 3 * np.ones(nobs)
|
||
|
# add point labels
|
||
|
labels = self.results.model.data.row_labels
|
||
|
if labels is None:
|
||
|
labels = np.arange(nobs)
|
||
|
ax = utils.annotate_axes(np.where(large_points)[0], labels,
|
||
|
lzip(index, y),
|
||
|
lzip(-psize, psize), "large",
|
||
|
ax)
|
||
|
|
||
|
font = {"fontsize": 16, "color": "black"}
|
||
|
ax.set_ylabel(ylabel, **font)
|
||
|
ax.set_xlabel("Observation", **font)
|
||
|
ax.set_title(title, **font)
|
||
|
return fig
|
||
|
|
||
|
def plot_index(self, y_var='cooks', threshold=None, title=None, ax=None,
|
||
|
idx=None, **kwds):
|
||
|
"""index plot for influence attributes
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_var : str
|
||
|
Name of attribute or shortcut for predefined attributes that will
|
||
|
be plotted on the y-axis.
|
||
|
threshold : None or float
|
||
|
Threshold for adding annotation with observation labels.
|
||
|
Observations for which the absolute value of the y_var is larger
|
||
|
than the threshold will be annotated. Set to a negative number to
|
||
|
label all observations or to a large number to have no annotation.
|
||
|
title : str
|
||
|
If provided, the title will replace the default "Index Plot" title.
|
||
|
ax : matplolib axis instance
|
||
|
The plot will be added to the `ax` if provided, otherwise a new
|
||
|
figure is created.
|
||
|
idx : {None, int}
|
||
|
Some attributes require an additional index to select the y-var.
|
||
|
In dfbetas this refers to the column indes.
|
||
|
kwds : optional keywords
|
||
|
Keywords will be used in the call to matplotlib scatter function.
|
||
|
"""
|
||
|
criterion = y_var # alias
|
||
|
if threshold is None:
|
||
|
# TODO: criterion specific defaults
|
||
|
threshold = 'all'
|
||
|
|
||
|
if criterion == 'dfbeta':
|
||
|
y = self.dfbetas[:, idx]
|
||
|
ylabel = 'DFBETA for ' + self.results.model.exog_names[idx]
|
||
|
elif criterion.startswith('cook'):
|
||
|
y = self.cooks_distance[0]
|
||
|
ylabel = "Cook's distance"
|
||
|
elif criterion.startswith('hat') or criterion.startswith('lever'):
|
||
|
y = self.hat_matrix_diag
|
||
|
ylabel = "Leverage (diagonal of hat matrix)"
|
||
|
elif criterion.startswith('cook'):
|
||
|
y = self.cooks_distance[0]
|
||
|
ylabel = "Cook's distance"
|
||
|
elif criterion.startswith('resid_stu'):
|
||
|
y = self.resid_studentized
|
||
|
ylabel = "Internally Studentized Residuals"
|
||
|
else:
|
||
|
# assume we have the name of an attribute
|
||
|
y = getattr(self, y_var)
|
||
|
if idx is not None:
|
||
|
y = y[idx]
|
||
|
ylabel = y_var
|
||
|
|
||
|
fig = self._plot_index(y, ylabel, threshold=threshold, title=title,
|
||
|
ax=ax, **kwds)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
class MLEInfluence(_BaseInfluenceMixin):
|
||
|
"""Global Influence and outlier measures (experimental)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : instance of results class
|
||
|
This only works for model and results classes that have the necessary
|
||
|
helper methods.
|
||
|
other arguments :
|
||
|
Those are only available to override default behavior and are used
|
||
|
instead of the corresponding attribute of the results class.
|
||
|
By default resid_pearson is used as resid.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
hat_matrix_diag (hii) : This is the generalized leverage computed as the
|
||
|
local derivative of fittedvalues (predicted mean) with respect to the
|
||
|
observed response for each observation.
|
||
|
Not available for ZeroInflated models because of nondifferentiability.
|
||
|
d_params : Change in parameters computed with one Newton step using the
|
||
|
full Hessian corrected by division by (1 - hii).
|
||
|
If hat_matrix_diag is not available, then the division by (1 - hii) is
|
||
|
not included.
|
||
|
dbetas : change in parameters divided by the standard error of parameters
|
||
|
from the full model results, ``bse``.
|
||
|
cooks_distance : quadratic form for change in parameters weighted by
|
||
|
``cov_params`` from the full model divided by the number of variables.
|
||
|
It includes p-values based on the F-distribution which are only
|
||
|
approximate outside of linear Gaussian models.
|
||
|
resid_studentized : In the general MLE case resid_studentized are
|
||
|
computed from the score residuals scaled by hessian factor and
|
||
|
leverage. This does not use ``cov_params``.
|
||
|
d_fittedvalues : local change of expected mean given the change in the
|
||
|
parameters as computed in ``d_params``.
|
||
|
d_fittedvalues_scaled : same as d_fittedvalues but scaled by the standard
|
||
|
errors of a predicted mean of the response.
|
||
|
params_one : is the one step parameter estimate computed as ``params``
|
||
|
from the full sample minus ``d_params``.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
MLEInfluence uses generic definitions based on maximum likelihood models.
|
||
|
|
||
|
MLEInfluence produces the same results as GLMInfluence for canonical
|
||
|
links (verified for GLM Binomial, Poisson and Gaussian). There will be
|
||
|
some differences for non-canonical links or if a robust cov_type is used.
|
||
|
For example, the generalized leverage differs from the definition of the
|
||
|
GLM hat matrix in the case of Probit, which corresponds to family
|
||
|
Binomial with a non-canonical link.
|
||
|
|
||
|
The extension to non-standard models, e.g. multi-link model like
|
||
|
BetaModel and the ZeroInflated models is still experimental and might still
|
||
|
change.
|
||
|
Additonally, ZeroInflated and some threshold models have a
|
||
|
nondifferentiability in the generalized leverage. How this case is treated
|
||
|
might also change.
|
||
|
|
||
|
Warning: This does currently not work for constrained or penalized models,
|
||
|
e.g. models estimated with fit_constrained or fit_regularized.
|
||
|
|
||
|
This has not yet been tested for correctness when offset or exposure
|
||
|
are used, although they should be supported by the code.
|
||
|
|
||
|
status: experimental,
|
||
|
This class will need changes to support different kinds of models, e.g.
|
||
|
extra parameters in discrete.NegativeBinomial or two-part models like
|
||
|
ZeroInflatedPoisson.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, results, resid=None, endog=None, exog=None,
|
||
|
hat_matrix_diag=None, cov_params=None, scale=None):
|
||
|
# this __init__ attaches attributes that we don't really need
|
||
|
self.results = results = maybe_unwrap_results(results)
|
||
|
# TODO: check for extra params in e.g. NegBin
|
||
|
self.nobs, self.k_vars = results.model.exog.shape
|
||
|
self.k_params = np.size(results.params)
|
||
|
self.endog = endog if endog is not None else results.model.endog
|
||
|
self.exog = exog if exog is not None else results.model.exog
|
||
|
self.scale = scale if scale is not None else results.scale
|
||
|
if resid is not None:
|
||
|
self.resid = resid
|
||
|
else:
|
||
|
self.resid = getattr(results, "resid_pearson", None)
|
||
|
if self.resid is not None: # and scale != 1:
|
||
|
# GLM and similar does not divide resid_pearson by scale
|
||
|
self.resid = self.resid / np.sqrt(self.scale)
|
||
|
|
||
|
self.cov_params = (cov_params if cov_params is not None
|
||
|
else results.cov_params())
|
||
|
self.model_class = results.model.__class__
|
||
|
|
||
|
self.hessian = self.results.model.hessian(self.results.params)
|
||
|
self.score_obs = self.results.model.score_obs(self.results.params)
|
||
|
if hat_matrix_diag is not None:
|
||
|
self._hat_matrix_diag = hat_matrix_diag
|
||
|
|
||
|
@cache_readonly
|
||
|
def hat_matrix_diag(self):
|
||
|
"""Diagonal of the generalized leverage
|
||
|
|
||
|
This is the analogue of the hat matrix diagonal for general MLE.
|
||
|
"""
|
||
|
if hasattr(self, '_hat_matrix_diag'):
|
||
|
return self._hat_matrix_diag
|
||
|
|
||
|
try:
|
||
|
dsdy = self.results.model._deriv_score_obs_dendog(
|
||
|
self.results.params)
|
||
|
except NotImplementedError:
|
||
|
dsdy = None
|
||
|
|
||
|
if dsdy is None:
|
||
|
warnings.warn("hat matrix is not available, missing derivatives",
|
||
|
UserWarning)
|
||
|
return None
|
||
|
|
||
|
dmu_dp = self.results.model._deriv_mean_dparams(self.results.params)
|
||
|
|
||
|
# dmu_dp = 1 /
|
||
|
# self.results.model.family.link.deriv(self.results.fittedvalues)
|
||
|
h = (dmu_dp * np.linalg.solve(-self.hessian, dsdy.T).T).sum(1)
|
||
|
return h
|
||
|
|
||
|
@cache_readonly
|
||
|
def hat_matrix_exog_diag(self):
|
||
|
"""Diagonal of the hat_matrix using only exog as in OLS
|
||
|
|
||
|
"""
|
||
|
get_exogs = getattr(self.results.model, "_get_exogs", None)
|
||
|
if get_exogs is not None:
|
||
|
exog = np.column_stack(get_exogs())
|
||
|
else:
|
||
|
exog = self.exog
|
||
|
return (exog * np.linalg.pinv(exog).T).sum(1)
|
||
|
|
||
|
@cache_readonly
|
||
|
def d_params(self):
|
||
|
"""Approximate change in parameter estimates when dropping observation.
|
||
|
|
||
|
This uses one-step approximation of the parameter change to deleting
|
||
|
one observation.
|
||
|
"""
|
||
|
so_noti = self.score_obs.sum(0) - self.score_obs
|
||
|
beta_i = np.linalg.solve(self.hessian, so_noti.T).T
|
||
|
if self.hat_matrix_diag is not None:
|
||
|
beta_i /= (1 - self.hat_matrix_diag)[:, None]
|
||
|
|
||
|
return beta_i
|
||
|
|
||
|
@cache_readonly
|
||
|
def dfbetas(self):
|
||
|
"""Scaled change in parameter estimates.
|
||
|
|
||
|
The one-step change of parameters in d_params is rescaled by dividing
|
||
|
by the standard error of the parameter estimate given by results.bse.
|
||
|
"""
|
||
|
|
||
|
beta_i = self.d_params / self.results.bse
|
||
|
return beta_i
|
||
|
|
||
|
@cache_readonly
|
||
|
def params_one(self):
|
||
|
"""Parameter estimate based on one-step approximation.
|
||
|
|
||
|
This the one step parameter estimate computed as
|
||
|
``params`` from the full sample minus ``d_params``.
|
||
|
"""
|
||
|
return self.results.params - self.d_params
|
||
|
|
||
|
@cache_readonly
|
||
|
def cooks_distance(self):
|
||
|
"""Cook's distance and p-values.
|
||
|
|
||
|
Based on one step approximation d_params and on results.cov_params
|
||
|
Cook's distance divides by the number of explanatory variables.
|
||
|
|
||
|
p-values are based on the F-distribution which are only approximate
|
||
|
outside of linear Gaussian models.
|
||
|
|
||
|
Warning: The definition of p-values might change if we switch to using
|
||
|
chi-square distribution instead of F-distribution, or if we make it
|
||
|
dependent on the fit keyword use_t.
|
||
|
"""
|
||
|
cooks_d2 = (self.d_params * np.linalg.solve(self.cov_params,
|
||
|
self.d_params.T).T).sum(1)
|
||
|
cooks_d2 /= self.k_params
|
||
|
from scipy import stats
|
||
|
|
||
|
# alpha = 0.1
|
||
|
# print stats.f.isf(1-alpha, n_params, res.df_modelwc)
|
||
|
# TODO use chi2 # use_f option
|
||
|
pvals = stats.f.sf(cooks_d2, self.k_params, self.results.df_resid)
|
||
|
|
||
|
return cooks_d2, pvals
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_studentized(self):
|
||
|
"""studentized default residuals.
|
||
|
|
||
|
This uses the residual in `resid` attribute, which is by default
|
||
|
resid_pearson and studentizes is using the generalized leverage.
|
||
|
|
||
|
self.resid / np.sqrt(1 - self.hat_matrix_diag)
|
||
|
|
||
|
Studentized residuals are not available if hat_matrix_diag is None.
|
||
|
|
||
|
"""
|
||
|
return self.resid / np.sqrt(1 - self.hat_matrix_diag)
|
||
|
|
||
|
def resid_score_factor(self):
|
||
|
"""Score residual divided by sqrt of hessian factor.
|
||
|
|
||
|
experimental, agrees with GLMInfluence for Binomial and Gaussian.
|
||
|
This corresponds to considering the linear predictors as parameters
|
||
|
of the model.
|
||
|
|
||
|
Note: Nhis might have nan values if second derivative, hessian_factor,
|
||
|
is positive, i.e. loglikelihood is not globally concave w.r.t. linear
|
||
|
predictor. (This occured in an example for GeneralizedPoisson)
|
||
|
"""
|
||
|
from statsmodels.genmod.generalized_linear_model import GLM
|
||
|
sf = self.results.model.score_factor(self.results.params)
|
||
|
hf = self.results.model.hessian_factor(self.results.params)
|
||
|
if isinstance(sf, tuple):
|
||
|
sf = sf[0]
|
||
|
if isinstance(hf, tuple):
|
||
|
hf = hf[0]
|
||
|
if not isinstance(self.results.model, GLM):
|
||
|
# hessian_factor in GLM has wrong sign, is already positive
|
||
|
hf = -hf
|
||
|
|
||
|
return sf / np.sqrt(hf) / np.sqrt(1 - self.hat_matrix_diag)
|
||
|
|
||
|
def resid_score(self, joint=True, index=None, studentize=False):
|
||
|
"""Score observations scaled by inverse hessian.
|
||
|
|
||
|
Score residual in resid_score are defined in analogy to a score test
|
||
|
statistic for each observation.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
joint : bool
|
||
|
If joint is true, then a quadratic form similar to score_test is
|
||
|
returned for each observation.
|
||
|
If joint is false, then standardized score_obs are returned. The
|
||
|
returned array is two-dimensional
|
||
|
index : ndarray (optional)
|
||
|
Optional index to select a subset of score_obs columns.
|
||
|
By default, all columns of score_obs will be used.
|
||
|
studentize : bool
|
||
|
If studentize is true, the the scaled residuals are also
|
||
|
studentized using the generalized leverage.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
array : 1-D or 2-D residuals
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Status: experimental
|
||
|
|
||
|
Because of the one srep approacimation of d_params, score residuals
|
||
|
are identical to cooks_distance, except for
|
||
|
|
||
|
- cooks_distance is normalized by the number of parameters
|
||
|
- cooks_distance uses cov_params, resid_score is based on Hessian.
|
||
|
This will make them differ in the case of robust cov_params.
|
||
|
|
||
|
"""
|
||
|
# currently no caching
|
||
|
score_obs = self.results.model.score_obs(self.results.params)
|
||
|
hess = self.results.model.hessian(self.results.params)
|
||
|
if index is not None:
|
||
|
score_obs = score_obs[:, index]
|
||
|
hess = hess[index[:, None], index]
|
||
|
|
||
|
if joint:
|
||
|
resid = (score_obs.T * np.linalg.solve(-hess, score_obs.T)).sum(0)
|
||
|
else:
|
||
|
resid = score_obs / np.sqrt(np.diag(-hess))
|
||
|
|
||
|
if studentize:
|
||
|
if joint:
|
||
|
resid /= np.sqrt(1 - self.hat_matrix_diag)
|
||
|
else:
|
||
|
# 2-dim resid
|
||
|
resid /= np.sqrt(1 - self.hat_matrix_diag[:, None])
|
||
|
|
||
|
return resid
|
||
|
|
||
|
@cache_readonly
|
||
|
def _get_prediction(self):
|
||
|
# TODO: do we cache this or does it need to be a method
|
||
|
# we only need unchanging parts, alpha for confint could change
|
||
|
with warnings.catch_warnings():
|
||
|
msg = 'linear keyword is deprecated, use which="linear"'
|
||
|
warnings.filterwarnings("ignore", message=msg,
|
||
|
category=FutureWarning)
|
||
|
pred = self.results.get_prediction()
|
||
|
return pred
|
||
|
|
||
|
@cache_readonly
|
||
|
def d_fittedvalues(self):
|
||
|
"""Change in expected response, fittedvalues.
|
||
|
|
||
|
Local change of expected mean given the change in the parameters as
|
||
|
computed in d_params.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This uses the one-step approximation of the parameter change to
|
||
|
deleting one observation ``d_params``.
|
||
|
"""
|
||
|
# results.params might be a pandas.Series
|
||
|
params = np.asarray(self.results.params)
|
||
|
deriv = self.results.model._deriv_mean_dparams(params)
|
||
|
return (deriv * self.d_params).sum(1)
|
||
|
|
||
|
@property
|
||
|
def d_fittedvalues_scaled(self):
|
||
|
"""
|
||
|
Change in fittedvalues scaled by standard errors.
|
||
|
|
||
|
This uses one-step approximation of the parameter change to deleting
|
||
|
one observation ``d_params``, and divides by the standard errors
|
||
|
for the predicted mean provided by results.get_prediction.
|
||
|
"""
|
||
|
# Note: this and the previous methods are for the response
|
||
|
# and not for a weighted response, i.e. not the self.exog, self.endog
|
||
|
# this will be relevant for WLS comparing fitted endog versus wendog
|
||
|
return self.d_fittedvalues / self._get_prediction.se
|
||
|
|
||
|
def summary_frame(self):
|
||
|
"""
|
||
|
Creates a DataFrame with influence results.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
frame : pandas DataFrame
|
||
|
A DataFrame with selected results for each observation.
|
||
|
The index will be the same as provided to the model.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The resultant DataFrame contains six variables in addition to the
|
||
|
``dfbetas``. These are:
|
||
|
|
||
|
* cooks_d : Cook's Distance defined in ``cooks_distance``
|
||
|
* standard_resid : Standardized residuals defined in
|
||
|
`resid_studentizedl`
|
||
|
* hat_diag : The diagonal of the projection, or hat, matrix defined in
|
||
|
`hat_matrix_diag`. Not included if None.
|
||
|
* dffits_internal : DFFITS statistics using internally Studentized
|
||
|
residuals defined in `d_fittedvalues_scaled`
|
||
|
"""
|
||
|
from pandas import DataFrame
|
||
|
|
||
|
# row and column labels
|
||
|
data = self.results.model.data
|
||
|
row_labels = data.row_labels
|
||
|
beta_labels = ['dfb_' + i for i in data.xnames]
|
||
|
|
||
|
# grab the results
|
||
|
if self.hat_matrix_diag is not None:
|
||
|
summary_data = DataFrame(dict(
|
||
|
cooks_d=self.cooks_distance[0],
|
||
|
standard_resid=self.resid_studentized,
|
||
|
hat_diag=self.hat_matrix_diag,
|
||
|
dffits_internal=self.d_fittedvalues_scaled),
|
||
|
index=row_labels)
|
||
|
else:
|
||
|
summary_data = DataFrame(dict(
|
||
|
cooks_d=self.cooks_distance[0],
|
||
|
# standard_resid=self.resid_studentized,
|
||
|
# hat_diag=self.hat_matrix_diag,
|
||
|
dffits_internal=self.d_fittedvalues_scaled),
|
||
|
index=row_labels)
|
||
|
|
||
|
# NOTE: if we do not give columns, order of above will be arbitrary
|
||
|
dfbeta = DataFrame(self.dfbetas, columns=beta_labels,
|
||
|
index=row_labels)
|
||
|
|
||
|
return dfbeta.join(summary_data)
|
||
|
|
||
|
|
||
|
class OLSInfluence(_BaseInfluenceMixin):
|
||
|
"""class to calculate outlier and influence measures for OLS result
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : RegressionResults
|
||
|
currently assumes the results are from an OLS regression
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
One part of the results can be calculated without any auxiliary regression
|
||
|
(some of which have the `_internal` postfix in the name. Other statistics
|
||
|
require leave-one-observation-out (LOOO) auxiliary regression, and will be
|
||
|
slower (mainly results with `_external` postfix in the name).
|
||
|
The auxiliary LOOO regression only the required results are stored.
|
||
|
|
||
|
Using the LOO measures is currently only recommended if the data set
|
||
|
is not too large. One possible approach for LOOO measures would be to
|
||
|
identify possible problem observations with the _internal measures, and
|
||
|
then run the leave-one-observation-out only with observations that are
|
||
|
possible outliers. (However, this is not yet available in an automated way.)
|
||
|
|
||
|
This should be extended to general least squares.
|
||
|
|
||
|
The leave-one-variable-out (LOVO) auxiliary regression are currently not
|
||
|
used.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, results):
|
||
|
# check which model is allowed
|
||
|
self.results = maybe_unwrap_results(results)
|
||
|
self.nobs, self.k_vars = results.model.exog.shape
|
||
|
self.endog = results.model.endog
|
||
|
self.exog = results.model.exog
|
||
|
self.resid = results.resid
|
||
|
self.model_class = results.model.__class__
|
||
|
|
||
|
# self.sigma_est = np.sqrt(results.mse_resid)
|
||
|
self.scale = results.mse_resid
|
||
|
|
||
|
self.aux_regression_exog = {}
|
||
|
self.aux_regression_endog = {}
|
||
|
|
||
|
@cache_readonly
|
||
|
def hat_matrix_diag(self):
|
||
|
"""Diagonal of the hat_matrix for OLS
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
temporarily calculated here, this should go to model class
|
||
|
"""
|
||
|
return (self.exog * self.results.model.pinv_wexog.T).sum(1)
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_press(self):
|
||
|
"""PRESS residuals
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
return self.resid / (1 - hii)
|
||
|
|
||
|
@cache_readonly
|
||
|
def influence(self):
|
||
|
"""Influence measure
|
||
|
|
||
|
matches the influence measure that gretl reports
|
||
|
u * h / (1 - h)
|
||
|
where u are the residuals and h is the diagonal of the hat_matrix
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
return self.resid * hii / (1 - hii)
|
||
|
|
||
|
@cache_readonly
|
||
|
def hat_diag_factor(self):
|
||
|
"""Factor of diagonal of hat_matrix used in influence
|
||
|
|
||
|
this might be useful for internal reuse
|
||
|
h / (1 - h)
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
return hii / (1 - hii)
|
||
|
|
||
|
@cache_readonly
|
||
|
def ess_press(self):
|
||
|
"""Error sum of squares of PRESS residuals
|
||
|
"""
|
||
|
return np.dot(self.resid_press, self.resid_press)
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_studentized(self):
|
||
|
"""Studentized residuals using variance from OLS
|
||
|
|
||
|
alias for resid_studentized_internal for compatibility with
|
||
|
MLEInfluence this uses sigma from original estimate and does
|
||
|
not require leave one out loop
|
||
|
"""
|
||
|
return self.resid_studentized_internal
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_studentized_internal(self):
|
||
|
"""Studentized residuals using variance from OLS
|
||
|
|
||
|
this uses sigma from original estimate
|
||
|
does not require leave one out loop
|
||
|
"""
|
||
|
return self.get_resid_studentized_external(sigma=None)
|
||
|
# return self.results.resid / self.sigma_est
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_studentized_external(self):
|
||
|
"""Studentized residuals using LOOO variance
|
||
|
|
||
|
this uses sigma from leave-one-out estimates
|
||
|
|
||
|
requires leave one out loop for observations
|
||
|
"""
|
||
|
sigma_looo = np.sqrt(self.sigma2_not_obsi)
|
||
|
return self.get_resid_studentized_external(sigma=sigma_looo)
|
||
|
|
||
|
def get_resid_studentized_external(self, sigma=None):
|
||
|
"""calculate studentized residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
sigma : None or float
|
||
|
estimate of the standard deviation of the residuals. If None, then
|
||
|
the estimate from the regression results is used.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
stzd_resid : ndarray
|
||
|
studentized residuals
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
studentized residuals are defined as ::
|
||
|
|
||
|
resid / sigma / np.sqrt(1 - hii)
|
||
|
|
||
|
where resid are the residuals from the regression, sigma is an
|
||
|
estimate of the standard deviation of the residuals, and hii is the
|
||
|
diagonal of the hat_matrix.
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
if sigma is None:
|
||
|
sigma2_est = self.scale
|
||
|
# can be replace by different estimators of sigma
|
||
|
sigma = np.sqrt(sigma2_est)
|
||
|
|
||
|
return self.resid / sigma / np.sqrt(1 - hii)
|
||
|
|
||
|
# same computation as GLMInfluence
|
||
|
@cache_readonly
|
||
|
def cooks_distance(self):
|
||
|
"""
|
||
|
Cooks distance
|
||
|
|
||
|
Uses original results, no nobs loop
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [*] Eubank, R. L. (1999). Nonparametric regression and spline
|
||
|
smoothing. CRC press.
|
||
|
.. [*] Cook's distance. (n.d.). In Wikipedia. July 2019, from
|
||
|
https://en.wikipedia.org/wiki/Cook%27s_distance
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
# Eubank p.93, 94
|
||
|
cooks_d2 = self.resid_studentized ** 2 / self.k_vars
|
||
|
cooks_d2 *= hii / (1 - hii)
|
||
|
|
||
|
from scipy import stats
|
||
|
|
||
|
# alpha = 0.1
|
||
|
# print stats.f.isf(1-alpha, n_params, res.df_modelwc)
|
||
|
pvals = stats.f.sf(cooks_d2, self.k_vars, self.results.df_resid)
|
||
|
|
||
|
return cooks_d2, pvals
|
||
|
|
||
|
@cache_readonly
|
||
|
def dffits_internal(self):
|
||
|
"""dffits measure for influence of an observation
|
||
|
|
||
|
based on resid_studentized_internal
|
||
|
uses original results, no nobs loop
|
||
|
"""
|
||
|
# TODO: do I want to use different sigma estimate in
|
||
|
# resid_studentized_external
|
||
|
# -> move definition of sigma_error to the __init__
|
||
|
hii = self.hat_matrix_diag
|
||
|
dffits_ = self.resid_studentized_internal * np.sqrt(hii / (1 - hii))
|
||
|
dffits_threshold = 2 * np.sqrt(self.k_vars * 1. / self.nobs)
|
||
|
return dffits_, dffits_threshold
|
||
|
|
||
|
@cache_readonly
|
||
|
def dffits(self):
|
||
|
"""
|
||
|
dffits measure for influence of an observation
|
||
|
|
||
|
based on resid_studentized_external,
|
||
|
uses results from leave-one-observation-out loop
|
||
|
|
||
|
It is recommended that observations with dffits large than a
|
||
|
threshold of 2 sqrt{k / n} where k is the number of parameters, should
|
||
|
be investigated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dffits : float
|
||
|
dffits_threshold : float
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
`Wikipedia <https://en.wikipedia.org/wiki/DFFITS>`_
|
||
|
"""
|
||
|
# TODO: do I want to use different sigma estimate in
|
||
|
# resid_studentized_external
|
||
|
# -> move definition of sigma_error to the __init__
|
||
|
hii = self.hat_matrix_diag
|
||
|
dffits_ = self.resid_studentized_external * np.sqrt(hii / (1 - hii))
|
||
|
dffits_threshold = 2 * np.sqrt(self.k_vars * 1. / self.nobs)
|
||
|
return dffits_, dffits_threshold
|
||
|
|
||
|
@cache_readonly
|
||
|
def dfbetas(self):
|
||
|
"""dfbetas
|
||
|
|
||
|
uses results from leave-one-observation-out loop
|
||
|
"""
|
||
|
dfbetas = self.results.params - self.params_not_obsi # [None,:]
|
||
|
dfbetas /= np.sqrt(self.sigma2_not_obsi[:, None])
|
||
|
dfbetas /= np.sqrt(np.diag(self.results.normalized_cov_params))
|
||
|
return dfbetas
|
||
|
|
||
|
@cache_readonly
|
||
|
def dfbeta(self):
|
||
|
"""dfbetas
|
||
|
|
||
|
uses results from leave-one-observation-out loop
|
||
|
"""
|
||
|
dfbeta = self.results.params - self.params_not_obsi
|
||
|
return dfbeta
|
||
|
|
||
|
@cache_readonly
|
||
|
def sigma2_not_obsi(self):
|
||
|
"""error variance for all LOOO regressions
|
||
|
|
||
|
This is 'mse_resid' from each auxiliary regression.
|
||
|
|
||
|
uses results from leave-one-observation-out loop
|
||
|
"""
|
||
|
return np.asarray(self._res_looo['mse_resid'])
|
||
|
|
||
|
@property
|
||
|
def params_not_obsi(self):
|
||
|
"""parameter estimates for all LOOO regressions
|
||
|
|
||
|
uses results from leave-one-observation-out loop
|
||
|
"""
|
||
|
return np.asarray(self._res_looo['params'])
|
||
|
|
||
|
@property
|
||
|
def det_cov_params_not_obsi(self):
|
||
|
"""determinant of cov_params of all LOOO regressions
|
||
|
|
||
|
uses results from leave-one-observation-out loop
|
||
|
"""
|
||
|
return np.asarray(self._res_looo['det_cov_params'])
|
||
|
|
||
|
@cache_readonly
|
||
|
def cov_ratio(self):
|
||
|
"""covariance ratio between LOOO and original
|
||
|
|
||
|
This uses determinant of the estimate of the parameter covariance
|
||
|
from leave-one-out estimates.
|
||
|
requires leave one out loop for observations
|
||
|
"""
|
||
|
# do not use inplace division / because then we change original
|
||
|
cov_ratio = (self.det_cov_params_not_obsi
|
||
|
/ np.linalg.det(self.results.cov_params()))
|
||
|
return cov_ratio
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_var(self):
|
||
|
"""estimate of variance of the residuals
|
||
|
|
||
|
::
|
||
|
|
||
|
sigma2 = sigma2_OLS * (1 - hii)
|
||
|
|
||
|
where hii is the diagonal of the hat matrix
|
||
|
"""
|
||
|
# TODO:check if correct outside of ols
|
||
|
return self.scale * (1 - self.hat_matrix_diag)
|
||
|
|
||
|
@cache_readonly
|
||
|
def resid_std(self):
|
||
|
"""estimate of standard deviation of the residuals
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
resid_var
|
||
|
"""
|
||
|
return np.sqrt(self.resid_var)
|
||
|
|
||
|
def _ols_xnoti(self, drop_idx, endog_idx='endog', store=True):
|
||
|
"""regression results from LOVO auxiliary regression with cache
|
||
|
|
||
|
|
||
|
The result instances are stored, which could use a large amount of
|
||
|
memory if the datasets are large. There are too many combinations to
|
||
|
store them all, except for small problems.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
drop_idx : int
|
||
|
index of exog that is dropped from the regression
|
||
|
endog_idx : 'endog' or int
|
||
|
If 'endog', then the endogenous variable of the result instance
|
||
|
is regressed on the exogenous variables, excluding the one at
|
||
|
drop_idx. If endog_idx is an integer, then the exog with that
|
||
|
index is regressed with OLS on all other exogenous variables.
|
||
|
(The latter is the auxiliary regression for the variance inflation
|
||
|
factor.)
|
||
|
|
||
|
this needs more thought, memory versus speed
|
||
|
not yet used in any other parts, not sufficiently tested
|
||
|
"""
|
||
|
# reverse the structure, access store, if fail calculate ?
|
||
|
# this creates keys in store even if store = false ! bug
|
||
|
if endog_idx == 'endog':
|
||
|
stored = self.aux_regression_endog
|
||
|
if hasattr(stored, drop_idx):
|
||
|
return stored[drop_idx]
|
||
|
x_i = self.results.model.endog
|
||
|
|
||
|
else:
|
||
|
# nested dictionary
|
||
|
try:
|
||
|
self.aux_regression_exog[endog_idx][drop_idx]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
|
||
|
stored = self.aux_regression_exog[endog_idx]
|
||
|
stored = {}
|
||
|
|
||
|
x_i = self.exog[:, endog_idx]
|
||
|
|
||
|
k_vars = self.exog.shape[1]
|
||
|
mask = np.arange(k_vars) != drop_idx
|
||
|
x_noti = self.exog[:, mask]
|
||
|
res = OLS(x_i, x_noti).fit()
|
||
|
if store:
|
||
|
stored[drop_idx] = res
|
||
|
|
||
|
return res
|
||
|
|
||
|
def _get_drop_vari(self, attributes):
|
||
|
"""
|
||
|
regress endog on exog without one of the variables
|
||
|
|
||
|
This uses a k_vars loop, only attributes of the OLS instance are
|
||
|
stored.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
attributes : list[str]
|
||
|
These are the names of the attributes of the auxiliary OLS results
|
||
|
instance that are stored and returned.
|
||
|
|
||
|
not yet used
|
||
|
"""
|
||
|
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
|
||
|
|
||
|
endog = self.results.model.endog
|
||
|
exog = self.exog
|
||
|
|
||
|
cv_iter = LeaveOneOut(self.k_vars)
|
||
|
res_loo = defaultdict(list)
|
||
|
for inidx, outidx in cv_iter:
|
||
|
for att in attributes:
|
||
|
res_i = self.model_class(endog, exog[:, inidx]).fit()
|
||
|
res_loo[att].append(getattr(res_i, att))
|
||
|
|
||
|
return res_loo
|
||
|
|
||
|
@cache_readonly
|
||
|
def _res_looo(self):
|
||
|
"""collect required results from the LOOO loop
|
||
|
|
||
|
all results will be attached.
|
||
|
currently only 'params', 'mse_resid', 'det_cov_params' are stored
|
||
|
|
||
|
regresses endog on exog dropping one observation at a time
|
||
|
|
||
|
this uses a nobs loop, only attributes of the OLS instance are stored.
|
||
|
"""
|
||
|
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
|
||
|
|
||
|
def get_det_cov_params(res):
|
||
|
return np.linalg.det(res.cov_params())
|
||
|
|
||
|
endog = self.results.model.endog
|
||
|
exog = self.results.model.exog
|
||
|
|
||
|
params = np.zeros(exog.shape, dtype=float)
|
||
|
mse_resid = np.zeros(endog.shape, dtype=float)
|
||
|
det_cov_params = np.zeros(endog.shape, dtype=float)
|
||
|
|
||
|
cv_iter = LeaveOneOut(self.nobs)
|
||
|
for inidx, outidx in cv_iter:
|
||
|
res_i = self.model_class(endog[inidx], exog[inidx]).fit()
|
||
|
params[outidx] = res_i.params
|
||
|
mse_resid[outidx] = res_i.mse_resid
|
||
|
det_cov_params[outidx] = get_det_cov_params(res_i)
|
||
|
|
||
|
return dict(params=params, mse_resid=mse_resid,
|
||
|
det_cov_params=det_cov_params)
|
||
|
|
||
|
def summary_frame(self):
|
||
|
"""
|
||
|
Creates a DataFrame with all available influence results.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
frame : DataFrame
|
||
|
A DataFrame with all results.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The resultant DataFrame contains six variables in addition to the
|
||
|
DFBETAS. These are:
|
||
|
|
||
|
* cooks_d : Cook's Distance defined in `Influence.cooks_distance`
|
||
|
* standard_resid : Standardized residuals defined in
|
||
|
`Influence.resid_studentized_internal`
|
||
|
* hat_diag : The diagonal of the projection, or hat, matrix defined in
|
||
|
`Influence.hat_matrix_diag`
|
||
|
* dffits_internal : DFFITS statistics using internally Studentized
|
||
|
residuals defined in `Influence.dffits_internal`
|
||
|
* dffits : DFFITS statistics using externally Studentized residuals
|
||
|
defined in `Influence.dffits`
|
||
|
* student_resid : Externally Studentized residuals defined in
|
||
|
`Influence.resid_studentized_external`
|
||
|
"""
|
||
|
from pandas import DataFrame
|
||
|
|
||
|
# row and column labels
|
||
|
data = self.results.model.data
|
||
|
row_labels = data.row_labels
|
||
|
beta_labels = ['dfb_' + i for i in data.xnames]
|
||
|
|
||
|
# grab the results
|
||
|
summary_data = DataFrame(dict(
|
||
|
cooks_d=self.cooks_distance[0],
|
||
|
standard_resid=self.resid_studentized_internal,
|
||
|
hat_diag=self.hat_matrix_diag,
|
||
|
dffits_internal=self.dffits_internal[0],
|
||
|
student_resid=self.resid_studentized_external,
|
||
|
dffits=self.dffits[0],
|
||
|
),
|
||
|
index=row_labels)
|
||
|
# NOTE: if we do not give columns, order of above will be arbitrary
|
||
|
dfbeta = DataFrame(self.dfbetas, columns=beta_labels,
|
||
|
index=row_labels)
|
||
|
|
||
|
return dfbeta.join(summary_data)
|
||
|
|
||
|
def summary_table(self, float_fmt="%6.3f"):
|
||
|
"""create a summary table with all influence and outlier measures
|
||
|
|
||
|
This does currently not distinguish between statistics that can be
|
||
|
calculated from the original regression results and for which a
|
||
|
leave-one-observation-out loop is needed
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
res : SimpleTable
|
||
|
SimpleTable instance with the results, can be printed
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This also attaches table_data to the instance.
|
||
|
"""
|
||
|
# print self.dfbetas
|
||
|
|
||
|
# table_raw = [ np.arange(self.nobs),
|
||
|
# self.endog,
|
||
|
# self.fittedvalues,
|
||
|
# self.cooks_distance(),
|
||
|
# self.resid_studentized_internal,
|
||
|
# self.hat_matrix_diag,
|
||
|
# self.dffits_internal,
|
||
|
# self.resid_studentized_external,
|
||
|
# self.dffits,
|
||
|
# self.dfbetas
|
||
|
# ]
|
||
|
table_raw = [('obs', np.arange(self.nobs)),
|
||
|
('endog', self.endog),
|
||
|
('fitted\nvalue', self.results.fittedvalues),
|
||
|
("Cook's\nd", self.cooks_distance[0]),
|
||
|
("student.\nresidual", self.resid_studentized_internal),
|
||
|
('hat diag', self.hat_matrix_diag),
|
||
|
('dffits \ninternal', self.dffits_internal[0]),
|
||
|
("ext.stud.\nresidual", self.resid_studentized_external),
|
||
|
('dffits', self.dffits[0])
|
||
|
]
|
||
|
colnames, data = lzip(*table_raw) # unzip
|
||
|
data = np.column_stack(data)
|
||
|
self.table_data = data
|
||
|
from copy import deepcopy
|
||
|
|
||
|
from statsmodels.iolib.table import SimpleTable, default_html_fmt
|
||
|
from statsmodels.iolib.tableformatting import fmt_base
|
||
|
fmt = deepcopy(fmt_base)
|
||
|
fmt_html = deepcopy(default_html_fmt)
|
||
|
fmt['data_fmts'] = ["%4d"] + [float_fmt] * (data.shape[1] - 1)
|
||
|
# fmt_html['data_fmts'] = fmt['data_fmts']
|
||
|
return SimpleTable(data, headers=colnames, txt_fmt=fmt,
|
||
|
html_fmt=fmt_html)
|
||
|
|
||
|
|
||
|
def summary_table(res, alpha=0.05):
|
||
|
"""
|
||
|
Generate summary table of outlier and influence similar to SAS
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
alpha : float
|
||
|
significance level for confidence interval
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
st : SimpleTable
|
||
|
table with results that can be printed
|
||
|
data : ndarray
|
||
|
calculated measures and statistics for the table
|
||
|
ss2 : list[str]
|
||
|
column_names for table (Note: rows of table are observations)
|
||
|
"""
|
||
|
|
||
|
from scipy import stats
|
||
|
|
||
|
from statsmodels.sandbox.regression.predstd import wls_prediction_std
|
||
|
|
||
|
infl = OLSInfluence(res)
|
||
|
|
||
|
# standard error for predicted mean
|
||
|
# Note: using hat_matrix only works for fitted values
|
||
|
predict_mean_se = np.sqrt(infl.hat_matrix_diag * res.mse_resid)
|
||
|
|
||
|
tppf = stats.t.isf(alpha / 2., res.df_resid)
|
||
|
predict_mean_ci = np.column_stack([
|
||
|
res.fittedvalues - tppf * predict_mean_se,
|
||
|
res.fittedvalues + tppf * predict_mean_se])
|
||
|
|
||
|
# standard error for predicted observation
|
||
|
tmp = wls_prediction_std(res, alpha=alpha)
|
||
|
predict_se, predict_ci_low, predict_ci_upp = tmp
|
||
|
|
||
|
predict_ci = np.column_stack((predict_ci_low, predict_ci_upp))
|
||
|
|
||
|
# standard deviation of residual
|
||
|
resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag))
|
||
|
|
||
|
table_sm = np.column_stack([
|
||
|
np.arange(res.nobs) + 1,
|
||
|
res.model.endog,
|
||
|
res.fittedvalues,
|
||
|
predict_mean_se,
|
||
|
predict_mean_ci[:, 0],
|
||
|
predict_mean_ci[:, 1],
|
||
|
predict_ci[:, 0],
|
||
|
predict_ci[:, 1],
|
||
|
res.resid,
|
||
|
resid_se,
|
||
|
infl.resid_studentized_internal,
|
||
|
infl.cooks_distance[0]
|
||
|
])
|
||
|
|
||
|
# colnames, data = lzip(*table_raw) #unzip
|
||
|
data = table_sm
|
||
|
ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue',
|
||
|
'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp',
|
||
|
'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual',
|
||
|
'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"]
|
||
|
colnames = ss2
|
||
|
# self.table_data = data
|
||
|
# data = np.column_stack(data)
|
||
|
from copy import deepcopy
|
||
|
|
||
|
from statsmodels.iolib.table import SimpleTable, default_html_fmt
|
||
|
from statsmodels.iolib.tableformatting import fmt_base
|
||
|
fmt = deepcopy(fmt_base)
|
||
|
fmt_html = deepcopy(default_html_fmt)
|
||
|
fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1)
|
||
|
# fmt_html['data_fmts'] = fmt['data_fmts']
|
||
|
st = SimpleTable(data, headers=colnames, txt_fmt=fmt,
|
||
|
html_fmt=fmt_html)
|
||
|
|
||
|
return st, data, ss2
|
||
|
|
||
|
|
||
|
class GLMInfluence(MLEInfluence):
|
||
|
"""Influence and outlier measures (experimental)
|
||
|
|
||
|
This uses partly formulas specific to GLM, specifically cooks_distance
|
||
|
is based on the hessian, i.e. observed or expected information matrix and
|
||
|
not on cov_params, in contrast to MLEInfluence.
|
||
|
Standardization for changes in parameters, in fittedvalues and in
|
||
|
the linear predictor are based on cov_params.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : instance of results class
|
||
|
This only works for model and results classes that have the necessary
|
||
|
helper methods.
|
||
|
other arguments are only to override default behavior and are used instead
|
||
|
of the corresponding attribute of the results class.
|
||
|
By default resid_pearson is used as resid.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
dbetas
|
||
|
change in parameters divided by the standard error of parameters from
|
||
|
the full model results, ``bse``.
|
||
|
d_fittedvalues_scaled
|
||
|
same as d_fittedvalues but scaled by the standard errors of a
|
||
|
predicted mean of the response.
|
||
|
d_linpred
|
||
|
local change in linear prediction.
|
||
|
d_linpred_scale
|
||
|
local change in linear prediction scaled by the standard errors for
|
||
|
the prediction based on cov_params.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This has not yet been tested for correctness when offset or exposure
|
||
|
are used, although they should be supported by the code.
|
||
|
|
||
|
Some GLM specific measures like d_deviance are still missing.
|
||
|
|
||
|
Computing an explicit leave-one-observation-out (LOOO) loop is included
|
||
|
but no influence measures are currently computed from it.
|
||
|
"""
|
||
|
|
||
|
@cache_readonly
|
||
|
def hat_matrix_diag(self):
|
||
|
"""
|
||
|
Diagonal of the hat_matrix for GLM
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This returns the diagonal of the hat matrix that was provided as
|
||
|
argument to GLMInfluence or computes it using the results method
|
||
|
`get_hat_matrix`.
|
||
|
"""
|
||
|
if hasattr(self, '_hat_matrix_diag'):
|
||
|
return self._hat_matrix_diag
|
||
|
else:
|
||
|
return self.results.get_hat_matrix()
|
||
|
|
||
|
@cache_readonly
|
||
|
def d_params(self):
|
||
|
"""Change in parameter estimates
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This uses one-step approximation of the parameter change to deleting
|
||
|
one observation.
|
||
|
"""
|
||
|
|
||
|
beta_i = np.linalg.pinv(self.exog) * self.resid_studentized
|
||
|
beta_i /= np.sqrt(1 - self.hat_matrix_diag)
|
||
|
return beta_i.T
|
||
|
|
||
|
# same computation as OLS
|
||
|
@cache_readonly
|
||
|
def resid_studentized(self):
|
||
|
"""
|
||
|
Internally studentized pearson residuals
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
residuals / sqrt( scale * (1 - hii))
|
||
|
|
||
|
where residuals are those provided to GLMInfluence which are
|
||
|
pearson residuals by default, and
|
||
|
hii is the diagonal of the hat matrix.
|
||
|
"""
|
||
|
# redundant with scaled resid_pearson, keep for docstring for now
|
||
|
return super().resid_studentized
|
||
|
|
||
|
# same computation as OLS
|
||
|
@cache_readonly
|
||
|
def cooks_distance(self):
|
||
|
"""Cook's distance
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Based on one step approximation using resid_studentized and
|
||
|
hat_matrix_diag for the computation.
|
||
|
|
||
|
Cook's distance divides by the number of explanatory variables.
|
||
|
|
||
|
Computed using formulas for GLM and does not use results.cov_params.
|
||
|
It includes p-values based on the F-distribution which are only
|
||
|
approximate outside of linear Gaussian models.
|
||
|
"""
|
||
|
hii = self.hat_matrix_diag
|
||
|
# Eubank p.93, 94
|
||
|
cooks_d2 = self.resid_studentized ** 2 / self.k_vars
|
||
|
cooks_d2 *= hii / (1 - hii)
|
||
|
|
||
|
from scipy import stats
|
||
|
|
||
|
# alpha = 0.1
|
||
|
# print stats.f.isf(1-alpha, n_params, res.df_modelwc)
|
||
|
pvals = stats.f.sf(cooks_d2, self.k_vars, self.results.df_resid)
|
||
|
|
||
|
return cooks_d2, pvals
|
||
|
|
||
|
@property
|
||
|
def d_linpred(self):
|
||
|
"""
|
||
|
Change in linear prediction
|
||
|
|
||
|
This uses one-step approximation of the parameter change to deleting
|
||
|
one observation ``d_params``.
|
||
|
"""
|
||
|
# TODO: This will need adjustment for extra params in Poisson
|
||
|
# use original model exog not transformed influence exog
|
||
|
exog = self.results.model.exog
|
||
|
return (exog * self.d_params).sum(1)
|
||
|
|
||
|
@property
|
||
|
def d_linpred_scaled(self):
|
||
|
"""
|
||
|
Change in linpred scaled by standard errors
|
||
|
|
||
|
This uses one-step approximation of the parameter change to deleting
|
||
|
one observation ``d_params``, and divides by the standard errors
|
||
|
for linpred provided by results.get_prediction.
|
||
|
"""
|
||
|
# Note: this and the previous methods are for the response
|
||
|
# and not for a weighted response, i.e. not the self.exog, self.endog
|
||
|
# this will be relevant for WLS comparing fitted endog versus wendog
|
||
|
return self.d_linpred / self._get_prediction.linpred.se
|
||
|
|
||
|
@property
|
||
|
def _fittedvalues_one(self):
|
||
|
"""experimental code
|
||
|
"""
|
||
|
warnings.warn('this ignores offset and exposure', UserWarning)
|
||
|
# TODO: we need to handle offset, exposure and weights
|
||
|
# use original model exog not transformed influence exog
|
||
|
exog = self.results.model.exog
|
||
|
fitted = np.array([self.results.model.predict(pi, exog[i])
|
||
|
for i, pi in enumerate(self.params_one)])
|
||
|
return fitted.squeeze()
|
||
|
|
||
|
@property
|
||
|
def _diff_fittedvalues_one(self):
|
||
|
"""experimental code
|
||
|
"""
|
||
|
# in discrete we cannot reuse results.fittedvalues
|
||
|
return self.results.predict() - self._fittedvalues_one
|
||
|
|
||
|
@cache_readonly
|
||
|
def _res_looo(self):
|
||
|
"""collect required results from the LOOO loop
|
||
|
|
||
|
all results will be attached.
|
||
|
currently only 'params', 'mse_resid', 'det_cov_params' are stored
|
||
|
|
||
|
Reestimates the model with endog and exog dropping one observation
|
||
|
at a time
|
||
|
|
||
|
This uses a nobs loop, only attributes of the results instance are
|
||
|
stored.
|
||
|
|
||
|
Warning: This will need refactoring and API changes to be able to
|
||
|
add options.
|
||
|
"""
|
||
|
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
|
||
|
get_det_cov_params = lambda res: np.linalg.det(res.cov_params())
|
||
|
|
||
|
endog = self.results.model.endog
|
||
|
exog = self.results.model.exog
|
||
|
|
||
|
init_kwds = self.results.model._get_init_kwds()
|
||
|
# We need to drop obs also from extra arrays
|
||
|
freq_weights = init_kwds.pop('freq_weights')
|
||
|
var_weights = init_kwds.pop('var_weights')
|
||
|
offset = offset_ = init_kwds.pop('offset')
|
||
|
exposure = exposure_ = init_kwds.pop('exposure')
|
||
|
n_trials = init_kwds.pop('n_trials', None)
|
||
|
# family Binomial creates `n` i.e. `n_trials`
|
||
|
# we need to reset it
|
||
|
# TODO: figure out how to do this properly
|
||
|
if hasattr(init_kwds['family'], 'initialize'):
|
||
|
# assume we have Binomial
|
||
|
is_binomial = True
|
||
|
else:
|
||
|
is_binomial = False
|
||
|
|
||
|
params = np.zeros(exog.shape, dtype=float)
|
||
|
scale = np.zeros(endog.shape, dtype=float)
|
||
|
det_cov_params = np.zeros(endog.shape, dtype=float)
|
||
|
|
||
|
cv_iter = LeaveOneOut(self.nobs)
|
||
|
for inidx, outidx in cv_iter:
|
||
|
if offset is not None:
|
||
|
offset_ = offset[inidx]
|
||
|
if exposure is not None:
|
||
|
exposure_ = exposure[inidx]
|
||
|
if n_trials is not None:
|
||
|
init_kwds['n_trials'] = n_trials[inidx]
|
||
|
|
||
|
mod_i = self.model_class(endog[inidx], exog[inidx],
|
||
|
offset=offset_,
|
||
|
exposure=exposure_,
|
||
|
freq_weights=freq_weights[inidx],
|
||
|
var_weights=var_weights[inidx],
|
||
|
**init_kwds)
|
||
|
if is_binomial:
|
||
|
mod_i.family.n = init_kwds['n_trials']
|
||
|
res_i = mod_i.fit(start_params=self.results.params,
|
||
|
method='newton')
|
||
|
params[outidx] = res_i.params.copy()
|
||
|
scale[outidx] = res_i.scale
|
||
|
det_cov_params[outidx] = get_det_cov_params(res_i)
|
||
|
|
||
|
return dict(params=params, scale=scale, mse_resid=scale,
|
||
|
# alias for now
|
||
|
det_cov_params=det_cov_params)
|