"""Influence and Outlier Measures Created on Sun Jan 29 11:16:09 2012 Author: Josef Perktold License: BSD-3 """ import warnings from statsmodels.compat.pandas import Appender from statsmodels.compat.python import lzip from collections import defaultdict import numpy as np from statsmodels.graphics._regressionplots_doc import _plot_influence_doc from statsmodels.regression.linear_model import OLS from statsmodels.stats.multitest import multipletests from statsmodels.tools.decorators import cache_readonly from statsmodels.tools.tools import maybe_unwrap_results # outliers test convenience wrapper def outlier_test(model_results, method='bonf', alpha=.05, labels=None, order=False, cutoff=None): """ Outlier Tests for RegressionResults instances. Parameters ---------- model_results : RegressionResults Linear model results method : str - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : - `holm` : - `simes-hochberg` : - `hommel` : - `fdr_bh` : Benjamini/Hochberg - `fdr_by` : Benjamini/Yekutieli See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate labels : None or array_like If `labels` is not None, then it will be used as index to the returned pandas DataFrame. See also Returns below order : bool Whether or not to order the results by the absolute value of the studentized residuals. If labels are provided they will also be sorted. cutoff : None or float in [0, 1] If cutoff is not None, then the return only includes observations with multiple testing corrected p-values strictly below the cutoff. The returned array or dataframe can be empty if there are no outlier candidates at the specified cutoff. Returns ------- table : ndarray or DataFrame Returns either an ndarray or a DataFrame if labels is not None. Will attempt to get labels from model_results if available. The columns are the Studentized residuals, the unadjusted p-value, and the corrected p-value according to method. Notes ----- The unadjusted p-value is stats.t.sf(abs(resid), df) where df = df_resid - 1. """ from scipy import stats # lazy import if labels is None: labels = getattr(model_results.model.data, 'row_labels', None) infl = getattr(model_results, 'get_influence', None) if infl is None: results = maybe_unwrap_results(model_results) raise AttributeError("model_results object %s does not have a " "get_influence " "method." % results.__class__.__name__) resid = infl().resid_studentized_external if order: idx = np.abs(resid).argsort()[::-1] resid = resid[idx] if labels is not None: labels = np.asarray(labels)[idx] df = model_results.df_resid - 1 unadj_p = stats.t.sf(np.abs(resid), df) * 2 adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] if cutoff is not None: mask = data[:, -1] < cutoff data = data[mask] else: mask = slice(None) if labels is not None: from pandas import DataFrame return DataFrame(data, columns=['student_resid', 'unadj_p', method + "(p)"], index=np.asarray(labels)[mask]) return data # influence measures def reset_ramsey(res, degree=5): """Ramsey's RESET specification test for linear models This is a general specification test, for additional non-linear effects in a model. Parameters ---------- degree : int Maximum power to include in the RESET test. Powers 0 and 1 are excluded, so that degree tests powers 2, ..., degree of the fitted values. Notes ----- The test fits an auxiliary OLS regression where the design matrix, exog, is augmented by powers 2 to degree of the fitted values. Then it performs an F-test whether these additional terms are significant. If the p-value of the f-test is below a threshold, e.g. 0.1, then this indicates that there might be additional non-linear effects in the model and that the linear model is mis-specified. References ---------- https://en.wikipedia.org/wiki/Ramsey_RESET_test """ order = degree + 1 k_vars = res.model.exog.shape[1] # vander without constant and x, and drop constant norm_values = np.asarray(res.fittedvalues) norm_values = norm_values / np.sqrt((norm_values ** 2).mean()) y_fitted_vander = np.vander(norm_values, order)[:, :-2] exog = np.column_stack((res.model.exog, y_fitted_vander)) exog /= np.sqrt((exog ** 2).mean(0)) endog = res.model.endog / (res.model.endog ** 2).mean() res_aux = OLS(endog, exog).fit() # r_matrix = np.eye(degree, exog.shape[1], k_vars) r_matrix = np.eye(degree - 1, exog.shape[1], k_vars) # df1 = degree - 1 # df2 = exog.shape[0] - degree - res.df_model (without constant) return res_aux.f_test(r_matrix) # , r_matrix, res_aux def variance_inflation_factor(exog, exog_idx): """ Variance inflation factor, VIF, for one exogenous variable The variance inflation factor is a measure for the increase of the variance of the parameter estimates if an additional variable, given by exog_idx is added to the linear regression. It is a measure for multicollinearity of the design matrix, exog. One recommendation is that if VIF is greater than 5, then the explanatory variable given by exog_idx is highly collinear with the other explanatory variables, and the parameter estimates will have large standard errors because of this. Parameters ---------- exog : {ndarray, DataFrame} design matrix with all explanatory variables, as for example used in regression exog_idx : int index of the exogenous variable in the columns of exog Returns ------- float variance inflation factor Notes ----- This function does not save the auxiliary regression. See Also -------- xxx : class for regression diagnostics TODO: does not exist yet References ---------- https://en.wikipedia.org/wiki/Variance_inflation_factor """ k_vars = exog.shape[1] exog = np.asarray(exog) x_i = exog[:, exog_idx] mask = np.arange(k_vars) != exog_idx x_noti = exog[:, mask] r_squared_i = OLS(x_i, x_noti).fit().rsquared vif = 1. / (1. - r_squared_i) return vif class _BaseInfluenceMixin: """common methods between OLSInfluence and MLE/GLMInfluence """ @Appender(_plot_influence_doc.format(**{'extra_params_doc': ""})) def plot_influence(self, external=None, alpha=.05, criterion="cooks", size=48, plot_alpha=.75, ax=None, **kwargs): if external is None: external = hasattr(self, '_cache') and 'res_looo' in self._cache from statsmodels.graphics.regressionplots import _influence_plot if self.hat_matrix_diag is not None: res = _influence_plot(self.results, self, external=external, alpha=alpha, criterion=criterion, size=size, plot_alpha=plot_alpha, ax=ax, **kwargs) else: warnings.warn("Plot uses pearson residuals and exog hat matrix.") res = _influence_plot(self.results, self, external=external, alpha=alpha, criterion=criterion, size=size, leverage=self.hat_matrix_exog_diag, resid=self.resid, plot_alpha=plot_alpha, ax=ax, **kwargs) return res def _plot_index(self, y, ylabel, threshold=None, title=None, ax=None, **kwds): from statsmodels.graphics import utils fig, ax = utils.create_mpl_ax(ax) if title is None: title = "Index Plot" nobs = len(self.endog) index = np.arange(nobs) ax.scatter(index, y, **kwds) if threshold == 'all': large_points = np.ones(nobs, np.bool_) else: large_points = np.abs(y) > threshold psize = 3 * np.ones(nobs) # add point labels labels = self.results.model.data.row_labels if labels is None: labels = np.arange(nobs) ax = utils.annotate_axes(np.where(large_points)[0], labels, lzip(index, y), lzip(-psize, psize), "large", ax) font = {"fontsize": 16, "color": "black"} ax.set_ylabel(ylabel, **font) ax.set_xlabel("Observation", **font) ax.set_title(title, **font) return fig def plot_index(self, y_var='cooks', threshold=None, title=None, ax=None, idx=None, **kwds): """index plot for influence attributes Parameters ---------- y_var : str Name of attribute or shortcut for predefined attributes that will be plotted on the y-axis. threshold : None or float Threshold for adding annotation with observation labels. Observations for which the absolute value of the y_var is larger than the threshold will be annotated. Set to a negative number to label all observations or to a large number to have no annotation. title : str If provided, the title will replace the default "Index Plot" title. ax : matplolib axis instance The plot will be added to the `ax` if provided, otherwise a new figure is created. idx : {None, int} Some attributes require an additional index to select the y-var. In dfbetas this refers to the column indes. kwds : optional keywords Keywords will be used in the call to matplotlib scatter function. """ criterion = y_var # alias if threshold is None: # TODO: criterion specific defaults threshold = 'all' if criterion == 'dfbeta': y = self.dfbetas[:, idx] ylabel = 'DFBETA for ' + self.results.model.exog_names[idx] elif criterion.startswith('cook'): y = self.cooks_distance[0] ylabel = "Cook's distance" elif criterion.startswith('hat') or criterion.startswith('lever'): y = self.hat_matrix_diag ylabel = "Leverage (diagonal of hat matrix)" elif criterion.startswith('cook'): y = self.cooks_distance[0] ylabel = "Cook's distance" elif criterion.startswith('resid_stu'): y = self.resid_studentized ylabel = "Internally Studentized Residuals" else: # assume we have the name of an attribute y = getattr(self, y_var) if idx is not None: y = y[idx] ylabel = y_var fig = self._plot_index(y, ylabel, threshold=threshold, title=title, ax=ax, **kwds) return fig class MLEInfluence(_BaseInfluenceMixin): """Global Influence and outlier measures (experimental) Parameters ---------- results : instance of results class This only works for model and results classes that have the necessary helper methods. other arguments : Those are only available to override default behavior and are used instead of the corresponding attribute of the results class. By default resid_pearson is used as resid. Attributes ---------- hat_matrix_diag (hii) : This is the generalized leverage computed as the local derivative of fittedvalues (predicted mean) with respect to the observed response for each observation. Not available for ZeroInflated models because of nondifferentiability. d_params : Change in parameters computed with one Newton step using the full Hessian corrected by division by (1 - hii). If hat_matrix_diag is not available, then the division by (1 - hii) is not included. dbetas : change in parameters divided by the standard error of parameters from the full model results, ``bse``. cooks_distance : quadratic form for change in parameters weighted by ``cov_params`` from the full model divided by the number of variables. It includes p-values based on the F-distribution which are only approximate outside of linear Gaussian models. resid_studentized : In the general MLE case resid_studentized are computed from the score residuals scaled by hessian factor and leverage. This does not use ``cov_params``. d_fittedvalues : local change of expected mean given the change in the parameters as computed in ``d_params``. d_fittedvalues_scaled : same as d_fittedvalues but scaled by the standard errors of a predicted mean of the response. params_one : is the one step parameter estimate computed as ``params`` from the full sample minus ``d_params``. Notes ----- MLEInfluence uses generic definitions based on maximum likelihood models. MLEInfluence produces the same results as GLMInfluence for canonical links (verified for GLM Binomial, Poisson and Gaussian). There will be some differences for non-canonical links or if a robust cov_type is used. For example, the generalized leverage differs from the definition of the GLM hat matrix in the case of Probit, which corresponds to family Binomial with a non-canonical link. The extension to non-standard models, e.g. multi-link model like BetaModel and the ZeroInflated models is still experimental and might still change. Additonally, ZeroInflated and some threshold models have a nondifferentiability in the generalized leverage. How this case is treated might also change. Warning: This does currently not work for constrained or penalized models, e.g. models estimated with fit_constrained or fit_regularized. This has not yet been tested for correctness when offset or exposure are used, although they should be supported by the code. status: experimental, This class will need changes to support different kinds of models, e.g. extra parameters in discrete.NegativeBinomial or two-part models like ZeroInflatedPoisson. """ def __init__(self, results, resid=None, endog=None, exog=None, hat_matrix_diag=None, cov_params=None, scale=None): # this __init__ attaches attributes that we don't really need self.results = results = maybe_unwrap_results(results) # TODO: check for extra params in e.g. NegBin self.nobs, self.k_vars = results.model.exog.shape self.k_params = np.size(results.params) self.endog = endog if endog is not None else results.model.endog self.exog = exog if exog is not None else results.model.exog self.scale = scale if scale is not None else results.scale if resid is not None: self.resid = resid else: self.resid = getattr(results, "resid_pearson", None) if self.resid is not None: # and scale != 1: # GLM and similar does not divide resid_pearson by scale self.resid = self.resid / np.sqrt(self.scale) self.cov_params = (cov_params if cov_params is not None else results.cov_params()) self.model_class = results.model.__class__ self.hessian = self.results.model.hessian(self.results.params) self.score_obs = self.results.model.score_obs(self.results.params) if hat_matrix_diag is not None: self._hat_matrix_diag = hat_matrix_diag @cache_readonly def hat_matrix_diag(self): """Diagonal of the generalized leverage This is the analogue of the hat matrix diagonal for general MLE. """ if hasattr(self, '_hat_matrix_diag'): return self._hat_matrix_diag try: dsdy = self.results.model._deriv_score_obs_dendog( self.results.params) except NotImplementedError: dsdy = None if dsdy is None: warnings.warn("hat matrix is not available, missing derivatives", UserWarning) return None dmu_dp = self.results.model._deriv_mean_dparams(self.results.params) # dmu_dp = 1 / # self.results.model.family.link.deriv(self.results.fittedvalues) h = (dmu_dp * np.linalg.solve(-self.hessian, dsdy.T).T).sum(1) return h @cache_readonly def hat_matrix_exog_diag(self): """Diagonal of the hat_matrix using only exog as in OLS """ get_exogs = getattr(self.results.model, "_get_exogs", None) if get_exogs is not None: exog = np.column_stack(get_exogs()) else: exog = self.exog return (exog * np.linalg.pinv(exog).T).sum(1) @cache_readonly def d_params(self): """Approximate change in parameter estimates when dropping observation. This uses one-step approximation of the parameter change to deleting one observation. """ so_noti = self.score_obs.sum(0) - self.score_obs beta_i = np.linalg.solve(self.hessian, so_noti.T).T if self.hat_matrix_diag is not None: beta_i /= (1 - self.hat_matrix_diag)[:, None] return beta_i @cache_readonly def dfbetas(self): """Scaled change in parameter estimates. The one-step change of parameters in d_params is rescaled by dividing by the standard error of the parameter estimate given by results.bse. """ beta_i = self.d_params / self.results.bse return beta_i @cache_readonly def params_one(self): """Parameter estimate based on one-step approximation. This the one step parameter estimate computed as ``params`` from the full sample minus ``d_params``. """ return self.results.params - self.d_params @cache_readonly def cooks_distance(self): """Cook's distance and p-values. Based on one step approximation d_params and on results.cov_params Cook's distance divides by the number of explanatory variables. p-values are based on the F-distribution which are only approximate outside of linear Gaussian models. Warning: The definition of p-values might change if we switch to using chi-square distribution instead of F-distribution, or if we make it dependent on the fit keyword use_t. """ cooks_d2 = (self.d_params * np.linalg.solve(self.cov_params, self.d_params.T).T).sum(1) cooks_d2 /= self.k_params from scipy import stats # alpha = 0.1 # print stats.f.isf(1-alpha, n_params, res.df_modelwc) # TODO use chi2 # use_f option pvals = stats.f.sf(cooks_d2, self.k_params, self.results.df_resid) return cooks_d2, pvals @cache_readonly def resid_studentized(self): """studentized default residuals. This uses the residual in `resid` attribute, which is by default resid_pearson and studentizes is using the generalized leverage. self.resid / np.sqrt(1 - self.hat_matrix_diag) Studentized residuals are not available if hat_matrix_diag is None. """ return self.resid / np.sqrt(1 - self.hat_matrix_diag) def resid_score_factor(self): """Score residual divided by sqrt of hessian factor. experimental, agrees with GLMInfluence for Binomial and Gaussian. This corresponds to considering the linear predictors as parameters of the model. Note: Nhis might have nan values if second derivative, hessian_factor, is positive, i.e. loglikelihood is not globally concave w.r.t. linear predictor. (This occured in an example for GeneralizedPoisson) """ from statsmodels.genmod.generalized_linear_model import GLM sf = self.results.model.score_factor(self.results.params) hf = self.results.model.hessian_factor(self.results.params) if isinstance(sf, tuple): sf = sf[0] if isinstance(hf, tuple): hf = hf[0] if not isinstance(self.results.model, GLM): # hessian_factor in GLM has wrong sign, is already positive hf = -hf return sf / np.sqrt(hf) / np.sqrt(1 - self.hat_matrix_diag) def resid_score(self, joint=True, index=None, studentize=False): """Score observations scaled by inverse hessian. Score residual in resid_score are defined in analogy to a score test statistic for each observation. Parameters ---------- joint : bool If joint is true, then a quadratic form similar to score_test is returned for each observation. If joint is false, then standardized score_obs are returned. The returned array is two-dimensional index : ndarray (optional) Optional index to select a subset of score_obs columns. By default, all columns of score_obs will be used. studentize : bool If studentize is true, the the scaled residuals are also studentized using the generalized leverage. Returns ------- array : 1-D or 2-D residuals Notes ----- Status: experimental Because of the one srep approacimation of d_params, score residuals are identical to cooks_distance, except for - cooks_distance is normalized by the number of parameters - cooks_distance uses cov_params, resid_score is based on Hessian. This will make them differ in the case of robust cov_params. """ # currently no caching score_obs = self.results.model.score_obs(self.results.params) hess = self.results.model.hessian(self.results.params) if index is not None: score_obs = score_obs[:, index] hess = hess[index[:, None], index] if joint: resid = (score_obs.T * np.linalg.solve(-hess, score_obs.T)).sum(0) else: resid = score_obs / np.sqrt(np.diag(-hess)) if studentize: if joint: resid /= np.sqrt(1 - self.hat_matrix_diag) else: # 2-dim resid resid /= np.sqrt(1 - self.hat_matrix_diag[:, None]) return resid @cache_readonly def _get_prediction(self): # TODO: do we cache this or does it need to be a method # we only need unchanging parts, alpha for confint could change with warnings.catch_warnings(): msg = 'linear keyword is deprecated, use which="linear"' warnings.filterwarnings("ignore", message=msg, category=FutureWarning) pred = self.results.get_prediction() return pred @cache_readonly def d_fittedvalues(self): """Change in expected response, fittedvalues. Local change of expected mean given the change in the parameters as computed in d_params. Notes ----- This uses the one-step approximation of the parameter change to deleting one observation ``d_params``. """ # results.params might be a pandas.Series params = np.asarray(self.results.params) deriv = self.results.model._deriv_mean_dparams(params) return (deriv * self.d_params).sum(1) @property def d_fittedvalues_scaled(self): """ Change in fittedvalues scaled by standard errors. This uses one-step approximation of the parameter change to deleting one observation ``d_params``, and divides by the standard errors for the predicted mean provided by results.get_prediction. """ # Note: this and the previous methods are for the response # and not for a weighted response, i.e. not the self.exog, self.endog # this will be relevant for WLS comparing fitted endog versus wendog return self.d_fittedvalues / self._get_prediction.se def summary_frame(self): """ Creates a DataFrame with influence results. Returns ------- frame : pandas DataFrame A DataFrame with selected results for each observation. The index will be the same as provided to the model. Notes ----- The resultant DataFrame contains six variables in addition to the ``dfbetas``. These are: * cooks_d : Cook's Distance defined in ``cooks_distance`` * standard_resid : Standardized residuals defined in `resid_studentizedl` * hat_diag : The diagonal of the projection, or hat, matrix defined in `hat_matrix_diag`. Not included if None. * dffits_internal : DFFITS statistics using internally Studentized residuals defined in `d_fittedvalues_scaled` """ from pandas import DataFrame # row and column labels data = self.results.model.data row_labels = data.row_labels beta_labels = ['dfb_' + i for i in data.xnames] # grab the results if self.hat_matrix_diag is not None: summary_data = DataFrame(dict( cooks_d=self.cooks_distance[0], standard_resid=self.resid_studentized, hat_diag=self.hat_matrix_diag, dffits_internal=self.d_fittedvalues_scaled), index=row_labels) else: summary_data = DataFrame(dict( cooks_d=self.cooks_distance[0], # standard_resid=self.resid_studentized, # hat_diag=self.hat_matrix_diag, dffits_internal=self.d_fittedvalues_scaled), index=row_labels) # NOTE: if we do not give columns, order of above will be arbitrary dfbeta = DataFrame(self.dfbetas, columns=beta_labels, index=row_labels) return dfbeta.join(summary_data) class OLSInfluence(_BaseInfluenceMixin): """class to calculate outlier and influence measures for OLS result Parameters ---------- results : RegressionResults currently assumes the results are from an OLS regression Notes ----- One part of the results can be calculated without any auxiliary regression (some of which have the `_internal` postfix in the name. Other statistics require leave-one-observation-out (LOOO) auxiliary regression, and will be slower (mainly results with `_external` postfix in the name). The auxiliary LOOO regression only the required results are stored. Using the LOO measures is currently only recommended if the data set is not too large. One possible approach for LOOO measures would be to identify possible problem observations with the _internal measures, and then run the leave-one-observation-out only with observations that are possible outliers. (However, this is not yet available in an automated way.) This should be extended to general least squares. The leave-one-variable-out (LOVO) auxiliary regression are currently not used. """ def __init__(self, results): # check which model is allowed self.results = maybe_unwrap_results(results) self.nobs, self.k_vars = results.model.exog.shape self.endog = results.model.endog self.exog = results.model.exog self.resid = results.resid self.model_class = results.model.__class__ # self.sigma_est = np.sqrt(results.mse_resid) self.scale = results.mse_resid self.aux_regression_exog = {} self.aux_regression_endog = {} @cache_readonly def hat_matrix_diag(self): """Diagonal of the hat_matrix for OLS Notes ----- temporarily calculated here, this should go to model class """ return (self.exog * self.results.model.pinv_wexog.T).sum(1) @cache_readonly def resid_press(self): """PRESS residuals """ hii = self.hat_matrix_diag return self.resid / (1 - hii) @cache_readonly def influence(self): """Influence measure matches the influence measure that gretl reports u * h / (1 - h) where u are the residuals and h is the diagonal of the hat_matrix """ hii = self.hat_matrix_diag return self.resid * hii / (1 - hii) @cache_readonly def hat_diag_factor(self): """Factor of diagonal of hat_matrix used in influence this might be useful for internal reuse h / (1 - h) """ hii = self.hat_matrix_diag return hii / (1 - hii) @cache_readonly def ess_press(self): """Error sum of squares of PRESS residuals """ return np.dot(self.resid_press, self.resid_press) @cache_readonly def resid_studentized(self): """Studentized residuals using variance from OLS alias for resid_studentized_internal for compatibility with MLEInfluence this uses sigma from original estimate and does not require leave one out loop """ return self.resid_studentized_internal @cache_readonly def resid_studentized_internal(self): """Studentized residuals using variance from OLS this uses sigma from original estimate does not require leave one out loop """ return self.get_resid_studentized_external(sigma=None) # return self.results.resid / self.sigma_est @cache_readonly def resid_studentized_external(self): """Studentized residuals using LOOO variance this uses sigma from leave-one-out estimates requires leave one out loop for observations """ sigma_looo = np.sqrt(self.sigma2_not_obsi) return self.get_resid_studentized_external(sigma=sigma_looo) def get_resid_studentized_external(self, sigma=None): """calculate studentized residuals Parameters ---------- sigma : None or float estimate of the standard deviation of the residuals. If None, then the estimate from the regression results is used. Returns ------- stzd_resid : ndarray studentized residuals Notes ----- studentized residuals are defined as :: resid / sigma / np.sqrt(1 - hii) where resid are the residuals from the regression, sigma is an estimate of the standard deviation of the residuals, and hii is the diagonal of the hat_matrix. """ hii = self.hat_matrix_diag if sigma is None: sigma2_est = self.scale # can be replace by different estimators of sigma sigma = np.sqrt(sigma2_est) return self.resid / sigma / np.sqrt(1 - hii) # same computation as GLMInfluence @cache_readonly def cooks_distance(self): """ Cooks distance Uses original results, no nobs loop References ---------- .. [*] Eubank, R. L. (1999). Nonparametric regression and spline smoothing. CRC press. .. [*] Cook's distance. (n.d.). In Wikipedia. July 2019, from https://en.wikipedia.org/wiki/Cook%27s_distance """ hii = self.hat_matrix_diag # Eubank p.93, 94 cooks_d2 = self.resid_studentized ** 2 / self.k_vars cooks_d2 *= hii / (1 - hii) from scipy import stats # alpha = 0.1 # print stats.f.isf(1-alpha, n_params, res.df_modelwc) pvals = stats.f.sf(cooks_d2, self.k_vars, self.results.df_resid) return cooks_d2, pvals @cache_readonly def dffits_internal(self): """dffits measure for influence of an observation based on resid_studentized_internal uses original results, no nobs loop """ # TODO: do I want to use different sigma estimate in # resid_studentized_external # -> move definition of sigma_error to the __init__ hii = self.hat_matrix_diag dffits_ = self.resid_studentized_internal * np.sqrt(hii / (1 - hii)) dffits_threshold = 2 * np.sqrt(self.k_vars * 1. / self.nobs) return dffits_, dffits_threshold @cache_readonly def dffits(self): """ dffits measure for influence of an observation based on resid_studentized_external, uses results from leave-one-observation-out loop It is recommended that observations with dffits large than a threshold of 2 sqrt{k / n} where k is the number of parameters, should be investigated. Returns ------- dffits : float dffits_threshold : float References ---------- `Wikipedia `_ """ # TODO: do I want to use different sigma estimate in # resid_studentized_external # -> move definition of sigma_error to the __init__ hii = self.hat_matrix_diag dffits_ = self.resid_studentized_external * np.sqrt(hii / (1 - hii)) dffits_threshold = 2 * np.sqrt(self.k_vars * 1. / self.nobs) return dffits_, dffits_threshold @cache_readonly def dfbetas(self): """dfbetas uses results from leave-one-observation-out loop """ dfbetas = self.results.params - self.params_not_obsi # [None,:] dfbetas /= np.sqrt(self.sigma2_not_obsi[:, None]) dfbetas /= np.sqrt(np.diag(self.results.normalized_cov_params)) return dfbetas @cache_readonly def dfbeta(self): """dfbetas uses results from leave-one-observation-out loop """ dfbeta = self.results.params - self.params_not_obsi return dfbeta @cache_readonly def sigma2_not_obsi(self): """error variance for all LOOO regressions This is 'mse_resid' from each auxiliary regression. uses results from leave-one-observation-out loop """ return np.asarray(self._res_looo['mse_resid']) @property def params_not_obsi(self): """parameter estimates for all LOOO regressions uses results from leave-one-observation-out loop """ return np.asarray(self._res_looo['params']) @property def det_cov_params_not_obsi(self): """determinant of cov_params of all LOOO regressions uses results from leave-one-observation-out loop """ return np.asarray(self._res_looo['det_cov_params']) @cache_readonly def cov_ratio(self): """covariance ratio between LOOO and original This uses determinant of the estimate of the parameter covariance from leave-one-out estimates. requires leave one out loop for observations """ # do not use inplace division / because then we change original cov_ratio = (self.det_cov_params_not_obsi / np.linalg.det(self.results.cov_params())) return cov_ratio @cache_readonly def resid_var(self): """estimate of variance of the residuals :: sigma2 = sigma2_OLS * (1 - hii) where hii is the diagonal of the hat matrix """ # TODO:check if correct outside of ols return self.scale * (1 - self.hat_matrix_diag) @cache_readonly def resid_std(self): """estimate of standard deviation of the residuals See Also -------- resid_var """ return np.sqrt(self.resid_var) def _ols_xnoti(self, drop_idx, endog_idx='endog', store=True): """regression results from LOVO auxiliary regression with cache The result instances are stored, which could use a large amount of memory if the datasets are large. There are too many combinations to store them all, except for small problems. Parameters ---------- drop_idx : int index of exog that is dropped from the regression endog_idx : 'endog' or int If 'endog', then the endogenous variable of the result instance is regressed on the exogenous variables, excluding the one at drop_idx. If endog_idx is an integer, then the exog with that index is regressed with OLS on all other exogenous variables. (The latter is the auxiliary regression for the variance inflation factor.) this needs more thought, memory versus speed not yet used in any other parts, not sufficiently tested """ # reverse the structure, access store, if fail calculate ? # this creates keys in store even if store = false ! bug if endog_idx == 'endog': stored = self.aux_regression_endog if hasattr(stored, drop_idx): return stored[drop_idx] x_i = self.results.model.endog else: # nested dictionary try: self.aux_regression_exog[endog_idx][drop_idx] except KeyError: pass stored = self.aux_regression_exog[endog_idx] stored = {} x_i = self.exog[:, endog_idx] k_vars = self.exog.shape[1] mask = np.arange(k_vars) != drop_idx x_noti = self.exog[:, mask] res = OLS(x_i, x_noti).fit() if store: stored[drop_idx] = res return res def _get_drop_vari(self, attributes): """ regress endog on exog without one of the variables This uses a k_vars loop, only attributes of the OLS instance are stored. Parameters ---------- attributes : list[str] These are the names of the attributes of the auxiliary OLS results instance that are stored and returned. not yet used """ from statsmodels.sandbox.tools.cross_val import LeaveOneOut endog = self.results.model.endog exog = self.exog cv_iter = LeaveOneOut(self.k_vars) res_loo = defaultdict(list) for inidx, outidx in cv_iter: for att in attributes: res_i = self.model_class(endog, exog[:, inidx]).fit() res_loo[att].append(getattr(res_i, att)) return res_loo @cache_readonly def _res_looo(self): """collect required results from the LOOO loop all results will be attached. currently only 'params', 'mse_resid', 'det_cov_params' are stored regresses endog on exog dropping one observation at a time this uses a nobs loop, only attributes of the OLS instance are stored. """ from statsmodels.sandbox.tools.cross_val import LeaveOneOut def get_det_cov_params(res): return np.linalg.det(res.cov_params()) endog = self.results.model.endog exog = self.results.model.exog params = np.zeros(exog.shape, dtype=float) mse_resid = np.zeros(endog.shape, dtype=float) det_cov_params = np.zeros(endog.shape, dtype=float) cv_iter = LeaveOneOut(self.nobs) for inidx, outidx in cv_iter: res_i = self.model_class(endog[inidx], exog[inidx]).fit() params[outidx] = res_i.params mse_resid[outidx] = res_i.mse_resid det_cov_params[outidx] = get_det_cov_params(res_i) return dict(params=params, mse_resid=mse_resid, det_cov_params=det_cov_params) def summary_frame(self): """ Creates a DataFrame with all available influence results. Returns ------- frame : DataFrame A DataFrame with all results. Notes ----- The resultant DataFrame contains six variables in addition to the DFBETAS. These are: * cooks_d : Cook's Distance defined in `Influence.cooks_distance` * standard_resid : Standardized residuals defined in `Influence.resid_studentized_internal` * hat_diag : The diagonal of the projection, or hat, matrix defined in `Influence.hat_matrix_diag` * dffits_internal : DFFITS statistics using internally Studentized residuals defined in `Influence.dffits_internal` * dffits : DFFITS statistics using externally Studentized residuals defined in `Influence.dffits` * student_resid : Externally Studentized residuals defined in `Influence.resid_studentized_external` """ from pandas import DataFrame # row and column labels data = self.results.model.data row_labels = data.row_labels beta_labels = ['dfb_' + i for i in data.xnames] # grab the results summary_data = DataFrame(dict( cooks_d=self.cooks_distance[0], standard_resid=self.resid_studentized_internal, hat_diag=self.hat_matrix_diag, dffits_internal=self.dffits_internal[0], student_resid=self.resid_studentized_external, dffits=self.dffits[0], ), index=row_labels) # NOTE: if we do not give columns, order of above will be arbitrary dfbeta = DataFrame(self.dfbetas, columns=beta_labels, index=row_labels) return dfbeta.join(summary_data) def summary_table(self, float_fmt="%6.3f"): """create a summary table with all influence and outlier measures This does currently not distinguish between statistics that can be calculated from the original regression results and for which a leave-one-observation-out loop is needed Returns ------- res : SimpleTable SimpleTable instance with the results, can be printed Notes ----- This also attaches table_data to the instance. """ # print self.dfbetas # table_raw = [ np.arange(self.nobs), # self.endog, # self.fittedvalues, # self.cooks_distance(), # self.resid_studentized_internal, # self.hat_matrix_diag, # self.dffits_internal, # self.resid_studentized_external, # self.dffits, # self.dfbetas # ] table_raw = [('obs', np.arange(self.nobs)), ('endog', self.endog), ('fitted\nvalue', self.results.fittedvalues), ("Cook's\nd", self.cooks_distance[0]), ("student.\nresidual", self.resid_studentized_internal), ('hat diag', self.hat_matrix_diag), ('dffits \ninternal', self.dffits_internal[0]), ("ext.stud.\nresidual", self.resid_studentized_external), ('dffits', self.dffits[0]) ] colnames, data = lzip(*table_raw) # unzip data = np.column_stack(data) self.table_data = data from copy import deepcopy from statsmodels.iolib.table import SimpleTable, default_html_fmt from statsmodels.iolib.tableformatting import fmt_base fmt = deepcopy(fmt_base) fmt_html = deepcopy(default_html_fmt) fmt['data_fmts'] = ["%4d"] + [float_fmt] * (data.shape[1] - 1) # fmt_html['data_fmts'] = fmt['data_fmts'] return SimpleTable(data, headers=colnames, txt_fmt=fmt, html_fmt=fmt_html) def summary_table(res, alpha=0.05): """ Generate summary table of outlier and influence similar to SAS Parameters ---------- alpha : float significance level for confidence interval Returns ------- st : SimpleTable table with results that can be printed data : ndarray calculated measures and statistics for the table ss2 : list[str] column_names for table (Note: rows of table are observations) """ from scipy import stats from statsmodels.sandbox.regression.predstd import wls_prediction_std infl = OLSInfluence(res) # standard error for predicted mean # Note: using hat_matrix only works for fitted values predict_mean_se = np.sqrt(infl.hat_matrix_diag * res.mse_resid) tppf = stats.t.isf(alpha / 2., res.df_resid) predict_mean_ci = np.column_stack([ res.fittedvalues - tppf * predict_mean_se, res.fittedvalues + tppf * predict_mean_se]) # standard error for predicted observation tmp = wls_prediction_std(res, alpha=alpha) predict_se, predict_ci_low, predict_ci_upp = tmp predict_ci = np.column_stack((predict_ci_low, predict_ci_upp)) # standard deviation of residual resid_se = np.sqrt(res.mse_resid * (1 - infl.hat_matrix_diag)) table_sm = np.column_stack([ np.arange(res.nobs) + 1, res.model.endog, res.fittedvalues, predict_mean_se, predict_mean_ci[:, 0], predict_mean_ci[:, 1], predict_ci[:, 0], predict_ci[:, 1], res.resid, resid_se, infl.resid_studentized_internal, infl.cooks_distance[0] ]) # colnames, data = lzip(*table_raw) #unzip data = table_sm ss2 = ['Obs', 'Dep Var\nPopulation', 'Predicted\nValue', 'Std Error\nMean Predict', 'Mean ci\n95% low', 'Mean ci\n95% upp', 'Predict ci\n95% low', 'Predict ci\n95% upp', 'Residual', 'Std Error\nResidual', 'Student\nResidual', "Cook's\nD"] colnames = ss2 # self.table_data = data # data = np.column_stack(data) from copy import deepcopy from statsmodels.iolib.table import SimpleTable, default_html_fmt from statsmodels.iolib.tableformatting import fmt_base fmt = deepcopy(fmt_base) fmt_html = deepcopy(default_html_fmt) fmt['data_fmts'] = ["%4d"] + ["%6.3f"] * (data.shape[1] - 1) # fmt_html['data_fmts'] = fmt['data_fmts'] st = SimpleTable(data, headers=colnames, txt_fmt=fmt, html_fmt=fmt_html) return st, data, ss2 class GLMInfluence(MLEInfluence): """Influence and outlier measures (experimental) This uses partly formulas specific to GLM, specifically cooks_distance is based on the hessian, i.e. observed or expected information matrix and not on cov_params, in contrast to MLEInfluence. Standardization for changes in parameters, in fittedvalues and in the linear predictor are based on cov_params. Parameters ---------- results : instance of results class This only works for model and results classes that have the necessary helper methods. other arguments are only to override default behavior and are used instead of the corresponding attribute of the results class. By default resid_pearson is used as resid. Attributes ---------- dbetas change in parameters divided by the standard error of parameters from the full model results, ``bse``. d_fittedvalues_scaled same as d_fittedvalues but scaled by the standard errors of a predicted mean of the response. d_linpred local change in linear prediction. d_linpred_scale local change in linear prediction scaled by the standard errors for the prediction based on cov_params. Notes ----- This has not yet been tested for correctness when offset or exposure are used, although they should be supported by the code. Some GLM specific measures like d_deviance are still missing. Computing an explicit leave-one-observation-out (LOOO) loop is included but no influence measures are currently computed from it. """ @cache_readonly def hat_matrix_diag(self): """ Diagonal of the hat_matrix for GLM Notes ----- This returns the diagonal of the hat matrix that was provided as argument to GLMInfluence or computes it using the results method `get_hat_matrix`. """ if hasattr(self, '_hat_matrix_diag'): return self._hat_matrix_diag else: return self.results.get_hat_matrix() @cache_readonly def d_params(self): """Change in parameter estimates Notes ----- This uses one-step approximation of the parameter change to deleting one observation. """ beta_i = np.linalg.pinv(self.exog) * self.resid_studentized beta_i /= np.sqrt(1 - self.hat_matrix_diag) return beta_i.T # same computation as OLS @cache_readonly def resid_studentized(self): """ Internally studentized pearson residuals Notes ----- residuals / sqrt( scale * (1 - hii)) where residuals are those provided to GLMInfluence which are pearson residuals by default, and hii is the diagonal of the hat matrix. """ # redundant with scaled resid_pearson, keep for docstring for now return super().resid_studentized # same computation as OLS @cache_readonly def cooks_distance(self): """Cook's distance Notes ----- Based on one step approximation using resid_studentized and hat_matrix_diag for the computation. Cook's distance divides by the number of explanatory variables. Computed using formulas for GLM and does not use results.cov_params. It includes p-values based on the F-distribution which are only approximate outside of linear Gaussian models. """ hii = self.hat_matrix_diag # Eubank p.93, 94 cooks_d2 = self.resid_studentized ** 2 / self.k_vars cooks_d2 *= hii / (1 - hii) from scipy import stats # alpha = 0.1 # print stats.f.isf(1-alpha, n_params, res.df_modelwc) pvals = stats.f.sf(cooks_d2, self.k_vars, self.results.df_resid) return cooks_d2, pvals @property def d_linpred(self): """ Change in linear prediction This uses one-step approximation of the parameter change to deleting one observation ``d_params``. """ # TODO: This will need adjustment for extra params in Poisson # use original model exog not transformed influence exog exog = self.results.model.exog return (exog * self.d_params).sum(1) @property def d_linpred_scaled(self): """ Change in linpred scaled by standard errors This uses one-step approximation of the parameter change to deleting one observation ``d_params``, and divides by the standard errors for linpred provided by results.get_prediction. """ # Note: this and the previous methods are for the response # and not for a weighted response, i.e. not the self.exog, self.endog # this will be relevant for WLS comparing fitted endog versus wendog return self.d_linpred / self._get_prediction.linpred.se @property def _fittedvalues_one(self): """experimental code """ warnings.warn('this ignores offset and exposure', UserWarning) # TODO: we need to handle offset, exposure and weights # use original model exog not transformed influence exog exog = self.results.model.exog fitted = np.array([self.results.model.predict(pi, exog[i]) for i, pi in enumerate(self.params_one)]) return fitted.squeeze() @property def _diff_fittedvalues_one(self): """experimental code """ # in discrete we cannot reuse results.fittedvalues return self.results.predict() - self._fittedvalues_one @cache_readonly def _res_looo(self): """collect required results from the LOOO loop all results will be attached. currently only 'params', 'mse_resid', 'det_cov_params' are stored Reestimates the model with endog and exog dropping one observation at a time This uses a nobs loop, only attributes of the results instance are stored. Warning: This will need refactoring and API changes to be able to add options. """ from statsmodels.sandbox.tools.cross_val import LeaveOneOut get_det_cov_params = lambda res: np.linalg.det(res.cov_params()) endog = self.results.model.endog exog = self.results.model.exog init_kwds = self.results.model._get_init_kwds() # We need to drop obs also from extra arrays freq_weights = init_kwds.pop('freq_weights') var_weights = init_kwds.pop('var_weights') offset = offset_ = init_kwds.pop('offset') exposure = exposure_ = init_kwds.pop('exposure') n_trials = init_kwds.pop('n_trials', None) # family Binomial creates `n` i.e. `n_trials` # we need to reset it # TODO: figure out how to do this properly if hasattr(init_kwds['family'], 'initialize'): # assume we have Binomial is_binomial = True else: is_binomial = False params = np.zeros(exog.shape, dtype=float) scale = np.zeros(endog.shape, dtype=float) det_cov_params = np.zeros(endog.shape, dtype=float) cv_iter = LeaveOneOut(self.nobs) for inidx, outidx in cv_iter: if offset is not None: offset_ = offset[inidx] if exposure is not None: exposure_ = exposure[inidx] if n_trials is not None: init_kwds['n_trials'] = n_trials[inidx] mod_i = self.model_class(endog[inidx], exog[inidx], offset=offset_, exposure=exposure_, freq_weights=freq_weights[inidx], var_weights=var_weights[inidx], **init_kwds) if is_binomial: mod_i.family.n = init_kwds['n_trials'] res_i = mod_i.fit(start_params=self.results.params, method='newton') params[outidx] = res_i.params.copy() scale[outidx] = res_i.scale det_cov_params[outidx] = get_det_cov_params(res_i) return dict(params=params, scale=scale, mse_resid=scale, # alias for now det_cov_params=det_cov_params)