1297 lines
43 KiB
Python
1297 lines
43 KiB
Python
|
'''Partial Regression plot and residual plots to find misspecification
|
||
|
|
||
|
|
||
|
Author: Josef Perktold
|
||
|
License: BSD-3
|
||
|
Created: 2011-01-23
|
||
|
|
||
|
update
|
||
|
2011-06-05 : start to convert example to usable functions
|
||
|
2011-10-27 : docstrings
|
||
|
|
||
|
'''
|
||
|
from statsmodels.compat.pandas import Appender
|
||
|
from statsmodels.compat.python import lrange, lzip
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from patsy import dmatrix
|
||
|
|
||
|
from statsmodels.genmod.generalized_estimating_equations import GEE
|
||
|
from statsmodels.genmod.generalized_linear_model import GLM
|
||
|
from statsmodels.graphics import utils
|
||
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
||
|
from statsmodels.regression.linear_model import GLS, OLS, WLS
|
||
|
from statsmodels.sandbox.regression.predstd import wls_prediction_std
|
||
|
from statsmodels.tools.tools import maybe_unwrap_results
|
||
|
|
||
|
from ._regressionplots_doc import (
|
||
|
_plot_added_variable_doc,
|
||
|
_plot_ceres_residuals_doc,
|
||
|
_plot_influence_doc,
|
||
|
_plot_leverage_resid2_doc,
|
||
|
_plot_partial_residuals_doc,
|
||
|
)
|
||
|
|
||
|
__all__ = ['plot_fit', 'plot_regress_exog', 'plot_partregress', 'plot_ccpr',
|
||
|
'plot_regress_exog', 'plot_partregress_grid', 'plot_ccpr_grid',
|
||
|
'add_lowess', 'abline_plot', 'influence_plot',
|
||
|
'plot_leverage_resid2', 'added_variable_resids',
|
||
|
'partial_resids', 'ceres_resids', 'plot_added_variable',
|
||
|
'plot_partial_residuals', 'plot_ceres_residuals']
|
||
|
|
||
|
#TODO: consider moving to influence module
|
||
|
def _high_leverage(results):
|
||
|
#TODO: replace 1 with k_constant
|
||
|
return 2. * (results.df_model + 1)/results.nobs
|
||
|
|
||
|
|
||
|
def add_lowess(ax, lines_idx=0, frac=.2, **lowess_kwargs):
|
||
|
"""
|
||
|
Add Lowess line to a plot.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
ax : AxesSubplot
|
||
|
The Axes to which to add the plot
|
||
|
lines_idx : int
|
||
|
This is the line on the existing plot to which you want to add
|
||
|
a smoothed lowess line.
|
||
|
frac : float
|
||
|
The fraction of the points to use when doing the lowess fit.
|
||
|
lowess_kwargs
|
||
|
Additional keyword arguments are passes to lowess.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
The figure that holds the instance.
|
||
|
"""
|
||
|
y0 = ax.get_lines()[lines_idx]._y
|
||
|
x0 = ax.get_lines()[lines_idx]._x
|
||
|
lres = lowess(y0, x0, frac=frac, **lowess_kwargs)
|
||
|
ax.plot(lres[:, 0], lres[:, 1], 'r', lw=1.5)
|
||
|
return ax.figure
|
||
|
|
||
|
|
||
|
def plot_fit(results, exog_idx, y_true=None, ax=None, vlines=True, **kwargs):
|
||
|
"""
|
||
|
Plot fit against one regressor.
|
||
|
|
||
|
This creates one graph with the scatterplot of observed values
|
||
|
compared to fitted values.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : Results
|
||
|
A result instance with resid, model.endog and model.exog as
|
||
|
attributes.
|
||
|
exog_idx : {int, str}
|
||
|
Name or index of regressor in exog matrix.
|
||
|
y_true : array_like. optional
|
||
|
If this is not None, then the array is added to the plot.
|
||
|
ax : AxesSubplot, optional
|
||
|
If given, this subplot is used to plot in instead of a new figure being
|
||
|
created.
|
||
|
vlines : bool, optional
|
||
|
If this not True, then the uncertainty (pointwise prediction intervals) of the fit is not
|
||
|
plotted.
|
||
|
**kwargs
|
||
|
The keyword arguments are passed to the plot command for the fitted
|
||
|
values points.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
If `ax` is None, the created figure. Otherwise the figure to which
|
||
|
`ax` is connected.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Load the Statewide Crime data set and perform linear regression with
|
||
|
`poverty` and `hs_grad` as variables and `murder` as the response
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
|
||
|
>>> data = sm.datasets.statecrime.load_pandas().data
|
||
|
>>> murder = data['murder']
|
||
|
>>> X = data[['poverty', 'hs_grad']]
|
||
|
|
||
|
>>> X["constant"] = 1
|
||
|
>>> y = murder
|
||
|
>>> model = sm.OLS(y, X)
|
||
|
>>> results = model.fit()
|
||
|
|
||
|
Create a plot just for the variable 'Poverty.'
|
||
|
Note that vertical bars representing uncertainty are plotted since vlines is true
|
||
|
|
||
|
>>> fig, ax = plt.subplots()
|
||
|
>>> fig = sm.graphics.plot_fit(results, 0, ax=ax)
|
||
|
>>> ax.set_ylabel("Murder Rate")
|
||
|
>>> ax.set_xlabel("Poverty Level")
|
||
|
>>> ax.set_title("Linear Regression")
|
||
|
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_plot_fit_ex.py
|
||
|
"""
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
|
||
|
results = maybe_unwrap_results(results)
|
||
|
|
||
|
#maybe add option for wendog, wexog
|
||
|
y = results.model.endog
|
||
|
x1 = results.model.exog[:, exog_idx]
|
||
|
x1_argsort = np.argsort(x1)
|
||
|
y = y[x1_argsort]
|
||
|
x1 = x1[x1_argsort]
|
||
|
|
||
|
ax.plot(x1, y, 'bo', label=results.model.endog_names)
|
||
|
if y_true is not None:
|
||
|
ax.plot(x1, y_true[x1_argsort], 'b-', label='True values')
|
||
|
title = 'Fitted values versus %s' % exog_name
|
||
|
|
||
|
ax.plot(x1, results.fittedvalues[x1_argsort], 'D', color='r',
|
||
|
label='fitted', **kwargs)
|
||
|
if vlines is True:
|
||
|
_, iv_l, iv_u = wls_prediction_std(results)
|
||
|
ax.vlines(x1, iv_l[x1_argsort], iv_u[x1_argsort], linewidth=1,
|
||
|
color='k', alpha=.7)
|
||
|
#ax.fill_between(x1, iv_l[x1_argsort], iv_u[x1_argsort], alpha=0.1,
|
||
|
# color='k')
|
||
|
ax.set_title(title)
|
||
|
ax.set_xlabel(exog_name)
|
||
|
ax.set_ylabel(results.model.endog_names)
|
||
|
ax.legend(loc='best', numpoints=1)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_regress_exog(results, exog_idx, fig=None):
|
||
|
"""Plot regression results against one regressor.
|
||
|
|
||
|
This plots four graphs in a 2 by 2 figure: 'endog versus exog',
|
||
|
'residuals versus exog', 'fitted versus exog' and
|
||
|
'fitted plus residual versus exog'
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : result instance
|
||
|
A result instance with resid, model.endog and model.exog as attributes.
|
||
|
exog_idx : int or str
|
||
|
Name or index of regressor in exog matrix.
|
||
|
fig : Figure, optional
|
||
|
If given, this figure is simply returned. Otherwise a new figure is
|
||
|
created.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
The value of `fig` if provided. Otherwise a new instance.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Load the Statewide Crime data set and build a model with regressors
|
||
|
including the rate of high school graduation (hs_grad), population in urban
|
||
|
areas (urban), households below poverty line (poverty), and single person
|
||
|
households (single). Outcome variable is the murder rate (murder).
|
||
|
|
||
|
Build a 2 by 2 figure based on poverty showing fitted versus actual murder
|
||
|
rate, residuals versus the poverty rate, partial regression plot of poverty,
|
||
|
and CCPR plot for poverty rate.
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
>>> import statsmodels.formula.api as smf
|
||
|
|
||
|
>>> fig = plt.figure(figsize=(8, 6))
|
||
|
>>> crime_data = sm.datasets.statecrime.load_pandas()
|
||
|
>>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
|
||
|
... data=crime_data.data).fit()
|
||
|
>>> sm.graphics.plot_regress_exog(results, 'poverty', fig=fig)
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_regress_exog.py
|
||
|
"""
|
||
|
|
||
|
fig = utils.create_mpl_fig(fig)
|
||
|
|
||
|
exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
|
||
|
results = maybe_unwrap_results(results)
|
||
|
|
||
|
#maybe add option for wendog, wexog
|
||
|
y_name = results.model.endog_names
|
||
|
x1 = results.model.exog[:, exog_idx]
|
||
|
prstd, iv_l, iv_u = wls_prediction_std(results)
|
||
|
|
||
|
ax = fig.add_subplot(2, 2, 1)
|
||
|
ax.plot(x1, results.model.endog, 'o', color='b', alpha=0.9, label=y_name)
|
||
|
ax.plot(x1, results.fittedvalues, 'D', color='r', label='fitted',
|
||
|
alpha=.5)
|
||
|
ax.vlines(x1, iv_l, iv_u, linewidth=1, color='k', alpha=.7)
|
||
|
ax.set_title('Y and Fitted vs. X', fontsize='large')
|
||
|
ax.set_xlabel(exog_name)
|
||
|
ax.set_ylabel(y_name)
|
||
|
ax.legend(loc='best')
|
||
|
|
||
|
ax = fig.add_subplot(2, 2, 2)
|
||
|
ax.plot(x1, results.resid, 'o')
|
||
|
ax.axhline(y=0, color='black')
|
||
|
ax.set_title('Residuals versus %s' % exog_name, fontsize='large')
|
||
|
ax.set_xlabel(exog_name)
|
||
|
ax.set_ylabel("resid")
|
||
|
|
||
|
ax = fig.add_subplot(2, 2, 3)
|
||
|
exog_noti = np.ones(results.model.exog.shape[1], bool)
|
||
|
exog_noti[exog_idx] = False
|
||
|
exog_others = results.model.exog[:, exog_noti]
|
||
|
from pandas import Series
|
||
|
fig = plot_partregress(results.model.data.orig_endog,
|
||
|
Series(x1, name=exog_name,
|
||
|
index=results.model.data.row_labels),
|
||
|
exog_others, obs_labels=False, ax=ax)
|
||
|
ax.set_title('Partial regression plot', fontsize='large')
|
||
|
#ax.set_ylabel("Fitted values")
|
||
|
#ax.set_xlabel(exog_name)
|
||
|
|
||
|
ax = fig.add_subplot(2, 2, 4)
|
||
|
fig = plot_ccpr(results, exog_idx, ax=ax)
|
||
|
ax.set_title('CCPR Plot', fontsize='large')
|
||
|
#ax.set_xlabel(exog_name)
|
||
|
#ax.set_ylabel("Fitted values + resids")
|
||
|
|
||
|
fig.suptitle('Regression Plots for %s' % exog_name, fontsize="large")
|
||
|
|
||
|
fig.tight_layout()
|
||
|
|
||
|
fig.subplots_adjust(top=.90)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def _partial_regression(endog, exog_i, exog_others):
|
||
|
"""Partial regression.
|
||
|
|
||
|
regress endog on exog_i conditional on exog_others
|
||
|
|
||
|
uses OLS
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : array_like
|
||
|
exog : array_like
|
||
|
exog_others : array_like
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
res1c : OLS results instance
|
||
|
|
||
|
(res1a, res1b) : tuple of OLS results instances
|
||
|
results from regression of endog on exog_others and of exog_i on
|
||
|
exog_others
|
||
|
"""
|
||
|
#FIXME: This function does not appear to be used.
|
||
|
res1a = OLS(endog, exog_others).fit()
|
||
|
res1b = OLS(exog_i, exog_others).fit()
|
||
|
res1c = OLS(res1a.resid, res1b.resid).fit()
|
||
|
|
||
|
return res1c, (res1a, res1b)
|
||
|
|
||
|
|
||
|
def plot_partregress(endog, exog_i, exog_others, data=None,
|
||
|
title_kwargs={}, obs_labels=True, label_kwargs={},
|
||
|
ax=None, ret_coords=False, eval_env=1, **kwargs):
|
||
|
"""Plot partial regression for a single regressor.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : {ndarray, str}
|
||
|
The endogenous or response variable. If string is given, you can use a
|
||
|
arbitrary translations as with a formula.
|
||
|
exog_i : {ndarray, str}
|
||
|
The exogenous, explanatory variable. If string is given, you can use a
|
||
|
arbitrary translations as with a formula.
|
||
|
exog_others : {ndarray, list[str]}
|
||
|
Any other exogenous, explanatory variables. If a list of strings is
|
||
|
given, each item is a term in formula. You can use a arbitrary
|
||
|
translations as with a formula. The effect of these variables will be
|
||
|
removed by OLS regression.
|
||
|
data : {DataFrame, dict}
|
||
|
Some kind of data structure with names if the other variables are
|
||
|
given as strings.
|
||
|
title_kwargs : dict
|
||
|
Keyword arguments to pass on for the title. The key to control the
|
||
|
fonts is fontdict.
|
||
|
obs_labels : {bool, array_like}
|
||
|
Whether or not to annotate the plot points with their observation
|
||
|
labels. If obs_labels is a boolean, the point labels will try to do
|
||
|
the right thing. First it will try to use the index of data, then
|
||
|
fall back to the index of exog_i. Alternatively, you may give an
|
||
|
array-like object corresponding to the observation numbers.
|
||
|
label_kwargs : dict
|
||
|
Keyword arguments that control annotate for the observation labels.
|
||
|
ax : AxesSubplot, optional
|
||
|
If given, this subplot is used to plot in instead of a new figure being
|
||
|
created.
|
||
|
ret_coords : bool
|
||
|
If True will return the coordinates of the points in the plot. You
|
||
|
can use this to add your own annotations.
|
||
|
eval_env : int
|
||
|
Patsy eval environment if user functions and formulas are used in
|
||
|
defining endog or exog.
|
||
|
**kwargs
|
||
|
The keyword arguments passed to plot for the points.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
fig : Figure
|
||
|
If `ax` is None, the created figure. Otherwise the figure to which
|
||
|
`ax` is connected.
|
||
|
coords : list, optional
|
||
|
If ret_coords is True, return a tuple of arrays (x_coords, y_coords).
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
plot_partregress_grid : Plot partial regression for a set of regressors.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The slope of the fitted line is the that of `exog_i` in the full
|
||
|
multiple regression. The individual points can be used to assess the
|
||
|
influence of points on the estimated coefficient.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Load the Statewide Crime data set and plot partial regression of the rate
|
||
|
of high school graduation (hs_grad) on the murder rate(murder).
|
||
|
|
||
|
The effects of the percent of the population living in urban areas (urban),
|
||
|
below the poverty line (poverty) , and in a single person household (single)
|
||
|
are removed by OLS regression.
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
|
||
|
>>> crime_data = sm.datasets.statecrime.load_pandas()
|
||
|
>>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
|
||
|
... exog_others=['urban', 'poverty', 'single'],
|
||
|
... data=crime_data.data, obs_labels=False)
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_partregress.py
|
||
|
|
||
|
More detailed examples can be found in the Regression Plots notebook
|
||
|
on the examples page.
|
||
|
"""
|
||
|
#NOTE: there is no interaction between possible missing data and
|
||
|
#obs_labels yet, so this will need to be tweaked a bit for this case
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
# strings, use patsy to transform to data
|
||
|
if isinstance(endog, str):
|
||
|
endog = dmatrix(endog + "-1", data, eval_env=eval_env)
|
||
|
|
||
|
if isinstance(exog_others, str):
|
||
|
RHS = dmatrix(exog_others, data, eval_env=eval_env)
|
||
|
elif isinstance(exog_others, list):
|
||
|
RHS = "+".join(exog_others)
|
||
|
RHS = dmatrix(RHS, data, eval_env=eval_env)
|
||
|
else:
|
||
|
RHS = exog_others
|
||
|
RHS_isemtpy = False
|
||
|
if isinstance(RHS, np.ndarray) and RHS.size==0:
|
||
|
RHS_isemtpy = True
|
||
|
elif isinstance(RHS, pd.DataFrame) and RHS.empty:
|
||
|
RHS_isemtpy = True
|
||
|
if isinstance(exog_i, str):
|
||
|
exog_i = dmatrix(exog_i + "-1", data, eval_env=eval_env)
|
||
|
|
||
|
# all arrays or pandas-like
|
||
|
|
||
|
if RHS_isemtpy:
|
||
|
endog = np.asarray(endog)
|
||
|
exog_i = np.asarray(exog_i)
|
||
|
ax.plot(endog, exog_i, 'o', **kwargs)
|
||
|
fitted_line = OLS(endog, exog_i).fit()
|
||
|
x_axis_endog_name = 'x' if isinstance(exog_i, np.ndarray) else exog_i.name
|
||
|
y_axis_endog_name = 'y' if isinstance(endog, np.ndarray) else endog.design_info.column_names[0]
|
||
|
else:
|
||
|
res_yaxis = OLS(endog, RHS).fit()
|
||
|
res_xaxis = OLS(exog_i, RHS).fit()
|
||
|
xaxis_resid = res_xaxis.resid
|
||
|
yaxis_resid = res_yaxis.resid
|
||
|
x_axis_endog_name = res_xaxis.model.endog_names
|
||
|
y_axis_endog_name = res_yaxis.model.endog_names
|
||
|
ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
|
||
|
fitted_line = OLS(yaxis_resid, xaxis_resid).fit()
|
||
|
|
||
|
fig = abline_plot(0, np.asarray(fitted_line.params)[0], color='k', ax=ax)
|
||
|
|
||
|
if x_axis_endog_name == 'y': # for no names regression will just get a y
|
||
|
x_axis_endog_name = 'x' # this is misleading, so use x
|
||
|
ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
|
||
|
ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
|
||
|
ax.set_title('Partial Regression Plot', **title_kwargs)
|
||
|
|
||
|
# NOTE: if we want to get super fancy, we could annotate if a point is
|
||
|
# clicked using this widget
|
||
|
# http://stackoverflow.com/questions/4652439/
|
||
|
# is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
|
||
|
# 4674445#4674445
|
||
|
if obs_labels is True:
|
||
|
if data is not None:
|
||
|
obs_labels = data.index
|
||
|
elif hasattr(exog_i, "index"):
|
||
|
obs_labels = exog_i.index
|
||
|
else:
|
||
|
obs_labels = res_xaxis.model.data.row_labels
|
||
|
#NOTE: row_labels can be None.
|
||
|
#Maybe we should fix this to never be the case.
|
||
|
if obs_labels is None:
|
||
|
obs_labels = lrange(len(exog_i))
|
||
|
|
||
|
if obs_labels is not False: # could be array_like
|
||
|
if len(obs_labels) != len(exog_i):
|
||
|
raise ValueError("obs_labels does not match length of exog_i")
|
||
|
label_kwargs.update(dict(ha="center", va="bottom"))
|
||
|
ax = utils.annotate_axes(lrange(len(obs_labels)), obs_labels,
|
||
|
lzip(res_xaxis.resid, res_yaxis.resid),
|
||
|
[(0, 5)] * len(obs_labels), "x-large", ax=ax,
|
||
|
**label_kwargs)
|
||
|
|
||
|
if ret_coords:
|
||
|
return fig, (res_xaxis.resid, res_yaxis.resid)
|
||
|
else:
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_partregress_grid(results, exog_idx=None, grid=None, fig=None):
|
||
|
"""
|
||
|
Plot partial regression for a set of regressors.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : Results instance
|
||
|
A regression model results instance.
|
||
|
exog_idx : {None, list[int], list[str]}
|
||
|
The indices or column names of the exog used in the plot, default is
|
||
|
all.
|
||
|
grid : {None, tuple[int]}
|
||
|
If grid is given, then it is used for the arrangement of the subplots.
|
||
|
The format of grid is (nrows, ncols). If grid is None, then ncol is
|
||
|
one, if there are only 2 subplots, and the number of columns is two
|
||
|
otherwise.
|
||
|
fig : Figure, optional
|
||
|
If given, this figure is simply returned. Otherwise a new figure is
|
||
|
created.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
If `fig` is None, the created figure. Otherwise `fig` itself.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
plot_partregress : Plot partial regression for a single regressor.
|
||
|
plot_ccpr : Plot CCPR against one regressor
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
A subplot is created for each explanatory variable given by exog_idx.
|
||
|
The partial regression plot shows the relationship between the response
|
||
|
and the given explanatory variable after removing the effect of all other
|
||
|
explanatory variables in exog.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/partregr.htm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Using the state crime dataset separately plot the effect of the each
|
||
|
variable on the on the outcome, murder rate while accounting for the effect
|
||
|
of all other variables in the model visualized with a grid of partial
|
||
|
regression plots.
|
||
|
|
||
|
>>> from statsmodels.graphics.regressionplots import plot_partregress_grid
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
>>> import statsmodels.formula.api as smf
|
||
|
|
||
|
>>> fig = plt.figure(figsize=(8, 6))
|
||
|
>>> crime_data = sm.datasets.statecrime.load_pandas()
|
||
|
>>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
|
||
|
... data=crime_data.data).fit()
|
||
|
>>> plot_partregress_grid(results, fig=fig)
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_partregress_grid.py
|
||
|
"""
|
||
|
import pandas
|
||
|
fig = utils.create_mpl_fig(fig)
|
||
|
|
||
|
exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
|
||
|
|
||
|
# TODO: maybe add option for using wendog, wexog instead
|
||
|
y = pandas.Series(results.model.endog, name=results.model.endog_names)
|
||
|
exog = results.model.exog
|
||
|
|
||
|
k_vars = exog.shape[1]
|
||
|
# this function does not make sense if k_vars=1
|
||
|
|
||
|
nrows = (len(exog_idx) + 1) // 2
|
||
|
ncols = 1 if nrows == len(exog_idx) else 2
|
||
|
if grid is not None:
|
||
|
nrows, ncols = grid
|
||
|
if ncols > 1:
|
||
|
title_kwargs = {"fontdict": {"fontsize": 'small'}}
|
||
|
|
||
|
# for indexing purposes
|
||
|
other_names = np.array(results.model.exog_names)
|
||
|
for i, idx in enumerate(exog_idx):
|
||
|
others = lrange(k_vars)
|
||
|
others.pop(idx)
|
||
|
exog_others = pandas.DataFrame(exog[:, others],
|
||
|
columns=other_names[others])
|
||
|
ax = fig.add_subplot(nrows, ncols, i + 1)
|
||
|
plot_partregress(y, pandas.Series(exog[:, idx],
|
||
|
name=other_names[idx]),
|
||
|
exog_others, ax=ax, title_kwargs=title_kwargs,
|
||
|
obs_labels=False)
|
||
|
ax.set_title("")
|
||
|
|
||
|
fig.suptitle("Partial Regression Plot", fontsize="large")
|
||
|
fig.tight_layout()
|
||
|
fig.subplots_adjust(top=.95)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_ccpr(results, exog_idx, ax=None):
|
||
|
"""
|
||
|
Plot CCPR against one regressor.
|
||
|
|
||
|
Generates a component and component-plus-residual (CCPR) plot.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : result instance
|
||
|
A regression results instance.
|
||
|
exog_idx : {int, str}
|
||
|
Exogenous, explanatory variable. If string is given, it should
|
||
|
be the variable name that you want to use, and you can use arbitrary
|
||
|
translations as with a formula.
|
||
|
ax : AxesSubplot, optional
|
||
|
If given, it is used to plot in instead of a new figure being
|
||
|
created.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
If `ax` is None, the created figure. Otherwise the figure to which
|
||
|
`ax` is connected.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The CCPR plot provides a way to judge the effect of one regressor on the
|
||
|
response variable by taking into account the effects of the other
|
||
|
independent variables. The partial residuals plot is defined as
|
||
|
Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
|
||
|
X_i to show where the fitted line would lie. Care should be taken if X_i
|
||
|
is highly correlated with any of the other independent variables. If this
|
||
|
is the case, the variance evident in the plot will be an underestimate of
|
||
|
the true variance.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Using the state crime dataset plot the effect of the rate of single
|
||
|
households ('single') on the murder rate while accounting for high school
|
||
|
graduation rate ('hs_grad'), percentage of people in an urban area, and rate
|
||
|
of poverty ('poverty').
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
>>> import statsmodels.formula.api as smf
|
||
|
|
||
|
>>> crime_data = sm.datasets.statecrime.load_pandas()
|
||
|
>>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
|
||
|
... data=crime_data.data).fit()
|
||
|
>>> sm.graphics.plot_ccpr(results, 'single')
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_ccpr.py
|
||
|
"""
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
|
||
|
results = maybe_unwrap_results(results)
|
||
|
|
||
|
x1 = results.model.exog[:, exog_idx]
|
||
|
#namestr = ' for %s' % self.name if self.name else ''
|
||
|
x1beta = x1*results.params[exog_idx]
|
||
|
ax.plot(x1, x1beta + results.resid, 'o')
|
||
|
from statsmodels.tools.tools import add_constant
|
||
|
mod = OLS(x1beta, add_constant(x1)).fit()
|
||
|
params = mod.params
|
||
|
fig = abline_plot(*params, **dict(ax=ax))
|
||
|
#ax.plot(x1, x1beta, '-')
|
||
|
ax.set_title('Component and component plus residual plot')
|
||
|
ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
|
||
|
ax.set_xlabel("%s" % exog_name)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def plot_ccpr_grid(results, exog_idx=None, grid=None, fig=None):
|
||
|
"""
|
||
|
Generate CCPR plots against a set of regressors, plot in a grid.
|
||
|
|
||
|
Generates a grid of component and component-plus-residual (CCPR) plots.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : result instance
|
||
|
A results instance with exog and params.
|
||
|
exog_idx : None or list of int
|
||
|
The indices or column names of the exog used in the plot.
|
||
|
grid : None or tuple of int (nrows, ncols)
|
||
|
If grid is given, then it is used for the arrangement of the subplots.
|
||
|
If grid is None, then ncol is one, if there are only 2 subplots, and
|
||
|
the number of columns is two otherwise.
|
||
|
fig : Figure, optional
|
||
|
If given, this figure is simply returned. Otherwise a new figure is
|
||
|
created.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
If `ax` is None, the created figure. Otherwise the figure to which
|
||
|
`ax` is connected.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
plot_ccpr : Creates CCPR plot for a single regressor.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Partial residual plots are formed as::
|
||
|
|
||
|
Res + Betahat(i)*Xi versus Xi
|
||
|
|
||
|
and CCPR adds::
|
||
|
|
||
|
Betahat(i)*Xi versus Xi
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
See http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
Using the state crime dataset separately plot the effect of the each
|
||
|
variable on the on the outcome, murder rate while accounting for the effect
|
||
|
of all other variables in the model.
|
||
|
|
||
|
>>> import statsmodels.api as sm
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
>>> import statsmodels.formula.api as smf
|
||
|
|
||
|
>>> fig = plt.figure(figsize=(8, 8))
|
||
|
>>> crime_data = sm.datasets.statecrime.load_pandas()
|
||
|
>>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
|
||
|
... data=crime_data.data).fit()
|
||
|
>>> sm.graphics.plot_ccpr_grid(results, fig=fig)
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_ccpr_grid.py
|
||
|
"""
|
||
|
fig = utils.create_mpl_fig(fig)
|
||
|
|
||
|
exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
|
||
|
|
||
|
if grid is not None:
|
||
|
nrows, ncols = grid
|
||
|
else:
|
||
|
if len(exog_idx) > 2:
|
||
|
nrows = int(np.ceil(len(exog_idx)/2.))
|
||
|
ncols = 2
|
||
|
else:
|
||
|
nrows = len(exog_idx)
|
||
|
ncols = 1
|
||
|
|
||
|
seen_constant = 0
|
||
|
for i, idx in enumerate(exog_idx):
|
||
|
if results.model.exog[:, idx].var() == 0:
|
||
|
seen_constant = 1
|
||
|
continue
|
||
|
|
||
|
ax = fig.add_subplot(nrows, ncols, i+1-seen_constant)
|
||
|
fig = plot_ccpr(results, exog_idx=idx, ax=ax)
|
||
|
ax.set_title("")
|
||
|
|
||
|
fig.suptitle("Component-Component Plus Residual Plot", fontsize="large")
|
||
|
|
||
|
fig.tight_layout()
|
||
|
|
||
|
fig.subplots_adjust(top=.95)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def abline_plot(intercept=None, slope=None, horiz=None, vert=None,
|
||
|
model_results=None, ax=None, **kwargs):
|
||
|
"""
|
||
|
Plot a line given an intercept and slope.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
intercept : float
|
||
|
The intercept of the line.
|
||
|
slope : float
|
||
|
The slope of the line.
|
||
|
horiz : float or array_like
|
||
|
Data for horizontal lines on the y-axis.
|
||
|
vert : array_like
|
||
|
Data for verterical lines on the x-axis.
|
||
|
model_results : statsmodels results instance
|
||
|
Any object that has a two-value `params` attribute. Assumed that it
|
||
|
is (intercept, slope).
|
||
|
ax : axes, optional
|
||
|
Matplotlib axes instance.
|
||
|
**kwargs
|
||
|
Options passed to matplotlib.pyplot.plt.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Figure
|
||
|
The figure given by `ax.figure` or a new instance.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> import statsmodels.api as sm
|
||
|
|
||
|
>>> np.random.seed(12345)
|
||
|
>>> X = sm.add_constant(np.random.normal(0, 20, size=30))
|
||
|
>>> y = np.dot(X, [25, 3.5]) + np.random.normal(0, 30, size=30)
|
||
|
>>> mod = sm.OLS(y,X).fit()
|
||
|
>>> fig = sm.graphics.abline_plot(model_results=mod)
|
||
|
>>> ax = fig.axes[0]
|
||
|
>>> ax.scatter(X[:,1], y)
|
||
|
>>> ax.margins(.1)
|
||
|
>>> import matplotlib.pyplot as plt
|
||
|
>>> plt.show()
|
||
|
|
||
|
.. plot:: plots/graphics_regression_abline.py
|
||
|
"""
|
||
|
if ax is not None: # get axis limits first thing, do not change these
|
||
|
x = ax.get_xlim()
|
||
|
else:
|
||
|
x = None
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
if model_results:
|
||
|
intercept, slope = model_results.params
|
||
|
if x is None:
|
||
|
x = [model_results.model.exog[:, 1].min(),
|
||
|
model_results.model.exog[:, 1].max()]
|
||
|
else:
|
||
|
if not (intercept is not None and slope is not None):
|
||
|
raise ValueError("specify slope and intercepty or model_results")
|
||
|
if x is None:
|
||
|
x = ax.get_xlim()
|
||
|
|
||
|
data_y = [x[0]*slope+intercept, x[1]*slope+intercept]
|
||
|
ax.set_xlim(x)
|
||
|
#ax.set_ylim(y)
|
||
|
|
||
|
from matplotlib.lines import Line2D
|
||
|
|
||
|
class ABLine2D(Line2D):
|
||
|
def __init__(self, *args, **kwargs):
|
||
|
super().__init__(*args, **kwargs)
|
||
|
self.id_xlim_callback = None
|
||
|
self.id_ylim_callback = None
|
||
|
|
||
|
def remove(self):
|
||
|
ax = self.axes
|
||
|
if self.id_xlim_callback:
|
||
|
ax.callbacks.disconnect(self.id_xlim_callback)
|
||
|
if self.id_ylim_callback:
|
||
|
ax.callbacks.disconnect(self.id_ylim_callback)
|
||
|
super().remove()
|
||
|
|
||
|
def update_datalim(self, ax):
|
||
|
ax.set_autoscale_on(False)
|
||
|
children = ax.get_children()
|
||
|
ablines = [child for child in children if child is self]
|
||
|
abline = ablines[0]
|
||
|
x = ax.get_xlim()
|
||
|
y = [x[0] * slope + intercept, x[1] * slope + intercept]
|
||
|
abline.set_data(x, y)
|
||
|
ax.figure.canvas.draw()
|
||
|
|
||
|
# TODO: how to intercept something like a margins call and adjust?
|
||
|
line = ABLine2D(x, data_y, **kwargs)
|
||
|
ax.add_line(line)
|
||
|
line.id_xlim_callback = ax.callbacks.connect('xlim_changed', line.update_datalim)
|
||
|
line.id_ylim_callback = ax.callbacks.connect('ylim_changed', line.update_datalim)
|
||
|
|
||
|
if horiz:
|
||
|
ax.hline(horiz)
|
||
|
if vert:
|
||
|
ax.vline(vert)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
@Appender(_plot_influence_doc.format(**{
|
||
|
'extra_params_doc': "results: object\n"
|
||
|
" Results for a fitted regression model.\n"
|
||
|
" influence: instance\n"
|
||
|
" The instance of Influence for model."}))
|
||
|
def _influence_plot(results, influence, external=True, alpha=.05,
|
||
|
criterion="cooks", size=48, plot_alpha=.75, ax=None,
|
||
|
leverage=None, resid=None,
|
||
|
**kwargs):
|
||
|
# leverage and resid kwds are used only internally for MLEInfluence
|
||
|
infl = influence
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
if criterion.lower().startswith('coo'):
|
||
|
psize = infl.cooks_distance[0]
|
||
|
elif criterion.lower().startswith('dff'):
|
||
|
psize = np.abs(infl.dffits[0])
|
||
|
else:
|
||
|
raise ValueError("Criterion %s not understood" % criterion)
|
||
|
|
||
|
# scale the variables
|
||
|
#TODO: what is the correct scaling and the assumption here?
|
||
|
#we want plots to be comparable across different plots
|
||
|
#so we would need to use the expected distribution of criterion probably
|
||
|
old_range = np.ptp(psize)
|
||
|
new_range = size**2 - 8**2
|
||
|
|
||
|
psize = (psize - psize.min()) * new_range/old_range + 8**2
|
||
|
|
||
|
if leverage is None:
|
||
|
leverage = infl.hat_matrix_diag
|
||
|
if resid is None:
|
||
|
ylabel = "Studentized Residuals"
|
||
|
if external:
|
||
|
resid = infl.resid_studentized_external
|
||
|
else:
|
||
|
resid = infl.resid_studentized
|
||
|
else:
|
||
|
resid = np.asarray(resid)
|
||
|
ylabel = "Residuals"
|
||
|
|
||
|
from scipy import stats
|
||
|
|
||
|
cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
|
||
|
large_resid = np.abs(resid) > cutoff
|
||
|
large_leverage = leverage > _high_leverage(results)
|
||
|
large_points = np.logical_or(large_resid, large_leverage)
|
||
|
|
||
|
ax.scatter(leverage, resid, s=psize, alpha=plot_alpha)
|
||
|
|
||
|
# add point labels
|
||
|
labels = results.model.data.row_labels
|
||
|
if labels is None:
|
||
|
labels = lrange(len(resid))
|
||
|
ax = utils.annotate_axes(np.where(large_points)[0], labels,
|
||
|
lzip(leverage, resid),
|
||
|
lzip(-(psize/2)**.5, (psize/2)**.5), "x-large",
|
||
|
ax)
|
||
|
|
||
|
# TODO: make configurable or let people do it ex-post?
|
||
|
font = {"fontsize": 16, "color": "black"}
|
||
|
ax.set_ylabel(ylabel, **font)
|
||
|
ax.set_xlabel("Leverage", **font)
|
||
|
ax.set_title("Influence Plot", **font)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
@Appender(_plot_influence_doc.format(**{
|
||
|
'extra_params_doc': "results : Results\n"
|
||
|
" Results for a fitted regression model."}))
|
||
|
def influence_plot(results, external=True, alpha=.05, criterion="cooks",
|
||
|
size=48, plot_alpha=.75, ax=None, **kwargs):
|
||
|
|
||
|
infl = results.get_influence()
|
||
|
res = _influence_plot(results, infl, external=external, alpha=alpha,
|
||
|
criterion=criterion, size=size,
|
||
|
plot_alpha=plot_alpha, ax=ax, **kwargs)
|
||
|
return res
|
||
|
|
||
|
|
||
|
@Appender(_plot_leverage_resid2_doc.format({
|
||
|
'extra_params_doc': "results: object\n"
|
||
|
" Results for a fitted regression model\n"
|
||
|
"influence: instance\n"
|
||
|
" instance of Influence for model"}))
|
||
|
def _plot_leverage_resid2(results, influence, alpha=.05, ax=None,
|
||
|
**kwargs):
|
||
|
|
||
|
from scipy.stats import norm, zscore
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
infl = influence
|
||
|
leverage = infl.hat_matrix_diag
|
||
|
resid = zscore(infl.resid)
|
||
|
ax.plot(resid**2, leverage, 'o', **kwargs)
|
||
|
ax.set_xlabel("Normalized residuals**2")
|
||
|
ax.set_ylabel("Leverage")
|
||
|
ax.set_title("Leverage vs. Normalized residuals squared")
|
||
|
|
||
|
large_leverage = leverage > _high_leverage(results)
|
||
|
#norm or t here if standardized?
|
||
|
cutoff = norm.ppf(1.-alpha/2)
|
||
|
large_resid = np.abs(resid) > cutoff
|
||
|
labels = results.model.data.row_labels
|
||
|
if labels is None:
|
||
|
labels = lrange(int(results.nobs))
|
||
|
index = np.where(np.logical_or(large_leverage, large_resid))[0]
|
||
|
ax = utils.annotate_axes(index, labels, lzip(resid**2, leverage),
|
||
|
[(0, 5)]*int(results.nobs), "large",
|
||
|
ax=ax, ha="center", va="bottom")
|
||
|
ax.margins(.075, .075)
|
||
|
return fig
|
||
|
|
||
|
|
||
|
@Appender(_plot_leverage_resid2_doc.format({
|
||
|
'extra_params_doc': "results : object\n"
|
||
|
" Results for a fitted regression model"}))
|
||
|
def plot_leverage_resid2(results, alpha=.05, ax=None, **kwargs):
|
||
|
|
||
|
infl = results.get_influence()
|
||
|
return _plot_leverage_resid2(results, infl, alpha=alpha, ax=ax, **kwargs)
|
||
|
|
||
|
|
||
|
|
||
|
@Appender(_plot_added_variable_doc % {
|
||
|
'extra_params_doc': "results : object\n"
|
||
|
" Results for a fitted regression model"})
|
||
|
def plot_added_variable(results, focus_exog, resid_type=None,
|
||
|
use_glm_weights=True, fit_kwargs=None, ax=None):
|
||
|
|
||
|
model = results.model
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
|
||
|
endog_resid, focus_exog_resid =\
|
||
|
added_variable_resids(results, focus_exog,
|
||
|
resid_type=resid_type,
|
||
|
use_glm_weights=use_glm_weights,
|
||
|
fit_kwargs=fit_kwargs)
|
||
|
|
||
|
ax.plot(focus_exog_resid, endog_resid, 'o', alpha=0.6)
|
||
|
|
||
|
ax.set_title('Added variable plot', fontsize='large')
|
||
|
|
||
|
if isinstance(focus_exog, str):
|
||
|
xname = focus_exog
|
||
|
else:
|
||
|
xname = model.exog_names[focus_exog]
|
||
|
ax.set_xlabel(xname, size=15)
|
||
|
ax.set_ylabel(model.endog_names + " residuals", size=15)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
@Appender(_plot_partial_residuals_doc % {
|
||
|
'extra_params_doc': "results : object\n"
|
||
|
" Results for a fitted regression model"})
|
||
|
def plot_partial_residuals(results, focus_exog, ax=None):
|
||
|
# Docstring attached below
|
||
|
|
||
|
model = results.model
|
||
|
|
||
|
focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
|
||
|
|
||
|
pr = partial_resids(results, focus_exog)
|
||
|
focus_exog_vals = results.model.exog[:, focus_col]
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
ax.plot(focus_exog_vals, pr, 'o', alpha=0.6)
|
||
|
|
||
|
ax.set_title('Partial residuals plot', fontsize='large')
|
||
|
|
||
|
if isinstance(focus_exog, str):
|
||
|
xname = focus_exog
|
||
|
else:
|
||
|
xname = model.exog_names[focus_exog]
|
||
|
ax.set_xlabel(xname, size=15)
|
||
|
ax.set_ylabel("Component plus residual", size=15)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
@Appender(_plot_ceres_residuals_doc % {
|
||
|
'extra_params_doc': "results : Results\n"
|
||
|
" Results instance of a fitted regression "
|
||
|
"model."})
|
||
|
def plot_ceres_residuals(results, focus_exog, frac=0.66, cond_means=None,
|
||
|
ax=None):
|
||
|
|
||
|
model = results.model
|
||
|
|
||
|
focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
|
||
|
|
||
|
presid = ceres_resids(results, focus_exog, frac=frac,
|
||
|
cond_means=cond_means)
|
||
|
|
||
|
focus_exog_vals = model.exog[:, focus_col]
|
||
|
|
||
|
fig, ax = utils.create_mpl_ax(ax)
|
||
|
ax.plot(focus_exog_vals, presid, 'o', alpha=0.6)
|
||
|
|
||
|
ax.set_title('CERES residuals plot', fontsize='large')
|
||
|
|
||
|
ax.set_xlabel(focus_exog, size=15)
|
||
|
ax.set_ylabel("Component plus residual", size=15)
|
||
|
|
||
|
return fig
|
||
|
|
||
|
|
||
|
def ceres_resids(results, focus_exog, frac=0.66, cond_means=None):
|
||
|
"""
|
||
|
Calculate the CERES residuals (Conditional Expectation Partial
|
||
|
Residuals) for a fitted model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : model results instance
|
||
|
The fitted model for which the CERES residuals are calculated.
|
||
|
focus_exog : int
|
||
|
The column of results.model.exog used as the 'focus variable'.
|
||
|
frac : float, optional
|
||
|
Lowess smoothing parameter for estimating the conditional
|
||
|
means. Not used if `cond_means` is provided.
|
||
|
cond_means : array_like, optional
|
||
|
If provided, the columns of this array are the conditional
|
||
|
means E[exog | focus exog], where exog ranges over some
|
||
|
or all of the columns of exog other than focus exog. If
|
||
|
this is an empty nx0 array, the conditional means are
|
||
|
treated as being zero. If None, the conditional means are
|
||
|
estimated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
An array containing the CERES residuals.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If `cond_means` is not provided, it is obtained by smoothing each
|
||
|
column of exog (except the focus column) against the focus column.
|
||
|
|
||
|
Currently only supports GLM, GEE, and OLS models.
|
||
|
"""
|
||
|
|
||
|
model = results.model
|
||
|
|
||
|
if not isinstance(model, (GLM, GEE, OLS)):
|
||
|
raise ValueError("ceres residuals not available for %s" %
|
||
|
model.__class__.__name__)
|
||
|
|
||
|
focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
|
||
|
|
||
|
# Indices of non-focus columns
|
||
|
ix_nf = range(len(results.params))
|
||
|
ix_nf = list(ix_nf)
|
||
|
ix_nf.pop(focus_col)
|
||
|
nnf = len(ix_nf)
|
||
|
|
||
|
# Estimate the conditional means if not provided.
|
||
|
if cond_means is None:
|
||
|
|
||
|
# Below we calculate E[x | focus] where x is each column other
|
||
|
# than the focus column. We do not want the intercept when we do
|
||
|
# this so we remove it here.
|
||
|
pexog = model.exog[:, ix_nf]
|
||
|
pexog -= pexog.mean(0)
|
||
|
u, s, vt = np.linalg.svd(pexog, 0)
|
||
|
ii = np.flatnonzero(s > 1e-6)
|
||
|
pexog = u[:, ii]
|
||
|
|
||
|
fcol = model.exog[:, focus_col]
|
||
|
cond_means = np.empty((len(fcol), pexog.shape[1]))
|
||
|
for j in range(pexog.shape[1]):
|
||
|
|
||
|
# Get the fitted values for column i given the other
|
||
|
# columns (skip the intercept).
|
||
|
y0 = pexog[:, j]
|
||
|
|
||
|
cf = lowess(y0, fcol, frac=frac, return_sorted=False)
|
||
|
|
||
|
cond_means[:, j] = cf
|
||
|
|
||
|
new_exog = np.concatenate((model.exog[:, ix_nf], cond_means), axis=1)
|
||
|
|
||
|
# Refit the model using the adjusted exog values
|
||
|
klass = model.__class__
|
||
|
init_kwargs = model._get_init_kwds()
|
||
|
new_model = klass(model.endog, new_exog, **init_kwargs)
|
||
|
new_result = new_model.fit()
|
||
|
|
||
|
# The partial residual, with respect to l(x2) (notation of Cook 1998)
|
||
|
presid = model.endog - new_result.fittedvalues
|
||
|
if isinstance(model, (GLM, GEE)):
|
||
|
presid *= model.family.link.deriv(new_result.fittedvalues)
|
||
|
if new_exog.shape[1] > nnf:
|
||
|
presid += np.dot(new_exog[:, nnf:], new_result.params[nnf:])
|
||
|
|
||
|
return presid
|
||
|
|
||
|
def partial_resids(results, focus_exog):
|
||
|
"""
|
||
|
Returns partial residuals for a fitted model with respect to a
|
||
|
'focus predictor'.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : results instance
|
||
|
A fitted regression model.
|
||
|
focus col : int
|
||
|
The column index of model.exog with respect to which the
|
||
|
partial residuals are calculated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
An array of partial residuals.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
RD Cook and R Croos-Dabrera (1998). Partial residual plots in
|
||
|
generalized linear models. Journal of the American Statistical
|
||
|
Association, 93:442.
|
||
|
"""
|
||
|
|
||
|
# TODO: could be a method of results
|
||
|
# TODO: see Cook et al (1998) for a more general definition
|
||
|
|
||
|
# The calculation follows equation (8) from Cook's paper.
|
||
|
model = results.model
|
||
|
resid = model.endog - results.predict()
|
||
|
|
||
|
if isinstance(model, (GLM, GEE)):
|
||
|
resid *= model.family.link.deriv(results.fittedvalues)
|
||
|
elif isinstance(model, (OLS, GLS, WLS)):
|
||
|
pass # No need to do anything
|
||
|
else:
|
||
|
raise ValueError("Partial residuals for '%s' not implemented."
|
||
|
% type(model))
|
||
|
|
||
|
if type(focus_exog) is str:
|
||
|
focus_col = model.exog_names.index(focus_exog)
|
||
|
else:
|
||
|
focus_col = focus_exog
|
||
|
|
||
|
focus_val = results.params[focus_col] * model.exog[:, focus_col]
|
||
|
|
||
|
return focus_val + resid
|
||
|
|
||
|
def added_variable_resids(results, focus_exog, resid_type=None,
|
||
|
use_glm_weights=True, fit_kwargs=None):
|
||
|
"""
|
||
|
Residualize the endog variable and a 'focus' exog variable in a
|
||
|
regression model with respect to the other exog variables.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
results : regression results instance
|
||
|
A fitted model including the focus exog and all other
|
||
|
predictors of interest.
|
||
|
focus_exog : {int, str}
|
||
|
The column of results.model.exog or a variable name that is
|
||
|
to be residualized against the other predictors.
|
||
|
resid_type : str
|
||
|
The type of residuals to use for the dependent variable. If
|
||
|
None, uses `resid_deviance` for GLM/GEE and `resid` otherwise.
|
||
|
use_glm_weights : bool
|
||
|
Only used if the model is a GLM or GEE. If True, the
|
||
|
residuals for the focus predictor are computed using WLS, with
|
||
|
the weights obtained from the IRLS calculations for fitting
|
||
|
the GLM. If False, unweighted regression is used.
|
||
|
fit_kwargs : dict, optional
|
||
|
Keyword arguments to be passed to fit when refitting the
|
||
|
model.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
endog_resid : array_like
|
||
|
The residuals for the original exog
|
||
|
focus_exog_resid : array_like
|
||
|
The residuals for the focus predictor
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The 'focus variable' residuals are always obtained using linear
|
||
|
regression.
|
||
|
|
||
|
Currently only GLM, GEE, and OLS models are supported.
|
||
|
"""
|
||
|
|
||
|
model = results.model
|
||
|
if not isinstance(model, (GEE, GLM, OLS)):
|
||
|
raise ValueError("model type %s not supported for added variable residuals" %
|
||
|
model.__class__.__name__)
|
||
|
|
||
|
exog = model.exog
|
||
|
endog = model.endog
|
||
|
|
||
|
focus_exog, focus_col = utils.maybe_name_or_idx(focus_exog, model)
|
||
|
|
||
|
focus_exog_vals = exog[:, focus_col]
|
||
|
|
||
|
# Default residuals
|
||
|
if resid_type is None:
|
||
|
if isinstance(model, (GEE, GLM)):
|
||
|
resid_type = "resid_deviance"
|
||
|
else:
|
||
|
resid_type = "resid"
|
||
|
|
||
|
ii = range(exog.shape[1])
|
||
|
ii = list(ii)
|
||
|
ii.pop(focus_col)
|
||
|
reduced_exog = exog[:, ii]
|
||
|
start_params = results.params[ii]
|
||
|
|
||
|
klass = model.__class__
|
||
|
|
||
|
kwargs = model._get_init_kwds()
|
||
|
new_model = klass(endog, reduced_exog, **kwargs)
|
||
|
args = {"start_params": start_params}
|
||
|
if fit_kwargs is not None:
|
||
|
args.update(fit_kwargs)
|
||
|
new_result = new_model.fit(**args)
|
||
|
if not getattr(new_result, "converged", True):
|
||
|
raise ValueError("fit did not converge when calculating added variable residuals")
|
||
|
|
||
|
try:
|
||
|
endog_resid = getattr(new_result, resid_type)
|
||
|
except AttributeError:
|
||
|
raise ValueError("'%s' residual type not available" % resid_type)
|
||
|
|
||
|
import statsmodels.regression.linear_model as lm
|
||
|
|
||
|
if isinstance(model, (GLM, GEE)) and use_glm_weights:
|
||
|
weights = model.family.weights(results.fittedvalues)
|
||
|
if hasattr(model, "data_weights"):
|
||
|
weights = weights * model.data_weights
|
||
|
lm_results = lm.WLS(focus_exog_vals, reduced_exog, weights).fit()
|
||
|
else:
|
||
|
lm_results = lm.OLS(focus_exog_vals, reduced_exog).fit()
|
||
|
focus_exog_resid = lm_results.resid
|
||
|
|
||
|
return endog_resid, focus_exog_resid
|