3224 lines
113 KiB
Python
3224 lines
113 KiB
Python
"""
|
||
Procedures for fitting marginal regression models to dependent data
|
||
using Generalized Estimating Equations.
|
||
|
||
References
|
||
----------
|
||
KY Liang and S Zeger. "Longitudinal data analysis using
|
||
generalized linear models". Biometrika (1986) 73 (1): 13-22.
|
||
|
||
S Zeger and KY Liang. "Longitudinal Data Analysis for Discrete and
|
||
Continuous Outcomes". Biometrics Vol. 42, No. 1 (Mar., 1986),
|
||
pp. 121-130
|
||
|
||
A Rotnitzky and NP Jewell (1990). "Hypothesis testing of regression
|
||
parameters in semiparametric generalized linear models for cluster
|
||
correlated data", Biometrika, 77, 485-497.
|
||
|
||
Xu Guo and Wei Pan (2002). "Small sample performance of the score
|
||
test in GEE".
|
||
http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
|
||
|
||
LA Mancl LA, TA DeRouen (2001). A covariance estimator for GEE with
|
||
improved small-sample properties. Biometrics. 2001 Mar;57(1):126-34.
|
||
"""
|
||
from statsmodels.compat.python import lzip
|
||
from statsmodels.compat.pandas import Appender
|
||
|
||
import numpy as np
|
||
from scipy import stats
|
||
import pandas as pd
|
||
import patsy
|
||
from collections import defaultdict
|
||
from statsmodels.tools.decorators import cache_readonly
|
||
import statsmodels.base.model as base
|
||
# used for wrapper:
|
||
import statsmodels.regression.linear_model as lm
|
||
import statsmodels.base.wrapper as wrap
|
||
|
||
from statsmodels.genmod import families
|
||
from statsmodels.genmod.generalized_linear_model import GLM, GLMResults
|
||
from statsmodels.genmod import cov_struct as cov_structs
|
||
|
||
import statsmodels.genmod.families.varfuncs as varfuncs
|
||
from statsmodels.genmod.families.links import Link
|
||
|
||
from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
|
||
DomainWarning,
|
||
IterationLimitWarning,
|
||
ValueWarning)
|
||
import warnings
|
||
|
||
from statsmodels.graphics._regressionplots_doc import (
|
||
_plot_added_variable_doc,
|
||
_plot_partial_residuals_doc,
|
||
_plot_ceres_residuals_doc)
|
||
from statsmodels.discrete.discrete_margins import (
|
||
_get_margeff_exog, _check_margeff_args, _effects_at, margeff_cov_with_se,
|
||
_check_at_is_all, _transform_names, _check_discrete_args,
|
||
_get_dummy_index, _get_count_index)
|
||
|
||
|
||
class ParameterConstraint:
|
||
"""
|
||
A class for managing linear equality constraints for a parameter
|
||
vector.
|
||
"""
|
||
|
||
def __init__(self, lhs, rhs, exog):
|
||
"""
|
||
Parameters
|
||
----------
|
||
lhs : ndarray
|
||
A q x p matrix which is the left hand side of the
|
||
constraint lhs * param = rhs. The number of constraints is
|
||
q >= 1 and p is the dimension of the parameter vector.
|
||
rhs : ndarray
|
||
A 1-dimensional vector of length q which is the right hand
|
||
side of the constraint equation.
|
||
exog : ndarray
|
||
The n x p exognenous data for the full model.
|
||
"""
|
||
|
||
# In case a row or column vector is passed (patsy linear
|
||
# constraints passes a column vector).
|
||
rhs = np.atleast_1d(rhs.squeeze())
|
||
|
||
if rhs.ndim > 1:
|
||
raise ValueError("The right hand side of the constraint "
|
||
"must be a vector.")
|
||
|
||
if len(rhs) != lhs.shape[0]:
|
||
raise ValueError("The number of rows of the left hand "
|
||
"side constraint matrix L must equal "
|
||
"the length of the right hand side "
|
||
"constraint vector R.")
|
||
|
||
self.lhs = lhs
|
||
self.rhs = rhs
|
||
|
||
# The columns of lhs0 are an orthogonal basis for the
|
||
# orthogonal complement to row(lhs), the columns of lhs1 are
|
||
# an orthogonal basis for row(lhs). The columns of lhsf =
|
||
# [lhs0, lhs1] are mutually orthogonal.
|
||
lhs_u, lhs_s, lhs_vt = np.linalg.svd(lhs.T, full_matrices=1)
|
||
self.lhs0 = lhs_u[:, len(lhs_s):]
|
||
self.lhs1 = lhs_u[:, 0:len(lhs_s)]
|
||
self.lhsf = np.hstack((self.lhs0, self.lhs1))
|
||
|
||
# param0 is one solution to the underdetermined system
|
||
# L * param = R.
|
||
self.param0 = np.dot(self.lhs1, np.dot(lhs_vt, self.rhs) /
|
||
lhs_s)
|
||
|
||
self._offset_increment = np.dot(exog, self.param0)
|
||
|
||
self.orig_exog = exog
|
||
self.exog_fulltrans = np.dot(exog, self.lhsf)
|
||
|
||
def offset_increment(self):
|
||
"""
|
||
Returns a vector that should be added to the offset vector to
|
||
accommodate the constraint.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
The exogeneous data for the model.
|
||
"""
|
||
|
||
return self._offset_increment
|
||
|
||
def reduced_exog(self):
|
||
"""
|
||
Returns a linearly transformed exog matrix whose columns span
|
||
the constrained model space.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
The exogeneous data for the model.
|
||
"""
|
||
return self.exog_fulltrans[:, 0:self.lhs0.shape[1]]
|
||
|
||
def restore_exog(self):
|
||
"""
|
||
Returns the full exog matrix before it was reduced to
|
||
satisfy the constraint.
|
||
"""
|
||
return self.orig_exog
|
||
|
||
def unpack_param(self, params):
|
||
"""
|
||
Converts the parameter vector `params` from reduced to full
|
||
coordinates.
|
||
"""
|
||
|
||
return self.param0 + np.dot(self.lhs0, params)
|
||
|
||
def unpack_cov(self, bcov):
|
||
"""
|
||
Converts the covariance matrix `bcov` from reduced to full
|
||
coordinates.
|
||
"""
|
||
|
||
return np.dot(self.lhs0, np.dot(bcov, self.lhs0.T))
|
||
|
||
|
||
_gee_init_doc = """
|
||
Marginal regression model fit using Generalized Estimating Equations.
|
||
|
||
GEE can be used to fit Generalized Linear Models (GLMs) when the
|
||
data have a grouped structure, and the observations are possibly
|
||
correlated within groups but not between groups.
|
||
|
||
Parameters
|
||
----------
|
||
endog : array_like
|
||
1d array of endogenous values (i.e. responses, outcomes,
|
||
dependent variables, or 'Y' values).
|
||
exog : array_like
|
||
2d array of exogeneous values (i.e. covariates, predictors,
|
||
independent variables, regressors, or 'X' values). A `nobs x
|
||
k` array where `nobs` is the number of observations and `k` is
|
||
the number of regressors. An intercept is not included by
|
||
default and should be added by the user. See
|
||
`statsmodels.tools.add_constant`.
|
||
groups : array_like
|
||
A 1d array of length `nobs` containing the group labels.
|
||
time : array_like
|
||
A 2d array of time (or other index) values, used by some
|
||
dependence structures to define similarity relationships among
|
||
observations within a cluster.
|
||
family : family class instance
|
||
%(family_doc)s
|
||
cov_struct : CovStruct class instance
|
||
The default is Independence. To specify an exchangeable
|
||
structure use cov_struct = Exchangeable(). See
|
||
statsmodels.genmod.cov_struct.CovStruct for more
|
||
information.
|
||
offset : array_like
|
||
An offset to be included in the fit. If provided, must be
|
||
an array whose length is the number of rows in exog.
|
||
dep_data : array_like
|
||
Additional data passed to the dependence structure.
|
||
constraint : (ndarray, ndarray)
|
||
If provided, the constraint is a tuple (L, R) such that the
|
||
model parameters are estimated under the constraint L *
|
||
param = R, where L is a q x p matrix and R is a
|
||
q-dimensional vector. If constraint is provided, a score
|
||
test is performed to compare the constrained model to the
|
||
unconstrained model.
|
||
update_dep : bool
|
||
If true, the dependence parameters are optimized, otherwise
|
||
they are held fixed at their starting values.
|
||
weights : array_like
|
||
An array of case weights to use in the analysis.
|
||
%(extra_params)s
|
||
|
||
See Also
|
||
--------
|
||
statsmodels.genmod.families.family
|
||
:ref:`families`
|
||
:ref:`links`
|
||
|
||
Notes
|
||
-----
|
||
Only the following combinations make sense for family and link ::
|
||
|
||
+ ident log logit probit cloglog pow opow nbinom loglog logc
|
||
Gaussian | x x x
|
||
inv Gaussian | x x x
|
||
binomial | x x x x x x x x x
|
||
Poisson | x x x
|
||
neg binomial | x x x x
|
||
gamma | x x x
|
||
|
||
Not all of these link functions are currently available.
|
||
|
||
Endog and exog are references so that if the data they refer
|
||
to are already arrays and these arrays are changed, endog and
|
||
exog will change.
|
||
|
||
The "robust" covariance type is the standard "sandwich estimator"
|
||
(e.g. Liang and Zeger (1986)). It is the default here and in most
|
||
other packages. The "naive" estimator gives smaller standard
|
||
errors, but is only correct if the working correlation structure
|
||
is correctly specified. The "bias reduced" estimator of Mancl and
|
||
DeRouen (Biometrics, 2001) reduces the downward bias of the robust
|
||
estimator.
|
||
|
||
The robust covariance provided here follows Liang and Zeger (1986)
|
||
and agrees with R's gee implementation. To obtain the robust
|
||
standard errors reported in Stata, multiply by sqrt(N / (N - g)),
|
||
where N is the total sample size, and g is the average group size.
|
||
%(notes)s
|
||
Examples
|
||
--------
|
||
%(example)s
|
||
"""
|
||
|
||
_gee_nointercept = """
|
||
The nominal and ordinal GEE models should not have an intercept
|
||
(either implicit or explicit). Use "0 + " in a formula to
|
||
suppress the intercept.
|
||
"""
|
||
|
||
_gee_family_doc = """\
|
||
The default is Gaussian. To specify the binomial
|
||
distribution use `family=sm.families.Binomial()`. Each family
|
||
can take a link instance as an argument. See
|
||
statsmodels.genmod.families.family for more information."""
|
||
|
||
_gee_ordinal_family_doc = """\
|
||
The only family supported is `Binomial`. The default `Logit`
|
||
link may be replaced with `probit` if desired."""
|
||
|
||
_gee_nominal_family_doc = """\
|
||
The default value `None` uses a multinomial logit family
|
||
specifically designed for use with GEE. Setting this
|
||
argument to a non-default value is not currently supported."""
|
||
|
||
_gee_fit_doc = """
|
||
Fits a marginal regression model using generalized estimating
|
||
equations (GEE).
|
||
|
||
Parameters
|
||
----------
|
||
maxiter : int
|
||
The maximum number of iterations
|
||
ctol : float
|
||
The convergence criterion for stopping the Gauss-Seidel
|
||
iterations
|
||
start_params : array_like
|
||
A vector of starting values for the regression
|
||
coefficients. If None, a default is chosen.
|
||
params_niter : int
|
||
The number of Gauss-Seidel updates of the mean structure
|
||
parameters that take place prior to each update of the
|
||
dependence structure.
|
||
first_dep_update : int
|
||
No dependence structure updates occur before this
|
||
iteration number.
|
||
cov_type : str
|
||
One of "robust", "naive", or "bias_reduced".
|
||
ddof_scale : scalar or None
|
||
The scale parameter is estimated as the sum of squared
|
||
Pearson residuals divided by `N - ddof_scale`, where N
|
||
is the total sample size. If `ddof_scale` is None, the
|
||
number of covariates (including an intercept if present)
|
||
is used.
|
||
scaling_factor : scalar
|
||
The estimated covariance of the parameter estimates is
|
||
scaled by this value. Default is 1, Stata uses N / (N - g),
|
||
where N is the total sample size and g is the average group
|
||
size.
|
||
scale : str or float, optional
|
||
`scale` can be None, 'X2', or a float
|
||
If a float, its value is used as the scale parameter.
|
||
The default value is None, which uses `X2` (Pearson's
|
||
chi-square) for Gamma, Gaussian, and Inverse Gaussian.
|
||
The default is 1 for the Binomial and Poisson families.
|
||
|
||
Returns
|
||
-------
|
||
An instance of the GEEResults class or subclass
|
||
|
||
Notes
|
||
-----
|
||
If convergence difficulties occur, increase the values of
|
||
`first_dep_update` and/or `params_niter`. Setting
|
||
`first_dep_update` to a greater value (e.g. ~10-20) causes the
|
||
algorithm to move close to the GLM solution before attempting
|
||
to identify the dependence structure.
|
||
|
||
For the Gaussian family, there is no benefit to setting
|
||
`params_niter` to a value greater than 1, since the mean
|
||
structure parameters converge in one step.
|
||
"""
|
||
|
||
_gee_results_doc = """
|
||
Attributes
|
||
----------
|
||
|
||
cov_params_default : ndarray
|
||
default covariance of the parameter estimates. Is chosen among one
|
||
of the following three based on `cov_type`
|
||
cov_robust : ndarray
|
||
covariance of the parameter estimates that is robust
|
||
cov_naive : ndarray
|
||
covariance of the parameter estimates that is not robust to
|
||
correlation or variance misspecification
|
||
cov_robust_bc : ndarray
|
||
covariance of the parameter estimates that is robust and bias
|
||
reduced
|
||
converged : bool
|
||
indicator for convergence of the optimization.
|
||
True if the norm of the score is smaller than a threshold
|
||
cov_type : str
|
||
string indicating whether a "robust", "naive" or "bias_reduced"
|
||
covariance is used as default
|
||
fit_history : dict
|
||
Contains information about the iterations.
|
||
fittedvalues : ndarray
|
||
Linear predicted values for the fitted model.
|
||
dot(exog, params)
|
||
model : class instance
|
||
Pointer to GEE model instance that called `fit`.
|
||
normalized_cov_params : ndarray
|
||
See GEE docstring
|
||
params : ndarray
|
||
The coefficients of the fitted model. Note that
|
||
interpretation of the coefficients often depends on the
|
||
distribution family and the data.
|
||
scale : float
|
||
The estimate of the scale / dispersion for the model fit.
|
||
See GEE.fit for more information.
|
||
score_norm : float
|
||
norm of the score at the end of the iterative estimation.
|
||
bse : ndarray
|
||
The standard errors of the fitted GEE parameters.
|
||
"""
|
||
|
||
_gee_example = """
|
||
Logistic regression with autoregressive working dependence:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> family = sm.families.Binomial()
|
||
>>> va = sm.cov_struct.Autoregressive()
|
||
>>> model = sm.GEE(endog, exog, group, family=family, cov_struct=va)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
|
||
Use formulas to fit a Poisson GLM with independent working
|
||
dependence:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> fam = sm.families.Poisson()
|
||
>>> ind = sm.cov_struct.Independence()
|
||
>>> model = sm.GEE.from_formula("y ~ age + trt + base", "subject",
|
||
data, cov_struct=ind, family=fam)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
|
||
Equivalent, using the formula API:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> import statsmodels.formula.api as smf
|
||
>>> fam = sm.families.Poisson()
|
||
>>> ind = sm.cov_struct.Independence()
|
||
>>> model = smf.gee("y ~ age + trt + base", "subject",
|
||
data, cov_struct=ind, family=fam)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
"""
|
||
|
||
_gee_ordinal_example = """
|
||
Fit an ordinal regression model using GEE, with "global
|
||
odds ratio" dependence:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> gor = sm.cov_struct.GlobalOddsRatio("ordinal")
|
||
>>> model = sm.OrdinalGEE(endog, exog, groups, cov_struct=gor)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
|
||
Using formulas:
|
||
|
||
>>> import statsmodels.formula.api as smf
|
||
>>> model = smf.ordinal_gee("y ~ 0 + x1 + x2", groups, data,
|
||
cov_struct=gor)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
"""
|
||
|
||
_gee_nominal_example = """
|
||
Fit a nominal regression model using GEE:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> import statsmodels.formula.api as smf
|
||
>>> gor = sm.cov_struct.GlobalOddsRatio("nominal")
|
||
>>> model = sm.NominalGEE(endog, exog, groups, cov_struct=gor)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
|
||
Using formulas:
|
||
|
||
>>> import statsmodels.api as sm
|
||
>>> model = sm.NominalGEE.from_formula("y ~ 0 + x1 + x2", groups,
|
||
data, cov_struct=gor)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
|
||
Using the formula API:
|
||
|
||
>>> import statsmodels.formula.api as smf
|
||
>>> model = smf.nominal_gee("y ~ 0 + x1 + x2", groups, data,
|
||
cov_struct=gor)
|
||
>>> result = model.fit()
|
||
>>> print(result.summary())
|
||
"""
|
||
|
||
|
||
def _check_args(endog, exog, groups, time, offset, exposure):
|
||
|
||
if endog.size != exog.shape[0]:
|
||
raise ValueError("Leading dimension of 'exog' should match "
|
||
"length of 'endog'")
|
||
|
||
if groups.size != endog.size:
|
||
raise ValueError("'groups' and 'endog' should have the same size")
|
||
|
||
if time is not None and (time.size != endog.size):
|
||
raise ValueError("'time' and 'endog' should have the same size")
|
||
|
||
if offset is not None and (offset.size != endog.size):
|
||
raise ValueError("'offset and 'endog' should have the same size")
|
||
|
||
if exposure is not None and (exposure.size != endog.size):
|
||
raise ValueError("'exposure' and 'endog' should have the same size")
|
||
|
||
|
||
class GEE(GLM):
|
||
|
||
__doc__ = (
|
||
" Marginal Regression Model using Generalized Estimating "
|
||
"Equations.\n" + _gee_init_doc %
|
||
{'extra_params': base._missing_param_doc,
|
||
'family_doc': _gee_family_doc,
|
||
'example': _gee_example,
|
||
'notes': ""})
|
||
|
||
cached_means = None
|
||
|
||
def __init__(self, endog, exog, groups, time=None, family=None,
|
||
cov_struct=None, missing='none', offset=None,
|
||
exposure=None, dep_data=None, constraint=None,
|
||
update_dep=True, weights=None, **kwargs):
|
||
|
||
if type(self) is GEE:
|
||
self._check_kwargs(kwargs)
|
||
if family is not None:
|
||
if not isinstance(family.link, tuple(family.safe_links)):
|
||
msg = ("The {0} link function does not respect the "
|
||
"domain of the {1} family.")
|
||
warnings.warn(msg.format(family.link.__class__.__name__,
|
||
family.__class__.__name__),
|
||
DomainWarning)
|
||
|
||
groups = np.asarray(groups) # in case groups is pandas
|
||
|
||
if "missing_idx" in kwargs and kwargs["missing_idx"] is not None:
|
||
# If here, we are entering from super.from_formula; missing
|
||
# has already been dropped from endog and exog, but not from
|
||
# the other variables.
|
||
ii = ~kwargs["missing_idx"]
|
||
groups = groups[ii]
|
||
if time is not None:
|
||
time = time[ii]
|
||
if offset is not None:
|
||
offset = offset[ii]
|
||
if exposure is not None:
|
||
exposure = exposure[ii]
|
||
del kwargs["missing_idx"]
|
||
|
||
self.missing = missing
|
||
self.dep_data = dep_data
|
||
self.constraint = constraint
|
||
self.update_dep = update_dep
|
||
|
||
self._fit_history = defaultdict(list)
|
||
|
||
# Pass groups, time, offset, and dep_data so they are
|
||
# processed for missing data along with endog and exog.
|
||
# Calling super creates self.exog, self.endog, etc. as
|
||
# ndarrays and the original exog, endog, etc. are
|
||
# self.data.endog, etc.
|
||
super().__init__(endog, exog, groups=groups,
|
||
time=time, offset=offset,
|
||
exposure=exposure, weights=weights,
|
||
dep_data=dep_data, missing=missing,
|
||
family=family, **kwargs)
|
||
|
||
_check_args(
|
||
self.endog,
|
||
self.exog,
|
||
self.groups,
|
||
self.time,
|
||
getattr(self, "offset", None),
|
||
getattr(self, "exposure", None),
|
||
)
|
||
|
||
self._init_keys.extend(["update_dep", "constraint", "family",
|
||
"cov_struct"])
|
||
# remove keys added by super that are not supported
|
||
try:
|
||
self._init_keys.remove("freq_weights")
|
||
self._init_keys.remove("var_weights")
|
||
except ValueError:
|
||
pass
|
||
|
||
# Handle the family argument
|
||
if family is None:
|
||
family = families.Gaussian()
|
||
else:
|
||
if not issubclass(family.__class__, families.Family):
|
||
raise ValueError("GEE: `family` must be a genmod "
|
||
"family instance")
|
||
self.family = family
|
||
|
||
# Handle the cov_struct argument
|
||
if cov_struct is None:
|
||
cov_struct = cov_structs.Independence()
|
||
else:
|
||
if not issubclass(cov_struct.__class__, cov_structs.CovStruct):
|
||
raise ValueError("GEE: `cov_struct` must be a genmod "
|
||
"cov_struct instance")
|
||
|
||
self.cov_struct = cov_struct
|
||
|
||
# Handle the constraint
|
||
self.constraint = None
|
||
if constraint is not None:
|
||
if len(constraint) != 2:
|
||
raise ValueError("GEE: `constraint` must be a 2-tuple.")
|
||
if constraint[0].shape[1] != self.exog.shape[1]:
|
||
raise ValueError(
|
||
"GEE: the left hand side of the constraint must have "
|
||
"the same number of columns as the exog matrix.")
|
||
self.constraint = ParameterConstraint(constraint[0],
|
||
constraint[1],
|
||
self.exog)
|
||
|
||
if self._offset_exposure is not None:
|
||
self._offset_exposure += self.constraint.offset_increment()
|
||
else:
|
||
self._offset_exposure = (
|
||
self.constraint.offset_increment().copy())
|
||
self.exog = self.constraint.reduced_exog()
|
||
|
||
# Create list of row indices for each group
|
||
group_labels, ix = np.unique(self.groups, return_inverse=True)
|
||
se = pd.Series(index=np.arange(len(ix)), dtype="int")
|
||
gb = se.groupby(ix).groups
|
||
dk = [(lb, np.asarray(gb[k])) for k, lb in enumerate(group_labels)]
|
||
self.group_indices = dict(dk)
|
||
self.group_labels = group_labels
|
||
|
||
# Convert the data to the internal representation, which is a
|
||
# list of arrays, corresponding to the groups.
|
||
self.endog_li = self.cluster_list(self.endog)
|
||
self.exog_li = self.cluster_list(self.exog)
|
||
|
||
if self.weights is not None:
|
||
self.weights_li = self.cluster_list(self.weights)
|
||
|
||
self.num_group = len(self.endog_li)
|
||
|
||
# Time defaults to a 1d grid with equal spacing
|
||
if self.time is not None:
|
||
if self.time.ndim == 1:
|
||
self.time = self.time[:, None]
|
||
self.time_li = self.cluster_list(self.time)
|
||
else:
|
||
self.time_li = \
|
||
[np.arange(len(y), dtype=np.float64)[:, None]
|
||
for y in self.endog_li]
|
||
self.time = np.concatenate(self.time_li)
|
||
|
||
if (self._offset_exposure is None or
|
||
(np.isscalar(self._offset_exposure) and
|
||
self._offset_exposure == 0.)):
|
||
self.offset_li = None
|
||
else:
|
||
self.offset_li = self.cluster_list(self._offset_exposure)
|
||
if constraint is not None:
|
||
self.constraint.exog_fulltrans_li = \
|
||
self.cluster_list(self.constraint.exog_fulltrans)
|
||
|
||
self.family = family
|
||
|
||
self.cov_struct.initialize(self)
|
||
|
||
# Total sample size
|
||
group_ns = [len(y) for y in self.endog_li]
|
||
self.nobs = sum(group_ns)
|
||
# The following are column based, not on rank see #1928
|
||
self.df_model = self.exog.shape[1] - 1 # assumes constant
|
||
self.df_resid = self.nobs - self.exog.shape[1]
|
||
|
||
# Skip the covariance updates if all groups have a single
|
||
# observation (reduces to fitting a GLM).
|
||
maxgroup = max([len(x) for x in self.endog_li])
|
||
if maxgroup == 1:
|
||
self.update_dep = False
|
||
|
||
# Override to allow groups and time to be passed as variable
|
||
# names.
|
||
@classmethod
|
||
def from_formula(cls, formula, groups, data, subset=None,
|
||
time=None, offset=None, exposure=None,
|
||
*args, **kwargs):
|
||
"""
|
||
Create a GEE model instance from a formula and dataframe.
|
||
|
||
Parameters
|
||
----------
|
||
formula : str or generic Formula object
|
||
The formula specifying the model
|
||
groups : array_like or string
|
||
Array of grouping labels. If a string, this is the name
|
||
of a variable in `data` that contains the grouping labels.
|
||
data : array_like
|
||
The data for the model.
|
||
subset : array_like
|
||
An array-like object of booleans, integers, or index
|
||
values that indicate the subset of the data to used when
|
||
fitting the model.
|
||
time : array_like or string
|
||
The time values, used for dependence structures involving
|
||
distances between observations. If a string, this is the
|
||
name of a variable in `data` that contains the time
|
||
values.
|
||
offset : array_like or string
|
||
The offset values, added to the linear predictor. If a
|
||
string, this is the name of a variable in `data` that
|
||
contains the offset values.
|
||
exposure : array_like or string
|
||
The exposure values, only used if the link function is the
|
||
logarithm function, in which case the log of `exposure`
|
||
is added to the offset (if any). If a string, this is the
|
||
name of a variable in `data` that contains the offset
|
||
values.
|
||
{missing_param_doc}
|
||
args : extra arguments
|
||
These are passed to the model
|
||
kwargs : extra keyword arguments
|
||
These are passed to the model with two exceptions. `dep_data`
|
||
is processed as described below. The ``eval_env`` keyword is
|
||
passed to patsy. It can be either a
|
||
:class:`patsy:patsy.EvalEnvironment` object or an integer
|
||
indicating the depth of the namespace to use. For example, the
|
||
default ``eval_env=0`` uses the calling namespace.
|
||
If you wish to use a "clean" environment set ``eval_env=-1``.
|
||
|
||
Optional arguments
|
||
------------------
|
||
dep_data : str or array_like
|
||
Data used for estimating the dependence structure. See
|
||
specific dependence structure classes (e.g. Nested) for
|
||
details. If `dep_data` is a string, it is interpreted as
|
||
a formula that is applied to `data`. If it is an array, it
|
||
must be an array of strings corresponding to column names in
|
||
`data`. Otherwise it must be an array-like with the same
|
||
number of rows as data.
|
||
|
||
Returns
|
||
-------
|
||
model : GEE model instance
|
||
|
||
Notes
|
||
-----
|
||
`data` must define __getitem__ with the keys in the formula
|
||
terms args and kwargs are passed on to the model
|
||
instantiation. E.g., a numpy structured or rec array, a
|
||
dictionary, or a pandas DataFrame.
|
||
""".format(missing_param_doc=base._missing_param_doc)
|
||
|
||
groups_name = "Groups"
|
||
if isinstance(groups, str):
|
||
groups_name = groups
|
||
groups = data[groups]
|
||
|
||
if isinstance(time, str):
|
||
time = data[time]
|
||
|
||
if isinstance(offset, str):
|
||
offset = data[offset]
|
||
|
||
if isinstance(exposure, str):
|
||
exposure = data[exposure]
|
||
|
||
dep_data = kwargs.get("dep_data")
|
||
dep_data_names = None
|
||
if dep_data is not None:
|
||
if isinstance(dep_data, str):
|
||
dep_data = patsy.dmatrix(dep_data, data,
|
||
return_type='dataframe')
|
||
dep_data_names = dep_data.columns.tolist()
|
||
else:
|
||
dep_data_names = list(dep_data)
|
||
dep_data = data[dep_data]
|
||
kwargs["dep_data"] = np.asarray(dep_data)
|
||
|
||
family = None
|
||
if "family" in kwargs:
|
||
family = kwargs["family"]
|
||
del kwargs["family"]
|
||
|
||
model = super().from_formula(formula, data=data, subset=subset,
|
||
groups=groups, time=time,
|
||
offset=offset,
|
||
exposure=exposure,
|
||
family=family,
|
||
*args, **kwargs)
|
||
|
||
if dep_data_names is not None:
|
||
model._dep_data_names = dep_data_names
|
||
model._groups_name = groups_name
|
||
|
||
return model
|
||
|
||
def cluster_list(self, array):
|
||
"""
|
||
Returns `array` split into subarrays corresponding to the
|
||
cluster structure.
|
||
"""
|
||
|
||
if array.ndim == 1:
|
||
return [np.array(array[self.group_indices[k]])
|
||
for k in self.group_labels]
|
||
else:
|
||
return [np.array(array[self.group_indices[k], :])
|
||
for k in self.group_labels]
|
||
|
||
def compare_score_test(self, submodel):
|
||
"""
|
||
Perform a score test for the given submodel against this model.
|
||
|
||
Parameters
|
||
----------
|
||
submodel : GEEResults instance
|
||
A fitted GEE model that is a submodel of this model.
|
||
|
||
Returns
|
||
-------
|
||
A dictionary with keys "statistic", "p-value", and "df",
|
||
containing the score test statistic, its chi^2 p-value,
|
||
and the degrees of freedom used to compute the p-value.
|
||
|
||
Notes
|
||
-----
|
||
The score test can be performed without calling 'fit' on the
|
||
larger model. The provided submodel must be obtained from a
|
||
fitted GEE.
|
||
|
||
This method performs the same score test as can be obtained by
|
||
fitting the GEE with a linear constraint and calling `score_test`
|
||
on the results.
|
||
|
||
References
|
||
----------
|
||
Xu Guo and Wei Pan (2002). "Small sample performance of the score
|
||
test in GEE".
|
||
http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
|
||
"""
|
||
|
||
# Since the model has not been fit, its scaletype has not been
|
||
# set. So give it the scaletype of the submodel.
|
||
self.scaletype = submodel.model.scaletype
|
||
|
||
# Check consistency between model and submodel (not a comprehensive
|
||
# check)
|
||
submod = submodel.model
|
||
if self.exog.shape[0] != submod.exog.shape[0]:
|
||
msg = "Model and submodel have different numbers of cases."
|
||
raise ValueError(msg)
|
||
if self.exog.shape[1] == submod.exog.shape[1]:
|
||
msg = "Model and submodel have the same number of variables"
|
||
warnings.warn(msg)
|
||
if not isinstance(self.family, type(submod.family)):
|
||
msg = "Model and submodel have different GLM families."
|
||
warnings.warn(msg)
|
||
if not isinstance(self.cov_struct, type(submod.cov_struct)):
|
||
warnings.warn("Model and submodel have different GEE covariance "
|
||
"structures.")
|
||
if not np.equal(self.weights, submod.weights).all():
|
||
msg = "Model and submodel should have the same weights."
|
||
warnings.warn(msg)
|
||
|
||
# Get the positions of the submodel variables in the
|
||
# parent model
|
||
qm, qc = _score_test_submodel(self, submodel.model)
|
||
if qm is None:
|
||
msg = "The provided model is not a submodel."
|
||
raise ValueError(msg)
|
||
|
||
# Embed the submodel params into a params vector for the
|
||
# parent model
|
||
params_ex = np.dot(qm, submodel.params)
|
||
|
||
# Attempt to preserve the state of the parent model
|
||
cov_struct_save = self.cov_struct
|
||
import copy
|
||
cached_means_save = copy.deepcopy(self.cached_means)
|
||
|
||
# Get the score vector of the submodel params in
|
||
# the parent model
|
||
self.cov_struct = submodel.cov_struct
|
||
self.update_cached_means(params_ex)
|
||
_, score = self._update_mean_params()
|
||
if score is None:
|
||
msg = "Singular matrix encountered in GEE score test"
|
||
warnings.warn(msg, ConvergenceWarning)
|
||
return None
|
||
|
||
if not hasattr(self, "ddof_scale"):
|
||
self.ddof_scale = self.exog.shape[1]
|
||
|
||
if not hasattr(self, "scaling_factor"):
|
||
self.scaling_factor = 1
|
||
|
||
_, ncov1, cmat = self._covmat()
|
||
score2 = np.dot(qc.T, score)
|
||
|
||
try:
|
||
amat = np.linalg.inv(ncov1)
|
||
except np.linalg.LinAlgError:
|
||
amat = np.linalg.pinv(ncov1)
|
||
|
||
bmat_11 = np.dot(qm.T, np.dot(cmat, qm))
|
||
bmat_22 = np.dot(qc.T, np.dot(cmat, qc))
|
||
bmat_12 = np.dot(qm.T, np.dot(cmat, qc))
|
||
|
||
amat_11 = np.dot(qm.T, np.dot(amat, qm))
|
||
amat_12 = np.dot(qm.T, np.dot(amat, qc))
|
||
|
||
try:
|
||
ab = np.linalg.solve(amat_11, bmat_12)
|
||
except np.linalg.LinAlgError:
|
||
ab = np.dot(np.linalg.pinv(amat_11), bmat_12)
|
||
|
||
score_cov = bmat_22 - np.dot(amat_12.T, ab)
|
||
|
||
try:
|
||
aa = np.linalg.solve(amat_11, amat_12)
|
||
except np.linalg.LinAlgError:
|
||
aa = np.dot(np.linalg.pinv(amat_11), amat_12)
|
||
|
||
score_cov -= np.dot(bmat_12.T, aa)
|
||
|
||
try:
|
||
ab = np.linalg.solve(amat_11, bmat_11)
|
||
except np.linalg.LinAlgError:
|
||
ab = np.dot(np.linalg.pinv(amat_11), bmat_11)
|
||
|
||
try:
|
||
aa = np.linalg.solve(amat_11, amat_12)
|
||
except np.linalg.LinAlgError:
|
||
aa = np.dot(np.linalg.pinv(amat_11), amat_12)
|
||
|
||
score_cov += np.dot(amat_12.T, np.dot(ab, aa))
|
||
|
||
# Attempt to restore state
|
||
self.cov_struct = cov_struct_save
|
||
self.cached_means = cached_means_save
|
||
|
||
from scipy.stats.distributions import chi2
|
||
try:
|
||
sc2 = np.linalg.solve(score_cov, score2)
|
||
except np.linalg.LinAlgError:
|
||
sc2 = np.dot(np.linalg.pinv(score_cov), score2)
|
||
score_statistic = np.dot(score2, sc2)
|
||
score_df = len(score2)
|
||
score_pvalue = 1 - chi2.cdf(score_statistic, score_df)
|
||
return {"statistic": score_statistic,
|
||
"df": score_df,
|
||
"p-value": score_pvalue}
|
||
|
||
def estimate_scale(self):
|
||
"""
|
||
Estimate the dispersion/scale.
|
||
"""
|
||
|
||
if self.scaletype is None:
|
||
if isinstance(self.family, (families.Binomial, families.Poisson,
|
||
families.NegativeBinomial,
|
||
_Multinomial)):
|
||
return 1.
|
||
elif isinstance(self.scaletype, float):
|
||
return np.array(self.scaletype)
|
||
|
||
endog = self.endog_li
|
||
cached_means = self.cached_means
|
||
nobs = self.nobs
|
||
varfunc = self.family.variance
|
||
|
||
scale = 0.
|
||
fsum = 0.
|
||
for i in range(self.num_group):
|
||
|
||
if len(endog[i]) == 0:
|
||
continue
|
||
|
||
expval, _ = cached_means[i]
|
||
sdev = np.sqrt(varfunc(expval))
|
||
resid = (endog[i] - expval) / sdev
|
||
|
||
if self.weights is not None:
|
||
f = self.weights_li[i]
|
||
scale += np.sum(f * (resid ** 2))
|
||
fsum += f.sum()
|
||
else:
|
||
scale += np.sum(resid ** 2)
|
||
fsum += len(resid)
|
||
|
||
scale /= (fsum * (nobs - self.ddof_scale) / float(nobs))
|
||
|
||
return scale
|
||
|
||
def mean_deriv(self, exog, lin_pred):
|
||
"""
|
||
Derivative of the expected endog with respect to the parameters.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
The exogeneous data at which the derivative is computed.
|
||
lin_pred : array_like
|
||
The values of the linear predictor.
|
||
|
||
Returns
|
||
-------
|
||
The value of the derivative of the expected endog with respect
|
||
to the parameter vector.
|
||
|
||
Notes
|
||
-----
|
||
If there is an offset or exposure, it should be added to
|
||
`lin_pred` prior to calling this function.
|
||
"""
|
||
|
||
idl = self.family.link.inverse_deriv(lin_pred)
|
||
dmat = exog * idl[:, None]
|
||
return dmat
|
||
|
||
def mean_deriv_exog(self, exog, params, offset_exposure=None):
|
||
"""
|
||
Derivative of the expected endog with respect to exog.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
Values of the independent variables at which the derivative
|
||
is calculated.
|
||
params : array_like
|
||
Parameter values at which the derivative is calculated.
|
||
offset_exposure : array_like, optional
|
||
Combined offset and exposure.
|
||
|
||
Returns
|
||
-------
|
||
The derivative of the expected endog with respect to exog.
|
||
"""
|
||
|
||
lin_pred = np.dot(exog, params)
|
||
if offset_exposure is not None:
|
||
lin_pred += offset_exposure
|
||
|
||
idl = self.family.link.inverse_deriv(lin_pred)
|
||
dmat = np.outer(idl, params)
|
||
return dmat
|
||
|
||
def _update_mean_params(self):
|
||
"""
|
||
Returns
|
||
-------
|
||
update : array_like
|
||
The update vector such that params + update is the next
|
||
iterate when solving the score equations.
|
||
score : array_like
|
||
The current value of the score equations, not
|
||
incorporating the scale parameter. If desired,
|
||
multiply this vector by the scale parameter to
|
||
incorporate the scale.
|
||
"""
|
||
|
||
endog = self.endog_li
|
||
exog = self.exog_li
|
||
weights = getattr(self, "weights_li", None)
|
||
|
||
cached_means = self.cached_means
|
||
|
||
varfunc = self.family.variance
|
||
|
||
bmat, score = 0, 0
|
||
for i in range(self.num_group):
|
||
|
||
expval, lpr = cached_means[i]
|
||
resid = endog[i] - expval
|
||
dmat = self.mean_deriv(exog[i], lpr)
|
||
sdev = np.sqrt(varfunc(expval))
|
||
|
||
if weights is not None:
|
||
w = weights[i]
|
||
wresid = resid * w
|
||
wdmat = dmat * w[:, None]
|
||
else:
|
||
wresid = resid
|
||
wdmat = dmat
|
||
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (wdmat, wresid))
|
||
if rslt is None:
|
||
return None, None
|
||
vinv_d, vinv_resid = tuple(rslt)
|
||
|
||
bmat += np.dot(dmat.T, vinv_d)
|
||
score += np.dot(dmat.T, vinv_resid)
|
||
|
||
try:
|
||
update = np.linalg.solve(bmat, score)
|
||
except np.linalg.LinAlgError:
|
||
update = np.dot(np.linalg.pinv(bmat), score)
|
||
|
||
self._fit_history["cov_adjust"].append(
|
||
self.cov_struct.cov_adjust)
|
||
|
||
return update, score
|
||
|
||
def update_cached_means(self, mean_params):
|
||
"""
|
||
cached_means should always contain the most recent calculation
|
||
of the group-wise mean vectors. This function should be
|
||
called every time the regression parameters are changed, to
|
||
keep the cached means up to date.
|
||
"""
|
||
|
||
endog = self.endog_li
|
||
exog = self.exog_li
|
||
offset = self.offset_li
|
||
|
||
linkinv = self.family.link.inverse
|
||
|
||
self.cached_means = []
|
||
|
||
for i in range(self.num_group):
|
||
|
||
if len(endog[i]) == 0:
|
||
continue
|
||
|
||
lpr = np.dot(exog[i], mean_params)
|
||
if offset is not None:
|
||
lpr += offset[i]
|
||
expval = linkinv(lpr)
|
||
|
||
self.cached_means.append((expval, lpr))
|
||
|
||
def _covmat(self):
|
||
"""
|
||
Returns the sampling covariance matrix of the regression
|
||
parameters and related quantities.
|
||
|
||
Returns
|
||
-------
|
||
cov_robust : array_like
|
||
The robust, or sandwich estimate of the covariance, which
|
||
is meaningful even if the working covariance structure is
|
||
incorrectly specified.
|
||
cov_naive : array_like
|
||
The model-based estimate of the covariance, which is
|
||
meaningful if the covariance structure is correctly
|
||
specified.
|
||
cmat : array_like
|
||
The center matrix of the sandwich expression, used in
|
||
obtaining score test results.
|
||
"""
|
||
|
||
endog = self.endog_li
|
||
exog = self.exog_li
|
||
weights = getattr(self, "weights_li", None)
|
||
varfunc = self.family.variance
|
||
cached_means = self.cached_means
|
||
|
||
# Calculate the naive (model-based) and robust (sandwich)
|
||
# covariances.
|
||
bmat, cmat = 0, 0
|
||
for i in range(self.num_group):
|
||
|
||
expval, lpr = cached_means[i]
|
||
resid = endog[i] - expval
|
||
dmat = self.mean_deriv(exog[i], lpr)
|
||
sdev = np.sqrt(varfunc(expval))
|
||
|
||
if weights is not None:
|
||
w = weights[i]
|
||
wresid = resid * w
|
||
wdmat = dmat * w[:, None]
|
||
else:
|
||
wresid = resid
|
||
wdmat = dmat
|
||
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (wdmat, wresid))
|
||
if rslt is None:
|
||
return None, None, None, None
|
||
vinv_d, vinv_resid = tuple(rslt)
|
||
|
||
bmat += np.dot(dmat.T, vinv_d)
|
||
dvinv_resid = np.dot(dmat.T, vinv_resid)
|
||
cmat += np.outer(dvinv_resid, dvinv_resid)
|
||
|
||
scale = self.estimate_scale()
|
||
|
||
try:
|
||
bmati = np.linalg.inv(bmat)
|
||
except np.linalg.LinAlgError:
|
||
bmati = np.linalg.pinv(bmat)
|
||
|
||
cov_naive = bmati * scale
|
||
cov_robust = np.dot(bmati, np.dot(cmat, bmati))
|
||
|
||
cov_naive *= self.scaling_factor
|
||
cov_robust *= self.scaling_factor
|
||
return cov_robust, cov_naive, cmat
|
||
|
||
# Calculate the bias-corrected sandwich estimate of Mancl and
|
||
# DeRouen.
|
||
def _bc_covmat(self, cov_naive):
|
||
|
||
cov_naive = cov_naive / self.scaling_factor
|
||
endog = self.endog_li
|
||
exog = self.exog_li
|
||
varfunc = self.family.variance
|
||
cached_means = self.cached_means
|
||
scale = self.estimate_scale()
|
||
|
||
bcm = 0
|
||
for i in range(self.num_group):
|
||
|
||
expval, lpr = cached_means[i]
|
||
resid = endog[i] - expval
|
||
dmat = self.mean_deriv(exog[i], lpr)
|
||
sdev = np.sqrt(varfunc(expval))
|
||
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (dmat,))
|
||
if rslt is None:
|
||
return None
|
||
vinv_d = rslt[0]
|
||
vinv_d /= scale
|
||
|
||
hmat = np.dot(vinv_d, cov_naive)
|
||
hmat = np.dot(hmat, dmat.T).T
|
||
|
||
f = self.weights_li[i] if self.weights is not None else 1.
|
||
|
||
aresid = np.linalg.solve(np.eye(len(resid)) - hmat, resid)
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (aresid,))
|
||
if rslt is None:
|
||
return None
|
||
srt = rslt[0]
|
||
srt = f * np.dot(dmat.T, srt) / scale
|
||
bcm += np.outer(srt, srt)
|
||
|
||
cov_robust_bc = np.dot(cov_naive, np.dot(bcm, cov_naive))
|
||
cov_robust_bc *= self.scaling_factor
|
||
|
||
return cov_robust_bc
|
||
|
||
def _starting_params(self):
|
||
|
||
if np.isscalar(self._offset_exposure):
|
||
offset = None
|
||
else:
|
||
offset = self._offset_exposure
|
||
|
||
model = GLM(self.endog, self.exog, family=self.family,
|
||
offset=offset, freq_weights=self.weights)
|
||
result = model.fit()
|
||
return result.params
|
||
|
||
@Appender(_gee_fit_doc)
|
||
def fit(self, maxiter=60, ctol=1e-6, start_params=None,
|
||
params_niter=1, first_dep_update=0,
|
||
cov_type='robust', ddof_scale=None, scaling_factor=1.,
|
||
scale=None):
|
||
|
||
self.scaletype = scale
|
||
|
||
# Subtract this number from the total sample size when
|
||
# normalizing the scale parameter estimate.
|
||
if ddof_scale is None:
|
||
self.ddof_scale = self.exog.shape[1]
|
||
else:
|
||
if not ddof_scale >= 0:
|
||
raise ValueError(
|
||
"ddof_scale must be a non-negative number or None")
|
||
self.ddof_scale = ddof_scale
|
||
|
||
self.scaling_factor = scaling_factor
|
||
|
||
self._fit_history = defaultdict(list)
|
||
|
||
if self.weights is not None and cov_type == 'naive':
|
||
raise ValueError("when using weights, cov_type may not be naive")
|
||
|
||
if start_params is None:
|
||
mean_params = self._starting_params()
|
||
else:
|
||
start_params = np.asarray(start_params)
|
||
mean_params = start_params.copy()
|
||
|
||
self.update_cached_means(mean_params)
|
||
|
||
del_params = -1.
|
||
num_assoc_updates = 0
|
||
for itr in range(maxiter):
|
||
|
||
update, score = self._update_mean_params()
|
||
if update is None:
|
||
warnings.warn("Singular matrix encountered in GEE update",
|
||
ConvergenceWarning)
|
||
break
|
||
mean_params += update
|
||
self.update_cached_means(mean_params)
|
||
|
||
# L2 norm of the change in mean structure parameters at
|
||
# this iteration.
|
||
del_params = np.sqrt(np.sum(score ** 2))
|
||
|
||
self._fit_history['params'].append(mean_params.copy())
|
||
self._fit_history['score'].append(score)
|
||
self._fit_history['dep_params'].append(
|
||
self.cov_struct.dep_params)
|
||
|
||
# Do not exit until the association parameters have been
|
||
# updated at least once.
|
||
if (del_params < ctol and
|
||
(num_assoc_updates > 0 or self.update_dep is False)):
|
||
break
|
||
|
||
# Update the dependence structure
|
||
if (self.update_dep and (itr % params_niter) == 0
|
||
and (itr >= first_dep_update)):
|
||
self._update_assoc(mean_params)
|
||
num_assoc_updates += 1
|
||
|
||
if del_params >= ctol:
|
||
warnings.warn("Iteration limit reached prior to convergence",
|
||
IterationLimitWarning)
|
||
|
||
if mean_params is None:
|
||
warnings.warn("Unable to estimate GEE parameters.",
|
||
ConvergenceWarning)
|
||
return None
|
||
|
||
bcov, ncov, _ = self._covmat()
|
||
if bcov is None:
|
||
warnings.warn("Estimated covariance structure for GEE "
|
||
"estimates is singular", ConvergenceWarning)
|
||
return None
|
||
bc_cov = None
|
||
if cov_type == "bias_reduced":
|
||
bc_cov = self._bc_covmat(ncov)
|
||
|
||
if self.constraint is not None:
|
||
x = mean_params.copy()
|
||
mean_params, bcov = self._handle_constraint(mean_params, bcov)
|
||
if mean_params is None:
|
||
warnings.warn("Unable to estimate constrained GEE "
|
||
"parameters.", ConvergenceWarning)
|
||
return None
|
||
|
||
y, ncov = self._handle_constraint(x, ncov)
|
||
if y is None:
|
||
warnings.warn("Unable to estimate constrained GEE "
|
||
"parameters.", ConvergenceWarning)
|
||
return None
|
||
|
||
if bc_cov is not None:
|
||
y, bc_cov = self._handle_constraint(x, bc_cov)
|
||
if x is None:
|
||
warnings.warn("Unable to estimate constrained GEE "
|
||
"parameters.", ConvergenceWarning)
|
||
return None
|
||
|
||
scale = self.estimate_scale()
|
||
|
||
# kwargs to add to results instance, need to be available in __init__
|
||
res_kwds = dict(cov_type=cov_type,
|
||
cov_robust=bcov,
|
||
cov_naive=ncov,
|
||
cov_robust_bc=bc_cov)
|
||
|
||
# The superclass constructor will multiply the covariance
|
||
# matrix argument bcov by scale, which we do not want, so we
|
||
# divide bcov by the scale parameter here
|
||
results = GEEResults(self, mean_params, bcov / scale, scale,
|
||
cov_type=cov_type, use_t=False,
|
||
attr_kwds=res_kwds)
|
||
|
||
# attributes not needed during results__init__
|
||
results.fit_history = self._fit_history
|
||
self.fit_history = defaultdict(list)
|
||
results.score_norm = del_params
|
||
results.converged = (del_params < ctol)
|
||
results.cov_struct = self.cov_struct
|
||
results.params_niter = params_niter
|
||
results.first_dep_update = first_dep_update
|
||
results.ctol = ctol
|
||
results.maxiter = maxiter
|
||
|
||
# These will be copied over to subclasses when upgrading.
|
||
results._props = ["cov_type", "use_t",
|
||
"cov_params_default", "cov_robust",
|
||
"cov_naive", "cov_robust_bc",
|
||
"fit_history",
|
||
"score_norm", "converged", "cov_struct",
|
||
"params_niter", "first_dep_update", "ctol",
|
||
"maxiter"]
|
||
|
||
return GEEResultsWrapper(results)
|
||
|
||
def _update_regularized(self, params, pen_wt, scad_param, eps):
|
||
|
||
sn, hm = 0, 0
|
||
|
||
for i in range(self.num_group):
|
||
|
||
expval, _ = self.cached_means[i]
|
||
resid = self.endog_li[i] - expval
|
||
sdev = np.sqrt(self.family.variance(expval))
|
||
|
||
ex = self.exog_li[i] * sdev[:, None]**2
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (resid, ex))
|
||
sn0 = rslt[0]
|
||
sn += np.dot(ex.T, sn0)
|
||
hm0 = rslt[1]
|
||
hm += np.dot(ex.T, hm0)
|
||
|
||
# Wang et al. divide sn here by num_group, but that
|
||
# seems to be incorrect
|
||
|
||
ap = np.abs(params)
|
||
clipped = np.clip(scad_param * pen_wt - ap, 0, np.inf)
|
||
en = pen_wt * clipped * (ap > pen_wt)
|
||
en /= (scad_param - 1) * pen_wt
|
||
en += pen_wt * (ap <= pen_wt)
|
||
en /= eps + ap
|
||
|
||
hm.flat[::hm.shape[0] + 1] += self.num_group * en
|
||
sn -= self.num_group * en * params
|
||
try:
|
||
update = np.linalg.solve(hm, sn)
|
||
except np.linalg.LinAlgError:
|
||
update = np.dot(np.linalg.pinv(hm), sn)
|
||
msg = "Encountered singularity in regularized GEE update"
|
||
warnings.warn(msg)
|
||
hm *= self.estimate_scale()
|
||
|
||
return update, hm
|
||
|
||
def _regularized_covmat(self, mean_params):
|
||
|
||
self.update_cached_means(mean_params)
|
||
|
||
ma = 0
|
||
|
||
for i in range(self.num_group):
|
||
|
||
expval, _ = self.cached_means[i]
|
||
resid = self.endog_li[i] - expval
|
||
sdev = np.sqrt(self.family.variance(expval))
|
||
|
||
ex = self.exog_li[i] * sdev[:, None]**2
|
||
rslt = self.cov_struct.covariance_matrix_solve(
|
||
expval, i, sdev, (resid,))
|
||
ma0 = np.dot(ex.T, rslt[0])
|
||
ma += np.outer(ma0, ma0)
|
||
|
||
return ma
|
||
|
||
def fit_regularized(self, pen_wt, scad_param=3.7, maxiter=100,
|
||
ddof_scale=None, update_assoc=5,
|
||
ctol=1e-5, ztol=1e-3, eps=1e-6, scale=None):
|
||
"""
|
||
Regularized estimation for GEE.
|
||
|
||
Parameters
|
||
----------
|
||
pen_wt : float
|
||
The penalty weight (a non-negative scalar).
|
||
scad_param : float
|
||
Non-negative scalar determining the shape of the Scad
|
||
penalty.
|
||
maxiter : int
|
||
The maximum number of iterations.
|
||
ddof_scale : int
|
||
Value to subtract from `nobs` when calculating the
|
||
denominator degrees of freedom for t-statistics, defaults
|
||
to the number of columns in `exog`.
|
||
update_assoc : int
|
||
The dependence parameters are updated every `update_assoc`
|
||
iterations of the mean structure parameter updates.
|
||
ctol : float
|
||
Convergence criterion, default is one order of magnitude
|
||
smaller than proposed in section 3.1 of Wang et al.
|
||
ztol : float
|
||
Coefficients smaller than this value are treated as
|
||
being zero, default is based on section 5 of Wang et al.
|
||
eps : non-negative scalar
|
||
Numerical constant, see section 3.2 of Wang et al.
|
||
scale : float or string
|
||
If a float, this value is used as the scale parameter.
|
||
If "X2", the scale parameter is always estimated using
|
||
Pearson's chi-square method (e.g. as in a quasi-Poisson
|
||
analysis). If None, the default approach for the family
|
||
is used to estimate the scale parameter.
|
||
|
||
Returns
|
||
-------
|
||
GEEResults instance. Note that not all methods of the results
|
||
class make sense when the model has been fit with regularization.
|
||
|
||
Notes
|
||
-----
|
||
This implementation assumes that the link is canonical.
|
||
|
||
References
|
||
----------
|
||
Wang L, Zhou J, Qu A. (2012). Penalized generalized estimating
|
||
equations for high-dimensional longitudinal data analysis.
|
||
Biometrics. 2012 Jun;68(2):353-60.
|
||
doi: 10.1111/j.1541-0420.2011.01678.x.
|
||
https://www.ncbi.nlm.nih.gov/pubmed/21955051
|
||
http://users.stat.umn.edu/~wangx346/research/GEE_selection.pdf
|
||
"""
|
||
|
||
self.scaletype = scale
|
||
|
||
mean_params = np.zeros(self.exog.shape[1])
|
||
self.update_cached_means(mean_params)
|
||
converged = False
|
||
fit_history = defaultdict(list)
|
||
|
||
# Subtract this number from the total sample size when
|
||
# normalizing the scale parameter estimate.
|
||
if ddof_scale is None:
|
||
self.ddof_scale = self.exog.shape[1]
|
||
else:
|
||
if not ddof_scale >= 0:
|
||
raise ValueError(
|
||
"ddof_scale must be a non-negative number or None")
|
||
self.ddof_scale = ddof_scale
|
||
|
||
# Keep this private for now. In some cases the early steps are
|
||
# very small so it seems necessary to ensure a certain minimum
|
||
# number of iterations before testing for convergence.
|
||
miniter = 20
|
||
|
||
for itr in range(maxiter):
|
||
|
||
update, hm = self._update_regularized(
|
||
mean_params, pen_wt, scad_param, eps)
|
||
if update is None:
|
||
msg = "Singular matrix encountered in regularized GEE update"
|
||
warnings.warn(msg, ConvergenceWarning)
|
||
break
|
||
if itr > miniter and np.sqrt(np.sum(update**2)) < ctol:
|
||
converged = True
|
||
break
|
||
mean_params += update
|
||
fit_history['params'].append(mean_params.copy())
|
||
self.update_cached_means(mean_params)
|
||
|
||
if itr != 0 and (itr % update_assoc == 0):
|
||
self._update_assoc(mean_params)
|
||
|
||
if not converged:
|
||
msg = "GEE.fit_regularized did not converge"
|
||
warnings.warn(msg)
|
||
|
||
mean_params[np.abs(mean_params) < ztol] = 0
|
||
|
||
self._update_assoc(mean_params)
|
||
ma = self._regularized_covmat(mean_params)
|
||
cov = np.linalg.solve(hm, ma)
|
||
cov = np.linalg.solve(hm, cov.T)
|
||
|
||
# kwargs to add to results instance, need to be available in __init__
|
||
res_kwds = dict(cov_type="robust", cov_robust=cov)
|
||
|
||
scale = self.estimate_scale()
|
||
rslt = GEEResults(self, mean_params, cov, scale,
|
||
regularized=True, attr_kwds=res_kwds)
|
||
rslt.fit_history = fit_history
|
||
|
||
return GEEResultsWrapper(rslt)
|
||
|
||
def _handle_constraint(self, mean_params, bcov):
|
||
"""
|
||
Expand the parameter estimate `mean_params` and covariance matrix
|
||
`bcov` to the coordinate system of the unconstrained model.
|
||
|
||
Parameters
|
||
----------
|
||
mean_params : array_like
|
||
A parameter vector estimate for the reduced model.
|
||
bcov : array_like
|
||
The covariance matrix of mean_params.
|
||
|
||
Returns
|
||
-------
|
||
mean_params : array_like
|
||
The input parameter vector mean_params, expanded to the
|
||
coordinate system of the full model
|
||
bcov : array_like
|
||
The input covariance matrix bcov, expanded to the
|
||
coordinate system of the full model
|
||
"""
|
||
|
||
# The number of variables in the full model
|
||
red_p = len(mean_params)
|
||
full_p = self.constraint.lhs.shape[1]
|
||
mean_params0 = np.r_[mean_params, np.zeros(full_p - red_p)]
|
||
|
||
# Get the score vector under the full model.
|
||
save_exog_li = self.exog_li
|
||
self.exog_li = self.constraint.exog_fulltrans_li
|
||
import copy
|
||
save_cached_means = copy.deepcopy(self.cached_means)
|
||
self.update_cached_means(mean_params0)
|
||
_, score = self._update_mean_params()
|
||
|
||
if score is None:
|
||
warnings.warn("Singular matrix encountered in GEE score test",
|
||
ConvergenceWarning)
|
||
return None, None
|
||
|
||
_, ncov1, cmat = self._covmat()
|
||
scale = self.estimate_scale()
|
||
cmat = cmat / scale ** 2
|
||
score2 = score[red_p:] / scale
|
||
amat = np.linalg.inv(ncov1)
|
||
|
||
bmat_11 = cmat[0:red_p, 0:red_p]
|
||
bmat_22 = cmat[red_p:, red_p:]
|
||
bmat_12 = cmat[0:red_p, red_p:]
|
||
amat_11 = amat[0:red_p, 0:red_p]
|
||
amat_12 = amat[0:red_p, red_p:]
|
||
|
||
score_cov = bmat_22 - np.dot(amat_12.T,
|
||
np.linalg.solve(amat_11, bmat_12))
|
||
score_cov -= np.dot(bmat_12.T,
|
||
np.linalg.solve(amat_11, amat_12))
|
||
score_cov += np.dot(amat_12.T,
|
||
np.dot(np.linalg.solve(amat_11, bmat_11),
|
||
np.linalg.solve(amat_11, amat_12)))
|
||
|
||
from scipy.stats.distributions import chi2
|
||
score_statistic = np.dot(score2,
|
||
np.linalg.solve(score_cov, score2))
|
||
score_df = len(score2)
|
||
score_pvalue = 1 - chi2.cdf(score_statistic, score_df)
|
||
self.score_test_results = {"statistic": score_statistic,
|
||
"df": score_df,
|
||
"p-value": score_pvalue}
|
||
|
||
mean_params = self.constraint.unpack_param(mean_params)
|
||
bcov = self.constraint.unpack_cov(bcov)
|
||
|
||
self.exog_li = save_exog_li
|
||
self.cached_means = save_cached_means
|
||
self.exog = self.constraint.restore_exog()
|
||
|
||
return mean_params, bcov
|
||
|
||
def _update_assoc(self, params):
|
||
"""
|
||
Update the association parameters
|
||
"""
|
||
|
||
self.cov_struct.update(params)
|
||
|
||
def _derivative_exog(self, params, exog=None, transform='dydx',
|
||
dummy_idx=None, count_idx=None):
|
||
"""
|
||
For computing marginal effects, returns dF(XB) / dX where F(.)
|
||
is the fitted mean.
|
||
|
||
transform can be 'dydx', 'dyex', 'eydx', or 'eyex'.
|
||
|
||
Not all of these make sense in the presence of discrete regressors,
|
||
but checks are done in the results in get_margeff.
|
||
"""
|
||
# This form should be appropriate for group 1 probit, logit,
|
||
# logistic, cloglog, heckprob, xtprobit.
|
||
offset_exposure = None
|
||
if exog is None:
|
||
exog = self.exog
|
||
offset_exposure = self._offset_exposure
|
||
|
||
margeff = self.mean_deriv_exog(exog, params, offset_exposure)
|
||
|
||
if 'ex' in transform:
|
||
margeff *= exog
|
||
if 'ey' in transform:
|
||
margeff /= self.predict(params, exog)[:, None]
|
||
if count_idx is not None:
|
||
from statsmodels.discrete.discrete_margins import (
|
||
_get_count_effects)
|
||
margeff = _get_count_effects(margeff, exog, count_idx, transform,
|
||
self, params)
|
||
if dummy_idx is not None:
|
||
from statsmodels.discrete.discrete_margins import (
|
||
_get_dummy_effects)
|
||
margeff = _get_dummy_effects(margeff, exog, dummy_idx, transform,
|
||
self, params)
|
||
return margeff
|
||
|
||
def qic(self, params, scale, cov_params, n_step=1000):
|
||
"""
|
||
Returns quasi-information criteria and quasi-likelihood values.
|
||
|
||
Parameters
|
||
----------
|
||
params : array_like
|
||
The GEE estimates of the regression parameters.
|
||
scale : scalar
|
||
Estimated scale parameter
|
||
cov_params : array_like
|
||
An estimate of the covariance matrix for the
|
||
model parameters. Conventionally this is the robust
|
||
covariance matrix.
|
||
n_step : integer
|
||
The number of points in the trapezoidal approximation
|
||
to the quasi-likelihood function.
|
||
|
||
Returns
|
||
-------
|
||
ql : scalar
|
||
The quasi-likelihood value
|
||
qic : scalar
|
||
A QIC that can be used to compare the mean and covariance
|
||
structures of the model.
|
||
qicu : scalar
|
||
A simplified QIC that can be used to compare mean structures
|
||
but not covariance structures
|
||
|
||
Notes
|
||
-----
|
||
The quasi-likelihood used here is obtained by numerically evaluating
|
||
Wedderburn's integral representation of the quasi-likelihood function.
|
||
This approach is valid for all families and links. Many other
|
||
packages use analytical expressions for quasi-likelihoods that are
|
||
valid in special cases where the link function is canonical. These
|
||
analytical expressions may omit additive constants that only depend
|
||
on the data. Therefore, the numerical values of our QL and QIC values
|
||
will differ from the values reported by other packages. However only
|
||
the differences between two QIC values calculated for different models
|
||
using the same data are meaningful. Our QIC should produce the same
|
||
QIC differences as other software.
|
||
|
||
When using the QIC for models with unknown scale parameter, use a
|
||
common estimate of the scale parameter for all models being compared.
|
||
|
||
References
|
||
----------
|
||
.. [*] W. Pan (2001). Akaike's information criterion in generalized
|
||
estimating equations. Biometrics (57) 1.
|
||
"""
|
||
|
||
varfunc = self.family.variance
|
||
|
||
means = []
|
||
omega = 0.0
|
||
# omega^-1 is the model-based covariance assuming independence
|
||
|
||
for i in range(self.num_group):
|
||
expval, lpr = self.cached_means[i]
|
||
means.append(expval)
|
||
dmat = self.mean_deriv(self.exog_li[i], lpr)
|
||
omega += np.dot(dmat.T, dmat) / scale
|
||
|
||
means = np.concatenate(means)
|
||
|
||
# The quasi-likelihood, use change of variables so the integration is
|
||
# from -1 to 1.
|
||
endog_li = np.concatenate(self.endog_li)
|
||
du = means - endog_li
|
||
qv = np.empty(n_step)
|
||
xv = np.linspace(-0.99999, 1, n_step)
|
||
for i, g in enumerate(xv):
|
||
u = endog_li + (g + 1) * du / 2.0
|
||
vu = varfunc(u)
|
||
qv[i] = -np.sum(du**2 * (g + 1) / vu)
|
||
qv /= (4 * scale)
|
||
|
||
try:
|
||
from scipy.integrate import trapezoid
|
||
except ImportError:
|
||
# Remove after minimum is SciPy 1.7
|
||
from scipy.integrate import trapz as trapezoid
|
||
ql = trapezoid(qv, dx=xv[1] - xv[0])
|
||
|
||
qicu = -2 * ql + 2 * self.exog.shape[1]
|
||
qic = -2 * ql + 2 * np.trace(np.dot(omega, cov_params))
|
||
|
||
return ql, qic, qicu
|
||
|
||
|
||
class GEEResults(GLMResults):
|
||
|
||
__doc__ = (
|
||
"This class summarizes the fit of a marginal regression model "
|
||
"using GEE.\n" + _gee_results_doc)
|
||
|
||
def __init__(self, model, params, cov_params, scale,
|
||
cov_type='robust', use_t=False, regularized=False,
|
||
**kwds):
|
||
|
||
super().__init__(
|
||
model, params, normalized_cov_params=cov_params, scale=scale
|
||
)
|
||
|
||
# not added by super
|
||
self.df_resid = model.df_resid
|
||
self.df_model = model.df_model
|
||
self.family = model.family
|
||
|
||
attr_kwds = kwds.pop('attr_kwds', {})
|
||
self.__dict__.update(attr_kwds)
|
||
|
||
# we do not do this if the cov_type has already been set
|
||
# subclasses can set it through attr_kwds
|
||
if not (hasattr(self, 'cov_type') and
|
||
hasattr(self, 'cov_params_default')):
|
||
self.cov_type = cov_type # keep alias
|
||
covariance_type = self.cov_type.lower()
|
||
allowed_covariances = ["robust", "naive", "bias_reduced"]
|
||
if covariance_type not in allowed_covariances:
|
||
msg = ("GEE: `cov_type` must be one of " +
|
||
", ".join(allowed_covariances))
|
||
raise ValueError(msg)
|
||
|
||
if cov_type == "robust":
|
||
cov = self.cov_robust
|
||
elif cov_type == "naive":
|
||
cov = self.cov_naive
|
||
elif cov_type == "bias_reduced":
|
||
cov = self.cov_robust_bc
|
||
|
||
self.cov_params_default = cov
|
||
else:
|
||
if self.cov_type != cov_type:
|
||
raise ValueError('cov_type in argument is different from '
|
||
'already attached cov_type')
|
||
|
||
@cache_readonly
|
||
def resid(self):
|
||
"""
|
||
The response residuals.
|
||
"""
|
||
return self.resid_response
|
||
|
||
def standard_errors(self, cov_type="robust"):
|
||
"""
|
||
This is a convenience function that returns the standard
|
||
errors for any covariance type. The value of `bse` is the
|
||
standard errors for whichever covariance type is specified as
|
||
an argument to `fit` (defaults to "robust").
|
||
|
||
Parameters
|
||
----------
|
||
cov_type : str
|
||
One of "robust", "naive", or "bias_reduced". Determines
|
||
the covariance used to compute standard errors. Defaults
|
||
to "robust".
|
||
"""
|
||
|
||
# Check covariance_type
|
||
covariance_type = cov_type.lower()
|
||
allowed_covariances = ["robust", "naive", "bias_reduced"]
|
||
if covariance_type not in allowed_covariances:
|
||
msg = ("GEE: `covariance_type` must be one of " +
|
||
", ".join(allowed_covariances))
|
||
raise ValueError(msg)
|
||
|
||
if covariance_type == "robust":
|
||
return np.sqrt(np.diag(self.cov_robust))
|
||
elif covariance_type == "naive":
|
||
return np.sqrt(np.diag(self.cov_naive))
|
||
elif covariance_type == "bias_reduced":
|
||
if self.cov_robust_bc is None:
|
||
raise ValueError(
|
||
"GEE: `bias_reduced` covariance not available")
|
||
return np.sqrt(np.diag(self.cov_robust_bc))
|
||
|
||
# Need to override to allow for different covariance types.
|
||
@cache_readonly
|
||
def bse(self):
|
||
return self.standard_errors(self.cov_type)
|
||
|
||
def score_test(self):
|
||
"""
|
||
Return the results of a score test for a linear constraint.
|
||
|
||
Returns
|
||
-------
|
||
Adictionary containing the p-value, the test statistic,
|
||
and the degrees of freedom for the score test.
|
||
|
||
Notes
|
||
-----
|
||
See also GEE.compare_score_test for an alternative way to perform
|
||
a score test. GEEResults.score_test is more general, in that it
|
||
supports testing arbitrary linear equality constraints. However
|
||
GEE.compare_score_test might be easier to use when comparing
|
||
two explicit models.
|
||
|
||
References
|
||
----------
|
||
Xu Guo and Wei Pan (2002). "Small sample performance of the score
|
||
test in GEE".
|
||
http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
|
||
"""
|
||
|
||
if not hasattr(self.model, "score_test_results"):
|
||
msg = "score_test on results instance only available when "
|
||
msg += " model was fit with constraints"
|
||
raise ValueError(msg)
|
||
|
||
return self.model.score_test_results
|
||
|
||
@cache_readonly
|
||
def resid_split(self):
|
||
"""
|
||
Returns the residuals, the endogeneous data minus the fitted
|
||
values from the model. The residuals are returned as a list
|
||
of arrays containing the residuals for each cluster.
|
||
"""
|
||
sresid = []
|
||
for v in self.model.group_labels:
|
||
ii = self.model.group_indices[v]
|
||
sresid.append(self.resid[ii])
|
||
return sresid
|
||
|
||
@cache_readonly
|
||
def resid_centered(self):
|
||
"""
|
||
Returns the residuals centered within each group.
|
||
"""
|
||
cresid = self.resid.copy()
|
||
for v in self.model.group_labels:
|
||
ii = self.model.group_indices[v]
|
||
cresid[ii] -= cresid[ii].mean()
|
||
return cresid
|
||
|
||
@cache_readonly
|
||
def resid_centered_split(self):
|
||
"""
|
||
Returns the residuals centered within each group. The
|
||
residuals are returned as a list of arrays containing the
|
||
centered residuals for each cluster.
|
||
"""
|
||
sresid = []
|
||
for v in self.model.group_labels:
|
||
ii = self.model.group_indices[v]
|
||
sresid.append(self.centered_resid[ii])
|
||
return sresid
|
||
|
||
def qic(self, scale=None, n_step=1000):
|
||
"""
|
||
Returns the QIC and QICu information criteria.
|
||
|
||
See GEE.qic for documentation.
|
||
"""
|
||
|
||
# It is easy to forget to set the scale parameter. Sometimes
|
||
# this is intentional, so we warn.
|
||
if scale is None:
|
||
warnings.warn("QIC values obtained using scale=None are not "
|
||
"appropriate for comparing models")
|
||
|
||
if scale is None:
|
||
scale = self.scale
|
||
|
||
_, qic, qicu = self.model.qic(self.params, scale,
|
||
self.cov_params(),
|
||
n_step=n_step)
|
||
|
||
return qic, qicu
|
||
|
||
# FIXME: alias to be removed, temporary backwards compatibility
|
||
split_resid = resid_split
|
||
centered_resid = resid_centered
|
||
split_centered_resid = resid_centered_split
|
||
|
||
@Appender(_plot_added_variable_doc % {'extra_params_doc': ''})
|
||
def plot_added_variable(self, focus_exog, resid_type=None,
|
||
use_glm_weights=True, fit_kwargs=None,
|
||
ax=None):
|
||
|
||
from statsmodels.graphics.regressionplots import plot_added_variable
|
||
|
||
fig = plot_added_variable(self, focus_exog,
|
||
resid_type=resid_type,
|
||
use_glm_weights=use_glm_weights,
|
||
fit_kwargs=fit_kwargs, ax=ax)
|
||
|
||
return fig
|
||
|
||
@Appender(_plot_partial_residuals_doc % {'extra_params_doc': ''})
|
||
def plot_partial_residuals(self, focus_exog, ax=None):
|
||
|
||
from statsmodels.graphics.regressionplots import plot_partial_residuals
|
||
|
||
return plot_partial_residuals(self, focus_exog, ax=ax)
|
||
|
||
@Appender(_plot_ceres_residuals_doc % {'extra_params_doc': ''})
|
||
def plot_ceres_residuals(self, focus_exog, frac=0.66, cond_means=None,
|
||
ax=None):
|
||
|
||
from statsmodels.graphics.regressionplots import plot_ceres_residuals
|
||
|
||
return plot_ceres_residuals(self, focus_exog, frac,
|
||
cond_means=cond_means, ax=ax)
|
||
|
||
def conf_int(self, alpha=.05, cols=None, cov_type=None):
|
||
"""
|
||
Returns confidence intervals for the fitted parameters.
|
||
|
||
Parameters
|
||
----------
|
||
alpha : float, optional
|
||
The `alpha` level for the confidence interval. i.e., The
|
||
default `alpha` = .05 returns a 95% confidence interval.
|
||
cols : array_like, optional
|
||
`cols` specifies which confidence intervals to return
|
||
cov_type : str
|
||
The covariance type used for computing standard errors;
|
||
must be one of 'robust', 'naive', and 'bias reduced'.
|
||
See `GEE` for details.
|
||
|
||
Notes
|
||
-----
|
||
The confidence interval is based on the Gaussian distribution.
|
||
"""
|
||
# super does not allow to specify cov_type and method is not
|
||
# implemented,
|
||
# FIXME: remove this method here
|
||
if cov_type is None:
|
||
bse = self.bse
|
||
else:
|
||
bse = self.standard_errors(cov_type=cov_type)
|
||
params = self.params
|
||
dist = stats.norm
|
||
q = dist.ppf(1 - alpha / 2)
|
||
|
||
if cols is None:
|
||
lower = self.params - q * bse
|
||
upper = self.params + q * bse
|
||
else:
|
||
cols = np.asarray(cols)
|
||
lower = params[cols] - q * bse[cols]
|
||
upper = params[cols] + q * bse[cols]
|
||
return np.asarray(lzip(lower, upper))
|
||
|
||
def summary(self, yname=None, xname=None, title=None, alpha=.05):
|
||
"""
|
||
Summarize the GEE regression results
|
||
|
||
Parameters
|
||
----------
|
||
yname : str, optional
|
||
Default is `y`
|
||
xname : list[str], optional
|
||
Names for the exogenous variables, default is `var_#` for ## in
|
||
the number of regressors. Must match the number of parameters in
|
||
the model
|
||
title : str, optional
|
||
Title for the top table. If not None, then this replaces
|
||
the default title
|
||
alpha : float
|
||
significance level for the confidence intervals
|
||
cov_type : str
|
||
The covariance type used to compute the standard errors;
|
||
one of 'robust' (the usual robust sandwich-type covariance
|
||
estimate), 'naive' (ignores dependence), and 'bias
|
||
reduced' (the Mancl/DeRouen estimate).
|
||
|
||
Returns
|
||
-------
|
||
smry : Summary instance
|
||
this holds the summary tables and text, which can be
|
||
printed or converted to various output formats.
|
||
|
||
See Also
|
||
--------
|
||
statsmodels.iolib.summary.Summary : class to hold summary results
|
||
"""
|
||
|
||
top_left = [('Dep. Variable:', None),
|
||
('Model:', None),
|
||
('Method:', ['Generalized']),
|
||
('', ['Estimating Equations']),
|
||
('Family:', [self.model.family.__class__.__name__]),
|
||
('Dependence structure:',
|
||
[self.model.cov_struct.__class__.__name__]),
|
||
('Date:', None),
|
||
('Covariance type: ', [self.cov_type, ])
|
||
]
|
||
|
||
NY = [len(y) for y in self.model.endog_li]
|
||
|
||
top_right = [('No. Observations:', [sum(NY)]),
|
||
('No. clusters:', [len(self.model.endog_li)]),
|
||
('Min. cluster size:', [min(NY)]),
|
||
('Max. cluster size:', [max(NY)]),
|
||
('Mean cluster size:', ["%.1f" % np.mean(NY)]),
|
||
('Num. iterations:', ['%d' %
|
||
len(self.fit_history['params'])]),
|
||
('Scale:', ["%.3f" % self.scale]),
|
||
('Time:', None),
|
||
]
|
||
|
||
# The skew of the residuals
|
||
skew1 = stats.skew(self.resid)
|
||
kurt1 = stats.kurtosis(self.resid)
|
||
skew2 = stats.skew(self.centered_resid)
|
||
kurt2 = stats.kurtosis(self.centered_resid)
|
||
|
||
diagn_left = [('Skew:', ["%12.4f" % skew1]),
|
||
('Centered skew:', ["%12.4f" % skew2])]
|
||
|
||
diagn_right = [('Kurtosis:', ["%12.4f" % kurt1]),
|
||
('Centered kurtosis:', ["%12.4f" % kurt2])
|
||
]
|
||
|
||
if title is None:
|
||
title = self.model.__class__.__name__ + ' ' +\
|
||
"Regression Results"
|
||
|
||
# Override the exog variable names if xname is provided as an
|
||
# argument.
|
||
if xname is None:
|
||
xname = self.model.exog_names
|
||
|
||
if yname is None:
|
||
yname = self.model.endog_names
|
||
|
||
# Create summary table instance
|
||
from statsmodels.iolib.summary import Summary
|
||
smry = Summary()
|
||
smry.add_table_2cols(self, gleft=top_left, gright=top_right,
|
||
yname=yname, xname=xname,
|
||
title=title)
|
||
smry.add_table_params(self, yname=yname, xname=xname,
|
||
alpha=alpha, use_t=False)
|
||
smry.add_table_2cols(self, gleft=diagn_left,
|
||
gright=diagn_right, yname=yname,
|
||
xname=xname, title="")
|
||
|
||
return smry
|
||
|
||
def get_margeff(self, at='overall', method='dydx', atexog=None,
|
||
dummy=False, count=False):
|
||
"""Get marginal effects of the fitted model.
|
||
|
||
Parameters
|
||
----------
|
||
at : str, optional
|
||
Options are:
|
||
|
||
- 'overall', The average of the marginal effects at each
|
||
observation.
|
||
- 'mean', The marginal effects at the mean of each regressor.
|
||
- 'median', The marginal effects at the median of each regressor.
|
||
- 'zero', The marginal effects at zero for each regressor.
|
||
- 'all', The marginal effects at each observation. If `at` is 'all'
|
||
only margeff will be available.
|
||
|
||
Note that if `exog` is specified, then marginal effects for all
|
||
variables not specified by `exog` are calculated using the `at`
|
||
option.
|
||
method : str, optional
|
||
Options are:
|
||
|
||
- 'dydx' - dy/dx - No transformation is made and marginal effects
|
||
are returned. This is the default.
|
||
- 'eyex' - estimate elasticities of variables in `exog` --
|
||
d(lny)/d(lnx)
|
||
- 'dyex' - estimate semi-elasticity -- dy/d(lnx)
|
||
- 'eydx' - estimate semi-elasticity -- d(lny)/dx
|
||
|
||
Note that tranformations are done after each observation is
|
||
calculated. Semi-elasticities for binary variables are computed
|
||
using the midpoint method. 'dyex' and 'eyex' do not make sense
|
||
for discrete variables.
|
||
atexog : array_like, optional
|
||
Optionally, you can provide the exogenous variables over which to
|
||
get the marginal effects. This should be a dictionary with the key
|
||
as the zero-indexed column number and the value of the dictionary.
|
||
Default is None for all independent variables less the constant.
|
||
dummy : bool, optional
|
||
If False, treats binary variables (if present) as continuous. This
|
||
is the default. Else if True, treats binary variables as
|
||
changing from 0 to 1. Note that any variable that is either 0 or 1
|
||
is treated as binary. Each binary variable is treated separately
|
||
for now.
|
||
count : bool, optional
|
||
If False, treats count variables (if present) as continuous. This
|
||
is the default. Else if True, the marginal effect is the
|
||
change in probabilities when each observation is increased by one.
|
||
|
||
Returns
|
||
-------
|
||
effects : ndarray
|
||
the marginal effect corresponding to the input options
|
||
|
||
Notes
|
||
-----
|
||
When using after Poisson, returns the expected number of events
|
||
per period, assuming that the model is loglinear.
|
||
"""
|
||
|
||
if self.model.constraint is not None:
|
||
warnings.warn("marginal effects ignore constraints",
|
||
ValueWarning)
|
||
|
||
return GEEMargins(self, (at, method, atexog, dummy, count))
|
||
|
||
def plot_isotropic_dependence(self, ax=None, xpoints=10,
|
||
min_n=50):
|
||
"""
|
||
Create a plot of the pairwise products of within-group
|
||
residuals against the corresponding time differences. This
|
||
plot can be used to assess the possible form of an isotropic
|
||
covariance structure.
|
||
|
||
Parameters
|
||
----------
|
||
ax : AxesSubplot
|
||
An axes on which to draw the graph. If None, new
|
||
figure and axes objects are created
|
||
xpoints : scalar or array_like
|
||
If scalar, the number of points equally spaced points on
|
||
the time difference axis used to define bins for
|
||
calculating local means. If an array, the specific points
|
||
that define the bins.
|
||
min_n : int
|
||
The minimum sample size in a bin for the mean residual
|
||
product to be included on the plot.
|
||
"""
|
||
|
||
from statsmodels.graphics import utils as gutils
|
||
|
||
resid = self.model.cluster_list(self.resid)
|
||
time = self.model.cluster_list(self.model.time)
|
||
|
||
# All within-group pairwise time distances (xdt) and the
|
||
# corresponding products of scaled residuals (xre).
|
||
xre, xdt = [], []
|
||
for re, ti in zip(resid, time):
|
||
ix = np.tril_indices(re.shape[0], 0)
|
||
re = re[ix[0]] * re[ix[1]] / self.scale ** 2
|
||
xre.append(re)
|
||
dists = np.sqrt(((ti[ix[0], :] - ti[ix[1], :]) ** 2).sum(1))
|
||
xdt.append(dists)
|
||
|
||
xre = np.concatenate(xre)
|
||
xdt = np.concatenate(xdt)
|
||
|
||
if ax is None:
|
||
fig, ax = gutils.create_mpl_ax(ax)
|
||
else:
|
||
fig = ax.get_figure()
|
||
|
||
# Convert to a correlation
|
||
ii = np.flatnonzero(xdt == 0)
|
||
v0 = np.mean(xre[ii])
|
||
xre /= v0
|
||
|
||
# Use the simple average to smooth, since fancier smoothers
|
||
# that trim and downweight outliers give biased results (we
|
||
# need the actual mean of a skewed distribution).
|
||
if np.isscalar(xpoints):
|
||
xpoints = np.linspace(0, max(xdt), xpoints)
|
||
dg = np.digitize(xdt, xpoints)
|
||
dgu = np.unique(dg)
|
||
hist = np.asarray([np.sum(dg == k) for k in dgu])
|
||
ii = np.flatnonzero(hist >= min_n)
|
||
dgu = dgu[ii]
|
||
dgy = np.asarray([np.mean(xre[dg == k]) for k in dgu])
|
||
dgx = np.asarray([np.mean(xdt[dg == k]) for k in dgu])
|
||
|
||
ax.plot(dgx, dgy, '-', color='orange', lw=5)
|
||
ax.set_xlabel("Time difference")
|
||
ax.set_ylabel("Product of scaled residuals")
|
||
|
||
return fig
|
||
|
||
def sensitivity_params(self, dep_params_first,
|
||
dep_params_last, num_steps):
|
||
"""
|
||
Refits the GEE model using a sequence of values for the
|
||
dependence parameters.
|
||
|
||
Parameters
|
||
----------
|
||
dep_params_first : array_like
|
||
The first dep_params in the sequence
|
||
dep_params_last : array_like
|
||
The last dep_params in the sequence
|
||
num_steps : int
|
||
The number of dep_params in the sequence
|
||
|
||
Returns
|
||
-------
|
||
results : array_like
|
||
The GEEResults objects resulting from the fits.
|
||
"""
|
||
|
||
model = self.model
|
||
|
||
import copy
|
||
cov_struct = copy.deepcopy(self.model.cov_struct)
|
||
|
||
# We are fixing the dependence structure in each run.
|
||
update_dep = model.update_dep
|
||
model.update_dep = False
|
||
|
||
dep_params = []
|
||
results = []
|
||
for x in np.linspace(0, 1, num_steps):
|
||
|
||
dp = x * dep_params_last + (1 - x) * dep_params_first
|
||
dep_params.append(dp)
|
||
|
||
model.cov_struct = copy.deepcopy(cov_struct)
|
||
model.cov_struct.dep_params = dp
|
||
rslt = model.fit(start_params=self.params,
|
||
ctol=self.ctol,
|
||
params_niter=self.params_niter,
|
||
first_dep_update=self.first_dep_update,
|
||
cov_type=self.cov_type)
|
||
results.append(rslt)
|
||
|
||
model.update_dep = update_dep
|
||
|
||
return results
|
||
|
||
# FIXME: alias to be removed, temporary backwards compatibility
|
||
params_sensitivity = sensitivity_params
|
||
|
||
|
||
class GEEResultsWrapper(lm.RegressionResultsWrapper):
|
||
_attrs = {
|
||
'centered_resid': 'rows',
|
||
}
|
||
_wrap_attrs = wrap.union_dicts(lm.RegressionResultsWrapper._wrap_attrs,
|
||
_attrs)
|
||
wrap.populate_wrapper(GEEResultsWrapper, GEEResults) # noqa:E305
|
||
|
||
|
||
class OrdinalGEE(GEE):
|
||
|
||
__doc__ = (
|
||
" Ordinal Response Marginal Regression Model using GEE\n" +
|
||
_gee_init_doc % {'extra_params': base._missing_param_doc,
|
||
'family_doc': _gee_ordinal_family_doc,
|
||
'example': _gee_ordinal_example,
|
||
'notes': _gee_nointercept})
|
||
|
||
def __init__(self, endog, exog, groups, time=None, family=None,
|
||
cov_struct=None, missing='none', offset=None,
|
||
dep_data=None, constraint=None, **kwargs):
|
||
|
||
if family is None:
|
||
family = families.Binomial()
|
||
else:
|
||
if not isinstance(family, families.Binomial):
|
||
raise ValueError("ordinal GEE must use a Binomial family")
|
||
|
||
if cov_struct is None:
|
||
cov_struct = cov_structs.OrdinalIndependence()
|
||
|
||
endog, exog, groups, time, offset = self.setup_ordinal(
|
||
endog, exog, groups, time, offset)
|
||
|
||
super().__init__(endog, exog, groups, time,
|
||
family, cov_struct, missing,
|
||
offset, dep_data, constraint)
|
||
|
||
def setup_ordinal(self, endog, exog, groups, time, offset):
|
||
"""
|
||
Restructure ordinal data as binary indicators so that they can
|
||
be analyzed using Generalized Estimating Equations.
|
||
"""
|
||
|
||
self.endog_orig = endog.copy()
|
||
self.exog_orig = exog.copy()
|
||
self.groups_orig = groups.copy()
|
||
if offset is not None:
|
||
self.offset_orig = offset.copy()
|
||
else:
|
||
self.offset_orig = None
|
||
offset = np.zeros(len(endog))
|
||
if time is not None:
|
||
self.time_orig = time.copy()
|
||
else:
|
||
self.time_orig = None
|
||
time = np.zeros((len(endog), 1))
|
||
|
||
exog = np.asarray(exog)
|
||
endog = np.asarray(endog)
|
||
groups = np.asarray(groups)
|
||
time = np.asarray(time)
|
||
offset = np.asarray(offset)
|
||
|
||
# The unique outcomes, except the greatest one.
|
||
self.endog_values = np.unique(endog)
|
||
endog_cuts = self.endog_values[0:-1]
|
||
ncut = len(endog_cuts)
|
||
|
||
nrows = ncut * len(endog)
|
||
exog_out = np.zeros((nrows, exog.shape[1]),
|
||
dtype=np.float64)
|
||
endog_out = np.zeros(nrows, dtype=np.float64)
|
||
intercepts = np.zeros((nrows, ncut), dtype=np.float64)
|
||
groups_out = np.zeros(nrows, dtype=groups.dtype)
|
||
time_out = np.zeros((nrows, time.shape[1]),
|
||
dtype=np.float64)
|
||
offset_out = np.zeros(nrows, dtype=np.float64)
|
||
|
||
jrow = 0
|
||
zipper = zip(exog, endog, groups, time, offset)
|
||
for (exog_row, endog_value, group_value, time_value,
|
||
offset_value) in zipper:
|
||
|
||
# Loop over thresholds for the indicators
|
||
for thresh_ix, thresh in enumerate(endog_cuts):
|
||
|
||
exog_out[jrow, :] = exog_row
|
||
endog_out[jrow] = int(np.squeeze(endog_value > thresh))
|
||
intercepts[jrow, thresh_ix] = 1
|
||
groups_out[jrow] = group_value
|
||
time_out[jrow] = time_value
|
||
offset_out[jrow] = offset_value
|
||
jrow += 1
|
||
|
||
exog_out = np.concatenate((intercepts, exog_out), axis=1)
|
||
|
||
# exog column names, including intercepts
|
||
xnames = ["I(y>%.1f)" % v for v in endog_cuts]
|
||
if type(self.exog_orig) is pd.DataFrame:
|
||
xnames.extend(self.exog_orig.columns)
|
||
else:
|
||
xnames.extend(["x%d" % k for k in range(1, exog.shape[1] + 1)])
|
||
exog_out = pd.DataFrame(exog_out, columns=xnames)
|
||
|
||
# Preserve the endog name if there is one
|
||
if type(self.endog_orig) is pd.Series:
|
||
endog_out = pd.Series(endog_out, name=self.endog_orig.name)
|
||
|
||
return endog_out, exog_out, groups_out, time_out, offset_out
|
||
|
||
def _starting_params(self):
|
||
exposure = getattr(self, "exposure", None)
|
||
model = GEE(self.endog, self.exog, self.groups,
|
||
time=self.time, family=families.Binomial(),
|
||
offset=self.offset, exposure=exposure)
|
||
result = model.fit()
|
||
return result.params
|
||
|
||
@Appender(_gee_fit_doc)
|
||
def fit(self, maxiter=60, ctol=1e-6, start_params=None,
|
||
params_niter=1, first_dep_update=0,
|
||
cov_type='robust'):
|
||
|
||
rslt = super().fit(maxiter, ctol, start_params,
|
||
params_niter, first_dep_update,
|
||
cov_type=cov_type)
|
||
|
||
rslt = rslt._results # use unwrapped instance
|
||
res_kwds = {k: getattr(rslt, k) for k in rslt._props}
|
||
# Convert the GEEResults to an OrdinalGEEResults
|
||
ord_rslt = OrdinalGEEResults(self, rslt.params,
|
||
rslt.cov_params() / rslt.scale,
|
||
rslt.scale,
|
||
cov_type=cov_type,
|
||
attr_kwds=res_kwds)
|
||
# for k in rslt._props:
|
||
# setattr(ord_rslt, k, getattr(rslt, k))
|
||
# TODO: document or delete
|
||
|
||
return OrdinalGEEResultsWrapper(ord_rslt)
|
||
|
||
|
||
class OrdinalGEEResults(GEEResults):
|
||
|
||
__doc__ = (
|
||
"This class summarizes the fit of a marginal regression model"
|
||
"for an ordinal response using GEE.\n"
|
||
+ _gee_results_doc)
|
||
|
||
def plot_distribution(self, ax=None, exog_values=None):
|
||
"""
|
||
Plot the fitted probabilities of endog in an ordinal model,
|
||
for specified values of the predictors.
|
||
|
||
Parameters
|
||
----------
|
||
ax : AxesSubplot
|
||
An axes on which to draw the graph. If None, new
|
||
figure and axes objects are created
|
||
exog_values : array_like
|
||
A list of dictionaries, with each dictionary mapping
|
||
variable names to values at which the variable is held
|
||
fixed. The values P(endog=y | exog) are plotted for all
|
||
possible values of y, at the given exog value. Variables
|
||
not included in a dictionary are held fixed at the mean
|
||
value.
|
||
|
||
Example:
|
||
--------
|
||
We have a model with covariates 'age' and 'sex', and wish to
|
||
plot the probabilities P(endog=y | exog) for males (sex=0) and
|
||
for females (sex=1), as separate paths on the plot. Since
|
||
'age' is not included below in the map, it is held fixed at
|
||
its mean value.
|
||
|
||
>>> ev = [{"sex": 1}, {"sex": 0}]
|
||
>>> rslt.distribution_plot(exog_values=ev)
|
||
"""
|
||
|
||
from statsmodels.graphics import utils as gutils
|
||
|
||
if ax is None:
|
||
fig, ax = gutils.create_mpl_ax(ax)
|
||
else:
|
||
fig = ax.get_figure()
|
||
|
||
# If no covariate patterns are specified, create one with all
|
||
# variables set to their mean values.
|
||
if exog_values is None:
|
||
exog_values = [{}, ]
|
||
|
||
exog_means = self.model.exog.mean(0)
|
||
ix_icept = [i for i, x in enumerate(self.model.exog_names) if
|
||
x.startswith("I(")]
|
||
|
||
for ev in exog_values:
|
||
|
||
for k in ev.keys():
|
||
if k not in self.model.exog_names:
|
||
raise ValueError("%s is not a variable in the model"
|
||
% k)
|
||
|
||
# Get the fitted probability for each level, at the given
|
||
# covariate values.
|
||
pr = []
|
||
for j in ix_icept:
|
||
|
||
xp = np.zeros_like(self.params)
|
||
xp[j] = 1.
|
||
for i, vn in enumerate(self.model.exog_names):
|
||
if i in ix_icept:
|
||
continue
|
||
# User-specified value
|
||
if vn in ev:
|
||
xp[i] = ev[vn]
|
||
# Mean value
|
||
else:
|
||
xp[i] = exog_means[i]
|
||
|
||
p = 1 / (1 + np.exp(-np.dot(xp, self.params)))
|
||
pr.append(p)
|
||
|
||
pr.insert(0, 1)
|
||
pr.append(0)
|
||
pr = np.asarray(pr)
|
||
prd = -np.diff(pr)
|
||
|
||
ax.plot(self.model.endog_values, prd, 'o-')
|
||
|
||
ax.set_xlabel("Response value")
|
||
ax.set_ylabel("Probability")
|
||
ax.set_ylim(0, 1)
|
||
|
||
return fig
|
||
|
||
|
||
def _score_test_submodel(par, sub):
|
||
"""
|
||
Return transformation matrices for design matrices.
|
||
|
||
Parameters
|
||
----------
|
||
par : instance
|
||
The parent model
|
||
sub : instance
|
||
The sub-model
|
||
|
||
Returns
|
||
-------
|
||
qm : array_like
|
||
Matrix mapping the design matrix of the parent to the design matrix
|
||
for the sub-model.
|
||
qc : array_like
|
||
Matrix mapping the design matrix of the parent to the orthogonal
|
||
complement of the columnspace of the submodel in the columnspace
|
||
of the parent.
|
||
|
||
Notes
|
||
-----
|
||
Returns None, None if the provided submodel is not actually a submodel.
|
||
"""
|
||
|
||
x1 = par.exog
|
||
x2 = sub.exog
|
||
|
||
u, s, vt = np.linalg.svd(x1, 0)
|
||
v = vt.T
|
||
|
||
# Get the orthogonal complement of col(x2) in col(x1).
|
||
a, _ = np.linalg.qr(x2)
|
||
a = u - np.dot(a, np.dot(a.T, u))
|
||
x2c, sb, _ = np.linalg.svd(a, 0)
|
||
x2c = x2c[:, sb > 1e-12]
|
||
|
||
# x1 * qm = x2
|
||
ii = np.flatnonzero(np.abs(s) > 1e-12)
|
||
qm = np.dot(v[:, ii], np.dot(u[:, ii].T, x2) / s[ii, None])
|
||
|
||
e = np.max(np.abs(x2 - np.dot(x1, qm)))
|
||
if e > 1e-8:
|
||
return None, None
|
||
|
||
# x1 * qc = x2c
|
||
qc = np.dot(v[:, ii], np.dot(u[:, ii].T, x2c) / s[ii, None])
|
||
|
||
return qm, qc
|
||
|
||
|
||
class OrdinalGEEResultsWrapper(GEEResultsWrapper):
|
||
pass
|
||
wrap.populate_wrapper(OrdinalGEEResultsWrapper, OrdinalGEEResults) # noqa:E305
|
||
|
||
|
||
class NominalGEE(GEE):
|
||
|
||
__doc__ = (
|
||
" Nominal Response Marginal Regression Model using GEE.\n" +
|
||
_gee_init_doc % {'extra_params': base._missing_param_doc,
|
||
'family_doc': _gee_nominal_family_doc,
|
||
'example': _gee_nominal_example,
|
||
'notes': _gee_nointercept})
|
||
|
||
def __init__(self, endog, exog, groups, time=None, family=None,
|
||
cov_struct=None, missing='none', offset=None,
|
||
dep_data=None, constraint=None, **kwargs):
|
||
|
||
endog, exog, groups, time, offset = self.setup_nominal(
|
||
endog, exog, groups, time, offset)
|
||
|
||
if family is None:
|
||
family = _Multinomial(self.ncut + 1)
|
||
|
||
if cov_struct is None:
|
||
cov_struct = cov_structs.NominalIndependence()
|
||
|
||
super().__init__(
|
||
endog, exog, groups, time, family, cov_struct, missing,
|
||
offset, dep_data, constraint)
|
||
|
||
def _starting_params(self):
|
||
exposure = getattr(self, "exposure", None)
|
||
model = GEE(self.endog, self.exog, self.groups,
|
||
time=self.time, family=families.Binomial(),
|
||
offset=self.offset, exposure=exposure)
|
||
result = model.fit()
|
||
return result.params
|
||
|
||
def setup_nominal(self, endog, exog, groups, time, offset):
|
||
"""
|
||
Restructure nominal data as binary indicators so that they can
|
||
be analyzed using Generalized Estimating Equations.
|
||
"""
|
||
|
||
self.endog_orig = endog.copy()
|
||
self.exog_orig = exog.copy()
|
||
self.groups_orig = groups.copy()
|
||
if offset is not None:
|
||
self.offset_orig = offset.copy()
|
||
else:
|
||
self.offset_orig = None
|
||
offset = np.zeros(len(endog))
|
||
if time is not None:
|
||
self.time_orig = time.copy()
|
||
else:
|
||
self.time_orig = None
|
||
time = np.zeros((len(endog), 1))
|
||
|
||
exog = np.asarray(exog)
|
||
endog = np.asarray(endog)
|
||
groups = np.asarray(groups)
|
||
time = np.asarray(time)
|
||
offset = np.asarray(offset)
|
||
|
||
# The unique outcomes, except the greatest one.
|
||
self.endog_values = np.unique(endog)
|
||
endog_cuts = self.endog_values[0:-1]
|
||
ncut = len(endog_cuts)
|
||
self.ncut = ncut
|
||
|
||
nrows = len(endog_cuts) * exog.shape[0]
|
||
ncols = len(endog_cuts) * exog.shape[1]
|
||
exog_out = np.zeros((nrows, ncols), dtype=np.float64)
|
||
endog_out = np.zeros(nrows, dtype=np.float64)
|
||
groups_out = np.zeros(nrows, dtype=np.float64)
|
||
time_out = np.zeros((nrows, time.shape[1]),
|
||
dtype=np.float64)
|
||
offset_out = np.zeros(nrows, dtype=np.float64)
|
||
|
||
jrow = 0
|
||
zipper = zip(exog, endog, groups, time, offset)
|
||
for (exog_row, endog_value, group_value, time_value,
|
||
offset_value) in zipper:
|
||
|
||
# Loop over thresholds for the indicators
|
||
for thresh_ix, thresh in enumerate(endog_cuts):
|
||
|
||
u = np.zeros(len(endog_cuts), dtype=np.float64)
|
||
u[thresh_ix] = 1
|
||
exog_out[jrow, :] = np.kron(u, exog_row)
|
||
endog_out[jrow] = (int(endog_value == thresh))
|
||
groups_out[jrow] = group_value
|
||
time_out[jrow] = time_value
|
||
offset_out[jrow] = offset_value
|
||
jrow += 1
|
||
|
||
# exog names
|
||
if isinstance(self.exog_orig, pd.DataFrame):
|
||
xnames_in = self.exog_orig.columns
|
||
else:
|
||
xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)]
|
||
xnames = []
|
||
for tr in endog_cuts:
|
||
xnames.extend([f"{v}[{tr:.1f}]" for v in xnames_in])
|
||
exog_out = pd.DataFrame(exog_out, columns=xnames)
|
||
exog_out = pd.DataFrame(exog_out, columns=xnames)
|
||
|
||
# Preserve endog name if there is one
|
||
if isinstance(self.endog_orig, pd.Series):
|
||
endog_out = pd.Series(endog_out, name=self.endog_orig.name)
|
||
|
||
return endog_out, exog_out, groups_out, time_out, offset_out
|
||
|
||
def mean_deriv(self, exog, lin_pred):
|
||
"""
|
||
Derivative of the expected endog with respect to the parameters.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
The exogeneous data at which the derivative is computed,
|
||
number of rows must be a multiple of `ncut`.
|
||
lin_pred : array_like
|
||
The values of the linear predictor, length must be multiple
|
||
of `ncut`.
|
||
|
||
Returns
|
||
-------
|
||
The derivative of the expected endog with respect to the
|
||
parameters.
|
||
"""
|
||
|
||
expval = np.exp(lin_pred)
|
||
|
||
# Reshape so that each row contains all the indicators
|
||
# corresponding to one multinomial observation.
|
||
expval_m = np.reshape(expval, (len(expval) // self.ncut,
|
||
self.ncut))
|
||
|
||
# The normalizing constant for the multinomial probabilities.
|
||
denom = 1 + expval_m.sum(1)
|
||
denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
|
||
|
||
# The multinomial probabilities
|
||
mprob = expval / denom
|
||
|
||
# First term of the derivative: denom * expval' / denom^2 =
|
||
# expval' / denom.
|
||
dmat = mprob[:, None] * exog
|
||
|
||
# Second term of the derivative: -expval * denom' / denom^2
|
||
ddenom = expval[:, None] * exog
|
||
dmat -= mprob[:, None] * ddenom / denom[:, None]
|
||
|
||
return dmat
|
||
|
||
def mean_deriv_exog(self, exog, params, offset_exposure=None):
|
||
"""
|
||
Derivative of the expected endog with respect to exog for the
|
||
multinomial model, used in analyzing marginal effects.
|
||
|
||
Parameters
|
||
----------
|
||
exog : array_like
|
||
The exogeneous data at which the derivative is computed,
|
||
number of rows must be a multiple of `ncut`.
|
||
lpr : array_like
|
||
The linear predictor values, length must be multiple of
|
||
`ncut`.
|
||
|
||
Returns
|
||
-------
|
||
The value of the derivative of the expected endog with respect
|
||
to exog.
|
||
|
||
Notes
|
||
-----
|
||
offset_exposure must be set at None for the multinomial family.
|
||
"""
|
||
|
||
if offset_exposure is not None:
|
||
warnings.warn("Offset/exposure ignored for the multinomial family",
|
||
ValueWarning)
|
||
|
||
lpr = np.dot(exog, params)
|
||
expval = np.exp(lpr)
|
||
|
||
expval_m = np.reshape(expval, (len(expval) // self.ncut,
|
||
self.ncut))
|
||
|
||
denom = 1 + expval_m.sum(1)
|
||
denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
|
||
|
||
bmat0 = np.outer(np.ones(exog.shape[0]), params)
|
||
|
||
# Masking matrix
|
||
qmat = []
|
||
for j in range(self.ncut):
|
||
ee = np.zeros(self.ncut, dtype=np.float64)
|
||
ee[j] = 1
|
||
qmat.append(np.kron(ee, np.ones(len(params) // self.ncut)))
|
||
qmat = np.array(qmat)
|
||
qmat = np.kron(np.ones((exog.shape[0] // self.ncut, 1)), qmat)
|
||
bmat = bmat0 * qmat
|
||
|
||
dmat = expval[:, None] * bmat / denom[:, None]
|
||
|
||
expval_mb = np.kron(expval_m, np.ones((self.ncut, 1)))
|
||
expval_mb = np.kron(expval_mb, np.ones((1, self.ncut)))
|
||
|
||
dmat -= expval[:, None] * (bmat * expval_mb) / denom[:, None] ** 2
|
||
|
||
return dmat
|
||
|
||
@Appender(_gee_fit_doc)
|
||
def fit(self, maxiter=60, ctol=1e-6, start_params=None,
|
||
params_niter=1, first_dep_update=0,
|
||
cov_type='robust'):
|
||
|
||
rslt = super().fit(maxiter, ctol, start_params,
|
||
params_niter, first_dep_update,
|
||
cov_type=cov_type)
|
||
if rslt is None:
|
||
warnings.warn("GEE updates did not converge",
|
||
ConvergenceWarning)
|
||
return None
|
||
|
||
rslt = rslt._results # use unwrapped instance
|
||
res_kwds = {k: getattr(rslt, k) for k in rslt._props}
|
||
# Convert the GEEResults to a NominalGEEResults
|
||
nom_rslt = NominalGEEResults(self, rslt.params,
|
||
rslt.cov_params() / rslt.scale,
|
||
rslt.scale,
|
||
cov_type=cov_type,
|
||
attr_kwds=res_kwds)
|
||
# TODO: document or delete
|
||
# for k in rslt._props:
|
||
# setattr(nom_rslt, k, getattr(rslt, k))
|
||
|
||
return NominalGEEResultsWrapper(nom_rslt)
|
||
|
||
|
||
class NominalGEEResults(GEEResults):
|
||
|
||
__doc__ = (
|
||
"This class summarizes the fit of a marginal regression model"
|
||
"for a nominal response using GEE.\n"
|
||
+ _gee_results_doc)
|
||
|
||
def plot_distribution(self, ax=None, exog_values=None):
|
||
"""
|
||
Plot the fitted probabilities of endog in an nominal model,
|
||
for specified values of the predictors.
|
||
|
||
Parameters
|
||
----------
|
||
ax : AxesSubplot
|
||
An axes on which to draw the graph. If None, new
|
||
figure and axes objects are created
|
||
exog_values : array_like
|
||
A list of dictionaries, with each dictionary mapping
|
||
variable names to values at which the variable is held
|
||
fixed. The values P(endog=y | exog) are plotted for all
|
||
possible values of y, at the given exog value. Variables
|
||
not included in a dictionary are held fixed at the mean
|
||
value.
|
||
|
||
Example:
|
||
--------
|
||
We have a model with covariates 'age' and 'sex', and wish to
|
||
plot the probabilities P(endog=y | exog) for males (sex=0) and
|
||
for females (sex=1), as separate paths on the plot. Since
|
||
'age' is not included below in the map, it is held fixed at
|
||
its mean value.
|
||
|
||
>>> ex = [{"sex": 1}, {"sex": 0}]
|
||
>>> rslt.distribution_plot(exog_values=ex)
|
||
"""
|
||
|
||
from statsmodels.graphics import utils as gutils
|
||
|
||
if ax is None:
|
||
fig, ax = gutils.create_mpl_ax(ax)
|
||
else:
|
||
fig = ax.get_figure()
|
||
|
||
# If no covariate patterns are specified, create one with all
|
||
# variables set to their mean values.
|
||
if exog_values is None:
|
||
exog_values = [{}, ]
|
||
|
||
link = self.model.family.link.inverse
|
||
ncut = self.model.family.ncut
|
||
|
||
k = int(self.model.exog.shape[1] / ncut)
|
||
exog_means = self.model.exog.mean(0)[0:k]
|
||
exog_names = self.model.exog_names[0:k]
|
||
exog_names = [x.split("[")[0] for x in exog_names]
|
||
|
||
params = np.reshape(self.params,
|
||
(ncut, len(self.params) // ncut))
|
||
|
||
for ev in exog_values:
|
||
|
||
exog = exog_means.copy()
|
||
|
||
for k in ev.keys():
|
||
if k not in exog_names:
|
||
raise ValueError("%s is not a variable in the model"
|
||
% k)
|
||
|
||
ii = exog_names.index(k)
|
||
exog[ii] = ev[k]
|
||
|
||
lpr = np.dot(params, exog)
|
||
pr = link(lpr)
|
||
pr = np.r_[pr, 1 - pr.sum()]
|
||
|
||
ax.plot(self.model.endog_values, pr, 'o-')
|
||
|
||
ax.set_xlabel("Response value")
|
||
ax.set_ylabel("Probability")
|
||
ax.set_xticks(self.model.endog_values)
|
||
ax.set_xticklabels(self.model.endog_values)
|
||
ax.set_ylim(0, 1)
|
||
|
||
return fig
|
||
|
||
|
||
class NominalGEEResultsWrapper(GEEResultsWrapper):
|
||
pass
|
||
wrap.populate_wrapper(NominalGEEResultsWrapper, NominalGEEResults) # noqa:E305
|
||
|
||
|
||
class _MultinomialLogit(Link):
|
||
"""
|
||
The multinomial logit transform, only for use with GEE.
|
||
|
||
Notes
|
||
-----
|
||
The data are assumed coded as binary indicators, where each
|
||
observed multinomial value y is coded as I(y == S[0]), ..., I(y ==
|
||
S[-1]), where S is the set of possible response labels, excluding
|
||
the largest one. Thererefore functions in this class should only
|
||
be called using vector argument whose length is a multiple of |S|
|
||
= ncut, which is an argument to be provided when initializing the
|
||
class.
|
||
|
||
call and derivative use a private method _clean to trim p by 1e-10
|
||
so that p is in (0, 1)
|
||
"""
|
||
|
||
def __init__(self, ncut):
|
||
self.ncut = ncut
|
||
|
||
def inverse(self, lpr):
|
||
"""
|
||
Inverse of the multinomial logit transform, which gives the
|
||
expected values of the data as a function of the linear
|
||
predictors.
|
||
|
||
Parameters
|
||
----------
|
||
lpr : array_like (length must be divisible by `ncut`)
|
||
The linear predictors
|
||
|
||
Returns
|
||
-------
|
||
prob : ndarray
|
||
Probabilities, or expected values
|
||
"""
|
||
|
||
expval = np.exp(lpr)
|
||
|
||
denom = 1 + np.reshape(expval, (len(expval) // self.ncut,
|
||
self.ncut)).sum(1)
|
||
denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
|
||
|
||
prob = expval / denom
|
||
|
||
return prob
|
||
|
||
|
||
class _Multinomial(families.Family):
|
||
"""
|
||
Pseudo-link function for fitting nominal multinomial models with
|
||
GEE. Not for use outside the GEE class.
|
||
"""
|
||
|
||
links = [_MultinomialLogit, ]
|
||
variance = varfuncs.binary
|
||
safe_links = [_MultinomialLogit, ]
|
||
|
||
def __init__(self, nlevels, check_link=True):
|
||
"""
|
||
Parameters
|
||
----------
|
||
nlevels : int
|
||
The number of distinct categories for the multinomial
|
||
distribution.
|
||
"""
|
||
self._check_link = check_link
|
||
self.initialize(nlevels)
|
||
|
||
def initialize(self, nlevels):
|
||
self.ncut = nlevels - 1
|
||
self.link = _MultinomialLogit(self.ncut)
|
||
|
||
|
||
class GEEMargins:
|
||
"""
|
||
Estimated marginal effects for a regression model fit with GEE.
|
||
|
||
Parameters
|
||
----------
|
||
results : GEEResults instance
|
||
The results instance of a fitted discrete choice model
|
||
args : tuple
|
||
Args are passed to `get_margeff`. This is the same as
|
||
results.get_margeff. See there for more information.
|
||
kwargs : dict
|
||
Keyword args are passed to `get_margeff`. This is the same as
|
||
results.get_margeff. See there for more information.
|
||
"""
|
||
|
||
def __init__(self, results, args, kwargs={}):
|
||
self._cache = {}
|
||
self.results = results
|
||
self.get_margeff(*args, **kwargs)
|
||
|
||
def _reset(self):
|
||
self._cache = {}
|
||
|
||
@cache_readonly
|
||
def tvalues(self):
|
||
_check_at_is_all(self.margeff_options)
|
||
return self.margeff / self.margeff_se
|
||
|
||
def summary_frame(self, alpha=.05):
|
||
"""
|
||
Returns a DataFrame summarizing the marginal effects.
|
||
|
||
Parameters
|
||
----------
|
||
alpha : float
|
||
Number between 0 and 1. The confidence intervals have the
|
||
probability 1-alpha.
|
||
|
||
Returns
|
||
-------
|
||
frame : DataFrames
|
||
A DataFrame summarizing the marginal effects.
|
||
"""
|
||
_check_at_is_all(self.margeff_options)
|
||
from pandas import DataFrame
|
||
names = [_transform_names[self.margeff_options['method']],
|
||
'Std. Err.', 'z', 'Pr(>|z|)',
|
||
'Conf. Int. Low', 'Cont. Int. Hi.']
|
||
ind = self.results.model.exog.var(0) != 0 # True if not a constant
|
||
exog_names = self.results.model.exog_names
|
||
var_names = [name for i, name in enumerate(exog_names) if ind[i]]
|
||
table = np.column_stack((self.margeff, self.margeff_se, self.tvalues,
|
||
self.pvalues, self.conf_int(alpha)))
|
||
return DataFrame(table, columns=names, index=var_names)
|
||
|
||
@cache_readonly
|
||
def pvalues(self):
|
||
_check_at_is_all(self.margeff_options)
|
||
return stats.norm.sf(np.abs(self.tvalues)) * 2
|
||
|
||
def conf_int(self, alpha=.05):
|
||
"""
|
||
Returns the confidence intervals of the marginal effects
|
||
|
||
Parameters
|
||
----------
|
||
alpha : float
|
||
Number between 0 and 1. The confidence intervals have the
|
||
probability 1-alpha.
|
||
|
||
Returns
|
||
-------
|
||
conf_int : ndarray
|
||
An array with lower, upper confidence intervals for the marginal
|
||
effects.
|
||
"""
|
||
_check_at_is_all(self.margeff_options)
|
||
me_se = self.margeff_se
|
||
q = stats.norm.ppf(1 - alpha / 2)
|
||
lower = self.margeff - q * me_se
|
||
upper = self.margeff + q * me_se
|
||
return np.asarray(lzip(lower, upper))
|
||
|
||
def summary(self, alpha=.05):
|
||
"""
|
||
Returns a summary table for marginal effects
|
||
|
||
Parameters
|
||
----------
|
||
alpha : float
|
||
Number between 0 and 1. The confidence intervals have the
|
||
probability 1-alpha.
|
||
|
||
Returns
|
||
-------
|
||
Summary : SummaryTable
|
||
A SummaryTable instance
|
||
"""
|
||
_check_at_is_all(self.margeff_options)
|
||
results = self.results
|
||
model = results.model
|
||
title = model.__class__.__name__ + " Marginal Effects"
|
||
method = self.margeff_options['method']
|
||
top_left = [('Dep. Variable:', [model.endog_names]),
|
||
('Method:', [method]),
|
||
('At:', [self.margeff_options['at']]), ]
|
||
|
||
from statsmodels.iolib.summary import (Summary, summary_params,
|
||
table_extend)
|
||
exog_names = model.exog_names[:] # copy
|
||
smry = Summary()
|
||
|
||
const_idx = model.data.const_idx
|
||
if const_idx is not None:
|
||
exog_names.pop(const_idx)
|
||
|
||
J = int(getattr(model, "J", 1))
|
||
if J > 1:
|
||
yname, yname_list = results._get_endog_name(model.endog_names,
|
||
None, all=True)
|
||
else:
|
||
yname = model.endog_names
|
||
yname_list = [yname]
|
||
|
||
smry.add_table_2cols(self, gleft=top_left, gright=[],
|
||
yname=yname, xname=exog_names, title=title)
|
||
|
||
# NOTE: add_table_params is not general enough yet for margeff
|
||
# could use a refactor with getattr instead of hard-coded params
|
||
# tvalues etc.
|
||
table = []
|
||
conf_int = self.conf_int(alpha)
|
||
margeff = self.margeff
|
||
margeff_se = self.margeff_se
|
||
tvalues = self.tvalues
|
||
pvalues = self.pvalues
|
||
if J > 1:
|
||
for eq in range(J):
|
||
restup = (results, margeff[:, eq], margeff_se[:, eq],
|
||
tvalues[:, eq], pvalues[:, eq], conf_int[:, :, eq])
|
||
tble = summary_params(restup, yname=yname_list[eq],
|
||
xname=exog_names, alpha=alpha,
|
||
use_t=False,
|
||
skip_header=True)
|
||
tble.title = yname_list[eq]
|
||
# overwrite coef with method name
|
||
header = ['', _transform_names[method], 'std err', 'z',
|
||
'P>|z|',
|
||
'[%3.1f%% Conf. Int.]' % (100 - alpha * 100)]
|
||
tble.insert_header_row(0, header)
|
||
# from IPython.core.debugger import Pdb; Pdb().set_trace()
|
||
table.append(tble)
|
||
|
||
table = table_extend(table, keep_headers=True)
|
||
else:
|
||
restup = (results, margeff, margeff_se, tvalues, pvalues, conf_int)
|
||
table = summary_params(restup, yname=yname, xname=exog_names,
|
||
alpha=alpha, use_t=False, skip_header=True)
|
||
header = ['', _transform_names[method], 'std err', 'z',
|
||
'P>|z|', '[%3.1f%% Conf. Int.]' % (100 - alpha * 100)]
|
||
table.insert_header_row(0, header)
|
||
|
||
smry.tables.append(table)
|
||
return smry
|
||
|
||
def get_margeff(self, at='overall', method='dydx', atexog=None,
|
||
dummy=False, count=False):
|
||
|
||
self._reset() # always reset the cache when this is called
|
||
# TODO: if at is not all or overall, we can also put atexog values
|
||
# in summary table head
|
||
method = method.lower()
|
||
at = at.lower()
|
||
_check_margeff_args(at, method)
|
||
self.margeff_options = dict(method=method, at=at)
|
||
results = self.results
|
||
model = results.model
|
||
params = results.params
|
||
exog = model.exog.copy() # copy because values are changed
|
||
effects_idx = exog.var(0) != 0
|
||
const_idx = model.data.const_idx
|
||
|
||
if dummy:
|
||
_check_discrete_args(at, method)
|
||
dummy_idx, dummy = _get_dummy_index(exog, const_idx)
|
||
else:
|
||
dummy_idx = None
|
||
|
||
if count:
|
||
_check_discrete_args(at, method)
|
||
count_idx, count = _get_count_index(exog, const_idx)
|
||
else:
|
||
count_idx = None
|
||
|
||
# get the exogenous variables
|
||
exog = _get_margeff_exog(exog, at, atexog, effects_idx)
|
||
|
||
# get base marginal effects, handled by sub-classes
|
||
effects = model._derivative_exog(params, exog, method,
|
||
dummy_idx, count_idx)
|
||
effects = _effects_at(effects, at)
|
||
|
||
if at == 'all':
|
||
self.margeff = effects[:, effects_idx]
|
||
else:
|
||
# Set standard error of the marginal effects by Delta method.
|
||
margeff_cov, margeff_se = margeff_cov_with_se(
|
||
model, params, exog, results.cov_params(), at,
|
||
model._derivative_exog, dummy_idx, count_idx,
|
||
method, 1)
|
||
|
||
# do not care about at constant
|
||
self.margeff_cov = margeff_cov[effects_idx][:, effects_idx]
|
||
self.margeff_se = margeff_se[effects_idx]
|
||
self.margeff = effects[effects_idx]
|