525 lines
16 KiB
Python
525 lines
16 KiB
Python
|
import numpy as np
|
|||
|
from collections import defaultdict
|
|||
|
import statsmodels.base.model as base
|
|||
|
from statsmodels.genmod import families
|
|||
|
from statsmodels.genmod.generalized_linear_model import GLM
|
|||
|
from statsmodels.genmod.families import links
|
|||
|
from statsmodels.genmod.families import varfuncs
|
|||
|
import statsmodels.regression.linear_model as lm
|
|||
|
import statsmodels.base.wrapper as wrap
|
|||
|
from statsmodels.tools.decorators import cache_readonly
|
|||
|
|
|||
|
|
|||
|
class QIFCovariance:
|
|||
|
"""
|
|||
|
A covariance model for quadratic inference function regression.
|
|||
|
|
|||
|
The mat method returns a basis matrix B such that the inverse
|
|||
|
of the working covariance lies in the linear span of the
|
|||
|
basis matrices.
|
|||
|
|
|||
|
Subclasses should set the number of basis matrices `num_terms`,
|
|||
|
so that `mat(d, j)` for j=0, ..., num_terms-1 gives the basis
|
|||
|
of dimension d.`
|
|||
|
"""
|
|||
|
|
|||
|
def mat(self, dim, term):
|
|||
|
"""
|
|||
|
Returns the term'th basis matrix, which is a dim x dim
|
|||
|
matrix.
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
|
|||
|
class QIFIndependence(QIFCovariance):
|
|||
|
"""
|
|||
|
Independent working covariance for QIF regression. This covariance
|
|||
|
model gives identical results to GEE with the independence working
|
|||
|
covariance. When using QIFIndependence as the working covariance,
|
|||
|
the QIF value will be zero, and cannot be used for chi^2 testing, or
|
|||
|
for model selection using AIC, BIC, etc.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
self.num_terms = 1
|
|||
|
|
|||
|
def mat(self, dim, term):
|
|||
|
if term == 0:
|
|||
|
return np.eye(dim)
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
class QIFExchangeable(QIFCovariance):
|
|||
|
"""
|
|||
|
Exchangeable working covariance for QIF regression.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
self.num_terms = 2
|
|||
|
|
|||
|
def mat(self, dim, term):
|
|||
|
if term == 0:
|
|||
|
return np.eye(dim)
|
|||
|
elif term == 1:
|
|||
|
return np.ones((dim, dim))
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
class QIFAutoregressive(QIFCovariance):
|
|||
|
"""
|
|||
|
Autoregressive working covariance for QIF regression.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
self.num_terms = 3
|
|||
|
|
|||
|
def mat(self, dim, term):
|
|||
|
|
|||
|
if dim < 3:
|
|||
|
msg = ("Groups must have size at least 3 for " +
|
|||
|
"autoregressive covariance.")
|
|||
|
raise ValueError(msg)
|
|||
|
|
|||
|
if term == 0:
|
|||
|
return np.eye(dim)
|
|||
|
elif term == 1:
|
|||
|
mat = np.zeros((dim, dim))
|
|||
|
mat.flat[1::(dim+1)] = 1
|
|||
|
mat += mat.T
|
|||
|
return mat
|
|||
|
elif term == 2:
|
|||
|
mat = np.zeros((dim, dim))
|
|||
|
mat[0, 0] = 1
|
|||
|
mat[dim-1, dim-1] = 1
|
|||
|
return mat
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
class QIF(base.Model):
|
|||
|
"""
|
|||
|
Fit a regression model using quadratic inference functions (QIF).
|
|||
|
|
|||
|
QIF is an alternative to GEE that can be more efficient, and that
|
|||
|
offers different approaches for model selection and inference.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
endog : array_like
|
|||
|
The dependent variables of the regression.
|
|||
|
exog : array_like
|
|||
|
The independent variables of the regression.
|
|||
|
groups : array_like
|
|||
|
Labels indicating which group each observation belongs to.
|
|||
|
Observations in different groups should be independent.
|
|||
|
family : genmod family
|
|||
|
An instance of a GLM family.
|
|||
|
cov_struct : QIFCovariance instance
|
|||
|
An instance of a QIFCovariance.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
A. Qu, B. Lindsay, B. Li (2000). Improving Generalized Estimating
|
|||
|
Equations using Quadratic Inference Functions, Biometrika 87:4.
|
|||
|
www.jstor.org/stable/2673612
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, endog, exog, groups, family=None,
|
|||
|
cov_struct=None, missing='none', **kwargs):
|
|||
|
|
|||
|
# Handle the family argument
|
|||
|
if family is None:
|
|||
|
family = families.Gaussian()
|
|||
|
else:
|
|||
|
if not issubclass(family.__class__, families.Family):
|
|||
|
raise ValueError("QIF: `family` must be a genmod "
|
|||
|
"family instance")
|
|||
|
self.family = family
|
|||
|
|
|||
|
self._fit_history = defaultdict(list)
|
|||
|
|
|||
|
# Handle the cov_struct argument
|
|||
|
if cov_struct is None:
|
|||
|
cov_struct = QIFIndependence()
|
|||
|
else:
|
|||
|
if not isinstance(cov_struct, QIFCovariance):
|
|||
|
raise ValueError(
|
|||
|
"QIF: `cov_struct` must be a QIFCovariance instance")
|
|||
|
self.cov_struct = cov_struct
|
|||
|
|
|||
|
groups = np.asarray(groups)
|
|||
|
|
|||
|
super().__init__(
|
|||
|
endog, exog, groups=groups, missing=missing, **kwargs
|
|||
|
)
|
|||
|
|
|||
|
self.group_names = list(set(groups))
|
|||
|
self.nobs = len(self.endog)
|
|||
|
|
|||
|
groups_ix = defaultdict(list)
|
|||
|
for i, g in enumerate(groups):
|
|||
|
groups_ix[g].append(i)
|
|||
|
self.groups_ix = [groups_ix[na] for na in self.group_names]
|
|||
|
|
|||
|
self._check_args(groups)
|
|||
|
|
|||
|
def _check_args(self, groups):
|
|||
|
|
|||
|
if len(groups) != len(self.endog):
|
|||
|
msg = "QIF: groups and endog should have the same length"
|
|||
|
raise ValueError(msg)
|
|||
|
|
|||
|
if len(self.endog) != self.exog.shape[0]:
|
|||
|
msg = ("QIF: the length of endog should be equal to the "
|
|||
|
"number of rows of exog.")
|
|||
|
raise ValueError(msg)
|
|||
|
|
|||
|
def objective(self, params):
|
|||
|
"""
|
|||
|
Calculate the gradient of the QIF objective function.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
params : array_like
|
|||
|
The model parameters at which the gradient is evaluated.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
grad : array_like
|
|||
|
The gradient vector of the QIF objective function.
|
|||
|
gn_deriv : array_like
|
|||
|
The gradients of each estimating equation with
|
|||
|
respect to the parameter.
|
|||
|
"""
|
|||
|
|
|||
|
endog = self.endog
|
|||
|
exog = self.exog
|
|||
|
lpr = np.dot(exog, params)
|
|||
|
mean = self.family.link.inverse(lpr)
|
|||
|
va = self.family.variance(mean)
|
|||
|
|
|||
|
# Mean derivative
|
|||
|
idl = self.family.link.inverse_deriv(lpr)
|
|||
|
idl2 = self.family.link.inverse_deriv2(lpr)
|
|||
|
vd = self.family.variance.deriv(mean)
|
|||
|
|
|||
|
m = self.cov_struct.num_terms
|
|||
|
p = exog.shape[1]
|
|||
|
|
|||
|
d = p * m
|
|||
|
gn = np.zeros(d)
|
|||
|
gi = np.zeros(d)
|
|||
|
gi_deriv = np.zeros((d, p))
|
|||
|
gn_deriv = np.zeros((d, p))
|
|||
|
cn_deriv = [0] * p
|
|||
|
cmat = np.zeros((d, d))
|
|||
|
|
|||
|
fastvar = self.family.variance is varfuncs.constant
|
|||
|
fastlink = isinstance(
|
|||
|
self.family.link,
|
|||
|
# TODO: Remove links.identity after deprecation final
|
|||
|
(links.Identity, links.identity)
|
|||
|
)
|
|||
|
|
|||
|
for ix in self.groups_ix:
|
|||
|
sd = np.sqrt(va[ix])
|
|||
|
resid = endog[ix] - mean[ix]
|
|||
|
sresid = resid / sd
|
|||
|
deriv = exog[ix, :] * idl[ix, None]
|
|||
|
|
|||
|
jj = 0
|
|||
|
for j in range(m):
|
|||
|
# The derivative of each term in (5) of Qu et al.
|
|||
|
# There are four terms involving beta in a product.
|
|||
|
# Iterated application of the product rule gives
|
|||
|
# the gradient as a sum of four terms.
|
|||
|
c = self.cov_struct.mat(len(ix), j)
|
|||
|
crs1 = np.dot(c, sresid) / sd
|
|||
|
gi[jj:jj+p] = np.dot(deriv.T, crs1)
|
|||
|
crs2 = np.dot(c, -deriv / sd[:, None]) / sd[:, None]
|
|||
|
gi_deriv[jj:jj+p, :] = np.dot(deriv.T, crs2)
|
|||
|
if not (fastlink and fastvar):
|
|||
|
for k in range(p):
|
|||
|
m1 = np.dot(exog[ix, :].T,
|
|||
|
idl2[ix] * exog[ix, k] * crs1)
|
|||
|
if not fastvar:
|
|||
|
vx = -0.5 * vd[ix] * deriv[:, k] / va[ix]**1.5
|
|||
|
m2 = np.dot(deriv.T, vx * np.dot(c, sresid))
|
|||
|
m3 = np.dot(deriv.T, np.dot(c, vx * resid) / sd)
|
|||
|
else:
|
|||
|
m2, m3 = 0, 0
|
|||
|
gi_deriv[jj:jj+p, k] += m1 + m2 + m3
|
|||
|
jj += p
|
|||
|
|
|||
|
for j in range(p):
|
|||
|
u = np.outer(gi, gi_deriv[:, j])
|
|||
|
cn_deriv[j] += u + u.T
|
|||
|
|
|||
|
gn += gi
|
|||
|
gn_deriv += gi_deriv
|
|||
|
|
|||
|
cmat += np.outer(gi, gi)
|
|||
|
|
|||
|
ngrp = len(self.groups_ix)
|
|||
|
gn /= ngrp
|
|||
|
gn_deriv /= ngrp
|
|||
|
cmat /= ngrp**2
|
|||
|
|
|||
|
qif = np.dot(gn, np.linalg.solve(cmat, gn))
|
|||
|
|
|||
|
gcg = np.zeros(p)
|
|||
|
for j in range(p):
|
|||
|
cn_deriv[j] /= len(self.groups_ix)**2
|
|||
|
u = np.linalg.solve(cmat, cn_deriv[j]).T
|
|||
|
u = np.linalg.solve(cmat, u)
|
|||
|
gcg[j] = np.dot(gn, np.dot(u, gn))
|
|||
|
|
|||
|
grad = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn)) - gcg
|
|||
|
|
|||
|
return qif, grad, cmat, gn, gn_deriv
|
|||
|
|
|||
|
def estimate_scale(self, params):
|
|||
|
"""
|
|||
|
Estimate the dispersion/scale.
|
|||
|
|
|||
|
The scale parameter for binomial and Poisson families is
|
|||
|
fixed at 1, otherwise it is estimated from the data.
|
|||
|
"""
|
|||
|
|
|||
|
if isinstance(self.family, (families.Binomial, families.Poisson)):
|
|||
|
return 1.
|
|||
|
|
|||
|
if hasattr(self, "ddof_scale"):
|
|||
|
ddof_scale = self.ddof_scale
|
|||
|
else:
|
|||
|
ddof_scale = self.exog[1]
|
|||
|
|
|||
|
lpr = np.dot(self.exog, params)
|
|||
|
mean = self.family.link.inverse(lpr)
|
|||
|
resid = self.endog - mean
|
|||
|
scale = np.sum(resid**2) / (self.nobs - ddof_scale)
|
|||
|
|
|||
|
return scale
|
|||
|
|
|||
|
@classmethod
|
|||
|
def from_formula(cls, formula, groups, data, subset=None,
|
|||
|
*args, **kwargs):
|
|||
|
"""
|
|||
|
Create a QIF model instance from a formula and dataframe.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
formula : str or generic Formula object
|
|||
|
The formula specifying the model
|
|||
|
groups : array_like or string
|
|||
|
Array of grouping labels. If a string, this is the name
|
|||
|
of a variable in `data` that contains the grouping labels.
|
|||
|
data : array_like
|
|||
|
The data for the model.
|
|||
|
subset : array_like
|
|||
|
An array_like object of booleans, integers, or index
|
|||
|
values that indicate the subset of the data to used when
|
|||
|
fitting the model.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
model : QIF model instance
|
|||
|
"""
|
|||
|
|
|||
|
if isinstance(groups, str):
|
|||
|
groups = data[groups]
|
|||
|
|
|||
|
model = super().from_formula(
|
|||
|
formula, data=data, subset=subset,
|
|||
|
groups=groups, *args, **kwargs)
|
|||
|
|
|||
|
return model
|
|||
|
|
|||
|
def fit(self, maxiter=100, start_params=None, tol=1e-6, gtol=1e-4,
|
|||
|
ddof_scale=None):
|
|||
|
"""
|
|||
|
Fit a GLM to correlated data using QIF.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
maxiter : int
|
|||
|
Maximum number of iterations.
|
|||
|
start_params : array_like, optional
|
|||
|
Starting values
|
|||
|
tol : float
|
|||
|
Convergence threshold for difference of successive
|
|||
|
estimates.
|
|||
|
gtol : float
|
|||
|
Convergence threshold for gradient.
|
|||
|
ddof_scale : int, optional
|
|||
|
Degrees of freedom for the scale parameter
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
QIFResults object
|
|||
|
"""
|
|||
|
|
|||
|
if ddof_scale is None:
|
|||
|
self.ddof_scale = self.exog.shape[1]
|
|||
|
else:
|
|||
|
self.ddof_scale = ddof_scale
|
|||
|
|
|||
|
if start_params is None:
|
|||
|
model = GLM(self.endog, self.exog, family=self.family)
|
|||
|
result = model.fit()
|
|||
|
params = result.params
|
|||
|
else:
|
|||
|
params = start_params
|
|||
|
|
|||
|
for _ in range(maxiter):
|
|||
|
|
|||
|
qif, grad, cmat, _, gn_deriv = self.objective(params)
|
|||
|
|
|||
|
gnorm = np.sqrt(np.sum(grad * grad))
|
|||
|
self._fit_history["qif"].append(qif)
|
|||
|
self._fit_history["gradnorm"].append(gnorm)
|
|||
|
|
|||
|
if gnorm < gtol:
|
|||
|
break
|
|||
|
|
|||
|
cjac = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv))
|
|||
|
step = np.linalg.solve(cjac, grad)
|
|||
|
|
|||
|
snorm = np.sqrt(np.sum(step * step))
|
|||
|
self._fit_history["stepnorm"].append(snorm)
|
|||
|
if snorm < tol:
|
|||
|
break
|
|||
|
params -= step
|
|||
|
|
|||
|
vcov = np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv))
|
|||
|
vcov = np.linalg.inv(vcov)
|
|||
|
scale = self.estimate_scale(params)
|
|||
|
|
|||
|
rslt = QIFResults(self, params, vcov / scale, scale)
|
|||
|
rslt.fit_history = self._fit_history
|
|||
|
self._fit_history = defaultdict(list)
|
|||
|
|
|||
|
return QIFResultsWrapper(rslt)
|
|||
|
|
|||
|
|
|||
|
class QIFResults(base.LikelihoodModelResults):
|
|||
|
"""Results class for QIF Regression"""
|
|||
|
def __init__(self, model, params, cov_params, scale,
|
|||
|
use_t=False, **kwds):
|
|||
|
|
|||
|
super().__init__(
|
|||
|
model, params, normalized_cov_params=cov_params,
|
|||
|
scale=scale)
|
|||
|
|
|||
|
self.qif, _, _, _, _ = self.model.objective(params)
|
|||
|
|
|||
|
@cache_readonly
|
|||
|
def aic(self):
|
|||
|
"""
|
|||
|
An AIC-like statistic for models fit using QIF.
|
|||
|
"""
|
|||
|
if isinstance(self.model.cov_struct, QIFIndependence):
|
|||
|
msg = "AIC not available with QIFIndependence covariance"
|
|||
|
raise ValueError(msg)
|
|||
|
df = self.model.exog.shape[1]
|
|||
|
return self.qif + 2*df
|
|||
|
|
|||
|
@cache_readonly
|
|||
|
def bic(self):
|
|||
|
"""
|
|||
|
A BIC-like statistic for models fit using QIF.
|
|||
|
"""
|
|||
|
if isinstance(self.model.cov_struct, QIFIndependence):
|
|||
|
msg = "BIC not available with QIFIndependence covariance"
|
|||
|
raise ValueError(msg)
|
|||
|
df = self.model.exog.shape[1]
|
|||
|
return self.qif + np.log(self.model.nobs)*df
|
|||
|
|
|||
|
@cache_readonly
|
|||
|
def fittedvalues(self):
|
|||
|
"""
|
|||
|
Returns the fitted values from the model.
|
|||
|
"""
|
|||
|
return self.model.family.link.inverse(
|
|||
|
np.dot(self.model.exog, self.params))
|
|||
|
|
|||
|
def summary(self, yname=None, xname=None, title=None, alpha=.05):
|
|||
|
"""
|
|||
|
Summarize the QIF regression results
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
yname : str, optional
|
|||
|
Default is `y`
|
|||
|
xname : list[str], optional
|
|||
|
Names for the exogenous variables, default is `var_#` for ## in
|
|||
|
the number of regressors. Must match the number of parameters in
|
|||
|
the model
|
|||
|
title : str, optional
|
|||
|
Title for the top table. If not None, then this replaces
|
|||
|
the default title
|
|||
|
alpha : float
|
|||
|
significance level for the confidence intervals
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
smry : Summary instance
|
|||
|
this holds the summary tables and text, which can be
|
|||
|
printed or converted to various output formats.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
statsmodels.iolib.summary.Summary : class to hold summary results
|
|||
|
"""
|
|||
|
|
|||
|
top_left = [('Dep. Variable:', None),
|
|||
|
('Method:', ['QIF']),
|
|||
|
('Family:', [self.model.family.__class__.__name__]),
|
|||
|
('Covariance structure:',
|
|||
|
[self.model.cov_struct.__class__.__name__]),
|
|||
|
('Date:', None),
|
|||
|
('Time:', None),
|
|||
|
]
|
|||
|
|
|||
|
NY = [len(y) for y in self.model.groups_ix]
|
|||
|
|
|||
|
top_right = [('No. Observations:', [sum(NY)]),
|
|||
|
('No. clusters:', [len(NY)]),
|
|||
|
('Min. cluster size:', [min(NY)]),
|
|||
|
('Max. cluster size:', [max(NY)]),
|
|||
|
('Mean cluster size:', ["%.1f" % np.mean(NY)]),
|
|||
|
('Scale:', ["%.3f" % self.scale]),
|
|||
|
]
|
|||
|
|
|||
|
if title is None:
|
|||
|
title = self.model.__class__.__name__ + ' ' +\
|
|||
|
"Regression Results"
|
|||
|
|
|||
|
# Override the exog variable names if xname is provided as an
|
|||
|
# argument.
|
|||
|
if xname is None:
|
|||
|
xname = self.model.exog_names
|
|||
|
|
|||
|
if yname is None:
|
|||
|
yname = self.model.endog_names
|
|||
|
|
|||
|
# Create summary table instance
|
|||
|
from statsmodels.iolib.summary import Summary
|
|||
|
smry = Summary()
|
|||
|
smry.add_table_2cols(self, gleft=top_left, gright=top_right,
|
|||
|
yname=yname, xname=xname,
|
|||
|
title=title)
|
|||
|
smry.add_table_params(self, yname=yname, xname=xname,
|
|||
|
alpha=alpha, use_t=False)
|
|||
|
|
|||
|
return smry
|
|||
|
|
|||
|
|
|||
|
class QIFResultsWrapper(lm.RegressionResultsWrapper):
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
wrap.populate_wrapper(QIFResultsWrapper, QIFResults)
|