525 lines
16 KiB
Python
525 lines
16 KiB
Python
import numpy as np
|
||
from collections import defaultdict
|
||
import statsmodels.base.model as base
|
||
from statsmodels.genmod import families
|
||
from statsmodels.genmod.generalized_linear_model import GLM
|
||
from statsmodels.genmod.families import links
|
||
from statsmodels.genmod.families import varfuncs
|
||
import statsmodels.regression.linear_model as lm
|
||
import statsmodels.base.wrapper as wrap
|
||
from statsmodels.tools.decorators import cache_readonly
|
||
|
||
|
||
class QIFCovariance:
|
||
"""
|
||
A covariance model for quadratic inference function regression.
|
||
|
||
The mat method returns a basis matrix B such that the inverse
|
||
of the working covariance lies in the linear span of the
|
||
basis matrices.
|
||
|
||
Subclasses should set the number of basis matrices `num_terms`,
|
||
so that `mat(d, j)` for j=0, ..., num_terms-1 gives the basis
|
||
of dimension d.`
|
||
"""
|
||
|
||
def mat(self, dim, term):
|
||
"""
|
||
Returns the term'th basis matrix, which is a dim x dim
|
||
matrix.
|
||
"""
|
||
raise NotImplementedError
|
||
|
||
|
||
class QIFIndependence(QIFCovariance):
|
||
"""
|
||
Independent working covariance for QIF regression. This covariance
|
||
model gives identical results to GEE with the independence working
|
||
covariance. When using QIFIndependence as the working covariance,
|
||
the QIF value will be zero, and cannot be used for chi^2 testing, or
|
||
for model selection using AIC, BIC, etc.
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.num_terms = 1
|
||
|
||
def mat(self, dim, term):
|
||
if term == 0:
|
||
return np.eye(dim)
|
||
else:
|
||
return None
|
||
|
||
|
||
class QIFExchangeable(QIFCovariance):
|
||
"""
|
||
Exchangeable working covariance for QIF regression.
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.num_terms = 2
|
||
|
||
def mat(self, dim, term):
|
||
if term == 0:
|
||
return np.eye(dim)
|
||
elif term == 1:
|
||
return np.ones((dim, dim))
|
||
else:
|
||
return None
|
||
|
||
|
||
class QIFAutoregressive(QIFCovariance):
|
||
"""
|
||
Autoregressive working covariance for QIF regression.
|
||
"""
|
||
|
||
def __init__(self):
|
||
self.num_terms = 3
|
||
|
||
def mat(self, dim, term):
|
||
|
||
if dim < 3:
|
||
msg = ("Groups must have size at least 3 for " +
|
||
"autoregressive covariance.")
|
||
raise ValueError(msg)
|
||
|
||
if term == 0:
|
||
return np.eye(dim)
|
||
elif term == 1:
|
||
mat = np.zeros((dim, dim))
|
||
mat.flat[1::(dim+1)] = 1
|
||
mat += mat.T
|
||
return mat
|
||
elif term == 2:
|
||
mat = np.zeros((dim, dim))
|
||
mat[0, 0] = 1
|
||
mat[dim-1, dim-1] = 1
|
||
return mat
|
||
else:
|
||
return None
|
||
|
||
|
||
class QIF(base.Model):
|
||
"""
|
||
Fit a regression model using quadratic inference functions (QIF).
|
||
|
||
QIF is an alternative to GEE that can be more efficient, and that
|
||
offers different approaches for model selection and inference.
|
||
|
||
Parameters
|
||
----------
|
||
endog : array_like
|
||
The dependent variables of the regression.
|
||
exog : array_like
|
||
The independent variables of the regression.
|
||
groups : array_like
|
||
Labels indicating which group each observation belongs to.
|
||
Observations in different groups should be independent.
|
||
family : genmod family
|
||
An instance of a GLM family.
|
||
cov_struct : QIFCovariance instance
|
||
An instance of a QIFCovariance.
|
||
|
||
References
|
||
----------
|
||
A. Qu, B. Lindsay, B. Li (2000). Improving Generalized Estimating
|
||
Equations using Quadratic Inference Functions, Biometrika 87:4.
|
||
www.jstor.org/stable/2673612
|
||
"""
|
||
|
||
def __init__(self, endog, exog, groups, family=None,
|
||
cov_struct=None, missing='none', **kwargs):
|
||
|
||
# Handle the family argument
|
||
if family is None:
|
||
family = families.Gaussian()
|
||
else:
|
||
if not issubclass(family.__class__, families.Family):
|
||
raise ValueError("QIF: `family` must be a genmod "
|
||
"family instance")
|
||
self.family = family
|
||
|
||
self._fit_history = defaultdict(list)
|
||
|
||
# Handle the cov_struct argument
|
||
if cov_struct is None:
|
||
cov_struct = QIFIndependence()
|
||
else:
|
||
if not isinstance(cov_struct, QIFCovariance):
|
||
raise ValueError(
|
||
"QIF: `cov_struct` must be a QIFCovariance instance")
|
||
self.cov_struct = cov_struct
|
||
|
||
groups = np.asarray(groups)
|
||
|
||
super().__init__(
|
||
endog, exog, groups=groups, missing=missing, **kwargs
|
||
)
|
||
|
||
self.group_names = list(set(groups))
|
||
self.nobs = len(self.endog)
|
||
|
||
groups_ix = defaultdict(list)
|
||
for i, g in enumerate(groups):
|
||
groups_ix[g].append(i)
|
||
self.groups_ix = [groups_ix[na] for na in self.group_names]
|
||
|
||
self._check_args(groups)
|
||
|
||
def _check_args(self, groups):
|
||
|
||
if len(groups) != len(self.endog):
|
||
msg = "QIF: groups and endog should have the same length"
|
||
raise ValueError(msg)
|
||
|
||
if len(self.endog) != self.exog.shape[0]:
|
||
msg = ("QIF: the length of endog should be equal to the "
|
||
"number of rows of exog.")
|
||
raise ValueError(msg)
|
||
|
||
def objective(self, params):
|
||
"""
|
||
Calculate the gradient of the QIF objective function.
|
||
|
||
Parameters
|
||
----------
|
||
params : array_like
|
||
The model parameters at which the gradient is evaluated.
|
||
|
||
Returns
|
||
-------
|
||
grad : array_like
|
||
The gradient vector of the QIF objective function.
|
||
gn_deriv : array_like
|
||
The gradients of each estimating equation with
|
||
respect to the parameter.
|
||
"""
|
||
|
||
endog = self.endog
|
||
exog = self.exog
|
||
lpr = np.dot(exog, params)
|
||
mean = self.family.link.inverse(lpr)
|
||
va = self.family.variance(mean)
|
||
|
||
# Mean derivative
|
||
idl = self.family.link.inverse_deriv(lpr)
|
||
idl2 = self.family.link.inverse_deriv2(lpr)
|
||
vd = self.family.variance.deriv(mean)
|
||
|
||
m = self.cov_struct.num_terms
|
||
p = exog.shape[1]
|
||
|
||
d = p * m
|
||
gn = np.zeros(d)
|
||
gi = np.zeros(d)
|
||
gi_deriv = np.zeros((d, p))
|
||
gn_deriv = np.zeros((d, p))
|
||
cn_deriv = [0] * p
|
||
cmat = np.zeros((d, d))
|
||
|
||
fastvar = self.family.variance is varfuncs.constant
|
||
fastlink = isinstance(
|
||
self.family.link,
|
||
# TODO: Remove links.identity after deprecation final
|
||
(links.Identity, links.identity)
|
||
)
|
||
|
||
for ix in self.groups_ix:
|
||
sd = np.sqrt(va[ix])
|
||
resid = endog[ix] - mean[ix]
|
||
sresid = resid / sd
|
||
deriv = exog[ix, :] * idl[ix, None]
|
||
|
||
jj = 0
|
||
for j in range(m):
|
||
# The derivative of each term in (5) of Qu et al.
|
||
# There are four terms involving beta in a product.
|
||
# Iterated application of the product rule gives
|
||
# the gradient as a sum of four terms.
|
||
c = self.cov_struct.mat(len(ix), j)
|
||
crs1 = np.dot(c, sresid) / sd
|
||
gi[jj:jj+p] = np.dot(deriv.T, crs1)
|
||
crs2 = np.dot(c, -deriv / sd[:, None]) / sd[:, None]
|
||
gi_deriv[jj:jj+p, :] = np.dot(deriv.T, crs2)
|
||
if not (fastlink and fastvar):
|
||
for k in range(p):
|
||
m1 = np.dot(exog[ix, :].T,
|
||
idl2[ix] * exog[ix, k] * crs1)
|
||
if not fastvar:
|
||
vx = -0.5 * vd[ix] * deriv[:, k] / va[ix]**1.5
|
||
m2 = np.dot(deriv.T, vx * np.dot(c, sresid))
|
||
m3 = np.dot(deriv.T, np.dot(c, vx * resid) / sd)
|
||
else:
|
||
m2, m3 = 0, 0
|
||
gi_deriv[jj:jj+p, k] += m1 + m2 + m3
|
||
jj += p
|
||
|
||
for j in range(p):
|
||
u = np.outer(gi, gi_deriv[:, j])
|
||
cn_deriv[j] += u + u.T
|
||
|
||
gn += gi
|
||
gn_deriv += gi_deriv
|
||
|
||
cmat += np.outer(gi, gi)
|
||
|
||
ngrp = len(self.groups_ix)
|
||
gn /= ngrp
|
||
gn_deriv /= ngrp
|
||
cmat /= ngrp**2
|
||
|
||
qif = np.dot(gn, np.linalg.solve(cmat, gn))
|
||
|
||
gcg = np.zeros(p)
|
||
for j in range(p):
|
||
cn_deriv[j] /= len(self.groups_ix)**2
|
||
u = np.linalg.solve(cmat, cn_deriv[j]).T
|
||
u = np.linalg.solve(cmat, u)
|
||
gcg[j] = np.dot(gn, np.dot(u, gn))
|
||
|
||
grad = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn)) - gcg
|
||
|
||
return qif, grad, cmat, gn, gn_deriv
|
||
|
||
def estimate_scale(self, params):
|
||
"""
|
||
Estimate the dispersion/scale.
|
||
|
||
The scale parameter for binomial and Poisson families is
|
||
fixed at 1, otherwise it is estimated from the data.
|
||
"""
|
||
|
||
if isinstance(self.family, (families.Binomial, families.Poisson)):
|
||
return 1.
|
||
|
||
if hasattr(self, "ddof_scale"):
|
||
ddof_scale = self.ddof_scale
|
||
else:
|
||
ddof_scale = self.exog[1]
|
||
|
||
lpr = np.dot(self.exog, params)
|
||
mean = self.family.link.inverse(lpr)
|
||
resid = self.endog - mean
|
||
scale = np.sum(resid**2) / (self.nobs - ddof_scale)
|
||
|
||
return scale
|
||
|
||
@classmethod
|
||
def from_formula(cls, formula, groups, data, subset=None,
|
||
*args, **kwargs):
|
||
"""
|
||
Create a QIF model instance from a formula and dataframe.
|
||
|
||
Parameters
|
||
----------
|
||
formula : str or generic Formula object
|
||
The formula specifying the model
|
||
groups : array_like or string
|
||
Array of grouping labels. If a string, this is the name
|
||
of a variable in `data` that contains the grouping labels.
|
||
data : array_like
|
||
The data for the model.
|
||
subset : array_like
|
||
An array_like object of booleans, integers, or index
|
||
values that indicate the subset of the data to used when
|
||
fitting the model.
|
||
|
||
Returns
|
||
-------
|
||
model : QIF model instance
|
||
"""
|
||
|
||
if isinstance(groups, str):
|
||
groups = data[groups]
|
||
|
||
model = super().from_formula(
|
||
formula, data=data, subset=subset,
|
||
groups=groups, *args, **kwargs)
|
||
|
||
return model
|
||
|
||
def fit(self, maxiter=100, start_params=None, tol=1e-6, gtol=1e-4,
|
||
ddof_scale=None):
|
||
"""
|
||
Fit a GLM to correlated data using QIF.
|
||
|
||
Parameters
|
||
----------
|
||
maxiter : int
|
||
Maximum number of iterations.
|
||
start_params : array_like, optional
|
||
Starting values
|
||
tol : float
|
||
Convergence threshold for difference of successive
|
||
estimates.
|
||
gtol : float
|
||
Convergence threshold for gradient.
|
||
ddof_scale : int, optional
|
||
Degrees of freedom for the scale parameter
|
||
|
||
Returns
|
||
-------
|
||
QIFResults object
|
||
"""
|
||
|
||
if ddof_scale is None:
|
||
self.ddof_scale = self.exog.shape[1]
|
||
else:
|
||
self.ddof_scale = ddof_scale
|
||
|
||
if start_params is None:
|
||
model = GLM(self.endog, self.exog, family=self.family)
|
||
result = model.fit()
|
||
params = result.params
|
||
else:
|
||
params = start_params
|
||
|
||
for _ in range(maxiter):
|
||
|
||
qif, grad, cmat, _, gn_deriv = self.objective(params)
|
||
|
||
gnorm = np.sqrt(np.sum(grad * grad))
|
||
self._fit_history["qif"].append(qif)
|
||
self._fit_history["gradnorm"].append(gnorm)
|
||
|
||
if gnorm < gtol:
|
||
break
|
||
|
||
cjac = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv))
|
||
step = np.linalg.solve(cjac, grad)
|
||
|
||
snorm = np.sqrt(np.sum(step * step))
|
||
self._fit_history["stepnorm"].append(snorm)
|
||
if snorm < tol:
|
||
break
|
||
params -= step
|
||
|
||
vcov = np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv))
|
||
vcov = np.linalg.inv(vcov)
|
||
scale = self.estimate_scale(params)
|
||
|
||
rslt = QIFResults(self, params, vcov / scale, scale)
|
||
rslt.fit_history = self._fit_history
|
||
self._fit_history = defaultdict(list)
|
||
|
||
return QIFResultsWrapper(rslt)
|
||
|
||
|
||
class QIFResults(base.LikelihoodModelResults):
|
||
"""Results class for QIF Regression"""
|
||
def __init__(self, model, params, cov_params, scale,
|
||
use_t=False, **kwds):
|
||
|
||
super().__init__(
|
||
model, params, normalized_cov_params=cov_params,
|
||
scale=scale)
|
||
|
||
self.qif, _, _, _, _ = self.model.objective(params)
|
||
|
||
@cache_readonly
|
||
def aic(self):
|
||
"""
|
||
An AIC-like statistic for models fit using QIF.
|
||
"""
|
||
if isinstance(self.model.cov_struct, QIFIndependence):
|
||
msg = "AIC not available with QIFIndependence covariance"
|
||
raise ValueError(msg)
|
||
df = self.model.exog.shape[1]
|
||
return self.qif + 2*df
|
||
|
||
@cache_readonly
|
||
def bic(self):
|
||
"""
|
||
A BIC-like statistic for models fit using QIF.
|
||
"""
|
||
if isinstance(self.model.cov_struct, QIFIndependence):
|
||
msg = "BIC not available with QIFIndependence covariance"
|
||
raise ValueError(msg)
|
||
df = self.model.exog.shape[1]
|
||
return self.qif + np.log(self.model.nobs)*df
|
||
|
||
@cache_readonly
|
||
def fittedvalues(self):
|
||
"""
|
||
Returns the fitted values from the model.
|
||
"""
|
||
return self.model.family.link.inverse(
|
||
np.dot(self.model.exog, self.params))
|
||
|
||
def summary(self, yname=None, xname=None, title=None, alpha=.05):
|
||
"""
|
||
Summarize the QIF regression results
|
||
|
||
Parameters
|
||
----------
|
||
yname : str, optional
|
||
Default is `y`
|
||
xname : list[str], optional
|
||
Names for the exogenous variables, default is `var_#` for ## in
|
||
the number of regressors. Must match the number of parameters in
|
||
the model
|
||
title : str, optional
|
||
Title for the top table. If not None, then this replaces
|
||
the default title
|
||
alpha : float
|
||
significance level for the confidence intervals
|
||
|
||
Returns
|
||
-------
|
||
smry : Summary instance
|
||
this holds the summary tables and text, which can be
|
||
printed or converted to various output formats.
|
||
|
||
See Also
|
||
--------
|
||
statsmodels.iolib.summary.Summary : class to hold summary results
|
||
"""
|
||
|
||
top_left = [('Dep. Variable:', None),
|
||
('Method:', ['QIF']),
|
||
('Family:', [self.model.family.__class__.__name__]),
|
||
('Covariance structure:',
|
||
[self.model.cov_struct.__class__.__name__]),
|
||
('Date:', None),
|
||
('Time:', None),
|
||
]
|
||
|
||
NY = [len(y) for y in self.model.groups_ix]
|
||
|
||
top_right = [('No. Observations:', [sum(NY)]),
|
||
('No. clusters:', [len(NY)]),
|
||
('Min. cluster size:', [min(NY)]),
|
||
('Max. cluster size:', [max(NY)]),
|
||
('Mean cluster size:', ["%.1f" % np.mean(NY)]),
|
||
('Scale:', ["%.3f" % self.scale]),
|
||
]
|
||
|
||
if title is None:
|
||
title = self.model.__class__.__name__ + ' ' +\
|
||
"Regression Results"
|
||
|
||
# Override the exog variable names if xname is provided as an
|
||
# argument.
|
||
if xname is None:
|
||
xname = self.model.exog_names
|
||
|
||
if yname is None:
|
||
yname = self.model.endog_names
|
||
|
||
# Create summary table instance
|
||
from statsmodels.iolib.summary import Summary
|
||
smry = Summary()
|
||
smry.add_table_2cols(self, gleft=top_left, gright=top_right,
|
||
yname=yname, xname=xname,
|
||
title=title)
|
||
smry.add_table_params(self, yname=yname, xname=xname,
|
||
alpha=alpha, use_t=False)
|
||
|
||
return smry
|
||
|
||
|
||
class QIFResultsWrapper(lm.RegressionResultsWrapper):
|
||
pass
|
||
|
||
|
||
wrap.populate_wrapper(QIFResultsWrapper, QIFResults)
|