import numpy as np from collections import defaultdict import statsmodels.base.model as base from statsmodels.genmod import families from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import links from statsmodels.genmod.families import varfuncs import statsmodels.regression.linear_model as lm import statsmodels.base.wrapper as wrap from statsmodels.tools.decorators import cache_readonly class QIFCovariance: """ A covariance model for quadratic inference function regression. The mat method returns a basis matrix B such that the inverse of the working covariance lies in the linear span of the basis matrices. Subclasses should set the number of basis matrices `num_terms`, so that `mat(d, j)` for j=0, ..., num_terms-1 gives the basis of dimension d.` """ def mat(self, dim, term): """ Returns the term'th basis matrix, which is a dim x dim matrix. """ raise NotImplementedError class QIFIndependence(QIFCovariance): """ Independent working covariance for QIF regression. This covariance model gives identical results to GEE with the independence working covariance. When using QIFIndependence as the working covariance, the QIF value will be zero, and cannot be used for chi^2 testing, or for model selection using AIC, BIC, etc. """ def __init__(self): self.num_terms = 1 def mat(self, dim, term): if term == 0: return np.eye(dim) else: return None class QIFExchangeable(QIFCovariance): """ Exchangeable working covariance for QIF regression. """ def __init__(self): self.num_terms = 2 def mat(self, dim, term): if term == 0: return np.eye(dim) elif term == 1: return np.ones((dim, dim)) else: return None class QIFAutoregressive(QIFCovariance): """ Autoregressive working covariance for QIF regression. """ def __init__(self): self.num_terms = 3 def mat(self, dim, term): if dim < 3: msg = ("Groups must have size at least 3 for " + "autoregressive covariance.") raise ValueError(msg) if term == 0: return np.eye(dim) elif term == 1: mat = np.zeros((dim, dim)) mat.flat[1::(dim+1)] = 1 mat += mat.T return mat elif term == 2: mat = np.zeros((dim, dim)) mat[0, 0] = 1 mat[dim-1, dim-1] = 1 return mat else: return None class QIF(base.Model): """ Fit a regression model using quadratic inference functions (QIF). QIF is an alternative to GEE that can be more efficient, and that offers different approaches for model selection and inference. Parameters ---------- endog : array_like The dependent variables of the regression. exog : array_like The independent variables of the regression. groups : array_like Labels indicating which group each observation belongs to. Observations in different groups should be independent. family : genmod family An instance of a GLM family. cov_struct : QIFCovariance instance An instance of a QIFCovariance. References ---------- A. Qu, B. Lindsay, B. Li (2000). Improving Generalized Estimating Equations using Quadratic Inference Functions, Biometrika 87:4. www.jstor.org/stable/2673612 """ def __init__(self, endog, exog, groups, family=None, cov_struct=None, missing='none', **kwargs): # Handle the family argument if family is None: family = families.Gaussian() else: if not issubclass(family.__class__, families.Family): raise ValueError("QIF: `family` must be a genmod " "family instance") self.family = family self._fit_history = defaultdict(list) # Handle the cov_struct argument if cov_struct is None: cov_struct = QIFIndependence() else: if not isinstance(cov_struct, QIFCovariance): raise ValueError( "QIF: `cov_struct` must be a QIFCovariance instance") self.cov_struct = cov_struct groups = np.asarray(groups) super().__init__( endog, exog, groups=groups, missing=missing, **kwargs ) self.group_names = list(set(groups)) self.nobs = len(self.endog) groups_ix = defaultdict(list) for i, g in enumerate(groups): groups_ix[g].append(i) self.groups_ix = [groups_ix[na] for na in self.group_names] self._check_args(groups) def _check_args(self, groups): if len(groups) != len(self.endog): msg = "QIF: groups and endog should have the same length" raise ValueError(msg) if len(self.endog) != self.exog.shape[0]: msg = ("QIF: the length of endog should be equal to the " "number of rows of exog.") raise ValueError(msg) def objective(self, params): """ Calculate the gradient of the QIF objective function. Parameters ---------- params : array_like The model parameters at which the gradient is evaluated. Returns ------- grad : array_like The gradient vector of the QIF objective function. gn_deriv : array_like The gradients of each estimating equation with respect to the parameter. """ endog = self.endog exog = self.exog lpr = np.dot(exog, params) mean = self.family.link.inverse(lpr) va = self.family.variance(mean) # Mean derivative idl = self.family.link.inverse_deriv(lpr) idl2 = self.family.link.inverse_deriv2(lpr) vd = self.family.variance.deriv(mean) m = self.cov_struct.num_terms p = exog.shape[1] d = p * m gn = np.zeros(d) gi = np.zeros(d) gi_deriv = np.zeros((d, p)) gn_deriv = np.zeros((d, p)) cn_deriv = [0] * p cmat = np.zeros((d, d)) fastvar = self.family.variance is varfuncs.constant fastlink = isinstance( self.family.link, # TODO: Remove links.identity after deprecation final (links.Identity, links.identity) ) for ix in self.groups_ix: sd = np.sqrt(va[ix]) resid = endog[ix] - mean[ix] sresid = resid / sd deriv = exog[ix, :] * idl[ix, None] jj = 0 for j in range(m): # The derivative of each term in (5) of Qu et al. # There are four terms involving beta in a product. # Iterated application of the product rule gives # the gradient as a sum of four terms. c = self.cov_struct.mat(len(ix), j) crs1 = np.dot(c, sresid) / sd gi[jj:jj+p] = np.dot(deriv.T, crs1) crs2 = np.dot(c, -deriv / sd[:, None]) / sd[:, None] gi_deriv[jj:jj+p, :] = np.dot(deriv.T, crs2) if not (fastlink and fastvar): for k in range(p): m1 = np.dot(exog[ix, :].T, idl2[ix] * exog[ix, k] * crs1) if not fastvar: vx = -0.5 * vd[ix] * deriv[:, k] / va[ix]**1.5 m2 = np.dot(deriv.T, vx * np.dot(c, sresid)) m3 = np.dot(deriv.T, np.dot(c, vx * resid) / sd) else: m2, m3 = 0, 0 gi_deriv[jj:jj+p, k] += m1 + m2 + m3 jj += p for j in range(p): u = np.outer(gi, gi_deriv[:, j]) cn_deriv[j] += u + u.T gn += gi gn_deriv += gi_deriv cmat += np.outer(gi, gi) ngrp = len(self.groups_ix) gn /= ngrp gn_deriv /= ngrp cmat /= ngrp**2 qif = np.dot(gn, np.linalg.solve(cmat, gn)) gcg = np.zeros(p) for j in range(p): cn_deriv[j] /= len(self.groups_ix)**2 u = np.linalg.solve(cmat, cn_deriv[j]).T u = np.linalg.solve(cmat, u) gcg[j] = np.dot(gn, np.dot(u, gn)) grad = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn)) - gcg return qif, grad, cmat, gn, gn_deriv def estimate_scale(self, params): """ Estimate the dispersion/scale. The scale parameter for binomial and Poisson families is fixed at 1, otherwise it is estimated from the data. """ if isinstance(self.family, (families.Binomial, families.Poisson)): return 1. if hasattr(self, "ddof_scale"): ddof_scale = self.ddof_scale else: ddof_scale = self.exog[1] lpr = np.dot(self.exog, params) mean = self.family.link.inverse(lpr) resid = self.endog - mean scale = np.sum(resid**2) / (self.nobs - ddof_scale) return scale @classmethod def from_formula(cls, formula, groups, data, subset=None, *args, **kwargs): """ Create a QIF model instance from a formula and dataframe. Parameters ---------- formula : str or generic Formula object The formula specifying the model groups : array_like or string Array of grouping labels. If a string, this is the name of a variable in `data` that contains the grouping labels. data : array_like The data for the model. subset : array_like An array_like object of booleans, integers, or index values that indicate the subset of the data to used when fitting the model. Returns ------- model : QIF model instance """ if isinstance(groups, str): groups = data[groups] model = super().from_formula( formula, data=data, subset=subset, groups=groups, *args, **kwargs) return model def fit(self, maxiter=100, start_params=None, tol=1e-6, gtol=1e-4, ddof_scale=None): """ Fit a GLM to correlated data using QIF. Parameters ---------- maxiter : int Maximum number of iterations. start_params : array_like, optional Starting values tol : float Convergence threshold for difference of successive estimates. gtol : float Convergence threshold for gradient. ddof_scale : int, optional Degrees of freedom for the scale parameter Returns ------- QIFResults object """ if ddof_scale is None: self.ddof_scale = self.exog.shape[1] else: self.ddof_scale = ddof_scale if start_params is None: model = GLM(self.endog, self.exog, family=self.family) result = model.fit() params = result.params else: params = start_params for _ in range(maxiter): qif, grad, cmat, _, gn_deriv = self.objective(params) gnorm = np.sqrt(np.sum(grad * grad)) self._fit_history["qif"].append(qif) self._fit_history["gradnorm"].append(gnorm) if gnorm < gtol: break cjac = 2 * np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv)) step = np.linalg.solve(cjac, grad) snorm = np.sqrt(np.sum(step * step)) self._fit_history["stepnorm"].append(snorm) if snorm < tol: break params -= step vcov = np.dot(gn_deriv.T, np.linalg.solve(cmat, gn_deriv)) vcov = np.linalg.inv(vcov) scale = self.estimate_scale(params) rslt = QIFResults(self, params, vcov / scale, scale) rslt.fit_history = self._fit_history self._fit_history = defaultdict(list) return QIFResultsWrapper(rslt) class QIFResults(base.LikelihoodModelResults): """Results class for QIF Regression""" def __init__(self, model, params, cov_params, scale, use_t=False, **kwds): super().__init__( model, params, normalized_cov_params=cov_params, scale=scale) self.qif, _, _, _, _ = self.model.objective(params) @cache_readonly def aic(self): """ An AIC-like statistic for models fit using QIF. """ if isinstance(self.model.cov_struct, QIFIndependence): msg = "AIC not available with QIFIndependence covariance" raise ValueError(msg) df = self.model.exog.shape[1] return self.qif + 2*df @cache_readonly def bic(self): """ A BIC-like statistic for models fit using QIF. """ if isinstance(self.model.cov_struct, QIFIndependence): msg = "BIC not available with QIFIndependence covariance" raise ValueError(msg) df = self.model.exog.shape[1] return self.qif + np.log(self.model.nobs)*df @cache_readonly def fittedvalues(self): """ Returns the fitted values from the model. """ return self.model.family.link.inverse( np.dot(self.model.exog, self.params)) def summary(self, yname=None, xname=None, title=None, alpha=.05): """ Summarize the QIF regression results Parameters ---------- yname : str, optional Default is `y` xname : list[str], optional Names for the exogenous variables, default is `var_#` for ## in the number of regressors. Must match the number of parameters in the model title : str, optional Title for the top table. If not None, then this replaces the default title alpha : float significance level for the confidence intervals Returns ------- smry : Summary instance this holds the summary tables and text, which can be printed or converted to various output formats. See Also -------- statsmodels.iolib.summary.Summary : class to hold summary results """ top_left = [('Dep. Variable:', None), ('Method:', ['QIF']), ('Family:', [self.model.family.__class__.__name__]), ('Covariance structure:', [self.model.cov_struct.__class__.__name__]), ('Date:', None), ('Time:', None), ] NY = [len(y) for y in self.model.groups_ix] top_right = [('No. Observations:', [sum(NY)]), ('No. clusters:', [len(NY)]), ('Min. cluster size:', [min(NY)]), ('Max. cluster size:', [max(NY)]), ('Mean cluster size:', ["%.1f" % np.mean(NY)]), ('Scale:', ["%.3f" % self.scale]), ] if title is None: title = self.model.__class__.__name__ + ' ' +\ "Regression Results" # Override the exog variable names if xname is provided as an # argument. if xname is None: xname = self.model.exog_names if yname is None: yname = self.model.endog_names # Create summary table instance from statsmodels.iolib.summary import Summary smry = Summary() smry.add_table_2cols(self, gleft=top_left, gright=top_right, yname=yname, xname=xname, title=title) smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha, use_t=False) return smry class QIFResultsWrapper(lm.RegressionResultsWrapper): pass wrap.populate_wrapper(QIFResultsWrapper, QIFResults)