1563 lines
50 KiB
Python
1563 lines
50 KiB
Python
|
"""
|
||
|
Covariance models and estimators for GEE.
|
||
|
|
||
|
Some details for the covariance calculations can be found in the Stata
|
||
|
docs:
|
||
|
|
||
|
http://www.stata.com/manuals13/xtxtgee.pdf
|
||
|
"""
|
||
|
from statsmodels.compat.pandas import Appender
|
||
|
|
||
|
from collections import defaultdict
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from scipy import linalg as spl
|
||
|
|
||
|
from statsmodels.stats.correlation_tools import cov_nearest
|
||
|
from statsmodels.tools.sm_exceptions import (
|
||
|
ConvergenceWarning,
|
||
|
NotImplementedWarning,
|
||
|
OutputWarning,
|
||
|
)
|
||
|
from statsmodels.tools.validation import bool_like
|
||
|
|
||
|
|
||
|
class CovStruct:
|
||
|
"""
|
||
|
Base class for correlation and covariance structures.
|
||
|
|
||
|
An implementation of this class takes the residuals from a
|
||
|
regression model that has been fit to grouped data, and uses
|
||
|
them to estimate the within-group dependence structure of the
|
||
|
random errors in the model.
|
||
|
|
||
|
The current state of the covariance structure is represented
|
||
|
through the value of the `dep_params` attribute.
|
||
|
|
||
|
The default state of a newly-created instance should always be
|
||
|
the identity correlation matrix.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, cov_nearest_method="clipped"):
|
||
|
|
||
|
# Parameters describing the dependency structure
|
||
|
self.dep_params = None
|
||
|
|
||
|
# Keep track of the number of times that the covariance was
|
||
|
# adjusted.
|
||
|
self.cov_adjust = []
|
||
|
|
||
|
# Method for projecting the covariance matrix if it is not
|
||
|
# PSD.
|
||
|
self.cov_nearest_method = cov_nearest_method
|
||
|
|
||
|
def initialize(self, model):
|
||
|
"""
|
||
|
Called by GEE, used by implementations that need additional
|
||
|
setup prior to running `fit`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
model : GEE class
|
||
|
A reference to the parent GEE class instance.
|
||
|
"""
|
||
|
self.model = model
|
||
|
|
||
|
def update(self, params):
|
||
|
"""
|
||
|
Update the association parameter values based on the current
|
||
|
regression coefficients.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
params : array_like
|
||
|
Working values for the regression parameters.
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def covariance_matrix(self, endog_expval, index):
|
||
|
"""
|
||
|
Returns the working covariance or correlation matrix for a
|
||
|
given cluster of data.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog_expval : array_like
|
||
|
The expected values of endog for the cluster for which the
|
||
|
covariance or correlation matrix will be returned
|
||
|
index : int
|
||
|
The index of the cluster for which the covariance or
|
||
|
correlation matrix will be returned
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
M : matrix
|
||
|
The covariance or correlation matrix of endog
|
||
|
is_cor : bool
|
||
|
True if M is a correlation matrix, False if M is a
|
||
|
covariance matrix
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def covariance_matrix_solve(self, expval, index, stdev, rhs):
|
||
|
"""
|
||
|
Solves matrix equations of the form `covmat * soln = rhs` and
|
||
|
returns the values of `soln`, where `covmat` is the covariance
|
||
|
matrix represented by this class.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
expval : array_like
|
||
|
The expected value of endog for each observed value in the
|
||
|
group.
|
||
|
index : int
|
||
|
The group index.
|
||
|
stdev : array_like
|
||
|
The standard deviation of endog for each observation in
|
||
|
the group.
|
||
|
rhs : list/tuple of array_like
|
||
|
A set of right-hand sides; each defines a matrix equation
|
||
|
to be solved.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
soln : list/tuple of array_like
|
||
|
The solutions to the matrix equations.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Returns None if the solver fails.
|
||
|
|
||
|
Some dependence structures do not use `expval` and/or `index`
|
||
|
to determine the correlation matrix. Some families
|
||
|
(e.g. binomial) do not use the `stdev` parameter when forming
|
||
|
the covariance matrix.
|
||
|
|
||
|
If the covariance matrix is singular or not SPD, it is
|
||
|
projected to the nearest such matrix. These projection events
|
||
|
are recorded in the fit_history attribute of the GEE model.
|
||
|
|
||
|
Systems of linear equations with the covariance matrix as the
|
||
|
left hand side (LHS) are solved for different right hand sides
|
||
|
(RHS); the LHS is only factorized once to save time.
|
||
|
|
||
|
This is a default implementation, it can be reimplemented in
|
||
|
subclasses to optimize the linear algebra according to the
|
||
|
structure of the covariance matrix.
|
||
|
"""
|
||
|
|
||
|
vmat, is_cor = self.covariance_matrix(expval, index)
|
||
|
if is_cor:
|
||
|
vmat *= np.outer(stdev, stdev)
|
||
|
|
||
|
# Factor the covariance matrix. If the factorization fails,
|
||
|
# attempt to condition it into a factorizable matrix.
|
||
|
threshold = 1e-2
|
||
|
success = False
|
||
|
cov_adjust = 0
|
||
|
for itr in range(20):
|
||
|
try:
|
||
|
vco = spl.cho_factor(vmat)
|
||
|
success = True
|
||
|
break
|
||
|
except np.linalg.LinAlgError:
|
||
|
vmat = cov_nearest(vmat, method=self.cov_nearest_method,
|
||
|
threshold=threshold)
|
||
|
threshold *= 2
|
||
|
cov_adjust += 1
|
||
|
msg = "At least one covariance matrix was not PSD "
|
||
|
msg += "and required projection."
|
||
|
warnings.warn(msg)
|
||
|
|
||
|
self.cov_adjust.append(cov_adjust)
|
||
|
|
||
|
# Last resort if we still cannot factor the covariance matrix.
|
||
|
if not success:
|
||
|
warnings.warn(
|
||
|
"Unable to condition covariance matrix to an SPD "
|
||
|
"matrix using cov_nearest", ConvergenceWarning)
|
||
|
vmat = np.diag(np.diag(vmat))
|
||
|
vco = spl.cho_factor(vmat)
|
||
|
|
||
|
soln = [spl.cho_solve(vco, x) for x in rhs]
|
||
|
return soln
|
||
|
|
||
|
def summary(self):
|
||
|
"""
|
||
|
Returns a text summary of the current estimate of the
|
||
|
dependence structure.
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
|
||
|
class Independence(CovStruct):
|
||
|
"""
|
||
|
An independence working dependence structure.
|
||
|
"""
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
# Nothing to update
|
||
|
return
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, expval, index):
|
||
|
dim = len(expval)
|
||
|
return np.eye(dim, dtype=np.float64), True
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix_solve.__doc__)
|
||
|
def covariance_matrix_solve(self, expval, index, stdev, rhs):
|
||
|
v = stdev ** 2
|
||
|
rslt = []
|
||
|
for x in rhs:
|
||
|
if x.ndim == 1:
|
||
|
rslt.append(x / v)
|
||
|
else:
|
||
|
rslt.append(x / v[:, None])
|
||
|
return rslt
|
||
|
|
||
|
def summary(self):
|
||
|
return ("Observations within a cluster are modeled "
|
||
|
"as being independent.")
|
||
|
|
||
|
class Unstructured(CovStruct):
|
||
|
"""
|
||
|
An unstructured dependence structure.
|
||
|
|
||
|
To use the unstructured dependence structure, a `time`
|
||
|
argument must be provided when creating the GEE. The
|
||
|
time argument must be of integer dtype, and indicates
|
||
|
which position in a complete data vector is occupied
|
||
|
by each observed value.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, cov_nearest_method="clipped"):
|
||
|
|
||
|
super().__init__(cov_nearest_method)
|
||
|
|
||
|
def initialize(self, model):
|
||
|
|
||
|
self.model = model
|
||
|
|
||
|
import numbers
|
||
|
if not issubclass(self.model.time.dtype.type, numbers.Integral):
|
||
|
msg = "time must be provided and must have integer dtype"
|
||
|
raise ValueError(msg)
|
||
|
|
||
|
q = self.model.time[:, 0].max() + 1
|
||
|
|
||
|
self.dep_params = np.eye(q)
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, endog_expval, index):
|
||
|
|
||
|
if hasattr(self.model, "time"):
|
||
|
time_li = self.model.time_li
|
||
|
ix = time_li[index][:, 0]
|
||
|
return self.dep_params[np.ix_(ix, ix)],True
|
||
|
|
||
|
return self.dep_params, True
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
nobs = self.model.nobs
|
||
|
varfunc = self.model.family.variance
|
||
|
cached_means = self.model.cached_means
|
||
|
has_weights = self.model.weights is not None
|
||
|
weights_li = self.model.weights
|
||
|
|
||
|
time_li = self.model.time_li
|
||
|
q = self.model.time.max() + 1
|
||
|
csum = np.zeros((q, q))
|
||
|
wsum = 0.
|
||
|
cov = np.zeros((q, q))
|
||
|
|
||
|
scale = 0.
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
# Get the Pearson residuals
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
ix = time_li[i][:, 0]
|
||
|
m = np.outer(resid, resid)
|
||
|
ssr = np.sum(np.diag(m))
|
||
|
|
||
|
w = weights_li[i] if has_weights else 1.
|
||
|
csum[np.ix_(ix, ix)] += w
|
||
|
wsum += w * len(ix)
|
||
|
cov[np.ix_(ix, ix)] += w * m
|
||
|
scale += w * ssr
|
||
|
ddof = self.model.ddof_scale
|
||
|
scale /= wsum * (nobs - ddof) / float(nobs)
|
||
|
cov /= (csum - ddof)
|
||
|
|
||
|
sd = np.sqrt(np.diag(cov))
|
||
|
cov /= np.outer(sd, sd)
|
||
|
|
||
|
self.dep_params = cov
|
||
|
|
||
|
def summary(self):
|
||
|
print("Estimated covariance structure:")
|
||
|
print(self.dep_params)
|
||
|
|
||
|
|
||
|
class Exchangeable(CovStruct):
|
||
|
"""
|
||
|
An exchangeable working dependence structure.
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
|
||
|
super().__init__()
|
||
|
|
||
|
# The correlation between any two values in the same cluster
|
||
|
self.dep_params = 0.
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
|
||
|
nobs = self.model.nobs
|
||
|
|
||
|
varfunc = self.model.family.variance
|
||
|
|
||
|
cached_means = self.model.cached_means
|
||
|
|
||
|
has_weights = self.model.weights is not None
|
||
|
weights_li = self.model.weights
|
||
|
|
||
|
residsq_sum, scale = 0, 0
|
||
|
fsum1, fsum2, n_pairs = 0., 0., 0.
|
||
|
for i in range(self.model.num_group):
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
f = weights_li[i] if has_weights else 1.
|
||
|
|
||
|
ssr = np.sum(resid * resid)
|
||
|
scale += f * ssr
|
||
|
fsum1 += f * len(endog[i])
|
||
|
|
||
|
residsq_sum += f * (resid.sum() ** 2 - ssr) / 2
|
||
|
ngrp = len(resid)
|
||
|
npr = 0.5 * ngrp * (ngrp - 1)
|
||
|
fsum2 += f * npr
|
||
|
n_pairs += npr
|
||
|
|
||
|
ddof = self.model.ddof_scale
|
||
|
scale /= (fsum1 * (nobs - ddof) / float(nobs))
|
||
|
residsq_sum /= scale
|
||
|
self.dep_params = residsq_sum / \
|
||
|
(fsum2 * (n_pairs - ddof) / float(n_pairs))
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, expval, index):
|
||
|
dim = len(expval)
|
||
|
dp = self.dep_params * np.ones((dim, dim), dtype=np.float64)
|
||
|
np.fill_diagonal(dp, 1)
|
||
|
return dp, True
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix_solve.__doc__)
|
||
|
def covariance_matrix_solve(self, expval, index, stdev, rhs):
|
||
|
|
||
|
k = len(expval)
|
||
|
c = self.dep_params / (1. - self.dep_params)
|
||
|
c /= 1. + self.dep_params * (k - 1)
|
||
|
|
||
|
rslt = []
|
||
|
for x in rhs:
|
||
|
if x.ndim == 1:
|
||
|
x1 = x / stdev
|
||
|
y = x1 / (1. - self.dep_params)
|
||
|
y -= c * sum(x1)
|
||
|
y /= stdev
|
||
|
else:
|
||
|
x1 = x / stdev[:, None]
|
||
|
y = x1 / (1. - self.dep_params)
|
||
|
y -= c * x1.sum(0)
|
||
|
y /= stdev[:, None]
|
||
|
rslt.append(y)
|
||
|
|
||
|
return rslt
|
||
|
|
||
|
def summary(self):
|
||
|
return ("The correlation between two observations in the " +
|
||
|
"same cluster is %.3f" % self.dep_params)
|
||
|
|
||
|
|
||
|
class Nested(CovStruct):
|
||
|
"""
|
||
|
A nested working dependence structure.
|
||
|
|
||
|
A nested working dependence structure captures unique variance
|
||
|
associated with each level in a hierarchy of partitions of the
|
||
|
cases. For each level of the hierarchy, there is a set of iid
|
||
|
random effects with mean zero, and with variance that is specific
|
||
|
to the level. These variance parameters are estimated from the
|
||
|
data using the method of moments.
|
||
|
|
||
|
The top level of the hierarchy is always defined by the required
|
||
|
`groups` argument to GEE.
|
||
|
|
||
|
The `dep_data` argument used to create the GEE defines the
|
||
|
remaining levels of the hierarchy. it should be either an array,
|
||
|
or if using the formula interface, a string that contains a
|
||
|
formula. If an array, it should contain a `n_obs x k` matrix of
|
||
|
labels, corresponding to the k levels of partitioning that are
|
||
|
nested under the top-level `groups` of the GEE instance. These
|
||
|
subgroups should be nested from left to right, so that two
|
||
|
observations with the same label for column j of `dep_data` should
|
||
|
also have the same label for all columns j' < j (this only applies
|
||
|
to observations in the same top-level cluster given by the
|
||
|
`groups` argument to GEE).
|
||
|
|
||
|
If `dep_data` is a formula, it should usually be of the form `0 +
|
||
|
a + b + ...`, where `a`, `b`, etc. contain labels defining group
|
||
|
membership. The `0 + ` should be included to prevent creation of
|
||
|
an intercept. The variable values are interpreted as labels for
|
||
|
group membership, but the variables should not be explicitly coded
|
||
|
as categorical, i.e. use `0 + a` not `0 + C(a)`.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The calculations for the nested structure involve all pairs of
|
||
|
observations within the top level `group` passed to GEE. Large
|
||
|
group sizes will result in slow iterations.
|
||
|
"""
|
||
|
|
||
|
def initialize(self, model):
|
||
|
"""
|
||
|
Called on the first call to update
|
||
|
|
||
|
`ilabels` is a list of n_i x n_i matrices containing integer
|
||
|
labels that correspond to specific correlation parameters.
|
||
|
Two elements of ilabels[i] with the same label share identical
|
||
|
variance components.
|
||
|
|
||
|
`designx` is a matrix, with each row containing dummy
|
||
|
variables indicating which variance components are associated
|
||
|
with the corresponding element of QY.
|
||
|
"""
|
||
|
|
||
|
super().initialize(model)
|
||
|
|
||
|
if self.model.weights is not None:
|
||
|
warnings.warn("weights not implemented for nested cov_struct, "
|
||
|
"using unweighted covariance estimate",
|
||
|
NotImplementedWarning)
|
||
|
|
||
|
# A bit of processing of the nest data
|
||
|
id_matrix = np.asarray(self.model.dep_data)
|
||
|
if id_matrix.ndim == 1:
|
||
|
id_matrix = id_matrix[:, None]
|
||
|
self.id_matrix = id_matrix
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
designx, ilabels = [], []
|
||
|
|
||
|
# The number of layers of nesting
|
||
|
n_nest = self.id_matrix.shape[1]
|
||
|
|
||
|
for i in range(self.model.num_group):
|
||
|
ngrp = len(endog[i])
|
||
|
glab = self.model.group_labels[i]
|
||
|
rix = self.model.group_indices[glab]
|
||
|
|
||
|
# Determine the number of common variance components
|
||
|
# shared by each pair of observations.
|
||
|
ix1, ix2 = np.tril_indices(ngrp, -1)
|
||
|
ncm = (self.id_matrix[rix[ix1], :] ==
|
||
|
self.id_matrix[rix[ix2], :]).sum(1)
|
||
|
|
||
|
# This is used to construct the working correlation
|
||
|
# matrix.
|
||
|
ilabel = np.zeros((ngrp, ngrp), dtype=np.int32)
|
||
|
ilabel[(ix1, ix2)] = ncm + 1
|
||
|
ilabel[(ix2, ix1)] = ncm + 1
|
||
|
ilabels.append(ilabel)
|
||
|
|
||
|
# This is used to estimate the variance components.
|
||
|
dsx = np.zeros((len(ix1), n_nest + 1), dtype=np.float64)
|
||
|
dsx[:, 0] = 1
|
||
|
for k in np.unique(ncm):
|
||
|
ii = np.flatnonzero(ncm == k)
|
||
|
dsx[ii, 1:k + 1] = 1
|
||
|
designx.append(dsx)
|
||
|
|
||
|
self.designx = np.concatenate(designx, axis=0)
|
||
|
self.ilabels = ilabels
|
||
|
|
||
|
svd = np.linalg.svd(self.designx, 0)
|
||
|
self.designx_u = svd[0]
|
||
|
self.designx_s = svd[1]
|
||
|
self.designx_v = svd[2].T
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
|
||
|
nobs = self.model.nobs
|
||
|
dim = len(params)
|
||
|
|
||
|
if self.designx is None:
|
||
|
self._compute_design(self.model)
|
||
|
|
||
|
cached_means = self.model.cached_means
|
||
|
|
||
|
varfunc = self.model.family.variance
|
||
|
|
||
|
dvmat = []
|
||
|
scale = 0.
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
expval, _ = cached_means[i]
|
||
|
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
ix1, ix2 = np.tril_indices(len(resid), -1)
|
||
|
dvmat.append(resid[ix1] * resid[ix2])
|
||
|
|
||
|
scale += np.sum(resid ** 2)
|
||
|
|
||
|
dvmat = np.concatenate(dvmat)
|
||
|
scale /= (nobs - dim)
|
||
|
|
||
|
# Use least squares regression to estimate the variance
|
||
|
# components
|
||
|
vcomp_coeff = np.dot(self.designx_v, np.dot(self.designx_u.T,
|
||
|
dvmat) / self.designx_s)
|
||
|
|
||
|
self.vcomp_coeff = np.clip(vcomp_coeff, 0, np.inf)
|
||
|
self.scale = scale
|
||
|
|
||
|
self.dep_params = self.vcomp_coeff.copy()
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, expval, index):
|
||
|
|
||
|
dim = len(expval)
|
||
|
|
||
|
# First iteration
|
||
|
if self.dep_params is None:
|
||
|
return np.eye(dim, dtype=np.float64), True
|
||
|
|
||
|
ilabel = self.ilabels[index]
|
||
|
|
||
|
c = np.r_[self.scale, np.cumsum(self.vcomp_coeff)]
|
||
|
vmat = c[ilabel]
|
||
|
vmat /= self.scale
|
||
|
return vmat, True
|
||
|
|
||
|
def summary(self):
|
||
|
"""
|
||
|
Returns a summary string describing the state of the
|
||
|
dependence structure.
|
||
|
"""
|
||
|
|
||
|
dep_names = ["Groups"]
|
||
|
if hasattr(self.model, "_dep_data_names"):
|
||
|
dep_names.extend(self.model._dep_data_names)
|
||
|
else:
|
||
|
dep_names.extend(["Component %d:" % (k + 1) for k in range(len(self.vcomp_coeff) - 1)])
|
||
|
if hasattr(self.model, "_groups_name"):
|
||
|
dep_names[0] = self.model._groups_name
|
||
|
dep_names.append("Residual")
|
||
|
|
||
|
vc = self.vcomp_coeff.tolist()
|
||
|
vc.append(self.scale - np.sum(vc))
|
||
|
|
||
|
smry = pd.DataFrame({"Variance": vc}, index=dep_names)
|
||
|
|
||
|
return smry
|
||
|
|
||
|
|
||
|
class Stationary(CovStruct):
|
||
|
"""
|
||
|
A stationary covariance structure.
|
||
|
|
||
|
The correlation between two observations is an arbitrary function
|
||
|
of the distance between them. Distances up to a given maximum
|
||
|
value are included in the covariance model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
max_lag : float
|
||
|
The largest distance that is included in the covariance model.
|
||
|
grid : bool
|
||
|
If True, the index positions in the data (after dropping missing
|
||
|
values) are used to define distances, and the `time` variable is
|
||
|
ignored.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, max_lag=1, grid=None):
|
||
|
|
||
|
super().__init__()
|
||
|
grid = bool_like(grid, "grid", optional=True)
|
||
|
if grid is None:
|
||
|
warnings.warn(
|
||
|
"grid=True will become default in a future version",
|
||
|
FutureWarning
|
||
|
)
|
||
|
|
||
|
self.max_lag = max_lag
|
||
|
self.grid = bool(grid)
|
||
|
self.dep_params = np.zeros(max_lag + 1)
|
||
|
|
||
|
def initialize(self, model):
|
||
|
|
||
|
super().initialize(model)
|
||
|
|
||
|
# Time used as an index needs to be integer type.
|
||
|
if not self.grid:
|
||
|
time = self.model.time[:, 0].astype(np.int32)
|
||
|
self.time = self.model.cluster_list(time)
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
if self.grid:
|
||
|
self.update_grid(params)
|
||
|
else:
|
||
|
self.update_nogrid(params)
|
||
|
|
||
|
def update_grid(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
cached_means = self.model.cached_means
|
||
|
varfunc = self.model.family.variance
|
||
|
|
||
|
dep_params = np.zeros(self.max_lag + 1)
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
dep_params[0] += np.sum(resid * resid) / len(resid)
|
||
|
for j in range(1, self.max_lag + 1):
|
||
|
v = resid[j:]
|
||
|
dep_params[j] += np.sum(resid[0:-j] * v) / len(v)
|
||
|
|
||
|
dep_params /= dep_params[0]
|
||
|
self.dep_params = dep_params
|
||
|
|
||
|
def update_nogrid(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
cached_means = self.model.cached_means
|
||
|
varfunc = self.model.family.variance
|
||
|
|
||
|
dep_params = np.zeros(self.max_lag + 1)
|
||
|
dn = np.zeros(self.max_lag + 1)
|
||
|
resid_ssq = 0
|
||
|
resid_ssq_n = 0
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
j1, j2 = np.tril_indices(len(expval), -1)
|
||
|
dx = np.abs(self.time[i][j1] - self.time[i][j2])
|
||
|
ii = np.flatnonzero(dx <= self.max_lag)
|
||
|
j1 = j1[ii]
|
||
|
j2 = j2[ii]
|
||
|
dx = dx[ii]
|
||
|
|
||
|
vs = np.bincount(dx, weights=resid[j1] * resid[j2],
|
||
|
minlength=self.max_lag + 1)
|
||
|
vd = np.bincount(dx, minlength=self.max_lag + 1)
|
||
|
|
||
|
resid_ssq += np.sum(resid**2)
|
||
|
resid_ssq_n += len(resid)
|
||
|
|
||
|
ii = np.flatnonzero(vd > 0)
|
||
|
if len(ii) > 0:
|
||
|
dn[ii] += 1
|
||
|
dep_params[ii] += vs[ii] / vd[ii]
|
||
|
|
||
|
i0 = np.flatnonzero(dn > 0)
|
||
|
dep_params[i0] /= dn[i0]
|
||
|
resid_msq = resid_ssq / resid_ssq_n
|
||
|
dep_params /= resid_msq
|
||
|
self.dep_params = dep_params
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, endog_expval, index):
|
||
|
|
||
|
if self.grid:
|
||
|
return self.covariance_matrix_grid(endog_expval, index)
|
||
|
|
||
|
j1, j2 = np.tril_indices(len(endog_expval), -1)
|
||
|
dx = np.abs(self.time[index][j1] - self.time[index][j2])
|
||
|
ii = np.flatnonzero(dx <= self.max_lag)
|
||
|
j1 = j1[ii]
|
||
|
j2 = j2[ii]
|
||
|
dx = dx[ii]
|
||
|
|
||
|
cmat = np.eye(len(endog_expval))
|
||
|
cmat[j1, j2] = self.dep_params[dx]
|
||
|
cmat[j2, j1] = self.dep_params[dx]
|
||
|
|
||
|
return cmat, True
|
||
|
|
||
|
def covariance_matrix_grid(self, endog_expval, index):
|
||
|
|
||
|
from scipy.linalg import toeplitz
|
||
|
r = np.zeros(len(endog_expval))
|
||
|
r[0] = 1
|
||
|
r[1:self.max_lag + 1] = self.dep_params[1:]
|
||
|
return toeplitz(r), True
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix_solve.__doc__)
|
||
|
def covariance_matrix_solve(self, expval, index, stdev, rhs):
|
||
|
|
||
|
if not self.grid:
|
||
|
return super().covariance_matrix_solve(
|
||
|
expval, index, stdev, rhs)
|
||
|
|
||
|
from statsmodels.tools.linalg import stationary_solve
|
||
|
r = np.zeros(len(expval))
|
||
|
r[0:self.max_lag] = self.dep_params[1:]
|
||
|
|
||
|
rslt = []
|
||
|
for x in rhs:
|
||
|
if x.ndim == 1:
|
||
|
y = x / stdev
|
||
|
rslt.append(stationary_solve(r, y) / stdev)
|
||
|
else:
|
||
|
y = x / stdev[:, None]
|
||
|
rslt.append(stationary_solve(r, y) / stdev[:, None])
|
||
|
|
||
|
return rslt
|
||
|
|
||
|
def summary(self):
|
||
|
|
||
|
lag = np.arange(self.max_lag + 1)
|
||
|
return pd.DataFrame({"Lag": lag, "Cov": self.dep_params})
|
||
|
|
||
|
|
||
|
class Autoregressive(CovStruct):
|
||
|
"""
|
||
|
A first-order autoregressive working dependence structure.
|
||
|
|
||
|
The dependence is defined in terms of the `time` component of the
|
||
|
parent GEE class, which defaults to the index position of each
|
||
|
value within its cluster, based on the order of values in the
|
||
|
input data set. Time represents a potentially multidimensional
|
||
|
index from which distances between pairs of observations can be
|
||
|
determined.
|
||
|
|
||
|
The correlation between two observations in the same cluster is
|
||
|
dep_params^distance, where `dep_params` contains the (scalar)
|
||
|
autocorrelation parameter to be estimated, and `distance` is the
|
||
|
distance between the two observations, calculated from their
|
||
|
corresponding time values. `time` is stored as an n_obs x k
|
||
|
matrix, where `k` represents the number of dimensions in the time
|
||
|
index.
|
||
|
|
||
|
The autocorrelation parameter is estimated using weighted
|
||
|
nonlinear least squares, regressing each value within a cluster on
|
||
|
each preceding value in the same cluster.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
dist_func : function from R^k x R^k to R^+, optional
|
||
|
A function that computes the distance between the two
|
||
|
observations based on their `time` values.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
B Rosner, A Munoz. Autoregressive modeling for the analysis of
|
||
|
longitudinal data with unequally spaced examinations. Statistics
|
||
|
in medicine. Vol 7, 59-71, 1988.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, dist_func=None, grid=None):
|
||
|
|
||
|
super().__init__()
|
||
|
grid = bool_like(grid, "grid", optional=True)
|
||
|
# The function for determining distances based on time
|
||
|
if dist_func is None:
|
||
|
self.dist_func = lambda x, y: np.abs(x - y).sum()
|
||
|
else:
|
||
|
self.dist_func = dist_func
|
||
|
|
||
|
if grid is None:
|
||
|
warnings.warn(
|
||
|
"grid=True will become default in a future version",
|
||
|
FutureWarning
|
||
|
)
|
||
|
self.grid = bool(grid)
|
||
|
if not self.grid:
|
||
|
self.designx = None
|
||
|
|
||
|
# The autocorrelation parameter
|
||
|
self.dep_params = 0.
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
if self.model.weights is not None:
|
||
|
warnings.warn("weights not implemented for autoregressive "
|
||
|
"cov_struct, using unweighted covariance estimate",
|
||
|
NotImplementedWarning)
|
||
|
|
||
|
if self.grid:
|
||
|
self._update_grid(params)
|
||
|
else:
|
||
|
self._update_nogrid(params)
|
||
|
|
||
|
def _update_grid(self, params):
|
||
|
|
||
|
cached_means = self.model.cached_means
|
||
|
scale = self.model.estimate_scale()
|
||
|
varfunc = self.model.family.variance
|
||
|
endog = self.model.endog_li
|
||
|
|
||
|
lag0, lag1 = 0.0, 0.0
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(scale * varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
n = len(resid)
|
||
|
if n > 1:
|
||
|
lag1 += np.sum(resid[0:-1] * resid[1:]) / (n - 1)
|
||
|
lag0 += np.sum(resid**2) / n
|
||
|
|
||
|
self.dep_params = lag1 / lag0
|
||
|
|
||
|
def _update_nogrid(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
time = self.model.time_li
|
||
|
|
||
|
# Only need to compute this once
|
||
|
if self.designx is not None:
|
||
|
designx = self.designx
|
||
|
else:
|
||
|
designx = []
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
ngrp = len(endog[i])
|
||
|
if ngrp == 0:
|
||
|
continue
|
||
|
|
||
|
# Loop over pairs of observations within a cluster
|
||
|
for j1 in range(ngrp):
|
||
|
for j2 in range(j1):
|
||
|
designx.append(self.dist_func(time[i][j1, :],
|
||
|
time[i][j2, :]))
|
||
|
|
||
|
designx = np.array(designx)
|
||
|
self.designx = designx
|
||
|
|
||
|
scale = self.model.estimate_scale()
|
||
|
varfunc = self.model.family.variance
|
||
|
cached_means = self.model.cached_means
|
||
|
|
||
|
# Weights
|
||
|
var = 1. - self.dep_params ** (2 * designx)
|
||
|
var /= 1. - self.dep_params ** 2
|
||
|
wts = 1. / var
|
||
|
wts /= wts.sum()
|
||
|
|
||
|
residmat = []
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
expval, _ = cached_means[i]
|
||
|
stdev = np.sqrt(scale * varfunc(expval))
|
||
|
resid = (endog[i] - expval) / stdev
|
||
|
|
||
|
ngrp = len(resid)
|
||
|
for j1 in range(ngrp):
|
||
|
for j2 in range(j1):
|
||
|
residmat.append([resid[j1], resid[j2]])
|
||
|
|
||
|
residmat = np.array(residmat)
|
||
|
|
||
|
# Need to minimize this
|
||
|
def fitfunc(a):
|
||
|
dif = residmat[:, 0] - (a ** designx) * residmat[:, 1]
|
||
|
return np.dot(dif ** 2, wts)
|
||
|
|
||
|
# Left bracket point
|
||
|
b_lft, f_lft = 0., fitfunc(0.)
|
||
|
|
||
|
# Center bracket point
|
||
|
b_ctr, f_ctr = 0.5, fitfunc(0.5)
|
||
|
while f_ctr > f_lft:
|
||
|
b_ctr /= 2
|
||
|
f_ctr = fitfunc(b_ctr)
|
||
|
if b_ctr < 1e-8:
|
||
|
self.dep_params = 0
|
||
|
return
|
||
|
|
||
|
# Right bracket point
|
||
|
b_rgt, f_rgt = 0.75, fitfunc(0.75)
|
||
|
while f_rgt < f_ctr:
|
||
|
b_rgt = b_rgt + (1. - b_rgt) / 2
|
||
|
f_rgt = fitfunc(b_rgt)
|
||
|
if b_rgt > 1. - 1e-6:
|
||
|
raise ValueError(
|
||
|
"Autoregressive: unable to find right bracket")
|
||
|
|
||
|
from scipy.optimize import brent
|
||
|
self.dep_params = brent(fitfunc, brack=[b_lft, b_ctr, b_rgt])
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, endog_expval, index):
|
||
|
ngrp = len(endog_expval)
|
||
|
if self.dep_params == 0:
|
||
|
return np.eye(ngrp, dtype=np.float64), True
|
||
|
idx = np.arange(ngrp)
|
||
|
cmat = self.dep_params ** np.abs(idx[:, None] - idx[None, :])
|
||
|
return cmat, True
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix_solve.__doc__)
|
||
|
def covariance_matrix_solve(self, expval, index, stdev, rhs):
|
||
|
# The inverse of an AR(1) covariance matrix is tri-diagonal.
|
||
|
|
||
|
k = len(expval)
|
||
|
r = self.dep_params
|
||
|
soln = []
|
||
|
|
||
|
# RHS has 1 row
|
||
|
if k == 1:
|
||
|
return [x / stdev ** 2 for x in rhs]
|
||
|
|
||
|
# RHS has 2 rows
|
||
|
if k == 2:
|
||
|
mat = np.array([[1, -r], [-r, 1]])
|
||
|
mat /= (1. - r ** 2)
|
||
|
for x in rhs:
|
||
|
if x.ndim == 1:
|
||
|
x1 = x / stdev
|
||
|
else:
|
||
|
x1 = x / stdev[:, None]
|
||
|
x1 = np.dot(mat, x1)
|
||
|
if x.ndim == 1:
|
||
|
x1 /= stdev
|
||
|
else:
|
||
|
x1 /= stdev[:, None]
|
||
|
soln.append(x1)
|
||
|
return soln
|
||
|
|
||
|
# RHS has >= 3 rows: values c0, c1, c2 defined below give
|
||
|
# the inverse. c0 is on the diagonal, except for the first
|
||
|
# and last position. c1 is on the first and last position of
|
||
|
# the diagonal. c2 is on the sub/super diagonal.
|
||
|
c0 = (1. + r ** 2) / (1. - r ** 2)
|
||
|
c1 = 1. / (1. - r ** 2)
|
||
|
c2 = -r / (1. - r ** 2)
|
||
|
soln = []
|
||
|
for x in rhs:
|
||
|
flatten = False
|
||
|
if x.ndim == 1:
|
||
|
x = x[:, None]
|
||
|
flatten = True
|
||
|
x1 = x / stdev[:, None]
|
||
|
|
||
|
z0 = np.zeros((1, x1.shape[1]))
|
||
|
rhs1 = np.concatenate((x1[1:, :], z0), axis=0)
|
||
|
rhs2 = np.concatenate((z0, x1[0:-1, :]), axis=0)
|
||
|
|
||
|
y = c0 * x1 + c2 * rhs1 + c2 * rhs2
|
||
|
y[0, :] = c1 * x1[0, :] + c2 * x1[1, :]
|
||
|
y[-1, :] = c1 * x1[-1, :] + c2 * x1[-2, :]
|
||
|
|
||
|
y /= stdev[:, None]
|
||
|
|
||
|
if flatten:
|
||
|
y = np.squeeze(y)
|
||
|
|
||
|
soln.append(y)
|
||
|
|
||
|
return soln
|
||
|
|
||
|
def summary(self):
|
||
|
|
||
|
return ("Autoregressive(1) dependence parameter: %.3f\n" %
|
||
|
self.dep_params)
|
||
|
|
||
|
|
||
|
class CategoricalCovStruct(CovStruct):
|
||
|
"""
|
||
|
Parent class for covariance structure for categorical data models.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
nlevel : int
|
||
|
The number of distinct levels for the outcome variable.
|
||
|
ibd : list
|
||
|
A list whose i^th element ibd[i] is an array whose rows
|
||
|
contain integer pairs (a,b), where endog_li[i][a:b] is the
|
||
|
subvector of binary indicators derived from the same ordinal
|
||
|
value.
|
||
|
"""
|
||
|
|
||
|
def initialize(self, model):
|
||
|
|
||
|
super().initialize(model)
|
||
|
|
||
|
self.nlevel = len(model.endog_values)
|
||
|
self._ncut = self.nlevel - 1
|
||
|
|
||
|
from numpy.lib.stride_tricks import as_strided
|
||
|
b = np.dtype(np.int64).itemsize
|
||
|
|
||
|
ibd = []
|
||
|
for v in model.endog_li:
|
||
|
jj = np.arange(0, len(v) + 1, self._ncut, dtype=np.int64)
|
||
|
jj = as_strided(jj, shape=(len(jj) - 1, 2), strides=(b, b))
|
||
|
ibd.append(jj)
|
||
|
|
||
|
self.ibd = ibd
|
||
|
|
||
|
|
||
|
class GlobalOddsRatio(CategoricalCovStruct):
|
||
|
"""
|
||
|
Estimate the global odds ratio for a GEE with ordinal or nominal
|
||
|
data.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
PJ Heagerty and S Zeger. "Marginal Regression Models for Clustered
|
||
|
Ordinal Measurements". Journal of the American Statistical
|
||
|
Association Vol. 91, Issue 435 (1996).
|
||
|
|
||
|
Thomas Lumley. Generalized Estimating Equations for Ordinal Data:
|
||
|
A Note on Working Correlation Structures. Biometrics Vol. 52,
|
||
|
No. 1 (Mar., 1996), pp. 354-361
|
||
|
http://www.jstor.org/stable/2533173
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The following data structures are calculated in the class:
|
||
|
|
||
|
'ibd' is a list whose i^th element ibd[i] is a sequence of integer
|
||
|
pairs (a,b), where endog_li[i][a:b] is the subvector of binary
|
||
|
indicators derived from the same ordinal value.
|
||
|
|
||
|
`cpp` is a dictionary where cpp[group] is a map from cut-point
|
||
|
pairs (c,c') to the indices of all between-subject pairs derived
|
||
|
from the given cut points.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, endog_type):
|
||
|
super().__init__()
|
||
|
self.endog_type = endog_type
|
||
|
self.dep_params = 0.
|
||
|
|
||
|
def initialize(self, model):
|
||
|
|
||
|
super().initialize(model)
|
||
|
|
||
|
if self.model.weights is not None:
|
||
|
warnings.warn("weights not implemented for GlobalOddsRatio "
|
||
|
"cov_struct, using unweighted covariance estimate",
|
||
|
NotImplementedWarning)
|
||
|
|
||
|
# Need to restrict to between-subject pairs
|
||
|
cpp = []
|
||
|
for v in model.endog_li:
|
||
|
|
||
|
# Number of subjects in this group
|
||
|
m = int(len(v) / self._ncut)
|
||
|
i1, i2 = np.tril_indices(m, -1)
|
||
|
|
||
|
cpp1 = {}
|
||
|
for k1 in range(self._ncut):
|
||
|
for k2 in range(k1 + 1):
|
||
|
jj = np.zeros((len(i1), 2), dtype=np.int64)
|
||
|
jj[:, 0] = i1 * self._ncut + k1
|
||
|
jj[:, 1] = i2 * self._ncut + k2
|
||
|
cpp1[(k2, k1)] = jj
|
||
|
|
||
|
cpp.append(cpp1)
|
||
|
|
||
|
self.cpp = cpp
|
||
|
|
||
|
# Initialize the dependence parameters
|
||
|
self.crude_or = self.observed_crude_oddsratio()
|
||
|
if self.model.update_dep:
|
||
|
self.dep_params = self.crude_or
|
||
|
|
||
|
def pooled_odds_ratio(self, tables):
|
||
|
"""
|
||
|
Returns the pooled odds ratio for a list of 2x2 tables.
|
||
|
|
||
|
The pooled odds ratio is the inverse variance weighted average
|
||
|
of the sample odds ratios of the tables.
|
||
|
"""
|
||
|
|
||
|
if len(tables) == 0:
|
||
|
return 1.
|
||
|
|
||
|
# Get the sampled odds ratios and variances
|
||
|
log_oddsratio, var = [], []
|
||
|
for table in tables:
|
||
|
lor = np.log(table[1, 1]) + np.log(table[0, 0]) -\
|
||
|
np.log(table[0, 1]) - np.log(table[1, 0])
|
||
|
log_oddsratio.append(lor)
|
||
|
var.append((1 / table.astype(np.float64)).sum())
|
||
|
|
||
|
# Calculate the inverse variance weighted average
|
||
|
wts = [1 / v for v in var]
|
||
|
wtsum = sum(wts)
|
||
|
wts = [w / wtsum for w in wts]
|
||
|
log_pooled_or = sum([w * e for w, e in zip(wts, log_oddsratio)])
|
||
|
|
||
|
return np.exp(log_pooled_or)
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, expected_value, index):
|
||
|
|
||
|
vmat = self.get_eyy(expected_value, index)
|
||
|
vmat -= np.outer(expected_value, expected_value)
|
||
|
return vmat, False
|
||
|
|
||
|
def observed_crude_oddsratio(self):
|
||
|
"""
|
||
|
To obtain the crude (global) odds ratio, first pool all binary
|
||
|
indicators corresponding to a given pair of cut points (c,c'),
|
||
|
then calculate the odds ratio for this 2x2 table. The crude
|
||
|
odds ratio is the inverse variance weighted average of these
|
||
|
odds ratios. Since the covariate effects are ignored, this OR
|
||
|
will generally be greater than the stratified OR.
|
||
|
"""
|
||
|
|
||
|
cpp = self.cpp
|
||
|
endog = self.model.endog_li
|
||
|
|
||
|
# Storage for the contingency tables for each (c,c')
|
||
|
tables = {}
|
||
|
for ii in cpp[0].keys():
|
||
|
tables[ii] = np.zeros((2, 2), dtype=np.float64)
|
||
|
|
||
|
# Get the observed crude OR
|
||
|
for i in range(len(endog)):
|
||
|
|
||
|
# The observed joint values for the current cluster
|
||
|
yvec = endog[i]
|
||
|
endog_11 = np.outer(yvec, yvec)
|
||
|
endog_10 = np.outer(yvec, 1. - yvec)
|
||
|
endog_01 = np.outer(1. - yvec, yvec)
|
||
|
endog_00 = np.outer(1. - yvec, 1. - yvec)
|
||
|
|
||
|
cpp1 = cpp[i]
|
||
|
for ky in cpp1.keys():
|
||
|
ix = cpp1[ky]
|
||
|
tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum()
|
||
|
|
||
|
return self.pooled_odds_ratio(list(tables.values()))
|
||
|
|
||
|
def get_eyy(self, endog_expval, index):
|
||
|
"""
|
||
|
Returns a matrix V such that V[i,j] is the joint probability
|
||
|
that endog[i] = 1 and endog[j] = 1, based on the marginal
|
||
|
probabilities of endog and the global odds ratio `current_or`.
|
||
|
"""
|
||
|
|
||
|
current_or = self.dep_params
|
||
|
ibd = self.ibd[index]
|
||
|
|
||
|
# The between-observation joint probabilities
|
||
|
if current_or == 1.0:
|
||
|
vmat = np.outer(endog_expval, endog_expval)
|
||
|
else:
|
||
|
psum = endog_expval[:, None] + endog_expval[None, :]
|
||
|
pprod = endog_expval[:, None] * endog_expval[None, :]
|
||
|
pfac = np.sqrt((1. + psum * (current_or - 1.)) ** 2 +
|
||
|
4 * current_or * (1. - current_or) * pprod)
|
||
|
vmat = 1. + psum * (current_or - 1.) - pfac
|
||
|
vmat /= 2. * (current_or - 1)
|
||
|
|
||
|
# Fix E[YY'] for elements that belong to same observation
|
||
|
for bdl in ibd:
|
||
|
evy = endog_expval[bdl[0]:bdl[1]]
|
||
|
if self.endog_type == "ordinal":
|
||
|
vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
|
||
|
np.minimum.outer(evy, evy)
|
||
|
else:
|
||
|
vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] = np.diag(evy)
|
||
|
|
||
|
return vmat
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
"""
|
||
|
Update the global odds ratio based on the current value of
|
||
|
params.
|
||
|
"""
|
||
|
|
||
|
cpp = self.cpp
|
||
|
cached_means = self.model.cached_means
|
||
|
|
||
|
# This will happen if all the clusters have only
|
||
|
# one observation
|
||
|
if len(cpp[0]) == 0:
|
||
|
return
|
||
|
|
||
|
tables = {}
|
||
|
for ii in cpp[0]:
|
||
|
tables[ii] = np.zeros((2, 2), dtype=np.float64)
|
||
|
|
||
|
for i in range(self.model.num_group):
|
||
|
|
||
|
endog_expval, _ = cached_means[i]
|
||
|
|
||
|
emat_11 = self.get_eyy(endog_expval, i)
|
||
|
emat_10 = endog_expval[:, None] - emat_11
|
||
|
emat_01 = -emat_11 + endog_expval
|
||
|
emat_00 = 1. - (emat_11 + emat_10 + emat_01)
|
||
|
|
||
|
cpp1 = cpp[i]
|
||
|
for ky in cpp1.keys():
|
||
|
ix = cpp1[ky]
|
||
|
tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum()
|
||
|
tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum()
|
||
|
|
||
|
cor_expval = self.pooled_odds_ratio(list(tables.values()))
|
||
|
|
||
|
self.dep_params *= self.crude_or / cor_expval
|
||
|
if not np.isfinite(self.dep_params):
|
||
|
self.dep_params = 1.
|
||
|
warnings.warn("dep_params became inf, resetting to 1",
|
||
|
ConvergenceWarning)
|
||
|
|
||
|
def summary(self):
|
||
|
return "Global odds ratio: %.3f\n" % self.dep_params
|
||
|
|
||
|
|
||
|
class OrdinalIndependence(CategoricalCovStruct):
|
||
|
"""
|
||
|
An independence covariance structure for ordinal models.
|
||
|
|
||
|
The working covariance between indicators derived from different
|
||
|
observations is zero. The working covariance between indicators
|
||
|
derived form a common observation is determined from their current
|
||
|
mean values.
|
||
|
|
||
|
There are no parameters to estimate in this covariance structure.
|
||
|
"""
|
||
|
|
||
|
def covariance_matrix(self, expected_value, index):
|
||
|
|
||
|
ibd = self.ibd[index]
|
||
|
n = len(expected_value)
|
||
|
vmat = np.zeros((n, n))
|
||
|
|
||
|
for bdl in ibd:
|
||
|
ev = expected_value[bdl[0]:bdl[1]]
|
||
|
vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
|
||
|
np.minimum.outer(ev, ev) - np.outer(ev, ev)
|
||
|
|
||
|
return vmat, False
|
||
|
|
||
|
# Nothing to update
|
||
|
def update(self, params):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class NominalIndependence(CategoricalCovStruct):
|
||
|
"""
|
||
|
An independence covariance structure for nominal models.
|
||
|
|
||
|
The working covariance between indicators derived from different
|
||
|
observations is zero. The working covariance between indicators
|
||
|
derived form a common observation is determined from their current
|
||
|
mean values.
|
||
|
|
||
|
There are no parameters to estimate in this covariance structure.
|
||
|
"""
|
||
|
|
||
|
def covariance_matrix(self, expected_value, index):
|
||
|
|
||
|
ibd = self.ibd[index]
|
||
|
n = len(expected_value)
|
||
|
vmat = np.zeros((n, n))
|
||
|
|
||
|
for bdl in ibd:
|
||
|
ev = expected_value[bdl[0]:bdl[1]]
|
||
|
vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
|
||
|
np.diag(ev) - np.outer(ev, ev)
|
||
|
|
||
|
return vmat, False
|
||
|
|
||
|
# Nothing to update
|
||
|
def update(self, params):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class Equivalence(CovStruct):
|
||
|
"""
|
||
|
A covariance structure defined in terms of equivalence classes.
|
||
|
|
||
|
An 'equivalence class' is a set of pairs of observations such that
|
||
|
the covariance of every pair within the equivalence class has a
|
||
|
common value.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
pairs : dict-like
|
||
|
A dictionary of dictionaries, where `pairs[group][label]`
|
||
|
provides the indices of all pairs of observations in the group
|
||
|
that have the same covariance value. Specifically,
|
||
|
`pairs[group][label]` is a tuple `(j1, j2)`, where `j1` and `j2`
|
||
|
are integer arrays of the same length. `j1[i], j2[i]` is one
|
||
|
index pair that belongs to the `label` equivalence class. Only
|
||
|
one triangle of each covariance matrix should be included.
|
||
|
Positions where j1 and j2 have the same value are variance
|
||
|
parameters.
|
||
|
labels : array_like
|
||
|
An array of labels such that every distinct pair of labels
|
||
|
defines an equivalence class. Either `labels` or `pairs` must
|
||
|
be provided. When the two labels in a pair are equal two
|
||
|
equivalence classes are defined: one for the diagonal elements
|
||
|
(corresponding to variances) and one for the off-diagonal
|
||
|
elements (corresponding to covariances).
|
||
|
return_cov : bool
|
||
|
If True, `covariance_matrix` returns an estimate of the
|
||
|
covariance matrix, otherwise returns an estimate of the
|
||
|
correlation matrix.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Using `labels` to define the class is much easier than using
|
||
|
`pairs`, but is less general.
|
||
|
|
||
|
Any pair of values not contained in `pairs` will be assigned zero
|
||
|
covariance.
|
||
|
|
||
|
The index values in `pairs` are row indices into the `exog`
|
||
|
matrix. They are not updated if missing data are present. When
|
||
|
using this covariance structure, missing data should be removed
|
||
|
before constructing the model.
|
||
|
|
||
|
If using `labels`, after a model is defined using the covariance
|
||
|
structure it is possible to remove a label pair from the second
|
||
|
level of the `pairs` dictionary to force the corresponding
|
||
|
covariance to be zero.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
The following sets up the `pairs` dictionary for a model with two
|
||
|
groups, equal variance for all observations, and constant
|
||
|
covariance for all pairs of observations within each group.
|
||
|
|
||
|
>> pairs = {0: {}, 1: {}}
|
||
|
>> pairs[0][0] = (np.r_[0, 1, 2], np.r_[0, 1, 2])
|
||
|
>> pairs[0][1] = np.tril_indices(3, -1)
|
||
|
>> pairs[1][0] = (np.r_[3, 4, 5], np.r_[3, 4, 5])
|
||
|
>> pairs[1][2] = 3 + np.tril_indices(3, -1)
|
||
|
"""
|
||
|
|
||
|
def __init__(self, pairs=None, labels=None, return_cov=False):
|
||
|
|
||
|
super().__init__()
|
||
|
|
||
|
if (pairs is None) and (labels is None):
|
||
|
raise ValueError(
|
||
|
"Equivalence cov_struct requires either `pairs` or `labels`")
|
||
|
|
||
|
if (pairs is not None) and (labels is not None):
|
||
|
raise ValueError(
|
||
|
"Equivalence cov_struct accepts only one of `pairs` "
|
||
|
"and `labels`")
|
||
|
|
||
|
if pairs is not None:
|
||
|
import copy
|
||
|
self.pairs = copy.deepcopy(pairs)
|
||
|
|
||
|
if labels is not None:
|
||
|
self.labels = np.asarray(labels)
|
||
|
|
||
|
self.return_cov = return_cov
|
||
|
|
||
|
def _make_pairs(self, i, j):
|
||
|
"""
|
||
|
Create arrays containing all unique ordered pairs of i, j.
|
||
|
|
||
|
The arrays i and j must be one-dimensional containing non-negative
|
||
|
integers.
|
||
|
"""
|
||
|
|
||
|
mat = np.zeros((len(i) * len(j), 2), dtype=np.int32)
|
||
|
|
||
|
# Create the pairs and order them
|
||
|
f = np.ones(len(j))
|
||
|
mat[:, 0] = np.kron(f, i).astype(np.int32)
|
||
|
f = np.ones(len(i))
|
||
|
mat[:, 1] = np.kron(j, f).astype(np.int32)
|
||
|
mat.sort(1)
|
||
|
|
||
|
# Remove repeated rows
|
||
|
try:
|
||
|
dtype = np.dtype((np.void, mat.dtype.itemsize * mat.shape[1]))
|
||
|
bmat = np.ascontiguousarray(mat).view(dtype)
|
||
|
_, idx = np.unique(bmat, return_index=True)
|
||
|
except TypeError:
|
||
|
# workaround for old numpy that cannot call unique with complex
|
||
|
# dtypes
|
||
|
rs = np.random.RandomState(4234)
|
||
|
bmat = np.dot(mat, rs.uniform(size=mat.shape[1]))
|
||
|
_, idx = np.unique(bmat, return_index=True)
|
||
|
mat = mat[idx, :]
|
||
|
|
||
|
return mat[:, 0], mat[:, 1]
|
||
|
|
||
|
def _pairs_from_labels(self):
|
||
|
|
||
|
from collections import defaultdict
|
||
|
pairs = defaultdict(lambda: defaultdict(lambda: None))
|
||
|
|
||
|
model = self.model
|
||
|
|
||
|
df = pd.DataFrame({"labels": self.labels, "groups": model.groups})
|
||
|
gb = df.groupby(["groups", "labels"])
|
||
|
|
||
|
ulabels = np.unique(self.labels)
|
||
|
|
||
|
for g_ix, g_lb in enumerate(model.group_labels):
|
||
|
|
||
|
# Loop over label pairs
|
||
|
for lx1 in range(len(ulabels)):
|
||
|
for lx2 in range(lx1 + 1):
|
||
|
|
||
|
lb1 = ulabels[lx1]
|
||
|
lb2 = ulabels[lx2]
|
||
|
|
||
|
try:
|
||
|
i1 = gb.groups[(g_lb, lb1)]
|
||
|
i2 = gb.groups[(g_lb, lb2)]
|
||
|
except KeyError:
|
||
|
continue
|
||
|
|
||
|
i1, i2 = self._make_pairs(i1, i2)
|
||
|
|
||
|
clabel = str(lb1) + "/" + str(lb2)
|
||
|
|
||
|
# Variance parameters belong in their own equiv class.
|
||
|
jj = np.flatnonzero(i1 == i2)
|
||
|
if len(jj) > 0:
|
||
|
clabelv = clabel + "/v"
|
||
|
pairs[g_lb][clabelv] = (i1[jj], i2[jj])
|
||
|
|
||
|
# Covariance parameters
|
||
|
jj = np.flatnonzero(i1 != i2)
|
||
|
if len(jj) > 0:
|
||
|
i1 = i1[jj]
|
||
|
i2 = i2[jj]
|
||
|
pairs[g_lb][clabel] = (i1, i2)
|
||
|
|
||
|
self.pairs = pairs
|
||
|
|
||
|
def initialize(self, model):
|
||
|
|
||
|
super().initialize(model)
|
||
|
|
||
|
if self.model.weights is not None:
|
||
|
warnings.warn("weights not implemented for equalence cov_struct, "
|
||
|
"using unweighted covariance estimate",
|
||
|
NotImplementedWarning)
|
||
|
|
||
|
if not hasattr(self, 'pairs'):
|
||
|
self._pairs_from_labels()
|
||
|
|
||
|
# Initialize so that any equivalence class containing a
|
||
|
# variance parameter has value 1.
|
||
|
self.dep_params = defaultdict(float)
|
||
|
self._var_classes = set()
|
||
|
for gp in self.model.group_labels:
|
||
|
for lb in self.pairs[gp]:
|
||
|
j1, j2 = self.pairs[gp][lb]
|
||
|
if np.any(j1 == j2):
|
||
|
if not np.all(j1 == j2):
|
||
|
warnings.warn(
|
||
|
"equivalence class contains both variance "
|
||
|
"and covariance parameters", OutputWarning)
|
||
|
self._var_classes.add(lb)
|
||
|
self.dep_params[lb] = 1
|
||
|
|
||
|
# Need to start indexing at 0 within each group.
|
||
|
# rx maps olds indices to new indices
|
||
|
rx = -1 * np.ones(len(self.model.endog), dtype=np.int32)
|
||
|
for g_ix, g_lb in enumerate(self.model.group_labels):
|
||
|
ii = self.model.group_indices[g_lb]
|
||
|
rx[ii] = np.arange(len(ii), dtype=np.int32)
|
||
|
|
||
|
# Reindex
|
||
|
for gp in self.model.group_labels:
|
||
|
for lb in self.pairs[gp].keys():
|
||
|
a, b = self.pairs[gp][lb]
|
||
|
self.pairs[gp][lb] = (rx[a], rx[b])
|
||
|
|
||
|
@Appender(CovStruct.update.__doc__)
|
||
|
def update(self, params):
|
||
|
|
||
|
endog = self.model.endog_li
|
||
|
varfunc = self.model.family.variance
|
||
|
cached_means = self.model.cached_means
|
||
|
dep_params = defaultdict(lambda: [0., 0., 0.])
|
||
|
n_pairs = defaultdict(int)
|
||
|
dim = len(params)
|
||
|
|
||
|
for k, gp in enumerate(self.model.group_labels):
|
||
|
expval, _ = cached_means[k]
|
||
|
stdev = np.sqrt(varfunc(expval))
|
||
|
resid = (endog[k] - expval) / stdev
|
||
|
for lb in self.pairs[gp].keys():
|
||
|
if (not self.return_cov) and lb in self._var_classes:
|
||
|
continue
|
||
|
jj = self.pairs[gp][lb]
|
||
|
dep_params[lb][0] += np.sum(resid[jj[0]] * resid[jj[1]])
|
||
|
if not self.return_cov:
|
||
|
dep_params[lb][1] += np.sum(resid[jj[0]] ** 2)
|
||
|
dep_params[lb][2] += np.sum(resid[jj[1]] ** 2)
|
||
|
n_pairs[lb] += len(jj[0])
|
||
|
|
||
|
if self.return_cov:
|
||
|
for lb in dep_params.keys():
|
||
|
dep_params[lb] = dep_params[lb][0] / (n_pairs[lb] - dim)
|
||
|
else:
|
||
|
for lb in dep_params.keys():
|
||
|
den = np.sqrt(dep_params[lb][1] * dep_params[lb][2])
|
||
|
dep_params[lb] = dep_params[lb][0] / den
|
||
|
for lb in self._var_classes:
|
||
|
dep_params[lb] = 1.
|
||
|
|
||
|
self.dep_params = dep_params
|
||
|
self.n_pairs = n_pairs
|
||
|
|
||
|
@Appender(CovStruct.covariance_matrix.__doc__)
|
||
|
def covariance_matrix(self, expval, index):
|
||
|
dim = len(expval)
|
||
|
cmat = np.zeros((dim, dim))
|
||
|
g_lb = self.model.group_labels[index]
|
||
|
|
||
|
for lb in self.pairs[g_lb].keys():
|
||
|
j1, j2 = self.pairs[g_lb][lb]
|
||
|
cmat[j1, j2] = self.dep_params[lb]
|
||
|
|
||
|
cmat = cmat + cmat.T
|
||
|
np.fill_diagonal(cmat, cmat.diagonal() / 2)
|
||
|
|
||
|
return cmat, not self.return_cov
|