1814 lines
57 KiB
Python
1814 lines
57 KiB
Python
|
'''
|
||
|
The one parameter exponential family distributions used by GLM.
|
||
|
'''
|
||
|
# TODO: quasi, quasibinomial, quasipoisson
|
||
|
# see
|
||
|
# http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html
|
||
|
# for comparison to R, and McCullagh and Nelder
|
||
|
|
||
|
|
||
|
import inspect
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import special, stats
|
||
|
|
||
|
from statsmodels.compat.scipy import SP_LT_17
|
||
|
from statsmodels.tools.sm_exceptions import (
|
||
|
ValueWarning,
|
||
|
)
|
||
|
from . import links as L, varfuncs as V
|
||
|
|
||
|
FLOAT_EPS = np.finfo(float).eps
|
||
|
|
||
|
|
||
|
class Family:
|
||
|
"""
|
||
|
The parent class for one-parameter exponential families.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link function instance
|
||
|
Link is the linear transformation function.
|
||
|
See the individual families for available links.
|
||
|
variance : a variance function
|
||
|
Measures the variance as a function of the mean probabilities.
|
||
|
See the individual families for the default variance function.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:ref:`links` : Further details on links.
|
||
|
"""
|
||
|
# TODO: change these class attributes, use valid somewhere...
|
||
|
valid = [-np.inf, np.inf]
|
||
|
links = []
|
||
|
|
||
|
def _setlink(self, link):
|
||
|
"""
|
||
|
Helper method to set the link for a family.
|
||
|
|
||
|
Raises a ``ValueError`` exception if the link is not available. Note
|
||
|
that the error message might not be that informative because it tells
|
||
|
you that the link should be in the base class for the link function.
|
||
|
|
||
|
See statsmodels.genmod.generalized_linear_model.GLM for a list of
|
||
|
appropriate links for each family but note that not all of these are
|
||
|
currently available.
|
||
|
"""
|
||
|
# TODO: change the links class attribute in the families to hold
|
||
|
# meaningful information instead of a list of links instances such as
|
||
|
# [<statsmodels.family.links.Log object at 0x9a4240c>,
|
||
|
# <statsmodels.family.links.Power object at 0x9a423ec>,
|
||
|
# <statsmodels.family.links.Power object at 0x9a4236c>]
|
||
|
# for Poisson...
|
||
|
self._link = link
|
||
|
if self._check_link:
|
||
|
if not isinstance(link, L.Link):
|
||
|
raise TypeError("The input should be a valid Link object.")
|
||
|
if hasattr(self, "links"):
|
||
|
validlink = max([isinstance(link, _) for _ in self.links])
|
||
|
if not validlink:
|
||
|
msg = "Invalid link for family, should be in %s. (got %s)"
|
||
|
raise ValueError(msg % (repr(self.links), link))
|
||
|
|
||
|
def _getlink(self):
|
||
|
"""
|
||
|
Helper method to get the link for a family.
|
||
|
"""
|
||
|
return self._link
|
||
|
|
||
|
# link property for each family is a pointer to link instance
|
||
|
link = property(_getlink, _setlink, doc="Link function for family")
|
||
|
|
||
|
def __init__(self, link, variance, check_link=True):
|
||
|
self._check_link = check_link
|
||
|
if inspect.isclass(link):
|
||
|
warnmssg = (
|
||
|
"Calling Family(..) with a link class is not allowed. Use an "
|
||
|
"instance of a link class instead."
|
||
|
)
|
||
|
raise TypeError(warnmssg)
|
||
|
|
||
|
self.link = link
|
||
|
self.variance = variance
|
||
|
|
||
|
def starting_mu(self, y):
|
||
|
r"""
|
||
|
Starting value for mu in the IRLS algorithm.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y : ndarray
|
||
|
The untransformed response variable.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mu_0 : ndarray
|
||
|
The first guess on the transformed response variable.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
\mu_0 = (Y + \overline{Y})/2
|
||
|
|
||
|
Only the Binomial family takes a different initial value.
|
||
|
"""
|
||
|
return (y + y.mean())/2.
|
||
|
|
||
|
def weights(self, mu):
|
||
|
r"""
|
||
|
Weights for IRLS steps
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : array_like
|
||
|
The transformed mean response variable in the exponential family
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
w : ndarray
|
||
|
The weights for the IRLS steps
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
w = 1 / (g'(\mu)^2 * Var(\mu))
|
||
|
"""
|
||
|
return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
|
||
|
|
||
|
def deviance(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The deviance function evaluated at (endog, mu, var_weights,
|
||
|
freq_weights, scale) for the distribution.
|
||
|
|
||
|
Deviance is usually defined as twice the loglikelihood ratio.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : array_like
|
||
|
The endogenous response variable
|
||
|
mu : array_like
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
freq_weights : array_like
|
||
|
1d array of frequency weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional scale argument. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Deviance : ndarray
|
||
|
The value of deviance function defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Deviance is defined
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
D = 2\sum_i (freq\_weights_i * var\_weights *
|
||
|
(llf(endog_i, endog_i) - llf(endog_i, \mu_i)))
|
||
|
|
||
|
where y is the endogenous variable. The deviance functions are
|
||
|
analytically defined for each family.
|
||
|
|
||
|
Internally, we calculate deviance as:
|
||
|
|
||
|
.. math::
|
||
|
D = \sum_i freq\_weights_i * var\_weights * resid\_dev_i / scale
|
||
|
"""
|
||
|
resid_dev = self._resid_dev(endog, mu)
|
||
|
return np.sum(resid_dev * freq_weights * var_weights / scale)
|
||
|
|
||
|
def resid_dev(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : array_like
|
||
|
The endogenous response variable
|
||
|
mu : array_like
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional scale argument. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The deviance residuals are defined by the contribution D_i of
|
||
|
observation i to the deviance as
|
||
|
|
||
|
.. math::
|
||
|
resid\_dev_i = sign(y_i-\mu_i) \sqrt{D_i}
|
||
|
|
||
|
D_i is calculated from the _resid_dev method in each family.
|
||
|
Distribution-specific documentation of the calculation is available
|
||
|
there.
|
||
|
"""
|
||
|
resid_dev = self._resid_dev(endog, mu)
|
||
|
resid_dev *= var_weights / scale
|
||
|
return np.sign(endog - mu) * np.sqrt(np.clip(resid_dev, 0., np.inf))
|
||
|
|
||
|
def fitted(self, lin_pred):
|
||
|
r"""
|
||
|
Fitted values based on linear predictors lin_pred.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
lin_pred : ndarray
|
||
|
Values of the linear predictor of the model.
|
||
|
:math:`X \cdot \beta` in a classical linear model.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mu : ndarray
|
||
|
The mean response variables given by the inverse of the link
|
||
|
function.
|
||
|
"""
|
||
|
fits = self.link.inverse(lin_pred)
|
||
|
return fits
|
||
|
|
||
|
def predict(self, mu):
|
||
|
"""
|
||
|
Linear predictors based on given mu values.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
The mean response variables
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
lin_pred : ndarray
|
||
|
Linear predictors based on the mean response variables. The value
|
||
|
of the link function at the given mu.
|
||
|
"""
|
||
|
return self.link(mu)
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This is defined for each family. endog and mu are not restricted to
|
||
|
``endog`` and ``mu`` respectively. For instance, you could call
|
||
|
both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
|
||
|
log-likelihood ratio.
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def loglike(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function in terms of the fitted mean response.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
freq_weights : array_like
|
||
|
1d array of frequency weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, freq_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Where :math:`ll_i` is the by-observation log-likelihood:
|
||
|
|
||
|
.. math::
|
||
|
ll = \sum(ll_i * freq\_weights_i)
|
||
|
|
||
|
``ll_i`` is defined for each family. endog and mu are not restricted
|
||
|
to ``endog`` and ``mu`` respectively. For instance, you could call
|
||
|
both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
|
||
|
log-likelihood ratio.
|
||
|
"""
|
||
|
ll_obs = self.loglike_obs(endog, mu, var_weights, scale)
|
||
|
return np.sum(ll_obs * freq_weights)
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : `resid_anscombe` for the
|
||
|
individual families for more information
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Anscombe residuals are defined by
|
||
|
|
||
|
.. math::
|
||
|
resid\_anscombe_i = \frac{A(y)-A(\mu)}{A'(\mu)\sqrt{Var[\mu]}} *
|
||
|
\sqrt(var\_weights)
|
||
|
|
||
|
where :math:`A'(y)=v(y)^{-\frac{1}{3}}` and :math:`v(\mu)` is the
|
||
|
variance function :math:`Var[y]=\frac{\phi}{w}v(mu)`.
|
||
|
The transformation :math:`A(y)` makes the residuals more normal
|
||
|
distributed.
|
||
|
"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def _clean(self, x):
|
||
|
"""
|
||
|
Helper function to trim the data so that it is in (0,inf)
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The need for this function was discovered through usage and its
|
||
|
possible that other families might need a check for validity of the
|
||
|
domain.
|
||
|
"""
|
||
|
return np.clip(x, FLOAT_EPS, np.inf)
|
||
|
|
||
|
|
||
|
class Poisson(Family):
|
||
|
"""
|
||
|
Poisson exponential family.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the Poisson family is the log link. Available
|
||
|
links are log, identity, and sqrt. See statsmodels.families.links for
|
||
|
more information.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
Poisson.link : a link instance
|
||
|
The link function of the Poisson instance.
|
||
|
Poisson.variance : varfuncs instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.mu
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
"""
|
||
|
links = [L.Log, L.Identity, L.Sqrt]
|
||
|
variance = V.mu
|
||
|
valid = [0, np.inf]
|
||
|
safe_links = [L.Log, ]
|
||
|
|
||
|
def __init__(self, link=None, check_link=True):
|
||
|
if link is None:
|
||
|
link = L.Log()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=Poisson.variance,
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Poisson deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = 2 * (endog_i * \ln(endog_i / \mu_i) -
|
||
|
(endog_i - \mu_i))
|
||
|
"""
|
||
|
endog_mu = self._clean(endog / mu)
|
||
|
resid_dev = endog * np.log(endog_mu) - (endog - mu)
|
||
|
return 2 * resid_dev
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Poisson distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
ll_i = var\_weights_i / scale * (endog_i * \ln(\mu_i) - \mu_i -
|
||
|
\ln \Gamma(endog_i + 1))
|
||
|
"""
|
||
|
return var_weights / scale * (endog * np.log(mu) - mu -
|
||
|
special.gammaln(endog + 1))
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals for the Poisson family defined below
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = (3/2) * (endog_i^{2/3} - \mu_i^{2/3}) /
|
||
|
\mu_i^{1/6} * \sqrt(var\_weights)
|
||
|
"""
|
||
|
resid = ((3 / 2.) * (endog**(2 / 3.) - mu**(2 / 3.)) /
|
||
|
(mu ** (1 / 6.) * scale ** 0.5))
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale=1., var_weights=1.):
|
||
|
r"""
|
||
|
Frozen Poisson distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is ignored.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
var_weights are ignored for Poisson.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
|
||
|
return stats.poisson(mu)
|
||
|
|
||
|
|
||
|
class Gaussian(Family):
|
||
|
"""
|
||
|
Gaussian exponential family distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the Gaussian family is the identity link.
|
||
|
Available links are log, identity, and inverse.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
Gaussian.link : a link instance
|
||
|
The link function of the Gaussian instance
|
||
|
Gaussian.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.constant
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
"""
|
||
|
|
||
|
links = [L.Log, L.Identity, L.InversePower]
|
||
|
variance = V.constant
|
||
|
safe_links = links
|
||
|
|
||
|
def __init__(self, link=None, check_link=True):
|
||
|
if link is None:
|
||
|
link = L.Identity()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=Gaussian.variance,
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Gaussian deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = (endog_i - \mu_i) ** 2
|
||
|
"""
|
||
|
return (endog - mu) ** 2
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Gaussian distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If the link is the identity link function then the
|
||
|
loglikelihood function is the same as the classical OLS model.
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
llf = -nobs / 2 * (\log(SSR) + (1 + \log(2 \pi / nobs)))
|
||
|
|
||
|
where
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
SSR = \sum_i (Y_i - g^{-1}(\mu_i))^2
|
||
|
|
||
|
If the links is not the identity link then the loglikelihood
|
||
|
function is defined as
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
ll_i = -1 / 2 \sum_i * var\_weights * ((Y_i - mu_i)^2 / scale +
|
||
|
\log(2 * \pi * scale))
|
||
|
"""
|
||
|
ll_obs = -var_weights * (endog - mu) ** 2 / scale
|
||
|
ll_obs += -np.log(scale / var_weights) - np.log(2 * np.pi)
|
||
|
ll_obs /= 2
|
||
|
return ll_obs
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals for the Gaussian family defined below
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For the Gaussian distribution, Anscombe residuals are the same as
|
||
|
deviance residuals.
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = (Y_i - \mu_i) / \sqrt{scale} *
|
||
|
\sqrt(var\_weights)
|
||
|
"""
|
||
|
resid = (endog - mu) / scale ** 0.5
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale, var_weights=1.):
|
||
|
r"""
|
||
|
Frozen Gaussian distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is required argument for get_distribution.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
|
||
|
scale_n = scale / var_weights
|
||
|
return stats.norm(loc=mu, scale=np.sqrt(scale_n))
|
||
|
|
||
|
|
||
|
class Gamma(Family):
|
||
|
"""
|
||
|
Gamma exponential family distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the Gamma family is the inverse link.
|
||
|
Available links are log, identity, and inverse.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
Gamma.link : a link instance
|
||
|
The link function of the Gamma instance
|
||
|
Gamma.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.family.varfuncs.mu_squared
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
"""
|
||
|
links = [L.Log, L.Identity, L.InversePower]
|
||
|
variance = V.mu_squared
|
||
|
safe_links = [L.Log, ]
|
||
|
|
||
|
def __init__(self, link=None, check_link=True):
|
||
|
if link is None:
|
||
|
link = L.InversePower()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=Gamma.variance,
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Gamma deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = 2 * ((endog_i - \mu_i) / \mu_i -
|
||
|
\log(endog_i / \mu_i))
|
||
|
"""
|
||
|
endog_mu = self._clean(endog / mu)
|
||
|
resid_dev = -np.log(endog_mu) + (endog - mu) / mu
|
||
|
return 2 * resid_dev
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Gamma distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
ll_i = var\_weights_i / scale * (\ln(var\_weights_i * endog_i /
|
||
|
(scale * \mu_i)) - (var\_weights_i * endog_i) /
|
||
|
(scale * \mu_i)) - \ln \Gamma(var\_weights_i / scale) - \ln(\mu_i)
|
||
|
"""
|
||
|
endog_mu = self._clean(endog / mu)
|
||
|
weight_scale = var_weights / scale
|
||
|
ll_obs = weight_scale * np.log(weight_scale * endog_mu)
|
||
|
ll_obs -= weight_scale * endog_mu
|
||
|
ll_obs -= special.gammaln(weight_scale) + np.log(endog)
|
||
|
return ll_obs
|
||
|
|
||
|
# in Stata scale is set to equal 1 for reporting llf
|
||
|
# in R it's the dispersion, though there is a loss of precision vs.
|
||
|
# our results due to an assumed difference in implementation
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals for the Gamma family defined below
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = 3 * (endog_i^{1/3} - \mu_i^{1/3}) / \mu_i^{1/3}
|
||
|
/ \sqrt{scale} * \sqrt(var\_weights)
|
||
|
"""
|
||
|
resid = 3 * (endog**(1/3.) - mu**(1/3.)) / mu**(1/3.) / scale ** 0.5
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale, var_weights=1.):
|
||
|
r"""
|
||
|
Frozen Gamma distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is required argument for get_distribution.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
# combine var_weights with scale
|
||
|
scale_ = scale / var_weights
|
||
|
shape = 1 / scale_
|
||
|
scale_g = mu * scale_
|
||
|
return stats.gamma(shape, scale=scale_g)
|
||
|
|
||
|
|
||
|
class Binomial(Family):
|
||
|
"""
|
||
|
Binomial exponential family distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the Binomial family is the logit link.
|
||
|
Available links are logit, probit, cauchy, log, loglog, and cloglog.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
Binomial.link : a link instance
|
||
|
The link function of the Binomial instance
|
||
|
Binomial.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.binary
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
endog for Binomial can be specified in one of three ways:
|
||
|
A 1d array of 0 or 1 values, indicating failure or success
|
||
|
respectively.
|
||
|
A 2d array, with two columns. The first column represents the
|
||
|
success count and the second column represents the failure
|
||
|
count.
|
||
|
A 1d array of proportions, indicating the proportion of
|
||
|
successes, with parameter `var_weights` containing the
|
||
|
number of trials for each row.
|
||
|
"""
|
||
|
|
||
|
links = [L.Logit, L.Probit, L.Cauchy, L.Log, L.LogC, L.CLogLog, L.LogLog,
|
||
|
L.Identity]
|
||
|
variance = V.binary # this is not used below in an effort to include n
|
||
|
|
||
|
# Other safe links, e.g. cloglog and probit are subclasses
|
||
|
safe_links = [L.Logit, L.CDFLink]
|
||
|
|
||
|
def __init__(self, link=None, check_link=True): # , n=1.):
|
||
|
if link is None:
|
||
|
link = L.Logit()
|
||
|
# TODO: it *should* work for a constant n>1 actually, if freq_weights
|
||
|
# is equal to n
|
||
|
self.n = 1
|
||
|
# overwritten by initialize if needed but always used to initialize
|
||
|
# variance since endog is assumed/forced to be (0,1)
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=V.Binomial(n=self.n),
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def starting_mu(self, y):
|
||
|
r"""
|
||
|
The starting values for the IRLS algorithm for the Binomial family.
|
||
|
A good choice for the binomial family is :math:`\mu_0 = (Y_i + 0.5)/2`
|
||
|
"""
|
||
|
return (y + .5)/2
|
||
|
|
||
|
def initialize(self, endog, freq_weights):
|
||
|
'''
|
||
|
Initialize the response variable.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Endogenous response variable
|
||
|
freq_weights : ndarray
|
||
|
1d array of frequency weights
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
If `endog` is binary, returns `endog`
|
||
|
|
||
|
If `endog` is a 2d array, then the input is assumed to be in the format
|
||
|
(successes, failures) and
|
||
|
successes/(success + failures) is returned. And n is set to
|
||
|
successes + failures.
|
||
|
'''
|
||
|
# if not np.all(np.asarray(freq_weights) == 1):
|
||
|
# self.variance = V.Binomial(n=freq_weights)
|
||
|
if endog.ndim > 1 and endog.shape[1] > 2:
|
||
|
raise ValueError('endog has more than 2 columns. The Binomial '
|
||
|
'link supports either a single response variable '
|
||
|
'or a paired response variable.')
|
||
|
elif endog.ndim > 1 and endog.shape[1] > 1:
|
||
|
y = endog[:, 0]
|
||
|
# overwrite self.freq_weights for deviance below
|
||
|
self.n = endog.sum(1)
|
||
|
return y*1./self.n, self.n
|
||
|
else:
|
||
|
return endog, np.ones(endog.shape[0])
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Binomial deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = 2 * n * (endog_i * \ln(endog_i /\mu_i) +
|
||
|
(1 - endog_i) * \ln((1 - endog_i) / (1 - \mu_i)))
|
||
|
"""
|
||
|
endog_mu = self._clean(endog / (mu + 1e-20))
|
||
|
n_endog_mu = self._clean((1. - endog) / (1. - mu + 1e-20))
|
||
|
resid_dev = endog * np.log(endog_mu) + (1 - endog) * np.log(n_endog_mu)
|
||
|
return 2 * self.n * resid_dev
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Binomial distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If the endogenous variable is binary:
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
ll_i = \sum_i (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) *
|
||
|
var\_weights_i
|
||
|
|
||
|
If the endogenous variable is binomial:
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
ll_i = \sum_i var\_weights_i * (\ln \Gamma(n+1) -
|
||
|
\ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i *
|
||
|
\log(\mu_i / (n_i - \mu_i)) + n * \log(1 - \mu_i/n_i))
|
||
|
|
||
|
where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as
|
||
|
defined in Binomial initialize. This simply makes :math:`y_i` the
|
||
|
original number of successes.
|
||
|
"""
|
||
|
n = self.n # Number of trials
|
||
|
y = endog * n # Number of successes
|
||
|
|
||
|
# note that mu is still in (0,1), i.e. not converted back
|
||
|
return (
|
||
|
special.gammaln(n + 1) - special.gammaln(y + 1) -
|
||
|
special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
|
||
|
n * np.log(1 - mu + 1e-20)) * var_weights
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r'''
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
n^{2/3}*(cox\_snell(endog)-cox\_snell(mu)) /
|
||
|
(mu*(1-mu/n)*scale^3)^{1/6} * \sqrt(var\_weights)
|
||
|
|
||
|
where cox_snell is defined as
|
||
|
cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.)
|
||
|
where betainc is the incomplete beta function as defined in scipy,
|
||
|
which uses a regularized version (with the unregularized version, one
|
||
|
would just have :math:`cox_snell(x) = Betainc(2/3., 2/3., x)`).
|
||
|
|
||
|
The name 'cox_snell' is idiosyncratic and is simply used for
|
||
|
convenience following the approach suggested in Cox and Snell (1968).
|
||
|
Further note that
|
||
|
:math:`cox\_snell(x) = \frac{3}{2}*x^{2/3} *
|
||
|
hyp2f1(2/3.,1/3.,5/3.,x)`
|
||
|
where hyp2f1 is the hypergeometric 2f1 function. The Anscombe
|
||
|
residuals are sometimes defined in the literature using the
|
||
|
hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's
|
||
|
paper." Journal of the Royal Statistical Society B. 15, 229-30.
|
||
|
|
||
|
Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals."
|
||
|
Journal of the Royal Statistical Society B. 30, 248-75.
|
||
|
'''
|
||
|
endog = endog * self.n # convert back to successes
|
||
|
mu = mu * self.n # convert back to successes
|
||
|
|
||
|
def cox_snell(x):
|
||
|
return special.betainc(2/3., 2/3., x) * special.beta(2/3., 2/3.)
|
||
|
|
||
|
resid = (self.n ** (2/3.) * (cox_snell(endog * 1. / self.n) -
|
||
|
cox_snell(mu * 1. / self.n)) /
|
||
|
(mu * (1 - mu * 1. / self.n) * scale ** 3) ** (1 / 6.))
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale=1., var_weights=1., n_trials=1):
|
||
|
r"""
|
||
|
Frozen Binomial distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is ignored.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
var_weights are ignored for Poisson.
|
||
|
n_trials : int
|
||
|
Number of trials for the binomial distribution. The default is 1
|
||
|
which corresponds to a Bernoulli random variable.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
|
||
|
return stats.binom(n=n_trials, p=mu)
|
||
|
|
||
|
|
||
|
class InverseGaussian(Family):
|
||
|
"""
|
||
|
InverseGaussian exponential family.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the inverse Gaussian family is the
|
||
|
inverse squared link.
|
||
|
Available links are InverseSquared, Inverse, Log, and Identity.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
InverseGaussian.link : a link instance
|
||
|
The link function of the inverse Gaussian instance
|
||
|
InverseGaussian.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.mu_cubed
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The inverse Gaussian distribution is sometimes referred to in the
|
||
|
literature as the Wald distribution.
|
||
|
"""
|
||
|
|
||
|
links = [L.InverseSquared, L.InversePower, L.Identity, L.Log]
|
||
|
variance = V.mu_cubed
|
||
|
safe_links = [L.InverseSquared, L.Log, ]
|
||
|
|
||
|
def __init__(self, link=None, check_link=True):
|
||
|
if link is None:
|
||
|
link = L.InverseSquared()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=InverseGaussian.variance,
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Inverse Gaussian deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = 1 / (endog_i * \mu_i^2) * (endog_i - \mu_i)^2
|
||
|
"""
|
||
|
return 1. / (endog * mu ** 2) * (endog - mu) ** 2
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Inverse Gaussian distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
ll_i = -1/2 * (var\_weights_i * (endog_i - \mu_i)^2 /
|
||
|
(scale * endog_i * \mu_i^2) + \ln(scale * \endog_i^3 /
|
||
|
var\_weights_i) - \ln(2 * \pi))
|
||
|
"""
|
||
|
ll_obs = -var_weights * (endog - mu) ** 2 / (scale * endog * mu ** 2)
|
||
|
ll_obs += -np.log(scale * endog ** 3 / var_weights) - np.log(2 * np.pi)
|
||
|
ll_obs /= 2
|
||
|
return ll_obs
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals for the inverse Gaussian distribution as
|
||
|
defined below
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = \log(Y_i / \mu_i) / \sqrt{\mu_i * scale} *
|
||
|
\sqrt(var\_weights)
|
||
|
"""
|
||
|
resid = np.log(endog / mu) / np.sqrt(mu * scale)
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale, var_weights=1.):
|
||
|
r"""
|
||
|
Frozen Inverse Gaussian distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is required argument for get_distribution.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
# combine var_weights with scale
|
||
|
scale_ = scale / var_weights
|
||
|
mu_ig = mu * scale_
|
||
|
return stats.invgauss(mu_ig, scale=1 / scale_)
|
||
|
|
||
|
|
||
|
class NegativeBinomial(Family):
|
||
|
r"""
|
||
|
Negative Binomial exponential family (corresponds to NB2).
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the negative binomial family is the log link.
|
||
|
Available links are log, cloglog, identity, nbinom and power.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
alpha : float, optional
|
||
|
The ancillary parameter for the negative binomial distribution.
|
||
|
For now ``alpha`` is assumed to be nonstochastic. The default value
|
||
|
is 1. Permissible values are usually assumed to be between .01 and 2.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
NegativeBinomial.link : a link instance
|
||
|
The link function of the negative binomial instance
|
||
|
NegativeBinomial.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.nbinom
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Power link functions are not yet supported.
|
||
|
|
||
|
Parameterization for :math:`y=0, 1, 2, \ldots` is
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
f(y) = \frac{\Gamma(y+\frac{1}{\alpha})}{y!\Gamma(\frac{1}{\alpha})}
|
||
|
\left(\frac{1}{1+\alpha\mu}\right)^{\frac{1}{\alpha}}
|
||
|
\left(\frac{\alpha\mu}{1+\alpha\mu}\right)^y
|
||
|
|
||
|
with :math:`E[Y]=\mu\,` and :math:`Var[Y]=\mu+\alpha\mu^2`.
|
||
|
"""
|
||
|
links = [L.Log, L.CLogLog, L.Identity, L.NegativeBinomial, L.Power]
|
||
|
# TODO: add the ability to use the power links with an if test
|
||
|
# similar to below
|
||
|
variance = V.nbinom
|
||
|
safe_links = [L.Log, ]
|
||
|
|
||
|
def __init__(self, link=None, alpha=1., check_link=True):
|
||
|
self.alpha = 1. * alpha # make it at least float
|
||
|
if alpha is self.__init__.__defaults__[1]: # `is` is intentional
|
||
|
warnings.warn("Negative binomial dispersion parameter alpha not "
|
||
|
f"set. Using default value alpha={alpha}.",
|
||
|
ValueWarning)
|
||
|
if link is None:
|
||
|
link = L.Log()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=V.NegativeBinomial(alpha=self.alpha),
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Negative Binomial deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
.. math::
|
||
|
|
||
|
resid_dev_i = 2 * (endog_i * \ln(endog_i /
|
||
|
\mu_i) - (endog_i + 1 / \alpha) * \ln((endog_i + 1 / \alpha) /
|
||
|
(\mu_i + 1 / \alpha)))
|
||
|
"""
|
||
|
endog_mu = self._clean(endog / mu)
|
||
|
endog_alpha = endog + 1 / self.alpha
|
||
|
mu_alpha = mu + 1 / self.alpha
|
||
|
resid_dev = endog * np.log(endog_mu)
|
||
|
resid_dev -= endog_alpha * np.log(endog_alpha / mu_alpha)
|
||
|
return 2 * resid_dev
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Negative Binomial distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Defined as:
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
llf = \sum_i var\_weights_i / scale * (Y_i * \log{(\alpha * \mu_i /
|
||
|
(1 + \alpha * \mu_i))} - \log{(1 + \alpha * \mu_i)}/
|
||
|
\alpha + Constant)
|
||
|
|
||
|
where :math:`Constant` is defined as:
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
Constant = \ln \Gamma{(Y_i + 1/ \alpha )} - \ln \Gamma(Y_i + 1) -
|
||
|
\ln \Gamma{(1/ \alpha )}
|
||
|
|
||
|
constant = (special.gammaln(endog + 1 / self.alpha) -
|
||
|
special.gammaln(endog+1)-special.gammaln(1/self.alpha))
|
||
|
return (endog * np.log(self.alpha * mu / (1 + self.alpha * mu)) -
|
||
|
np.log(1 + self.alpha * mu) / self.alpha +
|
||
|
constant) * var_weights / scale
|
||
|
"""
|
||
|
ll_obs = endog * np.log(self.alpha * mu)
|
||
|
ll_obs -= (endog + 1 / self.alpha) * np.log(1 + self.alpha * mu)
|
||
|
ll_obs += special.gammaln(endog + 1 / self.alpha)
|
||
|
ll_obs -= special.gammaln(1 / self.alpha)
|
||
|
ll_obs -= special.gammaln(endog + 1)
|
||
|
return var_weights / scale * ll_obs
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Anscombe residuals for Negative Binomial are the same as for Binomial
|
||
|
upon setting :math:`n=-\frac{1}{\alpha}`. Due to the negative value of
|
||
|
:math:`-\alpha*Y` the representation with the hypergeometric function
|
||
|
:math:`H2F1(x) = hyp2f1(2/3.,1/3.,5/3.,x)` is advantageous
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = \frac{3}{2} *
|
||
|
(Y_i^(2/3)*H2F1(-\alpha*Y_i) - \mu_i^(2/3)*H2F1(-\alpha*\mu_i))
|
||
|
/ (\mu_i * (1+\alpha*\mu_i) * scale^3)^(1/6) * \sqrt(var\_weights)
|
||
|
|
||
|
Note that for the (unregularized) Beta function, one has
|
||
|
:math:`Beta(z,a,b) = z^a/a * H2F1(a,1-b,a+1,z)`
|
||
|
"""
|
||
|
def hyp2f1(x):
|
||
|
return special.hyp2f1(2 / 3., 1 / 3., 5 / 3., x)
|
||
|
|
||
|
resid = (3 / 2. * (endog ** (2 / 3.) * hyp2f1(-self.alpha * endog) -
|
||
|
mu ** (2 / 3.) * hyp2f1(-self.alpha * mu)) /
|
||
|
(mu * (1 + self.alpha * mu) *
|
||
|
scale ** 3) ** (1 / 6.))
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|
||
|
|
||
|
def get_distribution(self, mu, scale=1., var_weights=1.):
|
||
|
r"""
|
||
|
Frozen NegativeBinomial distribution instance for given parameters
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
scale : float
|
||
|
The scale parameter is ignored.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
var_weights are ignored for NegativeBinomial.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
distribution instance
|
||
|
|
||
|
"""
|
||
|
size = 1. / self.alpha
|
||
|
prob = size / (size + mu)
|
||
|
return stats.nbinom(size, prob)
|
||
|
|
||
|
|
||
|
class Tweedie(Family):
|
||
|
"""
|
||
|
Tweedie family.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
link : a link instance, optional
|
||
|
The default link for the Tweedie family is the log link.
|
||
|
Available links are log, Power and any aliases of power.
|
||
|
See statsmodels.genmod.families.links for more information.
|
||
|
var_power : float, optional
|
||
|
The variance power. The default is 1.
|
||
|
eql : bool
|
||
|
If True, the Extended Quasi-Likelihood is used, else the
|
||
|
likelihood is used.
|
||
|
In both cases, for likelihood computations the var_power
|
||
|
must be between 1 and 2.
|
||
|
check_link : bool
|
||
|
If True (default), then and exception is raised if the link is invalid
|
||
|
for the family.
|
||
|
If False, then the link is not checked.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
Tweedie.link : a link instance
|
||
|
The link function of the Tweedie instance
|
||
|
Tweedie.variance : varfunc instance
|
||
|
``variance`` is an instance of
|
||
|
statsmodels.genmod.families.varfuncs.Power
|
||
|
Tweedie.var_power : float
|
||
|
The power parameter of the variance function.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
statsmodels.genmod.families.family.Family : Parent class for all links.
|
||
|
:ref:`links` : Further details on links.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Loglikelihood function not implemented because of the complexity of
|
||
|
calculating an infinite series of summations. The variance power can be
|
||
|
estimated using the ``estimate_tweedie_power`` function that is part of the
|
||
|
statsmodels.genmod.generalized_linear_model.GLM class.
|
||
|
"""
|
||
|
links = [L.Log, L.Power]
|
||
|
variance = V.Power(power=1.5)
|
||
|
safe_links = [L.Log, L.Power]
|
||
|
|
||
|
def __init__(self, link=None, var_power=1., eql=False, check_link=True):
|
||
|
self.var_power = var_power
|
||
|
self.eql = eql
|
||
|
if eql and (var_power < 1 or var_power > 2):
|
||
|
raise ValueError("Tweedie: if EQL=True then var_power must fall "
|
||
|
"between 1 and 2")
|
||
|
if link is None:
|
||
|
link = L.Log()
|
||
|
super().__init__(
|
||
|
link=link,
|
||
|
variance=V.Power(power=var_power * 1.),
|
||
|
check_link=check_link
|
||
|
)
|
||
|
|
||
|
def _resid_dev(self, endog, mu):
|
||
|
r"""
|
||
|
Tweedie deviance residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable.
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_dev : float
|
||
|
Deviance residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
When :math:`p = 1`,
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
dev_i = \mu_i
|
||
|
|
||
|
when :math:`endog_i = 0` and
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
dev_i = endog_i * \log(endog_i / \mu_i) + (\mu_i - endog_i)
|
||
|
|
||
|
otherwise.
|
||
|
|
||
|
When :math:`p = 2`,
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
dev_i = (endog_i - \mu_i) / \mu_i - \log(endog_i / \mu_i)
|
||
|
|
||
|
For all other p,
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
dev_i = endog_i^{2 - p} / ((1 - p) * (2 - p)) -
|
||
|
endog_i * \mu_i^{1 - p} / (1 - p) + \mu_i^{2 - p} /
|
||
|
(2 - p)
|
||
|
|
||
|
The deviance residual is then
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
resid\_dev_i = 2 * dev_i
|
||
|
"""
|
||
|
p = self.var_power
|
||
|
if p == 1:
|
||
|
dev = np.where(endog == 0,
|
||
|
mu,
|
||
|
endog * np.log(endog / mu) + (mu - endog))
|
||
|
elif p == 2:
|
||
|
endog1 = self._clean(endog)
|
||
|
dev = ((endog - mu) / mu) - np.log(endog1 / mu)
|
||
|
else:
|
||
|
dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) -
|
||
|
endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p))
|
||
|
return 2 * dev
|
||
|
|
||
|
def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The log-likelihood function for each observation in terms of the fitted
|
||
|
mean response for the Tweedie distribution.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
Usually the endogenous response variable.
|
||
|
mu : ndarray
|
||
|
Usually but not always the fitted mean response variable.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float
|
||
|
The scale parameter. The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ll_i : float
|
||
|
The value of the loglikelihood evaluated at
|
||
|
(endog, mu, var_weights, scale) as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
If eql is True, the Extended Quasi-Likelihood is used. At present,
|
||
|
this method returns NaN if eql is False. When the actual likelihood
|
||
|
is implemented, it will be accessible by setting eql to False.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
R Kaas (2005). Compound Poisson Distributions and GLM's -- Tweedie's
|
||
|
Distribution.
|
||
|
https://core.ac.uk/download/pdf/6347266.pdf#page=11
|
||
|
|
||
|
JA Nelder, D Pregibon (1987). An extended quasi-likelihood function.
|
||
|
Biometrika 74:2, pp 221-232. https://www.jstor.org/stable/2336136
|
||
|
"""
|
||
|
p = self.var_power
|
||
|
endog = np.atleast_1d(endog)
|
||
|
if p == 1:
|
||
|
return Poisson().loglike_obs(
|
||
|
endog=endog,
|
||
|
mu=mu,
|
||
|
var_weights=var_weights,
|
||
|
scale=scale
|
||
|
)
|
||
|
elif p == 2:
|
||
|
return Gamma().loglike_obs(
|
||
|
endog=endog,
|
||
|
mu=mu,
|
||
|
var_weights=var_weights,
|
||
|
scale=scale
|
||
|
)
|
||
|
|
||
|
if not self.eql:
|
||
|
if p < 1 or p > 2:
|
||
|
# We have not yet implemented the actual likelihood
|
||
|
return np.nan
|
||
|
|
||
|
# scipy compat bessel_wright added in 1.7
|
||
|
if SP_LT_17:
|
||
|
# old return was nan
|
||
|
return np.nan
|
||
|
|
||
|
# See: Dunn, Smyth (2004) "Series evaluation of Tweedie
|
||
|
# exponential dispersion model densities"
|
||
|
# pdf(y, mu, p, phi) = f(y, theta, phi)
|
||
|
# = c(y, phi) * exp(1/phi (y theta - kappa(theta)))
|
||
|
# kappa = cumulant function
|
||
|
# theta = function of expectation mu and power p
|
||
|
# alpha = (2-p)/(1-p)
|
||
|
# phi = scale
|
||
|
# for 1<p<2:
|
||
|
# c(y, phi) = 1/y * wright_bessel(a, b, x)
|
||
|
# a = -alpha
|
||
|
# b = 0
|
||
|
# x = (p-1)**alpha/(2-p) / y**alpha / phi**(1-alpha)
|
||
|
scale = scale / var_weights
|
||
|
theta = mu ** (1 - p) / (1 - p)
|
||
|
kappa = mu ** (2 - p) / (2 - p)
|
||
|
alpha = (2 - p) / (1 - p)
|
||
|
|
||
|
ll_obs = (endog * theta - kappa) / scale
|
||
|
idx = endog > 0
|
||
|
if np.any(idx):
|
||
|
if not np.isscalar(endog):
|
||
|
endog = endog[idx]
|
||
|
if not np.isscalar(scale):
|
||
|
scale = scale[idx]
|
||
|
x = ((p - 1) * scale / endog) ** alpha
|
||
|
x /= (2 - p) * scale
|
||
|
wb = special.wright_bessel(-alpha, 0, x)
|
||
|
ll_obs[idx] += np.log(1/endog * wb)
|
||
|
return ll_obs
|
||
|
else:
|
||
|
# Equations 4 of Kaas
|
||
|
llf = np.log(2 * np.pi * scale) + p * np.log(endog)
|
||
|
llf -= np.log(var_weights)
|
||
|
llf /= -2
|
||
|
u = (endog ** (2 - p)
|
||
|
- (2 - p) * endog * mu ** (1 - p)
|
||
|
+ (1 - p) * mu ** (2 - p))
|
||
|
u *= var_weights / (scale * (1 - p) * (2 - p))
|
||
|
|
||
|
return llf - u
|
||
|
|
||
|
def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
|
||
|
r"""
|
||
|
The Anscombe residuals
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
endog : ndarray
|
||
|
The endogenous response variable
|
||
|
mu : ndarray
|
||
|
The inverse of the link function at the linear predicted values.
|
||
|
var_weights : array_like
|
||
|
1d array of variance (analytic) weights. The default is 1.
|
||
|
scale : float, optional
|
||
|
An optional argument to divide the residuals by sqrt(scale).
|
||
|
The default is 1.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resid_anscombe : ndarray
|
||
|
The Anscombe residuals as defined below.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
When :math:`p = 3`, then
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = \log(endog_i / \mu_i) / \sqrt{\mu_i * scale} *
|
||
|
\sqrt(var\_weights)
|
||
|
|
||
|
Otherwise,
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
c = (3 - p) / 3
|
||
|
|
||
|
.. math::
|
||
|
|
||
|
resid\_anscombe_i = (1 / c) * (endog_i^c - \mu_i^c) / \mu_i^{p / 6}
|
||
|
/ \sqrt{scale} * \sqrt(var\_weights)
|
||
|
"""
|
||
|
if self.var_power == 3:
|
||
|
resid = np.log(endog / mu) / np.sqrt(mu * scale)
|
||
|
else:
|
||
|
c = (3. - self.var_power) / 3.
|
||
|
resid = ((1. / c) * (endog ** c - mu ** c) /
|
||
|
mu ** (self.var_power / 6.)) / scale ** 0.5
|
||
|
resid *= np.sqrt(var_weights)
|
||
|
return resid
|