453 lines
15 KiB
Python
453 lines
15 KiB
Python
|
import numpy as np
|
|||
|
|
|||
|
from scipy.stats import rv_discrete, poisson, nbinom
|
|||
|
from scipy.special import gammaln
|
|||
|
from scipy._lib._util import _lazywhere
|
|||
|
|
|||
|
from statsmodels.base.model import GenericLikelihoodModel
|
|||
|
|
|||
|
|
|||
|
class genpoisson_p_gen(rv_discrete):
|
|||
|
'''Generalized Poisson distribution
|
|||
|
'''
|
|||
|
def _argcheck(self, mu, alpha, p):
|
|||
|
return (mu >= 0) & (alpha==alpha) & (p > 0)
|
|||
|
|
|||
|
def _logpmf(self, x, mu, alpha, p):
|
|||
|
mu_p = mu ** (p - 1.)
|
|||
|
a1 = np.maximum(np.nextafter(0, 1), 1 + alpha * mu_p)
|
|||
|
a2 = np.maximum(np.nextafter(0, 1), mu + (a1 - 1.) * x)
|
|||
|
logpmf_ = np.log(mu) + (x - 1.) * np.log(a2)
|
|||
|
logpmf_ -= x * np.log(a1) + gammaln(x + 1.) + a2 / a1
|
|||
|
return logpmf_
|
|||
|
|
|||
|
def _pmf(self, x, mu, alpha, p):
|
|||
|
return np.exp(self._logpmf(x, mu, alpha, p))
|
|||
|
|
|||
|
def mean(self, mu, alpha, p):
|
|||
|
return mu
|
|||
|
|
|||
|
def var(self, mu, alpha, p):
|
|||
|
dispersion_factor = (1 + alpha * mu**(p - 1))**2
|
|||
|
var = dispersion_factor * mu
|
|||
|
return var
|
|||
|
|
|||
|
|
|||
|
genpoisson_p = genpoisson_p_gen(name='genpoisson_p',
|
|||
|
longname='Generalized Poisson')
|
|||
|
|
|||
|
|
|||
|
class zipoisson_gen(rv_discrete):
|
|||
|
'''Zero Inflated Poisson distribution
|
|||
|
'''
|
|||
|
def _argcheck(self, mu, w):
|
|||
|
return (mu > 0) & (w >= 0) & (w<=1)
|
|||
|
|
|||
|
def _logpmf(self, x, mu, w):
|
|||
|
return _lazywhere(x != 0, (x, mu, w),
|
|||
|
(lambda x, mu, w: np.log(1. - w) + x * np.log(mu) -
|
|||
|
gammaln(x + 1.) - mu),
|
|||
|
np.log(w + (1. - w) * np.exp(-mu)))
|
|||
|
|
|||
|
def _pmf(self, x, mu, w):
|
|||
|
return np.exp(self._logpmf(x, mu, w))
|
|||
|
|
|||
|
def _cdf(self, x, mu, w):
|
|||
|
# construct cdf from standard poisson's cdf and the w inflation of zero
|
|||
|
return w + poisson(mu=mu).cdf(x) * (1 - w)
|
|||
|
|
|||
|
def _ppf(self, q, mu, w):
|
|||
|
# we just translated and stretched q to remove zi
|
|||
|
q_mod = (q - w) / (1 - w)
|
|||
|
x = poisson(mu=mu).ppf(q_mod)
|
|||
|
# set to zero if in the zi range
|
|||
|
x[q < w] = 0
|
|||
|
return x
|
|||
|
|
|||
|
def mean(self, mu, w):
|
|||
|
return (1 - w) * mu
|
|||
|
|
|||
|
def var(self, mu, w):
|
|||
|
dispersion_factor = 1 + w * mu
|
|||
|
var = (dispersion_factor * self.mean(mu, w))
|
|||
|
return var
|
|||
|
|
|||
|
def _moment(self, n, mu, w):
|
|||
|
return (1 - w) * poisson.moment(n, mu)
|
|||
|
|
|||
|
|
|||
|
zipoisson = zipoisson_gen(name='zipoisson',
|
|||
|
longname='Zero Inflated Poisson')
|
|||
|
|
|||
|
class zigeneralizedpoisson_gen(rv_discrete):
|
|||
|
'''Zero Inflated Generalized Poisson distribution
|
|||
|
'''
|
|||
|
def _argcheck(self, mu, alpha, p, w):
|
|||
|
return (mu > 0) & (w >= 0) & (w<=1)
|
|||
|
|
|||
|
def _logpmf(self, x, mu, alpha, p, w):
|
|||
|
return _lazywhere(x != 0, (x, mu, alpha, p, w),
|
|||
|
(lambda x, mu, alpha, p, w: np.log(1. - w) +
|
|||
|
genpoisson_p.logpmf(x, mu, alpha, p)),
|
|||
|
np.log(w + (1. - w) *
|
|||
|
genpoisson_p.pmf(x, mu, alpha, p)))
|
|||
|
|
|||
|
def _pmf(self, x, mu, alpha, p, w):
|
|||
|
return np.exp(self._logpmf(x, mu, alpha, p, w))
|
|||
|
|
|||
|
def mean(self, mu, alpha, p, w):
|
|||
|
return (1 - w) * mu
|
|||
|
|
|||
|
def var(self, mu, alpha, p, w):
|
|||
|
p = p - 1
|
|||
|
dispersion_factor = (1 + alpha * mu ** p) ** 2 + w * mu
|
|||
|
var = (dispersion_factor * self.mean(mu, alpha, p, w))
|
|||
|
return var
|
|||
|
|
|||
|
|
|||
|
zigenpoisson = zigeneralizedpoisson_gen(
|
|||
|
name='zigenpoisson',
|
|||
|
longname='Zero Inflated Generalized Poisson')
|
|||
|
|
|||
|
|
|||
|
class zinegativebinomial_gen(rv_discrete):
|
|||
|
'''Zero Inflated Generalized Negative Binomial distribution
|
|||
|
'''
|
|||
|
def _argcheck(self, mu, alpha, p, w):
|
|||
|
return (mu > 0) & (w >= 0) & (w<=1)
|
|||
|
|
|||
|
def _logpmf(self, x, mu, alpha, p, w):
|
|||
|
s, p = self.convert_params(mu, alpha, p)
|
|||
|
return _lazywhere(x != 0, (x, s, p, w),
|
|||
|
(lambda x, s, p, w: np.log(1. - w) +
|
|||
|
nbinom.logpmf(x, s, p)),
|
|||
|
np.log(w + (1. - w) *
|
|||
|
nbinom.pmf(x, s, p)))
|
|||
|
|
|||
|
def _pmf(self, x, mu, alpha, p, w):
|
|||
|
return np.exp(self._logpmf(x, mu, alpha, p, w))
|
|||
|
|
|||
|
def _cdf(self, x, mu, alpha, p, w):
|
|||
|
s, p = self.convert_params(mu, alpha, p)
|
|||
|
# construct cdf from standard negative binomial cdf
|
|||
|
# and the w inflation of zero
|
|||
|
return w + nbinom.cdf(x, s, p) * (1 - w)
|
|||
|
|
|||
|
def _ppf(self, q, mu, alpha, p, w):
|
|||
|
s, p = self.convert_params(mu, alpha, p)
|
|||
|
# we just translated and stretched q to remove zi
|
|||
|
q_mod = (q - w) / (1 - w)
|
|||
|
x = nbinom.ppf(q_mod, s, p)
|
|||
|
# set to zero if in the zi range
|
|||
|
x[q < w] = 0
|
|||
|
return x
|
|||
|
|
|||
|
def mean(self, mu, alpha, p, w):
|
|||
|
return (1 - w) * mu
|
|||
|
|
|||
|
def var(self, mu, alpha, p, w):
|
|||
|
dispersion_factor = 1 + alpha * mu ** (p - 1) + w * mu
|
|||
|
var = (dispersion_factor * self.mean(mu, alpha, p, w))
|
|||
|
return var
|
|||
|
|
|||
|
def _moment(self, n, mu, alpha, p, w):
|
|||
|
s, p = self.convert_params(mu, alpha, p)
|
|||
|
return (1 - w) * nbinom.moment(n, s, p)
|
|||
|
|
|||
|
def convert_params(self, mu, alpha, p):
|
|||
|
size = 1. / alpha * mu**(2-p)
|
|||
|
prob = size / (size + mu)
|
|||
|
return (size, prob)
|
|||
|
|
|||
|
zinegbin = zinegativebinomial_gen(name='zinegbin',
|
|||
|
longname='Zero Inflated Generalized Negative Binomial')
|
|||
|
|
|||
|
|
|||
|
class truncatedpoisson_gen(rv_discrete):
|
|||
|
'''Truncated Poisson discrete random variable
|
|||
|
'''
|
|||
|
# TODO: need cdf, and rvs
|
|||
|
|
|||
|
def _argcheck(self, mu, truncation):
|
|||
|
# this does not work
|
|||
|
# vector bound breaks some generic methods
|
|||
|
# self.a = truncation + 1 # max(truncation + 1, 0)
|
|||
|
return (mu >= 0) & (truncation >= -1)
|
|||
|
|
|||
|
def _get_support(self, mu, truncation):
|
|||
|
return truncation + 1, self.b
|
|||
|
|
|||
|
def _logpmf(self, x, mu, truncation):
|
|||
|
pmf = 0
|
|||
|
for i in range(int(np.max(truncation)) + 1):
|
|||
|
pmf += poisson.pmf(i, mu)
|
|||
|
|
|||
|
# Skip pmf = 1 to avoid warnings
|
|||
|
log_1_m_pmf = np.full_like(pmf, -np.inf)
|
|||
|
loc = pmf > 1
|
|||
|
log_1_m_pmf[loc] = np.nan
|
|||
|
loc = pmf < 1
|
|||
|
log_1_m_pmf[loc] = np.log(1 - pmf[loc])
|
|||
|
logpmf_ = poisson.logpmf(x, mu) - log_1_m_pmf
|
|||
|
#logpmf_[x < truncation + 1] = - np.inf
|
|||
|
return logpmf_
|
|||
|
|
|||
|
def _pmf(self, x, mu, truncation):
|
|||
|
return np.exp(self._logpmf(x, mu, truncation))
|
|||
|
|
|||
|
truncatedpoisson = truncatedpoisson_gen(name='truncatedpoisson',
|
|||
|
longname='Truncated Poisson')
|
|||
|
|
|||
|
class truncatednegbin_gen(rv_discrete):
|
|||
|
'''Truncated Generalized Negative Binomial (NB-P) discrete random variable
|
|||
|
'''
|
|||
|
def _argcheck(self, mu, alpha, p, truncation):
|
|||
|
return (mu >= 0) & (truncation >= -1)
|
|||
|
|
|||
|
def _get_support(self, mu, alpha, p, truncation):
|
|||
|
return truncation + 1, self.b
|
|||
|
|
|||
|
def _logpmf(self, x, mu, alpha, p, truncation):
|
|||
|
size, prob = self.convert_params(mu, alpha, p)
|
|||
|
pmf = 0
|
|||
|
for i in range(int(np.max(truncation)) + 1):
|
|||
|
pmf += nbinom.pmf(i, size, prob)
|
|||
|
|
|||
|
# Skip pmf = 1 to avoid warnings
|
|||
|
log_1_m_pmf = np.full_like(pmf, -np.inf)
|
|||
|
loc = pmf > 1
|
|||
|
log_1_m_pmf[loc] = np.nan
|
|||
|
loc = pmf < 1
|
|||
|
log_1_m_pmf[loc] = np.log(1 - pmf[loc])
|
|||
|
logpmf_ = nbinom.logpmf(x, size, prob) - log_1_m_pmf
|
|||
|
# logpmf_[x < truncation + 1] = - np.inf
|
|||
|
return logpmf_
|
|||
|
|
|||
|
def _pmf(self, x, mu, alpha, p, truncation):
|
|||
|
return np.exp(self._logpmf(x, mu, alpha, p, truncation))
|
|||
|
|
|||
|
def convert_params(self, mu, alpha, p):
|
|||
|
size = 1. / alpha * mu**(2-p)
|
|||
|
prob = size / (size + mu)
|
|||
|
return (size, prob)
|
|||
|
|
|||
|
truncatednegbin = truncatednegbin_gen(name='truncatednegbin',
|
|||
|
longname='Truncated Generalized Negative Binomial')
|
|||
|
|
|||
|
class DiscretizedCount(rv_discrete):
|
|||
|
"""Count distribution based on discretized distribution
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
distr : distribution instance
|
|||
|
d_offset : float
|
|||
|
Offset for integer interval, default is zero.
|
|||
|
The discrete random variable is ``y = floor(x + offset)`` where x is
|
|||
|
the continuous random variable.
|
|||
|
Warning: not verified for all methods.
|
|||
|
add_scale : bool
|
|||
|
If True (default), then the scale of the base distribution is added
|
|||
|
as parameter for the discrete distribution. The scale parameter is in
|
|||
|
the last position.
|
|||
|
kwds : keyword arguments
|
|||
|
The extra keyword arguments are used delegated to the ``__init__`` of
|
|||
|
the super class.
|
|||
|
Their usage has not been checked, e.g. currently the support of the
|
|||
|
distribution is assumed to be all non-negative integers.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
`loc` argument is currently not supported, scale is not available for
|
|||
|
discrete distributions in scipy. The scale parameter of the underlying
|
|||
|
continuous distribution is the last shape parameter in this
|
|||
|
DiscretizedCount distribution if ``add_scale`` is True.
|
|||
|
|
|||
|
The implementation was based mainly on [1]_ and [2]_. However, many new
|
|||
|
discrete distributions have been developed based on the approach that we
|
|||
|
use here. Note, that in many cases authors reparameterize the distribution,
|
|||
|
while this class inherits the parameterization from the underlying
|
|||
|
continuous distribution.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
.. [1] Chakraborty, Subrata, and Dhrubajyoti Chakravarty. "Discrete gamma
|
|||
|
distributions: Properties and parameter estimations." Communications in
|
|||
|
Statistics-Theory and Methods 41, no. 18 (2012): 3301-3324.
|
|||
|
|
|||
|
.. [2] Alzaatreh, Ayman, Carl Lee, and Felix Famoye. 2012. “On the Discrete
|
|||
|
Analogues of Continuous Distributions.” Statistical Methodology 9 (6):
|
|||
|
589–603.
|
|||
|
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
def __new__(cls, *args, **kwds):
|
|||
|
# rv_discrete.__new__ does not allow `kwds`, skip it
|
|||
|
# only does dispatch to multinomial
|
|||
|
return super(rv_discrete, cls).__new__(cls)
|
|||
|
|
|||
|
def __init__(self, distr, d_offset=0, add_scale=True, **kwds):
|
|||
|
# kwds are extras in rv_discrete
|
|||
|
self.distr = distr
|
|||
|
self.d_offset = d_offset
|
|||
|
self._ctor_param = distr._ctor_param
|
|||
|
self.add_scale = add_scale
|
|||
|
if distr.shapes is not None:
|
|||
|
self.k_shapes = len(distr.shapes.split(","))
|
|||
|
if add_scale:
|
|||
|
kwds.update({"shapes": distr.shapes + ", s"})
|
|||
|
self.k_shapes += 1
|
|||
|
else:
|
|||
|
# no shape parameters in underlying distribution
|
|||
|
if add_scale:
|
|||
|
kwds.update({"shapes": "s"})
|
|||
|
self.k_shapes = 1
|
|||
|
else:
|
|||
|
self.k_shapes = 0
|
|||
|
|
|||
|
super().__init__(**kwds)
|
|||
|
|
|||
|
def _updated_ctor_param(self):
|
|||
|
dic = super()._updated_ctor_param()
|
|||
|
dic["distr"] = self.distr
|
|||
|
return dic
|
|||
|
|
|||
|
def _unpack_args(self, args):
|
|||
|
if self.add_scale:
|
|||
|
scale = args[-1]
|
|||
|
args = args[:-1]
|
|||
|
else:
|
|||
|
scale = 1
|
|||
|
return args, scale
|
|||
|
|
|||
|
def _rvs(self, *args, size=None, random_state=None):
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
if size is None:
|
|||
|
size = getattr(self, "_size", 1)
|
|||
|
rv = np.trunc(self.distr.rvs(*args, scale=scale, size=size,
|
|||
|
random_state=random_state) +
|
|||
|
self.d_offset)
|
|||
|
return rv
|
|||
|
|
|||
|
def _pmf(self, x, *args):
|
|||
|
distr = self.distr
|
|||
|
if self.d_offset != 0:
|
|||
|
x = x + self.d_offset
|
|||
|
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
|
|||
|
p = (distr.sf(x, *args, scale=scale) -
|
|||
|
distr.sf(x + 1, *args, scale=scale))
|
|||
|
return p
|
|||
|
|
|||
|
def _cdf(self, x, *args):
|
|||
|
distr = self.distr
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
if self.d_offset != 0:
|
|||
|
x = x + self.d_offset
|
|||
|
p = distr.cdf(x + 1, *args, scale=scale)
|
|||
|
return p
|
|||
|
|
|||
|
def _sf(self, x, *args):
|
|||
|
distr = self.distr
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
if self.d_offset != 0:
|
|||
|
x = x + self.d_offset
|
|||
|
p = distr.sf(x + 1, *args, scale=scale)
|
|||
|
return p
|
|||
|
|
|||
|
def _ppf(self, p, *args):
|
|||
|
distr = self.distr
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
|
|||
|
qc = distr.ppf(p, *args, scale=scale)
|
|||
|
if self.d_offset != 0:
|
|||
|
qc = qc + self.d_offset
|
|||
|
q = np.floor(qc * (1 - 1e-15))
|
|||
|
return q
|
|||
|
|
|||
|
def _isf(self, p, *args):
|
|||
|
distr = self.distr
|
|||
|
args, scale = self._unpack_args(args)
|
|||
|
|
|||
|
qc = distr.isf(p, *args, scale=scale)
|
|||
|
if self.d_offset != 0:
|
|||
|
qc = qc + self.d_offset
|
|||
|
q = np.floor(qc * (1 - 1e-15))
|
|||
|
return q
|
|||
|
|
|||
|
|
|||
|
class DiscretizedModel(GenericLikelihoodModel):
|
|||
|
"""experimental model to fit discretized distribution
|
|||
|
|
|||
|
Count models based on discretized distributions can be used to model
|
|||
|
data that is under- or over-dispersed relative to Poisson or that has
|
|||
|
heavier tails.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
endog : array_like, 1-D
|
|||
|
Univariate data for fitting the distribution.
|
|||
|
exog : None
|
|||
|
Explanatory variables are not supported. The ``exog`` argument is
|
|||
|
only included for consistency in the signature across models.
|
|||
|
distr : DiscretizedCount instance
|
|||
|
(required) Instance of a DiscretizedCount distribution.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
DiscretizedCount
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from scipy import stats
|
|||
|
>>> from statsmodels.distributions.discrete import (
|
|||
|
DiscretizedCount, DiscretizedModel)
|
|||
|
|
|||
|
>>> dd = DiscretizedCount(stats.gamma)
|
|||
|
>>> mod = DiscretizedModel(y, distr=dd)
|
|||
|
>>> res = mod.fit()
|
|||
|
>>> probs = res.predict(which="probs", k_max=5)
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, endog, exog=None, distr=None):
|
|||
|
if exog is not None:
|
|||
|
raise ValueError("exog is not supported")
|
|||
|
|
|||
|
super().__init__(endog, exog, distr=distr)
|
|||
|
self._init_keys.append('distr')
|
|||
|
self.df_resid = len(endog) - distr.k_shapes
|
|||
|
self.df_model = 0
|
|||
|
self.k_extra = distr.k_shapes # no constant subtracted
|
|||
|
self.k_constant = 0
|
|||
|
self.nparams = distr.k_shapes # needed for start_params
|
|||
|
self.start_params = 0.5 * np.ones(self.nparams)
|
|||
|
|
|||
|
def loglike(self, params):
|
|||
|
|
|||
|
# this does not allow exog yet,
|
|||
|
# model `params` are also distribution `args`
|
|||
|
# For regression model this needs to be replaced by a conversion method
|
|||
|
args = params
|
|||
|
ll = np.log(self.distr._pmf(self.endog, *args))
|
|||
|
return ll.sum()
|
|||
|
|
|||
|
def predict(self, params, exog=None, which=None, k_max=20):
|
|||
|
|
|||
|
if exog is not None:
|
|||
|
raise ValueError("exog is not supported")
|
|||
|
|
|||
|
args = params
|
|||
|
if which == "probs":
|
|||
|
pr = self.distr.pmf(np.arange(k_max), *args)
|
|||
|
return pr
|
|||
|
else:
|
|||
|
raise ValueError('only which="probs" is currently implemented')
|
|||
|
|
|||
|
def get_distr(self, params):
|
|||
|
"""frozen distribution instance of the discrete distribution.
|
|||
|
"""
|
|||
|
args = params
|
|||
|
distr = self.distr(*args)
|
|||
|
return distr
|