251 lines
8.9 KiB
Python
251 lines
8.9 KiB
Python
"""
|
|
Created on Wed Nov 18 15:17:58 2020
|
|
|
|
Author: Josef Perktold
|
|
License: BSD-3
|
|
|
|
"""
|
|
|
|
import warnings
|
|
|
|
import numpy as np
|
|
|
|
from statsmodels.tools.decorators import cache_readonly
|
|
|
|
from statsmodels.stats.diagnostic_gen import (
|
|
test_chisquare_binning
|
|
)
|
|
from statsmodels.discrete._diagnostics_count import (
|
|
test_poisson_dispersion,
|
|
# _test_poisson_dispersion_generic,
|
|
test_poisson_zeroinflation_jh,
|
|
test_poisson_zeroinflation_broek,
|
|
test_poisson_zeros,
|
|
test_chisquare_prob,
|
|
plot_probs
|
|
)
|
|
|
|
|
|
class CountDiagnostic:
|
|
"""Diagnostic and specification tests and plots for Count model
|
|
|
|
status: experimental
|
|
|
|
Parameters
|
|
----------
|
|
results : Results instance of a count model.
|
|
y_max : int
|
|
Largest count to include when computing predicted probabilities for
|
|
counts. Default is the largest observed count.
|
|
|
|
"""
|
|
|
|
def __init__(self, results, y_max=None):
|
|
self.results = results
|
|
self.y_max = y_max
|
|
|
|
@cache_readonly
|
|
def probs_predicted(self):
|
|
if self.y_max is not None:
|
|
kwds = {"y_values": np.arange(self.y_max + 1)}
|
|
else:
|
|
kwds = {}
|
|
return self.results.predict(which="prob", **kwds)
|
|
|
|
def test_chisquare_prob(self, bin_edges=None, method=None):
|
|
"""Moment test for binned probabilites using OPG.
|
|
|
|
Paramters
|
|
---------
|
|
binedges : array_like or None
|
|
This defines which counts are included in the test on frequencies
|
|
and how counts are combined in bins.
|
|
The default if bin_edges is None will change in future.
|
|
See Notes and Example sections below.
|
|
method : str
|
|
Currently only `method = "opg"` is available.
|
|
If method is None, the OPG will be used, but the default might
|
|
change in future versions.
|
|
See Notes section below.
|
|
|
|
Returns
|
|
-------
|
|
test result
|
|
|
|
Notes
|
|
-----
|
|
Warning: The current default can have many empty or nearly empty bins.
|
|
The default number of bins is given by max(endog).
|
|
Currently it is recommended to limit the number of bins explicitly,
|
|
see Examples below.
|
|
Binning will change in future and automatic binning will be added.
|
|
|
|
Currently only the outer product of gradient, OPG, method is
|
|
implemented. In many case, the OPG version of a specification test
|
|
overrejects in small samples.
|
|
Specialized tests that use observed or expected information matrix
|
|
often have better small sample properties.
|
|
The default method will change if better methods are added.
|
|
|
|
Examples
|
|
--------
|
|
The following call is a test for the probability of zeros
|
|
`test_chisquare_prob(bin_edges=np.arange(3))`
|
|
|
|
`test_chisquare_prob(bin_edges=np.arange(10))` tests the hypothesis
|
|
that the frequencies for counts up to 7 correspond to the estimated
|
|
Poisson distributions.
|
|
In this case, edges are 0, ..., 9 which defines 9 bins for
|
|
counts 0 to 8. The last bin is dropped, so the joint test hypothesis is
|
|
that the observed aggregated frequencies for counts 0 to 7 correspond
|
|
to the model prediction for those frequencies. Predicted probabilites
|
|
Prob(y_i = k | x) are aggregated over observations ``i``.
|
|
|
|
"""
|
|
kwds = {}
|
|
if bin_edges is not None:
|
|
# TODO: verify upper bound, we drop last bin (may be open, inf)
|
|
kwds["y_values"] = np.arange(bin_edges[-2] + 1)
|
|
probs = self.results.predict(which="prob", **kwds)
|
|
res = test_chisquare_prob(self.results, probs, bin_edges=bin_edges,
|
|
method=method)
|
|
return res
|
|
|
|
def plot_probs(self, label='predicted', upp_xlim=None,
|
|
fig=None):
|
|
"""Plot observed versus predicted frequencies for entire sample.
|
|
"""
|
|
probs_predicted = self.probs_predicted.sum(0)
|
|
k_probs = len(probs_predicted)
|
|
freq = np.bincount(self.results.model.endog.astype(int),
|
|
minlength=k_probs)[:k_probs]
|
|
fig = plot_probs(freq, probs_predicted,
|
|
label=label, upp_xlim=upp_xlim,
|
|
fig=fig)
|
|
return fig
|
|
|
|
|
|
class PoissonDiagnostic(CountDiagnostic):
|
|
"""Diagnostic and specification tests and plots for Poisson model
|
|
|
|
status: experimental
|
|
|
|
Parameters
|
|
----------
|
|
results : PoissonResults instance
|
|
|
|
"""
|
|
|
|
def _init__(self, results):
|
|
self.results = results
|
|
|
|
def test_dispersion(self):
|
|
"""Test for excess (over or under) dispersion in Poisson.
|
|
|
|
Returns
|
|
-------
|
|
dispersion results
|
|
"""
|
|
res = test_poisson_dispersion(self.results)
|
|
return res
|
|
|
|
def test_poisson_zeroinflation(self, method="prob", exog_infl=None):
|
|
"""Test for excess zeros, zero inflation or deflation.
|
|
|
|
Parameters
|
|
----------
|
|
method : str
|
|
Three methods ara available for the test:
|
|
|
|
- "prob" : moment test for the probability of zeros
|
|
- "broek" : score test against zero inflation with or without
|
|
explanatory variables for inflation
|
|
|
|
exog_infl : array_like or None
|
|
Optional explanatory variables under the alternative of zero
|
|
inflation, or deflation. Only used if method is "broek".
|
|
|
|
Returns
|
|
-------
|
|
results
|
|
|
|
Notes
|
|
-----
|
|
If method = "prob", then the moment test of He et al 1_ is used based
|
|
on the explicit formula in Tang and Tang 2_.
|
|
|
|
If method = "broek" and exog_infl is None, then the test by Van den
|
|
Broek 3_ is used. This is a score test against and alternative of
|
|
constant zero inflation or deflation.
|
|
|
|
If method = "broek" and exog_infl is provided, then the extension of
|
|
the broek test to varying zero inflation or deflation by Jansakul and
|
|
Hinde is used.
|
|
|
|
Warning: The Broek and the Jansakul and Hinde tests are not numerically
|
|
stable when the probability of zeros in Poisson is small, i.e. if the
|
|
conditional means of the estimated Poisson distribution are large.
|
|
In these cases, p-values will not be accurate.
|
|
"""
|
|
if method == "prob":
|
|
if exog_infl is not None:
|
|
warnings.warn('exog_infl is only used if method = "broek"')
|
|
res = test_poisson_zeros(self.results)
|
|
elif method == "broek":
|
|
if exog_infl is None:
|
|
res = test_poisson_zeroinflation_broek(self.results)
|
|
else:
|
|
exog_infl = np.asarray(exog_infl)
|
|
if exog_infl.ndim == 1:
|
|
exog_infl = exog_infl[:, None]
|
|
res = test_poisson_zeroinflation_jh(self.results,
|
|
exog_infl=exog_infl)
|
|
|
|
return res
|
|
|
|
def _chisquare_binned(self, sort_var=None, bins=10, k_max=None, df=None,
|
|
sort_method="quicksort", frac_upp=0.1,
|
|
alpha_nc=0.05):
|
|
"""Hosmer-Lemeshow style test for count data.
|
|
|
|
Note, this does not take into account that parameters are estimated.
|
|
The distribution of the test statistic is only an approximation.
|
|
|
|
This corresponds to the Hosmer-Lemeshow type test for an ordinal
|
|
response variable. The outcome space y = k is partitioned into bins
|
|
and treated as ordinal variable.
|
|
The observations are split into approximately equal sized groups
|
|
of observations sorted according the ``sort_var``.
|
|
|
|
"""
|
|
|
|
if sort_var is None:
|
|
sort_var = self.results.predict(which="lin")
|
|
|
|
endog = self.results.model.endog
|
|
# not sure yet how this is supposed to work
|
|
# max_count = endog.max * 2
|
|
# no option for max count in predict
|
|
# counts = (endog == np.arange(max_count)).astype(int)
|
|
expected = self.results.predict(which="prob")
|
|
counts = (endog[:, None] == np.arange(expected.shape[1])).astype(int)
|
|
|
|
# truncate upper tail
|
|
if k_max is None:
|
|
nobs = len(endog)
|
|
icumcounts_sum = nobs - counts.sum(0).cumsum(0)
|
|
k_max = np.argmax(icumcounts_sum < nobs * frac_upp) - 1
|
|
expected = expected[:, :k_max]
|
|
counts = counts[:, :k_max]
|
|
# we should correct for or include truncated upper bin
|
|
# inplace modification, we cannot reuse expected and counts anymore
|
|
expected[:, -1] += 1 - expected.sum(1)
|
|
counts[:, -1] += 1 - counts.sum(1)
|
|
|
|
# TODO: what's the correct df, same as for multinomial/ordered ?
|
|
res = test_chisquare_binning(counts, expected, sort_var=sort_var,
|
|
bins=bins, df=df, ordered=True,
|
|
sort_method=sort_method,
|
|
alpha_nc=alpha_nc)
|
|
return res
|