AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/discrete/diagnostic.py
2024-10-02 22:15:59 +04:00

251 lines
8.9 KiB
Python

"""
Created on Wed Nov 18 15:17:58 2020
Author: Josef Perktold
License: BSD-3
"""
import warnings
import numpy as np
from statsmodels.tools.decorators import cache_readonly
from statsmodels.stats.diagnostic_gen import (
test_chisquare_binning
)
from statsmodels.discrete._diagnostics_count import (
test_poisson_dispersion,
# _test_poisson_dispersion_generic,
test_poisson_zeroinflation_jh,
test_poisson_zeroinflation_broek,
test_poisson_zeros,
test_chisquare_prob,
plot_probs
)
class CountDiagnostic:
"""Diagnostic and specification tests and plots for Count model
status: experimental
Parameters
----------
results : Results instance of a count model.
y_max : int
Largest count to include when computing predicted probabilities for
counts. Default is the largest observed count.
"""
def __init__(self, results, y_max=None):
self.results = results
self.y_max = y_max
@cache_readonly
def probs_predicted(self):
if self.y_max is not None:
kwds = {"y_values": np.arange(self.y_max + 1)}
else:
kwds = {}
return self.results.predict(which="prob", **kwds)
def test_chisquare_prob(self, bin_edges=None, method=None):
"""Moment test for binned probabilites using OPG.
Paramters
---------
binedges : array_like or None
This defines which counts are included in the test on frequencies
and how counts are combined in bins.
The default if bin_edges is None will change in future.
See Notes and Example sections below.
method : str
Currently only `method = "opg"` is available.
If method is None, the OPG will be used, but the default might
change in future versions.
See Notes section below.
Returns
-------
test result
Notes
-----
Warning: The current default can have many empty or nearly empty bins.
The default number of bins is given by max(endog).
Currently it is recommended to limit the number of bins explicitly,
see Examples below.
Binning will change in future and automatic binning will be added.
Currently only the outer product of gradient, OPG, method is
implemented. In many case, the OPG version of a specification test
overrejects in small samples.
Specialized tests that use observed or expected information matrix
often have better small sample properties.
The default method will change if better methods are added.
Examples
--------
The following call is a test for the probability of zeros
`test_chisquare_prob(bin_edges=np.arange(3))`
`test_chisquare_prob(bin_edges=np.arange(10))` tests the hypothesis
that the frequencies for counts up to 7 correspond to the estimated
Poisson distributions.
In this case, edges are 0, ..., 9 which defines 9 bins for
counts 0 to 8. The last bin is dropped, so the joint test hypothesis is
that the observed aggregated frequencies for counts 0 to 7 correspond
to the model prediction for those frequencies. Predicted probabilites
Prob(y_i = k | x) are aggregated over observations ``i``.
"""
kwds = {}
if bin_edges is not None:
# TODO: verify upper bound, we drop last bin (may be open, inf)
kwds["y_values"] = np.arange(bin_edges[-2] + 1)
probs = self.results.predict(which="prob", **kwds)
res = test_chisquare_prob(self.results, probs, bin_edges=bin_edges,
method=method)
return res
def plot_probs(self, label='predicted', upp_xlim=None,
fig=None):
"""Plot observed versus predicted frequencies for entire sample.
"""
probs_predicted = self.probs_predicted.sum(0)
k_probs = len(probs_predicted)
freq = np.bincount(self.results.model.endog.astype(int),
minlength=k_probs)[:k_probs]
fig = plot_probs(freq, probs_predicted,
label=label, upp_xlim=upp_xlim,
fig=fig)
return fig
class PoissonDiagnostic(CountDiagnostic):
"""Diagnostic and specification tests and plots for Poisson model
status: experimental
Parameters
----------
results : PoissonResults instance
"""
def _init__(self, results):
self.results = results
def test_dispersion(self):
"""Test for excess (over or under) dispersion in Poisson.
Returns
-------
dispersion results
"""
res = test_poisson_dispersion(self.results)
return res
def test_poisson_zeroinflation(self, method="prob", exog_infl=None):
"""Test for excess zeros, zero inflation or deflation.
Parameters
----------
method : str
Three methods ara available for the test:
- "prob" : moment test for the probability of zeros
- "broek" : score test against zero inflation with or without
explanatory variables for inflation
exog_infl : array_like or None
Optional explanatory variables under the alternative of zero
inflation, or deflation. Only used if method is "broek".
Returns
-------
results
Notes
-----
If method = "prob", then the moment test of He et al 1_ is used based
on the explicit formula in Tang and Tang 2_.
If method = "broek" and exog_infl is None, then the test by Van den
Broek 3_ is used. This is a score test against and alternative of
constant zero inflation or deflation.
If method = "broek" and exog_infl is provided, then the extension of
the broek test to varying zero inflation or deflation by Jansakul and
Hinde is used.
Warning: The Broek and the Jansakul and Hinde tests are not numerically
stable when the probability of zeros in Poisson is small, i.e. if the
conditional means of the estimated Poisson distribution are large.
In these cases, p-values will not be accurate.
"""
if method == "prob":
if exog_infl is not None:
warnings.warn('exog_infl is only used if method = "broek"')
res = test_poisson_zeros(self.results)
elif method == "broek":
if exog_infl is None:
res = test_poisson_zeroinflation_broek(self.results)
else:
exog_infl = np.asarray(exog_infl)
if exog_infl.ndim == 1:
exog_infl = exog_infl[:, None]
res = test_poisson_zeroinflation_jh(self.results,
exog_infl=exog_infl)
return res
def _chisquare_binned(self, sort_var=None, bins=10, k_max=None, df=None,
sort_method="quicksort", frac_upp=0.1,
alpha_nc=0.05):
"""Hosmer-Lemeshow style test for count data.
Note, this does not take into account that parameters are estimated.
The distribution of the test statistic is only an approximation.
This corresponds to the Hosmer-Lemeshow type test for an ordinal
response variable. The outcome space y = k is partitioned into bins
and treated as ordinal variable.
The observations are split into approximately equal sized groups
of observations sorted according the ``sort_var``.
"""
if sort_var is None:
sort_var = self.results.predict(which="lin")
endog = self.results.model.endog
# not sure yet how this is supposed to work
# max_count = endog.max * 2
# no option for max count in predict
# counts = (endog == np.arange(max_count)).astype(int)
expected = self.results.predict(which="prob")
counts = (endog[:, None] == np.arange(expected.shape[1])).astype(int)
# truncate upper tail
if k_max is None:
nobs = len(endog)
icumcounts_sum = nobs - counts.sum(0).cumsum(0)
k_max = np.argmax(icumcounts_sum < nobs * frac_upp) - 1
expected = expected[:, :k_max]
counts = counts[:, :k_max]
# we should correct for or include truncated upper bin
# inplace modification, we cannot reuse expected and counts anymore
expected[:, -1] += 1 - expected.sum(1)
counts[:, -1] += 1 - counts.sum(1)
# TODO: what's the correct df, same as for multinomial/ordered ?
res = test_chisquare_binning(counts, expected, sort_var=sort_var,
bins=bins, df=df, ordered=True,
sort_method=sort_method,
alpha_nc=alpha_nc)
return res