AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/discrete/diagnostic.py

"""
Created on Wed Nov 18 15:17:58 2020

Author: Josef Perktold
License: BSD-3

"""

import warnings

import numpy as np

from statsmodels.tools.decorators import cache_readonly

from statsmodels.stats.diagnostic_gen import (
    test_chisquare_binning
    )
from statsmodels.discrete._diagnostics_count import (
    test_poisson_dispersion,
    # _test_poisson_dispersion_generic,
    test_poisson_zeroinflation_jh,
    test_poisson_zeroinflation_broek,
    test_poisson_zeros,
    test_chisquare_prob,
    plot_probs
    )


class CountDiagnostic:
    """Diagnostic and specification tests and plots for Count model

    status: experimental

    Parameters
    ----------
    results : Results instance of a count model.
    y_max : int
        Largest count to include when computing predicted probabilities for
        counts. Default is the largest observed count.

    """

    def __init__(self, results, y_max=None):
        self.results = results
        self.y_max = y_max

    @cache_readonly
    def probs_predicted(self):
        if self.y_max is not None:
            kwds = {"y_values": np.arange(self.y_max + 1)}
        else:
            kwds = {}
        return self.results.predict(which="prob", **kwds)

    def test_chisquare_prob(self, bin_edges=None, method=None):
        """Moment test for binned probabilites using OPG.

        Paramters
        ---------
        binedges : array_like or None
            This defines which counts are included in the test on frequencies
            and how counts are combined in bins.
            The default if bin_edges is None will change in future.
            See Notes and Example sections below.
        method : str
            Currently only `method = "opg"` is available.
            If method is None, the OPG will be used, but the default might
            change in future versions.
            See Notes section below.

        Returns
        -------
        test result

        Notes
        -----
        Warning: The current default can have many empty or nearly empty bins.
        The default number of bins is given by max(endog).
        Currently it is recommended to limit the number of bins explicitly,
        see Examples below.
        Binning will change in future and automatic binning will be added.

        Currently only the outer product of gradient, OPG, method is
        implemented. In many case, the OPG version of a specification test
        overrejects in small samples.
        Specialized tests that use observed or expected information matrix
        often have better small sample properties.
        The default method will change if better methods are added.

        Examples
        --------
        The following call is a test for the probability of zeros
        `test_chisquare_prob(bin_edges=np.arange(3))`

        `test_chisquare_prob(bin_edges=np.arange(10))` tests the hypothesis
        that the frequencies for counts up to 7 correspond to the estimated
        Poisson distributions.
        In this case, edges are 0, ..., 9 which defines 9 bins for
        counts 0 to 8. The last bin is dropped, so the joint test hypothesis is
        that the observed aggregated frequencies for counts 0 to 7 correspond
        to the model prediction for those frequencies. Predicted probabilites
        Prob(y_i = k | x) are aggregated over observations ``i``.

        """
        kwds = {}
        if bin_edges is not None:
            # TODO: verify upper bound, we drop last bin (may be open, inf)
            kwds["y_values"] = np.arange(bin_edges[-2] + 1)
        probs = self.results.predict(which="prob", **kwds)
        res = test_chisquare_prob(self.results, probs, bin_edges=bin_edges,
                                  method=method)
        return res

    def plot_probs(self, label='predicted', upp_xlim=None,
                   fig=None):
        """Plot observed versus predicted frequencies for entire sample.
        """
        probs_predicted = self.probs_predicted.sum(0)
        k_probs = len(probs_predicted)
        freq = np.bincount(self.results.model.endog.astype(int),
                           minlength=k_probs)[:k_probs]
        fig = plot_probs(freq, probs_predicted,
                         label=label, upp_xlim=upp_xlim,
                         fig=fig)
        return fig


class PoissonDiagnostic(CountDiagnostic):
    """Diagnostic and specification tests and plots for Poisson model

    status: experimental

    Parameters
    ----------
    results : PoissonResults instance

    """

    def _init__(self, results):
        self.results = results

    def test_dispersion(self):
        """Test for excess (over or under) dispersion in Poisson.

        Returns
        -------
        dispersion results
        """
        res = test_poisson_dispersion(self.results)
        return res

    def test_poisson_zeroinflation(self, method="prob", exog_infl=None):
        """Test for excess zeros, zero inflation or deflation.

        Parameters
        ----------
        method : str
            Three methods ara available for the test:

             - "prob" : moment test for the probability of zeros
             - "broek" : score test against zero inflation with or without
                explanatory variables for inflation

        exog_infl : array_like or None
            Optional explanatory variables under the alternative of zero
            inflation, or deflation. Only used if method is "broek".

        Returns
        -------
        results

        Notes
        -----
        If method = "prob", then the moment test of He et al 1_ is used based
        on the explicit formula in Tang and Tang 2_.

        If method = "broek" and exog_infl is None, then the test by Van den
        Broek 3_ is used. This is a score test against and alternative of
        constant zero inflation or deflation.

        If method = "broek" and exog_infl is provided, then the extension of
        the broek test to varying zero inflation or deflation by Jansakul and
        Hinde is used.

        Warning: The Broek and the Jansakul and Hinde tests are not numerically
        stable when the probability of zeros in Poisson is small, i.e. if the
        conditional means of the estimated Poisson distribution are large.
        In these cases, p-values will not be accurate.
        """
        if method == "prob":
            if exog_infl is not None:
                warnings.warn('exog_infl is only used if method = "broek"')
            res = test_poisson_zeros(self.results)
        elif method == "broek":
            if exog_infl is None:
                res = test_poisson_zeroinflation_broek(self.results)
            else:
                exog_infl = np.asarray(exog_infl)
                if exog_infl.ndim == 1:
                    exog_infl = exog_infl[:, None]
                res = test_poisson_zeroinflation_jh(self.results,
                                                    exog_infl=exog_infl)

        return res

    def _chisquare_binned(self, sort_var=None, bins=10, k_max=None, df=None,
                          sort_method="quicksort", frac_upp=0.1,
                          alpha_nc=0.05):
        """Hosmer-Lemeshow style test for count data.

        Note, this does not take into account that parameters are estimated.
        The distribution of the test statistic is only an approximation.

        This corresponds to the Hosmer-Lemeshow type test for an ordinal
        response variable. The outcome space y = k is partitioned into bins
        and treated as ordinal variable.
        The observations are split into approximately equal sized groups
        of observations sorted according the ``sort_var``.

        """

        if sort_var is None:
            sort_var = self.results.predict(which="lin")

        endog = self.results.model.endog
        # not sure yet how this is supposed to work
        # max_count = endog.max * 2
        # no option for max count in predict
        # counts = (endog == np.arange(max_count)).astype(int)
        expected = self.results.predict(which="prob")
        counts = (endog[:, None] == np.arange(expected.shape[1])).astype(int)

        # truncate upper tail
        if k_max is None:
            nobs = len(endog)
            icumcounts_sum = nobs - counts.sum(0).cumsum(0)
            k_max = np.argmax(icumcounts_sum < nobs * frac_upp) - 1
        expected = expected[:, :k_max]
        counts = counts[:, :k_max]
        # we should correct for or include truncated upper bin
        # inplace modification, we cannot reuse expected and counts anymore
        expected[:, -1] += 1 - expected.sum(1)
        counts[:, -1] += 1 - counts.sum(1)

        # TODO: what's the correct df, same as for multinomial/ordered ?
        res = test_chisquare_binning(counts, expected, sort_var=sort_var,
                                     bins=bins, df=df, ordered=True,
                                     sort_method=sort_method,
                                     alpha_nc=alpha_nc)
        return res