AIM-PIbd-32-Kurbanova-A-A/aimenv/Lib/site-packages/statsmodels/discrete/_diagnostics_count.py
2024-10-02 22:15:59 +04:00

606 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Created on Fri Sep 15 12:53:45 2017
Author: Josef Perktold
"""
import numpy as np
from scipy import stats
import pandas as pd
from statsmodels.stats.base import HolderTuple
from statsmodels.discrete.discrete_model import Poisson
from statsmodels.regression.linear_model import OLS
def _combine_bins(edge_index, x):
"""group columns into bins using sum
This is mainly a helper function for combining probabilities into cells.
It similar to `np.add.reduceat(x, edge_index, axis=-1)` except for the
treatment of the last index and last cell.
Parameters
----------
edge_index : array_like
This defines the (zero-based) indices for the columns that are be
combined. Each index in `edge_index` except the last is the starting
index for a bin. The largest index in a bin is the next edge_index-1.
x : 1d or 2d array
array for which columns are combined. If x is 1-dimensional that it
will be treated as a 2-d row vector.
Returns
-------
x_new : ndarray
k_li : ndarray
Count of columns combined in bin.
Examples
--------
>>> dia.combine_bins([0,1,5], np.arange(4))
(array([0, 6]), array([1, 4]))
this aggregates to two bins with the sum of 1 and 4 elements
>>> np.arange(4)[0].sum()
0
>>> np.arange(4)[1:5].sum()
6
If the rightmost index is smaller than len(x)+1, then the remaining
columns will not be included.
>>> dia.combine_bins([0,1,3], np.arange(4))
(array([0, 3]), array([1, 2]))
"""
x = np.asarray(x)
if x.ndim == 1:
is_1d = True
x = x[None, :]
else:
is_1d = False
xli = []
kli = []
for bin_idx in range(len(edge_index) - 1):
i, j = edge_index[bin_idx : bin_idx + 2]
xli.append(x[:, i:j].sum(1))
kli.append(j - i)
x_new = np.column_stack(xli)
if is_1d:
x_new = x_new.squeeze()
return x_new, np.asarray(kli)
def plot_probs(freq, probs_predicted, label='predicted', upp_xlim=None,
fig=None):
"""diagnostic plots for comparing two lists of discrete probabilities
Parameters
----------
freq, probs_predicted : nd_arrays
two arrays of probabilities, this can be any probabilities for
the same events, default is designed for comparing predicted
and observed probabilities
label : str or tuple
If string, then it will be used as the label for probs_predicted and
"freq" is used for the other probabilities.
If label is a tuple of strings, then the first is they are used as
label for both probabilities
upp_xlim : None or int
If it is not None, then the xlim of the first two plots are set to
(0, upp_xlim), otherwise the matplotlib default is used
fig : None or matplotlib figure instance
If fig is provided, then the axes will be added to it in a (3,1)
subplots, otherwise a matplotlib figure instance is created
Returns
-------
Figure
The figure contains 3 subplot with probabilities, cumulative
probabilities and a PP-plot
"""
if isinstance(label, list):
label0, label1 = label
else:
label0, label1 = 'freq', label
if fig is None:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,12))
ax1 = fig.add_subplot(311)
ax1.plot(freq, '-o', label=label0)
ax1.plot(probs_predicted, '-d', label=label1)
if upp_xlim is not None:
ax1.set_xlim(0, upp_xlim)
ax1.legend()
ax1.set_title('probabilities')
ax2 = fig.add_subplot(312)
ax2.plot(np.cumsum(freq), '-o', label=label0)
ax2.plot(np.cumsum(probs_predicted), '-d', label=label1)
if upp_xlim is not None:
ax2.set_xlim(0, upp_xlim)
ax2.legend()
ax2.set_title('cumulative probabilities')
ax3 = fig.add_subplot(313)
ax3.plot(np.cumsum(probs_predicted), np.cumsum(freq), 'o')
ax3.plot(np.arange(len(freq)) / len(freq), np.arange(len(freq)) / len(freq))
ax3.set_title('PP-plot')
ax3.set_xlabel(label1)
ax3.set_ylabel(label0)
return fig
def test_chisquare_prob(results, probs, bin_edges=None, method=None):
"""
chisquare test for predicted probabilities using cmt-opg
Parameters
----------
results : results instance
Instance of a count regression results
probs : ndarray
Array of predicted probabilities with observations
in rows and event counts in columns
bin_edges : None or array
intervals to combine several counts into cells
see combine_bins
Returns
-------
(api not stable, replace by test-results class)
statistic : float
chisquare statistic for tes
p-value : float
p-value of test
df : int
degrees of freedom for chisquare distribution
extras : ???
currently returns a tuple with some intermediate results
(diff, res_aux)
Notes
-----
Status : experimental, no verified unit tests, needs to be generalized
currently only OPG version with auxiliary regression is implemented
Assumes counts are np.arange(probs.shape[1]), i.e. consecutive
integers starting at zero.
Auxiliary regression drops the last column of binned probs to avoid
that probabilities sum to 1.
References
----------
.. [1] Andrews, Donald W. K. 1988a. “Chi-Square Diagnostic Tests for
Econometric Models: Theory.” Econometrica 56 (6): 141953.
https://doi.org/10.2307/1913105.
.. [2] Andrews, Donald W. K. 1988b. “Chi-Square Diagnostic Tests for
Econometric Models.” Journal of Econometrics 37 (1): 13556.
https://doi.org/10.1016/0304-4076(88)90079-6.
.. [3] Manjón, M., and O. Martínez. 2014. “The Chi-Squared Goodness-of-Fit
Test for Count-Data Models.” Stata Journal 14 (4): 798816.
"""
res = results
score_obs = results.model.score_obs(results.params)
d_ind = (res.model.endog[:, None] == np.arange(probs.shape[1])).astype(int)
if bin_edges is not None:
d_ind_bins, k_bins = _combine_bins(bin_edges, d_ind)
probs_bins, k_bins = _combine_bins(bin_edges, probs)
k_bins = probs_bins.shape[-1]
else:
d_ind_bins, k_bins = d_ind, d_ind.shape[1]
probs_bins = probs
diff1 = d_ind_bins - probs_bins
# diff2 = (1 - d_ind.sum(1)) - (1 - probs_bins.sum(1))
x_aux = np.column_stack((score_obs, diff1[:, :-1])) # diff2))
nobs = x_aux.shape[0]
res_aux = OLS(np.ones(nobs), x_aux).fit()
chi2_stat = nobs * (1 - res_aux.ssr / res_aux.uncentered_tss)
df = res_aux.model.rank - score_obs.shape[1]
if df < k_bins - 1:
# not a problem in general, but it can be for OPG version
import warnings
# TODO: Warning shows up in Monte Carlo loop, skip for now
warnings.warn('auxiliary model is rank deficient')
statistic = chi2_stat
pvalue = stats.chi2.sf(chi2_stat, df)
res = HolderTuple(
statistic=statistic,
pvalue=pvalue,
df=df,
diff1=diff1,
res_aux=res_aux,
distribution="chi2",
)
return res
class DispersionResults(HolderTuple):
def summary_frame(self):
frame = pd.DataFrame({
"statistic": self.statistic,
"pvalue": self.pvalue,
"method": self.method,
"alternative": self.alternative
})
return frame
def test_poisson_dispersion(results, method="all", _old=False):
"""Score/LM type tests for Poisson variance assumptions
Null Hypothesis is
H0: var(y) = E(y) and assuming E(y) is correctly specified
H1: var(y) ~= E(y)
The tests are based on the constrained model, i.e. the Poisson model.
The tests differ in their assumed alternatives, and in their maintained
assumptions.
Parameters
----------
results : Poisson results instance
This can be a results instance for either a discrete Poisson or a GLM
with family Poisson.
method : str
Not used yet. Currently results for all methods are returned.
_old : bool
Temporary keyword for backwards compatibility, will be removed
in future version of statsmodels.
Returns
-------
res : instance
The instance of DispersionResults has the hypothesis test results,
statistic, pvalue, method, alternative, as main attributes and a
summary_frame method that returns the results as pandas DataFrame.
"""
if method not in ["all"]:
raise ValueError(f'unknown method "{method}"')
if hasattr(results, '_results'):
results = results._results
endog = results.model.endog
nobs = endog.shape[0] # TODO: use attribute, may need to be added
fitted = results.predict()
# fitted = results.fittedvalues # discrete has linear prediction
# this assumes Poisson
resid2 = results.resid_response**2
var_resid_endog = (resid2 - endog)
var_resid_fitted = (resid2 - fitted)
std1 = np.sqrt(2 * (fitted**2).sum())
var_resid_endog_sum = var_resid_endog.sum()
dean_a = var_resid_fitted.sum() / std1
dean_b = var_resid_endog_sum / std1
dean_c = (var_resid_endog / fitted).sum() / np.sqrt(2 * nobs)
pval_dean_a = 2 * stats.norm.sf(np.abs(dean_a))
pval_dean_b = 2 * stats.norm.sf(np.abs(dean_b))
pval_dean_c = 2 * stats.norm.sf(np.abs(dean_c))
results_all = [[dean_a, pval_dean_a],
[dean_b, pval_dean_b],
[dean_c, pval_dean_c]]
description = [['Dean A', 'mu (1 + a mu)'],
['Dean B', 'mu (1 + a mu)'],
['Dean C', 'mu (1 + a)']]
# Cameron Trived auxiliary regression page 78 count book 1989
endog_v = var_resid_endog / fitted
res_ols_nb2 = OLS(endog_v, fitted).fit(use_t=False)
stat_ols_nb2 = res_ols_nb2.tvalues[0]
pval_ols_nb2 = res_ols_nb2.pvalues[0]
results_all.append([stat_ols_nb2, pval_ols_nb2])
description.append(['CT nb2', 'mu (1 + a mu)'])
res_ols_nb1 = OLS(endog_v, fitted).fit(use_t=False)
stat_ols_nb1 = res_ols_nb1.tvalues[0]
pval_ols_nb1 = res_ols_nb1.pvalues[0]
results_all.append([stat_ols_nb1, pval_ols_nb1])
description.append(['CT nb1', 'mu (1 + a)'])
endog_v = var_resid_endog / fitted
res_ols_nb2 = OLS(endog_v, fitted).fit(cov_type='HC3', use_t=False)
stat_ols_hc1_nb2 = res_ols_nb2.tvalues[0]
pval_ols_hc1_nb2 = res_ols_nb2.pvalues[0]
results_all.append([stat_ols_hc1_nb2, pval_ols_hc1_nb2])
description.append(['CT nb2 HC3', 'mu (1 + a mu)'])
res_ols_nb1 = OLS(endog_v, np.ones(len(endog_v))).fit(cov_type='HC3',
use_t=False)
stat_ols_hc1_nb1 = res_ols_nb1.tvalues[0]
pval_ols_hc1_nb1 = res_ols_nb1.pvalues[0]
results_all.append([stat_ols_hc1_nb1, pval_ols_hc1_nb1])
description.append(['CT nb1 HC3', 'mu (1 + a)'])
results_all = np.array(results_all)
if _old:
# for backwards compatibility in 0.14, remove in later versions
return results_all, description
else:
res = DispersionResults(
statistic=results_all[:, 0],
pvalue=results_all[:, 1],
method=[i[0] for i in description],
alternative=[i[1] for i in description],
name="Poisson Dispersion Test"
)
return res
def _test_poisson_dispersion_generic(
results,
exog_new_test,
exog_new_control=None,
include_score=False,
use_endog=True,
cov_type='HC3',
cov_kwds=None,
use_t=False
):
"""A variable addition test for the variance function
This uses an artificial regression to calculate a variant of an LM or
generalized score test for the specification of the variance assumption
in a Poisson model. The performed test is a Wald test on the coefficients
of the `exog_new_test`.
Warning: insufficiently tested, especially for options
"""
if hasattr(results, '_results'):
results = results._results
endog = results.model.endog
nobs = endog.shape[0] # TODO: use attribute, may need to be added
# fitted = results.fittedvalues # generic has linpred as fittedvalues
fitted = results.predict()
resid2 = results.resid_response**2
# the following assumes Poisson
if use_endog:
var_resid = (resid2 - endog)
else:
var_resid = (resid2 - fitted)
endog_v = var_resid / fitted
k_constraints = exog_new_test.shape[1]
ex_list = [exog_new_test]
if include_score:
score_obs = results.model.score_obs(results.params)
ex_list.append(score_obs)
if exog_new_control is not None:
ex_list.append(score_obs)
if len(ex_list) > 1:
ex = np.column_stack(ex_list)
use_wald = True
else:
ex = ex_list[0] # no control variables in exog
use_wald = False
res_ols = OLS(endog_v, ex).fit(cov_type=cov_type, cov_kwds=cov_kwds,
use_t=use_t)
if use_wald:
# we have controls and need to test coefficients
k_vars = ex.shape[1]
constraints = np.eye(k_constraints, k_vars)
ht = res_ols.wald_test(constraints)
stat_ols = ht.statistic
pval_ols = ht.pvalue
else:
# we do not have controls and can use overall fit
nobs = endog_v.shape[0]
rsquared_noncentered = 1 - res_ols.ssr/res_ols.uncentered_tss
stat_ols = nobs * rsquared_noncentered
pval_ols = stats.chi2.sf(stat_ols, k_constraints)
return stat_ols, pval_ols
def test_poisson_zeroinflation_jh(results_poisson, exog_infl=None):
"""score test for zero inflation or deflation in Poisson
This implements Jansakul and Hinde 2009 score test
for excess zeros against a zero modified Poisson
alternative. They use a linear link function for the
inflation model to allow for zero deflation.
Parameters
----------
results_poisson: results instance
The test is only valid if the results instance is a Poisson
model.
exog_infl : ndarray
Explanatory variables for the zero inflated or zero modified
alternative. I exog_infl is None, then the inflation
probability is assumed to be constant.
Returns
-------
score test results based on chisquare distribution
Notes
-----
This is a score test based on the null hypothesis that
the true model is Poisson. It will also reject for
other deviations from a Poisson model if those affect
the zero probabilities, e.g. in the direction of
excess dispersion as in the Negative Binomial
or Generalized Poisson model.
Therefore, rejection in this test does not imply that
zero-inflated Poisson is the appropriate model.
Status: experimental, no verified unit tests,
TODO: If the zero modification probability is assumed
to be constant under the alternative, then we only have
a scalar test score and we can use one-sided tests to
distinguish zero inflation and deflation from the
two-sided deviations. (The general one-sided case is
difficult.)
In this case the test specializes to the test by Broek
References
----------
.. [1] Jansakul, N., and J. P. Hinde. 2002. “Score Tests for Zero-Inflated
Poisson Models.” Computational Statistics & Data Analysis 40 (1):
7596. https://doi.org/10.1016/S0167-9473(01)00104-9.
"""
if not isinstance(results_poisson.model, Poisson):
# GLM Poisson would be also valid, not tried
import warnings
warnings.warn('Test is only valid if model is Poisson')
nobs = results_poisson.model.endog.shape[0]
if exog_infl is None:
exog_infl = np.ones((nobs, 1))
endog = results_poisson.model.endog
exog = results_poisson.model.exog
mu = results_poisson.predict()
prob_zero = np.exp(-mu)
cov_poi = results_poisson.cov_params()
cross_derivative = (exog_infl.T * (-mu)).dot(exog).T
cov_infl = (exog_infl.T * ((1 - prob_zero) / prob_zero)).dot(exog_infl)
score_obs_infl = exog_infl * (((endog == 0) - prob_zero) / prob_zero)[:,None]
#score_obs_infl = exog_infl * ((endog == 0) * (1 - prob_zero) / prob_zero - (endog>0))[:,None] #same
score_infl = score_obs_infl.sum(0)
cov_score_infl = cov_infl - cross_derivative.T.dot(cov_poi).dot(cross_derivative)
cov_score_infl_inv = np.linalg.pinv(cov_score_infl)
statistic = score_infl.dot(cov_score_infl_inv).dot(score_infl)
df2 = np.linalg.matrix_rank(cov_score_infl) # more general, maybe not needed
df = exog_infl.shape[1]
pvalue = stats.chi2.sf(statistic, df)
res = HolderTuple(
statistic=statistic,
pvalue=pvalue,
df=df,
rank_score=df2,
distribution="chi2",
)
return res
def test_poisson_zeroinflation_broek(results_poisson):
"""score test for zero modification in Poisson, special case
This assumes that the Poisson model has a constant and that
the zero modification probability is constant.
This is a special case of test_poisson_zeroinflation derived by
van den Broek 1995.
The test reports two sided and one sided alternatives based on
the normal distribution of the test statistic.
References
----------
.. [1] Broek, Jan van den. 1995. “A Score Test for Zero Inflation in a
Poisson Distribution.” Biometrics 51 (2): 73843.
https://doi.org/10.2307/2532959.
"""
mu = results_poisson.predict()
prob_zero = np.exp(-mu)
endog = results_poisson.model.endog
# nobs = len(endog)
# score = ((endog == 0) / prob_zero).sum() - nobs
# var_score = (1 / prob_zero).sum() - nobs - endog.sum()
score = (((endog == 0) - prob_zero) / prob_zero).sum()
var_score = ((1 - prob_zero) / prob_zero).sum() - endog.sum()
statistic = score / np.sqrt(var_score)
pvalue_two = 2 * stats.norm.sf(np.abs(statistic))
pvalue_upp = stats.norm.sf(statistic)
pvalue_low = stats.norm.cdf(statistic)
res = HolderTuple(
statistic=statistic,
pvalue=pvalue_two,
pvalue_smaller=pvalue_upp,
pvalue_larger=pvalue_low,
chi2=statistic**2,
pvalue_chi2=stats.chi2.sf(statistic**2, 1),
df_chi2=1,
distribution="normal",
)
return res
def test_poisson_zeros(results):
"""Test for excess zeros in Poisson regression model.
The test is implemented following Tang and Tang [1]_ equ. (12) which is
based on the test derived in He et al 2019 [2]_.
References
----------
.. [1] Tang, Yi, and Wan Tang. 2018. “Testing Modified Zeros for Poisson
Regression Models:” Statistical Methods in Medical Research,
September. https://doi.org/10.1177/0962280218796253.
.. [2] He, Hua, Hui Zhang, Peng Ye, and Wan Tang. 2019. “A Test of Inflated
Zeros for Poisson Regression Models.” Statistical Methods in
Medical Research 28 (4): 115769.
https://doi.org/10.1177/0962280217749991.
"""
x = results.model.exog
mean = results.predict()
prob0 = np.exp(-mean)
counts = (results.model.endog == 0).astype(int)
diff = counts.sum() - prob0.sum()
var1 = prob0 @ (1 - prob0)
pm = prob0 * mean
c = np.linalg.inv(x.T * mean @ x)
pmx = pm @ x
var2 = pmx @ c @ pmx
var = var1 - var2
statistic = diff / np.sqrt(var)
pvalue_two = 2 * stats.norm.sf(np.abs(statistic))
pvalue_upp = stats.norm.sf(statistic)
pvalue_low = stats.norm.cdf(statistic)
res = HolderTuple(
statistic=statistic,
pvalue=pvalue_two,
pvalue_smaller=pvalue_upp,
pvalue_larger=pvalue_low,
chi2=statistic**2,
pvalue_chi2=stats.chi2.sf(statistic**2, 1),
df_chi2=1,
distribution="normal",
)
return res