243 lines
8.0 KiB
Python
243 lines
8.0 KiB
Python
|
"""
|
|||
|
Created on Tue Oct 6 12:42:11 2020
|
|||
|
|
|||
|
Author: Josef Perktold
|
|||
|
License: BSD-3
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
import numpy as np
|
|||
|
from scipy import stats
|
|||
|
|
|||
|
from statsmodels.stats.base import HolderTuple
|
|||
|
from statsmodels.stats.effect_size import _noncentrality_chisquare
|
|||
|
|
|||
|
|
|||
|
def test_chisquare_binning(counts, expected, sort_var=None, bins=10,
|
|||
|
df=None, ordered=False, sort_method="quicksort",
|
|||
|
alpha_nc=0.05):
|
|||
|
"""chisquare gof test with binning of data, Hosmer-Lemeshow type
|
|||
|
|
|||
|
``observed`` and ``expected`` are observation specific and should have
|
|||
|
observations in rows and choices in columns
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
counts : array_like
|
|||
|
Observed frequency, i.e. counts for all choices
|
|||
|
expected : array_like
|
|||
|
Expected counts or probability. If expected are counts, then they
|
|||
|
need to sum to the same total count as the sum of observed.
|
|||
|
If those sums are unequal and all expected values are smaller or equal
|
|||
|
to 1, then they are interpreted as probabilities and will be rescaled
|
|||
|
to match counts.
|
|||
|
sort_var : array_like
|
|||
|
1-dimensional array for binning. Groups will be formed according to
|
|||
|
quantiles of the sorted array ``sort_var``, so that group sizes have
|
|||
|
equal or approximately equal sizes.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
Holdertuple instance
|
|||
|
This instance contains the results of the chisquare test and some
|
|||
|
information about the data
|
|||
|
|
|||
|
- statistic : chisquare statistic of the goodness-of-fit test
|
|||
|
- pvalue : pvalue of the chisquare test
|
|||
|
= df : degrees of freedom of the test
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Degrees of freedom for Hosmer-Lemeshow tests are given by
|
|||
|
|
|||
|
g groups, c choices
|
|||
|
|
|||
|
- binary: `df = (g - 2)` for insample,
|
|||
|
Stata uses `df = g` for outsample
|
|||
|
- multinomial: `df = (g−2) *(c−1)`, reduces to (g-2) for binary c=2,
|
|||
|
(Fagerland, Hosmer, Bofin SIM 2008)
|
|||
|
- ordinal: `df = (g - 2) * (c - 1) + (c - 2)`, reduces to (g-2) for c=2,
|
|||
|
(Hosmer, ... ?)
|
|||
|
|
|||
|
Note: If there are ties in the ``sort_var`` array, then the split of
|
|||
|
observations into groups will depend on the sort algorithm.
|
|||
|
"""
|
|||
|
|
|||
|
observed = np.asarray(counts)
|
|||
|
expected = np.asarray(expected)
|
|||
|
n_observed = counts.sum()
|
|||
|
n_expected = expected.sum()
|
|||
|
if not np.allclose(n_observed, n_expected, atol=1e-13):
|
|||
|
if np.max(expected) < 1 + 1e-13:
|
|||
|
# expected seems to be probability, warn and rescale
|
|||
|
import warnings
|
|||
|
warnings.warn("sum of expected and of observed differ, "
|
|||
|
"rescaling ``expected``")
|
|||
|
expected = expected / n_expected * n_observed
|
|||
|
else:
|
|||
|
# expected doesn't look like fractions or probabilities
|
|||
|
raise ValueError("total counts of expected and observed differ")
|
|||
|
|
|||
|
# k = 1 if observed.ndim == 1 else observed.shape[1]
|
|||
|
if sort_var is not None:
|
|||
|
argsort = np.argsort(sort_var, kind=sort_method)
|
|||
|
else:
|
|||
|
argsort = np.arange(observed.shape[0])
|
|||
|
# indices = [arr for arr in np.array_split(argsort, bins, axis=0)]
|
|||
|
indices = np.array_split(argsort, bins, axis=0)
|
|||
|
# in one loop, observed expected in last dimension, too messy,
|
|||
|
# freqs_probs = np.array([np.vstack([observed[idx].mean(0),
|
|||
|
# expected[idx].mean(0)]).T
|
|||
|
# for idx in indices])
|
|||
|
freqs = np.array([observed[idx].sum(0) for idx in indices])
|
|||
|
probs = np.array([expected[idx].sum(0) for idx in indices])
|
|||
|
|
|||
|
# chisquare test
|
|||
|
resid_pearson = (freqs - probs) / np.sqrt(probs)
|
|||
|
chi2_stat_groups = ((freqs - probs)**2 / probs).sum(1)
|
|||
|
chi2_stat = chi2_stat_groups.sum()
|
|||
|
if df is None:
|
|||
|
g, c = freqs.shape
|
|||
|
if ordered is True:
|
|||
|
df = (g - 2) * (c - 1) + (c - 2)
|
|||
|
else:
|
|||
|
df = (g - 2) * (c - 1)
|
|||
|
pvalue = stats.chi2.sf(chi2_stat, df)
|
|||
|
noncentrality = _noncentrality_chisquare(chi2_stat, df, alpha=alpha_nc)
|
|||
|
|
|||
|
res = HolderTuple(statistic=chi2_stat,
|
|||
|
pvalue=pvalue,
|
|||
|
df=df,
|
|||
|
freqs=freqs,
|
|||
|
probs=probs,
|
|||
|
noncentrality=noncentrality,
|
|||
|
resid_pearson=resid_pearson,
|
|||
|
chi2_stat_groups=chi2_stat_groups,
|
|||
|
indices=indices
|
|||
|
)
|
|||
|
return res
|
|||
|
|
|||
|
|
|||
|
def prob_larger_ordinal_choice(prob):
|
|||
|
"""probability that observed category is larger than distribution prob
|
|||
|
|
|||
|
This is a helper function for Ordinal models, where endog is a 1-dim
|
|||
|
categorical variable and predicted probabilities are 2-dimensional with
|
|||
|
observations in rows and choices in columns.
|
|||
|
|
|||
|
Parameter
|
|||
|
---------
|
|||
|
prob : array_like
|
|||
|
Expected probabilities for ordinal choices, e.g. from prediction of
|
|||
|
an ordinal model with observations in rows and choices in columns.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
cdf_mid : ndarray
|
|||
|
mid cdf, i.e ``P(x < y) + 0.5 P(x=y)``
|
|||
|
r : ndarray
|
|||
|
Probability residual ``P(x > y) - P(x < y)`` for all possible choices.
|
|||
|
Computed as ``r = cdf_mid * 2 - 1``
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
.. [2] Li, Chun, and Bryan E. Shepherd. 2012. “A New Residual for Ordinal
|
|||
|
Outcomes.” Biometrika 99 (2): 473–80.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
`statsmodels.stats.nonparametric.rank_compare_2ordinal`
|
|||
|
|
|||
|
"""
|
|||
|
# similar to `nonparametric rank_compare_2ordinal`
|
|||
|
|
|||
|
prob = np.asarray(prob)
|
|||
|
cdf = prob.cumsum(-1)
|
|||
|
if cdf.ndim == 1:
|
|||
|
cdf_ = np.concatenate(([0], cdf))
|
|||
|
elif cdf.ndim == 2:
|
|||
|
cdf_ = np.concatenate((np.zeros((len(cdf), 1)), cdf), axis=1)
|
|||
|
# r_1 = cdf_[..., 1:] + cdf_[..., :-1] - 1
|
|||
|
cdf_mid = (cdf_[..., 1:] + cdf_[..., :-1]) / 2
|
|||
|
r = cdf_mid * 2 - 1
|
|||
|
return cdf_mid, r
|
|||
|
|
|||
|
|
|||
|
def prob_larger_2ordinal(probs1, probs2):
|
|||
|
"""Stochastically large probability for two ordinal distributions
|
|||
|
|
|||
|
Computes Pr(x1 > x2) + 0.5 * Pr(x1 = x2) for two ordered multinomial
|
|||
|
(ordinal) distributed random variables x1 and x2.
|
|||
|
|
|||
|
This is vectorized with choices along last axis.
|
|||
|
Broadcasting if freq2 is 1-dim also seems to work correctly.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
prob1 : float
|
|||
|
Probability that random draw from distribution 1 is larger than a
|
|||
|
random draw from distribution 2. Pr(x1 > x2) + 0.5 * Pr(x1 = x2)
|
|||
|
prob2 : float
|
|||
|
prob2 = 1 - prob1 = Pr(x1 < x2) + 0.5 * Pr(x1 = x2)
|
|||
|
"""
|
|||
|
# count1 = np.asarray(count1)
|
|||
|
# count2 = np.asarray(count2)
|
|||
|
# nobs1, nobs2 = count1.sum(), count2.sum()
|
|||
|
# freq1 = count1 / nobs1
|
|||
|
# freq2 = count2 / nobs2
|
|||
|
|
|||
|
# if freq1.ndim == 1:
|
|||
|
# freq1_ = np.concatenate(([0], freq1))
|
|||
|
# elif freq1.ndim == 2:
|
|||
|
# freq1_ = np.concatenate((np.zeros((len(freq1), 1)), freq1), axis=1)
|
|||
|
|
|||
|
# if freq2.ndim == 1:
|
|||
|
# freq2_ = np.concatenate(([0], freq2))
|
|||
|
# elif freq2.ndim == 2:
|
|||
|
# freq2_ = np.concatenate((np.zeros((len(freq2), 1)), freq2), axis=1)
|
|||
|
|
|||
|
freq1 = np.asarray(probs1)
|
|||
|
freq2 = np.asarray(probs2)
|
|||
|
# add zero at beginning of choices for cdf computation
|
|||
|
freq1_ = np.concatenate((np.zeros(freq1.shape[:-1] + (1,)), freq1),
|
|||
|
axis=-1)
|
|||
|
freq2_ = np.concatenate((np.zeros(freq2.shape[:-1] + (1,)), freq2),
|
|||
|
axis=-1)
|
|||
|
|
|||
|
cdf1 = freq1_.cumsum(axis=-1)
|
|||
|
cdf2 = freq2_.cumsum(axis=-1)
|
|||
|
|
|||
|
# mid rank cdf
|
|||
|
cdfm1 = (cdf1[..., 1:] + cdf1[..., :-1]) / 2
|
|||
|
cdfm2 = (cdf2[..., 1:] + cdf2[..., :-1]) / 2
|
|||
|
prob1 = (cdfm2 * freq1).sum(-1)
|
|||
|
prob2 = (cdfm1 * freq2).sum(-1)
|
|||
|
return prob1, prob2
|
|||
|
|
|||
|
|
|||
|
def cov_multinomial(probs):
|
|||
|
"""covariance matrix of multinomial distribution
|
|||
|
|
|||
|
This is vectorized with choices along last axis.
|
|||
|
|
|||
|
cov = diag(probs) - outer(probs, probs)
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
k = probs.shape[-1]
|
|||
|
di = np.diag_indices(k, 2)
|
|||
|
cov = probs[..., None] * probs[..., None, :]
|
|||
|
cov *= - 1
|
|||
|
cov[..., di[0], di[1]] += probs
|
|||
|
return cov
|
|||
|
|
|||
|
|
|||
|
def var_multinomial(probs):
|
|||
|
"""variance of multinomial distribution
|
|||
|
|
|||
|
var = probs * (1 - probs)
|
|||
|
|
|||
|
"""
|
|||
|
var = probs * (1 - probs)
|
|||
|
return var
|