311 lines
9.6 KiB
Python
311 lines
9.6 KiB
Python
|
"""
|
|||
|
Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and
|
|||
|
exponential distributions.
|
|||
|
|
|||
|
`kstest_fit` is provided as a top-level function to access both tests.
|
|||
|
`kstest_normal` and `kstest_exponential` are provided as convenience functions
|
|||
|
with the appropriate test as the default.
|
|||
|
`lilliefors` is provided as an alias for `kstest_fit`.
|
|||
|
|
|||
|
Created on Sat Oct 01 13:16:49 2011
|
|||
|
|
|||
|
Author: Josef Perktold
|
|||
|
License: BSD-3
|
|||
|
|
|||
|
pvalues for Lilliefors test are based on formula and table in
|
|||
|
|
|||
|
An Analytic Approximation to the Distribution of Lilliefors's Test Statistic
|
|||
|
for Normality
|
|||
|
Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American
|
|||
|
Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296
|
|||
|
Published by: American Statistical Association
|
|||
|
Stable URL: http://www.jstor.org/stable/2684607 .
|
|||
|
|
|||
|
On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown
|
|||
|
Hubert W. Lilliefors
|
|||
|
Journal of the American Statistical Association, Vol. 62, No. 318.
|
|||
|
(Jun., 1967), pp. 399-402.
|
|||
|
|
|||
|
---
|
|||
|
|
|||
|
Updated 2017-07-23
|
|||
|
Jacob C. Kimmel
|
|||
|
|
|||
|
Ref:
|
|||
|
Lilliefors, H.W.
|
|||
|
On the Kolmogorov-Smirnov test for the exponential distribution with mean
|
|||
|
unknown. Journal of the American Statistical Association, Vol 64, No. 325.
|
|||
|
(1969), pp. 387–389.
|
|||
|
"""
|
|||
|
from functools import partial
|
|||
|
|
|||
|
import numpy as np
|
|||
|
from scipy import stats
|
|||
|
|
|||
|
from statsmodels.tools.validation import string_like
|
|||
|
from ._lilliefors_critical_values import (critical_values,
|
|||
|
asymp_critical_values,
|
|||
|
PERCENTILES)
|
|||
|
from .tabledist import TableDist
|
|||
|
|
|||
|
|
|||
|
def _make_asymptotic_function(params):
|
|||
|
"""
|
|||
|
Generates an asymptotic distribution callable from a param matrix
|
|||
|
|
|||
|
Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2)
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
params : ndarray
|
|||
|
Array with shape (nalpha, 3) where nalpha is the number of
|
|||
|
significance levels
|
|||
|
"""
|
|||
|
|
|||
|
def f(n):
|
|||
|
poly = np.array([1, np.log(n), np.log(n) ** 2])
|
|||
|
return np.exp(poly.dot(params.T))
|
|||
|
|
|||
|
return f
|
|||
|
|
|||
|
|
|||
|
def ksstat(x, cdf, alternative='two_sided', args=()):
|
|||
|
"""
|
|||
|
Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit
|
|||
|
|
|||
|
This calculates the test statistic for a test of the distribution G(x) of
|
|||
|
an observed variable against a given distribution F(x). Under the null
|
|||
|
hypothesis the two distributions are identical, G(x)=F(x). The
|
|||
|
alternative hypothesis can be either 'two_sided' (default), 'less'
|
|||
|
or 'greater'. The KS test is only valid for continuous distributions.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
x : array_like, 1d
|
|||
|
array of observations
|
|||
|
cdf : str or callable
|
|||
|
string: name of a distribution in scipy.stats
|
|||
|
callable: function to evaluate cdf
|
|||
|
alternative : 'two_sided' (default), 'less' or 'greater'
|
|||
|
defines the alternative hypothesis (see explanation)
|
|||
|
args : tuple, sequence
|
|||
|
distribution parameters for call to cdf
|
|||
|
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
D : float
|
|||
|
KS test statistic, either D, D+ or D-
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
scipy.stats.kstest
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
|
|||
|
In the one-sided test, the alternative is that the empirical
|
|||
|
cumulative distribution function of the random variable is "less"
|
|||
|
or "greater" than the cumulative distribution function F(x) of the
|
|||
|
hypothesis, G(x)<=F(x), resp. G(x)>=F(x).
|
|||
|
|
|||
|
In contrast to scipy.stats.kstest, this function only calculates the
|
|||
|
statistic which can be used either as distance measure or to implement
|
|||
|
case specific p-values.
|
|||
|
"""
|
|||
|
nobs = float(len(x))
|
|||
|
|
|||
|
if isinstance(cdf, str):
|
|||
|
cdf = getattr(stats.distributions, cdf).cdf
|
|||
|
elif hasattr(cdf, 'cdf'):
|
|||
|
cdf = getattr(cdf, 'cdf')
|
|||
|
|
|||
|
x = np.sort(x)
|
|||
|
cdfvals = cdf(x, *args)
|
|||
|
|
|||
|
d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max()
|
|||
|
d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max()
|
|||
|
if alternative == 'greater':
|
|||
|
return d_plus
|
|||
|
elif alternative == 'less':
|
|||
|
return d_min
|
|||
|
|
|||
|
return np.max([d_plus, d_min])
|
|||
|
|
|||
|
|
|||
|
def get_lilliefors_table(dist='norm'):
|
|||
|
"""
|
|||
|
Generates tables for significance levels of Lilliefors test statistics
|
|||
|
|
|||
|
Tables for available normal and exponential distribution testing,
|
|||
|
as specified in Lilliefors references above
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
dist : str
|
|||
|
distribution being tested in set {'norm', 'exp'}.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
lf : TableDist object.
|
|||
|
table of critical values
|
|||
|
"""
|
|||
|
# function just to keep things together
|
|||
|
# for this test alpha is sf probability, i.e. right tail probability
|
|||
|
|
|||
|
alpha = 1 - np.array(PERCENTILES) / 100.0
|
|||
|
alpha = alpha[::-1]
|
|||
|
dist = 'normal' if dist == 'norm' else dist
|
|||
|
if dist not in critical_values:
|
|||
|
raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'")
|
|||
|
cv_data = critical_values[dist]
|
|||
|
acv_data = asymp_critical_values[dist]
|
|||
|
|
|||
|
size = np.array(sorted(cv_data), dtype=float)
|
|||
|
crit_lf = np.array([cv_data[key] for key in sorted(cv_data)])
|
|||
|
crit_lf = crit_lf[:, ::-1]
|
|||
|
|
|||
|
asym_params = np.array([acv_data[key] for key in sorted(acv_data)])
|
|||
|
asymp_fn = _make_asymptotic_function(asym_params[::-1])
|
|||
|
|
|||
|
lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn)
|
|||
|
return lf
|
|||
|
|
|||
|
|
|||
|
lilliefors_table_norm = get_lilliefors_table(dist='norm')
|
|||
|
lilliefors_table_expon = get_lilliefors_table(dist='exp')
|
|||
|
|
|||
|
|
|||
|
def pval_lf(d_max, n):
|
|||
|
"""
|
|||
|
Approximate pvalues for Lilliefors test
|
|||
|
|
|||
|
This is only valid for pvalues smaller than 0.1 which is not checked in
|
|||
|
this function.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
d_max : array_like
|
|||
|
two-sided Kolmogorov-Smirnov test statistic
|
|||
|
n : int or float
|
|||
|
sample size
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
p-value : float or ndarray
|
|||
|
pvalue according to approximation formula of Dallal and Wilkinson.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
This is mainly a helper function where the calling code should dispatch
|
|||
|
on bound violations. Therefore it does not check whether the pvalue is in
|
|||
|
the valid range.
|
|||
|
|
|||
|
Precision for the pvalues is around 2 to 3 decimals. This approximation is
|
|||
|
also used by other statistical packages (e.g. R:fBasics) but might not be
|
|||
|
the most precise available.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
DallalWilkinson1986
|
|||
|
"""
|
|||
|
# todo: check boundaries, valid range for n and Dmax
|
|||
|
if n > 100:
|
|||
|
d_max *= (n / 100.) ** 0.49
|
|||
|
n = 100
|
|||
|
pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019)
|
|||
|
+ 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119
|
|||
|
+ 0.974598 / np.sqrt(n) + 1.67997 / n)
|
|||
|
return pval
|
|||
|
|
|||
|
|
|||
|
def kstest_fit(x, dist='norm', pvalmethod="table"):
|
|||
|
"""
|
|||
|
Test assumed normal or exponential distribution using Lilliefors' test.
|
|||
|
|
|||
|
Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
x : array_like, 1d
|
|||
|
Data to test.
|
|||
|
dist : {'norm', 'exp'}, optional
|
|||
|
The assumed distribution.
|
|||
|
pvalmethod : {'approx', 'table'}, optional
|
|||
|
The method used to compute the p-value of the test statistic. In
|
|||
|
general, 'table' is preferred and makes use of a very large simulation.
|
|||
|
'approx' is only valid for normality. if `dist = 'exp'` `table` is
|
|||
|
always used. 'approx' uses the approximation formula of Dalal and
|
|||
|
Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
|
|||
|
then the result of `table` is returned.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
ksstat : float
|
|||
|
Kolmogorov-Smirnov test statistic with estimated mean and variance.
|
|||
|
pvalue : float
|
|||
|
If the pvalue is lower than some threshold, e.g. 0.05, then we can
|
|||
|
reject the Null hypothesis that the sample comes from a normal
|
|||
|
distribution.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
'table' uses an improved table based on 10,000,000 simulations. The
|
|||
|
critical values are approximated using
|
|||
|
log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
|
|||
|
where cv_alpha is the critical value for a test with size alpha,
|
|||
|
b_alpha is an alpha-specific intercept term and c[1] and c[2] are
|
|||
|
coefficients that are shared all alphas.
|
|||
|
Values in the table are linearly interpolated. Values outside the
|
|||
|
range are be returned as bounds, 0.990 for large and 0.001 for small
|
|||
|
pvalues.
|
|||
|
|
|||
|
For implementation details, see lilliefors_critical_value_simulation.py in
|
|||
|
the test directory.
|
|||
|
"""
|
|||
|
pvalmethod = string_like(pvalmethod,
|
|||
|
"pvalmethod",
|
|||
|
options=("approx", "table"))
|
|||
|
x = np.asarray(x)
|
|||
|
if x.ndim == 2 and x.shape[1] == 1:
|
|||
|
x = x[:, 0]
|
|||
|
elif x.ndim != 1:
|
|||
|
raise ValueError("Invalid parameter `x`: must be a one-dimensional"
|
|||
|
" array-like or a single-column DataFrame")
|
|||
|
|
|||
|
nobs = len(x)
|
|||
|
|
|||
|
if dist == 'norm':
|
|||
|
z = (x - x.mean()) / x.std(ddof=1)
|
|||
|
test_d = stats.norm.cdf
|
|||
|
lilliefors_table = lilliefors_table_norm
|
|||
|
elif dist == 'exp':
|
|||
|
z = x / x.mean()
|
|||
|
test_d = stats.expon.cdf
|
|||
|
lilliefors_table = lilliefors_table_expon
|
|||
|
pvalmethod = 'table'
|
|||
|
else:
|
|||
|
raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")
|
|||
|
|
|||
|
min_nobs = 4 if dist == 'norm' else 3
|
|||
|
if nobs < min_nobs:
|
|||
|
raise ValueError('Test for distribution {} requires at least {} '
|
|||
|
'observations'.format(dist, min_nobs))
|
|||
|
|
|||
|
d_ks = ksstat(z, test_d, alternative='two_sided')
|
|||
|
|
|||
|
if pvalmethod == 'approx':
|
|||
|
pval = pval_lf(d_ks, nobs)
|
|||
|
# check pval is in desired range
|
|||
|
if pval > 0.1:
|
|||
|
pval = lilliefors_table.prob(d_ks, nobs)
|
|||
|
else: # pvalmethod == 'table'
|
|||
|
pval = lilliefors_table.prob(d_ks, nobs)
|
|||
|
|
|||
|
return d_ks, pval
|
|||
|
|
|||
|
|
|||
|
lilliefors = kstest_fit
|
|||
|
kstest_normal = kstest_fit
|
|||
|
kstest_exponential = partial(kstest_fit, dist='exp')
|