311 lines
9.6 KiB
Python
311 lines
9.6 KiB
Python
"""
|
||
Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and
|
||
exponential distributions.
|
||
|
||
`kstest_fit` is provided as a top-level function to access both tests.
|
||
`kstest_normal` and `kstest_exponential` are provided as convenience functions
|
||
with the appropriate test as the default.
|
||
`lilliefors` is provided as an alias for `kstest_fit`.
|
||
|
||
Created on Sat Oct 01 13:16:49 2011
|
||
|
||
Author: Josef Perktold
|
||
License: BSD-3
|
||
|
||
pvalues for Lilliefors test are based on formula and table in
|
||
|
||
An Analytic Approximation to the Distribution of Lilliefors's Test Statistic
|
||
for Normality
|
||
Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American
|
||
Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296
|
||
Published by: American Statistical Association
|
||
Stable URL: http://www.jstor.org/stable/2684607 .
|
||
|
||
On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown
|
||
Hubert W. Lilliefors
|
||
Journal of the American Statistical Association, Vol. 62, No. 318.
|
||
(Jun., 1967), pp. 399-402.
|
||
|
||
---
|
||
|
||
Updated 2017-07-23
|
||
Jacob C. Kimmel
|
||
|
||
Ref:
|
||
Lilliefors, H.W.
|
||
On the Kolmogorov-Smirnov test for the exponential distribution with mean
|
||
unknown. Journal of the American Statistical Association, Vol 64, No. 325.
|
||
(1969), pp. 387–389.
|
||
"""
|
||
from functools import partial
|
||
|
||
import numpy as np
|
||
from scipy import stats
|
||
|
||
from statsmodels.tools.validation import string_like
|
||
from ._lilliefors_critical_values import (critical_values,
|
||
asymp_critical_values,
|
||
PERCENTILES)
|
||
from .tabledist import TableDist
|
||
|
||
|
||
def _make_asymptotic_function(params):
|
||
"""
|
||
Generates an asymptotic distribution callable from a param matrix
|
||
|
||
Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2)
|
||
|
||
Parameters
|
||
----------
|
||
params : ndarray
|
||
Array with shape (nalpha, 3) where nalpha is the number of
|
||
significance levels
|
||
"""
|
||
|
||
def f(n):
|
||
poly = np.array([1, np.log(n), np.log(n) ** 2])
|
||
return np.exp(poly.dot(params.T))
|
||
|
||
return f
|
||
|
||
|
||
def ksstat(x, cdf, alternative='two_sided', args=()):
|
||
"""
|
||
Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit
|
||
|
||
This calculates the test statistic for a test of the distribution G(x) of
|
||
an observed variable against a given distribution F(x). Under the null
|
||
hypothesis the two distributions are identical, G(x)=F(x). The
|
||
alternative hypothesis can be either 'two_sided' (default), 'less'
|
||
or 'greater'. The KS test is only valid for continuous distributions.
|
||
|
||
Parameters
|
||
----------
|
||
x : array_like, 1d
|
||
array of observations
|
||
cdf : str or callable
|
||
string: name of a distribution in scipy.stats
|
||
callable: function to evaluate cdf
|
||
alternative : 'two_sided' (default), 'less' or 'greater'
|
||
defines the alternative hypothesis (see explanation)
|
||
args : tuple, sequence
|
||
distribution parameters for call to cdf
|
||
|
||
|
||
Returns
|
||
-------
|
||
D : float
|
||
KS test statistic, either D, D+ or D-
|
||
|
||
See Also
|
||
--------
|
||
scipy.stats.kstest
|
||
|
||
Notes
|
||
-----
|
||
|
||
In the one-sided test, the alternative is that the empirical
|
||
cumulative distribution function of the random variable is "less"
|
||
or "greater" than the cumulative distribution function F(x) of the
|
||
hypothesis, G(x)<=F(x), resp. G(x)>=F(x).
|
||
|
||
In contrast to scipy.stats.kstest, this function only calculates the
|
||
statistic which can be used either as distance measure or to implement
|
||
case specific p-values.
|
||
"""
|
||
nobs = float(len(x))
|
||
|
||
if isinstance(cdf, str):
|
||
cdf = getattr(stats.distributions, cdf).cdf
|
||
elif hasattr(cdf, 'cdf'):
|
||
cdf = getattr(cdf, 'cdf')
|
||
|
||
x = np.sort(x)
|
||
cdfvals = cdf(x, *args)
|
||
|
||
d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max()
|
||
d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max()
|
||
if alternative == 'greater':
|
||
return d_plus
|
||
elif alternative == 'less':
|
||
return d_min
|
||
|
||
return np.max([d_plus, d_min])
|
||
|
||
|
||
def get_lilliefors_table(dist='norm'):
|
||
"""
|
||
Generates tables for significance levels of Lilliefors test statistics
|
||
|
||
Tables for available normal and exponential distribution testing,
|
||
as specified in Lilliefors references above
|
||
|
||
Parameters
|
||
----------
|
||
dist : str
|
||
distribution being tested in set {'norm', 'exp'}.
|
||
|
||
Returns
|
||
-------
|
||
lf : TableDist object.
|
||
table of critical values
|
||
"""
|
||
# function just to keep things together
|
||
# for this test alpha is sf probability, i.e. right tail probability
|
||
|
||
alpha = 1 - np.array(PERCENTILES) / 100.0
|
||
alpha = alpha[::-1]
|
||
dist = 'normal' if dist == 'norm' else dist
|
||
if dist not in critical_values:
|
||
raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'")
|
||
cv_data = critical_values[dist]
|
||
acv_data = asymp_critical_values[dist]
|
||
|
||
size = np.array(sorted(cv_data), dtype=float)
|
||
crit_lf = np.array([cv_data[key] for key in sorted(cv_data)])
|
||
crit_lf = crit_lf[:, ::-1]
|
||
|
||
asym_params = np.array([acv_data[key] for key in sorted(acv_data)])
|
||
asymp_fn = _make_asymptotic_function(asym_params[::-1])
|
||
|
||
lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn)
|
||
return lf
|
||
|
||
|
||
lilliefors_table_norm = get_lilliefors_table(dist='norm')
|
||
lilliefors_table_expon = get_lilliefors_table(dist='exp')
|
||
|
||
|
||
def pval_lf(d_max, n):
|
||
"""
|
||
Approximate pvalues for Lilliefors test
|
||
|
||
This is only valid for pvalues smaller than 0.1 which is not checked in
|
||
this function.
|
||
|
||
Parameters
|
||
----------
|
||
d_max : array_like
|
||
two-sided Kolmogorov-Smirnov test statistic
|
||
n : int or float
|
||
sample size
|
||
|
||
Returns
|
||
-------
|
||
p-value : float or ndarray
|
||
pvalue according to approximation formula of Dallal and Wilkinson.
|
||
|
||
Notes
|
||
-----
|
||
This is mainly a helper function where the calling code should dispatch
|
||
on bound violations. Therefore it does not check whether the pvalue is in
|
||
the valid range.
|
||
|
||
Precision for the pvalues is around 2 to 3 decimals. This approximation is
|
||
also used by other statistical packages (e.g. R:fBasics) but might not be
|
||
the most precise available.
|
||
|
||
References
|
||
----------
|
||
DallalWilkinson1986
|
||
"""
|
||
# todo: check boundaries, valid range for n and Dmax
|
||
if n > 100:
|
||
d_max *= (n / 100.) ** 0.49
|
||
n = 100
|
||
pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019)
|
||
+ 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119
|
||
+ 0.974598 / np.sqrt(n) + 1.67997 / n)
|
||
return pval
|
||
|
||
|
||
def kstest_fit(x, dist='norm', pvalmethod="table"):
|
||
"""
|
||
Test assumed normal or exponential distribution using Lilliefors' test.
|
||
|
||
Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.
|
||
|
||
Parameters
|
||
----------
|
||
x : array_like, 1d
|
||
Data to test.
|
||
dist : {'norm', 'exp'}, optional
|
||
The assumed distribution.
|
||
pvalmethod : {'approx', 'table'}, optional
|
||
The method used to compute the p-value of the test statistic. In
|
||
general, 'table' is preferred and makes use of a very large simulation.
|
||
'approx' is only valid for normality. if `dist = 'exp'` `table` is
|
||
always used. 'approx' uses the approximation formula of Dalal and
|
||
Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
|
||
then the result of `table` is returned.
|
||
|
||
Returns
|
||
-------
|
||
ksstat : float
|
||
Kolmogorov-Smirnov test statistic with estimated mean and variance.
|
||
pvalue : float
|
||
If the pvalue is lower than some threshold, e.g. 0.05, then we can
|
||
reject the Null hypothesis that the sample comes from a normal
|
||
distribution.
|
||
|
||
Notes
|
||
-----
|
||
'table' uses an improved table based on 10,000,000 simulations. The
|
||
critical values are approximated using
|
||
log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
|
||
where cv_alpha is the critical value for a test with size alpha,
|
||
b_alpha is an alpha-specific intercept term and c[1] and c[2] are
|
||
coefficients that are shared all alphas.
|
||
Values in the table are linearly interpolated. Values outside the
|
||
range are be returned as bounds, 0.990 for large and 0.001 for small
|
||
pvalues.
|
||
|
||
For implementation details, see lilliefors_critical_value_simulation.py in
|
||
the test directory.
|
||
"""
|
||
pvalmethod = string_like(pvalmethod,
|
||
"pvalmethod",
|
||
options=("approx", "table"))
|
||
x = np.asarray(x)
|
||
if x.ndim == 2 and x.shape[1] == 1:
|
||
x = x[:, 0]
|
||
elif x.ndim != 1:
|
||
raise ValueError("Invalid parameter `x`: must be a one-dimensional"
|
||
" array-like or a single-column DataFrame")
|
||
|
||
nobs = len(x)
|
||
|
||
if dist == 'norm':
|
||
z = (x - x.mean()) / x.std(ddof=1)
|
||
test_d = stats.norm.cdf
|
||
lilliefors_table = lilliefors_table_norm
|
||
elif dist == 'exp':
|
||
z = x / x.mean()
|
||
test_d = stats.expon.cdf
|
||
lilliefors_table = lilliefors_table_expon
|
||
pvalmethod = 'table'
|
||
else:
|
||
raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")
|
||
|
||
min_nobs = 4 if dist == 'norm' else 3
|
||
if nobs < min_nobs:
|
||
raise ValueError('Test for distribution {} requires at least {} '
|
||
'observations'.format(dist, min_nobs))
|
||
|
||
d_ks = ksstat(z, test_d, alternative='two_sided')
|
||
|
||
if pvalmethod == 'approx':
|
||
pval = pval_lf(d_ks, nobs)
|
||
# check pval is in desired range
|
||
if pval > 0.1:
|
||
pval = lilliefors_table.prob(d_ks, nobs)
|
||
else: # pvalmethod == 'table'
|
||
pval = lilliefors_table.prob(d_ks, nobs)
|
||
|
||
return d_ks, pval
|
||
|
||
|
||
lilliefors = kstest_fit
|
||
kstest_normal = kstest_fit
|
||
kstest_exponential = partial(kstest_fit, dist='exp')
|