""" Statistical tests to be used in conjunction with the models Notes ----- These functions have not been formally tested. """ from scipy import stats import numpy as np from statsmodels.tools.sm_exceptions import ValueWarning def durbin_watson(resids, axis=0): r""" Calculates the Durbin-Watson statistic. Parameters ---------- resids : array_like Data for which to compute the Durbin-Watson statistic. Usually regression model residuals. axis : int, optional Axis to use if data has more than 1 dimension. Default is 0. Returns ------- dw : float, array_like The Durbin-Watson statistic. Notes ----- The null hypothesis of the test is that there is no serial correlation in the residuals. The Durbin-Watson test statistic is defined as: .. math:: \sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2 The test statistic is approximately equal to 2*(1-r) where ``r`` is the sample autocorrelation of the residuals. Thus, for r == 0, indicating no serial correlation, the test statistic equals 2. This statistic will always be between 0 and 4. The closer to 0 the statistic, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation. """ resids = np.asarray(resids) diff_resids = np.diff(resids, 1, axis=axis) dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis) return dw def omni_normtest(resids, axis=0): """ Omnibus test for normality Parameters ---------- resid : array_like axis : int, optional Default is 0 Returns ------- Chi^2 score, two-tail probability """ # TODO: change to exception in summary branch and catch in summary() # behavior changed between scipy 0.9 and 0.10 resids = np.asarray(resids) n = resids.shape[axis] if n < 8: from warnings import warn warn("omni_normtest is not valid with less than 8 observations; %i " "samples were given." % int(n), ValueWarning) return np.nan, np.nan return stats.normaltest(resids, axis=axis) def jarque_bera(resids, axis=0): r""" The Jarque-Bera test of normality. Parameters ---------- resids : array_like Data to test for normality. Usually regression model residuals that are mean 0. axis : int, optional Axis to use if data has more than 1 dimension. Default is 0. Returns ------- JB : {float, ndarray} The Jarque-Bera test statistic. JBpv : {float, ndarray} The pvalue of the test statistic. skew : {float, ndarray} Estimated skewness of the data. kurtosis : {float, ndarray} Estimated kurtosis of the data. Notes ----- Each output returned has 1 dimension fewer than data The Jarque-Bera test statistic tests the null that the data is normally distributed against an alternative that the data follow some other distribution. The test statistic is based on two moments of the data, the skewness, and the kurtosis, and has an asymptotic :math:`\chi^2_2` distribution. The test statistic is defined .. math:: JB = n(S^2/6+(K-3)^2/24) where n is the number of data points, S is the sample skewness, and K is the sample kurtosis of the data. """ resids = np.atleast_1d(np.asarray(resids, dtype=float)) if resids.size < 2: raise ValueError("resids must contain at least 2 elements") # Calculate residual skewness and kurtosis skew = stats.skew(resids, axis=axis) kurtosis = 3 + stats.kurtosis(resids, axis=axis) # Calculate the Jarque-Bera test for normality n = resids.shape[axis] jb = (n / 6.) * (skew ** 2 + (1 / 4.) * (kurtosis - 3) ** 2) jb_pv = stats.chi2.sf(jb, 2) return jb, jb_pv, skew, kurtosis def robust_skewness(y, axis=0): """ Calculates the four skewness measures in Kim & White Parameters ---------- y : array_like Data to compute use in the estimator. axis : int or None, optional Axis along which the skewness measures are computed. If `None`, the entire array is used. Returns ------- sk1 : ndarray The standard skewness estimator. sk2 : ndarray Skewness estimator based on quartiles. sk3 : ndarray Skewness estimator based on mean-median difference, standardized by absolute deviation. sk4 : ndarray Skewness estimator based on mean-median difference, standardized by standard deviation. Notes ----- The robust skewness measures are defined .. math:: SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right) -\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}} .. math:: SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}} {\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]} .. math:: SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}} .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if axis is None: y = y.ravel() axis = 0 y = np.sort(y, axis) q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis) mu = y.mean(axis) shape = (y.size,) if axis is not None: shape = list(mu.shape) shape.insert(axis, 1) shape = tuple(shape) mu_b = np.reshape(mu, shape) q2_b = np.reshape(q2, shape) sigma = np.sqrt(np.mean(((y - mu_b)**2), axis)) sk1 = stats.skew(y, axis=axis) sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1) sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis) sk4 = (mu - q2) / sigma return sk1, sk2, sk3, sk4 def _kr3(y, alpha=5.0, beta=50.0): """ KR3 estimator from Kim & White Parameters ---------- y : array_like, 1-d Data to compute use in the estimator. alpha : float, optional Lower cut-off for measuring expectation in tail. beta : float, optional Lower cut-off for measuring expectation in center. Returns ------- kr3 : float Robust kurtosis estimator based on standardized lower- and upper-tail expected values Notes ----- .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ perc = (alpha, 100.0 - alpha, beta, 100.0 - beta) lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc) l_alpha = np.mean(y[y < lower_alpha]) u_alpha = np.mean(y[y > upper_alpha]) l_beta = np.mean(y[y < lower_beta]) u_beta = np.mean(y[y > upper_beta]) return (u_alpha - l_alpha) / (u_beta - l_beta) def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)): """ Calculates the expected value of the robust kurtosis measures in Kim and White assuming the data are normally distributed. Parameters ---------- ab : iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db : iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure Returns ------- ekr : ndarray, 4-element Contains the expected values of the 4 robust kurtosis measures Notes ----- See `robust_kurtosis` for definitions of the robust kurtosis measures """ alpha, beta = ab delta, gamma = dg expected_value = np.zeros(4) ppf = stats.norm.ppf pdf = stats.norm.pdf q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8) expected_value[0] = 3 expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2) q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0))) expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta) q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0))) expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma) return expected_value def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True): """ Calculates the four kurtosis measures in Kim & White Parameters ---------- y : array_like Data to compute use in the estimator. axis : int or None, optional Axis along which the kurtosis are computed. If `None`, the entire array is used. a iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db : iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure excess : bool, optional If true (default), computed values are excess of those for a standard normal distribution. Returns ------- kr1 : ndarray The standard kurtosis estimator. kr2 : ndarray Kurtosis estimator based on octiles. kr3 : ndarray Kurtosis estimators based on exceedance expectations. kr4 : ndarray Kurtosis measure based on the spread between high and low quantiles. Notes ----- The robust kurtosis measures are defined .. math:: KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right) +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)} {\\hat{q}_{.75}-\\hat{q}_{.25}} .. math:: KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)} {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)} .. math:: KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}} {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}} where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`. .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if (axis is None or (y.squeeze().ndim == 1 and y.ndim != 1)): y = y.ravel() axis = 0 alpha, beta = ab delta, gamma = dg perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, delta, 100.0 - delta, gamma, 100.0 - gamma) e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc, axis=axis) expected_value = (expected_robust_kurtosis(ab, dg) if excess else np.zeros(4)) kr1 = stats.kurtosis(y, axis, False) - expected_value[0] kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1] if y.ndim == 1: kr3 = _kr3(y, alpha, beta) else: kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta) kr3 -= expected_value[2] kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3] return kr1, kr2, kr3, kr4 def _medcouple_1d(y): """ Calculates the medcouple robust measure of skew. Parameters ---------- y : array_like, 1-d Data to compute use in the estimator. Returns ------- mc : float The medcouple statistic Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [*] M. Hubert and E. Vandervieren, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ # Parameter changes the algorithm to the slower for large n y = np.squeeze(np.asarray(y)) if y.ndim != 1: raise ValueError("y must be squeezable to a 1-d array") y = np.sort(y) n = y.shape[0] if n % 2 == 0: mf = (y[n // 2 - 1] + y[n // 2]) / 2 else: mf = y[(n - 1) // 2] z = y - mf lower = z[z <= 0.0] upper = z[z >= 0.0] upper = upper[:, None] standardization = upper - lower is_zero = np.logical_and(lower == 0.0, upper == 0.0) standardization[is_zero] = np.inf spread = upper + lower h = spread / standardization # GH5395 num_ties = np.sum(lower == 0.0) if num_ties: # Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal, # and 1 below the anti-diagonal replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties) replacements -= 2 * np.triu(replacements) # Convert diagonal to anti-diagonal replacements = np.fliplr(replacements) # Always replace upper right block h[:num_ties, -num_ties:] = replacements return np.median(h) def medcouple(y, axis=0): """ Calculate the medcouple robust measure of skew. Parameters ---------- y : array_like Data to compute use in the estimator. axis : {int, None} Axis along which the medcouple statistic is computed. If `None`, the entire array is used. Returns ------- mc : ndarray The medcouple statistic with the same shape as `y`, with the specified axis removed. Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [*] M. Hubert and E. Vandervieren, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ y = np.asarray(y, dtype=np.double) # GH 4243 if axis is None: return _medcouple_1d(y.ravel()) return np.apply_along_axis(_medcouple_1d, axis, y)