""" Additional statistics functions with support for masked arrays. """ # Original author (2007): Pierre GF Gerard-Marchant __all__ = ['compare_medians_ms', 'hdquantiles', 'hdmedian', 'hdquantiles_sd', 'idealfourths', 'median_cihs','mjci','mquantiles_cimj', 'rsh', 'trimmed_mean_ci',] import numpy as np from numpy import float64, ndarray import numpy.ma as ma from numpy.ma import MaskedArray from . import _mstats_basic as mstats from scipy.stats.distributions import norm, beta, t, binom def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,): """ Computes quantile estimates with the Harrell-Davis method. The quantile estimates are calculated as a weighted linear combination of order statistics. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of probabilities at which to compute the quantiles. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. var : bool, optional Whether to return the variance of the estimate. Returns ------- hdquantiles : MaskedArray A (p,) array of quantiles (if `var` is False), or a (2,p) array of quantiles and variances (if `var` is True), where ``p`` is the number of quantiles. See Also -------- hdquantiles_sd Examples -------- >>> import numpy as np >>> from scipy.stats.mstats import hdquantiles >>> >>> # Sample data >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4]) >>> >>> # Probabilities at which to compute quantiles >>> probabilities = [0.25, 0.5, 0.75] >>> >>> # Compute Harrell-Davis quantile estimates >>> quantile_estimates = hdquantiles(data, prob=probabilities) >>> >>> # Display the quantile estimates >>> for i, quantile in enumerate(probabilities): ... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}") 25th percentile: 3.1505820231763066 # may vary 50th percentile: 5.194344084883956 75th percentile: 7.430626414674935 """ def _hd_1D(data,prob,var): "Computes the HD quantiles for a 1D array. Returns nan for invalid data." xsorted = np.squeeze(np.sort(data.compressed().view(ndarray))) # Don't use length here, in case we have a numpy scalar n = xsorted.size hd = np.empty((2,len(prob)), float64) if n < 2: hd.flat = np.nan if var: return hd return hd[0] v = np.arange(n+1) / float(n) betacdf = beta.cdf for (i,p) in enumerate(prob): _w = betacdf(v, (n+1)*p, (n+1)*(1-p)) w = _w[1:] - _w[:-1] hd_mean = np.dot(w, xsorted) hd[0,i] = hd_mean # hd[1,i] = np.dot(w, (xsorted-hd_mean)**2) # hd[0, prob == 0] = xsorted[0] hd[0, prob == 1] = xsorted[-1] if var: hd[1, prob == 0] = hd[1, prob == 1] = np.nan return hd return hd[0] # Initialization & checks data = ma.array(data, copy=False, dtype=float64) p = np.atleast_1d(np.asarray(prob)) # Computes quantiles along axis (or globally) if (axis is None) or (data.ndim == 1): result = _hd_1D(data, p, var) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hd_1D, axis, data, p, var) return ma.fix_invalid(result, copy=False) def hdmedian(data, axis=-1, var=False): """ Returns the Harrell-Davis estimate of the median along the given axis. Parameters ---------- data : ndarray Data array. axis : int, optional Axis along which to compute the quantiles. If None, use a flattened array. var : bool, optional Whether to return the variance of the estimate. Returns ------- hdmedian : MaskedArray The median values. If ``var=True``, the variance is returned inside the masked array. E.g. for a 1-D array the shape change from (1,) to (2,). """ result = hdquantiles(data,[0.5], axis=axis, var=var) return result.squeeze() def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None): """ The standard error of the Harrell-Davis quantile estimates by jackknife. Parameters ---------- data : array_like Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- hdquantiles_sd : MaskedArray Standard error of the Harrell-Davis quantile estimates. See Also -------- hdquantiles """ def _hdsd_1D(data, prob): "Computes the std error for 1D arrays." xsorted = np.sort(data.compressed()) n = len(xsorted) hdsd = np.empty(len(prob), float64) if n < 2: hdsd.flat = np.nan vv = np.arange(n) / float(n-1) betacdf = beta.cdf for (i,p) in enumerate(prob): _w = betacdf(vv, n*p, n*(1-p)) w = _w[1:] - _w[:-1] # cumulative sum of weights and data points if # ith point is left out for jackknife mx_ = np.zeros_like(xsorted) mx_[1:] = np.cumsum(w * xsorted[:-1]) # similar but from the right mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1] hdsd[i] = np.sqrt(mx_.var() * (n - 1)) return hdsd # Initialization & checks data = ma.array(data, copy=False, dtype=float64) p = np.atleast_1d(np.asarray(prob)) # Computes quantiles along axis (or globally) if (axis is None): result = _hdsd_1D(data, p) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_hdsd_1D, axis, data, p) return ma.fix_invalid(result, copy=False).ravel() def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True), alpha=0.05, axis=None): """ Selected confidence interval of the trimmed mean along the given axis. Parameters ---------- data : array_like Input data. limits : {None, tuple}, optional None or a two item tuple. Tuple of the percentages to cut on each side of the array, with respect to the number of unmasked data, as floats between 0. and 1. If ``n`` is the number of unmasked data before trimming, then (``n * limits[0]``)th smallest data and (``n * limits[1]``)th largest data are masked. The total number of unmasked data after trimming is ``n * (1. - sum(limits))``. The value of one limit can be set to None to indicate an open interval. Defaults to (0.2, 0.2). inclusive : (2,) tuple of boolean, optional If relative==False, tuple indicating whether values exactly equal to the absolute limits are allowed. If relative==True, tuple indicating whether the number of data being masked on each side should be rounded (True) or truncated (False). Defaults to (True, True). alpha : float, optional Confidence level of the intervals. Defaults to 0.05. axis : int, optional Axis along which to cut. If None, uses a flattened version of `data`. Defaults to None. Returns ------- trimmed_mean_ci : (2,) ndarray The lower and upper confidence intervals of the trimmed data. """ data = ma.array(data, copy=False) trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis) tmean = trimmed.mean(axis) tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis) df = trimmed.count(axis) - 1 tppf = t.ppf(1-alpha/2.,df) return np.array((tmean - tppf*tstde, tmean+tppf*tstde)) def mjci(data, prob=[0.25,0.5,0.75], axis=None): """ Returns the Maritz-Jarrett estimators of the standard error of selected experimental quantiles of the data. Parameters ---------- data : ndarray Data array. prob : sequence, optional Sequence of quantiles to compute. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. """ def _mjci_1D(data, p): data = np.sort(data.compressed()) n = data.size prob = (np.array(p) * n + 0.5).astype(int) betacdf = beta.cdf mj = np.empty(len(prob), float64) x = np.arange(1,n+1, dtype=float64) / n y = x - 1./n for (i,m) in enumerate(prob): W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m) C1 = np.dot(W,data) C2 = np.dot(W,data**2) mj[i] = np.sqrt(C2 - C1**2) return mj data = ma.array(data, copy=False) if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) p = np.atleast_1d(np.asarray(prob)) # Computes quantiles along axis (or globally) if (axis is None): return _mjci_1D(data, p) else: return ma.apply_along_axis(_mjci_1D, axis, data, p) def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None): """ Computes the alpha confidence interval for the selected quantiles of the data, with Maritz-Jarrett estimators. Parameters ---------- data : ndarray Data array. prob : sequence, optional Sequence of quantiles to compute. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- ci_lower : ndarray The lower boundaries of the confidence interval. Of the same length as `prob`. ci_upper : ndarray The upper boundaries of the confidence interval. Of the same length as `prob`. """ alpha = min(alpha, 1 - alpha) z = norm.ppf(1 - alpha/2.) xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis) smj = mjci(data, prob, axis=axis) return (xq - z * smj, xq + z * smj) def median_cihs(data, alpha=0.05, axis=None): """ Computes the alpha-level confidence interval for the median of the data. Uses the Hettmasperger-Sheather method. Parameters ---------- data : array_like Input data. Masked values are discarded. The input should be 1D only, or `axis` should be set to None. alpha : float, optional Confidence level of the intervals. axis : int or None, optional Axis along which to compute the quantiles. If None, use a flattened array. Returns ------- median_cihs Alpha level confidence interval. """ def _cihs_1D(data, alpha): data = np.sort(data.compressed()) n = len(data) alpha = min(alpha, 1-alpha) k = int(binom._ppf(alpha/2., n, 0.5)) gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) if gk < 1-alpha: k -= 1 gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5) gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5) I = (gk - 1 + alpha)/(gk - gkk) lambd = (n-k) * I / float(k + (n-2*k)*I) lims = (lambd*data[k] + (1-lambd)*data[k-1], lambd*data[n-k-1] + (1-lambd)*data[n-k]) return lims data = ma.array(data, copy=False) # Computes quantiles along axis (or globally) if (axis is None): result = _cihs_1D(data, alpha) else: if data.ndim > 2: raise ValueError("Array 'data' must be at most two dimensional, " "but got data.ndim = %d" % data.ndim) result = ma.apply_along_axis(_cihs_1D, axis, data, alpha) return result def compare_medians_ms(group_1, group_2, axis=None): """ Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : array_like First dataset. Has to be of size >=7. group_2 : array_like Second dataset. Has to be of size >=7. axis : int, optional Axis along which the medians are estimated. If None, the arrays are flattened. If `axis` is not None, then `group_1` and `group_2` should have the same shape. Returns ------- compare_medians_ms : {float, ndarray} If `axis` is None, then returns a float, otherwise returns a 1-D ndarray of floats with a length equal to the length of `group_1` along `axis`. Examples -------- >>> from scipy import stats >>> a = [1, 2, 3, 4, 5, 6, 7] >>> b = [8, 9, 10, 11, 12, 13, 14] >>> stats.mstats.compare_medians_ms(a, b, axis=None) 1.0693225866553746e-05 The function is vectorized to compute along a given axis. >>> import numpy as np >>> rng = np.random.default_rng() >>> x = rng.random(size=(3, 7)) >>> y = rng.random(size=(3, 8)) >>> stats.mstats.compare_medians_ms(x, y, axis=1) array([0.36908985, 0.36092538, 0.2765313 ]) References ---------- .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods for studentizing the sample median." Communications in Statistics-Simulation and Computation 13.6 (1984): 751-773. """ (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) return 1 - norm.cdf(W) def idealfourths(data, axis=None): """ Returns an estimate of the lower and upper quartiles. Uses the ideal fourths algorithm. Parameters ---------- data : array_like Input array. axis : int, optional Axis along which the quartiles are estimated. If None, the arrays are flattened. Returns ------- idealfourths : {list of floats, masked array} Returns the two internal values that divide `data` into four parts using the ideal fourths algorithm either along the flattened array (if `axis` is None) or along `axis` of `data`. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan,np.nan] (j,h) = divmod(n/4. + 5/12.,1) j = int(j) qlo = (1-h)*x[j-1] + h*x[j] k = n - j qup = (1-h)*x[k] + h*x[k-1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if (axis is None): return _idf(data) else: return ma.apply_along_axis(_idf, axis, data) def rsh(data, points=None): """ Evaluates Rosenblatt's shifted histogram estimators for each data point. Rosenblatt's estimator is a centered finite-difference approximation to the derivative of the empirical cumulative distribution function. Parameters ---------- data : sequence Input data, should be 1-D. Masked values are ignored. points : sequence or None, optional Sequence of points where to evaluate Rosenblatt shifted histogram. If None, use the data. """ data = ma.array(data, copy=False) if points is None: points = data else: points = np.atleast_1d(np.asarray(points)) if data.ndim != 1: raise AttributeError("The input array should be 1D only !") n = data.count() r = idealfourths(data, axis=None) h = 1.2 * (r[-1]-r[0]) / n**(1./5) nhi = (data[:,None] <= points[None,:] + h).sum(0) nlo = (data[:,None] < points[None,:] - h).sum(0) return (nhi-nlo) / (2.*n*h)