305 lines
9.6 KiB
Python
305 lines
9.6 KiB
Python
"""Anova k-sample comparison without and with trimming
|
|
|
|
Created on Sun Jun 09 23:51:34 2013
|
|
|
|
Author: Josef Perktold
|
|
"""
|
|
|
|
import numbers
|
|
import numpy as np
|
|
|
|
# the trimboth and trim_mean are taken from scipy.stats.stats
|
|
# and enhanced by axis
|
|
|
|
|
|
def trimboth(a, proportiontocut, axis=0):
|
|
"""
|
|
Slices off a proportion of items from both ends of an array.
|
|
|
|
Slices off the passed proportion of items from both ends of the passed
|
|
array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
|
|
rightmost 10% of scores). You must pre-sort the array if you want
|
|
'proper' trimming. Slices off less if proportion results in a
|
|
non-integer slice index (i.e., conservatively slices off
|
|
`proportiontocut`).
|
|
|
|
Parameters
|
|
----------
|
|
a : array_like
|
|
Data to trim.
|
|
proportiontocut : float or int
|
|
Proportion of data to trim at each end.
|
|
axis : int or None
|
|
Axis along which the observations are trimmed. The default is to trim
|
|
along axis=0. If axis is None then the array will be flattened before
|
|
trimming.
|
|
|
|
Returns
|
|
-------
|
|
out : array-like
|
|
Trimmed version of array `a`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from scipy import stats
|
|
>>> a = np.arange(20)
|
|
>>> b = stats.trimboth(a, 0.1)
|
|
>>> b.shape
|
|
(16,)
|
|
|
|
"""
|
|
a = np.asarray(a)
|
|
if axis is None:
|
|
a = a.ravel()
|
|
axis = 0
|
|
nobs = a.shape[axis]
|
|
lowercut = int(proportiontocut * nobs)
|
|
uppercut = nobs - lowercut
|
|
if (lowercut >= uppercut):
|
|
raise ValueError("Proportion too big.")
|
|
|
|
sl = [slice(None)] * a.ndim
|
|
sl[axis] = slice(lowercut, uppercut)
|
|
return a[tuple(sl)]
|
|
|
|
|
|
def trim_mean(a, proportiontocut, axis=0):
|
|
"""
|
|
Return mean of array after trimming observations from both tails.
|
|
|
|
If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
|
|
scores. Slices off LESS if proportion results in a non-integer slice
|
|
index (i.e., conservatively slices off `proportiontocut` ).
|
|
|
|
Parameters
|
|
----------
|
|
a : array_like
|
|
Input array
|
|
proportiontocut : float
|
|
Fraction to cut off at each tail of the sorted observations.
|
|
axis : int or None
|
|
Axis along which the trimmed means are computed. The default is axis=0.
|
|
If axis is None then the trimmed mean will be computed for the
|
|
flattened array.
|
|
|
|
Returns
|
|
-------
|
|
trim_mean : ndarray
|
|
Mean of trimmed array.
|
|
|
|
"""
|
|
newa = trimboth(np.sort(a, axis), proportiontocut, axis=axis)
|
|
return np.mean(newa, axis=axis)
|
|
|
|
|
|
class TrimmedMean:
|
|
"""
|
|
class for trimmed and winsorized one sample statistics
|
|
|
|
axis is None, i.e. ravelling, is not supported
|
|
|
|
Parameters
|
|
----------
|
|
data : array-like
|
|
The data, observations to analyze.
|
|
fraction : float in (0, 0.5)
|
|
The fraction of observations to trim at each tail.
|
|
The number of observations trimmed at each tail is
|
|
``int(fraction * nobs)``
|
|
is_sorted : boolean
|
|
Indicator if data is already sorted. By default the data is sorted
|
|
along ``axis``.
|
|
axis : int
|
|
The axis of reduce operations. By default axis=0, that is observations
|
|
are along the zero dimension, i.e. rows if 2-dim.
|
|
"""
|
|
|
|
def __init__(self, data, fraction, is_sorted=False, axis=0):
|
|
self.data = np.asarray(data)
|
|
# TODO: add pandas handling, maybe not if this stays internal
|
|
|
|
self.axis = axis
|
|
self.fraction = fraction
|
|
self.nobs = nobs = self.data.shape[axis]
|
|
self.lowercut = lowercut = int(fraction * nobs)
|
|
self.uppercut = uppercut = nobs - lowercut
|
|
if (lowercut >= uppercut):
|
|
raise ValueError("Proportion too big.")
|
|
self.nobs_reduced = nobs - 2 * lowercut
|
|
|
|
self.sl = [slice(None)] * self.data.ndim
|
|
self.sl[axis] = slice(self.lowercut, self.uppercut)
|
|
# numpy requires now tuple for indexing, not list
|
|
self.sl = tuple(self.sl)
|
|
if not is_sorted:
|
|
self.data_sorted = np.sort(self.data, axis=axis)
|
|
else:
|
|
self.data_sorted = self.data
|
|
|
|
# this only works for axis=0
|
|
self.lowerbound = np.take(self.data_sorted, lowercut, axis=axis)
|
|
self.upperbound = np.take(self.data_sorted, uppercut - 1, axis=axis)
|
|
# self.lowerbound = self.data_sorted[lowercut]
|
|
# self.upperbound = self.data_sorted[uppercut - 1]
|
|
|
|
@property
|
|
def data_trimmed(self):
|
|
"""numpy array of trimmed and sorted data
|
|
"""
|
|
# returns a view
|
|
return self.data_sorted[self.sl]
|
|
|
|
@property # cache
|
|
def data_winsorized(self):
|
|
"""winsorized data
|
|
"""
|
|
lb = np.expand_dims(self.lowerbound, self.axis)
|
|
ub = np.expand_dims(self.upperbound, self.axis)
|
|
return np.clip(self.data_sorted, lb, ub)
|
|
|
|
@property
|
|
def mean_trimmed(self):
|
|
"""mean of trimmed data
|
|
"""
|
|
return np.mean(self.data_sorted[tuple(self.sl)], self.axis)
|
|
|
|
@property
|
|
def mean_winsorized(self):
|
|
"""mean of winsorized data
|
|
"""
|
|
return np.mean(self.data_winsorized, self.axis)
|
|
|
|
@property
|
|
def var_winsorized(self):
|
|
"""variance of winsorized data
|
|
"""
|
|
# hardcoded ddof = 1
|
|
return np.var(self.data_winsorized, ddof=1, axis=self.axis)
|
|
|
|
@property
|
|
def std_mean_trimmed(self):
|
|
"""standard error of trimmed mean
|
|
"""
|
|
se = np.sqrt(self.var_winsorized / self.nobs_reduced)
|
|
# trimming creates correlation across trimmed observations
|
|
# trimming is based on order statistics of the data
|
|
# wilcox 2012, p.61
|
|
se *= np.sqrt(self.nobs / self.nobs_reduced)
|
|
return se
|
|
|
|
@property
|
|
def std_mean_winsorized(self):
|
|
"""standard error of winsorized mean
|
|
"""
|
|
# the following matches Wilcox, WRS2
|
|
std_ = np.sqrt(self.var_winsorized / self.nobs)
|
|
std_ *= (self.nobs - 1) / (self.nobs_reduced - 1)
|
|
# old version
|
|
# tm = self
|
|
# formula from an old SAS manual page, simplified
|
|
# std_ = np.sqrt(tm.var_winsorized / (tm.nobs_reduced - 1) *
|
|
# (tm.nobs - 1.) / tm.nobs)
|
|
return std_
|
|
|
|
def ttest_mean(self, value=0, transform='trimmed',
|
|
alternative='two-sided'):
|
|
"""
|
|
One sample t-test for trimmed or Winsorized mean
|
|
|
|
Parameters
|
|
----------
|
|
value : float
|
|
Value of the mean under the Null hypothesis
|
|
transform : {'trimmed', 'winsorized'}
|
|
Specified whether the mean test is based on trimmed or winsorized
|
|
data.
|
|
alternative : {'two-sided', 'larger', 'smaller'}
|
|
|
|
|
|
Notes
|
|
-----
|
|
p-value is based on the approximate t-distribution of the test
|
|
statistic. The approximation is valid if the underlying distribution
|
|
is symmetric.
|
|
"""
|
|
import statsmodels.stats.weightstats as smws
|
|
df = self.nobs_reduced - 1
|
|
if transform == 'trimmed':
|
|
mean_ = self.mean_trimmed
|
|
std_ = self.std_mean_trimmed
|
|
elif transform == 'winsorized':
|
|
mean_ = self.mean_winsorized
|
|
std_ = self.std_mean_winsorized
|
|
else:
|
|
raise ValueError("transform can only be 'trimmed' or 'winsorized'")
|
|
|
|
res = smws._tstat_generic(mean_, 0, std_,
|
|
df, alternative=alternative, diff=value)
|
|
return res + (df,)
|
|
|
|
def reset_fraction(self, frac):
|
|
"""create a TrimmedMean instance with a new trimming fraction
|
|
|
|
This reuses the sorted array from the current instance.
|
|
"""
|
|
tm = TrimmedMean(self.data_sorted, frac, is_sorted=True,
|
|
axis=self.axis)
|
|
tm.data = self.data
|
|
# TODO: this will not work if there is processing of meta-information
|
|
# in __init__,
|
|
# for example storing a pandas DataFrame or Series index
|
|
return tm
|
|
|
|
|
|
def scale_transform(data, center='median', transform='abs', trim_frac=0.2,
|
|
axis=0):
|
|
"""Transform data for variance comparison for Levene type tests
|
|
|
|
Parameters
|
|
----------
|
|
data : array_like
|
|
Observations for the data.
|
|
center : "median", "mean", "trimmed" or float
|
|
Statistic used for centering observations. If a float, then this
|
|
value is used to center. Default is median.
|
|
transform : 'abs', 'square', 'identity' or a callable
|
|
The transform for the centered data.
|
|
trim_frac : float in [0, 0.5)
|
|
Fraction of observations that are trimmed on each side of the sorted
|
|
observations. This is only used if center is `trimmed`.
|
|
axis : int
|
|
Axis along which the data are transformed when centering.
|
|
|
|
Returns
|
|
-------
|
|
res : ndarray
|
|
transformed data in the same shape as the original data.
|
|
|
|
"""
|
|
x = np.asarray(data) # x is shorthand from earlier code
|
|
|
|
if transform == 'abs':
|
|
tfunc = np.abs
|
|
elif transform == 'square':
|
|
tfunc = lambda x: x * x # noqa
|
|
elif transform == 'identity':
|
|
tfunc = lambda x: x # noqa
|
|
elif callable(transform):
|
|
tfunc = transform
|
|
else:
|
|
raise ValueError('transform should be abs, square or exp')
|
|
|
|
if center == 'median':
|
|
res = tfunc(x - np.expand_dims(np.median(x, axis=axis), axis))
|
|
elif center == 'mean':
|
|
res = tfunc(x - np.expand_dims(np.mean(x, axis=axis), axis))
|
|
elif center == 'trimmed':
|
|
center = trim_mean(x, trim_frac, axis=axis)
|
|
res = tfunc(x - np.expand_dims(center, axis))
|
|
elif isinstance(center, numbers.Number):
|
|
res = tfunc(x - center)
|
|
else:
|
|
raise ValueError('center should be median, mean or trimmed')
|
|
|
|
return res
|