""" Univariate lowess function, like in R. References ---------- Hastie, Tibshirani, Friedman. (2009) The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition: Chapter 6. Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836. """ import numpy as np from numpy.linalg import lstsq def lowess(endog, exog, frac=2./3, it=3): """ LOWESS (Locally Weighted Scatterplot Smoothing) A lowess function that outs smoothed estimates of endog at the given exog values from points (exog, endog) Parameters ---------- endog : 1-D numpy array The y-values of the observed points exog : 1-D numpy array The x-values of the observed points frac : float Between 0 and 1. The fraction of the data used when estimating each y-value. it : int The number of residual-based reweightings to perform. Returns ------- out: numpy array A numpy array with two columns. The first column is the sorted x values and the second column the associated estimated y-values. Notes ----- This lowess function implements the algorithm given in the reference below using local linear estimates. Suppose the input data has N points. The algorithm works by estimating the true ``y_i`` by taking the frac*N closest points to ``(x_i,y_i)`` based on their x values and estimating ``y_i`` using a weighted linear regression. The weight for ``(x_j,y_j)`` is `_lowess_tricube` function applied to ``|x_i-x_j|``. If ``iter > 0``, then further weighted local linear regressions are performed, where the weights are the same as above times the `_lowess_bisquare` function of the residuals. Each iteration takes approximately the same amount of time as the original fit, so these iterations are expensive. They are most useful when the noise has extremely heavy tails, such as Cauchy noise. Noise with less heavy-tails, such as t-distributions with ``df > 2``, are less problematic. The weights downgrade the influence of points with large residuals. In the extreme case, points whose residuals are larger than 6 times the median absolute residual are given weight 0. Some experimentation is likely required to find a good choice of frac and iter for a particular dataset. References ---------- Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836. Examples -------- The below allows a comparison between how different the fits from `lowess` for different values of frac can be. >>> import numpy as np >>> import statsmodels.api as sm >>> lowess = sm.nonparametric.lowess >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) >>> y = np.sin(x) + np.random.normal(size=len(x)) >>> z = lowess(y, x) >>> w = lowess(y, x, frac=1./3) This gives a similar comparison for when it is 0 vs not. >>> import scipy.stats as stats >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x)) >>> z = lowess(y, x, frac= 1./3, it=0) >>> w = lowess(y, x, frac=1./3) """ x = exog if exog.ndim != 1: raise ValueError('exog must be a vector') if endog.ndim != 1: raise ValueError('endog must be a vector') if endog.shape[0] != x.shape[0] : raise ValueError('exog and endog must have same length') n = exog.shape[0] fitted = np.zeros(n) k = int(frac * n) index_array = np.argsort(exog) x_copy = np.array(exog[index_array]) #, dtype ='float32') y_copy = endog[index_array] fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n) for i in range(it): _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n) out = np.array([x_copy, fitted]).T out.shape = (n,2) return out def _lowess_initial_fit(x_copy, y_copy, k, n): """ The initial weighted local linear regression for lowess. Parameters ---------- x_copy : 1-d ndarray The x-values/exogenous part of the data being smoothed y_copy : 1-d ndarray The y-values/ endogenous part of the data being smoothed k : int The number of data points which affect the linear fit for each estimated point n : int The total number of points Returns ------- fitted : 1-d ndarray The fitted y-values weights : 2-d ndarray An n by k array. The contribution to the weights in the local linear fit coming from the distances between the x-values """ weights = np.zeros((n,k), dtype = x_copy.dtype) nn_indices = [0,k] X = np.ones((k,2)) fitted = np.zeros(n) for i in range(n): #note: all _lowess functions are inplace, no return left_width = x_copy[i] - x_copy[nn_indices[0]] right_width = x_copy[nn_indices[1]-1] - x_copy[i] width = max(left_width, right_width) _lowess_wt_standardize(weights[i,:], x_copy[nn_indices[0]:nn_indices[1]], x_copy[i], width) _lowess_tricube(weights[i,:]) weights[i,:] = np.sqrt(weights[i,:]) X[:,1] = x_copy[nn_indices[0]:nn_indices[1]] y_i = weights[i,:] * y_copy[nn_indices[0]:nn_indices[1]] beta = lstsq(weights[i,:].reshape(k,1) * X, y_i, rcond=-1)[0] fitted[i] = beta[0] + beta[1]*x_copy[i] _lowess_update_nn(x_copy, nn_indices, i+1) return fitted, weights def _lowess_wt_standardize(weights, new_entries, x_copy_i, width): """ The initial phase of creating the weights. Subtract the current x_i and divide by the width. Parameters ---------- weights : ndarray The memory where (new_entries - x_copy_i)/width will be placed new_entries : ndarray The x-values of the k closest points to x[i] x_copy_i : float x[i], the i'th point in the (sorted) x values width : float The maximum distance between x[i] and any point in new_entries Returns ------- Nothing. The modifications are made to weight in place. """ weights[:] = new_entries weights -= x_copy_i weights /= width def _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n): """ Additional weighted local linear regressions, performed if iter>0. They take into account the sizes of the residuals, to eliminate the effect of extreme outliers. Parameters ---------- x_copy : 1-d ndarray The x-values/exogenous part of the data being smoothed y_copy : 1-d ndarray The y-values/ endogenous part of the data being smoothed fitted : 1-d ndarray The fitted y-values from the previous iteration weights : 2-d ndarray An n by k array. The contribution to the weights in the local linear fit coming from the distances between the x-values k : int The number of data points which affect the linear fit for each estimated point n : int The total number of points Returns ------- Nothing. The fitted values are modified in place. """ nn_indices = [0,k] X = np.ones((k,2)) residual_weights = np.copy(y_copy) residual_weights.shape = (n,) residual_weights -= fitted residual_weights = np.absolute(residual_weights)#, out=residual_weights) s = np.median(residual_weights) residual_weights /= (6*s) too_big = residual_weights>=1 _lowess_bisquare(residual_weights) residual_weights[too_big] = 0 for i in range(n): total_weights = weights[i,:] * np.sqrt(residual_weights[nn_indices[0]: nn_indices[1]]) X[:,1] = x_copy[nn_indices[0]:nn_indices[1]] y_i = total_weights * y_copy[nn_indices[0]:nn_indices[1]] total_weights.shape = (k,1) beta = lstsq(total_weights * X, y_i, rcond=-1)[0] fitted[i] = beta[0] + beta[1] * x_copy[i] _lowess_update_nn(x_copy, nn_indices, i+1) def _lowess_update_nn(x, cur_nn,i): """ Update the endpoints of the nearest neighbors to the ith point. Parameters ---------- x : iterable The sorted points of x-values cur_nn : list of length 2 The two current indices between which are the k closest points to x[i]. (The actual value of k is irrelevant for the algorithm. i : int The index of the current value in x for which the k closest points are desired. Returns ------- Nothing. It modifies cur_nn in place. """ while True: if cur_nn[1]