307 lines
11 KiB
Python
307 lines
11 KiB
Python
|
"""
|
||
|
Lowess testing suite.
|
||
|
|
||
|
Expected outcomes are generated by R's lowess function given the same
|
||
|
arguments. The R script test_lowess_r_outputs.R can be used to
|
||
|
generate the expected outcomes.
|
||
|
|
||
|
The delta tests utilize Silverman's motorcycle collision data,
|
||
|
available in R's MASS package.
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
|
||
|
import numpy as np
|
||
|
from numpy.testing import (
|
||
|
assert_,
|
||
|
assert_allclose,
|
||
|
assert_almost_equal,
|
||
|
assert_equal,
|
||
|
assert_raises,
|
||
|
)
|
||
|
import pytest
|
||
|
|
||
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
||
|
import pandas as pd
|
||
|
|
||
|
# Number of decimals to test equality with.
|
||
|
# The default is 7.
|
||
|
curdir = os.path.dirname(os.path.abspath(__file__))
|
||
|
rpath = os.path.join(curdir, "results")
|
||
|
|
||
|
|
||
|
class TestLowess:
|
||
|
def test_import(self):
|
||
|
# this does not work
|
||
|
# from statsmodels.api.nonparametric import lowess as lowess1
|
||
|
import statsmodels.api as sm
|
||
|
|
||
|
lowess1 = sm.nonparametric.lowess
|
||
|
assert_(lowess is lowess1)
|
||
|
|
||
|
@pytest.mark.parametrize("use_pandas",[False, True])
|
||
|
def test_flat(self, use_pandas):
|
||
|
test_data = {
|
||
|
"x": np.arange(20),
|
||
|
"y": np.zeros(20),
|
||
|
"out": np.zeros(20),
|
||
|
}
|
||
|
if use_pandas:
|
||
|
test_data = {k: pd.Series(test_data[k]) for k in test_data}
|
||
|
expected_lowess = np.array([test_data["x"], test_data["out"]]).T
|
||
|
actual_lowess = lowess(test_data["y"], test_data["x"])
|
||
|
assert_almost_equal(expected_lowess, actual_lowess, 7)
|
||
|
|
||
|
def test_range(self):
|
||
|
test_data = {
|
||
|
"x": np.arange(20),
|
||
|
"y": np.arange(20),
|
||
|
"out": np.arange(20),
|
||
|
}
|
||
|
expected_lowess = np.array([test_data["x"], test_data["out"]]).T
|
||
|
actual_lowess = lowess(test_data["y"], test_data["x"])
|
||
|
assert_almost_equal(expected_lowess, actual_lowess, 7)
|
||
|
|
||
|
@staticmethod
|
||
|
def generate(name, fname, x="x", y="y", out="out", kwargs=None, decimal=7):
|
||
|
kwargs = {} if kwargs is None else kwargs
|
||
|
data = np.genfromtxt(
|
||
|
os.path.join(rpath, fname), delimiter=",", names=True
|
||
|
)
|
||
|
assert_almost_equal.description = name
|
||
|
if callable(kwargs):
|
||
|
kwargs = kwargs(data)
|
||
|
result = lowess(data[y], data[x], **kwargs)
|
||
|
expect = np.array([data[x], data[out]]).T
|
||
|
assert_almost_equal(result, expect, decimal)
|
||
|
|
||
|
# TODO: Refactor as parametrized test once nose is permanently dropped
|
||
|
def test_simple(self):
|
||
|
self.generate("test_simple", "test_lowess_simple.csv")
|
||
|
|
||
|
def test_iter_0(self):
|
||
|
self.generate(
|
||
|
"test_iter_0",
|
||
|
"test_lowess_iter.csv",
|
||
|
out="out_0",
|
||
|
kwargs={"it": 0},
|
||
|
)
|
||
|
|
||
|
def test_iter_0_3(self):
|
||
|
self.generate(
|
||
|
"test_iter_0",
|
||
|
"test_lowess_iter.csv",
|
||
|
out="out_3",
|
||
|
kwargs={"it": 3},
|
||
|
)
|
||
|
|
||
|
def test_frac_2_3(self):
|
||
|
self.generate(
|
||
|
"test_frac_2_3",
|
||
|
"test_lowess_frac.csv",
|
||
|
out="out_2_3",
|
||
|
kwargs={"frac": 2.0 / 3},
|
||
|
)
|
||
|
|
||
|
def test_frac_1_5(self):
|
||
|
self.generate(
|
||
|
"test_frac_1_5",
|
||
|
"test_lowess_frac.csv",
|
||
|
out="out_1_5",
|
||
|
kwargs={"frac": 1.0 / 5},
|
||
|
)
|
||
|
|
||
|
def test_delta_0(self):
|
||
|
self.generate(
|
||
|
"test_delta_0",
|
||
|
"test_lowess_delta.csv",
|
||
|
out="out_0",
|
||
|
kwargs={"frac": 0.1},
|
||
|
)
|
||
|
|
||
|
def test_delta_rdef(self):
|
||
|
self.generate(
|
||
|
"test_delta_Rdef",
|
||
|
"test_lowess_delta.csv",
|
||
|
out="out_Rdef",
|
||
|
kwargs=lambda data: {
|
||
|
"frac": 0.1,
|
||
|
"delta": 0.01 * np.ptp(data["x"]),
|
||
|
},
|
||
|
)
|
||
|
|
||
|
def test_delta_1(self):
|
||
|
self.generate(
|
||
|
"test_delta_1",
|
||
|
"test_lowess_delta.csv",
|
||
|
out="out_1",
|
||
|
kwargs={"frac": 0.1, "delta": 1 + 1e-10},
|
||
|
decimal=10,
|
||
|
)
|
||
|
|
||
|
def test_options(self):
|
||
|
rfile = os.path.join(rpath, "test_lowess_simple.csv")
|
||
|
test_data = np.genfromtxt(open(rfile, "rb"), delimiter=",", names=True)
|
||
|
y, x = test_data["y"], test_data["x"]
|
||
|
res1_fitted = test_data["out"]
|
||
|
expected_lowess = np.array([test_data["x"], test_data["out"]]).T
|
||
|
|
||
|
# check skip sorting
|
||
|
actual_lowess1 = lowess(y, x, is_sorted=True)
|
||
|
assert_almost_equal(actual_lowess1, expected_lowess, decimal=13)
|
||
|
|
||
|
# check skip sorting - DataFrame
|
||
|
df = pd.DataFrame({"y": y, "x": x})
|
||
|
actual_lowess1 = lowess(df["y"], df["x"], is_sorted=True)
|
||
|
assert_almost_equal(actual_lowess1, expected_lowess, decimal=13)
|
||
|
|
||
|
# check skip missing
|
||
|
actual_lowess = lowess(y, x, is_sorted=True, missing="none")
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1, decimal=13)
|
||
|
|
||
|
# check order/index, returns yfitted only
|
||
|
actual_lowess = lowess(y[::-1], x[::-1], return_sorted=False)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1[::-1, 1], decimal=13)
|
||
|
|
||
|
# check returns yfitted only
|
||
|
actual_lowess = lowess(
|
||
|
y, x, return_sorted=False, missing="none", is_sorted=True
|
||
|
)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1[:, 1], decimal=13)
|
||
|
|
||
|
# check integer input
|
||
|
actual_lowess = lowess(np.round(y).astype(int), x, is_sorted=True)
|
||
|
actual_lowess1 = lowess(np.round(y), x, is_sorted=True)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1, decimal=13)
|
||
|
assert_(actual_lowess.dtype is np.dtype(float))
|
||
|
# this will also have duplicate x
|
||
|
actual_lowess = lowess(y, np.round(x).astype(int), is_sorted=True)
|
||
|
actual_lowess1 = lowess(y, np.round(x), is_sorted=True)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1, decimal=13)
|
||
|
assert_(actual_lowess.dtype is np.dtype(float))
|
||
|
|
||
|
# Test specifying xvals explicitly
|
||
|
perm_idx = np.arange(len(x) // 2)
|
||
|
np.random.shuffle(perm_idx)
|
||
|
actual_lowess2 = lowess(y, x, xvals=x[perm_idx], return_sorted=False)
|
||
|
assert_almost_equal(
|
||
|
actual_lowess[perm_idx, 1], actual_lowess2, decimal=13
|
||
|
)
|
||
|
|
||
|
# check with nans, this changes the arrays
|
||
|
y[[5, 6]] = np.nan
|
||
|
x[3] = np.nan
|
||
|
mask_valid = np.isfinite(x) & np.isfinite(y)
|
||
|
# actual_lowess1[[3, 5, 6], 1] = np.nan
|
||
|
actual_lowess = lowess(y, x, is_sorted=True)
|
||
|
actual_lowess1 = lowess(y[mask_valid], x[mask_valid], is_sorted=True)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess1, decimal=13)
|
||
|
assert_raises(ValueError, lowess, y, x, missing="raise")
|
||
|
|
||
|
perm_idx = np.arange(len(x))
|
||
|
np.random.shuffle(perm_idx)
|
||
|
yperm = y[perm_idx]
|
||
|
xperm = x[perm_idx]
|
||
|
actual_lowess2 = lowess(yperm, xperm, is_sorted=False)
|
||
|
assert_almost_equal(actual_lowess, actual_lowess2, decimal=13)
|
||
|
|
||
|
actual_lowess3 = lowess(
|
||
|
yperm, xperm, is_sorted=False, return_sorted=False
|
||
|
)
|
||
|
mask_valid = np.isfinite(xperm) & np.isfinite(yperm)
|
||
|
assert_equal(np.isnan(actual_lowess3), ~mask_valid)
|
||
|
# get valid sorted smoothed y from actual_lowess3
|
||
|
sort_idx = np.argsort(xperm)
|
||
|
yhat = actual_lowess3[sort_idx]
|
||
|
yhat = yhat[np.isfinite(yhat)]
|
||
|
assert_almost_equal(yhat, actual_lowess2[:, 1], decimal=13)
|
||
|
|
||
|
# Test specifying xvals explicitly, now with nans
|
||
|
perm_idx = np.arange(actual_lowess.shape[0])
|
||
|
actual_lowess4 = lowess(
|
||
|
y, x, xvals=actual_lowess[perm_idx, 0], return_sorted=False
|
||
|
)
|
||
|
assert_almost_equal(
|
||
|
actual_lowess[perm_idx, 1], actual_lowess4, decimal=13
|
||
|
)
|
||
|
|
||
|
def test_duplicate_xs(self):
|
||
|
# see 2449
|
||
|
# Generate cases with many duplicate x values
|
||
|
x = [0] + [1] * 100 + [2] * 100 + [3]
|
||
|
y = x + np.random.normal(size=len(x)) * 1e-8
|
||
|
result = lowess(y, x, frac=50 / len(x), it=1)
|
||
|
# fit values should be approximately averages of values at
|
||
|
# a particular fit, which in this case are just equal to x
|
||
|
assert_almost_equal(result[1:-1, 1], x[1:-1], decimal=7)
|
||
|
|
||
|
def test_spike(self):
|
||
|
# see 7700
|
||
|
# Create a curve that is easy to fit at first but gets
|
||
|
# harder further along.
|
||
|
# This used to give an outlier bad fit at position 961
|
||
|
x = np.linspace(0, 10, 1001)
|
||
|
y = np.cos(x ** 2 / 5)
|
||
|
result = lowess(y, x, frac=11 / len(x), it=1)
|
||
|
assert_(np.all(result[:, 1] > np.min(y) - 0.1))
|
||
|
assert_(np.all(result[:, 1] < np.max(y) + 0.1))
|
||
|
|
||
|
def test_exog_predict(self):
|
||
|
rfile = os.path.join(rpath, "test_lowess_simple.csv")
|
||
|
test_data = np.genfromtxt(open(rfile, "rb"), delimiter=",", names=True)
|
||
|
y, x = test_data["y"], test_data["x"]
|
||
|
target = lowess(y, x, is_sorted=True)
|
||
|
|
||
|
# Test specifying exog_predict explicitly
|
||
|
perm_idx = np.arange(len(x) // 2)
|
||
|
np.random.shuffle(perm_idx)
|
||
|
actual_lowess = lowess(y, x, xvals=x[perm_idx], missing="none")
|
||
|
assert_almost_equal(target[perm_idx, 1], actual_lowess, decimal=13)
|
||
|
|
||
|
target_it0 = lowess(y, x, return_sorted=False, it=0)
|
||
|
actual_lowess2 = lowess(y, x, xvals=x[perm_idx], it=0)
|
||
|
assert_almost_equal(target_it0[perm_idx], actual_lowess2, decimal=13)
|
||
|
|
||
|
# Check nans in exog_predict
|
||
|
with pytest.raises(ValueError):
|
||
|
lowess(y, x, xvals=np.array([np.nan, 5, 3]), missing="raise")
|
||
|
|
||
|
# With is_sorted=True
|
||
|
actual_lowess3 = lowess(y, x, xvals=x, is_sorted=True)
|
||
|
assert_equal(actual_lowess3, target[:, 1])
|
||
|
|
||
|
# check with nans, this changes the arrays
|
||
|
y[[5, 6]] = np.nan
|
||
|
x[3] = np.nan
|
||
|
target = lowess(y, x, is_sorted=True)
|
||
|
|
||
|
# Test specifying exog_predict explicitly, now with nans
|
||
|
perm_idx = np.arange(target.shape[0])
|
||
|
actual_lowess1 = lowess(y, x, xvals=target[perm_idx, 0])
|
||
|
assert_almost_equal(target[perm_idx, 1], actual_lowess1, decimal=13)
|
||
|
|
||
|
# nans and missing='drop'
|
||
|
actual_lowess2 = lowess(y, x, xvals=x, missing="drop")
|
||
|
all_finite = np.isfinite(x) & np.isfinite(y)
|
||
|
assert_equal(actual_lowess2[all_finite], target[:, 1])
|
||
|
|
||
|
# Dimensional check
|
||
|
with pytest.raises(ValueError):
|
||
|
lowess(y, x, xvals=np.array([[5], [10]]))
|
||
|
|
||
|
|
||
|
def test_returns_inputs():
|
||
|
# see 1960
|
||
|
y = [0] * 10 + [1] * 10
|
||
|
x = np.arange(20)
|
||
|
result = lowess(y, x, frac=0.4)
|
||
|
assert_almost_equal(result, np.column_stack((x, y)))
|
||
|
|
||
|
|
||
|
def test_xvals_dtype(reset_randomstate):
|
||
|
y = [0] * 10 + [1] * 10
|
||
|
x = np.arange(20)
|
||
|
# Previously raised ValueError: Buffer dtype mismatch
|
||
|
results_xvals = lowess(y, x, frac=0.4, xvals=x[:5])
|
||
|
assert_allclose(results_xvals, np.zeros(5), atol=1e-12)
|