import os import numpy.testing as npt import numpy as np import pandas as pd import pytest from scipy import stats from statsmodels.distributions.mixture_rvs import mixture_rvs from statsmodels.nonparametric.kde import KDEUnivariate as KDE import statsmodels.sandbox.nonparametric.kernels as kernels import statsmodels.nonparametric.bandwidths as bandwidths # get results from Stata curdir = os.path.dirname(os.path.abspath(__file__)) rfname = os.path.join(curdir, 'results', 'results_kde.csv') # print rfname KDEResults = np.genfromtxt(open(rfname, 'rb'), delimiter=",", names=True) rfname = os.path.join(curdir, 'results', 'results_kde_univ_weights.csv') KDEWResults = np.genfromtxt(open(rfname, 'rb'), delimiter=",", names=True) # get results from R curdir = os.path.dirname(os.path.abspath(__file__)) rfname = os.path.join(curdir, 'results', 'results_kcde.csv') # print rfname KCDEResults = np.genfromtxt(open(rfname, 'rb'), delimiter=",", names=True) # setup test data np.random.seed(12345) Xi = mixture_rvs([.25, .75], size=200, dist=[stats.norm, stats.norm], kwargs=(dict(loc=-1, scale=.5), dict(loc=1, scale=.5))) class TestKDEExceptions: @classmethod def setup_class(cls): cls.kde = KDE(Xi) cls.weights_200 = np.linspace(1, 100, 200) cls.weights_100 = np.linspace(1, 100, 100) def test_check_is_fit_exception(self): with pytest.raises(ValueError): self.kde.evaluate(0) def test_non_weighted_fft_exception(self): with pytest.raises(NotImplementedError): self.kde.fit(kernel="gau", gridsize=50, weights=self.weights_200, fft=True, bw="silverman") def test_wrong_weight_length_exception(self): with pytest.raises(ValueError): self.kde.fit(kernel="gau", gridsize=50, weights=self.weights_100, fft=False, bw="silverman") def test_non_gaussian_fft_exception(self): with pytest.raises(NotImplementedError): self.kde.fit(kernel="epa", gridsize=50, fft=True, bw="silverman") class CheckKDE: decimal_density = 7 def test_density(self): npt.assert_almost_equal(self.res1.density, self.res_density, self.decimal_density) def test_evaluate(self): # disable test # fails for Epan, Triangular and Biweight, only Gaussian is correct # added it as test method to TestKDEGauss below # inDomain is not vectorized # kde_vals = self.res1.evaluate(self.res1.support) kde_vals = [np.squeeze(self.res1.evaluate(xi)) for xi in self.res1.support] kde_vals = np.squeeze(kde_vals) # kde_vals is a "column_list" mask_valid = np.isfinite(kde_vals) # TODO: nans at the boundaries kde_vals[~mask_valid] = 0 npt.assert_almost_equal(kde_vals, self.res_density, self.decimal_density) class TestKDEGauss(CheckKDE): @classmethod def setup_class(cls): res1 = KDE(Xi) res1.fit(kernel="gau", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["gau_d"] def test_evaluate(self): # kde_vals = self.res1.evaluate(self.res1.support) kde_vals = [self.res1.evaluate(xi) for xi in self.res1.support] kde_vals = np.squeeze(kde_vals) # kde_vals is a "column_list" mask_valid = np.isfinite(kde_vals) # TODO: nans at the boundaries kde_vals[~mask_valid] = 0 npt.assert_almost_equal(kde_vals, self.res_density, self.decimal_density) # The following tests are regression tests # Values have been checked to be very close to R 'ks' package (Dec 2013) def test_support_gridded(self): kde = self.res1 support = KCDEResults['gau_support'] npt.assert_allclose(support, kde.support) def test_cdf_gridded(self): kde = self.res1 cdf = KCDEResults['gau_cdf'] npt.assert_allclose(cdf, kde.cdf) def test_sf_gridded(self): kde = self.res1 sf = KCDEResults['gau_sf'] npt.assert_allclose(sf, kde.sf) def test_icdf_gridded(self): kde = self.res1 icdf = KCDEResults['gau_icdf'] npt.assert_allclose(icdf, kde.icdf) class TestKDEGaussPandas(TestKDEGauss): @classmethod def setup_class(cls): res1 = KDE(pd.Series(Xi)) res1.fit(kernel="gau", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["gau_d"] class TestKDEEpanechnikov(CheckKDE): @classmethod def setup_class(cls): res1 = KDE(Xi) res1.fit(kernel="epa", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["epa2_d"] class TestKDETriangular(CheckKDE): @classmethod def setup_class(cls): res1 = KDE(Xi) res1.fit(kernel="tri", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["tri_d"] class TestKDEBiweight(CheckKDE): @classmethod def setup_class(cls): res1 = KDE(Xi) res1.fit(kernel="biw", fft=False, bw="silverman") cls.res1 = res1 cls.res_density = KDEResults["biw_d"] # FIXME: enable/xfail/skip or delete # NOTE: This is a knownfailure due to a definitional difference of Cosine kernel # class TestKDECosine(CheckKDE): # @classmethod # def setup_class(cls): # res1 = KDE(Xi) # res1.fit(kernel="cos", fft=False, bw="silverman") # cls.res1 = res1 # cls.res_density = KDEResults["cos_d"] # weighted estimates taken from matlab so we can allow len(weights) != gridsize class TestKdeWeights(CheckKDE): @classmethod def setup_class(cls): res1 = KDE(Xi) weights = np.linspace(1, 100, 200) res1.fit(kernel="gau", gridsize=50, weights=weights, fft=False, bw="silverman") cls.res1 = res1 fname = os.path.join(curdir, 'results', 'results_kde_weights.csv') cls.res_density = np.genfromtxt(open(fname, 'rb'), skip_header=1) def test_evaluate(self): # kde_vals = self.res1.evaluate(self.res1.support) kde_vals = [self.res1.evaluate(xi) for xi in self.res1.support] kde_vals = np.squeeze(kde_vals) # kde_vals is a "column_list" mask_valid = np.isfinite(kde_vals) # TODO: nans at the boundaries kde_vals[~mask_valid] = 0 npt.assert_almost_equal(kde_vals, self.res_density, self.decimal_density) class TestKDEGaussFFT(CheckKDE): @classmethod def setup_class(cls): cls.decimal_density = 2 # low accuracy because binning is different res1 = KDE(Xi) res1.fit(kernel="gau", fft=True, bw="silverman") cls.res1 = res1 rfname2 = os.path.join(curdir, 'results', 'results_kde_fft.csv') cls.res_density = np.genfromtxt(open(rfname2, 'rb')) class CheckKDEWeights: @classmethod def setup_class(cls): cls.x = x = KDEWResults['x'] weights = KDEWResults['weights'] res1 = KDE(x) # default kernel was scott when reference values computed res1.fit(kernel=cls.kernel_name, weights=weights, fft=False, bw="scott") cls.res1 = res1 cls.res_density = KDEWResults[cls.res_kernel_name] decimal_density = 7 @pytest.mark.xfail(reason="Not almost equal to 7 decimals", raises=AssertionError, strict=True) def test_density(self): npt.assert_almost_equal(self.res1.density, self.res_density, self.decimal_density) def test_evaluate(self): if self.kernel_name == 'cos': pytest.skip("Cosine kernel fails against Stata") kde_vals = [self.res1.evaluate(xi) for xi in self.x] kde_vals = np.squeeze(kde_vals) # kde_vals is a "column_list" npt.assert_almost_equal(kde_vals, self.res_density, self.decimal_density) def test_compare(self): xx = self.res1.support kde_vals = [np.squeeze(self.res1.evaluate(xi)) for xi in xx] kde_vals = np.squeeze(kde_vals) # kde_vals is a "column_list" mask_valid = np.isfinite(kde_vals) # TODO: nans at the boundaries kde_vals[~mask_valid] = 0 npt.assert_almost_equal(self.res1.density, kde_vals, self.decimal_density) # regression test, not compared to another package nobs = len(self.res1.endog) kern = self.res1.kernel v = kern.density_var(kde_vals, nobs) v_direct = kde_vals * kern.L2Norm / kern.h / nobs npt.assert_allclose(v, v_direct, rtol=1e-10) ci = kern.density_confint(kde_vals, nobs) crit = 1.9599639845400545 # stats.norm.isf(0.05 / 2) hw = kde_vals - ci[:, 0] npt.assert_allclose(hw, crit * np.sqrt(v), rtol=1e-10) hw = ci[:, 1] - kde_vals npt.assert_allclose(hw, crit * np.sqrt(v), rtol=1e-10) def test_kernel_constants(self): kern = self.res1.kernel nc = kern.norm_const # trigger numerical integration kern._norm_const = None nc2 = kern.norm_const npt.assert_allclose(nc, nc2, rtol=1e-10) l2n = kern.L2Norm # trigger numerical integration kern._L2Norm = None l2n2 = kern.L2Norm npt.assert_allclose(l2n, l2n2, rtol=1e-10) v = kern.kernel_var # trigger numerical integration kern._kernel_var = None v2 = kern.kernel_var npt.assert_allclose(v, v2, rtol=1e-10) class TestKDEWGauss(CheckKDEWeights): kernel_name = "gau" res_kernel_name = "x_gau_wd" class TestKDEWEpa(CheckKDEWeights): kernel_name = "epa" res_kernel_name = "x_epan2_wd" class TestKDEWTri(CheckKDEWeights): kernel_name = "tri" res_kernel_name = "x_" + kernel_name + "_wd" class TestKDEWBiw(CheckKDEWeights): kernel_name = "biw" res_kernel_name = "x_bi_wd" class TestKDEWCos(CheckKDEWeights): kernel_name = "cos" res_kernel_name = "x_cos_wd" class TestKDEWCos2(CheckKDEWeights): kernel_name = "cos2" res_kernel_name = "x_cos_wd" class _TestKDEWRect(CheckKDEWeights): # TODO in docstring but not in kernel_switch kernel_name = "rect" res_kernel_name = "x_rec_wd" class _TestKDEWPar(CheckKDEWeights): # TODO in docstring but not implemented in kernels kernel_name = "par" res_kernel_name = "x_par_wd" class TestKdeRefit: np.random.seed(12345) data1 = np.random.randn(100) * 100 pdf = KDE(data1) pdf.fit() data2 = np.random.randn(100) * 100 pdf2 = KDE(data2) pdf2.fit() for attr in ['icdf', 'cdf', 'sf']: npt.assert_(not np.allclose(getattr(pdf, attr)[:10], getattr(pdf2, attr)[:10])) class TestNormConstant: def test_norm_constant_calculation(self): custom_gauss = kernels.CustomKernel(lambda x: np.exp(-x ** 2 / 2.0)) gauss_true_const = 0.3989422804014327 npt.assert_almost_equal(gauss_true_const, custom_gauss.norm_const) def test_kde_bw_positive(): # GH 6679 x = np.array([4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 4.59511985, 5.67332327, 6.19847872, 7.43189192]) kde = KDE(x) kde.fit() assert kde.bw > 0 def test_fit_self(reset_randomstate): x = np.random.standard_normal(100) kde = KDE(x) assert isinstance(kde, KDE) assert isinstance(kde.fit(), KDE) class TestKDECustomBandwidth: decimal_density = 7 @classmethod def setup_class(cls): cls.kde = KDE(Xi) cls.weights_200 = np.linspace(1, 100, 200) cls.weights_100 = np.linspace(1, 100, 100) def test_check_is_fit_ok_with_custom_bandwidth(self): def custom_bw(X, kern): return np.std(X) * len(X) kde = self.kde.fit(bw=custom_bw) assert isinstance(kde, KDE) def test_check_is_fit_ok_with_standard_custom_bandwidth(self): # Note, we are passing the function, not the string - this is intended kde = self.kde.fit(bw=bandwidths.bw_silverman) s1 = kde.support.copy() d1 = kde.density.copy() kde = self.kde.fit(bw='silverman') npt.assert_almost_equal(s1, kde.support, self.decimal_density) npt.assert_almost_equal(d1, kde.density, self.decimal_density) @pytest.mark.parametrize("fft", [True, False]) def test_check_is_fit_ok_with_float_bandwidth(self, fft): # Note, we are passing the function, not the string - this is intended kde = self.kde.fit(bw=bandwidths.bw_silverman, fft=fft) s1 = kde.support.copy() d1 = kde.density.copy() kde = self.kde.fit(bw=kde.bw, fft=fft) npt.assert_almost_equal(s1, kde.support, self.decimal_density) npt.assert_almost_equal(d1, kde.density, self.decimal_density)