""" Spline and other smoother classes for Generalized Additive Models Author: Luca Puggini Author: Josef Perktold Created on Fri Jun 5 16:32:00 2015 """ # import useful only for development from abc import ABCMeta, abstractmethod from statsmodels.compat.python import with_metaclass import numpy as np import pandas as pd from patsy import dmatrix from patsy.mgcv_cubic_splines import _get_all_sorted_knots from statsmodels.tools.linalg import transf_constraints # Obtain b splines from patsy def _equally_spaced_knots(x, df): n_knots = df - 2 x_min = x.min() x_max = x.max() knots = np.linspace(x_min, x_max, n_knots) return knots def _R_compat_quantile(x, probs): # return np.percentile(x, 100 * np.asarray(probs)) probs = np.asarray(probs) quantiles = np.asarray([np.percentile(x, 100 * prob) for prob in probs.ravel(order="C")]) return quantiles.reshape(probs.shape, order="C") # FIXME: is this copy/pasted? If so, why do we need it? If not, get # rid of the try/except for scipy import # from patsy splines.py def _eval_bspline_basis(x, knots, degree, deriv='all', include_intercept=True): try: from scipy.interpolate import splev except ImportError: raise ImportError("spline functionality requires scipy") # 'knots' are assumed to be already pre-processed. E.g. usually you # want to include duplicate copies of boundary knots; you should do # that *before* calling this constructor. knots = np.atleast_1d(np.asarray(knots, dtype=float)) assert knots.ndim == 1 knots.sort() degree = int(degree) x = np.atleast_1d(x) if x.ndim == 2 and x.shape[1] == 1: x = x[:, 0] assert x.ndim == 1 # XX FIXME: when points fall outside of the boundaries, splev and R seem # to handle them differently. I do not know why yet. So until we understand # this and decide what to do with it, I'm going to play it safe and # disallow such points. if np.min(x) < np.min(knots) or np.max(x) > np.max(knots): raise NotImplementedError("some data points fall outside the " "outermost knots, and I'm not sure how " "to handle them. (Patches accepted!)") # Thanks to Charles Harris for explaining splev. It's not well # documented, but basically it computes an arbitrary b-spline basis # given knots and degree on some specificed points (or derivatives # thereof, but we do not use that functionality), and then returns some # linear combination of these basis functions. To get out the basis # functions themselves, we use linear combinations like [1, 0, 0], [0, # 1, 0], [0, 0, 1]. # NB: This probably makes it rather inefficient (though I have not checked # to be sure -- maybe the fortran code actually skips computing the basis # function for coefficients that are zero). # Note: the order of a spline is the same as its degree + 1. # Note: there are (len(knots) - order) basis functions. k_const = 1 - int(include_intercept) n_bases = len(knots) - (degree + 1) - k_const if deriv in ['all', 0]: basis = np.empty((x.shape[0], n_bases), dtype=float) ret = basis if deriv in ['all', 1]: der1_basis = np.empty((x.shape[0], n_bases), dtype=float) ret = der1_basis if deriv in ['all', 2]: der2_basis = np.empty((x.shape[0], n_bases), dtype=float) ret = der2_basis for i in range(n_bases): coefs = np.zeros((n_bases + k_const,)) # we are skipping the first column of the basis to drop constant coefs[i + k_const] = 1 ii = i if deriv in ['all', 0]: basis[:, ii] = splev(x, (knots, coefs, degree)) if deriv in ['all', 1]: der1_basis[:, ii] = splev(x, (knots, coefs, degree), der=1) if deriv in ['all', 2]: der2_basis[:, ii] = splev(x, (knots, coefs, degree), der=2) if deriv == 'all': return basis, der1_basis, der2_basis else: return ret def compute_all_knots(x, df, degree): order = degree + 1 n_inner_knots = df - order lower_bound = np.min(x) upper_bound = np.max(x) knot_quantiles = np.linspace(0, 1, n_inner_knots + 2)[1:-1] inner_knots = _R_compat_quantile(x, knot_quantiles) all_knots = np.concatenate(([lower_bound, upper_bound] * order, inner_knots)) return all_knots, lower_bound, upper_bound, inner_knots def make_bsplines_basis(x, df, degree): ''' make a spline basis for x ''' all_knots, _, _, _ = compute_all_knots(x, df, degree) basis, der_basis, der2_basis = _eval_bspline_basis(x, all_knots, degree) return basis, der_basis, der2_basis def get_knots_bsplines(x=None, df=None, knots=None, degree=3, spacing='quantile', lower_bound=None, upper_bound=None, all_knots=None): """knots for use in B-splines There are two main options for the knot placement - quantile spacing with multiplicity of boundary knots - equal spacing extended to boundary or exterior knots The first corresponds to splines as used by patsy. the second is the knot spacing for P-Splines. """ # based on patsy memorize_finish if all_knots is not None: return all_knots x_min = x.min() x_max = x.max() if degree < 0: raise ValueError("degree must be greater than 0 (not %r)" % (degree,)) if int(degree) != degree: raise ValueError("degree must be an integer (not %r)" % (degree,)) # These are guaranteed to all be 1d vectors by the code above # x = np.concatenate(tmp["xs"]) if df is None and knots is None: raise ValueError("must specify either df or knots") order = degree + 1 if df is not None: n_inner_knots = df - order if n_inner_knots < 0: raise ValueError("df=%r is too small for degree=%r; must be >= %s" % (df, degree, # We know that n_inner_knots is negative; # if df were that much larger, it would # have been zero, and things would work. df - n_inner_knots)) if knots is not None: if len(knots) != n_inner_knots: raise ValueError("df=%s with degree=%r implies %s knots, " "but %s knots were provided" % (df, degree, n_inner_knots, len(knots))) elif spacing == 'quantile': # Need to compute inner knots knot_quantiles = np.linspace(0, 1, n_inner_knots + 2)[1:-1] inner_knots = _R_compat_quantile(x, knot_quantiles) elif spacing == 'equal': # Need to compute inner knots grid = np.linspace(0, 1, n_inner_knots + 2)[1:-1] inner_knots = x_min + grid * (x_max - x_min) diff_knots = inner_knots[1] - inner_knots[0] else: raise ValueError("incorrect option for spacing") if knots is not None: inner_knots = knots if lower_bound is None: lower_bound = np.min(x) if upper_bound is None: upper_bound = np.max(x) if lower_bound > upper_bound: raise ValueError("lower_bound > upper_bound (%r > %r)" % (lower_bound, upper_bound)) inner_knots = np.asarray(inner_knots) if inner_knots.ndim > 1: raise ValueError("knots must be 1 dimensional") if np.any(inner_knots < lower_bound): raise ValueError("some knot values (%s) fall below lower bound " "(%r)" % (inner_knots[inner_knots < lower_bound], lower_bound)) if np.any(inner_knots > upper_bound): raise ValueError("some knot values (%s) fall above upper bound " "(%r)" % (inner_knots[inner_knots > upper_bound], upper_bound)) if spacing == "equal": diffs = np.arange(1, order + 1) * diff_knots lower_knots = inner_knots[0] - diffs[::-1] upper_knots = inner_knots[-1] + diffs all_knots = np.concatenate((lower_knots, inner_knots, upper_knots)) else: all_knots = np.concatenate(([lower_bound, upper_bound] * order, inner_knots)) all_knots.sort() return all_knots def _get_integration_points(knots, k_points=3): """add points to each subinterval defined by knots inserts k_points between each two consecutive knots """ k_points = k_points + 1 knots = np.unique(knots) dxi = np.arange(k_points) / k_points dxk = np.diff(knots) dx = dxk[:, None] * dxi x = np.concatenate(((knots[:-1, None] + dx).ravel(), [knots[-1]])) return x def get_covder2(smoother, k_points=3, integration_points=None, skip_ctransf=False, deriv=2): """ Approximate integral of cross product of second derivative of smoother This uses scipy.integrate simps to compute an approximation to the integral of the smoother derivative cross-product at knots plus k_points in between knots. """ try: from scipy.integrate import simpson except ImportError: # Remove after SciPy 1.7 is the minimum version from scipy.integrate import simps as simpson knots = smoother.knots if integration_points is None: x = _get_integration_points(knots, k_points=k_points) else: x = integration_points d2 = smoother.transform(x, deriv=deriv, skip_ctransf=skip_ctransf) covd2 = simpson(d2[:, :, None] * d2[:, None, :], x=x, axis=0) return covd2 # TODO: this function should be deleted def make_poly_basis(x, degree, intercept=True): ''' given a vector x returns poly=(1, x, x^2, ..., x^degree) and its first and second derivative ''' if intercept: start = 0 else: start = 1 nobs = len(x) basis = np.zeros(shape=(nobs, degree + 1 - start)) der_basis = np.zeros(shape=(nobs, degree + 1 - start)) der2_basis = np.zeros(shape=(nobs, degree + 1 - start)) for i in range(start, degree + 1): basis[:, i - start] = x ** i der_basis[:, i - start] = i * x ** (i - 1) der2_basis[:, i - start] = i * (i - 1) * x ** (i - 2) return basis, der_basis, der2_basis # TODO: try to include other kinds of splines from patsy # x = np.linspace(0, 1, 30) # df = 10 # degree = 3 # from patsy.mgcv_cubic_splines import cc, cr, te # all_knots, lower, upper, inner = compute_all_knots(x, df, degree) # result = cc(x, df=df, knots=all_knots, lower_bound=lower, upper_bound=upper, # constraints=None) # # import matplotlib.pyplot as plt # # result = np.array(result) # print(result.shape) # plt.plot(result.T) # plt.show() class UnivariateGamSmoother(with_metaclass(ABCMeta)): """Base Class for single smooth component """ def __init__(self, x, constraints=None, variable_name='x'): self.x = x self.constraints = constraints self.variable_name = variable_name self.nobs, self.k_variables = len(x), 1 base4 = self._smooth_basis_for_single_variable() if constraints == 'center': constraints = base4[0].mean(0)[None, :] if constraints is not None and not isinstance(constraints, str): ctransf = transf_constraints(constraints) self.ctransf = ctransf else: # subclasses might set ctransf directly # only used if constraints is None if not hasattr(self, 'ctransf'): self.ctransf = None self.basis, self.der_basis, self.der2_basis, self.cov_der2 = base4 if self.ctransf is not None: ctransf = self.ctransf # transform attributes that are not None if base4[0] is not None: self.basis = base4[0].dot(ctransf) if base4[1] is not None: self.der_basis = base4[1].dot(ctransf) if base4[2] is not None: self.der2_basis = base4[2].dot(ctransf) if base4[3] is not None: self.cov_der2 = ctransf.T.dot(base4[3]).dot(ctransf) self.dim_basis = self.basis.shape[1] self.col_names = [self.variable_name + "_s" + str(i) for i in range(self.dim_basis)] @abstractmethod def _smooth_basis_for_single_variable(self): return class UnivariateGenericSmoother(UnivariateGamSmoother): """Generic single smooth component """ def __init__(self, x, basis, der_basis, der2_basis, cov_der2, variable_name='x'): self.basis = basis self.der_basis = der_basis self.der2_basis = der2_basis self.cov_der2 = cov_der2 super().__init__(x, variable_name=variable_name) def _smooth_basis_for_single_variable(self): return self.basis, self.der_basis, self.der2_basis, self.cov_der2 class UnivariatePolynomialSmoother(UnivariateGamSmoother): """polynomial single smooth component """ def __init__(self, x, degree, variable_name='x'): self.degree = degree super().__init__(x, variable_name=variable_name) def _smooth_basis_for_single_variable(self): # TODO: unclear description """ given a vector x returns poly=(1, x, x^2, ..., x^degree) and its first and second derivative """ basis = np.zeros(shape=(self.nobs, self.degree)) der_basis = np.zeros(shape=(self.nobs, self.degree)) der2_basis = np.zeros(shape=(self.nobs, self.degree)) for i in range(self.degree): dg = i + 1 basis[:, i] = self.x ** dg der_basis[:, i] = dg * self.x ** (dg - 1) if dg > 1: der2_basis[:, i] = dg * (dg - 1) * self.x ** (dg - 2) else: der2_basis[:, i] = 0 cov_der2 = np.dot(der2_basis.T, der2_basis) return basis, der_basis, der2_basis, cov_der2 class UnivariateBSplines(UnivariateGamSmoother): """B-Spline single smooth component This creates and holds the B-Spline basis function for one component. Parameters ---------- x : ndarray, 1-D underlying explanatory variable for smooth terms. df : int number of basis functions or degrees of freedom degree : int degree of the spline include_intercept : bool If False, then the basis functions are transformed so that they do not include a constant. This avoids perfect collinearity if a constant or several components are included in the model. constraints : {None, str, array} Constraints are used to transform the basis functions to satisfy those constraints. `constraints = 'center'` applies a linear transform to remove the constant and center the basis functions. variable_name : {None, str} The name for the underlying explanatory variable, x, used in for creating the column and parameter names for the basis functions. covder2_kwds : {None, dict} options for computing the penalty matrix from the second derivative of the spline. knot_kwds : {None, list[dict]} option for the knot selection. By default knots are selected in the same way as in patsy, however the number of knots is independent of keeping or removing the constant. Interior knot selection is based on quantiles of the data and is the same in patsy and mgcv. Boundary points are at the limits of the data range. The available options use with `get_knots_bsplines` are - knots : None or array interior knots - spacing : 'quantile' or 'equal' - lower_bound : None or float location of lower boundary knots, all boundary knots are at the same point - upper_bound : None or float location of upper boundary knots, all boundary knots are at the same point - all_knots : None or array If all knots are provided, then those will be taken as given and all other options will be ignored. """ def __init__(self, x, df, degree=3, include_intercept=False, constraints=None, variable_name='x', covder2_kwds=None, **knot_kwds): self.degree = degree self.df = df self.include_intercept = include_intercept self.knots = get_knots_bsplines(x, degree=degree, df=df, **knot_kwds) self.covder2_kwds = (covder2_kwds if covder2_kwds is not None else {}) super().__init__( x, constraints=constraints, variable_name=variable_name ) def _smooth_basis_for_single_variable(self): basis, der_basis, der2_basis = _eval_bspline_basis( self.x, self.knots, self.degree, include_intercept=self.include_intercept) # cov_der2 = np.dot(der2_basis.T, der2_basis) cov_der2 = get_covder2(self, skip_ctransf=True, **self.covder2_kwds) return basis, der_basis, der2_basis, cov_der2 def transform(self, x_new, deriv=0, skip_ctransf=False): """create the spline basis for new observations The main use of this stateful transformation is for prediction using the same specification of the spline basis. Parameters ---------- x_new : ndarray observations of the underlying explanatory variable deriv : int which derivative of the spline basis to compute This is an options for internal computation. skip_ctransf : bool whether to skip the constraint transform This is an options for internal computation. Returns ------- basis : ndarray design matrix for the spline basis for given ``x_new`` """ if x_new is None: x_new = self.x exog = _eval_bspline_basis(x_new, self.knots, self.degree, deriv=deriv, include_intercept=self.include_intercept) # ctransf does not exist yet when cov_der2 is computed ctransf = getattr(self, 'ctransf', None) if ctransf is not None and not skip_ctransf: exog = exog.dot(self.ctransf) return exog class UnivariateCubicSplines(UnivariateGamSmoother): """Cubic Spline single smooth component Cubic splines as described in the wood's book in chapter 3 """ def __init__(self, x, df, constraints=None, transform='domain', variable_name='x'): self.degree = 3 self.df = df self.transform_data_method = transform self.x = x = self.transform_data(x, initialize=True) self.knots = _equally_spaced_knots(x, df) super().__init__( x, constraints=constraints, variable_name=variable_name ) def transform_data(self, x, initialize=False): tm = self.transform_data_method if tm is None: return x if initialize is True: if tm == 'domain': self.domain_low = x.min(0) self.domain_upp = x.max(0) elif isinstance(tm, tuple): self.domain_low = tm[0] self.domain_upp = tm[1] self.transform_data_method = 'domain' else: raise ValueError("transform should be None, 'domain' " "or a tuple") self.domain_diff = self.domain_upp - self.domain_low if self.transform_data_method == 'domain': x = (x - self.domain_low) / self.domain_diff return x else: raise ValueError("incorrect transform_data_method") def _smooth_basis_for_single_variable(self): basis = self._splines_x()[:, :-1] # demean except for constant, does not affect derivatives if not self.constraints == 'none': self.transf_mean = basis[:, 1:].mean(0) basis[:, 1:] -= self.transf_mean else: self.transf_mean = np.zeros(basis.shape[1]) s = self._splines_s()[:-1, :-1] if not self.constraints == 'none': ctransf = np.diag(1/np.max(np.abs(basis), axis=0)) else: ctransf = np.eye(basis.shape[1]) # use np.eye to avoid rescaling # ctransf = np.eye(basis.shape[1]) if self.constraints == 'no-const': ctransf = ctransf[1:] self.ctransf = ctransf return basis, None, None, s def _rk(self, x, z): p1 = ((z - 1 / 2) ** 2 - 1 / 12) * ((x - 1 / 2) ** 2 - 1 / 12) / 4 p2 = ((np.abs(z - x) - 1 / 2) ** 4 - 1 / 2 * (np.abs(z - x) - 1 / 2) ** 2 + 7 / 240) / 24. return p1 - p2 def _splines_x(self, x=None): if x is None: x = self.x n_columns = len(self.knots) + 2 nobs = x.shape[0] basis = np.ones(shape=(nobs, n_columns)) basis[:, 1] = x # for loop equivalent to outer(x, xk, fun=rk) for i, xi in enumerate(x): for j, xkj in enumerate(self.knots): s_ij = self._rk(xi, xkj) basis[i, j + 2] = s_ij return basis def _splines_s(self): q = len(self.knots) + 2 s = np.zeros(shape=(q, q)) for i, x1 in enumerate(self.knots): for j, x2 in enumerate(self.knots): s[i + 2, j + 2] = self._rk(x1, x2) return s def transform(self, x_new): x_new = self.transform_data(x_new, initialize=False) exog = self._splines_x(x_new) exog[:, 1:] -= self.transf_mean if self.ctransf is not None: exog = exog.dot(self.ctransf) return exog class UnivariateCubicCyclicSplines(UnivariateGamSmoother): """cyclic cubic regression spline single smooth component This creates and holds the Cyclic CubicSpline basis function for one component. Parameters ---------- x : ndarray, 1-D underlying explanatory variable for smooth terms. df : int number of basis functions or degrees of freedom degree : int degree of the spline include_intercept : bool If False, then the basis functions are transformed so that they do not include a constant. This avoids perfect collinearity if a constant or several components are included in the model. constraints : {None, str, array} Constraints are used to transform the basis functions to satisfy those constraints. `constraints = 'center'` applies a linear transform to remove the constant and center the basis functions. variable_name : None or str The name for the underlying explanatory variable, x, used in for creating the column and parameter names for the basis functions. """ def __init__(self, x, df, constraints=None, variable_name='x'): self.degree = 3 self.df = df self.x = x self.knots = _equally_spaced_knots(x, df) super().__init__( x, constraints=constraints, variable_name=variable_name ) def _smooth_basis_for_single_variable(self): basis = dmatrix("cc(x, df=" + str(self.df) + ") - 1", {"x": self.x}) self.design_info = basis.design_info n_inner_knots = self.df - 2 + 1 # +n_constraints # TODO: from CubicRegressionSplines class all_knots = _get_all_sorted_knots(self.x, n_inner_knots=n_inner_knots, inner_knots=None, lower_bound=None, upper_bound=None) b, d = self._get_b_and_d(all_knots) s = self._get_s(b, d) return basis, None, None, s def _get_b_and_d(self, knots): """Returns mapping of cyclic cubic spline values to 2nd derivatives. .. note:: See 'Generalized Additive Models', Simon N. Wood, 2006, pp 146-147 Parameters ---------- knots : ndarray The 1-d array knots used for cubic spline parametrization, must be sorted in ascending order. Returns ------- b : ndarray Array for mapping cyclic cubic spline values at knots to second derivatives. d : ndarray Array for mapping cyclic cubic spline values at knots to second derivatives. Notes ----- The penalty matrix is equal to ``s = d.T.dot(b^-1).dot(d)`` """ h = knots[1:] - knots[:-1] n = knots.size - 1 # b and d are defined such that the penalty matrix is equivalent to: # s = d.T.dot(b^-1).dot(d) # reference in particular to pag 146 of Wood's book b = np.zeros((n, n)) # the b matrix on page 146 of Wood's book d = np.zeros((n, n)) # the d matrix on page 146 of Wood's book b[0, 0] = (h[n - 1] + h[0]) / 3. b[0, n - 1] = h[n - 1] / 6. b[n - 1, 0] = h[n - 1] / 6. d[0, 0] = -1. / h[0] - 1. / h[n - 1] d[0, n - 1] = 1. / h[n - 1] d[n - 1, 0] = 1. / h[n - 1] for i in range(1, n): b[i, i] = (h[i - 1] + h[i]) / 3. b[i, i - 1] = h[i - 1] / 6. b[i - 1, i] = h[i - 1] / 6. d[i, i] = -1. / h[i - 1] - 1. / h[i] d[i, i - 1] = 1. / h[i - 1] d[i - 1, i] = 1. / h[i - 1] return b, d def _get_s(self, b, d): return d.T.dot(np.linalg.inv(b)).dot(d) def transform(self, x_new): exog = dmatrix(self.design_info, {"x": x_new}) if self.ctransf is not None: exog = exog.dot(self.ctransf) return exog class AdditiveGamSmoother(with_metaclass(ABCMeta)): """Base class for additive smooth components """ def __init__(self, x, variable_names=None, include_intercept=False, **kwargs): # get pandas names before using asarray if isinstance(x, pd.DataFrame): data_names = x.columns.tolist() elif isinstance(x, pd.Series): data_names = [x.name] else: data_names = None x = np.asarray(x) if x.ndim == 1: self.x = x.copy() self.x.shape = (len(x), 1) else: self.x = x self.nobs, self.k_variables = self.x.shape if isinstance(include_intercept, bool): self.include_intercept = [include_intercept] * self.k_variables else: self.include_intercept = include_intercept if variable_names is None: if data_names is not None: self.variable_names = data_names else: self.variable_names = ['x' + str(i) for i in range(self.k_variables)] else: self.variable_names = variable_names self.smoothers = self._make_smoothers_list() self.basis = np.hstack(list(smoother.basis for smoother in self.smoothers)) self.dim_basis = self.basis.shape[1] self.penalty_matrices = [smoother.cov_der2 for smoother in self.smoothers] self.col_names = [] for smoother in self.smoothers: self.col_names.extend(smoother.col_names) self.mask = [] last_column = 0 for smoother in self.smoothers: mask = np.array([False] * self.dim_basis) mask[last_column:smoother.dim_basis + last_column] = True last_column = last_column + smoother.dim_basis self.mask.append(mask) @abstractmethod def _make_smoothers_list(self): pass def transform(self, x_new): """create the spline basis for new observations The main use of this stateful transformation is for prediction using the same specification of the spline basis. Parameters ---------- x_new: ndarray observations of the underlying explanatory variable Returns ------- basis : ndarray design matrix for the spline basis for given ``x_new``. """ if x_new.ndim == 1 and self.k_variables == 1: x_new = x_new.reshape(-1, 1) exog = np.hstack(list(self.smoothers[i].transform(x_new[:, i]) for i in range(self.k_variables))) return exog class GenericSmoothers(AdditiveGamSmoother): """generic class for additive smooth components for GAM """ def __init__(self, x, smoothers): self.smoothers = smoothers super().__init__(x, variable_names=None) def _make_smoothers_list(self): return self.smoothers class PolynomialSmoother(AdditiveGamSmoother): """additive polynomial components for GAM """ def __init__(self, x, degrees, variable_names=None): self.degrees = degrees super().__init__(x, variable_names=variable_names) def _make_smoothers_list(self): smoothers = [] for v in range(self.k_variables): uv_smoother = UnivariatePolynomialSmoother( self.x[:, v], degree=self.degrees[v], variable_name=self.variable_names[v]) smoothers.append(uv_smoother) return smoothers class BSplines(AdditiveGamSmoother): """additive smooth components using B-Splines This creates and holds the B-Spline basis function for several components. Parameters ---------- x : array_like, 1-D or 2-D underlying explanatory variable for smooth terms. If 2-dimensional, then observations should be in rows and explanatory variables in columns. df : {int, array_like[int]} number of basis functions or degrees of freedom; should be equal in length to the number of columns of `x`; may be an integer if `x` has one column or is 1-D. degree : {int, array_like[int]} degree(s) of the spline; the same length and type rules apply as to `df` include_intercept : bool If False, then the basis functions are transformed so that they do not include a constant. This avoids perfect collinearity if a constant or several components are included in the model. constraints : {None, str, array} Constraints are used to transform the basis functions to satisfy those constraints. `constraints = 'center'` applies a linear transform to remove the constant and center the basis functions. variable_names : {list[str], None} The names for the underlying explanatory variables, x used in for creating the column and parameter names for the basis functions. If ``x`` is a pandas object, then the names will be taken from it. knot_kwds : None or list of dict option for the knot selection. By default knots are selected in the same way as in patsy, however the number of knots is independent of keeping or removing the constant. Interior knot selection is based on quantiles of the data and is the same in patsy and mgcv. Boundary points are at the limits of the data range. The available options use with `get_knots_bsplines` are - knots : None or array interior knots - spacing : 'quantile' or 'equal' - lower_bound : None or float location of lower boundary knots, all boundary knots are at the same point - upper_bound : None or float location of upper boundary knots, all boundary knots are at the same point - all_knots : None or array If all knots are provided, then those will be taken as given and all other options will be ignored. Attributes ---------- smoothers : list of univariate smooth component instances basis : design matrix, array of spline bases columns for all components penalty_matrices : list of penalty matrices, one for each smooth term dim_basis : number of columns in the basis k_variables : number of smooth components col_names : created names for the basis columns There are additional attributes about the specification of the splines and some attributes mainly for internal use. Notes ----- A constant in the spline basis function can be removed in two different ways. The first is by dropping one basis column and normalizing the remaining columns. This is obtained by the default ``include_intercept=False, constraints=None`` The second option is by using the centering transform which is a linear transformation of all basis functions. As a consequence of the transformation, the B-spline basis functions do not have locally bounded support anymore. This is obtained ``constraints='center'``. In this case ``include_intercept`` will be automatically set to True to avoid dropping an additional column. """ def __init__(self, x, df, degree, include_intercept=False, constraints=None, variable_names=None, knot_kwds=None): if isinstance(degree, int): self.degrees = np.array([degree], dtype=int) else: self.degrees = degree if isinstance(df, int): self.dfs = np.array([df], dtype=int) else: self.dfs = df self.knot_kwds = knot_kwds # TODO: move attaching constraints to super call self.constraints = constraints if constraints == 'center': include_intercept = True super().__init__( x, include_intercept=include_intercept, variable_names=variable_names ) def _make_smoothers_list(self): smoothers = [] for v in range(self.k_variables): kwds = self.knot_kwds[v] if self.knot_kwds else {} uv_smoother = UnivariateBSplines( self.x[:, v], df=self.dfs[v], degree=self.degrees[v], include_intercept=self.include_intercept[v], constraints=self.constraints, variable_name=self.variable_names[v], **kwds) smoothers.append(uv_smoother) return smoothers class CubicSplines(AdditiveGamSmoother): """additive smooth components using cubic splines as in Wood 2006. Note, these splines do NOT use the same spline basis as ``Cubic Regression Splines``. """ def __init__(self, x, df, constraints='center', transform='domain', variable_names=None): self.dfs = df self.constraints = constraints self.transform = transform super().__init__( x, constraints=constraints, variable_names=variable_names ) def _make_smoothers_list(self): smoothers = [] for v in range(self.k_variables): uv_smoother = UnivariateCubicSplines( self.x[:, v], df=self.dfs[v], constraints=self.constraints, transform=self.transform, variable_name=self.variable_names[v]) smoothers.append(uv_smoother) return smoothers class CyclicCubicSplines(AdditiveGamSmoother): """additive smooth components using cyclic cubic regression splines This spline basis is the same as in patsy. Parameters ---------- x : array_like, 1-D or 2-D underlying explanatory variable for smooth terms. If 2-dimensional, then observations should be in rows and explanatory variables in columns. df : int numer of basis functions or degrees of freedom constraints : {None, str, array} Constraints are used to transform the basis functions to satisfy those constraints. variable_names : {list[str], None} The names for the underlying explanatory variables, x used in for creating the column and parameter names for the basis functions. If ``x`` is a pandas object, then the names will be taken from it. """ def __init__(self, x, df, constraints=None, variable_names=None): self.dfs = df # TODO: move attaching constraints to super call self.constraints = constraints super().__init__(x, variable_names=variable_names) def _make_smoothers_list(self): smoothers = [] for v in range(self.k_variables): uv_smoother = UnivariateCubicCyclicSplines( self.x[:, v], df=self.dfs[v], constraints=self.constraints, variable_name=self.variable_names[v]) smoothers.append(uv_smoother) return smoothers # class CubicRegressionSplines(BaseCubicSplines): # # TODO: this class is still not tested # # def __init__(self, x, df=10): # import warnings # warnings.warn("This class is still not tested and it is probably" # " not working properly. " # "I suggest to use another smoother", Warning) # # super(CubicRegressionSplines, self).__init__(x, df) # # self.basis = dmatrix("cc(x, df=" + str(df) + ") - 1", {"x": x}) # n_inner_knots = df - 2 + 1 # +n_constraints # # TODO: ACcording to CubicRegressionSplines class this should be # # n_inner_knots = df - 2 # all_knots = _get_all_sorted_knots(x, n_inner_knots=n_inner_knots, # inner_knots=None, # lower_bound=None, upper_bound=None) # # b, d = self._get_b_and_d(all_knots) # self.s = self._get_s(b, d) # # self.dim_basis = self.basis.shape[1] # # def _get_b_and_d(self, knots): # # h = knots[1:] - knots[:-1] # n = knots.size - 1 # # # b and d are defined such that the penalty matrix is equivalent to: # # s = d.T.dot(b^-1).dot(d) # # reference in particular to pag 146 of Wood's book # b = np.zeros((n, n)) # the b matrix on page 146 of Wood's book # d = np.zeros((n, n)) # the d matrix on page 146 of Wood's book # # for i in range(n-2): # d[i, i] = 1/h[i] # d[i, i+1] = -1/h[i] - 1/h[i+1] # d[i, i+2] = 1/h[i+1] # # b[i, i] = (h[i] + h[i+1])/3 # # for i in range(n-3): # b[i, i+1] = h[i+1]/6 # b[i+1, i] = h[i+1]/6 # # return b, d # # def _get_s(self, b, d): # # return d.T.dot(np.linalg.pinv(b)).dot(d)