587 lines
17 KiB
Python
587 lines
17 KiB
Python
|
"""
|
|||
|
A collection of smooth penalty functions.
|
|||
|
|
|||
|
Penalties on vectors take a vector argument and return a scalar
|
|||
|
penalty. The gradient of the penalty is a vector with the same shape
|
|||
|
as the input value.
|
|||
|
|
|||
|
Penalties on covariance matrices take two arguments: the matrix and
|
|||
|
its inverse, both in unpacked (square) form. The returned penalty is
|
|||
|
a scalar, and the gradient is returned as a vector that contains the
|
|||
|
gradient with respect to the free elements in the lower triangle of
|
|||
|
the covariance matrix.
|
|||
|
|
|||
|
All penalties are subtracted from the log-likelihood, so greater
|
|||
|
penalty values correspond to a greater degree of penalization.
|
|||
|
|
|||
|
The penaties should be smooth so that they can be subtracted from log
|
|||
|
likelihood functions and optimized using standard methods (i.e. L1
|
|||
|
penalties do not belong here).
|
|||
|
"""
|
|||
|
import numpy as np
|
|||
|
|
|||
|
|
|||
|
class Penalty:
|
|||
|
"""
|
|||
|
A class for representing a scalar-value penalty.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
weights : array_like
|
|||
|
A vector of weights that determines the weight of the penalty
|
|||
|
for each parameter.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
The class has a member called `alpha` that scales the weights.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, weights=1.):
|
|||
|
self.weights = weights
|
|||
|
self.alpha = 1.
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
"""
|
|||
|
A penalty function on a vector of parameters.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
params : array_like
|
|||
|
A vector of parameters.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
A scalar penaty value; greater values imply greater
|
|||
|
penalization.
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
"""
|
|||
|
The gradient of a penalty function.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
params : array_like
|
|||
|
A vector of parameters
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
The gradient of the penalty with respect to each element in
|
|||
|
`params`.
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def _null_weights(self, params):
|
|||
|
"""work around for Null model
|
|||
|
|
|||
|
This will not be needed anymore when we can use `self._null_drop_keys`
|
|||
|
as in DiscreteModels.
|
|||
|
TODO: check other models
|
|||
|
"""
|
|||
|
if np.size(self.weights) > 1:
|
|||
|
if len(params) == 1:
|
|||
|
raise # raise to identify models where this would be needed
|
|||
|
return 0.
|
|||
|
|
|||
|
return self.weights
|
|||
|
|
|||
|
|
|||
|
class NonePenalty(Penalty):
|
|||
|
"""
|
|||
|
A penalty that does not penalize.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, **kwds):
|
|||
|
super().__init__()
|
|||
|
if kwds:
|
|||
|
import warnings
|
|||
|
warnings.warn('keyword arguments are be ignored')
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
if params.ndim == 2:
|
|||
|
return np.zeros(params.shape[1:])
|
|||
|
else:
|
|||
|
return 0
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
return np.zeros(params.shape)
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
# returns diagonal of hessian
|
|||
|
return np.zeros(params.shape[0])
|
|||
|
|
|||
|
|
|||
|
class L2(Penalty):
|
|||
|
"""
|
|||
|
The L2 (ridge) penalty.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, weights=1.):
|
|||
|
super().__init__(weights)
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
return np.sum(self.weights * self.alpha * params**2)
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
return 2 * self.weights * self.alpha * params
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
return 2 * self.weights * self.alpha * np.ones(len(params))
|
|||
|
|
|||
|
|
|||
|
class L2Univariate(Penalty):
|
|||
|
"""
|
|||
|
The L2 (ridge) penalty applied to each parameter.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, weights=None):
|
|||
|
if weights is None:
|
|||
|
self.weights = 1.
|
|||
|
else:
|
|||
|
self.weights = weights
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
return self.weights * params**2
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
return 2 * self.weights * params
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
return 2 * self.weights * np.ones(len(params))
|
|||
|
|
|||
|
|
|||
|
class PseudoHuber(Penalty):
|
|||
|
"""
|
|||
|
The pseudo-Huber penalty.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, dlt, weights=1.):
|
|||
|
super().__init__(weights)
|
|||
|
self.dlt = dlt
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
v = np.sqrt(1 + (params / self.dlt)**2)
|
|||
|
v -= 1
|
|||
|
v *= self.dlt**2
|
|||
|
return np.sum(self.weights * self.alpha * v, 0)
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
v = np.sqrt(1 + (params / self.dlt)**2)
|
|||
|
return params * self.weights * self.alpha / v
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
v = np.power(1 + (params / self.dlt)**2, -3/2)
|
|||
|
return self.weights * self.alpha * v
|
|||
|
|
|||
|
|
|||
|
class SCAD(Penalty):
|
|||
|
"""
|
|||
|
The SCAD penalty of Fan and Li.
|
|||
|
|
|||
|
The SCAD penalty is linear around zero as a L1 penalty up to threshold tau.
|
|||
|
The SCAD penalty is constant for values larger than c*tau.
|
|||
|
The middle segment is quadratic and connect the two segments with a continuous
|
|||
|
derivative.
|
|||
|
The penalty is symmetric around zero.
|
|||
|
|
|||
|
Parameterization follows Boo, Johnson, Li and Tan 2011.
|
|||
|
Fan and Li use lambda instead of tau, and a instead of c. Fan and Li
|
|||
|
recommend setting c=3.7.
|
|||
|
|
|||
|
f(x) = { tau |x| if 0 <= |x| < tau
|
|||
|
{ -(|x|^2 - 2 c tau |x| + tau^2) / (2 (c - 1)) if tau <= |x| < c tau
|
|||
|
{ (c + 1) tau^2 / 2 if c tau <= |x|
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
tau : float
|
|||
|
slope and threshold for linear segment
|
|||
|
c : float
|
|||
|
factor for second threshold which is c * tau
|
|||
|
weights : None or array
|
|||
|
weights for penalty of each parameter. If an entry is zero, then the
|
|||
|
corresponding parameter will not be penalized.
|
|||
|
|
|||
|
References
|
|||
|
----------
|
|||
|
Buu, Anne, Norman J. Johnson, Runze Li, and Xianming Tan. "New variable
|
|||
|
selection methods for zero‐inflated count data with applications to the
|
|||
|
substance abuse field."
|
|||
|
Statistics in medicine 30, no. 18 (2011): 2326-2340.
|
|||
|
|
|||
|
Fan, Jianqing, and Runze Li. "Variable selection via nonconcave penalized
|
|||
|
likelihood and its oracle properties."
|
|||
|
Journal of the American statistical Association 96, no. 456 (2001):
|
|||
|
1348-1360.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, tau, c=3.7, weights=1.):
|
|||
|
super().__init__(weights)
|
|||
|
self.tau = tau
|
|||
|
self.c = c
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
|
|||
|
# 3 segments in absolute value
|
|||
|
tau = self.tau
|
|||
|
p_abs = np.atleast_1d(np.abs(params))
|
|||
|
res = np.empty(p_abs.shape, p_abs.dtype)
|
|||
|
res.fill(np.nan)
|
|||
|
mask1 = p_abs < tau
|
|||
|
mask3 = p_abs >= self.c * tau
|
|||
|
res[mask1] = tau * p_abs[mask1]
|
|||
|
mask2 = ~mask1 & ~mask3
|
|||
|
p_abs2 = p_abs[mask2]
|
|||
|
tmp = (p_abs2**2 - 2 * self.c * tau * p_abs2 + tau**2)
|
|||
|
res[mask2] = -tmp / (2 * (self.c - 1))
|
|||
|
res[mask3] = (self.c + 1) * tau**2 / 2.
|
|||
|
|
|||
|
return (self.weights * res).sum(0)
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
|
|||
|
# 3 segments in absolute value
|
|||
|
tau = self.tau
|
|||
|
p = np.atleast_1d(params)
|
|||
|
p_abs = np.abs(p)
|
|||
|
p_sign = np.sign(p)
|
|||
|
res = np.empty(p_abs.shape)
|
|||
|
res.fill(np.nan)
|
|||
|
|
|||
|
mask1 = p_abs < tau
|
|||
|
mask3 = p_abs >= self.c * tau
|
|||
|
mask2 = ~mask1 & ~mask3
|
|||
|
res[mask1] = p_sign[mask1] * tau
|
|||
|
tmp = p_sign[mask2] * (p_abs[mask2] - self.c * tau)
|
|||
|
res[mask2] = -tmp / (self.c - 1)
|
|||
|
res[mask3] = 0
|
|||
|
|
|||
|
return self.weights * res
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
"""Second derivative of function
|
|||
|
|
|||
|
This returns scalar or vector in same shape as params, not a square
|
|||
|
Hessian. If the return is 1 dimensional, then it is the diagonal of
|
|||
|
the Hessian.
|
|||
|
"""
|
|||
|
|
|||
|
# 3 segments in absolute value
|
|||
|
tau = self.tau
|
|||
|
p = np.atleast_1d(params)
|
|||
|
p_abs = np.abs(p)
|
|||
|
res = np.zeros(p_abs.shape)
|
|||
|
|
|||
|
mask1 = p_abs < tau
|
|||
|
mask3 = p_abs >= self.c * tau
|
|||
|
mask2 = ~mask1 & ~mask3
|
|||
|
res[mask2] = -1 / (self.c - 1)
|
|||
|
|
|||
|
return self.weights * res
|
|||
|
|
|||
|
|
|||
|
class SCADSmoothed(SCAD):
|
|||
|
"""
|
|||
|
The SCAD penalty of Fan and Li, quadratically smoothed around zero.
|
|||
|
|
|||
|
This follows Fan and Li 2001 equation (3.7).
|
|||
|
|
|||
|
Parameterization follows Boo, Johnson, Li and Tan 2011
|
|||
|
see docstring of SCAD
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
tau : float
|
|||
|
slope and threshold for linear segment
|
|||
|
c : float
|
|||
|
factor for second threshold
|
|||
|
c0 : float
|
|||
|
threshold for quadratically smoothed segment
|
|||
|
restriction : None or array
|
|||
|
linear constraints for
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
TODO: Use delegation instead of subclassing, so smoothing can be added to
|
|||
|
all penalty classes.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, tau, c=3.7, c0=None, weights=1., restriction=None):
|
|||
|
super().__init__(tau, c=c, weights=weights)
|
|||
|
self.tau = tau
|
|||
|
self.c = c
|
|||
|
self.c0 = c0 if c0 is not None else tau * 0.1
|
|||
|
if self.c0 > tau:
|
|||
|
raise ValueError('c0 cannot be larger than tau')
|
|||
|
|
|||
|
# get coefficients for quadratic approximation
|
|||
|
c0 = self.c0
|
|||
|
# need to temporarily override weights for call to super
|
|||
|
weights = self.weights
|
|||
|
self.weights = 1.
|
|||
|
deriv_c0 = super().deriv(c0)
|
|||
|
value_c0 = super().func(c0)
|
|||
|
self.weights = weights
|
|||
|
|
|||
|
self.aq1 = value_c0 - 0.5 * deriv_c0 * c0
|
|||
|
self.aq2 = 0.5 * deriv_c0 / c0
|
|||
|
self.restriction = restriction
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
# workaround for Null model
|
|||
|
weights = self._null_weights(params)
|
|||
|
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
|
|||
|
if self.restriction is not None and np.size(params) > 1:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
# need to temporarily override weights for call to super
|
|||
|
# Note: we have the same problem with `restriction`
|
|||
|
self_weights = self.weights
|
|||
|
self.weights = 1.
|
|||
|
value = super().func(params[None, ...])
|
|||
|
self.weights = self_weights
|
|||
|
|
|||
|
# shift down so func(0) == 0
|
|||
|
value -= self.aq1
|
|||
|
# change the segment corrsponding to quadratic approximation
|
|||
|
p_abs = np.atleast_1d(np.abs(params))
|
|||
|
mask = p_abs < self.c0
|
|||
|
p_abs_masked = p_abs[mask]
|
|||
|
value[mask] = self.aq2 * p_abs_masked**2
|
|||
|
|
|||
|
return (weights * value).sum(0)
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
# workaround for Null model
|
|||
|
weights = self._null_weights(params)
|
|||
|
if self.restriction is not None and np.size(params) > 1:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
# need to temporarily override weights for call to super
|
|||
|
self_weights = self.weights
|
|||
|
self.weights = 1.
|
|||
|
value = super().deriv(params)
|
|||
|
self.weights = self_weights
|
|||
|
|
|||
|
#change the segment corrsponding to quadratic approximation
|
|||
|
p = np.atleast_1d(params)
|
|||
|
mask = np.abs(p) < self.c0
|
|||
|
value[mask] = 2 * self.aq2 * p[mask]
|
|||
|
|
|||
|
if self.restriction is not None and np.size(params) > 1:
|
|||
|
return weights * value.dot(self.restriction)
|
|||
|
else:
|
|||
|
return weights * value
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
# workaround for Null model
|
|||
|
weights = self._null_weights(params)
|
|||
|
if self.restriction is not None and np.size(params) > 1:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
# need to temporarily override weights for call to super
|
|||
|
self_weights = self.weights
|
|||
|
self.weights = 1.
|
|||
|
value = super().deriv2(params)
|
|||
|
self.weights = self_weights
|
|||
|
|
|||
|
# change the segment corrsponding to quadratic approximation
|
|||
|
p = np.atleast_1d(params)
|
|||
|
mask = np.abs(p) < self.c0
|
|||
|
value[mask] = 2 * self.aq2
|
|||
|
|
|||
|
if self.restriction is not None and np.size(params) > 1:
|
|||
|
# note: super returns 1d array for diag, i.e. hessian_diag
|
|||
|
# TODO: weights are missing
|
|||
|
return (self.restriction.T * (weights * value)
|
|||
|
).dot(self.restriction)
|
|||
|
else:
|
|||
|
return weights * value
|
|||
|
|
|||
|
|
|||
|
class ConstraintsPenalty:
|
|||
|
"""
|
|||
|
Penalty applied to linear transformation of parameters
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
penalty: instance of penalty function
|
|||
|
currently this requires an instance of a univariate, vectorized
|
|||
|
penalty class
|
|||
|
weights : None or ndarray
|
|||
|
weights for adding penalties of transformed params
|
|||
|
restriction : None or ndarray
|
|||
|
If it is not None, then restriction defines a linear transformation
|
|||
|
of the parameters. The penalty function is applied to each transformed
|
|||
|
parameter independently.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
`restrictions` allows us to impose penalization on contrasts or stochastic
|
|||
|
constraints of the original parameters.
|
|||
|
Examples for these contrast are difference penalities or all pairs
|
|||
|
penalties.
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, penalty, weights=None, restriction=None):
|
|||
|
|
|||
|
self.penalty = penalty
|
|||
|
if weights is None:
|
|||
|
self.weights = 1.
|
|||
|
else:
|
|||
|
self.weights = weights
|
|||
|
|
|||
|
if restriction is not None:
|
|||
|
restriction = np.asarray(restriction)
|
|||
|
|
|||
|
self.restriction = restriction
|
|||
|
|
|||
|
def func(self, params):
|
|||
|
"""evaluate penalty function at params
|
|||
|
|
|||
|
Parameter
|
|||
|
---------
|
|||
|
params : ndarray
|
|||
|
array of parameters at which derivative is evaluated
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
deriv2 : ndarray
|
|||
|
value(s) of penalty function
|
|||
|
"""
|
|||
|
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
|
|||
|
# Is this still needed? it seems to work without
|
|||
|
if self.restriction is not None:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
|
|||
|
value = self.penalty.func(params)
|
|||
|
|
|||
|
return (self.weights * value.T).T.sum(0)
|
|||
|
|
|||
|
def deriv(self, params):
|
|||
|
"""first derivative of penalty function w.r.t. params
|
|||
|
|
|||
|
Parameter
|
|||
|
---------
|
|||
|
params : ndarray
|
|||
|
array of parameters at which derivative is evaluated
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
deriv2 : ndarray
|
|||
|
array of first partial derivatives
|
|||
|
"""
|
|||
|
if self.restriction is not None:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
|
|||
|
value = self.penalty.deriv(params)
|
|||
|
|
|||
|
if self.restriction is not None:
|
|||
|
return self.weights * value.T.dot(self.restriction)
|
|||
|
else:
|
|||
|
return (self.weights * value.T)
|
|||
|
|
|||
|
grad = deriv
|
|||
|
|
|||
|
def deriv2(self, params):
|
|||
|
"""second derivative of penalty function w.r.t. params
|
|||
|
|
|||
|
Parameter
|
|||
|
---------
|
|||
|
params : ndarray
|
|||
|
array of parameters at which derivative is evaluated
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
deriv2 : ndarray, 2-D
|
|||
|
second derivative matrix
|
|||
|
"""
|
|||
|
|
|||
|
if self.restriction is not None:
|
|||
|
params = self.restriction.dot(params)
|
|||
|
|
|||
|
value = self.penalty.deriv2(params)
|
|||
|
|
|||
|
if self.restriction is not None:
|
|||
|
# note: univariate penalty returns 1d array for diag,
|
|||
|
# i.e. hessian_diag
|
|||
|
v = (self.restriction.T * value * self.weights)
|
|||
|
value = v.dot(self.restriction)
|
|||
|
else:
|
|||
|
value = np.diag(self.weights * value)
|
|||
|
|
|||
|
return value
|
|||
|
|
|||
|
|
|||
|
class L2ConstraintsPenalty(ConstraintsPenalty):
|
|||
|
"""convenience class of ConstraintsPenalty with L2 penalization
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, weights=None, restriction=None, sigma_prior=None):
|
|||
|
|
|||
|
if sigma_prior is not None:
|
|||
|
raise NotImplementedError('sigma_prior is not implemented yet')
|
|||
|
|
|||
|
penalty = L2Univariate()
|
|||
|
|
|||
|
super().__init__(penalty, weights=weights,
|
|||
|
restriction=restriction)
|
|||
|
|
|||
|
|
|||
|
class CovariancePenalty:
|
|||
|
|
|||
|
def __init__(self, weight):
|
|||
|
# weight should be scalar
|
|||
|
self.weight = weight
|
|||
|
|
|||
|
def func(self, mat, mat_inv):
|
|||
|
"""
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
mat : square matrix
|
|||
|
The matrix to be penalized.
|
|||
|
mat_inv : square matrix
|
|||
|
The inverse of `mat`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
A scalar penalty value
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
def deriv(self, mat, mat_inv):
|
|||
|
"""
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
mat : square matrix
|
|||
|
The matrix to be penalized.
|
|||
|
mat_inv : square matrix
|
|||
|
The inverse of `mat`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
A vector containing the gradient of the penalty
|
|||
|
with respect to each element in the lower triangle
|
|||
|
of `mat`.
|
|||
|
"""
|
|||
|
raise NotImplementedError
|
|||
|
|
|||
|
|
|||
|
class PSD(CovariancePenalty):
|
|||
|
"""
|
|||
|
A penalty that converges to +infinity as the argument matrix
|
|||
|
approaches the boundary of the domain of symmetric, positive
|
|||
|
definite matrices.
|
|||
|
"""
|
|||
|
|
|||
|
def func(self, mat, mat_inv):
|
|||
|
try:
|
|||
|
cy = np.linalg.cholesky(mat)
|
|||
|
except np.linalg.LinAlgError:
|
|||
|
return np.inf
|
|||
|
return -2 * self.weight * np.sum(np.log(np.diag(cy)))
|
|||
|
|
|||
|
def deriv(self, mat, mat_inv):
|
|||
|
cy = mat_inv.copy()
|
|||
|
cy = 2*cy - np.diag(np.diag(cy))
|
|||
|
i,j = np.tril_indices(mat.shape[0])
|
|||
|
return -self.weight * cy[i,j]
|