2024-10-02 22:15:59 +04:00

587 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
A collection of smooth penalty functions.
Penalties on vectors take a vector argument and return a scalar
penalty. The gradient of the penalty is a vector with the same shape
as the input value.
Penalties on covariance matrices take two arguments: the matrix and
its inverse, both in unpacked (square) form. The returned penalty is
a scalar, and the gradient is returned as a vector that contains the
gradient with respect to the free elements in the lower triangle of
the covariance matrix.
All penalties are subtracted from the log-likelihood, so greater
penalty values correspond to a greater degree of penalization.
The penaties should be smooth so that they can be subtracted from log
likelihood functions and optimized using standard methods (i.e. L1
penalties do not belong here).
"""
import numpy as np
class Penalty:
"""
A class for representing a scalar-value penalty.
Parameters
----------
weights : array_like
A vector of weights that determines the weight of the penalty
for each parameter.
Notes
-----
The class has a member called `alpha` that scales the weights.
"""
def __init__(self, weights=1.):
self.weights = weights
self.alpha = 1.
def func(self, params):
"""
A penalty function on a vector of parameters.
Parameters
----------
params : array_like
A vector of parameters.
Returns
-------
A scalar penaty value; greater values imply greater
penalization.
"""
raise NotImplementedError
def deriv(self, params):
"""
The gradient of a penalty function.
Parameters
----------
params : array_like
A vector of parameters
Returns
-------
The gradient of the penalty with respect to each element in
`params`.
"""
raise NotImplementedError
def _null_weights(self, params):
"""work around for Null model
This will not be needed anymore when we can use `self._null_drop_keys`
as in DiscreteModels.
TODO: check other models
"""
if np.size(self.weights) > 1:
if len(params) == 1:
raise # raise to identify models where this would be needed
return 0.
return self.weights
class NonePenalty(Penalty):
"""
A penalty that does not penalize.
"""
def __init__(self, **kwds):
super().__init__()
if kwds:
import warnings
warnings.warn('keyword arguments are be ignored')
def func(self, params):
if params.ndim == 2:
return np.zeros(params.shape[1:])
else:
return 0
def deriv(self, params):
return np.zeros(params.shape)
def deriv2(self, params):
# returns diagonal of hessian
return np.zeros(params.shape[0])
class L2(Penalty):
"""
The L2 (ridge) penalty.
"""
def __init__(self, weights=1.):
super().__init__(weights)
def func(self, params):
return np.sum(self.weights * self.alpha * params**2)
def deriv(self, params):
return 2 * self.weights * self.alpha * params
def deriv2(self, params):
return 2 * self.weights * self.alpha * np.ones(len(params))
class L2Univariate(Penalty):
"""
The L2 (ridge) penalty applied to each parameter.
"""
def __init__(self, weights=None):
if weights is None:
self.weights = 1.
else:
self.weights = weights
def func(self, params):
return self.weights * params**2
def deriv(self, params):
return 2 * self.weights * params
def deriv2(self, params):
return 2 * self.weights * np.ones(len(params))
class PseudoHuber(Penalty):
"""
The pseudo-Huber penalty.
"""
def __init__(self, dlt, weights=1.):
super().__init__(weights)
self.dlt = dlt
def func(self, params):
v = np.sqrt(1 + (params / self.dlt)**2)
v -= 1
v *= self.dlt**2
return np.sum(self.weights * self.alpha * v, 0)
def deriv(self, params):
v = np.sqrt(1 + (params / self.dlt)**2)
return params * self.weights * self.alpha / v
def deriv2(self, params):
v = np.power(1 + (params / self.dlt)**2, -3/2)
return self.weights * self.alpha * v
class SCAD(Penalty):
"""
The SCAD penalty of Fan and Li.
The SCAD penalty is linear around zero as a L1 penalty up to threshold tau.
The SCAD penalty is constant for values larger than c*tau.
The middle segment is quadratic and connect the two segments with a continuous
derivative.
The penalty is symmetric around zero.
Parameterization follows Boo, Johnson, Li and Tan 2011.
Fan and Li use lambda instead of tau, and a instead of c. Fan and Li
recommend setting c=3.7.
f(x) = { tau |x| if 0 <= |x| < tau
{ -(|x|^2 - 2 c tau |x| + tau^2) / (2 (c - 1)) if tau <= |x| < c tau
{ (c + 1) tau^2 / 2 if c tau <= |x|
Parameters
----------
tau : float
slope and threshold for linear segment
c : float
factor for second threshold which is c * tau
weights : None or array
weights for penalty of each parameter. If an entry is zero, then the
corresponding parameter will not be penalized.
References
----------
Buu, Anne, Norman J. Johnson, Runze Li, and Xianming Tan. "New variable
selection methods for zeroinflated count data with applications to the
substance abuse field."
Statistics in medicine 30, no. 18 (2011): 2326-2340.
Fan, Jianqing, and Runze Li. "Variable selection via nonconcave penalized
likelihood and its oracle properties."
Journal of the American statistical Association 96, no. 456 (2001):
1348-1360.
"""
def __init__(self, tau, c=3.7, weights=1.):
super().__init__(weights)
self.tau = tau
self.c = c
def func(self, params):
# 3 segments in absolute value
tau = self.tau
p_abs = np.atleast_1d(np.abs(params))
res = np.empty(p_abs.shape, p_abs.dtype)
res.fill(np.nan)
mask1 = p_abs < tau
mask3 = p_abs >= self.c * tau
res[mask1] = tau * p_abs[mask1]
mask2 = ~mask1 & ~mask3
p_abs2 = p_abs[mask2]
tmp = (p_abs2**2 - 2 * self.c * tau * p_abs2 + tau**2)
res[mask2] = -tmp / (2 * (self.c - 1))
res[mask3] = (self.c + 1) * tau**2 / 2.
return (self.weights * res).sum(0)
def deriv(self, params):
# 3 segments in absolute value
tau = self.tau
p = np.atleast_1d(params)
p_abs = np.abs(p)
p_sign = np.sign(p)
res = np.empty(p_abs.shape)
res.fill(np.nan)
mask1 = p_abs < tau
mask3 = p_abs >= self.c * tau
mask2 = ~mask1 & ~mask3
res[mask1] = p_sign[mask1] * tau
tmp = p_sign[mask2] * (p_abs[mask2] - self.c * tau)
res[mask2] = -tmp / (self.c - 1)
res[mask3] = 0
return self.weights * res
def deriv2(self, params):
"""Second derivative of function
This returns scalar or vector in same shape as params, not a square
Hessian. If the return is 1 dimensional, then it is the diagonal of
the Hessian.
"""
# 3 segments in absolute value
tau = self.tau
p = np.atleast_1d(params)
p_abs = np.abs(p)
res = np.zeros(p_abs.shape)
mask1 = p_abs < tau
mask3 = p_abs >= self.c * tau
mask2 = ~mask1 & ~mask3
res[mask2] = -1 / (self.c - 1)
return self.weights * res
class SCADSmoothed(SCAD):
"""
The SCAD penalty of Fan and Li, quadratically smoothed around zero.
This follows Fan and Li 2001 equation (3.7).
Parameterization follows Boo, Johnson, Li and Tan 2011
see docstring of SCAD
Parameters
----------
tau : float
slope and threshold for linear segment
c : float
factor for second threshold
c0 : float
threshold for quadratically smoothed segment
restriction : None or array
linear constraints for
Notes
-----
TODO: Use delegation instead of subclassing, so smoothing can be added to
all penalty classes.
"""
def __init__(self, tau, c=3.7, c0=None, weights=1., restriction=None):
super().__init__(tau, c=c, weights=weights)
self.tau = tau
self.c = c
self.c0 = c0 if c0 is not None else tau * 0.1
if self.c0 > tau:
raise ValueError('c0 cannot be larger than tau')
# get coefficients for quadratic approximation
c0 = self.c0
# need to temporarily override weights for call to super
weights = self.weights
self.weights = 1.
deriv_c0 = super().deriv(c0)
value_c0 = super().func(c0)
self.weights = weights
self.aq1 = value_c0 - 0.5 * deriv_c0 * c0
self.aq2 = 0.5 * deriv_c0 / c0
self.restriction = restriction
def func(self, params):
# workaround for Null model
weights = self._null_weights(params)
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
if self.restriction is not None and np.size(params) > 1:
params = self.restriction.dot(params)
# need to temporarily override weights for call to super
# Note: we have the same problem with `restriction`
self_weights = self.weights
self.weights = 1.
value = super().func(params[None, ...])
self.weights = self_weights
# shift down so func(0) == 0
value -= self.aq1
# change the segment corrsponding to quadratic approximation
p_abs = np.atleast_1d(np.abs(params))
mask = p_abs < self.c0
p_abs_masked = p_abs[mask]
value[mask] = self.aq2 * p_abs_masked**2
return (weights * value).sum(0)
def deriv(self, params):
# workaround for Null model
weights = self._null_weights(params)
if self.restriction is not None and np.size(params) > 1:
params = self.restriction.dot(params)
# need to temporarily override weights for call to super
self_weights = self.weights
self.weights = 1.
value = super().deriv(params)
self.weights = self_weights
#change the segment corrsponding to quadratic approximation
p = np.atleast_1d(params)
mask = np.abs(p) < self.c0
value[mask] = 2 * self.aq2 * p[mask]
if self.restriction is not None and np.size(params) > 1:
return weights * value.dot(self.restriction)
else:
return weights * value
def deriv2(self, params):
# workaround for Null model
weights = self._null_weights(params)
if self.restriction is not None and np.size(params) > 1:
params = self.restriction.dot(params)
# need to temporarily override weights for call to super
self_weights = self.weights
self.weights = 1.
value = super().deriv2(params)
self.weights = self_weights
# change the segment corrsponding to quadratic approximation
p = np.atleast_1d(params)
mask = np.abs(p) < self.c0
value[mask] = 2 * self.aq2
if self.restriction is not None and np.size(params) > 1:
# note: super returns 1d array for diag, i.e. hessian_diag
# TODO: weights are missing
return (self.restriction.T * (weights * value)
).dot(self.restriction)
else:
return weights * value
class ConstraintsPenalty:
"""
Penalty applied to linear transformation of parameters
Parameters
----------
penalty: instance of penalty function
currently this requires an instance of a univariate, vectorized
penalty class
weights : None or ndarray
weights for adding penalties of transformed params
restriction : None or ndarray
If it is not None, then restriction defines a linear transformation
of the parameters. The penalty function is applied to each transformed
parameter independently.
Notes
-----
`restrictions` allows us to impose penalization on contrasts or stochastic
constraints of the original parameters.
Examples for these contrast are difference penalities or all pairs
penalties.
"""
def __init__(self, penalty, weights=None, restriction=None):
self.penalty = penalty
if weights is None:
self.weights = 1.
else:
self.weights = weights
if restriction is not None:
restriction = np.asarray(restriction)
self.restriction = restriction
def func(self, params):
"""evaluate penalty function at params
Parameter
---------
params : ndarray
array of parameters at which derivative is evaluated
Returns
-------
deriv2 : ndarray
value(s) of penalty function
"""
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
# Is this still needed? it seems to work without
if self.restriction is not None:
params = self.restriction.dot(params)
value = self.penalty.func(params)
return (self.weights * value.T).T.sum(0)
def deriv(self, params):
"""first derivative of penalty function w.r.t. params
Parameter
---------
params : ndarray
array of parameters at which derivative is evaluated
Returns
-------
deriv2 : ndarray
array of first partial derivatives
"""
if self.restriction is not None:
params = self.restriction.dot(params)
value = self.penalty.deriv(params)
if self.restriction is not None:
return self.weights * value.T.dot(self.restriction)
else:
return (self.weights * value.T)
grad = deriv
def deriv2(self, params):
"""second derivative of penalty function w.r.t. params
Parameter
---------
params : ndarray
array of parameters at which derivative is evaluated
Returns
-------
deriv2 : ndarray, 2-D
second derivative matrix
"""
if self.restriction is not None:
params = self.restriction.dot(params)
value = self.penalty.deriv2(params)
if self.restriction is not None:
# note: univariate penalty returns 1d array for diag,
# i.e. hessian_diag
v = (self.restriction.T * value * self.weights)
value = v.dot(self.restriction)
else:
value = np.diag(self.weights * value)
return value
class L2ConstraintsPenalty(ConstraintsPenalty):
"""convenience class of ConstraintsPenalty with L2 penalization
"""
def __init__(self, weights=None, restriction=None, sigma_prior=None):
if sigma_prior is not None:
raise NotImplementedError('sigma_prior is not implemented yet')
penalty = L2Univariate()
super().__init__(penalty, weights=weights,
restriction=restriction)
class CovariancePenalty:
def __init__(self, weight):
# weight should be scalar
self.weight = weight
def func(self, mat, mat_inv):
"""
Parameters
----------
mat : square matrix
The matrix to be penalized.
mat_inv : square matrix
The inverse of `mat`.
Returns
-------
A scalar penalty value
"""
raise NotImplementedError
def deriv(self, mat, mat_inv):
"""
Parameters
----------
mat : square matrix
The matrix to be penalized.
mat_inv : square matrix
The inverse of `mat`.
Returns
-------
A vector containing the gradient of the penalty
with respect to each element in the lower triangle
of `mat`.
"""
raise NotImplementedError
class PSD(CovariancePenalty):
"""
A penalty that converges to +infinity as the argument matrix
approaches the boundary of the domain of symmetric, positive
definite matrices.
"""
def func(self, mat, mat_inv):
try:
cy = np.linalg.cholesky(mat)
except np.linalg.LinAlgError:
return np.inf
return -2 * self.weight * np.sum(np.log(np.diag(cy)))
def deriv(self, mat, mat_inv):
cy = mat_inv.copy()
cy = 2*cy - np.diag(np.diag(cy))
i,j = np.tril_indices(mat.shape[0])
return -self.weight * cy[i,j]