399 lines
15 KiB
Python
399 lines
15 KiB
Python
import numbers
|
|
import warnings
|
|
from abc import ABCMeta, abstractmethod
|
|
|
|
import numpy as np
|
|
import scipy.sparse as sp
|
|
import six
|
|
from joblib import Memory, Parallel, delayed
|
|
from scipy.interpolate import interp1d
|
|
from sklearn.preprocessing import normalize as f_normalize
|
|
from sklearn.base import BaseEstimator
|
|
from sklearn.exceptions import ConvergenceWarning
|
|
from sklearn.feature_selection import SelectorMixin
|
|
from sklearn.linear_model import lars_path, LassoLarsIC
|
|
from sklearn.utils import check_X_y, check_random_state, safe_mask, check_array
|
|
|
|
__all__ = ['RandomizedLasso']
|
|
|
|
from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
|
|
|
|
from sklearn.utils.validation import check_is_fitted, as_float_array, FLOAT_DTYPES
|
|
|
|
|
|
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
|
|
sample_weight=None, return_mean=False):
|
|
"""
|
|
Centers data to have mean zero along axis 0. If fit_intercept=False or if
|
|
the X is a sparse matrix, no centering is done, but normalization can still
|
|
be applied. The function returns the statistics necessary to reconstruct
|
|
the input data, which are X_offset, y_offset, X_scale, such that the output
|
|
|
|
X = (X - X_offset) / X_scale
|
|
|
|
X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
|
|
then the weighted mean of X and y is zero, and not the mean itself. If
|
|
return_mean=True, the mean, eventually weighted, is returned, independently
|
|
of whether X was centered (option used for optimization with sparse data in
|
|
coordinate_descend).
|
|
|
|
This is here because nearly all linear models will want their data to be
|
|
centered. This function also systematically makes y consistent with X.dtype
|
|
"""
|
|
|
|
if isinstance(sample_weight, numbers.Number):
|
|
sample_weight = None
|
|
|
|
X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
|
|
dtype=FLOAT_DTYPES)
|
|
y = np.asarray(y, dtype=X.dtype)
|
|
|
|
if fit_intercept:
|
|
if sp.issparse(X):
|
|
X_offset, X_var = mean_variance_axis(X, axis=0)
|
|
if not return_mean:
|
|
X_offset[:] = X.dtype.type(0)
|
|
|
|
if normalize:
|
|
X_var *= X.shape[0]
|
|
X_scale = np.sqrt(X_var, X_var)
|
|
del X_var
|
|
X_scale[X_scale == 0] = 1
|
|
inplace_column_scale(X, 1. / X_scale)
|
|
else:
|
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
|
|
else:
|
|
X_offset = np.average(X, axis=0, weights=sample_weight)
|
|
X -= X_offset
|
|
if normalize:
|
|
X, X_scale = f_normalize(X, axis=0, copy=False,
|
|
return_norm=True)
|
|
else:
|
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
y_offset = np.average(y, axis=0, weights=sample_weight)
|
|
y = y - y_offset
|
|
else:
|
|
X_offset = np.zeros(X.shape[1], dtype=X.dtype)
|
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
if y.ndim == 1:
|
|
y_offset = X.dtype.type(0)
|
|
else:
|
|
y_offset = np.zeros(y.shape[1], dtype=X.dtype)
|
|
|
|
return X, y, X_offset, y_offset, X_scale
|
|
|
|
|
|
def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
|
|
n_jobs=1, verbose=False, pre_dispatch='3*n_jobs',
|
|
random_state=None, sample_fraction=.75, **params):
|
|
random_state = check_random_state(random_state)
|
|
# We are generating 1 - weights, and not weights
|
|
n_samples, n_features = X.shape
|
|
|
|
if not (0 < scaling < 1):
|
|
raise ValueError(
|
|
"'scaling' should be between 0 and 1. Got %r instead." % scaling)
|
|
|
|
scaling = 1. - scaling
|
|
scores_ = 0.0
|
|
for active_set in Parallel(n_jobs=n_jobs, verbose=verbose,
|
|
pre_dispatch=pre_dispatch)(
|
|
delayed(estimator_func)(
|
|
X, y, weights=scaling * random_state.randint(
|
|
0, 2, size=(n_features,)),
|
|
mask=(random_state.rand(n_samples) < sample_fraction),
|
|
verbose=max(0, verbose - 1),
|
|
**params)
|
|
for _ in range(n_resampling)):
|
|
scores_ += active_set
|
|
|
|
scores_ /= n_resampling
|
|
return scores_
|
|
|
|
|
|
class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
|
|
SelectorMixin)):
|
|
"""Base class to implement randomized linear models for feature selection
|
|
|
|
This implements the strategy by Meinshausen and Buhlman:
|
|
stability selection with randomized sampling, and random re-weighting of
|
|
the penalty.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def __init__(self):
|
|
pass
|
|
|
|
_preprocess_data = staticmethod(_preprocess_data)
|
|
|
|
def fit(self, X, y):
|
|
"""Fit the model using X, y as training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape = [n_samples, n_features]
|
|
Training data.
|
|
|
|
y : array-like, shape = [n_samples]
|
|
Target values. Will be cast to X's dtype if necessary
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns an instance of self.
|
|
"""
|
|
X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True,
|
|
ensure_min_samples=2, estimator=self)
|
|
X = as_float_array(X, copy=False)
|
|
n_samples, n_features = X.shape
|
|
|
|
X, y, X_offset, y_offset, X_scale = \
|
|
self._preprocess_data(X, y, self.fit_intercept, self.normalize)
|
|
|
|
estimator_func, params = self._make_estimator_and_params(X, y)
|
|
memory = self.memory
|
|
if memory is None:
|
|
memory = Memory(cachedir=None, verbose=0)
|
|
elif isinstance(memory, six.string_types):
|
|
memory = Memory(cachedir=memory, verbose=0)
|
|
elif not isinstance(memory, Memory):
|
|
raise ValueError("'memory' should either be a string or"
|
|
" a sklearn.externals.joblib.Memory"
|
|
" instance, got 'memory={!r}' instead.".format(
|
|
type(memory)))
|
|
|
|
scores_ = memory.cache(
|
|
_resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch']
|
|
)(
|
|
estimator_func, X, y,
|
|
scaling=self.scaling, n_resampling=self.n_resampling,
|
|
n_jobs=self.n_jobs, verbose=self.verbose,
|
|
pre_dispatch=self.pre_dispatch, random_state=self.random_state,
|
|
sample_fraction=self.sample_fraction, **params)
|
|
|
|
if scores_.ndim == 1:
|
|
scores_ = scores_[:, np.newaxis]
|
|
self.all_scores_ = scores_
|
|
self.scores_ = np.max(self.all_scores_, axis=1)
|
|
return self
|
|
|
|
def _make_estimator_and_params(self, X, y):
|
|
"""Return the parameters passed to the estimator"""
|
|
raise NotImplementedError
|
|
|
|
def _get_support_mask(self):
|
|
"""Get the boolean mask indicating which features are selected.
|
|
|
|
Returns
|
|
-------
|
|
support : boolean array of shape [# input features]
|
|
An element is True iff its corresponding feature is selected
|
|
for retention.
|
|
"""
|
|
check_is_fitted(self, 'scores_')
|
|
return self.scores_ > self.selection_threshold
|
|
|
|
|
|
###############################################################################
|
|
# Randomized lasso: regression settings
|
|
|
|
def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
|
|
precompute=False, eps=np.finfo(np.float).eps,
|
|
max_iter=500):
|
|
X = X[safe_mask(X, mask)]
|
|
y = y[mask]
|
|
|
|
# Center X and y to avoid fit the intercept
|
|
X -= X.mean(axis=0)
|
|
y -= y.mean()
|
|
|
|
alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
|
|
|
|
X = (1 - weights) * X
|
|
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter('ignore', ConvergenceWarning)
|
|
alphas_, _, coef_ = lars_path(X, y,
|
|
Gram=precompute, copy_X=False,
|
|
copy_Gram=False, alpha_min=np.min(alpha),
|
|
method='lasso', verbose=verbose,
|
|
max_iter=max_iter, eps=eps)
|
|
|
|
if len(alpha) > 1:
|
|
if len(alphas_) > 1: # np.min(alpha) < alpha_min
|
|
interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
|
|
bounds_error=False, fill_value=0.)
|
|
scores = (interpolator(alpha) != 0.0)
|
|
else:
|
|
scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
|
|
else:
|
|
scores = coef_[:, -1] != 0.0
|
|
return scores
|
|
|
|
|
|
class RandomizedLasso(BaseRandomizedLinearModel):
|
|
"""Randomized Lasso.
|
|
|
|
Randomized Lasso works by subsampling the training data and
|
|
computing a Lasso estimate where the penalty of a random subset of
|
|
coefficients has been scaled. By performing this double
|
|
randomization several times, the method assigns high scores to
|
|
features that are repeatedly selected across randomizations. This
|
|
is known as stability selection. In short, features selected more
|
|
often are considered good features.
|
|
|
|
Parameters
|
|
----------
|
|
alpha : float, 'aic', or 'bic', optional
|
|
The regularization parameter alpha parameter in the Lasso.
|
|
Warning: this is not the alpha parameter in the stability selection
|
|
article which is scaling.
|
|
|
|
scaling : float, optional
|
|
The s parameter used to randomly scale the penalty of different
|
|
features.
|
|
Should be between 0 and 1.
|
|
|
|
sample_fraction : float, optional
|
|
The fraction of samples to be used in each randomized design.
|
|
Should be between 0 and 1. If 1, all samples are used.
|
|
|
|
n_resampling : int, optional
|
|
Number of randomized models.
|
|
|
|
selection_threshold : float, optional
|
|
The score above which features should be selected.
|
|
|
|
fit_intercept : boolean, optional
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(e.g. data is expected to be already centered).
|
|
|
|
verbose : boolean or integer, optional
|
|
Sets the verbosity amount
|
|
|
|
normalize : boolean, optional, default True
|
|
If True, the regressors X will be normalized before regression.
|
|
This parameter is ignored when `fit_intercept` is set to False.
|
|
When the regressors are normalized, note that this makes the
|
|
hyperparameters learned more robust and almost independent of
|
|
the number of samples. The same property is not valid for
|
|
standardized data. However, if you wish to standardize, please
|
|
use `preprocessing.StandardScaler` before calling `fit` on an
|
|
estimator with `normalize=False`.
|
|
|
|
precompute : True | False | 'auto' | array-like
|
|
Whether to use a precomputed Gram matrix to speed up calculations.
|
|
If set to 'auto' let us decide.
|
|
The Gram matrix can also be passed as argument, but it will be used
|
|
only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations to perform in the Lars algorithm.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. Unlike the 'tol' parameter in some iterative
|
|
optimization-based algorithms, this parameter does not control
|
|
the tolerance of the optimization.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
n_jobs : integer, optional
|
|
Number of CPUs to use during the resampling. If '-1', use
|
|
all the CPUs
|
|
|
|
pre_dispatch : int, or string, optional
|
|
Controls the number of jobs that get dispatched during parallel
|
|
execution. Reducing this number can be useful to avoid an
|
|
explosion of memory consumption when more jobs get dispatched
|
|
than CPUs can process. This parameter can be:
|
|
|
|
- None, in which case all the jobs are immediately
|
|
created and spawned. Use this for lightweight and
|
|
fast-running jobs, to avoid delays due to on-demand
|
|
spawning of the jobs
|
|
|
|
- An int, giving the exact number of total jobs that are
|
|
spawned
|
|
|
|
- A string, giving an expression as a function of n_jobs,
|
|
as in '2*n_jobs'
|
|
|
|
memory : None, str or object with the joblib.Memory interface, optional \
|
|
(default=None)
|
|
Used for internal caching. By default, no caching is done.
|
|
If a string is given, it is the path to the caching directory.
|
|
|
|
Attributes
|
|
----------
|
|
scores_ : array, shape = [n_features]
|
|
Feature scores between 0 and 1.
|
|
|
|
all_scores_ : array, shape = [n_features, n_reg_parameter]
|
|
Feature scores between 0 and 1 for all values of the regularization \
|
|
parameter. The reference article suggests ``scores_`` is the max of \
|
|
``all_scores_``.
|
|
|
|
|
|
References
|
|
----------
|
|
Stability selection
|
|
Nicolai Meinshausen, Peter Buhlmann
|
|
Journal of the Royal Statistical Society: Series B
|
|
Volume 72, Issue 4, pages 417-473, September 2010
|
|
DOI: 10.1111/j.1467-9868.2010.00740.x
|
|
|
|
See also
|
|
--------
|
|
RandomizedLogisticRegression, Lasso, ElasticNet
|
|
"""
|
|
def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
|
|
n_resampling=200, selection_threshold=.25,
|
|
fit_intercept=True, verbose=False,
|
|
normalize=True, precompute='auto',
|
|
max_iter=500,
|
|
eps=np.finfo(np.float).eps, random_state=None,
|
|
n_jobs=1, pre_dispatch='3*n_jobs',
|
|
memory=None):
|
|
self.alpha = alpha
|
|
self.scaling = scaling
|
|
self.sample_fraction = sample_fraction
|
|
self.n_resampling = n_resampling
|
|
self.fit_intercept = fit_intercept
|
|
self.max_iter = max_iter
|
|
self.verbose = verbose
|
|
self.normalize = normalize
|
|
self.precompute = precompute
|
|
self.eps = eps
|
|
self.random_state = random_state
|
|
self.n_jobs = n_jobs
|
|
self.selection_threshold = selection_threshold
|
|
self.pre_dispatch = pre_dispatch
|
|
self.memory = memory
|
|
|
|
def _make_estimator_and_params(self, X, y):
|
|
alpha = self.alpha
|
|
if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'):
|
|
model = LassoLarsIC(precompute=self.precompute,
|
|
criterion=self.alpha,
|
|
max_iter=self.max_iter,
|
|
eps=self.eps)
|
|
model.fit(X, y)
|
|
self.alpha_ = alpha = model.alpha_
|
|
|
|
precompute = self.precompute
|
|
# A precomputed Gram array is useless, since _randomized_lasso
|
|
# change X a each iteration
|
|
if hasattr(precompute, '__array__'):
|
|
precompute = 'auto'
|
|
assert precompute in (True, False, None, 'auto')
|
|
return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
|
|
eps=self.eps,
|
|
precompute=precompute) |