lab 2 is ready
This commit is contained in:
parent
5d8a090a38
commit
ae454ae9ef
@ -1,17 +1,14 @@
|
|||||||
from sklearn.linear_model import LinearRegression
|
from sklearn.linear_model import LinearRegression, RandomizedLasso
|
||||||
from sklearn.feature_selection import RFE
|
from sklearn.feature_selection import RFE
|
||||||
from sklearn.preprocessing import MinMaxScaler
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
from sklearn.svm import SVR
|
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random as rand
|
||||||
|
|
||||||
from alexandrov_dmitrii_lab_2.rand_lasso import RandomizedLasso
|
|
||||||
|
|
||||||
figure = plt.figure(1, figsize=(16, 9))
|
figure = plt.figure(1, figsize=(16, 9))
|
||||||
axis = figure.subplots(1, 4)
|
axis = figure.subplots(1, 4)
|
||||||
col = 0
|
col = 0
|
||||||
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
||||||
|
|
||||||
|
|
||||||
def rank_to_dict(ranks, names, n_features):
|
def rank_to_dict(ranks, names, n_features):
|
||||||
@ -35,26 +32,29 @@ def createView(key, val):
|
|||||||
|
|
||||||
|
|
||||||
def start():
|
def start():
|
||||||
np.random.seed(random.randrange(50))
|
np.random.seed(rand.randint(0, 50))
|
||||||
size = 750
|
size = 750
|
||||||
n_features = 10
|
n_features = 14
|
||||||
X = np.random.uniform(0, 1, (size, n_features))
|
X = np.random.uniform(0, 1, (size, n_features))
|
||||||
|
|
||||||
Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
|
Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
|
||||||
10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
|
10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
|
||||||
|
X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
|
||||||
|
|
||||||
lr = LinearRegression()
|
lr = LinearRegression()
|
||||||
rl = RandomizedLasso()
|
rl = RandomizedLasso()
|
||||||
rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=n_features)
|
rfe = RFE(estimator=LinearRegression(), n_features_to_select=1)
|
||||||
lr.fit(X, Y)
|
lr.fit(X, Y)
|
||||||
rl.fit(X, Y)
|
rl.fit(X, Y)
|
||||||
rfe.fit(X, Y)
|
rfe.fit(X, Y)
|
||||||
|
|
||||||
names = ["x%s" % i for i in range(1, n_features + 1)]
|
names = ["x%s" % i for i in range(1, n_features + 1)]
|
||||||
|
rfe_res = rfe.ranking_
|
||||||
|
for i in range(rfe_res.size):
|
||||||
|
rfe_res[i] = 14 - rfe_res[i]
|
||||||
ranks = {"Linear regression": rank_to_dict(lr.coef_, names, n_features),
|
ranks = {"Linear regression": rank_to_dict(lr.coef_, names, n_features),
|
||||||
"Random lasso": rank_to_dict(rl.scores_, names, n_features),
|
"Random lasso": rank_to_dict(rl.scores_, names, n_features),
|
||||||
"RFE": rank_to_dict(rfe.estimator_.coef_, names, n_features)}
|
"RFE": rank_to_dict(rfe_res, names, n_features)}
|
||||||
|
|
||||||
mean = {}
|
mean = {}
|
||||||
|
|
||||||
|
@ -1,399 +0,0 @@
|
|||||||
import numbers
|
|
||||||
import warnings
|
|
||||||
from abc import ABCMeta, abstractmethod
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import scipy.sparse as sp
|
|
||||||
import six
|
|
||||||
from joblib import Memory, Parallel, delayed
|
|
||||||
from scipy.interpolate import interp1d
|
|
||||||
from sklearn.preprocessing import normalize as f_normalize
|
|
||||||
from sklearn.base import BaseEstimator
|
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
|
||||||
from sklearn.feature_selection import SelectorMixin
|
|
||||||
from sklearn.linear_model import lars_path, LassoLarsIC
|
|
||||||
from sklearn.utils import check_X_y, check_random_state, safe_mask, check_array
|
|
||||||
|
|
||||||
__all__ = ['RandomizedLasso']
|
|
||||||
|
|
||||||
from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
|
|
||||||
|
|
||||||
from sklearn.utils.validation import check_is_fitted, as_float_array, FLOAT_DTYPES
|
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
|
|
||||||
sample_weight=None, return_mean=False):
|
|
||||||
"""
|
|
||||||
Centers data to have mean zero along axis 0. If fit_intercept=False or if
|
|
||||||
the X is a sparse matrix, no centering is done, but normalization can still
|
|
||||||
be applied. The function returns the statistics necessary to reconstruct
|
|
||||||
the input data, which are X_offset, y_offset, X_scale, such that the output
|
|
||||||
|
|
||||||
X = (X - X_offset) / X_scale
|
|
||||||
|
|
||||||
X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
|
|
||||||
then the weighted mean of X and y is zero, and not the mean itself. If
|
|
||||||
return_mean=True, the mean, eventually weighted, is returned, independently
|
|
||||||
of whether X was centered (option used for optimization with sparse data in
|
|
||||||
coordinate_descend).
|
|
||||||
|
|
||||||
This is here because nearly all linear models will want their data to be
|
|
||||||
centered. This function also systematically makes y consistent with X.dtype
|
|
||||||
"""
|
|
||||||
|
|
||||||
if isinstance(sample_weight, numbers.Number):
|
|
||||||
sample_weight = None
|
|
||||||
|
|
||||||
X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
|
|
||||||
dtype=FLOAT_DTYPES)
|
|
||||||
y = np.asarray(y, dtype=X.dtype)
|
|
||||||
|
|
||||||
if fit_intercept:
|
|
||||||
if sp.issparse(X):
|
|
||||||
X_offset, X_var = mean_variance_axis(X, axis=0)
|
|
||||||
if not return_mean:
|
|
||||||
X_offset[:] = X.dtype.type(0)
|
|
||||||
|
|
||||||
if normalize:
|
|
||||||
X_var *= X.shape[0]
|
|
||||||
X_scale = np.sqrt(X_var, X_var)
|
|
||||||
del X_var
|
|
||||||
X_scale[X_scale == 0] = 1
|
|
||||||
inplace_column_scale(X, 1. / X_scale)
|
|
||||||
else:
|
|
||||||
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
||||||
|
|
||||||
else:
|
|
||||||
X_offset = np.average(X, axis=0, weights=sample_weight)
|
|
||||||
X -= X_offset
|
|
||||||
if normalize:
|
|
||||||
X, X_scale = f_normalize(X, axis=0, copy=False,
|
|
||||||
return_norm=True)
|
|
||||||
else:
|
|
||||||
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
||||||
y_offset = np.average(y, axis=0, weights=sample_weight)
|
|
||||||
y = y - y_offset
|
|
||||||
else:
|
|
||||||
X_offset = np.zeros(X.shape[1], dtype=X.dtype)
|
|
||||||
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
|
||||||
if y.ndim == 1:
|
|
||||||
y_offset = X.dtype.type(0)
|
|
||||||
else:
|
|
||||||
y_offset = np.zeros(y.shape[1], dtype=X.dtype)
|
|
||||||
|
|
||||||
return X, y, X_offset, y_offset, X_scale
|
|
||||||
|
|
||||||
|
|
||||||
def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
|
|
||||||
n_jobs=1, verbose=False, pre_dispatch='3*n_jobs',
|
|
||||||
random_state=None, sample_fraction=.75, **params):
|
|
||||||
random_state = check_random_state(random_state)
|
|
||||||
# We are generating 1 - weights, and not weights
|
|
||||||
n_samples, n_features = X.shape
|
|
||||||
|
|
||||||
if not (0 < scaling < 1):
|
|
||||||
raise ValueError(
|
|
||||||
"'scaling' should be between 0 and 1. Got %r instead." % scaling)
|
|
||||||
|
|
||||||
scaling = 1. - scaling
|
|
||||||
scores_ = 0.0
|
|
||||||
for active_set in Parallel(n_jobs=n_jobs, verbose=verbose,
|
|
||||||
pre_dispatch=pre_dispatch)(
|
|
||||||
delayed(estimator_func)(
|
|
||||||
X, y, weights=scaling * random_state.randint(
|
|
||||||
0, 2, size=(n_features,)),
|
|
||||||
mask=(random_state.rand(n_samples) < sample_fraction),
|
|
||||||
verbose=max(0, verbose - 1),
|
|
||||||
**params)
|
|
||||||
for _ in range(n_resampling)):
|
|
||||||
scores_ += active_set
|
|
||||||
|
|
||||||
scores_ /= n_resampling
|
|
||||||
return scores_
|
|
||||||
|
|
||||||
|
|
||||||
class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
|
|
||||||
SelectorMixin)):
|
|
||||||
"""Base class to implement randomized linear models for feature selection
|
|
||||||
|
|
||||||
This implements the strategy by Meinshausen and Buhlman:
|
|
||||||
stability selection with randomized sampling, and random re-weighting of
|
|
||||||
the penalty.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
_preprocess_data = staticmethod(_preprocess_data)
|
|
||||||
|
|
||||||
def fit(self, X, y):
|
|
||||||
"""Fit the model using X, y as training data.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
X : array-like, shape = [n_samples, n_features]
|
|
||||||
Training data.
|
|
||||||
|
|
||||||
y : array-like, shape = [n_samples]
|
|
||||||
Target values. Will be cast to X's dtype if necessary
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
self : object
|
|
||||||
Returns an instance of self.
|
|
||||||
"""
|
|
||||||
X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True,
|
|
||||||
ensure_min_samples=2, estimator=self)
|
|
||||||
X = as_float_array(X, copy=False)
|
|
||||||
n_samples, n_features = X.shape
|
|
||||||
|
|
||||||
X, y, X_offset, y_offset, X_scale = \
|
|
||||||
self._preprocess_data(X, y, self.fit_intercept, self.normalize)
|
|
||||||
|
|
||||||
estimator_func, params = self._make_estimator_and_params(X, y)
|
|
||||||
memory = self.memory
|
|
||||||
if memory is None:
|
|
||||||
memory = Memory(cachedir=None, verbose=0)
|
|
||||||
elif isinstance(memory, six.string_types):
|
|
||||||
memory = Memory(cachedir=memory, verbose=0)
|
|
||||||
elif not isinstance(memory, Memory):
|
|
||||||
raise ValueError("'memory' should either be a string or"
|
|
||||||
" a sklearn.externals.joblib.Memory"
|
|
||||||
" instance, got 'memory={!r}' instead.".format(
|
|
||||||
type(memory)))
|
|
||||||
|
|
||||||
scores_ = memory.cache(
|
|
||||||
_resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch']
|
|
||||||
)(
|
|
||||||
estimator_func, X, y,
|
|
||||||
scaling=self.scaling, n_resampling=self.n_resampling,
|
|
||||||
n_jobs=self.n_jobs, verbose=self.verbose,
|
|
||||||
pre_dispatch=self.pre_dispatch, random_state=self.random_state,
|
|
||||||
sample_fraction=self.sample_fraction, **params)
|
|
||||||
|
|
||||||
if scores_.ndim == 1:
|
|
||||||
scores_ = scores_[:, np.newaxis]
|
|
||||||
self.all_scores_ = scores_
|
|
||||||
self.scores_ = np.max(self.all_scores_, axis=1)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _make_estimator_and_params(self, X, y):
|
|
||||||
"""Return the parameters passed to the estimator"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def _get_support_mask(self):
|
|
||||||
"""Get the boolean mask indicating which features are selected.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
support : boolean array of shape [# input features]
|
|
||||||
An element is True iff its corresponding feature is selected
|
|
||||||
for retention.
|
|
||||||
"""
|
|
||||||
check_is_fitted(self, 'scores_')
|
|
||||||
return self.scores_ > self.selection_threshold
|
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
# Randomized lasso: regression settings
|
|
||||||
|
|
||||||
def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
|
|
||||||
precompute=False, eps=np.finfo(np.float).eps,
|
|
||||||
max_iter=500):
|
|
||||||
X = X[safe_mask(X, mask)]
|
|
||||||
y = y[mask]
|
|
||||||
|
|
||||||
# Center X and y to avoid fit the intercept
|
|
||||||
X -= X.mean(axis=0)
|
|
||||||
y -= y.mean()
|
|
||||||
|
|
||||||
alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
|
|
||||||
|
|
||||||
X = (1 - weights) * X
|
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter('ignore', ConvergenceWarning)
|
|
||||||
alphas_, _, coef_ = lars_path(X, y,
|
|
||||||
Gram=precompute, copy_X=False,
|
|
||||||
copy_Gram=False, alpha_min=np.min(alpha),
|
|
||||||
method='lasso', verbose=verbose,
|
|
||||||
max_iter=max_iter, eps=eps)
|
|
||||||
|
|
||||||
if len(alpha) > 1:
|
|
||||||
if len(alphas_) > 1: # np.min(alpha) < alpha_min
|
|
||||||
interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
|
|
||||||
bounds_error=False, fill_value=0.)
|
|
||||||
scores = (interpolator(alpha) != 0.0)
|
|
||||||
else:
|
|
||||||
scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
|
|
||||||
else:
|
|
||||||
scores = coef_[:, -1] != 0.0
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
class RandomizedLasso(BaseRandomizedLinearModel):
|
|
||||||
"""Randomized Lasso.
|
|
||||||
|
|
||||||
Randomized Lasso works by subsampling the training data and
|
|
||||||
computing a Lasso estimate where the penalty of a random subset of
|
|
||||||
coefficients has been scaled. By performing this double
|
|
||||||
randomization several times, the method assigns high scores to
|
|
||||||
features that are repeatedly selected across randomizations. This
|
|
||||||
is known as stability selection. In short, features selected more
|
|
||||||
often are considered good features.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
alpha : float, 'aic', or 'bic', optional
|
|
||||||
The regularization parameter alpha parameter in the Lasso.
|
|
||||||
Warning: this is not the alpha parameter in the stability selection
|
|
||||||
article which is scaling.
|
|
||||||
|
|
||||||
scaling : float, optional
|
|
||||||
The s parameter used to randomly scale the penalty of different
|
|
||||||
features.
|
|
||||||
Should be between 0 and 1.
|
|
||||||
|
|
||||||
sample_fraction : float, optional
|
|
||||||
The fraction of samples to be used in each randomized design.
|
|
||||||
Should be between 0 and 1. If 1, all samples are used.
|
|
||||||
|
|
||||||
n_resampling : int, optional
|
|
||||||
Number of randomized models.
|
|
||||||
|
|
||||||
selection_threshold : float, optional
|
|
||||||
The score above which features should be selected.
|
|
||||||
|
|
||||||
fit_intercept : boolean, optional
|
|
||||||
whether to calculate the intercept for this model. If set
|
|
||||||
to false, no intercept will be used in calculations
|
|
||||||
(e.g. data is expected to be already centered).
|
|
||||||
|
|
||||||
verbose : boolean or integer, optional
|
|
||||||
Sets the verbosity amount
|
|
||||||
|
|
||||||
normalize : boolean, optional, default True
|
|
||||||
If True, the regressors X will be normalized before regression.
|
|
||||||
This parameter is ignored when `fit_intercept` is set to False.
|
|
||||||
When the regressors are normalized, note that this makes the
|
|
||||||
hyperparameters learned more robust and almost independent of
|
|
||||||
the number of samples. The same property is not valid for
|
|
||||||
standardized data. However, if you wish to standardize, please
|
|
||||||
use `preprocessing.StandardScaler` before calling `fit` on an
|
|
||||||
estimator with `normalize=False`.
|
|
||||||
|
|
||||||
precompute : True | False | 'auto' | array-like
|
|
||||||
Whether to use a precomputed Gram matrix to speed up calculations.
|
|
||||||
If set to 'auto' let us decide.
|
|
||||||
The Gram matrix can also be passed as argument, but it will be used
|
|
||||||
only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
|
|
||||||
|
|
||||||
max_iter : integer, optional
|
|
||||||
Maximum number of iterations to perform in the Lars algorithm.
|
|
||||||
|
|
||||||
eps : float, optional
|
|
||||||
The machine-precision regularization in the computation of the
|
|
||||||
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
||||||
systems. Unlike the 'tol' parameter in some iterative
|
|
||||||
optimization-based algorithms, this parameter does not control
|
|
||||||
the tolerance of the optimization.
|
|
||||||
|
|
||||||
random_state : int, RandomState instance or None, optional (default=None)
|
|
||||||
If int, random_state is the seed used by the random number generator;
|
|
||||||
If RandomState instance, random_state is the random number generator;
|
|
||||||
If None, the random number generator is the RandomState instance used
|
|
||||||
by `np.random`.
|
|
||||||
|
|
||||||
n_jobs : integer, optional
|
|
||||||
Number of CPUs to use during the resampling. If '-1', use
|
|
||||||
all the CPUs
|
|
||||||
|
|
||||||
pre_dispatch : int, or string, optional
|
|
||||||
Controls the number of jobs that get dispatched during parallel
|
|
||||||
execution. Reducing this number can be useful to avoid an
|
|
||||||
explosion of memory consumption when more jobs get dispatched
|
|
||||||
than CPUs can process. This parameter can be:
|
|
||||||
|
|
||||||
- None, in which case all the jobs are immediately
|
|
||||||
created and spawned. Use this for lightweight and
|
|
||||||
fast-running jobs, to avoid delays due to on-demand
|
|
||||||
spawning of the jobs
|
|
||||||
|
|
||||||
- An int, giving the exact number of total jobs that are
|
|
||||||
spawned
|
|
||||||
|
|
||||||
- A string, giving an expression as a function of n_jobs,
|
|
||||||
as in '2*n_jobs'
|
|
||||||
|
|
||||||
memory : None, str or object with the joblib.Memory interface, optional \
|
|
||||||
(default=None)
|
|
||||||
Used for internal caching. By default, no caching is done.
|
|
||||||
If a string is given, it is the path to the caching directory.
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
scores_ : array, shape = [n_features]
|
|
||||||
Feature scores between 0 and 1.
|
|
||||||
|
|
||||||
all_scores_ : array, shape = [n_features, n_reg_parameter]
|
|
||||||
Feature scores between 0 and 1 for all values of the regularization \
|
|
||||||
parameter. The reference article suggests ``scores_`` is the max of \
|
|
||||||
``all_scores_``.
|
|
||||||
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
Stability selection
|
|
||||||
Nicolai Meinshausen, Peter Buhlmann
|
|
||||||
Journal of the Royal Statistical Society: Series B
|
|
||||||
Volume 72, Issue 4, pages 417-473, September 2010
|
|
||||||
DOI: 10.1111/j.1467-9868.2010.00740.x
|
|
||||||
|
|
||||||
See also
|
|
||||||
--------
|
|
||||||
RandomizedLogisticRegression, Lasso, ElasticNet
|
|
||||||
"""
|
|
||||||
def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
|
|
||||||
n_resampling=200, selection_threshold=.25,
|
|
||||||
fit_intercept=True, verbose=False,
|
|
||||||
normalize=True, precompute='auto',
|
|
||||||
max_iter=500,
|
|
||||||
eps=np.finfo(np.float).eps, random_state=None,
|
|
||||||
n_jobs=1, pre_dispatch='3*n_jobs',
|
|
||||||
memory=None):
|
|
||||||
self.alpha = alpha
|
|
||||||
self.scaling = scaling
|
|
||||||
self.sample_fraction = sample_fraction
|
|
||||||
self.n_resampling = n_resampling
|
|
||||||
self.fit_intercept = fit_intercept
|
|
||||||
self.max_iter = max_iter
|
|
||||||
self.verbose = verbose
|
|
||||||
self.normalize = normalize
|
|
||||||
self.precompute = precompute
|
|
||||||
self.eps = eps
|
|
||||||
self.random_state = random_state
|
|
||||||
self.n_jobs = n_jobs
|
|
||||||
self.selection_threshold = selection_threshold
|
|
||||||
self.pre_dispatch = pre_dispatch
|
|
||||||
self.memory = memory
|
|
||||||
|
|
||||||
def _make_estimator_and_params(self, X, y):
|
|
||||||
alpha = self.alpha
|
|
||||||
if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'):
|
|
||||||
model = LassoLarsIC(precompute=self.precompute,
|
|
||||||
criterion=self.alpha,
|
|
||||||
max_iter=self.max_iter,
|
|
||||||
eps=self.eps)
|
|
||||||
model.fit(X, y)
|
|
||||||
self.alpha_ = alpha = model.alpha_
|
|
||||||
|
|
||||||
precompute = self.precompute
|
|
||||||
# A precomputed Gram array is useless, since _randomized_lasso
|
|
||||||
# change X a each iteration
|
|
||||||
if hasattr(precompute, '__array__'):
|
|
||||||
precompute = 'auto'
|
|
||||||
assert precompute in (True, False, None, 'auto')
|
|
||||||
return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
|
|
||||||
eps=self.eps,
|
|
||||||
precompute=precompute)
|
|
@ -8,45 +8,42 @@
|
|||||||
* Рекурсивное сокращение признаков (Recursive Feature Elimination – RFE)
|
* Рекурсивное сокращение признаков (Recursive Feature Elimination – RFE)
|
||||||
|
|
||||||
### Запуск программы
|
### Запуск программы
|
||||||
Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует,
|
Программа работает на Python 3.7, поскольку только в нём можно подключить нужную версию библиотеки scikit-learn, которая ещё содержит RandomizedLasso
|
||||||
|
Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует.
|
||||||
|
|
||||||
### Описание программы
|
### Описание программы
|
||||||
Файл rand_lasso.py содержит реализацию RandomizedLasso, которая была 'устарена' со skilearn 0.19 и удалена с 0.21. Код взят с их гита, версии 0.19.
|
|
||||||
Пробовались готовые решения с гита, однако они были либо совсем нерабочими, либо у их результатов не прослеживалось корреляции с остальными моделями, что говорило о их некачественности.
|
|
||||||
|
|
||||||
Файл lab2.py содержит непосредственно программу.
|
Файл lab2.py содержит непосредственно программу.
|
||||||
|
|
||||||
Программа создаёт набор данных с 10 признаками для последующего их ранжирования, и обрабатывает тремя моделями по варианту.
|
Программа создаёт набор данных с 10 признаками для последующего их ранжирования, и обрабатывает тремя моделями по варианту.
|
||||||
Программа строит столбчатые диаграммы, которые показывают как распределились оценки важности признаков, и выводит в консоль отсортированные по убыванию важности признаки.
|
Программа строит столбчатые диаграммы, которые показывают как распределились оценки важности признаков, и выводит в консоль отсортированные по убыванию важности признаки.
|
||||||
Таким образом можно легко определить наиважнейшие признаки.
|
Таким образом можно легко определить наиважнейшие признаки.
|
||||||
|
|
||||||
Сперва в качестве оценщика в модели RFE использовалась линейная регрессия. Однако тогда результаты были идентичны с результатами обычной модели линейной регрессии.
|
|
||||||
Поэтому оценщик был заменён на предложенную в примерах sklearn модель SVR.
|
|
||||||
|
|
||||||
### Результаты тестирования
|
### Результаты тестирования
|
||||||
По результатам тестирования, можно сказать следующее:
|
По результатам тестирования, можно сказать следующее:
|
||||||
* линейная регрессия и рекурсивное сокращение признаков показывают близкие значения, которые, тем не менее, расходятся в деталях.
|
* линейная регрессия показывает хорошие результаты, выделяет все 9 значимых признаков.
|
||||||
* случайное лассо показывает сильно завышенные результаты, однако они более-менее коррелируют с результатами других моделей.
|
* случайное лассо справляется хуже других моделей, иногда выделяя шумовые признаки в значимые, а значимые - в шумовые.
|
||||||
* средние значения позволяют выявить взвешенный результат.
|
* рекурсивное сокращение признаков показывает хорошие результаты, правильно правильно выделяя 9 самых значимых признаков.
|
||||||
* определить, какая модель ближе к действительности однозначно сказать невозможно из-за разброса.
|
* хотя линейная регрессия и рекурсивное сокращение признаков правильно выделяют значимые признаки, саму значимость они оценивают по-разному.
|
||||||
* какая модель (её реализация) дальше всего от действительности наоборот немного очевидно.
|
* среднее значение позволяет c хорошей уверенностью определять истинные значимые признаки.
|
||||||
|
|
||||||
|
Итого. Если необходимо просто ранжирование, достаточно взять модель RFE, однако, если необходимо анализировать признаки по коэффициентам, имея меру (коэффициенты), то брать нужно линейную регрессию. Случайное лассо лучше не надо.
|
||||||
|
|
||||||
Пример консольных результатов:
|
Пример консольных результатов:
|
||||||
|
|
||||||
>Linear regression
|
>Linear regression
|
||||||
|
|
||||||
>[('x4', 1.0), ('x1', 0.73), ('x2', 0.73), ('x5', 0.38), ('x10', 0.05), ('x6', 0.03), ('x9', 0.03), ('x3', 0.01), ('x7', 0.01), ('x8', 0.0)]
|
>[('x1', 1.0), ('x4', 0.69), ('x2', 0.61), ('x11', 0.59), ('x3', 0.51), ('x13', 0.48), ('x5', 0.19), ('x12', 0.19), ('x14', 0.12), ('x8', 0.03), ('x6', 0.02), ('x10', 0.01), ('x7', 0.0), ('x9', 0.0)]
|
||||||
|
|
||||||
>Random lasso
|
>Random lasso
|
||||||
|
|
||||||
>[('x1', 1.0), ('x2', 1.0), ('x4', 1.0), ('x5', 1.0), ('x10', 0.97), ('x6', 0.89), ('x9', 0.82), ('x3', 0.55), ('x7', 0.36), ('x8', 0.0)]
|
>[('x5', 1.0), ('x4', 0.76), ('x2', 0.74), ('x1', 0.72), ('x14', 0.44), ('x12', 0.32), ('x11', 0.28), ('x8', 0.22), ('x6', 0.17), ('x3', 0.08), ('x7', 0.02), ('x13', 0.02), ('x9', 0.01), ('x10', 0.0)]
|
||||||
|
|
||||||
>RFE
|
>RFE
|
||||||
|
|
||||||
>[('x4', 1.0), ('x1', 0.86), ('x2', 0.8), ('x5', 0.44), ('x10', 0.08), ('x6', 0.05), ('x7', 0.04), ('x3', 0.01), ('x8', 0.01), ('x9', 0.0)]
|
>[('x4', 1.0), ('x1', 0.92), ('x11', 0.85), ('x2', 0.77), ('x3', 0.69), ('x13', 0.62), ('x5', 0.54), ('x12', 0.46), ('x14', 0.38), ('x8', 0.31), ('x6', 0.23), ('x10', 0.15), ('x7', 0.08), ('x9', 0.0)]
|
||||||
|
|
||||||
>Mean
|
>Mean
|
||||||
|
|
||||||
>[('x4', 1.0), ('x1', 0.86), ('x2', 0.84), ('x5', 0.61), ('x10', 0.37), ('x6', 0.32), ('x9', 0.28), ('x3', 0.19), ('x7', 0.14), ('x8', 0.0)]
|
>[('x1', 0.88), ('x4', 0.82), ('x2', 0.71), ('x5', 0.58), ('x11', 0.57), ('x3', 0.43), ('x13', 0.37), ('x12', 0.32), ('x14', 0.31), ('x8', 0.19), ('x6', 0.14), ('x10', 0.05), ('x7', 0.03), ('x9', 0.0)]
|
||||||
|
|
||||||
По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x4, x1, x2, x5.
|
По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x1, x4, x2, x5.
|
Loading…
Reference in New Issue
Block a user