lab 2 is ready
This commit is contained in:
parent
06116369e5
commit
5d8a090a38
82
alexandrov_dmitrii_lab_2/lab2.py
Normal file
82
alexandrov_dmitrii_lab_2/lab2.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.feature_selection import RFE
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
from sklearn.svm import SVR
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
|
||||||
|
from alexandrov_dmitrii_lab_2.rand_lasso import RandomizedLasso
|
||||||
|
|
||||||
|
figure = plt.figure(1, figsize=(16, 9))
|
||||||
|
axis = figure.subplots(1, 4)
|
||||||
|
col = 0
|
||||||
|
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||||
|
|
||||||
|
|
||||||
|
def rank_to_dict(ranks, names, n_features):
|
||||||
|
ranks = np.abs(ranks)
|
||||||
|
minmax = MinMaxScaler()
|
||||||
|
ranks = minmax.fit_transform(np.array(ranks).reshape(n_features, 1)).ravel()
|
||||||
|
ranks = map(lambda x: round(x, 2), ranks)
|
||||||
|
return dict(zip(names, ranks))
|
||||||
|
|
||||||
|
|
||||||
|
def createView(key, val):
|
||||||
|
global figure
|
||||||
|
global axis
|
||||||
|
global col
|
||||||
|
global y
|
||||||
|
|
||||||
|
axis[col].bar(y, list(val.values()), label=key)
|
||||||
|
axis[col].set_title(key)
|
||||||
|
|
||||||
|
col = col + 1
|
||||||
|
|
||||||
|
|
||||||
|
def start():
|
||||||
|
np.random.seed(random.randrange(50))
|
||||||
|
size = 750
|
||||||
|
n_features = 10
|
||||||
|
X = np.random.uniform(0, 1, (size, n_features))
|
||||||
|
|
||||||
|
Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
|
||||||
|
10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
|
||||||
|
|
||||||
|
lr = LinearRegression()
|
||||||
|
rl = RandomizedLasso()
|
||||||
|
rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=n_features)
|
||||||
|
lr.fit(X, Y)
|
||||||
|
rl.fit(X, Y)
|
||||||
|
rfe.fit(X, Y)
|
||||||
|
|
||||||
|
names = ["x%s" % i for i in range(1, n_features + 1)]
|
||||||
|
|
||||||
|
ranks = {"Linear regression": rank_to_dict(lr.coef_, names, n_features),
|
||||||
|
"Random lasso": rank_to_dict(rl.scores_, names, n_features),
|
||||||
|
"RFE": rank_to_dict(rfe.estimator_.coef_, names, n_features)}
|
||||||
|
|
||||||
|
mean = {}
|
||||||
|
|
||||||
|
for key, value in ranks.items():
|
||||||
|
for item in value.items():
|
||||||
|
if item[0] not in mean:
|
||||||
|
mean[item[0]] = 0
|
||||||
|
mean[item[0]] += item[1]
|
||||||
|
|
||||||
|
for key, value in mean.items():
|
||||||
|
res = value / len(ranks)
|
||||||
|
mean[key] = round(res, 2)
|
||||||
|
|
||||||
|
ranks["Mean"] = mean
|
||||||
|
|
||||||
|
for key, value in ranks.items():
|
||||||
|
createView(key, value)
|
||||||
|
ranks[key] = sorted(value.items(), key=lambda y: y[1], reverse=True)
|
||||||
|
for key, value in ranks.items():
|
||||||
|
print(key)
|
||||||
|
print(value)
|
||||||
|
|
||||||
|
|
||||||
|
start()
|
||||||
|
plt.show()
|
399
alexandrov_dmitrii_lab_2/rand_lasso.py
Normal file
399
alexandrov_dmitrii_lab_2/rand_lasso.py
Normal file
@ -0,0 +1,399 @@
|
|||||||
|
import numbers
|
||||||
|
import warnings
|
||||||
|
from abc import ABCMeta, abstractmethod
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse as sp
|
||||||
|
import six
|
||||||
|
from joblib import Memory, Parallel, delayed
|
||||||
|
from scipy.interpolate import interp1d
|
||||||
|
from sklearn.preprocessing import normalize as f_normalize
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from sklearn.feature_selection import SelectorMixin
|
||||||
|
from sklearn.linear_model import lars_path, LassoLarsIC
|
||||||
|
from sklearn.utils import check_X_y, check_random_state, safe_mask, check_array
|
||||||
|
|
||||||
|
__all__ = ['RandomizedLasso']
|
||||||
|
|
||||||
|
from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
|
||||||
|
|
||||||
|
from sklearn.utils.validation import check_is_fitted, as_float_array, FLOAT_DTYPES
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
|
||||||
|
sample_weight=None, return_mean=False):
|
||||||
|
"""
|
||||||
|
Centers data to have mean zero along axis 0. If fit_intercept=False or if
|
||||||
|
the X is a sparse matrix, no centering is done, but normalization can still
|
||||||
|
be applied. The function returns the statistics necessary to reconstruct
|
||||||
|
the input data, which are X_offset, y_offset, X_scale, such that the output
|
||||||
|
|
||||||
|
X = (X - X_offset) / X_scale
|
||||||
|
|
||||||
|
X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
|
||||||
|
then the weighted mean of X and y is zero, and not the mean itself. If
|
||||||
|
return_mean=True, the mean, eventually weighted, is returned, independently
|
||||||
|
of whether X was centered (option used for optimization with sparse data in
|
||||||
|
coordinate_descend).
|
||||||
|
|
||||||
|
This is here because nearly all linear models will want their data to be
|
||||||
|
centered. This function also systematically makes y consistent with X.dtype
|
||||||
|
"""
|
||||||
|
|
||||||
|
if isinstance(sample_weight, numbers.Number):
|
||||||
|
sample_weight = None
|
||||||
|
|
||||||
|
X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
|
||||||
|
dtype=FLOAT_DTYPES)
|
||||||
|
y = np.asarray(y, dtype=X.dtype)
|
||||||
|
|
||||||
|
if fit_intercept:
|
||||||
|
if sp.issparse(X):
|
||||||
|
X_offset, X_var = mean_variance_axis(X, axis=0)
|
||||||
|
if not return_mean:
|
||||||
|
X_offset[:] = X.dtype.type(0)
|
||||||
|
|
||||||
|
if normalize:
|
||||||
|
X_var *= X.shape[0]
|
||||||
|
X_scale = np.sqrt(X_var, X_var)
|
||||||
|
del X_var
|
||||||
|
X_scale[X_scale == 0] = 1
|
||||||
|
inplace_column_scale(X, 1. / X_scale)
|
||||||
|
else:
|
||||||
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
||||||
|
|
||||||
|
else:
|
||||||
|
X_offset = np.average(X, axis=0, weights=sample_weight)
|
||||||
|
X -= X_offset
|
||||||
|
if normalize:
|
||||||
|
X, X_scale = f_normalize(X, axis=0, copy=False,
|
||||||
|
return_norm=True)
|
||||||
|
else:
|
||||||
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
||||||
|
y_offset = np.average(y, axis=0, weights=sample_weight)
|
||||||
|
y = y - y_offset
|
||||||
|
else:
|
||||||
|
X_offset = np.zeros(X.shape[1], dtype=X.dtype)
|
||||||
|
X_scale = np.ones(X.shape[1], dtype=X.dtype)
|
||||||
|
if y.ndim == 1:
|
||||||
|
y_offset = X.dtype.type(0)
|
||||||
|
else:
|
||||||
|
y_offset = np.zeros(y.shape[1], dtype=X.dtype)
|
||||||
|
|
||||||
|
return X, y, X_offset, y_offset, X_scale
|
||||||
|
|
||||||
|
|
||||||
|
def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
|
||||||
|
n_jobs=1, verbose=False, pre_dispatch='3*n_jobs',
|
||||||
|
random_state=None, sample_fraction=.75, **params):
|
||||||
|
random_state = check_random_state(random_state)
|
||||||
|
# We are generating 1 - weights, and not weights
|
||||||
|
n_samples, n_features = X.shape
|
||||||
|
|
||||||
|
if not (0 < scaling < 1):
|
||||||
|
raise ValueError(
|
||||||
|
"'scaling' should be between 0 and 1. Got %r instead." % scaling)
|
||||||
|
|
||||||
|
scaling = 1. - scaling
|
||||||
|
scores_ = 0.0
|
||||||
|
for active_set in Parallel(n_jobs=n_jobs, verbose=verbose,
|
||||||
|
pre_dispatch=pre_dispatch)(
|
||||||
|
delayed(estimator_func)(
|
||||||
|
X, y, weights=scaling * random_state.randint(
|
||||||
|
0, 2, size=(n_features,)),
|
||||||
|
mask=(random_state.rand(n_samples) < sample_fraction),
|
||||||
|
verbose=max(0, verbose - 1),
|
||||||
|
**params)
|
||||||
|
for _ in range(n_resampling)):
|
||||||
|
scores_ += active_set
|
||||||
|
|
||||||
|
scores_ /= n_resampling
|
||||||
|
return scores_
|
||||||
|
|
||||||
|
|
||||||
|
class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
|
||||||
|
SelectorMixin)):
|
||||||
|
"""Base class to implement randomized linear models for feature selection
|
||||||
|
|
||||||
|
This implements the strategy by Meinshausen and Buhlman:
|
||||||
|
stability selection with randomized sampling, and random re-weighting of
|
||||||
|
the penalty.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
_preprocess_data = staticmethod(_preprocess_data)
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
"""Fit the model using X, y as training data.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : array-like, shape = [n_samples, n_features]
|
||||||
|
Training data.
|
||||||
|
|
||||||
|
y : array-like, shape = [n_samples]
|
||||||
|
Target values. Will be cast to X's dtype if necessary
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
self : object
|
||||||
|
Returns an instance of self.
|
||||||
|
"""
|
||||||
|
X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True,
|
||||||
|
ensure_min_samples=2, estimator=self)
|
||||||
|
X = as_float_array(X, copy=False)
|
||||||
|
n_samples, n_features = X.shape
|
||||||
|
|
||||||
|
X, y, X_offset, y_offset, X_scale = \
|
||||||
|
self._preprocess_data(X, y, self.fit_intercept, self.normalize)
|
||||||
|
|
||||||
|
estimator_func, params = self._make_estimator_and_params(X, y)
|
||||||
|
memory = self.memory
|
||||||
|
if memory is None:
|
||||||
|
memory = Memory(cachedir=None, verbose=0)
|
||||||
|
elif isinstance(memory, six.string_types):
|
||||||
|
memory = Memory(cachedir=memory, verbose=0)
|
||||||
|
elif not isinstance(memory, Memory):
|
||||||
|
raise ValueError("'memory' should either be a string or"
|
||||||
|
" a sklearn.externals.joblib.Memory"
|
||||||
|
" instance, got 'memory={!r}' instead.".format(
|
||||||
|
type(memory)))
|
||||||
|
|
||||||
|
scores_ = memory.cache(
|
||||||
|
_resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch']
|
||||||
|
)(
|
||||||
|
estimator_func, X, y,
|
||||||
|
scaling=self.scaling, n_resampling=self.n_resampling,
|
||||||
|
n_jobs=self.n_jobs, verbose=self.verbose,
|
||||||
|
pre_dispatch=self.pre_dispatch, random_state=self.random_state,
|
||||||
|
sample_fraction=self.sample_fraction, **params)
|
||||||
|
|
||||||
|
if scores_.ndim == 1:
|
||||||
|
scores_ = scores_[:, np.newaxis]
|
||||||
|
self.all_scores_ = scores_
|
||||||
|
self.scores_ = np.max(self.all_scores_, axis=1)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _make_estimator_and_params(self, X, y):
|
||||||
|
"""Return the parameters passed to the estimator"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _get_support_mask(self):
|
||||||
|
"""Get the boolean mask indicating which features are selected.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
support : boolean array of shape [# input features]
|
||||||
|
An element is True iff its corresponding feature is selected
|
||||||
|
for retention.
|
||||||
|
"""
|
||||||
|
check_is_fitted(self, 'scores_')
|
||||||
|
return self.scores_ > self.selection_threshold
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Randomized lasso: regression settings
|
||||||
|
|
||||||
|
def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
|
||||||
|
precompute=False, eps=np.finfo(np.float).eps,
|
||||||
|
max_iter=500):
|
||||||
|
X = X[safe_mask(X, mask)]
|
||||||
|
y = y[mask]
|
||||||
|
|
||||||
|
# Center X and y to avoid fit the intercept
|
||||||
|
X -= X.mean(axis=0)
|
||||||
|
y -= y.mean()
|
||||||
|
|
||||||
|
alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
|
||||||
|
|
||||||
|
X = (1 - weights) * X
|
||||||
|
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter('ignore', ConvergenceWarning)
|
||||||
|
alphas_, _, coef_ = lars_path(X, y,
|
||||||
|
Gram=precompute, copy_X=False,
|
||||||
|
copy_Gram=False, alpha_min=np.min(alpha),
|
||||||
|
method='lasso', verbose=verbose,
|
||||||
|
max_iter=max_iter, eps=eps)
|
||||||
|
|
||||||
|
if len(alpha) > 1:
|
||||||
|
if len(alphas_) > 1: # np.min(alpha) < alpha_min
|
||||||
|
interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
|
||||||
|
bounds_error=False, fill_value=0.)
|
||||||
|
scores = (interpolator(alpha) != 0.0)
|
||||||
|
else:
|
||||||
|
scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
|
||||||
|
else:
|
||||||
|
scores = coef_[:, -1] != 0.0
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
class RandomizedLasso(BaseRandomizedLinearModel):
|
||||||
|
"""Randomized Lasso.
|
||||||
|
|
||||||
|
Randomized Lasso works by subsampling the training data and
|
||||||
|
computing a Lasso estimate where the penalty of a random subset of
|
||||||
|
coefficients has been scaled. By performing this double
|
||||||
|
randomization several times, the method assigns high scores to
|
||||||
|
features that are repeatedly selected across randomizations. This
|
||||||
|
is known as stability selection. In short, features selected more
|
||||||
|
often are considered good features.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
alpha : float, 'aic', or 'bic', optional
|
||||||
|
The regularization parameter alpha parameter in the Lasso.
|
||||||
|
Warning: this is not the alpha parameter in the stability selection
|
||||||
|
article which is scaling.
|
||||||
|
|
||||||
|
scaling : float, optional
|
||||||
|
The s parameter used to randomly scale the penalty of different
|
||||||
|
features.
|
||||||
|
Should be between 0 and 1.
|
||||||
|
|
||||||
|
sample_fraction : float, optional
|
||||||
|
The fraction of samples to be used in each randomized design.
|
||||||
|
Should be between 0 and 1. If 1, all samples are used.
|
||||||
|
|
||||||
|
n_resampling : int, optional
|
||||||
|
Number of randomized models.
|
||||||
|
|
||||||
|
selection_threshold : float, optional
|
||||||
|
The score above which features should be selected.
|
||||||
|
|
||||||
|
fit_intercept : boolean, optional
|
||||||
|
whether to calculate the intercept for this model. If set
|
||||||
|
to false, no intercept will be used in calculations
|
||||||
|
(e.g. data is expected to be already centered).
|
||||||
|
|
||||||
|
verbose : boolean or integer, optional
|
||||||
|
Sets the verbosity amount
|
||||||
|
|
||||||
|
normalize : boolean, optional, default True
|
||||||
|
If True, the regressors X will be normalized before regression.
|
||||||
|
This parameter is ignored when `fit_intercept` is set to False.
|
||||||
|
When the regressors are normalized, note that this makes the
|
||||||
|
hyperparameters learned more robust and almost independent of
|
||||||
|
the number of samples. The same property is not valid for
|
||||||
|
standardized data. However, if you wish to standardize, please
|
||||||
|
use `preprocessing.StandardScaler` before calling `fit` on an
|
||||||
|
estimator with `normalize=False`.
|
||||||
|
|
||||||
|
precompute : True | False | 'auto' | array-like
|
||||||
|
Whether to use a precomputed Gram matrix to speed up calculations.
|
||||||
|
If set to 'auto' let us decide.
|
||||||
|
The Gram matrix can also be passed as argument, but it will be used
|
||||||
|
only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
|
||||||
|
|
||||||
|
max_iter : integer, optional
|
||||||
|
Maximum number of iterations to perform in the Lars algorithm.
|
||||||
|
|
||||||
|
eps : float, optional
|
||||||
|
The machine-precision regularization in the computation of the
|
||||||
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
||||||
|
systems. Unlike the 'tol' parameter in some iterative
|
||||||
|
optimization-based algorithms, this parameter does not control
|
||||||
|
the tolerance of the optimization.
|
||||||
|
|
||||||
|
random_state : int, RandomState instance or None, optional (default=None)
|
||||||
|
If int, random_state is the seed used by the random number generator;
|
||||||
|
If RandomState instance, random_state is the random number generator;
|
||||||
|
If None, the random number generator is the RandomState instance used
|
||||||
|
by `np.random`.
|
||||||
|
|
||||||
|
n_jobs : integer, optional
|
||||||
|
Number of CPUs to use during the resampling. If '-1', use
|
||||||
|
all the CPUs
|
||||||
|
|
||||||
|
pre_dispatch : int, or string, optional
|
||||||
|
Controls the number of jobs that get dispatched during parallel
|
||||||
|
execution. Reducing this number can be useful to avoid an
|
||||||
|
explosion of memory consumption when more jobs get dispatched
|
||||||
|
than CPUs can process. This parameter can be:
|
||||||
|
|
||||||
|
- None, in which case all the jobs are immediately
|
||||||
|
created and spawned. Use this for lightweight and
|
||||||
|
fast-running jobs, to avoid delays due to on-demand
|
||||||
|
spawning of the jobs
|
||||||
|
|
||||||
|
- An int, giving the exact number of total jobs that are
|
||||||
|
spawned
|
||||||
|
|
||||||
|
- A string, giving an expression as a function of n_jobs,
|
||||||
|
as in '2*n_jobs'
|
||||||
|
|
||||||
|
memory : None, str or object with the joblib.Memory interface, optional \
|
||||||
|
(default=None)
|
||||||
|
Used for internal caching. By default, no caching is done.
|
||||||
|
If a string is given, it is the path to the caching directory.
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
scores_ : array, shape = [n_features]
|
||||||
|
Feature scores between 0 and 1.
|
||||||
|
|
||||||
|
all_scores_ : array, shape = [n_features, n_reg_parameter]
|
||||||
|
Feature scores between 0 and 1 for all values of the regularization \
|
||||||
|
parameter. The reference article suggests ``scores_`` is the max of \
|
||||||
|
``all_scores_``.
|
||||||
|
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
Stability selection
|
||||||
|
Nicolai Meinshausen, Peter Buhlmann
|
||||||
|
Journal of the Royal Statistical Society: Series B
|
||||||
|
Volume 72, Issue 4, pages 417-473, September 2010
|
||||||
|
DOI: 10.1111/j.1467-9868.2010.00740.x
|
||||||
|
|
||||||
|
See also
|
||||||
|
--------
|
||||||
|
RandomizedLogisticRegression, Lasso, ElasticNet
|
||||||
|
"""
|
||||||
|
def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
|
||||||
|
n_resampling=200, selection_threshold=.25,
|
||||||
|
fit_intercept=True, verbose=False,
|
||||||
|
normalize=True, precompute='auto',
|
||||||
|
max_iter=500,
|
||||||
|
eps=np.finfo(np.float).eps, random_state=None,
|
||||||
|
n_jobs=1, pre_dispatch='3*n_jobs',
|
||||||
|
memory=None):
|
||||||
|
self.alpha = alpha
|
||||||
|
self.scaling = scaling
|
||||||
|
self.sample_fraction = sample_fraction
|
||||||
|
self.n_resampling = n_resampling
|
||||||
|
self.fit_intercept = fit_intercept
|
||||||
|
self.max_iter = max_iter
|
||||||
|
self.verbose = verbose
|
||||||
|
self.normalize = normalize
|
||||||
|
self.precompute = precompute
|
||||||
|
self.eps = eps
|
||||||
|
self.random_state = random_state
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.selection_threshold = selection_threshold
|
||||||
|
self.pre_dispatch = pre_dispatch
|
||||||
|
self.memory = memory
|
||||||
|
|
||||||
|
def _make_estimator_and_params(self, X, y):
|
||||||
|
alpha = self.alpha
|
||||||
|
if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'):
|
||||||
|
model = LassoLarsIC(precompute=self.precompute,
|
||||||
|
criterion=self.alpha,
|
||||||
|
max_iter=self.max_iter,
|
||||||
|
eps=self.eps)
|
||||||
|
model.fit(X, y)
|
||||||
|
self.alpha_ = alpha = model.alpha_
|
||||||
|
|
||||||
|
precompute = self.precompute
|
||||||
|
# A precomputed Gram array is useless, since _randomized_lasso
|
||||||
|
# change X a each iteration
|
||||||
|
if hasattr(precompute, '__array__'):
|
||||||
|
precompute = 'auto'
|
||||||
|
assert precompute in (True, False, None, 'auto')
|
||||||
|
return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
|
||||||
|
eps=self.eps,
|
||||||
|
precompute=precompute)
|
52
alexandrov_dmitrii_lab_2/readme.md
Normal file
52
alexandrov_dmitrii_lab_2/readme.md
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
### Задание
|
||||||
|
Выполнить ранжирование признаков с помощью указанных по варианту моделей. Отобразить получившиеся значения\оценки каждого признака каждым методом\моделью и среднюю оценку. Провести анализ получившихся результатов. Определить, какие четыре признака оказались самыми важными по среднему значению.
|
||||||
|
|
||||||
|
Вариант 1.
|
||||||
|
Модели:
|
||||||
|
* Линейная регрессия (LinearRegression)
|
||||||
|
* Случайное Лассо (RandomizedLasso)
|
||||||
|
* Рекурсивное сокращение признаков (Recursive Feature Elimination – RFE)
|
||||||
|
|
||||||
|
### Запуск программы
|
||||||
|
Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует,
|
||||||
|
|
||||||
|
### Описание программы
|
||||||
|
Файл rand_lasso.py содержит реализацию RandomizedLasso, которая была 'устарена' со skilearn 0.19 и удалена с 0.21. Код взят с их гита, версии 0.19.
|
||||||
|
Пробовались готовые решения с гита, однако они были либо совсем нерабочими, либо у их результатов не прослеживалось корреляции с остальными моделями, что говорило о их некачественности.
|
||||||
|
|
||||||
|
Файл lab2.py содержит непосредственно программу.
|
||||||
|
|
||||||
|
Программа создаёт набор данных с 10 признаками для последующего их ранжирования, и обрабатывает тремя моделями по варианту.
|
||||||
|
Программа строит столбчатые диаграммы, которые показывают как распределились оценки важности признаков, и выводит в консоль отсортированные по убыванию важности признаки.
|
||||||
|
Таким образом можно легко определить наиважнейшие признаки.
|
||||||
|
|
||||||
|
Сперва в качестве оценщика в модели RFE использовалась линейная регрессия. Однако тогда результаты были идентичны с результатами обычной модели линейной регрессии.
|
||||||
|
Поэтому оценщик был заменён на предложенную в примерах sklearn модель SVR.
|
||||||
|
|
||||||
|
### Результаты тестирования
|
||||||
|
По результатам тестирования, можно сказать следующее:
|
||||||
|
* линейная регрессия и рекурсивное сокращение признаков показывают близкие значения, которые, тем не менее, расходятся в деталях.
|
||||||
|
* случайное лассо показывает сильно завышенные результаты, однако они более-менее коррелируют с результатами других моделей.
|
||||||
|
* средние значения позволяют выявить взвешенный результат.
|
||||||
|
* определить, какая модель ближе к действительности однозначно сказать невозможно из-за разброса.
|
||||||
|
* какая модель (её реализация) дальше всего от действительности наоборот немного очевидно.
|
||||||
|
|
||||||
|
Пример консольных результатов:
|
||||||
|
|
||||||
|
>Linear regression
|
||||||
|
|
||||||
|
>[('x4', 1.0), ('x1', 0.73), ('x2', 0.73), ('x5', 0.38), ('x10', 0.05), ('x6', 0.03), ('x9', 0.03), ('x3', 0.01), ('x7', 0.01), ('x8', 0.0)]
|
||||||
|
|
||||||
|
>Random lasso
|
||||||
|
|
||||||
|
>[('x1', 1.0), ('x2', 1.0), ('x4', 1.0), ('x5', 1.0), ('x10', 0.97), ('x6', 0.89), ('x9', 0.82), ('x3', 0.55), ('x7', 0.36), ('x8', 0.0)]
|
||||||
|
|
||||||
|
>RFE
|
||||||
|
|
||||||
|
>[('x4', 1.0), ('x1', 0.86), ('x2', 0.8), ('x5', 0.44), ('x10', 0.08), ('x6', 0.05), ('x7', 0.04), ('x3', 0.01), ('x8', 0.01), ('x9', 0.0)]
|
||||||
|
|
||||||
|
>Mean
|
||||||
|
|
||||||
|
>[('x4', 1.0), ('x1', 0.86), ('x2', 0.84), ('x5', 0.61), ('x10', 0.37), ('x6', 0.32), ('x9', 0.28), ('x3', 0.19), ('x7', 0.14), ('x8', 0.0)]
|
||||||
|
|
||||||
|
По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x4, x1, x2, x5.
|
Loading…
x
Reference in New Issue
Block a user