diff --git a/alexandrov_dmitrii_lab_2/lab2.py b/alexandrov_dmitrii_lab_2/lab2.py index d2da8f0..5416957 100644 --- a/alexandrov_dmitrii_lab_2/lab2.py +++ b/alexandrov_dmitrii_lab_2/lab2.py @@ -1,17 +1,14 @@ -from sklearn.linear_model import LinearRegression +from sklearn.linear_model import LinearRegression, RandomizedLasso from sklearn.feature_selection import RFE from sklearn.preprocessing import MinMaxScaler -from sklearn.svm import SVR from matplotlib import pyplot as plt import numpy as np -import random - -from alexandrov_dmitrii_lab_2.rand_lasso import RandomizedLasso +import random as rand figure = plt.figure(1, figsize=(16, 9)) axis = figure.subplots(1, 4) col = 0 -y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] def rank_to_dict(ranks, names, n_features): @@ -35,26 +32,29 @@ def createView(key, val): def start(): - np.random.seed(random.randrange(50)) + np.random.seed(rand.randint(0, 50)) size = 750 - n_features = 10 + n_features = 14 X = np.random.uniform(0, 1, (size, n_features)) Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1)) + X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4)) lr = LinearRegression() rl = RandomizedLasso() - rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=n_features) + rfe = RFE(estimator=LinearRegression(), n_features_to_select=1) lr.fit(X, Y) rl.fit(X, Y) rfe.fit(X, Y) names = ["x%s" % i for i in range(1, n_features + 1)] - + rfe_res = rfe.ranking_ + for i in range(rfe_res.size): + rfe_res[i] = 14 - rfe_res[i] ranks = {"Linear regression": rank_to_dict(lr.coef_, names, n_features), "Random lasso": rank_to_dict(rl.scores_, names, n_features), - "RFE": rank_to_dict(rfe.estimator_.coef_, names, n_features)} + "RFE": rank_to_dict(rfe_res, names, n_features)} mean = {} diff --git a/alexandrov_dmitrii_lab_2/rand_lasso.py b/alexandrov_dmitrii_lab_2/rand_lasso.py deleted file mode 100644 index ab14f85..0000000 --- a/alexandrov_dmitrii_lab_2/rand_lasso.py +++ /dev/null @@ -1,399 +0,0 @@ -import numbers -import warnings -from abc import ABCMeta, abstractmethod - -import numpy as np -import scipy.sparse as sp -import six -from joblib import Memory, Parallel, delayed -from scipy.interpolate import interp1d -from sklearn.preprocessing import normalize as f_normalize -from sklearn.base import BaseEstimator -from sklearn.exceptions import ConvergenceWarning -from sklearn.feature_selection import SelectorMixin -from sklearn.linear_model import lars_path, LassoLarsIC -from sklearn.utils import check_X_y, check_random_state, safe_mask, check_array - -__all__ = ['RandomizedLasso'] - -from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis - -from sklearn.utils.validation import check_is_fitted, as_float_array, FLOAT_DTYPES - - -def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True, - sample_weight=None, return_mean=False): - """ - Centers data to have mean zero along axis 0. If fit_intercept=False or if - the X is a sparse matrix, no centering is done, but normalization can still - be applied. The function returns the statistics necessary to reconstruct - the input data, which are X_offset, y_offset, X_scale, such that the output - - X = (X - X_offset) / X_scale - - X_scale is the L2 norm of X - X_offset. If sample_weight is not None, - then the weighted mean of X and y is zero, and not the mean itself. If - return_mean=True, the mean, eventually weighted, is returned, independently - of whether X was centered (option used for optimization with sparse data in - coordinate_descend). - - This is here because nearly all linear models will want their data to be - centered. This function also systematically makes y consistent with X.dtype - """ - - if isinstance(sample_weight, numbers.Number): - sample_weight = None - - X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'], - dtype=FLOAT_DTYPES) - y = np.asarray(y, dtype=X.dtype) - - if fit_intercept: - if sp.issparse(X): - X_offset, X_var = mean_variance_axis(X, axis=0) - if not return_mean: - X_offset[:] = X.dtype.type(0) - - if normalize: - X_var *= X.shape[0] - X_scale = np.sqrt(X_var, X_var) - del X_var - X_scale[X_scale == 0] = 1 - inplace_column_scale(X, 1. / X_scale) - else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) - - else: - X_offset = np.average(X, axis=0, weights=sample_weight) - X -= X_offset - if normalize: - X, X_scale = f_normalize(X, axis=0, copy=False, - return_norm=True) - else: - X_scale = np.ones(X.shape[1], dtype=X.dtype) - y_offset = np.average(y, axis=0, weights=sample_weight) - y = y - y_offset - else: - X_offset = np.zeros(X.shape[1], dtype=X.dtype) - X_scale = np.ones(X.shape[1], dtype=X.dtype) - if y.ndim == 1: - y_offset = X.dtype.type(0) - else: - y_offset = np.zeros(y.shape[1], dtype=X.dtype) - - return X, y, X_offset, y_offset, X_scale - - -def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200, - n_jobs=1, verbose=False, pre_dispatch='3*n_jobs', - random_state=None, sample_fraction=.75, **params): - random_state = check_random_state(random_state) - # We are generating 1 - weights, and not weights - n_samples, n_features = X.shape - - if not (0 < scaling < 1): - raise ValueError( - "'scaling' should be between 0 and 1. Got %r instead." % scaling) - - scaling = 1. - scaling - scores_ = 0.0 - for active_set in Parallel(n_jobs=n_jobs, verbose=verbose, - pre_dispatch=pre_dispatch)( - delayed(estimator_func)( - X, y, weights=scaling * random_state.randint( - 0, 2, size=(n_features,)), - mask=(random_state.rand(n_samples) < sample_fraction), - verbose=max(0, verbose - 1), - **params) - for _ in range(n_resampling)): - scores_ += active_set - - scores_ /= n_resampling - return scores_ - - -class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator, - SelectorMixin)): - """Base class to implement randomized linear models for feature selection - - This implements the strategy by Meinshausen and Buhlman: - stability selection with randomized sampling, and random re-weighting of - the penalty. - """ - - @abstractmethod - def __init__(self): - pass - - _preprocess_data = staticmethod(_preprocess_data) - - def fit(self, X, y): - """Fit the model using X, y as training data. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training data. - - y : array-like, shape = [n_samples] - Target values. Will be cast to X's dtype if necessary - - Returns - ------- - self : object - Returns an instance of self. - """ - X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True, - ensure_min_samples=2, estimator=self) - X = as_float_array(X, copy=False) - n_samples, n_features = X.shape - - X, y, X_offset, y_offset, X_scale = \ - self._preprocess_data(X, y, self.fit_intercept, self.normalize) - - estimator_func, params = self._make_estimator_and_params(X, y) - memory = self.memory - if memory is None: - memory = Memory(cachedir=None, verbose=0) - elif isinstance(memory, six.string_types): - memory = Memory(cachedir=memory, verbose=0) - elif not isinstance(memory, Memory): - raise ValueError("'memory' should either be a string or" - " a sklearn.externals.joblib.Memory" - " instance, got 'memory={!r}' instead.".format( - type(memory))) - - scores_ = memory.cache( - _resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch'] - )( - estimator_func, X, y, - scaling=self.scaling, n_resampling=self.n_resampling, - n_jobs=self.n_jobs, verbose=self.verbose, - pre_dispatch=self.pre_dispatch, random_state=self.random_state, - sample_fraction=self.sample_fraction, **params) - - if scores_.ndim == 1: - scores_ = scores_[:, np.newaxis] - self.all_scores_ = scores_ - self.scores_ = np.max(self.all_scores_, axis=1) - return self - - def _make_estimator_and_params(self, X, y): - """Return the parameters passed to the estimator""" - raise NotImplementedError - - def _get_support_mask(self): - """Get the boolean mask indicating which features are selected. - - Returns - ------- - support : boolean array of shape [# input features] - An element is True iff its corresponding feature is selected - for retention. - """ - check_is_fitted(self, 'scores_') - return self.scores_ > self.selection_threshold - - -############################################################################### -# Randomized lasso: regression settings - -def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False, - precompute=False, eps=np.finfo(np.float).eps, - max_iter=500): - X = X[safe_mask(X, mask)] - y = y[mask] - - # Center X and y to avoid fit the intercept - X -= X.mean(axis=0) - y -= y.mean() - - alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64)) - - X = (1 - weights) * X - - with warnings.catch_warnings(): - warnings.simplefilter('ignore', ConvergenceWarning) - alphas_, _, coef_ = lars_path(X, y, - Gram=precompute, copy_X=False, - copy_Gram=False, alpha_min=np.min(alpha), - method='lasso', verbose=verbose, - max_iter=max_iter, eps=eps) - - if len(alpha) > 1: - if len(alphas_) > 1: # np.min(alpha) < alpha_min - interpolator = interp1d(alphas_[::-1], coef_[:, ::-1], - bounds_error=False, fill_value=0.) - scores = (interpolator(alpha) != 0.0) - else: - scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool) - else: - scores = coef_[:, -1] != 0.0 - return scores - - -class RandomizedLasso(BaseRandomizedLinearModel): - """Randomized Lasso. - - Randomized Lasso works by subsampling the training data and - computing a Lasso estimate where the penalty of a random subset of - coefficients has been scaled. By performing this double - randomization several times, the method assigns high scores to - features that are repeatedly selected across randomizations. This - is known as stability selection. In short, features selected more - often are considered good features. - - Parameters - ---------- - alpha : float, 'aic', or 'bic', optional - The regularization parameter alpha parameter in the Lasso. - Warning: this is not the alpha parameter in the stability selection - article which is scaling. - - scaling : float, optional - The s parameter used to randomly scale the penalty of different - features. - Should be between 0 and 1. - - sample_fraction : float, optional - The fraction of samples to be used in each randomized design. - Should be between 0 and 1. If 1, all samples are used. - - n_resampling : int, optional - Number of randomized models. - - selection_threshold : float, optional - The score above which features should be selected. - - fit_intercept : boolean, optional - whether to calculate the intercept for this model. If set - to false, no intercept will be used in calculations - (e.g. data is expected to be already centered). - - verbose : boolean or integer, optional - Sets the verbosity amount - - normalize : boolean, optional, default True - If True, the regressors X will be normalized before regression. - This parameter is ignored when `fit_intercept` is set to False. - When the regressors are normalized, note that this makes the - hyperparameters learned more robust and almost independent of - the number of samples. The same property is not valid for - standardized data. However, if you wish to standardize, please - use `preprocessing.StandardScaler` before calling `fit` on an - estimator with `normalize=False`. - - precompute : True | False | 'auto' | array-like - Whether to use a precomputed Gram matrix to speed up calculations. - If set to 'auto' let us decide. - The Gram matrix can also be passed as argument, but it will be used - only for the selection of parameter alpha, if alpha is 'aic' or 'bic'. - - max_iter : integer, optional - Maximum number of iterations to perform in the Lars algorithm. - - eps : float, optional - The machine-precision regularization in the computation of the - Cholesky diagonal factors. Increase this for very ill-conditioned - systems. Unlike the 'tol' parameter in some iterative - optimization-based algorithms, this parameter does not control - the tolerance of the optimization. - - random_state : int, RandomState instance or None, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. - - n_jobs : integer, optional - Number of CPUs to use during the resampling. If '-1', use - all the CPUs - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - memory : None, str or object with the joblib.Memory interface, optional \ - (default=None) - Used for internal caching. By default, no caching is done. - If a string is given, it is the path to the caching directory. - - Attributes - ---------- - scores_ : array, shape = [n_features] - Feature scores between 0 and 1. - - all_scores_ : array, shape = [n_features, n_reg_parameter] - Feature scores between 0 and 1 for all values of the regularization \ - parameter. The reference article suggests ``scores_`` is the max of \ - ``all_scores_``. - - - References - ---------- - Stability selection - Nicolai Meinshausen, Peter Buhlmann - Journal of the Royal Statistical Society: Series B - Volume 72, Issue 4, pages 417-473, September 2010 - DOI: 10.1111/j.1467-9868.2010.00740.x - - See also - -------- - RandomizedLogisticRegression, Lasso, ElasticNet - """ - def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75, - n_resampling=200, selection_threshold=.25, - fit_intercept=True, verbose=False, - normalize=True, precompute='auto', - max_iter=500, - eps=np.finfo(np.float).eps, random_state=None, - n_jobs=1, pre_dispatch='3*n_jobs', - memory=None): - self.alpha = alpha - self.scaling = scaling - self.sample_fraction = sample_fraction - self.n_resampling = n_resampling - self.fit_intercept = fit_intercept - self.max_iter = max_iter - self.verbose = verbose - self.normalize = normalize - self.precompute = precompute - self.eps = eps - self.random_state = random_state - self.n_jobs = n_jobs - self.selection_threshold = selection_threshold - self.pre_dispatch = pre_dispatch - self.memory = memory - - def _make_estimator_and_params(self, X, y): - alpha = self.alpha - if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'): - model = LassoLarsIC(precompute=self.precompute, - criterion=self.alpha, - max_iter=self.max_iter, - eps=self.eps) - model.fit(X, y) - self.alpha_ = alpha = model.alpha_ - - precompute = self.precompute - # A precomputed Gram array is useless, since _randomized_lasso - # change X a each iteration - if hasattr(precompute, '__array__'): - precompute = 'auto' - assert precompute in (True, False, None, 'auto') - return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter, - eps=self.eps, - precompute=precompute) \ No newline at end of file diff --git a/alexandrov_dmitrii_lab_2/readme.md b/alexandrov_dmitrii_lab_2/readme.md index 84e5190..236151b 100644 --- a/alexandrov_dmitrii_lab_2/readme.md +++ b/alexandrov_dmitrii_lab_2/readme.md @@ -8,45 +8,42 @@ * Рекурсивное сокращение признаков (Recursive Feature Elimination – RFE) ### Запуск программы -Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует, +Программа работает на Python 3.7, поскольку только в нём можно подключить нужную версию библиотеки scikit-learn, которая ещё содержит RandomizedLasso +Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует. ### Описание программы -Файл rand_lasso.py содержит реализацию RandomizedLasso, которая была 'устарена' со skilearn 0.19 и удалена с 0.21. Код взят с их гита, версии 0.19. -Пробовались готовые решения с гита, однако они были либо совсем нерабочими, либо у их результатов не прослеживалось корреляции с остальными моделями, что говорило о их некачественности. - Файл lab2.py содержит непосредственно программу. Программа создаёт набор данных с 10 признаками для последующего их ранжирования, и обрабатывает тремя моделями по варианту. Программа строит столбчатые диаграммы, которые показывают как распределились оценки важности признаков, и выводит в консоль отсортированные по убыванию важности признаки. Таким образом можно легко определить наиважнейшие признаки. -Сперва в качестве оценщика в модели RFE использовалась линейная регрессия. Однако тогда результаты были идентичны с результатами обычной модели линейной регрессии. -Поэтому оценщик был заменён на предложенную в примерах sklearn модель SVR. - ### Результаты тестирования По результатам тестирования, можно сказать следующее: -* линейная регрессия и рекурсивное сокращение признаков показывают близкие значения, которые, тем не менее, расходятся в деталях. -* случайное лассо показывает сильно завышенные результаты, однако они более-менее коррелируют с результатами других моделей. -* средние значения позволяют выявить взвешенный результат. -* определить, какая модель ближе к действительности однозначно сказать невозможно из-за разброса. -* какая модель (её реализация) дальше всего от действительности наоборот немного очевидно. +* линейная регрессия показывает хорошие результаты, выделяет все 9 значимых признаков. +* случайное лассо справляется хуже других моделей, иногда выделяя шумовые признаки в значимые, а значимые - в шумовые. +* рекурсивное сокращение признаков показывает хорошие результаты, правильно правильно выделяя 9 самых значимых признаков. +* хотя линейная регрессия и рекурсивное сокращение признаков правильно выделяют значимые признаки, саму значимость они оценивают по-разному. +* среднее значение позволяет c хорошей уверенностью определять истинные значимые признаки. + +Итого. Если необходимо просто ранжирование, достаточно взять модель RFE, однако, если необходимо анализировать признаки по коэффициентам, имея меру (коэффициенты), то брать нужно линейную регрессию. Случайное лассо лучше не надо. Пример консольных результатов: >Linear regression ->[('x4', 1.0), ('x1', 0.73), ('x2', 0.73), ('x5', 0.38), ('x10', 0.05), ('x6', 0.03), ('x9', 0.03), ('x3', 0.01), ('x7', 0.01), ('x8', 0.0)] +>[('x1', 1.0), ('x4', 0.69), ('x2', 0.61), ('x11', 0.59), ('x3', 0.51), ('x13', 0.48), ('x5', 0.19), ('x12', 0.19), ('x14', 0.12), ('x8', 0.03), ('x6', 0.02), ('x10', 0.01), ('x7', 0.0), ('x9', 0.0)] >Random lasso ->[('x1', 1.0), ('x2', 1.0), ('x4', 1.0), ('x5', 1.0), ('x10', 0.97), ('x6', 0.89), ('x9', 0.82), ('x3', 0.55), ('x7', 0.36), ('x8', 0.0)] +>[('x5', 1.0), ('x4', 0.76), ('x2', 0.74), ('x1', 0.72), ('x14', 0.44), ('x12', 0.32), ('x11', 0.28), ('x8', 0.22), ('x6', 0.17), ('x3', 0.08), ('x7', 0.02), ('x13', 0.02), ('x9', 0.01), ('x10', 0.0)] >RFE ->[('x4', 1.0), ('x1', 0.86), ('x2', 0.8), ('x5', 0.44), ('x10', 0.08), ('x6', 0.05), ('x7', 0.04), ('x3', 0.01), ('x8', 0.01), ('x9', 0.0)] +>[('x4', 1.0), ('x1', 0.92), ('x11', 0.85), ('x2', 0.77), ('x3', 0.69), ('x13', 0.62), ('x5', 0.54), ('x12', 0.46), ('x14', 0.38), ('x8', 0.31), ('x6', 0.23), ('x10', 0.15), ('x7', 0.08), ('x9', 0.0)] >Mean ->[('x4', 1.0), ('x1', 0.86), ('x2', 0.84), ('x5', 0.61), ('x10', 0.37), ('x6', 0.32), ('x9', 0.28), ('x3', 0.19), ('x7', 0.14), ('x8', 0.0)] +>[('x1', 0.88), ('x4', 0.82), ('x2', 0.71), ('x5', 0.58), ('x11', 0.57), ('x3', 0.43), ('x13', 0.37), ('x12', 0.32), ('x14', 0.31), ('x8', 0.19), ('x6', 0.14), ('x10', 0.05), ('x7', 0.03), ('x9', 0.0)] -По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x4, x1, x2, x5. \ No newline at end of file +По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x1, x4, x2, x5. \ No newline at end of file