lab 2 is ready

2023-10-04 22:14:04 +04:00
parent 06116369e5
commit 5d8a090a38
3 changed files with 533 additions and 0 deletions
--- a/alexandrov_dmitrii_lab_2/lab2.py
+++ b/alexandrov_dmitrii_lab_2/lab2.py
@@ -0,0 +1,82 @@
+from sklearn.linear_model import LinearRegression
+from sklearn.feature_selection import RFE
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.svm import SVR
+from matplotlib import pyplot as plt
+import numpy as np
+import random
+
+from alexandrov_dmitrii_lab_2.rand_lasso import RandomizedLasso
+
+figure = plt.figure(1, figsize=(16, 9))
+axis = figure.subplots(1, 4)
+col = 0
+y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+
+def rank_to_dict(ranks, names, n_features):
+    ranks = np.abs(ranks)
+    minmax = MinMaxScaler()
+    ranks = minmax.fit_transform(np.array(ranks).reshape(n_features, 1)).ravel()
+    ranks = map(lambda x: round(x, 2), ranks)
+    return dict(zip(names, ranks))
+
+
+def createView(key, val):
+    global figure
+    global axis
+    global col
+    global y
+
+    axis[col].bar(y, list(val.values()), label=key)
+    axis[col].set_title(key)
+
+    col = col + 1
+
+
+def start():
+    np.random.seed(random.randrange(50))
+    size = 750
+    n_features = 10
+    X = np.random.uniform(0, 1, (size, n_features))
+
+    Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
+         10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
+
+    lr = LinearRegression()
+    rl = RandomizedLasso()
+    rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=n_features)
+    lr.fit(X, Y)
+    rl.fit(X, Y)
+    rfe.fit(X, Y)
+
+    names = ["x%s" % i for i in range(1, n_features + 1)]
+
+    ranks = {"Linear regression": rank_to_dict(lr.coef_, names, n_features),
+             "Random lasso": rank_to_dict(rl.scores_, names, n_features),
+             "RFE": rank_to_dict(rfe.estimator_.coef_, names, n_features)}
+
+    mean = {}
+
+    for key, value in ranks.items():
+        for item in value.items():
+            if item[0] not in mean:
+                mean[item[0]] = 0
+            mean[item[0]] += item[1]
+
+    for key, value in mean.items():
+        res = value / len(ranks)
+        mean[key] = round(res, 2)
+
+    ranks["Mean"] = mean
+
+    for key, value in ranks.items():
+        createView(key, value)
+        ranks[key] = sorted(value.items(), key=lambda y: y[1], reverse=True)
+    for key, value in ranks.items():
+        print(key)
+        print(value)
+
+
+start()
+plt.show()
--- a/alexandrov_dmitrii_lab_2/rand_lasso.py
+++ b/alexandrov_dmitrii_lab_2/rand_lasso.py
@@ -0,0 +1,399 @@
+import numbers
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import scipy.sparse as sp
+import six
+from joblib import Memory, Parallel, delayed
+from scipy.interpolate import interp1d
+from sklearn.preprocessing import normalize as f_normalize
+from sklearn.base import BaseEstimator
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.feature_selection import SelectorMixin
+from sklearn.linear_model import lars_path, LassoLarsIC
+from sklearn.utils import check_X_y, check_random_state, safe_mask, check_array
+
+__all__ = ['RandomizedLasso']
+
+from sklearn.utils.sparsefuncs import inplace_column_scale, mean_variance_axis
+
+from sklearn.utils.validation import check_is_fitted, as_float_array, FLOAT_DTYPES
+
+
+def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
+                     sample_weight=None, return_mean=False):
+    """
+    Centers data to have mean zero along axis 0. If fit_intercept=False or if
+    the X is a sparse matrix, no centering is done, but normalization can still
+    be applied. The function returns the statistics necessary to reconstruct
+    the input data, which are X_offset, y_offset, X_scale, such that the output
+
+        X = (X - X_offset) / X_scale
+
+    X_scale is the L2 norm of X - X_offset. If sample_weight is not None,
+    then the weighted mean of X and y is zero, and not the mean itself. If
+    return_mean=True, the mean, eventually weighted, is returned, independently
+    of whether X was centered (option used for optimization with sparse data in
+    coordinate_descend).
+
+    This is here because nearly all linear models will want their data to be
+    centered. This function also systematically makes y consistent with X.dtype
+    """
+
+    if isinstance(sample_weight, numbers.Number):
+        sample_weight = None
+
+    X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
+                    dtype=FLOAT_DTYPES)
+    y = np.asarray(y, dtype=X.dtype)
+
+    if fit_intercept:
+        if sp.issparse(X):
+            X_offset, X_var = mean_variance_axis(X, axis=0)
+            if not return_mean:
+                X_offset[:] = X.dtype.type(0)
+
+            if normalize:
+                X_var *= X.shape[0]
+                X_scale = np.sqrt(X_var, X_var)
+                del X_var
+                X_scale[X_scale == 0] = 1
+                inplace_column_scale(X, 1. / X_scale)
+            else:
+                X_scale = np.ones(X.shape[1], dtype=X.dtype)
+
+        else:
+            X_offset = np.average(X, axis=0, weights=sample_weight)
+            X -= X_offset
+            if normalize:
+                X, X_scale = f_normalize(X, axis=0, copy=False,
+                                         return_norm=True)
+            else:
+                X_scale = np.ones(X.shape[1], dtype=X.dtype)
+        y_offset = np.average(y, axis=0, weights=sample_weight)
+        y = y - y_offset
+    else:
+        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
+        X_scale = np.ones(X.shape[1], dtype=X.dtype)
+        if y.ndim == 1:
+            y_offset = X.dtype.type(0)
+        else:
+            y_offset = np.zeros(y.shape[1], dtype=X.dtype)
+
+    return X, y, X_offset, y_offset, X_scale
+
+
+def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
+                    n_jobs=1, verbose=False, pre_dispatch='3*n_jobs',
+                    random_state=None, sample_fraction=.75, **params):
+    random_state = check_random_state(random_state)
+    # We are generating 1 - weights, and not weights
+    n_samples, n_features = X.shape
+
+    if not (0 < scaling < 1):
+        raise ValueError(
+            "'scaling' should be between 0 and 1. Got %r instead." % scaling)
+
+    scaling = 1. - scaling
+    scores_ = 0.0
+    for active_set in Parallel(n_jobs=n_jobs, verbose=verbose,
+                               pre_dispatch=pre_dispatch)(
+            delayed(estimator_func)(
+                X, y, weights=scaling * random_state.randint(
+                    0, 2, size=(n_features,)),
+                mask=(random_state.rand(n_samples) < sample_fraction),
+                verbose=max(0, verbose - 1),
+                **params)
+            for _ in range(n_resampling)):
+        scores_ += active_set
+
+    scores_ /= n_resampling
+    return scores_
+
+
+class BaseRandomizedLinearModel(six.with_metaclass(ABCMeta, BaseEstimator,
+                                                   SelectorMixin)):
+    """Base class to implement randomized linear models for feature selection
+
+    This implements the strategy by Meinshausen and Buhlman:
+    stability selection with randomized sampling, and random re-weighting of
+    the penalty.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    _preprocess_data = staticmethod(_preprocess_data)
+
+    def fit(self, X, y):
+        """Fit the model using X, y as training data.
+
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training data.
+
+        y : array-like, shape = [n_samples]
+            Target values. Will be cast to X's dtype if necessary
+
+        Returns
+        -------
+        self : object
+               Returns an instance of self.
+        """
+        X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True,
+                         ensure_min_samples=2, estimator=self)
+        X = as_float_array(X, copy=False)
+        n_samples, n_features = X.shape
+
+        X, y, X_offset, y_offset, X_scale = \
+            self._preprocess_data(X, y, self.fit_intercept, self.normalize)
+
+        estimator_func, params = self._make_estimator_and_params(X, y)
+        memory = self.memory
+        if memory is None:
+            memory = Memory(cachedir=None, verbose=0)
+        elif isinstance(memory, six.string_types):
+            memory = Memory(cachedir=memory, verbose=0)
+        elif not isinstance(memory, Memory):
+            raise ValueError("'memory' should either be a string or"
+                             " a sklearn.externals.joblib.Memory"
+                             " instance, got 'memory={!r}' instead.".format(
+                                 type(memory)))
+
+        scores_ = memory.cache(
+            _resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch']
+        )(
+            estimator_func, X, y,
+            scaling=self.scaling, n_resampling=self.n_resampling,
+            n_jobs=self.n_jobs, verbose=self.verbose,
+            pre_dispatch=self.pre_dispatch, random_state=self.random_state,
+            sample_fraction=self.sample_fraction, **params)
+
+        if scores_.ndim == 1:
+            scores_ = scores_[:, np.newaxis]
+        self.all_scores_ = scores_
+        self.scores_ = np.max(self.all_scores_, axis=1)
+        return self
+
+    def _make_estimator_and_params(self, X, y):
+        """Return the parameters passed to the estimator"""
+        raise NotImplementedError
+
+    def _get_support_mask(self):
+        """Get the boolean mask indicating which features are selected.
+
+        Returns
+        -------
+        support : boolean array of shape [# input features]
+                  An element is True iff its corresponding feature is selected
+                  for retention.
+        """
+        check_is_fitted(self, 'scores_')
+        return self.scores_ > self.selection_threshold
+
+
+###############################################################################
+# Randomized lasso: regression settings
+
+def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
+                      precompute=False, eps=np.finfo(np.float).eps,
+                      max_iter=500):
+    X = X[safe_mask(X, mask)]
+    y = y[mask]
+
+    # Center X and y to avoid fit the intercept
+    X -= X.mean(axis=0)
+    y -= y.mean()
+
+    alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))
+
+    X = (1 - weights) * X
+
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', ConvergenceWarning)
+        alphas_, _, coef_ = lars_path(X, y,
+                                      Gram=precompute, copy_X=False,
+                                      copy_Gram=False, alpha_min=np.min(alpha),
+                                      method='lasso', verbose=verbose,
+                                      max_iter=max_iter, eps=eps)
+
+    if len(alpha) > 1:
+        if len(alphas_) > 1:  # np.min(alpha) < alpha_min
+            interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
+                                    bounds_error=False, fill_value=0.)
+            scores = (interpolator(alpha) != 0.0)
+        else:
+            scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
+    else:
+        scores = coef_[:, -1] != 0.0
+    return scores
+
+
+class RandomizedLasso(BaseRandomizedLinearModel):
+    """Randomized Lasso.
+
+    Randomized Lasso works by subsampling the training data and
+    computing a Lasso estimate where the penalty of a random subset of
+    coefficients has been scaled. By performing this double
+    randomization several times, the method assigns high scores to
+    features that are repeatedly selected across randomizations. This
+    is known as stability selection. In short, features selected more
+    often are considered good features.
+
+    Parameters
+    ----------
+    alpha : float, 'aic', or 'bic', optional
+        The regularization parameter alpha parameter in the Lasso.
+        Warning: this is not the alpha parameter in the stability selection
+        article which is scaling.
+
+    scaling : float, optional
+        The s parameter used to randomly scale the penalty of different
+        features.
+        Should be between 0 and 1.
+
+    sample_fraction : float, optional
+        The fraction of samples to be used in each randomized design.
+        Should be between 0 and 1. If 1, all samples are used.
+
+    n_resampling : int, optional
+        Number of randomized models.
+
+    selection_threshold : float, optional
+        The score above which features should be selected.
+
+    fit_intercept : boolean, optional
+        whether to calculate the intercept for this model. If set
+        to false, no intercept will be used in calculations
+        (e.g. data is expected to be already centered).
+
+    verbose : boolean or integer, optional
+        Sets the verbosity amount
+
+    normalize : boolean, optional, default True
+        If True, the regressors X will be normalized before regression.
+        This parameter is ignored when `fit_intercept` is set to False.
+        When the regressors are normalized, note that this makes the
+        hyperparameters learned more robust and almost independent of
+        the number of samples. The same property is not valid for
+        standardized data. However, if you wish to standardize, please
+        use `preprocessing.StandardScaler` before calling `fit` on an
+        estimator with `normalize=False`.
+
+    precompute : True | False | 'auto' | array-like
+        Whether to use a precomputed Gram matrix to speed up calculations.
+        If set to 'auto' let us decide.
+        The Gram matrix can also be passed as argument, but it will be used
+        only for the selection of parameter alpha, if alpha is 'aic' or 'bic'.
+
+    max_iter : integer, optional
+        Maximum number of iterations to perform in the Lars algorithm.
+
+    eps : float, optional
+        The machine-precision regularization in the computation of the
+        Cholesky diagonal factors. Increase this for very ill-conditioned
+        systems. Unlike the 'tol' parameter in some iterative
+        optimization-based algorithms, this parameter does not control
+        the tolerance of the optimization.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    n_jobs : integer, optional
+        Number of CPUs to use during the resampling. If '-1', use
+        all the CPUs
+
+    pre_dispatch : int, or string, optional
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+
+            - An int, giving the exact number of total jobs that are
+              spawned
+
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    memory : None, str or object with the joblib.Memory interface, optional \
+            (default=None)
+        Used for internal caching. By default, no caching is done.
+        If a string is given, it is the path to the caching directory.
+
+    Attributes
+    ----------
+    scores_ : array, shape = [n_features]
+        Feature scores between 0 and 1.
+
+    all_scores_ : array, shape = [n_features, n_reg_parameter]
+        Feature scores between 0 and 1 for all values of the regularization \
+        parameter. The reference article suggests ``scores_`` is the max of \
+        ``all_scores_``.
+
+
+    References
+    ----------
+    Stability selection
+    Nicolai Meinshausen, Peter Buhlmann
+    Journal of the Royal Statistical Society: Series B
+    Volume 72, Issue 4, pages 417-473, September 2010
+    DOI: 10.1111/j.1467-9868.2010.00740.x
+
+    See also
+    --------
+    RandomizedLogisticRegression, Lasso, ElasticNet
+    """
+    def __init__(self, alpha='aic', scaling=.5, sample_fraction=.75,
+                 n_resampling=200, selection_threshold=.25,
+                 fit_intercept=True, verbose=False,
+                 normalize=True, precompute='auto',
+                 max_iter=500,
+                 eps=np.finfo(np.float).eps, random_state=None,
+                 n_jobs=1, pre_dispatch='3*n_jobs',
+                 memory=None):
+        self.alpha = alpha
+        self.scaling = scaling
+        self.sample_fraction = sample_fraction
+        self.n_resampling = n_resampling
+        self.fit_intercept = fit_intercept
+        self.max_iter = max_iter
+        self.verbose = verbose
+        self.normalize = normalize
+        self.precompute = precompute
+        self.eps = eps
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.selection_threshold = selection_threshold
+        self.pre_dispatch = pre_dispatch
+        self.memory = memory
+
+    def _make_estimator_and_params(self, X, y):
+        alpha = self.alpha
+        if isinstance(alpha, six.string_types) and alpha in ('aic', 'bic'):
+            model = LassoLarsIC(precompute=self.precompute,
+                                criterion=self.alpha,
+                                max_iter=self.max_iter,
+                                eps=self.eps)
+            model.fit(X, y)
+            self.alpha_ = alpha = model.alpha_
+
+        precompute = self.precompute
+        # A precomputed Gram array is useless, since _randomized_lasso
+        # change X a each iteration
+        if hasattr(precompute, '__array__'):
+            precompute = 'auto'
+        assert precompute in (True, False, None, 'auto')
+        return _randomized_lasso, dict(alpha=alpha, max_iter=self.max_iter,
+                                       eps=self.eps,
+                                       precompute=precompute)
--- a/alexandrov_dmitrii_lab_2/readme.md
+++ b/alexandrov_dmitrii_lab_2/readme.md
@@ -0,0 +1,52 @@
+### Задание
+Выполнить ранжирование признаков с помощью указанных по варианту моделей. Отобразить получившиеся значения\оценки каждого признака каждым методом\моделью и среднюю оценку. Провести анализ получившихся результатов. Определить, какие четыре признака оказались самыми важными по среднему значению.
+
+Вариант 1.
+Модели:
+* Линейная регрессия (LinearRegression)
+* Случайное Лассо (RandomizedLasso)
+* Рекурсивное сокращение признаков (Recursive Feature Elimination – RFE) 
+
+### Запуск программы
+Файл lab2.py содержит и запускает программу, аргументов и настройки ~~вроде~~ не требует, 
+
+### Описание программы
+Файл rand_lasso.py содержит реализацию RandomizedLasso, которая была 'устарена' со skilearn 0.19 и удалена с 0.21. Код взят с их гита, версии 0.19.
+Пробовались готовые решения с гита, однако они были либо совсем нерабочими, либо у их результатов не прослеживалось корреляции с остальными моделями, что говорило о их некачественности.
+
+Файл lab2.py содержит непосредственно программу.
+
+Программа создаёт набор данных с 10 признаками для последующего их ранжирования, и обрабатывает тремя моделями по варианту.
+Программа строит столбчатые диаграммы, которые показывают как распределились оценки важности признаков, и выводит в консоль отсортированные по убыванию важности признаки.
+Таким образом можно легко определить наиважнейшие признаки.
+
+Сперва в качестве оценщика в модели RFE использовалась линейная регрессия. Однако тогда результаты были идентичны с результатами обычной модели линейной регрессии.
+Поэтому оценщик был заменён на предложенную в примерах sklearn модель SVR.
+
+### Результаты тестирования
+По результатам тестирования, можно сказать следующее:
+* линейная регрессия и рекурсивное сокращение признаков показывают близкие значения, которые, тем не менее, расходятся в деталях.
+* случайное лассо показывает сильно завышенные результаты, однако они более-менее коррелируют с результатами других моделей.
+* средние значения позволяют выявить взвешенный результат.
+* определить, какая модель ближе к действительности однозначно сказать невозможно из-за разброса.
+* какая модель (её реализация) дальше всего от действительности наоборот немного очевидно.
+
+Пример консольных результатов:
+
+>Linear regression
+
+>[('x4', 1.0), ('x1', 0.73), ('x2', 0.73), ('x5', 0.38), ('x10', 0.05), ('x6', 0.03), ('x9', 0.03), ('x3', 0.01), ('x7', 0.01), ('x8', 0.0)]
+
+>Random lasso
+
+>[('x1', 1.0), ('x2', 1.0), ('x4', 1.0), ('x5', 1.0), ('x10', 0.97), ('x6', 0.89), ('x9', 0.82), ('x3', 0.55), ('x7', 0.36), ('x8', 0.0)]
+
+>RFE
+
+>[('x4', 1.0), ('x1', 0.86), ('x2', 0.8), ('x5', 0.44), ('x10', 0.08), ('x6', 0.05), ('x7', 0.04), ('x3', 0.01), ('x8', 0.01), ('x9', 0.0)]
+
+>Mean
+
+>[('x4', 1.0), ('x1', 0.86), ('x2', 0.84), ('x5', 0.61), ('x10', 0.37), ('x6', 0.32), ('x9', 0.28), ('x3', 0.19), ('x7', 0.14), ('x8', 0.0)]
+
+По данным результатам можно заключить, что наиболее влиятельные признаки по убыванию: x4, x1, x2, x5.