Merge pull request 'abanin_daniil_lab_2' (#50) from abanin_daniil_lab_2 into main

Reviewed-on: http://student.git.athene.tech/Alexey/IIS_2023_1/pulls/50
2023-10-17 17:21:19 +04:00 · 2023-10-17 17:21:19 +04:00 · 5e00a83340
commit 5e00a83340
parent 2239c15572 abd650a641
5 changed files with 198 additions and 0 deletions
--- a/abanin_daniil_lab_2/README.md
+++ b/abanin_daniil_lab_2/README.md
@ -0,0 +1,41 @@
+## Лабораторная работа №2
+
+### Ранжирование признаков
+
+## ПИбд-41 Абанин Даниил
+
+### Как запустить лабораторную работу:
+
+* установить python, numpy, matplotlib, sklearn
+* запустить проект (стартовая точка lab2)
+
+### Какие технологии использовались:
+
+* Язык программирования `Python`, библиотеки numpy, matplotlib, sklearn
+* Среда разработки `PyCharm`
+
+### Что делает лабораторная работа:
+
+* Генерирует данные и обучает такие модели, как: LinearRegression, RandomizedLasso, Recursive Feature Elimination (RFE)
+* Производиться ранжирование признаков с помощью моделей LinearRegression, RandomizedLasso, Recursive Feature Elimination (RFE)
+* Отображение получившихся результатов: 4 самых важных признака по среднему значению, значения признаков для каждой модели
+
+
+### 4 самых важных признака по среднему значению
+* Параметр - x4, значение - 0.56
+* Параметр - x1, значение - 0.45
+* Параметр - x2, значение - 0.33
+* Параметр - x9, значение - 0.33
+
+####Linear Regression
+[('x1', 1.0), ('x4', 0.69), ('x2', 0.61), ('x11', 0.59), ('x3', 0.51), ('x13', 0.48), ('x5', 0.19), ('x12', 0.19), ('x14', 0.12), ('x8', 0.03), ('x6', 0.02), ('x10', 0.01), ('x7', 0.0), ('x9', 0.0)]
+
+####Recursive Feature Elimination
+[('x9', 1.0), ('x7', 0.86), ('x10', 0.71), ('x6', 0.57), ('x8', 0.43), ('x14', 0.29), ('x12', 0.14), ('x1', 0.0), ('x2', 0.0), ('x3', 0.0), ('x4', 0.0), ('x5', 0.0), ('x11', 0.0), ('x13', 0.0)]
+
+####Randomize Lasso
+[('x4', 1.0), ('x2', 0.37), ('x1', 0.36), ('x5', 0.32), ('x6', 0.02), ('x8', 0.02), ('x3', 0.01), ('x7', 0.0), ('x9', 0.0), ('x10', 0.0), ('x11', 0.0), ('x12', 0.0), ('x13', 0.0), ('x14', 0.0)]
+
+#### Результаты:
+
+![Result](result.png)
--- a/abanin_daniil_lab_2/RadomizedLasso.py
+++ b/abanin_daniil_lab_2/RadomizedLasso.py
@ -0,0 +1,76 @@
+from sklearn.utils import check_X_y, check_random_state
+from sklearn.linear_model import Lasso
+from scipy.sparse import issparse
+from scipy import sparse
+
+
+def _rescale_data(x, weights):
+    if issparse(x):
+        size = weights.shape[0]
+        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
+        x_rescaled = x * weight_dia
+    else:
+        x_rescaled = x * (1 - weights)
+
+    return x_rescaled
+
+
+class RandomizedLasso(Lasso):
+    """
+    Randomized version of scikit-learns Lasso class.
+
+    Randomized LASSO is a generalization of the LASSO. The LASSO penalises
+    the absolute value of the coefficients with a penalty term proportional
+    to `alpha`, but the randomized LASSO changes the penalty to a randomly
+    chosen value in the range `[alpha, alpha/weakness]`.
+
+    Parameters
+    ----------
+    weakness : float
+        Weakness value for randomized LASSO. Must be in (0, 1].
+
+    See also
+    --------
+    sklearn.linear_model.LogisticRegression : learns logistic regression models
+    using the same algorithm.
+    """
+    def __init__(self, weakness=0.5, alpha=1.0, fit_intercept=True,
+                 precompute=False, copy_X=True, max_iter=1000,
+                 tol=1e-4, warm_start=False, positive=False,
+                 random_state=None, selection='cyclic'):
+        self.weakness = weakness
+        super(RandomizedLasso, self).__init__(
+            alpha=alpha, fit_intercept=fit_intercept, precompute=precompute, copy_X=copy_X,
+            max_iter=max_iter, tol=tol, warm_start=warm_start,
+            positive=positive, random_state=random_state,
+            selection=selection)
+
+    def fit(self, X, y):
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+            The training input samples.
+
+        y : array-like, shape = [n_samples]
+            The target values.
+        """
+        if not isinstance(self.weakness, float) or not (0.0 < self.weakness <= 1.0):
+            raise ValueError('weakness should be a float in (0, 1], got %s' % self.weakness)
+
+        X, y = check_X_y(X, y, accept_sparse=True)
+
+        n_features = X.shape[1]
+        weakness = 1. - self.weakness
+        random_state = check_random_state(self.random_state)
+
+        weights = weakness * random_state.randint(0, 1 + 1, size=(n_features,))
+
+        # TODO: I am afraid this will do double normalization if set to true
+        #X, y, _, _ = _preprocess_data(X, y, self.fit_intercept, normalize=self.normalize, copy=False,
+        #             sample_weight=None, return_mean=False)
+
+        # TODO: Check if this is a problem if it happens before standardization
+        X_rescaled = _rescale_data(X, weights)
+        return super(RandomizedLasso, self).fit(X_rescaled, y)
--- a/abanin_daniil_lab_2/pycache/RadomizedLasso.cpython-39.pyc
+++ b/abanin_daniil_lab_2/pycache/RadomizedLasso.cpython-39.pyc
--- a/abanin_daniil_lab_2/lab2.py
+++ b/abanin_daniil_lab_2/lab2.py
@ -0,0 +1,81 @@
+from matplotlib import pyplot as plt
+from sklearn.linear_model import LinearRegression
+from RadomizedLasso import RandomizedLasso
+from sklearn.feature_selection import RFE
+from sklearn.preprocessing import MinMaxScaler
+import numpy as np
+
+names = ["x%s" % i for i in range(1, 15)]
+
+
+def start_point():
+    X,Y = generation_data()
+    # Линейная модель
+    lr = LinearRegression()
+    lr.fit(X, Y)
+    # Рекурсивное сокращение признаков
+    rfe = RFE(lr)
+    rfe.fit(X, Y)
+    # Случайное Лассо
+    randomized_lasso = RandomizedLasso(alpha=.01)
+    randomized_lasso.fit(X, Y)
+
+    ranks = {"Linear Regression": rank_to_dict(lr.coef_), "Recursive Feature Elimination": rank_to_dict(rfe.ranking_),
+             "Randomize Lasso": rank_to_dict(randomized_lasso.coef_)}
+
+    get_estimation(ranks)
+    print_sorted_data(ranks)
+
+
+def generation_data():
+    np.random.seed(0)
+    size = 750
+    X = np.random.uniform(0, 1, (size, 14))
+    Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
+         10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
+    X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
+    return X, Y
+
+
+def rank_to_dict(ranks):
+    ranks = np.abs(ranks)
+    minmax = MinMaxScaler()
+    ranks = minmax.fit_transform(np.array(ranks).reshape(14, 1)).ravel()
+    ranks = map(lambda x: round(x, 2), ranks)
+    return dict(zip(names, ranks))
+
+
+def get_estimation(ranks: {}):
+    mean = {}
+    #«Бежим» по списку ranks
+    for key, value in ranks.items():
+        for item in value.items():
+            if(item[0] not in mean):
+                mean[item[0]] = 0
+            mean[item[0]] += item[1]
+
+    for key, value in mean.items():
+        res = value/len(ranks)
+        mean[key] = round(res, 2)
+
+    mean_sorted = sorted(mean.items(), key=lambda item: item[1], reverse=True)
+    print("Средние значения")
+    print(mean_sorted)
+
+
+    print("4 самых важных признака по среднему значению")
+    for item in mean_sorted[:4]:
+        print('Параметр - {0}, значение - {1}'.format(item[0], item[1]))
+
+
+
+def print_sorted_data(ranks: {}):
+    print()
+    for key, value in ranks.items():
+        ranks[key] = sorted(value.items(), key=lambda item: item[1], reverse=True)
+    for key, value in ranks.items():
+        print(key)
+        print(value)
+
+
+start_point()
--- a/abanin_daniil_lab_2/result.png
+++ b/abanin_daniil_lab_2/result.png