2023-12-05 22:59:00 +04:00
5 changed files with 110 additions and 0 deletions
--- a/kondrashin_mikhail_lab_2/README.md
+++ b/kondrashin_mikhail_lab_2/README.md
@ -0,0 +1,36 @@
+#### Кондрашин Михаил ПИбд-41
+
+## Лабораторная работа 2. Ранжирование признаков
+
+### Задание:
+
+* Линейная регрессия (LinearRegression)
+* Сокращение признаков случайными деревьями (Random Forest Regressor) 
+* Линейная корреляция (f_regression)
+
+### Запуск лабораторной работы:
+
+* установить `python`, `numpy`, `matplotlib`, `sklearn`
+* запустить проект (стартовая точка класс `main.py`)
+
+### Используемые технологии:
+
+* Язык программирования `Python`,
+* Библиотеки `numpy`, `matplotlib`, `sklearn`
+* Среда разработки `IntelliJ IDEA` (В версии "Ultimate edition" можно писать на python)
+
+### Описание решения:
+
+Программа выполняет ранжирование признаков для регрессионной модели:
+* Линейная регрессия (LinearRegression)
+* Сокращение признаков Случайными деревьями (Random Forest Regressor)
+* Линейная корреляция (f_regression)
+
+*14 признаков
+*750 наблюдений
+
+### Результат:
+![Result](images/result.png)
+* Лучше всего показал себя метод линейной корреляции (x4, x14, x2, x12). Хотя признаки x1 и x3 не были выявлены, их влияние может быть учтено через скоррелированные параметры x12 и x14.
+* Самые важные признаки по среднему значению: x1, x4, x2, x11
+
--- a/kondrashin_mikhail_lab_2/data.py
+++ b/kondrashin_mikhail_lab_2/data.py
@ -0,0 +1,12 @@
+import numpy as np
+
+
+def generate_data():
+    size = 750
+    np.random.seed(0)
+    X = np.random.uniform(0, 1, (size, 14))
+    Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1))
+
+    X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
+
+    return X, Y
--- a/kondrashin_mikhail_lab_2/images/result.png
+++ b/kondrashin_mikhail_lab_2/images/result.png
--- a/kondrashin_mikhail_lab_2/main.py
+++ b/kondrashin_mikhail_lab_2/main.py
@ -0,0 +1,22 @@
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.feature_selection import f_regression
+from sklearn.linear_model import LinearRegression
+
+from data import generate_data
+from ranks import mean_calc_and_sort, get_ranks
+
+
+if __name__ == '__main__':
+    x, y = generate_data()
+
+    linear = LinearRegression()
+    linear.fit(x, y)
+
+    rfr = RandomForestRegressor(bootstrap=True)
+    rfr.fit(x, y)
+
+    f, p_val = f_regression(x, y, center=True)
+
+    ranks = get_ranks(linear, rfr, f)
+
+    print("mean", mean_calc_and_sort(ranks))
--- a/kondrashin_mikhail_lab_2/ranks.py
+++ b/kondrashin_mikhail_lab_2/ranks.py
@ -0,0 +1,40 @@
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+
+
+def get_ranks(linear, rfr, f):
+    ranks = dict()
+    features = ["x%s" % i for i in range(1, 15)]
+
+    ranks['Linear'] = rank_to_dict(linear.coef_, features)
+    ranks['RFR'] = rank_to_dict(rfr.feature_importances_, features)
+    ranks['f_reg'] = rank_to_dict(f, features)
+
+    return ranks
+
+
+def rank_to_dict(ranks, names):
+    ranks = np.abs(ranks)
+    minmax = MinMaxScaler()
+
+    ranks = minmax.fit_transform(np.array(ranks).reshape(14, 1)).ravel()
+    ranks = map(lambda x: round(x, 2), ranks)
+    return dict(zip(names, ranks))
+
+
+def mean_calc_and_sort(ranks):
+    mean = {}
+
+    for key, value in ranks.items():
+        print(key, value)
+        for item in value.items():
+            if item[0] not in mean:
+                mean[item[0]] = 0
+                mean[item[0]] += item[1]
+
+    for key, value in mean.items():
+        res = value / len(ranks)
+        mean[key] = round(res, 2)
+
+    return mean
+