IIS_2023_1/antonov_dmitry_lab_2/lab2.py
DmitriyAntonov 2065c480df реади
2023-10-08 14:01:37 +04:00

106 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
# загрузка dataset
data = pd.read_csv('dataset.csv')
# разделение dataset на тренировочную и тестовую выборки
X = data.drop(['Target'], axis=1)
y = data['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Тренировка моделей
# Линейная регрессия
lr = LinearRegression()
lr.fit(X_train, y_train)
# Сокращение признаков случайными деревьями с помощью Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
# Ранжирование признаков использую каждую модель/метод
# Получение абсолютных значений коэффициентов в качестве оценок важности признаков
lr_scores = abs(lr.coef_)
# Получение оценок важности объектов из модели Random Forest Regressor
rf_scores = rf.feature_importances_
# Отображение итоговых оценок по каждой колонке
feature_names = X.columns.tolist()
# показать оценки рангов по модели линейной регрессии
print("оценки линейной регрессии:")
for feature, score in zip(feature_names, lr_scores):
print(f"{feature}: {round(score, 4)}")
# оценки метода рандомных лесов
print("\nоценки Random Forest:")
for feature, score in zip(feature_names, rf_scores):
print(f"{feature}: {round(score, 4)}")
# вычисление значений оценки для f_regression
f_scores, p_values = f_regression(X, y)
# оценки f_regression
print("\nоценки f_regression:")
for feature, score in zip(feature_names, f_scores):
print(f"{feature}: {round(score, 4)}")
# использую MinMaxScaler для точных средних значений рангов
scaler = MinMaxScaler()
lr_scores_scaled = scaler.fit_transform(lr_scores.reshape(-1, 1)).flatten()
rf_scores_scaled = scaler.fit_transform(rf_scores.reshape(-1, 1)).flatten()
f_scores_scaled = scaler.fit_transform(f_scores.reshape(-1, 1)).flatten()
# вычисление средних оценок для каждого признака
average_scores = {}
for feature in feature_names:
average_scores[feature] = (lr_scores_scaled[feature_names.index(feature)] +
rf_scores_scaled[feature_names.index(feature)] +
f_scores_scaled[feature_names.index(feature)]) / 3
# получаем среднюю оценку признаков
sorted_features = sorted(average_scores.items(), key=lambda x: x[1], reverse=True)
# получаем самых важных признака
top_4_features = sorted_features[:4]
# отображаем 4 самые важные
print("\n4 самых важных признака в среднем:")
for feature, score in top_4_features:
print(f"Признак: {feature}, Оценка: {round(score, 4)}")
# отображаем самых важных признака для каждого метода/модели
top_lr_indices = np.argsort(lr_scores)[-4:][::-1]
top_rf_indices = np.argsort(rf_scores)[-4:][::-1]
top_f_indices = np.argsort(f_scores)[-4:][::-1]
top_lr_features = [feature_names[i] for i in top_lr_indices]
top_rf_features = [feature_names[i] for i in top_rf_indices]
top_f_features = [feature_names[i] for i in top_f_indices]
top_lr_features_score = [lr_scores[i] for i in top_lr_indices]
top_rf_features_score = [rf_scores[i] for i in top_rf_indices]
top_f_features_score = [f_scores[i] for i in top_f_indices]
print("\n4 самых важных для lr_scores:")
print(top_lr_features)
for i in top_lr_features_score:
print(round(i, 4))
print("\n4 самых важных для rf_scores:")
print(top_rf_features)
for i in top_rf_features_score:
print(round(i, 4))
print("\n4 самых важных для f_scores:")
print(top_f_features)
for i in top_f_features_score:
print(round(i, 4))