IIS_2023_1/tsyppo_anton_lab_2/main.py
2023-12-06 15:02:51 +04:00

64 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
# Создаём случайные данные
np.random.seed(42)
n_samples = 100
n_features = 10
X = np.random.rand(n_samples, n_features)
true_coefficients = np.array([3, -1, 2, 0, 1, 2, -3, 4, 0, -2])
noise = np.random.normal(0, 0.1, n_samples)
y = np.dot(X, true_coefficients) + noise
# Нормализуем данные
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)
# Преобразуем данные в DataFrame
columns = [f'Feature_{i}' for i in range(1, n_features + 1)]
df = pd.DataFrame(X_normalized, columns=columns)
df['Target'] = y
# 1. Лассо (Lasso)
lasso = Lasso(alpha=0.1)
lasso.fit(X_normalized, y)
lasso_coefs = np.abs(lasso.coef_)
# 2. Рекурсивное сокращение признаков (RFE)
rfe = RFE(estimator=RandomForestRegressor(), n_features_to_select=1)
rfe.fit(X_normalized, y)
rfe_ranks = rfe.ranking_
# 3. Сокращение признаков Случайными деревьями (Random Forest Regressor)
rf = RandomForestRegressor()
rf.fit(X_normalized, y)
rf_importances = rf.feature_importances_
# 4. Средняя оценка
average_scores = (lasso_coefs + (1 / rfe_ranks) + rf_importances) / 3
# Масштабируем средние оценки так, чтобы они лежали в диапазоне от 0 до 1
scaled_average_scores = (average_scores - np.min(average_scores)) / (np.max(average_scores) - np.min(average_scores))
# Создаем DataFrame для анализа результатов
results_df = pd.DataFrame({
'Lasso': np.round(lasso_coefs, 2),
'RFE Rank': np.round(1 / rfe_ranks, 2),
'Random Forest': np.round(rf_importances, 2),
'Average': np.round(scaled_average_scores, 2)
}, index=columns)
# Выводим результаты
print("Оценки признаков:")
print(results_df)
# Четыре признака с самыми высокими средними оценками
top_features = results_df.nlargest(4, 'Average').index
print("\nСамые важные признаки:")
print(top_features)