IIS_2023_1/verina_daria_lab_2/main.py
2023-11-22 22:52:44 +04:00

72 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
def rank_to_dict(ranks, names):
ranks = np.abs(ranks)
minmax = MinMaxScaler()
ranks = minmax.fit_transform(np.array(ranks).reshape(14, 1)).ravel()
ranks = map(lambda x: round(x, 2), ranks)
return dict(zip(names, ranks))
np.random.seed(0)
size = 750
X = np.random.uniform(0, 1, (size, 14))
Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 +
10 * X[:, 3] + 5 * X[:, 4] ** 5 + np.random.normal(0, 1, size))
X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))
lasso_cv = LassoCV(alphas=np.linspace(0.001, 1, 100), cv=5)
lasso_cv.fit(X, Y)
rf = RandomForestRegressor(n_estimators=100)
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe.fit(X, Y)
# названия признаков
names = ["x%s" % i for i in range(1, 15)]
# Stable Randomized Lasso Simulation
n_resampling = 200
rlasso_coefs = np.zeros((X.shape[1], n_resampling))
for i in range(n_resampling):
Y_permuted = np.random.permutation(Y)
rlasso = LassoCV(alphas=np.linspace(0.001, 1, 100), cv=5)
rlasso.fit(X, Y_permuted)
rlasso_coefs[:, i] = rlasso.coef_
rlasso_scores = np.std(rlasso_coefs, axis=1)
# словарь для ранжирования
ranks = {"Lasso": rank_to_dict(lasso_cv.coef_, names),
"RFE": rank_to_dict(rfe.ranking_, names),
"RandomizedLassoSim": rank_to_dict(rlasso_scores, names)}
mean = {}
for method, values in ranks.items():
for feature, score in values.items():
# Если элемента с текущим ключом в mean нет - добавляем
if feature not in mean:
mean[feature] = 0
# Суммируем значения по каждому ключу-имени признака
mean[feature] += score
df_ranks = pd.DataFrame(ranks)
# Выводим ранжирование
print("ПО КАЖДОМУ МЕТОДУ:")
print(df_ranks)
# Находим среднее по каждому признаку
for feature, score in mean.items():
mean[feature] = round(score / len(ranks), 2)
# Отсортированные средние значени
mean = sorted(mean.items(), key=lambda x: x[1], reverse=True)
print("СРЕДНИЕ")
print(mean)