IIS_2023_1/romanova_adelina_lab_3/main.py

303 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import (LabelEncoder,
StandardScaler,
MinMaxScaler,
RobustScaler)
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
def str_features_to_numeric(data):
# Преобразовывает все строковые признаки в числовые.
# Определение категориальных признаков
categorical_columns = []
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = data.columns.values.tolist()
for col in features:
if data[col].dtype in numerics: continue
categorical_columns.append(col)
# Кодирование категориальных признаков
for col in categorical_columns:
if col in data.columns:
le = LabelEncoder()
le.fit(list(data[col].astype(str).values))
data[col] = le.transform(list(data[col].astype(str).values))
return data
def fe_creation(df):
# Feature engineering (FE)
df['age2'] = df['age']//10
df['trestbps2'] = df['trestbps']//10
df['chol2'] = df['chol']//60
df['thalch2'] = df['thalch']//40
df['oldpeak2'] = df['oldpeak']//0.4
for i in ['sex', 'age2', 'fbs', 'restecg', 'exang']:
for j in ['cp','trestbps2', 'chol2', 'thalch2', 'oldpeak2', 'slope']:
df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
return df
def acc_d(y_meas, y_pred):
# Относительная погрешность между прогнозируемыми значениями y_pred и измеренными значениями y_meas
return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))
def acc_rmse(y_meas, y_pred):
# Среднеквадратичная ошибка между прогнозируемыми значениями y_pred и измеренными значениями y_meas
return (mean_squared_error(y_meas, y_pred))**0.5
def plot_cm(train_target, train_target_pred, valid_target, valid_target_pred, title):
# Построение матриц ошибок
def cm_calc(y_true, y_pred):
cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100
annot = np.empty_like(cm).astype(str)
nrows, ncols = cm.shape
for i in range(nrows):
for j in range(ncols):
c = cm[i, j]
p = cm_perc[i, j]
if i == j:
s = cm_sum[i]
annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
elif c == 0:
annot[i, j] = ''
else:
annot[i, j] = '%.1f%%\n%d' % (p, c)
cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
cm.index.name = 'Actual'
cm.columns.name = 'Predicted'
return cm, annot
# Построение матриц ошибок
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharex=True)
# Обучающие данные
ax = axes[0]
ax.set_title("for training data")
cm0, annot0 = cm_calc(train_target, train_target_pred)
sns.heatmap(cm0, cmap= "YlGnBu", annot=annot0, fmt='', ax=ax)
# Тестовые данные
ax = axes[1]
ax.set_title("for test (validation) data")
cm1, annot1 = cm_calc(valid_target, valid_target_pred)
sns.heatmap(cm1, cmap= "YlGnBu", annot=annot1, fmt='', ax=ax)
fig.suptitle(f'CONFUSION MATRICES for {title}')
plt.savefig(f'CONFUSION MATRICES for {title}.png')
plt.show()
def acc_metrics_calc(num, acc_all, model, train, valid, train_target, valid_target, title):
# Этап выбора моделей
# Расчет точности модели по различным показателям
ytrain = model.predict(train).astype(int)
yvalid = model.predict(valid).astype(int)
print('train_target = ', train_target[:5].values)
print('ytrain = ', ytrain[:5])
print('valid_target =', valid_target[:5].values)
print('yvalid =', yvalid[:5])
num_acc = 0
for x in metrics_now:
if x == 1:
#критерий точности score
acc_train = round(metrics.accuracy_score(train_target, ytrain), 2)
acc_valid = round(metrics.accuracy_score(valid_target, yvalid), 2)
elif x == 2:
# rmse критерий
acc_train = round(acc_rmse(train_target, ytrain), 2)
acc_valid = round(acc_rmse(valid_target, yvalid), 2)
elif x == 3:
# критерий относительной погрешности
acc_train = round(acc_d(train_target, ytrain) * 100, 2)
acc_valid = round(acc_d(valid_target, yvalid) * 100, 2)
print('acc of', metrics_all[x], 'for train =', acc_train)
print('acc of', metrics_all[x], 'for valid =', acc_valid)
acc_all[num_acc].append(acc_train) #train
acc_all[num_acc+1].append(acc_valid) #valid
num_acc += 2
# Построение матриц
plot_cm(train_target, ytrain, valid_target, yvalid, title)
return acc_all
def plot_feature_importance(feature_importances, feature_names, model_name):
import matplotlib.pyplot as plt
import seaborn as sns
# Создание цветовой палитры
colors = sns.color_palette('viridis', len(feature_importances))
# Сортировка индексов важностей признаков
indices = feature_importances.argsort()[::-1]
# Создание стильного барплота
plt.figure(figsize=(12, 8))
ax = sns.barplot(x=feature_importances[indices], y=feature_names[indices], palette=colors)
# Добавление декораций
plt.xlabel('Важность признака', fontsize=14)
plt.ylabel('Признаки', fontsize=14)
plt.title(f'Важность признаков в модели {model_name}', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Добавление цветовой шкалы и ее описания
cbar = plt.colorbar(plt.cm.ScalarMappable(cmap='viridis'), ax=ax)
cbar.set_label('Уровень важности', rotation=270, labelpad=15, fontsize=12)
# Добавление сетки для лучшей читаемости
plt.grid(axis='x', linestyle='--', alpha=0.6)
# Сохранение графика в файл
plt.savefig('feature_importance_plot.png', bbox_inches='tight')
# Отображение графика
plt.savefig(f'feature_importances_{model_name}.png')
plt.show()
def plot_learning_curve(estimator, title, X, y, cv=None, axes=None, ylim=None,
n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), random_state=0):
fig, axes = plt.subplots(2, 1, figsize=(20, 10))
if axes is None:
_, axes = plt.subplots(1, 2, figsize=(20, 5))
axes[0].set_title(title)
if ylim is not None:
axes[0].set_ylim(*ylim)
axes[0].set_xlabel("Training examples")
axes[0].set_ylabel("Score")
cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator=estimator, X=X, y=y, cv=cv,
train_sizes=train_sizes,
return_times=True)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
axes[0].legend(loc="best")
# Plot n_samples vs fit_times
axes[1].grid()
axes[1].plot(train_sizes, fit_times_mean, 'o-')
axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
axes[1].set_xlabel("Training examples")
axes[1].set_ylabel("fit_times")
axes[1].set_title("Scalability of the model")
plt.savefig(f'{title}.png')
plt.show()
return
if __name__ == "__main__":
# Загрузка данных
# Преобразование данных и предобработка
# Обучение моделей Decision Tree Classifier и Random Forest Classifier
# Расчет метрик и построение графиков
cv_n_split = 5
random_state = 42
test_train_split_part = 0.25
metrics_all = {1: 'acc', 2 : 'rmse', 3 : 're'}
metrics_now = [1, 2, 3]
data = pd.read_csv("..//heart_disease_uci.csv")
data['target'] = data['num']
data = data.drop(columns=['id', 'dataset', 'ca', 'thal', 'num'])
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
print(data.info())
data = str_features_to_numeric(data)
data = data[data['target'].isin([0, 1])] # приволим столбец с целевыми значениями к бинарному виду
data = fe_creation(data)
data = str_features_to_numeric(data)
dataset = data.copy() # original data
target_name = 'target'
target = data.pop(target_name)
# Model standartization
# The standard score of a sample x is calculated as:
# z = (x - мат.ож.) / (стандартное отклонение)
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
train, valid, train_target, valid_target = train_test_split(data, target, test_size=test_train_split_part, random_state=random_state)
# list of accuracy of all model - amount of metrics_now * 2 (train & valid datasets)
num_models = 6
acc_train = []
acc_valid = []
acc_all = np.empty((len(metrics_now)*2, 0)).tolist()
acc_all
acc_all_pred = np.empty((len(metrics_now), 0)).tolist()
acc_all_pred
cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
decision_tree = DecisionTreeClassifier()
param_grid = {'min_samples_leaf': [i for i in range(2,12)]}
decision_tree_CV = GridSearchCV(decision_tree, param_grid=param_grid, cv=cv_train, verbose=False)
decision_tree_CV.fit(train, train_target)
print(decision_tree_CV.best_params_)
acc_all = acc_metrics_calc(0, acc_all, decision_tree_CV, train, valid, train_target, valid_target, title="Decision Tree Classifier")
plot_learning_curve(decision_tree_CV, "Decision Tree", train, train_target, cv=cv_train)
feature_importances_dt = decision_tree_CV.best_estimator_.feature_importances_
plot_feature_importance(feature_importances_dt, data.columns, "Decision Tree")