AIM-PIbd-31-Razubaev-S-M/Lab4/lab4.ipynb
2024-11-09 09:14:53 +04:00

510 KiB
Raw Blame History

Лабораторная 4

Информация о диабете индейцев Пима

In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config

set_config(transform_output="pandas")
df = pd.read_csv(".//scv//diabetes.csv")
print(df.columns)
df
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
Out[47]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
... ... ... ... ... ... ... ... ... ...
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0

768 rows × 9 columns

Формирование выборок

In [48]:
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   
    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )
    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))
    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.
    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )
    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp
    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )
    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="Outcome", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9
)

display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
196 1 105 58 0 0 24.3 0.187 21 0
69 4 146 85 27 100 28.9 0.189 27 0
494 3 80 0 0 0 0.0 0.174 22 0
463 5 88 78 30 0 27.6 0.258 37 0
653 2 120 54 0 0 26.8 0.455 27 0
... ... ... ... ... ... ... ... ... ...
322 0 124 70 20 0 27.4 0.254 36 1
109 0 95 85 25 36 37.4 0.247 24 1
27 1 97 66 15 140 23.2 0.487 22 0
651 1 117 60 23 106 33.8 0.466 27 0
197 3 107 62 13 48 22.9 0.678 23 1

614 rows × 9 columns

'y_train'
Outcome
196 0
69 0
494 0
463 0
653 0
... ...
322 1
109 1
27 0
651 0
197 1

614 rows × 1 columns

'X_test'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
669 9 154 78 30 100 30.9 0.164 45 0
379 0 93 100 39 72 43.4 1.021 35 0
640 0 102 86 17 105 29.3 0.695 27 0
658 11 127 106 0 0 39.0 0.190 51 0
304 3 150 76 0 0 21.0 0.207 37 0
... ... ... ... ... ... ... ... ... ...
203 2 99 70 16 44 20.4 0.235 27 0
605 1 124 60 32 0 35.8 0.514 21 0
561 0 198 66 32 274 41.3 0.502 28 1
280 0 146 70 0 0 37.9 0.334 28 1
103 1 81 72 18 40 26.6 0.283 24 0

154 rows × 9 columns

'y_test'
Outcome
669 0
379 0
640 0
658 0
304 0
... ...
203 0
605 0
561 1
280 1
103 0

154 rows × 1 columns

Классификация данных

In [49]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder



columns_to_drop = ["Glucose", "Age", "BloodPressure", "Outcome", "DiabetesPedigreeFunction"]
num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object"
]
cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object"
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="passthrough"
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)


pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

Проверка работы конвеера

In [50]:
preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df
Out[50]:
Pregnancies SkinThickness Insulin BMI
196 -0.838489 -1.297466 -0.688684 -0.946400
69 0.072181 0.395520 0.180416 -0.377190
494 -0.231376 -1.297466 -0.688684 -3.953317
463 0.375738 0.583630 -0.688684 -0.538054
653 -0.534932 -1.297466 -0.688684 -0.637047
... ... ... ... ...
322 -1.142046 -0.043402 -0.688684 -0.562802
109 -1.142046 0.270114 -0.375808 0.674613
27 -0.838489 -0.356918 0.528056 -1.082516
651 -0.838489 0.144708 0.232562 0.229143
197 -0.231376 -0.482325 -0.271516 -1.119638

614 rows × 4 columns

Формирование набора моделей для классификации

In [51]:
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    # "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=9)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=9
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=9,
        )
    },
}

Обучение моделей на обучающем наборе данных и оценка на тестовом

In [52]:
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
Model: ridge
Model: decision_tree
Model: knn
Model: naive_bayes
Model: gradient_boosting
Model: random_forest
Model: mlp

Сводная таблица оценок качества для использованных моделей классификации

Матрица неточностей

In [53]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Healthy", "Sick"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image

Точность, полнота, верность (аккуратность), F-мера

In [54]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[54]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
naive_bayes 0.564516 0.628571 0.327103 0.407407 0.677524 0.707792 0.414201 0.494382
ridge 0.494382 0.552632 0.616822 0.777778 0.646580 0.701299 0.548857 0.646154
knn 0.670807 0.551020 0.504673 0.500000 0.741042 0.681818 0.576000 0.524272
random_forest 0.955157 0.535714 0.995327 0.555556 0.982085 0.675325 0.974828 0.545455
gradient_boosting 0.920213 0.525000 0.808411 0.388889 0.908795 0.662338 0.860697 0.446809
logistic 0.618644 0.525000 0.341121 0.388889 0.697068 0.662338 0.439759 0.446809
decision_tree 0.718615 0.459016 0.775701 0.518519 0.815961 0.616883 0.746067 0.486957
mlp 0.409195 0.417391 0.831776 0.888889 0.522801 0.525974 0.548536 0.568047

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [55]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[55]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
ridge 0.701299 0.646154 0.767037 0.400271 0.417827
logistic 0.662338 0.446809 0.766296 0.211501 0.216434
naive_bayes 0.707792 0.494382 0.753704 0.301834 0.315869
knn 0.681818 0.524272 0.745556 0.286093 0.286855
mlp 0.525974 0.568047 0.729074 0.173747 0.240181
random_forest 0.675325 0.545455 0.715093 0.293059 0.293176
gradient_boosting 0.662338 0.446809 0.711296 0.211501 0.216434
decision_tree 0.616883 0.486957 0.612870 0.183061 0.183927
In [56]:
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)
'ridge'

Вывод данных с ошибкой предсказания для оценки

In [57]:
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["Outcome"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
'Error items count: 46'
Out[57]:
Pregnancies Predicted Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
30 5 1 109 75 26 0 36.0 0.546 60 0
82 7 1 83 78 26 71 29.3 0.767 36 0
86 13 1 106 72 54 0 36.6 0.178 45 0
91 4 1 123 80 15 176 32.0 0.443 34 0
95 6 1 144 72 27 228 33.9 0.255 40 0
176 6 1 85 78 0 0 31.2 0.382 42 0
201 1 1 138 82 0 0 40.1 0.236 28 0
204 6 1 103 72 32 190 37.7 0.324 55 0
223 7 1 142 60 33 190 28.8 0.687 61 0
228 4 1 197 70 39 744 36.7 2.329 31 0
233 4 1 122 68 0 0 35.0 0.394 29 0
266 0 0 138 0 0 0 36.3 0.933 25 1
274 13 1 106 70 0 0 34.2 0.251 52 0
280 0 0 146 70 0 0 37.9 0.334 28 1
282 7 1 133 88 15 155 32.4 0.262 37 0
302 5 1 77 82 41 42 35.8 0.156 35 0
309 2 0 124 68 28 205 32.9 0.875 30 1
335 0 1 165 76 43 255 47.9 0.259 26 0
358 12 1 88 74 40 54 35.3 0.378 48 0
364 4 1 147 74 25 293 34.9 0.385 30 0
379 0 1 93 100 39 72 43.4 1.021 35 0
397 0 0 131 66 40 0 34.3 0.196 22 1
405 2 1 123 48 32 165 42.1 0.520 26 0
406 4 0 115 72 0 0 28.9 0.376 46 1
442 4 1 117 64 27 120 33.2 0.230 24 0
486 1 1 139 62 41 480 40.7 0.536 21 0
515 3 0 163 70 18 105 31.6 0.268 28 1
517 7 1 125 86 0 0 37.6 0.304 51 0
583 8 1 100 76 0 0 38.7 0.190 42 0
594 6 1 123 72 45 230 33.6 0.733 34 0
622 6 1 183 94 0 0 40.8 1.461 45 0
630 7 0 114 64 0 0 27.4 0.732 34 1
634 10 1 92 62 0 0 25.9 0.167 31 0
646 1 0 167 74 17 144 23.4 0.447 33 1
658 11 1 127 106 0 0 39.0 0.190 51 0
669 9 1 154 78 30 100 30.9 0.164 45 0
674 8 1 91 82 0 0 35.6 0.587 68 0
676 9 0 156 86 0 0 24.8 0.230 53 1
682 0 1 95 64 39 105 44.6 0.366 22 0
699 4 1 118 70 0 0 44.5 0.904 26 0
702 1 0 168 88 29 0 35.0 0.905 52 1
723 5 1 117 86 30 105 39.1 0.251 42 0
725 4 1 112 78 40 0 39.4 0.236 38 0
730 3 0 130 78 23 79 28.4 0.323 34 1
744 13 1 153 88 37 140 40.6 1.174 39 0
750 4 0 136 70 0 0 31.2 1.182 22 1

Пример использования обученной модели (конвейера) для предсказания

In [58]:
model = class_models[best_model]["pipeline"]

example_id = 450
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
450 1.0 82.0 64.0 13.0 95.0 21.2 0.415 23.0 0.0
Pregnancies SkinThickness Insulin BMI
450 -0.838489 -0.482325 0.136961 -1.329999
'predicted: 0 (proba: [0.81353825 0.18646175])'
'real: 0'

Подбор гиперпараметров методом поиска по сетке

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn import metrics
import pandas as pd


# Определяем числовые признаки
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Установка random_state
random_state = 9

# Определение трансформера
pipeline_end = ColumnTransformer([
    ('numeric', StandardScaler(), numeric_features),
    # Добавьте другие трансформеры, если требуется
])

# Объявление модели
optimized_model = RandomForestClassifier(
    random_state=random_state,
    criterion="gini",
    max_depth=5,
    max_features="sqrt",
    n_estimators=10,
)

# Создание пайплайна с корректными шагами
result = {}

# Обучение модели
result["pipeline"] = Pipeline([
    ("pipeline", pipeline_end),
    ("model", optimized_model)
]).fit(X_train, y_train.values.ravel())

# Прогнозирование и расчет метрик
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)[:, 1]
result["preds"] = np.where(result["probs"] > 0.5, 1, 0)

# Метрики для оценки модели
result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"])
result["Precision_test"] = metrics.precision_score(y_test, result["preds"])
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"])
result["Recall_test"] = metrics.recall_score(y_test, result["preds"])
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"])
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"])
result["F1_test"] = metrics.f1_score(y_test, result["preds"])
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])

Формирование данных для оценки старой и новой версии модели

In [60]:
optimized_model_type = "random_forest"
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

Оценка параметров старой и новой модели

In [61]:
optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[61]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
Name                
Old 0.955157 0.535714 0.995327 0.555556 0.982085 0.675325 0.974828 0.545455
New 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [62]:
optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[62]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
Name          
Old 0.675325 0.545455 0.715093 0.293059 0.293176
New 1.000000 1.000000 1.000000 1.000000 1.000000
In [63]:
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)

for index in range(0, len(optimized_metrics)):
    c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Healthy", "Sick"]
    ).plot(ax=ax.flat[index])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()
No description has been provided for this image

В желтом квадрате мы видим значение 74, что обозначает количество правильно классифицированных объектов, отнесенных к классу "Sick". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.

В зеленом квадрате значение 54 указывает на количество правильно классифицированных объектов, отнесенных к классу "Healthy". Это также является показателем хорошей точности модели в определении объектов данного класса.

Определение достижимого уровня качества модели для второй задачи

Подготовка данных

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config


random_state = 9
set_config(transform_output="pandas")
df = pd.read_csv(".//scv//diabetes.csv")
print(df.describe())
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  0.078000   21.000000    0.000000  
25%     27.300000                  0.243750   24.000000    0.000000  
50%     32.000000                  0.372500   29.000000    0.000000  
75%     36.600000                  0.626250   41.000000    1.000000  
max     67.100000                  2.420000   81.000000    1.000000  

Формирование выборок

In [65]:
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split


def split_stratified_into_train_val_test(
    df_input: DataFrame,
    stratify_colname: str = "y",
    frac_train: float = 0.6,
    frac_val: float = 0.15,
    frac_test: float = 0.25,
    random_state: int = None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
   

    if not (0 < frac_train < 1) or not (0 <= frac_val <= 1) or not (0 <= frac_test <= 1):
        raise ValueError("Fractions must be between 0 and 1 and the sum must equal 1.")
    
    if not (frac_train + frac_val + frac_test == 1.0):
        raise ValueError("fractions %f, %f, %f do not add up to 1.0" %
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError(f"{stratify_colname} is not a column in the DataFrame.")

    X = df_input
    y = df_input[[stratify_colname]]

 
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    if frac_val == 0:
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp

    relative_frac_test = frac_test / (frac_val + frac_test)

    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    
    return df_train, df_val, df_test, y_train, y_val, y_test


X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
    df, stratify_colname="Outcome", frac_train=0.80, frac_val=0.0, frac_test=0.20, random_state=random_state
)

display("X_train", X_train)
display("y_train", y_train)
display("X_test", X_test)
display("y_test", y_test)
'X_train'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
196 1 105 58 0 0 24.3 0.187 21 0
69 4 146 85 27 100 28.9 0.189 27 0
494 3 80 0 0 0 0.0 0.174 22 0
463 5 88 78 30 0 27.6 0.258 37 0
653 2 120 54 0 0 26.8 0.455 27 0
... ... ... ... ... ... ... ... ... ...
322 0 124 70 20 0 27.4 0.254 36 1
109 0 95 85 25 36 37.4 0.247 24 1
27 1 97 66 15 140 23.2 0.487 22 0
651 1 117 60 23 106 33.8 0.466 27 0
197 3 107 62 13 48 22.9 0.678 23 1

614 rows × 9 columns

'y_train'
Outcome
196 0
69 0
494 0
463 0
653 0
... ...
322 1
109 1
27 0
651 0
197 1

614 rows × 1 columns

'X_test'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
669 9 154 78 30 100 30.9 0.164 45 0
379 0 93 100 39 72 43.4 1.021 35 0
640 0 102 86 17 105 29.3 0.695 27 0
658 11 127 106 0 0 39.0 0.190 51 0
304 3 150 76 0 0 21.0 0.207 37 0
... ... ... ... ... ... ... ... ... ...
203 2 99 70 16 44 20.4 0.235 27 0
605 1 124 60 32 0 35.8 0.514 21 0
561 0 198 66 32 274 41.3 0.502 28 1
280 0 146 70 0 0 37.9 0.334 28 1
103 1 81 72 18 40 26.6 0.283 24 0

154 rows × 9 columns

'y_test'
Outcome
669 0
379 0
640 0
658 0
304 0
... ...
203 0
605 0
561 1
280 1
103 0

154 rows × 1 columns

Формирование конвейера для классификации данных

In [66]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

class DiabetFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    

columns_to_drop = ["Pregnancies", "SkinThickness", "Insulin", "BMI"]
num_columns = ["Glucose", "Age", "BloodPressure", "Outcome", "DiabetesPedigreeFunction"]
cat_columns = []

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
preprocessing_cat = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_encoder),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_num", preprocessing_num, num_columns),
        ("prepocessing_cat", preprocessing_cat, cat_columns),
    ],
    remainder="passthrough"
)


drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

features_postprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_cat", preprocessing_cat, ["Cabin_type"]),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

Демонстрация работы конвейера

In [67]:
preprocessing_result = pipeline_end.fit_transform(X_train)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

preprocessed_df
Out[67]:
Glucose Age BloodPressure Outcome DiabetesPedigreeFunction
196 -0.478144 -1.029257 -0.554050 -0.731437 -0.849205
69 0.818506 -0.522334 0.804885 -0.731437 -0.843172
494 -1.268784 -0.944770 -3.473244 -0.731437 -0.888421
463 -1.015779 0.322537 0.452568 -0.731437 -0.635028
653 -0.003760 -0.522334 -0.755374 -0.731437 -0.040763
... ... ... ... ... ...
322 0.122742 0.238050 0.049921 1.367172 -0.647095
109 -0.794400 -0.775796 0.804885 1.367172 -0.668211
27 -0.731149 -0.944770 -0.151403 -0.731437 0.055767
651 -0.098637 -0.522334 -0.453388 -0.731437 -0.007581
197 -0.414893 -0.860283 -0.352726 1.367172 0.631933

614 rows × 5 columns

Формирование набора моделей для классификации

In [68]:
from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree

class_models = {
    "logistic": {"model": linear_model.LogisticRegression()},
    "ridge": {"model": linear_model.RidgeClassifierCV(cv=5, class_weight="balanced")},
    "ridge": {"model": linear_model.LogisticRegression(penalty="l2", class_weight="balanced")},
    "decision_tree": {
        "model": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsClassifier(n_neighbors=7)},
    "naive_bayes": {"model": naive_bayes.GaussianNB()},
    "gradient_boosting": {
        "model": ensemble.GradientBoostingClassifier(n_estimators=210)
    },
    "random_forest": {
        "model": ensemble.RandomForestClassifier(
            max_depth=11, class_weight="balanced", random_state=random_state
        )
    },
    "mlp": {
        "model": neural_network.MLPClassifier(
            hidden_layer_sizes=(7,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}

Обучение моделей на обучающем наборе данных и оценка на тестовом¶

In [69]:
import numpy as np
from sklearn import metrics

for model_name in class_models.keys():
    print(f"Model: {model_name}")
    model = class_models[model_name]["model"]

    model_pipeline = Pipeline([("pipeline", pipeline_end), ("model", model)])
    model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())

    y_train_predict = model_pipeline.predict(X_train)
    y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]
    y_test_predict = np.where(y_test_probs > 0.5, 1, 0)

    class_models[model_name]["pipeline"] = model_pipeline
    class_models[model_name]["probs"] = y_test_probs
    class_models[model_name]["preds"] = y_test_predict

    class_models[model_name]["Precision_train"] = metrics.precision_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Precision_test"] = metrics.precision_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Recall_train"] = metrics.recall_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Recall_test"] = metrics.recall_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Accuracy_train"] = metrics.accuracy_score(
        y_train, y_train_predict
    )
    class_models[model_name]["Accuracy_test"] = metrics.accuracy_score(
        y_test, y_test_predict
    )
    class_models[model_name]["ROC_AUC_test"] = metrics.roc_auc_score(
        y_test, y_test_probs
    )
    class_models[model_name]["F1_train"] = metrics.f1_score(y_train, y_train_predict)
    class_models[model_name]["F1_test"] = metrics.f1_score(y_test, y_test_predict)
    class_models[model_name]["MCC_test"] = metrics.matthews_corrcoef(
        y_test, y_test_predict
    )
    class_models[model_name]["Cohen_kappa_test"] = metrics.cohen_kappa_score(
        y_test, y_test_predict
    )
    class_models[model_name]["Confusion_matrix"] = metrics.confusion_matrix(
        y_test, y_test_predict
    )
Model: logistic
Model: ridge
Model: decision_tree
Model: knn
Model: naive_bayes
Model: gradient_boosting
Model: random_forest
Model: mlp

Сводная таблица оценок качества для использованных моделей классификации¶

Матрица неточностей

In [70]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)
for index, key in enumerate(class_models.keys()):
    c_matrix = class_models[key]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Healthy", "Sick"]
    ).plot(ax=ax.flat[index])
    disp.ax_.set_title(key)

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)
plt.show()
No description has been provided for this image
In [71]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
]
class_metrics.sort_values(
    by="Accuracy_test", ascending=False
).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[71]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
logistic 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
ridge 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
decision_tree 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
naive_bayes 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
random_forest 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
gradient_boosting 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
knn 1.000000 0.981818 0.990654 1.000000 0.996743 0.993506 0.995305 0.990826
mlp 0.729323 0.685714 0.453271 0.444444 0.750814 0.733766 0.559078 0.539326

Почти все модели, включая логистическую регрессию, ридж-регрессию, KNN, наивный байесовский классификатор, многослойную перцептронную сеть, случайный лес, дерево решений и градиентный бустинг, демонстрируют 100% точность (1.000000) на обучающей выборке. Это указывает на то, что модели смогли подстроиться под обучающие данные, что может указывать на возможное переобучение.

ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса

In [72]:
class_metrics = pd.DataFrame.from_dict(class_models, "index")[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
]
class_metrics.sort_values(by="ROC_AUC_test", ascending=False).style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[72]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
logistic 1.000000 1.000000 1.000000 1.000000 1.000000
ridge 1.000000 1.000000 1.000000 1.000000 1.000000
decision_tree 1.000000 1.000000 1.000000 1.000000 1.000000
knn 0.993506 0.990826 1.000000 0.985801 0.985901
naive_bayes 1.000000 1.000000 1.000000 1.000000 1.000000
gradient_boosting 1.000000 1.000000 1.000000 1.000000 1.000000
random_forest 1.000000 1.000000 1.000000 1.000000 1.000000
mlp 0.733766 0.539326 0.653148 0.363893 0.380814
In [73]:
best_model = str(class_metrics.sort_values(by="MCC_test", ascending=False).iloc[0].name)

display(best_model)
'logistic'

Вывод данных с ошибкой предсказания для оценки

In [74]:
preprocessing_result = pipeline_end.transform(X_test)
preprocessed_df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

y_pred = class_models[best_model]["preds"]

error_index = y_test[y_test["Outcome"] != y_pred].index.tolist()
display(f"Error items count: {len(error_index)}")

error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]
error_df = X_test.loc[error_index].copy()
error_df.insert(loc=1, column="Predicted", value=error_predicted)
error_df.sort_index()
'Error items count: 0'
Out[74]:
Pregnancies Predicted Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome

Пример использования обученной модели (конвейера) для предсказания

In [75]:
model = class_models[best_model]["pipeline"]

example_id = 555
test = pd.DataFrame(X_test.loc[example_id, :]).T
test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T
display(test)
display(test_preprocessed)
result_proba = model.predict_proba(test)[0]
result = model.predict(test)[0]
real = int(y_test.loc[example_id].values[0])
display(f"predicted: {result} (proba: {result_proba})")
display(f"real: {real}")
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
555 7.0 124.0 70.0 33.0 215.0 25.5 0.161 37.0 0.0
Glucose Age BloodPressure Outcome DiabetesPedigreeFunction
555 0.122742 0.322537 0.049921 -0.731437 -0.927636
'predicted: 0 (proba: [0.99431769 0.00568231])'
'real: 0'

Подбор гиперпараметров методом поиска по сетке

In [76]:
from sklearn.model_selection import GridSearchCV

optimized_model_type = "random_forest"

random_forest_model = class_models[optimized_model_type]["pipeline"]

param_grid = {
    "model__n_estimators": [10, 50, 100],
    "model__max_features": ["sqrt", "log2"],
    "model__max_depth": [5, 7, 10],
    "model__criterion": ["gini", "entropy"],
}

gs_optomizer = GridSearchCV(
    estimator=random_forest_model, param_grid=param_grid, n_jobs=-1
)
gs_optomizer.fit(X_train, y_train.values.ravel())
gs_optomizer.best_params_
d:\5_semester\AIM\rep\AIM-PIbd-31-Razubaev-S-M\.venv\Lib\site-packages\numpy\ma\core.py:2881: RuntimeWarning: invalid value encountered in cast
  _data = np.array(data, dtype=dtype, copy=copy,
Out[76]:
{'model__criterion': 'gini',
 'model__max_depth': 5,
 'model__max_features': 'sqrt',
 'model__n_estimators': 10}

Обучение модели с новыми гиперпараметрами

In [77]:
optimized_model = ensemble.RandomForestClassifier(
    random_state=random_state,
    criterion="gini",
    max_depth=5,
    max_features="log2",
    n_estimators=10,
)

result = {}

result["pipeline"] = Pipeline([("pipeline", pipeline_end), ("model", optimized_model)]).fit(X_train, y_train.values.ravel())
result["train_preds"] = result["pipeline"].predict(X_train)
result["probs"] = result["pipeline"].predict_proba(X_test)[:, 1]
result["preds"] = np.where(result["probs"] > 0.5, 1, 0)

result["Precision_train"] = metrics.precision_score(y_train, result["train_preds"])
result["Precision_test"] = metrics.precision_score(y_test, result["preds"])
result["Recall_train"] = metrics.recall_score(y_train, result["train_preds"])
result["Recall_test"] = metrics.recall_score(y_test, result["preds"])
result["Accuracy_train"] = metrics.accuracy_score(y_train, result["train_preds"])
result["Accuracy_test"] = metrics.accuracy_score(y_test, result["preds"])
result["ROC_AUC_test"] = metrics.roc_auc_score(y_test, result["probs"])
result["F1_train"] = metrics.f1_score(y_train, result["train_preds"])
result["F1_test"] = metrics.f1_score(y_test, result["preds"])
result["MCC_test"] = metrics.matthews_corrcoef(y_test, result["preds"])
result["Cohen_kappa_test"] = metrics.cohen_kappa_score(y_test, result["preds"])
result["Confusion_matrix"] = metrics.confusion_matrix(y_test, result["preds"])

Формирование данных для оценки старой и новой версии модели

In [78]:
optimized_metrics = pd.DataFrame(columns=list(result.keys()))
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=class_models[optimized_model_type]
)
optimized_metrics.loc[len(optimized_metrics)] = pd.Series(
    data=result
)
optimized_metrics.insert(loc=0, column="Name", value=["Old", "New"])
optimized_metrics = optimized_metrics.set_index("Name")

Оценка параметров старой и новой модели

In [79]:
optimized_metrics[
    [
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
        "Accuracy_train",
        "Accuracy_test",
        "F1_train",
        "F1_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=["Accuracy_train", "Accuracy_test", "F1_train", "F1_test"],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Precision_train",
        "Precision_test",
        "Recall_train",
        "Recall_test",
    ],
)
Out[79]:
  Precision_train Precision_test Recall_train Recall_test Accuracy_train Accuracy_test F1_train F1_test
Name                
Old 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
New 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [80]:
optimized_metrics[
    [
        "Accuracy_test",
        "F1_test",
        "ROC_AUC_test",
        "Cohen_kappa_test",
        "MCC_test",
    ]
].style.background_gradient(
    cmap="plasma",
    low=0.3,
    high=1,
    subset=[
        "ROC_AUC_test",
        "MCC_test",
        "Cohen_kappa_test",
    ],
).background_gradient(
    cmap="viridis",
    low=1,
    high=0.3,
    subset=[
        "Accuracy_test",
        "F1_test",
    ],
)
Out[80]:
  Accuracy_test F1_test ROC_AUC_test Cohen_kappa_test MCC_test
Name          
Old 1.000000 1.000000 1.000000 1.000000 1.000000
New 1.000000 1.000000 1.000000 1.000000 1.000000
In [81]:
_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False
)

for index in range(0, len(optimized_metrics)):
    c_matrix = optimized_metrics.iloc[index]["Confusion_matrix"]
    disp = ConfusionMatrixDisplay(
        confusion_matrix=c_matrix, display_labels=["Healthy", "Sick"]
    ).plot(ax=ax.flat[index])

plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)
plt.show()
No description has been provided for this image

Регрессионная модель

In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import set_config

random_state=9
set_config(transform_output="pandas")
df = pd.read_csv(".//scv//diabetes.csv")
print(df.columns)
df
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
Out[82]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
... ... ... ... ... ... ... ... ... ...
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0

768 rows × 9 columns

Разделение набора данных на обучающую и тестовые выборки

In [83]:
from typing import Tuple
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split

def split_into_train_test(
    df_input: DataFrame,
    target_colname: str = "above_average_close",
    frac_train: float = 0.8,
    random_state: int = None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
    
    if not (0 < frac_train < 1):
        raise ValueError("Fraction must be between 0 and 1.")
    
    # Проверка наличия целевого признака
    if target_colname not in df_input.columns:
        raise ValueError(f"{target_colname} is not a column in the DataFrame.")
    
    # Разделяем данные на признаки и целевую переменную
    X = df_input.drop(columns=[target_colname])  # Признаки
    y = df_input[[target_colname]]  # Целевая переменная

    # Разделяем данные на обучающую и тестовую выборки
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=(1.0 - frac_train),
        random_state=random_state
    )
    
    return X_train, X_test, y_train, y_test

# Применение функции для разделения данных
X_train, X_test, y_train, y_test = split_into_train_test(
    df, 
    target_colname="Outcome", 
    frac_train=0.8, 
    random_state=42  # Убедитесь, что вы задали нужное значение random_state
)

# Для отображения результатов
display("X_train", X_train)
display("y_train", y_train)

display("X_test", X_test)
display("y_test", y_test)
'X_train'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
60 2 84 0 0 0 0.0 0.304 21
618 9 112 82 24 0 28.2 1.282 50
346 1 139 46 19 83 28.7 0.654 22
294 0 161 50 0 0 21.9 0.254 65
231 6 134 80 37 370 46.2 0.238 46
... ... ... ... ... ... ... ... ...
71 5 139 64 35 140 28.6 0.411 26
106 1 96 122 0 0 22.4 0.207 27
270 10 101 86 37 0 45.6 1.136 38
435 0 141 0 0 0 42.4 0.205 29
102 0 125 96 0 0 22.5 0.262 21

614 rows × 8 columns

'y_train'
Outcome
60 0
618 1
346 0
294 0
231 1
... ...
71 0
106 0
270 1
435 1
102 0

614 rows × 1 columns

'X_test'
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
668 6 98 58 33 190 34.0 0.430 43
324 2 112 75 32 0 35.7 0.148 21
624 2 108 64 0 0 30.8 0.158 21
690 8 107 80 0 0 24.6 0.856 34
473 7 136 90 0 0 29.9 0.210 50
... ... ... ... ... ... ... ... ...
355 9 165 88 0 0 30.4 0.302 49
534 1 77 56 30 56 33.3 1.251 24
344 8 95 72 0 0 36.8 0.485 57
296 2 146 70 38 360 28.0 0.337 29
462 8 74 70 40 49 35.3 0.705 39

154 rows × 8 columns

'y_test'
Outcome
668 0
324 0
624 0
690 0
473 0
... ...
355 1
534 0
344 0
296 1
462 0

154 rows × 1 columns

Определение перечня алгоритмов решения задачи аппроксимации (регрессии)

In [84]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, tree, neighbors, ensemble, neural_network

random_state = 9

models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    "linear_poly": {
        "model": make_pipeline(
            PolynomialFeatures(degree=2),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "linear_interact": {
        "model": make_pipeline(
            PolynomialFeatures(interaction_only=True),
            linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
        )
    },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
    "mlp": {
        "model": neural_network.MLPRegressor(
            activation="tanh",
            hidden_layer_sizes=(3,),
            max_iter=500,
            early_stopping=True,
            random_state=random_state,
        )
    },
}
In [85]:
import math
from pandas import DataFrame
from sklearn import metrics

for model_name in models.keys():
    print(f"Model: {model_name}")

    fitted_model = models[model_name]["model"].fit(
        X_train.values, y_train.values.ravel()
    )
    y_train_pred = fitted_model.predict(X_train.values)
    y_test_pred = fitted_model.predict(X_test.values)
    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred
    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(y_test, y_test_pred)
Model: linear
Model: linear_poly
Model: linear_interact
Model: ridge
Model: decision_tree
Model: knn
Model: random_forest
Model: mlp

Вывод результатов оценки

In [86]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[86]:
  RMSE_train RMSE_test RMAE_test R2_test
random_forest 0.240052 0.405871 0.559210 0.282505
linear 0.396793 0.413576 0.590024 0.255003
ridge 0.396822 0.414236 0.590431 0.252623
linear_poly 0.370076 0.422852 0.584147 0.221209
linear_interact 0.380128 0.426815 0.593532 0.206543
decision_tree 0.249880 0.445708 0.520376 0.134743
knn 0.373319 0.450285 0.592157 0.116883
mlp 0.623529 0.544323 0.658689 -0.290498

Вывод реального и "спрогнозированного" результата для обучающей и тестовой выборок

Получение лучшей модели

In [87]:
best_model = str(reg_metrics.sort_values(by="RMSE_test").iloc[0].name)

display(best_model)
'random_forest'

Вывод для обучающей выборки

In [88]:
pd.concat(
    [
        X_train,
        y_train,
        pd.Series(
            models[best_model]["train_preds"],
            index=y_train.index,
            name="DiabetPred",
        ),
    ],
    axis=1,
).head(5)
Out[88]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome DiabetPred
60 2 84 0 0 0 0.0 0.304 21 0 0.001849
618 9 112 82 24 0 28.2 1.282 50 1 0.758997
346 1 139 46 19 83 28.7 0.654 22 0 0.149231
294 0 161 50 0 0 21.9 0.254 65 0 0.239564
231 6 134 80 37 370 46.2 0.238 46 1 0.773890

Вывод для тестовой выборки

In [89]:
pd.concat(
    [
        X_test,
        y_test,
        pd.Series(
            models[best_model]["preds"],
            index=y_test.index,
            name="DiabetPred",
        ),
    ],
    axis=1,
).head(5)
Out[89]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome DiabetPred
668 6 98 58 33 190 34.0 0.430 43 0 0.516537
324 2 112 75 32 0 35.7 0.148 21 0 0.205507
624 2 108 64 0 0 30.8 0.158 21 0 0.047710
690 8 107 80 0 0 24.6 0.856 34 0 0.128867
473 7 136 90 0 0 29.9 0.210 50 0 0.438512