{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Лабораторная 4"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Информация о диабете индейцев Пима"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
".. ... ... ... \n",
"763 0.171 63 0 \n",
"764 0.340 27 0 \n",
"765 0.245 30 0 \n",
"766 0.349 47 1 \n",
"767 0.315 23 0 \n",
"\n",
"[768 rows x 9 columns]"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"\n",
"set_config(transform_output=\"pandas\")\n",
"df = pd.read_csv(\".//scv//diabetes.csv\")\n",
"print(df.columns)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование выборок"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" 1 | \n",
" 105 | \n",
" 58 | \n",
" 0 | \n",
" 0 | \n",
" 24.3 | \n",
" 0.187 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 69 | \n",
" 4 | \n",
" 146 | \n",
" 85 | \n",
" 27 | \n",
" 100 | \n",
" 28.9 | \n",
" 0.189 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 494 | \n",
" 3 | \n",
" 80 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.174 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 463 | \n",
" 5 | \n",
" 88 | \n",
" 78 | \n",
" 30 | \n",
" 0 | \n",
" 27.6 | \n",
" 0.258 | \n",
" 37 | \n",
" 0 | \n",
"
\n",
" \n",
" 653 | \n",
" 2 | \n",
" 120 | \n",
" 54 | \n",
" 0 | \n",
" 0 | \n",
" 26.8 | \n",
" 0.455 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" 0 | \n",
" 124 | \n",
" 70 | \n",
" 20 | \n",
" 0 | \n",
" 27.4 | \n",
" 0.254 | \n",
" 36 | \n",
" 1 | \n",
"
\n",
" \n",
" 109 | \n",
" 0 | \n",
" 95 | \n",
" 85 | \n",
" 25 | \n",
" 36 | \n",
" 37.4 | \n",
" 0.247 | \n",
" 24 | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" 1 | \n",
" 97 | \n",
" 66 | \n",
" 15 | \n",
" 140 | \n",
" 23.2 | \n",
" 0.487 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 651 | \n",
" 1 | \n",
" 117 | \n",
" 60 | \n",
" 23 | \n",
" 106 | \n",
" 33.8 | \n",
" 0.466 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 197 | \n",
" 3 | \n",
" 107 | \n",
" 62 | \n",
" 13 | \n",
" 48 | \n",
" 22.9 | \n",
" 0.678 | \n",
" 23 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"196 1 105 58 0 0 24.3 \n",
"69 4 146 85 27 100 28.9 \n",
"494 3 80 0 0 0 0.0 \n",
"463 5 88 78 30 0 27.6 \n",
"653 2 120 54 0 0 26.8 \n",
".. ... ... ... ... ... ... \n",
"322 0 124 70 20 0 27.4 \n",
"109 0 95 85 25 36 37.4 \n",
"27 1 97 66 15 140 23.2 \n",
"651 1 117 60 23 106 33.8 \n",
"197 3 107 62 13 48 22.9 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"196 0.187 21 0 \n",
"69 0.189 27 0 \n",
"494 0.174 22 0 \n",
"463 0.258 37 0 \n",
"653 0.455 27 0 \n",
".. ... ... ... \n",
"322 0.254 36 1 \n",
"109 0.247 24 1 \n",
"27 0.487 22 0 \n",
"651 0.466 27 0 \n",
"197 0.678 23 1 \n",
"\n",
"[614 rows x 9 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" 0 | \n",
"
\n",
" \n",
" 69 | \n",
" 0 | \n",
"
\n",
" \n",
" 494 | \n",
" 0 | \n",
"
\n",
" \n",
" 463 | \n",
" 0 | \n",
"
\n",
" \n",
" 653 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" 1 | \n",
"
\n",
" \n",
" 109 | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 651 | \n",
" 0 | \n",
"
\n",
" \n",
" 197 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"196 0\n",
"69 0\n",
"494 0\n",
"463 0\n",
"653 0\n",
".. ...\n",
"322 1\n",
"109 1\n",
"27 0\n",
"651 0\n",
"197 1\n",
"\n",
"[614 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 669 | \n",
" 9 | \n",
" 154 | \n",
" 78 | \n",
" 30 | \n",
" 100 | \n",
" 30.9 | \n",
" 0.164 | \n",
" 45 | \n",
" 0 | \n",
"
\n",
" \n",
" 379 | \n",
" 0 | \n",
" 93 | \n",
" 100 | \n",
" 39 | \n",
" 72 | \n",
" 43.4 | \n",
" 1.021 | \n",
" 35 | \n",
" 0 | \n",
"
\n",
" \n",
" 640 | \n",
" 0 | \n",
" 102 | \n",
" 86 | \n",
" 17 | \n",
" 105 | \n",
" 29.3 | \n",
" 0.695 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 658 | \n",
" 11 | \n",
" 127 | \n",
" 106 | \n",
" 0 | \n",
" 0 | \n",
" 39.0 | \n",
" 0.190 | \n",
" 51 | \n",
" 0 | \n",
"
\n",
" \n",
" 304 | \n",
" 3 | \n",
" 150 | \n",
" 76 | \n",
" 0 | \n",
" 0 | \n",
" 21.0 | \n",
" 0.207 | \n",
" 37 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 203 | \n",
" 2 | \n",
" 99 | \n",
" 70 | \n",
" 16 | \n",
" 44 | \n",
" 20.4 | \n",
" 0.235 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 605 | \n",
" 1 | \n",
" 124 | \n",
" 60 | \n",
" 32 | \n",
" 0 | \n",
" 35.8 | \n",
" 0.514 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 561 | \n",
" 0 | \n",
" 198 | \n",
" 66 | \n",
" 32 | \n",
" 274 | \n",
" 41.3 | \n",
" 0.502 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 280 | \n",
" 0 | \n",
" 146 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 37.9 | \n",
" 0.334 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 103 | \n",
" 1 | \n",
" 81 | \n",
" 72 | \n",
" 18 | \n",
" 40 | \n",
" 26.6 | \n",
" 0.283 | \n",
" 24 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"669 9 154 78 30 100 30.9 \n",
"379 0 93 100 39 72 43.4 \n",
"640 0 102 86 17 105 29.3 \n",
"658 11 127 106 0 0 39.0 \n",
"304 3 150 76 0 0 21.0 \n",
".. ... ... ... ... ... ... \n",
"203 2 99 70 16 44 20.4 \n",
"605 1 124 60 32 0 35.8 \n",
"561 0 198 66 32 274 41.3 \n",
"280 0 146 70 0 0 37.9 \n",
"103 1 81 72 18 40 26.6 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"669 0.164 45 0 \n",
"379 1.021 35 0 \n",
"640 0.695 27 0 \n",
"658 0.190 51 0 \n",
"304 0.207 37 0 \n",
".. ... ... ... \n",
"203 0.235 27 0 \n",
"605 0.514 21 0 \n",
"561 0.502 28 1 \n",
"280 0.334 28 1 \n",
"103 0.283 24 0 \n",
"\n",
"[154 rows x 9 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 669 | \n",
" 0 | \n",
"
\n",
" \n",
" 379 | \n",
" 0 | \n",
"
\n",
" \n",
" 640 | \n",
" 0 | \n",
"
\n",
" \n",
" 658 | \n",
" 0 | \n",
"
\n",
" \n",
" 304 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 203 | \n",
" 0 | \n",
"
\n",
" \n",
" 605 | \n",
" 0 | \n",
"
\n",
" \n",
" 561 | \n",
" 1 | \n",
"
\n",
" \n",
" 280 | \n",
" 1 | \n",
"
\n",
" \n",
" 103 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"669 0\n",
"379 0\n",
"640 0\n",
"658 0\n",
"304 0\n",
".. ...\n",
"203 0\n",
"605 0\n",
"561 1\n",
"280 1\n",
"103 0\n",
"\n",
"[154 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input,\n",
" stratify_colname=\"y\",\n",
" frac_train=0.6,\n",
" frac_val=0.15,\n",
" frac_test=0.25,\n",
" random_state=None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if frac_train + frac_val + frac_test != 1.0:\n",
" raise ValueError(\n",
" \"fractions %f, %f, %f do not add up to 1.0\"\n",
" % (frac_train, frac_val, frac_test)\n",
" )\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
" X = df_input # Contains all columns.\n",
" y = df_input[\n",
" [stratify_colname]\n",
" ] # Dataframe of just the column on which to stratify.\n",
" # Split original dataframe into train and temp dataframes.\n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
" if frac_val <= 0:\n",
" assert len(df_input) == len(df_train) + len(df_temp)\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
" # Split the temp dataframe into val and test dataframes.\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"Outcome\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Классификация данных"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"from transformers import DiabetFeatures\n",
"\n",
"\n",
"columns_to_drop = [\"Glucose\", \"Age\", \"BloodPressure\", \"Outcome\", \"DiabetesPedigreeFunction\"]\n",
"num_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype != \"object\"\n",
"]\n",
"cat_columns = [\n",
" column\n",
" for column in df.columns\n",
" if column not in columns_to_drop and df[column].dtype == \"object\"\n",
"]\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка работы конвеера"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" -0.838489 | \n",
" -1.297466 | \n",
" -0.688684 | \n",
" -0.946400 | \n",
"
\n",
" \n",
" 69 | \n",
" 0.072181 | \n",
" 0.395520 | \n",
" 0.180416 | \n",
" -0.377190 | \n",
"
\n",
" \n",
" 494 | \n",
" -0.231376 | \n",
" -1.297466 | \n",
" -0.688684 | \n",
" -3.953317 | \n",
"
\n",
" \n",
" 463 | \n",
" 0.375738 | \n",
" 0.583630 | \n",
" -0.688684 | \n",
" -0.538054 | \n",
"
\n",
" \n",
" 653 | \n",
" -0.534932 | \n",
" -1.297466 | \n",
" -0.688684 | \n",
" -0.637047 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" -1.142046 | \n",
" -0.043402 | \n",
" -0.688684 | \n",
" -0.562802 | \n",
"
\n",
" \n",
" 109 | \n",
" -1.142046 | \n",
" 0.270114 | \n",
" -0.375808 | \n",
" 0.674613 | \n",
"
\n",
" \n",
" 27 | \n",
" -0.838489 | \n",
" -0.356918 | \n",
" 0.528056 | \n",
" -1.082516 | \n",
"
\n",
" \n",
" 651 | \n",
" -0.838489 | \n",
" 0.144708 | \n",
" 0.232562 | \n",
" 0.229143 | \n",
"
\n",
" \n",
" 197 | \n",
" -0.231376 | \n",
" -0.482325 | \n",
" -0.271516 | \n",
" -1.119638 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 4 columns
\n",
"
"
],
"text/plain": [
" Pregnancies SkinThickness Insulin BMI\n",
"196 -0.838489 -1.297466 -0.688684 -0.946400\n",
"69 0.072181 0.395520 0.180416 -0.377190\n",
"494 -0.231376 -1.297466 -0.688684 -3.953317\n",
"463 0.375738 0.583630 -0.688684 -0.538054\n",
"653 -0.534932 -1.297466 -0.688684 -0.637047\n",
".. ... ... ... ...\n",
"322 -1.142046 -0.043402 -0.688684 -0.562802\n",
"109 -1.142046 0.270114 -0.375808 0.674613\n",
"27 -0.838489 -0.356918 0.528056 -1.082516\n",
"651 -0.838489 0.144708 0.232562 0.229143\n",
"197 -0.231376 -0.482325 -0.271516 -1.119638\n",
"\n",
"[614 rows x 4 columns]"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование набора моделей для классификации"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
"\n",
"class_models = {\n",
" \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
" # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
" \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=9)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
" \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
" \"gradient_boosting\": {\n",
" \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
" },\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestClassifier(\n",
" max_depth=11, class_weight=\"balanced\", random_state=9\n",
" )\n",
" },\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPClassifier(\n",
" hidden_layer_sizes=(7,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=9,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обучение моделей на обучающем наборе данных и оценка на тестовом"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: naive_bayes\n",
"Model: gradient_boosting\n",
"Model: random_forest\n",
"Model: mlp\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = class_models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train)\n",
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
"\n",
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"probs\"] = y_test_probs\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
"\n",
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test, y_test_probs\n",
" )\n",
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
" y_test, y_test_predict\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Сводная таблица оценок качества для использованных моделей классификации\n",
"\n",
"Матрица неточностей"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"import matplotlib.pyplot as plt\n",
"\n",
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"for index, key in enumerate(class_models.keys()):\n",
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(key)\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Точность, полнота, верность (аккуратность), F-мера"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Precision_train | \n",
" Precision_test | \n",
" Recall_train | \n",
" Recall_test | \n",
" Accuracy_train | \n",
" Accuracy_test | \n",
" F1_train | \n",
" F1_test | \n",
"
\n",
" \n",
" \n",
" \n",
" naive_bayes | \n",
" 0.564516 | \n",
" 0.628571 | \n",
" 0.327103 | \n",
" 0.407407 | \n",
" 0.677524 | \n",
" 0.707792 | \n",
" 0.414201 | \n",
" 0.494382 | \n",
"
\n",
" \n",
" ridge | \n",
" 0.494382 | \n",
" 0.552632 | \n",
" 0.616822 | \n",
" 0.777778 | \n",
" 0.646580 | \n",
" 0.701299 | \n",
" 0.548857 | \n",
" 0.646154 | \n",
"
\n",
" \n",
" knn | \n",
" 0.670807 | \n",
" 0.551020 | \n",
" 0.504673 | \n",
" 0.500000 | \n",
" 0.741042 | \n",
" 0.681818 | \n",
" 0.576000 | \n",
" 0.524272 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.955157 | \n",
" 0.535714 | \n",
" 0.995327 | \n",
" 0.555556 | \n",
" 0.982085 | \n",
" 0.675325 | \n",
" 0.974828 | \n",
" 0.545455 | \n",
"
\n",
" \n",
" logistic | \n",
" 0.618644 | \n",
" 0.525000 | \n",
" 0.341121 | \n",
" 0.388889 | \n",
" 0.697068 | \n",
" 0.662338 | \n",
" 0.439759 | \n",
" 0.446809 | \n",
"
\n",
" \n",
" gradient_boosting | \n",
" 0.920213 | \n",
" 0.512195 | \n",
" 0.808411 | \n",
" 0.388889 | \n",
" 0.908795 | \n",
" 0.655844 | \n",
" 0.860697 | \n",
" 0.442105 | \n",
"
\n",
" \n",
" decision_tree | \n",
" 0.718615 | \n",
" 0.459016 | \n",
" 0.775701 | \n",
" 0.518519 | \n",
" 0.815961 | \n",
" 0.616883 | \n",
" 0.746067 | \n",
" 0.486957 | \n",
"
\n",
" \n",
" mlp | \n",
" 0.409195 | \n",
" 0.417391 | \n",
" 0.831776 | \n",
" 0.888889 | \n",
" 0.522801 | \n",
" 0.525974 | \n",
" 0.548536 | \n",
" 0.568047 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(\n",
" by=\"Accuracy_test\", ascending=False\n",
").style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Accuracy_test | \n",
" F1_test | \n",
" ROC_AUC_test | \n",
" Cohen_kappa_test | \n",
" MCC_test | \n",
"
\n",
" \n",
" \n",
" \n",
" ridge | \n",
" 0.701299 | \n",
" 0.646154 | \n",
" 0.767037 | \n",
" 0.400271 | \n",
" 0.417827 | \n",
"
\n",
" \n",
" logistic | \n",
" 0.662338 | \n",
" 0.446809 | \n",
" 0.766296 | \n",
" 0.211501 | \n",
" 0.216434 | \n",
"
\n",
" \n",
" naive_bayes | \n",
" 0.707792 | \n",
" 0.494382 | \n",
" 0.753704 | \n",
" 0.301834 | \n",
" 0.315869 | \n",
"
\n",
" \n",
" knn | \n",
" 0.681818 | \n",
" 0.524272 | \n",
" 0.745556 | \n",
" 0.286093 | \n",
" 0.286855 | \n",
"
\n",
" \n",
" mlp | \n",
" 0.525974 | \n",
" 0.568047 | \n",
" 0.729074 | \n",
" 0.173747 | \n",
" 0.240181 | \n",
"
\n",
" \n",
" random_forest | \n",
" 0.675325 | \n",
" 0.545455 | \n",
" 0.715093 | \n",
" 0.293059 | \n",
" 0.293176 | \n",
"
\n",
" \n",
" gradient_boosting | \n",
" 0.655844 | \n",
" 0.442105 | \n",
" 0.709630 | \n",
" 0.199961 | \n",
" 0.203926 | \n",
"
\n",
" \n",
" decision_tree | \n",
" 0.616883 | \n",
" 0.486957 | \n",
" 0.612870 | \n",
" 0.183061 | \n",
" 0.183927 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ridge'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
"\n",
"display(best_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод данных с ошибкой предсказания для оценки"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Error items count: 46'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Predicted | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 30 | \n",
" 5 | \n",
" 1 | \n",
" 109 | \n",
" 75 | \n",
" 26 | \n",
" 0 | \n",
" 36.0 | \n",
" 0.546 | \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 82 | \n",
" 7 | \n",
" 1 | \n",
" 83 | \n",
" 78 | \n",
" 26 | \n",
" 71 | \n",
" 29.3 | \n",
" 0.767 | \n",
" 36 | \n",
" 0 | \n",
"
\n",
" \n",
" 86 | \n",
" 13 | \n",
" 1 | \n",
" 106 | \n",
" 72 | \n",
" 54 | \n",
" 0 | \n",
" 36.6 | \n",
" 0.178 | \n",
" 45 | \n",
" 0 | \n",
"
\n",
" \n",
" 91 | \n",
" 4 | \n",
" 1 | \n",
" 123 | \n",
" 80 | \n",
" 15 | \n",
" 176 | \n",
" 32.0 | \n",
" 0.443 | \n",
" 34 | \n",
" 0 | \n",
"
\n",
" \n",
" 95 | \n",
" 6 | \n",
" 1 | \n",
" 144 | \n",
" 72 | \n",
" 27 | \n",
" 228 | \n",
" 33.9 | \n",
" 0.255 | \n",
" 40 | \n",
" 0 | \n",
"
\n",
" \n",
" 176 | \n",
" 6 | \n",
" 1 | \n",
" 85 | \n",
" 78 | \n",
" 0 | \n",
" 0 | \n",
" 31.2 | \n",
" 0.382 | \n",
" 42 | \n",
" 0 | \n",
"
\n",
" \n",
" 201 | \n",
" 1 | \n",
" 1 | \n",
" 138 | \n",
" 82 | \n",
" 0 | \n",
" 0 | \n",
" 40.1 | \n",
" 0.236 | \n",
" 28 | \n",
" 0 | \n",
"
\n",
" \n",
" 204 | \n",
" 6 | \n",
" 1 | \n",
" 103 | \n",
" 72 | \n",
" 32 | \n",
" 190 | \n",
" 37.7 | \n",
" 0.324 | \n",
" 55 | \n",
" 0 | \n",
"
\n",
" \n",
" 223 | \n",
" 7 | \n",
" 1 | \n",
" 142 | \n",
" 60 | \n",
" 33 | \n",
" 190 | \n",
" 28.8 | \n",
" 0.687 | \n",
" 61 | \n",
" 0 | \n",
"
\n",
" \n",
" 228 | \n",
" 4 | \n",
" 1 | \n",
" 197 | \n",
" 70 | \n",
" 39 | \n",
" 744 | \n",
" 36.7 | \n",
" 2.329 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 233 | \n",
" 4 | \n",
" 1 | \n",
" 122 | \n",
" 68 | \n",
" 0 | \n",
" 0 | \n",
" 35.0 | \n",
" 0.394 | \n",
" 29 | \n",
" 0 | \n",
"
\n",
" \n",
" 266 | \n",
" 0 | \n",
" 0 | \n",
" 138 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 36.3 | \n",
" 0.933 | \n",
" 25 | \n",
" 1 | \n",
"
\n",
" \n",
" 274 | \n",
" 13 | \n",
" 1 | \n",
" 106 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 34.2 | \n",
" 0.251 | \n",
" 52 | \n",
" 0 | \n",
"
\n",
" \n",
" 280 | \n",
" 0 | \n",
" 0 | \n",
" 146 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 37.9 | \n",
" 0.334 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 282 | \n",
" 7 | \n",
" 1 | \n",
" 133 | \n",
" 88 | \n",
" 15 | \n",
" 155 | \n",
" 32.4 | \n",
" 0.262 | \n",
" 37 | \n",
" 0 | \n",
"
\n",
" \n",
" 302 | \n",
" 5 | \n",
" 1 | \n",
" 77 | \n",
" 82 | \n",
" 41 | \n",
" 42 | \n",
" 35.8 | \n",
" 0.156 | \n",
" 35 | \n",
" 0 | \n",
"
\n",
" \n",
" 309 | \n",
" 2 | \n",
" 0 | \n",
" 124 | \n",
" 68 | \n",
" 28 | \n",
" 205 | \n",
" 32.9 | \n",
" 0.875 | \n",
" 30 | \n",
" 1 | \n",
"
\n",
" \n",
" 335 | \n",
" 0 | \n",
" 1 | \n",
" 165 | \n",
" 76 | \n",
" 43 | \n",
" 255 | \n",
" 47.9 | \n",
" 0.259 | \n",
" 26 | \n",
" 0 | \n",
"
\n",
" \n",
" 358 | \n",
" 12 | \n",
" 1 | \n",
" 88 | \n",
" 74 | \n",
" 40 | \n",
" 54 | \n",
" 35.3 | \n",
" 0.378 | \n",
" 48 | \n",
" 0 | \n",
"
\n",
" \n",
" 364 | \n",
" 4 | \n",
" 1 | \n",
" 147 | \n",
" 74 | \n",
" 25 | \n",
" 293 | \n",
" 34.9 | \n",
" 0.385 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 379 | \n",
" 0 | \n",
" 1 | \n",
" 93 | \n",
" 100 | \n",
" 39 | \n",
" 72 | \n",
" 43.4 | \n",
" 1.021 | \n",
" 35 | \n",
" 0 | \n",
"
\n",
" \n",
" 397 | \n",
" 0 | \n",
" 0 | \n",
" 131 | \n",
" 66 | \n",
" 40 | \n",
" 0 | \n",
" 34.3 | \n",
" 0.196 | \n",
" 22 | \n",
" 1 | \n",
"
\n",
" \n",
" 405 | \n",
" 2 | \n",
" 1 | \n",
" 123 | \n",
" 48 | \n",
" 32 | \n",
" 165 | \n",
" 42.1 | \n",
" 0.520 | \n",
" 26 | \n",
" 0 | \n",
"
\n",
" \n",
" 406 | \n",
" 4 | \n",
" 0 | \n",
" 115 | \n",
" 72 | \n",
" 0 | \n",
" 0 | \n",
" 28.9 | \n",
" 0.376 | \n",
" 46 | \n",
" 1 | \n",
"
\n",
" \n",
" 442 | \n",
" 4 | \n",
" 1 | \n",
" 117 | \n",
" 64 | \n",
" 27 | \n",
" 120 | \n",
" 33.2 | \n",
" 0.230 | \n",
" 24 | \n",
" 0 | \n",
"
\n",
" \n",
" 486 | \n",
" 1 | \n",
" 1 | \n",
" 139 | \n",
" 62 | \n",
" 41 | \n",
" 480 | \n",
" 40.7 | \n",
" 0.536 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 515 | \n",
" 3 | \n",
" 0 | \n",
" 163 | \n",
" 70 | \n",
" 18 | \n",
" 105 | \n",
" 31.6 | \n",
" 0.268 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 517 | \n",
" 7 | \n",
" 1 | \n",
" 125 | \n",
" 86 | \n",
" 0 | \n",
" 0 | \n",
" 37.6 | \n",
" 0.304 | \n",
" 51 | \n",
" 0 | \n",
"
\n",
" \n",
" 583 | \n",
" 8 | \n",
" 1 | \n",
" 100 | \n",
" 76 | \n",
" 0 | \n",
" 0 | \n",
" 38.7 | \n",
" 0.190 | \n",
" 42 | \n",
" 0 | \n",
"
\n",
" \n",
" 594 | \n",
" 6 | \n",
" 1 | \n",
" 123 | \n",
" 72 | \n",
" 45 | \n",
" 230 | \n",
" 33.6 | \n",
" 0.733 | \n",
" 34 | \n",
" 0 | \n",
"
\n",
" \n",
" 622 | \n",
" 6 | \n",
" 1 | \n",
" 183 | \n",
" 94 | \n",
" 0 | \n",
" 0 | \n",
" 40.8 | \n",
" 1.461 | \n",
" 45 | \n",
" 0 | \n",
"
\n",
" \n",
" 630 | \n",
" 7 | \n",
" 0 | \n",
" 114 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 27.4 | \n",
" 0.732 | \n",
" 34 | \n",
" 1 | \n",
"
\n",
" \n",
" 634 | \n",
" 10 | \n",
" 1 | \n",
" 92 | \n",
" 62 | \n",
" 0 | \n",
" 0 | \n",
" 25.9 | \n",
" 0.167 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 646 | \n",
" 1 | \n",
" 0 | \n",
" 167 | \n",
" 74 | \n",
" 17 | \n",
" 144 | \n",
" 23.4 | \n",
" 0.447 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" 658 | \n",
" 11 | \n",
" 1 | \n",
" 127 | \n",
" 106 | \n",
" 0 | \n",
" 0 | \n",
" 39.0 | \n",
" 0.190 | \n",
" 51 | \n",
" 0 | \n",
"
\n",
" \n",
" 669 | \n",
" 9 | \n",
" 1 | \n",
" 154 | \n",
" 78 | \n",
" 30 | \n",
" 100 | \n",
" 30.9 | \n",
" 0.164 | \n",
" 45 | \n",
" 0 | \n",
"
\n",
" \n",
" 674 | \n",
" 8 | \n",
" 1 | \n",
" 91 | \n",
" 82 | \n",
" 0 | \n",
" 0 | \n",
" 35.6 | \n",
" 0.587 | \n",
" 68 | \n",
" 0 | \n",
"
\n",
" \n",
" 676 | \n",
" 9 | \n",
" 0 | \n",
" 156 | \n",
" 86 | \n",
" 0 | \n",
" 0 | \n",
" 24.8 | \n",
" 0.230 | \n",
" 53 | \n",
" 1 | \n",
"
\n",
" \n",
" 682 | \n",
" 0 | \n",
" 1 | \n",
" 95 | \n",
" 64 | \n",
" 39 | \n",
" 105 | \n",
" 44.6 | \n",
" 0.366 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 699 | \n",
" 4 | \n",
" 1 | \n",
" 118 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 44.5 | \n",
" 0.904 | \n",
" 26 | \n",
" 0 | \n",
"
\n",
" \n",
" 702 | \n",
" 1 | \n",
" 0 | \n",
" 168 | \n",
" 88 | \n",
" 29 | \n",
" 0 | \n",
" 35.0 | \n",
" 0.905 | \n",
" 52 | \n",
" 1 | \n",
"
\n",
" \n",
" 723 | \n",
" 5 | \n",
" 1 | \n",
" 117 | \n",
" 86 | \n",
" 30 | \n",
" 105 | \n",
" 39.1 | \n",
" 0.251 | \n",
" 42 | \n",
" 0 | \n",
"
\n",
" \n",
" 725 | \n",
" 4 | \n",
" 1 | \n",
" 112 | \n",
" 78 | \n",
" 40 | \n",
" 0 | \n",
" 39.4 | \n",
" 0.236 | \n",
" 38 | \n",
" 0 | \n",
"
\n",
" \n",
" 730 | \n",
" 3 | \n",
" 0 | \n",
" 130 | \n",
" 78 | \n",
" 23 | \n",
" 79 | \n",
" 28.4 | \n",
" 0.323 | \n",
" 34 | \n",
" 1 | \n",
"
\n",
" \n",
" 744 | \n",
" 13 | \n",
" 1 | \n",
" 153 | \n",
" 88 | \n",
" 37 | \n",
" 140 | \n",
" 40.6 | \n",
" 1.174 | \n",
" 39 | \n",
" 0 | \n",
"
\n",
" \n",
" 750 | \n",
" 4 | \n",
" 0 | \n",
" 136 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 31.2 | \n",
" 1.182 | \n",
" 22 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Predicted Glucose BloodPressure SkinThickness Insulin \\\n",
"30 5 1 109 75 26 0 \n",
"82 7 1 83 78 26 71 \n",
"86 13 1 106 72 54 0 \n",
"91 4 1 123 80 15 176 \n",
"95 6 1 144 72 27 228 \n",
"176 6 1 85 78 0 0 \n",
"201 1 1 138 82 0 0 \n",
"204 6 1 103 72 32 190 \n",
"223 7 1 142 60 33 190 \n",
"228 4 1 197 70 39 744 \n",
"233 4 1 122 68 0 0 \n",
"266 0 0 138 0 0 0 \n",
"274 13 1 106 70 0 0 \n",
"280 0 0 146 70 0 0 \n",
"282 7 1 133 88 15 155 \n",
"302 5 1 77 82 41 42 \n",
"309 2 0 124 68 28 205 \n",
"335 0 1 165 76 43 255 \n",
"358 12 1 88 74 40 54 \n",
"364 4 1 147 74 25 293 \n",
"379 0 1 93 100 39 72 \n",
"397 0 0 131 66 40 0 \n",
"405 2 1 123 48 32 165 \n",
"406 4 0 115 72 0 0 \n",
"442 4 1 117 64 27 120 \n",
"486 1 1 139 62 41 480 \n",
"515 3 0 163 70 18 105 \n",
"517 7 1 125 86 0 0 \n",
"583 8 1 100 76 0 0 \n",
"594 6 1 123 72 45 230 \n",
"622 6 1 183 94 0 0 \n",
"630 7 0 114 64 0 0 \n",
"634 10 1 92 62 0 0 \n",
"646 1 0 167 74 17 144 \n",
"658 11 1 127 106 0 0 \n",
"669 9 1 154 78 30 100 \n",
"674 8 1 91 82 0 0 \n",
"676 9 0 156 86 0 0 \n",
"682 0 1 95 64 39 105 \n",
"699 4 1 118 70 0 0 \n",
"702 1 0 168 88 29 0 \n",
"723 5 1 117 86 30 105 \n",
"725 4 1 112 78 40 0 \n",
"730 3 0 130 78 23 79 \n",
"744 13 1 153 88 37 140 \n",
"750 4 0 136 70 0 0 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"30 36.0 0.546 60 0 \n",
"82 29.3 0.767 36 0 \n",
"86 36.6 0.178 45 0 \n",
"91 32.0 0.443 34 0 \n",
"95 33.9 0.255 40 0 \n",
"176 31.2 0.382 42 0 \n",
"201 40.1 0.236 28 0 \n",
"204 37.7 0.324 55 0 \n",
"223 28.8 0.687 61 0 \n",
"228 36.7 2.329 31 0 \n",
"233 35.0 0.394 29 0 \n",
"266 36.3 0.933 25 1 \n",
"274 34.2 0.251 52 0 \n",
"280 37.9 0.334 28 1 \n",
"282 32.4 0.262 37 0 \n",
"302 35.8 0.156 35 0 \n",
"309 32.9 0.875 30 1 \n",
"335 47.9 0.259 26 0 \n",
"358 35.3 0.378 48 0 \n",
"364 34.9 0.385 30 0 \n",
"379 43.4 1.021 35 0 \n",
"397 34.3 0.196 22 1 \n",
"405 42.1 0.520 26 0 \n",
"406 28.9 0.376 46 1 \n",
"442 33.2 0.230 24 0 \n",
"486 40.7 0.536 21 0 \n",
"515 31.6 0.268 28 1 \n",
"517 37.6 0.304 51 0 \n",
"583 38.7 0.190 42 0 \n",
"594 33.6 0.733 34 0 \n",
"622 40.8 1.461 45 0 \n",
"630 27.4 0.732 34 1 \n",
"634 25.9 0.167 31 0 \n",
"646 23.4 0.447 33 1 \n",
"658 39.0 0.190 51 0 \n",
"669 30.9 0.164 45 0 \n",
"674 35.6 0.587 68 0 \n",
"676 24.8 0.230 53 1 \n",
"682 44.6 0.366 22 0 \n",
"699 44.5 0.904 26 0 \n",
"702 35.0 0.905 52 1 \n",
"723 39.1 0.251 42 0 \n",
"725 39.4 0.236 38 0 \n",
"730 28.4 0.323 34 1 \n",
"744 40.6 1.174 39 0 \n",
"750 31.2 1.182 22 1 "
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.transform(X_test)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"y_pred = class_models[best_model][\"preds\"]\n",
"\n",
"error_index = y_test[y_test[\"Outcome\"] != y_pred].index.tolist()\n",
"display(f\"Error items count: {len(error_index)}\")\n",
"\n",
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
"error_df = X_test.loc[error_index].copy()\n",
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
"error_df.sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Пример использования обученной модели (конвейера) для предсказания"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 450 | \n",
" 1.0 | \n",
" 82.0 | \n",
" 64.0 | \n",
" 13.0 | \n",
" 95.0 | \n",
" 21.2 | \n",
" 0.415 | \n",
" 23.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"450 1.0 82.0 64.0 13.0 95.0 21.2 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"450 0.415 23.0 0.0 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
"
\n",
" \n",
" \n",
" \n",
" 450 | \n",
" -0.838489 | \n",
" -0.482325 | \n",
" 0.136961 | \n",
" -1.329999 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies SkinThickness Insulin BMI\n",
"450 -0.838489 -0.482325 0.136961 -1.329999"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'predicted: 0 (proba: [0.81353825 0.18646175])'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'real: 0'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = class_models[best_model][\"pipeline\"]\n",
"\n",
"example_id = 450\n",
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
"display(test)\n",
"display(test_preprocessed)\n",
"result_proba = model.predict_proba(test)[0]\n",
"result = model.predict(test)[0]\n",
"real = int(y_test.loc[example_id].values[0])\n",
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
"display(f\"real: {real}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Подбор гиперпараметров методом поиска по сетке"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"import numpy as np\n",
"from sklearn import metrics\n",
"import pandas as pd\n",
"\n",
"\n",
"# Определяем числовые признаки\n",
"numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()\n",
"\n",
"# Установка random_state\n",
"random_state = 42\n",
"\n",
"# Определение трансформера\n",
"pipeline_end = ColumnTransformer([\n",
" ('numeric', StandardScaler(), numeric_features),\n",
" # Добавьте другие трансформеры, если требуется\n",
"])\n",
"\n",
"# Объявление модели\n",
"optimized_model = RandomForestClassifier(\n",
" random_state=random_state,\n",
" criterion=\"gini\",\n",
" max_depth=5,\n",
" max_features=\"sqrt\",\n",
" n_estimators=10,\n",
")\n",
"\n",
"# Создание пайплайна с корректными шагами\n",
"result = {}\n",
"\n",
"# Обучение модели\n",
"result[\"pipeline\"] = Pipeline([\n",
" (\"pipeline\", pipeline_end),\n",
" (\"model\", optimized_model)\n",
"]).fit(X_train, y_train.values.ravel())\n",
"\n",
"# Прогнозирование и расчет метрик\n",
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
"\n",
"# Метрики для оценки модели\n",
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование данных для оценки старой и новой версии модели"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=class_models[optimized_model_type]\n",
")\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=result\n",
")\n",
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка параметров старой и новой модели"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Precision_train | \n",
" Precision_test | \n",
" Recall_train | \n",
" Recall_test | \n",
" Accuracy_train | \n",
" Accuracy_test | \n",
" F1_train | \n",
" F1_test | \n",
"
\n",
" \n",
" Name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Old | \n",
" 0.955157 | \n",
" 0.535714 | \n",
" 0.995327 | \n",
" 0.555556 | \n",
" 0.982085 | \n",
" 0.675325 | \n",
" 0.974828 | \n",
" 0.545455 | \n",
"
\n",
" \n",
" New | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Accuracy_test | \n",
" F1_test | \n",
" ROC_AUC_test | \n",
" Cohen_kappa_test | \n",
" MCC_test | \n",
"
\n",
" \n",
" Name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Old | \n",
" 0.675325 | \n",
" 0.545455 | \n",
" 0.715093 | \n",
" 0.293059 | \n",
" 0.293176 | \n",
"
\n",
" \n",
" New | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"\n",
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
")\n",
"\n",
"for index in range(0, len(optimized_metrics)):\n",
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
" ).plot(ax=ax.flat[index])\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В желтом квадрате мы видим значение 74, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"Sick\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n",
"\n",
"В зеленом квадрате значение 54 указывает на количество правильно классифицированных объектов, отнесенных к классу \"Healthy\". Это также является показателем хорошей точности модели в определении объектов данного класса."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определение достижимого уровня качества модели для второй задачи"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Подготовка данных"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
"\n",
" BMI DiabetesPedigreeFunction Age Outcome \n",
"count 768.000000 768.000000 768.000000 768.000000 \n",
"mean 31.992578 0.471876 33.240885 0.348958 \n",
"std 7.884160 0.331329 11.760232 0.476951 \n",
"min 0.000000 0.078000 21.000000 0.000000 \n",
"25% 27.300000 0.243750 24.000000 0.000000 \n",
"50% 32.000000 0.372500 29.000000 0.000000 \n",
"75% 36.600000 0.626250 41.000000 1.000000 \n",
"max 67.100000 2.420000 81.000000 1.000000 \n"
]
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"\n",
"\n",
"random_state = 9\n",
"set_config(transform_output=\"pandas\")\n",
"df = pd.read_csv(\".//scv//diabetes.csv\")\n",
"print(df.describe())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование выборок"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" 1 | \n",
" 105 | \n",
" 58 | \n",
" 0 | \n",
" 0 | \n",
" 24.3 | \n",
" 0.187 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 69 | \n",
" 4 | \n",
" 146 | \n",
" 85 | \n",
" 27 | \n",
" 100 | \n",
" 28.9 | \n",
" 0.189 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 494 | \n",
" 3 | \n",
" 80 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.174 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 463 | \n",
" 5 | \n",
" 88 | \n",
" 78 | \n",
" 30 | \n",
" 0 | \n",
" 27.6 | \n",
" 0.258 | \n",
" 37 | \n",
" 0 | \n",
"
\n",
" \n",
" 653 | \n",
" 2 | \n",
" 120 | \n",
" 54 | \n",
" 0 | \n",
" 0 | \n",
" 26.8 | \n",
" 0.455 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" 0 | \n",
" 124 | \n",
" 70 | \n",
" 20 | \n",
" 0 | \n",
" 27.4 | \n",
" 0.254 | \n",
" 36 | \n",
" 1 | \n",
"
\n",
" \n",
" 109 | \n",
" 0 | \n",
" 95 | \n",
" 85 | \n",
" 25 | \n",
" 36 | \n",
" 37.4 | \n",
" 0.247 | \n",
" 24 | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" 1 | \n",
" 97 | \n",
" 66 | \n",
" 15 | \n",
" 140 | \n",
" 23.2 | \n",
" 0.487 | \n",
" 22 | \n",
" 0 | \n",
"
\n",
" \n",
" 651 | \n",
" 1 | \n",
" 117 | \n",
" 60 | \n",
" 23 | \n",
" 106 | \n",
" 33.8 | \n",
" 0.466 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 197 | \n",
" 3 | \n",
" 107 | \n",
" 62 | \n",
" 13 | \n",
" 48 | \n",
" 22.9 | \n",
" 0.678 | \n",
" 23 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"196 1 105 58 0 0 24.3 \n",
"69 4 146 85 27 100 28.9 \n",
"494 3 80 0 0 0 0.0 \n",
"463 5 88 78 30 0 27.6 \n",
"653 2 120 54 0 0 26.8 \n",
".. ... ... ... ... ... ... \n",
"322 0 124 70 20 0 27.4 \n",
"109 0 95 85 25 36 37.4 \n",
"27 1 97 66 15 140 23.2 \n",
"651 1 117 60 23 106 33.8 \n",
"197 3 107 62 13 48 22.9 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"196 0.187 21 0 \n",
"69 0.189 27 0 \n",
"494 0.174 22 0 \n",
"463 0.258 37 0 \n",
"653 0.455 27 0 \n",
".. ... ... ... \n",
"322 0.254 36 1 \n",
"109 0.247 24 1 \n",
"27 0.487 22 0 \n",
"651 0.466 27 0 \n",
"197 0.678 23 1 \n",
"\n",
"[614 rows x 9 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" 0 | \n",
"
\n",
" \n",
" 69 | \n",
" 0 | \n",
"
\n",
" \n",
" 494 | \n",
" 0 | \n",
"
\n",
" \n",
" 463 | \n",
" 0 | \n",
"
\n",
" \n",
" 653 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" 1 | \n",
"
\n",
" \n",
" 109 | \n",
" 1 | \n",
"
\n",
" \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 651 | \n",
" 0 | \n",
"
\n",
" \n",
" 197 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"196 0\n",
"69 0\n",
"494 0\n",
"463 0\n",
"653 0\n",
".. ...\n",
"322 1\n",
"109 1\n",
"27 0\n",
"651 0\n",
"197 1\n",
"\n",
"[614 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 669 | \n",
" 9 | \n",
" 154 | \n",
" 78 | \n",
" 30 | \n",
" 100 | \n",
" 30.9 | \n",
" 0.164 | \n",
" 45 | \n",
" 0 | \n",
"
\n",
" \n",
" 379 | \n",
" 0 | \n",
" 93 | \n",
" 100 | \n",
" 39 | \n",
" 72 | \n",
" 43.4 | \n",
" 1.021 | \n",
" 35 | \n",
" 0 | \n",
"
\n",
" \n",
" 640 | \n",
" 0 | \n",
" 102 | \n",
" 86 | \n",
" 17 | \n",
" 105 | \n",
" 29.3 | \n",
" 0.695 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 658 | \n",
" 11 | \n",
" 127 | \n",
" 106 | \n",
" 0 | \n",
" 0 | \n",
" 39.0 | \n",
" 0.190 | \n",
" 51 | \n",
" 0 | \n",
"
\n",
" \n",
" 304 | \n",
" 3 | \n",
" 150 | \n",
" 76 | \n",
" 0 | \n",
" 0 | \n",
" 21.0 | \n",
" 0.207 | \n",
" 37 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 203 | \n",
" 2 | \n",
" 99 | \n",
" 70 | \n",
" 16 | \n",
" 44 | \n",
" 20.4 | \n",
" 0.235 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 605 | \n",
" 1 | \n",
" 124 | \n",
" 60 | \n",
" 32 | \n",
" 0 | \n",
" 35.8 | \n",
" 0.514 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 561 | \n",
" 0 | \n",
" 198 | \n",
" 66 | \n",
" 32 | \n",
" 274 | \n",
" 41.3 | \n",
" 0.502 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 280 | \n",
" 0 | \n",
" 146 | \n",
" 70 | \n",
" 0 | \n",
" 0 | \n",
" 37.9 | \n",
" 0.334 | \n",
" 28 | \n",
" 1 | \n",
"
\n",
" \n",
" 103 | \n",
" 1 | \n",
" 81 | \n",
" 72 | \n",
" 18 | \n",
" 40 | \n",
" 26.6 | \n",
" 0.283 | \n",
" 24 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"669 9 154 78 30 100 30.9 \n",
"379 0 93 100 39 72 43.4 \n",
"640 0 102 86 17 105 29.3 \n",
"658 11 127 106 0 0 39.0 \n",
"304 3 150 76 0 0 21.0 \n",
".. ... ... ... ... ... ... \n",
"203 2 99 70 16 44 20.4 \n",
"605 1 124 60 32 0 35.8 \n",
"561 0 198 66 32 274 41.3 \n",
"280 0 146 70 0 0 37.9 \n",
"103 1 81 72 18 40 26.6 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"669 0.164 45 0 \n",
"379 1.021 35 0 \n",
"640 0.695 27 0 \n",
"658 0.190 51 0 \n",
"304 0.207 37 0 \n",
".. ... ... ... \n",
"203 0.235 27 0 \n",
"605 0.514 21 0 \n",
"561 0.502 28 1 \n",
"280 0.334 28 1 \n",
"103 0.283 24 0 \n",
"\n",
"[154 rows x 9 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 669 | \n",
" 0 | \n",
"
\n",
" \n",
" 379 | \n",
" 0 | \n",
"
\n",
" \n",
" 640 | \n",
" 0 | \n",
"
\n",
" \n",
" 658 | \n",
" 0 | \n",
"
\n",
" \n",
" 304 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 203 | \n",
" 0 | \n",
"
\n",
" \n",
" 605 | \n",
" 0 | \n",
"
\n",
" \n",
" 561 | \n",
" 1 | \n",
"
\n",
" \n",
" 280 | \n",
" 1 | \n",
"
\n",
" \n",
" 103 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"669 0\n",
"379 0\n",
"640 0\n",
"658 0\n",
"304 0\n",
".. ...\n",
"203 0\n",
"605 0\n",
"561 1\n",
"280 1\n",
"103 0\n",
"\n",
"[154 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"def split_stratified_into_train_val_test(\n",
" df_input: DataFrame,\n",
" stratify_colname: str = \"y\",\n",
" frac_train: float = 0.6,\n",
" frac_val: float = 0.15,\n",
" frac_test: float = 0.25,\n",
" random_state: int = None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
"\n",
" if not (0 < frac_train < 1) or not (0 <= frac_val <= 1) or not (0 <= frac_test <= 1):\n",
" raise ValueError(\"Fractions must be between 0 and 1 and the sum must equal 1.\")\n",
" \n",
" if not (frac_train + frac_val + frac_test == 1.0):\n",
" raise ValueError(\"fractions %f, %f, %f do not add up to 1.0\" %\n",
" (frac_train, frac_val, frac_test))\n",
"\n",
" if stratify_colname not in df_input.columns:\n",
" raise ValueError(f\"{stratify_colname} is not a column in the DataFrame.\")\n",
"\n",
" X = df_input\n",
" y = df_input[[stratify_colname]]\n",
"\n",
" \n",
" df_train, df_temp, y_train, y_temp = train_test_split(\n",
" X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
" )\n",
"\n",
" if frac_val == 0:\n",
" return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
"\n",
" relative_frac_test = frac_test / (frac_val + frac_test)\n",
"\n",
" df_val, df_test, y_val, y_test = train_test_split(\n",
" df_temp,\n",
" y_temp,\n",
" stratify=y_temp,\n",
" test_size=relative_frac_test,\n",
" random_state=random_state,\n",
" )\n",
"\n",
" assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
" \n",
" return df_train, df_val, df_test, y_train, y_val, y_test\n",
"\n",
"\n",
"X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
" df, stratify_colname=\"Outcome\", frac_train=0.80, frac_val=0.0, frac_test=0.20, random_state=random_state\n",
")\n",
"\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование конвейера для классификации данных"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.discriminant_analysis import StandardScaler\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"class DiabetFeatures(BaseEstimator, TransformerMixin):\n",
" def __init__(self):\n",
" pass\n",
" def fit(self, X, y=None):\n",
" return self\n",
" \n",
"\n",
"columns_to_drop = [\"Pregnancies\", \"SkinThickness\", \"Insulin\", \"BMI\"]\n",
"num_columns = [\"Glucose\", \"Age\", \"BloodPressure\", \"Outcome\", \"DiabetesPedigreeFunction\"]\n",
"cat_columns = []\n",
"\n",
"num_imputer = SimpleImputer(strategy=\"median\")\n",
"num_scaler = StandardScaler()\n",
"preprocessing_num = Pipeline(\n",
" [\n",
" (\"imputer\", num_imputer),\n",
" (\"scaler\", num_scaler),\n",
" ]\n",
")\n",
"\n",
"cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
"cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
"preprocessing_cat = Pipeline(\n",
" [\n",
" (\"imputer\", cat_imputer),\n",
" (\"encoder\", cat_encoder),\n",
" ]\n",
")\n",
"\n",
"features_preprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_num\", preprocessing_num, num_columns),\n",
" (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
" ],\n",
" remainder=\"passthrough\"\n",
")\n",
"\n",
"\n",
"drop_columns = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"drop_columns\", \"drop\", columns_to_drop),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"features_postprocessing = ColumnTransformer(\n",
" verbose_feature_names_out=False,\n",
" transformers=[\n",
" (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"\n",
"pipeline_end = Pipeline(\n",
" [\n",
" (\"features_preprocessing\", features_preprocessing),\n",
" (\"drop_columns\", drop_columns),\n",
" ]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Демонстрация работы конвейера"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Glucose | \n",
" Age | \n",
" BloodPressure | \n",
" Outcome | \n",
" DiabetesPedigreeFunction | \n",
"
\n",
" \n",
" \n",
" \n",
" 196 | \n",
" -0.478144 | \n",
" -1.029257 | \n",
" -0.554050 | \n",
" -0.731437 | \n",
" -0.849205 | \n",
"
\n",
" \n",
" 69 | \n",
" 0.818506 | \n",
" -0.522334 | \n",
" 0.804885 | \n",
" -0.731437 | \n",
" -0.843172 | \n",
"
\n",
" \n",
" 494 | \n",
" -1.268784 | \n",
" -0.944770 | \n",
" -3.473244 | \n",
" -0.731437 | \n",
" -0.888421 | \n",
"
\n",
" \n",
" 463 | \n",
" -1.015779 | \n",
" 0.322537 | \n",
" 0.452568 | \n",
" -0.731437 | \n",
" -0.635028 | \n",
"
\n",
" \n",
" 653 | \n",
" -0.003760 | \n",
" -0.522334 | \n",
" -0.755374 | \n",
" -0.731437 | \n",
" -0.040763 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 322 | \n",
" 0.122742 | \n",
" 0.238050 | \n",
" 0.049921 | \n",
" 1.367172 | \n",
" -0.647095 | \n",
"
\n",
" \n",
" 109 | \n",
" -0.794400 | \n",
" -0.775796 | \n",
" 0.804885 | \n",
" 1.367172 | \n",
" -0.668211 | \n",
"
\n",
" \n",
" 27 | \n",
" -0.731149 | \n",
" -0.944770 | \n",
" -0.151403 | \n",
" -0.731437 | \n",
" 0.055767 | \n",
"
\n",
" \n",
" 651 | \n",
" -0.098637 | \n",
" -0.522334 | \n",
" -0.453388 | \n",
" -0.731437 | \n",
" -0.007581 | \n",
"
\n",
" \n",
" 197 | \n",
" -0.414893 | \n",
" -0.860283 | \n",
" -0.352726 | \n",
" 1.367172 | \n",
" 0.631933 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 5 columns
\n",
"
"
],
"text/plain": [
" Glucose Age BloodPressure Outcome DiabetesPedigreeFunction\n",
"196 -0.478144 -1.029257 -0.554050 -0.731437 -0.849205\n",
"69 0.818506 -0.522334 0.804885 -0.731437 -0.843172\n",
"494 -1.268784 -0.944770 -3.473244 -0.731437 -0.888421\n",
"463 -1.015779 0.322537 0.452568 -0.731437 -0.635028\n",
"653 -0.003760 -0.522334 -0.755374 -0.731437 -0.040763\n",
".. ... ... ... ... ...\n",
"322 0.122742 0.238050 0.049921 1.367172 -0.647095\n",
"109 -0.794400 -0.775796 0.804885 1.367172 -0.668211\n",
"27 -0.731149 -0.944770 -0.151403 -0.731437 0.055767\n",
"651 -0.098637 -0.522334 -0.453388 -0.731437 -0.007581\n",
"197 -0.414893 -0.860283 -0.352726 1.367172 0.631933\n",
"\n",
"[614 rows x 5 columns]"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.fit_transform(X_train)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"preprocessed_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование набора моделей для классификации"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
"\n",
"random_state = 9\n",
"\n",
"models = {\n",
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
" \"linear_poly\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"linear_interact\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(interaction_only=True),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestRegressor(\n",
" max_depth=7, random_state=random_state, n_jobs=-1\n",
" )\n",
" },\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPRegressor(\n",
" activation=\"tanh\",\n",
" hidden_layer_sizes=(3,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обучение моделей на обучающем наборе данных и оценка на тестовом¶"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: logistic\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: naive_bayes\n",
"Model: gradient_boosting\n",
"Model: random_forest\n",
"Model: mlp\n"
]
}
],
"source": [
"import numpy as np\n",
"from sklearn import metrics\n",
"\n",
"for model_name in class_models.keys():\n",
" print(f\"Model: {model_name}\")\n",
" model = class_models[model_name][\"model\"]\n",
"\n",
" model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
" model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
"\n",
" y_train_predict = model_pipeline.predict(X_train)\n",
" y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
" y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
"\n",
" class_models[model_name][\"pipeline\"] = model_pipeline\n",
" class_models[model_name][\"probs\"] = y_test_probs\n",
" class_models[model_name][\"preds\"] = y_test_predict\n",
"\n",
" class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
" y_train, y_train_predict\n",
" )\n",
" class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
" y_test, y_test_probs\n",
" )\n",
" class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
" class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
" class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
" y_test, y_test_predict\n",
" )\n",
" class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
" y_test, y_test_predict\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Сводная таблица оценок качества для использованных моделей классификации¶\n",
"\n",
"Матрица неточностей\n"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"import matplotlib.pyplot as plt\n",
"\n",
"_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
"for index, key in enumerate(class_models.keys()):\n",
" c_matrix = class_models[key][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
" ).plot(ax=ax.flat[index])\n",
" disp.ax_.set_title(key)\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Precision_train | \n",
" Precision_test | \n",
" Recall_train | \n",
" Recall_test | \n",
" Accuracy_train | \n",
" Accuracy_test | \n",
" F1_train | \n",
" F1_test | \n",
"
\n",
" \n",
" \n",
" \n",
" logistic | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" ridge | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" decision_tree | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" naive_bayes | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" random_forest | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" gradient_boosting | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" knn | \n",
" 1.000000 | \n",
" 0.981818 | \n",
" 0.990654 | \n",
" 1.000000 | \n",
" 0.996743 | \n",
" 0.993506 | \n",
" 0.995305 | \n",
" 0.990826 | \n",
"
\n",
" \n",
" mlp | \n",
" 0.729323 | \n",
" 0.685714 | \n",
" 0.453271 | \n",
" 0.444444 | \n",
" 0.750814 | \n",
" 0.733766 | \n",
" 0.559078 | \n",
" 0.539326 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(\n",
" by=\"Accuracy_test\", ascending=False\n",
").style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"Почти все модели, включая логистическую регрессию, ридж-регрессию, KNN, наивный байесовский классификатор, многослойную перцептронную сеть, случайный лес, дерево решений и градиентный бустинг, демонстрируют 100% точность (1.000000) на обучающей выборке. Это указывает на то, что модели смогли подстроиться под обучающие данные, что может указывать на возможное переобучение.\n",
"\n",
"ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса\n"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Accuracy_test | \n",
" F1_test | \n",
" ROC_AUC_test | \n",
" Cohen_kappa_test | \n",
" MCC_test | \n",
"
\n",
" \n",
" \n",
" \n",
" logistic | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" ridge | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" decision_tree | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" knn | \n",
" 0.993506 | \n",
" 0.990826 | \n",
" 1.000000 | \n",
" 0.985801 | \n",
" 0.985901 | \n",
"
\n",
" \n",
" naive_bayes | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" gradient_boosting | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" random_forest | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" mlp | \n",
" 0.733766 | \n",
" 0.539326 | \n",
" 0.653148 | \n",
" 0.363893 | \n",
" 0.380814 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"]\n",
"class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'logistic'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
"\n",
"display(best_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод данных с ошибкой предсказания для оценки"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Error items count: 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Predicted | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Pregnancies, Predicted, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]\n",
"Index: []"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessing_result = pipeline_end.transform(X_test)\n",
"preprocessed_df = pd.DataFrame(\n",
" preprocessing_result,\n",
" columns=pipeline_end.get_feature_names_out(),\n",
")\n",
"\n",
"y_pred = class_models[best_model][\"preds\"]\n",
"\n",
"error_index = y_test[y_test[\"Outcome\"] != y_pred].index.tolist()\n",
"display(f\"Error items count: {len(error_index)}\")\n",
"\n",
"error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
"error_df = X_test.loc[error_index].copy()\n",
"error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
"error_df.sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Пример использования обученной модели (конвейера) для предсказания"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 555 | \n",
" 7.0 | \n",
" 124.0 | \n",
" 70.0 | \n",
" 33.0 | \n",
" 215.0 | \n",
" 25.5 | \n",
" 0.161 | \n",
" 37.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"555 7.0 124.0 70.0 33.0 215.0 25.5 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"555 0.161 37.0 0.0 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Glucose | \n",
" Age | \n",
" BloodPressure | \n",
" Outcome | \n",
" DiabetesPedigreeFunction | \n",
"
\n",
" \n",
" \n",
" \n",
" 555 | \n",
" 0.122742 | \n",
" 0.322537 | \n",
" 0.049921 | \n",
" -0.731437 | \n",
" -0.927636 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Glucose Age BloodPressure Outcome DiabetesPedigreeFunction\n",
"555 0.122742 0.322537 0.049921 -0.731437 -0.927636"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'predicted: 0 (proba: [0.99431769 0.00568231])'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'real: 0'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = class_models[best_model][\"pipeline\"]\n",
"\n",
"example_id = 555\n",
"test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
"test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
"display(test)\n",
"display(test_preprocessed)\n",
"result_proba = model.predict_proba(test)[0]\n",
"result = model.predict(test)[0]\n",
"real = int(y_test.loc[example_id].values[0])\n",
"display(f\"predicted: {result} (proba: {result_proba})\")\n",
"display(f\"real: {real}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Подбор гиперпараметров методом поиска по сетке"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
" _data = np.array(data, dtype=dtype, copy=copy,\n"
]
},
{
"data": {
"text/plain": [
"{'model__criterion': 'gini',\n",
" 'model__max_depth': 5,\n",
" 'model__max_features': 'sqrt',\n",
" 'model__n_estimators': 10}"
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"optimized_model_type = \"random_forest\"\n",
"\n",
"random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
"\n",
"param_grid = {\n",
" \"model__n_estimators\": [10, 50, 100],\n",
" \"model__max_features\": [\"sqrt\", \"log2\"],\n",
" \"model__max_depth\": [5, 7, 10],\n",
" \"model__criterion\": [\"gini\", \"entropy\"],\n",
"}\n",
"\n",
"gs_optomizer = GridSearchCV(\n",
" estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
")\n",
"gs_optomizer.fit(X_train, y_train.values.ravel())\n",
"gs_optomizer.best_params_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обучение модели с новыми гиперпараметрами"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"optimized_model = ensemble.RandomForestClassifier(\n",
" random_state=random_state,\n",
" criterion=\"gini\",\n",
" max_depth=5,\n",
" max_features=\"log2\",\n",
" n_estimators=10,\n",
")\n",
"\n",
"result = {}\n",
"\n",
"result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
"result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
"result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
"result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
"\n",
"result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
"result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
"result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
"result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
"result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
"result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
"result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
"result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
"result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
"result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
"result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
"result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Формирование данных для оценки старой и новой версии модели"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=class_models[optimized_model_type]\n",
")\n",
"optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
" data=result\n",
")\n",
"optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
"optimized_metrics = optimized_metrics.set_index(\"Name\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка параметров старой и новой модели"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Precision_train | \n",
" Precision_test | \n",
" Recall_train | \n",
" Recall_test | \n",
" Accuracy_train | \n",
" Accuracy_test | \n",
" F1_train | \n",
" F1_test | \n",
"
\n",
" \n",
" Name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Old | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" New | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" \"Accuracy_train\",\n",
" \"Accuracy_test\",\n",
" \"F1_train\",\n",
" \"F1_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Precision_train\",\n",
" \"Precision_test\",\n",
" \"Recall_train\",\n",
" \"Recall_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" Accuracy_test | \n",
" F1_test | \n",
" ROC_AUC_test | \n",
" Cohen_kappa_test | \n",
" MCC_test | \n",
"
\n",
" \n",
" Name | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Old | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" New | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"optimized_metrics[\n",
" [\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" \"ROC_AUC_test\",\n",
" \"Cohen_kappa_test\",\n",
" \"MCC_test\",\n",
" ]\n",
"].style.background_gradient(\n",
" cmap=\"plasma\",\n",
" low=0.3,\n",
" high=1,\n",
" subset=[\n",
" \"ROC_AUC_test\",\n",
" \"MCC_test\",\n",
" \"Cohen_kappa_test\",\n",
" ],\n",
").background_gradient(\n",
" cmap=\"viridis\",\n",
" low=1,\n",
" high=0.3,\n",
" subset=[\n",
" \"Accuracy_test\",\n",
" \"F1_test\",\n",
" ],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
")\n",
"\n",
"for index in range(0, len(optimized_metrics)):\n",
" c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
" disp = ConfusionMatrixDisplay(\n",
" confusion_matrix=c_matrix, display_labels=[\"Less\", \"More\"]\n",
" ).plot(ax=ax.flat[index])\n",
"\n",
"plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Регрессионная модель"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
" 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
" dtype='object')\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6 | \n",
" 148 | \n",
" 72 | \n",
" 35 | \n",
" 0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 85 | \n",
" 66 | \n",
" 29 | \n",
" 0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 183 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 89 | \n",
" 66 | \n",
" 23 | \n",
" 94 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 137 | \n",
" 40 | \n",
" 35 | \n",
" 168 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 763 | \n",
" 10 | \n",
" 101 | \n",
" 76 | \n",
" 48 | \n",
" 180 | \n",
" 32.9 | \n",
" 0.171 | \n",
" 63 | \n",
" 0 | \n",
"
\n",
" \n",
" 764 | \n",
" 2 | \n",
" 122 | \n",
" 70 | \n",
" 27 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.340 | \n",
" 27 | \n",
" 0 | \n",
"
\n",
" \n",
" 765 | \n",
" 5 | \n",
" 121 | \n",
" 72 | \n",
" 23 | \n",
" 112 | \n",
" 26.2 | \n",
" 0.245 | \n",
" 30 | \n",
" 0 | \n",
"
\n",
" \n",
" 766 | \n",
" 1 | \n",
" 126 | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 30.1 | \n",
" 0.349 | \n",
" 47 | \n",
" 1 | \n",
"
\n",
" \n",
" 767 | \n",
" 1 | \n",
" 93 | \n",
" 70 | \n",
" 31 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.315 | \n",
" 23 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
768 rows × 9 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
".. ... ... ... ... ... ... \n",
"763 10 101 76 48 180 32.9 \n",
"764 2 122 70 27 0 36.8 \n",
"765 5 121 72 23 112 26.2 \n",
"766 1 126 60 0 0 30.1 \n",
"767 1 93 70 31 0 30.4 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 \n",
".. ... ... ... \n",
"763 0.171 63 0 \n",
"764 0.340 27 0 \n",
"765 0.245 30 0 \n",
"766 0.349 47 1 \n",
"767 0.315 23 0 \n",
"\n",
"[768 rows x 9 columns]"
]
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import set_config\n",
"\n",
"random_state=9\n",
"set_config(transform_output=\"pandas\")\n",
"df = pd.read_csv(\".//scv//diabetes.csv\")\n",
"print(df.columns)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разделение набора данных на обучающую и тестовые выборки"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'X_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
" 60 | \n",
" 2 | \n",
" 84 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.304 | \n",
" 21 | \n",
"
\n",
" \n",
" 618 | \n",
" 9 | \n",
" 112 | \n",
" 82 | \n",
" 24 | \n",
" 0 | \n",
" 28.2 | \n",
" 1.282 | \n",
" 50 | \n",
"
\n",
" \n",
" 346 | \n",
" 1 | \n",
" 139 | \n",
" 46 | \n",
" 19 | \n",
" 83 | \n",
" 28.7 | \n",
" 0.654 | \n",
" 22 | \n",
"
\n",
" \n",
" 294 | \n",
" 0 | \n",
" 161 | \n",
" 50 | \n",
" 0 | \n",
" 0 | \n",
" 21.9 | \n",
" 0.254 | \n",
" 65 | \n",
"
\n",
" \n",
" 231 | \n",
" 6 | \n",
" 134 | \n",
" 80 | \n",
" 37 | \n",
" 370 | \n",
" 46.2 | \n",
" 0.238 | \n",
" 46 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 71 | \n",
" 5 | \n",
" 139 | \n",
" 64 | \n",
" 35 | \n",
" 140 | \n",
" 28.6 | \n",
" 0.411 | \n",
" 26 | \n",
"
\n",
" \n",
" 106 | \n",
" 1 | \n",
" 96 | \n",
" 122 | \n",
" 0 | \n",
" 0 | \n",
" 22.4 | \n",
" 0.207 | \n",
" 27 | \n",
"
\n",
" \n",
" 270 | \n",
" 10 | \n",
" 101 | \n",
" 86 | \n",
" 37 | \n",
" 0 | \n",
" 45.6 | \n",
" 1.136 | \n",
" 38 | \n",
"
\n",
" \n",
" 435 | \n",
" 0 | \n",
" 141 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 42.4 | \n",
" 0.205 | \n",
" 29 | \n",
"
\n",
" \n",
" 102 | \n",
" 0 | \n",
" 125 | \n",
" 96 | \n",
" 0 | \n",
" 0 | \n",
" 22.5 | \n",
" 0.262 | \n",
" 21 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 8 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"60 2 84 0 0 0 0.0 \n",
"618 9 112 82 24 0 28.2 \n",
"346 1 139 46 19 83 28.7 \n",
"294 0 161 50 0 0 21.9 \n",
"231 6 134 80 37 370 46.2 \n",
".. ... ... ... ... ... ... \n",
"71 5 139 64 35 140 28.6 \n",
"106 1 96 122 0 0 22.4 \n",
"270 10 101 86 37 0 45.6 \n",
"435 0 141 0 0 0 42.4 \n",
"102 0 125 96 0 0 22.5 \n",
"\n",
" DiabetesPedigreeFunction Age \n",
"60 0.304 21 \n",
"618 1.282 50 \n",
"346 0.654 22 \n",
"294 0.254 65 \n",
"231 0.238 46 \n",
".. ... ... \n",
"71 0.411 26 \n",
"106 0.207 27 \n",
"270 1.136 38 \n",
"435 0.205 29 \n",
"102 0.262 21 \n",
"\n",
"[614 rows x 8 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_train'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 60 | \n",
" 0 | \n",
"
\n",
" \n",
" 618 | \n",
" 1 | \n",
"
\n",
" \n",
" 346 | \n",
" 0 | \n",
"
\n",
" \n",
" 294 | \n",
" 0 | \n",
"
\n",
" \n",
" 231 | \n",
" 1 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 71 | \n",
" 0 | \n",
"
\n",
" \n",
" 106 | \n",
" 0 | \n",
"
\n",
" \n",
" 270 | \n",
" 1 | \n",
"
\n",
" \n",
" 435 | \n",
" 1 | \n",
"
\n",
" \n",
" 102 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
614 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"60 0\n",
"618 1\n",
"346 0\n",
"294 0\n",
"231 1\n",
".. ...\n",
"71 0\n",
"106 0\n",
"270 1\n",
"435 1\n",
"102 0\n",
"\n",
"[614 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'X_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
" 668 | \n",
" 6 | \n",
" 98 | \n",
" 58 | \n",
" 33 | \n",
" 190 | \n",
" 34.0 | \n",
" 0.430 | \n",
" 43 | \n",
"
\n",
" \n",
" 324 | \n",
" 2 | \n",
" 112 | \n",
" 75 | \n",
" 32 | \n",
" 0 | \n",
" 35.7 | \n",
" 0.148 | \n",
" 21 | \n",
"
\n",
" \n",
" 624 | \n",
" 2 | \n",
" 108 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 30.8 | \n",
" 0.158 | \n",
" 21 | \n",
"
\n",
" \n",
" 690 | \n",
" 8 | \n",
" 107 | \n",
" 80 | \n",
" 0 | \n",
" 0 | \n",
" 24.6 | \n",
" 0.856 | \n",
" 34 | \n",
"
\n",
" \n",
" 473 | \n",
" 7 | \n",
" 136 | \n",
" 90 | \n",
" 0 | \n",
" 0 | \n",
" 29.9 | \n",
" 0.210 | \n",
" 50 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 355 | \n",
" 9 | \n",
" 165 | \n",
" 88 | \n",
" 0 | \n",
" 0 | \n",
" 30.4 | \n",
" 0.302 | \n",
" 49 | \n",
"
\n",
" \n",
" 534 | \n",
" 1 | \n",
" 77 | \n",
" 56 | \n",
" 30 | \n",
" 56 | \n",
" 33.3 | \n",
" 1.251 | \n",
" 24 | \n",
"
\n",
" \n",
" 344 | \n",
" 8 | \n",
" 95 | \n",
" 72 | \n",
" 0 | \n",
" 0 | \n",
" 36.8 | \n",
" 0.485 | \n",
" 57 | \n",
"
\n",
" \n",
" 296 | \n",
" 2 | \n",
" 146 | \n",
" 70 | \n",
" 38 | \n",
" 360 | \n",
" 28.0 | \n",
" 0.337 | \n",
" 29 | \n",
"
\n",
" \n",
" 462 | \n",
" 8 | \n",
" 74 | \n",
" 70 | \n",
" 40 | \n",
" 49 | \n",
" 35.3 | \n",
" 0.705 | \n",
" 39 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 8 columns
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"668 6 98 58 33 190 34.0 \n",
"324 2 112 75 32 0 35.7 \n",
"624 2 108 64 0 0 30.8 \n",
"690 8 107 80 0 0 24.6 \n",
"473 7 136 90 0 0 29.9 \n",
".. ... ... ... ... ... ... \n",
"355 9 165 88 0 0 30.4 \n",
"534 1 77 56 30 56 33.3 \n",
"344 8 95 72 0 0 36.8 \n",
"296 2 146 70 38 360 28.0 \n",
"462 8 74 70 40 49 35.3 \n",
"\n",
" DiabetesPedigreeFunction Age \n",
"668 0.430 43 \n",
"324 0.148 21 \n",
"624 0.158 21 \n",
"690 0.856 34 \n",
"473 0.210 50 \n",
".. ... ... \n",
"355 0.302 49 \n",
"534 1.251 24 \n",
"344 0.485 57 \n",
"296 0.337 29 \n",
"462 0.705 39 \n",
"\n",
"[154 rows x 8 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'y_test'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Outcome | \n",
"
\n",
" \n",
" \n",
" \n",
" 668 | \n",
" 0 | \n",
"
\n",
" \n",
" 324 | \n",
" 0 | \n",
"
\n",
" \n",
" 624 | \n",
" 0 | \n",
"
\n",
" \n",
" 690 | \n",
" 0 | \n",
"
\n",
" \n",
" 473 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 355 | \n",
" 1 | \n",
"
\n",
" \n",
" 534 | \n",
" 0 | \n",
"
\n",
" \n",
" 344 | \n",
" 0 | \n",
"
\n",
" \n",
" 296 | \n",
" 1 | \n",
"
\n",
" \n",
" 462 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
154 rows × 1 columns
\n",
"
"
],
"text/plain": [
" Outcome\n",
"668 0\n",
"324 0\n",
"624 0\n",
"690 0\n",
"473 0\n",
".. ...\n",
"355 1\n",
"534 0\n",
"344 0\n",
"296 1\n",
"462 0\n",
"\n",
"[154 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from typing import Tuple\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"def split_into_train_test(\n",
" df_input: DataFrame,\n",
" target_colname: str = \"above_average_close\",\n",
" frac_train: float = 0.8,\n",
" random_state: int = None,\n",
") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n",
" \n",
" if not (0 < frac_train < 1):\n",
" raise ValueError(\"Fraction must be between 0 and 1.\")\n",
" \n",
" # Проверка наличия целевого признака\n",
" if target_colname not in df_input.columns:\n",
" raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n",
" \n",
" # Разделяем данные на признаки и целевую переменную\n",
" X = df_input.drop(columns=[target_colname]) # Признаки\n",
" y = df_input[[target_colname]] # Целевая переменная\n",
"\n",
" # Разделяем данные на обучающую и тестовую выборки\n",
" X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y,\n",
" test_size=(1.0 - frac_train),\n",
" random_state=random_state\n",
" )\n",
" \n",
" return X_train, X_test, y_train, y_test\n",
"\n",
"# Применение функции для разделения данных\n",
"X_train, X_test, y_train, y_test = split_into_train_test(\n",
" df, \n",
" target_colname=\"Outcome\", \n",
" frac_train=0.8, \n",
" random_state=42 # Убедитесь, что вы задали нужное значение random_state\n",
")\n",
"\n",
"# Для отображения результатов\n",
"display(\"X_train\", X_train)\n",
"display(\"y_train\", y_train)\n",
"\n",
"display(\"X_test\", X_test)\n",
"display(\"y_test\", y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Определение перечня алгоритмов решения задачи аппроксимации (регрессии)"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
"\n",
"random_state = 9\n",
"\n",
"models = {\n",
" \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
" \"linear_poly\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(degree=2),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"linear_interact\": {\n",
" \"model\": make_pipeline(\n",
" PolynomialFeatures(interaction_only=True),\n",
" linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
" )\n",
" },\n",
" \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
" \"decision_tree\": {\n",
" \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
" },\n",
" \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
" \"random_forest\": {\n",
" \"model\": ensemble.RandomForestRegressor(\n",
" max_depth=7, random_state=random_state, n_jobs=-1\n",
" )\n",
" },\n",
" \"mlp\": {\n",
" \"model\": neural_network.MLPRegressor(\n",
" activation=\"tanh\",\n",
" hidden_layer_sizes=(3,),\n",
" max_iter=500,\n",
" early_stopping=True,\n",
" random_state=random_state,\n",
" )\n",
" },\n",
"}\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: linear\n",
"Model: linear_poly\n",
"Model: linear_interact\n",
"Model: ridge\n",
"Model: decision_tree\n",
"Model: knn\n",
"Model: random_forest\n",
"Model: mlp\n"
]
}
],
"source": [
"import math\n",
"from pandas import DataFrame\n",
"from sklearn import metrics\n",
"\n",
"for model_name in models.keys():\n",
" print(f\"Model: {model_name}\")\n",
"\n",
" fitted_model = models[model_name][\"model\"].fit(\n",
" X_train.values, y_train.values.ravel()\n",
" )\n",
" y_train_pred = fitted_model.predict(X_train.values)\n",
" y_test_pred = fitted_model.predict(X_test.values)\n",
" models[model_name][\"fitted\"] = fitted_model\n",
" models[model_name][\"train_preds\"] = y_train_pred\n",
" models[model_name][\"preds\"] = y_test_pred\n",
" models[model_name][\"RMSE_train\"] = math.sqrt(\n",
" metrics.mean_squared_error(y_train, y_train_pred)\n",
" )\n",
" models[model_name][\"RMSE_test\"] = math.sqrt(\n",
" metrics.mean_squared_error(y_test, y_test_pred)\n",
" )\n",
" models[model_name][\"RMAE_test\"] = math.sqrt(\n",
" metrics.mean_absolute_error(y_test, y_test_pred)\n",
" )\n",
" models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод результатов оценки"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" \n",
" \n",
" | \n",
" RMSE_train | \n",
" RMSE_test | \n",
" RMAE_test | \n",
" R2_test | \n",
"
\n",
" \n",
" \n",
" \n",
" random_forest | \n",
" 0.240052 | \n",
" 0.405871 | \n",
" 0.559210 | \n",
" 0.282505 | \n",
"
\n",
" \n",
" linear | \n",
" 0.396793 | \n",
" 0.413576 | \n",
" 0.590024 | \n",
" 0.255003 | \n",
"
\n",
" \n",
" ridge | \n",
" 0.396822 | \n",
" 0.414236 | \n",
" 0.590431 | \n",
" 0.252623 | \n",
"
\n",
" \n",
" linear_poly | \n",
" 0.370076 | \n",
" 0.422852 | \n",
" 0.584147 | \n",
" 0.221209 | \n",
"
\n",
" \n",
" linear_interact | \n",
" 0.380128 | \n",
" 0.426815 | \n",
" 0.593532 | \n",
" 0.206543 | \n",
"
\n",
" \n",
" decision_tree | \n",
" 0.249880 | \n",
" 0.445708 | \n",
" 0.520376 | \n",
" 0.134743 | \n",
"
\n",
" \n",
" knn | \n",
" 0.373319 | \n",
" 0.450285 | \n",
" 0.592157 | \n",
" 0.116883 | \n",
"
\n",
" \n",
" mlp | \n",
" 0.623529 | \n",
" 0.544323 | \n",
" 0.658689 | \n",
" -0.290498 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
""
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n",
" [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n",
"]\n",
"reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n",
" cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n",
").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Вывод реального и \"спрогнозированного\" результата для обучающей и тестовой выборок\n",
"\n",
"Получение лучшей модели\n"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'random_forest'"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"best_model = str(reg_metrics.sort_values(by=\"RMSE_test\").iloc[0].name)\n",
"\n",
"display(best_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод для обучающей выборки"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
" DiabetPred | \n",
"
\n",
" \n",
" \n",
" \n",
" 60 | \n",
" 2 | \n",
" 84 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 0.304 | \n",
" 21 | \n",
" 0 | \n",
" 0.001849 | \n",
"
\n",
" \n",
" 618 | \n",
" 9 | \n",
" 112 | \n",
" 82 | \n",
" 24 | \n",
" 0 | \n",
" 28.2 | \n",
" 1.282 | \n",
" 50 | \n",
" 1 | \n",
" 0.758997 | \n",
"
\n",
" \n",
" 346 | \n",
" 1 | \n",
" 139 | \n",
" 46 | \n",
" 19 | \n",
" 83 | \n",
" 28.7 | \n",
" 0.654 | \n",
" 22 | \n",
" 0 | \n",
" 0.149231 | \n",
"
\n",
" \n",
" 294 | \n",
" 0 | \n",
" 161 | \n",
" 50 | \n",
" 0 | \n",
" 0 | \n",
" 21.9 | \n",
" 0.254 | \n",
" 65 | \n",
" 0 | \n",
" 0.239564 | \n",
"
\n",
" \n",
" 231 | \n",
" 6 | \n",
" 134 | \n",
" 80 | \n",
" 37 | \n",
" 370 | \n",
" 46.2 | \n",
" 0.238 | \n",
" 46 | \n",
" 1 | \n",
" 0.773890 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"60 2 84 0 0 0 0.0 \n",
"618 9 112 82 24 0 28.2 \n",
"346 1 139 46 19 83 28.7 \n",
"294 0 161 50 0 0 21.9 \n",
"231 6 134 80 37 370 46.2 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome DiabetPred \n",
"60 0.304 21 0 0.001849 \n",
"618 1.282 50 1 0.758997 \n",
"346 0.654 22 0 0.149231 \n",
"294 0.254 65 0 0.239564 \n",
"231 0.238 46 1 0.773890 "
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [\n",
" X_train,\n",
" y_train,\n",
" pd.Series(\n",
" models[best_model][\"train_preds\"],\n",
" index=y_train.index,\n",
" name=\"DiabetPred\",\n",
" ),\n",
" ],\n",
" axis=1,\n",
").head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Вывод для тестовой выборки"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pregnancies | \n",
" Glucose | \n",
" BloodPressure | \n",
" SkinThickness | \n",
" Insulin | \n",
" BMI | \n",
" DiabetesPedigreeFunction | \n",
" Age | \n",
" Outcome | \n",
" DiabetPred | \n",
"
\n",
" \n",
" \n",
" \n",
" 668 | \n",
" 6 | \n",
" 98 | \n",
" 58 | \n",
" 33 | \n",
" 190 | \n",
" 34.0 | \n",
" 0.430 | \n",
" 43 | \n",
" 0 | \n",
" 0.516537 | \n",
"
\n",
" \n",
" 324 | \n",
" 2 | \n",
" 112 | \n",
" 75 | \n",
" 32 | \n",
" 0 | \n",
" 35.7 | \n",
" 0.148 | \n",
" 21 | \n",
" 0 | \n",
" 0.205507 | \n",
"
\n",
" \n",
" 624 | \n",
" 2 | \n",
" 108 | \n",
" 64 | \n",
" 0 | \n",
" 0 | \n",
" 30.8 | \n",
" 0.158 | \n",
" 21 | \n",
" 0 | \n",
" 0.047710 | \n",
"
\n",
" \n",
" 690 | \n",
" 8 | \n",
" 107 | \n",
" 80 | \n",
" 0 | \n",
" 0 | \n",
" 24.6 | \n",
" 0.856 | \n",
" 34 | \n",
" 0 | \n",
" 0.128867 | \n",
"
\n",
" \n",
" 473 | \n",
" 7 | \n",
" 136 | \n",
" 90 | \n",
" 0 | \n",
" 0 | \n",
" 29.9 | \n",
" 0.210 | \n",
" 50 | \n",
" 0 | \n",
" 0.438512 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"668 6 98 58 33 190 34.0 \n",
"324 2 112 75 32 0 35.7 \n",
"624 2 108 64 0 0 30.8 \n",
"690 8 107 80 0 0 24.6 \n",
"473 7 136 90 0 0 29.9 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome DiabetPred \n",
"668 0.430 43 0 0.516537 \n",
"324 0.148 21 0 0.205507 \n",
"624 0.158 21 0 0.047710 \n",
"690 0.856 34 0 0.128867 \n",
"473 0.210 50 0 0.438512 "
]
},
"execution_count": 174,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [\n",
" X_test,\n",
" y_test,\n",
" pd.Series(\n",
" models[best_model][\"preds\"],\n",
" index=y_test.index,\n",
" name=\"DiabetPred\",\n",
" ),\n",
" ],\n",
" axis=1,\n",
").head(5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}