diff --git a/Lab4/lab4.ipynb b/Lab4/lab4.ipynb
new file mode 100644
index 0000000..84244af
--- /dev/null
+++ b/Lab4/lab4.ipynb
@@ -0,0 +1,7580 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Лабораторная 4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Информация о диабете индейцев Пима"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 267,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
+ " 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 148 | \n",
+ " 72 | \n",
+ " 35 | \n",
+ " 0 | \n",
+ " 33.6 | \n",
+ " 0.627 | \n",
+ " 50 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 85 | \n",
+ " 66 | \n",
+ " 29 | \n",
+ " 0 | \n",
+ " 26.6 | \n",
+ " 0.351 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 183 | \n",
+ " 64 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 23.3 | \n",
+ " 0.672 | \n",
+ " 32 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 89 | \n",
+ " 66 | \n",
+ " 23 | \n",
+ " 94 | \n",
+ " 28.1 | \n",
+ " 0.167 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 137 | \n",
+ " 40 | \n",
+ " 35 | \n",
+ " 168 | \n",
+ " 43.1 | \n",
+ " 2.288 | \n",
+ " 33 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 763 | \n",
+ " 10 | \n",
+ " 101 | \n",
+ " 76 | \n",
+ " 48 | \n",
+ " 180 | \n",
+ " 32.9 | \n",
+ " 0.171 | \n",
+ " 63 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 764 | \n",
+ " 2 | \n",
+ " 122 | \n",
+ " 70 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ " 36.8 | \n",
+ " 0.340 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 765 | \n",
+ " 5 | \n",
+ " 121 | \n",
+ " 72 | \n",
+ " 23 | \n",
+ " 112 | \n",
+ " 26.2 | \n",
+ " 0.245 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 766 | \n",
+ " 1 | \n",
+ " 126 | \n",
+ " 60 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30.1 | \n",
+ " 0.349 | \n",
+ " 47 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 767 | \n",
+ " 1 | \n",
+ " 93 | \n",
+ " 70 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 30.4 | \n",
+ " 0.315 | \n",
+ " 23 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
768 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "0 6 148 72 35 0 33.6 \n",
+ "1 1 85 66 29 0 26.6 \n",
+ "2 8 183 64 0 0 23.3 \n",
+ "3 1 89 66 23 94 28.1 \n",
+ "4 0 137 40 35 168 43.1 \n",
+ ".. ... ... ... ... ... ... \n",
+ "763 10 101 76 48 180 32.9 \n",
+ "764 2 122 70 27 0 36.8 \n",
+ "765 5 121 72 23 112 26.2 \n",
+ "766 1 126 60 0 0 30.1 \n",
+ "767 1 93 70 31 0 30.4 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "0 0.627 50 1 \n",
+ "1 0.351 31 0 \n",
+ "2 0.672 32 1 \n",
+ "3 0.167 21 0 \n",
+ "4 2.288 33 1 \n",
+ ".. ... ... ... \n",
+ "763 0.171 63 0 \n",
+ "764 0.340 27 0 \n",
+ "765 0.245 30 0 \n",
+ "766 0.349 47 1 \n",
+ "767 0.315 23 0 \n",
+ "\n",
+ "[768 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 267,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "df = pd.read_csv(\".//scv//diabetes.csv\")\n",
+ "print(df.columns)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование выборок"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 268,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " 1 | \n",
+ " 105 | \n",
+ " 58 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 24.3 | \n",
+ " 0.187 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 4 | \n",
+ " 146 | \n",
+ " 85 | \n",
+ " 27 | \n",
+ " 100 | \n",
+ " 28.9 | \n",
+ " 0.189 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " 3 | \n",
+ " 80 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.174 | \n",
+ " 22 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " 5 | \n",
+ " 88 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ " 27.6 | \n",
+ " 0.258 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " 2 | \n",
+ " 120 | \n",
+ " 54 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 26.8 | \n",
+ " 0.455 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 70 | \n",
+ " 20 | \n",
+ " 0 | \n",
+ " 27.4 | \n",
+ " 0.254 | \n",
+ " 36 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 0 | \n",
+ " 95 | \n",
+ " 85 | \n",
+ " 25 | \n",
+ " 36 | \n",
+ " 37.4 | \n",
+ " 0.247 | \n",
+ " 24 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 97 | \n",
+ " 66 | \n",
+ " 15 | \n",
+ " 140 | \n",
+ " 23.2 | \n",
+ " 0.487 | \n",
+ " 22 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " 1 | \n",
+ " 117 | \n",
+ " 60 | \n",
+ " 23 | \n",
+ " 106 | \n",
+ " 33.8 | \n",
+ " 0.466 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " 3 | \n",
+ " 107 | \n",
+ " 62 | \n",
+ " 13 | \n",
+ " 48 | \n",
+ " 22.9 | \n",
+ " 0.678 | \n",
+ " 23 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "196 1 105 58 0 0 24.3 \n",
+ "69 4 146 85 27 100 28.9 \n",
+ "494 3 80 0 0 0 0.0 \n",
+ "463 5 88 78 30 0 27.6 \n",
+ "653 2 120 54 0 0 26.8 \n",
+ ".. ... ... ... ... ... ... \n",
+ "322 0 124 70 20 0 27.4 \n",
+ "109 0 95 85 25 36 37.4 \n",
+ "27 1 97 66 15 140 23.2 \n",
+ "651 1 117 60 23 106 33.8 \n",
+ "197 3 107 62 13 48 22.9 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "196 0.187 21 0 \n",
+ "69 0.189 27 0 \n",
+ "494 0.174 22 0 \n",
+ "463 0.258 37 0 \n",
+ "653 0.455 27 0 \n",
+ ".. ... ... ... \n",
+ "322 0.254 36 1 \n",
+ "109 0.247 24 1 \n",
+ "27 0.487 22 0 \n",
+ "651 0.466 27 0 \n",
+ "197 0.678 23 1 \n",
+ "\n",
+ "[614 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "196 0\n",
+ "69 0\n",
+ "494 0\n",
+ "463 0\n",
+ "653 0\n",
+ ".. ...\n",
+ "322 1\n",
+ "109 1\n",
+ "27 0\n",
+ "651 0\n",
+ "197 1\n",
+ "\n",
+ "[614 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 669 | \n",
+ " 9 | \n",
+ " 154 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 100 | \n",
+ " 30.9 | \n",
+ " 0.164 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 379 | \n",
+ " 0 | \n",
+ " 93 | \n",
+ " 100 | \n",
+ " 39 | \n",
+ " 72 | \n",
+ " 43.4 | \n",
+ " 1.021 | \n",
+ " 35 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 640 | \n",
+ " 0 | \n",
+ " 102 | \n",
+ " 86 | \n",
+ " 17 | \n",
+ " 105 | \n",
+ " 29.3 | \n",
+ " 0.695 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 11 | \n",
+ " 127 | \n",
+ " 106 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 39.0 | \n",
+ " 0.190 | \n",
+ " 51 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 304 | \n",
+ " 3 | \n",
+ " 150 | \n",
+ " 76 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 21.0 | \n",
+ " 0.207 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 203 | \n",
+ " 2 | \n",
+ " 99 | \n",
+ " 70 | \n",
+ " 16 | \n",
+ " 44 | \n",
+ " 20.4 | \n",
+ " 0.235 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 605 | \n",
+ " 1 | \n",
+ " 124 | \n",
+ " 60 | \n",
+ " 32 | \n",
+ " 0 | \n",
+ " 35.8 | \n",
+ " 0.514 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 561 | \n",
+ " 0 | \n",
+ " 198 | \n",
+ " 66 | \n",
+ " 32 | \n",
+ " 274 | \n",
+ " 41.3 | \n",
+ " 0.502 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 0 | \n",
+ " 146 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37.9 | \n",
+ " 0.334 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 103 | \n",
+ " 1 | \n",
+ " 81 | \n",
+ " 72 | \n",
+ " 18 | \n",
+ " 40 | \n",
+ " 26.6 | \n",
+ " 0.283 | \n",
+ " 24 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "669 9 154 78 30 100 30.9 \n",
+ "379 0 93 100 39 72 43.4 \n",
+ "640 0 102 86 17 105 29.3 \n",
+ "658 11 127 106 0 0 39.0 \n",
+ "304 3 150 76 0 0 21.0 \n",
+ ".. ... ... ... ... ... ... \n",
+ "203 2 99 70 16 44 20.4 \n",
+ "605 1 124 60 32 0 35.8 \n",
+ "561 0 198 66 32 274 41.3 \n",
+ "280 0 146 70 0 0 37.9 \n",
+ "103 1 81 72 18 40 26.6 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "669 0.164 45 0 \n",
+ "379 1.021 35 0 \n",
+ "640 0.695 27 0 \n",
+ "658 0.190 51 0 \n",
+ "304 0.207 37 0 \n",
+ ".. ... ... ... \n",
+ "203 0.235 27 0 \n",
+ "605 0.514 21 0 \n",
+ "561 0.502 28 1 \n",
+ "280 0.334 28 1 \n",
+ "103 0.283 24 0 \n",
+ "\n",
+ "[154 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 669 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 379 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 640 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 304 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 203 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 605 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 561 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 103 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "669 0\n",
+ "379 0\n",
+ "640 0\n",
+ "658 0\n",
+ "304 0\n",
+ ".. ...\n",
+ "203 0\n",
+ "605 0\n",
+ "561 1\n",
+ "280 1\n",
+ "103 0\n",
+ "\n",
+ "[154 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from typing import Tuple\n",
+ "import pandas as pd\n",
+ "from pandas import DataFrame\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "def split_stratified_into_train_val_test(\n",
+ " df_input,\n",
+ " stratify_colname=\"y\",\n",
+ " frac_train=0.6,\n",
+ " frac_val=0.15,\n",
+ " frac_test=0.25,\n",
+ " random_state=None,\n",
+ ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+ " \n",
+ " if frac_train + frac_val + frac_test != 1.0:\n",
+ " raise ValueError(\n",
+ " \"fractions %f, %f, %f do not add up to 1.0\"\n",
+ " % (frac_train, frac_val, frac_test)\n",
+ " )\n",
+ " if stratify_colname not in df_input.columns:\n",
+ " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+ " X = df_input # Contains all columns.\n",
+ " y = df_input[\n",
+ " [stratify_colname]\n",
+ " ] # Dataframe of just the column on which to stratify.\n",
+ " # Split original dataframe into train and temp dataframes.\n",
+ " df_train, df_temp, y_train, y_temp = train_test_split(\n",
+ " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+ " )\n",
+ " if frac_val <= 0:\n",
+ " assert len(df_input) == len(df_train) + len(df_temp)\n",
+ " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+ " # Split the temp dataframe into val and test dataframes.\n",
+ " relative_frac_test = frac_test / (frac_val + frac_test)\n",
+ " df_val, df_test, y_val, y_test = train_test_split(\n",
+ " df_temp,\n",
+ " y_temp,\n",
+ " stratify=y_temp,\n",
+ " test_size=relative_frac_test,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+ " return df_train, df_val, df_test, y_train, y_val, y_test\n",
+ "\n",
+ "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+ " df, stratify_colname=\"Outcome\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=9\n",
+ ")\n",
+ "\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 269,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Пропущенные значения по столбцам:\n",
+ "Pregnancies 0\n",
+ "Glucose 0\n",
+ "BloodPressure 0\n",
+ "SkinThickness 0\n",
+ "Insulin 0\n",
+ "BMI 0\n",
+ "DiabetesPedigreeFunction 0\n",
+ "Age 0\n",
+ "Outcome 0\n",
+ "dtype: int64\n",
+ "\n",
+ "Статистический обзор данных:\n",
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
+ "count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
+ "mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
+ "std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
+ "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
+ "25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
+ "50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
+ "75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
+ "max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
+ "\n",
+ " BMI DiabetesPedigreeFunction Age Outcome \n",
+ "count 768.000000 768.000000 768.000000 768.000000 \n",
+ "mean 31.992578 0.471876 33.240885 0.348958 \n",
+ "std 7.884160 0.331329 11.760232 0.476951 \n",
+ "min 0.000000 0.078000 21.000000 0.000000 \n",
+ "25% 27.300000 0.243750 24.000000 0.000000 \n",
+ "50% 32.000000 0.372500 29.000000 0.000000 \n",
+ "75% 36.600000 0.626250 41.000000 1.000000 \n",
+ "max 67.100000 2.420000 81.000000 1.000000 \n"
+ ]
+ }
+ ],
+ "source": [
+ "null_values = df.isnull().sum()\n",
+ "print(\"Пропущенные значения по столбцам:\")\n",
+ "print(null_values)\n",
+ "\n",
+ "stat_summary = df.describe()\n",
+ "print(\"\\nСтатистический обзор данных:\")\n",
+ "print(stat_summary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 270,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Выбросы в датасете:\n",
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "4 0 137 40 35 168 43.1 \n",
+ "12 10 139 80 0 0 27.1 \n",
+ "39 4 111 72 47 207 37.1 \n",
+ "45 0 180 66 39 0 42.0 \n",
+ "58 0 146 82 0 0 40.5 \n",
+ "100 1 163 72 0 0 39.0 \n",
+ "147 2 106 64 35 119 30.5 \n",
+ "187 1 128 98 41 58 32.0 \n",
+ "218 5 85 74 22 0 29.0 \n",
+ "228 4 197 70 39 744 36.7 \n",
+ "243 6 119 50 22 176 27.1 \n",
+ "245 9 184 85 15 0 30.0 \n",
+ "259 11 155 76 28 150 33.3 \n",
+ "292 2 128 78 37 182 43.3 \n",
+ "308 0 128 68 19 180 30.5 \n",
+ "330 8 118 72 19 0 23.1 \n",
+ "370 3 173 82 48 465 38.4 \n",
+ "371 0 118 64 23 89 0.0 \n",
+ "383 1 90 62 18 59 25.1 \n",
+ "395 2 127 58 24 275 27.7 \n",
+ "445 0 180 78 63 14 59.4 \n",
+ "534 1 77 56 30 56 33.3 \n",
+ "593 2 82 52 22 115 28.5 \n",
+ "606 1 181 78 42 293 40.0 \n",
+ "618 9 112 82 24 0 28.2 \n",
+ "621 2 92 76 20 0 24.2 \n",
+ "622 6 183 94 0 0 40.8 \n",
+ "659 3 80 82 31 70 34.2 \n",
+ "661 1 199 76 43 0 42.9 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "4 2.288 33 1 \n",
+ "12 1.441 57 0 \n",
+ "39 1.390 56 1 \n",
+ "45 1.893 25 1 \n",
+ "58 1.781 44 0 \n",
+ "100 1.222 33 1 \n",
+ "147 1.400 34 0 \n",
+ "187 1.321 33 1 \n",
+ "218 1.224 32 1 \n",
+ "228 2.329 31 0 \n",
+ "243 1.318 33 1 \n",
+ "245 1.213 49 1 \n",
+ "259 1.353 51 1 \n",
+ "292 1.224 31 1 \n",
+ "308 1.391 25 1 \n",
+ "330 1.476 46 0 \n",
+ "370 2.137 25 1 \n",
+ "371 1.731 21 0 \n",
+ "383 1.268 25 0 \n",
+ "395 1.600 25 0 \n",
+ "445 2.420 25 1 \n",
+ "534 1.251 24 0 \n",
+ "593 1.699 25 0 \n",
+ "606 1.258 22 1 \n",
+ "618 1.282 50 1 \n",
+ "621 1.698 28 0 \n",
+ "622 1.461 45 0 \n",
+ "659 1.292 27 1 \n",
+ "661 1.394 22 1 \n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "Q1 = df[\"DiabetesPedigreeFunction\"].quantile(0.25)\n",
+ "Q3 = df[\"DiabetesPedigreeFunction\"].quantile(0.75)\n",
+ "\n",
+ "IQR = Q3 - Q1\n",
+ "\n",
+ "threshold = 1.5 * IQR\n",
+ "lower_bound = Q1 - threshold\n",
+ "upper_bound = Q3 + threshold\n",
+ "\n",
+ "outliers = (df[\"DiabetesPedigreeFunction\"] < lower_bound) | (df[\"DiabetesPedigreeFunction\"] > upper_bound)\n",
+ "\n",
+ "# Вывод выбросов\n",
+ "print(\"Выбросы в датасете:\")\n",
+ "print(df[outliers])\n",
+ "\n",
+ "# Заменяем выбросы на медианные значения\n",
+ "median_score = df[\"DiabetesPedigreeFunction\"].median()\n",
+ "df.loc[outliers, \"DiabetesPedigreeFunction\"] = median_score\n",
+ "\n",
+ "# Визуализация данных после обработки\n",
+ "plt.figure(figsize=(10, 6))\n",
+ "plt.scatter(df['DiabetesPedigreeFunction'], df['Age'])\n",
+ "plt.xlabel('Функция родословной диабета')\n",
+ "plt.ylabel('Возраст')\n",
+ "plt.title('Диаграмма рассеивания после чистки')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Классификация данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 271,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.discriminant_analysis import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "\n",
+ "\n",
+ "columns_to_drop = [\"Pregnancies\", \"SkinThickness\", \"BloodPressure\", \"Outcome\", \"DiabetesPedigreeFunction\"]\n",
+ "num_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype != \"object\"\n",
+ "]\n",
+ "cat_columns = [\n",
+ " column\n",
+ " for column in df.columns\n",
+ " if column not in columns_to_drop and df[column].dtype == \"object\"\n",
+ "]\n",
+ "\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Проверка работы конвеера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 272,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Glucose | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " -0.478144 | \n",
+ " -0.688684 | \n",
+ " -0.946400 | \n",
+ " -1.029257 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 0.818506 | \n",
+ " 0.180416 | \n",
+ " -0.377190 | \n",
+ " -0.522334 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " -1.268784 | \n",
+ " -0.688684 | \n",
+ " -3.953317 | \n",
+ " -0.944770 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " -1.015779 | \n",
+ " -0.688684 | \n",
+ " -0.538054 | \n",
+ " 0.322537 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " -0.003760 | \n",
+ " -0.688684 | \n",
+ " -0.637047 | \n",
+ " -0.522334 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 0.122742 | \n",
+ " -0.688684 | \n",
+ " -0.562802 | \n",
+ " 0.238050 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " -0.794400 | \n",
+ " -0.375808 | \n",
+ " 0.674613 | \n",
+ " -0.775796 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " -0.731149 | \n",
+ " 0.528056 | \n",
+ " -1.082516 | \n",
+ " -0.944770 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " -0.098637 | \n",
+ " 0.232562 | \n",
+ " 0.229143 | \n",
+ " -0.522334 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " -0.414893 | \n",
+ " -0.271516 | \n",
+ " -1.119638 | \n",
+ " -0.860283 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Glucose Insulin BMI Age\n",
+ "196 -0.478144 -0.688684 -0.946400 -1.029257\n",
+ "69 0.818506 0.180416 -0.377190 -0.522334\n",
+ "494 -1.268784 -0.688684 -3.953317 -0.944770\n",
+ "463 -1.015779 -0.688684 -0.538054 0.322537\n",
+ "653 -0.003760 -0.688684 -0.637047 -0.522334\n",
+ ".. ... ... ... ...\n",
+ "322 0.122742 -0.688684 -0.562802 0.238050\n",
+ "109 -0.794400 -0.375808 0.674613 -0.775796\n",
+ "27 -0.731149 0.528056 -1.082516 -0.944770\n",
+ "651 -0.098637 0.232562 0.229143 -0.522334\n",
+ "197 -0.414893 -0.271516 -1.119638 -0.860283\n",
+ "\n",
+ "[614 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 272,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование набора моделей для классификации"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 273,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
+ "\n",
+ "class_models = {\n",
+ " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
+ " # \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
+ " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
+ " \"decision_tree\": {\n",
+ " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=9)\n",
+ " },\n",
+ " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
+ " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
+ " \"gradient_boosting\": {\n",
+ " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
+ " },\n",
+ " \"random_forest\": {\n",
+ " \"model\": ensemble.RandomForestClassifier(\n",
+ " max_depth=11, class_weight=\"balanced\", random_state=9\n",
+ " )\n",
+ " },\n",
+ " \"mlp\": {\n",
+ " \"model\": neural_network.MLPClassifier(\n",
+ " hidden_layer_sizes=(7,),\n",
+ " max_iter=500,\n",
+ " early_stopping=True,\n",
+ " random_state=9,\n",
+ " )\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Обучение моделей на обучающем наборе данных и оценка на тестовом"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 274,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: logistic\n",
+ "Model: ridge\n",
+ "Model: decision_tree\n",
+ "Model: knn\n",
+ "Model: naive_bayes\n",
+ "Model: gradient_boosting\n",
+ "Model: random_forest\n",
+ "Model: mlp\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn import metrics\n",
+ "\n",
+ "for model_name in class_models.keys():\n",
+ " print(f\"Model: {model_name}\")\n",
+ " model = class_models[model_name][\"model\"]\n",
+ "\n",
+ " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
+ " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ " y_train_predict = model_pipeline.predict(X_train)\n",
+ " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
+ " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
+ "\n",
+ " class_models[model_name][\"pipeline\"] = model_pipeline\n",
+ " class_models[model_name][\"probs\"] = y_test_probs\n",
+ " class_models[model_name][\"preds\"] = y_test_predict\n",
+ "\n",
+ " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
+ " y_test, y_test_probs\n",
+ " )\n",
+ " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
+ " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
+ " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
+ " y_test, y_test_predict\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Сводная таблица оценок качества для использованных моделей классификации\n",
+ "\n",
+ "Матрица неточностей"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 275,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import ConfusionMatrixDisplay\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
+ "for index, key in enumerate(class_models.keys()):\n",
+ " c_matrix = class_models[key][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ " disp.ax_.set_title(key)\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Точность, полнота, верность (аккуратность), F-мера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 276,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Precision_train | \n",
+ " Precision_test | \n",
+ " Recall_train | \n",
+ " Recall_test | \n",
+ " Accuracy_train | \n",
+ " Accuracy_test | \n",
+ " F1_train | \n",
+ " F1_test | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " logistic | \n",
+ " 0.710843 | \n",
+ " 0.714286 | \n",
+ " 0.551402 | \n",
+ " 0.648148 | \n",
+ " 0.765472 | \n",
+ " 0.785714 | \n",
+ " 0.621053 | \n",
+ " 0.679612 | \n",
+ "
\n",
+ " \n",
+ " random_forest | \n",
+ " 0.977169 | \n",
+ " 0.666667 | \n",
+ " 1.000000 | \n",
+ " 0.777778 | \n",
+ " 0.991857 | \n",
+ " 0.785714 | \n",
+ " 0.988453 | \n",
+ " 0.717949 | \n",
+ "
\n",
+ " \n",
+ " naive_bayes | \n",
+ " 0.702532 | \n",
+ " 0.708333 | \n",
+ " 0.518692 | \n",
+ " 0.629630 | \n",
+ " 0.755700 | \n",
+ " 0.779221 | \n",
+ " 0.596774 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " gradient_boosting | \n",
+ " 0.941463 | \n",
+ " 0.642857 | \n",
+ " 0.901869 | \n",
+ " 0.666667 | \n",
+ " 0.946254 | \n",
+ " 0.753247 | \n",
+ " 0.921241 | \n",
+ " 0.654545 | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " 0.716346 | \n",
+ " 0.584615 | \n",
+ " 0.696262 | \n",
+ " 0.703704 | \n",
+ " 0.798046 | \n",
+ " 0.720779 | \n",
+ " 0.706161 | \n",
+ " 0.638655 | \n",
+ "
\n",
+ " \n",
+ " ridge | \n",
+ " 0.610442 | \n",
+ " 0.561644 | \n",
+ " 0.710280 | \n",
+ " 0.759259 | \n",
+ " 0.741042 | \n",
+ " 0.707792 | \n",
+ " 0.656587 | \n",
+ " 0.645669 | \n",
+ "
\n",
+ " \n",
+ " decision_tree | \n",
+ " 0.793860 | \n",
+ " 0.552632 | \n",
+ " 0.845794 | \n",
+ " 0.777778 | \n",
+ " 0.869707 | \n",
+ " 0.701299 | \n",
+ " 0.819005 | \n",
+ " 0.646154 | \n",
+ "
\n",
+ " \n",
+ " mlp | \n",
+ " 0.379576 | \n",
+ " 0.376000 | \n",
+ " 0.920561 | \n",
+ " 0.870370 | \n",
+ " 0.447883 | \n",
+ " 0.448052 | \n",
+ " 0.537517 | \n",
+ " 0.525140 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 276,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(\n",
+ " by=\"Accuracy_test\", ascending=False\n",
+ ").style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 277,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Accuracy_test | \n",
+ " F1_test | \n",
+ " ROC_AUC_test | \n",
+ " Cohen_kappa_test | \n",
+ " MCC_test | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " random_forest | \n",
+ " 0.785714 | \n",
+ " 0.717949 | \n",
+ " 0.867222 | \n",
+ " 0.546816 | \n",
+ " 0.551041 | \n",
+ "
\n",
+ " \n",
+ " gradient_boosting | \n",
+ " 0.753247 | \n",
+ " 0.654545 | \n",
+ " 0.845741 | \n",
+ " 0.462725 | \n",
+ " 0.462910 | \n",
+ "
\n",
+ " \n",
+ " logistic | \n",
+ " 0.785714 | \n",
+ " 0.679612 | \n",
+ " 0.835556 | \n",
+ " 0.519205 | \n",
+ " 0.520588 | \n",
+ "
\n",
+ " \n",
+ " ridge | \n",
+ " 0.707792 | \n",
+ " 0.645669 | \n",
+ " 0.833889 | \n",
+ " 0.406373 | \n",
+ " 0.419772 | \n",
+ "
\n",
+ " \n",
+ " naive_bayes | \n",
+ " 0.779221 | \n",
+ " 0.666667 | \n",
+ " 0.822593 | \n",
+ " 0.502471 | \n",
+ " 0.504419 | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " 0.720779 | \n",
+ " 0.638655 | \n",
+ " 0.806296 | \n",
+ " 0.414293 | \n",
+ " 0.419023 | \n",
+ "
\n",
+ " \n",
+ " decision_tree | \n",
+ " 0.701299 | \n",
+ " 0.646154 | \n",
+ " 0.794167 | \n",
+ " 0.400271 | \n",
+ " 0.417827 | \n",
+ "
\n",
+ " \n",
+ " mlp | \n",
+ " 0.448052 | \n",
+ " 0.525140 | \n",
+ " 0.603333 | \n",
+ " 0.069387 | \n",
+ " 0.110298 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 277,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 278,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'random_forest'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
+ "\n",
+ "display(best_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Вывод данных с ошибкой предсказания для оценки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 279,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Error items count: 33'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Predicted | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 46 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 146 | \n",
+ " 56 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 29.7 | \n",
+ " 0.564 | \n",
+ " 29 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 86 | \n",
+ " 13 | \n",
+ " 1 | \n",
+ " 106 | \n",
+ " 72 | \n",
+ " 54 | \n",
+ " 0 | \n",
+ " 36.6 | \n",
+ " 0.178 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 91 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 123 | \n",
+ " 80 | \n",
+ " 15 | \n",
+ " 176 | \n",
+ " 32.0 | \n",
+ " 0.443 | \n",
+ " 34 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 95 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 144 | \n",
+ " 72 | \n",
+ " 27 | \n",
+ " 228 | \n",
+ " 33.9 | \n",
+ " 0.255 | \n",
+ " 40 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 125 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 88 | \n",
+ " 30 | \n",
+ " 42 | \n",
+ " 99 | \n",
+ " 55.0 | \n",
+ " 0.496 | \n",
+ " 26 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 167 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 120 | \n",
+ " 68 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 29.6 | \n",
+ " 0.709 | \n",
+ " 34 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 188 | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 109 | \n",
+ " 76 | \n",
+ " 39 | \n",
+ " 114 | \n",
+ " 27.9 | \n",
+ " 0.640 | \n",
+ " 31 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 204 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 103 | \n",
+ " 72 | \n",
+ " 32 | \n",
+ " 190 | \n",
+ " 37.7 | \n",
+ " 0.324 | \n",
+ " 55 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 228 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 197 | \n",
+ " 70 | \n",
+ " 39 | \n",
+ " 744 | \n",
+ " 36.7 | \n",
+ " 2.329 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 274 | \n",
+ " 13 | \n",
+ " 1 | \n",
+ " 106 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 34.2 | \n",
+ " 0.251 | \n",
+ " 52 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 146 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37.9 | \n",
+ " 0.334 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 282 | \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 133 | \n",
+ " 88 | \n",
+ " 15 | \n",
+ " 155 | \n",
+ " 32.4 | \n",
+ " 0.262 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 309 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 68 | \n",
+ " 28 | \n",
+ " 205 | \n",
+ " 32.9 | \n",
+ " 0.875 | \n",
+ " 30 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 335 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 165 | \n",
+ " 76 | \n",
+ " 43 | \n",
+ " 255 | \n",
+ " 47.9 | \n",
+ " 0.259 | \n",
+ " 26 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 363 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 146 | \n",
+ " 78 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 38.5 | \n",
+ " 0.520 | \n",
+ " 67 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 131 | \n",
+ " 66 | \n",
+ " 40 | \n",
+ " 0 | \n",
+ " 34.3 | \n",
+ " 0.196 | \n",
+ " 22 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 510 | \n",
+ " 12 | \n",
+ " 0 | \n",
+ " 84 | \n",
+ " 72 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 29.7 | \n",
+ " 0.297 | \n",
+ " 46 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 517 | \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 125 | \n",
+ " 86 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37.6 | \n",
+ " 0.304 | \n",
+ " 51 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 536 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 105 | \n",
+ " 90 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 29.6 | \n",
+ " 0.197 | \n",
+ " 46 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 541 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 128 | \n",
+ " 72 | \n",
+ " 25 | \n",
+ " 190 | \n",
+ " 32.4 | \n",
+ " 0.549 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 549 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 189 | \n",
+ " 110 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 28.5 | \n",
+ " 0.680 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 568 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 154 | \n",
+ " 72 | \n",
+ " 29 | \n",
+ " 126 | \n",
+ " 31.3 | \n",
+ " 0.338 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 577 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 118 | \n",
+ " 80 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 42.9 | \n",
+ " 0.693 | \n",
+ " 21 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 583 | \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 100 | \n",
+ " 76 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 38.7 | \n",
+ " 0.190 | \n",
+ " 42 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 590 | \n",
+ " 11 | \n",
+ " 0 | \n",
+ " 111 | \n",
+ " 84 | \n",
+ " 40 | \n",
+ " 0 | \n",
+ " 46.8 | \n",
+ " 0.925 | \n",
+ " 45 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 594 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 123 | \n",
+ " 72 | \n",
+ " 45 | \n",
+ " 230 | \n",
+ " 33.6 | \n",
+ " 0.733 | \n",
+ " 34 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 622 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 183 | \n",
+ " 94 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40.8 | \n",
+ " 1.461 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 630 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 114 | \n",
+ " 64 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 27.4 | \n",
+ " 0.732 | \n",
+ " 34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 11 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ " 106 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 39.0 | \n",
+ " 0.190 | \n",
+ " 51 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 669 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 154 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 100 | \n",
+ " 30.9 | \n",
+ " 0.164 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 725 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 112 | \n",
+ " 78 | \n",
+ " 40 | \n",
+ " 0 | \n",
+ " 39.4 | \n",
+ " 0.236 | \n",
+ " 38 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 744 | \n",
+ " 13 | \n",
+ " 1 | \n",
+ " 153 | \n",
+ " 88 | \n",
+ " 37 | \n",
+ " 140 | \n",
+ " 40.6 | \n",
+ " 1.174 | \n",
+ " 39 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 750 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 136 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 31.2 | \n",
+ " 1.182 | \n",
+ " 22 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Predicted Glucose BloodPressure SkinThickness Insulin \\\n",
+ "46 1 1 146 56 0 0 \n",
+ "86 13 1 106 72 54 0 \n",
+ "91 4 1 123 80 15 176 \n",
+ "95 6 1 144 72 27 228 \n",
+ "125 1 0 88 30 42 99 \n",
+ "167 4 1 120 68 0 0 \n",
+ "188 8 0 109 76 39 114 \n",
+ "204 6 1 103 72 32 190 \n",
+ "228 4 1 197 70 39 744 \n",
+ "274 13 1 106 70 0 0 \n",
+ "280 0 0 146 70 0 0 \n",
+ "282 7 1 133 88 15 155 \n",
+ "309 2 0 124 68 28 205 \n",
+ "335 0 1 165 76 43 255 \n",
+ "363 4 0 146 78 0 0 \n",
+ "397 0 0 131 66 40 0 \n",
+ "510 12 0 84 72 31 0 \n",
+ "517 7 1 125 86 0 0 \n",
+ "536 0 1 105 90 0 0 \n",
+ "541 3 0 128 72 25 190 \n",
+ "549 4 1 189 110 31 0 \n",
+ "568 4 1 154 72 29 126 \n",
+ "577 2 0 118 80 0 0 \n",
+ "583 8 1 100 76 0 0 \n",
+ "590 11 0 111 84 40 0 \n",
+ "594 6 1 123 72 45 230 \n",
+ "622 6 1 183 94 0 0 \n",
+ "630 7 0 114 64 0 0 \n",
+ "658 11 1 127 106 0 0 \n",
+ "669 9 1 154 78 30 100 \n",
+ "725 4 1 112 78 40 0 \n",
+ "744 13 1 153 88 37 140 \n",
+ "750 4 0 136 70 0 0 \n",
+ "\n",
+ " BMI DiabetesPedigreeFunction Age Outcome \n",
+ "46 29.7 0.564 29 0 \n",
+ "86 36.6 0.178 45 0 \n",
+ "91 32.0 0.443 34 0 \n",
+ "95 33.9 0.255 40 0 \n",
+ "125 55.0 0.496 26 1 \n",
+ "167 29.6 0.709 34 0 \n",
+ "188 27.9 0.640 31 1 \n",
+ "204 37.7 0.324 55 0 \n",
+ "228 36.7 2.329 31 0 \n",
+ "274 34.2 0.251 52 0 \n",
+ "280 37.9 0.334 28 1 \n",
+ "282 32.4 0.262 37 0 \n",
+ "309 32.9 0.875 30 1 \n",
+ "335 47.9 0.259 26 0 \n",
+ "363 38.5 0.520 67 1 \n",
+ "397 34.3 0.196 22 1 \n",
+ "510 29.7 0.297 46 1 \n",
+ "517 37.6 0.304 51 0 \n",
+ "536 29.6 0.197 46 0 \n",
+ "541 32.4 0.549 27 1 \n",
+ "549 28.5 0.680 37 0 \n",
+ "568 31.3 0.338 37 0 \n",
+ "577 42.9 0.693 21 1 \n",
+ "583 38.7 0.190 42 0 \n",
+ "590 46.8 0.925 45 1 \n",
+ "594 33.6 0.733 34 0 \n",
+ "622 40.8 1.461 45 0 \n",
+ "630 27.4 0.732 34 1 \n",
+ "658 39.0 0.190 51 0 \n",
+ "669 30.9 0.164 45 0 \n",
+ "725 39.4 0.236 38 0 \n",
+ "744 40.6 1.174 39 0 \n",
+ "750 31.2 1.182 22 1 "
+ ]
+ },
+ "execution_count": 279,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.transform(X_test)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "y_pred = class_models[best_model][\"preds\"]\n",
+ "\n",
+ "error_index = y_test[y_test[\"Outcome\"] != y_pred].index.tolist()\n",
+ "display(f\"Error items count: {len(error_index)}\")\n",
+ "\n",
+ "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
+ "error_df = X_test.loc[error_index].copy()\n",
+ "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
+ "error_df.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Пример использования обученной модели (конвейера) для предсказания"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 280,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 450 | \n",
+ " 1.0 | \n",
+ " 82.0 | \n",
+ " 64.0 | \n",
+ " 13.0 | \n",
+ " 95.0 | \n",
+ " 21.2 | \n",
+ " 0.415 | \n",
+ " 23.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "450 1.0 82.0 64.0 13.0 95.0 21.2 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "450 0.415 23.0 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Glucose | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 450 | \n",
+ " -1.205533 | \n",
+ " 0.136961 | \n",
+ " -1.329999 | \n",
+ " -0.860283 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Glucose Insulin BMI Age\n",
+ "450 -1.205533 0.136961 -1.329999 -0.860283"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'predicted: 0 (proba: [0.96 0.04])'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'real: 0'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model = class_models[best_model][\"pipeline\"]\n",
+ "\n",
+ "example_id = 450\n",
+ "test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
+ "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
+ "display(test)\n",
+ "display(test_preprocessed)\n",
+ "result_proba = model.predict_proba(test)[0]\n",
+ "result = model.predict(test)[0]\n",
+ "real = int(y_test.loc[example_id].values[0])\n",
+ "display(f\"predicted: {result} (proba: {result_proba})\")\n",
+ "display(f\"real: {real}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Подбор гиперпараметров методом поиска по сетке"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 281,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "# Определяем числовые признаки\n",
+ "numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()\n",
+ "\n",
+ "# Установка random_state\n",
+ "random_state = 9\n",
+ "\n",
+ "# Определение трансформера\n",
+ "pipeline_end = ColumnTransformer([\n",
+ " ('numeric', StandardScaler(), numeric_features),\n",
+ " # Добавьте другие трансформеры, если требуется\n",
+ "])\n",
+ "\n",
+ "# Объявление модели\n",
+ "optimized_model = RandomForestClassifier(\n",
+ " random_state=random_state,\n",
+ " criterion=\"gini\",\n",
+ " max_depth=5,\n",
+ " max_features=\"sqrt\",\n",
+ " n_estimators=10,\n",
+ ")\n",
+ "\n",
+ "# Создание пайплайна с корректными шагами\n",
+ "result = {}\n",
+ "\n",
+ "# Обучение модели\n",
+ "result[\"pipeline\"] = Pipeline([\n",
+ " (\"pipeline\", pipeline_end),\n",
+ " (\"model\", optimized_model)\n",
+ "]).fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ "# Прогнозирование и расчет метрик\n",
+ "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
+ "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
+ "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
+ "\n",
+ "# Метрики для оценки модели\n",
+ "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
+ "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
+ "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
+ "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
+ "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
+ "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
+ "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
+ "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
+ "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование данных для оценки старой и новой версии модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 282,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_model_type = \"random_forest\"\n",
+ "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=class_models[optimized_model_type]\n",
+ ")\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=result\n",
+ ")\n",
+ "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
+ "optimized_metrics = optimized_metrics.set_index(\"Name\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Оценка параметров старой и новой модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 283,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Precision_train | \n",
+ " Precision_test | \n",
+ " Recall_train | \n",
+ " Recall_test | \n",
+ " Accuracy_train | \n",
+ " Accuracy_test | \n",
+ " F1_train | \n",
+ " F1_test | \n",
+ "
\n",
+ " \n",
+ " Name | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Old | \n",
+ " 0.977169 | \n",
+ " 0.666667 | \n",
+ " 1.000000 | \n",
+ " 0.777778 | \n",
+ " 0.991857 | \n",
+ " 0.785714 | \n",
+ " 0.988453 | \n",
+ " 0.717949 | \n",
+ "
\n",
+ " \n",
+ " New | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 283,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 284,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Accuracy_test | \n",
+ " F1_test | \n",
+ " ROC_AUC_test | \n",
+ " Cohen_kappa_test | \n",
+ " MCC_test | \n",
+ "
\n",
+ " \n",
+ " Name | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Old | \n",
+ " 0.785714 | \n",
+ " 0.717949 | \n",
+ " 0.867222 | \n",
+ " 0.546816 | \n",
+ " 0.551041 | \n",
+ "
\n",
+ " \n",
+ " New | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 284,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 285,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
+ ")\n",
+ "\n",
+ "for index in range(0, len(optimized_metrics)):\n",
+ " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
+ "plt.show()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "В желтом квадрате мы видим значение 79, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"Sick\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n",
+ "\n",
+ "В зеленом квадрате значение 42 указывает на количество правильно классифицированных объектов, отнесенных к классу \"Healthy\". Это также является показателем хорошей точности модели в определении объектов данного класса."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Определение достижимого уровня качества модели для второй задачи"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Подготовка данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 286,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin \\\n",
+ "count 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
+ "mean 3.845052 120.894531 69.105469 20.536458 79.799479 \n",
+ "std 3.369578 31.972618 19.355807 15.952218 115.244002 \n",
+ "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
+ "25% 1.000000 99.000000 62.000000 0.000000 0.000000 \n",
+ "50% 3.000000 117.000000 72.000000 23.000000 30.500000 \n",
+ "75% 6.000000 140.250000 80.000000 32.000000 127.250000 \n",
+ "max 17.000000 199.000000 122.000000 99.000000 846.000000 \n",
+ "\n",
+ " BMI DiabetesPedigreeFunction Age Outcome \n",
+ "count 768.000000 768.000000 768.000000 768.000000 \n",
+ "mean 31.992578 0.471876 33.240885 0.348958 \n",
+ "std 7.884160 0.331329 11.760232 0.476951 \n",
+ "min 0.000000 0.078000 21.000000 0.000000 \n",
+ "25% 27.300000 0.243750 24.000000 0.000000 \n",
+ "50% 32.000000 0.372500 29.000000 0.000000 \n",
+ "75% 36.600000 0.626250 41.000000 1.000000 \n",
+ "max 67.100000 2.420000 81.000000 1.000000 \n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "\n",
+ "random_state = 9\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "df = pd.read_csv(\".//scv//diabetes.csv\")\n",
+ "print(df.describe())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование выборок"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 287,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " 1 | \n",
+ " 105 | \n",
+ " 58 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 24.3 | \n",
+ " 0.187 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 4 | \n",
+ " 146 | \n",
+ " 85 | \n",
+ " 27 | \n",
+ " 100 | \n",
+ " 28.9 | \n",
+ " 0.189 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " 3 | \n",
+ " 80 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.174 | \n",
+ " 22 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " 5 | \n",
+ " 88 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ " 27.6 | \n",
+ " 0.258 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " 2 | \n",
+ " 120 | \n",
+ " 54 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 26.8 | \n",
+ " 0.455 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 70 | \n",
+ " 20 | \n",
+ " 0 | \n",
+ " 27.4 | \n",
+ " 0.254 | \n",
+ " 36 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 0 | \n",
+ " 95 | \n",
+ " 85 | \n",
+ " 25 | \n",
+ " 36 | \n",
+ " 37.4 | \n",
+ " 0.247 | \n",
+ " 24 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 97 | \n",
+ " 66 | \n",
+ " 15 | \n",
+ " 140 | \n",
+ " 23.2 | \n",
+ " 0.487 | \n",
+ " 22 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " 1 | \n",
+ " 117 | \n",
+ " 60 | \n",
+ " 23 | \n",
+ " 106 | \n",
+ " 33.8 | \n",
+ " 0.466 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " 3 | \n",
+ " 107 | \n",
+ " 62 | \n",
+ " 13 | \n",
+ " 48 | \n",
+ " 22.9 | \n",
+ " 0.678 | \n",
+ " 23 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "196 1 105 58 0 0 24.3 \n",
+ "69 4 146 85 27 100 28.9 \n",
+ "494 3 80 0 0 0 0.0 \n",
+ "463 5 88 78 30 0 27.6 \n",
+ "653 2 120 54 0 0 26.8 \n",
+ ".. ... ... ... ... ... ... \n",
+ "322 0 124 70 20 0 27.4 \n",
+ "109 0 95 85 25 36 37.4 \n",
+ "27 1 97 66 15 140 23.2 \n",
+ "651 1 117 60 23 106 33.8 \n",
+ "197 3 107 62 13 48 22.9 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "196 0.187 21 0 \n",
+ "69 0.189 27 0 \n",
+ "494 0.174 22 0 \n",
+ "463 0.258 37 0 \n",
+ "653 0.455 27 0 \n",
+ ".. ... ... ... \n",
+ "322 0.254 36 1 \n",
+ "109 0.247 24 1 \n",
+ "27 0.487 22 0 \n",
+ "651 0.466 27 0 \n",
+ "197 0.678 23 1 \n",
+ "\n",
+ "[614 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "196 0\n",
+ "69 0\n",
+ "494 0\n",
+ "463 0\n",
+ "653 0\n",
+ ".. ...\n",
+ "322 1\n",
+ "109 1\n",
+ "27 0\n",
+ "651 0\n",
+ "197 1\n",
+ "\n",
+ "[614 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 669 | \n",
+ " 9 | \n",
+ " 154 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 100 | \n",
+ " 30.9 | \n",
+ " 0.164 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 379 | \n",
+ " 0 | \n",
+ " 93 | \n",
+ " 100 | \n",
+ " 39 | \n",
+ " 72 | \n",
+ " 43.4 | \n",
+ " 1.021 | \n",
+ " 35 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 640 | \n",
+ " 0 | \n",
+ " 102 | \n",
+ " 86 | \n",
+ " 17 | \n",
+ " 105 | \n",
+ " 29.3 | \n",
+ " 0.695 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 11 | \n",
+ " 127 | \n",
+ " 106 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 39.0 | \n",
+ " 0.190 | \n",
+ " 51 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 304 | \n",
+ " 3 | \n",
+ " 150 | \n",
+ " 76 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 21.0 | \n",
+ " 0.207 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 203 | \n",
+ " 2 | \n",
+ " 99 | \n",
+ " 70 | \n",
+ " 16 | \n",
+ " 44 | \n",
+ " 20.4 | \n",
+ " 0.235 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 605 | \n",
+ " 1 | \n",
+ " 124 | \n",
+ " 60 | \n",
+ " 32 | \n",
+ " 0 | \n",
+ " 35.8 | \n",
+ " 0.514 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 561 | \n",
+ " 0 | \n",
+ " 198 | \n",
+ " 66 | \n",
+ " 32 | \n",
+ " 274 | \n",
+ " 41.3 | \n",
+ " 0.502 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 0 | \n",
+ " 146 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37.9 | \n",
+ " 0.334 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 103 | \n",
+ " 1 | \n",
+ " 81 | \n",
+ " 72 | \n",
+ " 18 | \n",
+ " 40 | \n",
+ " 26.6 | \n",
+ " 0.283 | \n",
+ " 24 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "669 9 154 78 30 100 30.9 \n",
+ "379 0 93 100 39 72 43.4 \n",
+ "640 0 102 86 17 105 29.3 \n",
+ "658 11 127 106 0 0 39.0 \n",
+ "304 3 150 76 0 0 21.0 \n",
+ ".. ... ... ... ... ... ... \n",
+ "203 2 99 70 16 44 20.4 \n",
+ "605 1 124 60 32 0 35.8 \n",
+ "561 0 198 66 32 274 41.3 \n",
+ "280 0 146 70 0 0 37.9 \n",
+ "103 1 81 72 18 40 26.6 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "669 0.164 45 0 \n",
+ "379 1.021 35 0 \n",
+ "640 0.695 27 0 \n",
+ "658 0.190 51 0 \n",
+ "304 0.207 37 0 \n",
+ ".. ... ... ... \n",
+ "203 0.235 27 0 \n",
+ "605 0.514 21 0 \n",
+ "561 0.502 28 1 \n",
+ "280 0.334 28 1 \n",
+ "103 0.283 24 0 \n",
+ "\n",
+ "[154 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 669 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 379 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 640 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 304 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 203 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 605 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 561 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 103 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "669 0\n",
+ "379 0\n",
+ "640 0\n",
+ "658 0\n",
+ "304 0\n",
+ ".. ...\n",
+ "203 0\n",
+ "605 0\n",
+ "561 1\n",
+ "280 1\n",
+ "103 0\n",
+ "\n",
+ "[154 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from typing import Tuple\n",
+ "import pandas as pd\n",
+ "from pandas import DataFrame\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "def split_stratified_into_train_val_test(\n",
+ " df_input: DataFrame,\n",
+ " stratify_colname: str = \"y\",\n",
+ " frac_train: float = 0.6,\n",
+ " frac_val: float = 0.15,\n",
+ " frac_test: float = 0.25,\n",
+ " random_state: int = None,\n",
+ ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+ " \n",
+ "\n",
+ " if not (0 < frac_train < 1) or not (0 <= frac_val <= 1) or not (0 <= frac_test <= 1):\n",
+ " raise ValueError(\"Fractions must be between 0 and 1 and the sum must equal 1.\")\n",
+ " \n",
+ " if not (frac_train + frac_val + frac_test == 1.0):\n",
+ " raise ValueError(\"fractions %f, %f, %f do not add up to 1.0\" %\n",
+ " (frac_train, frac_val, frac_test))\n",
+ "\n",
+ " if stratify_colname not in df_input.columns:\n",
+ " raise ValueError(f\"{stratify_colname} is not a column in the DataFrame.\")\n",
+ "\n",
+ " X = df_input\n",
+ " y = df_input[[stratify_colname]]\n",
+ "\n",
+ " \n",
+ " df_train, df_temp, y_train, y_temp = train_test_split(\n",
+ " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+ " )\n",
+ "\n",
+ " if frac_val == 0:\n",
+ " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+ "\n",
+ " relative_frac_test = frac_test / (frac_val + frac_test)\n",
+ "\n",
+ " df_val, df_test, y_val, y_test = train_test_split(\n",
+ " df_temp,\n",
+ " y_temp,\n",
+ " stratify=y_temp,\n",
+ " test_size=relative_frac_test,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ "\n",
+ " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+ " \n",
+ " return df_train, df_val, df_test, y_train, y_val, y_test\n",
+ "\n",
+ "\n",
+ "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+ " df, stratify_colname=\"Outcome\", frac_train=0.80, frac_val=0.0, frac_test=0.20, random_state=random_state\n",
+ ")\n",
+ "\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование конвейера для классификации данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 288,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.discriminant_analysis import StandardScaler\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "class DiabetFeatures(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self):\n",
+ " pass\n",
+ " def fit(self, X, y=None):\n",
+ " return self\n",
+ " \n",
+ "\n",
+ "columns_to_drop = [\"Pregnancies\", \"SkinThickness\", \"Insulin\", \"BMI\", \"Outcome\"]\n",
+ "num_columns = [\"Glucose\", \"Age\", \"BloodPressure\", \"DiabetesPedigreeFunction\"]\n",
+ "cat_columns = []\n",
+ "\n",
+ "num_imputer = SimpleImputer(strategy=\"median\")\n",
+ "num_scaler = StandardScaler()\n",
+ "preprocessing_num = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", num_imputer),\n",
+ " (\"scaler\", num_scaler),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "cat_imputer = SimpleImputer(strategy=\"constant\", fill_value=\"unknown\")\n",
+ "cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=False, drop=\"first\")\n",
+ "preprocessing_cat = Pipeline(\n",
+ " [\n",
+ " (\"imputer\", cat_imputer),\n",
+ " (\"encoder\", cat_encoder),\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_num\", preprocessing_num, num_columns),\n",
+ " (\"prepocessing_cat\", preprocessing_cat, cat_columns),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_postprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"prepocessing_cat\", preprocessing_cat, [\"Cabin_type\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Демонстрация работы конвейера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 289,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Glucose | \n",
+ " Age | \n",
+ " BloodPressure | \n",
+ " DiabetesPedigreeFunction | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 196 | \n",
+ " -0.478144 | \n",
+ " -1.029257 | \n",
+ " -0.554050 | \n",
+ " -0.849205 | \n",
+ "
\n",
+ " \n",
+ " 69 | \n",
+ " 0.818506 | \n",
+ " -0.522334 | \n",
+ " 0.804885 | \n",
+ " -0.843172 | \n",
+ "
\n",
+ " \n",
+ " 494 | \n",
+ " -1.268784 | \n",
+ " -0.944770 | \n",
+ " -3.473244 | \n",
+ " -0.888421 | \n",
+ "
\n",
+ " \n",
+ " 463 | \n",
+ " -1.015779 | \n",
+ " 0.322537 | \n",
+ " 0.452568 | \n",
+ " -0.635028 | \n",
+ "
\n",
+ " \n",
+ " 653 | \n",
+ " -0.003760 | \n",
+ " -0.522334 | \n",
+ " -0.755374 | \n",
+ " -0.040763 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 322 | \n",
+ " 0.122742 | \n",
+ " 0.238050 | \n",
+ " 0.049921 | \n",
+ " -0.647095 | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " -0.794400 | \n",
+ " -0.775796 | \n",
+ " 0.804885 | \n",
+ " -0.668211 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " -0.731149 | \n",
+ " -0.944770 | \n",
+ " -0.151403 | \n",
+ " 0.055767 | \n",
+ "
\n",
+ " \n",
+ " 651 | \n",
+ " -0.098637 | \n",
+ " -0.522334 | \n",
+ " -0.453388 | \n",
+ " -0.007581 | \n",
+ "
\n",
+ " \n",
+ " 197 | \n",
+ " -0.414893 | \n",
+ " -0.860283 | \n",
+ " -0.352726 | \n",
+ " 0.631933 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Glucose Age BloodPressure DiabetesPedigreeFunction\n",
+ "196 -0.478144 -1.029257 -0.554050 -0.849205\n",
+ "69 0.818506 -0.522334 0.804885 -0.843172\n",
+ "494 -1.268784 -0.944770 -3.473244 -0.888421\n",
+ "463 -1.015779 0.322537 0.452568 -0.635028\n",
+ "653 -0.003760 -0.522334 -0.755374 -0.040763\n",
+ ".. ... ... ... ...\n",
+ "322 0.122742 0.238050 0.049921 -0.647095\n",
+ "109 -0.794400 -0.775796 0.804885 -0.668211\n",
+ "27 -0.731149 -0.944770 -0.151403 0.055767\n",
+ "651 -0.098637 -0.522334 -0.453388 -0.007581\n",
+ "197 -0.414893 -0.860283 -0.352726 0.631933\n",
+ "\n",
+ "[614 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 289,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование набора моделей для классификации"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 290,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
+ "\n",
+ "class_models = {\n",
+ " \"logistic\": {\"model\": linear_model.LogisticRegression()},\n",
+ " \"ridge\": {\"model\": linear_model.RidgeClassifierCV(cv=5, class_weight=\"balanced\")},\n",
+ " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\")},\n",
+ " \"decision_tree\": {\n",
+ " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
+ " },\n",
+ " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
+ " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
+ " \"gradient_boosting\": {\n",
+ " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210)\n",
+ " },\n",
+ " \"random_forest\": {\n",
+ " \"model\": ensemble.RandomForestClassifier(\n",
+ " max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
+ " )\n",
+ " },\n",
+ " \"mlp\": {\n",
+ " \"model\": neural_network.MLPClassifier(\n",
+ " hidden_layer_sizes=(7,),\n",
+ " max_iter=500,\n",
+ " early_stopping=True,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Обучение моделей на обучающем наборе данных и оценка на тестовом¶"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 291,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: logistic\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: ridge\n",
+ "Model: decision_tree\n",
+ "Model: knn\n",
+ "Model: naive_bayes\n",
+ "Model: gradient_boosting\n",
+ "Model: random_forest\n",
+ "Model: mlp\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "for model_name in class_models.keys():\n",
+ " print(f\"Model: {model_name}\")\n",
+ " model = class_models[model_name][\"model\"]\n",
+ "\n",
+ " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
+ " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ " y_train_predict = model_pipeline.predict(X_train)\n",
+ " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
+ " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
+ "\n",
+ " class_models[model_name][\"pipeline\"] = model_pipeline\n",
+ " class_models[model_name][\"probs\"] = y_test_probs\n",
+ " class_models[model_name][\"preds\"] = y_test_predict\n",
+ "\n",
+ " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
+ " y_test, y_test_probs\n",
+ " )\n",
+ " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
+ " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
+ " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
+ " y_test, y_test_predict\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Сводная таблица оценок качества для использованных моделей классификации¶\n",
+ "\n",
+ "Матрица неточностей\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 292,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import ConfusionMatrixDisplay\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "_, ax = plt.subplots(int(len(class_models) / 2), 2, figsize=(12, 10), sharex=False, sharey=False)\n",
+ "for index, key in enumerate(class_models.keys()):\n",
+ " c_matrix = class_models[key][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ " disp.ax_.set_title(key)\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 293,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Precision_train | \n",
+ " Precision_test | \n",
+ " Recall_train | \n",
+ " Recall_test | \n",
+ " Accuracy_train | \n",
+ " Accuracy_test | \n",
+ " F1_train | \n",
+ " F1_test | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " naive_bayes | \n",
+ " 0.678571 | \n",
+ " 0.734694 | \n",
+ " 0.532710 | \n",
+ " 0.666667 | \n",
+ " 0.749186 | \n",
+ " 0.798701 | \n",
+ " 0.596859 | \n",
+ " 0.699029 | \n",
+ "
\n",
+ " \n",
+ " logistic | \n",
+ " 0.696774 | \n",
+ " 0.717391 | \n",
+ " 0.504673 | \n",
+ " 0.611111 | \n",
+ " 0.750814 | \n",
+ " 0.779221 | \n",
+ " 0.585366 | \n",
+ " 0.660000 | \n",
+ "
\n",
+ " \n",
+ " gradient_boosting | \n",
+ " 0.949749 | \n",
+ " 0.673469 | \n",
+ " 0.883178 | \n",
+ " 0.611111 | \n",
+ " 0.942997 | \n",
+ " 0.759740 | \n",
+ " 0.915254 | \n",
+ " 0.640777 | \n",
+ "
\n",
+ " \n",
+ " random_forest | \n",
+ " 0.990741 | \n",
+ " 0.633333 | \n",
+ " 1.000000 | \n",
+ " 0.703704 | \n",
+ " 0.996743 | \n",
+ " 0.753247 | \n",
+ " 0.995349 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " 0.730159 | \n",
+ " 0.622642 | \n",
+ " 0.644860 | \n",
+ " 0.611111 | \n",
+ " 0.793160 | \n",
+ " 0.733766 | \n",
+ " 0.684864 | \n",
+ " 0.616822 | \n",
+ "
\n",
+ " \n",
+ " ridge | \n",
+ " 0.602459 | \n",
+ " 0.583333 | \n",
+ " 0.686916 | \n",
+ " 0.777778 | \n",
+ " 0.732899 | \n",
+ " 0.727273 | \n",
+ " 0.641921 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " decision_tree | \n",
+ " 0.848168 | \n",
+ " 0.612245 | \n",
+ " 0.757009 | \n",
+ " 0.555556 | \n",
+ " 0.868078 | \n",
+ " 0.720779 | \n",
+ " 0.800000 | \n",
+ " 0.582524 | \n",
+ "
\n",
+ " \n",
+ " mlp | \n",
+ " 0.513158 | \n",
+ " 0.532258 | \n",
+ " 0.546729 | \n",
+ " 0.611111 | \n",
+ " 0.661238 | \n",
+ " 0.675325 | \n",
+ " 0.529412 | \n",
+ " 0.568966 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 293,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(\n",
+ " by=\"Accuracy_test\", ascending=False\n",
+ ").style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "Почти все модели, включая логистическую регрессию, ридж-регрессию, KNN, наивный байесовский классификатор, многослойную перцептронную сеть, случайный лес, дерево решений и градиентный бустинг, демонстрируют 100% точность (1.000000) на обучающей выборке. Это указывает на то, что модели смогли подстроиться под обучающие данные, что может указывать на возможное переобучение.\n",
+ "\n",
+ "ROC-кривая, каппа Коэна, коэффициент корреляции Мэтьюса\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 294,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Accuracy_test | \n",
+ " F1_test | \n",
+ " ROC_AUC_test | \n",
+ " Cohen_kappa_test | \n",
+ " MCC_test | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " logistic | \n",
+ " 0.779221 | \n",
+ " 0.660000 | \n",
+ " 0.825370 | \n",
+ " 0.498083 | \n",
+ " 0.501593 | \n",
+ "
\n",
+ " \n",
+ " ridge | \n",
+ " 0.727273 | \n",
+ " 0.666667 | \n",
+ " 0.824444 | \n",
+ " 0.443756 | \n",
+ " 0.456930 | \n",
+ "
\n",
+ " \n",
+ " naive_bayes | \n",
+ " 0.798701 | \n",
+ " 0.699029 | \n",
+ " 0.820556 | \n",
+ " 0.548344 | \n",
+ " 0.549805 | \n",
+ "
\n",
+ " \n",
+ " gradient_boosting | \n",
+ " 0.759740 | \n",
+ " 0.640777 | \n",
+ " 0.815741 | \n",
+ " 0.460927 | \n",
+ " 0.462155 | \n",
+ "
\n",
+ " \n",
+ " random_forest | \n",
+ " 0.753247 | \n",
+ " 0.666667 | \n",
+ " 0.808704 | \n",
+ " 0.471650 | \n",
+ " 0.473300 | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " 0.733766 | \n",
+ " 0.616822 | \n",
+ " 0.776204 | \n",
+ " 0.412870 | \n",
+ " 0.412912 | \n",
+ "
\n",
+ " \n",
+ " decision_tree | \n",
+ " 0.720779 | \n",
+ " 0.582524 | \n",
+ " 0.719167 | \n",
+ " 0.373510 | \n",
+ " 0.374505 | \n",
+ "
\n",
+ " \n",
+ " mlp | \n",
+ " 0.675325 | \n",
+ " 0.568966 | \n",
+ " 0.719074 | \n",
+ " 0.310530 | \n",
+ " 0.312437 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 294,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 295,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'naive_bayes'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
+ "\n",
+ "display(best_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Вывод данных с ошибкой предсказания для оценки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 296,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Error items count: 31'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Predicted | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 64 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 114 | \n",
+ " 66 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 32.8 | \n",
+ " 0.258 | \n",
+ " 42 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 88 | \n",
+ " 15 | \n",
+ " 0 | \n",
+ " 136 | \n",
+ " 70 | \n",
+ " 32 | \n",
+ " 110 | \n",
+ " 37.1 | \n",
+ " 0.153 | \n",
+ " 43 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 125 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 88 | \n",
+ " 30 | \n",
+ " 42 | \n",
+ " 99 | \n",
+ " 55.0 | \n",
+ " 0.496 | \n",
+ " 26 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 143 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 108 | \n",
+ " 66 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 32.4 | \n",
+ " 0.272 | \n",
+ " 42 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 170 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 102 | \n",
+ " 82 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30.8 | \n",
+ " 0.180 | \n",
+ " 36 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 188 | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 109 | \n",
+ " 76 | \n",
+ " 39 | \n",
+ " 114 | \n",
+ " 27.9 | \n",
+ " 0.640 | \n",
+ " 31 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 199 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 148 | \n",
+ " 60 | \n",
+ " 27 | \n",
+ " 318 | \n",
+ " 30.9 | \n",
+ " 0.150 | \n",
+ " 29 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 214 | \n",
+ " 9 | \n",
+ " 0 | \n",
+ " 112 | \n",
+ " 82 | \n",
+ " 32 | \n",
+ " 175 | \n",
+ " 34.2 | \n",
+ " 0.260 | \n",
+ " 36 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 223 | \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 142 | \n",
+ " 60 | \n",
+ " 33 | \n",
+ " 190 | \n",
+ " 28.8 | \n",
+ " 0.687 | \n",
+ " 61 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 228 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 197 | \n",
+ " 70 | \n",
+ " 39 | \n",
+ " 744 | \n",
+ " 36.7 | \n",
+ " 2.329 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 146 | \n",
+ " 70 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 37.9 | \n",
+ " 0.334 | \n",
+ " 28 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 294 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 161 | \n",
+ " 50 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 21.9 | \n",
+ " 0.254 | \n",
+ " 65 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 304 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 150 | \n",
+ " 76 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 21.0 | \n",
+ " 0.207 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 309 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 124 | \n",
+ " 68 | \n",
+ " 28 | \n",
+ " 205 | \n",
+ " 32.9 | \n",
+ " 0.875 | \n",
+ " 30 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 335 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 165 | \n",
+ " 76 | \n",
+ " 43 | \n",
+ " 255 | \n",
+ " 47.9 | \n",
+ " 0.259 | \n",
+ " 26 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 395 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ " 58 | \n",
+ " 24 | \n",
+ " 275 | \n",
+ " 27.7 | \n",
+ " 1.600 | \n",
+ " 25 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 397 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 131 | \n",
+ " 66 | \n",
+ " 40 | \n",
+ " 0 | \n",
+ " 34.3 | \n",
+ " 0.196 | \n",
+ " 22 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 401 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 137 | \n",
+ " 61 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 24.2 | \n",
+ " 0.151 | \n",
+ " 55 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 406 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 115 | \n",
+ " 72 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 28.9 | \n",
+ " 0.376 | \n",
+ " 46 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 510 | \n",
+ " 12 | \n",
+ " 0 | \n",
+ " 84 | \n",
+ " 72 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 29.7 | \n",
+ " 0.297 | \n",
+ " 46 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 541 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 128 | \n",
+ " 72 | \n",
+ " 25 | \n",
+ " 190 | \n",
+ " 32.4 | \n",
+ " 0.549 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 549 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 189 | \n",
+ " 110 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 28.5 | \n",
+ " 0.680 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 568 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 154 | \n",
+ " 72 | \n",
+ " 29 | \n",
+ " 126 | \n",
+ " 31.3 | \n",
+ " 0.338 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 577 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 118 | \n",
+ " 80 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 42.9 | \n",
+ " 0.693 | \n",
+ " 21 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 622 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 183 | \n",
+ " 94 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40.8 | \n",
+ " 1.461 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 630 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 114 | \n",
+ " 64 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 27.4 | \n",
+ " 0.732 | \n",
+ " 34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 658 | \n",
+ " 11 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ " 106 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 39.0 | \n",
+ " 0.190 | \n",
+ " 51 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 669 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 154 | \n",
+ " 78 | \n",
+ " 30 | \n",
+ " 100 | \n",
+ " 30.9 | \n",
+ " 0.164 | \n",
+ " 45 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 693 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 129 | \n",
+ " 68 | \n",
+ " 49 | \n",
+ " 125 | \n",
+ " 38.5 | \n",
+ " 0.439 | \n",
+ " 43 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 730 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 130 | \n",
+ " 78 | \n",
+ " 23 | \n",
+ " 79 | \n",
+ " 28.4 | \n",
+ " 0.323 | \n",
+ " 34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 744 | \n",
+ " 13 | \n",
+ " 1 | \n",
+ " 153 | \n",
+ " 88 | \n",
+ " 37 | \n",
+ " 140 | \n",
+ " 40.6 | \n",
+ " 1.174 | \n",
+ " 39 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Predicted Glucose BloodPressure SkinThickness Insulin \\\n",
+ "64 7 0 114 66 0 0 \n",
+ "88 15 0 136 70 32 110 \n",
+ "125 1 0 88 30 42 99 \n",
+ "143 10 0 108 66 0 0 \n",
+ "170 6 0 102 82 0 0 \n",
+ "188 8 0 109 76 39 114 \n",
+ "199 4 0 148 60 27 318 \n",
+ "214 9 0 112 82 32 175 \n",
+ "223 7 1 142 60 33 190 \n",
+ "228 4 1 197 70 39 744 \n",
+ "280 0 0 146 70 0 0 \n",
+ "294 0 1 161 50 0 0 \n",
+ "304 3 1 150 76 0 0 \n",
+ "309 2 0 124 68 28 205 \n",
+ "335 0 1 165 76 43 255 \n",
+ "395 2 1 127 58 24 275 \n",
+ "397 0 0 131 66 40 0 \n",
+ "401 6 1 137 61 0 0 \n",
+ "406 4 0 115 72 0 0 \n",
+ "510 12 0 84 72 31 0 \n",
+ "541 3 0 128 72 25 190 \n",
+ "549 4 1 189 110 31 0 \n",
+ "568 4 1 154 72 29 126 \n",
+ "577 2 0 118 80 0 0 \n",
+ "622 6 1 183 94 0 0 \n",
+ "630 7 0 114 64 0 0 \n",
+ "658 11 1 127 106 0 0 \n",
+ "669 9 1 154 78 30 100 \n",
+ "693 7 0 129 68 49 125 \n",
+ "730 3 0 130 78 23 79 \n",
+ "744 13 1 153 88 37 140 \n",
+ "\n",
+ " BMI DiabetesPedigreeFunction Age Outcome \n",
+ "64 32.8 0.258 42 1 \n",
+ "88 37.1 0.153 43 1 \n",
+ "125 55.0 0.496 26 1 \n",
+ "143 32.4 0.272 42 1 \n",
+ "170 30.8 0.180 36 1 \n",
+ "188 27.9 0.640 31 1 \n",
+ "199 30.9 0.150 29 1 \n",
+ "214 34.2 0.260 36 1 \n",
+ "223 28.8 0.687 61 0 \n",
+ "228 36.7 2.329 31 0 \n",
+ "280 37.9 0.334 28 1 \n",
+ "294 21.9 0.254 65 0 \n",
+ "304 21.0 0.207 37 0 \n",
+ "309 32.9 0.875 30 1 \n",
+ "335 47.9 0.259 26 0 \n",
+ "395 27.7 1.600 25 0 \n",
+ "397 34.3 0.196 22 1 \n",
+ "401 24.2 0.151 55 0 \n",
+ "406 28.9 0.376 46 1 \n",
+ "510 29.7 0.297 46 1 \n",
+ "541 32.4 0.549 27 1 \n",
+ "549 28.5 0.680 37 0 \n",
+ "568 31.3 0.338 37 0 \n",
+ "577 42.9 0.693 21 1 \n",
+ "622 40.8 1.461 45 0 \n",
+ "630 27.4 0.732 34 1 \n",
+ "658 39.0 0.190 51 0 \n",
+ "669 30.9 0.164 45 0 \n",
+ "693 38.5 0.439 43 1 \n",
+ "730 28.4 0.323 34 1 \n",
+ "744 40.6 1.174 39 0 "
+ ]
+ },
+ "execution_count": 296,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.transform(X_test)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "y_pred = class_models[best_model][\"preds\"]\n",
+ "\n",
+ "error_index = y_test[y_test[\"Outcome\"] != y_pred].index.tolist()\n",
+ "display(f\"Error items count: {len(error_index)}\")\n",
+ "\n",
+ "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
+ "error_df = X_test.loc[error_index].copy()\n",
+ "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
+ "error_df.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Пример использования обученной модели (конвейера) для предсказания"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 297,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 555 | \n",
+ " 7.0 | \n",
+ " 124.0 | \n",
+ " 70.0 | \n",
+ " 33.0 | \n",
+ " 215.0 | \n",
+ " 25.5 | \n",
+ " 0.161 | \n",
+ " 37.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "555 7.0 124.0 70.0 33.0 215.0 25.5 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "555 0.161 37.0 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Glucose | \n",
+ " Age | \n",
+ " BloodPressure | \n",
+ " DiabetesPedigreeFunction | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 555 | \n",
+ " 0.122742 | \n",
+ " 0.322537 | \n",
+ " 0.049921 | \n",
+ " -0.927636 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Glucose Age BloodPressure DiabetesPedigreeFunction\n",
+ "555 0.122742 0.322537 0.049921 -0.927636"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'predicted: 0 (proba: [0.7669925 0.2330075])'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'real: 0'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model = class_models[best_model][\"pipeline\"]\n",
+ "\n",
+ "example_id = 555\n",
+ "test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
+ "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
+ "display(test)\n",
+ "display(test_preprocessed)\n",
+ "result_proba = model.predict_proba(test)[0]\n",
+ "result = model.predict(test)[0]\n",
+ "real = int(y_test.loc[example_id].values[0])\n",
+ "display(f\"predicted: {result} (proba: {result_proba})\")\n",
+ "display(f\"real: {real}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Подбор гиперпараметров методом поиска по сетке"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 298,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'model__criterion': 'entropy',\n",
+ " 'model__max_depth': 7,\n",
+ " 'model__max_features': 'sqrt',\n",
+ " 'model__n_estimators': 50}"
+ ]
+ },
+ "execution_count": 298,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "optimized_model_type = \"random_forest\"\n",
+ "\n",
+ "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
+ "\n",
+ "param_grid = {\n",
+ " \"model__n_estimators\": [10, 50, 100],\n",
+ " \"model__max_features\": [\"sqrt\", \"log2\"],\n",
+ " \"model__max_depth\": [5, 7, 10],\n",
+ " \"model__criterion\": [\"gini\", \"entropy\"],\n",
+ "}\n",
+ "\n",
+ "gs_optomizer = GridSearchCV(\n",
+ " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
+ ")\n",
+ "gs_optomizer.fit(X_train, y_train.values.ravel())\n",
+ "gs_optomizer.best_params_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Обучение модели с новыми гиперпараметрами"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 299,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_model = ensemble.RandomForestClassifier(\n",
+ " random_state=random_state,\n",
+ " criterion=\"gini\",\n",
+ " max_depth=5,\n",
+ " max_features=\"log2\",\n",
+ " n_estimators=10,\n",
+ ")\n",
+ "\n",
+ "result = {}\n",
+ "\n",
+ "result[\"pipeline\"] = Pipeline([(\"pipeline\", pipeline_end), (\"model\", optimized_model)]).fit(X_train, y_train.values.ravel())\n",
+ "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
+ "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
+ "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
+ "\n",
+ "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
+ "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
+ "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
+ "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
+ "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
+ "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
+ "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
+ "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
+ "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование данных для оценки старой и новой версии модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 300,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=class_models[optimized_model_type]\n",
+ ")\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=result\n",
+ ")\n",
+ "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
+ "optimized_metrics = optimized_metrics.set_index(\"Name\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Оценка параметров старой и новой модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 301,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Precision_train | \n",
+ " Precision_test | \n",
+ " Recall_train | \n",
+ " Recall_test | \n",
+ " Accuracy_train | \n",
+ " Accuracy_test | \n",
+ " F1_train | \n",
+ " F1_test | \n",
+ "
\n",
+ " \n",
+ " Name | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Old | \n",
+ " 0.990741 | \n",
+ " 0.633333 | \n",
+ " 1.000000 | \n",
+ " 0.703704 | \n",
+ " 0.996743 | \n",
+ " 0.753247 | \n",
+ " 0.995349 | \n",
+ " 0.666667 | \n",
+ "
\n",
+ " \n",
+ " New | \n",
+ " 0.861842 | \n",
+ " 0.673913 | \n",
+ " 0.612150 | \n",
+ " 0.574074 | \n",
+ " 0.830619 | \n",
+ " 0.753247 | \n",
+ " 0.715847 | \n",
+ " 0.620000 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 301,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 302,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Accuracy_test | \n",
+ " F1_test | \n",
+ " ROC_AUC_test | \n",
+ " Cohen_kappa_test | \n",
+ " MCC_test | \n",
+ "
\n",
+ " \n",
+ " Name | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Old | \n",
+ " 0.753247 | \n",
+ " 0.666667 | \n",
+ " 0.808704 | \n",
+ " 0.471650 | \n",
+ " 0.473300 | \n",
+ "
\n",
+ " \n",
+ " New | \n",
+ " 0.753247 | \n",
+ " 0.620000 | \n",
+ " 0.846111 | \n",
+ " 0.439034 | \n",
+ " 0.442128 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 302,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 303,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2oAAAGjCAYAAABdU+ZeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYiUlEQVR4nO3df3zNdf/H8efZZj/sx5mJzbQNKT+KhGLyI8KSisuulHR9CV3lVyEllR9RJtdVpKRCU1dJuSqhqx+sKPkRihKt8qPNj40L24z2w875/iGnzjW0Y5/tnH0+j/vt9rldzufzOe/zPq7Zs/fn/fq8Pzan0+kUAAAAAMBn+Hm7AwAAAAAAdwzUAAAAAMDHMFADAAAAAB/DQA0AAAAAfAwDNQAAAADwMQzUAAAAAMDHMFADAAAAAB8T4O0OAAAqR0FBgYqKigxrLzAwUMHBwYa1BwCAJ8yeawzUAMACCgoKVD8hTFmHSgxrMyYmRnv27PGpUAMAWIMVco2BGgBYQFFRkbIOleiXLfUUEV7+qve84w4ltNqroqIinwk0AIB1WCHXGKgBgIWEhdsUFm4rdzsOlb8NAADKy8y5xkANACykxOlQidOYdgAA8DYz5xqrPgIAAACAj2FGDQAsxCGnHCr/pUcj2gAAoLzMnGsM1ADAQhxyyIjiDmNaAQCgfMyca5Q+AgAAAICPYUYNACykxOlUibP85R1GtAEAQHmZOdcYqAGAhZi5lh8AYD1mzjVKHwEAAADAxzCjBgAW4pBTJSa98ggAsB4z5xozagAAAADgY5hRAwALMXMtPwDAesycawzUAMBCzLw6FgDAesyca5Q+AgAAAICPYUYNACzE8dtmRDsAAHibmXONgRoAWEiJQatjGdEGAADlZeZco/QRAAAAAHwMM2oAYCElztObEe0AAOBtZs41BmoAYCFmruUHAFiPmXON0kcAAAAA8DHMqAGAhThkU4lshrQDAIC3mTnXGKgBgIU4nKc3I9oBAMDbzJxrlD4CAAAAgI9hoAYAFlLyW4mIERsAAN7mjVwrKSnRhAkTVL9+fYWEhOiSSy7R1KlT5XT+Pi3ndDo1ceJE1alTRyEhIeratat++uknj74bAzUAAAAAKKOnnnpKc+fO1fPPP6+dO3fqqaee0owZM/Tcc8+5zpkxY4Zmz56tF198URs3blRoaKiSkpJUUFBQ5s/hHjUAsBCjZsOYUQMA+AJv5Nq6devUq1cv9ezZU5JUr149vfnmm/rqq68knZ5NmzVrlh577DH16tVLkvTaa68pOjpaS5cu1e23316mz2FGDQAsxOG0GbYBAOBtRudaXl6e21ZYWFjqM9u1a6e0tDT9+OOPkqRt27Zp7dq16tGjhyRpz549ysrKUteuXV3vsdvtatOmjdavX1/m78aMGgAAAABIiouLc3s9adIkTZ482W3fww8/rLy8PDVu3Fj+/v4qKSnRk08+qf79+0uSsrKyJEnR0dFu74uOjnYdKwsGagBgIZQ+AgDMxOhcy8zMVEREhGt/UFBQqXPffvttvfHGG1q0aJEuv/xybd26VaNGjVJsbKwGDBhQ7r6cwUANACykRH4qMaDqvcSAvgAAUF5G51pERITbQO1sHnzwQT388MOue82aNWumX375RSkpKRowYIBiYmIkSdnZ2apTp47rfdnZ2WrRokWZ+8Q9agAAAABQRidPnpSfn/swyt/fXw6HQ5JUv359xcTEKC0tzXU8Ly9PGzduVGJiYpk/hxk1ALAQp0ELgThZTAQA4AO8kWs333yznnzyScXHx+vyyy/XN998o2eeeUaDBg2SJNlsNo0aNUpPPPGELr30UtWvX18TJkxQbGysevfuXebPYaAGABbCPWoAADPxRq4999xzmjBhgoYNG6ZDhw4pNjZW99xzjyZOnOg656GHHtKJEyf097//XTk5OWrfvr0++ugjBQcHl/lzbM4/PkIbAGBKeXl5stvt+uS7BIWGl7/q/cRxh7o3+0W5ubl/WssPAIDRrJBrzKgBgIWUOP1U4jTgpmsu8QEAfICZc43FRAAAAADAxzCjBgAW4pBNDgOu0Tnkg5ceAQCWY+ZcY6AGABbCYiIAADMxc65R+ggAAAAAPoYZNQCwEONuuva9EhEAgPWYOdcYqAGAhZyu5S9/eYcRbQAAUF5mzjVKHwEAAADAxzCjBgAW4pCfSky6OhYAwHrMnGsM1ADAQsxcyw8AsB4z5xqljwAAAADgY5hRAwALccjPtA8GBQBYj5lzjRk1ALCQEqfNsK2s6tWrJ5vNVmobPny4JKmgoEDDhw9XzZo1FRYWpuTkZGVnZ1fUXwEAwES8kWuVhYEaAKBCbdq0SQcPHnRtK1eulCTdeuutkqTRo0dr+fLlWrJkidasWaMDBw6oT58+3uwyAABeR+kjAFhIiUGrY5V4UCJSq1Ytt9fTp0/XJZdcok6dOik3N1cLFizQokWL1KVLF0lSamqqmjRpog0bNqht27bl7isAwLy8kWuVhRk1AMAFy8vLc9sKCwvPe35RUZFef/11DRo0SDabTVu2bFFxcbG6du3qOqdx48aKj4/X+vXrK7r7AAD4LAZqAGAhDqefYZskxcXFyW63u7aUlJTzfv7SpUuVk5OjgQMHSpKysrIUGBioyMhIt/Oio6OVlZVVEX8FAAATMTrXfAmljwBgIUaXiGRmZioiIsK1Pygo6LzvW7BggXr06KHY2Nhy9wEAADOXPjJQAwBcsIiICLeB2vn88ssvWrVqld59913XvpiYGBUVFSknJ8dtVi07O1sxMTFGdxcAgCrD9+b4AAAVxiFjljJ2XMBnp6amqnbt2urZs6drX6tWrVStWjWlpaW59qWnpysjI0OJiYnl/8IAAFPzZq5VNGbUAMBCjHswqGdtOBwOpaamasCAAQoI+D167Ha7Bg8erDFjxigqKkoREREaOXKkEhMTWfERAPCnvJVrlYGBGgCgwq1atUoZGRkaNGhQqWMzZ86Un5+fkpOTVVhYqKSkJL3wwgte6CUAAL6DgRoAWEiJ008lBqxs5Wkb3bt3l9N59hu1g4ODNWfOHM2ZM6fc/QIAWIu3cq0yMFADAAtxyCaHbIa0AwCAt5k513xv6AgAAAAAFseMGgBYiJlLRAAA1mPmXPO9HgEAAACAxTGjBgAWUiI/lRhwjc6INgAAKC8z5xoDtUrmcDh04MABhYeHy2bzvZsWAfgep9Op48ePKzY2Vn5+5QsSh9Mmh9OAm64NaAPmQK4B8BS5VjYM1CrZgQMHFBcX5+1uAKiCMjMzdfHFF3u7G4Abcg3AhSLXzo+BWiULDw+XJP3ydT1FhPneFCu869buN3q7C/BBpxxFWp3xsuv3R3k4DCoRcfhgiQi8g1zD+fzlsmbe7gJ80CkVa63+Q679CQZqlexMWUhEmJ8iwn3vBwLeFeAX5O0uwIcZUVbmcPrJYcDKVka0AXMg13A+AbZq3u4CfJHz9P+Qa+fnez0CAAAAAItjRg0ALKRENpWo/FcwjWgDAIDyMnOuMVADAAsxc4kIAMB6zJxrvtcjAAAAALA4ZtQAwEJKZEx5R0n5uwIAQLmZOdcYqAGAhZi5RAQAYD1mzjXf6xEAAAAAWBwzagBgISVOP5UYcNXQiDYAACgvM+ea7/UIAAAAACyOGTUAsBCnbHIYcNO10wefNwMAsB4z5xoDNQCwEDOXiAAArMfMueZ7PQIAAAAAi2NGDQAsxOG0yeEsf3mHEW0AAFBeZs41BmoAYCEl8lOJAcUURrQBAEB5mTnXfK9HAAAAAOCj6tWrJ5vNVmobPny4JKmgoEDDhw9XzZo1FRYWpuTkZGVnZ3v8OQzUAMBCzpSIGLEBAOBt3si1TZs26eDBg65t5cqVkqRbb71VkjR69GgtX75cS5Ys0Zo1a3TgwAH16dPH4+9G6SMAWIhDfnIYcI3OiDYAACgvb+RarVq13F5Pnz5dl1xyiTp16qTc3FwtWLBAixYtUpcuXSRJqampatKkiTZs2KC2bduW+XNIWgAAAACQlJeX57YVFhae9/yioiK9/vrrGjRokGw2m7Zs2aLi4mJ17drVdU7jxo0VHx+v9evXe9QXBmoAYCElTpthGwAA3mZ0rsXFxclut7u2lJSU837+0qVLlZOTo4EDB0qSsrKyFBgYqMjISLfzoqOjlZWV5dF3o/QRAAAAACRlZmYqIiLC9TooKOi85y9YsEA9evRQbGys4X1hoAYAFmLm580AAKzH6FyLiIhwG6idzy+//KJVq1bp3Xffde2LiYlRUVGRcnJy3GbVsrOzFRMT41GfKH0EAAtxOv3kMGBzOokPAID3eTPXUlNTVbt2bfXs2dO1r1WrVqpWrZrS0tJc+9LT05WRkaHExESP2mdGDQAAAAA84HA4lJqaqgEDBigg4Pchld1u1+DBgzVmzBhFRUUpIiJCI0eOVGJiokcrPkoM1ADAUkpkU4nKXyJiRBsAAJSXt3Jt1apVysjI0KBBg0odmzlzpvz8/JScnKzCwkIlJSXphRde8LhPDNQAwEIcTmPuL3M4DegMAADl5K1c6969u5zOs78pODhYc+bM0Zw5c8rVJ24yAAAAAAAfw4waAFjImZumjWgHAABvM3OuMVADAAtxyCaHAbX8RrQBAEB5mTnXfG/oCAAAAAAWx4waAFhIidOmEgNuujaiDQAAysvMucZADQAsxMy1/AAA6zFzrvlejwAAAADA4phRAwALcchmzPNmfPCmawCA9Zg515hRAwAAAAAfw4waAFiI06BljJ0+eOURAGA9Zs41BmoAYCEOp0ElIj64OhYAwHrMnGuUPgIAAACAj2FGDQAsxMzLGAMArMfMucZADQAsxMwlIgAA6zFzrvne0BEAAAAALI4ZNQCwEIdBq2P54vNmAADWY+ZcY6AGABZi5hIRAID1mDnXKH0EAAAAAB/DjBoAWIiZrzwCAKzHzLnGjBoAAAAA+BgGagBgIWeuPBqxeWL//v268847VbNmTYWEhKhZs2bavHmz67jT6dTEiRNVp04dhYSEqGvXrvrpp5+M/voAAJPxVq5VBgZqAGAh3gi0Y8eO6dprr1W1atX04YcfaseOHXr66adVo0YN1zkzZszQ7Nmz9eKLL2rjxo0KDQ1VUlKSCgoKKuKvAQBgEmYeqHGPGgDgguXl5bm9DgoKUlBQkNu+p556SnFxcUpNTXXtq1+/vuvPTqdTs2bN0mOPPaZevXpJkl577TVFR0dr6dKluv322yvwGwAA4JuYUQMAC3Hq92fOlGdz/tZeXFyc7Ha7a0tJSSn1mcuWLVPr1q116623qnbt2rrqqqs0b9481/E9e/YoKytLXbt2de2z2+1q06aN1q9fX8F/IwCAqszoXPMlzKgBgIUYvTpWZmamIiIiXPv/dzZNknbv3q25c+dqzJgxeuSRR7Rp0ybdd999CgwM1IABA5SVlSVJio6OdntfdHS06xgAAGdj5lUfGagBAC5YRESE20DtbBwOh1q3bq1p06ZJkq666ipt375dL774ogYMGFAZ3QQAoMqh9BEALMQbN13XqVNHTZs2ddvXpEkTZWRkSJJiYmIkSdnZ2W7nZGdnu44BAHA2Zl5MhIEaAFiINwLt2muvVXp6utu+H3/8UQkJCZJOLywSExOjtLQ01/G8vDxt3LhRiYmJxnxxAIApmXmgRukjAKBCjR49Wu3atdO0adPUt29fffXVV3r55Zf18ssvS5JsNptGjRqlJ554Qpdeeqnq16+vCRMmKDY2Vr179/Zu5wEA8BIGagBgId646frqq6/We++9p/Hjx2vKlCmqX7++Zs2apf79+7vOeeihh3TixAn9/e9/V05Ojtq3b6+PPvpIwcHB5e4rAMC8WEwEAGAKTqdNTgPCyNM2brrpJt10003nPG6z2TRlyhRNmTKlvF0DAFiIt3KtMnCPGgAAAAD4GGbUAMBCzjzY04h2AADwNjPnGjNqAAAAAOBjmFEDAAsx803XAADrMXOuWWKgtnr1anXu3FnHjh1TZGTkOc+rV6+eRo0apVGjRlVa36zg/65pqux9gaX23zzgsEak7NfRQwGaPzVWX38erpP5foq7pFC335+tDj1zvdBbVKZb//aT2nU6qIsTjquo0F87v4tS6tym2p8RJkkKCy/SnUPSddU1h1Qr+lflHgvShi9i9K95jXXyRDUv975qMvNN11ZCrnlXSYn0+tMxSnunho4drqaa0cXq1veo7hiVLdtv/zT+OSpeK9+Ocntfq+vyNG3Rbi/0GJXlijb5unXYYV3a7KRqxpzS5EH1tP4ju+v4AzMz1P22Y27v2fxZuB7t36Cyu2oaZs41rw7UBg4cqJycHC1dutRtf1kD6EItXLhQo0aNUk5OjuFto7TZH6bLUfL7D//eH4I1/vaG6nDz6YHYP+6LV36evyYv3CN71Cl99l4NTbunnp778Ec1bPart7qNStCsxX/1wbv19OPOSPn7OzXgnp16YuZ63du/swoLAlTzogJFXVSgBc9froy94aodfVIjHvxWURcVKOWxq73dfaAUcs0a3p5TWytevUhjn81QQqMC/bQtRE+PjldoeIl6D/mv67zWnfP0wMwM1+tqgU5vdBeVKLi6Q7u/D9bHb0Zp0it7z3rOpk/D9fToONfr4iLfGyDAN1hiRg3eFVmzxO31W8/bVadeoZon5kuSdmwO1cjp+9T4qpOSpDtGZevdebX007chDNRMbuIDiW6vn3nyKr35wcdq2ChX32+rqV/2RGjao78PyLL2h+q1l5to7MSv5efvkKOE22w9ZeYSEaCy7NgcqsSkXLXpmidJiokr0mdLjyt9a3W386oFOhVV+5Q3uggv2fxZhDZ/FnHec4qLbDp2mKoQo5g516rEf+WsXbtWHTp0UEhIiOLi4nTffffpxIkTruP/+te/1Lp1a4WHhysmJkZ33HGHDh06dNa2Vq9erbvuuku5ubmy2Wyy2WyaPHmy6/jJkyc1aNAghYeHKz4+Xi+//LLrWJcuXTRixAi39g4fPqzAwEClpaUZ+6VNqrjIpk/fqaGk24+4ykOatj6hNcsilXfMXw6HtHpppIoKbGreLt+7nUWlCw0tliTl5507wKqHFevkiQAGaRfoTImIERsuHLlWtTVtfUJb14Zr364gSdKu74P1/VehurrLcbfzvl0fpr7NLtfg9o01++GLlXfU3xvdhY9pnpivt779XvO/+EEjU/YpvAaD+fIwc675/H/p7Nq1SzfccIOSk5P17bff6q233tLatWvdgqW4uFhTp07Vtm3btHTpUu3du1cDBw48a3vt2rXTrFmzFBERoYMHD+rgwYMaO3as6/jTTz+t1q1b65tvvtGwYcM0dOhQpaenS5KGDBmiRYsWqbCw0HX+66+/rrp166pLly5n/bzCwkLl5eW5bVa27iO78vP81b3vUde+R1/6RSXFNt16eTPdVO9KPTsuTpMW7FXd+kVe7Ckqm83m1N/v/17fb4vSL3vOfjUywl6ofgN/1EfLEiq5d4BxyLWq77YRh9Sp1zEN6dhYN8ZfqeHdG+kvdx9Wlz6/33vU+ro8PfjsL3rq7V0a/OhBfbc+TI/e2UAlJedpGKa3eXW4/nF/vMb1baAFT9ZRs8R8Pfn6bvn5URaL0rxe+rhixQqFhYW57Sv5w2+xlJQU9e/f33Uj9KWXXqrZs2erU6dOmjt3roKDgzVo0CDX+Q0aNNDs2bN19dVXKz8/v1TbgYGBstvtstlsiomJKdWfG2+8UcOGDZMkjRs3TjNnztRnn32mRo0aqU+fPhoxYoTef/999e3bV9Lp+wIGDhwom+3so/CUlBQ9/vjjnv/FmNTHb0bp6s55qhnz+9WjV2fEKD/PX9Pf+lkRUae0/iO7nry3np5+7yfVb1Lgxd6iMg194FslNMjTg0Pbn/V4SPViTf7HRmXsCdcbCxpVcu/Mw2lQiYgvXnn0FeSa+X2+LFKfvltDD8/5RQmNCrTr+xC9OKnub4uKnB6sXdc7x3V+/SYFqt/0Vw1MbKpv14Xpqg5UjFjVmvdruP6894cQ7dkRrFc3/KDm7fK1dW24F3tWdZk517w+o9a5c2dt3brVbZs/f77r+LZt27Rw4UKFhYW5tqSkJDkcDu3Zs0eStGXLFt18882Kj49XeHi4OnXqJEnKyMg462eeT/PmzV1/PhN6Z8pNgoOD9be//U2vvPKKJOnrr7/W9u3bz3mVU5LGjx+v3Nxc15aZmelxn8wie181ffNFuG6444hr34G9gVqWWktjnsnUVR3ydcnlBbrzgWxd2vykli28yIu9RWW6d8y3uqZdtsaPbKcjh0NKHQ+pfkpTn9mgX08G6IlHrlYJZY8XzCnJ6TRg8/YX8WHkmvnNmxqr20Yc0nW9c1S/SYG6/vWY+tx9WIufiz7ne+okFMkedUoH9gZVYk/h67IygpRzxF+x9agiulBmzjWvz6iFhoaqYcOGbvv27dvn+nN+fr7uuece3XfffaXeGx8frxMnTigpKUlJSUl64403VKtWLWVkZCgpKUlFRZ7/0Fer5n5vjM1mk8PhcL0eMmSIWrRooX379ik1NVVdunRRQsK5y7CCgoIUFMQvZUn6ZHFNRV50ynXztSQV/nr6P7j/d8rf398pp0MwPafuHfOdEjtmafyIdso+GFrqjJDqxZo6c4OKi/w0Zdw1Ki7iHg/4NnLN/AoL/GT7n9zy83fKeZ7/0jt8oJryjvkrqnZxBfcOVclFdYoUUaNERw95/T/J4YN8/qeiZcuW2rFjR6nQO+O7777TkSNHNH36dMXFnV7qdPPmzedtMzAw0K0MxRPNmjVT69atNW/ePC1atEjPP//8BbVjNQ6H9MlbUep661H5/+GnLq5hgWLrF+rZh+J098QDiqhxSus+suvrz8M15TWeNWN2wx74Tp267dPUh6/RrycDVCPqdKnrifxqKiryV0j1Yj0xa4OCgk7pn1OuUfXQU6oeerpsNjcnSA6H75Up+DqHbLLJgNWxDGjDqsi1qq9ttzwtnh2t2nWLT5c+bg/Ruy/VVvfbT1eM/HrCT68/HaP2PXNUo/YpHdwbqPlPxCq2fqFaXXf8T1pHVRZcvUSxf7jHPiauSA0u/1XHc/x1/Ji/7nwgW2s/sOvYoWqqU69QQx47qAN7ArVlNWWPF8rMuebzA7Vx48apbdu2GjFihIYMGaLQ0FDt2LFDK1eu1PPPP6/4+HgFBgbqueee07333qvt27dr6tSp522zXr16ys/PV1pamq688kpVr15d1atXP+97/mjIkCEaMWKEQkND9Ze//KW8X9ESvvk8XIf2Byrp9qNu+wOqSU/8a5cWTIvVpAH19esJP8XWL9LYZzN0zfWEmdn17LNXkvTUnHVu+2c+2UKr/hOvho1y1fjy0/d7LHjbfQW6u5K76lBW2f/dAr6CXKv6hj2xT6/OqKPnx1+snCMBqhldrBv/9l/1H50t6XSVyJ6dwVq5pL5O5PmrZvQpteyUpwEPZSkwyBcLrGCUy678Vf94Z5fr9b2PH5AkffJWDT03/mLVb/Krut16TKERJTqSHaCv14Tr1RkxKi6ipB+l+fxArXnz5lqzZo0effRRdejQQU6nU5dccoluu+02SVKtWrW0cOFCPfLII5o9e7Zatmypf/7zn7rlllvO2Wa7du1077336rbbbtORI0c0adIkt6WM/0y/fv00atQo9evXT8HBweX9ipbQ6rrj+vjA1rMeq9ugSBPn763U/sA39Lz23P9OJem7by7603PgGaOWIPbFm66rCnKt6qse5tDQKfs1dMr+sx4PCnFq2ptUhVjRt+vDlBR75TmPP3rHJZXYG2swc67ZnM7zVVTjbPbu3atLLrlEmzZtUsuWLT16b15enux2u4792EAR4Vw9gbue1/bydhfgg045CrVq7/PKzc1VRMT5H6R6Lmd+91zx9oPyr17++4tKThZqe99/lKtP8B3kGipKUmwLb3cBPuiUs1ir9X6VzrX9+/dr3Lhx+vDDD3Xy5Ek1bNhQqampat26tSTJ6XRq0qRJmjdvnnJycnTttddq7ty5uvTSS8vcJ36jeqC4uFhZWVl67LHH1LZtW4/DDAAAX0KuAYDnjh07pmuvvVbVqlXThx9+qB07dujpp59WjRq/P35hxowZmj17tl588UVt3LhRoaGhSkpKUkFB2R895fOlj77kyy+/VOfOnXXZZZfp3//+t7e7AwAeO7MMsRHtoOoj1wBUdd7ItaeeekpxcXFKTU117atfv/4f2nJq1qxZeuyxx9Sr1+lqqddee03R0dFaunSpbr/99jJ9DgM1D1x33XWiUhRAVWbmWn54jlwDUNUZnWt5eXlu+8/2SJJly5YpKSlJt956q9asWaO6detq2LBhuvvuuyVJe/bsUVZWlrp27ep6j91uV5s2bbR+/foyD9QofQQAAAAASXFxcbLb7a4tJSWl1Dm7d+923W/28ccfa+jQobrvvvv06quvSpKysrIkSdHR0W7vi46Odh0rC2bUAMBCmFEDAJiJ0bmWmZnptpjI/86mSZLD4VDr1q01bdo0SdJVV12l7du368UXX9SAAQPK3ZczmFEDAAtxOG2GbQAAeJvRuRYREeG2nW2gVqdOHTVt2tRtX5MmTZSRkSFJiomJkSRlZ2e7nZOdne06VhYM1AAAAACgjK699lqlp6e77fvxxx+VkJAg6fTCIjExMUpLS3Mdz8vL08aNG5WYmFjmz6H0EQAshFUfAQBm4o1cGz16tNq1a6dp06apb9+++uqrr/Tyyy/r5ZdfliTZbDaNGjVKTzzxhC699FLVr19fEyZMUGxsrHr37l3mz2GgBgAWcjrQjKjlN6AzAACUkzdy7eqrr9Z7772n8ePHa8qUKapfv75mzZql/v37u8556KGHdOLECf39739XTk6O2rdvr48++kjBwcFl/hwGagAAAADggZtuukk33XTTOY/bbDZNmTJFU6ZMueDPYKAGABbCqo8AADMxc66xmAgAAAAA+Bhm1ADAQpy/bUa0AwCAt5k51xioAYCFmLlEBABgPWbONUofAQAAAMDHMKMGAFZi5hoRAID1mDjXGKgBgJUYVCIiHywRAQBYkIlzjdJHAAAAAPAxzKgBgIU4nac3I9oBAMDbzJxrDNQAwELMvDoWAMB6zJxrlD4CAAAAgI9hRg0ArMRpM+aGaR+88ggAsCAT5xozagAAAADgY5hRAwALMfNN1wAA6zFzrjFQAwArMfGDQQEAFmTiXKP0EQAAAAB8DDNqAGAhZl7GGABgPWbONQZqAGA1PljeAQDABTNprlH6CAAAAAA+pkwzasuWLStzg7fccssFdwYAULHMXCLiCXINAMzBzLlWpoFa7969y9SYzWZTSUlJefoDAKhIJl4dyxPkGgCYhIlzrUwDNYfDUdH9AACg0pBrAABfV6571AoKCozqBwCgUtgM3MyHXAOAqsa8uebxQK2kpERTp05V3bp1FRYWpt27d0uSJkyYoAULFhjeQQCAgZwGbiZBrgFAFWbiXPN4oPbkk09q4cKFmjFjhgIDA137r7jiCs2fP9/QzgEAUNHINQCAL/J4oPbaa6/p5ZdfVv/+/eXv7+/af+WVV+qHH34wtHMAAIOZ+MrjhSLXAKAKM3GueTxQ279/vxo2bFhqv8PhUHFxsSGdAgCgspBrAABf5PFArWnTpvriiy9K7f/3v/+tq666ypBOAQAqiNNm3FZGkydPls1mc9saN27sOl5QUKDhw4erZs2aCgsLU3JysrKzsyvi258VuQYAVZgXcq2ylGl5/j+aOHGiBgwYoP3798vhcOjdd99Venq6XnvtNa1YsaIi+ggAMIjTeXozoh1PXH755Vq1apXrdUDA7/EzevRoffDBB1qyZInsdrtGjBihPn366Msvvyx/R8uAXAOAqstbuVYZPJ5R69Wrl5YvX65Vq1YpNDRUEydO1M6dO7V8+XJ169atIvoIAKjiAgICFBMT49ouuugiSVJubq4WLFigZ555Rl26dFGrVq2UmpqqdevWacOGDZXSN3INAOCLPJ5Rk6QOHTpo5cqVRvcFAFDRjLph+rc28vLy3HYHBQUpKCio1Ok//fSTYmNjFRwcrMTERKWkpCg+Pl5btmxRcXGxunbt6jq3cePGio+P1/r169W2bVsDOvvnyDUAqKIMzjVfckEDNUnavHmzdu7cKel0fX+rVq0M6xQAoIIYVYf/WxtxcXFuuydNmqTJkye77WvTpo0WLlyoRo0a6eDBg3r88cfVoUMHbd++XVlZWQoMDFRkZKTbe6Kjo5WVlVX+fnqAXAOAKsjgXPMlHg/U9u3bp379+unLL790BWtOTo7atWunxYsX6+KLLza6jwAAH5WZmamIiAjX67PNpvXo0cP15+bNm6tNmzZKSEjQ22+/rZCQkErp5/mQawAAX+TxPWpDhgxRcXGxdu7cqaNHj+ro0aPauXOnHA6HhgwZUhF9BAAYxOY0bpOkiIgIt+1sA7X/FRkZqcsuu0w///yzYmJiVFRUpJycHLdzsrOzFRMTUwF/A6WRawBQdRmda77E44HamjVrNHfuXDVq1Mi1r1GjRnruuef0+eefG9o5AIDBfODBoPn5+dq1a5fq1KmjVq1aqVq1akpLS3MdT09PV0ZGhhITEy/8QzxArgFAFeYDuVZRPC59jIuLO+sDQEtKShQbG2tIpwAA5jF27FjdfPPNSkhI0IEDBzRp0iT5+/urX79+stvtGjx4sMaMGaOoqChFRERo5MiRSkxMrLSFRMg1AIAv8nhG7R//+IdGjhypzZs3u/Zt3rxZ999/v/75z38a2jkAgMG88GDQM/eANWrUSH379lXNmjW1YcMG1apVS5I0c+ZM3XTTTUpOTlbHjh0VExOjd999t6L+Bkoh1wCgCrP6A69r1Kghm+33zp84cUJt2rRxPbD01KlTCggI0KBBg9S7d+8K6SgAoGpavHjxeY8HBwdrzpw5mjNnTiX1iFwDAPi+Mg3UZs2aVcHdAABUChM/b8YT5BoAmISJc61MA7UBAwZUdD8AAJXBxIHmCXINAEzCxLl2wQ+8lqSCggIVFRW57fvj83QAAKhKyDUAgK/weDGREydOaMSIEapdu7ZCQ0NVo0YNtw0A4MNMvIzxhSLXAKAKM3GueTxQe+ihh/Tpp59q7ty5CgoK0vz58/X4448rNjZWr732WkX0EQBgFBOvjnWhyDUAqMJMnGselz4uX75cr732mq677jrddddd6tChgxo2bKiEhAS98cYb6t+/f0X0EwCACkGuAQB8kcczakePHlWDBg0kna7bP3r0qCSpffv2+vzzz43tHQDAUDancZtZkGsAUHWZOdc8Hqg1aNBAe/bskSQ1btxYb7/9tqTTVyQjIyMN7RwAwGAmruW/UOQaAFRhJs41jwdqd911l7Zt2yZJevjhhzVnzhwFBwdr9OjRevDBBw3vIAAAFYlcAwB4YvLkybLZbG5b48aNXccLCgo0fPhw1axZU2FhYUpOTlZ2drbHn+PxPWqjR492/blr16764YcftGXLFjVs2FDNmzf3uAMAAHgTuQYA8NTll1+uVatWuV4HBPw+rBo9erQ++OADLVmyRHa7XSNGjFCfPn305ZdfevQZ5XqOmiQlJCQoISGhvM0AACqBTcbU4fve2ljGIdcAoOrwVq4FBAQoJiam1P7c3FwtWLBAixYtUpcuXSRJqampatKkiTZs2KC2bduW/TPKctLs2bPL3OB9991X5nMBAPAGcg0AcDZ5eXlur4OCghQUFFTqvJ9++kmxsbEKDg5WYmKiUlJSFB8fry1btqi4uFhdu3Z1ndu4cWPFx8dr/fr1xg/UZs6cWabGbDYbgVZGf7msmQJs1bzdDfiYQ8PrersL8EElRQXSywY1ZtSzYnzweTOeINeM99eevRTgX/o/ZmBthTfy0HiUdqq4QPrkfWMaMzjX4uLi3HZPmjRJkydPdtvXpk0bLVy4UI0aNdLBgwf1+OOPq0OHDtq+fbuysrIUGBhYajGq6OhoZWVledSlMg3UzqyGBQCAGZBrAICzyczMVEREhOv12WbTevTo4fpz8+bN1aZNGyUkJOjtt99WSEiIYX3xeNVHAEAVZuJljAEAFmRwrkVERLhtZxuo/a/IyEhddtll+vnnnxUTE6OioiLl5OS4nZOdnX3We9rOh4EaAFgJAzUAgJn4QK7l5+dr165dqlOnjlq1aqVq1aopLS3NdTw9PV0ZGRlKTEz0qN1yr/oIAAAAAFYxduxY3XzzzUpISNCBAwc0adIk+fv7q1+/frLb7Ro8eLDGjBmjqKgoRUREaOTIkUpMTPRoIRGJgRoAWIrNadAyxsyoAQB8gDdybd++ferXr5+OHDmiWrVqqX379tqwYYNq1aol6fSCVX5+fkpOTlZhYaGSkpL0wgsveNwnBmoAYCVGlS0yUAMA+AIv5NrixYvPezw4OFhz5szRnDlzytWlC7pH7YsvvtCdd96pxMRE7d+/X5L0r3/9S2vXri1XZwAA8AZyDQDgazweqL3zzjtKSkpSSEiIvvnmGxUWFko6/RTuadOmGd5BAICBfOCma19DrgFAFWbiXPN4oPbEE0/oxRdf1Lx581St2u8PbL722mv19ddfG9o5AICxztTyG7GZBbkGAFWXmXPN44Faenq6OnbsWGq/3W4v9bwAAAB8HbkGAPBFHg/UYmJi9PPPP5fav3btWjVo0MCQTgEAKojTZtxmEuQaAFRhJs41jwdqd999t+6//35t3LhRNptNBw4c0BtvvKGxY8dq6NChFdFHAAAqDLkGAPBFHi/P//DDD8vhcOj666/XyZMn1bFjRwUFBWns2LEaOXJkRfQRAGAUlucvhVwDgCrMxLnm8UDNZrPp0Ucf1YMPPqiff/5Z+fn5atq0qcLCwiqifwAAA/HA69LINQCousycaxf8wOvAwEA1bdrUyL4AAOA15BoAwJd4PFDr3LmzbLZz32z36aeflqtDAIAKZOISkQtFrgFAFWbiXPN4oNaiRQu318XFxdq6dau2b9+uAQMGGNUvAEBFMOpZMT4YaBeKXAOAKszEuebxQG3mzJln3T958mTl5+eXu0MAAFQmcg0A4Is8Xp7/XO6880698sorRjUHAKgITgM3kyPXAKAKMHGuXfBiIv9r/fr1Cg4ONqo5AEBFMHEtv9HINQCoAkycax4P1Pr06eP22ul06uDBg9q8ebMmTJhgWMcAAKgM5BoAwBd5PFCz2+1ur/38/NSoUSNNmTJF3bt3N6xjAADjmfl5MxeKXAOAqsvMuebRQK2kpER33XWXmjVrpho1alRUnwAAqBTkGgDAV3m0mIi/v7+6d++unJycCuoOAACVh1wDAPgqj1d9vOKKK7R79+6K6AsAoKKZeHWsC0WuAUAVZuJc83ig9sQTT2js2LFasWKFDh48qLy8PLcNAICqhFwDAPiiMt+jNmXKFD3wwAO68cYbJUm33HKLbDab67jT6ZTNZlNJSYnxvQQAGMLMN117ilwDgKrPzLlW5oHa448/rnvvvVefffZZRfYHAFDRfDCMvIFcAwCTMGmulXmg5nSe/hvo1KlThXUGAIDKQq4BAHyZR8vz/7EkBABQBRl1w7RJrl6SawBQxZk41zwaqF122WV/GmpHjx4tV4cAABXHzLX8F4JcA4Cqzcy55tFA7fHHH5fdbq+ovgAAUKnINQCAr/JooHb77berdu3aFdUXAEBFM3GJyIUg1wCgijNxrpV5oEYdPwBUfWYuEfEUuQYAVZ+Zc63MD7w+szoWAABmQK4BAHxZmWfUHA5HRfYDAFAZTFwi4ilyDQBMwMS5VuYZNQAAAABA5fBoMREAQBVn4iuPAAALMnGuMVADAAsx803XAADrMXOuUfoIAAAAAD6GGTUAsBITl4gAACzIxLnGjBoAWInTwO0CTZ8+XTabTaNGjXLtKygo0PDhw1WzZk2FhYUpOTlZ2dnZF/4hAABr8IFcqygM1AAAlWbTpk166aWX1Lx5c7f9o0eP1vLly7VkyRKtWbNGBw4cUJ8+fbzUSwAAvI+BGgBYyJmbro3YJCkvL89tKywsPOdn5+fnq3///po3b55q1Kjh2p+bm6sFCxbomWeeUZcuXdSqVSulpqZq3bp12rBhQ0X/lQAAqjCjc82XMFADACsxuEQkLi5OdrvdtaWkpJzzo4cPH66ePXuqa9eubvu3bNmi4uJit/2NGzdWfHy81q9fb8CXBgCYlolLH1lMBABwwTIzMxUREeF6HRQUdNbzFi9erK+//lqbNm0qdSwrK0uBgYGKjIx02x8dHa2srCxD+wsAQFXBQA0ALMTo581ERES4DdTOJjMzU/fff79Wrlyp4ODg8n84AAC/4TlqAABz8EKJyJYtW3To0CG1bNlSAQEBCggI0Jo1azR79mwFBAQoOjpaRUVFysnJcXtfdna2YmJiyvNtAQBmR+kjAAAX5vrrr9d3333ntu+uu+5S48aNNW7cOMXFxalatWpKS0tTcnKyJCk9PV0ZGRlKTEz0RpcBAPA6BmoAYCVGXTX0oI3w8HBdccUVbvtCQ0NVs2ZN1/7BgwdrzJgxioqKUkREhEaOHKnExES1bdvWgM4CAEzLC7lWWRioAQC8bubMmfLz81NycrIKCwuVlJSkF154wdvdAgDAaxioAYCF2H7bjGinPFavXu32Ojg4WHPmzNGcOXPK2TIAwEp8JdcqAgM1ALASE5eIAAAsyMS5xqqPAAAAAHCBpk+fLpvNplGjRrn2FRQUaPjw4apZs6bCwsKUnJys7Oxsj9ploAYAFnLmeTNGbAAAeJu3c23Tpk166aWX1Lx5c7f9o0eP1vLly7VkyRKtWbNGBw4cUJ8+fTxqm4EaAFiJiZ83AwCwIINzLS8vz20rLCw850fn5+erf//+mjdvnmrUqOHan5ubqwULFuiZZ55Rly5d1KpVK6WmpmrdunXasGFDmb8aAzUAAAAAkBQXFye73e7aUlJSznnu8OHD1bNnT3Xt2tVt/5YtW1RcXOy2v3HjxoqPj9f69evL3BcWEwEAq2E2DABgJgbmWmZmpiIiIlyvg4KCznre4sWL9fXXX2vTpk2ljmVlZSkwMFCRkZFu+6Ojo5WVlVXmvjBQAwALMer+Mu5RAwD4AqNzLSIiwm2gdjaZmZm6//77tXLlSgUHB5f/w8+B0kcAAAAAKKMtW7bo0KFDatmypQICAhQQEKA1a9Zo9uzZCggIUHR0tIqKipSTk+P2vuzsbMXExJT5c5hRAwArMfHzZgAAFuSFXLv++uv13Xffue2766671LhxY40bN05xcXGqVq2a0tLSlJycLElKT09XRkaGEhMTy/w5DNRQ4a5ok69bhx3Wpc1OqmbMKU0eVE/rP7K7nRPXsECDHzuo5m3z5R8g/fJjkKbeXU+H9wd6qdeoDLe23K6/tvxesZHHJUm7D0fp5bWt9OWuBElSzdCTGnX9erWtn6nQwGLtPRqpBWtbKi39Em92G4DF9b3jB7XrcEAXxx9XUaG/dn4fpVdebqb9meGuc0aM+VpXtTykqIt+VcGvAdrxfU2lvnSF9mWev6QKVdst1+1Qr+t2KqZmviRp74EaenX5Vfpqe5wk6aaOP6hrm591afwRhYYU66aRf1P+r2e/Bwq+Kzw8XFdccYXbvtDQUNWsWdO1f/DgwRozZoyioqIUERGhkSNHKjExUW3bti3z51D6KMlms2np0qVlOnfy5Mlq0aJFhfbHbIKrO7T7+2A9/8jFZz1eJ6FQzyz9WZk/B+nBv16ie6+/TItmRauowFbJPUVlyz4epuc+a6v+C/6q/q/8VV/9Ulczb/1IDS46Kkmaekua6tXM0aglPXTrvNv06Q8N9FSflWoUfdjLPa+6vP28GVQOcq1iXXHlf7ViaQONGd5Zjz7YXv4BTj05Y62Cgk+5zvn5x0jNnNFK9wzorsceai+bpCf+sVZ+fvzjMbPDx0L18jvX6O9Te+ueJ3rr6x/q6MkRK1Uv9pgkKTjwlL7aHqc3/tPCux01EV/NtZkzZ+qmm25ScnKyOnbsqJiYGL377rsetWGJGbXDhw9r4sSJ+uCDD5Sdna0aNWroyiuv1MSJE3Xttdfq4MGDbs8+gLE2fxahzZ+d+wriwIez9NWnEVrwRKxr38FfuLpkBZ//VM/t9ZzVbXRry+/VvG62dv83SldenKVpH3bU9weiJUnzv2yl/tdsU9M6h5WeXcsLPTYBSh9NgVzzronj2ru9fmZ6ay1eukKXXnZM2789/bvpoxUNXMcPZYfqtVcu1wsLVql2zAllHQir1P6i8qzfluD2esF7V6vXdT+oaYND2nughv696vRsS4tGB7zRPXPykVxbvXq12+vg4GDNmTNHc+bMueA2LTFQS05OVlFRkV599VU1aNBA2dnZSktL05EjRyTJo5v6YCybzalrrs/Tkhdq68lFu9TwigJlZQRq8fO1S5VHwtz8bA51a7JLIdWK9e3+0wOzbfti1L3pLn3xc4KOFwSpe9OfFRRQos2/1PVybwHvItd8S2hosSTpeN7Zy/WDgk+p2w17dfBAdf33UPXK7Bq8yM/m0HWt9yg4sFjf76rt7e6gCjL9QC0nJ0dffPGFVq9erU6dOkmSEhISdM0117jOsdlseu+999S7d29J0r59+/Tggw/q448/VmFhoZo0aaI5c+aoTZs2pdrftWuXunXrphtvvFHPPfecbDb3cr3CwkK3J5rn5eVVwLesuiIvOqXqYQ7dNuKQFj4VowVPxqp15zxNnL9XD/31En23gauOZtew1hG9OvBdBQaU6Neianrg3zdo93+jJEkPvdtdT/1lpdY8kKriEj8VFAdozL9vUOYxBvEXiuX5qz5yzbfYbE7dM2Kbvv+upn7Z6/67qWevXRp0z3cKCSlRZkaYHn2wg06d4q4Ts6tf96heGL9MgdVK9GthNU14oZt+OcgMd0Uxc66Z/rdFWFiYwsLCtHTpUrdgOZf8/Hx16tRJ+/fv17Jly7Rt2zY99NBDcjgcpc799ttv1b59e91xxx16/vnnS4WZJKWkpLg93TwuLs6Q72UWtt9+Atd/HKH35tXS7u9D9Pbz0dq4KkI9/++IdzuHSrH3SKRun99X/5earCVbLteUmz913aM2vNNXCg8u1D1v3Kw7X0nW618114w+n6hhLX42LpjTwA1eQa75lmH3f6OE+nmaPuWaUsc+WxWvkXdfr4fu76j9meEaP2mjqlUr8UIvUZkys+waMuUvGjqtl95f3UTjB61RQp1j3u6WeZk410w/UAsICNDChQv16quvKjIyUtdee60eeeQRffvtt2c9f9GiRTp8+LCWLl2q9u3bq2HDhurbt2+ppTTXrVun6667TmPHjtUTTzxxzs8fP368cnNzXVtmZqah36+qyzvqr1PF0i8/uj8sMPOnINWuW+SlXqEynXL4K/OYXTuzaum51W3146Ga6nf1d7o4Mle3X71dk1d01ld7L9aPhy7Sy19crR0Ha+m21tu93W3Aa8g13zH0vm90TWKWHh7dUUf+W7qk8eSJajqwP1zbv62laZPbKi7uuNp14N4ksztV4q/9h+z68ZeLNO/dq7UrM0rJXb/3drdQBZl+oCadruU/cOCAli1bphtuuEGrV69Wy5YttXDhwlLnbt26VVdddZWioqLO2V5GRoa6deumiRMn6oEHHjjvZwcFBbmecF6WJ51bzaliP/24rbouvsT9qnDdBoU6tI+l+a3IZnMq0L9EwdVOr57mdLpf0S9x+Mnmi/UJVYWJrzxaCbnmbU4Nve8bJbY/oPFjOig7K/TP32JzSjYxo2ZBNptTgQH8/15hTJxrlhioSadXXunWrZsmTJigdevWaeDAgZo0aVKp80JCQv60rVq1aumaa67Rm2++afna/LIIrl6iBpf/qgaX/ypJiokrUoPLf1Wt32bMlrxQW51uyVGPO44otl6hbrnrv2rbLU/LX63pzW6jEoy8boNaxh1QHXueGtY6opHXbVDrhAP6z/eXau+RSGUcteuxG9fo8thsXRyZq7+12aq2DTK1Or2+t7teZfnqMsbwHLnmPcNGbVXnbpma8eQ1+vVkNdWoUaAaNQoUGHj6P8Zj6uSr7x0/qOFlx1Sr9kk1ufyIHpm0UUWF/tq0kYVezOzuPpvU/NKDiql5XPXrHtXdfTapRaODWrnx9PM/oyJOqmHcEdWtffrfWf2Lj6lh3BGFhxZ4s9tVmplzzfSLiZxL06ZNz/qMmebNm2v+/Pk6evToOa8+hoSEaMWKFbrxxhuVlJSkTz75ROHh4Wc9F9JlV/6qf7yzy/X63sdPl3188lYNPT06Xus+smv2w3V1+4hDGjp1v/btPv2w6++/YiERs4sK/VVTb/lUF4WdUH5hoH46VFPD3rxJG/ecvudl5OIbdV+XDXr21g9VPbBYmcfsmrisi9b+9kBsAL8j1yrPTb12S5JmzPrcbf8z01tp1cf1VFTkr8ub/Ve9kn9WWHiRco4Fa/u3F+mBkdcpNyf4bE3CJCLDf9Ujg9coyn5SJ34N1O59UXpw1g3asuP0s2RvuW6nBt7yjev858atkCRNf6WjPlp3mVf6DN9l+oHakSNHdOutt2rQoEFq3ry5wsPDtXnzZs2YMUO9evUqdX6/fv00bdo09e7dWykpKapTp46++eYbxcbGutXzh4aG6oMPPlCPHj3Uo0cPffTRRwoLY2BxNt+uD1NS7JXnPeeTxTX1yWJm0Kzm8Q86n/d4xrFIjX3nhkrqjUUYVd7hg1cerYJc874bOyef9/jRIyGaNL79ec+BOf3j1Y7nPb5wWSstXNaqknpjESbONdOXPoaFhalNmzaaOXOmOnbsqCuuuEITJkzQ3Xffreeff77U+YGBgfrkk09Uu3Zt3XjjjWrWrJmmT58uf3//s7b94Ycfyul0qmfPnjpx4kRlfCUAuGA2p9OwDd5BrgHA78ycazan0wd7ZWJ5eXmy2+26Tr0UYKvm7e7Axxwa3s7bXYAPKikq0PcvP6Lc3NwLXrjhzO+eFn97Uv6B5S+9Kikq0NZ/PVquPsEczvxsXX/paAX4B3m7O/AxJy/h+WEo7VRxgdZ/Molc+xOmL30EAPyBiUtEAAAWZOJcM33pIwAAAABUNcyoAYCFGLUEsS8uYwwAsB4z5xoDNQCwEhOXiAAALMjEuUbpIwAAAAD4GGbUAMBCzFwiAgCwHjPnGgM1ALASE5eIAAAsyMS5RukjAAAAAPgYZtQAwELMXCICALAeM+caAzUAsBITl4gAACzIxLlG6SMAAAAA+Bhm1ADAYnyxvAMAgAtl1lxjRg0AAAAAfAwzagBgJU7n6c2IdgAA8DYT5xoDNQCwEDOvjgUAsB4z5xqljwAAAADgY5hRAwArMfEyxgAACzJxrjFQAwALsTlOb0a0AwCAt5k51yh9BAAAAAAfw4waAFiJiUtEAAAWZOJcY6AGABZi5tWxAADWY+Zco/QRAAAAAHwMM2oAYCUmfjAoAMCCTJxrDNQAwELMXCICALAeM+capY8AAAAA4GOYUQMAKzHx6lgAAAsyca4xowYAAAAAPoYZNQCwEDPX8gMArMfMucZADQCsxMSrYwEALMjEuUbpIwAAAAD4GGbUAMBCzFwiAgCwHjPnGjNqAGAlTgO3Mpo7d66aN2+uiIgIRUREKDExUR9++KHreEFBgYYPH66aNWsqLCxMycnJys7OLvdXBQBYgBdyrbIwUAMAVKiLL75Y06dP15YtW7R582Z16dJFvXr10vfffy9JGj16tJYvX64lS5ZozZo1OnDggPr06ePlXgMA4F2UPgKAhXijROTmm292e/3kk09q7ty52rBhgy6++GItWLBAixYtUpcuXSRJqampatKkiTZs2KC2bduWv7MAANMyc+kjAzUAsBKH8/RmRDuS8vLy3HYHBQUpKCjonG8rKSnRkiVLdOLECSUmJmrLli0qLi5W165dXec0btxY8fHxWr9+PQM1AMD5GZxrvoTSRwDABYuLi5PdbndtKSkpZz3vu+++U1hYmIKCgnTvvffqvffeU9OmTZWVlaXAwEBFRka6nR8dHa2srKxK+AYAAPgmBmoAYCUG33SdmZmp3Nxc1zZ+/PizfmyjRo20detWbdy4UUOHDtWAAQO0Y8eOCvuaAACLMPEiWQzUAAAX7ExIndnOVfYYGBiohg0bqlWrVkpJSdGVV16pZ599VjExMSoqKlJOTo7b+dnZ2YqJiamEbwAAgGcqa5Es7lEDAAuxyaCbrsv5fofDocLCQrVq1UrVqlVTWlqakpOTJUnp6enKyMhQYmJi+TsKADA1o3OtLPdeV9YiWQzUAMBKnM7TmxHtlNH48ePVo0cPxcfH6/jx41q0aJFWr16tjz/+WHa7XYMHD9aYMWMUFRWliIgIjRw5UomJiSwkAgD4cwbnWlxcnNvuSZMmafLkyed8W0UuksVADQBQoQ4dOqT/+7//08GDB2W329W8eXN9/PHH6tatmyRp5syZ8vPzU3JysgoLC5WUlKQXXnjBy70GAFhRZmamIiIiXK/PVdL/3XffKTExUQUFBQoLC3MtkrV161bDFslioAYAFuKN580sWLDgvMeDg4M1Z84czZkzp5y9AgBYjdG5duae6z9zZpGs3Nxc/fvf/9aAAQO0Zs2a8nfkDxioAYCVeLiy1XnbAQDA27yUa2cWyZKkVq1aadOmTXr22Wd12223uRbJ+uOs2oUsksWqjwAAAABQDmdbJOuMC10kixk1ALAQm9MpmwE3XRvRBgAA5eWNXKusRbIYqAGAlTh+24xoBwAAb/NCrlXWIlkM1AAAAACgjCprkSwGagBgIZQ+AgDMxMy5xkANAKyEVR8BAGZi4lxj1UcAAAAA8DHMqAGAlTidpzcj2gEAwNtMnGvMqAEAAACAj2FGDQAsxOY8vRnRDgAA3mbmXGOgBgBWYuISEQCABZk41yh9BAAAAAAfw4waAFiIzXF6M6IdAAC8zcy5xkANAKzExCUiAAALMnGuUfoIAAAAAD6GGTUAsBLnb5sR7QAA4G0mzjUGapXM+du06ikV++QPBLyrpKjA212ADzrzc+E0oCzD5nTKZlA7gPSHXCsp9HJP4ItOFZNrKO3UKXKtLBioVbLjx49LktbqP17uCXzSy+97uwfwYcePH5fdbvd2NwA3Z3Jtze4XvNwT+KSfvN0B+DJy7fwYqFWy2NhYZWZmKjw8XDabzdvd8bq8vDzFxcUpMzNTERER3u4OfAQ/F+6cTqeOHz+u2NhYIxoz7U3X8A5yzR2/v3A2/Fy4I9fKhoFaJfPz89PFF1/s7W74nIiICH5xoRR+Ln7HFUf4KnLt7Pj9hbPh5+J35NqfY6AGAFbilGTEs2J878IjAMCKTJxrDNQAwELMfNM1AMB6zJxrPEcNXhUUFKRJkyYpKCjI212BD+HnAkBVxe8vnA0/F7gQNqcR62ICAHxaXl6e7Ha7urR4WAH+5f8PhVMlhfp063Tl5uZyvwUAoNJZIdcofQQAKzHx6lgAAAsyca5R+ggAAAAAPoYZNQCwEockIx51ZcQKWwAAlJeJc42BGgBYiJlXxwIAWI+Zc43SR3jd6tWrZbPZlJOTc97z6tWrp1mzZlVKn1D5bDabli5dWqZzJ0+erBYtWlRofwDgQpFrkMg1lB8DNZzTwIED1bt371L7yxpAF2rhwoWKjIyskLbhPYcPH9bQoUMVHx+voKAgxcTEKCkpSV9++aUk6eDBg+rRo4eXe2kBZ266NmIDqhhyDUYi13yEiXON0kcAlSI5OVlFRUV69dVX1aBBA2VnZystLU1HjhyRJMXExHi5hxZh4tWxAKAykWs+wsS5xowaym3t2rXq0KGDQkJCFBcXp/vuu08nTpxwHf/Xv/6l1q1bKzw8XDExMbrjjjt06NChs7a1evVq3XXXXcrNzZXNZpPNZtPkyZNdx0+ePKlBgwYpPDxc8fHxevnll13HunTpohEjRri1d/jwYQUGBiotLc3YLw2P5OTk6IsvvtBTTz2lzp07KyEhQddcc43Gjx+vW265RVLpEpF9+/apX79+ioqKUmhoqFq3bq2NGzeetf1du3apQYMGGjFihHg0JIDyItfwZ8g1VAYGaiiXXbt26YYbblBycrK+/fZbvfXWW1q7dq1bsBQXF2vq1Knatm2bli5dqr1792rgwIFnba9du3aaNWuWIiIidPDgQR08eFBjx451HX/66afVunVrffPNNxo2bJiGDh2q9PR0SdKQIUO0aNEiFRYWus5//fXXVbduXXXp0qVi/gJQJmFhYQoLC9PSpUvd/v85l/z8fHXq1En79+/XsmXLtG3bNj300ENyOEovyfTtt9+qffv2uuOOO/T888/LZjNi6ScTM3GJCGAEcg1lQa75EBPnGqWPOK8VK1YoLCzMbV9JSYnrzykpKerfv79GjRolSbr00ks1e/ZsderUSXPnzlVwcLAGDRrkOr9BgwaaPXu2rr76auXn55dqOzAwUHa7XTab7awlAzfeeKOGDRsmSRo3bpxmzpypzz77TI0aNVKfPn00YsQIvf/+++rbt6+k0/cFDBw4kF9yXhYQEKCFCxfq7rvv1osvvqiWLVuqU6dOuv3229W8efNS5y9atEiHDx/Wpk2bFBUVJUlq2LBhqfPWrVunm266SY8++qgeeOCBCv8eAKo+cg1GINdQGZhRw3l17txZW7duddvmz5/vOr5t2zYtXLjQdWUpLCxMSUlJcjgc2rNnjyRpy5YtuvnmmxUfH6/w8HB16tRJkpSRkeFxf/74y+9M6J0pNwkODtbf/vY3vfLKK5Kkr7/+Wtu3bz/nVU5UruTkZB04cEDLli3TDTfcoNWrV6tly5ZauHBhqXO3bt2qq666yhVmZ5ORkaFu3bpp4sSJhJknHAZuQBVErsEo5JqPMHGuMaOG8woNDS11xWffvn2uP+fn5+uee+7RfffdV+q98fHxOnHihJKSkpSUlKQ33nhDtWrVUkZGhpKSklRUVORxf6pVq+b22mazuZUNDBkyRC1atNC+ffuUmpqqLl26KCEhwePPQcUIDg5Wt27d1K1bN02YMEFDhgzRpEmTSv1HR0hIyJ+2VatWLcXGxurNN9/UoEGDFBERUUG9NhczP28GKAtyDUYi17zPzLnGjBrKpWXLltqxY4caNmxYagsMDNQPP/ygI0eOaPr06erQoYMaN258zhuuzwgMDHQrQ/FEs2bN1Lp1a82bN0+LFi1yK0+B72natKnbDfpnNG/eXFu3btXRo0fP+d6QkBCtWLFCwcHBSkpK0vHjxyuyqwAsglxDeZBrMBIDNZTLuHHjtG7dOo0YMUJbt27VTz/9pPfff99103V8fLwCAwP13HPPaffu3Vq2bJmmTp163jbr1aun/Px8paWl6b///a9OnjzpUZ+GDBmi6dOny+l06i9/+csFfzcY58iRI+rSpYtef/11ffvtt9qzZ4+WLFmiGTNmqFevXqXO79evn2JiYtS7d299+eWX2r17t9555x2tX7/e7bzQ0FB98MEHCggIUI8ePZSfn19ZX6nqMvFN14ARyDWUBbnmQ0ycawzUUC7NmzfXmjVr9OOPP6pDhw666qqrNHHiRMXGxko6PY2/cOFCLVmyRE2bNtX06dP1z3/+87xttmvXTvfee69uu+021apVSzNmzPCoT/369VNAQID69eun4ODgC/5uME5YWJjatGmjmTNnqmPHjrriiis0YcIE3X333Xr++edLnR8YGKhPPvlEtWvX1o033qhmzZpp+vTp8vf3P2vbH374oZxOp3r27HnWK5n4A4fTuA0wIXINZUGu+RAT55rNycMZYDJ79+7VJZdcok2bNqlly5be7g7gE/Ly8mS329X1klEK8A8qd3unSgq1atcs5ebmch8FUMHINaA0K+Qai4nANIqLi3XkyBE99thjatu2LWEGnI1R5R1c4wMqHLkGlIGJc42BGkzjyy+/VOfOnXXZZZfp3//+t7e7A/goo+rwfS/QALMh14CyMG+uMVCDaVx33XWikhcAYBbkGmBtDNQAwEpMXCICALAgE+caqz4CAAAAgI9hRg0ArMThlCF1+D64jDEAwIJMnGsM1ADASpyO05sR7QAA4G0mzjVKHwEPDBw4UL1793a9vu666zRq1KhK78fq1atls9mUk5NzznNsNpuWLl1a5jYnT56sFi1alKtfe/fulc1m09atW8vVDgCgcpBr50euwZsYqKHKGzhwoGw2m2w2mwIDA9WwYUNNmTJFp06dqvDPfvfddzV16tQynVuWEAIq3Jmbro3YAFQIcg3wgIlzjdJHmMINN9yg1NRUFRYW6j//+Y+GDx+uatWqafz48aXOLSoqUmBgoCGfGxUVZUg7QKUxcS0/YCbkGlBGJs41ZtRgCkFBQYqJiVFCQoKGDh2qrl27atmyZZJ+L+t48sknFRsbq0aNGkmSMjMz1bdvX0VGRioqKkq9evXS3r17XW2WlJRozJgxioyMVM2aNfXQQw+Vep7N/5aIFBYWaty4cYqLi1NQUJAaNmyoBQsWaO/evercubMkqUaNGrLZbBo4cKAkyeFwKCUlRfXr11dISIiuvPLKUg82/c9//qPLLrtMISEh6ty5s1s/y2rcuHG67LLLVL16dTVo0EATJkxQcXFxqfNeeuklxcXFqXr16urbt69yc3Pdjs+fP19NmjRRcHCwGjdurBdeeMHjvgAAzo9c+3PkGrwlJSVFV199tcLDw1W7dm317t1b6enpbucUFBRo+PDhqlmzpsLCwpScnKzs7GyPPoeBGkwpJCRERUVFrtdpaWlKT0/XypUrtWLFChUXFyspKUnh4eH64osv9OWXXyosLEw33HCD631PP/20Fi5cqFdeeUVr167V0aNH9d577533c//v//5Pb775pmbPnq2dO3fqpZdeUlhYmOLi4vTOO+9IktLT03Xw4EE9++yzkk7/Y3/ttdf04osv6vvvv9fo0aN15513as2aNZJOB2+fPn108803a+vWrRoyZIgefvhhj/9OwsPDtXDhQu3YsUPPPvus5s2bp5kzZ7qd8/PPP+vtt9/W8uXL9dFHH+mbb77RsGHDXMffeOMNTZw4UU8++aR27typadOmacKECXr11Vc97g+8xMQlIoCZkWulkWuQ5JVcW7NmjYYPH64NGzZo5cqVKi4uVvfu3XXixAnXOaNHj9by5cu1ZMkSrVmzRgcOHFCfPn08+mqUPsJUnE6n0tLS9PHHH2vkyJGu/aGhoZo/f76rNOT111+Xw+HQ/PnzZbPZJEmpqamKjIzU6tWr1b17d82aNUvjx493/aN68cUX9fHHH5/zs3/88Ue9/fbbWrlypbp27SpJatCggev4mXKS2rVrKzIyUtLpK5XTpk3TqlWrlJiY6HrP2rVr9dJLL6lTp06aO3euLrnkEj399NOSpEaNGum7777TU0895dHfzWOPPeb6c7169TR27FgtXrxYDz30kGt/QUGBXnvtNdWtW1eS9Nxzz6lnz556+umnFRMTo0mTJunpp592/Z3Ur19fO3bs0EsvvaQBAwZ41B94iVMGPRi0/E0A+HPk2rmRa5BkeK7l5eW57Q4KClJQUJDbvo8++sjt9cKFC1W7dm1t2bJFHTt2VG5urhYsWKBFixapS5cukk7/e2zSpIk2bNigtm3blqlLDNRgCitWrFBYWJiKi4vlcDh0xx13aPLkya7jzZo1c6vf37Ztm37++WeFh4e7tVNQUKBdu3YpNzdXBw8eVJs2bVzHAgIC1Lp161JlImds3bpV/v7+6tSpU5n7/fPPP+vkyZPq1q2b2/6ioiJdddVVkqSdO3e69UOSK/w88dZbb2n27NnatWuX8vPzderUKUVERLidEx8f7wqzM5/jcDiUnp6u8PBw7dq1S4MHD9bdd9/tOufUqVOy2+0e9wfWkZKSonfffVc//PCDQkJC1K5dOz311FOuci3p9L+9Bx54QIsXL1ZhYaGSkpL0wgsvKDo62os9B7yHXPtz5BoqQlxcnNvrSZMmuf3bO5sz5bRnLl5s2bJFxcXFrgscktS4cWPFx8dr/fr1DNRgLZ07d9bcuXMVGBio2NhYBQS4/2iHhoa6vc7Pz1erVq30xhtvlGqrVq1aF9SHkJAQj9+Tn58vSfrggw/cgkRSqas35bF+/Xr1799fjz/+uJKSkmS327V48WLX1UxP+jpv3rxSAevv729YX1HBjCpbvIASkauvvlqnTp3SI488ou7du2vHjh2uf5ujR4/WBx98oCVLlshut2vEiBHq06ePvvzyy/L3FaiCyLXzI9fgYnCuZWZmug34/+zn1uFwaNSoUbr22mt1xRVXSJKysrIUGBjommk+Izo6WllZWWXuEgM1mEJoaKgaNmxY5vNbtmypt956S7Vr1y519e2MOnXqaOPGjerYsaOk01fYtmzZopYtW571/GbNmsnhcGjNmjVuV1DOOHPls6SkxLWvadOmCgoKUkZGxjmvWDZp0sR1A/kZGzZs+PMv+Qfr1q1TQkKCHn30Ude+X375pdR5GRkZOnDggGJjY12f4+fnp0aNGik6OlqxsbHavXu3+vfv79Hnw4c4HJIMeKin43QbvlQiApgJuXZ+5BpcDM61iIiIc/4bOpvhw4dr+/btWrt2bfn78D9YTASW1L9/f1100UXq1auXvvjiC+3Zs0erV6/Wfffdp3379kmS7r//fk2fPl1Lly7VDz/8oGHDhp33WTH16tXTgAEDNGjQIC1dutTV5ttvvy1JSkhIkM1m04oVK3T48GHl5+crPDxcY8eO1ejRo/Xqq69q165d+vrrr/Xcc8+5bmS+99579dNPP+nBBx9Uenq6Fi1apIULF3r0fS+99FJlZGRo8eLF2rVrl2bPnn3WG8iDg4M1YMAAbdu2TV988YXuu+8+9e3bVzExMZKkxx9/XCkpKZo9e7Z+/PFHfffdd0pNTdUzzzzjUX9gHnFxcbLb7a4tJSXlT9/jaYkIgD9HrpFrqHwjRozQihUr9Nlnn+niiy927Y+JiVFRUVGpf1/Z2dmun72yYKAGS6pevbo+//xzxcfHq0+fPmrSpIkGDx6sgoIC11WUBx54QH/72980YMAAJSYmKjw8XH/5y1/O2+7cuXP117/+VcOGDVPjxo119913u1YAqlu3rh5//HE9/PDDio6O1ogRIyRJU6dO1YQJE5SSkqImTZrohhtu0AcffKD69etLOl1f/84772jp0qW68sor9eKLL2ratGkefd9bbrlFo0eP1ogRI9SiRQutW7dOEyZMKHVew4YN1adPH914443q3r27mjdv7rZM8ZAhQzR//nylpqaqWbNm6tSpkxYuXOjqK6oAg1fHyszMVG5urms72zOe/qgiS0QAKyPXyDXL8sKqj06nUyNGjNB7772nTz/9tNTPS6tWrVStWjWlpaW59qWnpysjI8Oj+zFtznPdQQoAMI28vDzZ7XZ1rTVYAX7lfzDuKUeRVh1eoNzcXI9KRIYOHaoPP/xQa9eudV19XLRoke666y4VFha6nXvNNdeoc+fOHq8EBwAwP2/m2rBhw7Ro0SK9//77bgtj2e12172dQ4cO1X/+8x8tXLhQERERrlVb161bV+Y+cY8aAFiJFxYTOeNMicjnn39+zhKRP86qeVoiAgCwIC/k2ty5cyWdfkD8H6Wmproe/D5z5kz5+fkpOTnZbTVjTzBQAwArcThlyEPQHJ6ViIwcOVLvvfeeVq9efd4SkeTkZEkXViICALAgL+XanwkODtacOXM0Z86cC+4SAzUAQIUaPny4q0QkPDzcdd/ZmRIRu92uwYMHa8yYMYqKinKViCQmJrLiIwDAshioAYCFOJ0OOZ3lX8bYkzYqq0QEAGA93si1ysJADQCsxOn0qLzjvO2U+dTKKREBAFiQF3KtsrA8PwAAAAD4GGbUAMBKnAbddO2DVx4BABZk4lxjoAYAVuJwSDYD6vB9sJYfAGBBJs41Sh8BAAAAwMcwowYAVmLiEhEAgAWZONeYUQMAAAAAH8OMGgBYiNPhkNOAWn5ffN4MAMB6zJxrDNQAwEpMXCICALAgE+capY8AAAAA4GOYUQMAK3E4JZs5rzwCACzIxLnGQA0ArMTplGTE82Z8L9AAABZk4lyj9BEAAAAAfAwzagBgIU6HU04DSkScPnjlEQBgPWbONQZqAGAlToeMKRHxvWWMAQAWZOJco/QRAAAAAHwMM2oAYCFmLhEBAFiPmXONgRoAWImJS0QAABZk4lxjoAYAFnJKxZIBFw1Pqbj8jQAAUE5mzjUGagBgAYGBgYqJidHarP8Y1mZMTIwCAwMNaw8AgLKyQq7ZnL5YkAkAMFxBQYGKiooMay8wMFDBwcGGtQcAgCfMnmsM1AAAAADAx7A8PwAAAAD4GAZqAAAAAOBjGKgBAAAAgI9hoAYAAAAAPoaBGgAAAAD4GAZqAAAAAOBjGKgBAAAAgI/5f9YeTRYCVhCDAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
+ ")\n",
+ "\n",
+ "for index in range(0, len(optimized_metrics)):\n",
+ " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"Healthy\", \"Sick\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Регрессионная модель"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 304,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
+ " 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 148 | \n",
+ " 72 | \n",
+ " 35 | \n",
+ " 0 | \n",
+ " 33.6 | \n",
+ " 0.627 | \n",
+ " 50 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 85 | \n",
+ " 66 | \n",
+ " 29 | \n",
+ " 0 | \n",
+ " 26.6 | \n",
+ " 0.351 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 8 | \n",
+ " 183 | \n",
+ " 64 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 23.3 | \n",
+ " 0.672 | \n",
+ " 32 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 89 | \n",
+ " 66 | \n",
+ " 23 | \n",
+ " 94 | \n",
+ " 28.1 | \n",
+ " 0.167 | \n",
+ " 21 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 137 | \n",
+ " 40 | \n",
+ " 35 | \n",
+ " 168 | \n",
+ " 43.1 | \n",
+ " 2.288 | \n",
+ " 33 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 763 | \n",
+ " 10 | \n",
+ " 101 | \n",
+ " 76 | \n",
+ " 48 | \n",
+ " 180 | \n",
+ " 32.9 | \n",
+ " 0.171 | \n",
+ " 63 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 764 | \n",
+ " 2 | \n",
+ " 122 | \n",
+ " 70 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ " 36.8 | \n",
+ " 0.340 | \n",
+ " 27 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 765 | \n",
+ " 5 | \n",
+ " 121 | \n",
+ " 72 | \n",
+ " 23 | \n",
+ " 112 | \n",
+ " 26.2 | \n",
+ " 0.245 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 766 | \n",
+ " 1 | \n",
+ " 126 | \n",
+ " 60 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30.1 | \n",
+ " 0.349 | \n",
+ " 47 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 767 | \n",
+ " 1 | \n",
+ " 93 | \n",
+ " 70 | \n",
+ " 31 | \n",
+ " 0 | \n",
+ " 30.4 | \n",
+ " 0.315 | \n",
+ " 23 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
768 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "0 6 148 72 35 0 33.6 \n",
+ "1 1 85 66 29 0 26.6 \n",
+ "2 8 183 64 0 0 23.3 \n",
+ "3 1 89 66 23 94 28.1 \n",
+ "4 0 137 40 35 168 43.1 \n",
+ ".. ... ... ... ... ... ... \n",
+ "763 10 101 76 48 180 32.9 \n",
+ "764 2 122 70 27 0 36.8 \n",
+ "765 5 121 72 23 112 26.2 \n",
+ "766 1 126 60 0 0 30.1 \n",
+ "767 1 93 70 31 0 30.4 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "0 0.627 50 1 \n",
+ "1 0.351 31 0 \n",
+ "2 0.672 32 1 \n",
+ "3 0.167 21 0 \n",
+ "4 2.288 33 1 \n",
+ ".. ... ... ... \n",
+ "763 0.171 63 0 \n",
+ "764 0.340 27 0 \n",
+ "765 0.245 30 0 \n",
+ "766 0.349 47 1 \n",
+ "767 0.315 23 0 \n",
+ "\n",
+ "[768 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 304,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "random_state=9\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "df = pd.read_csv(\".//scv//diabetes.csv\")\n",
+ "print(df.columns)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Разделение набора данных на обучающую и тестовые выборки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 305,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 60 | \n",
+ " 2 | \n",
+ " 84 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.0 | \n",
+ " 0.304 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 618 | \n",
+ " 9 | \n",
+ " 112 | \n",
+ " 82 | \n",
+ " 24 | \n",
+ " 0 | \n",
+ " 28.2 | \n",
+ " 1.282 | \n",
+ " 50 | \n",
+ "
\n",
+ " \n",
+ " 346 | \n",
+ " 1 | \n",
+ " 139 | \n",
+ " 46 | \n",
+ " 19 | \n",
+ " 83 | \n",
+ " 28.7 | \n",
+ " 0.654 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " 294 | \n",
+ " 0 | \n",
+ " 161 | \n",
+ " 50 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 21.9 | \n",
+ " 0.254 | \n",
+ " 65 | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " 6 | \n",
+ " 134 | \n",
+ " 80 | \n",
+ " 37 | \n",
+ " 370 | \n",
+ " 46.2 | \n",
+ " 0.238 | \n",
+ " 46 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 71 | \n",
+ " 5 | \n",
+ " 139 | \n",
+ " 64 | \n",
+ " 35 | \n",
+ " 140 | \n",
+ " 28.6 | \n",
+ " 0.411 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " 106 | \n",
+ " 1 | \n",
+ " 96 | \n",
+ " 122 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 22.4 | \n",
+ " 0.207 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " 270 | \n",
+ " 10 | \n",
+ " 101 | \n",
+ " 86 | \n",
+ " 37 | \n",
+ " 0 | \n",
+ " 45.6 | \n",
+ " 1.136 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " 435 | \n",
+ " 0 | \n",
+ " 141 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 42.4 | \n",
+ " 0.205 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 102 | \n",
+ " 0 | \n",
+ " 125 | \n",
+ " 96 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 22.5 | \n",
+ " 0.262 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "60 2 84 0 0 0 0.0 \n",
+ "618 9 112 82 24 0 28.2 \n",
+ "346 1 139 46 19 83 28.7 \n",
+ "294 0 161 50 0 0 21.9 \n",
+ "231 6 134 80 37 370 46.2 \n",
+ ".. ... ... ... ... ... ... \n",
+ "71 5 139 64 35 140 28.6 \n",
+ "106 1 96 122 0 0 22.4 \n",
+ "270 10 101 86 37 0 45.6 \n",
+ "435 0 141 0 0 0 42.4 \n",
+ "102 0 125 96 0 0 22.5 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age \n",
+ "60 0.304 21 \n",
+ "618 1.282 50 \n",
+ "346 0.654 22 \n",
+ "294 0.254 65 \n",
+ "231 0.238 46 \n",
+ ".. ... ... \n",
+ "71 0.411 26 \n",
+ "106 0.207 27 \n",
+ "270 1.136 38 \n",
+ "435 0.205 29 \n",
+ "102 0.262 21 \n",
+ "\n",
+ "[614 rows x 8 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 60 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 618 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 346 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 294 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 231 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 71 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 106 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 270 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 435 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 102 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
614 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "60 0\n",
+ "618 1\n",
+ "346 0\n",
+ "294 0\n",
+ "231 1\n",
+ ".. ...\n",
+ "71 0\n",
+ "106 0\n",
+ "270 1\n",
+ "435 1\n",
+ "102 0\n",
+ "\n",
+ "[614 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Pregnancies | \n",
+ " Glucose | \n",
+ " BloodPressure | \n",
+ " SkinThickness | \n",
+ " Insulin | \n",
+ " BMI | \n",
+ " DiabetesPedigreeFunction | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 668 | \n",
+ " 6 | \n",
+ " 98 | \n",
+ " 58 | \n",
+ " 33 | \n",
+ " 190 | \n",
+ " 34.0 | \n",
+ " 0.430 | \n",
+ " 43 | \n",
+ "
\n",
+ " \n",
+ " 324 | \n",
+ " 2 | \n",
+ " 112 | \n",
+ " 75 | \n",
+ " 32 | \n",
+ " 0 | \n",
+ " 35.7 | \n",
+ " 0.148 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 624 | \n",
+ " 2 | \n",
+ " 108 | \n",
+ " 64 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30.8 | \n",
+ " 0.158 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " 690 | \n",
+ " 8 | \n",
+ " 107 | \n",
+ " 80 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 24.6 | \n",
+ " 0.856 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " 473 | \n",
+ " 7 | \n",
+ " 136 | \n",
+ " 90 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 29.9 | \n",
+ " 0.210 | \n",
+ " 50 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 355 | \n",
+ " 9 | \n",
+ " 165 | \n",
+ " 88 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 30.4 | \n",
+ " 0.302 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ " 534 | \n",
+ " 1 | \n",
+ " 77 | \n",
+ " 56 | \n",
+ " 30 | \n",
+ " 56 | \n",
+ " 33.3 | \n",
+ " 1.251 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " 344 | \n",
+ " 8 | \n",
+ " 95 | \n",
+ " 72 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 36.8 | \n",
+ " 0.485 | \n",
+ " 57 | \n",
+ "
\n",
+ " \n",
+ " 296 | \n",
+ " 2 | \n",
+ " 146 | \n",
+ " 70 | \n",
+ " 38 | \n",
+ " 360 | \n",
+ " 28.0 | \n",
+ " 0.337 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 462 | \n",
+ " 8 | \n",
+ " 74 | \n",
+ " 70 | \n",
+ " 40 | \n",
+ " 49 | \n",
+ " 35.3 | \n",
+ " 0.705 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "668 6 98 58 33 190 34.0 \n",
+ "324 2 112 75 32 0 35.7 \n",
+ "624 2 108 64 0 0 30.8 \n",
+ "690 8 107 80 0 0 24.6 \n",
+ "473 7 136 90 0 0 29.9 \n",
+ ".. ... ... ... ... ... ... \n",
+ "355 9 165 88 0 0 30.4 \n",
+ "534 1 77 56 30 56 33.3 \n",
+ "344 8 95 72 0 0 36.8 \n",
+ "296 2 146 70 38 360 28.0 \n",
+ "462 8 74 70 40 49 35.3 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age \n",
+ "668 0.430 43 \n",
+ "324 0.148 21 \n",
+ "624 0.158 21 \n",
+ "690 0.856 34 \n",
+ "473 0.210 50 \n",
+ ".. ... ... \n",
+ "355 0.302 49 \n",
+ "534 1.251 24 \n",
+ "344 0.485 57 \n",
+ "296 0.337 29 \n",
+ "462 0.705 39 \n",
+ "\n",
+ "[154 rows x 8 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Outcome | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 668 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 324 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 624 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 690 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 473 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 355 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 534 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 344 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 296 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 462 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
154 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "668 0\n",
+ "324 0\n",
+ "624 0\n",
+ "690 0\n",
+ "473 0\n",
+ ".. ...\n",
+ "355 1\n",
+ "534 0\n",
+ "344 0\n",
+ "296 1\n",
+ "462 0\n",
+ "\n",
+ "[154 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from typing import Tuple\n",
+ "import pandas as pd\n",
+ "from pandas import DataFrame\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "def split_into_train_test(\n",
+ " df_input: DataFrame,\n",
+ " target_colname: str = \"Outcome\",\n",
+ " frac_train: float = 0.8,\n",
+ " random_state: int = None,\n",
+ ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+ " \n",
+ " if not (0 < frac_train < 1):\n",
+ " raise ValueError(\"Fraction must be between 0 and 1.\")\n",
+ " \n",
+ " # Проверка наличия целевого признака\n",
+ " if target_colname not in df_input.columns:\n",
+ " raise ValueError(f\"{target_colname} is not a column in the DataFrame.\")\n",
+ " \n",
+ " # Разделяем данные на признаки и целевую переменную\n",
+ " X = df_input.drop(columns=[target_colname]) # Признаки\n",
+ " y = df_input[[target_colname]] # Целевая переменная\n",
+ "\n",
+ " # Разделяем данные на обучающую и тестовую выборки\n",
+ " X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X, y,\n",
+ " test_size=(1.0 - frac_train),\n",
+ " random_state=random_state\n",
+ " )\n",
+ " \n",
+ " return X_train, X_test, y_train, y_test\n",
+ "\n",
+ "# Применение функции для разделения данных\n",
+ "X_train, X_test, y_train, y_test = split_into_train_test(\n",
+ " df, \n",
+ " target_colname=\"Outcome\", \n",
+ " frac_train=0.8, \n",
+ " random_state=42 # Убедитесь, что вы задали нужное значение random_state\n",
+ ")\n",
+ "\n",
+ "# Для отображения результатов\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Определение перечня алгоритмов решения задачи аппроксимации (регрессии)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 306,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "from sklearn.preprocessing import PolynomialFeatures\n",
+ "from sklearn import linear_model, tree, neighbors, ensemble, neural_network\n",
+ "\n",
+ "random_state = 9\n",
+ "\n",
+ "models = {\n",
+ " \"linear\": {\"model\": linear_model.LinearRegression(n_jobs=-1)},\n",
+ " \"linear_poly\": {\n",
+ " \"model\": make_pipeline(\n",
+ " PolynomialFeatures(degree=2),\n",
+ " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
+ " )\n",
+ " },\n",
+ " \"linear_interact\": {\n",
+ " \"model\": make_pipeline(\n",
+ " PolynomialFeatures(interaction_only=True),\n",
+ " linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),\n",
+ " )\n",
+ " },\n",
+ " \"ridge\": {\"model\": linear_model.RidgeCV()},\n",
+ " \"decision_tree\": {\n",
+ " \"model\": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)\n",
+ " },\n",
+ " \"knn\": {\"model\": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},\n",
+ " \"random_forest\": {\n",
+ " \"model\": ensemble.RandomForestRegressor(\n",
+ " max_depth=7, random_state=random_state, n_jobs=-1\n",
+ " )\n",
+ " },\n",
+ " \"mlp\": {\n",
+ " \"model\": neural_network.MLPRegressor(\n",
+ " activation=\"tanh\",\n",
+ " hidden_layer_sizes=(3,),\n",
+ " max_iter=500,\n",
+ " early_stopping=True,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " },\n",
+ "}\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 307,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: linear\n",
+ "Model: linear_poly\n",
+ "Model: linear_interact\n",
+ "Model: ridge\n",
+ "Model: decision_tree\n",
+ "Model: knn\n",
+ "Model: random_forest\n",
+ "Model: mlp\n"
+ ]
+ }
+ ],
+ "source": [
+ "import math\n",
+ "from pandas import DataFrame\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "for model_name in models.keys():\n",
+ " print(f\"Model: {model_name}\")\n",
+ "\n",
+ " fitted_model = models[model_name][\"model\"].fit(\n",
+ " X_train.values, y_train.values.ravel()\n",
+ " )\n",
+ " y_train_pred = fitted_model.predict(X_train.values)\n",
+ " y_test_pred = fitted_model.predict(X_test.values)\n",
+ " models[model_name][\"fitted\"] = fitted_model\n",
+ " models[model_name][\"train_preds\"] = y_train_pred\n",
+ " models[model_name][\"preds\"] = y_test_pred\n",
+ " models[model_name][\"RMSE_train\"] = math.sqrt(\n",
+ " metrics.mean_squared_error(y_train, y_train_pred)\n",
+ " )\n",
+ " models[model_name][\"RMSE_test\"] = math.sqrt(\n",
+ " metrics.mean_squared_error(y_test, y_test_pred)\n",
+ " )\n",
+ " models[model_name][\"RMAE_test\"] = math.sqrt(\n",
+ " metrics.mean_absolute_error(y_test, y_test_pred)\n",
+ " )\n",
+ " models[model_name][\"R2_test\"] = metrics.r2_score(y_test, y_test_pred)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Вывод результатов оценки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 308,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " RMSE_train | \n",
+ " RMSE_test | \n",
+ " RMAE_test | \n",
+ " R2_test | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " random_forest | \n",
+ " 0.240052 | \n",
+ " 0.405871 | \n",
+ " 0.559210 | \n",
+ " 0.282505 | \n",
+ "
\n",
+ " \n",
+ " linear | \n",
+ " 0.396793 | \n",
+ " 0.413576 | \n",
+ " 0.590024 | \n",
+ " 0.255003 | \n",
+ "
\n",
+ " \n",
+ " ridge | \n",
+ " 0.396822 | \n",
+ " 0.414236 | \n",
+ " 0.590431 | \n",
+ " 0.252623 | \n",
+ "
\n",
+ " \n",
+ " linear_poly | \n",
+ " 0.370076 | \n",
+ " 0.422852 | \n",
+ " 0.584147 | \n",
+ " 0.221209 | \n",
+ "
\n",
+ " \n",
+ " linear_interact | \n",
+ " 0.380128 | \n",
+ " 0.426815 | \n",
+ " 0.593532 | \n",
+ " 0.206543 | \n",
+ "
\n",
+ " \n",
+ " decision_tree | \n",
+ " 0.249880 | \n",
+ " 0.445708 | \n",
+ " 0.520376 | \n",
+ " 0.134743 | \n",
+ "
\n",
+ " \n",
+ " knn | \n",
+ " 0.373319 | \n",
+ " 0.450285 | \n",
+ " 0.592157 | \n",
+ " 0.116883 | \n",
+ "
\n",
+ " \n",
+ " mlp | \n",
+ " 0.623529 | \n",
+ " 0.544323 | \n",
+ " 0.658689 | \n",
+ " -0.290498 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 308,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "reg_metrics = pd.DataFrame.from_dict(models, \"index\")[\n",
+ " [\"RMSE_train\", \"RMSE_test\", \"RMAE_test\", \"R2_test\"]\n",
+ "]\n",
+ "reg_metrics.sort_values(by=\"RMSE_test\").style.background_gradient(\n",
+ " cmap=\"viridis\", low=1, high=0.3, subset=[\"RMSE_train\", \"RMSE_test\"]\n",
+ ").background_gradient(cmap=\"plasma\", low=0.3, high=1, subset=[\"RMAE_test\", \"R2_test\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "Вывод реального и \"спрогнозированного\" результата для обучающей и тестовой выборок\n",
+ "\n",
+ "Получение лучшей модели\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 309,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'random_forest'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "best_model = str(reg_metrics.sort_values(by=\"RMSE_test\").iloc[0].name)\n",
+ "\n",
+ "display(best_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Подбор гиперпараметров методом поиска по сетке"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 310,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Лучшие параметры: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}\n",
+ "Лучший результат (MSE): 0.15427721639903466\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.model_selection import train_test_split, GridSearchCV\n",
+ "from sklearn.ensemble import RandomForestRegressor # Используем регрессор\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "\n",
+ "df.dropna(inplace=True) \n",
+ "# Предикторы и целевая переменная\n",
+ "X = df[[\"Glucose\", \"Age\", \"BloodPressure\", \"DiabetesPedigreeFunction\"]]\n",
+ "y = df['Outcome'] # Целевая переменная для регрессии\n",
+ "\n",
+ "\n",
+ "model = RandomForestRegressor() \n",
+ "\n",
+ "param_grid = {\n",
+ " 'n_estimators': [50, 100, 200], \n",
+ " 'max_depth': [None, 10, 20, 30], \n",
+ " 'min_samples_split': [2, 5, 10] \n",
+ "}\n",
+ "\n",
+ "# 3. Подбор гиперпараметров с помощью Grid Search\n",
+ "grid_search = GridSearchCV(estimator=model, param_grid=param_grid,\n",
+ " scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n",
+ "\n",
+ "# Обучение модели на тренировочных данных\n",
+ "grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "# 4. Результаты подбора гиперпараметров\n",
+ "print(\"Лучшие параметры:\", grid_search.best_params_)\n",
+ "print(\"Лучший результат (MSE):\", -grid_search.best_score_) "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Обучение модели с новыми гиперпараметрами и сравнение новых и старых данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 319,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n",
+ "d:\\5_semester\\AIM\\rep\\AIM-PIbd-31-Razubaev-S-M\\.venv\\Lib\\site-packages\\sklearn\\base.py:1473: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
+ " return fit_method(estimator, *args, **kwargs)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Старые параметры: {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 50}\n",
+ "Лучший результат (MSE) на старых параметрах: 0.1543002886456971\n",
+ "\n",
+ "Новые параметры: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}\n",
+ "Лучший результат (MSE) на новых параметрах: 0.15791709286040012\n",
+ "Среднеквадратическая ошибка (MSE) на тестовых данных: 0.16712438177283198\n",
+ "Корень среднеквадратичной ошибки (RMSE) на тестовых данных: 0.408808490338486\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.model_selection import train_test_split, GridSearchCV\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "\n",
+ "old_param_grid = {\n",
+ " 'n_estimators': [50, 100, 200], # Количество деревьев\n",
+ " 'max_depth': [None, 10, 20, 30], # Максимальная глубина дерева\n",
+ " 'min_samples_split': [2, 5, 10] # Минимальное количество образцов для разбиения узла\n",
+ "}\n",
+ "\n",
+ "old_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
+ " param_grid=old_param_grid,\n",
+ " scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=2)\n",
+ "\n",
+ "old_grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "old_best_params = old_grid_search.best_params_\n",
+ "old_best_mse = -old_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
+ "\n",
+ "new_param_grid = {\n",
+ " 'n_estimators': [200],\n",
+ " 'max_depth': [20],\n",
+ " 'min_samples_split': [10]\n",
+ "}\n",
+ "\n",
+ "new_grid_search = GridSearchCV(estimator=RandomForestRegressor(), \n",
+ " param_grid=new_param_grid,\n",
+ " scoring='neg_mean_squared_error', cv=2)\n",
+ "\n",
+ "new_grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "new_best_params = new_grid_search.best_params_\n",
+ "new_best_mse = -new_grid_search.best_score_ # Меняем знак, так как берем отрицательное значение MSE\n",
+ "\n",
+ "model_best = RandomForestRegressor(**new_best_params)\n",
+ "model_best.fit(X_train, y_train)\n",
+ "\n",
+ "model_oldbest = RandomForestRegressor(**old_best_params)\n",
+ "model_oldbest.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = model_best.predict(X_test)\n",
+ "y_oldpred = model_oldbest.predict(X_test)\n",
+ "\n",
+ "mse = metrics.mean_squared_error(y_test, y_pred)\n",
+ "rmse = np.sqrt(mse)\n",
+ "\n",
+ "print(\"Старые параметры:\", old_best_params)\n",
+ "print(\"Лучший результат (MSE) на старых параметрах:\", old_best_mse)\n",
+ "print(\"\\nНовые параметры:\", new_best_params)\n",
+ "print(\"Лучший результат (MSE) на новых параметрах:\", new_best_mse)\n",
+ "print(\"Среднеквадратическая ошибка (MSE) на тестовых данных:\", mse)\n",
+ "print(\"Корень среднеквадратичной ошибки (RMSE) на тестовых данных:\", rmse)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Визуализация"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 329,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "