diff --git a/lab_4/lab4.ipynb b/lab_4/lab4.ipynb
new file mode 100644
index 0000000..a222bf1
--- /dev/null
+++ b/lab_4/lab4.ipynb
@@ -0,0 +1,2893 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Начало лабораторной работы"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "*Вариант 3:* Диабет у индейцев Пима "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',\n",
+ " 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 6 \n",
+ " 148 \n",
+ " 72 \n",
+ " 35 \n",
+ " 0 \n",
+ " 33.6 \n",
+ " 0.627 \n",
+ " 50 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " 85 \n",
+ " 66 \n",
+ " 29 \n",
+ " 0 \n",
+ " 26.6 \n",
+ " 0.351 \n",
+ " 31 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 8 \n",
+ " 183 \n",
+ " 64 \n",
+ " 0 \n",
+ " 0 \n",
+ " 23.3 \n",
+ " 0.672 \n",
+ " 32 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1 \n",
+ " 89 \n",
+ " 66 \n",
+ " 23 \n",
+ " 94 \n",
+ " 28.1 \n",
+ " 0.167 \n",
+ " 21 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 0 \n",
+ " 137 \n",
+ " 40 \n",
+ " 35 \n",
+ " 168 \n",
+ " 43.1 \n",
+ " 2.288 \n",
+ " 33 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 763 \n",
+ " 10 \n",
+ " 101 \n",
+ " 76 \n",
+ " 48 \n",
+ " 180 \n",
+ " 32.9 \n",
+ " 0.171 \n",
+ " 63 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 764 \n",
+ " 2 \n",
+ " 122 \n",
+ " 70 \n",
+ " 27 \n",
+ " 0 \n",
+ " 36.8 \n",
+ " 0.340 \n",
+ " 27 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 765 \n",
+ " 5 \n",
+ " 121 \n",
+ " 72 \n",
+ " 23 \n",
+ " 112 \n",
+ " 26.2 \n",
+ " 0.245 \n",
+ " 30 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 766 \n",
+ " 1 \n",
+ " 126 \n",
+ " 60 \n",
+ " 0 \n",
+ " 0 \n",
+ " 30.1 \n",
+ " 0.349 \n",
+ " 47 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 767 \n",
+ " 1 \n",
+ " 93 \n",
+ " 70 \n",
+ " 31 \n",
+ " 0 \n",
+ " 30.4 \n",
+ " 0.315 \n",
+ " 23 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
768 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "0 6 148 72 35 0 33.6 \n",
+ "1 1 85 66 29 0 26.6 \n",
+ "2 8 183 64 0 0 23.3 \n",
+ "3 1 89 66 23 94 28.1 \n",
+ "4 0 137 40 35 168 43.1 \n",
+ ".. ... ... ... ... ... ... \n",
+ "763 10 101 76 48 180 32.9 \n",
+ "764 2 122 70 27 0 36.8 \n",
+ "765 5 121 72 23 112 26.2 \n",
+ "766 1 126 60 0 0 30.1 \n",
+ "767 1 93 70 31 0 30.4 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "0 0.627 50 1 \n",
+ "1 0.351 31 0 \n",
+ "2 0.672 32 1 \n",
+ "3 0.167 21 0 \n",
+ "4 2.288 33 1 \n",
+ ".. ... ... ... \n",
+ "763 0.171 63 0 \n",
+ "764 0.340 27 0 \n",
+ "765 0.245 30 0 \n",
+ "766 0.349 47 1 \n",
+ "767 0.315 23 0 \n",
+ "\n",
+ "[768 rows x 9 columns]"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "# Установим параметры для вывода\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "\n",
+ "random_state = 42\n",
+ "\n",
+ "# Подключим датафрейм и выгрузим данные\n",
+ "df = pd.read_csv(\"C:/Users/TIGR228/Desktop/МИИ/Lab1/AIM-PIbd-31-Afanasev-S-S/static/csv/diabetes.csv\")\n",
+ "print(df.columns)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Бизнес-цели:\n",
+ "\n",
+ "1. Прогнозирование риска развития диабета\n",
+ "\n",
+ "Описание: Классифицировать пациентов на основе их медицинских данных для определения риска развития диабета (используя целевой признак \"Outcome\"). Эта задача актуальна для раннего выявления диабета и разработки профилактических мер, направленных на улучшение здоровья населения.\n",
+ "\n",
+ "2. Оценка факторов, влияющих на развитие диабета\n",
+ "\n",
+ "Описание: Предсказать вероятность развития диабета у новых пациентов на основе их медицинских характеристик (таких как уровень глюкозы, артериальное давление, индекс массы тела и другие параметры). Это позволит медицинским специалистам планировать лечение и наблюдение в зависимости от индивидуальных рисков пациентов. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Определение достижимого уровня качества модели для первой задачи "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи классификации\n",
+ "Целевой признак -- Outcome"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'X_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 353 \n",
+ " 1 \n",
+ " 90 \n",
+ " 62 \n",
+ " 12 \n",
+ " 43 \n",
+ " 27.2 \n",
+ " 0.580 \n",
+ " 24 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 711 \n",
+ " 5 \n",
+ " 126 \n",
+ " 78 \n",
+ " 27 \n",
+ " 22 \n",
+ " 29.6 \n",
+ " 0.439 \n",
+ " 40 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 373 \n",
+ " 2 \n",
+ " 105 \n",
+ " 58 \n",
+ " 40 \n",
+ " 94 \n",
+ " 34.9 \n",
+ " 0.225 \n",
+ " 25 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 46 \n",
+ " 1 \n",
+ " 146 \n",
+ " 56 \n",
+ " 0 \n",
+ " 0 \n",
+ " 29.7 \n",
+ " 0.564 \n",
+ " 29 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 682 \n",
+ " 0 \n",
+ " 95 \n",
+ " 64 \n",
+ " 39 \n",
+ " 105 \n",
+ " 44.6 \n",
+ " 0.366 \n",
+ " 22 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 451 \n",
+ " 2 \n",
+ " 134 \n",
+ " 70 \n",
+ " 0 \n",
+ " 0 \n",
+ " 28.9 \n",
+ " 0.542 \n",
+ " 23 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 113 \n",
+ " 4 \n",
+ " 76 \n",
+ " 62 \n",
+ " 0 \n",
+ " 0 \n",
+ " 34.0 \n",
+ " 0.391 \n",
+ " 25 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 556 \n",
+ " 1 \n",
+ " 97 \n",
+ " 70 \n",
+ " 40 \n",
+ " 0 \n",
+ " 38.1 \n",
+ " 0.218 \n",
+ " 30 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 667 \n",
+ " 10 \n",
+ " 111 \n",
+ " 70 \n",
+ " 27 \n",
+ " 0 \n",
+ " 27.5 \n",
+ " 0.141 \n",
+ " 40 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 107 \n",
+ " 4 \n",
+ " 144 \n",
+ " 58 \n",
+ " 28 \n",
+ " 140 \n",
+ " 29.5 \n",
+ " 0.287 \n",
+ " 37 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
614 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "353 1 90 62 12 43 27.2 \n",
+ "711 5 126 78 27 22 29.6 \n",
+ "373 2 105 58 40 94 34.9 \n",
+ "46 1 146 56 0 0 29.7 \n",
+ "682 0 95 64 39 105 44.6 \n",
+ ".. ... ... ... ... ... ... \n",
+ "451 2 134 70 0 0 28.9 \n",
+ "113 4 76 62 0 0 34.0 \n",
+ "556 1 97 70 40 0 38.1 \n",
+ "667 10 111 70 27 0 27.5 \n",
+ "107 4 144 58 28 140 29.5 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "353 0.580 24 0 \n",
+ "711 0.439 40 0 \n",
+ "373 0.225 25 0 \n",
+ "46 0.564 29 0 \n",
+ "682 0.366 22 0 \n",
+ ".. ... ... ... \n",
+ "451 0.542 23 1 \n",
+ "113 0.391 25 0 \n",
+ "556 0.218 30 0 \n",
+ "667 0.141 40 1 \n",
+ "107 0.287 37 0 \n",
+ "\n",
+ "[614 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_train'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 353 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 711 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 373 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 46 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 682 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 451 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 113 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 556 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 667 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 107 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
614 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "353 0\n",
+ "711 0\n",
+ "373 0\n",
+ "46 0\n",
+ "682 0\n",
+ ".. ...\n",
+ "451 1\n",
+ "113 0\n",
+ "556 0\n",
+ "667 1\n",
+ "107 0\n",
+ "\n",
+ "[614 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'X_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 44 \n",
+ " 7 \n",
+ " 159 \n",
+ " 64 \n",
+ " 0 \n",
+ " 0 \n",
+ " 27.4 \n",
+ " 0.294 \n",
+ " 40 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 672 \n",
+ " 10 \n",
+ " 68 \n",
+ " 106 \n",
+ " 23 \n",
+ " 49 \n",
+ " 35.5 \n",
+ " 0.285 \n",
+ " 47 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 700 \n",
+ " 2 \n",
+ " 122 \n",
+ " 76 \n",
+ " 27 \n",
+ " 200 \n",
+ " 35.9 \n",
+ " 0.483 \n",
+ " 26 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 630 \n",
+ " 7 \n",
+ " 114 \n",
+ " 64 \n",
+ " 0 \n",
+ " 0 \n",
+ " 27.4 \n",
+ " 0.732 \n",
+ " 34 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 81 \n",
+ " 2 \n",
+ " 74 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0.0 \n",
+ " 0.102 \n",
+ " 22 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 32 \n",
+ " 3 \n",
+ " 88 \n",
+ " 58 \n",
+ " 11 \n",
+ " 54 \n",
+ " 24.8 \n",
+ " 0.267 \n",
+ " 22 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 637 \n",
+ " 2 \n",
+ " 94 \n",
+ " 76 \n",
+ " 18 \n",
+ " 66 \n",
+ " 31.6 \n",
+ " 0.649 \n",
+ " 23 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 593 \n",
+ " 2 \n",
+ " 82 \n",
+ " 52 \n",
+ " 22 \n",
+ " 115 \n",
+ " 28.5 \n",
+ " 1.699 \n",
+ " 25 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 425 \n",
+ " 4 \n",
+ " 184 \n",
+ " 78 \n",
+ " 39 \n",
+ " 277 \n",
+ " 37.0 \n",
+ " 0.264 \n",
+ " 31 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 273 \n",
+ " 1 \n",
+ " 71 \n",
+ " 78 \n",
+ " 50 \n",
+ " 45 \n",
+ " 33.2 \n",
+ " 0.422 \n",
+ " 21 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
154 rows × 9 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "44 7 159 64 0 0 27.4 \n",
+ "672 10 68 106 23 49 35.5 \n",
+ "700 2 122 76 27 200 35.9 \n",
+ "630 7 114 64 0 0 27.4 \n",
+ "81 2 74 0 0 0 0.0 \n",
+ ".. ... ... ... ... ... ... \n",
+ "32 3 88 58 11 54 24.8 \n",
+ "637 2 94 76 18 66 31.6 \n",
+ "593 2 82 52 22 115 28.5 \n",
+ "425 4 184 78 39 277 37.0 \n",
+ "273 1 71 78 50 45 33.2 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "44 0.294 40 0 \n",
+ "672 0.285 47 0 \n",
+ "700 0.483 26 0 \n",
+ "630 0.732 34 1 \n",
+ "81 0.102 22 0 \n",
+ ".. ... ... ... \n",
+ "32 0.267 22 0 \n",
+ "637 0.649 23 0 \n",
+ "593 1.699 25 0 \n",
+ "425 0.264 31 1 \n",
+ "273 0.422 21 0 \n",
+ "\n",
+ "[154 rows x 9 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'y_test'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 44 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 672 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 700 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 630 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 81 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 32 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 637 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 593 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " 425 \n",
+ " 1 \n",
+ " \n",
+ " \n",
+ " 273 \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
154 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Outcome\n",
+ "44 0\n",
+ "672 0\n",
+ "700 0\n",
+ "630 1\n",
+ "81 0\n",
+ ".. ...\n",
+ "32 0\n",
+ "637 0\n",
+ "593 0\n",
+ "425 1\n",
+ "273 0\n",
+ "\n",
+ "[154 rows x 1 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from typing import Tuple\n",
+ "import pandas as pd\n",
+ "from pandas import DataFrame\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Устанавливаем случайное состояние\n",
+ "random_state = 42\n",
+ "\n",
+ "def split_stratified_into_train_val_test(\n",
+ " df_input,\n",
+ " stratify_colname=\"y\",\n",
+ " frac_train=0.6,\n",
+ " frac_val=0.15,\n",
+ " frac_test=0.25,\n",
+ " random_state=None,\n",
+ ") -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:\n",
+ " \n",
+ " if frac_train + frac_val + frac_test != 1.0:\n",
+ " raise ValueError(\n",
+ " \"fractions %f, %f, %f do not add up to 1.0\"\n",
+ " % (frac_train, frac_val, frac_test)\n",
+ " )\n",
+ " if stratify_colname not in df_input.columns:\n",
+ " raise ValueError(\"%s is not a column in the dataframe\" % (stratify_colname))\n",
+ " X = df_input # Contains all columns.\n",
+ " y = df_input[\n",
+ " [stratify_colname]\n",
+ " ] # Dataframe of just the column on which to stratify.\n",
+ " # Split original dataframe into train and temp dataframes.\n",
+ " df_train, df_temp, y_train, y_temp = train_test_split(\n",
+ " X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state\n",
+ " )\n",
+ " if frac_val <= 0:\n",
+ " assert len(df_input) == len(df_train) + len(df_temp)\n",
+ " return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp\n",
+ " # Split the temp dataframe into val and test dataframes.\n",
+ " relative_frac_test = frac_test / (frac_val + frac_test)\n",
+ " df_val, df_test, y_val, y_test = train_test_split(\n",
+ " df_temp,\n",
+ " y_temp,\n",
+ " stratify=y_temp,\n",
+ " test_size=relative_frac_test,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " assert len(df_input) == len(df_train) + len(df_val) + len(df_test)\n",
+ " return df_train, df_val, df_test, y_train, y_val, y_test\n",
+ "\n",
+ "X_train, X_val, X_test, y_train, y_val, y_test = split_stratified_into_train_val_test(\n",
+ " df, stratify_colname=\"Outcome\", frac_train=0.80, frac_val=0, frac_test=0.20, random_state=random_state\n",
+ ")\n",
+ "\n",
+ "display(\"X_train\", X_train)\n",
+ "display(\"y_train\", y_train)\n",
+ "\n",
+ "display(\"X_test\", X_test)\n",
+ "display(\"y_test\", y_test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование конвейера для классификации данных\n",
+ "preprocessing_num -- конвейер для обработки числовых данных: заполнение пропущенных значений и стандартизация\n",
+ "\n",
+ "preprocessing_cat -- конвейер для обработки категориальных данных: заполнение пропущенных данных и унитарное кодирование\n",
+ "\n",
+ "features_preprocessing -- трансформер для предобработки признаков\n",
+ "\n",
+ "features_engineering -- трансформер для конструирования признаков\n",
+ "\n",
+ "drop_columns -- трансформер для удаления колонок\n",
+ "\n",
+ "pipeline_end -- основной конвейер предобработки данных и конструирования признаков"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.base import BaseEstimator, TransformerMixin\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.impute import SimpleImputer\n",
+ "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Построение конвейеров предобработки\n",
+ "\n",
+ "class DiabetesFeatures(BaseEstimator, TransformerMixin):\n",
+ " def __init__(self):\n",
+ " pass\n",
+ "\n",
+ " def fit(self, X, y=None):\n",
+ " return self\n",
+ "\n",
+ " def transform(self, X, y=None):\n",
+ " # Создание новых признаков\n",
+ " X = X.copy()\n",
+ " X[\"BMI_to_Age_ratio\"] = X[\"BMI\"] / X[\"Age\"]\n",
+ " return X\n",
+ "\n",
+ " def get_feature_names_out(self, features_in):\n",
+ " # Добавление имен новых признаков\n",
+ " new_features = [\"BMI_to_Age_ratio\"]\n",
+ " return np.append(features_in, new_features, axis=0)\n",
+ "\n",
+ "# Обработка числовых данных. Числовой конвейр: заполнение пропущенных значений медианой и стандартизация\n",
+ "preprocessing_num_class = Pipeline(steps=[\n",
+ " ('imputer', SimpleImputer(strategy='median')),\n",
+ " ('scaler', StandardScaler())\n",
+ "])\n",
+ "\n",
+ "preprocessing_cat_class = Pipeline(steps=[\n",
+ " ('imputer', SimpleImputer(strategy='most_frequent')),\n",
+ " ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))\n",
+ "])\n",
+ "\n",
+ "columns_to_drop = []\n",
+ "numeric_columns = [\"Pregnancies\", \"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\",\n",
+ " \"BMI\", \"DiabetesPedigreeFunction\", \"Age\"]\n",
+ "cat_columns = [\"Outcome\"]\n",
+ "\n",
+ "features_preprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"preprocessing_num\", preprocessing_num_class, numeric_columns),\n",
+ " (\"preprocessing_cat\", preprocessing_cat_class, cat_columns),\n",
+ " ],\n",
+ " remainder=\"passthrough\"\n",
+ ")\n",
+ "\n",
+ "drop_columns = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " (\"drop_columns\", \"drop\", columns_to_drop),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "features_postprocessing = ColumnTransformer(\n",
+ " verbose_feature_names_out=False,\n",
+ " transformers=[\n",
+ " ('preprocessing_cat', preprocessing_cat_class, [\"Outcome\"]),\n",
+ " ],\n",
+ " remainder=\"passthrough\",\n",
+ ")\n",
+ "\n",
+ "pipeline_end = Pipeline(\n",
+ " [\n",
+ " (\"features_preprocessing\", features_preprocessing),\n",
+ " (\"custom_features\", DiabetesFeatures()),\n",
+ " (\"drop_columns\", drop_columns),\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Демонстрация работы конвейера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome_1 \n",
+ " BMI_to_Age_ratio \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 353 \n",
+ " -0.851355 \n",
+ " -0.980131 \n",
+ " -0.404784 \n",
+ " -0.553973 \n",
+ " -0.331319 \n",
+ " -0.607678 \n",
+ " 0.310794 \n",
+ " -0.792169 \n",
+ " 0.0 \n",
+ " 0.767107 \n",
+ " \n",
+ " \n",
+ " 711 \n",
+ " 0.356576 \n",
+ " 0.161444 \n",
+ " 0.465368 \n",
+ " 0.392787 \n",
+ " -0.526398 \n",
+ " -0.302139 \n",
+ " -0.116439 \n",
+ " 0.561034 \n",
+ " 0.0 \n",
+ " -0.538540 \n",
+ " \n",
+ " \n",
+ " 373 \n",
+ " -0.549372 \n",
+ " -0.504474 \n",
+ " -0.622322 \n",
+ " 1.213312 \n",
+ " 0.142444 \n",
+ " 0.372594 \n",
+ " -0.764862 \n",
+ " -0.707594 \n",
+ " 0.0 \n",
+ " -0.526564 \n",
+ " \n",
+ " \n",
+ " 46 \n",
+ " -0.851355 \n",
+ " 0.795653 \n",
+ " -0.731091 \n",
+ " -1.311380 \n",
+ " -0.730766 \n",
+ " -0.289408 \n",
+ " 0.262314 \n",
+ " -0.369293 \n",
+ " 0.0 \n",
+ " 0.783681 \n",
+ " \n",
+ " \n",
+ " 682 \n",
+ " -1.153338 \n",
+ " -0.821579 \n",
+ " -0.296015 \n",
+ " 1.150195 \n",
+ " 0.244628 \n",
+ " 1.607482 \n",
+ " -0.337630 \n",
+ " -0.961320 \n",
+ " 0.0 \n",
+ " -1.672162 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 451 \n",
+ " -0.549372 \n",
+ " 0.415128 \n",
+ " 0.030292 \n",
+ " -1.311380 \n",
+ " -0.730766 \n",
+ " -0.391255 \n",
+ " 0.195653 \n",
+ " -0.876744 \n",
+ " 1.0 \n",
+ " 0.446259 \n",
+ " \n",
+ " \n",
+ " 113 \n",
+ " 0.054593 \n",
+ " -1.424076 \n",
+ " -0.404784 \n",
+ " -1.311380 \n",
+ " -0.730766 \n",
+ " 0.258017 \n",
+ " -0.261879 \n",
+ " -0.707594 \n",
+ " 0.0 \n",
+ " -0.364639 \n",
+ " \n",
+ " \n",
+ " 556 \n",
+ " -0.851355 \n",
+ " -0.758158 \n",
+ " 0.030292 \n",
+ " 1.213312 \n",
+ " -0.730766 \n",
+ " 0.779980 \n",
+ " -0.786072 \n",
+ " -0.284718 \n",
+ " 0.0 \n",
+ " -2.739481 \n",
+ " \n",
+ " \n",
+ " 667 \n",
+ " 1.866489 \n",
+ " -0.314212 \n",
+ " 0.030292 \n",
+ " 0.392787 \n",
+ " -0.730766 \n",
+ " -0.569486 \n",
+ " -1.019383 \n",
+ " 0.561034 \n",
+ " 1.0 \n",
+ " -1.015065 \n",
+ " \n",
+ " \n",
+ " 107 \n",
+ " 0.054593 \n",
+ " 0.732232 \n",
+ " -0.622322 \n",
+ " 0.455904 \n",
+ " 0.569759 \n",
+ " -0.314870 \n",
+ " -0.577001 \n",
+ " 0.307308 \n",
+ " 0.0 \n",
+ " -1.024606 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
614 rows × 10 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "353 -0.851355 -0.980131 -0.404784 -0.553973 -0.331319 -0.607678 \n",
+ "711 0.356576 0.161444 0.465368 0.392787 -0.526398 -0.302139 \n",
+ "373 -0.549372 -0.504474 -0.622322 1.213312 0.142444 0.372594 \n",
+ "46 -0.851355 0.795653 -0.731091 -1.311380 -0.730766 -0.289408 \n",
+ "682 -1.153338 -0.821579 -0.296015 1.150195 0.244628 1.607482 \n",
+ ".. ... ... ... ... ... ... \n",
+ "451 -0.549372 0.415128 0.030292 -1.311380 -0.730766 -0.391255 \n",
+ "113 0.054593 -1.424076 -0.404784 -1.311380 -0.730766 0.258017 \n",
+ "556 -0.851355 -0.758158 0.030292 1.213312 -0.730766 0.779980 \n",
+ "667 1.866489 -0.314212 0.030292 0.392787 -0.730766 -0.569486 \n",
+ "107 0.054593 0.732232 -0.622322 0.455904 0.569759 -0.314870 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome_1 BMI_to_Age_ratio \n",
+ "353 0.310794 -0.792169 0.0 0.767107 \n",
+ "711 -0.116439 0.561034 0.0 -0.538540 \n",
+ "373 -0.764862 -0.707594 0.0 -0.526564 \n",
+ "46 0.262314 -0.369293 0.0 0.783681 \n",
+ "682 -0.337630 -0.961320 0.0 -1.672162 \n",
+ ".. ... ... ... ... \n",
+ "451 0.195653 -0.876744 1.0 0.446259 \n",
+ "113 -0.261879 -0.707594 0.0 -0.364639 \n",
+ "556 -0.786072 -0.284718 0.0 -2.739481 \n",
+ "667 -1.019383 0.561034 1.0 -1.015065 \n",
+ "107 -0.577001 0.307308 0.0 -1.024606 \n",
+ "\n",
+ "[614 rows x 10 columns]"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.fit_transform(X_train)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "preprocessed_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Формирование набора моделей для классификации\n",
+ " logistic -- логистическая регрессия\n",
+ "\n",
+ "ridge -- гребневая регрессия\n",
+ "\n",
+ "decision_tree -- дерево решений\n",
+ "\n",
+ "knn -- k-ближайших соседей\n",
+ "\n",
+ "naive_bayes -- наивный Байесовский классификатор\n",
+ "\n",
+ "gradient_boosting -- метод градиентного бустинга (набор деревьев решений)\n",
+ "\n",
+ "random_forest -- метод случайного леса (набор деревьев решений)\n",
+ "\n",
+ "mlp -- многослойный персептрон (нейронная сеть)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn import ensemble, linear_model, naive_bayes, neighbors, neural_network, tree\n",
+ "\n",
+ "# Определите random_state для воспроизводимости результатов\n",
+ "random_state = 42\n",
+ "\n",
+ "# Определите модели машинного обучения для классификации\n",
+ "class_models = {\n",
+ " \"logistic\": {\"model\": linear_model.LogisticRegression(random_state=random_state)},\n",
+ " \"ridge\": {\"model\": linear_model.LogisticRegression(penalty=\"l2\", class_weight=\"balanced\", random_state=random_state)},\n",
+ " \"decision_tree\": {\n",
+ " \"model\": tree.DecisionTreeClassifier(max_depth=7, random_state=random_state)\n",
+ " },\n",
+ " \"knn\": {\"model\": neighbors.KNeighborsClassifier(n_neighbors=7)},\n",
+ " \"naive_bayes\": {\"model\": naive_bayes.GaussianNB()},\n",
+ " \"gradient_boosting\": {\n",
+ " \"model\": ensemble.GradientBoostingClassifier(n_estimators=210, random_state=random_state)\n",
+ " },\n",
+ " \"random_forest\": {\n",
+ " \"model\": ensemble.RandomForestClassifier(\n",
+ " max_depth=11, class_weight=\"balanced\", random_state=random_state\n",
+ " )\n",
+ " },\n",
+ " \"mlp\": {\n",
+ " \"model\": neural_network.MLPClassifier(\n",
+ " hidden_layer_sizes=(7,),\n",
+ " max_iter=500,\n",
+ " early_stopping=True,\n",
+ " random_state=random_state,\n",
+ " )\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Обучение моделей на обучающем наборе данных и оценка на тестовом"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: logistic\n",
+ "Model: ridge\n",
+ "Model: decision_tree\n",
+ "Model: knn\n",
+ "Model: naive_bayes\n",
+ "Model: gradient_boosting\n",
+ "Model: random_forest\n",
+ "Model: mlp\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "\n",
+ "for model_name in class_models.keys():\n",
+ " print(f\"Model: {model_name}\")\n",
+ " model = class_models[model_name][\"model\"]\n",
+ "\n",
+ " model_pipeline = Pipeline([(\"pipeline\", pipeline_end), (\"model\", model)])\n",
+ " model_pipeline = model_pipeline.fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ " y_train_predict = model_pipeline.predict(X_train)\n",
+ " y_test_probs = model_pipeline.predict_proba(X_test)[:, 1]\n",
+ " y_test_predict = np.where(y_test_probs > 0.5, 1, 0)\n",
+ "\n",
+ " class_models[model_name][\"pipeline\"] = model_pipeline\n",
+ " class_models[model_name][\"probs\"] = y_test_probs\n",
+ " class_models[model_name][\"preds\"] = y_test_predict\n",
+ "\n",
+ " class_models[model_name][\"Precision_train\"] = metrics.precision_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Precision_test\"] = metrics.precision_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_train\"] = metrics.recall_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Recall_test\"] = metrics.recall_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_train\"] = metrics.accuracy_score(\n",
+ " y_train, y_train_predict\n",
+ " )\n",
+ " class_models[model_name][\"Accuracy_test\"] = metrics.accuracy_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"ROC_AUC_test\"] = metrics.roc_auc_score(\n",
+ " y_test, y_test_probs\n",
+ " )\n",
+ " class_models[model_name][\"F1_train\"] = metrics.f1_score(y_train, y_train_predict)\n",
+ " class_models[model_name][\"F1_test\"] = metrics.f1_score(y_test, y_test_predict)\n",
+ " class_models[model_name][\"MCC_test\"] = metrics.matthews_corrcoef(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(\n",
+ " y_test, y_test_predict\n",
+ " )\n",
+ " class_models[model_name][\"Confusion_matrix\"] = metrics.confusion_matrix(\n",
+ " y_test, y_test_predict\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Сводная таблица оценок качества для использованных моделей классификации"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sklearn.metrics import ConfusionMatrixDisplay\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Определите количество строк и столбцов для subplots\n",
+ "n_rows = int(len(class_models) / 2)\n",
+ "n_cols = 2\n",
+ "\n",
+ "fig, ax = plt.subplots(n_rows, n_cols, figsize=(12, 10), sharex=False, sharey=False)\n",
+ "\n",
+ "for index, key in enumerate(class_models.keys()):\n",
+ " c_matrix = class_models[key][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"No Diabetes\", \"Diabetes\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ " disp.ax_.set_title(key)\n",
+ "\n",
+ "# Настройте расположение subplots\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.1)\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "100 - количество истинных положительных диагнозов (True Positives), где модель правильно определила объекты как \"No Diabetes\".\n",
+ "\n",
+ "54 в некоторых моделях - количество ложных отрицательных диагнозов (False Negatives), где модель неправильно определила объекты, которые на самом деле принадлежат к классу \"No Diabetes\", но были отнесены к классу \"Diabetes\". \n",
+ "\n",
+ "Исходя из значений True Positives и False Negatives, можно сказать, что модель имеет высокую точность при предсказании класса \"No Diabetes\". В принципе, уровень ложных отрицательных результатов в некоторых моделях (54) говорит нам о том, что существует некотрое небольшое количество примеров, которые модель пропускает.\n",
+ "\n",
+ "Точность, полнота, верность (аккуратность), F-мера"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " Precision_train \n",
+ " Precision_test \n",
+ " Recall_train \n",
+ " Recall_test \n",
+ " Accuracy_train \n",
+ " Accuracy_test \n",
+ " F1_train \n",
+ " F1_test \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " logistic \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " ridge \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " decision_tree \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " naive_bayes \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " random_forest \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " gradient_boosting \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " knn \n",
+ " 0.918367 \n",
+ " 0.777778 \n",
+ " 0.841121 \n",
+ " 0.777778 \n",
+ " 0.918567 \n",
+ " 0.844156 \n",
+ " 0.878049 \n",
+ " 0.777778 \n",
+ " \n",
+ " \n",
+ " mlp \n",
+ " 0.254237 \n",
+ " 0.238095 \n",
+ " 0.070093 \n",
+ " 0.092593 \n",
+ " 0.604235 \n",
+ " 0.577922 \n",
+ " 0.109890 \n",
+ " 0.133333 \n",
+ " \n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(\n",
+ " by=\"Accuracy_test\", ascending=False\n",
+ ").style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Все модели в данной выборке, а именно логистическая регрессия, ридж-регрессия, дерево решений, KNN, наивный байесовский классификатор, градиентный бустинг, случайный лес и многослойный перцептрон (MLP) демонстрируют неплохие значения по всем метрикам на обучающих и тестовых наборах данных.\n",
+ "\n",
+ "Модели MLP не так эффективна по сравнению с другими, но в некоторых метриках показывают высокие результаты. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " Accuracy_test \n",
+ " F1_test \n",
+ " ROC_AUC_test \n",
+ " Cohen_kappa_test \n",
+ " MCC_test \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " logistic \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " ridge \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " decision_tree \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " naive_bayes \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " random_forest \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " gradient_boosting \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " knn \n",
+ " 0.844156 \n",
+ " 0.777778 \n",
+ " 0.908056 \n",
+ " 0.657778 \n",
+ " 0.657778 \n",
+ " \n",
+ " \n",
+ " mlp \n",
+ " 0.577922 \n",
+ " 0.133333 \n",
+ " 0.488148 \n",
+ " -0.078431 \n",
+ " -0.093728 \n",
+ " \n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "class_metrics = pd.DataFrame.from_dict(class_models, \"index\")[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "]\n",
+ "class_metrics.sort_values(by=\"ROC_AUC_test\", ascending=False).style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Схожий вывод можно сделать и для следующих метрик: Accuracy, F1, ROC AUC, Cohen's Kappa и MCC. Все модели, кроме KNN и MLP, указывают на хорошо-развитую способность к выделению классов"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'logistic'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "best_model = str(class_metrics.sort_values(by=\"MCC_test\", ascending=False).iloc[0].name)\n",
+ "\n",
+ "display(best_model)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Вывод данных с ошибкой предсказания для оценки"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Error items count: 0'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Predicted \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [Pregnancies, Predicted, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "preprocessing_result = pipeline_end.transform(X_test)\n",
+ "preprocessed_df = pd.DataFrame(\n",
+ " preprocessing_result,\n",
+ " columns=pipeline_end.get_feature_names_out(),\n",
+ ")\n",
+ "\n",
+ "y_pred = class_models[best_model][\"preds\"]\n",
+ "\n",
+ "error_index = y_test[y_test[\"Outcome\"] != y_pred].index.tolist()\n",
+ "display(f\"Error items count: {len(error_index)}\")\n",
+ "\n",
+ "error_predicted = pd.Series(y_pred, index=y_test.index).loc[error_index]\n",
+ "error_df = X_test.loc[error_index].copy()\n",
+ "error_df.insert(loc=1, column=\"Predicted\", value=error_predicted)\n",
+ "error_df.sort_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Пример использования обученной модели (конвейера) для предсказания"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 163 \n",
+ " 2.0 \n",
+ " 100.0 \n",
+ " 64.0 \n",
+ " 23.0 \n",
+ " 0.0 \n",
+ " 29.7 \n",
+ " 0.368 \n",
+ " 21.0 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "163 2.0 100.0 64.0 23.0 0.0 29.7 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome \n",
+ "163 0.368 21.0 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " Outcome_1 \n",
+ " BMI_to_Age_ratio \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 163 \n",
+ " -0.549372 \n",
+ " -0.663027 \n",
+ " -0.296015 \n",
+ " 0.140318 \n",
+ " -0.730766 \n",
+ " -0.289408 \n",
+ " -0.33157 \n",
+ " -1.045895 \n",
+ " 0.0 \n",
+ " 0.276709 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "163 -0.549372 -0.663027 -0.296015 0.140318 -0.730766 -0.289408 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age Outcome_1 BMI_to_Age_ratio \n",
+ "163 -0.33157 -1.045895 0.0 0.276709 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'predicted: 0 (proba: [0.98965692 0.01034308])'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'real: 0'"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model = class_models[best_model][\"pipeline\"]\n",
+ "\n",
+ "example_id = 163\n",
+ "test = pd.DataFrame(X_test.loc[example_id, :]).T\n",
+ "test_preprocessed = pd.DataFrame(preprocessed_df.loc[example_id, :]).T\n",
+ "display(test)\n",
+ "display(test_preprocessed)\n",
+ "result_proba = model.predict_proba(test)[0]\n",
+ "result = model.predict(test)[0]\n",
+ "real = int(y_test.loc[example_id].values[0])\n",
+ "display(f\"predicted: {result} (proba: {result_proba})\")\n",
+ "display(f\"real: {real}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Подбор гиперпараметров методом поиска по сетке "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\TIGR228\\Desktop\\МИИ\\Lab1\\AIM-PIbd-31-Afanasev-S-S\\aimenv\\Lib\\site-packages\\numpy\\ma\\core.py:2881: RuntimeWarning: invalid value encountered in cast\n",
+ " _data = np.array(data, dtype=dtype, copy=copy,\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "{'model__criterion': 'gini',\n",
+ " 'model__max_depth': 5,\n",
+ " 'model__max_features': 'sqrt',\n",
+ " 'model__n_estimators': 10}"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "optimized_model_type = \"random_forest\"\n",
+ "\n",
+ "random_forest_model = class_models[optimized_model_type][\"pipeline\"]\n",
+ "\n",
+ "param_grid = {\n",
+ " \"model__n_estimators\": [10, 50, 100],\n",
+ " \"model__max_features\": [\"sqrt\", \"log2\"],\n",
+ " \"model__max_depth\": [5, 7, 10],\n",
+ " \"model__criterion\": [\"gini\", \"entropy\"],\n",
+ "}\n",
+ "\n",
+ "gs_optomizer = GridSearchCV(\n",
+ " estimator=random_forest_model, param_grid=param_grid, n_jobs=-1\n",
+ ")\n",
+ "gs_optomizer.fit(X_train, y_train.values.ravel())\n",
+ "gs_optomizer.best_params_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Обучение модели с новыми гиперпараметрами"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.pipeline import Pipeline\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.compose import ColumnTransformer\n",
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "import numpy as np\n",
+ "from sklearn import metrics\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "# Определяем числовые признаки\n",
+ "numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()\n",
+ "\n",
+ "# Установка random_state\n",
+ "random_state = 42\n",
+ "\n",
+ "# Определение трансформера\n",
+ "pipeline_end = ColumnTransformer([\n",
+ " ('numeric', StandardScaler(), numeric_features),\n",
+ " # Добавьте другие трансформеры, если требуется\n",
+ "])\n",
+ "\n",
+ "# Объявление модели\n",
+ "optimized_model = RandomForestClassifier(\n",
+ " random_state=random_state,\n",
+ " criterion=\"gini\",\n",
+ " max_depth=5,\n",
+ " max_features=\"sqrt\",\n",
+ " n_estimators=50,\n",
+ ")\n",
+ "\n",
+ "# Создание пайплайна с корректными шагами\n",
+ "result = {}\n",
+ "\n",
+ "# Обучение модели\n",
+ "result[\"pipeline\"] = Pipeline([\n",
+ " (\"pipeline\", pipeline_end),\n",
+ " (\"model\", optimized_model)\n",
+ "]).fit(X_train, y_train.values.ravel())\n",
+ "\n",
+ "# Прогнозирование и расчет метрик\n",
+ "result[\"train_preds\"] = result[\"pipeline\"].predict(X_train)\n",
+ "result[\"probs\"] = result[\"pipeline\"].predict_proba(X_test)[:, 1]\n",
+ "result[\"preds\"] = np.where(result[\"probs\"] > 0.5, 1, 0)\n",
+ "\n",
+ "# Метрики для оценки модели\n",
+ "result[\"Precision_train\"] = metrics.precision_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Precision_test\"] = metrics.precision_score(y_test, result[\"preds\"])\n",
+ "result[\"Recall_train\"] = metrics.recall_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Recall_test\"] = metrics.recall_score(y_test, result[\"preds\"])\n",
+ "result[\"Accuracy_train\"] = metrics.accuracy_score(y_train, result[\"train_preds\"])\n",
+ "result[\"Accuracy_test\"] = metrics.accuracy_score(y_test, result[\"preds\"])\n",
+ "result[\"ROC_AUC_test\"] = metrics.roc_auc_score(y_test, result[\"probs\"])\n",
+ "result[\"F1_train\"] = metrics.f1_score(y_train, result[\"train_preds\"])\n",
+ "result[\"F1_test\"] = metrics.f1_score(y_test, result[\"preds\"])\n",
+ "result[\"MCC_test\"] = metrics.matthews_corrcoef(y_test, result[\"preds\"])\n",
+ "result[\"Cohen_kappa_test\"] = metrics.cohen_kappa_score(y_test, result[\"preds\"])\n",
+ "result[\"Confusion_matrix\"] = metrics.confusion_matrix(y_test, result[\"preds\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Формирование данных для оценки старой и новой версии модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimized_metrics = pd.DataFrame(columns=list(result.keys()))\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=class_models[optimized_model_type]\n",
+ ")\n",
+ "optimized_metrics.loc[len(optimized_metrics)] = pd.Series(\n",
+ " data=result\n",
+ ")\n",
+ "optimized_metrics.insert(loc=0, column=\"Name\", value=[\"Old\", \"New\"])\n",
+ "optimized_metrics = optimized_metrics.set_index(\"Name\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Оценка параметров старой и новой модели"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " Precision_train \n",
+ " Precision_test \n",
+ " Recall_train \n",
+ " Recall_test \n",
+ " Accuracy_train \n",
+ " Accuracy_test \n",
+ " F1_train \n",
+ " F1_test \n",
+ " \n",
+ " \n",
+ " Name \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Old \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " New \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " \"Accuracy_train\",\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_train\",\n",
+ " \"F1_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\"Accuracy_train\", \"Accuracy_test\", \"F1_train\", \"F1_test\"],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Precision_train\",\n",
+ " \"Precision_test\",\n",
+ " \"Recall_train\",\n",
+ " \"Recall_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " \n",
+ " Accuracy_test \n",
+ " F1_test \n",
+ " ROC_AUC_test \n",
+ " Cohen_kappa_test \n",
+ " MCC_test \n",
+ " \n",
+ " \n",
+ " Name \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " Old \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ " New \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " 1.000000 \n",
+ " \n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "optimized_metrics[\n",
+ " [\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " \"ROC_AUC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " \"MCC_test\",\n",
+ " ]\n",
+ "].style.background_gradient(\n",
+ " cmap=\"plasma\",\n",
+ " low=0.3,\n",
+ " high=1,\n",
+ " subset=[\n",
+ " \"ROC_AUC_test\",\n",
+ " \"MCC_test\",\n",
+ " \"Cohen_kappa_test\",\n",
+ " ],\n",
+ ").background_gradient(\n",
+ " cmap=\"viridis\",\n",
+ " low=1,\n",
+ " high=0.3,\n",
+ " subset=[\n",
+ " \"Accuracy_test\",\n",
+ " \"F1_test\",\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "_, ax = plt.subplots(1, 2, figsize=(10, 4), sharex=False, sharey=False\n",
+ ")\n",
+ "\n",
+ "for index in range(0, len(optimized_metrics)):\n",
+ " c_matrix = optimized_metrics.iloc[index][\"Confusion_matrix\"]\n",
+ " disp = ConfusionMatrixDisplay(\n",
+ " confusion_matrix=c_matrix, display_labels=[\"No Diabetes\", \"Diabetes\"]\n",
+ " ).plot(ax=ax.flat[index])\n",
+ "\n",
+ "plt.subplots_adjust(top=1, bottom=0, hspace=0.4, wspace=0.3)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "В желтых квадрате мы наблюдаем значение 100, что обозначает количество правильно классифицированных объектов, отнесенных к классу \"No Diabetes\". Это свидетельствует о том, что модель успешно идентифицирует объекты этого класса, минимизируя количество ложных положительных срабатываний.\n",
+ "\n",
+ "В бирюзовом квадрате значение 0 указывает на количество правильно классифицированных объектов, отнесенных к классу \"Diabetes\". Это является показателем не такой высокой точности модели в определении объектов данного класса."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Определение достижимого уровня качества модели для второй задачи (задача регрессии)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(700, 8)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Pregnancies \n",
+ " Glucose \n",
+ " BloodPressure \n",
+ " SkinThickness \n",
+ " Insulin \n",
+ " BMI \n",
+ " DiabetesPedigreeFunction \n",
+ " Age \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 6 \n",
+ " 98 \n",
+ " 58 \n",
+ " 33 \n",
+ " 190 \n",
+ " 34.0 \n",
+ " 0.430 \n",
+ " 43 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 2 \n",
+ " 112 \n",
+ " 75 \n",
+ " 32 \n",
+ " 0 \n",
+ " 35.7 \n",
+ " 0.148 \n",
+ " 21 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 2 \n",
+ " 108 \n",
+ " 64 \n",
+ " 0 \n",
+ " 0 \n",
+ " 30.8 \n",
+ " 0.158 \n",
+ " 21 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 8 \n",
+ " 107 \n",
+ " 80 \n",
+ " 0 \n",
+ " 0 \n",
+ " 24.6 \n",
+ " 0.856 \n",
+ " 34 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 7 \n",
+ " 136 \n",
+ " 90 \n",
+ " 0 \n",
+ " 0 \n",
+ " 29.9 \n",
+ " 0.210 \n",
+ " 50 \n",
+ " \n",
+ " \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " ... \n",
+ " \n",
+ " \n",
+ " 695 \n",
+ " 2 \n",
+ " 105 \n",
+ " 80 \n",
+ " 45 \n",
+ " 191 \n",
+ " 33.7 \n",
+ " 0.711 \n",
+ " 29 \n",
+ " \n",
+ " \n",
+ " 696 \n",
+ " 1 \n",
+ " 126 \n",
+ " 56 \n",
+ " 29 \n",
+ " 152 \n",
+ " 28.7 \n",
+ " 0.801 \n",
+ " 21 \n",
+ " \n",
+ " \n",
+ " 697 \n",
+ " 2 \n",
+ " 95 \n",
+ " 54 \n",
+ " 14 \n",
+ " 88 \n",
+ " 26.1 \n",
+ " 0.748 \n",
+ " 22 \n",
+ " \n",
+ " \n",
+ " 698 \n",
+ " 3 \n",
+ " 100 \n",
+ " 68 \n",
+ " 23 \n",
+ " 81 \n",
+ " 31.6 \n",
+ " 0.949 \n",
+ " 28 \n",
+ " \n",
+ " \n",
+ " 699 \n",
+ " 1 \n",
+ " 85 \n",
+ " 66 \n",
+ " 29 \n",
+ " 0 \n",
+ " 26.6 \n",
+ " 0.351 \n",
+ " 31 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
700 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "0 6 98 58 33 190 34.0 \n",
+ "1 2 112 75 32 0 35.7 \n",
+ "2 2 108 64 0 0 30.8 \n",
+ "3 8 107 80 0 0 24.6 \n",
+ "4 7 136 90 0 0 29.9 \n",
+ ".. ... ... ... ... ... ... \n",
+ "695 2 105 80 45 191 33.7 \n",
+ "696 1 126 56 29 152 28.7 \n",
+ "697 2 95 54 14 88 26.1 \n",
+ "698 3 100 68 23 81 31.6 \n",
+ "699 1 85 66 29 0 26.6 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age \n",
+ "0 0.430 43 \n",
+ "1 0.148 21 \n",
+ "2 0.158 21 \n",
+ "3 0.856 34 \n",
+ "4 0.210 50 \n",
+ ".. ... ... \n",
+ "695 0.711 29 \n",
+ "696 0.801 21 \n",
+ "697 0.748 22 \n",
+ "698 0.949 28 \n",
+ "699 0.351 31 \n",
+ "\n",
+ "[700 rows x 8 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn import set_config\n",
+ "\n",
+ "# Установите random_state для воспроизводимости результатов\n",
+ "random_state = 42\n",
+ "set_config(transform_output=\"pandas\")\n",
+ "\n",
+ "df = pd.read_csv(\"C:/Users/TIGR228/Desktop/МИИ/Lab1/AIM-PIbd-31-Afanasev-S-S/static/csv/diabetes.csv\")\n",
+ "\n",
+ "# Удалите столбцы, которые не нужны для анализа\n",
+ "df = df.drop(columns=[\"Outcome\"])\n",
+ "\n",
+ "df = df.sample(n=700, random_state=random_state).reset_index(drop=True)\n",
+ "\n",
+ "print(df.shape) \n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
+ "0 6 148 72 35 0 33.6 \n",
+ "1 1 85 66 29 0 26.6 \n",
+ "2 8 183 64 0 0 23.3 \n",
+ "3 1 89 66 23 94 28.1 \n",
+ "4 0 137 40 35 168 43.1 \n",
+ "\n",
+ " DiabetesPedigreeFunction Age diabetes_risk_index \n",
+ "0 0.627 50 71.68 \n",
+ "1 0.351 31 46.28 \n",
+ "2 0.672 32 74.69 \n",
+ "3 0.167 21 55.33 \n",
+ "4 2.288 33 81.43 \n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\"C:/Users/TIGR228/Desktop/МИИ/Lab1/AIM-PIbd-31-Afanasev-S-S/static/csv/diabetes.csv\")\n",
+ "\n",
+ "required_columns = [\"Pregnancies\", \"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\", \"DiabetesPedigreeFunction\", \"Age\"]\n",
+ "missing_columns = [col for col in required_columns if col not in df.columns]\n",
+ "if missing_columns:\n",
+ " raise ValueError(f\"Отсутствуют столбцы: {missing_columns}\")\n",
+ "\n",
+ "df[\"diabetes_risk_index\"] = (\n",
+ " df[\"Glucose\"] * 0.3 \n",
+ " + df[\"BMI\"] * 0.3 \n",
+ " + df[\"Age\"] * 0.2 \n",
+ " + df[\"BloodPressure\"] * 0.1 \n",
+ " + df[\"Insulin\"] * 0.1 \n",
+ ")\n",
+ "\n",
+ "# Проверка новых данных\n",
+ "print(df[[\"Pregnancies\", \"Glucose\", \"BloodPressure\", \"SkinThickness\", \"Insulin\", \"BMI\", \"DiabetesPedigreeFunction\", \"Age\", \"diabetes_risk_index\"]].head())\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Разделение набора данных на обучающую и тестовые выборки (80/20) для задачи регрессии "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "aimenv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}