{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Данные по инсультам\n",
    "\n",
    "Выведем информацию о столбцах датасета:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 441,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
      "       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
      "       'smoking_status', 'stroke'],\n",
      "      dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>hypertension</th>\n",
       "      <th>heart_disease</th>\n",
       "      <th>ever_married</th>\n",
       "      <th>work_type</th>\n",
       "      <th>Residence_type</th>\n",
       "      <th>avg_glucose_level</th>\n",
       "      <th>bmi</th>\n",
       "      <th>smoking_status</th>\n",
       "      <th>stroke</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9046</td>\n",
       "      <td>Male</td>\n",
       "      <td>67.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>228.69</td>\n",
       "      <td>36.6</td>\n",
       "      <td>formerly smoked</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>51676</td>\n",
       "      <td>Female</td>\n",
       "      <td>61.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Self-employed</td>\n",
       "      <td>Rural</td>\n",
       "      <td>202.21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>31112</td>\n",
       "      <td>Male</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Rural</td>\n",
       "      <td>105.92</td>\n",
       "      <td>32.5</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60182</td>\n",
       "      <td>Female</td>\n",
       "      <td>49.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Private</td>\n",
       "      <td>Urban</td>\n",
       "      <td>171.23</td>\n",
       "      <td>34.4</td>\n",
       "      <td>smokes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1665</td>\n",
       "      <td>Female</td>\n",
       "      <td>79.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Yes</td>\n",
       "      <td>Self-employed</td>\n",
       "      <td>Rural</td>\n",
       "      <td>174.12</td>\n",
       "      <td>24.0</td>\n",
       "      <td>never smoked</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      id  gender   age  hypertension  heart_disease ever_married  \\\n",
       "0   9046    Male  67.0             0              1          Yes   \n",
       "1  51676  Female  61.0             0              0          Yes   \n",
       "2  31112    Male  80.0             0              1          Yes   \n",
       "3  60182  Female  49.0             0              0          Yes   \n",
       "4   1665  Female  79.0             1              0          Yes   \n",
       "\n",
       "       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \\\n",
       "0        Private          Urban             228.69  36.6  formerly smoked   \n",
       "1  Self-employed          Rural             202.21   NaN     never smoked   \n",
       "2        Private          Rural             105.92  32.5     never smoked   \n",
       "3        Private          Urban             171.23  34.4           smokes   \n",
       "4  Self-employed          Rural             174.12  24.0     never smoked   \n",
       "\n",
       "   stroke  \n",
       "0       1  \n",
       "1       1  \n",
       "2       1  \n",
       "3       1  \n",
       "4       1  "
      ]
     },
     "execution_count": 441,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import featuretools as ft\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "import time\n",
    "from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "df = pd.read_csv(\"..//..//static//csv//healthcare-dataset-stroke-data.csv\")\n",
    "\n",
    "print(df.columns)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Определим бизнес цели и цели технического проекта.\n",
    "\n",
    "1. Улучшение диагностики и профилактики инсульта.\n",
    "    * Бизнес-цель: повышение точности прогнозирования риска инсульта среди пациентов для более раннего лечебного вмешательства. Определение основных факторов риска для более целенаправленного подхода в медицинском обслуживании.\n",
    "    * Цель технического проекта: разработка статистической модели, которая решает задачу классификации и предсказывает возможность возникновения  инсульта у пациентов на основе имеющихся данных (возраст, гипертония, заболевания сердца и пр.), с целью выявления групп риска. Внедрение этой модели в систему поддержки принятия медицинских решений для врачей.\n",
    "2. Снижение расходов на лечение инсультов.\n",
    "    * Бизнес-цель: снижение затрат на лечение инсульта путем более эффективного распределения медицинских ресурсов и направленных профилактических мер.\n",
    "    * Цель технического проекта: создание системы оценки индивидуального риска инсульта для пациентов, что позволит медучреждениям проводить профилактические меры среди целевых групп, сокращая расходы на лечение.\n",
    "\n",
    "### И теперь проверим датасет на пустые значения:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 442,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                     0\n",
      "gender                 0\n",
      "age                    0\n",
      "hypertension           0\n",
      "heart_disease          0\n",
      "ever_married           0\n",
      "work_type              0\n",
      "Residence_type         0\n",
      "avg_glucose_level      0\n",
      "bmi                  201\n",
      "smoking_status         0\n",
      "stroke                 0\n",
      "dtype: int64\n",
      "\n",
      "id                   False\n",
      "gender               False\n",
      "age                  False\n",
      "hypertension         False\n",
      "heart_disease        False\n",
      "ever_married         False\n",
      "work_type            False\n",
      "Residence_type       False\n",
      "avg_glucose_level    False\n",
      "bmi                   True\n",
      "smoking_status       False\n",
      "stroke               False\n",
      "dtype: bool\n",
      "\n",
      "bmi процент пустых значений: %3.93\n"
     ]
    }
   ],
   "source": [
    "# Количество пустых значений признаков\n",
    "print(df.isnull().sum())\n",
    "\n",
    "print()\n",
    "\n",
    "# Есть ли пустые значения признаков\n",
    "print(df.isnull().any())\n",
    "\n",
    "print()\n",
    "\n",
    "# Процент пустых значений признаков\n",
    "for i in df.columns:\n",
    "    null_rate = df[i].isnull().sum() / len(df) * 100\n",
    "    if null_rate > 0:\n",
    "        print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "В столбце bmi можно заметить пустые значение. Заменим их на медиану:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 443,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Количество пустых значений в каждом столбце после замены:\n",
      "id                   0\n",
      "gender               0\n",
      "age                  0\n",
      "hypertension         0\n",
      "heart_disease        0\n",
      "ever_married         0\n",
      "work_type            0\n",
      "Residence_type       0\n",
      "avg_glucose_level    0\n",
      "bmi                  0\n",
      "smoking_status       0\n",
      "stroke               0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# Замена значений\n",
    "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
    "\n",
    "# Проверка на пропущенные значения после замены\n",
    "missing_values_after_drop = df.isnull().sum()\n",
    "\n",
    "# Вывод результатов после замены\n",
    "print(\"\\nКоличество пустых значений в каждом столбце после замены:\")\n",
    "print(missing_values_after_drop)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Можно перейти к созданию выборок"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 444,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Размер обучающей выборки: (2503, 11)\n",
      "Размер контрольной выборки: (1074, 11)\n",
      "Размер тестовой выборки: (1533, 11)\n"
     ]
    }
   ],
   "source": [
    "# Разделение данных на признаки (X) и целевую переменную (y)\n",
    "# В данном случае мы хотим предсказать 'stroke'\n",
    "X = df.drop(columns=['stroke'])\n",
    "y = df['stroke']\n",
    "\n",
    "# Разбиение данных на обучающую и тестовую выборки\n",
    "# Сначала разделим на обучающую и тестовую\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
    "\n",
    "# Затем разделим обучающую выборку на обучающую и контрольную\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
    "\n",
    "# Проверка размеров выборок\n",
    "print(\"Размер обучающей выборки:\", X_train.shape)\n",
    "print(\"Размер контрольной выборки:\", X_val.shape)\n",
    "print(\"Размер тестовой выборки:\", X_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Оценим сбалансированность выборок:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 445,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Распределение классов в обучающей выборке:\n",
      "stroke\n",
      "0    0.955653\n",
      "1    0.044347\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Распределение классов в контрольной выборке:\n",
      "stroke\n",
      "0    0.954376\n",
      "1    0.045624\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Распределение классов в тестовой выборке:\n",
      "stroke\n",
      "0    0.941944\n",
      "1    0.058056\n",
      "Name: proportion, dtype: float64\n"
     ]
    },
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 1800x500 with 3 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Функция для анализа сбалансированности\n",
    "def analyze_balance(y_train, y_val, y_test, y_name):\n",
    "    # Распределение классов\n",
    "    print(\"Распределение классов в обучающей выборке:\")\n",
    "    print(y_train.value_counts(normalize=True))\n",
    "    \n",
    "    print(\"\\nРаспределение классов в контрольной выборке:\")\n",
    "    print(y_val.value_counts(normalize=True))\n",
    "    \n",
    "    print(\"\\nРаспределение классов в тестовой выборке:\")\n",
    "    print(y_test.value_counts(normalize=True))\n",
    "\n",
    "    # Создание фигуры и осей для трех столбчатых диаграмм\n",
    "    fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
    "    fig.suptitle('Распределение в различных выборках')\n",
    "\n",
    "    # Обучающая выборка\n",
    "    sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
    "    axes[0].set_title('Обучающая выборка')\n",
    "    axes[0].set_xlabel(y_name)\n",
    "    axes[0].set_ylabel('Доля')\n",
    "\n",
    "    # Контрольная выборка\n",
    "    sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
    "    axes[1].set_title('Контрольная выборка')\n",
    "    axes[1].set_xlabel(y_name)\n",
    "\n",
    "    # Тестовая выборка\n",
    "    sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
    "    axes[2].set_title('Тестовая выборка')\n",
    "    axes[2].set_xlabel(y_name)\n",
    "\n",
    "    plt.show()\n",
    "\n",
    "analyze_balance(y_train, y_val, y_test, 'stroke')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Легко заметить, что выборки несбалансированны. Необходимо сбалансировать обучающую и контрольную выборки, чтобы получить лучшие результаты при обучении модели. Для балансировки применим RandomOverSampler:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 446,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Распределение классов в обучающей выборке:\n",
      "stroke\n",
      "0    0.5\n",
      "1    0.5\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Распределение классов в контрольной выборке:\n",
      "stroke\n",
      "0    0.5\n",
      "1    0.5\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "Распределение классов в тестовой выборке:\n",
      "stroke\n",
      "0    0.941944\n",
      "1    0.058056\n",
      "Name: proportion, dtype: float64\n"
     ]
    },
    {
     "data": {
      "image/png": "",
      "text/plain": [
       "<Figure size 1800x500 with 3 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ros = RandomOverSampler(random_state=42)\n",
    "\n",
    "# Применение RandomOverSampler для балансировки выборок\n",
    "X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
    "X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
    "\n",
    "# Проверка сбалансированности после RandomOverSampler\n",
    "analyze_balance(y_train_resampled, y_val_resampled, y_test, 'stroke')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Выборки сбалансированы.\n",
    "\n",
    "### Перейдем к конструированию признаков\n",
    "\n",
    "Для начала применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 447,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      id   age  hypertension  heart_disease  avg_glucose_level   bmi  \\\n",
      "0  16605  57.0             0              0             106.24  32.3   \n",
      "1  12015  14.0             0              0              99.87  25.2   \n",
      "2  26474  44.0             0              0              97.16  33.1   \n",
      "3  31143  22.0             0              0             107.52  41.6   \n",
      "4   2447  63.0             0              0              85.04  29.7   \n",
      "\n",
      "   gender_Male  gender_Other  ever_married_Yes  work_type_Never_worked  \\\n",
      "0         True         False              True                   False   \n",
      "1         True         False             False                   False   \n",
      "2        False         False              True                   False   \n",
      "3        False         False             False                   False   \n",
      "4        False         False              True                   False   \n",
      "\n",
      "   work_type_Private  work_type_Self-employed  work_type_children  \\\n",
      "0               True                    False               False   \n",
      "1              False                    False                True   \n",
      "2              False                    False               False   \n",
      "3               True                    False               False   \n",
      "4               True                    False               False   \n",
      "\n",
      "   Residence_type_Urban  smoking_status_formerly smoked  \\\n",
      "0                  True                           False   \n",
      "1                  True                           False   \n",
      "2                  True                           False   \n",
      "3                 False                           False   \n",
      "4                  True                            True   \n",
      "\n",
      "   smoking_status_never smoked  smoking_status_smokes  \n",
      "0                         True                  False  \n",
      "1                        False                  False  \n",
      "2                        False                  False  \n",
      "3                        False                  False  \n",
      "4                        False                  False  \n"
     ]
    }
   ],
   "source": [
    "# Определение категориальных признаков\n",
    "categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
    "\n",
    "# Применение one-hot encoding к обучающей выборке\n",
    "X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
    "\n",
    "# Применение one-hot encoding к контрольной выборке\n",
    "X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
    "\n",
    "# Применение one-hot encoding к тестовой выборке\n",
    "X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
    "\n",
    "print(X_train_encoded.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Далее к числовым признакам, а именно к колонке age, применим дискретизацию (позволяет преобразовать данные из числового представления в категориальное):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 448,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      id  hypertension  heart_disease  avg_glucose_level   bmi  gender_Male  \\\n",
      "0  16605             0              0             106.24  32.3         True   \n",
      "1  12015             0              0              99.87  25.2         True   \n",
      "2  26474             0              0              97.16  33.1        False   \n",
      "3  31143             0              0             107.52  41.6        False   \n",
      "4   2447             0              0              85.04  29.7        False   \n",
      "\n",
      "   gender_Other  ever_married_Yes  work_type_Never_worked  work_type_Private  \\\n",
      "0         False              True                   False               True   \n",
      "1         False             False                   False              False   \n",
      "2         False              True                   False              False   \n",
      "3         False             False                   False               True   \n",
      "4         False              True                   False               True   \n",
      "\n",
      "   work_type_Self-employed  work_type_children  Residence_type_Urban  \\\n",
      "0                    False               False                  True   \n",
      "1                    False                True                  True   \n",
      "2                    False               False                  True   \n",
      "3                    False               False                 False   \n",
      "4                    False               False                  True   \n",
      "\n",
      "   smoking_status_formerly smoked  smoking_status_never smoked  \\\n",
      "0                           False                         True   \n",
      "1                           False                        False   \n",
      "2                           False                        False   \n",
      "3                           False                        False   \n",
      "4                            True                        False   \n",
      "\n",
      "   smoking_status_smokes      age_bin  \n",
      "0                  False          old  \n",
      "1                  False        young  \n",
      "2                  False  middle-aged  \n",
      "3                  False        young  \n",
      "4                  False          old  \n"
     ]
    }
   ],
   "source": [
    "# Определение числовых признаков для дискретизации\n",
    "numerical_features = ['age']\n",
    "\n",
    "# Функция для дискретизации числовых признаков\n",
    "def discretize_features(df, features, bins, labels):\n",
    "    for feature in features:\n",
    "        df[f'{feature}_bin'] = pd.cut(df[feature], bins=bins, labels=labels)\n",
    "        df.drop(columns=[feature], inplace=True)\n",
    "    return df\n",
    "\n",
    "# Заданные интервалы и метки\n",
    "age_bins = [0, 25, 55, 100]\n",
    "age_labels = [\"young\", \"middle-aged\", \"old\"]\n",
    "\n",
    "# Применение дискретизации к обучающей, контрольной и тестовой выборкам\n",
    "X_train_encoded = discretize_features(X_train_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
    "X_val_encoded = discretize_features(X_val_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
    "X_test_encoded = discretize_features(X_test_encoded, numerical_features, bins=age_bins, labels=age_labels)\n",
    "\n",
    "print(X_train_encoded.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Применим ручной синтез признаков. Это создание новых признаков на основе существующих, учитывая экспертные знания и логику предметной области. К примеру, в этом случае можно создать признак, в котором вычисляется насколько уровень глюкозы отклоняется от среднего для возрастной группы пациента. Такой признак может быть полезен для выделения пациентов с нетипичными данными."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 449,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      id  hypertension  heart_disease  avg_glucose_level   bmi  gender_Male  \\\n",
      "0  16605             0              0             106.24  32.3         True   \n",
      "1  12015             0              0              99.87  25.2         True   \n",
      "2  26474             0              0              97.16  33.1        False   \n",
      "3  31143             0              0             107.52  41.6        False   \n",
      "4   2447             0              0              85.04  29.7        False   \n",
      "\n",
      "   gender_Other  ever_married_Yes  work_type_Never_worked  work_type_Private  \\\n",
      "0         False              True                   False               True   \n",
      "1         False             False                   False              False   \n",
      "2         False              True                   False              False   \n",
      "3         False             False                   False               True   \n",
      "4         False              True                   False               True   \n",
      "\n",
      "   work_type_Self-employed  work_type_children  Residence_type_Urban  \\\n",
      "0                    False               False                  True   \n",
      "1                    False                True                  True   \n",
      "2                    False               False                  True   \n",
      "3                    False               False                 False   \n",
      "4                    False               False                  True   \n",
      "\n",
      "   smoking_status_formerly smoked  smoking_status_never smoked  \\\n",
      "0                           False                         True   \n",
      "1                           False                        False   \n",
      "2                           False                        False   \n",
      "3                           False                        False   \n",
      "4                            True                        False   \n",
      "\n",
      "   smoking_status_smokes      age_bin  glucose_age_deviation  \n",
      "0                  False          old             -27.642870  \n",
      "1                  False        young               6.088032  \n",
      "2                  False  middle-aged              -6.217053  \n",
      "3                  False        young              13.738032  \n",
      "4                  False          old             -48.842870  \n"
     ]
    }
   ],
   "source": [
    "age_glucose_mean = X_train_encoded.groupby('age_bin', observed=False)['avg_glucose_level'].transform('mean')\n",
    "X_train_encoded['glucose_age_deviation'] = X_train_encoded['avg_glucose_level'] - age_glucose_mean\n",
    "\n",
    "age_glucose_mean = X_val_encoded.groupby('age_bin', observed=False)['avg_glucose_level'].transform('mean')\n",
    "X_val_encoded['glucose_age_deviation'] = X_val_encoded['avg_glucose_level'] - age_glucose_mean\n",
    "\n",
    "age_glucose_mean = X_test_encoded.groupby('age_bin', observed=False)['avg_glucose_level'].transform('mean')\n",
    "X_test_encoded['glucose_age_deviation'] = X_test_encoded['avg_glucose_level'] - age_glucose_mean\n",
    "\n",
    "print(X_train_encoded.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Теперь используем масштабирование признаков, что позволяет привести все числовые признаки к одинаковым или очень похожим диапазонам значений либо распределениям. По результатам многочисленных исследований масштабирование признаков позволяет получить более качественную модель за счет снижения доминирования одних признаков над другими."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 450,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      id  hypertension  heart_disease  avg_glucose_level       bmi  \\\n",
      "0  16605             0              0          -0.244097  0.426328   \n",
      "1  12015             0              0          -0.360110 -0.596170   \n",
      "2  26474             0              0          -0.409465  0.541539   \n",
      "3  31143             0              0          -0.220785  1.765656   \n",
      "4   2447             0              0          -0.630199  0.051892   \n",
      "\n",
      "   gender_Male  gender_Other  ever_married_Yes  work_type_Never_worked  \\\n",
      "0         True         False              True                   False   \n",
      "1         True         False             False                   False   \n",
      "2        False         False              True                   False   \n",
      "3        False         False             False                   False   \n",
      "4        False         False              True                   False   \n",
      "\n",
      "   work_type_Private  work_type_Self-employed  work_type_children  \\\n",
      "0               True                    False               False   \n",
      "1              False                    False                True   \n",
      "2              False                    False               False   \n",
      "3               True                    False               False   \n",
      "4               True                    False               False   \n",
      "\n",
      "   Residence_type_Urban  smoking_status_formerly smoked  \\\n",
      "0                  True                           False   \n",
      "1                  True                           False   \n",
      "2                  True                           False   \n",
      "3                 False                           False   \n",
      "4                  True                            True   \n",
      "\n",
      "   smoking_status_never smoked  smoking_status_smokes      age_bin  \\\n",
      "0                         True                  False          old   \n",
      "1                        False                  False        young   \n",
      "2                        False                  False  middle-aged   \n",
      "3                        False                  False        young   \n",
      "4                        False                  False          old   \n",
      "\n",
      "   glucose_age_deviation  \n",
      "0              -0.528807  \n",
      "1               0.116464  \n",
      "2              -0.118932  \n",
      "3               0.262808  \n",
      "4              -0.934362  \n"
     ]
    }
   ],
   "source": [
    "# Пример масштабирования числовых признаков\n",
    "numerical_features = ['avg_glucose_level', 'bmi', 'glucose_age_deviation']\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])\n",
    "X_val_encoded[numerical_features] = scaler.transform(X_val_encoded[numerical_features])\n",
    "X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])\n",
    "\n",
    "print(X_train_encoded.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "И также попробуем сконструировать признаки, используя фреймворк Featuretools:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 451,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          id  hypertension  heart_disease  avg_glucose_level       bmi  \\\n",
      "index                                                                    \n",
      "0      16605             0              0          -0.244097  0.426328   \n",
      "1      12015             0              0          -0.360110 -0.596170   \n",
      "2      26474             0              0          -0.409465  0.541539   \n",
      "3      31143             0              0          -0.220785  1.765656   \n",
      "4       2447             0              0          -0.630199  0.051892   \n",
      "\n",
      "       gender_Male  gender_Other  ever_married_Yes  work_type_Never_worked  \\\n",
      "index                                                                        \n",
      "0             True         False              True                   False   \n",
      "1             True         False             False                   False   \n",
      "2            False         False              True                   False   \n",
      "3            False         False             False                   False   \n",
      "4            False         False              True                   False   \n",
      "\n",
      "       work_type_Private  work_type_Self-employed  work_type_children  \\\n",
      "index                                                                   \n",
      "0                   True                    False               False   \n",
      "1                  False                    False                True   \n",
      "2                  False                    False               False   \n",
      "3                   True                    False               False   \n",
      "4                   True                    False               False   \n",
      "\n",
      "       Residence_type_Urban  smoking_status_formerly smoked  \\\n",
      "index                                                         \n",
      "0                      True                           False   \n",
      "1                      True                           False   \n",
      "2                      True                           False   \n",
      "3                     False                           False   \n",
      "4                      True                            True   \n",
      "\n",
      "       smoking_status_never smoked  smoking_status_smokes      age_bin  \\\n",
      "index                                                                    \n",
      "0                             True                  False          old   \n",
      "1                            False                  False        young   \n",
      "2                            False                  False  middle-aged   \n",
      "3                            False                  False        young   \n",
      "4                            False                  False          old   \n",
      "\n",
      "       glucose_age_deviation  \n",
      "index                         \n",
      "0                  -0.528807  \n",
      "1                   0.116464  \n",
      "2                  -0.118932  \n",
      "3                   0.262808  \n",
      "4                  -0.934362  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\Ilya\\Desktop\\AIM\\aimenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
      "  pd.to_datetime(\n"
     ]
    }
   ],
   "source": [
    "data = X_train_encoded.copy()  # Используем предобработанные данные\n",
    "\n",
    "es = ft.EntitySet(id=\"patients\")\n",
    "\n",
    "es = es.add_dataframe(dataframe_name=\"strokes_data\", dataframe=data, index=\"index\", make_index=True)\n",
    "\n",
    "feature_matrix, feature_defs = ft.dfs(\n",
    "    entityset=es, \n",
    "    target_dataframe_name=\"strokes_data\",\n",
    "    max_depth=1\n",
    ")\n",
    "\n",
    "print(feature_matrix.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Оценим качество набора признаков.\n",
    "\n",
    "Представим основные оценки качества наборов признаков: \n",
    "\n",
    "* Предсказательная способность Метрики: RMSE, MAE, R²\n",
    "\n",
    "  Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
    "\n",
    "* Скорость вычисления \n",
    "\n",
    "  Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
    "\n",
    "* Надежность \n",
    "\n",
    "  Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
    "\n",
    "* Корреляция \n",
    "\n",
    "  Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
    "\n",
    "* Цельность \n",
    "\n",
    "  Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 452,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Время обучения модели: 0.01 секунд\n",
      "Среднеквадратичная ошибка: 0.41\n"
     ]
    }
   ],
   "source": [
    "X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
    "X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
    "X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
    "\n",
    "all_columns = X_train_encoded.columns\n",
    "X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
    "X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
    "X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
    "\n",
    "# Обучение модели\n",
    "model = LinearRegression()\n",
    "\n",
    "# Начинаем отсчет времени\n",
    "start_time = time.time()\n",
    "model.fit(X_train_encoded, y_train_resampled)\n",
    "\n",
    "# Время обучения модели\n",
    "train_time = time.time() - start_time\n",
    "\n",
    "# Предсказания и оценка модели и вычисляем среднеквадратичную ошибку\n",
    "predictions = model.predict(X_val_encoded)\n",
    "mse = root_mean_squared_error(y_val_resampled, predictions)\n",
    "\n",
    "print(f'Время обучения модели: {train_time:.2f} секунд')\n",
    "print(f'Среднеквадратичная ошибка: {mse:.2f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 453,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "RMSE: 0.24109840514907446\n",
      "R²: -0.06295721700021817\n",
      "MAE: 0.10402478799739073 \n",
      "\n",
      "Кросс-валидация RMSE: 0.1197518340742331 \n",
      "\n",
      "Train RMSE: 0.037396456827854585\n",
      "Train R²: 0.9944060200668896\n",
      "Train MAE: 0.010727424749163881\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Выбор модели\n",
    "model = RandomForestRegressor(random_state=42)\n",
    "\n",
    "# Обучение модели\n",
    "model.fit(X_train_encoded, y_train_resampled)\n",
    "\n",
    "# Предсказание и оценка\n",
    "y_pred = model.predict(X_test_encoded)\n",
    "\n",
    "rmse = root_mean_squared_error(y_test, y_pred)\n",
    "r2 = r2_score(y_test, y_pred)\n",
    "mae = mean_absolute_error(y_test, y_pred)\n",
    "\n",
    "print()\n",
    "print(f\"RMSE: {rmse}\")\n",
    "print(f\"R²: {r2}\")\n",
    "print(f\"MAE: {mae} \\n\")\n",
    "\n",
    "# Кросс-валидация\n",
    "scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='neg_mean_squared_error')\n",
    "rmse_cv = (-scores.mean())**0.5\n",
    "print(f\"Кросс-валидация RMSE: {rmse_cv} \\n\")\n",
    "\n",
    "# Проверка на переобучение\n",
    "y_train_pred = model.predict(X_train_encoded)\n",
    "\n",
    "rmse_train = root_mean_squared_error(y_train_resampled, y_train_pred)\n",
    "r2_train = r2_score(y_train_resampled, y_train_pred)\n",
    "mae_train = mean_absolute_error(y_train_resampled, y_train_pred)\n",
    "\n",
    "print(f\"Train RMSE: {rmse_train}\")\n",
    "print(f\"Train R²: {r2_train}\")\n",
    "print(f\"Train MAE: {mae_train}\")\n",
    "print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Можно заметить, что модель хорошо подстроилась под тренировочные данные (Низкий Train RMSE и высокое значение Train R²). Однако высокий RMSE и отрицательный R² на тестовом наборе  свидетельствуют о том, что модель не обобщила зависимости и плохо предсказывает новые данные, поэтому можно сделать вывод о том, что получившийся набор признаков, к сожалению, далек от идеала. "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "aimenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}