AIM-PIbd-31-Anisin-R-S/lab_3/lab3.ipynb

1265 lines
272 KiB
Plaintext
Raw Normal View History

2024-11-09 00:27:45 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Лабораторная 3\n",
"Датасет: Набор данных для анализа и прогнозирования сердечного приступа"
]
},
{
"cell_type": "code",
"execution_count": 345,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
" dtype='object')\n",
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 319795 entries, 0 to 319794\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 HeartDisease 319795 non-null object \n",
" 1 BMI 319795 non-null float64\n",
" 2 Smoking 319795 non-null object \n",
" 3 AlcoholDrinking 319795 non-null object \n",
" 4 Stroke 319795 non-null object \n",
" 5 PhysicalHealth 319795 non-null float64\n",
" 6 MentalHealth 319795 non-null float64\n",
" 7 DiffWalking 319795 non-null object \n",
" 8 Sex 319795 non-null object \n",
" 9 AgeCategory 319795 non-null object \n",
" 10 Race 319795 non-null object \n",
" 11 Diabetic 319795 non-null object \n",
" 12 PhysicalActivity 319795 non-null object \n",
" 13 GenHealth 319795 non-null object \n",
" 14 SleepTime 319795 non-null float64\n",
" 15 Asthma 319795 non-null object \n",
" 16 KidneyDisease 319795 non-null object \n",
" 17 SkinCancer 319795 non-null object \n",
"dtypes: float64(4), object(14)\n",
"memory usage: 43.9+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>HeartDisease</th>\n",
" <th>BMI</th>\n",
" <th>Smoking</th>\n",
" <th>AlcoholDrinking</th>\n",
" <th>Stroke</th>\n",
" <th>PhysicalHealth</th>\n",
" <th>MentalHealth</th>\n",
" <th>DiffWalking</th>\n",
" <th>Sex</th>\n",
" <th>AgeCategory</th>\n",
" <th>Race</th>\n",
" <th>Diabetic</th>\n",
" <th>PhysicalActivity</th>\n",
" <th>GenHealth</th>\n",
" <th>SleepTime</th>\n",
" <th>Asthma</th>\n",
" <th>KidneyDisease</th>\n",
" <th>SkinCancer</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>No</td>\n",
" <td>16.60</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>3.0</td>\n",
" <td>30.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>55-59</td>\n",
" <td>White</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>5.0</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>No</td>\n",
" <td>20.34</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>80 or older</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>7.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>No</td>\n",
" <td>26.58</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>20.0</td>\n",
" <td>30.0</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>65-69</td>\n",
" <td>White</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>Fair</td>\n",
" <td>8.0</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No</td>\n",
" <td>24.21</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>75-79</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Good</td>\n",
" <td>6.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>No</td>\n",
" <td>23.71</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>28.0</td>\n",
" <td>0.0</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>40-44</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>8.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"0 No 16.60 Yes No No 3.0 \n",
"1 No 20.34 No No Yes 0.0 \n",
"2 No 26.58 Yes No No 20.0 \n",
"3 No 24.21 No No No 0.0 \n",
"4 No 23.71 No No No 28.0 \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"0 30.0 No Female 55-59 White Yes \n",
"1 0.0 No Female 80 or older White No \n",
"2 30.0 No Male 65-69 White Yes \n",
"3 0.0 No Female 75-79 White No \n",
"4 0.0 Yes Female 40-44 White No \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
"0 Yes Very good 5.0 Yes No Yes \n",
"1 Yes Very good 7.0 No No No \n",
"2 Yes Fair 8.0 Yes No No \n",
"3 No Good 6.0 No No Yes \n",
"4 Yes Very good 8.0 No No No "
]
},
"execution_count": 345,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.preprocessing import StandardScaler\n",
"import featuretools as ft\n",
"import time\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"df = pd.read_csv(\".//static//csv//heart_2020_cleaned.csv\")\n",
"\n",
"print(df.columns)\n",
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Бизнес цели и цели технического проекта\n",
"1. Улучшение профилактики сердечно-сосудистых заболеваний\n",
"\n",
" - Бизнес-цель: Повышение точности прогнозирования риска сердечно-сосудистых заболеваний среди пациентов для более раннего вмешательства и снижения частоты обострений. Определение основных факторов риска, чтобы медперсонал мог предоставлять более целенаправленные рекомендации по улучшению здоровья.\n",
"\n",
" - Цель технического проекта: Разработка классификационной модели для предсказания вероятности сердечно-сосудистых заболеваний на основе данных (возраст, индекс массы тела, физическая активность, курение и т. д.), что поможет выделить группы высокого риска. Интеграция этой модели в систему поддержки принятия решений для врачей, чтобы улучшить качество и своевременность рекомендаций.\n",
"\n",
"2. Снижение расходов на лечение сердечно-сосудистых заболеваний\n",
"\n",
" - Бизнес-цель: Оптимизация затрат на лечение сердечно-сосудистых заболеваний путем эффективного распределения ресурсов и проведения профилактических мер среди целевых групп.\n",
"\n",
" - Цель технического проекта: Создание системы оценки индивидуального риска сердечно-сосудистых заболеваний для пациентов, которая позволит медицинским учреждениям и страховым компаниям выделять целевые группы для проведения превентивных мероприятий, тем самым сокращая затраты на лечение."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на пустые значения и дубликаты"
]
},
{
"cell_type": "code",
"execution_count": 346,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пустые значения по столбцам:\n",
"HeartDisease 0\n",
"BMI 0\n",
"Smoking 0\n",
"AlcoholDrinking 0\n",
"Stroke 0\n",
"PhysicalHealth 0\n",
"MentalHealth 0\n",
"DiffWalking 0\n",
"Sex 0\n",
"AgeCategory 0\n",
"Race 0\n",
"Diabetic 0\n",
"PhysicalActivity 0\n",
"GenHealth 0\n",
"SleepTime 0\n",
"Asthma 0\n",
"KidneyDisease 0\n",
"SkinCancer 0\n",
"dtype: int64\n",
"\n",
"Количество дубликатов: 18078\n",
"\n",
"Статистический обзор данных:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>BMI</th>\n",
" <th>PhysicalHealth</th>\n",
" <th>MentalHealth</th>\n",
" <th>SleepTime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>319795.000000</td>\n",
" <td>319795.00000</td>\n",
" <td>319795.000000</td>\n",
" <td>319795.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>28.325399</td>\n",
" <td>3.37171</td>\n",
" <td>3.898366</td>\n",
" <td>7.097075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>6.356100</td>\n",
" <td>7.95085</td>\n",
" <td>7.955235</td>\n",
" <td>1.436007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>12.020000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>24.030000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>27.340000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>31.420000</td>\n",
" <td>2.00000</td>\n",
" <td>3.000000</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>94.850000</td>\n",
" <td>30.00000</td>\n",
" <td>30.000000</td>\n",
" <td>24.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" BMI PhysicalHealth MentalHealth SleepTime\n",
"count 319795.000000 319795.00000 319795.000000 319795.000000\n",
"mean 28.325399 3.37171 3.898366 7.097075\n",
"std 6.356100 7.95085 7.955235 1.436007\n",
"min 12.020000 0.00000 0.000000 1.000000\n",
"25% 24.030000 0.00000 0.000000 6.000000\n",
"50% 27.340000 0.00000 0.000000 7.000000\n",
"75% 31.420000 2.00000 3.000000 8.000000\n",
"max 94.850000 30.00000 30.000000 24.000000"
]
},
"execution_count": 346,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пустые значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
"\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Пустых значений нет, но есть дубликаты, удаляем их"
]
},
{
"cell_type": "code",
"execution_count": 347,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Количество дубликатов: 0\n"
]
}
],
"source": [
"df = df.drop_duplicates()\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Преобразуем строковые значение в столбце 'Сердечный приступ' в числовые значения. Это понадобится для расчёта качества набора признаков."
]
},
{
"cell_type": "code",
"execution_count": 348,
"metadata": {},
"outputs": [],
"source": [
"map_stroke_to_int = {'No': 0, 'Yes': 1}\n",
"\n",
"df['Stroke'] = df['Stroke'].map(map_stroke_to_int).astype('int32')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Создание выборок"
]
},
{
"cell_type": "code",
"execution_count": 349,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: (147840, 17)\n",
"Размер контрольной выборки: (63361, 17)\n",
"Размер тестовой выборки: (90516, 17)\n"
]
}
],
"source": [
"# Разделение данных на признаки (X) и целевую переменную (y)\n",
"# В данном случае мы хотим предсказать 'stroke'\n",
"X = df.drop(columns=['Stroke'])\n",
"y = df['Stroke']\n",
"\n",
"# Разбиение данных на обучающую и тестовую выборки\n",
"# Сначала разделим на обучающую и тестовую\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
"\n",
"# Затем разделим обучающую выборку на обучающую и контрольную\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
"print(\"Размер тестовой выборки:\", X_test.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценим сбалансированность выборок"
]
},
{
"cell_type": "code",
"execution_count": 350,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Stroke\n",
"0 0.960045\n",
"1 0.039955\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Stroke\n",
"0 0.95977\n",
"1 0.04023\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Stroke\n",
"0 0.96014\n",
"1 0.03986\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABp3ElEQVR4nO3deXxM1//H8XcS2SO2RBJEYl9raSxF7SE0ii7U8q2gRYsu9KstbUV1SVVrKVqlpQstpaXfVquW0qqmFFXUUkvUnohdkJA5vz88Mj9jJsRSk9u+no/HPB6ZM+fe+7kzmTl33nPnjIcxxggAAAAAAAAAAIvydHcBAAAAAAAAAADcCIJuAAAAAAAAAIClEXQDAAAAAAAAACyNoBsAAAAAAAAAYGkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAAAAAMDSCLoBAAAAAAAAAJZG0A0AAAAA18Fmsyk9PV27du1ydykAAAD/egTdAAAAAJBHhw4d0pNPPqmoqCj5+PgoNDRUVatW1cmTJ91dGgAAwL9aAXcXAAAAcLN98MEH6tWrl/26r6+vSpcurdatW+uFF15QWFiYG6sDYFU7duxQ8+bNdf78eT3++OO6/fbbVaBAAfn7+yswMNDd5QEAAPyrEXQDAIB/rJEjR6pMmTI6d+6cfvrpJ73zzjv65ptvtGnTJgUEBLi7PAAW069fP/n4+OiXX35RyZIl3V0OAAAALkHQDQAA/rHatm2rOnXqSJIefvhhFStWTGPGjNGXX36prl27urk6AFaydu1aff/991q0aBEhNwAAQD7EHN0AAOBfo0WLFpKklJQUSdLRo0f13//+V7fddpuCgoIUHBystm3b6vfff3da9ty5cxoxYoQqVqwoPz8/RURE6N5779XOnTslSbt375aHh0eul2bNmtnXtXz5cnl4eGj27NkaNmyYwsPDFRgYqPbt22vv3r1O2161apXatGmjQoUKKSAgQE2bNtXKlStd7mOzZs1cbn/EiBFOfWfMmKGYmBj5+/uraNGi6tKli8vtX2nfLmWz2TRu3DhVq1ZNfn5+CgsLU79+/XTs2DGHftHR0WrXrp3TdgYOHOi0Tle1jx492uk+laTMzEwlJiaqfPny8vX1VWRkpJ5++mllZma6vK8udfn9FhISovj4eG3atClPy1avXl1r165Vw4YN5e/vrzJlymjy5MkO/bKysjR8+HDFxMSoUKFCCgwMVOPGjbVs2TKHftu2bVOLFi0UHh5u349HHnlER48eddp2z549r/p49+zZU9HR0Q7L7d27V/7+/vLw8NDu3bsl/f/j/MEHHzj0HTFihMvHZeDAgU71tGvXzmFbOet84403crn3nNc/ffp0eXh4aNq0aQ79Xn31VXl4eOibb77JdV3Sxf+vnPvB09NT4eHheuCBB7Rnz54bquuXX36Rn5+fdu7cqWrVqsnX11fh4eHq16+fy8dmzpw59udXSEiI/vOf/2j//v0OfXr27KmgoCDt2rVLcXFxCgwMVIkSJTRy5EgZY5zqvfSxOXXqlGJiYlSmTBkdPHjQ3v7GG2+oYcOGKlasmPz9/RUTE6O5c+c6bPdG72MAAID8iDO6AQDAv0ZOKF2sWDFJ0q5duzR//nx16tRJZcqUUWpqqt599101bdpUmzdvVokSJSRJ2dnZateunZYuXaouXbroiSee0KlTp7R48WJt2rRJ5cqVs2+ja9euuuuuuxy2O3ToUJf1vPLKK/Lw8NAzzzyjtLQ0jRs3TrGxsVq/fr38/f0lSd9//73atm2rmJgYJSYmytPTU9OnT1eLFi20YsUK1atXz2m9pUqVUlJSkiTp9OnTevTRR11u+4UXXlDnzp318MMP6/Dhw5owYYKaNGmi3377TYULF3Zapm/fvmrcuLEk6YsvvtC8efMcbu/Xr599fvTHH39cKSkpmjhxon777TetXLlS3t7eLu+Ha3H8+HH7vl3KZrOpffv2+umnn9S3b19VqVJFGzdu1NixY/Xnn39q/vz5V1135cqV9dxzz8kYo507d2rMmDG66667HALS3Bw7dkx33XWXOnfurK5du+qzzz7To48+Kh8fH/Xu3VuSdPLkSb333nvq2rWr+vTpo1OnTun9999XXFycVq9erVq1akmSMjIyVKpUKd19990KDg7Wpk2bNGnSJO3fv19fffWV07ZDQkI0duxY+/UHH3zwqvUOHz5c586du2o/d+jVq5e++OILDR48WK1atVJkZKQ2btyoF198UQ899JDT88uVxo0bq2/fvrLZbNq0aZPGjRunAwcOaMWKFddd15EjR3Tu3Dk9+uijatGihR555BHt3LlTkyZN0qpVq7Rq1Sr5+vpK+v/fCahbt66SkpKUmpqq8ePHa+XKlU7Pr+zsbLVp00Z33HGHXn/9dS1cuFCJiYm6cOGCRo4c6bKW8+fP67777tOePXu0cuVKRURE2G8bP3682rdvr+7duysrK0uzZs1Sp06d9PXXXys+Pv6m3ccAAAD5jgEAAPiHmT59upFklixZYg4fPmz27t1rZs2aZYoVK2b8/f3Nvn37jDHGnDt3zmRnZzssm5KSYnx9fc3IkSPtbdOmTTOSzJgxY5y2ZbPZ7MtJMqNHj3bqU61aNdO0aVP79WXLlhlJpmTJkubkyZP29s8++8xIMuPHj7evu0KFCiYuLs6+HWOMOXPmjClTpoxp1aqV07YaNmxoqlevbr9++PBhI8kkJiba23bv3m28vLzMK6+84rDsxo0bTYECBZzat2/fbiSZDz/80N6WmJhoLj2UXLFihZFkZs6c6bDswoULndqjoqJMfHy8U+0DBgwwlx+eXl77008/bYoXL25iYmIc7tOPP/7YeHp6mhUrVjgsP3nyZCPJrFy50ml7l2ratKnD+owxZtiwYUaSSUtLu+qyksybb75pb8vMzDS1atUyxYsXN1lZWcYYYy5cuGAyMzMdlj127JgJCwszvXv3vuI2+vfvb4KCgpzau3fvbsqUKePQdvl9lpCQYKKiouzXN23aZDw9PU3btm2NJJOSkmKMMeavv/4yksy0adMc1nf5Y52zjQEDBjjVEx8f77CtKz0vrrT+gwcPmqJFi5pWrVqZzMxMU7t2bVO6dGlz4sSJXNeTIyoqyiQkJDi0devWzQQEBNxQXTnXW7ZsaS5cuGBvz3m9mTBhgjHGmKysLFO8eHFTvXp1c/bsWXu/r7/+2kgyw4cPt7clJCQYSeaxxx6zt9lsNhMfH298fHzM4cOHHeqdPn26sdlspnv37iYgIMCsWrXKqe4zZ844XM/KyjLVq1c3LVq0cGi/kfsYAAAgP2LqEgAA8I8VGxur0NBQRUZGqkuXLgoKCtK8efPs8+v6+vrK0/Pi4VB2draOHDmioKAgVapUSevWrbOv5/PPP1dISIgee+wxp21cPqXDtejRo4cKFixov37//fcrIiLCPm3A+vXrtX37dnXr1k1HjhxRenq60tPTlZGRoZYtW+rHH3+UzWZzWOe5c+fk5+d3xe1+8cUXstls6ty5s32d6enpCg8PV4UKFZym0sjKypIk+9mqrsyZM0eFChVSq1atHNYZExOjoKAgp3WeP3/eoV96evpVzzDev3+/JkyYoBdeeEFBQUFO269SpYoqV67ssM6c6Wou374rOTUdPnxYycnJmjdvnmrUqKGQkJCrLlugQAH169fPft3Hx0f9+vVTWlqa1q5dK0ny8vKSj4+PpItnoB89elQXLlxQnTp1HP7fcpw4cUKpqalaunSpFixYoCZNmjj1ycrKuuLj4srQoUN1++23q1OnTg7toaGhkqR9+/blaT3nzp1zegzPnz/vsu+ZM2eUnp6uY8eOOUzJkZvw8HBNmjRJixcvVuPGjbV+/XpNmzZNwcHBeaotMzNT6enpSktL0+LFi/X999+rZcuWN1yXJA0ePFheXl726w8++KDCwsK0YMECSdKaNWuUlpam/v37OzwX4+PjVblyZXu/S106DUzOtDBZWVlasmSJU98hQ4Zo5syZ+uyzz1x+oyPn2yDSxW8anDhxQo0bN3b6H7vR+xgAACC/YeoSAADwjzV
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Функция для анализа сбалансированности\n",
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
" # Распределение классов\n",
" print(\"Распределение классов в обучающей выборке:\")\n",
" print(y_train.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
" print(y_val.value_counts(normalize=True))\n",
" \n",
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
" print(y_test.value_counts(normalize=True))\n",
"\n",
" # Создание фигуры и осей для трех столбчатых диаграмм\n",
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
" fig.suptitle('Распределение в различных выборках')\n",
"\n",
" # Обучающая выборка\n",
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
" axes[0].set_title('Обучающая выборка')\n",
" axes[0].set_xlabel(y_name)\n",
" axes[0].set_ylabel('Доля')\n",
"\n",
" # Контрольная выборка\n",
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
" axes[1].set_title('Контрольная выборка')\n",
" axes[1].set_xlabel(y_name)\n",
"\n",
" # Тестовая выборка\n",
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
" axes[2].set_title('Тестовая выборка')\n",
" axes[2].set_xlabel(y_name)\n",
"\n",
" plt.show()\n",
"\n",
"analyze_balance(y_train, y_val, y_test, 'Stroke')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выборки несбалансированны. Необходимо сбалансировать обучающую и контрольную выборки, чтобы получить лучшие результаты при обучении модели. Для балансировки применим RandomOverSampler:"
]
},
{
"cell_type": "code",
"execution_count": 351,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов в обучающей выборке:\n",
"Stroke\n",
"0 0.5\n",
"1 0.5\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в контрольной выборке:\n",
"Stroke\n",
"0 0.5\n",
"1 0.5\n",
"Name: proportion, dtype: float64\n",
"\n",
"Распределение классов в тестовой выборке:\n",
"Stroke\n",
"0 0.96014\n",
"1 0.03986\n",
"Name: proportion, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABp/ElEQVR4nO3deXxM9/7H8XcS2SO2RBJEYl9raSxF7SE0ii7Ucito0aILvdrSVlRvm6oWLVqlpQu9lJbeVquW0qqmFFXUUkvUnohdkJD5/v7wyPyMmRCkJqd9PR+PeTwy3/mecz5nJjPfM+858x0PY4wRAAAAAAAAAAAW5enuAgAAAAAAAAAAuBkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAAAAAMDSCLoBAAAAAAAAAJZG0A0AAAAAAAAAsDSCbgAAAAAAAACApRF0AwAAAAAAAAAsjaAbAAAAAG6AzWZTenq6du/e7e5SAAAA/vEIugEAAAAgjw4fPqwnn3xSUVFR8vHxUWhoqKpXr65Tp065uzQAAIB/tELuLgAAACC/ffDBB+rTp4/9uq+vr8qWLau2bdvqhRdeUFhYmBurA2BVO3fuVMuWLXXhwgU9/vjjuv3221WoUCH5+/srMDDQ3eUBAAD8oxF0AwCAv63Ro0erXLlyOn/+vH788Ue98847+vrrr7V582YFBAS4uzwAFjNgwAD5+Pjo559/VunSpd1dDgAAAC5D0A0AAP622rdvr3r16kmSHn74YZUoUULjxo3TF198oe7du7u5OgBWsm7dOn333XdavHgxITcAAEABxBzdAADgH6NVq1aSpJSUFEnSsWPH9O9//1u33XabgoKCFBwcrPbt2+u3335zWvb8+fMaNWqUKleuLD8/P0VEROjee+/Vrl27JEl79uyRh4dHrpcWLVrY17VixQp5eHhozpw5GjFihMLDwxUYGKiOHTtq3759TttevXq12rVrpyJFiiggIEDNmzfXqlWrXO5jixYtXG5/1KhRTn1nzpypmJgY+fv7q3jx4urWrZvL7V9t3y5ns9k0YcIE1ahRQ35+fgoLC9OAAQN0/Phxh37R0dHq0KGD03YGDx7stE5XtY8dO9bpPpWkzMxMJSYmqmLFivL19VVkZKSefvppZWZmuryvLnfl/RYSEqL4+Hht3rw5T8vWrFlT69atU+PGjeXv769y5cppypQpDv2ysrI0cuRIxcTEqEiRIgoMDFTTpk21fPlyh37bt29Xq1atFB4ebt+PRx55RMeOHXPadu/eva/5ePfu3VvR0dEOy+3bt0/+/v7y8PDQnj17JP3/4/zBBx849B01apTLx2Xw4MFO9XTo0MFhWznrfP3113O595zXP2PGDHl4eGj69OkO/V555RV5eHjo66+/znVd0qX/r5z7wdPTU+Hh4XrggQe0d+/em6rr559/lp+fn3bt2qUaNWrI19dX4eHhGjBggMvHZu7cufbnV0hIiP71r3/pwIEDDn169+6toKAg7d69W3FxcQoMDFSpUqU0evRoGWOc6r38sTl9+rRiYmJUrlw5HTp0yN7++uuvq3HjxipRooT8/f0VExOjefPmOWz3Zu9jAACAgogzugEAwD9GTihdokQJSdLu3bu1YMECdenSReXKlVNqaqreffddNW/eXFu2bFGpUqUkSdnZ2erQoYOWLVumbt266YknntDp06e1ZMkSbd68WRUqVLBvo3v37rrrrrsctjt8+HCX9bz88svy8PDQM888o7S0NE2YMEGxsbHasGGD/P39JUnfffed2rdvr5iYGCUmJsrT01MzZsxQq1attHLlSjVo0MBpvWXKlFFSUpIk6cyZM3r00UddbvuFF15Q165d9fDDD+vIkSOaOHGimjVrpl9//VVFixZ1WqZ///5q2rSpJOnzzz/X/PnzHW4fMGCAfX70xx9/XCkpKZo0aZJ+/fVXrVq1St7e3i7vh+tx4sQJ+75dzmazqWPHjvrxxx/Vv39/VatWTZs2bdL48eP1xx9/aMGCBddcd9WqVfXcc8/JGKNdu3Zp3LhxuuuuuxwC0twcP35cd911l7p27aru3bvr008/1aOPPiofHx/17dtXknTq1Cm999576t69u/r166fTp0/r/fffV1xcnNasWaM6depIkjIyMlSmTBndfffdCg4O1ubNmzV58mQdOHBAX375pdO2Q0JCNH78ePv1Bx988Jr1jhw5UufPn79mP3fo06ePPv/8cw0dOlRt2rRRZGSkNm3apBdffFEPPfSQ0/PLlaZNm6p///6y2WzavHmzJkyYoIMHD2rlypU3XNfRo0d1/vx5Pfroo2rVqpUeeeQR7dq1S5MnT9bq1au1evVq+fr6Svr/3wmoX7++kpKSlJqaqjfffFOrVq1yen5lZ2erXbt2uuOOO/Taa69p0aJFSkxM1MWLFzV69GiXtVy4cEH33Xef9u7dq1WrVikiIsJ+25tvvqmOHTuqZ8+eysrK0uzZs9WlSxd99dVXio+Pz7f7GAAAoMAxAAAAfzMzZswwkszSpUvNkSNHzL59+8zs2bNNiRIljL+/v9m/f78xxpjz58+b7Oxsh2VTUlKMr6+vGT16tL1t+vTpRpIZN26c07ZsNpt9OUlm7NixTn1q1Khhmjdvbr++fPlyI8mULl3anDp1yt7+6aefGknmzTfftK+7UqVKJi4uzr4dY4w5e/asKVeunGnTpo3Ttho3bmxq1qxpv37kyBEjySQmJtrb9uzZY7y8vMzLL7/ssOymTZtMoUKFnNp37NhhJJkPP/zQ3paYmGguP5RcuXKlkWRmzZrlsOyiRYuc2qOiokx8fLxT7YMGDTJXHp5eWfvTTz9tSpYsaWJiYhzu048//th4enqalStXOiw/ZcoUI8msWrXKaXuXa968ucP6jDFmxIgRRpJJS0u75rKSzBtvvGFvy8zMNHXq1DElS5Y0WVlZxhhjLl68aDIzMx2WPX78uAkLCzN9+/a96jYGDhxogoKCnNp79uxpypUr59B25X2WkJBgoqKi7Nc3b95sPD09Tfv27Y0kk5KSYowx5s8//zSSzPTp0x3Wd+VjnbONQYMGOdUTHx/vsK2rPS+utv5Dhw6Z4sWLmzZt2pjMzExTt25dU7ZsWXPy5Mlc15MjKirKJCQkOLT16NHDBAQE3FRdOddbt25tLl68aG/Peb2ZOHGiMcaYrKwsU7JkSVOzZk1z7tw5e7+vvvrKSDIjR460tyUkJBhJ5rHHHrO32Ww2Ex8fb3x8fMyRI0cc6p0xY4ax2WymZ8+eJiAgwKxevdqp7rNnzzpcz8rKMjVr1jStWrVyaL+Z+xgAAKAgYuoSAADwtxUbG6vQ0FBFRkaqW7duCgoK0vz58+3z6/r6+srT89LhUHZ2to4ePaqgoCBVqVJF69evt6/ns88+U0hIiB577DGnbVw5pcP16NWrlwoXLmy/fv/99ysiIsI+bcCGDRu0Y8cO9ejRQ0ePHlV6errS09OVkZGh1q1b64cffpDNZnNY5/nz5+Xn53fV7X7++eey2Wzq2rWrfZ3p6ekKDw9XpUqVnKbSyMrKkiT72aquzJ07V0WKFFGbNm0c1hkTE6OgoCCndV64cMGhX3p6+jXPMD5w4IAmTpyoF154QUFBQU7br1atmqpWreqwzpzpaq7cvis5NR05ckTJycmaP3++atWqpZCQkGsuW6hQIQ0YMMB+3cfHRwMGDFBaWprWrVsnSfLy8pKPj4+kS2egHzt2TBcvXlS9evUc/t9ynDx5UqmpqVq2bJkWLlyoZs2aOfXJysq66uPiyvDhw3X77berS5cuDu2hoaGSpP379+dpPefPn3d6DC9cuOCy79mzZ5Wenq7jx487TMmRm/DwcE2ePFlLlixR06ZNtWHDBk2fPl3BwcF5qi0zM1Pp6elKS0vTkiVL9N1336l169Y3XZckDR06VF5eXvbrDz74oMLCwrRw4UJJ0tq1a5WWlqaBAwc6PBfj4+NVtWpVe7/LXT4NTM60MFlZWVq6dKlT32HDhmnWrFn69NNPXX6jI+fbINKlbxqcPHlSTZs2dfofu9n7GAAAoKBh6hIAAPC
"text/plain": [
"<Figure size 1800x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ros = RandomOverSampler(random_state=42)\n",
"\n",
"# Применение RandomOverSampler для балансировки выборок\n",
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
"X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
"\n",
"# Проверка сбалансированности после RandomOverSampler\n",
"analyze_balance(y_train_resampled, y_val_resampled, y_test, 'Stroke')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Перейдем к конструированию признаков.\n",
"Применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора:"
]
},
{
"cell_type": "code",
"execution_count": 352,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
"0 26.50 5.0 0.0 7.0 False \n",
"1 33.91 0.0 0.0 7.0 False \n",
"2 42.57 4.0 5.0 6.0 False \n",
"3 32.08 0.0 0.0 6.0 False \n",
"4 15.78 1.0 3.0 6.0 False \n",
"\n",
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
"0 False False False True \n",
"1 False False False True \n",
"2 False False False True \n",
"3 False False False True \n",
"4 False False False True \n",
"\n",
" AgeCategory_25-29 ... Diabetic_Yes Diabetic_Yes (during pregnancy) \\\n",
"0 False ... False False \n",
"1 False ... False False \n",
"2 False ... False False \n",
"3 False ... False False \n",
"4 False ... False False \n",
"\n",
" PhysicalActivity_Yes GenHealth_Fair GenHealth_Good GenHealth_Poor \\\n",
"0 True False False False \n",
"1 True False False False \n",
"2 True False True False \n",
"3 True False False False \n",
"4 True False True False \n",
"\n",
" GenHealth_Very good Asthma_Yes KidneyDisease_Yes SkinCancer_Yes \n",
"0 True False False False \n",
"1 True False False False \n",
"2 False False False False \n",
"3 True False False False \n",
"4 False False False False \n",
"\n",
"[5 rows x 37 columns]\n"
]
}
],
"source": [
"# Определение категориальных признаков\n",
"categorical_features = ['HeartDisease', 'Smoking', 'AlcoholDrinking',\n",
" 'DiffWalking', 'Sex', 'AgeCategory',\n",
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',\n",
" 'Asthma', 'KidneyDisease', 'SkinCancer']\n",
"\n",
"# Применение one-hot encoding к обучающей выборке\n",
"X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
"\n",
"# Применение one-hot encoding к контрольной выборке\n",
"X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
"\n",
"# Применение one-hot encoding к тестовой выборке\n",
"X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
"\n",
"print(X_train_encoded.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Далее применим дискретизацию к числовым признакам "
]
},
{
"cell_type": "code",
"execution_count": 353,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
"0 26.50 5.0 0.0 7.0 False \n",
"1 33.91 0.0 0.0 7.0 False \n",
"2 42.57 4.0 5.0 6.0 False \n",
"3 32.08 0.0 0.0 6.0 False \n",
"4 15.78 1.0 3.0 6.0 False \n",
"\n",
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
"0 False False False True \n",
"1 False False False True \n",
"2 False False False True \n",
"3 False False False True \n",
"4 False False False True \n",
"\n",
" AgeCategory_25-29 ... Diabetic_Yes (during pregnancy) \\\n",
"0 False ... False \n",
"1 False ... False \n",
"2 False ... False \n",
"3 False ... False \n",
"4 False ... False \n",
"\n",
" PhysicalActivity_Yes GenHealth_Fair GenHealth_Good GenHealth_Poor \\\n",
"0 True False False False \n",
"1 True False False False \n",
"2 True False True False \n",
"3 True False False False \n",
"4 True False True False \n",
"\n",
" GenHealth_Very good Asthma_Yes KidneyDisease_Yes SkinCancer_Yes \\\n",
"0 True False False False \n",
"1 True False False False \n",
"2 False False False False \n",
"3 True False False False \n",
"4 False False False False \n",
"\n",
" BMI_binned \n",
"0 Overweight \n",
"1 Obese \n",
"2 Severely Obese \n",
"3 Obese \n",
"4 Underweight \n",
"\n",
"[5 rows x 38 columns]\n"
]
}
],
"source": [
"bmi_bins = [0, 18.5, 25, 30, 40, 60]\n",
"bmi_labels = [\"Underweight\", \"Normal\", \"Overweight\", \"Obese\", \"Severely Obese\"]\n",
"\n",
"\n",
"X_train_encoded['BMI_binned'] = pd.cut(X_train_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
"X_val_encoded['BMI_binned'] = pd.cut(X_val_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
"\n",
"X_test_encoded['BMI_binned'] = pd.cut(X_test_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
"\n",
"print(X_train_encoded.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Применим ручной синтез признаков. К примеру, можно создать фактор риска для сердечных заболеваний: комбинированный признак на основе факторов риска, таких как курение, диабет, употребление алкоголя и наличие болезней."
]
},
{
"cell_type": "code",
"execution_count": 354,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
"0 26.50 5.0 0.0 7.0 False \n",
"1 33.91 0.0 0.0 7.0 False \n",
"2 42.57 4.0 5.0 6.0 False \n",
"3 32.08 0.0 0.0 6.0 False \n",
"4 15.78 1.0 3.0 6.0 False \n",
"\n",
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
"0 False False False True \n",
"1 False False False True \n",
"2 False False False True \n",
"3 False False False True \n",
"4 False False False True \n",
"\n",
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
"0 False ... True False \n",
"1 False ... True False \n",
"2 False ... True False \n",
"3 False ... True False \n",
"4 False ... True False \n",
"\n",
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
"0 False False True False \n",
"1 False False True False \n",
"2 True False False False \n",
"3 False False True False \n",
"4 True False False False \n",
"\n",
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
"0 False False Overweight 0 \n",
"1 False False Obese 0 \n",
"2 False False Severely Obese 0 \n",
"3 False False Obese 0 \n",
"4 False False Underweight 0 \n",
"\n",
"[5 rows x 39 columns]\n"
]
}
],
"source": [
"X_train_encoded['RiskFactor'] = ((X_train_encoded['Smoking_Yes'] == True) | \n",
" (X_train_encoded['Diabetic_Yes'] == True) | \n",
" (X_train_encoded['AlcoholDrinking_Yes'] == True) | \n",
" (X_train_encoded['KidneyDisease_Yes'] == True) | \n",
" (X_train_encoded['SkinCancer_Yes'] == True)).astype(int)\n",
"\n",
"print(X_train_encoded.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Используем масштабирование признаков"
]
},
{
"cell_type": "code",
"execution_count": 355,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
"0 26.50 -0.099915 -0.540452 -0.070949 False \n",
"1 33.91 -0.581538 -0.540452 -0.070949 False \n",
"2 42.57 -0.196239 0.006442 -0.646839 False \n",
"3 32.08 -0.581538 -0.540452 -0.646839 False \n",
"4 15.78 -0.485213 -0.212315 -0.646839 False \n",
"\n",
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
"0 False False False True \n",
"1 False False False True \n",
"2 False False False True \n",
"3 False False False True \n",
"4 False False False True \n",
"\n",
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
"0 False ... True False \n",
"1 False ... True False \n",
"2 False ... True False \n",
"3 False ... True False \n",
"4 False ... True False \n",
"\n",
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
"0 False False True False \n",
"1 False False True False \n",
"2 True False False False \n",
"3 False False True False \n",
"4 True False False False \n",
"\n",
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
"0 False False Overweight 0 \n",
"1 False False Obese 0 \n",
"2 False False Severely Obese 0 \n",
"3 False False Obese 0 \n",
"4 False False Underweight 0 \n",
"\n",
"[5 rows x 39 columns]\n"
]
}
],
"source": [
"numerical_features = ['PhysicalHealth', 'MentalHealth', 'SleepTime']\n",
"\n",
"scaler = StandardScaler()\n",
"X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])\n",
"X_val_encoded[numerical_features] = scaler.transform(X_val_encoded[numerical_features])\n",
"X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])\n",
"\n",
"print(X_train_encoded.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"И также попробуем сконструировать признаки, используя фреймворк Featuretools:"
]
},
{
"cell_type": "code",
"execution_count": 356,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\User\\Desktop\\aim\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
"index \n",
"0 26.50 -0.099915 -0.540452 -0.070949 False \n",
"1 33.91 -0.581538 -0.540452 -0.070949 False \n",
"2 42.57 -0.196239 0.006442 -0.646839 False \n",
"3 32.08 -0.581538 -0.540452 -0.646839 False \n",
"4 15.78 -0.485213 -0.212315 -0.646839 False \n",
"\n",
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
"index \n",
"0 False False False True \n",
"1 False False False True \n",
"2 False False False True \n",
"3 False False False True \n",
"4 False False False True \n",
"\n",
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
"index ... \n",
"0 False ... True False \n",
"1 False ... True False \n",
"2 False ... True False \n",
"3 False ... True False \n",
"4 False ... True False \n",
"\n",
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
"index \n",
"0 False False True False \n",
"1 False False True False \n",
"2 True False False False \n",
"3 False False True False \n",
"4 True False False False \n",
"\n",
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
"index \n",
"0 False False Overweight 0 \n",
"1 False False Obese 0 \n",
"2 False False Severely Obese 0 \n",
"3 False False Obese 0 \n",
"4 False False Underweight 0 \n",
"\n",
"[5 rows x 39 columns]\n"
]
}
],
"source": [
"data = X_train_encoded.copy()\n",
"\n",
"es = ft.EntitySet(id=\"patients\")\n",
"\n",
"es = es.add_dataframe(dataframe_name=\"strokes_data\", dataframe=data, index=\"index\", make_index=True)\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es, \n",
" target_dataframe_name=\"strokes_data\",\n",
" max_depth=1\n",
")\n",
"\n",
"print(feature_matrix.head())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Оценка качества набора признаков\n",
"Представим основные оценки качества наборов признаков:\n",
"\n",
"- Предсказательная способность (для задачи классификации) Метрики: Accuracy, Precision, Recall, F1-Score, ROC AUC\n",
"\n",
" Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"\n",
"- Скорость вычисления\n",
"\n",
" Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"\n",
"- Надежность\n",
"\n",
" Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"\n",
"- Корреляция\n",
"\n",
" Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
"\n",
"- Цельность\n",
"\n",
" Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
]
},
{
"cell_type": "code",
"execution_count": 357,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 36.52 секунд\n"
]
}
],
"source": [
"X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
"X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
"X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
"\n",
"all_columns = X_train_encoded.columns\n",
"X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
"X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
"\n",
"# Выбор модели\n",
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"\n",
"# Начинаем отсчет времени\n",
"start_time = time.time()\n",
"model.fit(X_train_encoded, y_train_resampled)\n",
"\n",
"# Время обучения модели\n",
"train_time = time.time() - start_time\n",
"\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')\n"
]
},
{
"cell_type": "code",
"execution_count": 358,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature Importance:\n",
" feature importance\n",
"0 BMI 0.194905\n",
"3 SleepTime 0.086714\n",
"1 PhysicalHealth 0.074056\n",
"4 HeartDisease_Yes 0.065638\n",
"7 DiffWalking_Yes 0.057683\n",
"2 MentalHealth 0.057339\n",
"8 Sex_Male 0.028562\n",
"20 AgeCategory_80 or older 0.025721\n",
"29 PhysicalActivity_Yes 0.023747\n",
"27 Diabetic_Yes 0.023346\n",
"30 GenHealth_Fair 0.022295\n",
"34 Asthma_Yes 0.019722\n",
"19 AgeCategory_75-79 0.017912\n",
"5 Smoking_Yes 0.017702\n",
"37 RiskFactor 0.017532\n",
"31 GenHealth_Good 0.016946\n",
"18 AgeCategory_70-74 0.015593\n",
"33 GenHealth_Very good 0.015544\n",
"25 Race_White 0.014721\n",
"39 BMI_binned_Overweight 0.014350\n",
"17 AgeCategory_65-69 0.014142\n",
"36 SkinCancer_Yes 0.014002\n",
"32 GenHealth_Poor 0.013788\n",
"40 BMI_binned_Obese 0.012988\n",
"38 BMI_binned_Normal 0.012010\n",
"16 AgeCategory_60-64 0.011894\n",
"35 KidneyDisease_Yes 0.011588\n",
"15 AgeCategory_55-59 0.010550\n",
"22 Race_Black 0.009165\n",
"23 Race_Hispanic 0.008975\n",
"6 AlcoholDrinking_Yes 0.008943\n",
"14 AgeCategory_50-54 0.008495\n",
"13 AgeCategory_45-49 0.006740\n",
"11 AgeCategory_35-39 0.006491\n",
"26 Diabetic_No, borderline diabetes 0.006442\n",
"12 AgeCategory_40-44 0.006333\n",
"9 AgeCategory_25-29 0.006128\n",
"24 Race_Other 0.005832\n",
"10 AgeCategory_30-34 0.005631\n",
"41 BMI_binned_Severely Obese 0.004984\n",
"21 Race_Asian 0.002756\n",
"28 Diabetic_Yes (during pregnancy) 0.002092\n"
]
}
],
"source": [
"# Получение важности признаков\n",
"importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n",
"\n",
"# Сортировка признаков по важности\n",
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
"\n",
"print(\"Feature Importance:\")\n",
"print(feature_importance)"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9562618763533519\n",
"Precision: 0.10204081632653061\n",
"Recall: 0.012472283813747228\n",
"F1 Score: 0.02222771054581378\n",
"ROC AUC: 0.5039578706315019\n",
"Cross-validated Accuracy: 0.9940253495420259\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABB0AAANXCAYAAAB5YScaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxN1/7/8ddJIoOMhpAgxJAQGkPQXvMsUWNRQ01BFDXU1JKaxxhbVI0lUVVjFTUPlSraGm4TWmqIRPRKaQ2JUEFyfn/4OV+nSSQpEer9fDz2456z19prfdbOvR53f85aaxuMRqMREREREREREZGnzCKnAxARERERERGRfyclHUREREREREQkWyjpICIiIiIiIiLZQkkHEREREREREckWSjqIiIiIiIiISLZQ0kFEREREREREsoWSDiIiIiIiIiKSLZR0EBEREREREZFsoaSDiIiIiIiIiGQLJR1EREREREREJFso6SAiIiIvjbCwMAwGQ5rHiBEjsqXPQ4cOMW7cOG7cuJEt7T+Jh/fj6NGjOR3KPzZ//nzCwsJyOgwREUmHVU4HICIiIvKsTZgwgeLFi5ude+WVV7Klr0OHDjF+/HgCAwNxcXHJlj5eZvPnzyd//vwEBgbmdCgiIpIGJR1ERETkpdOkSROqVKmS02E8kVu3bmFvb5/TYeSY27dvkzt37pwOQ0REMqDlFSIiIiJ/s337dmrVqoW9vT2Ojo40bdqUX375xazO8ePHCQwMpESJEtja2uLm5kaPHj24evWqqc64ceN47733AChevLhpKUdMTAwxMTEYDIY0lwYYDAbGjRtn1o7BYODkyZO89dZb5MmTh5o1a5rKP//8cypXroydnR158+alQ4cOXLx48R+NPTAwEAcHB2JjY2nWrBkODg4ULlyYTz75BIATJ05Qv3597O3tKVasGF988YXZ9Q+XbOzfv5/evXuTL18+nJyc6Nq1K9evX0/V3/z58ylXrhw2NjYUKlSIfv36pVqKUrduXV555RWOHTtG7dq1yZ07Nx988AGenp788ssvfPvtt6Z7W7duXQCuXbvGsGHD8PX1xcHBAScnJ5o0aUJkZKRZ2+Hh4RgMBtauXcvkyZMpUqQItra2NGjQgHPnzqWK98cff+T1118nT5482NvbU758eebMmWNW59dff6Vt27bkzZsXW1tbqlSpwubNm7P6pxAR+VfQTAcRERF56cTHx/Pnn3+ancufPz8AK1asoFu3bvj7+zNt2jRu377NggULqFmzJj/99BOenp4A7N69m/Pnz9O9e3fc3Nz45ZdfWLx4Mb/88gs//PADBoOB1q1bc+bMGVatWsVHH31k6sPV1ZU//vgjy3G/+eabeHl5MWXKFIxGIwCTJ09m9OjRtGvXjqCgIP744w8+/vhjateuzU8//fSPlnQkJyfTpEkTateuzfTp01m5ciX9+/fH3t6ekSNH0qlTJ1q3bs3ChQvp2rUr1apVS7VcpX///ri4uDBu3DhOnz7NggULuHDhgukhHx4kU8aPH0/Dhg3p27evqd6RI0c4ePAguXLlMrV39epVmjRpQocOHejcuTMFCxakbt26DBgwAAcHB0aOHAlAwYIFATh//jwbN27kzTffpHjx4ly+fJlFixZRp04dTp48SaFChczinTp1KhYWFgwbNoz4+HimT59Op06d+PHHH011du/eTbNmzXB3d+fdd9/Fzc2NU6dOsWXLFt59910AfvnlF2rUqEHhwoUZMWIE9vb2rF27llatWvHll1/yxhtvZPnvISLyQjOKiIiIvCRCQ0ONQJqH0Wg03rx50+ji4mLs1auX2XW///670dnZ2ez87du3U7W/atUqI2Dcv3+/6dyMGTOMgDE6OtqsbnR0tBEwhoaGpmoHMI4dO9b0fezYsUbA2LFjR7N6MTExRktLS+PkyZPNzp84ccJoZWWV6nx69+PIkSOmc926dTMCxilTppjOXb9+3WhnZ2c0GAzG1atXm87/+uuvqWJ92GblypWNd+/eNZ2fPn26ETBu2rTJaDQajVeuXDFaW1sbGzdubExOTjbVmzdvnhEwLlu2zHSuTp06RsC4cOHCVGMoV66csU6dOqnO37lzx6xdo/HBPbexsTFOmDDBdG7fvn1GwOjj42NMSkoynZ8zZ44RMJ44ccJoNBqN9+/fNxYvXtxYrFgx4/Xr183aTUlJMX1u0KCB0dfX13jnzh2z8urVqxu9vLxSxSki8m+n5RUiIiLy0vnkk0/YvXu32QEPfsm+ceMGHTt25M8//zQdlpaWvPbaa+zbt8/Uhp2dnenznTt3+PPPP/nPf/4DwH//+99sibtPnz5m3zds2EBKSgrt2rUzi9fNzQ0vLy+zeLMqKCjI9NnFxYXSpUtjb29Pu3btTOdLly6Ni4sL58+fT3X922+/bTZToW/fvlhZWbFt2zYA9uzZw927dxk0aBAWFv/3f0l79eqFk5MTW7duNWvPxsaG7t27Zzp+GxsbU7vJyclcvXoVBwcHSpcunebfp3v37lhbW5u+16pVC8A0tp9++ono6GgGDRqUavbIw5kb165d45tvvqFdu3bcvHnT9Pe4evUq/v7+nD17lv/973+ZHoOIyL+BlleIiIjIS+fVV19NcyPJs2fPAlC/fv00r3NycjJ9vnbtGuPHj2f16tVcuXLFrF58fPxTjPb//H0Jw9mzZzEajXh5eaVZ/9GH/qywtbXF1dXV7JyzszNFihQxPWA/ej6tvRr+HpODgwPu7u7ExMQAcOHCBeBB4uJR1tbWlChRwlT+UOHChc2SAhlJSUlhzpw5zJ8/n+joaJKTk01l+fLlS1W/aNGiZt/z5MkDYBpbVFQU8Pi3nJw7dw6j0cjo0aMZPXp0mnWuXLlC4cKFMz0OEZEXnZIOIiIiIv9fSkoK8GBfBzc3t1TlVlb/93+d2rVrx6FDh3jvvfeoWLEiDg4OpKSkEBAQYGrncf7+8P7Qow/Hf/fo7IqH8RoMBrZv346lpWWq+g4ODhnGkZa02nrceeP/318iO/197BmZMmUKo0ePpkePHkycOJG8efNiYWHBoEGD0vz7PI2xPWx32LBh+Pv7p1mnVKlSmW5PROTfQEkHERERkf+vZMmSABQoUICGDRumW+/69evs3buX8ePHM2bMGNP5hzMlHpVecuHhL+l/f1PD33/hzyheo9FI8eLF8fb2zvR1z8LZs2epV6+e6XtiYiJxcXG8/vrrABQrVgyA06dPU6JECVO9u3fvEh0d/dj7/6j07u/69eupV68eS5cuNTt/48YN04aeWfHwvxs///xzurE9HEeuXLkyHb+IyL+d9nQQERER+f/8/f1xcnJiypQp3Lt3L1X5wzdOPPxV/O+/gs+ePTvVNfb29kDq5IKTkxP58+dn//79Zufnz5+f6Xhbt26NpaUl48ePTxWL0Wg0e33ns7Z48WKze7hgwQLu379PkyZNAGjYsCHW1tbMnTvXLPalS5cSHx9P06ZNM9WPvb19qnsLD/5Gf78n69at+8d7Kvj5+VG8eHFmz56dqr+H/RQoUIC6deuyaNEi4uLiUrXxT95YIiLyotNMBxEREZH/z8nJiQULFtClSxf8/Pzo0KEDrq6uxMbGsnXrVmrUqMG8efNwcnIyvU7y3r17FC5cmF27dhEdHZ2qzcqVKwMwcuRIOnToQK5cuWjevDn29vYEBQUxdepUgoKCqFKlCvv37+fMmTOZjrdkyZJMmjSJ4OBgYmJiaNWqFY6OjkRHR/PVV1/x9ttvM2zYsKd2f7Li7t27NGjQgHbt2nH69Gnmz59PzZo1adGiBfDgtaHBwcGMHz+egIAAWrRoYapXtWpVOnfunKl+KleuzIIFC5g0aRKlSpWiQIEC1K9fn2bNmjFhwgS6d+9O9erVOXHiBCtXrjSbVZEVFhYWLFiwgObNm1OxYkW6d++Ou7s7v/76K7/88gs7d+4EHmxSWrNmTXx9fenVqxclSpTg8uXLfP/99/z2229ERkb+o/5FRF5USjqIiIiIPOKtt96iUKFCTJ06lRkzZpCUlEThwoWpVauW2dsTvvjiCwYMGMAnn3yC0WikceP
"text/plain": [
"<Figure size 1000x1000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train Accuracy: 0.9997146540973558\n",
"Train Precision: 0.9994296336980861\n",
"Train Recall: 1.0\n",
"Train F1 Score: 0.9997147354964131\n",
"Train ROC AUC: 0.9997146540973557\n"
]
}
],
"source": [
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test_encoded)\n",
"\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"precision = precision_score(y_test, y_pred)\n",
"recall = recall_score(y_test, y_pred)\n",
"f1 = f1_score(y_test, y_pred)\n",
"roc_auc = roc_auc_score(y_test, y_pred)\n",
"\n",
"print(f\"Accuracy: {accuracy}\")\n",
"print(f\"Precision: {precision}\")\n",
"print(f\"Recall: {recall}\")\n",
"print(f\"F1 Score: {f1}\")\n",
"print(f\"ROC AUC: {roc_auc}\")\n",
"\n",
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n",
"accuracy_cv = scores.mean()\n",
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
"\n",
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train_encoded.columns\n",
"\n",
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
"\n",
"plt.figure(figsize=(10, 10))\n",
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
"plt.title('Feature Importance')\n",
"plt.show()\n",
"\n",
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train_encoded)\n",
"\n",
"accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n",
"precision_train = precision_score(y_train_resampled, y_train_pred)\n",
"recall_train = recall_score(y_train_resampled, y_train_pred)\n",
"f1_train = f1_score(y_train_resampled, y_train_pred)\n",
"roc_auc_train = roc_auc_score(y_train_resampled, y_train_pred)\n",
"\n",
"print(f\"Train Accuracy: {accuracy_train}\")\n",
"print(f\"Train Precision: {precision_train}\")\n",
"print(f\"Train Recall: {recall_train}\")\n",
"print(f\"Train F1 Score: {f1_train}\")\n",
"print(f\"Train ROC AUC: {roc_auc_train}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimvenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}