1265 lines
272 KiB
Plaintext
1265 lines
272 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Лабораторная 3\n",
|
|||
|
"Датасет: Набор данных для анализа и прогнозирования сердечного приступа"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 345,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
|
|||
|
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
|
|||
|
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
|
|||
|
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 319795 entries, 0 to 319794\n",
|
|||
|
"Data columns (total 18 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 HeartDisease 319795 non-null object \n",
|
|||
|
" 1 BMI 319795 non-null float64\n",
|
|||
|
" 2 Smoking 319795 non-null object \n",
|
|||
|
" 3 AlcoholDrinking 319795 non-null object \n",
|
|||
|
" 4 Stroke 319795 non-null object \n",
|
|||
|
" 5 PhysicalHealth 319795 non-null float64\n",
|
|||
|
" 6 MentalHealth 319795 non-null float64\n",
|
|||
|
" 7 DiffWalking 319795 non-null object \n",
|
|||
|
" 8 Sex 319795 non-null object \n",
|
|||
|
" 9 AgeCategory 319795 non-null object \n",
|
|||
|
" 10 Race 319795 non-null object \n",
|
|||
|
" 11 Diabetic 319795 non-null object \n",
|
|||
|
" 12 PhysicalActivity 319795 non-null object \n",
|
|||
|
" 13 GenHealth 319795 non-null object \n",
|
|||
|
" 14 SleepTime 319795 non-null float64\n",
|
|||
|
" 15 Asthma 319795 non-null object \n",
|
|||
|
" 16 KidneyDisease 319795 non-null object \n",
|
|||
|
" 17 SkinCancer 319795 non-null object \n",
|
|||
|
"dtypes: float64(4), object(14)\n",
|
|||
|
"memory usage: 43.9+ MB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>HeartDisease</th>\n",
|
|||
|
" <th>BMI</th>\n",
|
|||
|
" <th>Smoking</th>\n",
|
|||
|
" <th>AlcoholDrinking</th>\n",
|
|||
|
" <th>Stroke</th>\n",
|
|||
|
" <th>PhysicalHealth</th>\n",
|
|||
|
" <th>MentalHealth</th>\n",
|
|||
|
" <th>DiffWalking</th>\n",
|
|||
|
" <th>Sex</th>\n",
|
|||
|
" <th>AgeCategory</th>\n",
|
|||
|
" <th>Race</th>\n",
|
|||
|
" <th>Diabetic</th>\n",
|
|||
|
" <th>PhysicalActivity</th>\n",
|
|||
|
" <th>GenHealth</th>\n",
|
|||
|
" <th>SleepTime</th>\n",
|
|||
|
" <th>Asthma</th>\n",
|
|||
|
" <th>KidneyDisease</th>\n",
|
|||
|
" <th>SkinCancer</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>16.60</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>30.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>55-59</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>5.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>20.34</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>80 or older</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>7.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>26.58</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>20.0</td>\n",
|
|||
|
" <td>30.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>65-69</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Fair</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>24.21</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>75-79</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Good</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>23.71</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>28.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>40-44</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"0 No 16.60 Yes No No 3.0 \n",
|
|||
|
"1 No 20.34 No No Yes 0.0 \n",
|
|||
|
"2 No 26.58 Yes No No 20.0 \n",
|
|||
|
"3 No 24.21 No No No 0.0 \n",
|
|||
|
"4 No 23.71 No No No 28.0 \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"0 30.0 No Female 55-59 White Yes \n",
|
|||
|
"1 0.0 No Female 80 or older White No \n",
|
|||
|
"2 30.0 No Male 65-69 White Yes \n",
|
|||
|
"3 0.0 No Female 75-79 White No \n",
|
|||
|
"4 0.0 Yes Female 40-44 White No \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
|
|||
|
"0 Yes Very good 5.0 Yes No Yes \n",
|
|||
|
"1 Yes Very good 7.0 No No No \n",
|
|||
|
"2 Yes Fair 8.0 Yes No No \n",
|
|||
|
"3 No Good 6.0 No No Yes \n",
|
|||
|
"4 Yes Very good 8.0 No No No "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 345,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"import featuretools as ft\n",
|
|||
|
"import time\n",
|
|||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.model_selection import cross_val_score\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\".//static//csv//heart_2020_cleaned.csv\")\n",
|
|||
|
"\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Бизнес цели и цели технического проекта\n",
|
|||
|
"1. Улучшение профилактики сердечно-сосудистых заболеваний\n",
|
|||
|
"\n",
|
|||
|
" - Бизнес-цель: Повышение точности прогнозирования риска сердечно-сосудистых заболеваний среди пациентов для более раннего вмешательства и снижения частоты обострений. Определение основных факторов риска, чтобы медперсонал мог предоставлять более целенаправленные рекомендации по улучшению здоровья.\n",
|
|||
|
"\n",
|
|||
|
" - Цель технического проекта: Разработка классификационной модели для предсказания вероятности сердечно-сосудистых заболеваний на основе данных (возраст, индекс массы тела, физическая активность, курение и т. д.), что поможет выделить группы высокого риска. Интеграция этой модели в систему поддержки принятия решений для врачей, чтобы улучшить качество и своевременность рекомендаций.\n",
|
|||
|
"\n",
|
|||
|
"2. Снижение расходов на лечение сердечно-сосудистых заболеваний\n",
|
|||
|
"\n",
|
|||
|
" - Бизнес-цель: Оптимизация затрат на лечение сердечно-сосудистых заболеваний путем эффективного распределения ресурсов и проведения профилактических мер среди целевых групп.\n",
|
|||
|
"\n",
|
|||
|
" - Цель технического проекта: Создание системы оценки индивидуального риска сердечно-сосудистых заболеваний для пациентов, которая позволит медицинским учреждениям и страховым компаниям выделять целевые группы для проведения превентивных мероприятий, тем самым сокращая затраты на лечение."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на пустые значения и дубликаты"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 346,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"HeartDisease 0\n",
|
|||
|
"BMI 0\n",
|
|||
|
"Smoking 0\n",
|
|||
|
"AlcoholDrinking 0\n",
|
|||
|
"Stroke 0\n",
|
|||
|
"PhysicalHealth 0\n",
|
|||
|
"MentalHealth 0\n",
|
|||
|
"DiffWalking 0\n",
|
|||
|
"Sex 0\n",
|
|||
|
"AgeCategory 0\n",
|
|||
|
"Race 0\n",
|
|||
|
"Diabetic 0\n",
|
|||
|
"PhysicalActivity 0\n",
|
|||
|
"GenHealth 0\n",
|
|||
|
"SleepTime 0\n",
|
|||
|
"Asthma 0\n",
|
|||
|
"KidneyDisease 0\n",
|
|||
|
"SkinCancer 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 18078\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>BMI</th>\n",
|
|||
|
" <th>PhysicalHealth</th>\n",
|
|||
|
" <th>MentalHealth</th>\n",
|
|||
|
" <th>SleepTime</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" <td>319795.00000</td>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>28.325399</td>\n",
|
|||
|
" <td>3.37171</td>\n",
|
|||
|
" <td>3.898366</td>\n",
|
|||
|
" <td>7.097075</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>6.356100</td>\n",
|
|||
|
" <td>7.95085</td>\n",
|
|||
|
" <td>7.955235</td>\n",
|
|||
|
" <td>1.436007</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>12.020000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>24.030000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>6.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>27.340000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>7.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>31.420000</td>\n",
|
|||
|
" <td>2.00000</td>\n",
|
|||
|
" <td>3.000000</td>\n",
|
|||
|
" <td>8.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>94.850000</td>\n",
|
|||
|
" <td>30.00000</td>\n",
|
|||
|
" <td>30.000000</td>\n",
|
|||
|
" <td>24.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime\n",
|
|||
|
"count 319795.000000 319795.00000 319795.000000 319795.000000\n",
|
|||
|
"mean 28.325399 3.37171 3.898366 7.097075\n",
|
|||
|
"std 6.356100 7.95085 7.955235 1.436007\n",
|
|||
|
"min 12.020000 0.00000 0.000000 1.000000\n",
|
|||
|
"25% 24.030000 0.00000 0.000000 6.000000\n",
|
|||
|
"50% 27.340000 0.00000 0.000000 7.000000\n",
|
|||
|
"75% 31.420000 2.00000 3.000000 8.000000\n",
|
|||
|
"max 94.850000 30.00000 30.000000 24.000000"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 346,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Пустых значений нет, но есть дубликаты, удаляем их"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 347,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = df.drop_duplicates()\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Преобразуем строковые значение в столбце 'Сердечный приступ' в числовые значения. Это понадобится для расчёта качества набора признаков."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 348,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"map_stroke_to_int = {'No': 0, 'Yes': 1}\n",
|
|||
|
"\n",
|
|||
|
"df['Stroke'] = df['Stroke'].map(map_stroke_to_int).astype('int32')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Создание выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 349,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: (147840, 17)\n",
|
|||
|
"Размер контрольной выборки: (63361, 17)\n",
|
|||
|
"Размер тестовой выборки: (90516, 17)\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение данных на признаки (X) и целевую переменную (y)\n",
|
|||
|
"# В данном случае мы хотим предсказать 'stroke'\n",
|
|||
|
"X = df.drop(columns=['Stroke'])\n",
|
|||
|
"y = df['Stroke']\n",
|
|||
|
"\n",
|
|||
|
"# Разбиение данных на обучающую и тестовую выборки\n",
|
|||
|
"# Сначала разделим на обучающую и тестовую\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
|
|||
|
"\n",
|
|||
|
"# Затем разделим обучающую выборку на обучающую и контрольную\n",
|
|||
|
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", X_train.shape)\n",
|
|||
|
"print(\"Размер контрольной выборки:\", X_val.shape)\n",
|
|||
|
"print(\"Размер тестовой выборки:\", X_test.shape)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценим сбалансированность выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 350,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в обучающей выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.960045\n",
|
|||
|
"1 0.039955\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в контрольной выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.95977\n",
|
|||
|
"1 0.04023\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в тестовой выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.96014\n",
|
|||
|
"1 0.03986\n",
|
|||
|
"Name: proportion, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABp3ElEQVR4nO3deXxM1//H8XcS2SO2RBJEYl9raSxF7SE0ii7U8q2gRYsu9KstbUV1SVVrKVqlpQstpaXfVquW0qqmFFXUUkvUnohdkJA5vz88Mj9jJsRSk9u+no/HPB6ZM+fe+7kzmTl33nPnjIcxxggAAAAAAAAAAIvydHcBAAAAAAAAAADcCIJuAAAAAAAAAIClEXQDAAAAAAAAACyNoBsAAAAAAAAAYGkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAAAAAMDSCLoBAAAAAAAAAJZG0A0AAAAA18Fmsyk9PV27du1ydykAAAD/egTdAAAAAJBHhw4d0pNPPqmoqCj5+PgoNDRUVatW1cmTJ91dGgAAwL9aAXcXAAAAcLN98MEH6tWrl/26r6+vSpcurdatW+uFF15QWFiYG6sDYFU7duxQ8+bNdf78eT3++OO6/fbbVaBAAfn7+yswMNDd5QEAAPyrEXQDAIB/rJEjR6pMmTI6d+6cfvrpJ73zzjv65ptvtGnTJgUEBLi7PAAW069fP/n4+OiXX35RyZIl3V0OAAAALkHQDQAA/rHatm2rOnXqSJIefvhhFStWTGPGjNGXX36prl27urk6AFaydu1aff/991q0aBEhNwAAQD7EHN0AAOBfo0WLFpKklJQUSdLRo0f13//+V7fddpuCgoIUHBystm3b6vfff3da9ty5cxoxYoQqVqwoPz8/RURE6N5779XOnTslSbt375aHh0eul2bNmtnXtXz5cnl4eGj27NkaNmyYwsPDFRgYqPbt22vv3r1O2161apXatGmjQoUKKSAgQE2bNtXKlStd7mOzZs1cbn/EiBFOfWfMmKGYmBj5+/uraNGi6tKli8vtX2nfLmWz2TRu3DhVq1ZNfn5+CgsLU79+/XTs2DGHftHR0WrXrp3TdgYOHOi0Tle1jx492uk+laTMzEwlJiaqfPny8vX1VWRkpJ5++mllZma6vK8udfn9FhISovj4eG3atClPy1avXl1r165Vw4YN5e/vrzJlymjy5MkO/bKysjR8+HDFxMSoUKFCCgwMVOPGjbVs2TKHftu2bVOLFi0UHh5u349HHnlER48eddp2z549r/p49+zZU9HR0Q7L7d27V/7+/vLw8NDu3bsl/f/j/MEHHzj0HTFihMvHZeDAgU71tGvXzmFbOet84403crn3nNc/ffp0eXh4aNq0aQ79Xn31VXl4eOibb77JdV3Sxf+vnPvB09NT4eHheuCBB7Rnz54bquuXX36Rn5+fdu7cqWrVqsnX11fh4eHq16+fy8dmzpw59udXSEiI/vOf/2j//v0OfXr27KmgoCDt2rVLcXFxCgwMVIkSJTRy5EgZY5zqvfSxOXXqlGJiYlSmTBkdPHjQ3v7GG2+oYcOGKlasmPz9/RUTE6O5c+c6bPdG72MAAID8iDO6AQDAv0ZOKF2sWDFJ0q5duzR//nx16tRJZcqUUWpqqt599101bdpUmzdvVokSJSRJ2dnZateunZYuXaouXbroiSee0KlTp7R48WJt2rRJ5cqVs2+ja9euuuuuuxy2O3ToUJf1vPLKK/Lw8NAzzzyjtLQ0jRs3TrGxsVq/fr38/f0lSd9//73atm2rmJgYJSYmytPTU9OnT1eLFi20YsUK1atXz2m9pUqVUlJSkiTp9OnTevTRR11u+4UXXlDnzp318MMP6/Dhw5owYYKaNGmi3377TYULF3Zapm/fvmrcuLEk6YsvvtC8efMcbu/Xr599fvTHH39cKSkpmjhxon777TetXLlS3t7eLu+Ha3H8+HH7vl3KZrOpffv2+umnn9S3b19VqVJFGzdu1NixY/Xnn39q/vz5V1135cqV9dxzz8kYo507d2rMmDG66667HALS3Bw7dkx33XWXOnfurK5du+qzzz7To48+Kh8fH/Xu3VuSdPLkSb333nvq2rWr+vTpo1OnTun9999XXFycVq9erVq1akmSMjIyVKpUKd19990KDg7Wpk2bNGnSJO3fv19fffWV07ZDQkI0duxY+/UHH3zwqvUOHz5c586du2o/d+jVq5e++OILDR48WK1atVJkZKQ2btyoF198UQ899JDT88uVxo0bq2/fvrLZbNq0aZPGjRunAwcOaMWKFddd15EjR3Tu3Dk9+uijatGihR555BHt3LlTkyZN0qpVq7Rq1Sr5+vpK+v/fCahbt66SkpKUmpqq8ePHa+XKlU7Pr+zsbLVp00Z33HGHXn/9dS1cuFCJiYm6cOGCRo4c6bKW8+fP67777tOePXu0cuVKRURE2G8bP3682rdvr+7duysrK0uzZs1Sp06d9PXXXys+Pv6m3ccAAAD5jgEAAPiHmT59upFklixZYg4fPmz27t1rZs2aZYoVK2b8/f3Nvn37jDHGnDt3zmRnZzssm5KSYnx9fc3IkSPtbdOmTTOSzJgxY5y2ZbPZ7MtJMqNHj3bqU61aNdO0aVP79WXLlhlJpmTJkubkyZP29s8++8xIMuPHj7evu0KFCiYuLs6+HWOMOXPmjClTpoxp1aqV07YaNmxoqlevbr9++PBhI8kkJiba23bv3m28vLzMK6+84rDsxo0bTYECBZzat2/fbiSZDz/80N6WmJhoLj2UXLFihZFkZs6c6bDswoULndqjoqJMfHy8U+0DBgwwlx+eXl77008/bYoXL25iYmIc7tOPP/7YeHp6mhUrVjgsP3nyZCPJrFy50ml7l2ratKnD+owxZtiwYUaSSUtLu+qyksybb75pb8vMzDS1atUyxYsXN1lZWcYYYy5cuGAyMzMdlj127JgJCwszvXv3vuI2+vfvb4KCgpzau3fvbsqUKePQdvl9lpCQYKKiouzXN23aZDw9PU3btm2NJJOSkmKMMeavv/4yksy0adMc1nf5Y52zjQEDBjjVEx8f77CtKz0vrrT+gwcPmqJFi5pWrVqZzMxMU7t2bVO6dGlz4sSJXNeTIyoqyiQkJDi0devWzQQEBNxQXTnXW7ZsaS5cuGBvz3m9mTBhgjHGmKysLFO8eHFTvXp1c/bsWXu/r7/+2kgyw4cPt7clJCQYSeaxxx6zt9lsNhMfH298fHzM4cOHHeqdPn26sdlspnv37iYgIMCsWrXKqe4zZ844XM/KyjLVq1c3LVq0cGi/kfsYAAAgP2LqEgAA8I8VGxur0NBQRUZGqkuXLgoKCtK8efPs8+v6+vrK0/Pi4VB2draOHDmioKAgVapUSevWrbOv5/PPP1dISIgee+wxp21cPqXDtejRo4cKFixov37//fcrIiLCPm3A+vXrtX37dnXr1k1HjhxRenq60tPTlZGRoZYtW+rHH3+UzWZzWOe5c+fk5+d3xe1+8cUXstls6ty5s32d6enpCg8PV4UKFZym0sjKypIk+9mqrsyZM0eFChVSq1atHNYZExOjoKAgp3WeP3/eoV96evpVzzDev3+/JkyYoBdeeEFBQUFO269SpYoqV67ssM6c6Wou374rOTUdPnxYycnJmjdvnmrUqKGQkJCrLlugQAH169fPft3Hx0f9+vVTWlqa1q5dK0ny8vKSj4+PpItnoB89elQXLlxQnTp1HP7fcpw4cUKpqalaunSpFixYoCZNmjj1ycrKuuLj4srQoUN1++23q1OnTg7toaGhkqR9+/blaT3nzp1zegzPnz/vsu+ZM2eUnp6uY8eOOUzJkZvw8HBNmjRJixcvVuPGjbV+/XpNmzZNwcHBeaotMzNT6enpSktL0+LFi/X999+rZcuWN1yXJA0ePFheXl726w8++KDCwsK0YMECSdKaNWuUlpam/v37OzwX4+PjVblyZXu/S106DUzOtDBZWVlasmSJU98hQ4Zo5syZ+uyzz1x+oyPn2yDSxW8anDhxQo0bN3b6H7vR+xgAACC/YeoSAADwjzV
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1800x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Функция для анализа сбалансированности\n",
|
|||
|
"def analyze_balance(y_train, y_val, y_test, y_name):\n",
|
|||
|
" # Распределение классов\n",
|
|||
|
" print(\"Распределение классов в обучающей выборке:\")\n",
|
|||
|
" print(y_train.value_counts(normalize=True))\n",
|
|||
|
" \n",
|
|||
|
" print(\"\\nРаспределение классов в контрольной выборке:\")\n",
|
|||
|
" print(y_val.value_counts(normalize=True))\n",
|
|||
|
" \n",
|
|||
|
" print(\"\\nРаспределение классов в тестовой выборке:\")\n",
|
|||
|
" print(y_test.value_counts(normalize=True))\n",
|
|||
|
"\n",
|
|||
|
" # Создание фигуры и осей для трех столбчатых диаграмм\n",
|
|||
|
" fig, axes = plt.subplots(1, 3, figsize=(18, 5), sharey=True)\n",
|
|||
|
" fig.suptitle('Распределение в различных выборках')\n",
|
|||
|
"\n",
|
|||
|
" # Обучающая выборка\n",
|
|||
|
" sns.barplot(x=y_train.value_counts().index, y=y_train.value_counts(normalize=True), ax=axes[0])\n",
|
|||
|
" axes[0].set_title('Обучающая выборка')\n",
|
|||
|
" axes[0].set_xlabel(y_name)\n",
|
|||
|
" axes[0].set_ylabel('Доля')\n",
|
|||
|
"\n",
|
|||
|
" # Контрольная выборка\n",
|
|||
|
" sns.barplot(x=y_val.value_counts().index, y=y_val.value_counts(normalize=True), ax=axes[1])\n",
|
|||
|
" axes[1].set_title('Контрольная выборка')\n",
|
|||
|
" axes[1].set_xlabel(y_name)\n",
|
|||
|
"\n",
|
|||
|
" # Тестовая выборка\n",
|
|||
|
" sns.barplot(x=y_test.value_counts().index, y=y_test.value_counts(normalize=True), ax=axes[2])\n",
|
|||
|
" axes[2].set_title('Тестовая выборка')\n",
|
|||
|
" axes[2].set_xlabel(y_name)\n",
|
|||
|
"\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"analyze_balance(y_train, y_val, y_test, 'Stroke')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выборки несбалансированны. Необходимо сбалансировать обучающую и контрольную выборки, чтобы получить лучшие результаты при обучении модели. Для балансировки применим RandomOverSampler:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 351,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение классов в обучающей выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.5\n",
|
|||
|
"1 0.5\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в контрольной выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.5\n",
|
|||
|
"1 0.5\n",
|
|||
|
"Name: proportion, dtype: float64\n",
|
|||
|
"\n",
|
|||
|
"Распределение классов в тестовой выборке:\n",
|
|||
|
"Stroke\n",
|
|||
|
"0 0.96014\n",
|
|||
|
"1 0.03986\n",
|
|||
|
"Name: proportion, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAHyCAYAAAAtJXgGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABp/ElEQVR4nO3deXxM9/7H8XcS2SO2RBJEYl9raSxF7SE0ii7Ucito0aILvdrSVlRvm6oWLVqlpQu9lJbeVquW0qqmFFXUUkvUnohdkJD5/v7wyPyMmRCkJqd9PR+PeTwy3/mecz5nJjPfM+858x0PY4wRAAAAAAAAAAAW5enuAgAAAAAAAAAAuBkE3QAAAAAAAAAASyPoBgAAAAAAAABYGkE3AAAAAAAAAMDSCLoBAAAAAAAAAJZG0A0AAAAAAAAAsDSCbgAAAAAAAACApRF0AwAAAAAAAAAsjaAbAAAAAG6AzWZTenq6du/e7e5SAAAA/vEIugEAAAAgjw4fPqwnn3xSUVFR8vHxUWhoqKpXr65Tp065uzQAAIB/tELuLgAAACC/ffDBB+rTp4/9uq+vr8qWLau2bdvqhRdeUFhYmBurA2BVO3fuVMuWLXXhwgU9/vjjuv3221WoUCH5+/srMDDQ3eUBAAD8oxF0AwCAv63Ro0erXLlyOn/+vH788Ue98847+vrrr7V582YFBAS4uzwAFjNgwAD5+Pjo559/VunSpd1dDgAAAC5D0A0AAP622rdvr3r16kmSHn74YZUoUULjxo3TF198oe7du7u5OgBWsm7dOn333XdavHgxITcAAEABxBzdAADgH6NVq1aSpJSUFEnSsWPH9O9//1u33XabgoKCFBwcrPbt2+u3335zWvb8+fMaNWqUKleuLD8/P0VEROjee+/Vrl27JEl79uyRh4dHrpcWLVrY17VixQp5eHhozpw5GjFihMLDwxUYGKiOHTtq3759TttevXq12rVrpyJFiiggIEDNmzfXqlWrXO5jixYtXG5/1KhRTn1nzpypmJgY+fv7q3jx4urWrZvL7V9t3y5ns9k0YcIE1ahRQ35+fgoLC9OAAQN0/Phxh37R0dHq0KGD03YGDx7stE5XtY8dO9bpPpWkzMxMJSYmqmLFivL19VVkZKSefvppZWZmuryvLnfl/RYSEqL4+Hht3rw5T8vWrFlT69atU+PGjeXv769y5cppypQpDv2ysrI0cuRIxcTEqEiRIgoMDFTTpk21fPlyh37bt29Xq1atFB4ebt+PRx55RMeOHXPadu/eva/5ePfu3VvR0dEOy+3bt0/+/v7y8PDQnj17JP3/4/zBBx849B01apTLx2Xw4MFO9XTo0MFhWznrfP3113O595zXP2PGDHl4eGj69OkO/V555RV5eHjo66+/znVd0qX/r5z7wdPTU+Hh4XrggQe0d+/em6rr559/lp+fn3bt2qUaNWrI19dX4eHhGjBggMvHZu7cufbnV0hIiP71r3/pwIEDDn169+6toKAg7d69W3FxcQoMDFSpUqU0evRoGWOc6r38sTl9+rRiYmJUrlw5HTp0yN7++uuvq3HjxipRooT8/f0VExOjefPmOWz3Zu9jAACAgogzugEAwD9GTihdokQJSdLu3bu1YMECdenSReXKlVNqaqreffddNW/eXFu2bFGpUqUkSdnZ2erQoYOWLVumbt266YknntDp06e1ZMkSbd68WRUqVLBvo3v37rrrrrsctjt8+HCX9bz88svy8PDQM888o7S0NE2YMEGxsbHasGGD/P39JUnfffed2rdvr5iYGCUmJsrT01MzZsxQq1attHLlSjVo0MBpvWXKlFFSUpIk6cyZM3r00UddbvuFF15Q165d9fDDD+vIkSOaOHGimjVrpl9//VVFixZ1WqZ///5q2rSpJOnzzz/X/PnzHW4fMGCAfX70xx9/XCkpKZo0aZJ+/fVXrVq1St7e3i7vh+tx4sQJ+75dzmazqWPHjvrxxx/Vv39/VatWTZs2bdL48eP1xx9/aMGCBddcd9WqVfXcc8/JGKNdu3Zp3LhxuuuuuxwC0twcP35cd911l7p27aru3bvr008/1aOPPiofHx/17dtXknTq1Cm999576t69u/r166fTp0/r/fffV1xcnNasWaM6depIkjIyMlSmTBndfffdCg4O1ubNmzV58mQdOHBAX375pdO2Q0JCNH78ePv1Bx988Jr1jhw5UufPn79mP3fo06ePPv/8cw0dOlRt2rRRZGSkNm3apBdffFEPPfSQ0/PLlaZNm6p///6y2WzavHmzJkyYoIMHD2rlypU3XNfRo0d1/vx5Pfroo2rVqpUeeeQR7dq1S5MnT9bq1au1evVq+fr6Svr/3wmoX7++kpKSlJqaqjfffFOrVq1yen5lZ2erXbt2uuOOO/Taa69p0aJFSkxM1MWLFzV69GiXtVy4cEH33Xef9u7dq1WrVikiIsJ+25tvvqmOHTuqZ8+eysrK0uzZs9WlSxd99dVXio+Pz7f7GAAAoMAxAAAAfzMzZswwkszSpUvNkSNHzL59+8zs2bNNiRIljL+/v9m/f78xxpjz58+b7Oxsh2VTUlKMr6+vGT16tL1t+vTpRpIZN26c07ZsNpt9OUlm7NixTn1q1Khhmjdvbr++fPlyI8mULl3anDp1yt7+6aefGknmzTfftK+7UqVKJi4uzr4dY4w5e/asKVeunGnTpo3Ttho3bmxq1qxpv37kyBEjySQmJtrb9uzZY7y8vMzLL7/ssOymTZtMoUKFnNp37NhhJJkPP/zQ3paYmGguP5RcuXKlkWRmzZrlsOyiRYuc2qOiokx8fLxT7YMGDTJXHp5eWfvTTz9tSpYsaWJiYhzu048//th4enqalStXOiw/ZcoUI8msWrXKaXuXa968ucP6jDFmxIgRRpJJS0u75rKSzBtvvGFvy8zMNHXq1DElS5Y0WVlZxhhjLl68aDIzMx2WPX78uAkLCzN9+/a96jYGDhxogoKCnNp79uxpypUr59B25X2WkJBgoqKi7Nc3b95sPD09Tfv27Y0kk5KSYowx5s8//zSSzPTp0x3Wd+VjnbONQYMGOdUTHx/vsK2rPS+utv5Dhw6Z4sWLmzZt2pjMzExTt25dU7ZsWXPy5Mlc15MjKirKJCQkOLT16NHDBAQE3FRdOddbt25tLl68aG/Peb2ZOHGiMcaYrKwsU7JkSVOzZk1z7tw5e7+vvvrKSDIjR460tyUkJBhJ5rHHHrO32Ww2Ex8fb3x8fMyRI0cc6p0xY4ax2WymZ8+eJiAgwKxevdqp7rNnzzpcz8rKMjVr1jStWrVyaL+Z+xgAAKAgYuoSAADwtxUbG6vQ0FBFRkaqW7duCgoK0vz58+3z6/r6+srT89LhUHZ2to4ePaqgoCBVqVJF69evt6/ns88+U0hIiB577DGnbVw5pcP16NWrlwoXLmy/fv/99ysiIsI+bcCGDRu0Y8cO9ejRQ0ePHlV6errS09OVkZGh1q1b64cffpDNZnNY5/nz5+Xn53fV7X7++eey2Wzq2rWrfZ3p6ekKDw9XpUqVnKbSyMrKkiT72aquzJ07V0WKFFGbNm0c1hkTE6OgoCCndV64cMGhX3p6+jXPMD5w4IAmTpyoF154QUFBQU7br1atmqpWreqwzpzpaq7cvis5NR05ckTJycmaP3++atWqpZCQkGsuW6hQIQ0YMMB+3cfHRwMGDFBaWprWrVsnSfLy8pKPj4+kS2egHzt2TBcvXlS9evUc/t9ynDx5UqmpqVq2bJkWLlyoZs2aOfXJysq66uPiyvDhw3X77berS5cuDu2hoaGSpP379+dpPefPn3d6DC9cuOCy79mzZ5Wenq7jx487TMmRm/DwcE2ePFlLlixR06ZNtWHDBk2fPl3BwcF5qi0zM1Pp6elKS0vTkiVL9N1336l169Y3XZckDR06VF5eXvbrDz74oMLCwrRw4UJJ0tq1a5WWlqaBAwc6PBfj4+NVtWpVe7/LXT4NTM60MFlZWVq6dKlT32HDhmnWrFn69NNPXX6jI+fbINKlbxqcPHlSTZs2dfofu9n7GAAAoKBh6hIAAPC
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1800x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Применение RandomOverSampler для балансировки выборок\n",
|
|||
|
"X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)\n",
|
|||
|
"X_val_resampled, y_val_resampled = ros.fit_resample(X_val, y_val)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка сбалансированности после RandomOverSampler\n",
|
|||
|
"analyze_balance(y_train_resampled, y_val_resampled, y_test, 'Stroke')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Перейдем к конструированию признаков.\n",
|
|||
|
"Применим унитарное кодирование категориальных признаков (one-hot encoding), переведя их в бинарные вектора:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 352,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
|
|||
|
"0 26.50 5.0 0.0 7.0 False \n",
|
|||
|
"1 33.91 0.0 0.0 7.0 False \n",
|
|||
|
"2 42.57 4.0 5.0 6.0 False \n",
|
|||
|
"3 32.08 0.0 0.0 6.0 False \n",
|
|||
|
"4 15.78 1.0 3.0 6.0 False \n",
|
|||
|
"\n",
|
|||
|
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
|
|||
|
"0 False False False True \n",
|
|||
|
"1 False False False True \n",
|
|||
|
"2 False False False True \n",
|
|||
|
"3 False False False True \n",
|
|||
|
"4 False False False True \n",
|
|||
|
"\n",
|
|||
|
" AgeCategory_25-29 ... Diabetic_Yes Diabetic_Yes (during pregnancy) \\\n",
|
|||
|
"0 False ... False False \n",
|
|||
|
"1 False ... False False \n",
|
|||
|
"2 False ... False False \n",
|
|||
|
"3 False ... False False \n",
|
|||
|
"4 False ... False False \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity_Yes GenHealth_Fair GenHealth_Good GenHealth_Poor \\\n",
|
|||
|
"0 True False False False \n",
|
|||
|
"1 True False False False \n",
|
|||
|
"2 True False True False \n",
|
|||
|
"3 True False False False \n",
|
|||
|
"4 True False True False \n",
|
|||
|
"\n",
|
|||
|
" GenHealth_Very good Asthma_Yes KidneyDisease_Yes SkinCancer_Yes \n",
|
|||
|
"0 True False False False \n",
|
|||
|
"1 True False False False \n",
|
|||
|
"2 False False False False \n",
|
|||
|
"3 True False False False \n",
|
|||
|
"4 False False False False \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 37 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Определение категориальных признаков\n",
|
|||
|
"categorical_features = ['HeartDisease', 'Smoking', 'AlcoholDrinking',\n",
|
|||
|
" 'DiffWalking', 'Sex', 'AgeCategory',\n",
|
|||
|
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',\n",
|
|||
|
" 'Asthma', 'KidneyDisease', 'SkinCancer']\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к обучающей выборке\n",
|
|||
|
"X_train_encoded = pd.get_dummies(X_train_resampled, columns=categorical_features, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к контрольной выборке\n",
|
|||
|
"X_val_encoded = pd.get_dummies(X_val_resampled, columns=categorical_features, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# Применение one-hot encoding к тестовой выборке\n",
|
|||
|
"X_test_encoded = pd.get_dummies(X_test, columns=categorical_features, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"print(X_train_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Далее применим дискретизацию к числовым признакам "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 353,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
|
|||
|
"0 26.50 5.0 0.0 7.0 False \n",
|
|||
|
"1 33.91 0.0 0.0 7.0 False \n",
|
|||
|
"2 42.57 4.0 5.0 6.0 False \n",
|
|||
|
"3 32.08 0.0 0.0 6.0 False \n",
|
|||
|
"4 15.78 1.0 3.0 6.0 False \n",
|
|||
|
"\n",
|
|||
|
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
|
|||
|
"0 False False False True \n",
|
|||
|
"1 False False False True \n",
|
|||
|
"2 False False False True \n",
|
|||
|
"3 False False False True \n",
|
|||
|
"4 False False False True \n",
|
|||
|
"\n",
|
|||
|
" AgeCategory_25-29 ... Diabetic_Yes (during pregnancy) \\\n",
|
|||
|
"0 False ... False \n",
|
|||
|
"1 False ... False \n",
|
|||
|
"2 False ... False \n",
|
|||
|
"3 False ... False \n",
|
|||
|
"4 False ... False \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity_Yes GenHealth_Fair GenHealth_Good GenHealth_Poor \\\n",
|
|||
|
"0 True False False False \n",
|
|||
|
"1 True False False False \n",
|
|||
|
"2 True False True False \n",
|
|||
|
"3 True False False False \n",
|
|||
|
"4 True False True False \n",
|
|||
|
"\n",
|
|||
|
" GenHealth_Very good Asthma_Yes KidneyDisease_Yes SkinCancer_Yes \\\n",
|
|||
|
"0 True False False False \n",
|
|||
|
"1 True False False False \n",
|
|||
|
"2 False False False False \n",
|
|||
|
"3 True False False False \n",
|
|||
|
"4 False False False False \n",
|
|||
|
"\n",
|
|||
|
" BMI_binned \n",
|
|||
|
"0 Overweight \n",
|
|||
|
"1 Obese \n",
|
|||
|
"2 Severely Obese \n",
|
|||
|
"3 Obese \n",
|
|||
|
"4 Underweight \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 38 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"bmi_bins = [0, 18.5, 25, 30, 40, 60]\n",
|
|||
|
"bmi_labels = [\"Underweight\", \"Normal\", \"Overweight\", \"Obese\", \"Severely Obese\"]\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"X_train_encoded['BMI_binned'] = pd.cut(X_train_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
|
|||
|
"X_val_encoded['BMI_binned'] = pd.cut(X_val_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
|
|||
|
"\n",
|
|||
|
"X_test_encoded['BMI_binned'] = pd.cut(X_test_encoded['BMI'], bins=bmi_bins, labels=bmi_labels)\n",
|
|||
|
"\n",
|
|||
|
"print(X_train_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Применим ручной синтез признаков. К примеру, можно создать фактор риска для сердечных заболеваний: комбинированный признак на основе факторов риска, таких как курение, диабет, употребление алкоголя и наличие болезней."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 354,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
|
|||
|
"0 26.50 5.0 0.0 7.0 False \n",
|
|||
|
"1 33.91 0.0 0.0 7.0 False \n",
|
|||
|
"2 42.57 4.0 5.0 6.0 False \n",
|
|||
|
"3 32.08 0.0 0.0 6.0 False \n",
|
|||
|
"4 15.78 1.0 3.0 6.0 False \n",
|
|||
|
"\n",
|
|||
|
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
|
|||
|
"0 False False False True \n",
|
|||
|
"1 False False False True \n",
|
|||
|
"2 False False False True \n",
|
|||
|
"3 False False False True \n",
|
|||
|
"4 False False False True \n",
|
|||
|
"\n",
|
|||
|
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
|
|||
|
"0 False ... True False \n",
|
|||
|
"1 False ... True False \n",
|
|||
|
"2 False ... True False \n",
|
|||
|
"3 False ... True False \n",
|
|||
|
"4 False ... True False \n",
|
|||
|
"\n",
|
|||
|
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
|
|||
|
"0 False False True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 True False False False \n",
|
|||
|
"3 False False True False \n",
|
|||
|
"4 True False False False \n",
|
|||
|
"\n",
|
|||
|
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
|
|||
|
"0 False False Overweight 0 \n",
|
|||
|
"1 False False Obese 0 \n",
|
|||
|
"2 False False Severely Obese 0 \n",
|
|||
|
"3 False False Obese 0 \n",
|
|||
|
"4 False False Underweight 0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 39 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_train_encoded['RiskFactor'] = ((X_train_encoded['Smoking_Yes'] == True) | \n",
|
|||
|
" (X_train_encoded['Diabetic_Yes'] == True) | \n",
|
|||
|
" (X_train_encoded['AlcoholDrinking_Yes'] == True) | \n",
|
|||
|
" (X_train_encoded['KidneyDisease_Yes'] == True) | \n",
|
|||
|
" (X_train_encoded['SkinCancer_Yes'] == True)).astype(int)\n",
|
|||
|
"\n",
|
|||
|
"print(X_train_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Используем масштабирование признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 355,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
|
|||
|
"0 26.50 -0.099915 -0.540452 -0.070949 False \n",
|
|||
|
"1 33.91 -0.581538 -0.540452 -0.070949 False \n",
|
|||
|
"2 42.57 -0.196239 0.006442 -0.646839 False \n",
|
|||
|
"3 32.08 -0.581538 -0.540452 -0.646839 False \n",
|
|||
|
"4 15.78 -0.485213 -0.212315 -0.646839 False \n",
|
|||
|
"\n",
|
|||
|
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
|
|||
|
"0 False False False True \n",
|
|||
|
"1 False False False True \n",
|
|||
|
"2 False False False True \n",
|
|||
|
"3 False False False True \n",
|
|||
|
"4 False False False True \n",
|
|||
|
"\n",
|
|||
|
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
|
|||
|
"0 False ... True False \n",
|
|||
|
"1 False ... True False \n",
|
|||
|
"2 False ... True False \n",
|
|||
|
"3 False ... True False \n",
|
|||
|
"4 False ... True False \n",
|
|||
|
"\n",
|
|||
|
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
|
|||
|
"0 False False True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 True False False False \n",
|
|||
|
"3 False False True False \n",
|
|||
|
"4 True False False False \n",
|
|||
|
"\n",
|
|||
|
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
|
|||
|
"0 False False Overweight 0 \n",
|
|||
|
"1 False False Obese 0 \n",
|
|||
|
"2 False False Severely Obese 0 \n",
|
|||
|
"3 False False Obese 0 \n",
|
|||
|
"4 False False Underweight 0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 39 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"numerical_features = ['PhysicalHealth', 'MentalHealth', 'SleepTime']\n",
|
|||
|
"\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"X_train_encoded[numerical_features] = scaler.fit_transform(X_train_encoded[numerical_features])\n",
|
|||
|
"X_val_encoded[numerical_features] = scaler.transform(X_val_encoded[numerical_features])\n",
|
|||
|
"X_test_encoded[numerical_features] = scaler.transform(X_test_encoded[numerical_features])\n",
|
|||
|
"\n",
|
|||
|
"print(X_train_encoded.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"И также попробуем сконструировать признаки, используя фреймворк Featuretools:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 356,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"c:\\Users\\User\\Desktop\\aim\\aimvenv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime HeartDisease_Yes \\\n",
|
|||
|
"index \n",
|
|||
|
"0 26.50 -0.099915 -0.540452 -0.070949 False \n",
|
|||
|
"1 33.91 -0.581538 -0.540452 -0.070949 False \n",
|
|||
|
"2 42.57 -0.196239 0.006442 -0.646839 False \n",
|
|||
|
"3 32.08 -0.581538 -0.540452 -0.646839 False \n",
|
|||
|
"4 15.78 -0.485213 -0.212315 -0.646839 False \n",
|
|||
|
"\n",
|
|||
|
" Smoking_Yes AlcoholDrinking_Yes DiffWalking_Yes Sex_Male \\\n",
|
|||
|
"index \n",
|
|||
|
"0 False False False True \n",
|
|||
|
"1 False False False True \n",
|
|||
|
"2 False False False True \n",
|
|||
|
"3 False False False True \n",
|
|||
|
"4 False False False True \n",
|
|||
|
"\n",
|
|||
|
" AgeCategory_25-29 ... PhysicalActivity_Yes GenHealth_Fair \\\n",
|
|||
|
"index ... \n",
|
|||
|
"0 False ... True False \n",
|
|||
|
"1 False ... True False \n",
|
|||
|
"2 False ... True False \n",
|
|||
|
"3 False ... True False \n",
|
|||
|
"4 False ... True False \n",
|
|||
|
"\n",
|
|||
|
" GenHealth_Good GenHealth_Poor GenHealth_Very good Asthma_Yes \\\n",
|
|||
|
"index \n",
|
|||
|
"0 False False True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 True False False False \n",
|
|||
|
"3 False False True False \n",
|
|||
|
"4 True False False False \n",
|
|||
|
"\n",
|
|||
|
" KidneyDisease_Yes SkinCancer_Yes BMI_binned RiskFactor \n",
|
|||
|
"index \n",
|
|||
|
"0 False False Overweight 0 \n",
|
|||
|
"1 False False Obese 0 \n",
|
|||
|
"2 False False Severely Obese 0 \n",
|
|||
|
"3 False False Obese 0 \n",
|
|||
|
"4 False False Underweight 0 \n",
|
|||
|
"\n",
|
|||
|
"[5 rows x 39 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data = X_train_encoded.copy()\n",
|
|||
|
"\n",
|
|||
|
"es = ft.EntitySet(id=\"patients\")\n",
|
|||
|
"\n",
|
|||
|
"es = es.add_dataframe(dataframe_name=\"strokes_data\", dataframe=data, index=\"index\", make_index=True)\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=es, \n",
|
|||
|
" target_dataframe_name=\"strokes_data\",\n",
|
|||
|
" max_depth=1\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(feature_matrix.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### Оценка качества набора признаков\n",
|
|||
|
"Представим основные оценки качества наборов признаков:\n",
|
|||
|
"\n",
|
|||
|
"- Предсказательная способность (для задачи классификации) Метрики: Accuracy, Precision, Recall, F1-Score, ROC AUC\n",
|
|||
|
"\n",
|
|||
|
" Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
|
|||
|
"\n",
|
|||
|
"- Скорость вычисления\n",
|
|||
|
"\n",
|
|||
|
" Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
|
|||
|
"\n",
|
|||
|
"- Надежность\n",
|
|||
|
"\n",
|
|||
|
" Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
|
|||
|
"\n",
|
|||
|
"- Корреляция\n",
|
|||
|
"\n",
|
|||
|
" Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
|
|||
|
"\n",
|
|||
|
"- Цельность\n",
|
|||
|
"\n",
|
|||
|
" Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 357,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Время обучения модели: 36.52 секунд\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_train_encoded = pd.get_dummies(X_train_encoded, drop_first=True)\n",
|
|||
|
"X_val_encoded = pd.get_dummies(X_val_encoded, drop_first=True)\n",
|
|||
|
"X_test_encoded = pd.get_dummies(X_test_encoded, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"all_columns = X_train_encoded.columns\n",
|
|||
|
"X_train_encoded = X_train_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
|||
|
"X_val_encoded = X_val_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
|||
|
"X_test_encoded = X_test_encoded.reindex(columns=all_columns, fill_value=0)\n",
|
|||
|
"\n",
|
|||
|
"# Выбор модели\n",
|
|||
|
"model = RandomForestClassifier(n_estimators=100, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Начинаем отсчет времени\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"model.fit(X_train_encoded, y_train_resampled)\n",
|
|||
|
"\n",
|
|||
|
"# Время обучения модели\n",
|
|||
|
"train_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"print(f'Время обучения модели: {train_time:.2f} секунд')\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 358,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Feature Importance:\n",
|
|||
|
" feature importance\n",
|
|||
|
"0 BMI 0.194905\n",
|
|||
|
"3 SleepTime 0.086714\n",
|
|||
|
"1 PhysicalHealth 0.074056\n",
|
|||
|
"4 HeartDisease_Yes 0.065638\n",
|
|||
|
"7 DiffWalking_Yes 0.057683\n",
|
|||
|
"2 MentalHealth 0.057339\n",
|
|||
|
"8 Sex_Male 0.028562\n",
|
|||
|
"20 AgeCategory_80 or older 0.025721\n",
|
|||
|
"29 PhysicalActivity_Yes 0.023747\n",
|
|||
|
"27 Diabetic_Yes 0.023346\n",
|
|||
|
"30 GenHealth_Fair 0.022295\n",
|
|||
|
"34 Asthma_Yes 0.019722\n",
|
|||
|
"19 AgeCategory_75-79 0.017912\n",
|
|||
|
"5 Smoking_Yes 0.017702\n",
|
|||
|
"37 RiskFactor 0.017532\n",
|
|||
|
"31 GenHealth_Good 0.016946\n",
|
|||
|
"18 AgeCategory_70-74 0.015593\n",
|
|||
|
"33 GenHealth_Very good 0.015544\n",
|
|||
|
"25 Race_White 0.014721\n",
|
|||
|
"39 BMI_binned_Overweight 0.014350\n",
|
|||
|
"17 AgeCategory_65-69 0.014142\n",
|
|||
|
"36 SkinCancer_Yes 0.014002\n",
|
|||
|
"32 GenHealth_Poor 0.013788\n",
|
|||
|
"40 BMI_binned_Obese 0.012988\n",
|
|||
|
"38 BMI_binned_Normal 0.012010\n",
|
|||
|
"16 AgeCategory_60-64 0.011894\n",
|
|||
|
"35 KidneyDisease_Yes 0.011588\n",
|
|||
|
"15 AgeCategory_55-59 0.010550\n",
|
|||
|
"22 Race_Black 0.009165\n",
|
|||
|
"23 Race_Hispanic 0.008975\n",
|
|||
|
"6 AlcoholDrinking_Yes 0.008943\n",
|
|||
|
"14 AgeCategory_50-54 0.008495\n",
|
|||
|
"13 AgeCategory_45-49 0.006740\n",
|
|||
|
"11 AgeCategory_35-39 0.006491\n",
|
|||
|
"26 Diabetic_No, borderline diabetes 0.006442\n",
|
|||
|
"12 AgeCategory_40-44 0.006333\n",
|
|||
|
"9 AgeCategory_25-29 0.006128\n",
|
|||
|
"24 Race_Other 0.005832\n",
|
|||
|
"10 AgeCategory_30-34 0.005631\n",
|
|||
|
"41 BMI_binned_Severely Obese 0.004984\n",
|
|||
|
"21 Race_Asian 0.002756\n",
|
|||
|
"28 Diabetic_Yes (during pregnancy) 0.002092\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Получение важности признаков\n",
|
|||
|
"importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train_encoded.columns\n",
|
|||
|
"\n",
|
|||
|
"# Сортировка признаков по важности\n",
|
|||
|
"feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})\n",
|
|||
|
"feature_importance = feature_importance.sort_values(by='importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Feature Importance:\")\n",
|
|||
|
"print(feature_importance)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 359,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 0.9562618763533519\n",
|
|||
|
"Precision: 0.10204081632653061\n",
|
|||
|
"Recall: 0.012472283813747228\n",
|
|||
|
"F1 Score: 0.02222771054581378\n",
|
|||
|
"ROC AUC: 0.5039578706315019\n",
|
|||
|
"Cross-validated Accuracy: 0.9940253495420259\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABB0AAANXCAYAAAB5YScaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxN1/7/8ddJIoOMhpAgxJAQGkPQXvMsUWNRQ01BFDXU1JKaxxhbVI0lUVVjFTUPlSraGm4TWmqIRPRKaQ2JUEFyfn/4OV+nSSQpEer9fDz2456z19prfdbOvR53f85aaxuMRqMREREREREREZGnzCKnAxARERERERGRfyclHUREREREREQkWyjpICIiIiIiIiLZQkkHEREREREREckWSjqIiIiIiIiISLZQ0kFEREREREREsoWSDiIiIiIiIiKSLZR0EBEREREREZFsoaSDiIiIiIiIiGQLJR1EREREREREJFso6SAiIiIvjbCwMAwGQ5rHiBEjsqXPQ4cOMW7cOG7cuJEt7T+Jh/fj6NGjOR3KPzZ//nzCwsJyOgwREUmHVU4HICIiIvKsTZgwgeLFi5ude+WVV7Klr0OHDjF+/HgCAwNxcXHJlj5eZvPnzyd//vwEBgbmdCgiIpIGJR1ERETkpdOkSROqVKmS02E8kVu3bmFvb5/TYeSY27dvkzt37pwOQ0REMqDlFSIiIiJ/s337dmrVqoW9vT2Ojo40bdqUX375xazO8ePHCQwMpESJEtja2uLm5kaPHj24evWqqc64ceN47733AChevLhpKUdMTAwxMTEYDIY0lwYYDAbGjRtn1o7BYODkyZO89dZb5MmTh5o1a5rKP//8cypXroydnR158+alQ4cOXLx48R+NPTAwEAcHB2JjY2nWrBkODg4ULlyYTz75BIATJ05Qv3597O3tKVasGF988YXZ9Q+XbOzfv5/evXuTL18+nJyc6Nq1K9evX0/V3/z58ylXrhw2NjYUKlSIfv36pVqKUrduXV555RWOHTtG7dq1yZ07Nx988AGenp788ssvfPvtt6Z7W7duXQCuXbvGsGHD8PX1xcHBAScnJ5o0aUJkZKRZ2+Hh4RgMBtauXcvkyZMpUqQItra2NGjQgHPnzqWK98cff+T1118nT5482NvbU758eebMmWNW59dff6Vt27bkzZsXW1tbqlSpwubNm7P6pxAR+VfQTAcRERF56cTHx/Pnn3+ancufPz8AK1asoFu3bvj7+zNt2jRu377NggULqFmzJj/99BOenp4A7N69m/Pnz9O9e3fc3Nz45ZdfWLx4Mb/88gs//PADBoOB1q1bc+bMGVatWsVHH31k6sPV1ZU//vgjy3G/+eabeHl5MWXKFIxGIwCTJ09m9OjRtGvXjqCgIP744w8+/vhjateuzU8//fSPlnQkJyfTpEkTateuzfTp01m5ciX9+/fH3t6ekSNH0qlTJ1q3bs3ChQvp2rUr1apVS7VcpX///ri4uDBu3DhOnz7NggULuHDhgukhHx4kU8aPH0/Dhg3p27evqd6RI0c4ePAguXLlMrV39epVmjRpQocOHejcuTMFCxakbt26DBgwAAcHB0aOHAlAwYIFATh//jwbN27kzTffpHjx4ly+fJlFixZRp04dTp48SaFChczinTp1KhYWFgwbNoz4+HimT59Op06d+PHHH011du/eTbNmzXB3d+fdd9/Fzc2NU6dOsWXLFt59910AfvnlF2rUqEHhwoUZMWIE9vb2rF27llatWvHll1/yxhtvZPnvISLyQjOKiIiIvCRCQ0ONQJqH0Wg03rx50+ji4mLs1auX2XW///670dnZ2ez87du3U7W/atUqI2Dcv3+/6dyMGTOMgDE6OtqsbnR0tBEwhoaGpmoHMI4dO9b0fezYsUbA2LFjR7N6MTExRktLS+PkyZPNzp84ccJoZWWV6nx69+PIkSOmc926dTMCxilTppjOXb9+3WhnZ2c0GAzG1atXm87/+uuvqWJ92GblypWNd+/eNZ2fPn26ETBu2rTJaDQajVeuXDFaW1sbGzdubExOTjbVmzdvnhEwLlu2zHSuTp06RsC4cOHCVGMoV66csU6dOqnO37lzx6xdo/HBPbexsTFOmDDBdG7fvn1GwOjj42NMSkoynZ8zZ44RMJ44ccJoNBqN9+/fNxYvXtxYrFgx4/Xr183aTUlJMX1u0KCB0dfX13jnzh2z8urVqxu9vLxSxSki8m+n5RUiIiLy0vnkk0/YvXu32QEPfsm+ceMGHTt25M8//zQdlpaWvPbaa+zbt8/Uhp2dnenznTt3+PPPP/nPf/4DwH//+99sibtPnz5m3zds2EBKSgrt2rUzi9fNzQ0vLy+zeLMqKCjI9NnFxYXSpUtjb29Pu3btTOdLly6Ni4sL58+fT3X922+/bTZToW/fvlhZWbFt2zYA9uzZw927dxk0aBAWFv/3f0l79eqFk5MTW7duNWvPxsaG7t27Zzp+GxsbU7vJyclcvXoVBwcHSpcunebfp3v37lhbW5u+16pVC8A0tp9++ono6GgGDRqUavbIw5kb165d45tvvqFdu3bcvHnT9Pe4evUq/v7+nD17lv/973+ZHoOIyL+BlleIiIjIS+fVV19NcyPJs2fPAlC/fv00r3NycjJ9vnbtGuPHj2f16tVcuXLFrF58fPxTjPb//H0Jw9mzZzEajXh5eaVZ/9GH/qywtbXF1dXV7JyzszNFihQxPWA/ej6tvRr+HpODgwPu7u7ExMQAcOHCBeBB4uJR1tbWlChRwlT+UOHChc2SAhlJSUlhzpw5zJ8/n+joaJKTk01l+fLlS1W/aNGiZt/z5MkDYBpbVFQU8Pi3nJw7dw6j0cjo0aMZPXp0mnWuXLlC4cKFMz0OEZEXnZIOIiIiIv9fSkoK8GBfBzc3t1TlVlb/93+d2rVrx6FDh3jvvfeoWLEiDg4OpKSkEBAQYGrncf7+8P7Qow/Hf/fo7IqH8RoMBrZv346lpWWq+g4ODhnGkZa02nrceeP/318iO/197BmZMmUKo0ePpkePHkycOJG8efNiYWHBoEGD0vz7PI2xPWx32LBh+Pv7p1mnVKlSmW5PROTfQEkHERERkf+vZMmSABQoUICGDRumW+/69evs3buX8ePHM2bMGNP5hzMlHpVecuHhL+l/f1PD33/hzyheo9FI8eLF8fb2zvR1z8LZs2epV6+e6XtiYiJxcXG8/vrrABQrVgyA06dPU6JECVO9u3fvEh0d/dj7/6j07u/69eupV68eS5cuNTt/48YN04aeWfHwvxs///xzurE9HEeuXLkyHb+IyL+d9nQQERER+f/8/f1xcnJiypQp3Lt3L1X5wzdOPPxV/O+/gs+ePTvVNfb29kDq5IKTkxP58+dn//79Zufnz5+f6Xhbt26NpaUl48ePTxWL0Wg0e33ns7Z48WKze7hgwQLu379PkyZNAGjYsCHW1tbMnTvXLPalS5cSHx9P06ZNM9WPvb19qnsLD/5Gf78n69at+8d7Kvj5+VG8eHFmz56dqr+H/RQoUIC6deuyaNEi4uLiUrXxT95YIiLyotNMBxEREZH/z8nJiQULFtClSxf8/Pzo0KEDrq6uxMbGsnXrVmrUqMG8efNwcnIyvU7y3r17FC5cmF27dhEdHZ2qzcqVKwMwcuRIOnToQK5cuWjevDn29vYEBQUxdepUgoKCqFKlCvv37+fMmTOZjrdkyZJMmjSJ4OBgYmJiaNWqFY6OjkRHR/PVV1/x9ttvM2zYsKd2f7Li7t27NGjQgHbt2nH69Gnmz59PzZo1adGiBfDgtaHBwcGMHz+egIAAWrRoYapXtWpVOnfunKl+KleuzIIFC5g0aRKlSpWiQIEC1K9fn2bNmjFhwgS6d+9O9erVOXHiBCtXrjSbVZEVFhYWLFiwgObNm1OxYkW6d++Ou7s7v/76K7/88gs7d+4EHmxSWrNmTXx9fenVqxclSpTg8uXLfP/99/z2229ERkb+o/5FRF5USjqIiIiIPOKtt96iUKFCTJ06lRkzZpCUlEThwoWpVauW2dsTvvjiCwYMGMAnn3yC0WikceP
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x1000 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Train Accuracy: 0.9997146540973558\n",
|
|||
|
"Train Precision: 0.9994296336980861\n",
|
|||
|
"Train Recall: 1.0\n",
|
|||
|
"Train F1 Score: 0.9997147354964131\n",
|
|||
|
"Train ROC AUC: 0.9997146540973557\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Предсказание и оценка\n",
|
|||
|
"y_pred = model.predict(X_test_encoded)\n",
|
|||
|
"\n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"precision = precision_score(y_test, y_pred)\n",
|
|||
|
"recall = recall_score(y_test, y_pred)\n",
|
|||
|
"f1 = f1_score(y_test, y_pred)\n",
|
|||
|
"roc_auc = roc_auc_score(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Accuracy: {accuracy}\")\n",
|
|||
|
"print(f\"Precision: {precision}\")\n",
|
|||
|
"print(f\"Recall: {recall}\")\n",
|
|||
|
"print(f\"F1 Score: {f1}\")\n",
|
|||
|
"print(f\"ROC AUC: {roc_auc}\")\n",
|
|||
|
"\n",
|
|||
|
"# Кросс-валидация\n",
|
|||
|
"scores = cross_val_score(model, X_train_encoded, y_train_resampled, cv=5, scoring='accuracy')\n",
|
|||
|
"accuracy_cv = scores.mean()\n",
|
|||
|
"print(f\"Cross-validated Accuracy: {accuracy_cv}\")\n",
|
|||
|
"\n",
|
|||
|
"# Анализ важности признаков\n",
|
|||
|
"feature_importances = model.feature_importances_\n",
|
|||
|
"feature_names = X_train_encoded.columns\n",
|
|||
|
"\n",
|
|||
|
"importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n",
|
|||
|
"importance_df = importance_df.sort_values(by='Importance', ascending=False)\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 10))\n",
|
|||
|
"sns.barplot(x='Importance', y='Feature', data=importance_df)\n",
|
|||
|
"plt.title('Feature Importance')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Проверка на переобучение\n",
|
|||
|
"y_train_pred = model.predict(X_train_encoded)\n",
|
|||
|
"\n",
|
|||
|
"accuracy_train = accuracy_score(y_train_resampled, y_train_pred)\n",
|
|||
|
"precision_train = precision_score(y_train_resampled, y_train_pred)\n",
|
|||
|
"recall_train = recall_score(y_train_resampled, y_train_pred)\n",
|
|||
|
"f1_train = f1_score(y_train_resampled, y_train_pred)\n",
|
|||
|
"roc_auc_train = roc_auc_score(y_train_resampled, y_train_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Train Accuracy: {accuracy_train}\")\n",
|
|||
|
"print(f\"Train Precision: {precision_train}\")\n",
|
|||
|
"print(f\"Train Recall: {recall_train}\")\n",
|
|||
|
"print(f\"Train F1 Score: {f1_train}\")\n",
|
|||
|
"print(f\"Train ROC AUC: {roc_auc_train}\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimvenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.5"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|