2655 lines
297 KiB
Plaintext
Raw Permalink Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4 style=\"margin: 30px;\">бизнес-цели и 2 задачи, которые нужно решить:<br/>\n",
"Снижение вероятности инсульта у пациентов с высоким риском путем раннего выявления предрасположенности.<br/>\n",
"Оптимизация медицинских услуг, предоставляемых пациентам, с учетом их риска инсульта.<br/><br/><br/>\n",
"Разработать модель, которая прогнозирует вероятность инсульта у пациента.<br/>\n",
"Определить значимые признаки для анализа риска инсульта, чтобы направить усилия медицинских работников на важные факторы.</h4>"
]
},
{
"cell_type": "code",
"execution_count": 330,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество колонок: 12\n",
"Колонки: Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
" 'smoking_status', 'stroke'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Загрузка данных\n",
"data = pd.read_csv('./csv/option4.csv')\n",
"\n",
"# Обзор данных\n",
"print(\"Количество колонок:\", data.columns.size)\n",
"print(\"Колонки:\", data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 331,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 201\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n",
"\n",
"\n",
"\n",
"<bound method NDFrame.describe of id gender age hypertension heart_disease ever_married \\\n",
"0 9046 Male 67.0 0 1 Yes \n",
"1 51676 Female 61.0 0 0 Yes \n",
"2 31112 Male 80.0 0 1 Yes \n",
"3 60182 Female 49.0 0 0 Yes \n",
"4 1665 Female 79.0 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"5105 18234 Female 80.0 1 0 Yes \n",
"5106 44873 Female 81.0 0 0 Yes \n",
"5107 19723 Female 35.0 0 0 Yes \n",
"5108 37544 Male 51.0 0 0 Yes \n",
"5109 44679 Female 44.0 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 Private Urban 228.69 36.6 formerly smoked \n",
"1 Self-employed Rural 202.21 NaN never smoked \n",
"2 Private Rural 105.92 32.5 never smoked \n",
"3 Private Urban 171.23 34.4 smokes \n",
"4 Self-employed Rural 174.12 24.0 never smoked \n",
"... ... ... ... ... ... \n",
"5105 Private Urban 83.75 NaN never smoked \n",
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
"5108 Private Rural 166.29 25.6 formerly smoked \n",
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
"\n",
" stroke \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"5105 0 \n",
"5106 0 \n",
"5107 0 \n",
"5108 0 \n",
"5109 0 \n",
"\n",
"[5110 rows x 12 columns]>\n"
]
}
],
"source": [
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())\n",
"\n",
"print(\"\\n\\n\")\n",
"\n",
"print(data.describe)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Возьмем и заменим нулевые значения в столбце bmi на средние значения по столбцу </p>"
]
},
{
"cell_type": "code",
"execution_count": 332,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 0\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n"
]
}
],
"source": [
"data['bmi'] = data['bmi'].fillna(data['bmi'].median())\n",
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Взглянем на выбросы: </p>"
]
},
{
"cell_type": "code",
"execution_count": 333,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABscUlEQVR4nO3de1yUdf7//+cwHD0wiAcGChDPirIeKiOM3GTDU6vlZhaltX7081G0zG0rNw9hGFvblj9bsGz7mBau1ZZmZVpiylZoahoeWtMicdPBCgEPiQjX74++XB9HmRQdGAYe99ttbs28r9dc87rwlm988uZ9WQzDMAQAAAAAAAAAAM7j4+kGAAAAAAAAAABoqAjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAAAAAAAAAHCBEB0AAAAAAAAAABcI0QEAAAAAAAAAcIEQHQAAAAAAAAAAFwjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAACAemCxWPTYY495uo16tWHDBlksFm3YsMHTrTSoXgAAjddjjz0mi8WiH374oU4/55577lH79u3r9DMA/B9CdAAAAAAAAAAAXPD1dAMAAAAAAAAALt6LL76oqqoqT7cBNBmE6AAAAAAAAIAX8fPz83QLQJPCdi5AI3TgwAFNnjxZXbt2VVBQkFq3bq3bbrtN33777Xm1+fn5uuGGGxQUFKQrr7xS6enpWrx4sSwWy3n177//vq6//no1b95cLVu21LBhw7R79+76uSgAQJN1MfPa1q1bZbFYtGTJkvPev3btWlksFr377rvm2IYNG3TVVVcpMDBQHTt21AsvvGDuYVpbb7zxhnr06KHAwED17NlTK1asuKh9Sl3VuOrj1Vdf1TXXXKNmzZqpVatWSkxM1AcffOBUk5WVpdjYWAUEBCgiIkKpqakqKSlxqtm3b59GjRolu92uwMBAXXnllRozZoxKS0vP+7x+/fopKChIoaGhGjNmjA4ePHhRX5ML2bx5swYPHiybzaZmzZrphhtu0CeffGIe/+c//ymLxaKNGzee994XXnhBFotFu3btMsf+/e9/63e/+51CQ0MVGBioq666SqtWrXJLrwAAXIoffvhBo0ePVnBwsFq3bq37779fp06dMo9bLBZNmTLF/D4iKChI8fHx2rlzp6Sf57tOnTopMDBQAwcOPO/f5+yJDtQvVqIDjdCWLVv06aefasyYMbryyiv17bffauHChRo4cKD27NmjZs2aSZK+++47/frXv5bFYtGMGTPUvHlz/f3vf1dAQMB553zllVc0btw4JScn68knn9TJkye1cOFCDRgwQNu3b2fyBgDUmYuZ16666ip16NBBr7/+usaNG+f0/tdee02tWrVScnKyJGn79u0aPHiwwsPDlZaWpsrKSs2dO1dt27atdW/vvfeebr/9dvXq1UsZGRk6evSoxo8fryuuuMIt114tLS1Njz32mK677jrNnTtX/v7+2rx5s9avX6+bbrpJ0s/he1pampKSkjRp0iTt3btXCxcu1JYtW/TJJ5/Iz89Pp0+fVnJyssrLyzV16lTZ7XZ99913evfdd1VSUiKbzSZJmjdvnmbNmqXRo0frv/7rv/T999/rueeeU2JiorZv366QkJBLvpb169dryJAh6tevn+bMmSMfHx8tXrxYN954o/71r3/pmmuu0bBhw9SiRQu9/vrruuGGG5ze/9prryk2NlY9e/aUJO3evVsJCQm64oor9Mgjj6h58+Z6/fXXNXLkSL355pu65ZZbLrlXAAAu1ejRo9W+fXtlZGRo06ZNWrBggY4ePaqlS5eaNf/617+0atUqpaamSpIyMjI0fPhwPfTQQ8rKytLkyZN19OhRPfXUU/r973+v9evXe+pyABgAGp2TJ0+eN5aXl2dIMpYuXWqOTZ061bBYLMb27dvNsR9//NEIDQ01JBkFBQWGYRjGsWPHjJCQEGPChAlO53Q4HIbNZjtvHAAAd7rYeW3GjBmGn5+fUVxcbI6Vl5cbISEhxu9//3tz7OabbzaaNWtmfPfdd+bYvn37DF9fX6O23x736tXLuPLKK41jx46ZYxs2bDAkGdHR0U61kow5c+aYr8eNG3dejWEYxpw5c5z62Ldvn+Hj42PccsstRmVlpVNtVVWVYRiGceTIEcPf39+46aabnGr+9re/GZKM//3f/zUMwzC2b99uSDLeeOMNl9f07bffGlar1Zg3b57T+M6dOw1fX9/zxn/JRx99ZEgyPvroI7Pfzp07G8nJyWbvhvHzn3FMTIzxm9/8xhy74447jHbt2hlnzpwxxw4fPmz4+PgYc+fONccGDRpk9OrVyzh16pTT1+W6664zOnfu7LIXAADqQvU8/tvf/tZpfPLkyYYk44svvjAM4+fvCwICAsx/dxuGYbzwwguGJMNutxtlZWXm+IwZM5z+jW4Yrr+PAFA32M4FaISCgoLM5xUVFfrxxx/VqVMnhYSE6PPPPzePrVmzRvHx8erdu7c5FhoaqpSUFKfzffjhhyopKdEdd9yhH374wXxYrVb1799fH330UZ1fEwCg6brYee32229XRUWF3nrrLXPsgw8+UElJiW6//XZJUmVlpdatW6eRI0cqIiLCrOvUqZOGDBlSq74OHTqknTt3auzYsWrRooU5fsMNN6hXr161vk5XVq5cqaqqKs2ePVs+Ps7fvldv+7Ju3TqdPn1a06ZNc6qZMGGCgoOD9d5770mSudJ87dq1OnnyZI2f99Zbb6mqqkqjR492mvftdrs6d+58WfP+jh07tG/fPt1555368ccfzXOfOHFCgwYNUm5urnmTtNtvv11HjhzRhg0bzPf/85//VFVVlfnnWVxcrPXr12v06NE6duyYeb4ff/xRycnJ2rdvn7777rtL7hcAgEtVvbq82tSpUyVJq1evNscGDRrk9Fvd/fv3lySNGjVKLVu2PG/8m2++qat2AVwA27kAjdBPP/2kjIwMLV68WN99950MwzCPnb3f6YEDBxQfH3/e+zt16uT0et++fZKkG2+8scbPCw4OdkfbAADU6GLntV/96lfq1q2bXnvtNY0fP17Sz1t/tGnTxpzDjhw5op9++um8uU46f/67kAMHDrh8X6dOnZwC/svx9ddfy8fHRz169LhgL127dnUa9/f3V4cOHczjMTExmj59up555hllZ2fr+uuv129/+1vdddddZsC+b98+GYahzp071/hZl3Mjs+rvKc7dcudspaWlatWqlbln+muvvaZBgwZJ+vnPs3fv3urSpYskaf/+/TIMQ7NmzdKsWbNqPN+RI0fcvr0OAAAXcu482rFjR/n4+DjtbR4VFeVUUz0XR0ZG1jh+9OjROugUwMUgRAcaoalTp2rx4sWaNm2a4uPjZbPZZLFYNGbMGHN1V21Uv+eVV16R3W4/77ivL3+VAADqTm3mtdtvv13z5s3TDz/8oJYtW2rVqlW64447GuRc5eomppWVlXX6uX/96191zz336O2339YHH3yg++67z9yv9corr1RVVZUsFovef/99Wa3W895/9qr72qr+8/rLX/7i9JtwNZ0/ICBAI0eO1IoVK5SVlaWioiJ98skneuKJJ84734MPPmjueX+u2v5wBACAulDTvF/TPPtL42cvJABQvxrevyYAXLZ//vOfGjdunP7617+aY6dOnVJJSYlTXXR0tPbv33/e+88d69ixoySpXbt2SkpKcn/DAAD8goud16SfQ/S0tDS9+eabCgsLU1lZmcaMGWMeb9eunQIDAy9q/ruQ6Ohol++7mHO1atWqxmuoXjVerWPHjqqqqtKePXtcBs/Vvezdu1cdOnQwx0+fPq2CgoLz5u9evXqpV69emjlzpj799FMlJCTo+eefV3p6ujp27CjDMBQTE2Ou+HaX6u8pgoODL+p7ittvv11LlixRTk6OvvzySxmGYW7lIsm8Vj8/P75HAQA0KPv27VNMTIz5ev/+/aqqqnLavgWA92BPdKARslqt5/2E+rnnnjtvZVtycrLy8vK0Y8cOc6y4uFjZ2dnn1QUHB+uJJ55QRUXFeZ/3/fffu695AADOcbHzmiR1795dvXr
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def plot_numeric_boxplots(dataframe):\n",
" # Фильтрация числовых столбцов\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" \n",
" # Построение графиков\n",
" if numeric_columns:\n",
" plt.figure(figsize=(15, 5))\n",
" \n",
" for i, col in enumerate(numeric_columns):\n",
" if col != 'id':\n",
" plt.subplot(1, len(numeric_columns), i + 1)\n",
" sns.boxplot(y=dataframe[col])\n",
" plt.title(f'{col}')\n",
" plt.ylabel('')\n",
" plt.xlabel(col)\n",
" \n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"Нет подходящих числовых столбцов для построения графиков.\")\n",
"\n",
"plot_numeric_boxplots(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Видим выбросы в столбцах со средним уровнем глюкозы и в столбце bmi (индекс массы тела). устраним выбросы - поставим верхние и нижние границы</p>"
]
},
{
"cell_type": "code",
"execution_count": 334,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcoElEQVR4nO3dfZiVVb0//vcgMIPADIIxAwmI+AAGPhuSZqQk4kOgnJSiNPPIqfCRThpH0SSJ9Fh5NMQ0j2lBmqUezcQUBcpQEUVNjdBQSAVKZUZQRmT2749+7m8j7BIFBobX67ruS/Za6177cw9erJn33HvdZYVCoRAAAAAAAGAtLZq6AAAAAAAA2FwJ0QEAAAAAoAQhOgAAAAAAlCBEBwAAAACAEoToAAAAAABQghAdAAAAAABKEKIDAAAAAEAJQnQAAAAAAChBiA4AAAAAACUI0QEAYBMoKyvLN7/5zaYuY5OaMWNGysrKMmPGjKYuZbOqBYDm65vf/GbKysryt7/9baO+zxe/+MXsuOOOG/U9gP9HiA4AAAAAACW0bOoCAAAAAID37pprrklDQ0NTlwFbDSE6AAAAAGxBWrVq1dQlwFbFdi7QDL3wwgv56le/mt122y1t2rRJp06d8pnPfCbPP//8WmOfeOKJfOITn0ibNm2yww475KKLLsp1112XsrKytcbfdddd+fjHP562bdumffv2OfLII/PUU09tmosCYKv1Xta1Rx55JGVlZbn++uvXOv/uu+9OWVlZfvWrXxXbZsyYkf322y8VFRXp1atXfvjDHxb3MF1fN998c3bfffdUVFSkb9++ufXWW9/TPqWlxpSq46c//Wk++tGPZtttt812222Xgw8+OL/5zW8ajbnyyivzkY98JOXl5enatWtGjx6d5cuXNxqzYMGCDB8+PDU1NamoqMgOO+yQESNGpLa2dq3323fffdOmTZt07NgxI0aMyOLFi9/T1+Rfeeihh3L44Yenqqoq2267bT7xiU/kgQceKPb/4he/SFlZWWbOnLnWuT/84Q9TVlaWP/zhD8W2P/7xj/m3f/u3dOzYMRUVFdlvv/1y++23b5BaAeD9+Nvf/pbjjjsulZWV6dSpU84444ysWrWq2F9WVpZTTz21+H1EmzZtMmDAgDz55JNJ/r7e7bzzzqmoqMjAgQPX+vncnuiwabkTHZqhOXPm5Pe//31GjBiRHXbYIc8//3wmT56cgQMH5umnn862226bJHnxxRfzyU9+MmVlZRk7dmzatm2bH/3oRykvL19rzp/85Cc58cQTM3jw4Fx88cV54403Mnny5Bx00EF57LHHLN4AbDTvZV3bb7/9stNOO+XnP/95TjzxxEbn33TTTdluu+0yePDgJMljjz2Www8/PF26dMmFF16YNWvWZPz48fnQhz603rXdeeedOf7449OvX79MnDgxr732Wk4++eR8+MMf3iDX/o4LL7ww3/zmN/Oxj30s48ePT+vWrfPQQw/lvvvuy2GHHZbk7+H7hRdemEGDBuUrX/lK5s+fn8mTJ2fOnDl54IEH0qpVq7z11lsZPHhw6uvrc9ppp6WmpiYvvvhifvWrX2X58uWpqqpKkkyYMCHjxo3Lcccdl3//93/PX//611xxxRU5+OCD89hjj6VDhw7v+1ruu+++DBkyJPvuu28uuOCCtGjRItddd10OOeSQ/Pa3v81HP/rRHHnkkWnXrl1+/vOf5xOf+ESj82+66aZ85CMfSd++fZMkTz31VA488MB8+MMfzje+8Y20bds2P//5zzNs2LD88pe/zDHHHPO+awWA9+u4447LjjvumIkTJ+bBBx/M5Zdfntdeey033HBDccxvf/vb3H777Rk9enSSZOLEiTnqqKNy9tln58orr8xXv/rVvPbaa7nkkkvypS99Kffdd19TXQ5QAJqdN954Y6222bNnF5IUbrjhhmLbaaedVigrKys89thjxbZXXnml0LFjx0KSwsKFCwuFQqHw+uuvFzp06FA45ZRTGs25ZMmSQlVV1VrtALAhvdd1bezYsYVWrVoVXn311WJbfX19oUOHDoUvfelLxbajjz66sO222xZefPHFYtuCBQsKLVu2LKzvt8f9+vUr7LDDDoXXX3+92DZjxoxCkkKPHj0ajU1SuOCCC4qvTzzxxLXGFAqFwgUXXNCojgULFhRatGhROOaYYwpr1qxpNLahoaFQKBQKy5YtK7Ru3bpw2GGHNRrzgx/8oJCk8L//+7+FQqFQeOyxxwpJCjfffHPJa3r++ecL22yzTWHChAmN2p988slCy5Yt12r/Z+6///5CksL9999frHeXXXYpDB48uFh7ofD3v+OePXsWPvWpTxXbPvvZzxY6d+5cePvtt4ttL7/8cqFFixaF8ePHF9sOPfTQQr9+/QqrVq1q9HX52Mc+Vthll11K1gIAG8M76/inP/3pRu1f/epXC0kKjz/+eKFQ+Pv3BeXl5cWfuwuFQuGHP/xhIUmhpqamUFdXV2wfO3Zso5/RC4XS30cAG4ftXKAZatOmTfHPq1evziuvvJKdd945HTp0yKOPPlrsmzZtWgYMGJC99tqr2NaxY8eMHDmy0Xz33HNPli9fns9+9rP529/+Vjy22Wab9O/fP/fff/9GvyYAtl7vdV07/vjjs3r16txyyy3Ftt/85jdZvnx5jj/++CTJmjVrcu+992bYsGHp2rVrcdzOO++cIUOGrFddL730Up588smccMIJadeuXbH9E5/4RPr167fe11nKbbfdloaGhpx//vlp0aLxt+/vbPty77335q233sqZZ57ZaMwpp5ySysrK3HnnnUlSvNP87rvvzhtvvLHO97vlllvS0NCQ4447rtG6X1NTk1122eUDrfvz5s3LggUL8rnPfS6vvPJKce6VK1fm0EMPzaxZs4oPSTv++OOzbNmyzJgxo3j+L37xizQ0NBT/Pl999dXcd999Oe644/L6668X53vllVcyePDgLFiwIC+++OL7rhcA3q937i5/x2mnnZYk+fWvf11sO/TQQxt9qrt///5JkuHDh6d9+/Zrtf/5z3/eWOUC/4LtXKAZevPNNzNx4sRcd911efHFF1MoFIp9/7jf6QsvvJABAwasdf7OO+/c6PWCBQuSJIcccsg636+ysnJDlA0A6/Re17U999wzvXv3zk033ZSTTz45yd+3/th+++2La9iyZcvy5ptvrrXWJWuvf//KCy+8UPK8nXfeuVHA/0E899xzadGiRXbfffd/Wctuu+3WqL1169bZaaediv09e/bMmDFj8r3vfS9TpkzJxz/+8Xz605/O5z//+WLAvmDBghQKheyyyy7rfK8P8iCzd76nePeWO/+otrY22223XXHP9JtuuimHHnpokr//fe61117ZddddkyTPPvtsCoVCxo0bl3Hjxq1zvmXLlm3w7XUA4F959zraq1evtGjRotHe5t27d2805p21uFu3butsf+211zZCpcB7IUSHZui0007LddddlzPPPDMDBgxIVVVVysrKMmLEiOLdXevjnXN+8pOfpKamZq3+li39UwLAxrM+69rxxx+fCRMm5G9/+1vat2+f22+/PZ/97Gc3y7Wq1ENM16xZs1Hf97vf/W6++MUv5v/+7//ym9/8Jqeffnpxv9YddtghDQ0NKSsry1133ZVtttlmrfP/8a779fXO39d///d/N/ok3LrmLy8vz7Bhw3LrrbfmyiuvzNKlS/PAAw/k29/+9lrz/ed//mdxz/t3W99fjgDAxrCudX9d6+w/a//HGwmATWvz+2kC+MB+8Ytf5MQTT8x3v/vdYtuqVauyfPnyRuN69OiRZ599dq3z393Wq1evJEnnzp0zaNCgDV8wAPwT73VdS/4eol944YX55S9/merq6tTV1WXEiBHF/s6dO6eiouI9rX//So8ePUqe917m2m677dZ5De/cNf6OXr16paGhIU8//XTJ4PmdWubPn5+ddtqp2P7WW29l4cKFa63f/fr1S79+/XLeeefl97//fQ488MBcddVVueiii9KrV68UCoX07NmzeMf3hvLO9xSVlZXv6XuK448/Ptdff32mT5+eZ555JoV
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def remove_outliers(df):\n",
"\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" for column in numeric_columns:\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
" return df\n",
" \n",
"data = remove_outliers(data)\n",
"plot_numeric_boxplots(data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Теперь можно и к конструированию признаков приступить) данные ведь сбалансированы (в выборках)</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Унитарное кодирование категориальных признаков <br/> <br/>Применяем к категориальным (НЕ числовым) признакам: 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'</p>"
]
},
{
"cell_type": "code",
"execution_count": 335,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные после унитарного кодирования:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>stroke</th>\n",
" <th>gender_Male</th>\n",
" <th>gender_Other</th>\n",
" <th>ever_married_Yes</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>Residence_type_Urban</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>67.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>169.3575</td>\n",
" <td>36.6</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>61.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>169.3575</td>\n",
" <td>28.1</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>80.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>105.9200</td>\n",
" <td>32.5</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>60182</td>\n",
" <td>49.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>169.3575</td>\n",
" <td>34.4</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1665</td>\n",
" <td>79.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>169.3575</td>\n",
" <td>24.0</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>56669</td>\n",
" <td>81.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>169.3575</td>\n",
" <td>29.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>53882</td>\n",
" <td>74.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>70.0900</td>\n",
" <td>27.4</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>10434</td>\n",
" <td>69.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>94.3900</td>\n",
" <td>22.8</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>27419</td>\n",
" <td>59.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>76.1500</td>\n",
" <td>28.1</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>60491</td>\n",
" <td>78.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>58.5700</td>\n",
" <td>24.2</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id age hypertension heart_disease avg_glucose_level bmi stroke \\\n",
"0 9046 67.0 0 1 169.3575 36.6 1 \n",
"1 51676 61.0 0 0 169.3575 28.1 1 \n",
"2 31112 80.0 0 1 105.9200 32.5 1 \n",
"3 60182 49.0 0 0 169.3575 34.4 1 \n",
"4 1665 79.0 1 0 169.3575 24.0 1 \n",
"5 56669 81.0 0 0 169.3575 29.0 1 \n",
"6 53882 74.0 1 1 70.0900 27.4 1 \n",
"7 10434 69.0 0 0 94.3900 22.8 1 \n",
"8 27419 59.0 0 0 76.1500 28.1 1 \n",
"9 60491 78.0 0 0 58.5700 24.2 1 \n",
"\n",
" gender_Male gender_Other ever_married_Yes work_type_Never_worked \\\n",
"0 True False True False \n",
"1 False False True False \n",
"2 True False True False \n",
"3 False False True False \n",
"4 False False True False \n",
"5 True False True False \n",
"6 True False True False \n",
"7 False False False False \n",
"8 False False True False \n",
"9 False False True False \n",
"\n",
" work_type_Private work_type_Self-employed work_type_children \\\n",
"0 True False False \n",
"1 False True False \n",
"2 True False False \n",
"3 True False False \n",
"4 False True False \n",
"5 True False False \n",
"6 True False False \n",
"7 True False False \n",
"8 True False False \n",
"9 True False False \n",
"\n",
" Residence_type_Urban smoking_status_formerly smoked \\\n",
"0 True True \n",
"1 False False \n",
"2 False False \n",
"3 True False \n",
"4 False False \n",
"5 True True \n",
"6 False False \n",
"7 True False \n",
"8 False False \n",
"9 True False \n",
"\n",
" smoking_status_never smoked smoking_status_smokes \n",
"0 False False \n",
"1 True False \n",
"2 True False \n",
"3 False True \n",
"4 True False \n",
"5 False False \n",
"6 True False \n",
"7 True False \n",
"8 False False \n",
"9 False False "
]
},
"execution_count": 335,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# One-Hot Encoding\n",
"categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
"data_edit_categories = pd.get_dummies(data, columns=categorical_columns, drop_first=True)\n",
"\n",
"print(\"Данные после унитарного кодирования:\")\n",
"data_edit_categories.head(10)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Дискретизация числовых признаков<br/><br/>Числовые признаки, такие как 'age', 'avg_glucose_level', 'bmi', можно разделить на категории (биннинг).</p>\n"
]
},
{
"cell_type": "code",
"execution_count": 336,
"metadata": {},
"outputs": [],
"source": [
"# data_edit_categories['age_bins'] = pd.cut(data_edit_categories['age'], bins=[0, 18, 30, 50, 100], labels=['ребенок', 'молодой', 'средний', 'пожилой'])\n",
"# data_edit_categories['bmi_bins'] = pd.cut(data_edit_categories['bmi'], bins=[0, 18.5, 25, 30, 50], labels=['низкий', 'норма', 'избыток', 'ожирение'])\n",
"\n",
"# print(\"Данные после дискретизации:\")\n",
"# data_edit_categories[['age_bins', 'bmi_bins']].head(10)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Ручной синтез новых признаков <br/><br/>\n",
"<li>Возрастной индекс глюкозы: age * avg_glucose_level\n",
"<li>Индекс массы тела с поправкой на глюкозу: bmi / avg_glucose_level </p>\n"
]
},
{
"cell_type": "code",
"execution_count": 337,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные после синтеза новых признаков:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age_glucose_index</th>\n",
" <th>bmi_glucose_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11346.9525</td>\n",
" <td>0.216111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>10330.8075</td>\n",
" <td>0.165921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8473.6000</td>\n",
" <td>0.306835</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>8298.5175</td>\n",
" <td>0.203121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13379.2425</td>\n",
" <td>0.141712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>13717.9575</td>\n",
" <td>0.171235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5186.6600</td>\n",
" <td>0.390926</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>6512.9100</td>\n",
" <td>0.241551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>4492.8500</td>\n",
" <td>0.369009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>4568.4600</td>\n",
" <td>0.413181</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age_glucose_index bmi_glucose_ratio\n",
"0 11346.9525 0.216111\n",
"1 10330.8075 0.165921\n",
"2 8473.6000 0.306835\n",
"3 8298.5175 0.203121\n",
"4 13379.2425 0.141712\n",
"5 13717.9575 0.171235\n",
"6 5186.6600 0.390926\n",
"7 6512.9100 0.241551\n",
"8 4492.8500 0.369009\n",
"9 4568.4600 0.413181"
]
},
"execution_count": 337,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_edit_categories['age_glucose_index'] = data_edit_categories['age'] * data_edit_categories['avg_glucose_level']\n",
"data_edit_categories['bmi_glucose_ratio'] = data_edit_categories['bmi'] / data_edit_categories['avg_glucose_level']\n",
"\n",
"print(\"Данные после синтеза новых признаков:\")\n",
"data_edit_categories[['age_glucose_index', 'bmi_glucose_ratio']].head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Масштабирование признаков<br/><br/>Применяем нормализацию (для сжатия в диапазон [0, 1]) и стандартизацию (для приведения к среднему 0 и стандартному отклонению 1)</p>"
]
},
{
"cell_type": "code",
"execution_count": 338,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные после нормализации:\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>stroke</th>\n",
" <th>gender_Male</th>\n",
" <th>gender_Other</th>\n",
" <th>ever_married_Yes</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>Residence_type_Urban</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" <th>age_glucose_index</th>\n",
" <th>bmi_glucose_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>0.816895</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>0.730556</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>11346.9525</td>\n",
" <td>0.216111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>0.743652</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.494444</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10330.8075</td>\n",
" <td>0.165921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>0.975586</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.444688</td>\n",
" <td>0.616667</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>8473.6000</td>\n",
" <td>0.306835</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>60182</td>\n",
" <td>0.597168</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.669444</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>8298.5175</td>\n",
" <td>0.203121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1665</td>\n",
" <td>0.963379</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.380556</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>13379.2425</td>\n",
" <td>0.141712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5105</th>\n",
" <td>18234</td>\n",
" <td>0.975586</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.250618</td>\n",
" <td>0.494444</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>6700.0000</td>\n",
" <td>0.335522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5106</th>\n",
" <td>44873</td>\n",
" <td>0.987793</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.613459</td>\n",
" <td>0.825000</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10141.2000</td>\n",
" <td>0.319489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5107</th>\n",
" <td>19723</td>\n",
" <td>0.426270</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.243965</td>\n",
" <td>0.563889</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>2904.6500</td>\n",
" <td>0.368719</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5108</th>\n",
" <td>37544</td>\n",
" <td>0.621582</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.973148</td>\n",
" <td>0.425000</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>8480.7900</td>\n",
" <td>0.153948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5109</th>\n",
" <td>44679</td>\n",
" <td>0.536133</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.264011</td>\n",
" <td>0.441667</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>3752.3200</td>\n",
" <td>0.307223</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5110 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" id age hypertension heart_disease avg_glucose_level \\\n",
"0 9046 0.816895 0 1 1.000000 \n",
"1 51676 0.743652 0 0 1.000000 \n",
"2 31112 0.975586 0 1 0.444688 \n",
"3 60182 0.597168 0 0 1.000000 \n",
"4 1665 0.963379 1 0 1.000000 \n",
"... ... ... ... ... ... \n",
"5105 18234 0.975586 1 0 0.250618 \n",
"5106 44873 0.987793 0 0 0.613459 \n",
"5107 19723 0.426270 0 0 0.243965 \n",
"5108 37544 0.621582 0 0 0.973148 \n",
"5109 44679 0.536133 0 0 0.264011 \n",
"\n",
" bmi stroke gender_Male gender_Other ever_married_Yes \\\n",
"0 0.730556 1 True False True \n",
"1 0.494444 1 False False True \n",
"2 0.616667 1 True False True \n",
"3 0.669444 1 False False True \n",
"4 0.380556 1 False False True \n",
"... ... ... ... ... ... \n",
"5105 0.494444 0 False False True \n",
"5106 0.825000 0 False False True \n",
"5107 0.563889 0 False False True \n",
"5108 0.425000 0 True False True \n",
"5109 0.441667 0 False False True \n",
"\n",
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
"0 False True False \n",
"1 False False True \n",
"2 False True False \n",
"3 False True False \n",
"4 False False True \n",
"... ... ... ... \n",
"5105 False True False \n",
"5106 False False True \n",
"5107 False False True \n",
"5108 False True False \n",
"5109 False False False \n",
"\n",
" work_type_children Residence_type_Urban \\\n",
"0 False True \n",
"1 False False \n",
"2 False False \n",
"3 False True \n",
"4 False False \n",
"... ... ... \n",
"5105 False True \n",
"5106 False True \n",
"5107 False False \n",
"5108 False False \n",
"5109 False True \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
"0 True False \n",
"1 False True \n",
"2 False True \n",
"3 False False \n",
"4 False True \n",
"... ... ... \n",
"5105 False True \n",
"5106 False True \n",
"5107 False True \n",
"5108 True False \n",
"5109 False False \n",
"\n",
" smoking_status_smokes age_glucose_index bmi_glucose_ratio \n",
"0 False 11346.9525 0.216111 \n",
"1 False 10330.8075 0.165921 \n",
"2 False 8473.6000 0.306835 \n",
"3 True 8298.5175 0.203121 \n",
"4 False 13379.2425 0.141712 \n",
"... ... ... ... \n",
"5105 False 6700.0000 0.335522 \n",
"5106 False 10141.2000 0.319489 \n",
"5107 False 2904.6500 0.368719 \n",
"5108 False 8480.7900 0.153948 \n",
"5109 False 3752.3200 0.307223 \n",
"\n",
"[5110 rows x 20 columns]"
]
},
"execution_count": 338,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n",
"scaler = MinMaxScaler()\n",
"standardizer = StandardScaler()\n",
"\n",
"# Нормализация\n",
"data_edit_categories[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(data_edit_categories[['age', 'avg_glucose_level', 'bmi']])\n",
"print(\"Данные после нормализации:\\n\")\n",
"data_edit_categories\n",
"\n",
"\n",
"# # Стандартизация\n",
"# X_encoded[['age', 'avg_glucose_level', 'bmi']] = standardizer.fit_transform(X_encoded[['age', 'avg_glucose_level', 'bmi']])\n",
"# print(\"Данные после стандартизации:\\n\", X_encoded.head(10))\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Конструирование признаков с применением фреймворка Featuretools</p>"
]
},
{
"cell_type": "code",
"execution_count": 339,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Столбцы в data: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 0\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Сгенерированные признаки:\n",
" gender age hypertension heart_disease ever_married work_type \\\n",
"id \n",
"9046 Male 67.0 0 1 True Private \n",
"51676 Female 61.0 0 0 True Self-employed \n",
"31112 Male 80.0 0 1 True Private \n",
"60182 Female 49.0 0 0 True Private \n",
"1665 Female 79.0 1 0 True Self-employed \n",
"\n",
" Residence_type avg_glucose_level bmi smoking_status stroke \n",
"id \n",
"9046 Urban 169.3575 36.6 formerly smoked 1 \n",
"51676 Rural 169.3575 28.1 never smoked 1 \n",
"31112 Rural 105.9200 32.5 never smoked 1 \n",
"60182 Urban 169.3575 34.4 smokes 1 \n",
"1665 Rural 169.3575 24.0 never smoked 1 \n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gender</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>ever_married</th>\n",
" <th>work_type</th>\n",
" <th>Residence_type</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>smoking_status</th>\n",
" <th>stroke</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9046</th>\n",
" <td>Male</td>\n",
" <td>67.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>169.3575</td>\n",
" <td>36.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51676</th>\n",
" <td>Female</td>\n",
" <td>61.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Self-employed</td>\n",
" <td>Rural</td>\n",
" <td>169.3575</td>\n",
" <td>28.1</td>\n",
" <td>never smoked</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31112</th>\n",
" <td>Male</td>\n",
" <td>80.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>Private</td>\n",
" <td>Rural</td>\n",
" <td>105.9200</td>\n",
" <td>32.5</td>\n",
" <td>never smoked</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60182</th>\n",
" <td>Female</td>\n",
" <td>49.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>169.3575</td>\n",
" <td>34.4</td>\n",
" <td>smokes</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1665</th>\n",
" <td>Female</td>\n",
" <td>79.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Self-employed</td>\n",
" <td>Rural</td>\n",
" <td>169.3575</td>\n",
" <td>24.0</td>\n",
" <td>never smoked</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18234</th>\n",
" <td>Female</td>\n",
" <td>80.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Private</td>\n",
" <td>Urban</td>\n",
" <td>83.7500</td>\n",
" <td>28.1</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44873</th>\n",
" <td>Female</td>\n",
" <td>81.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Self-employed</td>\n",
" <td>Urban</td>\n",
" <td>125.2000</td>\n",
" <td>40.0</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19723</th>\n",
" <td>Female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Self-employed</td>\n",
" <td>Rural</td>\n",
" <td>82.9900</td>\n",
" <td>30.6</td>\n",
" <td>never smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37544</th>\n",
" <td>Male</td>\n",
" <td>51.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Private</td>\n",
" <td>Rural</td>\n",
" <td>166.2900</td>\n",
" <td>25.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44679</th>\n",
" <td>Female</td>\n",
" <td>44.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>Govt_job</td>\n",
" <td>Urban</td>\n",
" <td>85.2800</td>\n",
" <td>26.2</td>\n",
" <td>Unknown</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5110 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" gender age hypertension heart_disease ever_married work_type \\\n",
"id \n",
"9046 Male 67.0 0 1 True Private \n",
"51676 Female 61.0 0 0 True Self-employed \n",
"31112 Male 80.0 0 1 True Private \n",
"60182 Female 49.0 0 0 True Private \n",
"1665 Female 79.0 1 0 True Self-employed \n",
"... ... ... ... ... ... ... \n",
"18234 Female 80.0 1 0 True Private \n",
"44873 Female 81.0 0 0 True Self-employed \n",
"19723 Female 35.0 0 0 True Self-employed \n",
"37544 Male 51.0 0 0 True Private \n",
"44679 Female 44.0 0 0 True Govt_job \n",
"\n",
" Residence_type avg_glucose_level bmi smoking_status stroke \n",
"id \n",
"9046 Urban 169.3575 36.6 formerly smoked 1 \n",
"51676 Rural 169.3575 28.1 never smoked 1 \n",
"31112 Rural 105.9200 32.5 never smoked 1 \n",
"60182 Urban 169.3575 34.4 smokes 1 \n",
"1665 Rural 169.3575 24.0 never smoked 1 \n",
"... ... ... ... ... ... \n",
"18234 Urban 83.7500 28.1 never smoked 0 \n",
"44873 Urban 125.2000 40.0 never smoked 0 \n",
"19723 Rural 82.9900 30.6 never smoked 0 \n",
"37544 Rural 166.2900 25.6 formerly smoked 0 \n",
"44679 Urban 85.2800 26.2 Unknown 0 \n",
"\n",
"[5110 rows x 11 columns]"
]
},
"execution_count": 339,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import featuretools as ft\n",
"\n",
"print(\"Столбцы в data:\", data.columns.tolist())\n",
"print(data.isnull().sum())\n",
"\n",
"# Создание EntitySet (основная структура для Featuretools)\n",
"entity = ft.EntitySet(id=\"stroke_prediction\")\n",
"\n",
"entity = entity.add_dataframe(\n",
" dataframe_name=\"data\", \n",
" dataframe=data, \n",
" index=\"id\",\n",
")\n",
"\n",
"# Генерация новых признаков\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=entity,\n",
" target_dataframe_name=\"data\", # Основная таблица\n",
" max_depth=2 # Уровень вложенности\n",
")\n",
"\n",
"print(\"Сгенерированные признаки:\")\n",
"print(feature_matrix.head())\n",
"\n",
"# Сохранение результатов\n",
"feature_matrix.to_csv(\"./csv/generated_features_copy.csv\", index=False)\n",
"feature_matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Так, теперь разобьем на выборки</p>"
]
},
{
"cell_type": "code",
"execution_count": 340,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Размеры выборок:\n",
"Обучающая выборка: (4088, 18)\n",
"Тестовая выборка: (511, 18)\n",
"Контрольная выборка: (511, 18)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>stroke</th>\n",
" <th>gender_Male</th>\n",
" <th>gender_Other</th>\n",
" <th>ever_married_Yes</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>Residence_type_Urban</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" <th>age_glucose_index</th>\n",
" <th>bmi_glucose_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9046</td>\n",
" <td>0.816895</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" <td>0.730556</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>11346.9525</td>\n",
" <td>0.216111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>51676</td>\n",
" <td>0.743652</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.494444</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10330.8075</td>\n",
" <td>0.165921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>31112</td>\n",
" <td>0.975586</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.444688</td>\n",
" <td>0.616667</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>8473.6000</td>\n",
" <td>0.306835</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>60182</td>\n",
" <td>0.597168</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.669444</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>8298.5175</td>\n",
" <td>0.203121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1665</td>\n",
" <td>0.963379</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.380556</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>13379.2425</td>\n",
" <td>0.141712</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5105</th>\n",
" <td>18234</td>\n",
" <td>0.975586</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.250618</td>\n",
" <td>0.494444</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>6700.0000</td>\n",
" <td>0.335522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5106</th>\n",
" <td>44873</td>\n",
" <td>0.987793</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.613459</td>\n",
" <td>0.825000</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>10141.2000</td>\n",
" <td>0.319489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5107</th>\n",
" <td>19723</td>\n",
" <td>0.426270</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.243965</td>\n",
" <td>0.563889</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>2904.6500</td>\n",
" <td>0.368719</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5108</th>\n",
" <td>37544</td>\n",
" <td>0.621582</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.973148</td>\n",
" <td>0.425000</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>8480.7900</td>\n",
" <td>0.153948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5109</th>\n",
" <td>44679</td>\n",
" <td>0.536133</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.264011</td>\n",
" <td>0.441667</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>3752.3200</td>\n",
" <td>0.307223</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5110 rows × 20 columns</p>\n",
"</div>"
],
"text/plain": [
" id age hypertension heart_disease avg_glucose_level \\\n",
"0 9046 0.816895 0 1 1.000000 \n",
"1 51676 0.743652 0 0 1.000000 \n",
"2 31112 0.975586 0 1 0.444688 \n",
"3 60182 0.597168 0 0 1.000000 \n",
"4 1665 0.963379 1 0 1.000000 \n",
"... ... ... ... ... ... \n",
"5105 18234 0.975586 1 0 0.250618 \n",
"5106 44873 0.987793 0 0 0.613459 \n",
"5107 19723 0.426270 0 0 0.243965 \n",
"5108 37544 0.621582 0 0 0.973148 \n",
"5109 44679 0.536133 0 0 0.264011 \n",
"\n",
" bmi stroke gender_Male gender_Other ever_married_Yes \\\n",
"0 0.730556 1 True False True \n",
"1 0.494444 1 False False True \n",
"2 0.616667 1 True False True \n",
"3 0.669444 1 False False True \n",
"4 0.380556 1 False False True \n",
"... ... ... ... ... ... \n",
"5105 0.494444 0 False False True \n",
"5106 0.825000 0 False False True \n",
"5107 0.563889 0 False False True \n",
"5108 0.425000 0 True False True \n",
"5109 0.441667 0 False False True \n",
"\n",
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
"0 False True False \n",
"1 False False True \n",
"2 False True False \n",
"3 False True False \n",
"4 False False True \n",
"... ... ... ... \n",
"5105 False True False \n",
"5106 False False True \n",
"5107 False False True \n",
"5108 False True False \n",
"5109 False False False \n",
"\n",
" work_type_children Residence_type_Urban \\\n",
"0 False True \n",
"1 False False \n",
"2 False False \n",
"3 False True \n",
"4 False False \n",
"... ... ... \n",
"5105 False True \n",
"5106 False True \n",
"5107 False False \n",
"5108 False False \n",
"5109 False True \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
"0 True False \n",
"1 False True \n",
"2 False True \n",
"3 False False \n",
"4 False True \n",
"... ... ... \n",
"5105 False True \n",
"5106 False True \n",
"5107 False True \n",
"5108 True False \n",
"5109 False False \n",
"\n",
" smoking_status_smokes age_glucose_index bmi_glucose_ratio \n",
"0 False 11346.9525 0.216111 \n",
"1 False 10330.8075 0.165921 \n",
"2 False 8473.6000 0.306835 \n",
"3 True 8298.5175 0.203121 \n",
"4 False 13379.2425 0.141712 \n",
"... ... ... ... \n",
"5105 False 6700.0000 0.335522 \n",
"5106 False 10141.2000 0.319489 \n",
"5107 False 2904.6500 0.368719 \n",
"5108 False 8480.7900 0.153948 \n",
"5109 False 3752.3200 0.307223 \n",
"\n",
"[5110 rows x 20 columns]"
]
},
"execution_count": 340,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Определение признаков и целевой переменной\n",
"\n",
"# data_edit_categories = pd.read_csv('./csv/generated_features_copy.csv')\n",
"\n",
"\n",
"X = data_edit_categories.drop(columns=['id', 'stroke']) \n",
"y = data_edit_categories['stroke'] \n",
"\n",
"# Обучающая выборка\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)\n",
"\n",
"# Тестовая и контрольная выборки\n",
"X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=None, stratify=y_temp)\n",
"\n",
"print(\"\\nРазмеры выборок:\")\n",
"print(f\"Обучающая выборка: {X_train.shape}\")\n",
"print(f\"Тестовая выборка: {X_test.shape}\")\n",
"print(f\"Контрольная выборка: {X_control.shape}\")\n",
"\n",
"data_edit_categories\n"
]
},
{
"cell_type": "code",
"execution_count": 341,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stroke\n",
"0 4861\n",
"1 249\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9kUlEQVR4nO3deXyM9/7//2d2IZlEkIRSQmqJpa0oglpDqlFVnFYpqpyWhhYtTs6ptfXVcmorqqeLpaUt1XLKsUQUVSkaja042sahjSRCk0ErieT6/dFP5mckthGZxPW4325zu5n39b7e1+uaTDJP1/W+rnExDMMQAACAibk6uwAAAABnIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABABz2/PPPq3Pnzs4uw2bx4sVycXHRd999d8tj5ebmqkaNGlqwYEExVIbSjkCEO1rBH8eCR7ly5VS3bl0NHz5caWlpzi4PKNOSk5P13nvv6e9///tNrbdz505NmjRJmZmZt6ewYuLh4aHRo0dr6tSpunjxorPLwW1GIIIpTJkyRR9++KHmzZunVq1a6e2331ZERIR+//13Z5cGlFlz5sxRSEiIOnTocFPr7dy5U5MnTy71gUiSBg0apIyMDC1fvtzZpeA2IxDBFLp27aqnnnpKQ4YM0eLFizVy5EglJydrzZo1zi4NKJNyc3O1bNkyPf7447d1O/n5+U49OuPv768uXbpo8eLFTqsBJYNABFPq2LGjpD8P+UvS2bNn9fLLL6tx48by8fGRxWJR165dtW/fvkLrXrx4UZMmTVLdunVVrlw5Va1aVT179tRPP/0kSTp+/LjdaborH+3bt7eNtXXrVrm4uOjTTz/V3//+dwUHB6tChQrq3r27Tp48WWjbu3bt0kMPPSQ/Pz+VL19e7dq10zfffFPkPrZv377I7U+aNKlQ348++kjh4eHy9vZWQECA+vTpU+T2r7Vvl8vPz9fs2bPVsGFDlStXTkFBQXruuef022+/2fWrVauWunXrVmg7w4cPLzRmUbXPmDGj0GsqSdnZ2Zo4caJCQ0Pl5eWlGjVqaOzYscrOzi7ytbpc+/btC403depUubq6FjpKcKOvxz//+U+1atVKlSpVkre3t8LDw/XZZ58Vuf2PPvpIzZs3V/ny5VWxYkW1bdtWmzZtsuuzfv16tWvXTr6+vrJYLHrggQcK1bZy5Urbz7Ry5cp66qmn9Ouvv9r1efrpp+1qrlixotq3b6+vv/76uq/Tjh07lJGRocjIyELL3nrrLTVs2NC2D82aNbPVN2nSJI0ZM0aSFBISYtv28ePHJf35cx4+fLiWLVumhg0bysvLSxs2bJAkff/99+ratassFot8fHzUqVMnffvtt9et9bffflPz5s1VvXp1HT16VNLNvUc6d+6sHTt26OzZs9fdFsoud2cXADhDQXipVKmSJOnnn3/W6tWr9Ze//EUhISFKS0vTO++8o3bt2umHH35QtWrVJEl5eXnq1q2b4uPj1adPH7344os6d+6c4uLidPDgQdWpU8e2jSeffFIPP/yw3XZjY2OLrGfq1KlycXHRuHHjlJ6ertmzZysyMlJJSUny9vaWJG3ZskVdu3ZVeHi4Jk6cKFdXVy1atEgdO3bU119/rebNmxcat3r16po2bZok6fz58xo2bFiR2x4/frwef/xxDRkyRKdPn9Zbb72ltm3b6vvvv5e/v3+hdZ599lk9+OCDkqTPP/9cX3zxhd3y5557TosXL9agQYP0wgsvKDk5WfPmzdP333+vb775Rh4eHkW+DjcjMzPTtm+Xy8/PV/fu3bVjxw49++yzatCggQ4cOKBZs2bpv//9r1avXn1T21m0aJFeeeUVvfnmm+rbt2+Rfa73esyZM0fdu3dXv379lJOTo08++UR/+ctftHbtWkVHR9v6TZ48WZMmTVKrVq00ZcoUeXp6ateuXdqyZYu6dOki6c95cc8884waNmyo2NhY+fv76/vvv9eGDRts9RW89g888ICmTZumtLQ0zZkzR998802hn2nlypU1a9YsSdIvv/yiOXPm6OGHH9bJkyeL/NkX2Llzp1xcXHT//ffbtb/77rt64YUX1Lt3b7344ou6ePGi9u/fr127dqlv377q2bOn/vvf/+rjjz/WrFmzVLlyZUlSlSpVbGNs2bJFK1as0PDhw1W5cmXVqlVLhw4d0oMPPiiLxaKxY8fKw8ND77zzjtq3b69t27apRYsWRdaZkZGhzp076+zZs9q2bZvq1Klz0++R8PBwGYahnTt3FhngcYcwgDvYokWLDEnG5s2bjdOnTxsnT540PvnkE6NSpUqGt7e38csvvxiGYRgXL1408vLy7NZNTk42vLy8jClTptjaPvjgA0OSMXPmzELbys/Pt60nyZgxY0ahPg0bNjTatWtne/7VV18Zkoy77rrLsFqttvYVK1YYkow5c+bYxr7nnnuMqKgo23YMwzB+//13IyQkxOjcuXOhbbVq1cpo1KiR7fnp06cNScbEiRNtbcePHzfc3NyMqVOn2q174MABw93dvVD7sWPHDEnGkiVLbG0TJ040Lv9T8vXXXxuSjGXLltmtu2HDhkLtNWvWNKKjowvVHhMTY1z55+nK2seOHWsEBgYa4eHhdq/phx9+aLi6uhpff/213foLFy40JBnffPNNoe1drl27drbx1q1bZ7i7uxsvvfRSkX1v5PUwjD9/TpfLyckxGjVqZHTs2NFuLFdXV+Oxxx4r9F4s+JlnZmYavr6+RosWLYw//vijyD45OTlGYGCg0ahRI7s+a9euNSQZEyZMsLUNHDjQqFmzpt04//rXvwxJxu7du4vc5wJPPfWUUalSpULtjz76qNGwYcNrrjtjxgxDkpGcnFxomSTD1dXVOHTokF17jx49DE9PT+Onn36ytaWkpBi+vr5G27ZtbW0Fv/N79uwxTp06ZTRs2NCoXbu2cfz4cVufm32PpKSkGJKMN95445r7hbKNU2YwhcjISFWpUkU1atRQnz595OPjoy+++EJ33XWXJMnLy0uurn/+OuTl5enMmTPy8fFRvXr1tHfvXts4q1atUuXKlTVixIhC27jyNMnNGDBggHx9fW3Pe/furapVq+o///mPJCkpKUnHjh1T3759debMGWVkZCgjI0MXLlxQp06dtH37duXn59uNefHiRZUrV+6a2/3888+Vn5+vxx9/3DZmRkaGgoODdc899+irr76y65+TkyPpz9fralauXCk/Pz917tzZbszw8HD5+PgUGjM3N9euX0ZGxnXnjPz666966623NH78ePn4+BTafoMGDVS/fn27MQtOk165/avZvXu3Hn/8cfXq1UszZswoss+NvB6SbEf5pD9P32RlZenBBx+0e2+tXr1a+fn5mjBhgu29WKDgvRUXF6dz587pb3/7W6GfbUGf7777Tunp6Xr++eft+kRHR6t+/fpat26d3Xr5+fm21ygpKUlLly5V1apV1aBBg2vu05kzZ1SxYsVC7f7+/vrll1+0Z8+ea65/Le3atVNYWJjteV5enjZt2qQePXqodu3atvaqVauqb9++2rFjh6xWq90Yv/zyi9q1a6fc3Fxt375dNWvWtC272fdIwX5mZGQ4vE8o/ThlBlOYP3++6tatK3d3dwUFBalevXp2Hzr5+fmaM2eOFixYoOTkZOXl5dmWFZxWk/481VavXj25uxfvr84999xj99zFxUWhoaG2eRXHjh2TJA0cOPCqY2RlZdl9QGVkZBQa90rHjh2TYRhX7Xflqa2Cq4KuDCFXjpmVlaXAwMAil6enp9s937Rpk93pkhsxceJEVatWTc8991yhuTjHjh3T4cOHrzrmldsvyq+//qro6GhduHBBZ86cuWrYvZHXQ5LWrl2r1157TUlJSXZzVC4f96effpKrq6tdELhSwaneRo0aXbXP//73P0lSvXr1Ci2rX7++duzYYdd28uRJu9eqatWqWrVq1XX3SZIMwyjUNm7cOG3evFnNmzdXaGiounTpor59+6p169bXHa9ASEiI3fPTp0/r999/L3KfGjR
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Подсчет количества объектов каждого класса\n",
"class_counts = y.value_counts()\n",
"print(class_counts)\n",
"\n",
"# Визуализация\n",
"sns.barplot(x=class_counts.index, y=class_counts.values)\n",
"plt.title(\"Распределение классов (stroke)\")\n",
"plt.xlabel(\"Класс\")\n",
"plt.ylabel(\"Количество\")\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Напишем функцию и сделаем аугментацию данных</p>"
]
},
{
"cell_type": "code",
"execution_count": 342,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\n",
"\n",
"stroke\n",
"0 3889\n",
"1 199\n",
"Name: count, dtype: int64\n",
"\n",
"После оверсемплинга\n",
"\n",
"stroke\n",
"0 3889\n",
"1 777\n",
"Name: count, dtype: int64\n",
"\n",
"После балансировки данных (андерсемплинга)\n",
"\n",
"stroke\n",
"0 777\n",
"1 777\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGbCAYAAAAr/4yjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8YklEQVR4nO3deXhTVcIG8PfeJE3apmVpaUsBoew7aNlBdlAEFRFBEUX8UGdcZ3R0hnHcdRwHR0Vxm3EEQcYFERxEFkGQVUD2nQItlK2lBUq3NNv5/ii9Q2iBLklO7s378+mDTXPvfZOmfXvOuUkUIYQAERERAFV2ACIiCh0sBSIi0rAUiIhIw1IgIiINS4GIiDQsBSIi0rAUiIhIw1IgIiINS4GIiDQsBSKqkYcffhhDhgyRHUMzY8YMKIqCX3/9tcb7crlcaNSoET744AM/JNMHw5dC2QOk7MNms6Fly5Z49NFHkZWVJTseka6lp6fjk08+wZ///Ocqbbdu3Tq8+OKLOHfuXGCC+YnFYsGTTz6J1157DQ6HQ3acoDB8KZR5+eWXMWvWLEybNg29evXChx9+iJ49e6KoqEh2NCLdmjp1KlJSUjBgwIAqbbdu3Tq89NJLIV8KADBx4kTk5OTgP//5j+woQRE2pTBs2DCMHz8ekyZNwowZM/C73/0O6enp+O6772RHI9Ill8uF2bNnY8yYMQE9jtfrlfpXeu3atTF06FDMmDFDWoZgCptSuNTAgQMBlA5/AeDMmTP4wx/+gA4dOsButyM2NhbDhg3D9u3by23rcDjw4osvomXLlrDZbKhfvz5GjRqFQ4cOAQAyMjJ8pqwu/ejfv7+2r5UrV0JRFHz11Vf485//jKSkJERHR+OWW25BZmZmuWNv2LABN954I2rVqoWoqCj069cPa9eurfA29u/fv8Ljv/jii+Wu+/nnnyM1NRWRkZGoW7cu7rzzzgqPf6XbdjGv14t33nkH7dq1g81mQ2JiIh566CGcPXvW53pNmjTBiBEjyh3n0UcfLbfPirJPmTKl3H0KACUlJXjhhRfQvHlzWK1WNGrUCM888wxKSkoqvK8u1r9//3L7e+2116Cqarm/Fit7f7z55pvo1asX4uLiEBkZidTUVHzzzTcVHv/zzz9Ht27dEBUVhTp16qBv375YunSpz3UWLVqEfv36ISYmBrGxsejatWu5bHPmzNG+p/Hx8Rg/fjyOHz/uc5377rvPJ3OdOnXQv39/rF69+qr305o1a5CTk4PBgweX+9p7772Hdu3aabehS5cuWr4XX3wRTz/9NAAgJSVFO3ZGRgaA0u/zo48+itmzZ6Ndu3awWq1YvHgxAGDr1q0YNmwYYmNjYbfbMWjQIPzyyy9XzXr27Fl069YNDRs2xP79+wFU7TEyZMgQrFmzBmfOnLnqsfTOLDuALGW/wOPi4gAAhw8fxvz583HHHXcgJSUFWVlZ+Pjjj9GvXz/s2bMHycnJAACPx4MRI0Zg+fLluPPOO/HEE08gPz8fP/74I3bt2oVmzZppx7jrrrtw0003+Rx38uTJFeZ57bXXoCgK/vjHPyI7OxvvvPMOBg8ejG3btiEyMhIA8NNPP2HYsGFITU3FCy+8AFVVMX36dAwcOBCrV69Gt27dyu23YcOGeP311wEABQUF+O1vf1vhsZ977jmMGTMGkyZNwunTp/Hee++hb9++2Lp1K2rXrl1umwcffBDXX389AODbb7/FvHnzfL7+0EMPYcaMGZg4cSIef/xxpKenY9q0adi6dSvWrl0Li8VS4f1QFefOndNu28W8Xi9uueUWrFmzBg8++CDatGmDnTt34u2338aBAwcwf/78Kh1n+vTp+Mtf/oJ//OMfGDduXIXXudr9MXXqVNxyyy24++674XQ68eWXX+KOO+7A999/j+HDh2vXe+mll/Diiy+iV69eePnllxEREYENGzbgp59+wtChQwGUrpPdf//9aNeuHSZPnozatWtj69atWLx4sZav7L7v2rUrXn/9dWRlZWHq1KlYu3Ztue9pfHw83n77bQDAsWPHMHXqVNx0003IzMys8HtfZt26dVAUBddee63P5f/617/w+OOPY/To0XjiiSfgcDiwY8cObNiwAePGjcOoUaNw4MABfPHFF3j77bcRHx8PAKhXr562j59++glff/01Hn30UcTHx6NJkybYvXs3rr/+esTGxuKZZ56BxWLBxx9/jP79++Pnn39G9+7dK8yZk5ODIUOG4MyZM/j555/RrFmzKj9GUlNTIYTAunXrKvwjxlCEwU2fPl0AEMuWLROnT58WmZmZ4ssvvxRxcXEiMjJSHDt2TAghhMPhEB6Px2fb9PR0YbVaxcsvv6xd9umnnwoA4q233ip3LK/Xq20HQEyZMqXcddq1ayf69eunfb5ixQoBQDRo0ECcP39eu/zrr78WAMTUqVO1fbdo0ULccMMN2nGEEKKoqEikpKSIIUOGlDtWr169RPv27bXPT58+LQCIF154QbssIyNDmEwm8dprr/lsu3PnTmE2m8tdnpaWJgCIzz77TLvshRdeEBc/lFavXi0AiNmzZ/tsu3jx4nKXN27cWAwfPrxc9kceeURc+vC8NPszzzwjEhISRGpqqs99OmvWLKGqqli9erXP9h999JEAINauXVvueBfr16+ftr+FCxcKs9ksnnrqqQqvW5n7Q4jS79PFnE6naN++vRg4cKDPvlRVFbfddlu5x2LZ9/zcuXMiJiZGdO/eXRQXF1d4HafTKRISEkT79u19rvP9998LAOL555/XLpswYYJo3Lixz37++c9/CgBi48aNFd7mMuPHjxdxcXHlLr/11ltFu3btrrjtlClTBACRnp5e7msAhKqqYvfu3T6Xjxw5UkRERIhDhw5pl504cULExMSIvn37apeV/cxv2rRJnDx5UrRr1040bdpUZGRkaNep6mPkxIkTAoB44403rni7jCBspo8GDx6MevXqoVGjRrjzzjtht9sxb948NGjQAABgtVqhqqV3h8fjQW5uLux2O1q1aoUtW7Zo+5k7dy7i4+Px2GOPlTvGpVMGVXHvvfciJiZG+3z06NGoX78+fvjhBwDAtm3bkJaWhnHjxiE3Nxc5OTnIyclBYWEhBg0ahFWrVsHr9frs0+FwwGazXfG43377LbxeL8aMGaPtMycnB0lJSWjRogVWrFjhc32n0wmg9P66nDlz5qBWrVoYMmSIzz5TU1Nht9vL7dPlcvlcLycn56pzyMePH8d7772H5557Dna7vdzx27Rpg9atW/vss2zK8NLjX87GjRsxZswY3H777ZgyZUqF16nM/QFAG+0BpVMZeXl5uP76630eW/Pnz4fX68Xzzz+vPRbLlD22fvzxR+Tn5+NPf/pTue9t2XV+/fVXZGdn4+GHH/a5zvDhw9G6dWssXLjQZzuv16vdR9u2bcPMmTNRv359tGnT5oq3KTc3F3Xq1Cl3ee3atXHs2DFs2rTpittfSb9+/dC2bVvtc4/Hg6VLl2LkyJFo2rSpdnn9+vUxbtw4rFmzBufPn/fZx7Fjx9CvXz+4XC6sWrUKjRs31r5W1cdI2e3Mycmp9m3Si7CZPnr//ffRsmVLmM1mJCYmolWrVj4/eF6vF1OnTsUHH3yA9PR0eDwe7WtlU0xA6bRTq1atYDb7965r0aKFz+eKoqB58+baPGtaWhoAYMKECZfdR15ens8PaU5OTrn9XiotLQ1CiMte79JpnrKzRS79RXzpPvPy8pCQkFDh17Ozs30+X7p0qc/UQWW88MILSE5OxkMPPVRubj4tLQ179+697D4vPX5Fjh8/juHDh6OwsBC5ubmXLfzK3B8A8P333+PVV1/Ftm3bfOasL97voUOHoKqqzy/DS5VNe7Zv3/6y1zly5AgAoFWrVuW+1rp1a6xZs8bnsszMTJ/7qn79+pg7d+5VbxMAiAreuPGPf/wjli1bhm7duqF58+YYOnQoxo0bh969e191f2VSUlJ8Pj99+jSKiooqvE1t2rSB1+tFZmYm2rVrp11+zz33wGw2Y+/evUhKSvLZpqqPkbLbWZM//PQibEqhW7du6NKly2W//te
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>hypertension</th>\n",
" <th>heart_disease</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>gender_Male</th>\n",
" <th>gender_Other</th>\n",
" <th>ever_married_Yes</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>Residence_type_Urban</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" <th>age_glucose_index</th>\n",
" <th>bmi_glucose_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2508</th>\n",
" <td>0.316406</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.176562</td>\n",
" <td>0.341667</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>1957.540</td>\n",
" <td>0.300173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2435</th>\n",
" <td>0.768066</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.351636</td>\n",
" <td>0.591667</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>6003.270</td>\n",
" <td>0.331619</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2547</th>\n",
" <td>0.060059</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.250618</td>\n",
" <td>0.216667</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>418.750</td>\n",
" <td>0.216119</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3885</th>\n",
" <td>0.914551</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.342882</td>\n",
" <td>0.691667</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>7071.750</td>\n",
" <td>0.373316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>335</th>\n",
" <td>0.426270</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.500974</td>\n",
" <td>0.544444</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>3932.250</td>\n",
" <td>0.266133</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4661</th>\n",
" <td>0.853516</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.977778</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>11855.025</td>\n",
" <td>0.268662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4662</th>\n",
" <td>0.926758</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.024510</td>\n",
" <td>0.494444</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>4401.920</td>\n",
" <td>0.485152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4663</th>\n",
" <td>0.682617</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.000000</td>\n",
" <td>0.836111</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>9484.020</td>\n",
" <td>0.238549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4664</th>\n",
" <td>0.768066</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.313207</td>\n",
" <td>0.494444</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>5726.700</td>\n",
" <td>0.309131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4665</th>\n",
" <td>0.902344</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.156166</td>\n",
" <td>0.583333</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>5399.040</td>\n",
" <td>0.429002</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1554 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" age hypertension heart_disease avg_glucose_level bmi \\\n",
"2508 0.316406 0 0 0.176562 0.341667 \n",
"2435 0.768066 0 0 0.351636 0.591667 \n",
"2547 0.060059 0 0 0.250618 0.216667 \n",
"3885 0.914551 0 0 0.342882 0.691667 \n",
"335 0.426270 0 0 0.500974 0.544444 \n",
"... ... ... ... ... ... \n",
"4661 0.853516 1 0 1.000000 0.977778 \n",
"4662 0.926758 0 0 0.024510 0.494444 \n",
"4663 0.682617 0 0 1.000000 0.836111 \n",
"4664 0.768066 0 0 0.313207 0.494444 \n",
"4665 0.902344 0 0 0.156166 0.583333 \n",
"\n",
" gender_Male gender_Other ever_married_Yes work_type_Never_worked \\\n",
"2508 False False True False \n",
"2435 True False True False \n",
"2547 True False False False \n",
"3885 True False True False \n",
"335 False False True False \n",
"... ... ... ... ... \n",
"4661 True False True False \n",
"4662 False False True False \n",
"4663 False False True False \n",
"4664 False False True False \n",
"4665 True False True False \n",
"\n",
" work_type_Private work_type_Self-employed work_type_children \\\n",
"2508 True False False \n",
"2435 True False False \n",
"2547 False False True \n",
"3885 False False False \n",
"335 True False False \n",
"... ... ... ... \n",
"4661 True False False \n",
"4662 True False False \n",
"4663 True False False \n",
"4664 True False False \n",
"4665 True False False \n",
"\n",
" Residence_type_Urban smoking_status_formerly smoked \\\n",
"2508 False False \n",
"2435 True False \n",
"2547 True False \n",
"3885 True False \n",
"335 True False \n",
"... ... ... \n",
"4661 False True \n",
"4662 True True \n",
"4663 True True \n",
"4664 True True \n",
"4665 True False \n",
"\n",
" smoking_status_never smoked smoking_status_smokes age_glucose_index \\\n",
"2508 False True 1957.540 \n",
"2435 False True 6003.270 \n",
"2547 False False 418.750 \n",
"3885 False False 7071.750 \n",
"335 False False 3932.250 \n",
"... ... ... ... \n",
"4661 False False 11855.025 \n",
"4662 False False 4401.920 \n",
"4663 False False 9484.020 \n",
"4664 False False 5726.700 \n",
"4665 False True 5399.040 \n",
"\n",
" bmi_glucose_ratio \n",
"2508 0.300173 \n",
"2435 0.331619 \n",
"2547 0.216119 \n",
"3885 0.373316 \n",
"335 0.266133 \n",
"... ... \n",
"4661 0.268662 \n",
"4662 0.485152 \n",
"4663 0.238549 \n",
"4664 0.309131 \n",
"4665 0.429002 \n",
"\n",
"[1554 rows x 18 columns]"
]
},
"execution_count": 342,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def over_under_sampling(x_selection, y_selection):\n",
"\n",
" # сначала увеличение меньшинства\n",
"\n",
" oversampler = RandomOverSampler(sampling_strategy=0.2, random_state=42) \n",
" x_over, y_over = oversampler.fit_resample(x_selection, y_selection) \n",
"\n",
" print(\"\\nПосле оверсемплинга\\n\")\n",
" print(y_over.value_counts())\n",
"\n",
" # потом уменьшение большинства\n",
"\n",
" undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)\n",
" x_balanced, y_balanced = undersampler.fit_resample(x_over, y_over)\n",
"\n",
" print(\"\\nПосле балансировки данных (андерсемплинга)\\n\")\n",
" print(y_balanced.value_counts())\n",
"\n",
" plt.pie(\n",
" y_balanced.value_counts(), \n",
" labels=class_counts.index, # Метки классов (0 и 1)\n",
" autopct='%1.1f%%', # Отображение процентов\n",
" colors=['lightgreen', 'lightcoral'], # Цвета для классов\n",
" startangle=45, # Поворот диаграммы\n",
" explode=(0, 0.05) # Небольшое смещение для класса 1\n",
" )\n",
" plt.title(\"Распределение классов (stroke)\")\n",
" plt.show()\n",
" return x_balanced, y_balanced \n",
"\n",
"print(\"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\\n\")\n",
"print(y_train.value_counts())\n",
"X_train, y_train = over_under_sampling(X_train, y_train)\n",
"\n",
"X_train\n",
"\n",
"# print(\"Данные ДО аугментации в ТЕСТОВОЙ ВЫБОРКЕ (10-20% данных)\\n\")\n",
"# print(y_test.value_counts())\n",
"# over_under_sampling(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Самое время оценить качество работы модели</p>"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 0.25 секунд\n",
"ROC-AUC: 0.84\n",
"F1-Score: 0.29\n",
"Матрица ошибок:\n",
"[[434 52]\n",
" [ 12 13]]\n",
"Отчет по классификации:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.89 0.93 486\n",
" 1 0.20 0.52 0.29 25\n",
"\n",
" accuracy 0.87 511\n",
" macro avg 0.59 0.71 0.61 511\n",
"weighted avg 0.94 0.87 0.90 511\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAJwCAYAAACZACVsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABdI0lEQVR4nO3df3zN9f//8fvZbDObbYZt5vevCFukYsX8SOZH3kQ/pLJJiUb5GSs/kyYpFP2QGiqJ3kr0A/mZ35JFSIiW2OZNNpP9Pt8/fJ1Pp23n7DDnnO3cru/L6/Leeb6er9frcc46PDyez9fzZTAajUYBAACgUG6ODgAAAMCZkSwBAABYQLIEAABgAckSAACABSRLAAAAFpAsAQAAWECyBAAAYAHJEgAAgAXlHB0AADhKdna2zp8/r/z8fIWGhjo6HABOisoSAJfyww8/qF+/fqpSpYq8vLxUrVo19enTx9FhAXBiJEvAvyxcuFAGg0EGg0Fbt24tsN9oNKpmzZoyGAy69957HRAhrtXKlSvVpk0bHTp0SNOmTdO6deu0bt06vfvuu44ODYATYxgOKEL58uW1ZMkStWnTxqx98+bNOnXqlLy8vBwUGa7F+fPn9cQTTygqKkrLly+Xp6eno0MCUEpQWQKK0K1bNy1fvly5ublm7UuWLFHLli0VEhLioMhwLRISEpSZmamFCxeSKAGwCckSUISHH35Y586d07p160xt2dnZ+uyzz9SvX79Cj5k5c6buvPNOVa5cWd7e3mrZsqU+++wzsz5Xh/iK2tq3by9J2rRpkwwGgz799FM9//zzCgkJkY+Pj/7zn//ojz/+MDtn+/btTcddtWfPHtM5/339oUOHFoj93nvvVZ06dcza9u/fr5iYGNWrV0/ly5dXSEiIHn/8cZ07d87SR2eSmpqqgQMHKjg4WOXLl9ctt9yiRYsWmfU5efKkDAaDZs6cadberFmzAu9p/PjxMhgMysjIMHs/kydPNuv36quvmn2WkrRz5041b95cL7/8smrWrCkvLy81bNhQ06dPV35+vtnxubm5mjp1qurXry8vLy/VqVNHzz//vLKyssz61alTRzExMWZtgwYNUvny5bVp0ybrHxCAUoFhOKAIderUUUREhD755BN17dpVkvTNN98oLS1Nffv21RtvvFHgmDlz5ug///mPHnnkEWVnZ2vp0qV64IEHtHr1anXv3l2S9OGHH5r6f//995o/f75mzZqlKlWqSJKCg4PNzjlt2jQZDAaNHTtWqampmj17tjp16qTExER5e3sXGf/YsWOv+zNYt26dfvvtNw0YMEAhISE6ePCg5s+fr4MHD2rnzp0FErF/unz5stq3b69jx45p6NChqlu3rpYvX66YmBhduHBBzz777HXHV5gLFy4oPj6+QPu5c+e0detWbd26VY8//rhatmyp9evXKy4uTidPntQ777xj6vvEE09o0aJFuv/++zVq1Cjt2rVL8fHxOnz4sD7//PMirz1p0iS9//77+vTTTwskegBKMSMAMwkJCUZJxj179hjnzp1rrFixovHvv/82Go1G4wMPPGDs0KGD0Wg0GmvXrm3s3r272bFX+12VnZ1tbNasmbFjx44Wr3XixIkC+zZu3GiUZKxevboxPT3d1L5s2TKjJOOcOXNMbe3atTO2a9fO9Prrr782SjJ26dLF+O+vuSRjbGxsget1797dWLt2bYvvx2g0Gj/55BOjJOOWLVsKfU9XzZ492yjJ+NFHH5nasrOzjREREUZfX1/Tezpx4oRRkvHVV181O75p06Zm78loNBpfeOEFoyTjxYsXzd7PpEmTTK+fe+45Y1BQkLFly5Zmx7dr184oyTh58mSzc8bExBglGQ8cOGA0Go3GxMREoyTjE088YdZv9OjRRknGDRs2mNpq165tjI6ONhqNRuO7775rlGR88803LX4uAEofhuEACx588EFdvnxZq1ev1sWLF7V69eoih+AkmVV6/vrrL6Wlpalt27b68ccfrzmG/v37q2LFiqbX999/v6pVq6avv/660P5Go1FxcXHq06ePWrVqdc3XlczfT2Zmpv73v/+pdevWkmT1PX399dcKCQnRww8/bGrz8PDQM888o4yMDG3evPm6YivMn3/+qTfffFMTJkyQr69vgf3u7u4aMWKEWduoUaMkSV999ZUpbkkaOXKkxX7/tHLlSj399NMaM2ZMoUOcAEo3kiXAgqpVq6pTp05asmSJVqxYoby8PN1///1F9l+9erVat26t8uXLKzAwUFWrVtXbb7+ttLS0a46hYcOGZq8NBoMaNGigkydPFtr/448/1sGDB/Xyyy9f8zWvOn/+vJ599lkFBwfL29tbVatWVd26dSXJ6nv6/fff1bBhQ7m5mf8xc/PNN5v2l7RJkyYpNDRUTz31VIF9BoNBoaGh8vPzM2tv1KiR3NzcTJ/n77//Ljc3NzVo0MCsX0hIiAICAgrEnZiYqIcfflh5eXk6f/58yb4hAE6BOUuAFf369dOTTz6p5ORkde3aVQEBAYX2+/777/Wf//xHkZGReuutt1StWjV5eHgoISFBS5YssUus2dnZmjBhggYOHKibbrrpus/34IMPavv27RozZoyaN28uX19f5efnq0uXLgUmRTva4cOHtXDhQn300Ufy8PAosN/S/K7CWJqP9U8//fSTunbtqrvvvltjxozRo48+ynwloIwhWQKsuO+++/TUU09p586d+vTTT4vs99///lfly5fXmjVrzNZgSkhIuK7rHz161Oy10WjUsWPHFB4eXqDvW2+9pdTU1AJ3h12Lv/76S+vXr9eUKVM0ceLEIuMpSu3atbV//37l5+ebVZd++eUX0/6SFBcXp+bNm+uhhx4qdH/dunW1du1aXbx40WxY89dff1V+fr7pTsDatWsrPz9fR48eNVXBJCklJUUXLlwoEHdYWJiWL18ub29vLV++XIMGDdL+/ftVvnz5En1/AByHYTjACl9fX7399tuaPHmyevToUWQ/d3d3GQwG5eXlmdpOnjypL7744rquv3jxYl28eNH0+rPPPtOZM2dMd+hddfHiRU2bNk0jRowokTWg3N3dJV1Jzv5p9uzZxTq+W7duSk5ONkswc3Nz9eabb8rX11ft2rW77hiv2rFjh1auXKnp06cXWRHq1q2b8vLyNHfuXLP2119/XZJMdyt269ZNUsH3+e9+V916663y8fGRm5ubFixYoJMnT+rFF1+87vcEwHlQWQKKITo62mqf7t276/XXX1eXLl3Ur18/paamat68eWrQoIH2799/zdcODAxUmzZtNGDAAKWkpGj27Nlq0KCBnnzySbN+P/74o6pUqaLnnnvO6jmTkpL07bffmrWdPXtWly9f1rfffqt27drJz89PkZGRmjFjhnJyclS9enWtXbtWJ06cKFbcgwYN0rvvvquYmBjt3btXderU0WeffaZt27Zp9uzZZtUdSTpy5IhZTBkZGXJzczNr++233wq91tq1a3XPPfeoU6dORcbTrVs3derUSS+88IJOnDih5s2ba8OGDfrvf/+rwYMHq1mzZpKkW265RdHR0Zo/f74uXLigdu3aaffu3Vq0aJF69eqlDh06FHmNZs2aaezYsZo+fbr69u1baPUPQCnk4LvxAKfzz6UDLCls6YD333/f2LBhQ6OXl5excePGxoSEBOOkSZMK3L7/72tZWjrgk08+McbFxRmDgoKM3t7exu7duxt///13s75Xb4ufNWuWWXth15Zkdbsaz6lTp4z33XefMSAgwOjv72984IEHjKdPny5wu35RUlJSjAMGDDBWqVLF6OnpaQwLCzMmJCSY9bm6dIAt27+XDjAYDMa9e/cW+Ez+vfRARkaGccSIEcbQ0FCjh4eHsUGDBsbp06cb8/LyzPrl5OQYp0yZYqxbt67Rw8PDWLNmTWNcXJwxMzPTrN8/lw64KjMz09i4cWPj7bffbszNzbX6GQFwfgaj8V81dgBOYdOmTerQoYOWL19u8Q68knTy5EnVrVtXJ06cKLCaNwC4KuYsAQAAWECyBMDE29tbUVFRNt9mDwBlGRO8AZgEBwcXmPgNAK6
"text/plain": [
"<Figure size 700x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACkwklEQVR4nOzddVhU6dsH8O/QDaIIIiiIoBiIioVirK65tmsLmLsqoNiF3aKCYgeu3brWmmutjYoFYqGIXYQgOef9wx/ndRxqEBzi+7muuXTuU/c5DMPc8zzneSSCIAggIiIiIiKiDKkoOwEiIiIiIqL8joUTERERERFRFlg4ERERERERZYGFExERERERURZYOBEREREREWWBhRMREREREVEWWDgRERERERFlgYUTERERERFRFlg4ERERERERZYGFExHlis+fP8PPz098HhUVhWXLlikvISIiIqJcxMKJCiR3d3fo6ekpOw36hra2NiZNmoQtW7bg+fPnmDp1Kg4ePKjstIiIiIhyhZqyEyDKrg8fPmDLli04f/48zp07hy9fvqBly5aoXr06unbtiurVqys7xSJNVVUV06ZNg6urK6RSKQwMDHD48GFlp0VERESUKySCIAjKToIoK9u3b8fAgQPx+fNnWFlZITk5Ga9fv0b16tVx69YtJCcnw83NDatXr4aGhoay0y3SIiMj8fz5c9jb28PIyEjZ6RARERHlCnbVo3zvwoUL6N27N8zMzHDhwgWEh4ejWbNm0NLSwrVr1/Dy5Uv06NEDf/31F7y9vWW29fX1hbOzM4oXLw5tbW3UrFkTu3fvljuGRCLB1KlTxecpKSlo3bo1jI2NERISIq6T2aNx48YAgDNnzkAikeDMmTMyx2jTpo3ccRo3bixul+bp06eQSCTYsGGDTPz+/fvo0qULjI2NoaWlBScnJxw4cEDuXKKiouDt7Q0rKytoamrCwsICrq6ueP/+fYb5vXz5ElZWVnBycsLnz58VPo+pU6dCIpEAACwsLFCvXj2oqanBzMws3X2k58WLF+jfvz/Mzc2hqakJa2trDB48GElJSdiwYUOW1z/tet2+fRvu7u4oV64ctLS0YGZmhn79+uHDhw9y+Wb2OHPmDKZMmQJ1dXW8e/dOLt9BgwbByMgICQkJYuyff/5Bo0aNoK+vDwMDA9SqVQtbt27N9Ly/vXZpPn/+nO61a9y4MapUqSK3D19fX0gkEjx9+lQmnlk+ip5b2uvh+4eVlZXcOun9jn1/vlldewA4f/48fv/9d5QpUwaampqwtLSEt7c3vnz5kuH+02T1mvn29QsAN2/eRKtWrWBgYAA9PT00bdoUly9fzvI4ACCVSuHv74+qVatCS0sLJiYmaNmyJYKCgsR1JBIJPDw8sGXLFlSoUAFaWlqoWbMmzp07J7OvZ8+eYciQIahQoQK0tbVRvHhx/P7773I/2+/PT0dHB1WrVsXatWtl1suoW/Pu3bvT/d28cuUKWrZsCUNDQ+jo6KBRo0a4cOGCzDppP8O095Q0QUFBcu9d7u7uMq8RAHj+/Dm0tbXlXrPfvx8mJyfDx8cH1tbW0NDQQJkyZTBmzJhs/fyBr++ZXbt2hYmJCbS1tVGhQgVMnDgx020yep2nPdzd3cV1034G586dwx9//IHixYvDwMAArq6u+PTpk9y+ly9fjsqVK0NTUxPm5uYYOnQooqKiZNZp3Lhxusdt1qyZuE7aa+l7v/32m9y1jouLw8iRI2FpaQlNTU1UqFABvr6++PY78w8fPqBVq1awsLCApqYmSpUqhV69euHZs2fiOhn9XRo6dGiOr4ubmxtKlCiB5ORkuXNp3rw5KlSoIBPbvHkzatasCW1tbRgbG6N79+54/vx5utevQ4cOcvv8448/IJFIZN5D087L19dXbv006b1Hp/1cvr2vN03FihUz/BlRwceuepTvzZ07F1KpFNu3b0fNmjXllpcoUQIbN25ESEgIVq1ahSlTpqBkyZIAAH9/f7Rr1w69evVCUlIStm/fjt9//x2HDh1CmzZtMjzmgAEDcObMGZw4cQKVKlUCAGzatElcfv78eaxevRqLFy9GiRIlAACmpqYZ7u/cuXM4cuRIjs4fAO7du4f69eujdOnSGDduHHR1dbFz50506NABe/bsQceOHQF8/cDt4uKC0NBQ9OvXDzVq1MD79+9x4MABREZGirl+Kzo6Gq1atYK6ujqOHDmS6b1jipzHwoUL8ebNm2yt+/LlS9SuXRtRUVEYNGgQKlasiBcvXmD37t2Ij49Hw4YNZa7/rFmzAEDmQ5CzszMA4MSJE3jy5An69u0LMzMz3Lt3D6tXr8a9e/dw+fJlSCQSdOrUCeXLlxe39fb2hr29PQYNGiTG7O3tYWFhgenTp2PHjh0yfwSTkpKwe/dudO7cGVpaWgC+fljo168fKleujPHjx8PIyAg3b97E0aNH0bNnz2xdh5xcu4xklU+fPn2yfW7fmjBhAuzt7QEAq1evRkREhEJ5ZffaA8CuXbsQHx+PwYMHo3jx4rh69SqWLl2KyMhI7Nq1K1vHmz59OqytrcXnnz9/xuDBg2XWuXfvHlxcXGBgYIAxY8ZAXV0dq1atQuPGjXH27FnUqVMn02P0798fGzZsQKtWrTBgwACkpKTg/PnzuHz5MpycnMT1zp49ix07dsDLywuamppYvnw5WrZsiatXr4of5q5du4aLFy+ie/fusLCwwNOnT7FixQo0btwYISEh0NHRkTl22ntQTEwM1q9fj4EDB8LKykrmg3Z2/fvvv2jVqhVq1qyJKVOmQEVFBYGBgfjll19w/vx51K5dW+F9pmfy5MkyRXlGhg4dijVr1qBdu3YYNWoUbt68iQULFuDu3bs4fPiw3IfZb92+fRsuLi5QV1fHoEGDYGVlhcePH+PgwYPi+0dmvLy8UKtWLZnYgAED0l3Xw8MDRkZGmDp1KsLCwrBixQo8e/ZMLMKArx++p02bhmbNmmHw4MHieteuXcOFCxegrq4u7s/CwgJz5syROUapUqWyzPl7giCgXbt2OH36NPr37w9HR0ccO3YMo0ePxosXL7B48WIAX3/n9fX1MWzYMBQvXhyPHz/G0qVLcfv2bdy5cyfD/T969Ahr1qzJcHlW16VPnz7YuHEjjh07ht9++03c7vXr1/j3338xZcoUMTZr1iz4+Piga9euGDBgAN69e4elS5eiYcOGuHnzpkzvBi0tLRw+fBhv374VPwt8+fIFO3bsSPc9Lae0tLQQGBiI4cOHi7GLFy/KFJxUCAlE+ZyxsbFQtmxZmZibm5ugq6srE/Px8REACAcPHhRj8fHxMuskJSUJVapUEX755ReZOABhypQpgiAIwvjx4wVVVVVh//79GeYUGBgoABDCw8Pllp0+fVoAIJw+fVqM1alTR2jVqpXMcQRBEJo0aSI0bNhQZvvw8HABgBAYGCjGmjZtKlStWlVISEgQY1KpVHB2dhZsbW3F2OTJkwUAwt69e+XykkqlcvklJCQIjRs3FkqWLCk8evQox+cxZcoU4du3k7dv3wr6+vriut/uIz2urq6CioqKcO3atQzz/lajRo2ERo0apbuv73/mgiAI27ZtEwAI586dS3ebsmXLCm5ubukuq1evnlCnTh2Z2N69e2XOKyoqStDX1xfq1KkjfPnyJcv8v6XItWvUqJFQuXJluX0sWLBA5vWY3Xyyc25pTpw4IQAQzp49K8bc3NxkfjfTXjO7du3K9Jy/ldm1T+9nOWfOHEEikQjPnj3LdL9pv6Pfv6bevXsn9/rt0KGDoKGhITx+/FiMvXz5UtDX15f7/fzev//+KwAQvLy85JZ9e60BCACEoKAgMfbs2TNBS0tL6NixoxhL75wvXbokABA2btwod37fvgc9ePBAACDMnz9fjKX3XikIgrBr1y6Zn7NUKhVsbW2FFi1ayOQdHx8vWFtbC7/++qsYS3vNvnv3Tmaf165dk3vv+v41cvfuXUFFRUV8fX+b/7e/17dv3xYkEonQvXt3mWNMnTpV7n0+PQ0bNhT09fXlXidZ/T5m9hrW1dWVea2m/Qxq1qwpJCUlifH58+cLAIS
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import time\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки\n",
"\n",
"# X = data.drop(columns=['id', 'stroke']) # Признаки\n",
"# y = data['stroke'] # Целевая переменная\n",
"\n",
"# # Преобразование категориальных признаков с помощью One-Hot Encoding\n",
"# categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
"# X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)\n",
"\n",
"# # Заполнение пропущенных значений (например, медианой для числовых данных)\n",
"# X.fillna(X.median(), inplace=True)\n",
"\n",
"# # Разделение данных на обучающую и тестовую выборки\n",
"# # Обучающая выборка\n",
"# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"\n",
"# # Тестовая и контрольная выборки\n",
"# X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n",
"\n",
"\n",
"# Обучение модели\n",
"model = RandomForestClassifier(random_state=42)\n",
"\n",
"# Начинаем отсчет времени\n",
"start_time = time.time()\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Время обучения модели\n",
"train_time = time.time() - start_time\n",
"\n",
"# Предсказания и оценка модели\n",
"y_pred = model.predict(X_test)\n",
"y_pred_proba = model.predict_proba(X_test)[:, 1] # Вероятности для ROC-AUC\n",
"\n",
"# Метрики\n",
"roc_auc = roc_auc_score(y_test, y_pred_proba)\n",
"f1 = f1_score(y_test, y_pred)\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"class_report = classification_report(y_test, y_pred)\n",
"\n",
"# Вывод результатов\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
"print(f'ROC-AUC: {roc_auc:.2f}')\n",
"print(f'F1-Score: {f1:.2f}')\n",
"print('Матрица ошибок:')\n",
"print(conf_matrix)\n",
"print('Отчет по классификации:')\n",
"print(class_report)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(7, 7))\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Нет инсульта', 'Инсульт'], yticklabels=['Нет инсульта', 'Инсульт'])\n",
"plt.title('Матрица ошибок')\n",
"plt.xlabel('Предсказанный класс')\n",
"plt.ylabel('Истинный класс')\n",
"plt.show()\n",
"\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(y_test, y_pred, alpha=0.5, color='blue', label='Прогнозы модели')\n",
"plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Идеальное совпадение')\n",
"plt.xlabel('Фактический статус инсульта')\n",
"plt.ylabel('Прогнозируемый статус инсульта')\n",
"plt.title('Фактический статус инсульта по сравнению с прогнозируемым')\n",
"plt.legend()\n",
"plt.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">А ВОТ ТЕПЕЕЕЕЕЕЕЕЕЕЕРЬ я поправила недоразумения и вроде как модель проперло на выявление инсульта. Но, так как в данных ЛЮТЫЙ дисбаланс, то модель слаба на выявление инсульта все еще.</p>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}