2655 lines
297 KiB
Plaintext
2655 lines
297 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<h4 style=\"margin: 30px;\">бизнес-цели и 2 задачи, которые нужно решить:<br/>\n",
|
|||
|
"Снижение вероятности инсульта у пациентов с высоким риском путем раннего выявления предрасположенности.<br/>\n",
|
|||
|
"Оптимизация медицинских услуг, предоставляемых пациентам, с учетом их риска инсульта.<br/><br/><br/>\n",
|
|||
|
"Разработать модель, которая прогнозирует вероятность инсульта у пациента.<br/>\n",
|
|||
|
"Определить значимые признаки для анализа риска инсульта, чтобы направить усилия медицинских работников на важные факторы.</h4>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 330,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Количество колонок: 12\n",
|
|||
|
"Колонки: Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
|||
|
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
|||
|
" 'smoking_status', 'stroke'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"# Загрузка данных\n",
|
|||
|
"data = pd.read_csv('./csv/option4.csv')\n",
|
|||
|
"\n",
|
|||
|
"# Обзор данных\n",
|
|||
|
"print(\"Количество колонок:\", data.columns.size)\n",
|
|||
|
"print(\"Колонки:\", data.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 331,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Наличие пропущенных значений:\n",
|
|||
|
"id 0\n",
|
|||
|
"gender 0\n",
|
|||
|
"age 0\n",
|
|||
|
"hypertension 0\n",
|
|||
|
"heart_disease 0\n",
|
|||
|
"ever_married 0\n",
|
|||
|
"work_type 0\n",
|
|||
|
"Residence_type 0\n",
|
|||
|
"avg_glucose_level 0\n",
|
|||
|
"bmi 201\n",
|
|||
|
"smoking_status 0\n",
|
|||
|
"stroke 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"<bound method NDFrame.describe of id gender age hypertension heart_disease ever_married \\\n",
|
|||
|
"0 9046 Male 67.0 0 1 Yes \n",
|
|||
|
"1 51676 Female 61.0 0 0 Yes \n",
|
|||
|
"2 31112 Male 80.0 0 1 Yes \n",
|
|||
|
"3 60182 Female 49.0 0 0 Yes \n",
|
|||
|
"4 1665 Female 79.0 1 0 Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5105 18234 Female 80.0 1 0 Yes \n",
|
|||
|
"5106 44873 Female 81.0 0 0 Yes \n",
|
|||
|
"5107 19723 Female 35.0 0 0 Yes \n",
|
|||
|
"5108 37544 Male 51.0 0 0 Yes \n",
|
|||
|
"5109 44679 Female 44.0 0 0 Yes \n",
|
|||
|
"\n",
|
|||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
|||
|
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
|||
|
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
|||
|
"2 Private Rural 105.92 32.5 never smoked \n",
|
|||
|
"3 Private Urban 171.23 34.4 smokes \n",
|
|||
|
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 Private Urban 83.75 NaN never smoked \n",
|
|||
|
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
|||
|
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
|||
|
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
|||
|
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
|||
|
"\n",
|
|||
|
" stroke \n",
|
|||
|
"0 1 \n",
|
|||
|
"1 1 \n",
|
|||
|
"2 1 \n",
|
|||
|
"3 1 \n",
|
|||
|
"4 1 \n",
|
|||
|
"... ... \n",
|
|||
|
"5105 0 \n",
|
|||
|
"5106 0 \n",
|
|||
|
"5107 0 \n",
|
|||
|
"5108 0 \n",
|
|||
|
"5109 0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 12 columns]>\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(\"\\nНаличие пропущенных значений:\")\n",
|
|||
|
"print(data.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\n\\n\")\n",
|
|||
|
"\n",
|
|||
|
"print(data.describe)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Возьмем и заменим нулевые значения в столбце bmi на средние значения по столбцу </p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 332,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Наличие пропущенных значений:\n",
|
|||
|
"id 0\n",
|
|||
|
"gender 0\n",
|
|||
|
"age 0\n",
|
|||
|
"hypertension 0\n",
|
|||
|
"heart_disease 0\n",
|
|||
|
"ever_married 0\n",
|
|||
|
"work_type 0\n",
|
|||
|
"Residence_type 0\n",
|
|||
|
"avg_glucose_level 0\n",
|
|||
|
"bmi 0\n",
|
|||
|
"smoking_status 0\n",
|
|||
|
"stroke 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data['bmi'] = data['bmi'].fillna(data['bmi'].median())\n",
|
|||
|
"print(\"\\nНаличие пропущенных значений:\")\n",
|
|||
|
"print(data.isnull().sum())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Взглянем на выбросы: </p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 333,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABscUlEQVR4nO3de1yUdf7//+cwHD0wiAcGChDPirIeKiOM3GTDU6vlZhaltX7081G0zG0rNw9hGFvblj9bsGz7mBau1ZZmZVpiylZoahoeWtMicdPBCgEPiQjX74++XB9HmRQdGAYe99ttbs28r9dc87rwlm988uZ9WQzDMAQAAAAAAAAAAM7j4+kGAAAAAAAAAABoqAjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAAAAAAAAAHCBEB0AAAAAAAAAABcI0QEAAAAAAAAAcIEQHQAAAAAAAAAAFwjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAACAemCxWPTYY495uo16tWHDBlksFm3YsMHTrTSoXgAAjddjjz0mi8WiH374oU4/55577lH79u3r9DMA/B9CdAAAAAAAAAAAXPD1dAMAAAAAAAAALt6LL76oqqoqT7cBNBmE6AAAAAAAAIAX8fPz83QLQJPCdi5AI3TgwAFNnjxZXbt2VVBQkFq3bq3bbrtN33777Xm1+fn5uuGGGxQUFKQrr7xS6enpWrx4sSwWy3n177//vq6//no1b95cLVu21LBhw7R79+76uSgAQJN1MfPa1q1bZbFYtGTJkvPev3btWlksFr377rvm2IYNG3TVVVcpMDBQHTt21AsvvGDuYVpbb7zxhnr06KHAwED17NlTK1asuKh9Sl3VuOrj1Vdf1TXXXKNmzZqpVatWSkxM1AcffOBUk5WVpdjYWAUEBCgiIkKpqakqKSlxqtm3b59GjRolu92uwMBAXXnllRozZoxKS0vP+7x+/fopKChIoaGhGjNmjA4ePHhRX5ML2bx5swYPHiybzaZmzZrphhtu0CeffGIe/+c//ymLxaKNGzee994XXnhBFotFu3btMsf+/e9/63e/+51CQ0MVGBioq666SqtWrXJLrwAAXIoffvhBo0ePVnBwsFq3bq37779fp06dMo9bLBZNmTLF/D4iKChI8fHx2rlzp6Sf57tOnTopMDBQAwcOPO/f5+yJDtQvVqIDjdCWLVv06aefasyYMbryyiv17bffauHChRo4cKD27NmjZs2aSZK+++47/frXv5bFYtGMGTPUvHlz/f3vf1dAQMB553zllVc0btw4JScn68knn9TJkye1cOFCDRgwQNu3b2fyBgDUmYuZ16666ip16NBBr7/+usaNG+f0/tdee02tWrVScnKyJGn79u0aPHiwwsPDlZaWpsrKSs2dO1dt27atdW/vvfeebr/9dvXq1UsZGRk6evSoxo8fryuuuMIt114tLS1Njz32mK677jrNnTtX/v7+2rx5s9avX6+bbrpJ0s/he1pampKSkjRp0iTt3btXCxcu1JYtW/TJJ5/Iz89Pp0+fVnJyssrLyzV16lTZ7XZ99913evfdd1VSUiKbzSZJmjdvnmbNmqXRo0frv/7rv/T999/rueeeU2JiorZv366QkJBLvpb169dryJAh6tevn+bMmSMfHx8tXrxYN954o/71r3/pmmuu0bBhw9SiRQu9/vrruuGGG5ze/9prryk2NlY9e/aUJO3evVsJCQm64oor9Mgjj6h58+Z6/fXXNXLkSL355pu65ZZbLrlXAAAu1ejRo9W+fXtlZGRo06ZNWrBggY4ePaqlS5eaNf/617+0atUqpaamSpIyMjI0fPhwPfTQQ8rKytLkyZN19OhRPfXUU/r973+v9evXe+pyABgAGp2TJ0+eN5aXl2dIMpYuXWqOTZ061bBYLMb27dvNsR9//NEIDQ01JBkFBQWGYRjGsWPHjJCQEGPChAlO53Q4HIbNZjtvHAAAd7rYeW3GjBmGn5+fUVxcbI6Vl5cbISEhxu9//3tz7OabbzaaNWtmfPfdd+bYvn37DF9fX6O23x736tXLuPLKK41jx46ZYxs2bDAkGdHR0U61kow5c+aYr8eNG3dejWEYxpw5c5z62Ldvn+Hj42PccsstRmVlpVNtVVWVYRiGceTIEcPf39+46aabnGr+9re/GZKM//3f/zUMwzC2b99uSDLeeOMNl9f07bffGlar1Zg3b57T+M6dOw1fX9/zxn/JRx99ZEgyPvroI7Pfzp07G8nJyWbvhvHzn3FMTIzxm9/8xhy74447jHbt2hlnzpwxxw4fPmz4+PgYc+fONccGDRpk9OrVyzh16pTT1+W6664zOnfu7LIXAADqQvU8/tvf/tZpfPLkyYYk44svvjAM4+fvCwICAsx/dxuGYbzwwguGJMNutxtlZWXm+IwZM5z+jW4Yrr+PAFA32M4FaISCgoLM5xUVFfrxxx/VqVMnhYSE6PPPPzePrVmzRvHx8erdu7c5FhoaqpSUFKfzffjhhyopKdEdd9yhH374wXxYrVb1799fH330UZ1fEwCg6brYee32229XRUWF3nrrLXPsgw8+UElJiW6//XZJUmVlpdatW6eRI0cqIiLCrOvUqZOGDBlSq74OHTqknTt3auzYsWrRooU5fsMNN6hXr161vk5XVq5cqaqqKs2ePVs+Ps7fvldv+7Ju3TqdPn1a06ZNc6qZMGGCgoOD9d5770mSudJ87dq1OnnyZI2f99Zbb6mqqkqjR492mvftdrs6d+58WfP+jh07tG/fPt1555368ccfzXOfOHFCgwYNUm5urnmTtNtvv11HjhzRhg0bzPf/85//VFVVlfnnWVxcrPXr12v06NE6duyYeb4ff/xRycnJ2rdvn7777rtL7hcAgEtVvbq82tSpUyVJq1evNscGDRrk9Fvd/fv3lySNGjVKLVu2PG/8m2++qat2AVwA27kAjdBPP/2kjIwMLV68WN99950MwzCPnb3f6YEDBxQfH3/e+zt16uT0et++fZKkG2+8scbPCw4OdkfbAADU6GLntV/96lfq1q2bXnvtNY0fP17Sz1t/tGnTxpzDjhw5op9++um8uU46f/67kAMHDrh8X6dOnZwC/svx9ddfy8fHRz169LhgL127dnUa9/f3V4cOHczjMTExmj59up555hllZ2fr+uuv129/+1vdddddZsC+b98+GYahzp071/hZl3Mjs+rvKc7dcudspaWlatWqlbln+muvvaZBgwZJ+vnPs3fv3urSpYskaf/+/TIMQ7NmzdKsWbNqPN+RI0fcvr0OAAAXcu482rFjR/n4+DjtbR4VFeVUUz0XR0ZG1jh+9OjROugUwMUgRAcaoalTp2rx4sWaNm2a4uPjZbPZZLFYNGbMGHN1V21Uv+eVV16R3W4/77ivL3+VAADqTm3mtdtvv13z5s3TDz/8oJYtW2rVqlW64447GuRc5eomppWVlXX6uX/96191zz336O2339YHH3yg++67z9yv9corr1RVVZUsFovef/99Wa3W895/9qr72qr+8/rLX/7i9JtwNZ0/ICBAI0eO1IoVK5SVlaWioiJ98skneuKJJ84734MPPmjueX+u2v5wBACAulDTvF/TPPtL42cvJABQvxrevyYAXLZ//vOfGjdunP7617+aY6dOnVJJSYlTXXR0tPbv33/e+88d69ixoySpXbt2SkpKcn/DAAD8goud16SfQ/S0tDS9+eabCgsLU1lZmcaMGWMeb9eunQIDAy9q/ruQ6Ohol++7mHO1atWqxmuoXjVerWPHjqqqqtKePXtcBs/Vvezdu1cdOnQwx0+fPq2CgoLz5u9evXqpV69emjlzpj799FMlJCTo+eefV3p6ujp27CjDMBQTE2Ou+HaX6u8pgoODL+p7ittvv11LlixRTk6OvvzySxmGYW7lIsm8Vj8/P75HAQA0KPv27VNMTIz5ev/+/aqqqnLavgWA92BPdKARslqt5/2E+rnnnjtvZVtycrLy8vK0Y8cOc6y4uFjZ2dnn1QUHB+uJJ55QRUXFeZ/3/fffu695AADOcbHzmiR1795dvXr
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def plot_numeric_boxplots(dataframe):\n",
|
|||
|
" # Фильтрация числовых столбцов\n",
|
|||
|
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
|
|||
|
" \n",
|
|||
|
" # Построение графиков\n",
|
|||
|
" if numeric_columns:\n",
|
|||
|
" plt.figure(figsize=(15, 5))\n",
|
|||
|
" \n",
|
|||
|
" for i, col in enumerate(numeric_columns):\n",
|
|||
|
" if col != 'id':\n",
|
|||
|
" plt.subplot(1, len(numeric_columns), i + 1)\n",
|
|||
|
" sns.boxplot(y=dataframe[col])\n",
|
|||
|
" plt.title(f'{col}')\n",
|
|||
|
" plt.ylabel('')\n",
|
|||
|
" plt.xlabel(col)\n",
|
|||
|
" \n",
|
|||
|
" plt.tight_layout()\n",
|
|||
|
" plt.show()\n",
|
|||
|
" else:\n",
|
|||
|
" print(\"Нет подходящих числовых столбцов для построения графиков.\")\n",
|
|||
|
"\n",
|
|||
|
"plot_numeric_boxplots(data)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Видим выбросы в столбцах со средним уровнем глюкозы и в столбце bmi (индекс массы тела). устраним выбросы - поставим верхние и нижние границы</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 334,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcoElEQVR4nO3dfZiVVb0//vcgMIPADIIxAwmI+AAGPhuSZqQk4kOgnJSiNPPIqfCRThpH0SSJ9Fh5NMQ0j2lBmqUezcQUBcpQEUVNjdBQSAVKZUZQRmT2749+7m8j7BIFBobX67ruS/Za6177cw9erJn33HvdZYVCoRAAAAAAAGAtLZq6AAAAAAAA2FwJ0QEAAAAAoAQhOgAAAAAAlCBEBwAAAACAEoToAAAAAABQghAdAAAAAABKEKIDAAAAAEAJQnQAAAAAAChBiA4AAAAAACUI0QEAYBMoKyvLN7/5zaYuY5OaMWNGysrKMmPGjKYuZbOqBYDm65vf/GbKysryt7/9baO+zxe/+MXsuOOOG/U9gP9HiA4AAAAAACW0bOoCAAAAAID37pprrklDQ0NTlwFbDSE6AAAAAGxBWrVq1dQlwFbFdi7QDL3wwgv56le/mt122y1t2rRJp06d8pnPfCbPP//8WmOfeOKJfOITn0ibNm2yww475KKLLsp1112XsrKytcbfdddd+fjHP562bdumffv2OfLII/PUU09tmosCYKv1Xta1Rx55JGVlZbn++uvXOv/uu+9OWVlZfvWrXxXbZsyYkf322y8VFRXp1atXfvjDHxb3MF1fN998c3bfffdUVFSkb9++ufXWW9/TPqWlxpSq46c//Wk++tGPZtttt812222Xgw8+OL/5zW8ajbnyyivzkY98JOXl5enatWtGjx6d5cuXNxqzYMGCDB8+PDU1NamoqMgOO+yQESNGpLa2dq3323fffdOmTZt07NgxI0aMyOLFi9/T1+Rfeeihh3L44Yenqqoq2267bT7xiU/kgQceKPb/4he/SFlZWWbOnLnWuT/84Q9TVlaWP/zhD8W2P/7xj/m3f/u3dOzYMRUVFdlvv/1y++23b5BaAeD9+Nvf/pbjjjsulZWV6dSpU84444ysWrWq2F9WVpZTTz21+H1EmzZtMmDAgDz55JNJ/r7e7bzzzqmoqMjAgQPX+vncnuiwabkTHZqhOXPm5Pe//31GjBiRHXbYIc8//3wmT56cgQMH5umnn862226bJHnxxRfzyU9+MmVlZRk7dmzatm2bH/3oRykvL19rzp/85Cc58cQTM3jw4Fx88cV54403Mnny5Bx00EF57LHHLN4AbDTvZV3bb7/9stNOO+XnP/95TjzxxEbn33TTTdluu+0yePDgJMljjz2Www8/PF26dMmFF16YNWvWZPz48fnQhz603rXdeeedOf7449OvX79MnDgxr732Wk4++eR8+MMf3iDX/o4LL7ww3/zmN/Oxj30s48ePT+vWrfPQQw/lvvvuy2GHHZbk7+H7hRdemEGDBuUrX/lK5s+fn8mTJ2fOnDl54IEH0qpVq7z11lsZPHhw6uvrc9ppp6WmpiYvvvhifvWrX2X58uWpqqpKkkyYMCHjxo3Lcccdl3//93/PX//611xxxRU5+OCD89hjj6VDhw7v+1ruu+++DBkyJPvuu28uuOCCtGjRItddd10OOeSQ/Pa3v81HP/rRHHnkkWnXrl1+/vOf5xOf+ESj82+66aZ85CMfSd++fZMkTz31VA488MB8+MMfzje+8Y20bds2P//5zzNs2LD88pe/zDHHHPO+awWA9+u4447LjjvumIkTJ+bBBx/M5Zdfntdeey033HBDccxvf/vb3H777Rk9enSSZOLEiTnqqKNy9tln58orr8xXv/rVvPbaa7nkkkvypS99Kffdd19TXQ5QAJqdN954Y6222bNnF5IUbrjhhmLbaaedVigrKys89thjxbZXXnml0LFjx0KSwsKFCwuFQqHw+uuvFzp06FA45ZRTGs25ZMmSQlVV1VrtALAhvdd1bezYsYVWrVoVXn311WJbfX19oUOHDoUvfelLxbajjz66sO222xZefPHFYtuCBQsKLVu2LKzvt8f9+vUr7LDDDoXXX3+92DZjxoxCkkKPHj0ajU1SuOCCC4qvTzzxxLXGFAqFwgUXXNCojgULFhRatGhROOaYYwpr1qxpNLahoaFQKBQKy5YtK7Ru3bpw2GGHNRrzgx/8oJCk8L//+7+FQqFQeOyxxwpJCjfffHPJa3r++ecL22yzTWHChAmN2p988slCy5Yt12r/Z+6///5CksL9999frHeXXXYpDB48uFh7ofD3v+OePXsWPvWpTxXbPvvZzxY6d+5cePvtt4ttL7/8cqFFixaF8ePHF9sOPfTQQr9+/QqrVq1q9HX52Mc+Vthll11K1gIAG8M76/inP/3pRu1f/epXC0kKjz/+eKFQ+Pv3BeXl5cWfuwuFQuGHP/xhIUmhpqamUFdXV2wfO3Zso5/RC4XS30cAG4ftXKAZatOmTfHPq1evziuvvJKdd945HTp0yKOPPlrsmzZtWgYMGJC99tqr2NaxY8eMHDmy0Xz33HNPli9fns9+9rP529/+Vjy22Wab9O/fP/fff/9GvyYAtl7vdV07/vjjs3r16txyyy3Ftt/85jdZvnx5jj/++CTJmjVrcu+992bYsGHp2rVrcdzOO++cIUOGrFddL730Up588smccMIJadeuXbH9E5/4RPr167fe11nKbbfdloaGhpx//vlp0aLxt+/vbPty77335q233sqZZ57ZaMwpp5ySysrK3HnnnUlSvNP87rvvzhtvvLHO97vlllvS0NCQ4447rtG6X1NTk1122eUDrfvz5s3LggUL8rnPfS6vvPJKce6VK1fm0EMPzaxZs4oPSTv++OOzbNmyzJgxo3j+L37xizQ0NBT/Pl999dXcd999Oe644/L6668X53vllVcyePDgLFiwIC+++OL7rhcA3q937i5/x2mnnZYk+fWvf11sO/TQQxt9qrt///5JkuHDh6d9+/Zrtf/5z3/eWOUC/4LtXKAZevPNNzNx4sRcd911efHFF1MoFIp9/7jf6QsvvJABAwasdf7OO+/c6PWCBQuSJIcccsg636+ysnJDlA0A6/Re17U999wzvXv3zk033ZSTTz45yd+3/th+++2La9iyZcvy5ptvrrXWJWuvf//KCy+8UPK8nXfeuVHA/0E899xzadGiRXbfffd/Wctuu+3WqL1169bZaaediv09e/bMmDFj8r3vfS9TpkzJxz/+8Xz605/O5z//+WLAvmDBghQKheyyyy7rfK8P8iCzd76nePeWO/+otrY22223XXHP9JtuuimHHnpokr//fe61117ZddddkyTPPvtsCoVCxo0bl3Hjxq1zvmXLlm3w7XUA4F959zraq1evtGjRotHe5t27d2805p21uFu3butsf+211zZCpcB7IUSHZui0007LddddlzPPPDMDBgxIVVVVysrKMmLEiOLdXevjnXN+8pOfpKamZq3+li39UwLAxrM+69rxxx+fCRMm5G9/+1vat2+f22+/PZ/97Gc3y7Wq1ENM16xZs1Hf97vf/W6++MUv5v/+7//ym9/8Jqeffnpxv9YddtghDQ0NKSsry1133ZVtttlmrfP/8a779fXO39d///d/N/ok3LrmLy8vz7Bhw3LrrbfmyiuvzNKlS/PAAw/k29/+9lrz/ed//mdxz/t3W99fjgDAxrCudX9d6+w/a//HGwmATWvz+2kC+MB+8Ytf5MQTT8x3v/vdYtuqVauyfPnyRuN69OiRZ599dq3z393Wq1evJEnnzp0zaNCgDV8wAPwT73VdS/4eol944YX55S9/merq6tTV1WXEiBHF/s6dO6eiouI9rX//So8ePUqe917m2m677dZ5De/cNf6OXr16paGhIU8//XTJ4PmdWubPn5+ddtqp2P7WW29l4cKFa63f/fr1S79+/XLeeefl97//fQ488MBcddVVueiii9KrV68UCoX07NmzeMf3hvLO9xSVlZXv6XuK448/Ptdff32mT5+eZ555JoV
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x500 with 3 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def remove_outliers(df):\n",
|
|||
|
"\n",
|
|||
|
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
|
|||
|
" for column in numeric_columns:\n",
|
|||
|
" Q1 = df[column].quantile(0.25)\n",
|
|||
|
" Q3 = df[column].quantile(0.75)\n",
|
|||
|
" IQR = Q3 - Q1\n",
|
|||
|
" lower_bound = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound = Q3 + 1.5 * IQR\n",
|
|||
|
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
|||
|
" return df\n",
|
|||
|
" \n",
|
|||
|
"data = remove_outliers(data)\n",
|
|||
|
"plot_numeric_boxplots(data)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Теперь можно и к конструированию признаков приступить) данные ведь сбалансированы (в выборках)</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Унитарное кодирование категориальных признаков <br/> <br/>Применяем к категориальным (НЕ числовым) признакам: 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 335,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Данные после унитарного кодирования:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>heart_disease</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>stroke</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>gender_Other</th>\n",
|
|||
|
" <th>ever_married_Yes</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>Residence_type_Urban</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>9046</td>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>36.6</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>51676</td>\n",
|
|||
|
" <td>61.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>31112</td>\n",
|
|||
|
" <td>80.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>105.9200</td>\n",
|
|||
|
" <td>32.5</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>60182</td>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>34.4</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1665</td>\n",
|
|||
|
" <td>79.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>56669</td>\n",
|
|||
|
" <td>81.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>29.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>53882</td>\n",
|
|||
|
" <td>74.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>70.0900</td>\n",
|
|||
|
" <td>27.4</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>10434</td>\n",
|
|||
|
" <td>69.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>94.3900</td>\n",
|
|||
|
" <td>22.8</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>27419</td>\n",
|
|||
|
" <td>59.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>76.1500</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>60491</td>\n",
|
|||
|
" <td>78.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>58.5700</td>\n",
|
|||
|
" <td>24.2</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id age hypertension heart_disease avg_glucose_level bmi stroke \\\n",
|
|||
|
"0 9046 67.0 0 1 169.3575 36.6 1 \n",
|
|||
|
"1 51676 61.0 0 0 169.3575 28.1 1 \n",
|
|||
|
"2 31112 80.0 0 1 105.9200 32.5 1 \n",
|
|||
|
"3 60182 49.0 0 0 169.3575 34.4 1 \n",
|
|||
|
"4 1665 79.0 1 0 169.3575 24.0 1 \n",
|
|||
|
"5 56669 81.0 0 0 169.3575 29.0 1 \n",
|
|||
|
"6 53882 74.0 1 1 70.0900 27.4 1 \n",
|
|||
|
"7 10434 69.0 0 0 94.3900 22.8 1 \n",
|
|||
|
"8 27419 59.0 0 0 76.1500 28.1 1 \n",
|
|||
|
"9 60491 78.0 0 0 58.5700 24.2 1 \n",
|
|||
|
"\n",
|
|||
|
" gender_Male gender_Other ever_married_Yes work_type_Never_worked \\\n",
|
|||
|
"0 True False True False \n",
|
|||
|
"1 False False True False \n",
|
|||
|
"2 True False True False \n",
|
|||
|
"3 False False True False \n",
|
|||
|
"4 False False True False \n",
|
|||
|
"5 True False True False \n",
|
|||
|
"6 True False True False \n",
|
|||
|
"7 False False False False \n",
|
|||
|
"8 False False True False \n",
|
|||
|
"9 False False True False \n",
|
|||
|
"\n",
|
|||
|
" work_type_Private work_type_Self-employed work_type_children \\\n",
|
|||
|
"0 True False False \n",
|
|||
|
"1 False True False \n",
|
|||
|
"2 True False False \n",
|
|||
|
"3 True False False \n",
|
|||
|
"4 False True False \n",
|
|||
|
"5 True False False \n",
|
|||
|
"6 True False False \n",
|
|||
|
"7 True False False \n",
|
|||
|
"8 True False False \n",
|
|||
|
"9 True False False \n",
|
|||
|
"\n",
|
|||
|
" Residence_type_Urban smoking_status_formerly smoked \\\n",
|
|||
|
"0 True True \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False False \n",
|
|||
|
"3 True False \n",
|
|||
|
"4 False False \n",
|
|||
|
"5 True True \n",
|
|||
|
"6 False False \n",
|
|||
|
"7 True False \n",
|
|||
|
"8 False False \n",
|
|||
|
"9 True False \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_never smoked smoking_status_smokes \n",
|
|||
|
"0 False False \n",
|
|||
|
"1 True False \n",
|
|||
|
"2 True False \n",
|
|||
|
"3 False True \n",
|
|||
|
"4 True False \n",
|
|||
|
"5 False False \n",
|
|||
|
"6 True False \n",
|
|||
|
"7 True False \n",
|
|||
|
"8 False False \n",
|
|||
|
"9 False False "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 335,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# One-Hot Encoding\n",
|
|||
|
"categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
|
|||
|
"data_edit_categories = pd.get_dummies(data, columns=categorical_columns, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Данные после унитарного кодирования:\")\n",
|
|||
|
"data_edit_categories.head(10)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Дискретизация числовых признаков<br/><br/>Числовые признаки, такие как 'age', 'avg_glucose_level', 'bmi', можно разделить на категории (биннинг).</p>\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 336,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# data_edit_categories['age_bins'] = pd.cut(data_edit_categories['age'], bins=[0, 18, 30, 50, 100], labels=['ребенок', 'молодой', 'средний', 'пожилой'])\n",
|
|||
|
"# data_edit_categories['bmi_bins'] = pd.cut(data_edit_categories['bmi'], bins=[0, 18.5, 25, 30, 50], labels=['низкий', 'норма', 'избыток', 'ожирение'])\n",
|
|||
|
"\n",
|
|||
|
"# print(\"Данные после дискретизации:\")\n",
|
|||
|
"# data_edit_categories[['age_bins', 'bmi_bins']].head(10)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Ручной синтез новых признаков <br/><br/>\n",
|
|||
|
"<li>Возрастной индекс глюкозы: age * avg_glucose_level\n",
|
|||
|
"<li>Индекс массы тела с поправкой на глюкозу: bmi / avg_glucose_level </p>\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 337,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Данные после синтеза новых признаков:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>age_glucose_index</th>\n",
|
|||
|
" <th>bmi_glucose_ratio</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>11346.9525</td>\n",
|
|||
|
" <td>0.216111</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>10330.8075</td>\n",
|
|||
|
" <td>0.165921</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>8473.6000</td>\n",
|
|||
|
" <td>0.306835</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>8298.5175</td>\n",
|
|||
|
" <td>0.203121</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>13379.2425</td>\n",
|
|||
|
" <td>0.141712</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>13717.9575</td>\n",
|
|||
|
" <td>0.171235</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>5186.6600</td>\n",
|
|||
|
" <td>0.390926</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>6512.9100</td>\n",
|
|||
|
" <td>0.241551</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>4492.8500</td>\n",
|
|||
|
" <td>0.369009</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>4568.4600</td>\n",
|
|||
|
" <td>0.413181</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" age_glucose_index bmi_glucose_ratio\n",
|
|||
|
"0 11346.9525 0.216111\n",
|
|||
|
"1 10330.8075 0.165921\n",
|
|||
|
"2 8473.6000 0.306835\n",
|
|||
|
"3 8298.5175 0.203121\n",
|
|||
|
"4 13379.2425 0.141712\n",
|
|||
|
"5 13717.9575 0.171235\n",
|
|||
|
"6 5186.6600 0.390926\n",
|
|||
|
"7 6512.9100 0.241551\n",
|
|||
|
"8 4492.8500 0.369009\n",
|
|||
|
"9 4568.4600 0.413181"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 337,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data_edit_categories['age_glucose_index'] = data_edit_categories['age'] * data_edit_categories['avg_glucose_level']\n",
|
|||
|
"data_edit_categories['bmi_glucose_ratio'] = data_edit_categories['bmi'] / data_edit_categories['avg_glucose_level']\n",
|
|||
|
"\n",
|
|||
|
"print(\"Данные после синтеза новых признаков:\")\n",
|
|||
|
"data_edit_categories[['age_glucose_index', 'bmi_glucose_ratio']].head(10)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Масштабирование признаков<br/><br/>Применяем нормализацию (для сжатия в диапазон [0, 1]) и стандартизацию (для приведения к среднему 0 и стандартному отклонению 1)</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 338,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Данные после нормализации:\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>heart_disease</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>stroke</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>gender_Other</th>\n",
|
|||
|
" <th>ever_married_Yes</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>Residence_type_Urban</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" <th>age_glucose_index</th>\n",
|
|||
|
" <th>bmi_glucose_ratio</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>9046</td>\n",
|
|||
|
" <td>0.816895</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.730556</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>11346.9525</td>\n",
|
|||
|
" <td>0.216111</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>51676</td>\n",
|
|||
|
" <td>0.743652</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>10330.8075</td>\n",
|
|||
|
" <td>0.165921</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>31112</td>\n",
|
|||
|
" <td>0.975586</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.444688</td>\n",
|
|||
|
" <td>0.616667</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>8473.6000</td>\n",
|
|||
|
" <td>0.306835</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>60182</td>\n",
|
|||
|
" <td>0.597168</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.669444</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>8298.5175</td>\n",
|
|||
|
" <td>0.203121</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1665</td>\n",
|
|||
|
" <td>0.963379</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.380556</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>13379.2425</td>\n",
|
|||
|
" <td>0.141712</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5105</th>\n",
|
|||
|
" <td>18234</td>\n",
|
|||
|
" <td>0.975586</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.250618</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>6700.0000</td>\n",
|
|||
|
" <td>0.335522</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5106</th>\n",
|
|||
|
" <td>44873</td>\n",
|
|||
|
" <td>0.987793</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.613459</td>\n",
|
|||
|
" <td>0.825000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>10141.2000</td>\n",
|
|||
|
" <td>0.319489</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5107</th>\n",
|
|||
|
" <td>19723</td>\n",
|
|||
|
" <td>0.426270</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.243965</td>\n",
|
|||
|
" <td>0.563889</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>2904.6500</td>\n",
|
|||
|
" <td>0.368719</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5108</th>\n",
|
|||
|
" <td>37544</td>\n",
|
|||
|
" <td>0.621582</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.973148</td>\n",
|
|||
|
" <td>0.425000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>8480.7900</td>\n",
|
|||
|
" <td>0.153948</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5109</th>\n",
|
|||
|
" <td>44679</td>\n",
|
|||
|
" <td>0.536133</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.264011</td>\n",
|
|||
|
" <td>0.441667</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>3752.3200</td>\n",
|
|||
|
" <td>0.307223</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5110 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id age hypertension heart_disease avg_glucose_level \\\n",
|
|||
|
"0 9046 0.816895 0 1 1.000000 \n",
|
|||
|
"1 51676 0.743652 0 0 1.000000 \n",
|
|||
|
"2 31112 0.975586 0 1 0.444688 \n",
|
|||
|
"3 60182 0.597168 0 0 1.000000 \n",
|
|||
|
"4 1665 0.963379 1 0 1.000000 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 18234 0.975586 1 0 0.250618 \n",
|
|||
|
"5106 44873 0.987793 0 0 0.613459 \n",
|
|||
|
"5107 19723 0.426270 0 0 0.243965 \n",
|
|||
|
"5108 37544 0.621582 0 0 0.973148 \n",
|
|||
|
"5109 44679 0.536133 0 0 0.264011 \n",
|
|||
|
"\n",
|
|||
|
" bmi stroke gender_Male gender_Other ever_married_Yes \\\n",
|
|||
|
"0 0.730556 1 True False True \n",
|
|||
|
"1 0.494444 1 False False True \n",
|
|||
|
"2 0.616667 1 True False True \n",
|
|||
|
"3 0.669444 1 False False True \n",
|
|||
|
"4 0.380556 1 False False True \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 0.494444 0 False False True \n",
|
|||
|
"5106 0.825000 0 False False True \n",
|
|||
|
"5107 0.563889 0 False False True \n",
|
|||
|
"5108 0.425000 0 True False True \n",
|
|||
|
"5109 0.441667 0 False False True \n",
|
|||
|
"\n",
|
|||
|
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False False True \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False True \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5105 False True False \n",
|
|||
|
"5106 False False True \n",
|
|||
|
"5107 False False True \n",
|
|||
|
"5108 False True False \n",
|
|||
|
"5109 False False False \n",
|
|||
|
"\n",
|
|||
|
" work_type_children Residence_type_Urban \\\n",
|
|||
|
"0 False True \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False False \n",
|
|||
|
"3 False True \n",
|
|||
|
"4 False False \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5105 False True \n",
|
|||
|
"5106 False True \n",
|
|||
|
"5107 False False \n",
|
|||
|
"5108 False False \n",
|
|||
|
"5109 False True \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
|||
|
"0 True False \n",
|
|||
|
"1 False True \n",
|
|||
|
"2 False True \n",
|
|||
|
"3 False False \n",
|
|||
|
"4 False True \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5105 False True \n",
|
|||
|
"5106 False True \n",
|
|||
|
"5107 False True \n",
|
|||
|
"5108 True False \n",
|
|||
|
"5109 False False \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_smokes age_glucose_index bmi_glucose_ratio \n",
|
|||
|
"0 False 11346.9525 0.216111 \n",
|
|||
|
"1 False 10330.8075 0.165921 \n",
|
|||
|
"2 False 8473.6000 0.306835 \n",
|
|||
|
"3 True 8298.5175 0.203121 \n",
|
|||
|
"4 False 13379.2425 0.141712 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5105 False 6700.0000 0.335522 \n",
|
|||
|
"5106 False 10141.2000 0.319489 \n",
|
|||
|
"5107 False 2904.6500 0.368719 \n",
|
|||
|
"5108 False 8480.7900 0.153948 \n",
|
|||
|
"5109 False 3752.3200 0.307223 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 338,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
|
|||
|
"\n",
|
|||
|
"scaler = MinMaxScaler()\n",
|
|||
|
"standardizer = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Нормализация\n",
|
|||
|
"data_edit_categories[['age', 'avg_glucose_level', 'bmi']] = scaler.fit_transform(data_edit_categories[['age', 'avg_glucose_level', 'bmi']])\n",
|
|||
|
"print(\"Данные после нормализации:\\n\")\n",
|
|||
|
"data_edit_categories\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# # Стандартизация\n",
|
|||
|
"# X_encoded[['age', 'avg_glucose_level', 'bmi']] = standardizer.fit_transform(X_encoded[['age', 'avg_glucose_level', 'bmi']])\n",
|
|||
|
"# print(\"Данные после стандартизации:\\n\", X_encoded.head(10))\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Конструирование признаков с применением фреймворка Featuretools</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 339,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Столбцы в data: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']\n",
|
|||
|
"id 0\n",
|
|||
|
"gender 0\n",
|
|||
|
"age 0\n",
|
|||
|
"hypertension 0\n",
|
|||
|
"heart_disease 0\n",
|
|||
|
"ever_married 0\n",
|
|||
|
"work_type 0\n",
|
|||
|
"Residence_type 0\n",
|
|||
|
"avg_glucose_level 0\n",
|
|||
|
"bmi 0\n",
|
|||
|
"smoking_status 0\n",
|
|||
|
"stroke 0\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\mai\\labs\\AIM-PIbd-31-Bakalskaya-E-D\\lab_3\\venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Сгенерированные признаки:\n",
|
|||
|
" gender age hypertension heart_disease ever_married work_type \\\n",
|
|||
|
"id \n",
|
|||
|
"9046 Male 67.0 0 1 True Private \n",
|
|||
|
"51676 Female 61.0 0 0 True Self-employed \n",
|
|||
|
"31112 Male 80.0 0 1 True Private \n",
|
|||
|
"60182 Female 49.0 0 0 True Private \n",
|
|||
|
"1665 Female 79.0 1 0 True Self-employed \n",
|
|||
|
"\n",
|
|||
|
" Residence_type avg_glucose_level bmi smoking_status stroke \n",
|
|||
|
"id \n",
|
|||
|
"9046 Urban 169.3575 36.6 formerly smoked 1 \n",
|
|||
|
"51676 Rural 169.3575 28.1 never smoked 1 \n",
|
|||
|
"31112 Rural 105.9200 32.5 never smoked 1 \n",
|
|||
|
"60182 Urban 169.3575 34.4 smokes 1 \n",
|
|||
|
"1665 Rural 169.3575 24.0 never smoked 1 \n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>gender</th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>heart_disease</th>\n",
|
|||
|
" <th>ever_married</th>\n",
|
|||
|
" <th>work_type</th>\n",
|
|||
|
" <th>Residence_type</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>smoking_status</th>\n",
|
|||
|
" <th>stroke</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th></th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9046</th>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Urban</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>36.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>51676</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>61.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>Rural</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>31112</th>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>80.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Rural</td>\n",
|
|||
|
" <td>105.9200</td>\n",
|
|||
|
" <td>32.5</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>60182</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Urban</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>34.4</td>\n",
|
|||
|
" <td>smokes</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1665</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>79.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>Rural</td>\n",
|
|||
|
" <td>169.3575</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18234</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>80.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Urban</td>\n",
|
|||
|
" <td>83.7500</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>44873</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>81.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>Urban</td>\n",
|
|||
|
" <td>125.2000</td>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19723</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>Rural</td>\n",
|
|||
|
" <td>82.9900</td>\n",
|
|||
|
" <td>30.6</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>37544</th>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>51.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>Rural</td>\n",
|
|||
|
" <td>166.2900</td>\n",
|
|||
|
" <td>25.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>44679</th>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>44.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Govt_job</td>\n",
|
|||
|
" <td>Urban</td>\n",
|
|||
|
" <td>85.2800</td>\n",
|
|||
|
" <td>26.2</td>\n",
|
|||
|
" <td>Unknown</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5110 rows × 11 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" gender age hypertension heart_disease ever_married work_type \\\n",
|
|||
|
"id \n",
|
|||
|
"9046 Male 67.0 0 1 True Private \n",
|
|||
|
"51676 Female 61.0 0 0 True Self-employed \n",
|
|||
|
"31112 Male 80.0 0 1 True Private \n",
|
|||
|
"60182 Female 49.0 0 0 True Private \n",
|
|||
|
"1665 Female 79.0 1 0 True Self-employed \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"18234 Female 80.0 1 0 True Private \n",
|
|||
|
"44873 Female 81.0 0 0 True Self-employed \n",
|
|||
|
"19723 Female 35.0 0 0 True Self-employed \n",
|
|||
|
"37544 Male 51.0 0 0 True Private \n",
|
|||
|
"44679 Female 44.0 0 0 True Govt_job \n",
|
|||
|
"\n",
|
|||
|
" Residence_type avg_glucose_level bmi smoking_status stroke \n",
|
|||
|
"id \n",
|
|||
|
"9046 Urban 169.3575 36.6 formerly smoked 1 \n",
|
|||
|
"51676 Rural 169.3575 28.1 never smoked 1 \n",
|
|||
|
"31112 Rural 105.9200 32.5 never smoked 1 \n",
|
|||
|
"60182 Urban 169.3575 34.4 smokes 1 \n",
|
|||
|
"1665 Rural 169.3575 24.0 never smoked 1 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"18234 Urban 83.7500 28.1 never smoked 0 \n",
|
|||
|
"44873 Urban 125.2000 40.0 never smoked 0 \n",
|
|||
|
"19723 Rural 82.9900 30.6 never smoked 0 \n",
|
|||
|
"37544 Rural 166.2900 25.6 formerly smoked 0 \n",
|
|||
|
"44679 Urban 85.2800 26.2 Unknown 0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 11 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 339,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft\n",
|
|||
|
"\n",
|
|||
|
"print(\"Столбцы в data:\", data.columns.tolist())\n",
|
|||
|
"print(data.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"# Создание EntitySet (основная структура для Featuretools)\n",
|
|||
|
"entity = ft.EntitySet(id=\"stroke_prediction\")\n",
|
|||
|
"\n",
|
|||
|
"entity = entity.add_dataframe(\n",
|
|||
|
" dataframe_name=\"data\", \n",
|
|||
|
" dataframe=data, \n",
|
|||
|
" index=\"id\",\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"# Генерация новых признаков\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=entity,\n",
|
|||
|
" target_dataframe_name=\"data\", # Основная таблица\n",
|
|||
|
" max_depth=2 # Уровень вложенности\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Сгенерированные признаки:\")\n",
|
|||
|
"print(feature_matrix.head())\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение результатов\n",
|
|||
|
"feature_matrix.to_csv(\"./csv/generated_features_copy.csv\", index=False)\n",
|
|||
|
"feature_matrix"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Так, теперь разобьем на выборки</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 340,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Размеры выборок:\n",
|
|||
|
"Обучающая выборка: (4088, 18)\n",
|
|||
|
"Тестовая выборка: (511, 18)\n",
|
|||
|
"Контрольная выборка: (511, 18)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>id</th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>heart_disease</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>stroke</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>gender_Other</th>\n",
|
|||
|
" <th>ever_married_Yes</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>Residence_type_Urban</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" <th>age_glucose_index</th>\n",
|
|||
|
" <th>bmi_glucose_ratio</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>9046</td>\n",
|
|||
|
" <td>0.816895</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.730556</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>11346.9525</td>\n",
|
|||
|
" <td>0.216111</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>51676</td>\n",
|
|||
|
" <td>0.743652</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>10330.8075</td>\n",
|
|||
|
" <td>0.165921</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>31112</td>\n",
|
|||
|
" <td>0.975586</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.444688</td>\n",
|
|||
|
" <td>0.616667</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>8473.6000</td>\n",
|
|||
|
" <td>0.306835</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>60182</td>\n",
|
|||
|
" <td>0.597168</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.669444</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>8298.5175</td>\n",
|
|||
|
" <td>0.203121</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>1665</td>\n",
|
|||
|
" <td>0.963379</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.380556</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>13379.2425</td>\n",
|
|||
|
" <td>0.141712</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5105</th>\n",
|
|||
|
" <td>18234</td>\n",
|
|||
|
" <td>0.975586</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.250618</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>6700.0000</td>\n",
|
|||
|
" <td>0.335522</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5106</th>\n",
|
|||
|
" <td>44873</td>\n",
|
|||
|
" <td>0.987793</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.613459</td>\n",
|
|||
|
" <td>0.825000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>10141.2000</td>\n",
|
|||
|
" <td>0.319489</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5107</th>\n",
|
|||
|
" <td>19723</td>\n",
|
|||
|
" <td>0.426270</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.243965</td>\n",
|
|||
|
" <td>0.563889</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>2904.6500</td>\n",
|
|||
|
" <td>0.368719</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5108</th>\n",
|
|||
|
" <td>37544</td>\n",
|
|||
|
" <td>0.621582</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.973148</td>\n",
|
|||
|
" <td>0.425000</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>8480.7900</td>\n",
|
|||
|
" <td>0.153948</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5109</th>\n",
|
|||
|
" <td>44679</td>\n",
|
|||
|
" <td>0.536133</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.264011</td>\n",
|
|||
|
" <td>0.441667</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>3752.3200</td>\n",
|
|||
|
" <td>0.307223</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5110 rows × 20 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" id age hypertension heart_disease avg_glucose_level \\\n",
|
|||
|
"0 9046 0.816895 0 1 1.000000 \n",
|
|||
|
"1 51676 0.743652 0 0 1.000000 \n",
|
|||
|
"2 31112 0.975586 0 1 0.444688 \n",
|
|||
|
"3 60182 0.597168 0 0 1.000000 \n",
|
|||
|
"4 1665 0.963379 1 0 1.000000 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 18234 0.975586 1 0 0.250618 \n",
|
|||
|
"5106 44873 0.987793 0 0 0.613459 \n",
|
|||
|
"5107 19723 0.426270 0 0 0.243965 \n",
|
|||
|
"5108 37544 0.621582 0 0 0.973148 \n",
|
|||
|
"5109 44679 0.536133 0 0 0.264011 \n",
|
|||
|
"\n",
|
|||
|
" bmi stroke gender_Male gender_Other ever_married_Yes \\\n",
|
|||
|
"0 0.730556 1 True False True \n",
|
|||
|
"1 0.494444 1 False False True \n",
|
|||
|
"2 0.616667 1 True False True \n",
|
|||
|
"3 0.669444 1 False False True \n",
|
|||
|
"4 0.380556 1 False False True \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 0.494444 0 False False True \n",
|
|||
|
"5106 0.825000 0 False False True \n",
|
|||
|
"5107 0.563889 0 False False True \n",
|
|||
|
"5108 0.425000 0 True False True \n",
|
|||
|
"5109 0.441667 0 False False True \n",
|
|||
|
"\n",
|
|||
|
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
|
|||
|
"0 False True False \n",
|
|||
|
"1 False False True \n",
|
|||
|
"2 False True False \n",
|
|||
|
"3 False True False \n",
|
|||
|
"4 False False True \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5105 False True False \n",
|
|||
|
"5106 False False True \n",
|
|||
|
"5107 False False True \n",
|
|||
|
"5108 False True False \n",
|
|||
|
"5109 False False False \n",
|
|||
|
"\n",
|
|||
|
" work_type_children Residence_type_Urban \\\n",
|
|||
|
"0 False True \n",
|
|||
|
"1 False False \n",
|
|||
|
"2 False False \n",
|
|||
|
"3 False True \n",
|
|||
|
"4 False False \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5105 False True \n",
|
|||
|
"5106 False True \n",
|
|||
|
"5107 False False \n",
|
|||
|
"5108 False False \n",
|
|||
|
"5109 False True \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
|||
|
"0 True False \n",
|
|||
|
"1 False True \n",
|
|||
|
"2 False True \n",
|
|||
|
"3 False False \n",
|
|||
|
"4 False True \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5105 False True \n",
|
|||
|
"5106 False True \n",
|
|||
|
"5107 False True \n",
|
|||
|
"5108 True False \n",
|
|||
|
"5109 False False \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_smokes age_glucose_index bmi_glucose_ratio \n",
|
|||
|
"0 False 11346.9525 0.216111 \n",
|
|||
|
"1 False 10330.8075 0.165921 \n",
|
|||
|
"2 False 8473.6000 0.306835 \n",
|
|||
|
"3 True 8298.5175 0.203121 \n",
|
|||
|
"4 False 13379.2425 0.141712 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"5105 False 6700.0000 0.335522 \n",
|
|||
|
"5106 False 10141.2000 0.319489 \n",
|
|||
|
"5107 False 2904.6500 0.368719 \n",
|
|||
|
"5108 False 8480.7900 0.153948 \n",
|
|||
|
"5109 False 3752.3200 0.307223 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 20 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 340,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"# Определение признаков и целевой переменной\n",
|
|||
|
"\n",
|
|||
|
"# data_edit_categories = pd.read_csv('./csv/generated_features_copy.csv')\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"X = data_edit_categories.drop(columns=['id', 'stroke']) \n",
|
|||
|
"y = data_edit_categories['stroke'] \n",
|
|||
|
"\n",
|
|||
|
"# Обучающая выборка\n",
|
|||
|
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)\n",
|
|||
|
"\n",
|
|||
|
"# Тестовая и контрольная выборки\n",
|
|||
|
"X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=None, stratify=y_temp)\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nРазмеры выборок:\")\n",
|
|||
|
"print(f\"Обучающая выборка: {X_train.shape}\")\n",
|
|||
|
"print(f\"Тестовая выборка: {X_test.shape}\")\n",
|
|||
|
"print(f\"Контрольная выборка: {X_control.shape}\")\n",
|
|||
|
"\n",
|
|||
|
"data_edit_categories\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 341,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"stroke\n",
|
|||
|
"0 4861\n",
|
|||
|
"1 249\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9kUlEQVR4nO3deXyM9/7//2d2IZlEkIRSQmqJpa0oglpDqlFVnFYpqpyWhhYtTs6ptfXVcmorqqeLpaUt1XLKsUQUVSkaja042sahjSRCk0ErieT6/dFP5mckthGZxPW4325zu5n39b7e1+uaTDJP1/W+rnExDMMQAACAibk6uwAAAABnIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABABz2/PPPq3Pnzs4uw2bx4sVycXHRd999d8tj5ebmqkaNGlqwYEExVIbSjkCEO1rBH8eCR7ly5VS3bl0NHz5caWlpzi4PKNOSk5P13nvv6e9///tNrbdz505NmjRJmZmZt6ewYuLh4aHRo0dr6tSpunjxorPLwW1GIIIpTJkyRR9++KHmzZunVq1a6e2331ZERIR+//13Z5cGlFlz5sxRSEiIOnTocFPr7dy5U5MnTy71gUiSBg0apIyMDC1fvtzZpeA2IxDBFLp27aqnnnpKQ4YM0eLFizVy5EglJydrzZo1zi4NKJNyc3O1bNkyPf7447d1O/n5+U49OuPv768uXbpo8eLFTqsBJYNABFPq2LGjpD8P+UvS2bNn9fLLL6tx48by8fGRxWJR165dtW/fvkLrXrx4UZMmTVLdunVVrlw5Va1aVT179tRPP/0kSTp+/LjdaborH+3bt7eNtXXrVrm4uOjTTz/V3//+dwUHB6tChQrq3r27Tp48WWjbu3bt0kMPPSQ/Pz+VL19e7dq10zfffFPkPrZv377I7U+aNKlQ348++kjh4eHy9vZWQECA+vTpU+T2r7Vvl8vPz9fs2bPVsGFDlStXTkFBQXruuef022+/2fWrVauWunXrVmg7w4cPLzRmUbXPmDGj0GsqSdnZ2Zo4caJCQ0Pl5eWlGjVqaOzYscrOzi7ytbpc+/btC403depUubq6FjpKcKOvxz//+U+1atVKlSpVkre3t8LDw/XZZ58Vuf2PPvpIzZs3V/ny5VWxYkW1bdtWmzZtsuuzfv16tWvXTr6+vrJYLHrggQcK1bZy5Urbz7Ry5cp66qmn9Ouvv9r1efrpp+1qrlixotq3b6+vv/76uq/Tjh07lJGRocjIyELL3nrrLTVs2NC2D82aNbPVN2nSJI0ZM0aSFBISYtv28ePHJf35cx4+fLiWLVumhg0bysvLSxs2bJAkff/99+ratassFot8fHzUqVMnffvtt9et9bffflPz5s1VvXp1HT16VNLNvUc6d+6sHTt26OzZs9fdFsoud2cXADhDQXipVKmSJOnnn3/W6tWr9Ze//EUhISFKS0vTO++8o3bt2umHH35QtWrVJEl5eXnq1q2b4uPj1adPH7344os6d+6c4uLidPDgQdWpU8e2jSeffFIPP/yw3XZjY2OLrGfq1KlycXHRuHHjlJ6ertmzZysyMlJJSUny9vaWJG3ZskVdu3ZVeHi4Jk6cKFdXVy1atEgdO3bU119/rebNmxcat3r16po2bZok6fz58xo2bFiR2x4/frwef/xxDRkyRKdPn9Zbb72ltm3b6vvvv5e/v3+hdZ599lk9+OCDkqTPP/9cX3zxhd3y5557TosXL9agQYP0wgsvKDk5WfPmzdP333+vb775Rh4eHkW+DjcjMzPTtm+Xy8/PV/fu3bVjxw49++yzatCggQ4cOKBZs2bpv//9r1avXn1T21m0aJFeeeUVvfnmm+rbt2+Rfa73esyZM0fdu3dXv379lJOTo08++UR/+ctftHbtWkVHR9v6TZ48WZMmTVKrVq00ZcoUeXp6ateuXdqyZYu6dOki6c95cc8884waNmyo2NhY+fv76/vvv9eGDRts9RW89g888ICmTZumtLQ0zZkzR998802hn2nlypU1a9YsSdIvv/yiOXPm6OGHH9bJkyeL/NkX2Llzp1xcXHT//ffbtb/77rt64YUX1Lt3b7344ou6ePGi9u/fr127dqlv377q2bOn/vvf/+rjjz/WrFmzVLlyZUlSlSpVbGNs2bJFK1as0PDhw1W5cmXVqlVLhw4d0oMPPiiLxaKxY8fKw8ND77zzjtq3b69t27apRYsWRdaZkZGhzp076+zZs9q2bZvq1Klz0++R8PBwGYahnTt3FhngcYcwgDvYokWLDEnG5s2bjdOnTxsnT540PvnkE6NSpUqGt7e38csvvxiGYRgXL1408vLy7NZNTk42vLy8jClTptjaPvjgA0OSMXPmzELbys/Pt60nyZgxY0ahPg0bNjTatWtne/7VV18Zkoy77rrLsFqttvYVK1YYkow5c+bYxr7nnnuMqKgo23YMwzB+//13IyQkxOjcuXOhbbVq1cpo1KiR7fnp06cNScbEiRNtbcePHzfc3NyMqVOn2q174MABw93dvVD7sWPHDEnGkiVLbG0TJ040Lv9T8vXXXxuSjGXLltmtu2HDhkLtNWvWNKKjowvVHhMTY1z55+nK2seOHWsEBgYa4eHhdq/phx9+aLi6uhpff/213foLFy40JBnffPNNoe1drl27drbx1q1bZ7i7uxsvvfRSkX1v5PUwjD9/TpfLyckxGjVqZHTs2NFuLFdXV+Oxxx4r9F4s+JlnZmYavr6+RosWLYw//vijyD45OTlGYGCg0ahRI7s+a9euNSQZEyZMsLUNHDjQqFmzpt04//rXvwxJxu7du4vc5wJPPfWUUalSpULtjz76qNGwYcNrrjtjxgxDkpGcnFxomSTD1dXVOHTokF17jx49DE9PT+Onn36ytaWkpBi+vr5G27ZtbW0Fv/N79uwxTp06ZTRs2NCoXbu2cfz4cVufm32PpKSkGJKMN95445r7hbKNU2YwhcjISFWpUkU1atRQnz595OPjoy+++EJ33XWXJMnLy0uurn/+OuTl5enMmTPy8fFRvXr1tHfvXts4q1atUuXKlTVixIhC27jyNMnNGDBggHx9fW3Pe/furapVq+o///mPJCkpKUnHjh1T3759debMGWVkZCgjI0MXLlxQp06dtH37duXn59uNefHiRZUrV+6a2/3888+Vn5+vxx9/3DZmRkaGgoODdc899+irr76y65+TkyPpz9fralauXCk/Pz917tzZbszw8HD5+PgUGjM3N9euX0ZGxnXnjPz666966623NH78ePn4+BTafoMGDVS/fn27MQtOk165/avZvXu3Hn/8cfXq1UszZswoss+NvB6SbEf5pD9P32RlZenBBx+0e2+tXr1a+fn5mjBhgu29WKDgvRUXF6dz587pb3/7W6GfbUGf7777Tunp6Xr++eft+kRHR6t+/fpat26d3Xr5+fm21ygpKUlLly5V1apV1aBBg2vu05kzZ1SxYsVC7f7+/vrll1+0Z8+ea65/Le3atVNYWJjteV5enjZt2qQePXqodu3atvaqVauqb9++2rFjh6xWq90Yv/zyi9q1a6fc3Fxt375dNWvWtC272fdIwX5mZGQ4vE8o/ThlBlOYP3++6tatK3d3dwUFBalevXp2Hzr5+fmaM2eOFixYoOTkZOXl5dmWFZxWk/481VavXj25uxfvr84999xj99zFxUWhoaG2eRXHjh2TJA0cOPCqY2RlZdl9QGVkZBQa90rHjh2TYRhX7Xflqa2Cq4KuDCFXjpmVlaXAwMAil6enp9s937Rpk93pkhsxceJEVatWTc8991yhuTjHjh3T4cOHrzrmldsvyq+//qro6GhduHBBZ86cuWrYvZHXQ5LWrl2r1157TUlJSXZzVC4f96effpKrq6tdELhSwaneRo0aXbXP//73P0lSvXr1Ci2rX7++duzYYdd28uRJu9eqatWqWrVq1XX3SZIMwyjUNm7cOG3evFnNmzdXaGiounTpor59+6p169bXHa9ASEiI3fPTp0/r999/L3KfGjR
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"\n",
|
|||
|
"# Подсчет количества объектов каждого класса\n",
|
|||
|
"class_counts = y.value_counts()\n",
|
|||
|
"print(class_counts)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация\n",
|
|||
|
"sns.barplot(x=class_counts.index, y=class_counts.values)\n",
|
|||
|
"plt.title(\"Распределение классов (stroke)\")\n",
|
|||
|
"plt.xlabel(\"Класс\")\n",
|
|||
|
"plt.ylabel(\"Количество\")\n",
|
|||
|
"plt.show()\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Напишем функцию и сделаем аугментацию данных</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 342,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\n",
|
|||
|
"\n",
|
|||
|
"stroke\n",
|
|||
|
"0 3889\n",
|
|||
|
"1 199\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"После оверсемплинга\n",
|
|||
|
"\n",
|
|||
|
"stroke\n",
|
|||
|
"0 3889\n",
|
|||
|
"1 777\n",
|
|||
|
"Name: count, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"После балансировки данных (андерсемплинга)\n",
|
|||
|
"\n",
|
|||
|
"stroke\n",
|
|||
|
"0 777\n",
|
|||
|
"1 777\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGbCAYAAAAr/4yjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8YklEQVR4nO3deXhTVcIG8PfeJE3apmVpaUsBoew7aNlBdlAEFRFBEUX8UGdcZ3R0hnHcdRwHR0Vxm3EEQcYFERxEFkGQVUD2nQItlK2lBUq3NNv5/ii9Q2iBLklO7s378+mDTXPvfZOmfXvOuUkUIYQAERERAFV2ACIiCh0sBSIi0rAUiIhIw1IgIiINS4GIiDQsBSIi0rAUiIhIw1IgIiINS4GIiDQsBSKqkYcffhhDhgyRHUMzY8YMKIqCX3/9tcb7crlcaNSoET744AM/JNMHw5dC2QOk7MNms6Fly5Z49NFHkZWVJTseka6lp6fjk08+wZ///Ocqbbdu3Tq8+OKLOHfuXGCC+YnFYsGTTz6J1157DQ6HQ3acoDB8KZR5+eWXMWvWLEybNg29evXChx9+iJ49e6KoqEh2NCLdmjp1KlJSUjBgwIAqbbdu3Tq89NJLIV8KADBx4kTk5OTgP//5j+woQRE2pTBs2DCMHz8ekyZNwowZM/C73/0O6enp+O6772RHI9Ill8uF2bNnY8yYMQE9jtfrlfpXeu3atTF06FDMmDFDWoZgCptSuNTAgQMBlA5/AeDMmTP4wx/+gA4dOsButyM2NhbDhg3D9u3by23rcDjw4osvomXLlrDZbKhfvz5GjRqFQ4cOAQAyMjJ8pqwu/ejfv7+2r5UrV0JRFHz11Vf485//jKSkJERHR+OWW25BZmZmuWNv2LABN954I2rVqoWoqCj069cPa9eurfA29u/fv8Ljv/jii+Wu+/nnnyM1NRWRkZGoW7cu7rzzzgqPf6XbdjGv14t33nkH7dq1g81mQ2JiIh566CGcPXvW53pNmjTBiBEjyh3n0UcfLbfPirJPmTKl3H0KACUlJXjhhRfQvHlzWK1WNGrUCM888wxKSkoqvK8u1r9//3L7e+2116Cqarm/Fit7f7z55pvo1asX4uLiEBkZidTUVHzzzTcVHv/zzz9Ht27dEBUVhTp16qBv375YunSpz3UWLVqEfv36ISYmBrGxsejatWu5bHPmzNG+p/Hx8Rg/fjyOHz/uc5377rvPJ3OdOnXQv39/rF69+qr305o1a5CTk4PBgweX+9p7772Hdu3aabehS5cuWr4XX3wRTz/9NAAgJSVFO3ZGRgaA0u/zo48+itmzZ6Ndu3awWq1YvHgxAGDr1q0YNmwYYmNjYbfbMWjQIPzyyy9XzXr27Fl069YNDRs2xP79+wFU7TEyZMgQrFmzBmfOnLnqsfTOLDuALGW/wOPi4gAAhw8fxvz583HHHXcgJSUFWVlZ+Pjjj9GvXz/s2bMHycnJAACPx4MRI0Zg+fLluPPOO/HEE08gPz8fP/74I3bt2oVmzZppx7jrrrtw0003+Rx38uTJFeZ57bXXoCgK/vjHPyI7OxvvvPMOBg8ejG3btiEyMhIA8NNPP2HYsGFITU3FCy+8AFVVMX36dAwcOBCrV69Gt27dyu23YcOGeP311wEABQUF+O1vf1vhsZ977jmMGTMGkyZNwunTp/Hee++hb9++2Lp1K2rXrl1umwcffBDXX389AODbb7/FvHnzfL7+0EMPYcaMGZg4cSIef/xxpKenY9q0adi6dSvWrl0Li8VS4f1QFefOndNu28W8Xi9uueUWrFmzBg8++CDatGmDnTt34u2338aBAwcwf/78Kh1n+vTp+Mtf/oJ//OMfGDduXIXXudr9MXXqVNxyyy24++674XQ68eWXX+KOO+7A999/j+HDh2vXe+mll/Diiy+iV69eePnllxEREYENGzbgp59+wtChQwGUrpPdf//9aNeuHSZPnozatWtj69atWLx4sZav7L7v2rUrXn/9dWRlZWHq1KlYu3Ztue9pfHw83n77bQDAsWPHMHXqVNx0003IzMys8HtfZt26dVAUBddee63P5f/617/w+OOPY/To0XjiiSfgcDiwY8cObNiwAePGjcOoUaNw4MABfPHFF3j77bcRHx8PAKhXr562j59++glff/01Hn30UcTHx6NJkybYvXs3rr/+esTGxuKZZ56BxWLBxx9/jP79++Pnn39G9+7dK8yZk5ODIUOG4MyZM/j555/RrFmzKj9GUlNTIYTAunXrKvwjxlCEwU2fPl0AEMuWLROnT58WmZmZ4ssvvxRxcXEiMjJSHDt2TAghhMPhEB6Px2fb9PR0YbVaxcsvv6xd9umnnwoA4q233ip3LK/Xq20HQEyZMqXcddq1ayf69eunfb5ixQoBQDRo0ECcP39eu/zrr78WAMTUqVO1fbdo0ULccMMN2nGEEKKoqEikpKSIIUOGlDtWr169RPv27bXPT58+LQCIF154QbssIyNDmEwm8dprr/lsu3PnTmE2m8tdnpaWJgCIzz77TLvshRdeEBc/lFavXi0AiNmzZ/tsu3jx4nKXN27cWAwfPrxc9kceeURc+vC8NPszzzwjEhISRGpqqs99OmvWLKGqqli9erXP9h999JEAINauXVvueBfr16+ftr+FCxcKs9ksnnrqqQqvW5n7Q4jS79PFnE6naN++vRg4cKDPvlRVFbfddlu5x2LZ9/zcuXMiJiZGdO/eXRQXF1d4HafTKRISEkT79u19rvP9998LAOL555/XLpswYYJo3Lixz37++c9/CgBi48aNFd7mMuPHjxdxcXHlLr/11ltFu3btrrjtlClTBACRnp5e7msAhKqqYvfu3T6Xjxw5UkRERIhDhw5pl504cULExMSIvn37apeV/cxv2rRJnDx5UrRr1040bdpUZGRkaNep6mPkxIkTAoB44403rni7jCBspo8GDx6MevXqoVGjRrjzzjtht9sxb948NGjQAABgtVqhqqV3h8fjQW5uLux2O1q1aoUtW7Zo+5k7dy7i4+Px2GOPlTvGpVMGVXHvvfciJiZG+3z06NGoX78+fvjhBwDAtm3bkJaWhnHjxiE3Nxc5OTnIyclBYWEhBg0ahFWrVsHr9frs0+FwwGazXfG43377LbxeL8aMGaPtMycnB0lJSWjRogVWrFjhc32n0wmg9P66nDlz5qBWrVoYMmSIzz5TU1Nht9vL7dPlcvlcLycn56pzyMePH8d7772H5557Dna7vdzx27Rpg9atW/vss2zK8NLjX87GjRsxZswY3H777ZgyZUqF16nM/QFAG+0BpVMZeXl5uP76630eW/Pnz4fX68Xzzz+vPRbLlD22fvzxR+Tn5+NPf/pTue9t2XV+/fVXZGdn4+GHH/a5zvDhw9G6dWssXLjQZzuv16vdR9u2bcPMmTNRv359tGnT5oq3KTc3F3Xq1Cl3ee3atXHs2DFs2rTpittfSb9+/dC2bVvtc4/Hg6VLl2LkyJFo2rSpdnn9+vUxbtw4rFmzBufPn/fZx7Fjx9CvXz+4XC6sWrUKjRs31r5W1cdI2e3Mycmp9m3Si7CZPnr//ffRsmVLmM1mJCYmolWrVj4/eF6vF1OnTsUHH3yA9PR0eDwe7WtlU0xA6bRTq1atYDb7965r0aKFz+eKoqB58+baPGtaWhoAYMKECZfdR15ens8PaU5OTrn9XiotLQ1CiMte79JpnrKzRS79RXzpPvPy8pCQkFDh17Ozs30+X7p0qc/UQWW88MILSE5OxkMPPVRubj4tLQ179+697D4vPX5Fjh8/juHDh6OwsBC5ubmXLfzK3B8A8P333+PVV1/Ftm3bfOasL97voUOHoKqqzy/DS5VNe7Zv3/6y1zly5AgAoFWrVuW+1rp1a6xZs8bnsszMTJ/7qn79+pg7d+5VbxMAiAreuPGPf/wjli1bhm7duqF58+YYOnQoxo0bh969e191f2VSUlJ8Pj99+jSKiooqvE1t2rSB1+tFZmYm2rVrp11+zz33wGw2Y+/evUhKSvLZpqqPkbLbWZM//PQibEqhW7du6NKly2W//te
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>heart_disease</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>gender_Other</th>\n",
|
|||
|
" <th>ever_married_Yes</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>Residence_type_Urban</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" <th>age_glucose_index</th>\n",
|
|||
|
" <th>bmi_glucose_ratio</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2508</th>\n",
|
|||
|
" <td>0.316406</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.176562</td>\n",
|
|||
|
" <td>0.341667</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>1957.540</td>\n",
|
|||
|
" <td>0.300173</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2435</th>\n",
|
|||
|
" <td>0.768066</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.351636</td>\n",
|
|||
|
" <td>0.591667</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>6003.270</td>\n",
|
|||
|
" <td>0.331619</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2547</th>\n",
|
|||
|
" <td>0.060059</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.250618</td>\n",
|
|||
|
" <td>0.216667</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>418.750</td>\n",
|
|||
|
" <td>0.216119</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3885</th>\n",
|
|||
|
" <td>0.914551</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.342882</td>\n",
|
|||
|
" <td>0.691667</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>7071.750</td>\n",
|
|||
|
" <td>0.373316</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>335</th>\n",
|
|||
|
" <td>0.426270</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.500974</td>\n",
|
|||
|
" <td>0.544444</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>3932.250</td>\n",
|
|||
|
" <td>0.266133</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4661</th>\n",
|
|||
|
" <td>0.853516</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.977778</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>11855.025</td>\n",
|
|||
|
" <td>0.268662</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4662</th>\n",
|
|||
|
" <td>0.926758</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.024510</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>4401.920</td>\n",
|
|||
|
" <td>0.485152</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4663</th>\n",
|
|||
|
" <td>0.682617</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" <td>0.836111</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>9484.020</td>\n",
|
|||
|
" <td>0.238549</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4664</th>\n",
|
|||
|
" <td>0.768066</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.313207</td>\n",
|
|||
|
" <td>0.494444</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>5726.700</td>\n",
|
|||
|
" <td>0.309131</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4665</th>\n",
|
|||
|
" <td>0.902344</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.156166</td>\n",
|
|||
|
" <td>0.583333</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>5399.040</td>\n",
|
|||
|
" <td>0.429002</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>1554 rows × 18 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" age hypertension heart_disease avg_glucose_level bmi \\\n",
|
|||
|
"2508 0.316406 0 0 0.176562 0.341667 \n",
|
|||
|
"2435 0.768066 0 0 0.351636 0.591667 \n",
|
|||
|
"2547 0.060059 0 0 0.250618 0.216667 \n",
|
|||
|
"3885 0.914551 0 0 0.342882 0.691667 \n",
|
|||
|
"335 0.426270 0 0 0.500974 0.544444 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"4661 0.853516 1 0 1.000000 0.977778 \n",
|
|||
|
"4662 0.926758 0 0 0.024510 0.494444 \n",
|
|||
|
"4663 0.682617 0 0 1.000000 0.836111 \n",
|
|||
|
"4664 0.768066 0 0 0.313207 0.494444 \n",
|
|||
|
"4665 0.902344 0 0 0.156166 0.583333 \n",
|
|||
|
"\n",
|
|||
|
" gender_Male gender_Other ever_married_Yes work_type_Never_worked \\\n",
|
|||
|
"2508 False False True False \n",
|
|||
|
"2435 True False True False \n",
|
|||
|
"2547 True False False False \n",
|
|||
|
"3885 True False True False \n",
|
|||
|
"335 False False True False \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"4661 True False True False \n",
|
|||
|
"4662 False False True False \n",
|
|||
|
"4663 False False True False \n",
|
|||
|
"4664 False False True False \n",
|
|||
|
"4665 True False True False \n",
|
|||
|
"\n",
|
|||
|
" work_type_Private work_type_Self-employed work_type_children \\\n",
|
|||
|
"2508 True False False \n",
|
|||
|
"2435 True False False \n",
|
|||
|
"2547 False False True \n",
|
|||
|
"3885 False False False \n",
|
|||
|
"335 True False False \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"4661 True False False \n",
|
|||
|
"4662 True False False \n",
|
|||
|
"4663 True False False \n",
|
|||
|
"4664 True False False \n",
|
|||
|
"4665 True False False \n",
|
|||
|
"\n",
|
|||
|
" Residence_type_Urban smoking_status_formerly smoked \\\n",
|
|||
|
"2508 False False \n",
|
|||
|
"2435 True False \n",
|
|||
|
"2547 True False \n",
|
|||
|
"3885 True False \n",
|
|||
|
"335 True False \n",
|
|||
|
"... ... ... \n",
|
|||
|
"4661 False True \n",
|
|||
|
"4662 True True \n",
|
|||
|
"4663 True True \n",
|
|||
|
"4664 True True \n",
|
|||
|
"4665 True False \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_never smoked smoking_status_smokes age_glucose_index \\\n",
|
|||
|
"2508 False True 1957.540 \n",
|
|||
|
"2435 False True 6003.270 \n",
|
|||
|
"2547 False False 418.750 \n",
|
|||
|
"3885 False False 7071.750 \n",
|
|||
|
"335 False False 3932.250 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"4661 False False 11855.025 \n",
|
|||
|
"4662 False False 4401.920 \n",
|
|||
|
"4663 False False 9484.020 \n",
|
|||
|
"4664 False False 5726.700 \n",
|
|||
|
"4665 False True 5399.040 \n",
|
|||
|
"\n",
|
|||
|
" bmi_glucose_ratio \n",
|
|||
|
"2508 0.300173 \n",
|
|||
|
"2435 0.331619 \n",
|
|||
|
"2547 0.216119 \n",
|
|||
|
"3885 0.373316 \n",
|
|||
|
"335 0.266133 \n",
|
|||
|
"... ... \n",
|
|||
|
"4661 0.268662 \n",
|
|||
|
"4662 0.485152 \n",
|
|||
|
"4663 0.238549 \n",
|
|||
|
"4664 0.309131 \n",
|
|||
|
"4665 0.429002 \n",
|
|||
|
"\n",
|
|||
|
"[1554 rows x 18 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 342,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"def over_under_sampling(x_selection, y_selection):\n",
|
|||
|
"\n",
|
|||
|
" # сначала увеличение меньшинства\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(sampling_strategy=0.2, random_state=42) \n",
|
|||
|
" x_over, y_over = oversampler.fit_resample(x_selection, y_selection) \n",
|
|||
|
"\n",
|
|||
|
" print(\"\\nПосле оверсемплинга\\n\")\n",
|
|||
|
" print(y_over.value_counts())\n",
|
|||
|
"\n",
|
|||
|
" # потом уменьшение большинства\n",
|
|||
|
"\n",
|
|||
|
" undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)\n",
|
|||
|
" x_balanced, y_balanced = undersampler.fit_resample(x_over, y_over)\n",
|
|||
|
"\n",
|
|||
|
" print(\"\\nПосле балансировки данных (андерсемплинга)\\n\")\n",
|
|||
|
" print(y_balanced.value_counts())\n",
|
|||
|
"\n",
|
|||
|
" plt.pie(\n",
|
|||
|
" y_balanced.value_counts(), \n",
|
|||
|
" labels=class_counts.index, # Метки классов (0 и 1)\n",
|
|||
|
" autopct='%1.1f%%', # Отображение процентов\n",
|
|||
|
" colors=['lightgreen', 'lightcoral'], # Цвета для классов\n",
|
|||
|
" startangle=45, # Поворот диаграммы\n",
|
|||
|
" explode=(0, 0.05) # Небольшое смещение для класса 1\n",
|
|||
|
" )\n",
|
|||
|
" plt.title(\"Распределение классов (stroke)\")\n",
|
|||
|
" plt.show()\n",
|
|||
|
" return x_balanced, y_balanced \n",
|
|||
|
"\n",
|
|||
|
"print(\"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\\n\")\n",
|
|||
|
"print(y_train.value_counts())\n",
|
|||
|
"X_train, y_train = over_under_sampling(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"X_train\n",
|
|||
|
"\n",
|
|||
|
"# print(\"Данные ДО аугментации в ТЕСТОВОЙ ВЫБОРКЕ (10-20% данных)\\n\")\n",
|
|||
|
"# print(y_test.value_counts())\n",
|
|||
|
"# over_under_sampling(X_test, y_test)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">Самое время оценить качество работы модели</p>"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 343,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Время обучения модели: 0.25 секунд\n",
|
|||
|
"ROC-AUC: 0.84\n",
|
|||
|
"F1-Score: 0.29\n",
|
|||
|
"Матрица ошибок:\n",
|
|||
|
"[[434 52]\n",
|
|||
|
" [ 12 13]]\n",
|
|||
|
"Отчет по классификации:\n",
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0 0.97 0.89 0.93 486\n",
|
|||
|
" 1 0.20 0.52 0.29 25\n",
|
|||
|
"\n",
|
|||
|
" accuracy 0.87 511\n",
|
|||
|
" macro avg 0.59 0.71 0.61 511\n",
|
|||
|
"weighted avg 0.94 0.87 0.90 511\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAJwCAYAAACZACVsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABdI0lEQVR4nO3df3zN9f//8fvZbDObbYZt5vevCFukYsX8SOZH3kQ/pLJJiUb5GSs/kyYpFP2QGiqJ3kr0A/mZ35JFSIiW2OZNNpP9Pt8/fJ1Pp23n7DDnnO3cru/L6/Leeb6er9frcc46PDyez9fzZTAajUYBAACgUG6ODgAAAMCZkSwBAABYQLIEAABgAckSAACABSRLAAAAFpAsAQAAWECyBAAAYAHJEgAAgAXlHB0AADhKdna2zp8/r/z8fIWGhjo6HABOisoSAJfyww8/qF+/fqpSpYq8vLxUrVo19enTx9FhAXBiJEvAvyxcuFAGg0EGg0Fbt24tsN9oNKpmzZoyGAy69957HRAhrtXKlSvVpk0bHTp0SNOmTdO6deu0bt06vfvuu44ODYATYxgOKEL58uW1ZMkStWnTxqx98+bNOnXqlLy8vBwUGa7F+fPn9cQTTygqKkrLly+Xp6eno0MCUEpQWQKK0K1bNy1fvly5ublm7UuWLFHLli0VEhLioMhwLRISEpSZmamFCxeSKAGwCckSUISHH35Y586d07p160xt2dnZ+uyzz9SvX79Cj5k5c6buvPNOVa5cWd7e3mrZsqU+++wzsz5Xh/iK2tq3by9J2rRpkwwGgz799FM9//zzCgkJkY+Pj/7zn//ojz/+MDtn+/btTcddtWfPHtM5/339oUOHFoj93nvvVZ06dcza9u/fr5iYGNWrV0/ly5dXSEiIHn/8cZ07d87SR2eSmpqqgQMHKjg4WOXLl9ctt9yiRYsWmfU5efKkDAaDZs6cadberFmzAu9p/PjxMhgMysjIMHs/kydPNuv36quvmn2WkrRz5041b95cL7/8smrWrCkvLy81bNhQ06dPV35+vtnxubm5mjp1qurXry8vLy/VqVNHzz//vLKyssz61alTRzExMWZtgwYNUvny5bVp0ybrHxCAUoFhOKAIderUUUREhD755BN17dpVkvTNN98oLS1Nffv21RtvvFHgmDlz5ug///mPHnnkEWVnZ2vp0qV64IEHtHr1anXv3l2S9OGHH5r6f//995o/f75mzZqlKlWqSJKCg4PNzjlt2jQZDAaNHTtWqampmj17tjp16qTExER5e3sXGf/YsWOv+zNYt26dfvvtNw0YMEAhISE6ePCg5s+fr4MHD2rnzp0FErF/unz5stq3b69jx45p6NChqlu3rpYvX66YmBhduHBBzz777HXHV5gLFy4oPj6+QPu5c+e0detWbd26VY8//rhatmyp9evXKy4uTidPntQ777xj6vvEE09o0aJFuv/++zVq1Cjt2rVL8fHxOnz4sD7//PMirz1p0iS9//77+vTTTwskegBKMSMAMwkJCUZJxj179hjnzp1rrFixovHvv/82Go1G4wMPPGDs0KGD0Wg0GmvXrm3s3r272bFX+12VnZ1tbNasmbFjx44Wr3XixIkC+zZu3GiUZKxevboxPT3d1L5s2TKjJOOcOXNMbe3atTO2a9fO9Prrr782SjJ26dLF+O+vuSRjbGxsget1797dWLt2bYvvx2g0Gj/55BOjJOOWLVsKfU9XzZ492yjJ+NFHH5nasrOzjREREUZfX1/Tezpx4oRRkvHVV181O75p06Zm78loNBpfeOEFoyTjxYsXzd7PpEmTTK+fe+45Y1BQkLFly5Zmx7dr184oyTh58mSzc8bExBglGQ8cOGA0Go3GxMREoyTjE088YdZv9OjRRknGDRs2mNpq165tjI6ONhqNRuO7775rlGR88803LX4uAEofhuEACx588EFdvnxZq1ev1sWLF7V69eoih+AkmVV6/vrrL6Wlpalt27b68ccfrzmG/v37q2LFiqbX999/v6pVq6avv/660P5Go1FxcXHq06ePWrVqdc3XlczfT2Zmpv73v/+pdevWkmT1PX399dcKCQnRww8/bGrz8PDQM888o4yMDG3evPm6YivMn3/+qTfffFMTJkyQr69vgf3u7u4aMWKEWduoUaMkSV999ZUpbkkaOXKkxX7/tHLlSj399NMaM2ZMoUOcAEo3kiXAgqpVq6pTp05asmSJVqxYoby8PN1///1F9l+9erVat26t8uXLKzAwUFWrVtXbb7+ttLS0a46hYcOGZq8NBoMaNGigkydPFtr/448/1sGDB/Xyyy9f8zWvOn/+vJ599lkFBwfL29tbVatWVd26dSXJ6nv6/fff1bBhQ7m5mf8xc/PNN5v2l7RJkyYpNDRUTz31VIF9BoNBoaGh8vPzM2tv1KiR3NzcTJ/n77//Ljc3NzVo0MCsX0hIiAICAgrEnZiYqIcfflh5eXk6f/58yb4hAE6BOUuAFf369dOTTz6p5ORkde3aVQEBAYX2+/777/Wf//xHkZGReuutt1StWjV5eHgoISFBS5YssUus2dnZmjBhggYOHKibbrrpus/34IMPavv27RozZoyaN28uX19f5efnq0uXLgUmRTva4cOHtXDhQn300Ufy8PAosN/S/K7CWJqP9U8//fSTunbtqrvvvltjxozRo48+ynwloIwhWQKsuO+++/TUU09p586d+vTTT4vs99///lfly5fXmjVrzNZgSkhIuK7rHz161Oy10WjUsWPHFB4eXqDvW2+9pdTU1AJ3h12Lv/76S+vXr9eUKVM0ceLEIuMpSu3atbV//37l5+ebVZd++eUX0/6SFBcXp+bNm+uhhx4qdH/dunW1du1aXbx40WxY89dff1V+fr7pTsDatWsrPz9fR48eNVXBJCklJUUXLlwoEHdYWJiWL18ub29vLV++XIMGDdL+/ftVvnz5En1/AByHYTjACl9fX7399tuaPHmyevToUWQ/d3d3GQwG5eXlmdpOnjypL7744rquv3jxYl28eNH0+rPPPtOZM2dMd+hddfHiRU2bNk0jRowokTWg3N3dJV1Jzv5p9uzZxTq+W7duSk5ONkswc3Nz9eabb8rX11ft2rW77hiv2rFjh1auXKnp06cXWRHq1q2b8vLyNHfuXLP2119/XZJMdyt269ZNUsH3+e9+V916663y8fGRm5ubFixYoJMnT+rFF1+87vcEwHlQWQKKITo62mqf7t276/XXX1eXLl3Ur18/paamat68eWrQoIH2799/zdcODAxUmzZtNGDAAKWkpGj27Nlq0KCBnnzySbN+P/74o6pUqaLnnnvO6jmTkpL07bffmrWdPXtWly9f1rfffqt27drJz89PkZGRmjFjhnJyclS9enWtXbtWJ06cKFbcgwYN0rvvvquYmBjt3btXderU0WeffaZt27Zp9uzZZtUdSTpy5IhZTBkZGXJzczNr++233wq91tq1a3XPPfeoU6dORcbTrVs3derUSS+88IJOnDih5s2ba8OGDfrvf/+rwYMHq1mzZpKkW265RdHR0Zo/f74uXLigdu3aaffu3Vq0aJF69eqlDh06FHmNZs2aaezYsZo+fbr69u1baPUPQCnk4LvxAKfzz6UDLCls6YD333/f2LBhQ6OXl5excePGxoSEBOOkSZMK3L7/72tZWjrgk08+McbFxRmDgoKM3t7exu7duxt///13s75Xb4ufNWuWWXth15Zkdbsaz6lTp4z33XefMSAgwOjv72984IEHjKdPny5wu35RUlJSjAMGDDBWqVLF6OnpaQwLCzMmJCSY9bm6dIAt27+XDjAYDMa9e/cW+Ez+vfRARkaGccSIEcbQ0FCjh4eHsUGDBsbp06cb8/LyzPrl5OQYp0yZYqxbt67Rw8PDWLNmTWNcXJwxMzPTrN8/lw64KjMz09i4cWPj7bffbszNzbX6GQFwfgaj8V81dgBOYdOmTerQoYOWL19u8Q68knTy5EnVrVtXJ06cKLCaNwC4KuYsAQAAWECyBMDE29tbUVFRNt9mDwBlGRO8AZgEBwcXmPgNAK6
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 700x700 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACkwklEQVR4nOzddVhU6dsH8O/QDaIIIiiIoBiIioVirK65tmsLmLsqoNiF3aKCYgeu3brWmmutjYoFYqGIXYQgOef9wx/ndRxqEBzi+7muuXTuU/c5DMPc8zzneSSCIAggIiIiIiKiDKkoOwEiIiIiIqL8joUTERERERFRFlg4ERERERERZYGFExERERERURZYOBEREREREWWBhRMREREREVEWWDgRERERERFlgYUTERERERFRFlg4ERERERERZYGFExHlis+fP8PPz098HhUVhWXLlikvISIiIqJcxMKJCiR3d3fo6ekpOw36hra2NiZNmoQtW7bg+fPnmDp1Kg4ePKjstIiIiIhyhZqyEyDKrg8fPmDLli04f/48zp07hy9fvqBly5aoXr06unbtiurVqys7xSJNVVUV06ZNg6urK6RSKQwMDHD48GFlp0VERESUKySCIAjKToIoK9u3b8fAgQPx+fNnWFlZITk5Ga9fv0b16tVx69YtJCcnw83NDatXr4aGhoay0y3SIiMj8fz5c9jb28PIyEjZ6RARERHlCnbVo3zvwoUL6N27N8zMzHDhwgWEh4ejWbNm0NLSwrVr1/Dy5Uv06NEDf/31F7y9vWW29fX1hbOzM4oXLw5tbW3UrFkTu3fvljuGRCLB1KlTxecpKSlo3bo1jI2NERISIq6T2aNx48YAgDNnzkAikeDMmTMyx2jTpo3ccRo3bixul+bp06eQSCTYsGGDTPz+/fvo0qULjI2NoaWlBScnJxw4cEDuXKKiouDt7Q0rKytoamrCwsICrq6ueP/+fYb5vXz5ElZWVnBycsLnz58VPo+pU6dCIpEAACwsLFCvXj2oqanBzMws3X2k58WLF+jfvz/Mzc2hqakJa2trDB48GElJSdiwYUOW1z/tet2+fRvu7u4oV64ctLS0YGZmhn79+uHDhw9y+Wb2OHPmDKZMmQJ1dXW8e/dOLt9BgwbByMgICQkJYuyff/5Bo0aNoK+vDwMDA9SqVQtbt27N9Ly/vXZpPn/+nO61a9y4MapUqSK3D19fX0gkEjx9+lQmnlk+ip5b2uvh+4eVlZXcOun9jn1/vlldewA4f/48fv/9d5QpUwaampqwtLSEt7c3vnz5kuH+02T1mvn29QsAN2/eRKtWrWBgYAA9PT00bdoUly9fzvI4ACCVSuHv74+qVatCS0sLJiYmaNmyJYKCgsR1JBIJPDw8sGXLFlSoUAFaWlqoWbMmzp07J7OvZ8+eYciQIahQoQK0tbVRvHhx/P7773I/2+/PT0dHB1WrVsXatWtl1suoW/Pu3bvT/d28cuUKWrZsCUNDQ+jo6KBRo0a4cOGCzDppP8O095Q0QUFBcu9d7u7uMq8RAHj+/Dm0tbXlXrPfvx8mJyfDx8cH1tbW0NDQQJkyZTBmzJhs/fyBr++ZXbt2hYmJCbS1tVGhQgVMnDgx020yep2nPdzd3cV1034G586dwx9//IHixYvDwMAArq6u+PTpk9y+ly9fjsqVK0NTUxPm5uYYOnQooqKiZNZp3Lhxusdt1qyZuE7aa+l7v/32m9y1jouLw8iRI2FpaQlNTU1UqFABvr6++PY78w8fPqBVq1awsLCApqYmSpUqhV69euHZs2fiOhn9XRo6dGiOr4ubmxtKlCiB5ORkuXNp3rw5KlSoIBPbvHkzatasCW1tbRgbG6N79+54/vx5utevQ4cOcvv8448/IJFIZN5D087L19dXbv006b1Hp/1cvr2vN03FihUz/BlRwceuepTvzZ07F1KpFNu3b0fNmjXllpcoUQIbN25ESEgIVq1ahSlTpqBkyZIAAH9/f7Rr1w69evVCUlIStm/fjt9//x2HDh1CmzZtMjzmgAEDcObMGZw4cQKVKlUCAGzatElcfv78eaxevRqLFy9GiRIlAACmpqYZ7u/cuXM4cuRIjs4fAO7du4f69eujdOnSGDduHHR1dbFz50506NABe/bsQceOHQF8/cDt4uKC0NBQ9OvXDzVq1MD79+9x4MABREZGirl+Kzo6Gq1atYK6ujqOHDmS6b1jipzHwoUL8ebNm2yt+/LlS9SuXRtRUVEYNGgQKlasiBcvXmD37t2Ij49Hw4YNZa7/rFmzAEDmQ5CzszMA4MSJE3jy5An69u0LMzMz3Lt3D6tXr8a9e/dw+fJlSCQSdOrUCeXLlxe39fb2hr29PQYNGiTG7O3tYWFhgenTp2PHjh0yfwSTkpKwe/dudO7cGVpaWgC+fljo168fKleujPHjx8PIyAg3b97E0aNH0bNnz2xdh5xcu4xklU+fPn2yfW7fmjBhAuzt7QEAq1evRkREhEJ5ZffaA8CuXbsQHx+PwYMHo3jx4rh69SqWLl2KyMhI7Nq1K1vHmz59OqytrcXnnz9/xuDBg2XWuXfvHlxcXGBgYIAxY8ZAXV0dq1atQuPGjXH27FnUqVMn02P0798fGzZsQKtWrTBgwACkpKTg/PnzuHz5MpycnMT1zp49ix07dsDLywuamppYvnw5WrZsiatXr4of5q5du4aLFy+ie/fusLCwwNOnT7FixQo0btwYISEh0NHRkTl22ntQTEwM1q9fj4EDB8LKykrmg3Z2/fvvv2jVqhVq1qyJKVOmQEVFBYGBgfjll19w/vx51K5dW+F9pmfy5MkyRXlGhg4dijVr1qBdu3YYNWoUbt68iQULFuDu3bs4fPiw3IfZb92+fRsuLi5QV1fHoEGDYGVlhcePH+PgwYPi+0dmvLy8UKtWLZnYgAED0l3Xw8MDRkZGmDp1KsLCwrBixQo8e/ZMLMKArx++p02bhmbNmmHw4MHieteuXcOFCxegrq4u7s/CwgJz5syROUapUqWyzPl7giCgXbt2OH36NPr37w9HR0ccO3YMo0ePxosXL7B48WIAX3/n9fX1MWzYMBQvXhyPHz/G0qVLcfv2bdy5cyfD/T969Ahr1qzJcHlW16VPnz7YuHEjjh07ht9++03c7vXr1/j3338xZcoUMTZr1iz4+Piga9euGDBgAN69e4elS5eiYcOGuHnzpkzvBi0tLRw+fBhv374VPwt8+fIFO3bsSPc9Lae0tLQQGBiI4cOHi7GLFy/KFJxUCAlE+ZyxsbFQtmxZmZibm5ugq6srE/Px8REACAcPHhRj8fHxMuskJSUJVapUEX755ReZOABhypQpgiAIwvjx4wVVVVVh//79GeYUGBgoABDCw8Pllp0+fVoAIJw+fVqM1alTR2jVqpXMcQRBEJo0aSI0bNhQZvvw8HABgBAYGCjGmjZtKlStWlVISEgQY1KpVHB2dhZsbW3F2OTJkwUAwt69e+XykkqlcvklJCQIjRs3FkqWLCk8evQox+cxZcoU4du3k7dv3wr6+vriut/uIz2urq6CioqKcO3atQzz/lajRo2ERo0apbuv73/mgiAI27ZtEwAI586dS3ebsmXLCm5ubukuq1evnlCnTh2Z2N69e2XOKyoqStDX1xfq1KkjfPnyJcv8v6XItWvUqJFQuXJluX0sWLBA5vWY3Xyyc25pTpw4IQAQzp49K8bc3NxkfjfTXjO7du3K9Jy/ldm1T+9nOWfOHEEikQjPnj3LdL9pv6Pfv6bevXsn9/rt0KGDoKGhITx+/FiMvXz5UtDX15f7/fzev//+KwAQvLy85JZ9e60BCACEoKAgMfbs2TNBS0tL6NixoxhL75wvXbokABA2btwod37fvgc9ePBAACDMnz9fjKX3XikIgrBr1y6Zn7NUKhVsbW2FFi1ayOQdHx8vWFtbC7/++qsYS3vNvnv3Tmaf165dk3vv+v41cvfuXUFFRUV8fX+b/7e/17dv3xYkEonQvXt3mWNMnTpV7n0+PQ0bNhT09fXlXidZ/T5m9hrW1dWVea2m/Qxq1qwpJCUlifH58+cLAIS
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import time\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report\n",
|
|||
|
"\n",
|
|||
|
"# Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"\n",
|
|||
|
"# X = data.drop(columns=['id', 'stroke']) # Признаки\n",
|
|||
|
"# y = data['stroke'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# # Преобразование категориальных признаков с помощью One-Hot Encoding\n",
|
|||
|
"# categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
|
|||
|
"# X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)\n",
|
|||
|
"\n",
|
|||
|
"# # Заполнение пропущенных значений (например, медианой для числовых данных)\n",
|
|||
|
"# X.fillna(X.median(), inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"# # Разделение данных на обучающую и тестовую выборки\n",
|
|||
|
"# # Обучающая выборка\n",
|
|||
|
"# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
|
|||
|
"\n",
|
|||
|
"# # Тестовая и контрольная выборки\n",
|
|||
|
"# X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# Обучение модели\n",
|
|||
|
"model = RandomForestClassifier(random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Начинаем отсчет времени\n",
|
|||
|
"start_time = time.time()\n",
|
|||
|
"model.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# Время обучения модели\n",
|
|||
|
"train_time = time.time() - start_time\n",
|
|||
|
"\n",
|
|||
|
"# Предсказания и оценка модели\n",
|
|||
|
"y_pred = model.predict(X_test)\n",
|
|||
|
"y_pred_proba = model.predict_proba(X_test)[:, 1] # Вероятности для ROC-AUC\n",
|
|||
|
"\n",
|
|||
|
"# Метрики\n",
|
|||
|
"roc_auc = roc_auc_score(y_test, y_pred_proba)\n",
|
|||
|
"f1 = f1_score(y_test, y_pred)\n",
|
|||
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
|||
|
"class_report = classification_report(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"# Вывод результатов\n",
|
|||
|
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
|
|||
|
"print(f'ROC-AUC: {roc_auc:.2f}')\n",
|
|||
|
"print(f'F1-Score: {f1:.2f}')\n",
|
|||
|
"print('Матрица ошибок:')\n",
|
|||
|
"print(conf_matrix)\n",
|
|||
|
"print('Отчет по классификации:')\n",
|
|||
|
"print(class_report)\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация матрицы ошибок\n",
|
|||
|
"plt.figure(figsize=(7, 7))\n",
|
|||
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Нет инсульта', 'Инсульт'], yticklabels=['Нет инсульта', 'Инсульт'])\n",
|
|||
|
"plt.title('Матрица ошибок')\n",
|
|||
|
"plt.xlabel('Предсказанный класс')\n",
|
|||
|
"plt.ylabel('Истинный класс')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(y_test, y_pred, alpha=0.5, color='blue', label='Прогнозы модели')\n",
|
|||
|
"plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Идеальное совпадение')\n",
|
|||
|
"plt.xlabel('Фактический статус инсульта')\n",
|
|||
|
"plt.ylabel('Прогнозируемый статус инсульта')\n",
|
|||
|
"plt.title('Фактический статус инсульта по сравнению с прогнозируемым')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"<p style=\"margin: 30px;\">А ВОТ ТЕПЕЕЕЕЕЕЕЕЕЕЕРЬ я поправила недоразумения и вроде как модель проперло на выявление инсульта. Но, так как в данных ЛЮТЫЙ дисбаланс, то модель слаба на выявление инсульта все еще.</p>"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|