505 lines
145 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество колонок: 12\n",
"Колонки: Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
" 'smoking_status', 'stroke'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Загрузка данных\n",
"data = pd.read_csv('./csv/option4.csv')\n",
"\n",
"# Обзор данных\n",
"print(\"Количество колонок:\", data.columns.size)\n",
"print(\"Колонки:\", data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 201\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n",
"\n",
"\n",
"\n",
"<bound method NDFrame.describe of id gender age hypertension heart_disease ever_married \\\n",
"0 9046 Male 67.0 0 1 Yes \n",
"1 51676 Female 61.0 0 0 Yes \n",
"2 31112 Male 80.0 0 1 Yes \n",
"3 60182 Female 49.0 0 0 Yes \n",
"4 1665 Female 79.0 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"5105 18234 Female 80.0 1 0 Yes \n",
"5106 44873 Female 81.0 0 0 Yes \n",
"5107 19723 Female 35.0 0 0 Yes \n",
"5108 37544 Male 51.0 0 0 Yes \n",
"5109 44679 Female 44.0 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 Private Urban 228.69 36.6 formerly smoked \n",
"1 Self-employed Rural 202.21 NaN never smoked \n",
"2 Private Rural 105.92 32.5 never smoked \n",
"3 Private Urban 171.23 34.4 smokes \n",
"4 Self-employed Rural 174.12 24.0 never smoked \n",
"... ... ... ... ... ... \n",
"5105 Private Urban 83.75 NaN never smoked \n",
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
"5108 Private Rural 166.29 25.6 formerly smoked \n",
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
"\n",
" stroke \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"5105 0 \n",
"5106 0 \n",
"5107 0 \n",
"5108 0 \n",
"5109 0 \n",
"\n",
"[5110 rows x 12 columns]>\n"
]
}
],
"source": [
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())\n",
"\n",
"print(\"\\n\\n\")\n",
"\n",
"print(data.describe)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Возьмем и заменим нулевые значения в столбце bmi на средние значения по столбцу </p>"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 0\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n"
]
}
],
"source": [
"data['bmi'] = data['bmi'].fillna(data['bmi'].median())\n",
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Взглянем на выбросы: </p>"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABscUlEQVR4nO3de1yUdf7//+cwHD0wiAcGChDPirIeKiOM3GTDU6vlZhaltX7081G0zG0rNw9hGFvblj9bsGz7mBau1ZZmZVpiylZoahoeWtMicdPBCgEPiQjX74++XB9HmRQdGAYe99ttbs28r9dc87rwlm988uZ9WQzDMAQAAAAAAAAAAM7j4+kGAAAAAAAAAABoqAjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAAAAAAAAAHCBEB0AAAAAAAAAABcI0QEAAAAAAAAAcIEQHQAAAAAAAAAAFwjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAACAemCxWPTYY495uo16tWHDBlksFm3YsMHTrTSoXgAAjddjjz0mi8WiH374oU4/55577lH79u3r9DMA/B9CdAAAAAAAAAAAXPD1dAMAAAAAAAAALt6LL76oqqoqT7cBNBmE6AAAAAAAAIAX8fPz83QLQJPCdi5AI3TgwAFNnjxZXbt2VVBQkFq3bq3bbrtN33777Xm1+fn5uuGGGxQUFKQrr7xS6enpWrx4sSwWy3n177//vq6//no1b95cLVu21LBhw7R79+76uSgAQJN1MfPa1q1bZbFYtGTJkvPev3btWlksFr377rvm2IYNG3TVVVcpMDBQHTt21AsvvGDuYVpbb7zxhnr06KHAwED17NlTK1asuKh9Sl3VuOrj1Vdf1TXXXKNmzZqpVatWSkxM1AcffOBUk5WVpdjYWAUEBCgiIkKpqakqKSlxqtm3b59GjRolu92uwMBAXXnllRozZoxKS0vP+7x+/fopKChIoaGhGjNmjA4ePHhRX5ML2bx5swYPHiybzaZmzZrphhtu0CeffGIe/+c//ymLxaKNGzee994XXnhBFotFu3btMsf+/e9/63e/+51CQ0MVGBioq666SqtWrXJLrwAAXIoffvhBo0ePVnBwsFq3bq37779fp06dMo9bLBZNmTLF/D4iKChI8fHx2rlzp6Sf57tOnTopMDBQAwcOPO/f5+yJDtQvVqIDjdCWLVv06aefasyYMbryyiv17bffauHChRo4cKD27NmjZs2aSZK+++47/frXv5bFYtGMGTPUvHlz/f3vf1dAQMB553zllVc0btw4JScn68knn9TJkye1cOFCDRgwQNu3b2fyBgDUmYuZ16666ip16NBBr7/+usaNG+f0/tdee02tWrVScnKyJGn79u0aPHiwwsPDlZaWpsrKSs2dO1dt27atdW/vvfeebr/9dvXq1UsZGRk6evSoxo8fryuuuMIt114tLS1Njz32mK677jrNnTtX/v7+2rx5s9avX6+bbrpJ0s/he1pampKSkjRp0iTt3btXCxcu1JYtW/TJJ5/Iz89Pp0+fVnJyssrLyzV16lTZ7XZ99913evfdd1VSUiKbzSZJmjdvnmbNmqXRo0frv/7rv/T999/rueeeU2JiorZv366QkJBLvpb169dryJAh6tevn+bMmSMfHx8tXrxYN954o/71r3/pmmuu0bBhw9SiRQu9/vrruuGGG5ze/9prryk2NlY9e/aUJO3evVsJCQm64oor9Mgjj6h58+Z6/fXXNXLkSL355pu65ZZbLrlXAAAu1ejRo9W+fXtlZGRo06ZNWrBggY4ePaqlS5eaNf/617+0atUqpaamSpIyMjI0fPhwPfTQQ8rKytLkyZN19OhRPfXUU/r973+v9evXe+pyABgAGp2TJ0+eN5aXl2dIMpYuXWqOTZ061bBYLMb27dvNsR9//NEIDQ01JBkFBQWGYRjGsWPHjJCQEGPChAlO53Q4HIbNZjtvHAAAd7rYeW3GjBmGn5+fUVxcbI6Vl5cbISEhxu9//3tz7OabbzaaNWtmfPfdd+bYvn37DF9fX6O23x736tXLuPLKK41jx46ZYxs2bDAkGdHR0U61kow5c+aYr8eNG3dejWEYxpw5c5z62Ldvn+Hj42PccsstRmVlpVNtVVWVYRiGceTIEcPf39+46aabnGr+9re/GZKM//3f/zUMwzC2b99uSDLeeOMNl9f07bffGlar1Zg3b57T+M6dOw1fX9/zxn/JRx99ZEgyPvroI7Pfzp07G8nJyWbvhvHzn3FMTIzxm9/8xhy74447jHbt2hlnzpwxxw4fPmz4+PgYc+fONccGDRpk9OrVyzh16pTT1+W6664zOnfu7LIXAADqQvU8/tvf/tZpfPLkyYYk44svvjAM4+fvCwICAsx/dxuGYbzwwguGJMNutxtlZWXm+IwZM5z+jW4Yrr+PAFA32M4FaISCgoLM5xUVFfrxxx/VqVMnhYSE6PPPPzePrVmzRvHx8erdu7c5FhoaqpSUFKfzffjhhyopKdEdd9yhH374wXxYrVb1799fH330UZ1fEwCg6brYee32229XRUWF3nrrLXPsgw8+UElJiW6//XZJUmVlpdatW6eRI0cqIiLCrOvUqZOGDBlSq74OHTqknTt3auzYsWrRooU5fsMNN6hXr161vk5XVq5cqaqqKs2ePVs+Ps7fvldv+7Ju3TqdPn1a06ZNc6qZMGGCgoOD9d5770mSudJ87dq1OnnyZI2f99Zbb6mqqkqjR492mvftdrs6d+58WfP+jh07tG/fPt1555368ccfzXOfOHFCgwYNUm5urnmTtNtvv11HjhzRhg0bzPf/85//VFVVlfnnWVxcrPXr12v06NE6duyYeb4ff/xRycnJ2rdvn7777rtL7hcAgEtVvbq82tSpUyVJq1evNscGDRrk9Fvd/fv3lySNGjVKLVu2PG/8m2++qat2AVwA27kAjdBPP/2kjIwMLV68WN99950MwzCPnb3f6YEDBxQfH3/e+zt16uT0et++fZKkG2+8scbPCw4OdkfbAADU6GLntV/96lfq1q2bXnvtNY0fP17Sz1t/tGnTxpzDjhw5op9++um8uU46f/67kAMHDrh8X6dOnZwC/svx9ddfy8fHRz169LhgL127dnUa9/f3V4cOHczjMTExmj59up555hllZ2fr+uuv129/+1vdddddZsC+b98+GYahzp071/hZl3Mjs+rvKc7dcudspaWlatWqlbln+muvvaZBgwZJ+vnPs3fv3urSpYskaf/+/TIMQ7NmzdKsWbNqPN+RI0fcvr0OAAAXcu482rFjR/n4+DjtbR4VFeVUUz0XR0ZG1jh+9OjROugUwMUgRAcaoalTp2rx4sWaNm2a4uPjZbPZZLFYNGbMGHN1V21Uv+eVV16R3W4/77ivL3+VAADqTm3mtdtvv13z5s3TDz/8oJYtW2rVqlW64447GuRc5eomppWVlXX6uX/96191zz336O2339YHH3yg++67z9yv9corr1RVVZUsFovef/99Wa3W895/9qr72qr+8/rLX/7i9JtwNZ0/ICBAI0eO1IoVK5SVlaWioiJ98skneuKJJ84734MPPmjueX+u2v5wBACAulDTvF/TPPtL42cvJABQvxrevyYAXLZ//vOfGjdunP7617+aY6dOnVJJSYlTXXR0tPbv33/e+88d69ixoySpXbt2SkpKcn/DAAD8goud16SfQ/S0tDS9+eabCgsLU1lZmcaMGWMeb9eunQIDAy9q/ruQ6Ohol++7mHO1atWqxmuoXjVerWPHjqqqqtKePXtcBs/Vvezdu1cdOnQwx0+fPq2CgoLz5u9evXqpV69emjlzpj799FMlJCTo+eefV3p6ujp27CjDMBQTE2Ou+HaX6u8pgoODL+p7ittvv11LlixRTk6OvvzySxmGYW7lIsm8Vj8/P75HAQA0KPv27VNMTIz5ev/+/aqqqnLavgWA92BPdKARslqt5/2E+rnnnjtvZVtycrLy8vK0Y8cOc6y4uFjZ2dnn1QUHB+uJJ55QRUXFeZ/3/fffu695AADOcbHzmiR1795dvXr
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def plot_numeric_boxplots(dataframe, columns):\n",
" # Фильтрация числовых столбцов\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" \n",
" # Построение графиков\n",
" if numeric_columns:\n",
" plt.figure(figsize=(15, 5))\n",
" \n",
" for i, col in enumerate(numeric_columns):\n",
" if col != 'id':\n",
" plt.subplot(1, len(numeric_columns), i + 1)\n",
" sns.boxplot(y=dataframe[col])\n",
" plt.title(f'{col}')\n",
" plt.ylabel('')\n",
" plt.xlabel(col)\n",
" \n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"Нет подходящих числовых столбцов для построения графиков.\")\n",
"\n",
"plot_numeric_boxplots(data, data.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Видим выбросы в столбцах со средним уровнем глюкозы и в столбце bmi (индекс массы тела). устраним выбросы - поставим верхние и нижние границы</p>"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcoElEQVR4nO3dfZiVVb0//vcgMIPADIIxAwmI+AAGPhuSZqQk4kOgnJSiNPPIqfCRThpH0SSJ9Fh5NMQ0j2lBmqUezcQUBcpQEUVNjdBQSAVKZUZQRmT2749+7m8j7BIFBobX67ruS/Za6177cw9erJn33HvdZYVCoRAAAAAAAGAtLZq6AAAAAAAA2FwJ0QEAAAAAoAQhOgAAAAAAlCBEBwAAAACAEoToAAAAAABQghAdAAAAAABKEKIDAAAAAEAJQnQAAAAAAChBiA4AAAAAACUI0QEAYBMoKyvLN7/5zaYuY5OaMWNGysrKMmPGjKYuZbOqBYDm65vf/GbKysryt7/9baO+zxe/+MXsuOOOG/U9gP9HiA4AAAAAACW0bOoCAAAAAID37pprrklDQ0NTlwFbDSE6AAAAAGxBWrVq1dQlwFbFdi7QDL3wwgv56le/mt122y1t2rRJp06d8pnPfCbPP//8WmOfeOKJfOITn0ibNm2yww475KKLLsp1112XsrKytcbfdddd+fjHP562bdumffv2OfLII/PUU09tmosCYKv1Xta1Rx55JGVlZbn++uvXOv/uu+9OWVlZfvWrXxXbZsyYkf322y8VFRXp1atXfvjDHxb3MF1fN998c3bfffdUVFSkb9++ufXWW9/TPqWlxpSq46c//Wk++tGPZtttt812222Xgw8+OL/5zW8ajbnyyivzkY98JOXl5enatWtGjx6d5cuXNxqzYMGCDB8+PDU1NamoqMgOO+yQESNGpLa2dq3323fffdOmTZt07NgxI0aMyOLFi9/T1+Rfeeihh3L44Yenqqoq2267bT7xiU/kgQceKPb/4he/SFlZWWbOnLnWuT/84Q9TVlaWP/zhD8W2P/7xj/m3f/u3dOzYMRUVFdlvv/1y++23b5BaAeD9+Nvf/pbjjjsulZWV6dSpU84444ysWrWq2F9WVpZTTz21+H1EmzZtMmDAgDz55JNJ/r7e7bzzzqmoqMjAgQPX+vncnuiwabkTHZqhOXPm5Pe//31GjBiRHXbYIc8//3wmT56cgQMH5umnn862226bJHnxxRfzyU9+MmVlZRk7dmzatm2bH/3oRykvL19rzp/85Cc58cQTM3jw4Fx88cV54403Mnny5Bx00EF57LHHLN4AbDTvZV3bb7/9stNOO+XnP/95TjzxxEbn33TTTdluu+0yePDgJMljjz2Www8/PF26dMmFF16YNWvWZPz48fnQhz603rXdeeedOf7449OvX79MnDgxr732Wk4++eR8+MMf3iDX/o4LL7ww3/zmN/Oxj30s48ePT+vWrfPQQw/lvvvuy2GHHZbk7+H7hRdemEGDBuUrX/lK5s+fn8mTJ2fOnDl54IEH0qpVq7z11lsZPHhw6uvrc9ppp6WmpiYvvvhifvWrX2X58uWpqqpKkkyYMCHjxo3Lcccdl3//93/PX//611xxxRU5+OCD89hjj6VDhw7v+1ruu+++DBkyJPvuu28uuOCCtGjRItddd10OOeSQ/Pa3v81HP/rRHHnkkWnXrl1+/vOf5xOf+ESj82+66aZ85CMfSd++fZMkTz31VA488MB8+MMfzje+8Y20bds2P//5zzNs2LD88pe/zDHHHPO+awWA9+u4447LjjvumIkTJ+bBBx/M5Zdfntdeey033HBDccxvf/vb3H777Rk9enSSZOLEiTnqqKNy9tln58orr8xXv/rVvPbaa7nkkkvypS99Kffdd19TXQ5QAJqdN954Y6222bNnF5IUbrjhhmLbaaedVigrKys89thjxbZXXnml0LFjx0KSwsKFCwuFQqHw+uuvFzp06FA45ZRTGs25ZMmSQlVV1VrtALAhvdd1bezYsYVWrVoVXn311WJbfX19oUOHDoUvfelLxbajjz66sO222xZefPHFYtuCBQsKLVu2LKzvt8f9+vUr7LDDDoXXX3+92DZjxoxCkkKPHj0ajU1SuOCCC4qvTzzxxLXGFAqFwgUXXNCojgULFhRatGhROOaYYwpr1qxpNLahoaFQKBQKy5YtK7Ru3bpw2GGHNRrzgx/8oJCk8L//+7+FQqFQeOyxxwpJCjfffHPJa3r++ecL22yzTWHChAmN2p988slCy5Yt12r/Z+6///5CksL9999frHeXXXYpDB48uFh7ofD3v+OePXsWPvWpTxXbPvvZzxY6d+5cePvtt4ttL7/8cqFFixaF8ePHF9sOPfTQQr9+/QqrVq1q9HX52Mc+Vthll11K1gIAG8M76/inP/3pRu1f/epXC0kKjz/+eKFQ+Pv3BeXl5cWfuwuFQuGHP/xhIUmhpqamUFdXV2wfO3Zso5/RC4XS30cAG4ftXKAZatOmTfHPq1evziuvvJKdd945HTp0yKOPPlrsmzZtWgYMGJC99tqr2NaxY8eMHDmy0Xz33HNPli9fns9+9rP529/+Vjy22Wab9O/fP/fff/9GvyYAtl7vdV07/vjjs3r16txyyy3Ftt/85jdZvnx5jj/++CTJmjVrcu+992bYsGHp2rVrcdzOO++cIUOGrFddL730Up588smccMIJadeuXbH9E5/4RPr167fe11nKbbfdloaGhpx//vlp0aLxt+/vbPty77335q233sqZZ57ZaMwpp5ySysrK3HnnnUlSvNP87rvvzhtvvLHO97vlllvS0NCQ4447rtG6X1NTk1122eUDrfvz5s3LggUL8rnPfS6vvPJKce6VK1fm0EMPzaxZs4oPSTv++OOzbNmyzJgxo3j+L37xizQ0NBT/Pl999dXcd999Oe644/L6668X53vllVcyePDgLFiwIC+++OL7rhcA3q937i5/x2mnnZYk+fWvf11sO/TQQxt9qrt///5JkuHDh6d9+/Zrtf/5z3/eWOUC/4LtXKAZevPNNzNx4sRcd911efHFF1MoFIp9/7jf6QsvvJABAwasdf7OO+/c6PWCBQuSJIcccsg636+ysnJDlA0A6/Re17U999wzvXv3zk033ZSTTz45yd+3/th+++2La9iyZcvy5ptvrrXWJWuvf//KCy+8UPK8nXfeuVHA/0E899xzadGiRXbfffd/Wctuu+3WqL1169bZaaediv09e/bMmDFj8r3vfS9TpkzJxz/+8Xz605/O5z//+WLAvmDBghQKheyyyy7rfK8P8iCzd76nePeWO/+otrY22223XXHP9JtuuimHHnpokr//fe61117ZddddkyTPPvtsCoVCxo0bl3Hjxq1zvmXLlm3w7XUA4F959zraq1evtGjRotHe5t27d2805p21uFu3butsf+211zZCpcB7IUSHZui0007LddddlzPPPDMDBgxIVVVVysrKMmLEiOLdXevjnXN+8pOfpKamZq3+li39UwLAxrM+69rxxx+fCRMm5G9/+1vat2+f22+/PZ/97Gc3y7Wq1ENM16xZs1Hf97vf/W6++MUv5v/+7//ym9/8Jqeffnpxv9YddtghDQ0NKSsry1133ZVtttlmrfP/8a779fXO39d///d/N/ok3LrmLy8vz7Bhw3LrrbfmyiuvzNKlS/PAAw/k29/+9lrz/ed//mdxz/t3W99fjgDAxrCudX9d6+w/a//HGwmATWvz+2kC+MB+8Ytf5MQTT8x3v/vdYtuqVauyfPnyRuN69OiRZ599dq3z393Wq1evJEnnzp0zaNCgDV8wAPwT73VdS/4eol944YX55S9/merq6tTV1WXEiBHF/s6dO6eiouI9rX//So8ePUqe917m2m677dZ5De/cNf6OXr16paGhIU8//XTJ4PmdWubPn5+ddtqp2P7WW29l4cKFa63f/fr1S79+/XLeeefl97//fQ488MBcddVVueiii9KrV68UCoX07NmzeMf3hvLO9xSVlZXv6XuK448/Ptdff32mT5+eZ555JoV
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def remove_outliers(df):\n",
"\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" for column in numeric_columns:\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
" return df\n",
" \n",
"data = remove_outliers(data)\n",
"plot_numeric_boxplots(data, data.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Так, от выбросов избавились, теперь разобьем на выборки</p>"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Размеры выборок:\n",
"Обучающая выборка: (4088, 10)\n",
"Тестовая выборка: (511, 10)\n",
"Контрольная выборка: (511, 10)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Определение признаков и целевой переменной\n",
"X = data.drop(columns=['id', 'stroke']) \n",
"y = data['stroke'] \n",
"\n",
"# Обучающая выборка\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"\n",
"# Тестовая и контрольная выборки\n",
"X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n",
"\n",
"print(\"\\nРазмеры выборок:\")\n",
"print(f\"Обучающая выборка: {X_train.shape}\")\n",
"print(f\"Тестовая выборка: {X_test.shape}\")\n",
"print(f\"Контрольная выборка: {X_control.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stroke\n",
"0 4861\n",
"1 249\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9kUlEQVR4nO3deXyM9/7//2d2IZlEkIRSQmqJpa0oglpDqlFVnFYpqpyWhhYtTs6ptfXVcmorqqeLpaUt1XLKsUQUVSkaja042sahjSRCk0ErieT6/dFP5mckthGZxPW4325zu5n39b7e1+uaTDJP1/W+rnExDMMQAACAibk6uwAAAABnIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABABz2/PPPq3Pnzs4uw2bx4sVycXHRd999d8tj5ebmqkaNGlqwYEExVIbSjkCEO1rBH8eCR7ly5VS3bl0NHz5caWlpzi4PKNOSk5P13nvv6e9///tNrbdz505NmjRJmZmZt6ewYuLh4aHRo0dr6tSpunjxorPLwW1GIIIpTJkyRR9++KHmzZunVq1a6e2331ZERIR+//13Z5cGlFlz5sxRSEiIOnTocFPr7dy5U5MnTy71gUiSBg0apIyMDC1fvtzZpeA2IxDBFLp27aqnnnpKQ4YM0eLFizVy5EglJydrzZo1zi4NKJNyc3O1bNkyPf7447d1O/n5+U49OuPv768uXbpo8eLFTqsBJYNABFPq2LGjpD8P+UvS2bNn9fLLL6tx48by8fGRxWJR165dtW/fvkLrXrx4UZMmTVLdunVVrlw5Va1aVT179tRPP/0kSTp+/LjdaborH+3bt7eNtXXrVrm4uOjTTz/V3//+dwUHB6tChQrq3r27Tp48WWjbu3bt0kMPPSQ/Pz+VL19e7dq10zfffFPkPrZv377I7U+aNKlQ348++kjh4eHy9vZWQECA+vTpU+T2r7Vvl8vPz9fs2bPVsGFDlStXTkFBQXruuef022+/2fWrVauWunXrVmg7w4cPLzRmUbXPmDGj0GsqSdnZ2Zo4caJCQ0Pl5eWlGjVqaOzYscrOzi7ytbpc+/btC403depUubq6FjpKcKOvxz//+U+1atVKlSpVkre3t8LDw/XZZ58Vuf2PPvpIzZs3V/ny5VWxYkW1bdtWmzZtsuuzfv16tWvXTr6+vrJYLHrggQcK1bZy5Urbz7Ry5cp66qmn9Ouvv9r1efrpp+1qrlixotq3b6+vv/76uq/Tjh07lJGRocjIyELL3nrrLTVs2NC2D82aNbPVN2nSJI0ZM0aSFBISYtv28ePHJf35cx4+fLiWLVumhg0bysvLSxs2bJAkff/99+ratassFot8fHzUqVMnffvtt9et9bffflPz5s1VvXp1HT16VNLNvUc6d+6sHTt26OzZs9fdFsoud2cXADhDQXipVKmSJOnnn3/W6tWr9Ze//EUhISFKS0vTO++8o3bt2umHH35QtWrVJEl5eXnq1q2b4uPj1adPH7344os6d+6c4uLidPDgQdWpU8e2jSeffFIPP/yw3XZjY2OLrGfq1KlycXHRuHHjlJ6ertmzZysyMlJJSUny9vaWJG3ZskVdu3ZVeHi4Jk6cKFdXVy1atEgdO3bU119/rebNmxcat3r16po2bZok6fz58xo2bFiR2x4/frwef/xxDRkyRKdPn9Zbb72ltm3b6vvvv5e/v3+hdZ599lk9+OCDkqTPP/9cX3zxhd3y5557TosXL9agQYP0wgsvKDk5WfPmzdP333+vb775Rh4eHkW+DjcjMzPTtm+Xy8/PV/fu3bVjxw49++yzatCggQ4cOKBZs2bpv//9r1avXn1T21m0aJFeeeUVvfnmm+rbt2+Rfa73esyZM0fdu3dXv379lJOTo08++UR/+ctftHbtWkVHR9v6TZ48WZMmTVKrVq00ZcoUeXp6ateuXdqyZYu6dOki6c95cc8884waNmyo2NhY+fv76/vvv9eGDRts9RW89g888ICmTZumtLQ0zZkzR998802hn2nlypU1a9YsSdIvv/yiOXPm6OGHH9bJkyeL/NkX2Llzp1xcXHT//ffbtb/77rt64YUX1Lt3b7344ou6ePGi9u/fr127dqlv377q2bOn/vvf/+rjjz/WrFmzVLlyZUlSlSpVbGNs2bJFK1as0PDhw1W5cmXVqlVLhw4d0oMPPiiLxaKxY8fKw8ND77zzjtq3b69t27apRYsWRdaZkZGhzp076+zZs9q2bZvq1Klz0++R8PBwGYahnTt3FhngcYcwgDvYokWLDEnG5s2bjdOnTxsnT540PvnkE6NSpUqGt7e38csvvxiGYRgXL1408vLy7NZNTk42vLy8jClTptjaPvjgA0OSMXPmzELbys/Pt60nyZgxY0ahPg0bNjTatWtne/7VV18Zkoy77rrLsFqttvYVK1YYkow5c+bYxr7nnnuMqKgo23YMwzB+//13IyQkxOjcuXOhbbVq1cpo1KiR7fnp06cNScbEiRNtbcePHzfc3NyMqVOn2q174MABw93dvVD7sWPHDEnGkiVLbG0TJ040Lv9T8vXXXxuSjGXLltmtu2HDhkLtNWvWNKKjowvVHhMTY1z55+nK2seOHWsEBgYa4eHhdq/phx9+aLi6uhpff/213foLFy40JBnffPNNoe1drl27drbx1q1bZ7i7uxsvvfRSkX1v5PUwjD9/TpfLyckxGjVqZHTs2NFuLFdXV+Oxxx4r9F4s+JlnZmYavr6+RosWLYw//vijyD45OTlGYGCg0ahRI7s+a9euNSQZEyZMsLUNHDjQqFmzpt04//rXvwxJxu7du4vc5wJPPfWUUalSpULtjz76qNGwYcNrrjtjxgxDkpGcnFxomSTD1dXVOHTokF17jx49DE9PT+Onn36ytaWkpBi+vr5G27ZtbW0Fv/N79uwxTp06ZTRs2NCoXbu2cfz4cVufm32PpKSkGJKMN95445r7hbKNU2YwhcjISFWpUkU1atRQnz595OPjoy+++EJ33XWXJMnLy0uurn/+OuTl5enMmTPy8fFRvXr1tHfvXts4q1atUuXKlTVixIhC27jyNMnNGDBggHx9fW3Pe/furapVq+o///mPJCkpKUnHjh1T3759debMGWVkZCgjI0MXLlxQp06dtH37duXn59uNefHiRZUrV+6a2/3888+Vn5+vxx9/3DZmRkaGgoODdc899+irr76y65+TkyPpz9fralauXCk/Pz917tzZbszw8HD5+PgUGjM3N9euX0ZGxnXnjPz666966623NH78ePn4+BTafoMGDVS/fn27MQtOk165/avZvXu3Hn/8cfXq1UszZswoss+NvB6SbEf5pD9P32RlZenBBx+0e2+tXr1a+fn5mjBhgu29WKDgvRUXF6dz587pb3/7W6GfbUGf7777Tunp6Xr++eft+kRHR6t+/fpat26d3Xr5+fm21ygpKUlLly5V1apV1aBBg2vu05kzZ1SxYsVC7f7+/vrll1+0Z8+ea65/Le3atVNYWJjteV5enjZt2qQePXqodu3atvaqVauqb9++2rFjh6xWq90Yv/zyi9q1a6fc3Fxt375dNWvWtC272fdIwX5mZGQ4vE8o/ThlBlOYP3++6tatK3d3dwUFBalevXp2Hzr5+fmaM2eOFixYoOTkZOXl5dmWFZxWk/481VavXj25uxfvr84999xj99zFxUWhoaG2eRXHjh2TJA0cOPCqY2RlZdl9QGVkZBQa90rHjh2TYRhX7Xflqa2Cq4KuDCFXjpmVlaXAwMAil6enp9s937Rpk93pkhsxceJEVatWTc8991yhuTjHjh3T4cOHrzrmldsvyq+//qro6GhduHBBZ86cuWrYvZHXQ5LWrl2r1157TUlJSXZzVC4f96effpKrq6tdELhSwaneRo0aXbXP//73P0lSvXr1Ci2rX7++duzYYdd28uRJu9eqatWqWrVq1XX3SZIMwyjUNm7cOG3evFnNmzdXaGiounTpor59+6p169bXHa9ASEiI3fPTp0/r999/L3KfGjR
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Подсчет количества объектов каждого класса\n",
"class_counts = y.value_counts()\n",
"print(class_counts)\n",
"\n",
"# Визуализация\n",
"sns.barplot(x=class_counts.index, y=class_counts.values)\n",
"plt.title(\"Распределение классов (stroke)\")\n",
"plt.xlabel(\"Класс\")\n",
"plt.ylabel(\"Количество\")\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Напишем функцию и сделаем аугментацию данных</p>"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\n",
"\n",
"stroke\n",
"0 3889\n",
"1 199\n",
"Name: count, dtype: int64\n",
"\n",
"После оверсемплинга\n",
"\n",
"stroke\n",
"0 3889\n",
"1 1944\n",
"Name: count, dtype: int64\n",
"\n",
"После балансировки данных (андерсемплинга)\n",
"\n",
"stroke\n",
"0 1944\n",
"1 1944\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGbCAYAAAAr/4yjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8YklEQVR4nO3deXhTVcIG8PfeJE3apmVpaUsBoew7aNlBdlAEFRFBEUX8UGdcZ3R0hnHcdRwHR0Vxm3EEQcYFERxEFkGQVUD2nQItlK2lBUq3NNv5/ii9Q2iBLklO7s378+mDTXPvfZOmfXvOuUkUIYQAERERAFV2ACIiCh0sBSIi0rAUiIhIw1IgIiINS4GIiDQsBSIi0rAUiIhIw1IgIiINS4GIiDQsBSKqkYcffhhDhgyRHUMzY8YMKIqCX3/9tcb7crlcaNSoET744AM/JNMHw5dC2QOk7MNms6Fly5Z49NFHkZWVJTseka6lp6fjk08+wZ///Ocqbbdu3Tq8+OKLOHfuXGCC+YnFYsGTTz6J1157DQ6HQ3acoDB8KZR5+eWXMWvWLEybNg29evXChx9+iJ49e6KoqEh2NCLdmjp1KlJSUjBgwIAqbbdu3Tq89NJLIV8KADBx4kTk5OTgP//5j+woQRE2pTBs2DCMHz8ekyZNwowZM/C73/0O6enp+O6772RHI9Ill8uF2bNnY8yYMQE9jtfrlfpXeu3atTF06FDMmDFDWoZgCptSuNTAgQMBlA5/AeDMmTP4wx/+gA4dOsButyM2NhbDhg3D9u3by23rcDjw4osvomXLlrDZbKhfvz5GjRqFQ4cOAQAyMjJ8pqwu/ejfv7+2r5UrV0JRFHz11Vf485//jKSkJERHR+OWW25BZmZmuWNv2LABN954I2rVqoWoqCj069cPa9eurfA29u/fv8Ljv/jii+Wu+/nnnyM1NRWRkZGoW7cu7rzzzgqPf6XbdjGv14t33nkH7dq1g81mQ2JiIh566CGcPXvW53pNmjTBiBEjyh3n0UcfLbfPirJPmTKl3H0KACUlJXjhhRfQvHlzWK1WNGrUCM888wxKSkoqvK8u1r9//3L7e+2116Cqarm/Fit7f7z55pvo1asX4uLiEBkZidTUVHzzzTcVHv/zzz9Ht27dEBUVhTp16qBv375YunSpz3UWLVqEfv36ISYmBrGxsejatWu5bHPmzNG+p/Hx8Rg/fjyOHz/uc5377rvPJ3OdOnXQv39/rF69+qr305o1a5CTk4PBgweX+9p7772Hdu3aabehS5cuWr4XX3wRTz/9NAAgJSVFO3ZGRgaA0u/zo48+itmzZ6Ndu3awWq1YvHgxAGDr1q0YNmwYYmNjYbfbMWjQIPzyyy9XzXr27Fl069YNDRs2xP79+wFU7TEyZMgQrFmzBmfOnLnqsfTOLDuALGW/wOPi4gAAhw8fxvz583HHHXcgJSUFWVlZ+Pjjj9GvXz/s2bMHycnJAACPx4MRI0Zg+fLluPPOO/HEE08gPz8fP/74I3bt2oVmzZppx7jrrrtw0003+Rx38uTJFeZ57bXXoCgK/vjHPyI7OxvvvPMOBg8ejG3btiEyMhIA8NNPP2HYsGFITU3FCy+8AFVVMX36dAwcOBCrV69Gt27dyu23YcOGeP311wEABQUF+O1vf1vhsZ977jmMGTMGkyZNwunTp/Hee++hb9++2Lp1K2rXrl1umwcffBDXX389AODbb7/FvHnzfL7+0EMPYcaMGZg4cSIef/xxpKenY9q0adi6dSvWrl0Li8VS4f1QFefOndNu28W8Xi9uueUWrFmzBg8++CDatGmDnTt34u2338aBAwcwf/78Kh1n+vTp+Mtf/oJ//OMfGDduXIXXudr9MXXqVNxyyy24++674XQ68eWXX+KOO+7A999/j+HDh2vXe+mll/Diiy+iV69eePnllxEREYENGzbgp59+wtChQwGUrpPdf//9aNeuHSZPnozatWtj69atWLx4sZav7L7v2rUrXn/9dWRlZWHq1KlYu3Ztue9pfHw83n77bQDAsWPHMHXqVNx0003IzMys8HtfZt26dVAUBddee63P5f/617/w+OOPY/To0XjiiSfgcDiwY8cObNiwAePGjcOoUaNw4MABfPHFF3j77bcRHx8PAKhXr562j59++glff/01Hn30UcTHx6NJkybYvXs3rr/+esTGxuKZZ56BxWLBxx9/jP79++Pnn39G9+7dK8yZk5ODIUOG4MyZM/j555/RrFmzKj9GUlNTIYTAunXrKvwjxlCEwU2fPl0AEMuWLROnT58WmZmZ4ssvvxRxcXEiMjJSHDt2TAghhMPhEB6Px2fb9PR0YbVaxcsvv6xd9umnnwoA4q233ip3LK/Xq20HQEyZMqXcddq1ayf69eunfb5ixQoBQDRo0ECcP39eu/zrr78WAMTUqVO1fbdo0ULccMMN2nGEEKKoqEikpKSIIUOGlDtWr169RPv27bXPT58+LQCIF154QbssIyNDmEwm8dprr/lsu3PnTmE2m8tdnpaWJgCIzz77TLvshRdeEBc/lFavXi0AiNmzZ/tsu3jx4nKXN27cWAwfPrxc9kceeURc+vC8NPszzzwjEhISRGpqqs99OmvWLKGqqli9erXP9h999JEAINauXVvueBfr16+ftr+FCxcKs9ksnnrqqQqvW5n7Q4jS79PFnE6naN++vRg4cKDPvlRVFbfddlu5x2LZ9/zcuXMiJiZGdO/eXRQXF1d4HafTKRISEkT79u19rvP9998LAOL555/XLpswYYJo3Lixz37++c9/CgBi48aNFd7mMuPHjxdxcXHlLr/11ltFu3btrrjtlClTBACRnp5e7msAhKqqYvfu3T6Xjxw5UkRERIhDhw5pl504cULExMSIvn37apeV/cxv2rRJnDx5UrRr1040bdpUZGRkaNep6mPkxIkTAoB44403rni7jCBspo8GDx6MevXqoVGjRrjzzjtht9sxb948NGjQAABgtVqhqqV3h8fjQW5uLux2O1q1aoUtW7Zo+5k7dy7i4+Px2GOPlTvGpVMGVXHvvfciJiZG+3z06NGoX78+fvjhBwDAtm3bkJaWhnHjxiE3Nxc5OTnIyclBYWEhBg0ahFWrVsHr9frs0+FwwGazXfG43377LbxeL8aMGaPtMycnB0lJSWjRogVWrFjhc32n0wmg9P66nDlz5qBWrVoYMmSIzz5TU1Nht9vL7dPlcvlcLycn56pzyMePH8d7772H5557Dna7vdzx27Rpg9atW/vss2zK8NLjX87GjRsxZswY3H777ZgyZUqF16nM/QFAG+0BpVMZeXl5uP76630eW/Pnz4fX68Xzzz+vPRbLlD22fvzxR+Tn5+NPf/pTue9t2XV+/fVXZGdn4+GHH/a5zvDhw9G6dWssXLjQZzuv16vdR9u2bcPMmTNRv359tGnT5oq3KTc3F3Xq1Cl3ee3atXHs2DFs2rTpittfSb9+/dC2bVvtc4/Hg6VLl2LkyJFo2rSpdnn9+vUxbtw4rFmzBufPn/fZx7Fjx9CvXz+4XC6sWrUKjRs31r5W1cdI2e3Mycmp9m3Si7CZPnr//ffRsmVLmM1mJCYmolWrVj4/eF6vF1OnTsUHH3yA9PR0eDwe7WtlU0xA6bRTq1atYDb7965r0aKFz+eKoqB58+baPGtaWhoAYMKECZfdR15ens8PaU5OTrn9XiotLQ1CiMte79JpnrKzRS79RXzpPvPy8pCQkFDh17Ozs30+X7p0qc/UQWW88MILSE5OxkMPPVRubj4tLQ179+697D4vPX5Fjh8/juHDh6OwsBC5ubmXLfzK3B8A8P333+PVV1/Ftm3bfOasL97voUOHoKqqzy/DS5VNe7Zv3/6y1zly5AgAoFWrVuW+1rp1a6xZs8bnsszMTJ/7qn79+pg7d+5VbxMAiAreuPGPf/wjli1bhm7duqF58+YYOnQoxo0bh969e191f2VSUlJ8Pj99+jSKiooqvE1t2rSB1+tFZmYm2rVrp11+zz33wGw2Y+/evUhKSvLZpqqPkbLbWZM//PQibEqhW7du6NKly2W//te
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные ДО аугментации в ТЕСТОВОЙ ВЫБОРКЕ (10-20% данных)\n",
"\n",
"stroke\n",
"0 486\n",
"1 25\n",
"Name: count, dtype: int64\n",
"\n",
"После оверсемплинга\n",
"\n",
"stroke\n",
"0 486\n",
"1 243\n",
"Name: count, dtype: int64\n",
"\n",
"После балансировки данных (андерсемплинга)\n",
"\n",
"stroke\n",
"0 243\n",
"1 243\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGbCAYAAAAr/4yjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8YklEQVR4nO3deXhTVcIG8PfeJE3apmVpaUsBoew7aNlBdlAEFRFBEUX8UGdcZ3R0hnHcdRwHR0Vxm3EEQcYFERxEFkGQVUD2nQItlK2lBUq3NNv5/ii9Q2iBLklO7s378+mDTXPvfZOmfXvOuUkUIYQAERERAFV2ACIiCh0sBSIi0rAUiIhIw1IgIiINS4GIiDQsBSIi0rAUiIhIw1IgIiINS4GIiDQsBSKqkYcffhhDhgyRHUMzY8YMKIqCX3/9tcb7crlcaNSoET744AM/JNMHw5dC2QOk7MNms6Fly5Z49NFHkZWVJTseka6lp6fjk08+wZ///Ocqbbdu3Tq8+OKLOHfuXGCC+YnFYsGTTz6J1157DQ6HQ3acoDB8KZR5+eWXMWvWLEybNg29evXChx9+iJ49e6KoqEh2NCLdmjp1KlJSUjBgwIAqbbdu3Tq89NJLIV8KADBx4kTk5OTgP//5j+woQRE2pTBs2DCMHz8ekyZNwowZM/C73/0O6enp+O6772RHI9Ill8uF2bNnY8yYMQE9jtfrlfpXeu3atTF06FDMmDFDWoZgCptSuNTAgQMBlA5/AeDMmTP4wx/+gA4dOsButyM2NhbDhg3D9u3by23rcDjw4osvomXLlrDZbKhfvz5GjRqFQ4cOAQAyMjJ8pqwu/ejfv7+2r5UrV0JRFHz11Vf485//jKSkJERHR+OWW25BZmZmuWNv2LABN954I2rVqoWoqCj069cPa9eurfA29u/fv8Ljv/jii+Wu+/nnnyM1NRWRkZGoW7cu7rzzzgqPf6XbdjGv14t33nkH7dq1g81mQ2JiIh566CGcPXvW53pNmjTBiBEjyh3n0UcfLbfPirJPmTKl3H0KACUlJXjhhRfQvHlzWK1WNGrUCM888wxKSkoqvK8u1r9//3L7e+2116Cqarm/Fit7f7z55pvo1asX4uLiEBkZidTUVHzzzTcVHv/zzz9Ht27dEBUVhTp16qBv375YunSpz3UWLVqEfv36ISYmBrGxsejatWu5bHPmzNG+p/Hx8Rg/fjyOHz/uc5377rvPJ3OdOnXQv39/rF69+qr305o1a5CTk4PBgweX+9p7772Hdu3aabehS5cuWr4XX3wRTz/9NAAgJSVFO3ZGRgaA0u/zo48+itmzZ6Ndu3awWq1YvHgxAGDr1q0YNmwYYmNjYbfbMWjQIPzyyy9XzXr27Fl069YNDRs2xP79+wFU7TEyZMgQrFmzBmfOnLnqsfTOLDuALGW/wOPi4gAAhw8fxvz583HHHXcgJSUFWVlZ+Pjjj9GvXz/s2bMHycnJAACPx4MRI0Zg+fLluPPOO/HEE08gPz8fP/74I3bt2oVmzZppx7jrrrtw0003+Rx38uTJFeZ57bXXoCgK/vjHPyI7OxvvvPMOBg8ejG3btiEyMhIA8NNPP2HYsGFITU3FCy+8AFVVMX36dAwcOBCrV69Gt27dyu23YcOGeP311wEABQUF+O1vf1vhsZ977jmMGTMGkyZNwunTp/Hee++hb9++2Lp1K2rXrl1umwcffBDXX389AODbb7/FvHnzfL7+0EMPYcaMGZg4cSIef/xxpKenY9q0adi6dSvWrl0Li8VS4f1QFefOndNu28W8Xi9uueUWrFmzBg8++CDatGmDnTt34u2338aBAwcwf/78Kh1n+vTp+Mtf/oJ//OMfGDduXIXXudr9MXXqVNxyyy24++674XQ68eWXX+KOO+7A999/j+HDh2vXe+mll/Diiy+iV69eePnllxEREYENGzbgp59+wtChQwGUrpPdf//9aNeuHSZPnozatWtj69atWLx4sZav7L7v2rUrXn/9dWRlZWHq1KlYu3Ztue9pfHw83n77bQDAsWPHMHXqVNx0003IzMys8HtfZt26dVAUBddee63P5f/617/w+OOPY/To0XjiiSfgcDiwY8cObNiwAePGjcOoUaNw4MABfPHFF3j77bcRHx8PAKhXr562j59++glff/01Hn30UcTHx6NJkybYvXs3rr/+esTGxuKZZ56BxWLBxx9/jP79++Pnn39G9+7dK8yZk5ODIUOG4MyZM/j555/RrFmzKj9GUlNTIYTAunXrKvwjxlCEwU2fPl0AEMuWLROnT58WmZmZ4ssvvxRxcXEiMjJSHDt2TAghhMPhEB6Px2fb9PR0YbVaxcsvv6xd9umnnwoA4q233ip3LK/Xq20HQEyZMqXcddq1ayf69eunfb5ixQoBQDRo0ECcP39eu/zrr78WAMTUqVO1fbdo0ULccMMN2nGEEKKoqEikpKSIIUOGlDtWr169RPv27bXPT58+LQCIF154QbssIyNDmEwm8dprr/lsu3PnTmE2m8tdnpaWJgCIzz77TLvshRdeEBc/lFavXi0AiNmzZ/tsu3jx4nKXN27cWAwfPrxc9kceeURc+vC8NPszzzwjEhISRGpqqs99OmvWLKGqqli9erXP9h999JEAINauXVvueBfr16+ftr+FCxcKs9ksnnrqqQqvW5n7Q4jS79PFnE6naN++vRg4cKDPvlRVFbfddlu5x2LZ9/zcuXMiJiZGdO/eXRQXF1d4HafTKRISEkT79u19rvP9998LAOL555/XLpswYYJo3Lixz37++c9/CgBi48aNFd7mMuPHjxdxcXHlLr/11ltFu3btrrjtlClTBACRnp5e7msAhKqqYvfu3T6Xjxw5UkRERIhDhw5pl504cULExMSIvn37apeV/cxv2rRJnDx5UrRr1040bdpUZGRkaNep6mPkxIkTAoB44403rni7jCBspo8GDx6MevXqoVGjRrjzzjtht9sxb948NGjQAABgtVqhqqV3h8fjQW5uLux2O1q1aoUtW7Zo+5k7dy7i4+Px2GOPlTvGpVMGVXHvvfciJiZG+3z06NGoX78+fvjhBwDAtm3bkJaWhnHjxiE3Nxc5OTnIyclBYWEhBg0ahFWrVsHr9frs0+FwwGazXfG43377LbxeL8aMGaPtMycnB0lJSWjRogVWrFjhc32n0wmg9P66nDlz5qBWrVoYMmSIzz5TU1Nht9vL7dPlcvlcLycn56pzyMePH8d7772H5557Dna7vdzx27Rpg9atW/vss2zK8NLjX87GjRsxZswY3H777ZgyZUqF16nM/QFAG+0BpVMZeXl5uP76630eW/Pnz4fX68Xzzz+vPRbLlD22fvzxR+Tn5+NPf/pTue9t2XV+/fVXZGdn4+GHH/a5zvDhw9G6dWssXLjQZzuv16vdR9u2bcPMmTNRv359tGnT5oq3KTc3F3Xq1Cl3ee3atXHs2DFs2rTpittfSb9+/dC2bVvtc4/Hg6VLl2LkyJFo2rSpdnn9+vUxbtw4rFmzBufPn/fZx7Fjx9CvXz+4XC6sWrUKjRs31r5W1cdI2e3Mycmp9m3Si7CZPnr//ffRsmVLmM1mJCYmolWrVj4/eF6vF1OnTsUHH3yA9PR0eDwe7WtlU0xA6bRTq1atYDb7965r0aKFz+eKoqB58+baPGtaWhoAYMKECZfdR15ens8PaU5OTrn9XiotLQ1CiMte79JpnrKzRS79RXzpPvPy8pCQkFDh17Ozs30+X7p0qc/UQWW88MILSE5OxkMPPVRubj4tLQ179+697D4vPX5Fjh8/juHDh6OwsBC5ubmXLfzK3B8A8P333+PVV1/Ftm3bfOasL97voUOHoKqqzy/DS5VNe7Zv3/6y1zly5AgAoFWrVuW+1rp1a6xZs8bnsszMTJ/7qn79+pg7d+5VbxMAiAreuPGPf/wjli1bhm7duqF58+YYOnQoxo0bh969e191f2VSUlJ8Pj99+jSKiooqvE1t2rSB1+tFZmYm2rVrp11+zz33wGw2Y+/evUhKSvLZpqqPkbLbWZM//PQibEqhW7du6NKly2W//te
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def over_under_sampling(x_selection, y_selection):\n",
"\n",
" # сначала увеличение меньшинства\n",
"\n",
" oversampler = RandomOverSampler(sampling_strategy=0.5, random_state=42) \n",
" x_over, y_over = oversampler.fit_resample(x_selection, y_selection) \n",
"\n",
" print(\"\\nПосле оверсемплинга\\n\")\n",
" print(y_over.value_counts())\n",
"\n",
" # потом уменьшение большинства\n",
"\n",
" undersampler = RandomUnderSampler(sampling_strategy=1.0, random_state=42)\n",
" x_balanced, y_balanced = undersampler.fit_resample(x_over, y_over)\n",
"\n",
" print(\"\\nПосле балансировки данных (андерсемплинга)\\n\")\n",
" print(y_balanced.value_counts())\n",
"\n",
" plt.pie(\n",
" y_balanced.value_counts(), \n",
" labels=class_counts.index, # Метки классов (0 и 1)\n",
" autopct='%1.1f%%', # Отображение процентов\n",
" colors=['lightgreen', 'lightcoral'], # Цвета для классов\n",
" startangle=45, # Поворот диаграммы\n",
" explode=(0, 0.05) # Небольшое смещение для класса 1\n",
" )\n",
" plt.title(\"Распределение классов (stroke)\")\n",
" plt.show()\n",
"\n",
"print(\"Данные ДО аугментации в ОБУЧАЮЩЕЙ ВЫБОРКЕ (60-80% данных)\\n\")\n",
"print(y_train.value_counts())\n",
"over_under_sampling(X_train, y_train)\n",
"\n",
"print(\"Данные ДО аугментации в ТЕСТОВОЙ ВЫБОРКЕ (10-20% данных)\\n\")\n",
"print(y_test.value_counts())\n",
"over_under_sampling(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Теперь можно и к конструированию признаков приступить) данные ведь сбалансированы (в выборках)</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Унитарное кодирование категориальных признаков <br/> <br/>Применяем к категориальным (НЕ числовым) признакам: 'gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'</p>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# One-Hot Encoding\n",
"categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']\n",
"X_encoded = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)\n",
"\n",
"print(\"Данные после унитарного кодирования:\")\n",
"print(X_encoded.head())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}