416 lines
117 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество колонок: 12\n",
"Колонки: Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
" 'smoking_status', 'stroke'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"# Загрузка данных\n",
"data = pd.read_csv('./csv/option4.csv')\n",
"\n",
"# Обзор данных\n",
"print(\"Количество колонок:\", data.columns.size)\n",
"print(\"Колонки:\", data.columns)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 201\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.describe of id gender age hypertension heart_disease ever_married \\\n",
"0 9046 Male 67.0 0 1 Yes \n",
"1 51676 Female 61.0 0 0 Yes \n",
"2 31112 Male 80.0 0 1 Yes \n",
"3 60182 Female 49.0 0 0 Yes \n",
"4 1665 Female 79.0 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"5105 18234 Female 80.0 1 0 Yes \n",
"5106 44873 Female 81.0 0 0 Yes \n",
"5107 19723 Female 35.0 0 0 Yes \n",
"5108 37544 Male 51.0 0 0 Yes \n",
"5109 44679 Female 44.0 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 Private Urban 228.69 36.6 formerly smoked \n",
"1 Self-employed Rural 202.21 NaN never smoked \n",
"2 Private Rural 105.92 32.5 never smoked \n",
"3 Private Urban 171.23 34.4 smokes \n",
"4 Self-employed Rural 174.12 24.0 never smoked \n",
"... ... ... ... ... ... \n",
"5105 Private Urban 83.75 NaN never smoked \n",
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
"5108 Private Rural 166.29 25.6 formerly smoked \n",
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
"\n",
" stroke \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"5105 0 \n",
"5106 0 \n",
"5107 0 \n",
"5108 0 \n",
"5109 0 \n",
"\n",
"[5110 rows x 12 columns]>\n"
]
}
],
"source": [
"print(data.describe)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Возьмем и заменим нулевые значения в столбце bmi на средние значения по столбцу </p>"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Наличие пропущенных значений:\n",
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 0\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n"
]
}
],
"source": [
"data['bmi'] = data['bmi'].fillna(data['bmi'].median())\n",
"print(\"\\nНаличие пропущенных значений:\")\n",
"print(data.isnull().sum())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Взглянем на выбросы: </p>"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABscUlEQVR4nO3de1yUdf7//+cwHD0wiAcGChDPirIeKiOM3GTDU6vlZhaltX7081G0zG0rNw9hGFvblj9bsGz7mBau1ZZmZVpiylZoahoeWtMicdPBCgEPiQjX74++XB9HmRQdGAYe99ttbs28r9dc87rwlm988uZ9WQzDMAQAAAAAAAAAAM7j4+kGAAAAAAAAAABoqAjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAAAAAAAAAHCBEB0AAAAAAAAAABcI0QEAAAAAAAAAcIEQHQAAAAAAAAAAFwjRAQAAAAAAAABwgRAdAAAAAAAAAAAXCNEBAACAemCxWPTYY495uo16tWHDBlksFm3YsMHTrTSoXgAAjddjjz0mi8WiH374oU4/55577lH79u3r9DMA/B9CdAAAAAAAAAAAXPD1dAMAAAAAAAAALt6LL76oqqoqT7cBNBmE6AAAAAAAAIAX8fPz83QLQJPCdi5AI3TgwAFNnjxZXbt2VVBQkFq3bq3bbrtN33777Xm1+fn5uuGGGxQUFKQrr7xS6enpWrx4sSwWy3n177//vq6//no1b95cLVu21LBhw7R79+76uSgAQJN1MfPa1q1bZbFYtGTJkvPev3btWlksFr377rvm2IYNG3TVVVcpMDBQHTt21AsvvGDuYVpbb7zxhnr06KHAwED17NlTK1asuKh9Sl3VuOrj1Vdf1TXXXKNmzZqpVatWSkxM1AcffOBUk5WVpdjYWAUEBCgiIkKpqakqKSlxqtm3b59GjRolu92uwMBAXXnllRozZoxKS0vP+7x+/fopKChIoaGhGjNmjA4ePHhRX5ML2bx5swYPHiybzaZmzZrphhtu0CeffGIe/+c//ymLxaKNGzee994XXnhBFotFu3btMsf+/e9/63e/+51CQ0MVGBioq666SqtWrXJLrwAAXIoffvhBo0ePVnBwsFq3bq37779fp06dMo9bLBZNmTLF/D4iKChI8fHx2rlzp6Sf57tOnTopMDBQAwcOPO/f5+yJDtQvVqIDjdCWLVv06aefasyYMbryyiv17bffauHChRo4cKD27NmjZs2aSZK+++47/frXv5bFYtGMGTPUvHlz/f3vf1dAQMB553zllVc0btw4JScn68knn9TJkye1cOFCDRgwQNu3b2fyBgDUmYuZ16666ip16NBBr7/+usaNG+f0/tdee02tWrVScnKyJGn79u0aPHiwwsPDlZaWpsrKSs2dO1dt27atdW/vvfeebr/9dvXq1UsZGRk6evSoxo8fryuuuMIt114tLS1Njz32mK677jrNnTtX/v7+2rx5s9avX6+bbrpJ0s/he1pampKSkjRp0iTt3btXCxcu1JYtW/TJJ5/Iz89Pp0+fVnJyssrLyzV16lTZ7XZ99913evfdd1VSUiKbzSZJmjdvnmbNmqXRo0frv/7rv/T999/rueeeU2JiorZv366QkJBLvpb169dryJAh6tevn+bMmSMfHx8tXrxYN954o/71r3/pmmuu0bBhw9SiRQu9/vrruuGGG5ze/9prryk2NlY9e/aUJO3evVsJCQm64oor9Mgjj6h58+Z6/fXXNXLkSL355pu65ZZbLrlXAAAu1ejRo9W+fXtlZGRo06ZNWrBggY4ePaqlS5eaNf/617+0atUqpaamSpIyMjI0fPhwPfTQQ8rKytLkyZN19OhRPfXUU/r973+v9evXe+pyABgAGp2TJ0+eN5aXl2dIMpYuXWqOTZ061bBYLMb27dvNsR9//NEIDQ01JBkFBQWGYRjGsWPHjJCQEGPChAlO53Q4HIbNZjtvHAAAd7rYeW3GjBmGn5+fUVxcbI6Vl5cbISEhxu9//3tz7OabbzaaNWtmfPfdd+bYvn37DF9fX6O23x736tXLuPLKK41jx46ZYxs2bDAkGdHR0U61kow5c+aYr8eNG3dejWEYxpw5c5z62Ldvn+Hj42PccsstRmVlpVNtVVWVYRiGceTIEcPf39+46aabnGr+9re/GZKM//3f/zUMwzC2b99uSDLeeOMNl9f07bffGlar1Zg3b57T+M6dOw1fX9/zxn/JRx99ZEgyPvroI7Pfzp07G8nJyWbvhvHzn3FMTIzxm9/8xhy74447jHbt2hlnzpwxxw4fPmz4+PgYc+fONccGDRpk9OrVyzh16pTT1+W6664zOnfu7LIXAADqQvU8/tvf/tZpfPLkyYYk44svvjAM4+fvCwICAsx/dxuGYbzwwguGJMNutxtlZWXm+IwZM5z+jW4Yrr+PAFA32M4FaISCgoLM5xUVFfrxxx/VqVMnhYSE6PPPPzePrVmzRvHx8erdu7c5FhoaqpSUFKfzffjhhyopKdEdd9yhH374wXxYrVb1799fH330UZ1fEwCg6brYee32229XRUWF3nrrLXPsgw8+UElJiW6//XZJUmVlpdatW6eRI0cqIiLCrOvUqZOGDBlSq74OHTqknTt3auzYsWrRooU5fsMNN6hXr161vk5XVq5cqaqqKs2ePVs+Ps7fvldv+7Ju3TqdPn1a06ZNc6qZMGGCgoOD9d5770mSudJ87dq1OnnyZI2f99Zbb6mqqkqjR492mvftdrs6d+58WfP+jh07tG/fPt1555368ccfzXOfOHFCgwYNUm5urnmTtNtvv11HjhzRhg0bzPf/85//VFVVlfnnWVxcrPXr12v06NE6duyYeb4ff/xRycnJ2rdvn7777rtL7hcAgEtVvbq82tSpUyVJq1evNscGDRrk9Fvd/fv3lySNGjVKLVu2PG/8m2++qat2AVwA27kAjdBPP/2kjIwMLV68WN99950MwzCPnb3f6YEDBxQfH3/e+zt16uT0et++fZKkG2+8scbPCw4OdkfbAADU6GLntV/96lfq1q2bXnvtNY0fP17Sz1t/tGnTxpzDjhw5op9++um8uU46f/67kAMHDrh8X6dOnZwC/svx9ddfy8fHRz169LhgL127dnUa9/f3V4cOHczjMTExmj59up555hllZ2fr+uuv129/+1vdddddZsC+b98+GYahzp071/hZl3Mjs+rvKc7dcudspaWlatWqlbln+muvvaZBgwZJ+vnPs3fv3urSpYskaf/+/TIMQ7NmzdKsWbNqPN+RI0fcvr0OAAAXcu482rFjR/n4+DjtbR4VFeVUUz0XR0ZG1jh+9OjROugUwMUgRAcaoalTp2rx4sWaNm2a4uPjZbPZZLFYNGbMGHN1V21Uv+eVV16R3W4/77ivL3+VAADqTm3mtdtvv13z5s3TDz/8oJYtW2rVqlW64447GuRc5eomppWVlXX6uX/96191zz336O2339YHH3yg++67z9yv9corr1RVVZUsFovef/99Wa3W895/9qr72qr+8/rLX/7i9JtwNZ0/ICBAI0eO1IoVK5SVlaWioiJ98skneuKJJ84734MPPmjueX+u2v5wBACAulDTvF/TPPtL42cvJABQvxrevyYAXLZ//vOfGjdunP7617+aY6dOnVJJSYlTXXR0tPbv33/e+88d69ixoySpXbt2SkpKcn/DAAD8goud16SfQ/S0tDS9+eabCgsLU1lZmcaMGWMeb9eunQIDAy9q/ruQ6Ohol++7mHO1atWqxmuoXjVerWPHjqqqqtKePXtcBs/Vvezdu1cdOnQwx0+fPq2CgoLz5u9evXqpV69emjlzpj799FMlJCTo+eefV3p6ujp27CjDMBQTE2Ou+HaX6u8pgoODL+p7ittvv11LlixRTk6OvvzySxmGYW7lIsm8Vj8/P75HAQA0KPv27VNMTIz5ev/+/aqqqnLavgWA92BPdKARslqt5/2E+rnnnjtvZVtycrLy8vK0Y8cOc6y4uFjZ2dnn1QUHB+uJJ55QRUXFeZ/3/fffu695AADOcbHzmiR1795dvXr
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def plot_numeric_boxplots(dataframe, columns):\n",
" # Фильтрация числовых столбцов\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" \n",
" # Построение графиков\n",
" if numeric_columns:\n",
" plt.figure(figsize=(15, 5))\n",
" \n",
" for i, col in enumerate(numeric_columns):\n",
" if col != 'id':\n",
" plt.subplot(1, len(numeric_columns), i + 1)\n",
" sns.boxplot(y=dataframe[col])\n",
" plt.title(f'{col}')\n",
" plt.ylabel('')\n",
" plt.xlabel(col)\n",
" \n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"Нет подходящих числовых столбцов для построения графиков.\")\n",
"\n",
"plot_numeric_boxplots(data, data.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Видим выбросы в столбцах со средним уровнем глюкозы и в столбце bmi (индекс массы тела). устраним выбросы - поставим верхние и нижние границы</p>"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAHqCAYAAADrpwd3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABcoElEQVR4nO3dfZiVVb0//vcgMIPADIIxAwmI+AAGPhuSZqQk4kOgnJSiNPPIqfCRThpH0SSJ9Fh5NMQ0j2lBmqUezcQUBcpQEUVNjdBQSAVKZUZQRmT2749+7m8j7BIFBobX67ruS/Za6177cw9erJn33HvdZYVCoRAAAAAAAGAtLZq6AAAAAAAA2FwJ0QEAAAAAoAQhOgAAAAAAlCBEBwAAAACAEoToAAAAAABQghAdAAAAAABKEKIDAAAAAEAJQnQAAAAAAChBiA4AAAAAACUI0QEAYBMoKyvLN7/5zaYuY5OaMWNGysrKMmPGjKYuZbOqBYDm65vf/GbKysryt7/9baO+zxe/+MXsuOOOG/U9gP9HiA4AAAAAACW0bOoCAAAAAID37pprrklDQ0NTlwFbDSE6AAAAAGxBWrVq1dQlwFbFdi7QDL3wwgv56le/mt122y1t2rRJp06d8pnPfCbPP//8WmOfeOKJfOITn0ibNm2yww475KKLLsp1112XsrKytcbfdddd+fjHP562bdumffv2OfLII/PUU09tmosCYKv1Xta1Rx55JGVlZbn++uvXOv/uu+9OWVlZfvWrXxXbZsyYkf322y8VFRXp1atXfvjDHxb3MF1fN998c3bfffdUVFSkb9++ufXWW9/TPqWlxpSq46c//Wk++tGPZtttt812222Xgw8+OL/5zW8ajbnyyivzkY98JOXl5enatWtGjx6d5cuXNxqzYMGCDB8+PDU1NamoqMgOO+yQESNGpLa2dq3323fffdOmTZt07NgxI0aMyOLFi9/T1+Rfeeihh3L44Yenqqoq2267bT7xiU/kgQceKPb/4he/SFlZWWbOnLnWuT/84Q9TVlaWP/zhD8W2P/7xj/m3f/u3dOzYMRUVFdlvv/1y++23b5BaAeD9+Nvf/pbjjjsulZWV6dSpU84444ysWrWq2F9WVpZTTz21+H1EmzZtMmDAgDz55JNJ/r7e7bzzzqmoqMjAgQPX+vncnuiwabkTHZqhOXPm5Pe//31GjBiRHXbYIc8//3wmT56cgQMH5umnn862226bJHnxxRfzyU9+MmVlZRk7dmzatm2bH/3oRykvL19rzp/85Cc58cQTM3jw4Fx88cV54403Mnny5Bx00EF57LHHLN4AbDTvZV3bb7/9stNOO+XnP/95TjzxxEbn33TTTdluu+0yePDgJMljjz2Www8/PF26dMmFF16YNWvWZPz48fnQhz603rXdeeedOf7449OvX79MnDgxr732Wk4++eR8+MMf3iDX/o4LL7ww3/zmN/Oxj30s48ePT+vWrfPQQw/lvvvuy2GHHZbk7+H7hRdemEGDBuUrX/lK5s+fn8mTJ2fOnDl54IEH0qpVq7z11lsZPHhw6uvrc9ppp6WmpiYvvvhifvWrX2X58uWpqqpKkkyYMCHjxo3Lcccdl3//93/PX//611xxxRU5+OCD89hjj6VDhw7v+1ruu+++DBkyJPvuu28uuOCCtGjRItddd10OOeSQ/Pa3v81HP/rRHHnkkWnXrl1+/vOf5xOf+ESj82+66aZ85CMfSd++fZMkTz31VA488MB8+MMfzje+8Y20bds2P//5zzNs2LD88pe/zDHHHPO+awWA9+u4447LjjvumIkTJ+bBBx/M5Zdfntdeey033HBDccxvf/vb3H777Rk9enSSZOLEiTnqqKNy9tln58orr8xXv/rVvPbaa7nkkkvypS99Kffdd19TXQ5QAJqdN954Y6222bNnF5IUbrjhhmLbaaedVigrKys89thjxbZXXnml0LFjx0KSwsKFCwuFQqHw+uuvFzp06FA45ZRTGs25ZMmSQlVV1VrtALAhvdd1bezYsYVWrVoVXn311WJbfX19oUOHDoUvfelLxbajjz66sO222xZefPHFYtuCBQsKLVu2LKzvt8f9+vUr7LDDDoXXX3+92DZjxoxCkkKPHj0ajU1SuOCCC4qvTzzxxLXGFAqFwgUXXNCojgULFhRatGhROOaYYwpr1qxpNLahoaFQKBQKy5YtK7Ru3bpw2GGHNRrzgx/8oJCk8L//+7+FQqFQeOyxxwpJCjfffHPJa3r++ecL22yzTWHChAmN2p988slCy5Yt12r/Z+6///5CksL9999frHeXXXYpDB48uFh7ofD3v+OePXsWPvWpTxXbPvvZzxY6d+5cePvtt4ttL7/8cqFFixaF8ePHF9sOPfTQQr9+/QqrVq1q9HX52Mc+Vthll11K1gIAG8M76/inP/3pRu1f/epXC0kKjz/+eKFQ+Pv3BeXl5cWfuwuFQuGHP/xhIUmhpqamUFdXV2wfO3Zso5/RC4XS30cAG4ftXKAZatOmTfHPq1evziuvvJKdd945HTp0yKOPPlrsmzZtWgYMGJC99tqr2NaxY8eMHDmy0Xz33HNPli9fns9+9rP529/+Vjy22Wab9O/fP/fff/9GvyYAtl7vdV07/vjjs3r16txyyy3Ftt/85jdZvnx5jj/++CTJmjVrcu+992bYsGHp2rVrcdzOO++cIUOGrFddL730Up588smccMIJadeuXbH9E5/4RPr167fe11nKbbfdloaGhpx//vlp0aLxt+/vbPty77335q233sqZZ57ZaMwpp5ySysrK3HnnnUlSvNP87rvvzhtvvLHO97vlllvS0NCQ4447rtG6X1NTk1122eUDrfvz5s3LggUL8rnPfS6vvPJKce6VK1fm0EMPzaxZs4oPSTv++OOzbNmyzJgxo3j+L37xizQ0NBT/Pl999dXcd999Oe644/L6668X53vllVcyePDgLFiwIC+++OL7rhcA3q937i5/x2mnnZYk+fWvf11sO/TQQxt9qrt///5JkuHDh6d9+/Zrtf/5z3/eWOUC/4LtXKAZevPNNzNx4sRcd911efHFF1MoFIp9/7jf6QsvvJABAwasdf7OO+/c6PWCBQuSJIcccsg636+ysnJDlA0A6/Re17U999wzvXv3zk033ZSTTz45yd+3/th+++2La9iyZcvy5ptvrrXWJWuvf//KCy+8UPK8nXfeuVHA/0E899xzadGiRXbfffd/Wctuu+3WqL1169bZaaediv09e/bMmDFj8r3vfS9TpkzJxz/+8Xz605/O5z//+WLAvmDBghQKheyyyy7rfK8P8iCzd76nePeWO/+otrY22223XXHP9JtuuimHHnpokr//fe61117ZddddkyTPPvtsCoVCxo0bl3Hjxq1zvmXLlm3w7XUA4F959zraq1evtGjRotHe5t27d2805p21uFu3butsf+211zZCpcB7IUSHZui0007LddddlzPPPDMDBgxIVVVVysrKMmLEiOLdXevjnXN+8pOfpKamZq3+li39UwLAxrM+69rxxx+fCRMm5G9/+1vat2+f22+/PZ/97Gc3y7Wq1ENM16xZs1Hf97vf/W6++MUv5v/+7//ym9/8Jqeffnpxv9YddtghDQ0NKSsry1133ZVtttlmrfP/8a779fXO39d///d/N/ok3LrmLy8vz7Bhw3LrrbfmyiuvzNKlS/PAAw/k29/+9lrz/ed//mdxz/t3W99fjgDAxrCudX9d6+w/a//HGwmATWvz+2kC+MB+8Ytf5MQTT8x3v/vdYtuqVauyfPnyRuN69OiRZ599dq3z393Wq1evJEnnzp0zaNCgDV8wAPwT73VdS/4eol944YX55S9/merq6tTV1WXEiBHF/s6dO6eiouI9rX//So8ePUqe917m2m677dZ5De/cNf6OXr16paGhIU8//XTJ4PmdWubPn5+ddtqp2P7WW29l4cKFa63f/fr1S79+/XLeeefl97//fQ488MBcddVVueiii9KrV68UCoX07NmzeMf3hvLO9xSVlZXv6XuK448/Ptdff32mT5+eZ555JoV
"text/plain": [
"<Figure size 1500x500 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def remove_outliers(df):\n",
"\n",
" numeric_columns = ['age', 'avg_glucose_level', 'bmi']\n",
" for column in numeric_columns:\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
" return df\n",
" \n",
"data = remove_outliers(data)\n",
"plot_numeric_boxplots(data, data.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"margin: 30px;\">Так, от выбросов избавились, теперь разобьем на выборки</p>"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Размеры выборок:\n",
"Обучающая выборка: (3577, 10)\n",
"Тестовая выборка: (766, 10)\n",
"Контрольная выборка: (767, 10)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Определение признаков и целевой переменной\n",
"X = data.drop(columns=['id', 'stroke']) \n",
"y = data['stroke'] \n",
"\n",
"# Обучающая выборка\n",
"X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)\n",
"\n",
"# Тестовая и контрольная выборки\n",
"X_test, X_control, y_test, y_control = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)\n",
"\n",
"print(\"\\nРазмеры выборок:\")\n",
"print(f\"Обучающая выборка: {X_train.shape}\")\n",
"print(f\"Тестовая выборка: {X_test.shape}\")\n",
"print(f\"Контрольная выборка: {X_control.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"stroke\n",
"0 4861\n",
"1 249\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9kUlEQVR4nO3deXyM9/7//2d2IZlEkIRSQmqJpa0oglpDqlFVnFYpqpyWhhYtTs6ptfXVcmorqqeLpaUt1XLKsUQUVSkaja042sahjSRCk0ErieT6/dFP5mckthGZxPW4325zu5n39b7e1+uaTDJP1/W+rnExDMMQAACAibk6uwAAAABnIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABAADTIxABABz2/PPPq3Pnzs4uw2bx4sVycXHRd999d8tj5ebmqkaNGlqwYEExVIbSjkCEO1rBH8eCR7ly5VS3bl0NHz5caWlpzi4PKNOSk5P13nvv6e9///tNrbdz505NmjRJmZmZt6ewYuLh4aHRo0dr6tSpunjxorPLwW1GIIIpTJkyRR9++KHmzZunVq1a6e2331ZERIR+//13Z5cGlFlz5sxRSEiIOnTocFPr7dy5U5MnTy71gUiSBg0apIyMDC1fvtzZpeA2IxDBFLp27aqnnnpKQ4YM0eLFizVy5EglJydrzZo1zi4NKJNyc3O1bNkyPf7447d1O/n5+U49OuPv768uXbpo8eLFTqsBJYNABFPq2LGjpD8P+UvS2bNn9fLLL6tx48by8fGRxWJR165dtW/fvkLrXrx4UZMmTVLdunVVrlw5Va1aVT179tRPP/0kSTp+/LjdaborH+3bt7eNtXXrVrm4uOjTTz/V3//+dwUHB6tChQrq3r27Tp48WWjbu3bt0kMPPSQ/Pz+VL19e7dq10zfffFPkPrZv377I7U+aNKlQ348++kjh4eHy9vZWQECA+vTpU+T2r7Vvl8vPz9fs2bPVsGFDlStXTkFBQXruuef022+/2fWrVauWunXrVmg7w4cPLzRmUbXPmDGj0GsqSdnZ2Zo4caJCQ0Pl5eWlGjVqaOzYscrOzi7ytbpc+/btC403depUubq6FjpKcKOvxz//+U+1atVKlSpVkre3t8LDw/XZZ58Vuf2PPvpIzZs3V/ny5VWxYkW1bdtWmzZtsuuzfv16tWvXTr6+vrJYLHrggQcK1bZy5Urbz7Ry5cp66qmn9Ouvv9r1efrpp+1qrlixotq3b6+vv/76uq/Tjh07lJGRocjIyELL3nrrLTVs2NC2D82aNbPVN2nSJI0ZM0aSFBISYtv28ePHJf35cx4+fLiWLVumhg0bysvLSxs2bJAkff/99+ratassFot8fHzUqVMnffvtt9et9bffflPz5s1VvXp1HT16VNLNvUc6d+6sHTt26OzZs9fdFsoud2cXADhDQXipVKmSJOnnn3/W6tWr9Ze//EUhISFKS0vTO++8o3bt2umHH35QtWrVJEl5eXnq1q2b4uPj1adPH7344os6d+6c4uLidPDgQdWpU8e2jSeffFIPP/yw3XZjY2OLrGfq1KlycXHRuHHjlJ6ertmzZysyMlJJSUny9vaWJG3ZskVdu3ZVeHi4Jk6cKFdXVy1atEgdO3bU119/rebNmxcat3r16po2bZok6fz58xo2bFiR2x4/frwef/xxDRkyRKdPn9Zbb72ltm3b6vvvv5e/v3+hdZ599lk9+OCDkqTPP/9cX3zxhd3y5557TosXL9agQYP0wgsvKDk5WfPmzdP333+vb775Rh4eHkW+DjcjMzPTtm+Xy8/PV/fu3bVjxw49++yzatCggQ4cOKBZs2bpv//9r1avXn1T21m0aJFeeeUVvfnmm+rbt2+Rfa73esyZM0fdu3dXv379lJOTo08++UR/+ctftHbtWkVHR9v6TZ48WZMmTVKrVq00ZcoUeXp6ateuXdqyZYu6dOki6c95cc8884waNmyo2NhY+fv76/vvv9eGDRts9RW89g888ICmTZumtLQ0zZkzR998802hn2nlypU1a9YsSdIvv/yiOXPm6OGHH9bJkyeL/NkX2Llzp1xcXHT//ffbtb/77rt64YUX1Lt3b7344ou6ePGi9u/fr127dqlv377q2bOn/vvf/+rjjz/WrFmzVLlyZUlSlSpVbGNs2bJFK1as0PDhw1W5cmXVqlVLhw4d0oMPPiiLxaKxY8fKw8ND77zzjtq3b69t27apRYsWRdaZkZGhzp076+zZs9q2bZvq1Klz0++R8PBwGYahnTt3FhngcYcwgDvYokWLDEnG5s2bjdOnTxsnT540PvnkE6NSpUqGt7e38csvvxiGYRgXL1408vLy7NZNTk42vLy8jClTptjaPvjgA0OSMXPmzELbys/Pt60nyZgxY0ahPg0bNjTatWtne/7VV18Zkoy77rrLsFqttvYVK1YYkow5c+bYxr7nnnuMqKgo23YMwzB+//13IyQkxOjcuXOhbbVq1cpo1KiR7fnp06cNScbEiRNtbcePHzfc3NyMqVOn2q174MABw93dvVD7sWPHDEnGkiVLbG0TJ040Lv9T8vXXXxuSjGXLltmtu2HDhkLtNWvWNKKjowvVHhMTY1z55+nK2seOHWsEBgYa4eHhdq/phx9+aLi6uhpff/213foLFy40JBnffPNNoe1drl27drbx1q1bZ7i7uxsvvfRSkX1v5PUwjD9/TpfLyckxGjVqZHTs2NFuLFdXV+Oxxx4r9F4s+JlnZmYavr6+RosWLYw//vijyD45OTlGYGCg0ahRI7s+a9euNSQZEyZMsLUNHDjQqFmzpt04//rXvwxJxu7du4vc5wJPPfWUUalSpULtjz76qNGwYcNrrjtjxgxDkpGcnFxomSTD1dXVOHTokF17jx49DE9PT+Onn36ytaWkpBi+vr5G27ZtbW0Fv/N79uwxTp06ZTRs2NCoXbu2cfz4cVufm32PpKSkGJKMN95445r7hbKNU2YwhcjISFWpUkU1atRQnz595OPjoy+++EJ33XWXJMnLy0uurn/+OuTl5enMmTPy8fFRvXr1tHfvXts4q1atUuXKlTVixIhC27jyNMnNGDBggHx9fW3Pe/furapVq+o///mPJCkpKUnHjh1T3759debMGWVkZCgjI0MXLlxQp06dtH37duXn59uNefHiRZUrV+6a2/3888+Vn5+vxx9/3DZmRkaGgoODdc899+irr76y65+TkyPpz9fralauXCk/Pz917tzZbszw8HD5+PgUGjM3N9euX0ZGxnXnjPz666966623NH78ePn4+BTafoMGDVS/fn27MQtOk165/avZvXu3Hn/8cfXq1UszZswoss+NvB6SbEf5pD9P32RlZenBBx+0e2+tXr1a+fn5mjBhgu29WKDgvRUXF6dz587pb3/7W6GfbUGf7777Tunp6Xr++eft+kRHR6t+/fpat26d3Xr5+fm21ygpKUlLly5V1apV1aBBg2vu05kzZ1SxYsVC7f7+/vrll1+0Z8+ea65/Le3atVNYWJjteV5enjZt2qQePXqodu3atvaqVauqb9++2rFjh6xWq90Yv/zyi9q1a6fc3Fxt375dNWvWtC272fdIwX5mZGQ4vE8o/ThlBlOYP3++6tatK3d3dwUFBalevXp2Hzr5+fmaM2eOFixYoOTkZOXl5dmWFZxWk/481VavXj25uxfvr84999xj99zFxUWhoaG2eRXHjh2TJA0cOPCqY2RlZdl9QGVkZBQa90rHjh2TYRhX7Xflqa2Cq4KuDCFXjpmVlaXAwMAil6enp9s937Rpk93pkhsxceJEVatWTc8991yhuTjHjh3T4cOHrzrmldsvyq+//qro6GhduHBBZ86cuWrYvZHXQ5LWrl2r1157TUlJSXZzVC4f96effpKrq6tdELhSwaneRo0aXbXP//73P0lSvXr1Ci2rX7++duzYYdd28uRJu9eqatWqWrVq1XX3SZIMwyjUNm7cOG3evFnNmzdXaGiounTpor59+6p169bXHa9ASEiI3fPTp0/r999/L3KfGjR
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Подсчет количества объектов каждого класса\n",
"class_counts = y.value_counts()\n",
"print(class_counts)\n",
"\n",
"# Визуализация\n",
"sns.barplot(x=class_counts.index, y=class_counts.values)\n",
"plt.title(\"Распределение классов (stroke)\")\n",
"plt.xlabel(\"Класс\")\n",
"plt.ylabel(\"Количество\")\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение классов до OverSampling:\n",
"stroke\n",
"0 3403\n",
"1 174\n",
"Name: count, dtype: int64\n",
"\n",
"Распределение классов после OverSampling:\n",
"stroke\n",
"0 3403\n",
"1 3403\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGbCAYAAAAr/4yjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAwHElEQVR4nO3deVxU5eIG8GcGEFAWFWRxSRFEFDULw+uKe6ldMzJzzSxv/tK2W2ZZueu1rl2VMsuyNJcszdJrpSlqueaWayqiggEqm4IIss28vz9GznUckEXgnXPO8/18/BTDzJlnzgznmfOed84YhBACREREAIyyAxARkf1gKRARkYKlQERECpYCEREpWApERKRgKRARkYKlQERECpYCEREpWApERKRgKRDRPRk3bhx69+4tO4Zi2bJlMBgMOHTo0D0vq6CgAI0aNcKiRYsqIZk6aL4Uil4gRf9cXFwQHByMF198EcnJybLjEalaXFwclixZgrfffrtct9u7dy+mTZuGjIyMqglWSZycnPDaa69h9uzZyM3NlR2nWmi+FIrMmDEDK1aswMKFC9GxY0d88skn6NChA3JycmRHI1KtqKgoBAQEoHv37uW63d69ezF9+nS7LwUAGD16NNLS0vD111/LjlItdFMKffv2xYgRIzBmzBgsW7YMr776KuLi4rBhwwbZ0YhUqaCgAKtWrcLgwYOr9H7MZrPUd+m1a9dGnz59sGzZMmkZqpNuSuFOPXr0AGDZ/QWAq1evYsKECWjdujXc3Nzg4eGBvn374tixYza3zc3NxbRp0xAcHAwXFxf4+/sjMjIS58+fBwDEx8dbDVnd+a9bt27Ksn799VcYDAZ8++23ePvtt+Hn54datWphwIABSEhIsLnv/fv345FHHoGnpydq1qyJiIgI7Nmzp9jH2K1bt2Lvf9q0aTbXXblyJcLCwuDq6oq6detiyJAhxd7/3R7b7cxmMxYsWIDQ0FC4uLjA19cXY8eOxbVr16yu16RJEzz66KM29/Piiy/aLLO47HPnzrVZpwCQl5eHqVOnIigoCM7OzmjUqBEmTpyIvLy8YtfV7bp162azvNmzZ8NoNNq8Wyzr+vjggw/QsWNHeHl5wdXVFWFhYfjuu++Kvf+VK1ciPDwcNWvWRJ06ddC1a1ds2bLF6jqbNm1CREQE3N3d4eHhgYceesgm29q1a5Xn1NvbGyNGjEBSUpLVdZ555hmrzHXq1EG3bt2wa9euUtfT7t27kZaWhl69etn87qOPPkJoaKjyGNq1a6fkmzZtGt544w0AQEBAgHLf8fHxACzP84svvohVq1YhNDQUzs7O2Lx5MwDgyJEj6Nu3Lzw8PODm5oaePXvi999/LzXrtWvXEB4ejoYNGyImJgZA+V4jvXv3xu7du3H16tVS70vtHGUHkKVoA+7l5QUAuHDhAtavX48nn3wSAQEBSE5OxuLFixEREYFTp06hfv36AACTyYRHH30U27Ztw5AhQ/DKK68gKysLW7duxcmTJxEYGKjcx9ChQ9GvXz+r+500aVKxeWbPng2DwYA333wTKSkpWLBgAXr16oWjR4/C1dUVALB9+3b07dsXYWFhmDp1KoxGI5YuXYoePXpg165dCA8Pt1luw4YNMWfOHADAjRs38MILLxR735MnT8bgwYMxZswYpKam4qOPPkLXrl1x5MgR1K5d2+Y2zz//PLp06QIA+P777/HDDz9Y/X7s2LFYtmwZRo8ejZdffhlxcXFYuHAhjhw5gj179sDJyanY9VAeGRkZymO7ndlsxoABA7B79248//zzaNGiBU6cOIH58+fj7NmzWL9+fbnuZ+nSpXj33Xfxn//8B8OGDSv2OqWtj6ioKAwYMADDhw9Hfn4+vvnmGzz55JP48ccf0b9/f+V606dPx7Rp09CxY0fMmDEDNWrUwP79+7F9+3b06dMHgOU42bPPPovQ0FBMmjQJtWvXxpEjR7B582YlX9G6f+ihhzBnzhwkJycjKioKe/bssXlOvb29MX/+fABAYmIioqKi0K9fPyQkJBT73BfZu3cvDAYDHnjgAavLP//8c7z88ssYNGgQXnnlFeTm5uL48ePYv38/hg0bhsjISJw9exarV6/G/Pnz4e3tDQCoV6+esozt27djzZo1ePHFF+Ht7Y0mTZrgzz//RJcuXeDh4YGJEyfCyckJixcvRrdu3fDbb7+hffv2xeZMS0tD7969cfXqVfz2228IDAws92skLCwMQgjs3bu32DcxmiI0bunSpQKAiI6OFqmpqSIhIUF88803wsvLS7i6uorExEQhhBC5ubnCZDJZ3TYuLk44OzuLGTNmKJd9+eWXAoCYN2+ezX2ZzWbldgDE3Llzba4TGhoqIiIilJ937NghAIgGDRqI69evK5evWbNGABBRUVHKsps1ayYefvhh5X6EECInJ0cEBASI3r1729xXx44dRatWrZSfU1NTBQAxdepU5bL4+Hjh4OAgZs+ebXXbEydOCEdHR5vLY2NjBQDx1VdfKZdNnTpV3P5S2rVrlwAgVq1aZXXbzZs321zeuHFj0b9/f5vs48ePF3e+PO/MPnHiROHj4yPCwsKs1umKFSuE0WgUu3btsrr9p59+KgCIPXv22Nzf7SIiIpTl/fTTT8LR0VG8/vrrxV63LOtDCMvzdLv8/HzRqlUr0aNHD6tlGY1G8fjjj9u8Foue84yMDOHu7i7at28vbt68Wex18vPzhY+Pj2jVqpXVdX788UcBQEyZMkW5bNSoUaJx48ZWy/nss88EAHHgwIFiH3ORESNGCC8vL5vLH3vsMREaGnrX286dO1cAEHFxcTa/AyCMRqP4888/rS4fOHCgqFGjhjh//rxy2aVLl4S7u7vo2rWrclnR3/zBgwfF5cuXRWhoqGjatKmIj49XrlPe18ilS5cEAPH+++/f9XFpgW6Gj3r16oV69eqhUaNGGDJkCNzc3PDDDz+gQYMGAABnZ2cYjZbVYTKZkJ6eDjc3NzRv3hx//PGHspx169bB29sbL730ks193DlkUB5PP/003N3dlZ8HDRoEf39//PzzzwCAo0ePIjY2FsOGDUN6ejrS0tKQlpaG7Oxs9OzZEzt37oTZbLZaZm5uLlxcXO56v99//z3MZjMGDx6sLDMtLQ1+fn5o1qwZduzYYXX9/Px8AJb1VZK1a9fC09MTvXv3tlpmWFgY3NzcbJZZUFBgdb20tLRSx5CTkpLw0UcfYfLkyXBzc7O5/xYtWiAkJMRqmUVDhnfef0kOHDiAwYMH44knnsDcuXOLvU5Z1gcAZW8PsAxlZGZmokuXLlavrfXr18NsNmPKlCnKa7FI0Wtr69atyMrKwltvvWXz3BZd59ChQ0hJScG4ceOsrtO/f3+EhITgp59+srqd2WxW1tHRo0exfPly+Pv7o0WLFnd9TOnp6ahTp47N5bVr10ZiYiIOHjx419vfTUREBFq2bKn8bDKZsGXLFgwcOBBNmzZVLvf398ewYcOwe/duXL9+3WoZiYmJiIiIQEFBAXbu3InGjRsrvyvva6TocaalpVX4MamFboaPPv74YwQHB8PR0RG+vr5o3ry51R+e2WxGVFQUFi1ahLi4OJhMJuV3RUNMgGXYqXnz5nB0rNxV16xZM6ufDQYDgoKClHHW2NhYAMCoUaNKXEZmZqbVH2laWprNcu8UGxsLIUSJ17tzmKdotsidG+I7l5mZmQkfH59if5+SkmL185YtW6yGDspi6tSpqF+/PsaOHWszNh8bG4vTp0+XuMw77784SUlJ6N+/P7Kzs5Genl5i4ZdlfQDAjz/+iFmzZuHo0aNWY9a3L/f8+fMwGo1WG8M7FQ17tmrVqsTrXLx4EQDQvHlzm9+FhIRg9+7dVpclJCRYrSt/f3+sW7eu1McEAKKYL2588803ER0djfDwcAQFBaFPnz4YNmwYOnXqVOryigQEBFj9nJqaipycnGIfU4sWLWA2m5GQkIDQ0FDl8pEjR8LR0RGnT5+Gn5+f1W3K+xopepz38sZPLXRTCuHh4WjXrl2Jv//Xv/6FyZMn49lnn8XMmTNRt25dGI1GvPrqqzbvwGUoyjB37ly
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"from imblearn.over_sampling import RandomOverSampler \n",
"\n",
"oversampler = RandomOverSampler(random_state=42)\n",
"X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)\n",
"\n",
"print(\"Распределение классов до OverSampling:\")\n",
"print(y_train.value_counts())\n",
"print(\"\\nРаспределение классов после OverSampling:\")\n",
"print(y_train_over.value_counts())\n",
"\n",
"# plt.pie(y_train.value_counts())\n",
"\n",
"plt.pie(y_train_over.value_counts())\n",
"# sns.barplot(x=class_counts.index, y=class_counts.values)\n",
"plt.title(\"Распределение классов (stroke)\")\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}