diff --git a/Lab_3/lab3.ipynb b/Lab_3/lab3.ipynb new file mode 100644 index 0000000..54231c1 --- /dev/null +++ b/Lab_3/lab3.ipynb @@ -0,0 +1,1066 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "1) Бизнес цели - опредение наличия заболивания у человека", + "id": "54c08440669b8de7" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "2) Подготовка данных", + "id": "5d090ddc69b152cf" + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.107138Z", + "start_time": "2024-12-07T07:33:51.094517Z" + } + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from prompt_toolkit.shortcuts.progress_bar import Percentage\n", + "\n", + "df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", + "print(df.columns)\n", + "print(df)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n", + " 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n", + " 'smoking_status', 'stroke'],\n", + " dtype='object')\n", + " id gender age hypertension heart_disease ever_married \\\n", + "0 9046 Male 67.0 0 1 Yes \n", + "1 51676 Female 61.0 0 0 Yes \n", + "2 31112 Male 80.0 0 1 Yes \n", + "3 60182 Female 49.0 0 0 Yes \n", + "4 1665 Female 79.0 1 0 Yes \n", + "... ... ... ... ... ... ... \n", + "5105 18234 Female 80.0 1 0 Yes \n", + "5106 44873 Female 81.0 0 0 Yes \n", + "5107 19723 Female 35.0 0 0 Yes \n", + "5108 37544 Male 51.0 0 0 Yes \n", + "5109 44679 Female 44.0 0 0 Yes \n", + "\n", + " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", + "0 Private Urban 228.69 36.6 formerly smoked \n", + "1 Self-employed Rural 202.21 NaN never smoked \n", + "2 Private Rural 105.92 32.5 never smoked \n", + "3 Private Urban 171.23 34.4 smokes \n", + "4 Self-employed Rural 174.12 24.0 never smoked \n", + "... ... ... ... ... ... \n", + "5105 Private Urban 83.75 NaN never smoked \n", + "5106 Self-employed Urban 125.20 40.0 never smoked \n", + "5107 Self-employed Rural 82.99 30.6 never smoked \n", + "5108 Private Rural 166.29 25.6 formerly smoked \n", + "5109 Govt_job Urban 85.28 26.2 Unknown \n", + "\n", + " stroke \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... \n", + "5105 0 \n", + "5106 0 \n", + "5107 0 \n", + "5108 0 \n", + "5109 0 \n", + "\n", + "[5110 rows x 12 columns]\n" + ] + } + ], + "execution_count": 171 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.208159Z", + "start_time": "2024-12-07T07:33:51.118240Z" + } + }, + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x=df['age'])\n", + "plt.title('Box Plot для age')\n", + "plt.xlabel('age')\n", + "plt.show()" + ], + "id": "ea1a7ed0e6d7d189", + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikElEQVR4nO3df5SWdZ3/8dcA8ltEBc1QWdQ0MBwEZVf8kYI/W10zfx1lRc3CMuWr1Srarobk0QIkEwTJDDV/pRJnrXZXPW6USrriIsdIEERFVIQEAUEI5/7+4fGuSRH0fJZ7Rh6Pc+bIfd3X3PO+8XNm7ifXdd1TV6lUKgEAACioRa0HAAAAPnmEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAD4UH6vKwAfh9AA+BjOOOOM7LXXXo0+9ttvvwwZMiRPPPHEZplhypQp75uhZ8+e2X///fPlL385M2bMqO57/fXXZ6+99vpIj//aa69l6NChWbRoUenRAdgCtKr1AADNVa9evXLFFVckSd55550sW7Ysd955Z84555xMmTIln/nMZzbLHOPGjUvXrl2TJA0NDVm6dGnGjx+fM888M/fee28++9nPfqzHfeyxxzJt2rSSowKwBREaAB9Tx44d06dPn0bbBgwYkAMOOCBTpkzJJZdcslnm6NmzZ3beeedG23r16pUjjjgid9xxR6688srNMgcA/DWhAVBQu3bt0qZNm9TV1TXa/utf/zo33XRTFixYkPbt22fQoEH51re+lW222SarVq3KcccdlzZt2uTf//3f07p161QqlZx55pmZP39+7r///my33XYfaY6dd9452267bV555ZUN7vNhM02ZMiWXXnppkmTQoEE54YQTcs0113zg47zwwgs56qij3re9W7duefjhh6u3Bw4c+IGnYf31fjNmzMjo0aPz7LPPZvXq1Rt8rL/17LPPZty4cXnyySezcuXKbLfddjnqqKPy7W9/O23btk2SrFq1Kj/4wQ/y4IMP5u23386hhx6a+vr6XH311ZkzZ071sR566KHccMMNee6559KpU6ccc8wx+eY3v5n27dtv8OsD8H5CA+BjqlQqWb9+ffXPy5cvzy233JJ169blxBNPrO53ww035Ec/+lFOP/30XHTRRVm4cGGuu+66zJw5Mz//+c/TsWPHXHXVVfnyl7+ciRMnZtiwYbn11lvz+OOPZ9KkSR85MpJk2bJlWbZsWXbdddcPvH9jMx166KH5+te/ngkTJmTcuHEfen3H22+/nZYtW+aOO+5o9Pjz5s17376f//znc955533gfqtWrcrXvva17LHHHhkzZky23Xbb1NXVbfCx3vP6669n8ODB6dOnT6655pq0bt06v/3tb/PTn/40O+ywQ4YOHZokOe+88/LHP/4xF110UT796U/njjvuyJgxYxo91v33359vf/vbOe6443LhhRdm0aJFGTt2bObNm5ef/vSn7wtIADZMaAB8TP/zP/+Tvffe+33bv/nNb2b33XdPkrz55puZMGFCTjnllFx++eXVffbcc88MHjw49913XwYPHpwBAwbk1FNPzaRJk1JfX59rr702gwcPzuc///mNztHQ0FANnrVr1+aFF17I6NGj06JFi5x66qnv239TZ3ovUj7o1Ky/tmbNmrRp06bRaWQbiqPttttug/stWLAgK1asyNChQ3PYYYdt9LHeM3fu3PTs2TPXXXddOnbsmOTdU9geffTRPP744xk6dGimT5+exx9/PNdff32OPPLIJMkhhxySY489NvPnz0/ybiyOHj06Bx98cEaPHl19/L/7u7/LWWedlWnTpuXQQw/90FkA+AuhAfAx7b333hkxYkSSd1+krlixIr/97W8zduzYrF69OhdddFFmzpyZdevW5dhjj230ufvtt1+6deuWJ554IoMHD06SXHzxxXnkkUfyta99LT169MjFF1+8SXMcccQR79vWrVu3jBo16gOPRHyUmTbFq6++mk6dOm3y/hvSo0ePdOrUKffcc0969OiRT33qU2nVqtVG3173oIMOykEHHZQ///nPmTdvXl588cXMnTs3b7zxRjp37pwk+f3vf5+tttoqhx9+ePXzWrRokS984Qu5/vrrkyTPP/98XnvttZx77rnVcEuS/fffPx07dsyjjz4qNAA+AqEB8DF16NAhvXv3brTtoIMOyurVq3PTTTdlyJAhefPNN5MkXbp0ed/nd+nSJStXrmz0eEceeWRuvvnmHHDAAdVrCzZmwoQJ1Xed2mqrrbLttttmxx133OD+H2WmTbFo0aJ069btI33OB+nYsWPGjRuX733ve++75uPDHr+hoSHXXnttbr/99qxevTo77bRT9tlnn7Rp06a6z7Jly9K5c+e0aNH4Xd2333776p+XL1+eJBkxYkQ1IP/a66+//nGeFsAWS2gAFPa5z30u99xzT15++eVss802SZKlS5dmt912a7TfkiVLsssuu1Rvz507N7fddlt69uyZO++8M//0T/+U+vr6jX69Pffc80NPbfpbH2WmTfHUU09t8lvobuwah7//+7/PkUcemQULFuRb3/pW+vbtmwkTJmTu3Lkb/JxJkyZl8uTJGTFiRI488shsvfXWSZKTTjqpus+OO+6YZcuWpaGhoVFs/OlPf6r++b2jMhdffHH69+//vq/z3t8bAJvGL+wDKGzWrFlp2bJldtlll9TX16d169b55S9/2WifJ598Mq+88kr69u2bJFm/fn2GDx+eXXfdNXfddVc++9nP5pJLLsnatWuLz7epM/3tv/5/kDVr1uSJJ57IgQceuNF9//ZF/geZNm1axo0blyFDhuTss89OfX199fSnDZkxY0b22GOPnHjiidXIWLx4cebOnZuGhoYkSf/+/bN+/fpG71xVqVTy0EMPVW/vtttu2X777fPyyy+nd+/e1Y8dd9wxY8aMyezZszf6HAH4C0c0AD6mVatWZebMmdXb69aty8MPP5z77rsvp556avUi5qFDh2b8+PHZaqutcthhh+Xll1/Oddddlz322CMnnHBCkmTixImZPXt27rjjjrRt2zYjR47MySefnLFjx2b48OFF5+7cufMmzfTev/A/+OCDOeSQQ6oXuL/njTfeyOTJk1NXV5fOnTs3+rt44403sm7dusyePTvbb7995syZkzfeeONDr+VYu3ZtRo4cmW7duuWCCy7Y5Oezzz775IYbbsikSZPSp0+fvPjii7nxxhuzbt26rFmzJsm711kceOCB+c53vpOlS5fm05/+dO69997MmTOnepSlZcuWueiii3L55ZenZcuWOeyww7JixYrccMMNWbx48Qde+A/AhgkNgI9p9uzZjd7VqU2bNtl1111z0UUX5Zxzzqluv+CCC9KlS5f87Gc/y913353OnTvn6KOPzoUXXpj27dvn2WefzcSJE3PaaadVjybsvffeGTJkSG655ZYcccQR6devX9HZNzZT8u5pTAMGDMiYMWMyffr0TJo0qdFj/OY3v8mNN96YJDn99NM/8Oucf/75OeWUU3LdddelR48eOfnkkzc4049//OMsXLgw48aNS7t27Tb5uZx77rlZtmxZbr311owfPz477bRTjj/++NTV1eXGG2/MihUr0qlTp4wdOzbXXHNNxowZk/Xr12fQoEE57bTTMnXq1OpjnXzyyenQoUNuuumm3H333Wnfvn369u2b0aNHf+RTygC2dHWVjb2dBwB8gClTpmTcuHEb/EV6G7t/c1q0aFFmzpyZQYMGNbrIftiwYVm4cGF+8Ytf1HA6gE8mRzQA+MRr0aJFhg8fnkGDBuWkk05Ky5Yt87vf/S4PPPBArr766lqPB/CJJDQA+Fi222679OzZ82PfvznttNNO+fGPf5zx48fnwgsvzPr167P77rtn9OjR7/t9IgCU4dQpAACgOG9vCwAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxH+ntbf/0p5Wp9XtU1dUl22+/dZOYBTbGeqU5sV5pTqxXmpNP2np97/lszEcKjUolTeYvpynNAhtjvdKcWK80J9YrzcmWtl6dOgUAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOJa1XoAoOlYsuT1rFy5otZjUCN1dckbb3TI8uVvpVKp9TTw4TZlvW69dad07brD5h0MqBIaQJJ3I+P/Xfj1/HndulqPAlDEVq1b57ofThAbUCNCA0iSrFy5In9ety5rdvt8GtpuU+tx4EO1WLM87Rb8Nmt6HJKGdp1rPQ5NUIu330yen5aVK1cIDagRoQE00tB2mzR06FLrMWCTNLTrbL0CNFEuBgcAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFBcswyN5cuX13oEAADYrFaseLPWI3wkzS40Fi9+LV/60peyePFrtR4FAAA2i8WLX8tXvjKkWb0Gbnah8dZbq9LQ0JC33lpV61EAAGCzeOutValUmtdr4GYXGgAAQNMnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOJa1XqAj+vllxemUqn1FPDh6uqSN97okOXL32ry63XRooW1HgGgON/baApKvB5ojmu52YbGj350ba1HAACaOK8XoHaabWgMG/bNdOu2S63HgA9VV5d07tx8jmj4gQx80ni9QFNQ4vVAc/w53WxDY+edd0mPHnvUegz4UHV1SZcuW2fp0pVNPjQAPom6ddslu+3m9QK1taW+HnAxOAAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUFyzC40OHTqmRYsW6dChY61HAQCAzaJDh46pq2ter4Fb1XqAj2rHHT+VKVOmZP36lqlUaj0NAAD839txx0/lpptuTadO29R6lE3W7I5oJEnnzp1rPQIAAGxWzSkykmYaGgAAQNMmNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABTXqtYDAE1Li7ffrPUIsFEt1ixv9F/4W76XQe0JDSBJsvXWnbJV69bJ89NqPQpssnYLflvrEWjCtmrdOltv3anWY8AWS2gASZKuXXfIdT+ckJUrV9R6FGqkri7p3LlDli9/K5VKraeBD7cp63XrrTula9cdNu9gQJXQAKq6dt3BD+UtWF1d0qXL1lm6dKXQoMmzXqHpczE4AABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKa/VRdq6r+78a46PP0BRmgY2xXmlOrFeaE+uV5uSTtl439XnUVSqVyv/tKAAAwJbGqVMAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxzSY01q5dm8suuyz77bdfDjrooNx88821HgmqFi9enGHDhqV///45+OCDc/XVV2ft2rVJkoULF+ass85Knz598oUvfCGPPPJIjaeFvxg6dGiGDx9evT179uycfPLJqa+vz4knnphnnnmmhtNBsm7duowYMSL7779/BgwYkGuvvTaVSiWJ9UrT8+qrr+bcc89N3759M3DgwEyePLl635a4XptNaPzgBz/IM888k1tuuSVXXHFFxo0bl//8z/+s9ViQSqWSYcOGZc2aNbn99tszduzY/Pd//3d++MMfplKp5Bvf+Ea6dOmS++67L8cff3zOP//8vPLKK7UeG/KrX/0q06ZNq95evXp1hg4dmv322y9TpkzJvvvum3PPPTerV6+u4ZRs6b73ve/lsccey09+8pOMGTMmP//5z3P33XdbrzRJF154Ydq3b58pU6bksssuyw9/+MM8+OCDW+56rTQDb731VqV3796V3//+99Vt48ePr/zzP/9zDaeCd82bN6+y5557VpYsWVLddv/991cOOuigymOPPVbp06dP5a233qred+aZZ1Z+9KMf1WJUqFq2bFnlkEMOqZx44omVSy65pFKpVCr33HNPZeDAgZWGhoZKpVKpNDQ0VI444ojKfffdV8tR2YItW7as0qtXr8rjjz9e3XbjjTdWhg8fbr3S5Cxfvryy5557VubMmVPddv7551dGjBixxa7XZnFE49lnn8369euz7777Vrf169cvTz/9dBoaGmo4GSRdu3bNTTfdlC5dujTavmrVqjz99NPp1atX2rdvX93er1+/zJw5czNPCY19//vfz/HHH5899tijuu3pp59Ov379UldXlySpq6tL3759rVdqZsaMGenYsWP69+9f3TZ06NBcffXV1itNTtu2bdOuXbtMmTIlf/7zn/P888/nqaeeSs+ePbfY9dosQmPJkiXZdttt07p16+q2Ll26ZO3atVm+fHntBoMknTp1ysEHH1y93dDQkJ/97Gf5h3/4hyxZsiQ77LBDo/233377vPbaa5t7TKiaPn16nnzyyZx33nmNtluvNDULFy5Mt27dMnXq1Bx99NEZNGhQxo8fn4aGBuuVJqdNmza5/PLLc/fdd6e+vj7HHHNMDjnkkJx88slb7HptVesBNsWaNWsaRUaS6u1169bVYiTYoFGjRmX27Nm59957M3ny5A9cu9YttbJ27dpcccUVufzyy9O2bdtG923oe631Sq2sXr06L774Yu66665cffXVWbJkSS6//PK0a9fOeqVJmj9/fg477LCcffbZee655zJy5MgccMABW+x6bRah0aZNm/f9j3jv9t/+oIRaGjVqVG655ZaMHTs2e+65Z9q0afO+o27r1q2zbqmZcePG5XOf+1yjo3Dv2dD3WuuVWmnVqlVWrVqVMWPGpFu3bkmSV155JXfeeWe6d+9uvdKkTJ8+Pffee2+mTZuWtm3bpnfv3lm8eHEmTJiQXXbZZYtcr80iNHbccccsW7Ys69evT6tW7468ZMmStG3bNp06darxdPCukSNH5s4778yoUaNy1FFHJXl37c6bN6/RfkuXLn3f4VPYXH71q19l6dKl1Wve3vvB91//9V859thjs3Tp0kb7W6/UUteuXdOmTZtqZCRJjx498uqrr6Z///7WK03KM888k+7duzeKh169emXixInZb7/9tsj12iyu0ejZs2datWrV6IKZGTNmpHfv3mnRolk8BT7hxo0bl7vuuivXXntt/vEf/7G6vb6+Pn/4wx/y9ttvV7fNmDEj9fX1tRgTctttt+X+++/P1KlTM3Xq1AwcODADBw7M1KlTU19fn//93/+t/o6CSqWSp556ynqlZurr67N27dosWLCguu35559Pt27drFeanB122CEvvvhioyMXzz//fHbeeectdr02i1fp7dq1yxe/+MV897vfzaxZs/LQQw/l5ptvzpAhQ2o9GmT+/Pm54YYb8tWvfjX9+vXLkiVLqh/9+/fPTjvtlEsvvTTPPfdcJk2alFmzZuWkk06q9dhsobp165bu3btXPzp06JAOHTqke/fuOfroo7NixYpcddVVmTdvXq666qqsWbMmxxxzTK3HZgu122675dBDD82ll16aZ599Nr/73e8yadKknHbaadYrTc7AgQOz1VZb5V//9V+zYMGCPPzww5k4cWLOOOOMLXa91lXeS6smbs2aNfnud7+bBx54IB07dsw555yTs846q9ZjQSZNmpQxY8Z84H1z5szJiy++mO985zt5+umn071791x22WUZMGDAZp4SPth7vxX8mmuuSZLMmjUrV1xxRebPn5+99torI0aMSK9evWo5Ilu4lStXZuTIkXnwwQfTrl27nH766fnGN76Ruro665Um572ImDVrVrbbbrsMHjw4Z5555ha7XptNaAAAAM1Hszh1CgAAaF6EBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAGTGjBk57bTTUl9fnz59+uSrX/1qXn/99STJI488kuOOOy777LNPvvKVr2TkyJEZPnx49XPvuuuuDBw4MPvuu2/OOOOMzJkzp1ZPA4AmRGgAbOFWrlyZc889NwceeGB++ctf5ic/+UleeumlTJo0KQsXLszXv/71HHPMMZk6dWp69+6d22+/vfq5Dz/8cMaNG5d/+7d/yy9+8Yv069cvQ4YMyZtvvlnDZwRAU1BXqVQqtR4CgNpZsmRJ7r///px99tmpq6tLkowZMyazZs1KfX19ZsyY0SguTj311PTo0SPXXHNNTj/99BxzzDE544wzqvd/6UtfygknnNBoGwBbnla1HgCA2uratWu++MUvZvLkyfnjH/+YefPmZc6cOenbt2/mzJmT3r17N9q/T58+1SMW8+fPz6hRo3LttddW71+7dm1eeOGFzfkUAGiChAbAFm7x4sU58cQTs/fee2fAgAE55ZRT8pvf/CZPP/10WrZsmb898P3Xt995551cdtllOeCAAxrt07Fjx80yOwBNl9AA2MI9+OCD2WabbXLjjTdWt912222pVCr5zGc+kxkzZjTa/w9/+EN22WWXJEmPHj3y2muvpXv37tX7L7300hx++OEZNGjQ5nkCADRJLgYH2MJ17tw5r7zySqZPn56FCxdm0qRJeeCBB7Ju3bqccsopmTlzZiZNmpQFCxZk4sSJefLJJ6vXcpx99tm55ZZbMnXq1Lz00ksZNWpU/uM//iO77757jZ8VALXmYnCALdw777yTK6+8Mr/+9a9TV1eX3r175+CDD87111+f6dOn55FHHsn3v//9LFq0KAceeGBatmyZLl265Morr0yS3HrrrZk8eXKWLl2aPfbYI//yL//yvlOpANjyCA0ANmju3LlZv359evXqVd02dOjQ9O7dOxdccEENJwOgqXPqFAAb9NJLL+Xss8/Oo48+mkWLFuWee+7J9OnTc8QRR9R6NACaOEc0APhQEyZMyN13350//elP6dGjR4YNG5bDDz+81mMB0MQJDQAAoDinTgEAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAACju/wM4J82jKM7PHwAAAABJRU5ErkJggg==" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 172 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.245693Z", + "start_time": "2024-12-07T07:33:51.236859Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# Количество пустых значений признаков\n", + "print(df.isnull().sum())\n", + "\n", + "print()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "print(df.isnull().any())\n", + "\n", + "print()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ], + "id": "84cf47a513b9f258", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64\n", + "\n", + "id False\n", + "gender False\n", + "age False\n", + "hypertension False\n", + "heart_disease False\n", + "ever_married False\n", + "work_type False\n", + "Residence_type False\n", + "avg_glucose_level False\n", + "bmi True\n", + "smoking_status False\n", + "stroke False\n", + "dtype: bool\n", + "\n", + "bmi процент пустых значений: %3.93\n" + ] + } + ], + "execution_count": 173 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.304644Z", + "start_time": "2024-12-07T07:33:51.298975Z" + } + }, + "cell_type": "code", + "source": [ + "# Замена пустых данных на медиану\n", + "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ], + "id": "ba00afd3f040bc81", + "outputs": [], + "execution_count": 174 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "3) Унитарное кодирование", + "id": "858e690bed6f98dd" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.334790Z", + "start_time": "2024-12-07T07:33:51.311404Z" + } + }, + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "\n", + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n", + "df = df.query('gender == \"Male\" or gender == \"Female\"')\n", + "encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n", + "\n", + "encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n", + "\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "df = pd.concat([df, encoded_values_df], axis=1)\n", + "\n", + "df" + ], + "id": "ec19b16d7410cbff", + "outputs": [ + { + "data": { + "text/plain": [ + " age gender hypertension work_type avg_glucose_level bmi \\\n", + "0 67.0 Male 0.0 Private 228.69 36.6 \n", + "1 61.0 Female 0.0 Self-employed 202.21 28.1 \n", + "2 80.0 Male 0.0 Private 105.92 32.5 \n", + "3 49.0 Female 0.0 Private 171.23 34.4 \n", + "4 79.0 Female 1.0 Self-employed 174.12 24.0 \n", + "... ... ... ... ... ... ... \n", + "5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n", + "5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n", + "5108 51.0 Male 0.0 Private 166.29 25.6 \n", + "5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n", + "3116 NaN NaN NaN NaN NaN NaN \n", + "\n", + " smoking_status gender_Male work_type_Never_worked work_type_Private \\\n", + "0 formerly smoked 1.0 0.0 1.0 \n", + "1 never smoked 0.0 0.0 0.0 \n", + "2 never smoked 1.0 0.0 1.0 \n", + "3 smokes 0.0 0.0 1.0 \n", + "4 never smoked 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "5106 never smoked 0.0 0.0 0.0 \n", + "5107 never smoked 1.0 0.0 1.0 \n", + "5108 formerly smoked 0.0 0.0 0.0 \n", + "5109 Unknown NaN NaN NaN \n", + "3116 NaN 0.0 0.0 0.0 \n", + "\n", + " work_type_Self-employed work_type_children \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "... ... ... \n", + "5106 1.0 0.0 \n", + "5107 0.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 1.0 \n", + "\n", + " smoking_status_formerly smoked smoking_status_never smoked \\\n", + "0 1.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 0.0 \n", + "4 0.0 1.0 \n", + "... ... ... \n", + "5106 0.0 1.0 \n", + "5107 1.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 0.0 \n", + "\n", + " smoking_status_smokes \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 \n", + "... ... \n", + "5106 0.0 \n", + "5107 0.0 \n", + "5108 0.0 \n", + "5109 NaN \n", + "3116 0.0 \n", + "\n", + "[5110 rows x 15 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agegenderhypertensionwork_typeavg_glucose_levelbmismoking_statusgender_Malework_type_Never_workedwork_type_Privatework_type_Self-employedwork_type_childrensmoking_status_formerly smokedsmoking_status_never smokedsmoking_status_smokes
067.0Male0.0Private228.6936.6formerly smoked1.00.01.00.00.01.00.00.0
161.0Female0.0Self-employed202.2128.1never smoked0.00.00.01.00.00.01.00.0
280.0Male0.0Private105.9232.5never smoked1.00.01.00.00.00.01.00.0
349.0Female0.0Private171.2334.4smokes0.00.01.00.00.00.00.01.0
479.0Female1.0Self-employed174.1224.0never smoked0.00.00.01.00.00.01.00.0
................................................
510681.0Female0.0Self-employed125.2040.0never smoked0.00.00.01.00.00.01.00.0
510735.0Female0.0Self-employed82.9930.6never smoked1.00.01.00.00.01.00.00.0
510851.0Male0.0Private166.2925.6formerly smoked0.00.00.00.00.00.00.00.0
510944.0Female0.0Govt_job85.2826.2UnknownNaNNaNNaNNaNNaNNaNNaNNaN
3116NaNNaNNaNNaNNaNNaNNaN0.00.00.00.01.00.00.00.0
\n", + "

5110 rows × 15 columns

\n", + "
" + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 175 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "4) Дискретизация признаков", + "id": "cc934d2268784440" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.401444Z", + "start_time": "2024-12-07T07:33:51.386817Z" + } + }, + "cell_type": "code", + "source": [ + "df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n", + "df" + ], + "id": "b9a70c0c56176f98", + "outputs": [ + { + "data": { + "text/plain": [ + " age gender hypertension work_type avg_glucose_level bmi \\\n", + "0 4.0 Male 0.0 Private 228.69 36.6 \n", + "1 3.0 Female 0.0 Self-employed 202.21 28.1 \n", + "2 4.0 Male 0.0 Private 105.92 32.5 \n", + "3 2.0 Female 0.0 Private 171.23 34.4 \n", + "4 4.0 Female 1.0 Self-employed 174.12 24.0 \n", + "... ... ... ... ... ... ... \n", + "5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n", + "5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n", + "5108 2.0 Male 0.0 Private 166.29 25.6 \n", + "5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n", + "3116 NaN NaN NaN NaN NaN NaN \n", + "\n", + " smoking_status gender_Male work_type_Never_worked work_type_Private \\\n", + "0 formerly smoked 1.0 0.0 1.0 \n", + "1 never smoked 0.0 0.0 0.0 \n", + "2 never smoked 1.0 0.0 1.0 \n", + "3 smokes 0.0 0.0 1.0 \n", + "4 never smoked 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "5106 never smoked 0.0 0.0 0.0 \n", + "5107 never smoked 1.0 0.0 1.0 \n", + "5108 formerly smoked 0.0 0.0 0.0 \n", + "5109 Unknown NaN NaN NaN \n", + "3116 NaN 0.0 0.0 0.0 \n", + "\n", + " work_type_Self-employed work_type_children \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "... ... ... \n", + "5106 1.0 0.0 \n", + "5107 0.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 1.0 \n", + "\n", + " smoking_status_formerly smoked smoking_status_never smoked \\\n", + "0 1.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 0.0 \n", + "4 0.0 1.0 \n", + "... ... ... \n", + "5106 0.0 1.0 \n", + "5107 1.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 0.0 \n", + "\n", + " smoking_status_smokes \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 \n", + "... ... \n", + "5106 0.0 \n", + "5107 0.0 \n", + "5108 0.0 \n", + "5109 NaN \n", + "3116 0.0 \n", + "\n", + "[5110 rows x 15 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agegenderhypertensionwork_typeavg_glucose_levelbmismoking_statusgender_Malework_type_Never_workedwork_type_Privatework_type_Self-employedwork_type_childrensmoking_status_formerly smokedsmoking_status_never smokedsmoking_status_smokes
04.0Male0.0Private228.6936.6formerly smoked1.00.01.00.00.01.00.00.0
13.0Female0.0Self-employed202.2128.1never smoked0.00.00.01.00.00.01.00.0
24.0Male0.0Private105.9232.5never smoked1.00.01.00.00.00.01.00.0
32.0Female0.0Private171.2334.4smokes0.00.01.00.00.00.00.01.0
44.0Female1.0Self-employed174.1224.0never smoked0.00.00.01.00.00.01.00.0
................................................
51064.0Female0.0Self-employed125.2040.0never smoked0.00.00.01.00.00.01.00.0
51071.0Female0.0Self-employed82.9930.6never smoked1.00.01.00.00.01.00.00.0
51082.0Male0.0Private166.2925.6formerly smoked0.00.00.00.00.00.00.00.0
51092.0Female0.0Govt_job85.2826.2UnknownNaNNaNNaNNaNNaNNaNNaNNaN
3116NaNNaNNaNNaNNaNNaNNaN0.00.00.00.01.00.00.00.0
\n", + "

5110 rows × 15 columns

\n", + "
" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 176 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "3) Разбиение данных", + "id": "7c5387ab7d3b9349" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.496592Z", + "start_time": "2024-12-07T07:33:51.456961Z" + } + }, + "cell_type": "code", + "source": [ + "# Функция для создания выборок\n", + "from sklearn.model_selection import train_test_split\n", + "dropna_df = df.dropna()\n", + "\n", + "df_input = dropna_df[[\n", + " \"age\",\n", + " \"hypertension\", \n", + " \"avg_glucose_level\", \n", + " \"bmi\", \n", + " \"gender_Male\",\n", + " \"work_type_Never_worked\",\n", + " \"work_type_Private\",\n", + " \"work_type_Self-employed\",\n", + " \"work_type_children\",\n", + " \"smoking_status_formerly smoked\",\n", + " \"smoking_status_never smoked\",\n", + " \"smoking_status_smokes\",\n", + "]]\n", + "\n", + "print(df_input.head())\n", + "\n", + "train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n", + "\n", + "# Разделение остатка на контрольную и тестовую выборки\n", + "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Сохранение выборок в файлы\n", + "train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n", + "val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n", + "test_df.to_csv(\".//static//csv//test_data.csv\", index=False)" + ], + "id": "8c9949a919295290", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age hypertension avg_glucose_level bmi gender_Male \\\n", + "0 4.0 0.0 228.69 36.6 1.0 \n", + "1 3.0 0.0 202.21 28.1 0.0 \n", + "2 4.0 0.0 105.92 32.5 1.0 \n", + "3 2.0 0.0 171.23 34.4 0.0 \n", + "4 4.0 1.0 174.12 24.0 0.0 \n", + "\n", + " work_type_Never_worked work_type_Private work_type_Self-employed \\\n", + "0 0.0 1.0 0.0 \n", + "1 0.0 0.0 1.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 1.0 0.0 \n", + "4 0.0 0.0 1.0 \n", + "\n", + " work_type_children smoking_status_formerly smoked \\\n", + "0 0.0 1.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " smoking_status_never smoked smoking_status_smokes \n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 0.0 1.0 \n", + "4 1.0 0.0 \n", + "Размер обучающей выборки: 3064\n", + "Размер контрольной выборки: 1022\n", + "Размер тестовой выборки: 1022\n" + ] + } + ], + "execution_count": 177 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.637107Z", + "start_time": "2024-12-07T07:33:51.601412Z" + } + }, + "cell_type": "code", + "source": [ + "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", + "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", + "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", + "\n", + "def check_balance(df, name):\n", + " print(f\"Распределение gender в {name}:\")\n", + " print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n", + " print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")" + ], + "id": "79b2248eb6438fa5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение gender в обучающей выборке:\n", + "Процент gender_Male: 41.87%\n", + "Процент gender_Female: 58.13%\n", + "\n", + "Распределение gender в контрольной выборке:\n", + "Процент gender_Male: 40.41%\n", + "Процент gender_Female: 59.59%\n", + "\n", + "Распределение gender в тестовой выборке:\n", + "Процент gender_Male: 41.00%\n", + "Процент gender_Female: 59.00%\n", + "\n" + ] + } + ], + "execution_count": 178 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Выборка сбалансирована", + "id": "a6436f11045161c4" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}