1067 lines
50 KiB
Plaintext
1067 lines
50 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "1) Бизнес цели - опредение наличия заболивания у человека",
|
||
"id": "54c08440669b8de7"
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "2) Подготовка данных",
|
||
"id": "5d090ddc69b152cf"
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "initial_id",
|
||
"metadata": {
|
||
"collapsed": true,
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.107138Z",
|
||
"start_time": "2024-12-07T07:33:51.094517Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import pandas as pd\n",
|
||
"from prompt_toolkit.shortcuts.progress_bar import Percentage\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
|
||
"print(df.columns)\n",
|
||
"print(df)"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
||
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
||
" 'smoking_status', 'stroke'],\n",
|
||
" dtype='object')\n",
|
||
" id gender age hypertension heart_disease ever_married \\\n",
|
||
"0 9046 Male 67.0 0 1 Yes \n",
|
||
"1 51676 Female 61.0 0 0 Yes \n",
|
||
"2 31112 Male 80.0 0 1 Yes \n",
|
||
"3 60182 Female 49.0 0 0 Yes \n",
|
||
"4 1665 Female 79.0 1 0 Yes \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5105 18234 Female 80.0 1 0 Yes \n",
|
||
"5106 44873 Female 81.0 0 0 Yes \n",
|
||
"5107 19723 Female 35.0 0 0 Yes \n",
|
||
"5108 37544 Male 51.0 0 0 Yes \n",
|
||
"5109 44679 Female 44.0 0 0 Yes \n",
|
||
"\n",
|
||
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
||
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
||
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
||
"2 Private Rural 105.92 32.5 never smoked \n",
|
||
"3 Private Urban 171.23 34.4 smokes \n",
|
||
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5105 Private Urban 83.75 NaN never smoked \n",
|
||
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
||
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
||
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
||
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
||
"\n",
|
||
" stroke \n",
|
||
"0 1 \n",
|
||
"1 1 \n",
|
||
"2 1 \n",
|
||
"3 1 \n",
|
||
"4 1 \n",
|
||
"... ... \n",
|
||
"5105 0 \n",
|
||
"5106 0 \n",
|
||
"5107 0 \n",
|
||
"5108 0 \n",
|
||
"5109 0 \n",
|
||
"\n",
|
||
"[5110 rows x 12 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 171
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.208159Z",
|
||
"start_time": "2024-12-07T07:33:51.118240Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"\n",
|
||
"plt.figure(figsize=(10, 6))\n",
|
||
"sns.boxplot(x=df['age'])\n",
|
||
"plt.title('Box Plot для age')\n",
|
||
"plt.xlabel('age')\n",
|
||
"plt.show()"
|
||
],
|
||
"id": "ea1a7ed0e6d7d189",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 1000x600 with 1 Axes>"
|
||
],
|
||
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikElEQVR4nO3df5SWdZ3/8dcA8ltEBc1QWdQ0MBwEZVf8kYI/W10zfx1lRc3CMuWr1Srarobk0QIkEwTJDDV/pRJnrXZXPW6USrriIsdIEERFVIQEAUEI5/7+4fGuSRH0fJZ7Rh6Pc+bIfd3X3PO+8XNm7ifXdd1TV6lUKgEAACioRa0HAAAAPnmEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAD4UH6vKwAfh9AA+BjOOOOM7LXXXo0+9ttvvwwZMiRPPPHEZplhypQp75uhZ8+e2X///fPlL385M2bMqO57/fXXZ6+99vpIj//aa69l6NChWbRoUenRAdgCtKr1AADNVa9evXLFFVckSd55550sW7Ysd955Z84555xMmTIln/nMZzbLHOPGjUvXrl2TJA0NDVm6dGnGjx+fM888M/fee28++9nPfqzHfeyxxzJt2rSSowKwBREaAB9Tx44d06dPn0bbBgwYkAMOOCBTpkzJJZdcslnm6NmzZ3beeedG23r16pUjjjgid9xxR6688srNMgcA/DWhAVBQu3bt0qZNm9TV1TXa/utf/zo33XRTFixYkPbt22fQoEH51re+lW222SarVq3KcccdlzZt2uTf//3f07p161QqlZx55pmZP39+7r///my33XYfaY6dd9452267bV555ZUN7vNhM02ZMiWXXnppkmTQoEE54YQTcs0113zg47zwwgs56qij3re9W7duefjhh6u3Bw4c+IGnYf31fjNmzMjo0aPz7LPPZvXq1Rt8rL/17LPPZty4cXnyySezcuXKbLfddjnqqKPy7W9/O23btk2SrFq1Kj/4wQ/y4IMP5u23386hhx6a+vr6XH311ZkzZ071sR566KHccMMNee6559KpU6ccc8wx+eY3v5n27dtv8OsD8H5CA+BjqlQqWb9+ffXPy5cvzy233JJ169blxBNPrO53ww035Ec/+lFOP/30XHTRRVm4cGGuu+66zJw5Mz//+c/TsWPHXHXVVfnyl7+ciRMnZtiwYbn11lvz+OOPZ9KkSR85MpJk2bJlWbZsWXbdddcPvH9jMx166KH5+te/ngkTJmTcuHEfen3H22+/nZYtW+aOO+5o9Pjz5s17376f//znc955533gfqtWrcrXvva17LHHHhkzZky23Xbb1NXVbfCx3vP6669n8ODB6dOnT6655pq0bt06v/3tb/PTn/40O+ywQ4YOHZokOe+88/LHP/4xF110UT796U/njjvuyJgxYxo91v33359vf/vbOe6443LhhRdm0aJFGTt2bObNm5ef/vSn7wtIADZMaAB8TP/zP/+Tvffe+33bv/nNb2b33XdPkrz55puZMGFCTjnllFx++eXVffbcc88MHjw49913XwYPHpwBAwbk1FNPzaRJk1JfX59rr702gwcPzuc///mNztHQ0FANnrVr1+aFF17I6NGj06JFi5x66qnv239TZ3ovUj7o1Ky/tmbNmrRp06bRaWQbiqPttttug/stWLAgK1asyNChQ3PYYYdt9LHeM3fu3PTs2TPXXXddOnbsmOTdU9geffTRPP744xk6dGimT5+exx9/PNdff32OPPLIJMkhhxySY489NvPnz0/ybiyOHj06Bx98cEaPHl19/L/7u7/LWWedlWnTpuXQQw/90FkA+AuhAfAx7b333hkxYkSSd1+krlixIr/97W8zduzYrF69OhdddFFmzpyZdevW5dhjj230ufvtt1+6deuWJ554IoMHD06SXHzxxXnkkUfyta99LT169MjFF1+8SXMcccQR79vWrVu3jBo16gOPRHyUmTbFq6++mk6dOm3y/hvSo0ePdOrUKffcc0969OiRT33qU2nVqtVG3173oIMOykEHHZQ///nPmTdvXl588cXMnTs3b7zxRjp37pwk+f3vf5+tttoqhx9+ePXzWrRokS984Qu5/vrrkyTPP/98XnvttZx77rnVcEuS/fffPx07dsyjjz4qNAA+AqEB8DF16NAhvXv3brTtoIMOyurVq3PTTTdlyJAhefPNN5MkXbp0ed/nd+nSJStXrmz0eEceeWRuvvnmHHDAAdVrCzZmwoQJ1Xed2mqrrbLttttmxx133OD+H2WmTbFo0aJ069btI33OB+nYsWPGjRuX733ve++75uPDHr+hoSHXXnttbr/99qxevTo77bRT9tlnn7Rp06a6z7Jly9K5c+e0aNH4Xd2333776p+XL1+eJBkxYkQ1IP/a66+//nGeFsAWS2gAFPa5z30u99xzT15++eVss802SZKlS5dmt912a7TfkiVLsssuu1Rvz507N7fddlt69uyZO++8M//0T/+U+vr6jX69Pffc80NPbfpbH2WmTfHUU09t8lvobuwah7//+7/PkUcemQULFuRb3/pW+vbtmwkTJmTu3Lkb/JxJkyZl8uTJGTFiRI488shsvfXWSZKTTjqpus+OO+6YZcuWpaGhoVFs/OlPf6r++b2jMhdffHH69+//vq/z3t8bAJvGL+wDKGzWrFlp2bJldtlll9TX16d169b55S9/2WifJ598Mq+88kr69u2bJFm/fn2GDx+eXXfdNXfddVc++9nP5pJLLsnatWuLz7epM/3tv/5/kDVr1uSJJ57IgQceuNF9//ZF/geZNm1axo0blyFDhuTss89OfX199fSnDZkxY0b22GOPnHjiidXIWLx4cebOnZuGhoYkSf/+/bN+/fpG71xVqVTy0EMPVW/vtttu2X777fPyyy+nd+/e1Y8dd9wxY8aMyezZszf6HAH4C0c0AD6mVatWZebMmdXb69aty8MPP5z77rsvp556avUi5qFDh2b8+PHZaqutcthhh+Xll1/Oddddlz322CMnnHBCkmTixImZPXt27rjjjrRt2zYjR47MySefnLFjx2b48OFF5+7cufMmzfTev/A/+OCDOeSQQ6oXuL/njTfeyOTJk1NXV5fOnTs3+rt44403sm7dusyePTvbb7995syZkzfeeONDr+VYu3ZtRo4cmW7duuWCCy7Y5Oezzz775IYbbsikSZPSp0+fvPjii7nxxhuzbt26rFmzJsm711kceOCB+c53vpOlS5fm05/+dO69997MmTOnepSlZcuWueiii3L55ZenZcuWOeyww7JixYrccMMNWbx48Qde+A/AhgkNgI9p9uzZjd7VqU2bNtl1111z0UUX5Zxzzqluv+CCC9KlS5f87Gc/y913353OnTvn6KOPzoUXXpj27dvn2WefzcSJE3PaaadVjybsvffeGTJkSG655ZYcccQR6devX9HZNzZT8u5pTAMGDMiYMWMyffr0TJo0qdFj/OY3v8mNN96YJDn99NM/8Oucf/75OeWUU3LdddelR48eOfnkkzc4049//OMsXLgw48aNS7t27Tb5uZx77rlZtmxZbr311owfPz477bRTjj/++NTV1eXGG2/MihUr0qlTp4wdOzbXXHNNxowZk/Xr12fQoEE57bTTMnXq1OpjnXzyyenQoUNuuumm3H333Wnfvn369u2b0aNHf+RTygC2dHWVjb2dBwB8gClTpmTcuHEb/EV6G7t/c1q0aFFmzpyZQYMGNbrIftiwYVm4cGF+8Ytf1HA6gE8mRzQA+MRr0aJFhg8fnkGDBuWkk05Ky5Yt87vf/S4PPPBArr766lqPB/CJJDQA+Fi222679OzZ82PfvznttNNO+fGPf5zx48fnwgsvzPr167P77rtn9OjR7/t9IgCU4dQpAACgOG9vCwAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxH+ntbf/0p5Wp9XtU1dUl22+/dZOYBTbGeqU5sV5pTqxXmpNP2np97/lszEcKjUolTeYvpynNAhtjvdKcWK80J9YrzcmWtl6dOgUAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOJa1XoAoOlYsuT1rFy5otZjUCN1dckbb3TI8uVvpVKp9TTw4TZlvW69dad07brD5h0MqBIaQJJ3I+P/Xfj1/HndulqPAlDEVq1b57ofThAbUCNCA0iSrFy5In9ety5rdvt8GtpuU+tx4EO1WLM87Rb8Nmt6HJKGdp1rPQ5NUIu330yen5aVK1cIDagRoQE00tB2mzR06FLrMWCTNLTrbL0CNFEuBgcAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFCc0AAAAIoTGgAAQHFCAwAAKE5oAAAAxQkNAACgOKEBAAAUJzQAAIDihAYAAFBcswyN5cuX13oEAADYrFaseLPWI3wkzS40Fi9+LV/60peyePFrtR4FAAA2i8WLX8tXvjKkWb0Gbnah8dZbq9LQ0JC33lpV61EAAGCzeOutValUmtdr4GYXGgAAQNMnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOJa1XqAj+vllxemUqn1FPDh6uqSN97okOXL32ry63XRooW1HgGgON/baApKvB5ojmu52YbGj350ba1HAACaOK8XoHaabWgMG/bNdOu2S63HgA9VV5d07tx8jmj4gQx80ni9QFNQ4vVAc/w53WxDY+edd0mPHnvUegz4UHV1SZcuW2fp0pVNPjQAPom6ddslu+3m9QK1taW+HnAxOAAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUFyzC40OHTqmRYsW6dChY61HAQCAzaJDh46pq2ter4Fb1XqAj2rHHT+VKVOmZP36lqlUaj0NAAD839txx0/lpptuTadO29R6lE3W7I5oJEnnzp1rPQIAAGxWzSkykmYaGgAAQNMmNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABTXqtYDAE1Li7ffrPUIsFEt1ixv9F/4W76XQe0JDSBJsvXWnbJV69bJ89NqPQpssnYLflvrEWjCtmrdOltv3anWY8AWS2gASZKuXXfIdT+ckJUrV9R6FGqkri7p3LlDli9/K5VKraeBD7cp63XrrTula9cdNu9gQJXQAKq6dt3BD+UtWF1d0qXL1lm6dKXQoMmzXqHpczE4AABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKa/VRdq6r+78a46PP0BRmgY2xXmlOrFeaE+uV5uSTtl439XnUVSqVyv/tKAAAwJbGqVMAAEBxQgMAAChOaAAAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxzSY01q5dm8suuyz77bdfDjrooNx88821HgmqFi9enGHDhqV///45+OCDc/XVV2ft2rVJkoULF+ass85Knz598oUvfCGPPPJIjaeFvxg6dGiGDx9evT179uycfPLJqa+vz4knnphnnnmmhtNBsm7duowYMSL7779/BgwYkGuvvTaVSiWJ9UrT8+qrr+bcc89N3759M3DgwEyePLl635a4XptNaPzgBz/IM888k1tuuSVXXHFFxo0bl//8z/+s9ViQSqWSYcOGZc2aNbn99tszduzY/Pd//3d++MMfplKp5Bvf+Ea6dOmS++67L8cff3zOP//8vPLKK7UeG/KrX/0q06ZNq95evXp1hg4dmv322y9TpkzJvvvum3PPPTerV6+u4ZRs6b73ve/lsccey09+8pOMGTMmP//5z3P33XdbrzRJF154Ydq3b58pU6bksssuyw9/+MM8+OCDW+56rTQDb731VqV3796V3//+99Vt48ePr/zzP/9zDaeCd82bN6+y5557VpYsWVLddv/991cOOuigymOPPVbp06dP5a233qred+aZZ1Z+9KMf1WJUqFq2bFnlkEMOqZx44omVSy65pFKpVCr33HNPZeDAgZWGhoZKpVKpNDQ0VI444ojKfffdV8tR2YItW7as0qtXr8rjjz9e3XbjjTdWhg8fbr3S5Cxfvryy5557VubMmVPddv7551dGjBixxa7XZnFE49lnn8369euz7777Vrf169cvTz/9dBoaGmo4GSRdu3bNTTfdlC5dujTavmrVqjz99NPp1atX2rdvX93er1+/zJw5czNPCY19//vfz/HHH5899tijuu3pp59Ov379UldXlySpq6tL3759rVdqZsaMGenYsWP69+9f3TZ06NBcffXV1itNTtu2bdOuXbtMmTIlf/7zn/P888/nqaeeSs+ePbfY9dosQmPJkiXZdttt07p16+q2Ll26ZO3atVm+fHntBoMknTp1ysEHH1y93dDQkJ/97Gf5h3/4hyxZsiQ77LBDo/233377vPbaa5t7TKiaPn16nnzyyZx33nmNtluvNDULFy5Mt27dMnXq1Bx99NEZNGhQxo8fn4aGBuuVJqdNmza5/PLLc/fdd6e+vj7HHHNMDjnkkJx88slb7HptVesBNsWaNWsaRUaS6u1169bVYiTYoFGjRmX27Nm59957M3ny5A9cu9YttbJ27dpcccUVufzyy9O2bdtG923oe631Sq2sXr06L774Yu66665cffXVWbJkSS6//PK0a9fOeqVJmj9/fg477LCcffbZee655zJy5MgccMABW+x6bRah0aZNm/f9j3jv9t/+oIRaGjVqVG655ZaMHTs2e+65Z9q0afO+o27r1q2zbqmZcePG5XOf+1yjo3Dv2dD3WuuVWmnVqlVWrVqVMWPGpFu3bkmSV155JXfeeWe6d+9uvdKkTJ8+Pffee2+mTZuWtm3bpnfv3lm8eHEmTJiQXXbZZYtcr80iNHbccccsW7Ys69evT6tW7468ZMmStG3bNp06darxdPCukSNH5s4778yoUaNy1FFHJXl37c6bN6/RfkuXLn3f4VPYXH71q19l6dKl1Wve3vvB91//9V859thjs3Tp0kb7W6/UUteuXdOmTZtqZCRJjx498uqrr6Z///7WK03KM888k+7duzeKh169emXixInZb7/9tsj12iyu0ejZs2datWrV6IKZGTNmpHfv3mnRolk8BT7hxo0bl7vuuivXXntt/vEf/7G6vb6+Pn/4wx/y9ttvV7fNmDEj9fX1tRgTctttt+X+++/P1KlTM3Xq1AwcODADBw7M1KlTU19fn//93/+t/o6CSqWSp556ynqlZurr67N27dosWLCguu35559Pt27drFeanB122CEvvvhioyMXzz//fHbeeectdr02i1fp7dq1yxe/+MV897vfzaxZs/LQQw/l5ptvzpAhQ2o9GmT+/Pm54YYb8tWvfjX9+vXLkiVLqh/9+/fPTjvtlEsvvTTPPfdcJk2alFmzZuWkk06q9dhsobp165bu3btXPzp06JAOHTqke/fuOfroo7NixYpcddVVmTdvXq666qqsWbMmxxxzTK3HZgu122675dBDD82ll16aZ599Nr/73e8yadKknHbaadYrTc7AgQOz1VZb5V//9V+zYMGCPPzww5k4cWLOOOOMLXa91lXeS6smbs2aNfnud7+bBx54IB07dsw555yTs846q9ZjQSZNmpQxY8Z84H1z5szJiy++mO985zt5+umn071791x22WUZMGDAZp4SPth7vxX8mmuuSZLMmjUrV1xxRebPn5+99torI0aMSK9evWo5Ilu4lStXZuTIkXnwwQfTrl27nH766fnGN76Ruro665Um572ImDVrVrbbbrsMHjw4Z5555ha7XptNaAAAAM1Hszh1CgAAaF6EBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAAAgOKEBgAAUJzQAAAAihMaAGTGjBk57bTTUl9fnz59+uSrX/1qXn/99STJI488kuOOOy777LNPvvKVr2TkyJEZPnx49XPvuuuuDBw4MPvuu2/OOOOMzJkzp1ZPA4AmRGgAbOFWrlyZc889NwceeGB++ctf5ic/+UleeumlTJo0KQsXLszXv/71HHPMMZk6dWp69+6d22+/vfq5Dz/8cMaNG5d/+7d/yy9+8Yv069cvQ4YMyZtvvlnDZwRAU1BXqVQqtR4CgNpZsmRJ7r///px99tmpq6tLkowZMyazZs1KfX19ZsyY0SguTj311PTo0SPXXHNNTj/99BxzzDE544wzqvd/6UtfygknnNBoGwBbnla1HgCA2uratWu++MUvZvLkyfnjH/+YefPmZc6cOenbt2/mzJmT3r17N9q/T58+1SMW8+fPz6hRo3LttddW71+7dm1eeOGFzfkUAGiChAbAFm7x4sU58cQTs/fee2fAgAE55ZRT8pvf/CZPP/10WrZsmb898P3Xt995551cdtllOeCAAxrt07Fjx80yOwBNl9AA2MI9+OCD2WabbXLjjTdWt912222pVCr5zGc+kxkzZjTa/w9/+EN22WWXJEmPHj3y2muvpXv37tX7L7300hx++OEZNGjQ5nkCADRJLgYH2MJ17tw5r7zySqZPn56FCxdm0qRJeeCBB7Ju3bqccsopmTlzZiZNmpQFCxZk4sSJefLJJ6vXcpx99tm55ZZbMnXq1Lz00ksZNWpU/uM//iO77757jZ8VALXmYnCALdw777yTK6+8Mr/+9a9TV1eX3r175+CDD87111+f6dOn55FHHsn3v//9LFq0KAceeGBatmyZLl265Morr0yS3HrrrZk8eXKWLl2aPfbYI//yL//yvlOpANjyCA0ANmju3LlZv359evXqVd02dOjQ9O7dOxdccEENJwOgqXPqFAAb9NJLL+Xss8/Oo48+mkWLFuWee+7J9OnTc8QRR9R6NACaOEc0APhQEyZMyN13350//elP6dGjR4YNG5bDDz+81mMB0MQJDQAAoDinTgEAAMUJDQAAoDihAQAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxQgMAACju/wM4J82jKM7PHwAAAABJRU5ErkJggg=="
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"execution_count": 172
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.245693Z",
|
||
"start_time": "2024-12-07T07:33:51.236859Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
],
|
||
"id": "84cf47a513b9f258",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"id 0\n",
|
||
"gender 0\n",
|
||
"age 0\n",
|
||
"hypertension 0\n",
|
||
"heart_disease 0\n",
|
||
"ever_married 0\n",
|
||
"work_type 0\n",
|
||
"Residence_type 0\n",
|
||
"avg_glucose_level 0\n",
|
||
"bmi 201\n",
|
||
"smoking_status 0\n",
|
||
"stroke 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"id False\n",
|
||
"gender False\n",
|
||
"age False\n",
|
||
"hypertension False\n",
|
||
"heart_disease False\n",
|
||
"ever_married False\n",
|
||
"work_type False\n",
|
||
"Residence_type False\n",
|
||
"avg_glucose_level False\n",
|
||
"bmi True\n",
|
||
"smoking_status False\n",
|
||
"stroke False\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"bmi процент пустых значений: %3.93\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 173
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.304644Z",
|
||
"start_time": "2024-12-07T07:33:51.298975Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# Замена пустых данных на медиану\n",
|
||
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
],
|
||
"id": "ba00afd3f040bc81",
|
||
"outputs": [],
|
||
"execution_count": 174
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "3) Унитарное кодирование",
|
||
"id": "858e690bed6f98dd"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.334790Z",
|
||
"start_time": "2024-12-07T07:33:51.311404Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n",
|
||
"df = df.query('gender == \"Male\" or gender == \"Female\"')\n",
|
||
"encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n",
|
||
"\n",
|
||
"encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n",
|
||
"\n",
|
||
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"df = pd.concat([df, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"df"
|
||
],
|
||
"id": "ec19b16d7410cbff",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
||
"0 67.0 Male 0.0 Private 228.69 36.6 \n",
|
||
"1 61.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
||
"2 80.0 Male 0.0 Private 105.92 32.5 \n",
|
||
"3 49.0 Female 0.0 Private 171.23 34.4 \n",
|
||
"4 79.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
||
"5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
||
"5108 51.0 Male 0.0 Private 166.29 25.6 \n",
|
||
"5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
||
"3116 NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
||
"0 formerly smoked 1.0 0.0 1.0 \n",
|
||
"1 never smoked 0.0 0.0 0.0 \n",
|
||
"2 never smoked 1.0 0.0 1.0 \n",
|
||
"3 smokes 0.0 0.0 1.0 \n",
|
||
"4 never smoked 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"5106 never smoked 0.0 0.0 0.0 \n",
|
||
"5107 never smoked 1.0 0.0 1.0 \n",
|
||
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
||
"5109 Unknown NaN NaN NaN \n",
|
||
"3116 NaN 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Self-employed work_type_children \\\n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"... ... ... \n",
|
||
"5106 1.0 0.0 \n",
|
||
"5107 0.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 1.0 \n",
|
||
"\n",
|
||
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
||
"0 1.0 0.0 \n",
|
||
"1 0.0 1.0 \n",
|
||
"2 0.0 1.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 1.0 \n",
|
||
"... ... ... \n",
|
||
"5106 0.0 1.0 \n",
|
||
"5107 1.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_smokes \n",
|
||
"0 0.0 \n",
|
||
"1 0.0 \n",
|
||
"2 0.0 \n",
|
||
"3 1.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"5106 0.0 \n",
|
||
"5107 0.0 \n",
|
||
"5108 0.0 \n",
|
||
"5109 NaN \n",
|
||
"3116 0.0 \n",
|
||
"\n",
|
||
"[5110 rows x 15 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>hypertension</th>\n",
|
||
" <th>work_type</th>\n",
|
||
" <th>avg_glucose_level</th>\n",
|
||
" <th>bmi</th>\n",
|
||
" <th>smoking_status</th>\n",
|
||
" <th>gender_Male</th>\n",
|
||
" <th>work_type_Never_worked</th>\n",
|
||
" <th>work_type_Private</th>\n",
|
||
" <th>work_type_Self-employed</th>\n",
|
||
" <th>work_type_children</th>\n",
|
||
" <th>smoking_status_formerly smoked</th>\n",
|
||
" <th>smoking_status_never smoked</th>\n",
|
||
" <th>smoking_status_smokes</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>67.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>228.69</td>\n",
|
||
" <td>36.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>61.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>202.21</td>\n",
|
||
" <td>28.1</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>105.92</td>\n",
|
||
" <td>32.5</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>49.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>171.23</td>\n",
|
||
" <td>34.4</td>\n",
|
||
" <td>smokes</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>174.12</td>\n",
|
||
" <td>24.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5106</th>\n",
|
||
" <td>81.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>125.20</td>\n",
|
||
" <td>40.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5107</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>82.99</td>\n",
|
||
" <td>30.6</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5108</th>\n",
|
||
" <td>51.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>166.29</td>\n",
|
||
" <td>25.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5109</th>\n",
|
||
" <td>44.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Govt_job</td>\n",
|
||
" <td>85.28</td>\n",
|
||
" <td>26.2</td>\n",
|
||
" <td>Unknown</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3116</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5110 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 175,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 175
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "4) Дискретизация признаков",
|
||
"id": "cc934d2268784440"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.401444Z",
|
||
"start_time": "2024-12-07T07:33:51.386817Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n",
|
||
"df"
|
||
],
|
||
"id": "b9a70c0c56176f98",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
||
"0 4.0 Male 0.0 Private 228.69 36.6 \n",
|
||
"1 3.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
||
"2 4.0 Male 0.0 Private 105.92 32.5 \n",
|
||
"3 2.0 Female 0.0 Private 171.23 34.4 \n",
|
||
"4 4.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
||
"5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
||
"5108 2.0 Male 0.0 Private 166.29 25.6 \n",
|
||
"5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
||
"3116 NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
||
"0 formerly smoked 1.0 0.0 1.0 \n",
|
||
"1 never smoked 0.0 0.0 0.0 \n",
|
||
"2 never smoked 1.0 0.0 1.0 \n",
|
||
"3 smokes 0.0 0.0 1.0 \n",
|
||
"4 never smoked 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"5106 never smoked 0.0 0.0 0.0 \n",
|
||
"5107 never smoked 1.0 0.0 1.0 \n",
|
||
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
||
"5109 Unknown NaN NaN NaN \n",
|
||
"3116 NaN 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Self-employed work_type_children \\\n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"... ... ... \n",
|
||
"5106 1.0 0.0 \n",
|
||
"5107 0.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 1.0 \n",
|
||
"\n",
|
||
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
||
"0 1.0 0.0 \n",
|
||
"1 0.0 1.0 \n",
|
||
"2 0.0 1.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 1.0 \n",
|
||
"... ... ... \n",
|
||
"5106 0.0 1.0 \n",
|
||
"5107 1.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_smokes \n",
|
||
"0 0.0 \n",
|
||
"1 0.0 \n",
|
||
"2 0.0 \n",
|
||
"3 1.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"5106 0.0 \n",
|
||
"5107 0.0 \n",
|
||
"5108 0.0 \n",
|
||
"5109 NaN \n",
|
||
"3116 0.0 \n",
|
||
"\n",
|
||
"[5110 rows x 15 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>hypertension</th>\n",
|
||
" <th>work_type</th>\n",
|
||
" <th>avg_glucose_level</th>\n",
|
||
" <th>bmi</th>\n",
|
||
" <th>smoking_status</th>\n",
|
||
" <th>gender_Male</th>\n",
|
||
" <th>work_type_Never_worked</th>\n",
|
||
" <th>work_type_Private</th>\n",
|
||
" <th>work_type_Self-employed</th>\n",
|
||
" <th>work_type_children</th>\n",
|
||
" <th>smoking_status_formerly smoked</th>\n",
|
||
" <th>smoking_status_never smoked</th>\n",
|
||
" <th>smoking_status_smokes</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>228.69</td>\n",
|
||
" <td>36.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>202.21</td>\n",
|
||
" <td>28.1</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>105.92</td>\n",
|
||
" <td>32.5</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>171.23</td>\n",
|
||
" <td>34.4</td>\n",
|
||
" <td>smokes</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>174.12</td>\n",
|
||
" <td>24.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5106</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>125.20</td>\n",
|
||
" <td>40.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5107</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>82.99</td>\n",
|
||
" <td>30.6</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5108</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>166.29</td>\n",
|
||
" <td>25.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5109</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Govt_job</td>\n",
|
||
" <td>85.28</td>\n",
|
||
" <td>26.2</td>\n",
|
||
" <td>Unknown</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3116</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5110 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 176,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 176
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "3) Разбиение данных",
|
||
"id": "7c5387ab7d3b9349"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.496592Z",
|
||
"start_time": "2024-12-07T07:33:51.456961Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"dropna_df = df.dropna()\n",
|
||
"\n",
|
||
"df_input = dropna_df[[\n",
|
||
" \"age\",\n",
|
||
" \"hypertension\", \n",
|
||
" \"avg_glucose_level\", \n",
|
||
" \"bmi\", \n",
|
||
" \"gender_Male\",\n",
|
||
" \"work_type_Never_worked\",\n",
|
||
" \"work_type_Private\",\n",
|
||
" \"work_type_Self-employed\",\n",
|
||
" \"work_type_children\",\n",
|
||
" \"smoking_status_formerly smoked\",\n",
|
||
" \"smoking_status_never smoked\",\n",
|
||
" \"smoking_status_smokes\",\n",
|
||
"]]\n",
|
||
"\n",
|
||
"print(df_input.head())\n",
|
||
"\n",
|
||
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
||
"\n",
|
||
"# Разделение остатка на контрольную и тестовую выборки\n",
|
||
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
||
"\n",
|
||
"# Проверка размеров выборок\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
||
"\n",
|
||
"# Сохранение выборок в файлы\n",
|
||
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
||
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
||
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
||
],
|
||
"id": "8c9949a919295290",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" age hypertension avg_glucose_level bmi gender_Male \\\n",
|
||
"0 4.0 0.0 228.69 36.6 1.0 \n",
|
||
"1 3.0 0.0 202.21 28.1 0.0 \n",
|
||
"2 4.0 0.0 105.92 32.5 1.0 \n",
|
||
"3 2.0 0.0 171.23 34.4 0.0 \n",
|
||
"4 4.0 1.0 174.12 24.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
|
||
"0 0.0 1.0 0.0 \n",
|
||
"1 0.0 0.0 1.0 \n",
|
||
"2 0.0 1.0 0.0 \n",
|
||
"3 0.0 1.0 0.0 \n",
|
||
"4 0.0 0.0 1.0 \n",
|
||
"\n",
|
||
" work_type_children smoking_status_formerly smoked \\\n",
|
||
"0 0.0 1.0 \n",
|
||
"1 0.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_never smoked smoking_status_smokes \n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 1.0 0.0 \n",
|
||
"3 0.0 1.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"Размер обучающей выборки: 3064\n",
|
||
"Размер контрольной выборки: 1022\n",
|
||
"Размер тестовой выборки: 1022\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 177
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.637107Z",
|
||
"start_time": "2024-12-07T07:33:51.601412Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
||
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
||
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
||
"\n",
|
||
"def check_balance(df, name):\n",
|
||
" print(f\"Распределение gender в {name}:\")\n",
|
||
" print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n",
|
||
" print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"check_balance(train_df, \"обучающей выборке\")\n",
|
||
"check_balance(val_df, \"контрольной выборке\")\n",
|
||
"check_balance(test_df, \"тестовой выборке\")"
|
||
],
|
||
"id": "79b2248eb6438fa5",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение gender в обучающей выборке:\n",
|
||
"Процент gender_Male: 41.87%\n",
|
||
"Процент gender_Female: 58.13%\n",
|
||
"\n",
|
||
"Распределение gender в контрольной выборке:\n",
|
||
"Процент gender_Male: 40.41%\n",
|
||
"Процент gender_Female: 59.59%\n",
|
||
"\n",
|
||
"Распределение gender в тестовой выборке:\n",
|
||
"Процент gender_Male: 41.00%\n",
|
||
"Процент gender_Female: 59.00%\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 178
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "Выборка сбалансирована",
|
||
"id": "a6436f11045161c4"
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|