1067 lines
50 KiB
Plaintext
1067 lines
50 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "1) Бизнес цели - опредение наличия заболивания у человека",
|
|||
|
"id": "54c08440669b8de7"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "2) Подготовка данных",
|
|||
|
"id": "5d090ddc69b152cf"
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"id": "initial_id",
|
|||
|
"metadata": {
|
|||
|
"collapsed": true,
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.107138Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.094517Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"from prompt_toolkit.shortcuts.progress_bar import Percentage\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
|
|||
|
"print(df.columns)\n",
|
|||
|
"print(df)"
|
|||
|
],
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
|||
|
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
|||
|
" 'smoking_status', 'stroke'],\n",
|
|||
|
" dtype='object')\n",
|
|||
|
" id gender age hypertension heart_disease ever_married \\\n",
|
|||
|
"0 9046 Male 67.0 0 1 Yes \n",
|
|||
|
"1 51676 Female 61.0 0 0 Yes \n",
|
|||
|
"2 31112 Male 80.0 0 1 Yes \n",
|
|||
|
"3 60182 Female 49.0 0 0 Yes \n",
|
|||
|
"4 1665 Female 79.0 1 0 Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5105 18234 Female 80.0 1 0 Yes \n",
|
|||
|
"5106 44873 Female 81.0 0 0 Yes \n",
|
|||
|
"5107 19723 Female 35.0 0 0 Yes \n",
|
|||
|
"5108 37544 Male 51.0 0 0 Yes \n",
|
|||
|
"5109 44679 Female 44.0 0 0 Yes \n",
|
|||
|
"\n",
|
|||
|
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
|||
|
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
|||
|
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
|||
|
"2 Private Rural 105.92 32.5 never smoked \n",
|
|||
|
"3 Private Urban 171.23 34.4 smokes \n",
|
|||
|
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"5105 Private Urban 83.75 NaN never smoked \n",
|
|||
|
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
|||
|
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
|||
|
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
|||
|
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
|||
|
"\n",
|
|||
|
" stroke \n",
|
|||
|
"0 1 \n",
|
|||
|
"1 1 \n",
|
|||
|
"2 1 \n",
|
|||
|
"3 1 \n",
|
|||
|
"4 1 \n",
|
|||
|
"... ... \n",
|
|||
|
"5105 0 \n",
|
|||
|
"5106 0 \n",
|
|||
|
"5107 0 \n",
|
|||
|
"5108 0 \n",
|
|||
|
"5109 0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 12 columns]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 171
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.208159Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.118240Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['age'])\n",
|
|||
|
"plt.title('Box Plot для age')\n",
|
|||
|
"plt.xlabel('age')\n",
|
|||
|
"plt.show()"
|
|||
|
],
|
|||
|
"id": "ea1a7ed0e6d7d189",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
],
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikElEQVR4nO3df5SWdZ3/8dcA8ltEBc1QWdQ0MBwEZVf8kYI/W10zfx1lRc3CMuWr1Srarobk0QIkEwTJDDV/pRJnrXZXPW6USrriIsdIEERFVIQEAUEI5/7+4fGuSRH0fJZ7Rh6Pc+bIfd3X3PO+8XNm7ifXdd1TV6lUKgEAACioRa0HAAAAPnmEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAD4UH6vKwAfh9AA+BjOOOOM7LXXXo0+9ttvvwwZMiRPPPHEZplhypQp75uhZ8+e2X///fPlL385M2bMqO57/fXXZ6+99vpIj//aa69l6NChWbRoUenRAdgCtKr1AADNVa9evXLFFVckSd55550sW7Ysd955Z84555xMmTIln/nMZzbLHOPGjUvXrl2TJA0NDVm6dGnGjx+fM888M/fee28++9nPfqzHfeyxxzJt2rSSowKwBREaAB9Tx44d06dPn0bbBgwYkAMOOCBTpkzJJZdcslnm6NmzZ3beeedG23r16pUjjjgid9xxR6688srNMgcA/DWhAVBQu3bt0qZNm9TV1TXa/utf/zo33XRTFixYkPbt22fQoEH51re+lW222SarVq3KcccdlzZt2uTf//3f07p161QqlZx55pmZP39+7r///my33XYfaY6dd9452267bV555ZUN7vNhM02ZMiWXXnppkmTQoEE54YQTcs0113zg47zwwgs56qij3re9W7duefjhh6u3Bw4c+IGnYf31fjNmzMjo0aPz7LPPZvXq1Rt8rL/17LPPZty4cXnyySezcuXKbLfddjnqqKPy7W9/O23btk2SrFq1Kj/4wQ/y4IMP5u23386hhx6a+vr6XH311ZkzZ071sR566KHccMMNee6559KpU6ccc8wx+eY3v5n27dtv8OsD8H5CA+BjqlQqWb9+ffXPy5cvzy233JJ169blxBNPrO53ww035Ec/+lFOP/30XHTRRVm4cGGuu+66zJw5Mz//+c/TsWPHXHXVVfnyl7+ciRMnZtiwYbn11lvz+OOPZ9KkSR85MpJk2bJlWbZsWXbdddcPvH9jMx166KH5+te/ngkTJmTcuHEfen3H22+/nZYtW+aOO+5o9Pjz5s17376f//znc955533gfqtWrcrXvva17LHHHhkzZky23Xbb1NXVbfCx3vP6669n8ODB6dOnT6655pq0bt06v/3tb/PTn/40O+ywQ4YOHZokOe+88/LHP/4xF110UT796U/njjvuyJgxYxo91v33359vf/vbOe6443LhhRdm0aJFGTt2bObNm5ef/vSn7wtIADZMaAB8TP/zP/+Tvffe+33bv/nNb2b33XdPkrz55puZMGFCTjnllFx++eXVffbcc88MHjw49913XwYPHpwBAwbk1FNPzaRJk1JfX59rr702gwcPzuc///mNztHQ0FANnrVr1+aFF17I6NGj06JFi5x66qnv239TZ3ovUj7o1Ky/tmbNmrRp06bRaWQbiqPttttug/stWLAgK1asyNChQ3PYYYdt9LHeM3fu3PTs2TPXXXddOnbsmOTdU9geffTRPP744xk6dGimT5+exx9/PNdff32OPPLIJMkhhxySY489NvPnz0/ybiyOHj06Bx98cEaPHl19/L/7u7/LWWedlWnTpuXQQw/90FkA+AuhAfAx7b333hkxYkSSd1+krlixIr/97W8zduzYrF69OhdddFFmzpyZdevW5dhjj230ufvtt1+6deuWJ554IoMHD06SXHzxxXnkkUfyta99LT169MjFF1+8SXMcccQR79vWrVu3jBo16gOPRHyUmTbFq6++mk6dOm3y/hvSo0ePdOrUKffcc0969OiRT33qU2nVqtVG3173oIMOykEHHZQ///nPmTdvXl588cXMnTs3b7zxRjp37pwk+f3vf5+tttoqhx9+ePXzWrRokS984Qu5/vrrkyTPP/98XnvttZx77rnVcEuS/fffPx07dsyjjz4qNAA+AqEB8DF16NAhvXv3brTtoIMOyurVq3PTTTdlyJAhefPNN5MkXbp0ed/nd+nSJStXrmz0eEceeWRuvvnmHHDAAdVrCzZmwoQJ1Xed2mqrrbLttttmxx133OD+H2WmTbFo0aJ069btI33OB+nYsWPGjRuX733ve++75uPDHr+hoSHXXnttbr/99qxevTo77bRT9tlnn7Rp06a6z7Jly9K5c+e0aNH4Xd2333776p+XL1+eJBkxYkQ1IP/a66+//nGeFsAWS2gAFPa5z30u99xzT15++eVss802SZKlS5dmt912a7TfkiVLsssuu1Rvz507N7fddlt69uyZO++8M//0T/+U+vr6jX69Pffc80NPbfpbH2WmTfHUU09t8lvobuwah7//+7/PkUcemQULFuRb3/pW+vbtmwkTJmTu3Lkb/JxJkyZl8uTJGTFiRI488shsvfXWSZKTTjqpus+OO+6YZcuWpaGhoVFs/OlPf6r++b2jMhdffHH69+//vq/z3t8bAJvGL+wDKGzWrFlp2bJldtlll9TX16d169b55S9/2WifJ598Mq+88kr69u2bJFm/fn2GDx+eXXfdNXfddVc++9nP5pJLLsnatWuLz7epM/3tv/5/kDVr1uSJJ57IgQceuNF9//ZF/geZNm1axo0blyFDhuTss89OfX199fSnDZkxY0b22GOPnHjiidXIWLx4cebOnZuGhoYkSf/+/bN+/fpG71xVqVTy0EMPVW/vtttu2X777fPyyy+nd+/e1Y8dd9wxY8aMyezZszf6HAH4C0c0AD6mVatWZebMmdXb69aty8MPP5z77rsvp556avUi5qFDh2b8+PHZaqutcthhh+Xll1/Oddddlz322CMnnHBCkmTixImZPXt27rjjjrRt2zYjR47MySefnLFjx2b48OFF5+7cufMmzfTev/A/+OCDOeSQQ6oXuL/njTfeyOTJk1NXV5fOnTs3+rt44403sm7dusyePTvbb7995syZkzfeeONDr+VYu3ZtRo4cmW7duuWCCy7Y5Oezzz775IYbbsikSZPSp0+fvPjii7nxxhuzbt26rFmzJsm711kceOCB+c53vpOlS5fm05/+dO69997MmTOnepSlZcuWueiii3L55ZenZcuWOeyww7JixYrccMMNWbx48Qde+A/AhgkNgI9p9uzZjd7VqU2bNtl1111z0UUX5Zxzzqluv+CCC9KlS5f87Gc/y913353OnTvn6KOPzoUXXpj27dvn2WefzcSJE3PaaadVjybsvffeGTJkSG655ZYcccQR6devX9HZNzZT8u5pTAMGDMiYMWMyffr0TJo0qdFj/OY3v8mNN96YJDn99NM/8Oucf/75OeWUU3LdddelR48eOfnkkzc4049//OMsXLgw48aNS7t27Tb5uZx77rlZtmxZbr311owfPz477bRTjj/++NTV1eXGG2/MihUr0qlTp4wdOzbXXHNNxowZk/Xr12fQoEE57bTTMnXq1OpjnXzyyenQoUNuuumm3H333Wnfvn369u2b0aNHf+RTygC2dHWVjb2dBwB8gClTpmTcuHEb/EV6G7t/c1q0aFFmzpyZQYMGNbrIftiwYVm4cGF+8Ytf1HA6gE8mRzQA+MRr0aJFhg8fnkGDBuWkk05Ky5Yt87vf/S4PPPBArr766lqPB/CJJDQA+Fi222679OzZ82PfvznttNNO+fGPf5zx48fnwgsvzPr167P77rtn9OjR7/t9IgCU4dQpAACgOG9vCwAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxH+ntbf/0p5Wp9XtU1dUl22+/dZOYBTbGeqU5sV5pTqxXmpNP2np97/lszEcKjUolTeYvpynNAhtjvdKcWK80J9YrzcmWtl6dOgUAABQ
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 172
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.245693Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.236859Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"\n",
|
|||
|
"# Количество пустых значений признаков\n",
|
|||
|
"print(df.isnull().sum())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Есть ли пустые значения признаков\n",
|
|||
|
"print(df.isnull().any())\n",
|
|||
|
"\n",
|
|||
|
"print()\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "84cf47a513b9f258",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"id 0\n",
|
|||
|
"gender 0\n",
|
|||
|
"age 0\n",
|
|||
|
"hypertension 0\n",
|
|||
|
"heart_disease 0\n",
|
|||
|
"ever_married 0\n",
|
|||
|
"work_type 0\n",
|
|||
|
"Residence_type 0\n",
|
|||
|
"avg_glucose_level 0\n",
|
|||
|
"bmi 201\n",
|
|||
|
"smoking_status 0\n",
|
|||
|
"stroke 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"id False\n",
|
|||
|
"gender False\n",
|
|||
|
"age False\n",
|
|||
|
"hypertension False\n",
|
|||
|
"heart_disease False\n",
|
|||
|
"ever_married False\n",
|
|||
|
"work_type False\n",
|
|||
|
"Residence_type False\n",
|
|||
|
"avg_glucose_level False\n",
|
|||
|
"bmi True\n",
|
|||
|
"smoking_status False\n",
|
|||
|
"stroke False\n",
|
|||
|
"dtype: bool\n",
|
|||
|
"\n",
|
|||
|
"bmi процент пустых значений: %3.93\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 173
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.304644Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.298975Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Замена пустых данных на медиану\n",
|
|||
|
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
|
|||
|
"\n",
|
|||
|
"# Процент пустых значений признаков\n",
|
|||
|
"for i in df.columns:\n",
|
|||
|
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
|||
|
" if null_rate > 0:\n",
|
|||
|
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
|||
|
],
|
|||
|
"id": "ba00afd3f040bc81",
|
|||
|
"outputs": [],
|
|||
|
"execution_count": 174
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "3) Унитарное кодирование",
|
|||
|
"id": "858e690bed6f98dd"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.334790Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.311404Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
|||
|
"\n",
|
|||
|
"df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n",
|
|||
|
"df = df.query('gender == \"Male\" or gender == \"Female\"')\n",
|
|||
|
"encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n",
|
|||
|
"\n",
|
|||
|
"encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n",
|
|||
|
"\n",
|
|||
|
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
|||
|
"\n",
|
|||
|
"df = pd.concat([df, encoded_values_df], axis=1)\n",
|
|||
|
"\n",
|
|||
|
"df"
|
|||
|
],
|
|||
|
"id": "ec19b16d7410cbff",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
|||
|
"0 67.0 Male 0.0 Private 228.69 36.6 \n",
|
|||
|
"1 61.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
|||
|
"2 80.0 Male 0.0 Private 105.92 32.5 \n",
|
|||
|
"3 49.0 Female 0.0 Private 171.23 34.4 \n",
|
|||
|
"4 79.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
|||
|
"5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
|||
|
"5108 51.0 Male 0.0 Private 166.29 25.6 \n",
|
|||
|
"5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
|||
|
"3116 NaN NaN NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
|||
|
"0 formerly smoked 1.0 0.0 1.0 \n",
|
|||
|
"1 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"2 never smoked 1.0 0.0 1.0 \n",
|
|||
|
"3 smokes 0.0 0.0 1.0 \n",
|
|||
|
"4 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"5106 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"5107 never smoked 1.0 0.0 1.0 \n",
|
|||
|
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
|||
|
"5109 Unknown NaN NaN NaN \n",
|
|||
|
"3116 NaN 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" work_type_Self-employed work_type_children \\\n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 1.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 1.0 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5106 1.0 0.0 \n",
|
|||
|
"5107 0.0 0.0 \n",
|
|||
|
"5108 0.0 0.0 \n",
|
|||
|
"5109 NaN NaN \n",
|
|||
|
"3116 0.0 1.0 \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
|||
|
"0 1.0 0.0 \n",
|
|||
|
"1 0.0 1.0 \n",
|
|||
|
"2 0.0 1.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 1.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5106 0.0 1.0 \n",
|
|||
|
"5107 1.0 0.0 \n",
|
|||
|
"5108 0.0 0.0 \n",
|
|||
|
"5109 NaN NaN \n",
|
|||
|
"3116 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_smokes \n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 1.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"... ... \n",
|
|||
|
"5106 0.0 \n",
|
|||
|
"5107 0.0 \n",
|
|||
|
"5108 0.0 \n",
|
|||
|
"5109 NaN \n",
|
|||
|
"3116 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 15 columns]"
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>gender</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>work_type</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>smoking_status</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>67.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>228.69</td>\n",
|
|||
|
" <td>36.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>61.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>202.21</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>80.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>105.92</td>\n",
|
|||
|
" <td>32.5</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>49.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>171.23</td>\n",
|
|||
|
" <td>34.4</td>\n",
|
|||
|
" <td>smokes</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>79.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>174.12</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5106</th>\n",
|
|||
|
" <td>81.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>125.20</td>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5107</th>\n",
|
|||
|
" <td>35.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>82.99</td>\n",
|
|||
|
" <td>30.6</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5108</th>\n",
|
|||
|
" <td>51.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>166.29</td>\n",
|
|||
|
" <td>25.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5109</th>\n",
|
|||
|
" <td>44.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Govt_job</td>\n",
|
|||
|
" <td>85.28</td>\n",
|
|||
|
" <td>26.2</td>\n",
|
|||
|
" <td>Unknown</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3116</th>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5110 rows × 15 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 175,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 175
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "4) Дискретизация признаков",
|
|||
|
"id": "cc934d2268784440"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.401444Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.386817Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n",
|
|||
|
"df"
|
|||
|
],
|
|||
|
"id": "b9a70c0c56176f98",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
|||
|
"0 4.0 Male 0.0 Private 228.69 36.6 \n",
|
|||
|
"1 3.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
|||
|
"2 4.0 Male 0.0 Private 105.92 32.5 \n",
|
|||
|
"3 2.0 Female 0.0 Private 171.23 34.4 \n",
|
|||
|
"4 4.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
|||
|
"5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
|||
|
"5108 2.0 Male 0.0 Private 166.29 25.6 \n",
|
|||
|
"5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
|||
|
"3116 NaN NaN NaN NaN NaN NaN \n",
|
|||
|
"\n",
|
|||
|
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
|||
|
"0 formerly smoked 1.0 0.0 1.0 \n",
|
|||
|
"1 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"2 never smoked 1.0 0.0 1.0 \n",
|
|||
|
"3 smokes 0.0 0.0 1.0 \n",
|
|||
|
"4 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"5106 never smoked 0.0 0.0 0.0 \n",
|
|||
|
"5107 never smoked 1.0 0.0 1.0 \n",
|
|||
|
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
|||
|
"5109 Unknown NaN NaN NaN \n",
|
|||
|
"3116 NaN 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" work_type_Self-employed work_type_children \\\n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 1.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 1.0 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5106 1.0 0.0 \n",
|
|||
|
"5107 0.0 0.0 \n",
|
|||
|
"5108 0.0 0.0 \n",
|
|||
|
"5109 NaN NaN \n",
|
|||
|
"3116 0.0 1.0 \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
|||
|
"0 1.0 0.0 \n",
|
|||
|
"1 0.0 1.0 \n",
|
|||
|
"2 0.0 1.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 1.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"5106 0.0 1.0 \n",
|
|||
|
"5107 1.0 0.0 \n",
|
|||
|
"5108 0.0 0.0 \n",
|
|||
|
"5109 NaN NaN \n",
|
|||
|
"3116 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_smokes \n",
|
|||
|
"0 0.0 \n",
|
|||
|
"1 0.0 \n",
|
|||
|
"2 0.0 \n",
|
|||
|
"3 1.0 \n",
|
|||
|
"4 0.0 \n",
|
|||
|
"... ... \n",
|
|||
|
"5106 0.0 \n",
|
|||
|
"5107 0.0 \n",
|
|||
|
"5108 0.0 \n",
|
|||
|
"5109 NaN \n",
|
|||
|
"3116 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[5110 rows x 15 columns]"
|
|||
|
],
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>age</th>\n",
|
|||
|
" <th>gender</th>\n",
|
|||
|
" <th>hypertension</th>\n",
|
|||
|
" <th>work_type</th>\n",
|
|||
|
" <th>avg_glucose_level</th>\n",
|
|||
|
" <th>bmi</th>\n",
|
|||
|
" <th>smoking_status</th>\n",
|
|||
|
" <th>gender_Male</th>\n",
|
|||
|
" <th>work_type_Never_worked</th>\n",
|
|||
|
" <th>work_type_Private</th>\n",
|
|||
|
" <th>work_type_Self-employed</th>\n",
|
|||
|
" <th>work_type_children</th>\n",
|
|||
|
" <th>smoking_status_formerly smoked</th>\n",
|
|||
|
" <th>smoking_status_never smoked</th>\n",
|
|||
|
" <th>smoking_status_smokes</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>228.69</td>\n",
|
|||
|
" <td>36.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>202.21</td>\n",
|
|||
|
" <td>28.1</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>105.92</td>\n",
|
|||
|
" <td>32.5</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>171.23</td>\n",
|
|||
|
" <td>34.4</td>\n",
|
|||
|
" <td>smokes</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>174.12</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5106</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>125.20</td>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5107</th>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Self-employed</td>\n",
|
|||
|
" <td>82.99</td>\n",
|
|||
|
" <td>30.6</td>\n",
|
|||
|
" <td>never smoked</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5108</th>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Private</td>\n",
|
|||
|
" <td>166.29</td>\n",
|
|||
|
" <td>25.6</td>\n",
|
|||
|
" <td>formerly smoked</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5109</th>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Govt_job</td>\n",
|
|||
|
" <td>85.28</td>\n",
|
|||
|
" <td>26.2</td>\n",
|
|||
|
" <td>Unknown</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3116</th>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>5110 rows × 15 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 176,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 176
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "3) Разбиение данных",
|
|||
|
"id": "7c5387ab7d3b9349"
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.496592Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.456961Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"# Функция для создания выборок\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"dropna_df = df.dropna()\n",
|
|||
|
"\n",
|
|||
|
"df_input = dropna_df[[\n",
|
|||
|
" \"age\",\n",
|
|||
|
" \"hypertension\", \n",
|
|||
|
" \"avg_glucose_level\", \n",
|
|||
|
" \"bmi\", \n",
|
|||
|
" \"gender_Male\",\n",
|
|||
|
" \"work_type_Never_worked\",\n",
|
|||
|
" \"work_type_Private\",\n",
|
|||
|
" \"work_type_Self-employed\",\n",
|
|||
|
" \"work_type_children\",\n",
|
|||
|
" \"smoking_status_formerly smoked\",\n",
|
|||
|
" \"smoking_status_never smoked\",\n",
|
|||
|
" \"smoking_status_smokes\",\n",
|
|||
|
"]]\n",
|
|||
|
"\n",
|
|||
|
"print(df_input.head())\n",
|
|||
|
"\n",
|
|||
|
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение остатка на контрольную и тестовую выборки\n",
|
|||
|
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Проверка размеров выборок\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
|||
|
"\n",
|
|||
|
"# Сохранение выборок в файлы\n",
|
|||
|
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
|||
|
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
|||
|
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
|||
|
],
|
|||
|
"id": "8c9949a919295290",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" age hypertension avg_glucose_level bmi gender_Male \\\n",
|
|||
|
"0 4.0 0.0 228.69 36.6 1.0 \n",
|
|||
|
"1 3.0 0.0 202.21 28.1 0.0 \n",
|
|||
|
"2 4.0 0.0 105.92 32.5 1.0 \n",
|
|||
|
"3 2.0 0.0 171.23 34.4 0.0 \n",
|
|||
|
"4 4.0 1.0 174.12 24.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
|
|||
|
"0 0.0 1.0 0.0 \n",
|
|||
|
"1 0.0 0.0 1.0 \n",
|
|||
|
"2 0.0 1.0 0.0 \n",
|
|||
|
"3 0.0 1.0 0.0 \n",
|
|||
|
"4 0.0 0.0 1.0 \n",
|
|||
|
"\n",
|
|||
|
" work_type_children smoking_status_formerly smoked \\\n",
|
|||
|
"0 0.0 1.0 \n",
|
|||
|
"1 0.0 0.0 \n",
|
|||
|
"2 0.0 0.0 \n",
|
|||
|
"3 0.0 0.0 \n",
|
|||
|
"4 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" smoking_status_never smoked smoking_status_smokes \n",
|
|||
|
"0 0.0 0.0 \n",
|
|||
|
"1 1.0 0.0 \n",
|
|||
|
"2 1.0 0.0 \n",
|
|||
|
"3 0.0 1.0 \n",
|
|||
|
"4 1.0 0.0 \n",
|
|||
|
"Размер обучающей выборки: 3064\n",
|
|||
|
"Размер контрольной выборки: 1022\n",
|
|||
|
"Размер тестовой выборки: 1022\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 177
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-12-07T07:33:51.637107Z",
|
|||
|
"start_time": "2024-12-07T07:33:51.601412Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"cell_type": "code",
|
|||
|
"source": [
|
|||
|
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
|||
|
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
|||
|
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
|||
|
"\n",
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" print(f\"Распределение gender в {name}:\")\n",
|
|||
|
" print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n",
|
|||
|
" print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
],
|
|||
|
"id": "79b2248eb6438fa5",
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение gender в обучающей выборке:\n",
|
|||
|
"Процент gender_Male: 41.87%\n",
|
|||
|
"Процент gender_Female: 58.13%\n",
|
|||
|
"\n",
|
|||
|
"Распределение gender в контрольной выборке:\n",
|
|||
|
"Процент gender_Male: 40.41%\n",
|
|||
|
"Процент gender_Female: 59.59%\n",
|
|||
|
"\n",
|
|||
|
"Распределение gender в тестовой выборке:\n",
|
|||
|
"Процент gender_Male: 41.00%\n",
|
|||
|
"Процент gender_Female: 59.00%\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"execution_count": 178
|
|||
|
},
|
|||
|
{
|
|||
|
"metadata": {},
|
|||
|
"cell_type": "markdown",
|
|||
|
"source": "Выборка сбалансирована",
|
|||
|
"id": "a6436f11045161c4"
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 2
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython2",
|
|||
|
"version": "2.7.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|