AIM-PIbd-32-Borovkov-M-V/Lab_3/lab3.ipynb

1067 lines
50 KiB
Plaintext
Raw Permalink Normal View History

2024-12-07 11:38:30 +04:00
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "1) Бизнес цели - опредение наличия заболивания у человека",
"id": "54c08440669b8de7"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "2) Подготовка данных",
"id": "5d090ddc69b152cf"
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.107138Z",
"start_time": "2024-12-07T07:33:51.094517Z"
}
},
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from prompt_toolkit.shortcuts.progress_bar import Percentage\n",
"\n",
"df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
"print(df.columns)\n",
"print(df)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
" 'smoking_status', 'stroke'],\n",
" dtype='object')\n",
" id gender age hypertension heart_disease ever_married \\\n",
"0 9046 Male 67.0 0 1 Yes \n",
"1 51676 Female 61.0 0 0 Yes \n",
"2 31112 Male 80.0 0 1 Yes \n",
"3 60182 Female 49.0 0 0 Yes \n",
"4 1665 Female 79.0 1 0 Yes \n",
"... ... ... ... ... ... ... \n",
"5105 18234 Female 80.0 1 0 Yes \n",
"5106 44873 Female 81.0 0 0 Yes \n",
"5107 19723 Female 35.0 0 0 Yes \n",
"5108 37544 Male 51.0 0 0 Yes \n",
"5109 44679 Female 44.0 0 0 Yes \n",
"\n",
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
"0 Private Urban 228.69 36.6 formerly smoked \n",
"1 Self-employed Rural 202.21 NaN never smoked \n",
"2 Private Rural 105.92 32.5 never smoked \n",
"3 Private Urban 171.23 34.4 smokes \n",
"4 Self-employed Rural 174.12 24.0 never smoked \n",
"... ... ... ... ... ... \n",
"5105 Private Urban 83.75 NaN never smoked \n",
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
"5108 Private Rural 166.29 25.6 formerly smoked \n",
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
"\n",
" stroke \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"5105 0 \n",
"5106 0 \n",
"5107 0 \n",
"5108 0 \n",
"5109 0 \n",
"\n",
"[5110 rows x 12 columns]\n"
]
}
],
"execution_count": 171
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.208159Z",
"start_time": "2024-12-07T07:33:51.118240Z"
}
},
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['age'])\n",
"plt.title('Box Plot для age')\n",
"plt.xlabel('age')\n",
"plt.show()"
],
"id": "ea1a7ed0e6d7d189",
"outputs": [
{
"data": {
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
],
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAIhCAYAAADJisyIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAikElEQVR4nO3df5SWdZ3/8dcA8ltEBc1QWdQ0MBwEZVf8kYI/W10zfx1lRc3CMuWr1Srarobk0QIkEwTJDDV/pRJnrXZXPW6USrriIsdIEERFVIQEAUEI5/7+4fGuSRH0fJZ7Rh6Pc+bIfd3X3PO+8XNm7ifXdd1TV6lUKgEAACioRa0HAAAAPnmEBgAAUJzQAAAAihMaAABAcUIDAAAoTmgAAADFCQ0AAKA4oQEAABQnNAD4UH6vKwAfh9AA+BjOOOOM7LXXXo0+9ttvvwwZMiRPPPHEZplhypQp75uhZ8+e2X///fPlL385M2bMqO57/fXXZ6+99vpIj//aa69l6NChWbRoUenRAdgCtKr1AADNVa9evXLFFVckSd55550sW7Ysd955Z84555xMmTIln/nMZzbLHOPGjUvXrl2TJA0NDVm6dGnGjx+fM888M/fee28++9nPfqzHfeyxxzJt2rSSowKwBREaAB9Tx44d06dPn0bbBgwYkAMOOCBTpkzJJZdcslnm6NmzZ3beeedG23r16pUjjjgid9xxR6688srNMgcA/DWhAVBQu3bt0qZNm9TV1TXa/utf/zo33XRTFixYkPbt22fQoEH51re+lW222SarVq3KcccdlzZt2uTf//3f07p161QqlZx55pmZP39+7r///my33XYfaY6dd9452267bV555ZUN7vNhM02ZMiWXXnppkmTQoEE54YQTcs0113zg47zwwgs56qij3re9W7duefjhh6u3Bw4c+IGnYf31fjNmzMjo0aPz7LPPZvXq1Rt8rL/17LPPZty4cXnyySezcuXKbLfddjnqqKPy7W9/O23btk2SrFq1Kj/4wQ/y4IMP5u23386hhx6a+vr6XH311ZkzZ071sR566KHccMMNee6559KpU6ccc8wx+eY3v5n27dtv8OsD8H5CA+BjqlQqWb9+ffXPy5cvzy233JJ169blxBNPrO53ww035Ec/+lFOP/30XHTRRVm4cGGuu+66zJw5Mz//+c/TsWPHXHXVVfnyl7+ciRMnZtiwYbn11lvz+OOPZ9KkSR85MpJk2bJlWbZsWXbdddcPvH9jMx166KH5+te/ngkTJmTcuHEfen3H22+/nZYtW+aOO+5o9Pjz5s17376f//znc955533gfqtWrcrXvva17LHHHhkzZky23Xbb1NXVbfCx3vP6669n8ODB6dOnT6655pq0bt06v/3tb/PTn/40O+ywQ4YOHZokOe+88/LHP/4xF110UT796U/njjvuyJgxYxo91v33359vf/vbOe6443LhhRdm0aJFGTt2bObNm5ef/vSn7wtIADZMaAB8TP/zP/+Tvffe+33bv/nNb2b33XdPkrz55puZMGFCTjnllFx++eXVffbcc88MHjw49913XwYPHpwBAwbk1FNPzaRJk1JfX59rr702gwcPzuc///mNztHQ0FANnrVr1+aFF17I6NGj06JFi5x66qnv239TZ3ovUj7o1Ky/tmbNmrRp06bRaWQbiqPttttug/stWLAgK1asyNChQ3PYYYdt9LHeM3fu3PTs2TPXXXddOnbsmOTdU9geffTRPP744xk6dGimT5+exx9/PNdff32OPPLIJMkhhxySY489NvPnz0/ybiyOHj06Bx98cEaPHl19/L/7u7/LWWedlWnTpuXQQw/90FkA+AuhAfAx7b333hkxYkSSd1+krlixIr/97W8zduzYrF69OhdddFFmzpyZdevW5dhjj230ufvtt1+6deuWJ554IoMHD06SXHzxxXnkkUfyta99LT169MjFF1+8SXMcccQR79vWrVu3jBo16gOPRHyUmTbFq6++mk6dOm3y/hvSo0ePdOrUKffcc0969OiRT33qU2nVqtVG3173oIMOykEHHZQ///nPmTdvXl588cXMnTs3b7zxRjp37pwk+f3vf5+tttoqhx9+ePXzWrRokS984Qu5/vrrkyTPP/98XnvttZx77rnVcEuS/fffPx07dsyjjz4qNAA+AqEB8DF16NAhvXv3brTtoIMOyurVq3PTTTdlyJAhefPNN5MkXbp0ed/nd+nSJStXrmz0eEceeWRuvvnmHHDAAdVrCzZmwoQJ1Xed2mqrrbLttttmxx133OD+H2WmTbFo0aJ069btI33OB+nYsWPGjRuX733ve++75uPDHr+hoSHXXnttbr/99qxevTo77bRT9tlnn7Rp06a6z7Jly9K5c+e0aNH4Xd2333776p+XL1+eJBkxYkQ1IP/a66+//nGeFsAWS2gAFPa5z30u99xzT15++eVss802SZKlS5dmt912a7TfkiVLsssuu1Rvz507N7fddlt69uyZO++8M//0T/+U+vr6jX69Pffc80NPbfpbH2WmTfHUU09t8lvobuwah7//+7/PkUcemQULFuRb3/pW+vbtmwkTJmTu3Lkb/JxJkyZl8uTJGTFiRI488shsvfXWSZKTTjqpus+OO+6YZcuWpaGhoVFs/OlPf6r++b2jMhdffHH69+//vq/z3t8bAJvGL+wDKGzWrFlp2bJldtlll9TX16d169b55S9/2WifJ598Mq+88kr69u2bJFm/fn2GDx+eXXfdNXfddVc++9nP5pJLLsnatWuLz7epM/3tv/5/kDVr1uSJJ57IgQceuNF9//ZF/geZNm1axo0blyFDhuTss89OfX199fSnDZkxY0b22GOPnHjiidXIWLx4cebOnZuGhoYkSf/+/bN+/fpG71xVqVTy0EMPVW/vtttu2X777fPyyy+nd+/e1Y8dd9wxY8aMyezZszf6HAH4C0c0AD6mVatWZebMmdXb69aty8MPP5z77rsvp556avUi5qFDh2b8+PHZaqutcthhh+Xll1/Oddddlz322CMnnHBCkmTixImZPXt27rjjjrRt2zYjR47MySefnLFjx2b48OFF5+7cufMmzfTev/A/+OCDOeSQQ6oXuL/njTfeyOTJk1NXV5fOnTs3+rt44403sm7dusyePTvbb7995syZkzfeeONDr+VYu3ZtRo4cmW7duuWCCy7Y5Oezzz775IYbbsikSZPSp0+fvPjii7nxxhuzbt26rFmzJsm711kceOCB+c53vpOlS5fm05/+dO69997MmTOnepSlZcuWueiii3L55ZenZcuWOeyww7JixYrccMMNWbx48Qde+A/AhgkNgI9p9uzZjd7VqU2bNtl1111z0UUX5Zxzzqluv+CCC9KlS5f87Gc/y913353OnTvn6KOPzoUXXpj27dvn2WefzcSJE3PaaadVjybsvffeGTJkSG655ZYcccQR6devX9HZNzZT8u5pTAMGDMiYMWMyffr0TJo0qdFj/OY3v8mNN96YJDn99NM/8Oucf/75OeWUU3LdddelR48eOfnkkzc4049//OMsXLgw48aNS7t27Tb5uZx77rlZtmxZbr311owfPz477bRTjj/++NTV1eXGG2/MihUr0qlTp4wdOzbXXHNNxowZk/Xr12fQoEE57bTTMnXq1OpjnXzyyenQoUNuuumm3H333Wnfvn369u2b0aNHf+RTygC2dHWVjb2dBwB8gClTpmTcuHEb/EV6G7t/c1q0aFFmzpyZQYMGNbrIftiwYVm4cGF+8Ytf1HA6gE8mRzQA+MRr0aJFhg8fnkGDBuWkk05Ky5Yt87vf/S4PPPBArr766lqPB/CJJDQA+Fi222679OzZ82PfvznttNNO+fGPf5zx48fnwgsvzPr167P77rtn9OjR7/t9IgCU4dQpAACgOG9vCwAAFCc0AACA4oQGAABQnNAAAACKExoAAEBxH+ntbf/0p5Wp9XtU1dUl22+/dZOYBTbGeqU5sV5pTqxXmpNP2np97/lszEcKjUolTeYvpynNAhtjvdKcWK80J9YrzcmWtl6dOgUAABQ
},
"metadata": {},
"output_type": "display_data"
}
],
"execution_count": 172
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.245693Z",
"start_time": "2024-12-07T07:33:51.236859Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"# Количество пустых значений признаков\n",
"print(df.isnull().sum())\n",
"\n",
"print()\n",
"\n",
"# Есть ли пустые значения признаков\n",
"print(df.isnull().any())\n",
"\n",
"print()\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
],
"id": "84cf47a513b9f258",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0\n",
"gender 0\n",
"age 0\n",
"hypertension 0\n",
"heart_disease 0\n",
"ever_married 0\n",
"work_type 0\n",
"Residence_type 0\n",
"avg_glucose_level 0\n",
"bmi 201\n",
"smoking_status 0\n",
"stroke 0\n",
"dtype: int64\n",
"\n",
"id False\n",
"gender False\n",
"age False\n",
"hypertension False\n",
"heart_disease False\n",
"ever_married False\n",
"work_type False\n",
"Residence_type False\n",
"avg_glucose_level False\n",
"bmi True\n",
"smoking_status False\n",
"stroke False\n",
"dtype: bool\n",
"\n",
"bmi процент пустых значений: %3.93\n"
]
}
],
"execution_count": 173
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.304644Z",
"start_time": "2024-12-07T07:33:51.298975Z"
}
},
"cell_type": "code",
"source": [
"# Замена пустых данных на медиану\n",
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
"\n",
"# Процент пустых значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
],
"id": "ba00afd3f040bc81",
"outputs": [],
"execution_count": 174
},
{
"metadata": {},
"cell_type": "markdown",
"source": "3) Унитарное кодирование",
"id": "858e690bed6f98dd"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.334790Z",
"start_time": "2024-12-07T07:33:51.311404Z"
}
},
"cell_type": "code",
"source": [
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
"\n",
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
"\n",
"df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n",
"df = df.query('gender == \"Male\" or gender == \"Female\"')\n",
"encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n",
"\n",
"encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n",
"\n",
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
"\n",
"df = pd.concat([df, encoded_values_df], axis=1)\n",
"\n",
"df"
],
"id": "ec19b16d7410cbff",
"outputs": [
{
"data": {
"text/plain": [
" age gender hypertension work_type avg_glucose_level bmi \\\n",
"0 67.0 Male 0.0 Private 228.69 36.6 \n",
"1 61.0 Female 0.0 Self-employed 202.21 28.1 \n",
"2 80.0 Male 0.0 Private 105.92 32.5 \n",
"3 49.0 Female 0.0 Private 171.23 34.4 \n",
"4 79.0 Female 1.0 Self-employed 174.12 24.0 \n",
"... ... ... ... ... ... ... \n",
"5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n",
"5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n",
"5108 51.0 Male 0.0 Private 166.29 25.6 \n",
"5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n",
"3116 NaN NaN NaN NaN NaN NaN \n",
"\n",
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
"0 formerly smoked 1.0 0.0 1.0 \n",
"1 never smoked 0.0 0.0 0.0 \n",
"2 never smoked 1.0 0.0 1.0 \n",
"3 smokes 0.0 0.0 1.0 \n",
"4 never smoked 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"5106 never smoked 0.0 0.0 0.0 \n",
"5107 never smoked 1.0 0.0 1.0 \n",
"5108 formerly smoked 0.0 0.0 0.0 \n",
"5109 Unknown NaN NaN NaN \n",
"3116 NaN 0.0 0.0 0.0 \n",
"\n",
" work_type_Self-employed work_type_children \\\n",
"0 0.0 0.0 \n",
"1 1.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 1.0 0.0 \n",
"... ... ... \n",
"5106 1.0 0.0 \n",
"5107 0.0 0.0 \n",
"5108 0.0 0.0 \n",
"5109 NaN NaN \n",
"3116 0.0 1.0 \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
"0 1.0 0.0 \n",
"1 0.0 1.0 \n",
"2 0.0 1.0 \n",
"3 0.0 0.0 \n",
"4 0.0 1.0 \n",
"... ... ... \n",
"5106 0.0 1.0 \n",
"5107 1.0 0.0 \n",
"5108 0.0 0.0 \n",
"5109 NaN NaN \n",
"3116 0.0 0.0 \n",
"\n",
" smoking_status_smokes \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 1.0 \n",
"4 0.0 \n",
"... ... \n",
"5106 0.0 \n",
"5107 0.0 \n",
"5108 0.0 \n",
"5109 NaN \n",
"3116 0.0 \n",
"\n",
"[5110 rows x 15 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>hypertension</th>\n",
" <th>work_type</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>smoking_status</th>\n",
" <th>gender_Male</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>67.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>228.69</td>\n",
" <td>36.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>61.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>202.21</td>\n",
" <td>28.1</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>80.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>105.92</td>\n",
" <td>32.5</td>\n",
" <td>never smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>49.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>171.23</td>\n",
" <td>34.4</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>79.0</td>\n",
" <td>Female</td>\n",
" <td>1.0</td>\n",
" <td>Self-employed</td>\n",
" <td>174.12</td>\n",
" <td>24.0</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5106</th>\n",
" <td>81.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>125.20</td>\n",
" <td>40.0</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5107</th>\n",
" <td>35.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>82.99</td>\n",
" <td>30.6</td>\n",
" <td>never smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5108</th>\n",
" <td>51.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>166.29</td>\n",
" <td>25.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5109</th>\n",
" <td>44.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Govt_job</td>\n",
" <td>85.28</td>\n",
" <td>26.2</td>\n",
" <td>Unknown</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3116</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5110 rows × 15 columns</p>\n",
"</div>"
]
},
"execution_count": 175,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 175
},
{
"metadata": {},
"cell_type": "markdown",
"source": "4) Дискретизация признаков",
"id": "cc934d2268784440"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.401444Z",
"start_time": "2024-12-07T07:33:51.386817Z"
}
},
"cell_type": "code",
"source": [
"df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n",
"df"
],
"id": "b9a70c0c56176f98",
"outputs": [
{
"data": {
"text/plain": [
" age gender hypertension work_type avg_glucose_level bmi \\\n",
"0 4.0 Male 0.0 Private 228.69 36.6 \n",
"1 3.0 Female 0.0 Self-employed 202.21 28.1 \n",
"2 4.0 Male 0.0 Private 105.92 32.5 \n",
"3 2.0 Female 0.0 Private 171.23 34.4 \n",
"4 4.0 Female 1.0 Self-employed 174.12 24.0 \n",
"... ... ... ... ... ... ... \n",
"5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n",
"5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n",
"5108 2.0 Male 0.0 Private 166.29 25.6 \n",
"5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n",
"3116 NaN NaN NaN NaN NaN NaN \n",
"\n",
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
"0 formerly smoked 1.0 0.0 1.0 \n",
"1 never smoked 0.0 0.0 0.0 \n",
"2 never smoked 1.0 0.0 1.0 \n",
"3 smokes 0.0 0.0 1.0 \n",
"4 never smoked 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"5106 never smoked 0.0 0.0 0.0 \n",
"5107 never smoked 1.0 0.0 1.0 \n",
"5108 formerly smoked 0.0 0.0 0.0 \n",
"5109 Unknown NaN NaN NaN \n",
"3116 NaN 0.0 0.0 0.0 \n",
"\n",
" work_type_Self-employed work_type_children \\\n",
"0 0.0 0.0 \n",
"1 1.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 1.0 0.0 \n",
"... ... ... \n",
"5106 1.0 0.0 \n",
"5107 0.0 0.0 \n",
"5108 0.0 0.0 \n",
"5109 NaN NaN \n",
"3116 0.0 1.0 \n",
"\n",
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
"0 1.0 0.0 \n",
"1 0.0 1.0 \n",
"2 0.0 1.0 \n",
"3 0.0 0.0 \n",
"4 0.0 1.0 \n",
"... ... ... \n",
"5106 0.0 1.0 \n",
"5107 1.0 0.0 \n",
"5108 0.0 0.0 \n",
"5109 NaN NaN \n",
"3116 0.0 0.0 \n",
"\n",
" smoking_status_smokes \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 1.0 \n",
"4 0.0 \n",
"... ... \n",
"5106 0.0 \n",
"5107 0.0 \n",
"5108 0.0 \n",
"5109 NaN \n",
"3116 0.0 \n",
"\n",
"[5110 rows x 15 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>gender</th>\n",
" <th>hypertension</th>\n",
" <th>work_type</th>\n",
" <th>avg_glucose_level</th>\n",
" <th>bmi</th>\n",
" <th>smoking_status</th>\n",
" <th>gender_Male</th>\n",
" <th>work_type_Never_worked</th>\n",
" <th>work_type_Private</th>\n",
" <th>work_type_Self-employed</th>\n",
" <th>work_type_children</th>\n",
" <th>smoking_status_formerly smoked</th>\n",
" <th>smoking_status_never smoked</th>\n",
" <th>smoking_status_smokes</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>228.69</td>\n",
" <td>36.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>202.21</td>\n",
" <td>28.1</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>105.92</td>\n",
" <td>32.5</td>\n",
" <td>never smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>171.23</td>\n",
" <td>34.4</td>\n",
" <td>smokes</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.0</td>\n",
" <td>Female</td>\n",
" <td>1.0</td>\n",
" <td>Self-employed</td>\n",
" <td>174.12</td>\n",
" <td>24.0</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5106</th>\n",
" <td>4.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>125.20</td>\n",
" <td>40.0</td>\n",
" <td>never smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5107</th>\n",
" <td>1.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Self-employed</td>\n",
" <td>82.99</td>\n",
" <td>30.6</td>\n",
" <td>never smoked</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5108</th>\n",
" <td>2.0</td>\n",
" <td>Male</td>\n",
" <td>0.0</td>\n",
" <td>Private</td>\n",
" <td>166.29</td>\n",
" <td>25.6</td>\n",
" <td>formerly smoked</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5109</th>\n",
" <td>2.0</td>\n",
" <td>Female</td>\n",
" <td>0.0</td>\n",
" <td>Govt_job</td>\n",
" <td>85.28</td>\n",
" <td>26.2</td>\n",
" <td>Unknown</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3116</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5110 rows × 15 columns</p>\n",
"</div>"
]
},
"execution_count": 176,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 176
},
{
"metadata": {},
"cell_type": "markdown",
"source": "3) Разбиение данных",
"id": "7c5387ab7d3b9349"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.496592Z",
"start_time": "2024-12-07T07:33:51.456961Z"
}
},
"cell_type": "code",
"source": [
"# Функция для создания выборок\n",
"from sklearn.model_selection import train_test_split\n",
"dropna_df = df.dropna()\n",
"\n",
"df_input = dropna_df[[\n",
" \"age\",\n",
" \"hypertension\", \n",
" \"avg_glucose_level\", \n",
" \"bmi\", \n",
" \"gender_Male\",\n",
" \"work_type_Never_worked\",\n",
" \"work_type_Private\",\n",
" \"work_type_Self-employed\",\n",
" \"work_type_children\",\n",
" \"smoking_status_formerly smoked\",\n",
" \"smoking_status_never smoked\",\n",
" \"smoking_status_smokes\",\n",
"]]\n",
"\n",
"print(df_input.head())\n",
"\n",
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
"\n",
"# Разделение остатка на контрольную и тестовую выборки\n",
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
"\n",
"# Проверка размеров выборок\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))\n",
"\n",
"# Сохранение выборок в файлы\n",
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
],
"id": "8c9949a919295290",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" age hypertension avg_glucose_level bmi gender_Male \\\n",
"0 4.0 0.0 228.69 36.6 1.0 \n",
"1 3.0 0.0 202.21 28.1 0.0 \n",
"2 4.0 0.0 105.92 32.5 1.0 \n",
"3 2.0 0.0 171.23 34.4 0.0 \n",
"4 4.0 1.0 174.12 24.0 0.0 \n",
"\n",
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
"0 0.0 1.0 0.0 \n",
"1 0.0 0.0 1.0 \n",
"2 0.0 1.0 0.0 \n",
"3 0.0 1.0 0.0 \n",
"4 0.0 0.0 1.0 \n",
"\n",
" work_type_children smoking_status_formerly smoked \\\n",
"0 0.0 1.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"\n",
" smoking_status_never smoked smoking_status_smokes \n",
"0 0.0 0.0 \n",
"1 1.0 0.0 \n",
"2 1.0 0.0 \n",
"3 0.0 1.0 \n",
"4 1.0 0.0 \n",
"Размер обучающей выборки: 3064\n",
"Размер контрольной выборки: 1022\n",
"Размер тестовой выборки: 1022\n"
]
}
],
"execution_count": 177
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-12-07T07:33:51.637107Z",
"start_time": "2024-12-07T07:33:51.601412Z"
}
},
"cell_type": "code",
"source": [
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
"\n",
"def check_balance(df, name):\n",
" print(f\"Распределение gender в {name}:\")\n",
" print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n",
" print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
],
"id": "79b2248eb6438fa5",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение gender в обучающей выборке:\n",
"Процент gender_Male: 41.87%\n",
"Процент gender_Female: 58.13%\n",
"\n",
"Распределение gender в контрольной выборке:\n",
"Процент gender_Male: 40.41%\n",
"Процент gender_Female: 59.59%\n",
"\n",
"Распределение gender в тестовой выборке:\n",
"Процент gender_Male: 41.00%\n",
"Процент gender_Female: 59.00%\n",
"\n"
]
}
],
"execution_count": 178
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Выборка сбалансирована",
"id": "a6436f11045161c4"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}