diff --git a/Lab_3/lab3.ipynb b/Lab_3/lab3.ipynb new file mode 100644 index 0000000..54231c1 --- /dev/null +++ b/Lab_3/lab3.ipynb @@ -0,0 +1,1066 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "1) Бизнес цели - опредение наличия заболивания у человека", + "id": "54c08440669b8de7" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "2) Подготовка данных", + "id": "5d090ddc69b152cf" + }, + { + "cell_type": "code", + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.107138Z", + "start_time": "2024-12-07T07:33:51.094517Z" + } + }, + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from prompt_toolkit.shortcuts.progress_bar import Percentage\n", + "\n", + "df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", + "print(df.columns)\n", + "print(df)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n", + " 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n", + " 'smoking_status', 'stroke'],\n", + " dtype='object')\n", + " id gender age hypertension heart_disease ever_married \\\n", + "0 9046 Male 67.0 0 1 Yes \n", + "1 51676 Female 61.0 0 0 Yes \n", + "2 31112 Male 80.0 0 1 Yes \n", + "3 60182 Female 49.0 0 0 Yes \n", + "4 1665 Female 79.0 1 0 Yes \n", + "... ... ... ... ... ... ... \n", + "5105 18234 Female 80.0 1 0 Yes \n", + "5106 44873 Female 81.0 0 0 Yes \n", + "5107 19723 Female 35.0 0 0 Yes \n", + "5108 37544 Male 51.0 0 0 Yes \n", + "5109 44679 Female 44.0 0 0 Yes \n", + "\n", + " work_type Residence_type avg_glucose_level bmi smoking_status \\\n", + "0 Private Urban 228.69 36.6 formerly smoked \n", + "1 Self-employed Rural 202.21 NaN never smoked \n", + "2 Private Rural 105.92 32.5 never smoked \n", + "3 Private Urban 171.23 34.4 smokes \n", + "4 Self-employed Rural 174.12 24.0 never smoked \n", + "... ... ... ... ... ... \n", + "5105 Private Urban 83.75 NaN never smoked \n", + "5106 Self-employed Urban 125.20 40.0 never smoked \n", + "5107 Self-employed Rural 82.99 30.6 never smoked \n", + "5108 Private Rural 166.29 25.6 formerly smoked \n", + "5109 Govt_job Urban 85.28 26.2 Unknown \n", + "\n", + " stroke \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "... ... \n", + "5105 0 \n", + "5106 0 \n", + "5107 0 \n", + "5108 0 \n", + "5109 0 \n", + "\n", + "[5110 rows x 12 columns]\n" + ] + } + ], + "execution_count": 171 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.208159Z", + "start_time": "2024-12-07T07:33:51.118240Z" + } + }, + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x=df['age'])\n", + "plt.title('Box Plot для age')\n", + "plt.xlabel('age')\n", + "plt.show()" + ], + "id": "ea1a7ed0e6d7d189", + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ], + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "execution_count": 172 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.245693Z", + "start_time": "2024-12-07T07:33:51.236859Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# Количество пустых значений признаков\n", + "print(df.isnull().sum())\n", + "\n", + "print()\n", + "\n", + "# Есть ли пустые значения признаков\n", + "print(df.isnull().any())\n", + "\n", + "print()\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ], + "id": "84cf47a513b9f258", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "gender 0\n", + "age 0\n", + "hypertension 0\n", + "heart_disease 0\n", + "ever_married 0\n", + "work_type 0\n", + "Residence_type 0\n", + "avg_glucose_level 0\n", + "bmi 201\n", + "smoking_status 0\n", + "stroke 0\n", + "dtype: int64\n", + "\n", + "id False\n", + "gender False\n", + "age False\n", + "hypertension False\n", + "heart_disease False\n", + "ever_married False\n", + "work_type False\n", + "Residence_type False\n", + "avg_glucose_level False\n", + "bmi True\n", + "smoking_status False\n", + "stroke False\n", + "dtype: bool\n", + "\n", + "bmi процент пустых значений: %3.93\n" + ] + } + ], + "execution_count": 173 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.304644Z", + "start_time": "2024-12-07T07:33:51.298975Z" + } + }, + "cell_type": "code", + "source": [ + "# Замена пустых данных на медиану\n", + "df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n", + "\n", + "# Процент пустых значений признаков\n", + "for i in df.columns:\n", + " null_rate = df[i].isnull().sum() / len(df) * 100\n", + " if null_rate > 0:\n", + " print(f\"{i} процент пустых значений: %{null_rate:.2f}\")" + ], + "id": "ba00afd3f040bc81", + "outputs": [], + "execution_count": 174 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "3) Унитарное кодирование", + "id": "858e690bed6f98dd" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.334790Z", + "start_time": "2024-12-07T07:33:51.311404Z" + } + }, + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "import numpy as np\n", + "\n", + "encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n", + "\n", + "df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n", + "df = df.query('gender == \"Male\" or gender == \"Female\"')\n", + "encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n", + "\n", + "encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n", + "\n", + "encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n", + "\n", + "df = pd.concat([df, encoded_values_df], axis=1)\n", + "\n", + "df" + ], + "id": "ec19b16d7410cbff", + "outputs": [ + { + "data": { + "text/plain": [ + " age gender hypertension work_type avg_glucose_level bmi \\\n", + "0 67.0 Male 0.0 Private 228.69 36.6 \n", + "1 61.0 Female 0.0 Self-employed 202.21 28.1 \n", + "2 80.0 Male 0.0 Private 105.92 32.5 \n", + "3 49.0 Female 0.0 Private 171.23 34.4 \n", + "4 79.0 Female 1.0 Self-employed 174.12 24.0 \n", + "... ... ... ... ... ... ... \n", + "5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n", + "5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n", + "5108 51.0 Male 0.0 Private 166.29 25.6 \n", + "5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n", + "3116 NaN NaN NaN NaN NaN NaN \n", + "\n", + " smoking_status gender_Male work_type_Never_worked work_type_Private \\\n", + "0 formerly smoked 1.0 0.0 1.0 \n", + "1 never smoked 0.0 0.0 0.0 \n", + "2 never smoked 1.0 0.0 1.0 \n", + "3 smokes 0.0 0.0 1.0 \n", + "4 never smoked 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "5106 never smoked 0.0 0.0 0.0 \n", + "5107 never smoked 1.0 0.0 1.0 \n", + "5108 formerly smoked 0.0 0.0 0.0 \n", + "5109 Unknown NaN NaN NaN \n", + "3116 NaN 0.0 0.0 0.0 \n", + "\n", + " work_type_Self-employed work_type_children \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "... ... ... \n", + "5106 1.0 0.0 \n", + "5107 0.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 1.0 \n", + "\n", + " smoking_status_formerly smoked smoking_status_never smoked \\\n", + "0 1.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 0.0 \n", + "4 0.0 1.0 \n", + "... ... ... \n", + "5106 0.0 1.0 \n", + "5107 1.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 0.0 \n", + "\n", + " smoking_status_smokes \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 \n", + "... ... \n", + "5106 0.0 \n", + "5107 0.0 \n", + "5108 0.0 \n", + "5109 NaN \n", + "3116 0.0 \n", + "\n", + "[5110 rows x 15 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agegenderhypertensionwork_typeavg_glucose_levelbmismoking_statusgender_Malework_type_Never_workedwork_type_Privatework_type_Self-employedwork_type_childrensmoking_status_formerly smokedsmoking_status_never smokedsmoking_status_smokes
067.0Male0.0Private228.6936.6formerly smoked1.00.01.00.00.01.00.00.0
161.0Female0.0Self-employed202.2128.1never smoked0.00.00.01.00.00.01.00.0
280.0Male0.0Private105.9232.5never smoked1.00.01.00.00.00.01.00.0
349.0Female0.0Private171.2334.4smokes0.00.01.00.00.00.00.01.0
479.0Female1.0Self-employed174.1224.0never smoked0.00.00.01.00.00.01.00.0
................................................
510681.0Female0.0Self-employed125.2040.0never smoked0.00.00.01.00.00.01.00.0
510735.0Female0.0Self-employed82.9930.6never smoked1.00.01.00.00.01.00.00.0
510851.0Male0.0Private166.2925.6formerly smoked0.00.00.00.00.00.00.00.0
510944.0Female0.0Govt_job85.2826.2UnknownNaNNaNNaNNaNNaNNaNNaNNaN
3116NaNNaNNaNNaNNaNNaNNaN0.00.00.00.01.00.00.00.0
\n", + "

5110 rows × 15 columns

\n", + "
" + ] + }, + "execution_count": 175, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 175 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "4) Дискретизация признаков", + "id": "cc934d2268784440" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.401444Z", + "start_time": "2024-12-07T07:33:51.386817Z" + } + }, + "cell_type": "code", + "source": [ + "df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n", + "df" + ], + "id": "b9a70c0c56176f98", + "outputs": [ + { + "data": { + "text/plain": [ + " age gender hypertension work_type avg_glucose_level bmi \\\n", + "0 4.0 Male 0.0 Private 228.69 36.6 \n", + "1 3.0 Female 0.0 Self-employed 202.21 28.1 \n", + "2 4.0 Male 0.0 Private 105.92 32.5 \n", + "3 2.0 Female 0.0 Private 171.23 34.4 \n", + "4 4.0 Female 1.0 Self-employed 174.12 24.0 \n", + "... ... ... ... ... ... ... \n", + "5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n", + "5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n", + "5108 2.0 Male 0.0 Private 166.29 25.6 \n", + "5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n", + "3116 NaN NaN NaN NaN NaN NaN \n", + "\n", + " smoking_status gender_Male work_type_Never_worked work_type_Private \\\n", + "0 formerly smoked 1.0 0.0 1.0 \n", + "1 never smoked 0.0 0.0 0.0 \n", + "2 never smoked 1.0 0.0 1.0 \n", + "3 smokes 0.0 0.0 1.0 \n", + "4 never smoked 0.0 0.0 0.0 \n", + "... ... ... ... ... \n", + "5106 never smoked 0.0 0.0 0.0 \n", + "5107 never smoked 1.0 0.0 1.0 \n", + "5108 formerly smoked 0.0 0.0 0.0 \n", + "5109 Unknown NaN NaN NaN \n", + "3116 NaN 0.0 0.0 0.0 \n", + "\n", + " work_type_Self-employed work_type_children \\\n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "... ... ... \n", + "5106 1.0 0.0 \n", + "5107 0.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 1.0 \n", + "\n", + " smoking_status_formerly smoked smoking_status_never smoked \\\n", + "0 1.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 0.0 \n", + "4 0.0 1.0 \n", + "... ... ... \n", + "5106 0.0 1.0 \n", + "5107 1.0 0.0 \n", + "5108 0.0 0.0 \n", + "5109 NaN NaN \n", + "3116 0.0 0.0 \n", + "\n", + " smoking_status_smokes \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 1.0 \n", + "4 0.0 \n", + "... ... \n", + "5106 0.0 \n", + "5107 0.0 \n", + "5108 0.0 \n", + "5109 NaN \n", + "3116 0.0 \n", + "\n", + "[5110 rows x 15 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agegenderhypertensionwork_typeavg_glucose_levelbmismoking_statusgender_Malework_type_Never_workedwork_type_Privatework_type_Self-employedwork_type_childrensmoking_status_formerly smokedsmoking_status_never smokedsmoking_status_smokes
04.0Male0.0Private228.6936.6formerly smoked1.00.01.00.00.01.00.00.0
13.0Female0.0Self-employed202.2128.1never smoked0.00.00.01.00.00.01.00.0
24.0Male0.0Private105.9232.5never smoked1.00.01.00.00.00.01.00.0
32.0Female0.0Private171.2334.4smokes0.00.01.00.00.00.00.01.0
44.0Female1.0Self-employed174.1224.0never smoked0.00.00.01.00.00.01.00.0
................................................
51064.0Female0.0Self-employed125.2040.0never smoked0.00.00.01.00.00.01.00.0
51071.0Female0.0Self-employed82.9930.6never smoked1.00.01.00.00.01.00.00.0
51082.0Male0.0Private166.2925.6formerly smoked0.00.00.00.00.00.00.00.0
51092.0Female0.0Govt_job85.2826.2UnknownNaNNaNNaNNaNNaNNaNNaNNaN
3116NaNNaNNaNNaNNaNNaNNaN0.00.00.00.01.00.00.00.0
\n", + "

5110 rows × 15 columns

\n", + "
" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 176 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "3) Разбиение данных", + "id": "7c5387ab7d3b9349" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.496592Z", + "start_time": "2024-12-07T07:33:51.456961Z" + } + }, + "cell_type": "code", + "source": [ + "# Функция для создания выборок\n", + "from sklearn.model_selection import train_test_split\n", + "dropna_df = df.dropna()\n", + "\n", + "df_input = dropna_df[[\n", + " \"age\",\n", + " \"hypertension\", \n", + " \"avg_glucose_level\", \n", + " \"bmi\", \n", + " \"gender_Male\",\n", + " \"work_type_Never_worked\",\n", + " \"work_type_Private\",\n", + " \"work_type_Self-employed\",\n", + " \"work_type_children\",\n", + " \"smoking_status_formerly smoked\",\n", + " \"smoking_status_never smoked\",\n", + " \"smoking_status_smokes\",\n", + "]]\n", + "\n", + "print(df_input.head())\n", + "\n", + "train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n", + "\n", + "# Разделение остатка на контрольную и тестовую выборки\n", + "val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n", + "\n", + "# Проверка размеров выборок\n", + "print(\"Размер обучающей выборки:\", len(train_df))\n", + "print(\"Размер контрольной выборки:\", len(val_df))\n", + "print(\"Размер тестовой выборки:\", len(test_df))\n", + "\n", + "# Сохранение выборок в файлы\n", + "train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n", + "val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n", + "test_df.to_csv(\".//static//csv//test_data.csv\", index=False)" + ], + "id": "8c9949a919295290", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age hypertension avg_glucose_level bmi gender_Male \\\n", + "0 4.0 0.0 228.69 36.6 1.0 \n", + "1 3.0 0.0 202.21 28.1 0.0 \n", + "2 4.0 0.0 105.92 32.5 1.0 \n", + "3 2.0 0.0 171.23 34.4 0.0 \n", + "4 4.0 1.0 174.12 24.0 0.0 \n", + "\n", + " work_type_Never_worked work_type_Private work_type_Self-employed \\\n", + "0 0.0 1.0 0.0 \n", + "1 0.0 0.0 1.0 \n", + "2 0.0 1.0 0.0 \n", + "3 0.0 1.0 0.0 \n", + "4 0.0 0.0 1.0 \n", + "\n", + " work_type_children smoking_status_formerly smoked \\\n", + "0 0.0 1.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " smoking_status_never smoked smoking_status_smokes \n", + "0 0.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 0.0 1.0 \n", + "4 1.0 0.0 \n", + "Размер обучающей выборки: 3064\n", + "Размер контрольной выборки: 1022\n", + "Размер тестовой выборки: 1022\n" + ] + } + ], + "execution_count": 177 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-07T07:33:51.637107Z", + "start_time": "2024-12-07T07:33:51.601412Z" + } + }, + "cell_type": "code", + "source": [ + "train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n", + "val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n", + "test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n", + "\n", + "def check_balance(df, name):\n", + " print(f\"Распределение gender в {name}:\")\n", + " print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n", + " print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n", + " print()\n", + "\n", + "check_balance(train_df, \"обучающей выборке\")\n", + "check_balance(val_df, \"контрольной выборке\")\n", + "check_balance(test_df, \"тестовой выборке\")" + ], + "id": "79b2248eb6438fa5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Распределение gender в обучающей выборке:\n", + "Процент gender_Male: 41.87%\n", + "Процент gender_Female: 58.13%\n", + "\n", + "Распределение gender в контрольной выборке:\n", + "Процент gender_Male: 40.41%\n", + "Процент gender_Female: 59.59%\n", + "\n", + "Распределение gender в тестовой выборке:\n", + "Процент gender_Male: 41.00%\n", + "Процент gender_Female: 59.00%\n", + "\n" + ] + } + ], + "execution_count": 178 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Выборка сбалансирована", + "id": "a6436f11045161c4" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}