1067 lines
50 KiB
Plaintext
1067 lines
50 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "1) Бизнес цели - опредение наличия заболивания у человека",
|
||
"id": "54c08440669b8de7"
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "2) Подготовка данных",
|
||
"id": "5d090ddc69b152cf"
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"id": "initial_id",
|
||
"metadata": {
|
||
"collapsed": true,
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.107138Z",
|
||
"start_time": "2024-12-07T07:33:51.094517Z"
|
||
}
|
||
},
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import pandas as pd\n",
|
||
"from prompt_toolkit.shortcuts.progress_bar import Percentage\n",
|
||
"\n",
|
||
"df = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
|
||
"print(df.columns)\n",
|
||
"print(df)"
|
||
],
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',\n",
|
||
" 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',\n",
|
||
" 'smoking_status', 'stroke'],\n",
|
||
" dtype='object')\n",
|
||
" id gender age hypertension heart_disease ever_married \\\n",
|
||
"0 9046 Male 67.0 0 1 Yes \n",
|
||
"1 51676 Female 61.0 0 0 Yes \n",
|
||
"2 31112 Male 80.0 0 1 Yes \n",
|
||
"3 60182 Female 49.0 0 0 Yes \n",
|
||
"4 1665 Female 79.0 1 0 Yes \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5105 18234 Female 80.0 1 0 Yes \n",
|
||
"5106 44873 Female 81.0 0 0 Yes \n",
|
||
"5107 19723 Female 35.0 0 0 Yes \n",
|
||
"5108 37544 Male 51.0 0 0 Yes \n",
|
||
"5109 44679 Female 44.0 0 0 Yes \n",
|
||
"\n",
|
||
" work_type Residence_type avg_glucose_level bmi smoking_status \\\n",
|
||
"0 Private Urban 228.69 36.6 formerly smoked \n",
|
||
"1 Self-employed Rural 202.21 NaN never smoked \n",
|
||
"2 Private Rural 105.92 32.5 never smoked \n",
|
||
"3 Private Urban 171.23 34.4 smokes \n",
|
||
"4 Self-employed Rural 174.12 24.0 never smoked \n",
|
||
"... ... ... ... ... ... \n",
|
||
"5105 Private Urban 83.75 NaN never smoked \n",
|
||
"5106 Self-employed Urban 125.20 40.0 never smoked \n",
|
||
"5107 Self-employed Rural 82.99 30.6 never smoked \n",
|
||
"5108 Private Rural 166.29 25.6 formerly smoked \n",
|
||
"5109 Govt_job Urban 85.28 26.2 Unknown \n",
|
||
"\n",
|
||
" stroke \n",
|
||
"0 1 \n",
|
||
"1 1 \n",
|
||
"2 1 \n",
|
||
"3 1 \n",
|
||
"4 1 \n",
|
||
"... ... \n",
|
||
"5105 0 \n",
|
||
"5106 0 \n",
|
||
"5107 0 \n",
|
||
"5108 0 \n",
|
||
"5109 0 \n",
|
||
"\n",
|
||
"[5110 rows x 12 columns]\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 171
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.208159Z",
|
||
"start_time": "2024-12-07T07:33:51.118240Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import seaborn as sns\n",
|
||
"\n",
|
||
"plt.figure(figsize=(10, 6))\n",
|
||
"sns.boxplot(x=df['age'])\n",
|
||
"plt.title('Box Plot для age')\n",
|
||
"plt.xlabel('age')\n",
|
||
"plt.show()"
|
||
],
|
||
"id": "ea1a7ed0e6d7d189",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"<Figure size 1000x600 with 1 Axes>"
|
||
],
|
||
"image/png": ""
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"execution_count": 172
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.245693Z",
|
||
"start_time": "2024-12-07T07:33:51.236859Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Количество пустых значений признаков\n",
|
||
"print(df.isnull().sum())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Есть ли пустые значения признаков\n",
|
||
"print(df.isnull().any())\n",
|
||
"\n",
|
||
"print()\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
],
|
||
"id": "84cf47a513b9f258",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"id 0\n",
|
||
"gender 0\n",
|
||
"age 0\n",
|
||
"hypertension 0\n",
|
||
"heart_disease 0\n",
|
||
"ever_married 0\n",
|
||
"work_type 0\n",
|
||
"Residence_type 0\n",
|
||
"avg_glucose_level 0\n",
|
||
"bmi 201\n",
|
||
"smoking_status 0\n",
|
||
"stroke 0\n",
|
||
"dtype: int64\n",
|
||
"\n",
|
||
"id False\n",
|
||
"gender False\n",
|
||
"age False\n",
|
||
"hypertension False\n",
|
||
"heart_disease False\n",
|
||
"ever_married False\n",
|
||
"work_type False\n",
|
||
"Residence_type False\n",
|
||
"avg_glucose_level False\n",
|
||
"bmi True\n",
|
||
"smoking_status False\n",
|
||
"stroke False\n",
|
||
"dtype: bool\n",
|
||
"\n",
|
||
"bmi процент пустых значений: %3.93\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 173
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.304644Z",
|
||
"start_time": "2024-12-07T07:33:51.298975Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# Замена пустых данных на медиану\n",
|
||
"df[\"bmi\"] = df[\"bmi\"].fillna(df[\"bmi\"].median())\n",
|
||
"\n",
|
||
"# Процент пустых значений признаков\n",
|
||
"for i in df.columns:\n",
|
||
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
|
||
" if null_rate > 0:\n",
|
||
" print(f\"{i} процент пустых значений: %{null_rate:.2f}\")"
|
||
],
|
||
"id": "ba00afd3f040bc81",
|
||
"outputs": [],
|
||
"execution_count": 174
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "3) Унитарное кодирование",
|
||
"id": "858e690bed6f98dd"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.334790Z",
|
||
"start_time": "2024-12-07T07:33:51.311404Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"encoder = OneHotEncoder(sparse_output=False, drop=\"first\")\n",
|
||
"\n",
|
||
"df = df[[\"age\", \"gender\", \"hypertension\", \"work_type\", \"avg_glucose_level\", \"bmi\", \"smoking_status\"]]\n",
|
||
"df = df.query('gender == \"Male\" or gender == \"Female\"')\n",
|
||
"encoded_values = encoder.fit_transform(df[[\"gender\", \"work_type\", \"smoking_status\"]])\n",
|
||
"\n",
|
||
"encoded_columns = encoder.get_feature_names_out([\"gender\", \"work_type\", \"smoking_status\"])\n",
|
||
"\n",
|
||
"encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)\n",
|
||
"\n",
|
||
"df = pd.concat([df, encoded_values_df], axis=1)\n",
|
||
"\n",
|
||
"df"
|
||
],
|
||
"id": "ec19b16d7410cbff",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
||
"0 67.0 Male 0.0 Private 228.69 36.6 \n",
|
||
"1 61.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
||
"2 80.0 Male 0.0 Private 105.92 32.5 \n",
|
||
"3 49.0 Female 0.0 Private 171.23 34.4 \n",
|
||
"4 79.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5106 81.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
||
"5107 35.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
||
"5108 51.0 Male 0.0 Private 166.29 25.6 \n",
|
||
"5109 44.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
||
"3116 NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
||
"0 formerly smoked 1.0 0.0 1.0 \n",
|
||
"1 never smoked 0.0 0.0 0.0 \n",
|
||
"2 never smoked 1.0 0.0 1.0 \n",
|
||
"3 smokes 0.0 0.0 1.0 \n",
|
||
"4 never smoked 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"5106 never smoked 0.0 0.0 0.0 \n",
|
||
"5107 never smoked 1.0 0.0 1.0 \n",
|
||
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
||
"5109 Unknown NaN NaN NaN \n",
|
||
"3116 NaN 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Self-employed work_type_children \\\n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"... ... ... \n",
|
||
"5106 1.0 0.0 \n",
|
||
"5107 0.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 1.0 \n",
|
||
"\n",
|
||
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
||
"0 1.0 0.0 \n",
|
||
"1 0.0 1.0 \n",
|
||
"2 0.0 1.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 1.0 \n",
|
||
"... ... ... \n",
|
||
"5106 0.0 1.0 \n",
|
||
"5107 1.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_smokes \n",
|
||
"0 0.0 \n",
|
||
"1 0.0 \n",
|
||
"2 0.0 \n",
|
||
"3 1.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"5106 0.0 \n",
|
||
"5107 0.0 \n",
|
||
"5108 0.0 \n",
|
||
"5109 NaN \n",
|
||
"3116 0.0 \n",
|
||
"\n",
|
||
"[5110 rows x 15 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>hypertension</th>\n",
|
||
" <th>work_type</th>\n",
|
||
" <th>avg_glucose_level</th>\n",
|
||
" <th>bmi</th>\n",
|
||
" <th>smoking_status</th>\n",
|
||
" <th>gender_Male</th>\n",
|
||
" <th>work_type_Never_worked</th>\n",
|
||
" <th>work_type_Private</th>\n",
|
||
" <th>work_type_Self-employed</th>\n",
|
||
" <th>work_type_children</th>\n",
|
||
" <th>smoking_status_formerly smoked</th>\n",
|
||
" <th>smoking_status_never smoked</th>\n",
|
||
" <th>smoking_status_smokes</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>67.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>228.69</td>\n",
|
||
" <td>36.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>61.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>202.21</td>\n",
|
||
" <td>28.1</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>105.92</td>\n",
|
||
" <td>32.5</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>49.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>171.23</td>\n",
|
||
" <td>34.4</td>\n",
|
||
" <td>smokes</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>79.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>174.12</td>\n",
|
||
" <td>24.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5106</th>\n",
|
||
" <td>81.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>125.20</td>\n",
|
||
" <td>40.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5107</th>\n",
|
||
" <td>35.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>82.99</td>\n",
|
||
" <td>30.6</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5108</th>\n",
|
||
" <td>51.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>166.29</td>\n",
|
||
" <td>25.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5109</th>\n",
|
||
" <td>44.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Govt_job</td>\n",
|
||
" <td>85.28</td>\n",
|
||
" <td>26.2</td>\n",
|
||
" <td>Unknown</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3116</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5110 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 175,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 175
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "4) Дискретизация признаков",
|
||
"id": "cc934d2268784440"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.401444Z",
|
||
"start_time": "2024-12-07T07:33:51.386817Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"df[\"age\"] = pd.qcut(df[\"age\"], q=5, labels=False)\n",
|
||
"df"
|
||
],
|
||
"id": "b9a70c0c56176f98",
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
" age gender hypertension work_type avg_glucose_level bmi \\\n",
|
||
"0 4.0 Male 0.0 Private 228.69 36.6 \n",
|
||
"1 3.0 Female 0.0 Self-employed 202.21 28.1 \n",
|
||
"2 4.0 Male 0.0 Private 105.92 32.5 \n",
|
||
"3 2.0 Female 0.0 Private 171.23 34.4 \n",
|
||
"4 4.0 Female 1.0 Self-employed 174.12 24.0 \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"5106 4.0 Female 0.0 Self-employed 125.20 40.0 \n",
|
||
"5107 1.0 Female 0.0 Self-employed 82.99 30.6 \n",
|
||
"5108 2.0 Male 0.0 Private 166.29 25.6 \n",
|
||
"5109 2.0 Female 0.0 Govt_job 85.28 26.2 \n",
|
||
"3116 NaN NaN NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" smoking_status gender_Male work_type_Never_worked work_type_Private \\\n",
|
||
"0 formerly smoked 1.0 0.0 1.0 \n",
|
||
"1 never smoked 0.0 0.0 0.0 \n",
|
||
"2 never smoked 1.0 0.0 1.0 \n",
|
||
"3 smokes 0.0 0.0 1.0 \n",
|
||
"4 never smoked 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"5106 never smoked 0.0 0.0 0.0 \n",
|
||
"5107 never smoked 1.0 0.0 1.0 \n",
|
||
"5108 formerly smoked 0.0 0.0 0.0 \n",
|
||
"5109 Unknown NaN NaN NaN \n",
|
||
"3116 NaN 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Self-employed work_type_children \\\n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"... ... ... \n",
|
||
"5106 1.0 0.0 \n",
|
||
"5107 0.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 1.0 \n",
|
||
"\n",
|
||
" smoking_status_formerly smoked smoking_status_never smoked \\\n",
|
||
"0 1.0 0.0 \n",
|
||
"1 0.0 1.0 \n",
|
||
"2 0.0 1.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 1.0 \n",
|
||
"... ... ... \n",
|
||
"5106 0.0 1.0 \n",
|
||
"5107 1.0 0.0 \n",
|
||
"5108 0.0 0.0 \n",
|
||
"5109 NaN NaN \n",
|
||
"3116 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_smokes \n",
|
||
"0 0.0 \n",
|
||
"1 0.0 \n",
|
||
"2 0.0 \n",
|
||
"3 1.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"5106 0.0 \n",
|
||
"5107 0.0 \n",
|
||
"5108 0.0 \n",
|
||
"5109 NaN \n",
|
||
"3116 0.0 \n",
|
||
"\n",
|
||
"[5110 rows x 15 columns]"
|
||
],
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>age</th>\n",
|
||
" <th>gender</th>\n",
|
||
" <th>hypertension</th>\n",
|
||
" <th>work_type</th>\n",
|
||
" <th>avg_glucose_level</th>\n",
|
||
" <th>bmi</th>\n",
|
||
" <th>smoking_status</th>\n",
|
||
" <th>gender_Male</th>\n",
|
||
" <th>work_type_Never_worked</th>\n",
|
||
" <th>work_type_Private</th>\n",
|
||
" <th>work_type_Self-employed</th>\n",
|
||
" <th>work_type_children</th>\n",
|
||
" <th>smoking_status_formerly smoked</th>\n",
|
||
" <th>smoking_status_never smoked</th>\n",
|
||
" <th>smoking_status_smokes</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>228.69</td>\n",
|
||
" <td>36.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>202.21</td>\n",
|
||
" <td>28.1</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>105.92</td>\n",
|
||
" <td>32.5</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>171.23</td>\n",
|
||
" <td>34.4</td>\n",
|
||
" <td>smokes</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>174.12</td>\n",
|
||
" <td>24.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5106</th>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>125.20</td>\n",
|
||
" <td>40.0</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5107</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Self-employed</td>\n",
|
||
" <td>82.99</td>\n",
|
||
" <td>30.6</td>\n",
|
||
" <td>never smoked</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5108</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Male</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Private</td>\n",
|
||
" <td>166.29</td>\n",
|
||
" <td>25.6</td>\n",
|
||
" <td>formerly smoked</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5109</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>Female</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>Govt_job</td>\n",
|
||
" <td>85.28</td>\n",
|
||
" <td>26.2</td>\n",
|
||
" <td>Unknown</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3116</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5110 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
]
|
||
},
|
||
"execution_count": 176,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"execution_count": 176
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "3) Разбиение данных",
|
||
"id": "7c5387ab7d3b9349"
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.496592Z",
|
||
"start_time": "2024-12-07T07:33:51.456961Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"# Функция для создания выборок\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"dropna_df = df.dropna()\n",
|
||
"\n",
|
||
"df_input = dropna_df[[\n",
|
||
" \"age\",\n",
|
||
" \"hypertension\", \n",
|
||
" \"avg_glucose_level\", \n",
|
||
" \"bmi\", \n",
|
||
" \"gender_Male\",\n",
|
||
" \"work_type_Never_worked\",\n",
|
||
" \"work_type_Private\",\n",
|
||
" \"work_type_Self-employed\",\n",
|
||
" \"work_type_children\",\n",
|
||
" \"smoking_status_formerly smoked\",\n",
|
||
" \"smoking_status_never smoked\",\n",
|
||
" \"smoking_status_smokes\",\n",
|
||
"]]\n",
|
||
"\n",
|
||
"print(df_input.head())\n",
|
||
"\n",
|
||
"train_df, temp_df = train_test_split(df_input, test_size=0.4, random_state=42)\n",
|
||
"\n",
|
||
"# Разделение остатка на контрольную и тестовую выборки\n",
|
||
"val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)\n",
|
||
"\n",
|
||
"# Проверка размеров выборок\n",
|
||
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
||
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
||
"print(\"Размер тестовой выборки:\", len(test_df))\n",
|
||
"\n",
|
||
"# Сохранение выборок в файлы\n",
|
||
"train_df.to_csv(\".//static//csv//train_data.csv\", index=False)\n",
|
||
"val_df.to_csv(\".//static//csv//val_data.csv\", index=False)\n",
|
||
"test_df.to_csv(\".//static//csv//test_data.csv\", index=False)"
|
||
],
|
||
"id": "8c9949a919295290",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" age hypertension avg_glucose_level bmi gender_Male \\\n",
|
||
"0 4.0 0.0 228.69 36.6 1.0 \n",
|
||
"1 3.0 0.0 202.21 28.1 0.0 \n",
|
||
"2 4.0 0.0 105.92 32.5 1.0 \n",
|
||
"3 2.0 0.0 171.23 34.4 0.0 \n",
|
||
"4 4.0 1.0 174.12 24.0 0.0 \n",
|
||
"\n",
|
||
" work_type_Never_worked work_type_Private work_type_Self-employed \\\n",
|
||
"0 0.0 1.0 0.0 \n",
|
||
"1 0.0 0.0 1.0 \n",
|
||
"2 0.0 1.0 0.0 \n",
|
||
"3 0.0 1.0 0.0 \n",
|
||
"4 0.0 0.0 1.0 \n",
|
||
"\n",
|
||
" work_type_children smoking_status_formerly smoked \\\n",
|
||
"0 0.0 1.0 \n",
|
||
"1 0.0 0.0 \n",
|
||
"2 0.0 0.0 \n",
|
||
"3 0.0 0.0 \n",
|
||
"4 0.0 0.0 \n",
|
||
"\n",
|
||
" smoking_status_never smoked smoking_status_smokes \n",
|
||
"0 0.0 0.0 \n",
|
||
"1 1.0 0.0 \n",
|
||
"2 1.0 0.0 \n",
|
||
"3 0.0 1.0 \n",
|
||
"4 1.0 0.0 \n",
|
||
"Размер обучающей выборки: 3064\n",
|
||
"Размер контрольной выборки: 1022\n",
|
||
"Размер тестовой выборки: 1022\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 177
|
||
},
|
||
{
|
||
"metadata": {
|
||
"ExecuteTime": {
|
||
"end_time": "2024-12-07T07:33:51.637107Z",
|
||
"start_time": "2024-12-07T07:33:51.601412Z"
|
||
}
|
||
},
|
||
"cell_type": "code",
|
||
"source": [
|
||
"train_df = pd.read_csv(\".//static//csv//train_data.csv\")\n",
|
||
"val_df = pd.read_csv(\".//static//csv//val_data.csv\")\n",
|
||
"test_df = pd.read_csv(\".//static//csv//test_data.csv\")\n",
|
||
"\n",
|
||
"def check_balance(df, name):\n",
|
||
" print(f\"Распределение gender в {name}:\")\n",
|
||
" print(f\"Процент gender_Male: {df[\"gender_Male\"].value_counts()[1.0] / len(df) * 100:.2f}%\")\n",
|
||
" print(f\"Процент gender_Female: {(len(df) - df[\"gender_Male\"].value_counts()[1.0]) / len(df) * 100:.2f}%\")\n",
|
||
" print()\n",
|
||
"\n",
|
||
"check_balance(train_df, \"обучающей выборке\")\n",
|
||
"check_balance(val_df, \"контрольной выборке\")\n",
|
||
"check_balance(test_df, \"тестовой выборке\")"
|
||
],
|
||
"id": "79b2248eb6438fa5",
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Распределение gender в обучающей выборке:\n",
|
||
"Процент gender_Male: 41.87%\n",
|
||
"Процент gender_Female: 58.13%\n",
|
||
"\n",
|
||
"Распределение gender в контрольной выборке:\n",
|
||
"Процент gender_Male: 40.41%\n",
|
||
"Процент gender_Female: 59.59%\n",
|
||
"\n",
|
||
"Распределение gender в тестовой выборке:\n",
|
||
"Процент gender_Male: 41.00%\n",
|
||
"Процент gender_Female: 59.00%\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"execution_count": 178
|
||
},
|
||
{
|
||
"metadata": {},
|
||
"cell_type": "markdown",
|
||
"source": "Выборка сбалансирована",
|
||
"id": "a6436f11045161c4"
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 2
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython2",
|
||
"version": "2.7.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|