AIM-PIbd-31-Potapov-N-S/lab_3/lab3.ipynb

1858 lines
336 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Вариант 2. Показатели сердечных заболеваний"
]
},
{
"cell_type": "code",
"execution_count": 248,
"metadata": {},
"outputs": [],
"source": [
"from typing import Any\n",
"from math import ceil\n",
"\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import ADASYN, SMOTE\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Загрузим данные из датасета"
]
},
{
"cell_type": "code",
"execution_count": 249,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('csv\\\\heart_2022_no_nans.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Посмотрим общие сведения о датасете"
]
},
{
"cell_type": "code",
"execution_count": 250,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 246022 entries, 0 to 246021\n",
"Data columns (total 40 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 State 246022 non-null object \n",
" 1 Sex 246022 non-null object \n",
" 2 GeneralHealth 246022 non-null object \n",
" 3 PhysicalHealthDays 246022 non-null float64\n",
" 4 MentalHealthDays 246022 non-null float64\n",
" 5 LastCheckupTime 246022 non-null object \n",
" 6 PhysicalActivities 246022 non-null object \n",
" 7 SleepHours 246022 non-null float64\n",
" 8 RemovedTeeth 246022 non-null object \n",
" 9 HadHeartAttack 246022 non-null object \n",
" 10 HadAngina 246022 non-null object \n",
" 11 HadStroke 246022 non-null object \n",
" 12 HadAsthma 246022 non-null object \n",
" 13 HadSkinCancer 246022 non-null object \n",
" 14 HadCOPD 246022 non-null object \n",
" 15 HadDepressiveDisorder 246022 non-null object \n",
" 16 HadKidneyDisease 246022 non-null object \n",
" 17 HadArthritis 246022 non-null object \n",
" 18 HadDiabetes 246022 non-null object \n",
" 19 DeafOrHardOfHearing 246022 non-null object \n",
" 20 BlindOrVisionDifficulty 246022 non-null object \n",
" 21 DifficultyConcentrating 246022 non-null object \n",
" 22 DifficultyWalking 246022 non-null object \n",
" 23 DifficultyDressingBathing 246022 non-null object \n",
" 24 DifficultyErrands 246022 non-null object \n",
" 25 SmokerStatus 246022 non-null object \n",
" 26 ECigaretteUsage 246022 non-null object \n",
" 27 ChestScan 246022 non-null object \n",
" 28 RaceEthnicityCategory 246022 non-null object \n",
" 29 AgeCategory 246022 non-null object \n",
" 30 HeightInMeters 246022 non-null float64\n",
" 31 WeightInKilograms 246022 non-null float64\n",
" 32 BMI 246022 non-null float64\n",
" 33 AlcoholDrinkers 246022 non-null object \n",
" 34 HIVTesting 246022 non-null object \n",
" 35 FluVaxLast12 246022 non-null object \n",
" 36 PneumoVaxEver 246022 non-null object \n",
" 37 TetanusLast10Tdap 246022 non-null object \n",
" 38 HighRiskLastYear 246022 non-null object \n",
" 39 CovidPos 246022 non-null object \n",
"dtypes: float64(6), object(34)\n",
"memory usage: 75.1+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>PhysicalHealthDays</th>\n",
" <td>246022.0</td>\n",
" <td>4.119026</td>\n",
" <td>8.405844</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>3.00</td>\n",
" <td>30.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MentalHealthDays</th>\n",
" <td>246022.0</td>\n",
" <td>4.167140</td>\n",
" <td>8.102687</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>4.00</td>\n",
" <td>30.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SleepHours</th>\n",
" <td>246022.0</td>\n",
" <td>7.021331</td>\n",
" <td>1.440681</td>\n",
" <td>1.00</td>\n",
" <td>6.00</td>\n",
" <td>7.00</td>\n",
" <td>8.00</td>\n",
" <td>24.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HeightInMeters</th>\n",
" <td>246022.0</td>\n",
" <td>1.705150</td>\n",
" <td>0.106654</td>\n",
" <td>0.91</td>\n",
" <td>1.63</td>\n",
" <td>1.70</td>\n",
" <td>1.78</td>\n",
" <td>2.41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>WeightInKilograms</th>\n",
" <td>246022.0</td>\n",
" <td>83.615179</td>\n",
" <td>21.323156</td>\n",
" <td>28.12</td>\n",
" <td>68.04</td>\n",
" <td>81.65</td>\n",
" <td>95.25</td>\n",
" <td>292.57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BMI</th>\n",
" <td>246022.0</td>\n",
" <td>28.668136</td>\n",
" <td>6.513973</td>\n",
" <td>12.02</td>\n",
" <td>24.27</td>\n",
" <td>27.46</td>\n",
" <td>31.89</td>\n",
" <td>97.65</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"PhysicalHealthDays 246022.0 4.119026 8.405844 0.00 0.00 0.00 \n",
"MentalHealthDays 246022.0 4.167140 8.102687 0.00 0.00 0.00 \n",
"SleepHours 246022.0 7.021331 1.440681 1.00 6.00 7.00 \n",
"HeightInMeters 246022.0 1.705150 0.106654 0.91 1.63 1.70 \n",
"WeightInKilograms 246022.0 83.615179 21.323156 28.12 68.04 81.65 \n",
"BMI 246022.0 28.668136 6.513973 12.02 24.27 27.46 \n",
"\n",
" 75% max \n",
"PhysicalHealthDays 3.00 30.00 \n",
"MentalHealthDays 4.00 30.00 \n",
"SleepHours 8.00 24.00 \n",
"HeightInMeters 1.78 2.41 \n",
"WeightInKilograms 95.25 292.57 \n",
"BMI 31.89 97.65 "
]
},
"execution_count": 250,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.describe().transpose()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Получим информацию о пустых значениях в колонках датасета"
]
},
{
"cell_type": "code",
"execution_count": 251,
"metadata": {},
"outputs": [],
"source": [
"def get_null_columns_info(df: DataFrame) -> DataFrame:\n",
" \"\"\"\n",
" Возвращает информацию о пропущенных значениях в колонках датасета\n",
" \"\"\"\n",
" w = []\n",
" df_len = len(df)\n",
"\n",
" for column in df.columns:\n",
" column_nulls = df[column].isnull()\n",
" w.append([column, column_nulls.any(), column_nulls.sum() / df_len])\n",
"\n",
" null_df = DataFrame(w).rename(columns={0: \"Column\", 1: \"Has Null\", 2: \"Null Percent\"})\n",
"\n",
" return null_df"
]
},
{
"cell_type": "code",
"execution_count": 252,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column</th>\n",
" <th>Has Null</th>\n",
" <th>Null Percent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>State</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Sex</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>GeneralHealth</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PhysicalHealthDays</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>MentalHealthDays</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>LastCheckupTime</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>PhysicalActivities</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>SleepHours</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>RemovedTeeth</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>HadHeartAttack</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>HadAngina</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>HadStroke</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>HadAsthma</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>HadSkinCancer</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>HadCOPD</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>HadDepressiveDisorder</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>HadKidneyDisease</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>HadArthritis</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>HadDiabetes</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>DeafOrHardOfHearing</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>BlindOrVisionDifficulty</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>DifficultyConcentrating</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>DifficultyWalking</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>DifficultyDressingBathing</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>DifficultyErrands</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>SmokerStatus</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>ECigaretteUsage</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>ChestScan</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>RaceEthnicityCategory</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>AgeCategory</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>HeightInMeters</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>WeightInKilograms</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>BMI</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>AlcoholDrinkers</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>HIVTesting</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>FluVaxLast12</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>PneumoVaxEver</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>TetanusLast10Tdap</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>HighRiskLastYear</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>CovidPos</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column Has Null Null Percent\n",
"0 State False 0.0\n",
"1 Sex False 0.0\n",
"2 GeneralHealth False 0.0\n",
"3 PhysicalHealthDays False 0.0\n",
"4 MentalHealthDays False 0.0\n",
"5 LastCheckupTime False 0.0\n",
"6 PhysicalActivities False 0.0\n",
"7 SleepHours False 0.0\n",
"8 RemovedTeeth False 0.0\n",
"9 HadHeartAttack False 0.0\n",
"10 HadAngina False 0.0\n",
"11 HadStroke False 0.0\n",
"12 HadAsthma False 0.0\n",
"13 HadSkinCancer False 0.0\n",
"14 HadCOPD False 0.0\n",
"15 HadDepressiveDisorder False 0.0\n",
"16 HadKidneyDisease False 0.0\n",
"17 HadArthritis False 0.0\n",
"18 HadDiabetes False 0.0\n",
"19 DeafOrHardOfHearing False 0.0\n",
"20 BlindOrVisionDifficulty False 0.0\n",
"21 DifficultyConcentrating False 0.0\n",
"22 DifficultyWalking False 0.0\n",
"23 DifficultyDressingBathing False 0.0\n",
"24 DifficultyErrands False 0.0\n",
"25 SmokerStatus False 0.0\n",
"26 ECigaretteUsage False 0.0\n",
"27 ChestScan False 0.0\n",
"28 RaceEthnicityCategory False 0.0\n",
"29 AgeCategory False 0.0\n",
"30 HeightInMeters False 0.0\n",
"31 WeightInKilograms False 0.0\n",
"32 BMI False 0.0\n",
"33 AlcoholDrinkers False 0.0\n",
"34 HIVTesting False 0.0\n",
"35 FluVaxLast12 False 0.0\n",
"36 PneumoVaxEver False 0.0\n",
"37 TetanusLast10Tdap False 0.0\n",
"38 HighRiskLastYear False 0.0\n",
"39 CovidPos False 0.0"
]
},
"execution_count": 252,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_null_columns_info(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Получим информацию о выбросах"
]
},
{
"cell_type": "code",
"execution_count": 253,
"metadata": {},
"outputs": [],
"source": [
"def get_numeric_columns(df: DataFrame) -> list[str]:\n",
" \"\"\"\n",
" Возвращает список числовых колонок\n",
" \"\"\"\n",
" return list(filter(lambda column: pd.api.types.is_numeric_dtype(df[column]), df.columns))"
]
},
{
"cell_type": "code",
"execution_count": 254,
"metadata": {},
"outputs": [],
"source": [
"def get_outliers_info(df: DataFrame) -> DataFrame:\n",
" \"\"\"\n",
" Возаращает информацию о выбросах в числовых колонках датасета\n",
" \"\"\"\n",
" data = {\n",
" \"Column\": [],\n",
" \"Has Outliers\": [],\n",
" \"Outliers Count\": [],\n",
" \"Min Value\": [],\n",
" \"Max Value\": [],\n",
" \"Q1\": [],\n",
" \"Q3\": []\n",
" }\n",
"\n",
" info = DataFrame(data)\n",
"\n",
" for column in get_numeric_columns(df):\n",
" Q1: float = df[column].quantile(0.25)\n",
" Q3: float = df[column].quantile(0.75)\n",
" IQR: float = Q3 - Q1\n",
"\n",
" lower_bound: float = Q1 - 1.5 * IQR\n",
" upper_bound: float = Q3 + 1.5 * IQR\n",
"\n",
" outliers: DataFrame = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
" outlier_count: int = outliers.shape[0]\n",
"\n",
" info.loc[len(info)] = [column, outlier_count > 0, outlier_count, df[column].min(), df[column].max(), Q1, Q3]\n",
"\n",
" return info"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Посмотрим данные по выбросам"
]
},
{
"cell_type": "code",
"execution_count": 255,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column</th>\n",
" <th>Has Outliers</th>\n",
" <th>Outliers Count</th>\n",
" <th>Min Value</th>\n",
" <th>Max Value</th>\n",
" <th>Q1</th>\n",
" <th>Q3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PhysicalHealthDays</td>\n",
" <td>True</td>\n",
" <td>38810</td>\n",
" <td>0.00</td>\n",
" <td>30.00</td>\n",
" <td>0.00</td>\n",
" <td>3.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MentalHealthDays</td>\n",
" <td>True</td>\n",
" <td>32714</td>\n",
" <td>0.00</td>\n",
" <td>30.00</td>\n",
" <td>0.00</td>\n",
" <td>4.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SleepHours</td>\n",
" <td>True</td>\n",
" <td>3488</td>\n",
" <td>1.00</td>\n",
" <td>24.00</td>\n",
" <td>6.00</td>\n",
" <td>8.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HeightInMeters</td>\n",
" <td>True</td>\n",
" <td>830</td>\n",
" <td>0.91</td>\n",
" <td>2.41</td>\n",
" <td>1.63</td>\n",
" <td>1.78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>WeightInKilograms</td>\n",
" <td>True</td>\n",
" <td>5940</td>\n",
" <td>28.12</td>\n",
" <td>292.57</td>\n",
" <td>68.04</td>\n",
" <td>95.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>BMI</td>\n",
" <td>True</td>\n",
" <td>7563</td>\n",
" <td>12.02</td>\n",
" <td>97.65</td>\n",
" <td>24.27</td>\n",
" <td>31.89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column Has Outliers Outliers Count Min Value Max Value \\\n",
"0 PhysicalHealthDays True 38810 0.00 30.00 \n",
"1 MentalHealthDays True 32714 0.00 30.00 \n",
"2 SleepHours True 3488 1.00 24.00 \n",
"3 HeightInMeters True 830 0.91 2.41 \n",
"4 WeightInKilograms True 5940 28.12 292.57 \n",
"5 BMI True 7563 12.02 97.65 \n",
"\n",
" Q1 Q3 \n",
"0 0.00 3.00 \n",
"1 0.00 4.00 \n",
"2 6.00 8.00 \n",
"3 1.63 1.78 \n",
"4 68.04 95.25 \n",
"5 24.27 31.89 "
]
},
"execution_count": 255,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outliers_info = get_outliers_info(df)\n",
"outliers_info"
]
},
{
"cell_type": "code",
"execution_count": 256,
"metadata": {},
"outputs": [],
"source": [
"def visualize_outliers(df: DataFrame) -> None:\n",
" \"\"\"\n",
" Генерирует диаграммы BoxPlot для числовых колонок датасета\n",
" \"\"\"\n",
" columns = get_numeric_columns(df)\n",
" plt.figure(figsize=(15, 10))\n",
" rows: int = ceil(len(columns) / 3)\n",
" for index, column in enumerate(columns, 1):\n",
" plt.subplot(rows, 3, index)\n",
" plt.boxplot(df[column], vert=True, patch_artist=True)\n",
" plt.title(f\"Диаграмма размахов\\n\\\"{column}\\\"\")\n",
" plt.xlabel(column)\n",
" \n",
" plt.tight_layout()\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Визуализируем выбросы с помощью диаграмм"
]
},
{
"cell_type": "code",
"execution_count": 257,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdIAAAPeCAYAAAAI5OjmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wU1f7/8fcmgTSS0BMiCVVpoiAqvShIiBRRkKrSrgUpAraLStUrYgMrlnsFpIgNUVApIsWCKCiiCEgVlN4SWkLJ+f3BL/PNJJtNQsrsLq/n47EPMzNnZ86smnfms2fOuIwxRgAAAAAAAAAAwK0ApzsAAAAAAAAAAIA3o5AOAAAAAAAAAIAHFNIBAAAAAAAAAPCAQjoAAAAAAAAAAB5QSAcAAAAAAAAAwAMK6QAAAAAAAAAAeEAhHQAAAAAAAAAADyikAwAAAAAAAADgAYV0AAAAAAAAAAA8oJAOAAAAAAAAAIAHFNLhsz766CO5XC63ryuvvNLp7gEAADfIbwAAfA/5DQBSkNMdAPLrscceU61atazl//znPw72BgAA5Ab5DQCA7yG/AVzKKKTD5910001q1aqVtfzf//5Xhw4dcq5DAAAgR+Q3AAC+h/wGcCljahf4rDNnzkiSAgJy/s942rRpcrlc2rlzp7UuLS1NV111lVwul6ZNm2atX79+vfr27auqVasqJCREMTEx6t+/vw4fPmzb59ixY93e1hYU9H/fT7Vq1UpXXnml1q5dqyZNmig0NFRVqlTRG2+8keVcRo8erQYNGigqKkrh4eFq3ry5li1bZmu3c+dO6zjz5s2zbUtJSVGpUqXkcrn0/PPPZ+ln+fLldfbsWdt73nvvPWt/Gf/4+fTTT9W+fXvFxsYqODhY1apV05NPPqnz58/n+FmnH2/Tpk3q1q2bIiMjVaZMGT3wwANKSUmxtZ06dapuvPFGlS9fXsHBwapdu7amTJmSZZ+33HKLKleurJCQEJUvX16dOnXSb7/9ZmuTfh6TJ0/O8v6aNWvK5XJp8ODB1rojR47ooYceUt26dVWiRAlFRkYqMTFRv/76q+29ffr0UUhIiDZu3Ghbn5CQoFKlSmnPnj3Wuu3bt+v2229X6dKlFRYWpkaNGunzzz+3vW/58uW2/16Cg4N1xRVXaMKECTLGeP5wAcAPkN/zbNvIb/IbAHwB+T3Pto38Jr9xaWJEOnxWepAHBwdf1PtnzJiRJQwkacmSJdq+fbv69eunmJgYbdiwQW+99ZY2bNigH374QS6Xy9Z+ypQpKlGihLWc+Q+Lo0eP6uabb1a3bt3Us2dPffDBBxo4cKCKFy+u/v37S5KSk5P13//+Vz179tTdd9+t48eP63//+58SEhL0448/ql69erZ9hoSEaOrUqercubO1bu7cuVmCMqPjx49rwYIFuvXWW611U6dOVUhISJb3TZs2TSVKlNCIESNUokQJff311xo9erSSk5P13HPPZXuMjLp166bKlStrwoQJ+uGHH/Tyyy/r6NGjevfdd22fXZ06ddSpUycFBQVp/vz5uv/++5WWlqZBgwbZ9nfPPfcoJiZGe/bs0auvvqo2bdpox44dCgsLy/K5DBs2zFr3/fff66+//srSv+3bt2vevHm6/fbbVaVKFe3fv19vvvmmWrZsqT/++EOxsbGSpJdeeklff/21+vTpo1WrVikwMFBvvvmmFi9erBkzZljt9u/fryZNmujUqVMaOnSoypQpo+nTp6tTp0766KOPbJ+79H+3RJ4+fVrvv/++HnvsMZUvX14DBgzI1ecLAL6K/Ca/yW8A8D3kN/lNfgOSDOCjJk+ebCSZX3/91ba+ZcuWpk6dOrZ1U6dONZLMjh07jDHGpKSkmPj4eJOYmGgkmalTp1ptT506leVY7733npFkVq5caa0bM2aMkWQOHjyYbR9btmxpJJkXXnjBWpeammrq1atnypcvb86cOWOMMebcuXMmNTXV9t6jR4+a6Oho079/f2vdjh07jCTTs2dPExQUZPbt22dta926tenVq5eRZJ577rks/ezZs6fp0KGDtf6vv/4yAQEBpmfPnlnOw91ncO+995qwsDCTkpKS7flmPF6nTp1s6++///4s/77cHSchIcFUrVrV4zE++OADI8msWbPGWifJdO3a1QQFBdnWDxgwwPpcBg0aZK1PSUkx58+ft+13x44dJjg42IwfP962ftGiRUaSeeqpp8z27dtNiRIlTOfOnW1thg0bZiSZb775xlp3/PhxU6VKFVO5cmXrWMuWLTOSzLJly2x9CQgIMPfff7/H8wYAf0B+k9/kNwD4HvKb/Ca/AWOY2gU+K/1Wr3LlyuX5va+99poOHz6sMWPGZNkWGhpq/ZySkqJDhw6pUaNGkqSff/45z8cKCgrSvffeay0XL15c9957rw4cOKC1a9dKkgIDA1W8eHFJF255O3LkiM6dO6drr73W7TGvueYa1alTRzNmzJAk/fXXX1q2bJn69u2bbT/69++vhQsXat++fZKk6dOnq3HjxrriiiuytM34GRw/flyHDh1S8+bNderUKW3atClX5535G+0hQ4ZIkr744gu3x0lKStKhQ4fUsmVLbd++XUlJSbb3nzp1SocOHdK6dev09ttvKzo6Okvfo6Oj1b59e02dOtV6zwcffKB+/fpl6V9wcLA1euH8+fM6fPiwSpQooRo1amT5zNu2bat7771X48eP12233aaQkBC9+eabtjZffPGFrr/+ejVr1sxaV6JECd1zzz3auXOn/vjjD1v79PPdtWuXnn32WaWlpenGG29080kCgH8hv8lv8hsAfA/5TX6T3wBzpMOH/fXXXwoKCspzkCclJenpp5/WiBEjFB0dnWX7kSNH9MADDyg6OlqhoaEqV66cqlSpYr03r2JjYxUeHm5blx5AGeeMmz59uq666iqFhISoTJkyKleunD7//PNsj9mvXz8rsKZNm6YmTZro8ssvz7Yf9erV05VXXql3331XxhhNmzbNbcBJ0oYNG3TrrbcqKipKkZGRKleunO644w5Juf8MMvelWrVqCggIsJ3zd999pzZt2ig8PFwlS5ZUuXLl9Nhjj7k9zvjx41WuXDnVr19fO3fu1PLlyxUREZHluP369dPs2bOVmpqqDz/8UKVKlXIbkGlpaZo0aZIuv/xyBQcHq2zZsipXrpzWr1/v9hyff/55lS5dWuvWrdPLL7+s8uXL27b/9ddfqlGjRpb3pT/RPvPtbZ07d1a5cuVUqVIljR07Vk888YS6dOmS5f0A4G/Ib/Kb/AYA30N+k9/kN0AhHT5s8+bNqlq1qu3hIrkxceJEBQQE6OGHH3a7vVu3bnr77bd13333ae7cuVq8eLEWLlwo6cIv/8Iwc+ZM9e3bV9WqVdP//vc/LVy4UEuWLNGNN96Y7THvuOMObd26VT/88IOmT5+ebShn1L9/f02dOlUrVqzQvn371K1btyxtjh07ppYtW+rXX3/V+PHjNX/+fC1ZskQTJ06UdPGfQea57bZt26bWrVvr0KFDevHFF/X5559ryZIlGj58uNvj/Otf/9LixYv1zjvvKCQkRF26dHEbuO3bt1fx4sU1b948TZ06VX369HH7QJz0P+ZatGihmTNnatGiRVqyZInq1Knj9hx/+eUXHThwQJLczu2XV88//7yWLFmiL774QmPGjNHEiRM1bty4fO8XALwd+U1+k98A4HvIb/Kb/AZ42Ch8VGpqqtatW2d72Edu7NmzRy+99JImTJigiIiILE8CP3r0qJYuXapx48Zp9OjR1votW7ZcdF/37NmjkydP2r4V//PPPyVJlStXliR99NFHqlq1qubOnWsLPHe3vqUrU6aMOnXqZN2m1q1bN9uTv93p3bu3Hn74YT3wwAPq2rWr22+Uly9frsOHD2vu3Llq0aKFtX7Hjh25Ot90W7ZssUYSSNLWrVuVlpZmnfP8+fOVmpqqzz77TPHx8Va7zE9KT1e9enVVr15dktSmTRvFx8dr9uzZGjhwoK1dUFCQ7rzzTv3nP//Rhg0b9M4777jd30cffaQbbrhB//vf/2zrjx07prJly9rWnTx5Uv3
"text/plain": [
"<Figure size 1500x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"visualize_outliers(df)"
]
},
{
"cell_type": "code",
"execution_count": 258,
"metadata": {},
"outputs": [],
"source": [
"def remove_outliers(df: DataFrame, columns: list[str]) -> DataFrame:\n",
" \"\"\"\n",
" Устраняет выбросы в заданных колонках:\n",
" задает значениям выше максимального значение максимума, ниже минимального - значение минимума\n",
" \"\"\"\n",
" for column in columns:\n",
" Q1: float = df[column].quantile(0.25)\n",
" Q3: float = df[column].quantile(0.75)\n",
" IQR: float = Q3 - Q1\n",
"\n",
" lower_bound: float = Q1 - 1.5 * IQR\n",
" upper_bound: float = Q3 + 1.5 * IQR\n",
"\n",
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
" \n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаляем выбросы"
]
},
{
"cell_type": "code",
"execution_count": 259,
"metadata": {},
"outputs": [],
"source": [
"outliers_columns = list(outliers_info[outliers_info[\"Has Outliers\"] == True][\"Column\"])\n",
"df = remove_outliers(df, outliers_columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Снова получим данные о выбросах"
]
},
{
"cell_type": "code",
"execution_count": 260,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column</th>\n",
" <th>Has Outliers</th>\n",
" <th>Outliers Count</th>\n",
" <th>Min Value</th>\n",
" <th>Max Value</th>\n",
" <th>Q1</th>\n",
" <th>Q3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PhysicalHealthDays</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" <td>7.500</td>\n",
" <td>0.00</td>\n",
" <td>3.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MentalHealthDays</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0.000</td>\n",
" <td>10.000</td>\n",
" <td>0.00</td>\n",
" <td>4.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SleepHours</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>3.000</td>\n",
" <td>11.000</td>\n",
" <td>6.00</td>\n",
" <td>8.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>HeightInMeters</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1.405</td>\n",
" <td>2.005</td>\n",
" <td>1.63</td>\n",
" <td>1.78</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>WeightInKilograms</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>28.120</td>\n",
" <td>136.065</td>\n",
" <td>68.04</td>\n",
" <td>95.25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>BMI</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>12.840</td>\n",
" <td>43.320</td>\n",
" <td>24.27</td>\n",
" <td>31.89</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column Has Outliers Outliers Count Min Value Max Value \\\n",
"0 PhysicalHealthDays False 0 0.000 7.500 \n",
"1 MentalHealthDays False 0 0.000 10.000 \n",
"2 SleepHours False 0 3.000 11.000 \n",
"3 HeightInMeters False 0 1.405 2.005 \n",
"4 WeightInKilograms False 0 28.120 136.065 \n",
"5 BMI False 0 12.840 43.320 \n",
"\n",
" Q1 Q3 \n",
"0 0.00 3.00 \n",
"1 0.00 4.00 \n",
"2 6.00 8.00 \n",
"3 1.63 1.78 \n",
"4 68.04 95.25 \n",
"5 24.27 31.89 "
]
},
"execution_count": 260,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_outliers_info(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Видим, что выбросов не осталось - проверим через диаграммы"
]
},
{
"cell_type": "code",
"execution_count": 261,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPeCAYAAADj01PlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADoXklEQVR4nOzdd3gUVdvH8d8mIZuQRggJEEihd6SKdJASYwhgoQkSQFERRUBQo9KV0AQsgMijCVKkg4ACglJUioAiRaRIlR5KQg2QzPsHb1aWZOjJkvD9XNdcj3PmzJx7Fh7unXtnzlgMwzAEAAAAAAAAAADScHJ0AAAAAAAAAAAAPKgoogMAAAAAAAAAYIIiOgAAAAAAAAAAJiiiAwAAAAAAAABggiI6AAAAAAAAAAAmKKIDAAAAAAAAAGCCIjoAAAAAAAAAACYoogMAAAAAAAAAYIIiOgAAAAAAAAAAJiiiAwAAAAAAAABggiI6sqxZs2bJYrGku5QtW9bR4QEAgHSQvwEAyHrI3wAedi6ODgC4V++++65KlSplW//www8dGA0AALgd5G8AALIe8jeAhxVFdGR5jRo1Ur169Wzr//vf/xQfH++4gAAAwC2RvwEAyHrI3wAeVkzngizr8uXLkiQnp1v/NY6Li5PFYtG+fftsbSkpKSpfvrwsFovi4uJs7Zs3b1aHDh1UuHBhubm5KV++fOrUqZNOnjxpd8z+/fun+yibi8t/v03Vq1dPZcuW1caNG1WjRg25u7urUKFC+vzzz9OcS9++fVW5cmX5+PjIw8NDtWvX1vLly+367du3zzbOvHnz7LZdunRJvr6+slgsGjFiRJo4AwICdOXKFbt9vvnmG9vxrv/i8+233yoiIkKBgYGyWq0qUqSIBg0apOTk5Ft+1qnj/f3332rZsqW8vb3l5+enN954Q5cuXbLrGxsbq8cff1wBAQGyWq0qXbq0xo0bl+aYzZo1U2hoqNzc3BQQEKCmTZtqy5Ytdn1Sz2P06NFp9i9ZsqQsFotee+01W9upU6fUq1cvlStXTp6envL29lZ4eLj+/PNPu32joqLk5uam7du327WHhYXJ19dXhw8ftrXt2bNHLVq0UO7cuZUzZ0499thj+u677+z2W7Fihd3fF6vVquLFiysmJkaGYdz8wwWAbID8Pc9uG/mb/A0AWQH5e57dNvI3+RsPH+5ER5aVmsStVutd7T9p0qQ0iUCSli5dqj179qhjx47Kly+ftm3bpi+++ELbtm3T2rVrZbFY7PqPGzdOnp6etvUbv1ScPn1aTz75pFq2bKk2bdpoxowZ6tKli1xdXdWpUydJUmJiov73v/+pTZs26ty5s86ePasvv/xSYWFh+u2331ShQgW7Y7q5uSk2NlbNmze3tc2ZMydNkrze2bNntXDhQj311FO2ttjYWLm5uaXZLy4uTp6enurZs6c8PT31008/qW/fvkpMTNTw4cNNx7hey5YtFRoaqpiYGK1du1affPKJTp8+ra+//trusytTpoyaNm0qFxcXLViwQK+++qpSUlLUtWtXu+O99NJLypcvnw4fPqzPPvtMDRs21N69e5UzZ840n0v37t1tbatXr9b+/fvTxLdnzx7NmzdPLVq0UKFChXTs2DGNHz9edevW1V9//aXAwEBJ0scff6yffvpJUVFRWrNmjZydnTV+/Hj98MMPmjRpkq3fsWPHVKNGDV24cEHdunWTn5+fJk6cqKZNm2rWrFl2n7v032OQFy9e1PTp0/Xuu+8qICBAL7zwwm19vgCQVZG/yd/kbwDIesjf5G/yNx56BpBFjR492pBk/Pnnn3btdevWNcqUKWPXFhsba0gy9u7daxiGYVy6dMkIDg42wsPDDUlGbGysre+FCxfSjPXNN98YkoxVq1bZ2vr162dIMk6cOGEaY926dQ1JxkcffWRrS0pKMipUqGAEBAQYly9fNgzDMK5evWokJSXZ7Xv69Gkjb968RqdOnWxte/fuNSQZbdq0MVxcXIyjR4/atjVo0MB47rnnDEnG8OHD08TZpk0bo0mTJrb2/fv3G05OTkabNm3SnEd6n8HLL79s5MyZ07h06ZLp+V4/XtOmTe3aX3311TR/XumNExYWZhQuXPimY8yYMcOQZGzYsMHWJsl49tlnDRcXF7v2F154wfa5dO3a1dZ+6dIlIzk52e64e/fuNaxWqzFw4EC79iVLlhiSjA8++MDYs2eP4enpaTRv3tyuT/fu3Q1Jxs8//2xrO3v2rFGoUCEjNDTUNtby5csNScby5cvtYnFycjJeffXVm543AGQH5G/yN/kbALIe8jf5m/yNhx3TuSDLSn28y9/f/473HTNmjE6ePKl+/fql2ebu7m7770uXLik+Pl6PPfaYJOn333+/47FcXFz08ssv29ZdXV318ssv6/jx49q4caMkydnZWa6urpKuPeZ26tQpXb16VVWqVEl3zEqVKqlMmTKaNGmSJGn//v1avny5OnToYBpHp06dtHjxYh09elSSNHHiRFWvXl3FixdP0/f6z+Ds2bOKj49X7dq1deHCBf3999+3dd43/pL9+uuvS5K+//77dMdJSEhQfHy86tatqz179ighIcFu/wsXLig+Pl6bNm3ShAkTlDdv3jSx582bVxEREYqNjbXtM2PGDHXs2DFNfFar1XbXQnJysk6ePClPT0+VKFEizWfeuHFjvfzyyxo4cKCefvppubm5afz48XZ9vv/+ez366KOqVauWrc3T01MvvfSS9u3bp7/++suuf+r5HjhwQMOGDVNKSooef/zxdD5JAMheyN/kb/I3AGQ95G/yN/kbDzuK6Miy9u/fLxcXlztO4gkJCRo8eLB69uypvHnzptl+6tQpvfHGG8qbN6/c3d3l7++vQoUK2fa9U4GBgfLw8LBrS00+188RN3HiRJUvX15ubm7y8/OTv7+/vvvuO9MxO3bsaEtWcXFxqlGjhooVK2YaR4UKFVS2bFl9/fXXMgxDcXFx6SY3Sdq2bZueeuop+fj4yNvbW/7+/mrXrp2k2/8MboylSJEicnJysjvnX3/9VQ0bNpSHh4dy5colf39/vfvuu+mOM3DgQPn7+6tixYrat2+fVqxYIS8vrzTjduzYUVOnTlVSUpJmzpwpX1/fdJNjSkqKRo0apWLFislqtSpPnjzy9/fX5s2b0z3HESNGKHfu3Nq0aZM++eQTBQQE2G3fv3+/SpQokWa/1DfX3/hIW/PmzeXv76+QkBD1799f77//vp555pk0+wNAdkP+Jn+TvwEg6yF/k7/J33jYUURHlrVjxw4VLlzY7kUit2Po0KFycnJS7969093esmVLTZgwQa+88ormzJmjH374QYsXL5Z07R/+jDB58mR16NBBRYoU0ZdffqnFixdr6dKlevzxx03HbNeunXbv3q21a9dq4sSJpgn5ep06dVJsbKxWrlypo0ePqmXLlmn6nDlzRnXr1tWff/6pgQMHasGCBVq6dKmGDh0q6e4/gxvnsvvnn3/UoEEDxcfHa+TIkfruu++0dOlS9ejRI91xXnzxRf3www/66quv5ObmpmeeeSbdZBsRESFXV1fNmzdPsbGxioqKSvflN6lf5OrUqaPJkydryZIlWrp0qcqUKZPuOf7xxx86fvy4JKU7l9+dGjFihJYuXarvv/9e/fr109ChQzVgwIB7Pi4APOjI3+Rv8jcAZD3kb/I3+RsPO14siiwpKSlJmzZtsnuxx+04fPiwPv74Y8XExMjLyyvNG79Pnz6tH3/8UQMGDFDfvn1t7bt27brrWA8fPqzz58/b/Rq+c+dOSVJoaKgkadasWSpcuLDmzJljl+zSe9wtlZ+fn5o2bWp7NK1ly5Z2b/hOT9u2bdW7d2+98cYbevbZZ9P9JXnFihU6efKk5syZozp16tja9+7de1vnm2rXrl22Owgkaffu3UpJSbGd84IFC5SUlKT58+crODjY1u/GN6KnKlq0qIoWLSpJatiwoYKDgzV16lR16dLFrp+Li4uef/55ffjhh9q2bZu++uqrdI83a9Ys1a9fX19++aVd+5kzZ5QnTx67tvPnz6tjx44qXbq0atSooWH
"text/plain": [
"<Figure size 1500x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"visualize_outliers(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Нормализация числовых признаков"
]
},
{
"cell_type": "code",
"execution_count": 262,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 263,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>PhysicalHealthDaysNorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.253306</td>\n",
" <td>0.385378</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.400000</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>MentalHealthDaysNorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.244973</td>\n",
" <td>0.378598</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.400000</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SleepHoursNorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.501124</td>\n",
" <td>0.165569</td>\n",
" <td>0.0</td>\n",
" <td>0.375000</td>\n",
" <td>0.500000</td>\n",
" <td>0.625000</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HeightInMetersNorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.500401</td>\n",
" <td>0.176240</td>\n",
" <td>0.0</td>\n",
" <td>0.375000</td>\n",
" <td>0.491667</td>\n",
" <td>0.625000</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>WeightInKilogramsNorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.510963</td>\n",
" <td>0.186742</td>\n",
" <td>0.0</td>\n",
" <td>0.369818</td>\n",
" <td>0.495901</td>\n",
" <td>0.621891</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>BMINorm</th>\n",
" <td>246022.0</td>\n",
" <td>0.513599</td>\n",
" <td>0.194556</td>\n",
" <td>0.0</td>\n",
" <td>0.375000</td>\n",
" <td>0.479659</td>\n",
" <td>0.625000</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"PhysicalHealthDaysNorm 246022.0 0.253306 0.385378 0.0 0.000000 0.000000 \n",
"MentalHealthDaysNorm 246022.0 0.244973 0.378598 0.0 0.000000 0.000000 \n",
"SleepHoursNorm 246022.0 0.501124 0.165569 0.0 0.375000 0.500000 \n",
"HeightInMetersNorm 246022.0 0.500401 0.176240 0.0 0.375000 0.491667 \n",
"WeightInKilogramsNorm 246022.0 0.510963 0.186742 0.0 0.369818 0.495901 \n",
"BMINorm 246022.0 0.513599 0.194556 0.0 0.375000 0.479659 \n",
"\n",
" 75% max \n",
"PhysicalHealthDaysNorm 0.400000 1.0 \n",
"MentalHealthDaysNorm 0.400000 1.0 \n",
"SleepHoursNorm 0.625000 1.0 \n",
"HeightInMetersNorm 0.625000 1.0 \n",
"WeightInKilogramsNorm 0.621891 1.0 \n",
"BMINorm 0.625000 1.0 "
]
},
"execution_count": 263,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min_max_scaler = preprocessing.MinMaxScaler()\n",
"\n",
"df_norm = df.copy()\n",
"\n",
"numeric_columns = get_numeric_columns(df)\n",
"\n",
"for column in numeric_columns:\n",
" norm_column = column + \"Norm\"\n",
" df_norm[norm_column] = min_max_scaler.fit_transform(\n",
" df_norm[column].to_numpy().reshape(-1, 1)\n",
" ).reshape(df_norm[column].shape)\n",
"\n",
"df_norm = df_norm.drop(columns=numeric_columns)\n",
"\n",
"\n",
"df_norm.describe().transpose()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Конструирование признаков"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Автоматическое конструирование признаков с помощью фреймворка FeatureTools"
]
},
{
"cell_type": "code",
"execution_count": 264,
"metadata": {},
"outputs": [],
"source": [
"import featuretools as ft"
]
},
{
"cell_type": "code",
"execution_count": 266,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n"
]
}
],
"source": [
"# Преобразуем датасет с помощью фремйворка\n",
"# https://featuretools.alteryx.com/en/stable/getting_started/afe.html\n",
"\n",
"entity_set = ft.EntitySet().add_dataframe(df_norm, \"df\", make_index=True, index=\"id\")\n",
"\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=entity_set,\n",
" target_dataframe_name=\"df\",\n",
" max_depth=2\n",
")\n",
"\n",
"feature_matrix: DataFrame\n",
"feature_defs: list[ft.Feature]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполняем категориальное и унитарное кодирование признаков с помощью FeatureTools"
]
},
{
"cell_type": "code",
"execution_count": 267,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Было признаков: 40\n",
"Стало признаков: 99\n",
"<Feature: State = Washington>\n",
"<Feature: State = Maryland>\n",
"<Feature: State = Minnesota>\n",
"<Feature: State = Ohio>\n",
"<Feature: State = New York>\n",
"<Feature: State = Texas>\n",
"<Feature: State = Florida>\n",
"<Feature: State = Kansas>\n",
"<Feature: State = Wisconsin>\n",
"<Feature: State = Maine>\n",
"<Feature: State is unknown>\n",
"<Feature: Sex = Female>\n",
"<Feature: Sex = Male>\n",
"<Feature: Sex is unknown>\n",
"<Feature: GeneralHealth = Very good>\n",
"<Feature: GeneralHealth = Good>\n",
"<Feature: GeneralHealth = Excellent>\n",
"<Feature: GeneralHealth = Fair>\n",
"<Feature: GeneralHealth = Poor>\n",
"<Feature: GeneralHealth is unknown>\n",
"<Feature: LastCheckupTime = Within past year (anytime less than 12 months ago)>\n",
"<Feature: LastCheckupTime = Within past 2 years (1 year but less than 2 years ago)>\n",
"<Feature: LastCheckupTime = Within past 5 years (2 years but less than 5 years ago)>\n",
"<Feature: LastCheckupTime = 5 or more years ago>\n",
"<Feature: LastCheckupTime is unknown>\n",
"<Feature: PhysicalActivities>\n",
"<Feature: RemovedTeeth = None of them>\n",
"<Feature: RemovedTeeth = 1 to 5>\n",
"<Feature: RemovedTeeth = 6 or more, but not all>\n",
"<Feature: RemovedTeeth = All>\n",
"<Feature: RemovedTeeth is unknown>\n",
"<Feature: HadHeartAttack>\n",
"<Feature: HadAngina>\n",
"<Feature: HadStroke>\n",
"<Feature: HadAsthma>\n",
"<Feature: HadSkinCancer>\n",
"<Feature: HadCOPD>\n",
"<Feature: HadDepressiveDisorder>\n",
"<Feature: HadKidneyDisease>\n",
"<Feature: HadArthritis>\n",
"<Feature: HadDiabetes = No>\n",
"<Feature: HadDiabetes = Yes>\n",
"<Feature: HadDiabetes = No, pre-diabetes or borderline diabetes>\n",
"<Feature: HadDiabetes = Yes, but only during pregnancy (female)>\n",
"<Feature: HadDiabetes is unknown>\n",
"<Feature: DeafOrHardOfHearing>\n",
"<Feature: BlindOrVisionDifficulty>\n",
"<Feature: DifficultyConcentrating>\n",
"<Feature: DifficultyWalking>\n",
"<Feature: DifficultyDressingBathing>\n",
"<Feature: DifficultyErrands>\n",
"<Feature: SmokerStatus = Never smoked>\n",
"<Feature: SmokerStatus = Former smoker>\n",
"<Feature: SmokerStatus = Current smoker - now smokes every day>\n",
"<Feature: SmokerStatus = Current smoker - now smokes some days>\n",
"<Feature: SmokerStatus is unknown>\n",
"<Feature: ECigaretteUsage = Never used e-cigarettes in my entire life>\n",
"<Feature: ECigaretteUsage = Not at all (right now)>\n",
"<Feature: ECigaretteUsage = Use them some days>\n",
"<Feature: ECigaretteUsage = Use them every day>\n",
"<Feature: ECigaretteUsage is unknown>\n",
"<Feature: ChestScan>\n",
"<Feature: RaceEthnicityCategory = White only, Non-Hispanic>\n",
"<Feature: RaceEthnicityCategory = Hispanic>\n",
"<Feature: RaceEthnicityCategory = Black only, Non-Hispanic>\n",
"<Feature: RaceEthnicityCategory = Other race only, Non-Hispanic>\n",
"<Feature: RaceEthnicityCategory = Multiracial, Non-Hispanic>\n",
"<Feature: RaceEthnicityCategory is unknown>\n",
"<Feature: AgeCategory = Age 65 to 69>\n",
"<Feature: AgeCategory = Age 60 to 64>\n",
"<Feature: AgeCategory = Age 70 to 74>\n",
"<Feature: AgeCategory = Age 55 to 59>\n",
"<Feature: AgeCategory = Age 50 to 54>\n",
"<Feature: AgeCategory = Age 75 to 79>\n",
"<Feature: AgeCategory = Age 80 or older>\n",
"<Feature: AgeCategory = Age 40 to 44>\n",
"<Feature: AgeCategory = Age 45 to 49>\n",
"<Feature: AgeCategory = Age 35 to 39>\n",
"<Feature: AgeCategory is unknown>\n",
"<Feature: AlcoholDrinkers>\n",
"<Feature: HIVTesting>\n",
"<Feature: FluVaxLast12>\n",
"<Feature: PneumoVaxEver>\n",
"<Feature: TetanusLast10Tdap = No, did not receive any tetanus shot in the past 10 years>\n",
"<Feature: TetanusLast10Tdap = Yes, received tetanus shot but not sure what type>\n",
"<Feature: TetanusLast10Tdap = Yes, received Tdap>\n",
"<Feature: TetanusLast10Tdap = Yes, received tetanus shot, but not Tdap>\n",
"<Feature: TetanusLast10Tdap is unknown>\n",
"<Feature: HighRiskLastYear>\n",
"<Feature: CovidPos = No>\n",
"<Feature: CovidPos = Yes>\n",
"<Feature: CovidPos = Tested positive using home test without a health professional>\n",
"<Feature: CovidPos is unknown>\n",
"<Feature: PhysicalHealthDaysNorm>\n",
"<Feature: MentalHealthDaysNorm>\n",
"<Feature: SleepHoursNorm>\n",
"<Feature: HeightInMetersNorm>\n",
"<Feature: WeightInKilogramsNorm>\n",
"<Feature: BMINorm>\n"
]
}
],
"source": [
"# Сгенерируем новые признаки\n",
"# https://featuretools.alteryx.com/en/stable/guides/tuning_dfs.html\n",
"\n",
"feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)\n",
"feature_matrix_enc.to_csv(\"./csv/generated_features.csv\", index=False)\n",
"\n",
"print(\"Было признаков:\", len(feature_defs))\n",
"print(\"Стало признаков:\", len(features_enc))\n",
"print(*features_enc, sep='\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Разобьем данные на выборки"
]
},
{
"cell_type": "code",
"execution_count": 277,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры выборок:\n",
"Обучающая выборка: (196817, 98)\n",
"Тестовая выборка: (24602, 98)\n",
"Контрольная выборка: (24603, 98)\n"
]
}
],
"source": [
"prepared_dataset = feature_matrix_enc\n",
"\n",
"target_column = \"HadHeartAttack\"\n",
"\n",
"X = prepared_dataset.drop(columns=[target_column]) \n",
"Y = prepared_dataset[target_column] \n",
"\n",
"# Обучающая выборка\n",
"X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=None, stratify=y)\n",
"\n",
"# Тестовая и контрольная выборки\n",
"X_test, X_control, Y_test, Y_control = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=None, stratify=Y_temp)\n",
"\n",
"print(\"Размеры выборок:\")\n",
"print(f\"Обучающая выборка: {X_train.shape}\")\n",
"print(f\"Тестовая выборка: {X_test.shape}\")\n",
"print(f\"Контрольная выборка: {X_control.shape}\")"
]
},
{
"cell_type": "code",
"execution_count": 317,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"HadHeartAttack\n",
"False 232587\n",
"True 13435\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAHcCAYAAAD/UV8/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABNM0lEQVR4nO3deVhUdf//8deACsjqBoiSkLhrariE5pYoKlrmkluFWy63S6blcpd73abe5pJb3aW0qJV7aVmKmkumZqJp6k3ekJbiDggpKHN+f/Rjvo4DingK0efjuua6nM95n895z0GHl+ecOWMxDMMQAAAA7opTfjcAAABwPyBUAQAAmIBQBQAAYAJCFQAAgAkIVQAAACYgVAEAAJiAUAUAAGACQhUAAIAJCFUAAAAmIFQBAACYgFAFFCDR0dGyWCy2h6urqypWrKjBgwfrzJkz+d0eADzQCuV3AwDu3KRJkxQcHKyrV69qx44dWrBggb788ksdOnRIRYsWze/2AOCBRKgCCqDWrVurTp06kqS+ffuqRIkSeuutt7R27Vp169Ytn7sDgAcTp/+A+8ATTzwhSYqPj5ckXbx4US+//LJq1KghDw8PeXl5qXXr1jpw4IDDulevXtWECRNUsWJFubq6qnTp0urQoYOOHz8uSUpISLA75Xjzo2nTpra5tm7dKovFok8//VT//Oc/5e/vL3d3dz355JM6efKkw7Z3796tVq1aydvbW0WLFlWTJk20c+fObF9j06ZNs93+hAkTHGo//vhjhYaGys3NTcWLF1fXrl2z3f6tXtuNrFarZs2apWrVqsnV1VV+fn7q37+/Ll26ZFcXFBSktm3bOmxn8ODBDnNm1/v06dMd9qkkpaena/z48QoJCZGLi4sCAwM1cuRIpaenZ7uvbtS0aVOH+d544w05OTlp6dKledof//73v9WgQQOVKFFCbm5uCg0N1YoVK7Ld/scff6x69eqpaNGiKlasmBo3bqxvvvnGruarr75SkyZN5OnpKS8vL9WtW9eht+XLl9t+piVLltSzzz6r33//3a6mZ8+edj0XK1ZMTZs21fbt22+7nwAzcKQKuA9kBaASJUpIkv73v/9pzZo16ty5s4KDg3XmzBm98847atKkiX7++WcFBARIkjIzM9W2bVvFxMSoa9euevHFF3X58mVt3LhRhw4dUvny5W3b6Natm9q0aWO33TFjxmTbzxtvvCGLxaJRo0bp7NmzmjVrlsLDwxUbGys3NzdJ0ubNm9W6dWuFhoZq/PjxcnJy0uLFi/XEE09o+/btqlevnsO8ZcuW1ZQpUyRJqampGjhwYLbbHjt2rJ555hn17dtX586d09tvv63GjRtr//798vHxcVinX79+atSokSRp1apVWr16td3y/v37Kzo6Wr169dLQoUMVHx+vuXPnav/+/dq5c6cKFy6c7X64E0lJSbbXdiOr1aonn3xSO3bsUL9+/VSlShX99NNPmjlzpv773/9qzZo1d7SdxYsX67XXXtOMGTPUvXv3bGtutz9mz56tJ598Uj169FBGRoY++eQTde7cWevWrVNkZKStbuLEiZowYYIaNGigSZMmqUiRItq9e7c2b96sli1bSvrzOsHevXurWrVqGjNmjHx8fLR//35t2LDB1l/Wvq9bt66mTJmiM2fOaPbs2dq5c6fDz7RkyZKaOXOmJOm3337T7Nmz1aZNG508eTLbnz1gKgNAgbF48WJDkrFp0ybj3LlzxsmTJ41PPvnEKFGihOHm5mb89ttvhmEYxtWrV43MzEy7dePj4w0XFxdj0qRJtrFFixYZkoy33nrLYVtWq9W2niRj+vTpDjXVqlUzmjRpYnu+ZcsWQ5JRpkwZIyUlxTb+2WefGZKM2bNn2+auUKGCERERYduOYRjGH3/8YQQHBxstWrRw2FaDBg2M6tWr256fO3fOkGSMHz/eNpaQkGA4Ozsbb7zxht26P/30k1GoUCGH8bi4OEOS8cEHH9jGxo8fb9z41rh9+3ZDkrFkyRK7dTds2OAwXq5cOSMyMtKh90GDBhk3v93e3PvIkSMNX19fIzQ01G6ffvTRR4aTk5Oxfft2u/UXLlxoSDJ27tzpsL0bNWnSxDbf+vXrjUKFChkjRozItjY3+8Mw/vw53SgjI8OoXr268cQTT9jN5eTkZDz99NMOfxezfuZJSUmGp6enUb9+fePKlSvZ1mRkZBi+vr5G9erV7WrWrVtnSDLGjRtnG4uKijLKlStnN8+7775rSDL27NmT7WsGzMTpP6AACg8PV6lSpRQYGKiuXbvKw8NDq1evVpkyZSRJLi4ucnL68593ZmamLly4IA8PD1WqVEk//vijbZ6VK1eqZMmSGjJkiMM2bj7lcyeef/55eXp62p536tRJpUuX1pdffilJio2NVVxcnLp3764LFy7o/PnzOn/+vNLS0tS8eXNt27ZNVqvVbs6rV6/K1dX1lttdtWqVrFarnnnmGduc58+fl7+/vypUqKAtW7bY1WdkZEj6c3/lZPny5fL29laLFi3s5gwNDZWHh4fDnNeuXbOrO3/+vK5evXrLvn///Xe9/fbbGjt2rDw8PBy2X6VKFVWuXNluzqxTvjdvPyd79uzRM888o44dO2r69OnZ1uRmf0iyHW2UpEuXLik5OVmNGjWy+7u1Zs0aWa1WjRs3zvZ3MUvW362NGzfq8uXLGj16tMPPNqvmhx9+0NmzZ/WPf/zDriYyMlKVK1fW+vXr7dazWq22fRQbG6sPP/xQpUuXVpUqVW75mgAzcPoPKIDmzZunihUrqlChQvLz81OlSpXsfnFZrVbNnj1b8+fPV3x8vDIzM23Lsk4RSn+eNqxUqZIKFTL3raBChQp2zy0Wi0JCQpSQkCBJiouLkyRFRUXlOEdycrKKFStme37+/HmHeW8WFxcnwzByrLv5NF1SUpIkOQSZm+dMTk6Wr69vtsvPnj1r9/ybb75RqVKlbtnnzcaPH6+AgAD179/f4dqkuLg4HTlyJMc5b95+dn7//XdFRkYqLS1NFy5cyDEw52Z/SNK6dev0+uuvKzY21u66rhvnPX78uJycnFS1atUc58k6bV29evUca3799VdJUqVKlRyWVa5cWTt27LAbO3nypN2+Kl26tFauXHnb1wSYgVAFFED16tWzffovO//61780duxY9e7dW5MnT1bx4sXl5OSkYcOGORwByg9ZPUyfPl21atXKtubGX4IZGRk6ffq0WrRocdt5LRaLvvrqKzk7O99yTklKTEyUJPn7+99yTl9fXy1ZsiTb5TeHnfr16+v111+3G5s7d67Wrl2b7fpHjhxRdHS0Pv7442yvzbJarapRo4beeuutbNcPDAzMsfcsv/zyix599FHNnDlTzz33nD744INsA21u9sf27dv15JNPqnHjxpo/f75Kly6twoULa/HixQ4Xl+cHPz8/ffzxx5L+DOaLFi1Sq1attGPHDtWoUSOfu8P9jlAF3IdWrFihZs2a6f3337cbT0pKUsmSJW3Py5cvr927d+vatWumXGydJetIVBbDMPTLL7/okUcesW1Xkry8vBQeHn7b+Q4cOKBr167dMkhmzWsYhoKDg1WxYsXbzvvzzz/LYrFkexTkxjk3bdqkhg0b2p32yknJkiUdXtOtLiYfM2aMatWqpS5duuS4/QMHDqh58+Z5PiWbderVz89Pa9eu1YgRI9SmTRuHQJib/bFy5Uq5urrq66+/tjtNuHjxYoe+rVarfv755xyDc9bfg0OHDikkJCTbmnLlykmSjh07ZjvlmeXYsWO25VlcXV3t9v+TTz6p4sWLa+7cuXrnnXdyfF2AGbimCrgPOTs7yzAMu7Hly5c7fAS9Y8eOOn/+vObOneswx83r34kPP/xQly9ftj1fsWKFTp8+rdatW0uSQkNDVb58ef373/9Wamqqw/rnzp1z6N3Z2Tnb2xXcqEOHDnJ2dtbEiRMd+jcMQxcuXLA9v379ulauXKl69erd8tTQM888o8zMTE2ePNlh2fXr122nzPJi165dWrt2rd58880cA9Mzzzyj33//Xf/5z38cll25ckVpaWm33U7FihXl5+c
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# Подсчет количества объектов каждого класса\n",
"class_counts = y.value_counts()\n",
"print(class_counts)\n",
"\n",
"\n",
"class_counts_dict = class_counts.to_dict()\n",
"\n",
"keys = list(class_counts_dict.keys())\n",
"vals = list(class_counts_dict.values())\n",
"\n",
"keys[keys.index(True)] = \"Был приступ\"\n",
"keys[keys.index(False)] = \"Не было приступа\"\n",
"\n",
"# Визуализация\n",
"plt.bar(keys, vals)\n",
"plt.title(f\"Распределение классов\\n\\\"{target_column}\\\"\")\n",
"plt.xlabel(\"Класс\")\n",
"plt.ylabel(\"Количество\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 325,
"metadata": {},
"outputs": [],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def oversample(X: DataFrame, Y: Series, sampling_strategy=0.5) -> tuple[DataFrame, Series]:\n",
" sampler = RandomOverSampler(sampling_strategy=sampling_strategy)\n",
" x_over, y_over = sampler.fit_resample(X, Y)\n",
" return x_over, y_over \n",
"\n",
"def undersample(X: DataFrame, Y: Series, sampling_strategy=1) -> tuple[DataFrame, Series]:\n",
" sampler = RandomUnderSampler(sampling_strategy=sampling_strategy)\n",
" x_over, y_over = sampler.fit_resample(X, Y)\n",
" return x_over, y_over "
]
},
{
"cell_type": "code",
"execution_count": 327,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Данные до аугментации в обучающей выборке\n",
"HadHeartAttack\n",
"False 186069\n",
"True 10748\n",
"Name: count, dtype: int64\n",
"\n",
"Данные после аугментации в обучающей выборке\n",
"HadHeartAttack\n",
"False 10748\n",
"True 10748\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"print(\"Данные до аугментации в обучающей выборке\")\n",
"print(Y_train.value_counts())\n",
"\n",
"X_train_samplied, Y_train_samplied = X_train, Y_train\n",
"\n",
"# X_train_samplied, Y_train_samplied = oversample(X_train_samplied, Y_train_samplied)\n",
"X_train_samplied, Y_train_samplied = undersample(X_train_samplied, Y_train_samplied)\n",
"print()\n",
"print(\"Данные после аугментации в обучающей выборке\")\n",
"print(Y_train_samplied.value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 349,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAGwCAYAAABGlHlWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABA4ElEQVR4nO3dd3xT5eIG8CdJ2zRN924BGS1lo1BBBUqB62C4QQQUAS9XvApe18XJRnsVB4goXAcI1MX+uRUBLyAqImWPAmWP0tK90iTv74+YI+miI8nJOXm+n08+bZKTc96clDy847yvRgghQEREBEArdwGIiMhzMBSIiEjCUCAiIglDgYiIJAwFIiKSMBSIiEjCUCAiIglDgYiIJAwFIiKSMBSIiEjCUHCjJUuWQKPRSDd/f38kJSVh4sSJuHDhgtzFIyKCj9wF8EYzZ85E69atUV5eji1btuDdd9/F119/jb179yIgIEDu4hGRF2MoyGDQoEG49tprAQDjx49HREQE3njjDaxbtw4jR46UuXRE5M3YfOQBBgwYAADIysoCAFy6dAlPP/00unTpgsDAQAQHB2PQoEHYtWtXtdeWl5dj+vTpSEpKgr+/P+Li4nD33Xfj6NGjAIDjx487NFlVvfXr10/a16ZNm6DRaPDZZ5/h+eefR2xsLIxGI26//XacOnWq2rF//fVXDBw4ECEhIQgICEBqaiq2bt1a43vs169fjcefPn16tW2XL1+O5ORkGAwGhIeHY8SIETUev673djmr1Yq5c+eiU6dO8Pf3R0xMDCZMmIC8vDyH7Vq1aoVbb7212nEmTpxYbZ81lX3OnDnVzikAVFRUYNq0aUhMTIRer0eLFi0wefJkVFRU1HiuLtevX79q+3vppZeg1Wrx8ccfN+p8vPbaa+jVqxciIiJgMBiQnJyMlStX1nj85cuXo2fPnggICEBYWBj69u2L77//3mGbb775BqmpqQgKCkJwcDB69OhRrWwrVqyQPtPIyEjcf//9OHPmjMM2Y8eOdShzWFgY+vXrh82bN1/xPJHzsKbgAexf4BEREQCAY8eOYe3atbjnnnvQunVrXLhwAYsWLUJqair279+P+Ph4AIDFYsGtt96KH3/8ESNGjMC//vUvFBUV4YcffsDevXuRkJAgHWPkyJEYPHiww3Gfe+65Gsvz0ksvQaPR4JlnnkF2djbmzp2LG2+8ERkZGTAYDACADRs2YNCgQUhOTsa0adOg1WqxePFiDBgwAJs3b0bPnj2r7bd58+ZIS0sDABQXF+Of//xnjceeMmUKhg8fjvHjx+PixYuYP38++vbti507dyI0NLTaax566CGkpKQAAFavXo01a9Y4PD9hwgQsWbIE48aNw2OPPYasrCy8/fbb2LlzJ7Zu3QpfX98az0ND5OfnS+/tclarFbfffju2bNmChx56CB06dMCePXvw5ptv4vDhw1i7dm2DjrN48WK8+OKLeP311zFq1Kgat7nS+Zg3bx5uv/123HfffTCZTPj0009xzz334Msvv8SQIUOk7WbMmIHp06ejV69emDlzJvz8/PDrr79iw4YNuPnmmwHY+skefPBBdOrUCc899xxCQ0Oxc+dOfPvtt1L57Oe+R48eSEtLw4ULFzBv3jxs3bq12mcaGRmJN998EwBw+vRpzJs3D4MHD8apU6dq/OzJBQS5zeLFiwUAsX79enHx4kVx6tQp8emnn4qIiAhhMBjE6dOnhRBClJeXC4vF4vDarKwsodfrxcyZM6XHPvzwQwFAvPHGG9WOZbVapdcBEHPmzKm2TadOnURqaqp0f+PGjQKAaNasmSgsLJQe//zzzwUAMW/ePGnfbdu2Fbfccot0HCGEKC0tFa1btxY33XRTtWP16tVLdO7cWbp/8eJFAUBMmzZNeuz48eNCp9OJl156yeG1e/bsET4+PtUez8zMFADERx99JD02bdo0cfmf9ebNmwUAkZ6e7vDab7/9ttrjLVu2FEOGDKlW9kcffVRU/adSteyTJ08W0dHRIjk52eGcLlu2TGi1WrF582aH1y9cuFAAEFu3bq12vMulpqZK+/vqq6+Ej4+PeOqpp2rctj7nQwjb53Q5k8kkOnfuLAYMGOCwL61WK+66665qf4v2zzw/P18EBQWJ6667TpSVldW4jclkEtHR0aJz584O23z55ZcCgJg6dar02JgxY0TLli0d9vPf//5XABC//fZbje+ZnI/NRzK48cYbERUVhRYtWmDEiBEIDAzEmjVr0KxZMwCAXq+HVmv7aCwWC3JzcxEYGIh27drhjz/+kPazatUqREZGYtKkSdWOUbXJoCEeeOABBAUFSfeHDRuGuLg4fP311wCAjIwMZGZmYtSoUcjNzUVOTg5ycnJQUlKCv/3tb/jf//4Hq9XqsM/y8nL4+/vXedzVq1fDarVi+PDh0j5zcnIQGxuLtm3bYuPGjQ7bm0wmALbzVZsVK1YgJCQEN910k8M+k5OTERgYWG2flZWVDtvl5OSgvLy8znKfOXMG8+fPx5QpUxAYGFjt+B06dED79u0d9mlvMqx6/Nr89ttvGD58OIYOHYo5c+bUuE19zgcAqbYHAHl5eSgoKEBKSorD39batWthtVoxdepU6W/Rzv639cMPP6CoqAjPPvtstc/Wvs3vv/+O7OxsPPLIIw7bDBkyBO3bt8dXX33l8Dqr1Sqdo4yMDCxduhRxcXHo0KFDne+JnIfNRzJYsGABkpKS4OPjg5iYGLRr187hH57VasW8efPwzjvvICsrCxaLRXrO3sQE2Jqd2rVrBx8f536Mbdu2dbiv0WiQmJiI48ePAwAyMzMBAGPGjKl1HwUFBQgLC5Pu5+TkVNtvVZmZmRBC1Lpd1Wae/Px8AKj2RVx1nwUFBYiOjq7x+ezsbIf733//PaKiouosZ1XTpk1DfHw8JkyYUK1tPjMzEwcOHKh1n1WPX5MzZ85gyJAhKCkpQW5ubq2BX5/zAQBffvklZs+ejYyMDId+jcv3e/ToUWi1WnTs2LHW/dibPTt37lzrNidOnAAAtGvXrtpz7du3x5YtWxweO3XqlMO5iouLw6pVq674nsh5GAoy6NmzpzT6qCYvv/wypkyZggcffBCzZs1CeHg4tFotHn/88Wr/A5eDvQxz5szBNddcU+M2l/8jNplMOHfuHG666aYr7lej0eCbb76BTqerc58AcP78eQBAbGxsnfuMjo5Genp6jc9X/bK+7rrrMHv2bIfH3n77baxbt67G1x84cABLlizB8uXLa+ybsFqt6NKlC954440aX9+iRYtay2535MgRdO/eHW+++SZGjx6Njz76qMZArs/52Lx5M26//Xb07dsX77zzDuLi4uDr64vFixdX6xyWQ0xMDJYvXw7A9h+LDz/8EAMHDsSWLVvQpUsXmUvnHRgKHmjlypXo378/PvjgA4fH8/PzERkZKd1PSEjAr7/+isrKSqd0ltrZawJ2QggcOXIEXbt2lY4LAMHBwbjxxhuvuL9du3ahsrKyziC071cIgdatWyMpKemK+92/fz80Gk2N/wu9fJ/r169H7969HZpNahMZGVntPdXVGfzcc8/hmmuuwb333lvr8Xft2oW//e1vjW7SszfdxcTEYN26dXjqqacwePDgaoFWn/OxatUq+Pv747vvvnNoZlq8eHG1clutVuzfv7/W4Lf/HezduxeJiYk1btOyZUsAwKFDh6QmM7tDhw5Jz9v5+/s7nP/bb78d4eHhePvtt7Fo0aJa3xc5D/sUPJBOp4MQwuGxFStWVBvCN3ToUOTk5ODtt9+uto+qr2+IpUuXoqioSLq/cuVKnDt3DoMGDQIAJCcnIyEhAa+99hqKi4urvf7ixYvVyq7T6Woc7nm5u+++GzqdDjNmzKhWfiEEcnNzpftmsxmrVq1Cz54962xaGD58OCwWC2bNmlXtObPZLDW5NMa2bduwbt06/Oc//6n1C3/48OE4c+YM3nvvvWrPlZWVoaSk5IrHSUpKQkxMDABg/vz5sFqt+Ne//uWwTX3Ph06ng0ajcWiSPH78eLXgu/POO6HVajFz5sxqtVP7Z3PzzTcjKCgIaWlp1fp
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def show_distribution(df: Series, column_name=\"\") -> None:\n",
" plt.pie(\n",
" df.value_counts(),\n",
" labels=class_counts.index,\n",
" autopct='%1.1f%%',\n",
" colors=['lightblue', 'pink'],\n",
" startangle=45,\n",
" explode=(0, 0.05)\n",
" )\n",
" plt.title(\"Распределение классов\" + (f\"\\n\\\"{column_name}\\\"\" if column_name else \"\"))\n",
" plt.show()\n",
"\n",
"show_distribution(Y_train_samplied, column_name=target_column)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Обучение модели"
]
},
{
"cell_type": "code",
"execution_count": 356,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 352,
"metadata": {},
"outputs": [],
"source": [
"model = RandomForestClassifier()\n",
"\n",
"start_time = time.time()\n",
"\n",
"model.fit(X_train, Y_train)\n",
"\n",
"train_time = time.time() - start_time"
]
},
{
"cell_type": "code",
"execution_count": 353,
"metadata": {},
"outputs": [],
"source": [
"Y_pred = model.predict(X_test)\n",
"Y_pred_proba = model.predict_proba(X_test)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 360,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Время обучения модели: 51.06 секунд\n",
"ROC-AUC: 0.87\n",
"F1-Score: 0.23\n",
"Матрица ошибок:\n",
"[[23151 108]\n",
" [ 1155 188]]\n",
"Отчет по классификации:\n",
" precision recall f1-score support\n",
"\n",
" False 0.95 1.00 0.97 23259\n",
" True 0.64 0.14 0.23 1343\n",
"\n",
" accuracy 0.95 24602\n",
" macro avg 0.79 0.57 0.60 24602\n",
"weighted avg 0.94 0.95 0.93 24602\n",
"\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlwAAAJwCAYAAABccr/9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABieElEQVR4nO3dd3wU1f7/8fcGSIGQhBZC6ErvTZpU4RqKBUGqV7oIAkoRAekioIBSRFEuQkBFmggKinCpV4j0UAMigkgJPUAgPfv7g1/2y5oACWRmQvb19LEPs2fOzHxmQ8iHzzlzxma32+0CAACAYdysDgAAACCzI+ECAAAwGAkXAACAwUi4AAAADEbCBQAAYDASLgAAAIORcAEAABiMhAsAAMBgWa0OAACsEhsbq6tXryoxMVGBgYFWhwMgE6PCBcCl7N69W506dVLevHnl4eGhAgUKqE2bNlaHBSCTI+EC/iE4OFg2m002m02//vprsu12u12FCxeWzWbTc889Z0GEeFirVq1SvXr1dOTIEU2YMEHr16/X+vXr9cUXX1gdGoBMjiFF4B48PT21aNEi1atXz6l9y5YtOnPmjDw8PCyKDA/j6tWr6tmzp4KCgrRs2TK5u7tbHRIAF0KFC7iHFi1aaNmyZYqPj3dqX7RokapXr66AgACLIsPDmD9/vqKjoxUcHEyyBcB0JFzAPXTs2FFXrlzR+vXrHW2xsbFavny5OnXqlOI+U6dOVd26dZUnTx55eXmpevXqWr58uVOfpOHKe70aNWokSdq8ebNsNpuWLFmid999VwEBAcqRI4deeOEF/f33307HbNSokWO/JLt27XIc85/n79evX7LYn3vuORUrVsyp7cCBA+rataueeOIJeXp6KiAgQN27d9eVK1fu99E5XLx4UT169FD+/Pnl6empypUra8GCBU59Tp06JZvNpqlTpzq1V6hQIdk1jRw5UjabTZGRkU7XM3bsWKd+U6ZMcfosJem3335TlSpVNHHiRBUuXFgeHh4qWbKkPvjgAyUmJjrtHx8fr/Hjx+vJJ5+Uh4eHihUrpnfffVcxMTFO/YoVK6auXbs6tfXq1Uuenp7avHnzgz8gAC6DIUXgHooVK6Y6dero22+/VfPmzSVJP//8s65fv64OHTpo5syZyfaZMWOGXnjhBb3yyiuKjY3V4sWL1bZtW61evVotW7aUJH311VeO/v/73/80Z84cTZs2TXnz5pUk5c+f3+mYEyZMkM1m09ChQ3Xx4kVNnz5dTZs2VWhoqLy8vO4Z/9ChQx/5M1i/fr3+/PNPdevWTQEBATp8+LDmzJmjw4cP67fffkuWzN0tKipKjRo10h9//KF+/fqpePHiWrZsmbp27aqIiAi99dZbjxxfSiIiIjRp0qRk7VeuXNGvv/6qX3/9Vd27d1f16tW1YcMGDR8+XKdOndLnn3/u6NuzZ08tWLBAL7/8sgYPHqwdO3Zo0qRJCgsL0/fff3/Pc48ZM0ZffvmllixZkixZBODi7ACczJ8/3y7JvmvXLvusWbPsOXPmtN++fdtut9vtbdu2tTdu3Nhut9vtRYsWtbds2dJp36R+SWJjY+0VKlSwP/PMM/c918mTJ5Nt27Rpk12SvWDBgvYbN2442pcuXWqXZJ8xY4ajrWHDhvaGDRs63v/00092SfZmzZrZ//ljLsnet2/fZOdr2bKlvWjRove9Hrvdbv/222/tkuxbt25N8ZqSTJ8+3S7J/vXXXzvaYmNj7XXq1LF7e3s7runkyZN2SfYpU6Y47V++fHmna7Lb7fYRI0bYJdlv3rzpdD1jxoxxvH/nnXfs/v7+9urVqzvt37BhQ7sk+9ixY52O2bVrV7sk+8GDB+12u90eGhpql2Tv2bOnU7+3337bLsm+ceNGR1vRokXtXbp0sdvtdvsXX3xhl2T/5JNP7vu5AHBNDCkC99GuXTtFRUVp9erVunnzplavXn3P4URJThWna9eu6fr166pfv7727t370DF07txZOXPmdLx/+eWXVaBAAf30008p9rfb7Ro+fLjatGmjWrVqPfR5JefriY6O1uXLl1W7dm1JeuA1/fTTTwoICFDHjh0dbdmyZdObb76pyMhIbdmy5ZFiS8nZs2f1ySefaNSoUfL29k62PUuWLBo4cKBT2+DBgyVJa9asccQtSYMGDbpvv7utWrVKb7zxhoYMGZLicC0AkHAB95EvXz41bdpUixYt0ooVK5SQkKCXX375nv1Xr16t2rVry9PTU7lz51a+fPk0e/ZsXb9+/aFjKFmypNN7m82mEiVK6NSpUyn2/+abb3T48GFNnDjxoc+Z5OrVq3rrrbeUP39+eXl5KV++fCpevLgkPfCa/vrrL5UsWVJubs5/zZQtW9axPb2NGTNGgYGBev3115Nts9lsCgwMlI+Pj1N76dKl5ebm5vg8//rrL7m5ualEiRJO/QICAuTn55cs7tDQUHXs2FEJCQm6evVq+l4QgEyDOVzAA3Tq1EmvvfaawsPD1bx5c/n5+aXY73//+59eeOEFNWjQQJ999pkKFCigbNmyaf78+Vq0aJEpscbGxmrUqFHq0aOHSpUq9cjHa9eunbZv364hQ4aoSpUq8vb2VmJiopo1a5ZsornVwsLCFBwcrK+//lrZsmVLtv1+891Scr/5aXfbv3+/mjdvriZNmmjIkCH697//zfwtAMmQcAEP8NJLL+n111/Xb7/9piVLltyz33fffSdPT0/98ssvTmt0zZ8//5HOf/z4caf3drtdf/zxhypVqpSs72effaaLFy8mu2vvYVy7dk0bNmzQuHHjNHr06HvGcy9FixbVgQMHlJiY6FTlOnr0qGN7eho+fLiqVKmi9u3bp7i9ePHiWrdunW7evOk0RPv7778rMTHRcYdm0aJFlZiYqOPHjzuqcZJ04cIFRUREJIu7YsWKWrZsmby8vLRs2TL16tVLBw4ckKenZ7peH4DHG0OKwAN4e3tr9uzZGjt2rJ5//vl79suSJYtsNpsSEhIcbadOndLKlSsf6fwLFy7UzZs3He+XL1+u8+fPO+6cTHLz5k1NmDBBAwcOTJc1wrJkySLpToJ3t+nTp6dq/xYtWig8PNwpSY2Pj9cnn3wib29vNWzY8JFjTBISEqJVq1bpgw8+uGdlqkWLFkpISNCsWbOc2j/++GNJctxF2qJFC0nJr/Of/ZJUq1ZNOXLkkJubm+bOnatTp07pvffee+RrApC5UOECUqFLly4P7NOyZUt9/PHHatasmTp16qSLFy/q008/VYkSJXTgwIGHPnfu3LlVr149devWTRcuXND06dNVokQJvfbaa0799u7dq7x58+qdd9554DFPnz6ttWvXOrVdunRJUVFRWrt2rRo2bCgfHx81aNBAkydPVlxcnAoWLKh169bp5MmTqYq7V69e+uKLL9S1a1ft2bNHxYoV0/Lly7Vt2zZNnz7dqcokSceOHXOKKTIyUm5ubk5tf/75Z4rnWrdunf71r3+padOm94ynRYsWatq0qUaMGKGTJ0+qSpUq2rhxo7777jv17t1bFSpUkCRVrlxZXbp00Zw5cxQREaGGDRtq586dWrBggVq1aqXGjRvf8xwVKlTQ0KFD9cEHH6hDhw4pViEBuCiL75IEMpy7l4W4n5SWhfjyyy/tJUuWtHt4eNjLlCljnz9/vn3MmDHJlmb457nutyzEt99+ax8+fLjd39/f7uXlZW/ZsqX9r7/+cuqbtOTBtGnTnNpTOrekB76S4jlz5oz9pZdesvv5+dl9fX3tbdu2tZ87dy7ZUgz3cuHCBXu3bt3sefPmtbu7u9srVqxonz9/vlOfpGUh0vL657IQNpvNvmfPnmSfyT+XlYiMjLQPHDjQHhgYaM+WLZu9RIkS9g8++MCekJDg1C8uLs4+btw4e/Hixe3ZsmWzFy5c2D58+HB7dHS0U7+7l4VIEh0dbS9Tpoz9qaeessfHxz/wMwLgGmx2+z/GCwBkCJs3b1bjxo21bNmy+94ZmZ5OnTql4sWL6+TJk8lWnQcAPDzmcAEAABiMhAuAg5eXl4KCgtK8hAIA4P6YNA/AIX/+/Mkm0wMAHh1zuAA
"text/plain": [
"<Figure size 700x700 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Метрики\n",
"roc_auc = roc_auc_score(Y_test, Y_pred_proba)\n",
"f1 = f1_score(Y_test, Y_pred)\n",
"\n",
"conf_matrix = confusion_matrix(Y_test, Y_pred)\n",
"class_report = classification_report(Y_test, Y_pred)\n",
"\n",
"# Вывод результатов\n",
"print(f'Время обучения модели: {train_time:.2f} секунд')\n",
"print(f'ROC-AUC: {roc_auc:.2f}')\n",
"print(f'F1-Score: {f1:.2f}')\n",
"print('Матрица ошибок:')\n",
"print(conf_matrix)\n",
"print('Отчет по классификации:')\n",
"print(class_report)\n",
"\n",
"# Визуализация матрицы ошибок\n",
"plt.figure(figsize=(7, 7))\n",
"sns.heatmap(\n",
" conf_matrix,\n",
" annot=True,\n",
" fmt='d',\n",
" cmap='Blues',\n",
" xticklabels=['Нет приступа', 'Был приступ'],\n",
" yticklabels=['Нет приступа', 'Был приступ']\n",
")\n",
"plt.title('Матрица ошибок')\n",
"plt.xlabel('Предсказанный класс')\n",
"plt.ylabel('Истинный класс')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}