1550 lines
246 KiB
Plaintext
1550 lines
246 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Вариант 2. Показатели сердечных заболеваний"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 228,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from typing import Any\n",
|
|||
|
"from math import ceil\n",
|
|||
|
"\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"from pandas import DataFrame, Series\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import ADASYN, SMOTE\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"import matplotlib.pyplot as plt"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Загрузим данные из датасета"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 229,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv('csv\\\\heart_2022_no_nans.csv')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Посмотрим общие сведения о датасете"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 230,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 246022 entries, 0 to 246021\n",
|
|||
|
"Data columns (total 40 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 State 246022 non-null object \n",
|
|||
|
" 1 Sex 246022 non-null object \n",
|
|||
|
" 2 GeneralHealth 246022 non-null object \n",
|
|||
|
" 3 PhysicalHealthDays 246022 non-null float64\n",
|
|||
|
" 4 MentalHealthDays 246022 non-null float64\n",
|
|||
|
" 5 LastCheckupTime 246022 non-null object \n",
|
|||
|
" 6 PhysicalActivities 246022 non-null object \n",
|
|||
|
" 7 SleepHours 246022 non-null float64\n",
|
|||
|
" 8 RemovedTeeth 246022 non-null object \n",
|
|||
|
" 9 HadHeartAttack 246022 non-null object \n",
|
|||
|
" 10 HadAngina 246022 non-null object \n",
|
|||
|
" 11 HadStroke 246022 non-null object \n",
|
|||
|
" 12 HadAsthma 246022 non-null object \n",
|
|||
|
" 13 HadSkinCancer 246022 non-null object \n",
|
|||
|
" 14 HadCOPD 246022 non-null object \n",
|
|||
|
" 15 HadDepressiveDisorder 246022 non-null object \n",
|
|||
|
" 16 HadKidneyDisease 246022 non-null object \n",
|
|||
|
" 17 HadArthritis 246022 non-null object \n",
|
|||
|
" 18 HadDiabetes 246022 non-null object \n",
|
|||
|
" 19 DeafOrHardOfHearing 246022 non-null object \n",
|
|||
|
" 20 BlindOrVisionDifficulty 246022 non-null object \n",
|
|||
|
" 21 DifficultyConcentrating 246022 non-null object \n",
|
|||
|
" 22 DifficultyWalking 246022 non-null object \n",
|
|||
|
" 23 DifficultyDressingBathing 246022 non-null object \n",
|
|||
|
" 24 DifficultyErrands 246022 non-null object \n",
|
|||
|
" 25 SmokerStatus 246022 non-null object \n",
|
|||
|
" 26 ECigaretteUsage 246022 non-null object \n",
|
|||
|
" 27 ChestScan 246022 non-null object \n",
|
|||
|
" 28 RaceEthnicityCategory 246022 non-null object \n",
|
|||
|
" 29 AgeCategory 246022 non-null object \n",
|
|||
|
" 30 HeightInMeters 246022 non-null float64\n",
|
|||
|
" 31 WeightInKilograms 246022 non-null float64\n",
|
|||
|
" 32 BMI 246022 non-null float64\n",
|
|||
|
" 33 AlcoholDrinkers 246022 non-null object \n",
|
|||
|
" 34 HIVTesting 246022 non-null object \n",
|
|||
|
" 35 FluVaxLast12 246022 non-null object \n",
|
|||
|
" 36 PneumoVaxEver 246022 non-null object \n",
|
|||
|
" 37 TetanusLast10Tdap 246022 non-null object \n",
|
|||
|
" 38 HighRiskLastYear 246022 non-null object \n",
|
|||
|
" 39 CovidPos 246022 non-null object \n",
|
|||
|
"dtypes: float64(6), object(34)\n",
|
|||
|
"memory usage: 75.1+ MB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>PhysicalHealthDays</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>4.119026</td>\n",
|
|||
|
" <td>8.405844</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" <td>30.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>MentalHealthDays</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>4.167140</td>\n",
|
|||
|
" <td>8.102687</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>4.00</td>\n",
|
|||
|
" <td>30.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>SleepHours</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>7.021331</td>\n",
|
|||
|
" <td>1.440681</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>6.00</td>\n",
|
|||
|
" <td>7.00</td>\n",
|
|||
|
" <td>8.00</td>\n",
|
|||
|
" <td>24.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>HeightInMeters</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>1.705150</td>\n",
|
|||
|
" <td>0.106654</td>\n",
|
|||
|
" <td>0.91</td>\n",
|
|||
|
" <td>1.63</td>\n",
|
|||
|
" <td>1.70</td>\n",
|
|||
|
" <td>1.78</td>\n",
|
|||
|
" <td>2.41</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>WeightInKilograms</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>83.615179</td>\n",
|
|||
|
" <td>21.323156</td>\n",
|
|||
|
" <td>28.12</td>\n",
|
|||
|
" <td>68.04</td>\n",
|
|||
|
" <td>81.65</td>\n",
|
|||
|
" <td>95.25</td>\n",
|
|||
|
" <td>292.57</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>BMI</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>28.668136</td>\n",
|
|||
|
" <td>6.513973</td>\n",
|
|||
|
" <td>12.02</td>\n",
|
|||
|
" <td>24.27</td>\n",
|
|||
|
" <td>27.46</td>\n",
|
|||
|
" <td>31.89</td>\n",
|
|||
|
" <td>97.65</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" count mean std min 25% 50% \\\n",
|
|||
|
"PhysicalHealthDays 246022.0 4.119026 8.405844 0.00 0.00 0.00 \n",
|
|||
|
"MentalHealthDays 246022.0 4.167140 8.102687 0.00 0.00 0.00 \n",
|
|||
|
"SleepHours 246022.0 7.021331 1.440681 1.00 6.00 7.00 \n",
|
|||
|
"HeightInMeters 246022.0 1.705150 0.106654 0.91 1.63 1.70 \n",
|
|||
|
"WeightInKilograms 246022.0 83.615179 21.323156 28.12 68.04 81.65 \n",
|
|||
|
"BMI 246022.0 28.668136 6.513973 12.02 24.27 27.46 \n",
|
|||
|
"\n",
|
|||
|
" 75% max \n",
|
|||
|
"PhysicalHealthDays 3.00 30.00 \n",
|
|||
|
"MentalHealthDays 4.00 30.00 \n",
|
|||
|
"SleepHours 8.00 24.00 \n",
|
|||
|
"HeightInMeters 1.78 2.41 \n",
|
|||
|
"WeightInKilograms 95.25 292.57 \n",
|
|||
|
"BMI 31.89 97.65 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 230,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.describe().transpose()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Получим информацию о пустых значениях в колонках датасета"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 231,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def get_null_columns_info(df: DataFrame) -> DataFrame:\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Возвращает информацию о пропущенных значениях в колонках датасета\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" w = []\n",
|
|||
|
" df_len = len(df)\n",
|
|||
|
"\n",
|
|||
|
" for column in df.columns:\n",
|
|||
|
" column_nulls = df[column].isnull()\n",
|
|||
|
" w.append([column, column_nulls.any(), column_nulls.sum() / df_len])\n",
|
|||
|
"\n",
|
|||
|
" null_df = DataFrame(w).rename(columns={0: \"Column\", 1: \"Has Null\", 2: \"Null Percent\"})\n",
|
|||
|
"\n",
|
|||
|
" return null_df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 232,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Column</th>\n",
|
|||
|
" <th>Has Null</th>\n",
|
|||
|
" <th>Null Percent</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>State</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>Sex</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>GeneralHealth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>PhysicalHealthDays</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>MentalHealthDays</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>LastCheckupTime</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>PhysicalActivities</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>SleepHours</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>RemovedTeeth</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>HadHeartAttack</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>HadAngina</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>11</th>\n",
|
|||
|
" <td>HadStroke</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12</th>\n",
|
|||
|
" <td>HadAsthma</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>13</th>\n",
|
|||
|
" <td>HadSkinCancer</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>HadCOPD</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>HadDepressiveDisorder</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16</th>\n",
|
|||
|
" <td>HadKidneyDisease</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17</th>\n",
|
|||
|
" <td>HadArthritis</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18</th>\n",
|
|||
|
" <td>HadDiabetes</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>19</th>\n",
|
|||
|
" <td>DeafOrHardOfHearing</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>20</th>\n",
|
|||
|
" <td>BlindOrVisionDifficulty</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>21</th>\n",
|
|||
|
" <td>DifficultyConcentrating</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>22</th>\n",
|
|||
|
" <td>DifficultyWalking</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>23</th>\n",
|
|||
|
" <td>DifficultyDressingBathing</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>24</th>\n",
|
|||
|
" <td>DifficultyErrands</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25</th>\n",
|
|||
|
" <td>SmokerStatus</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>26</th>\n",
|
|||
|
" <td>ECigaretteUsage</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>27</th>\n",
|
|||
|
" <td>ChestScan</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28</th>\n",
|
|||
|
" <td>RaceEthnicityCategory</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>29</th>\n",
|
|||
|
" <td>AgeCategory</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>30</th>\n",
|
|||
|
" <td>HeightInMeters</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>31</th>\n",
|
|||
|
" <td>WeightInKilograms</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>32</th>\n",
|
|||
|
" <td>BMI</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>33</th>\n",
|
|||
|
" <td>AlcoholDrinkers</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>34</th>\n",
|
|||
|
" <td>HIVTesting</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>35</th>\n",
|
|||
|
" <td>FluVaxLast12</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>36</th>\n",
|
|||
|
" <td>PneumoVaxEver</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>37</th>\n",
|
|||
|
" <td>TetanusLast10Tdap</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>38</th>\n",
|
|||
|
" <td>HighRiskLastYear</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>39</th>\n",
|
|||
|
" <td>CovidPos</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Column Has Null Null Percent\n",
|
|||
|
"0 State False 0.0\n",
|
|||
|
"1 Sex False 0.0\n",
|
|||
|
"2 GeneralHealth False 0.0\n",
|
|||
|
"3 PhysicalHealthDays False 0.0\n",
|
|||
|
"4 MentalHealthDays False 0.0\n",
|
|||
|
"5 LastCheckupTime False 0.0\n",
|
|||
|
"6 PhysicalActivities False 0.0\n",
|
|||
|
"7 SleepHours False 0.0\n",
|
|||
|
"8 RemovedTeeth False 0.0\n",
|
|||
|
"9 HadHeartAttack False 0.0\n",
|
|||
|
"10 HadAngina False 0.0\n",
|
|||
|
"11 HadStroke False 0.0\n",
|
|||
|
"12 HadAsthma False 0.0\n",
|
|||
|
"13 HadSkinCancer False 0.0\n",
|
|||
|
"14 HadCOPD False 0.0\n",
|
|||
|
"15 HadDepressiveDisorder False 0.0\n",
|
|||
|
"16 HadKidneyDisease False 0.0\n",
|
|||
|
"17 HadArthritis False 0.0\n",
|
|||
|
"18 HadDiabetes False 0.0\n",
|
|||
|
"19 DeafOrHardOfHearing False 0.0\n",
|
|||
|
"20 BlindOrVisionDifficulty False 0.0\n",
|
|||
|
"21 DifficultyConcentrating False 0.0\n",
|
|||
|
"22 DifficultyWalking False 0.0\n",
|
|||
|
"23 DifficultyDressingBathing False 0.0\n",
|
|||
|
"24 DifficultyErrands False 0.0\n",
|
|||
|
"25 SmokerStatus False 0.0\n",
|
|||
|
"26 ECigaretteUsage False 0.0\n",
|
|||
|
"27 ChestScan False 0.0\n",
|
|||
|
"28 RaceEthnicityCategory False 0.0\n",
|
|||
|
"29 AgeCategory False 0.0\n",
|
|||
|
"30 HeightInMeters False 0.0\n",
|
|||
|
"31 WeightInKilograms False 0.0\n",
|
|||
|
"32 BMI False 0.0\n",
|
|||
|
"33 AlcoholDrinkers False 0.0\n",
|
|||
|
"34 HIVTesting False 0.0\n",
|
|||
|
"35 FluVaxLast12 False 0.0\n",
|
|||
|
"36 PneumoVaxEver False 0.0\n",
|
|||
|
"37 TetanusLast10Tdap False 0.0\n",
|
|||
|
"38 HighRiskLastYear False 0.0\n",
|
|||
|
"39 CovidPos False 0.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 232,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"get_null_columns_info(df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Получим информацию о выбросах"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 233,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def get_numeric_columns(df: DataFrame) -> list[str]:\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Возвращает список числовых колонок\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" return list(filter(lambda column: pd.api.types.is_numeric_dtype(df[column]), df.columns))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 234,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def get_outliers_info(df: DataFrame) -> DataFrame:\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Возаращает информацию о выбросах в числовых колонках датасета\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" data = {\n",
|
|||
|
" \"Column\": [],\n",
|
|||
|
" \"Has Outliers\": [],\n",
|
|||
|
" \"Outliers Count\": [],\n",
|
|||
|
" \"Min Value\": [],\n",
|
|||
|
" \"Max Value\": [],\n",
|
|||
|
" \"Q1\": [],\n",
|
|||
|
" \"Q3\": []\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" info = DataFrame(data)\n",
|
|||
|
"\n",
|
|||
|
" for column in get_numeric_columns(df):\n",
|
|||
|
" Q1: float = df[column].quantile(0.25)\n",
|
|||
|
" Q3: float = df[column].quantile(0.75)\n",
|
|||
|
" IQR: float = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
" lower_bound: float = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound: float = Q3 + 1.5 * IQR\n",
|
|||
|
"\n",
|
|||
|
" outliers: DataFrame = df[(df[column] < lower_bound) | (df[column] > upper_bound)]\n",
|
|||
|
" outlier_count: int = outliers.shape[0]\n",
|
|||
|
"\n",
|
|||
|
" info.loc[len(info)] = [column, outlier_count > 0, outlier_count, df[column].min(), df[column].max(), Q1, Q3]\n",
|
|||
|
"\n",
|
|||
|
" return info"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Посмотрим данные по выбросам"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 235,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Column</th>\n",
|
|||
|
" <th>Has Outliers</th>\n",
|
|||
|
" <th>Outliers Count</th>\n",
|
|||
|
" <th>Min Value</th>\n",
|
|||
|
" <th>Max Value</th>\n",
|
|||
|
" <th>Q1</th>\n",
|
|||
|
" <th>Q3</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>PhysicalHealthDays</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>38810</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>30.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>MentalHealthDays</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>32714</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>30.00</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>4.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>SleepHours</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>3488</td>\n",
|
|||
|
" <td>1.00</td>\n",
|
|||
|
" <td>24.00</td>\n",
|
|||
|
" <td>6.00</td>\n",
|
|||
|
" <td>8.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>HeightInMeters</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>830</td>\n",
|
|||
|
" <td>0.91</td>\n",
|
|||
|
" <td>2.41</td>\n",
|
|||
|
" <td>1.63</td>\n",
|
|||
|
" <td>1.78</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>WeightInKilograms</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>5940</td>\n",
|
|||
|
" <td>28.12</td>\n",
|
|||
|
" <td>292.57</td>\n",
|
|||
|
" <td>68.04</td>\n",
|
|||
|
" <td>95.25</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>BMI</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>7563</td>\n",
|
|||
|
" <td>12.02</td>\n",
|
|||
|
" <td>97.65</td>\n",
|
|||
|
" <td>24.27</td>\n",
|
|||
|
" <td>31.89</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Column Has Outliers Outliers Count Min Value Max Value \\\n",
|
|||
|
"0 PhysicalHealthDays True 38810 0.00 30.00 \n",
|
|||
|
"1 MentalHealthDays True 32714 0.00 30.00 \n",
|
|||
|
"2 SleepHours True 3488 1.00 24.00 \n",
|
|||
|
"3 HeightInMeters True 830 0.91 2.41 \n",
|
|||
|
"4 WeightInKilograms True 5940 28.12 292.57 \n",
|
|||
|
"5 BMI True 7563 12.02 97.65 \n",
|
|||
|
"\n",
|
|||
|
" Q1 Q3 \n",
|
|||
|
"0 0.00 3.00 \n",
|
|||
|
"1 0.00 4.00 \n",
|
|||
|
"2 6.00 8.00 \n",
|
|||
|
"3 1.63 1.78 \n",
|
|||
|
"4 68.04 95.25 \n",
|
|||
|
"5 24.27 31.89 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 235,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"outliers_info = get_outliers_info(df)\n",
|
|||
|
"outliers_info"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 236,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def visualize_outliers(df: DataFrame) -> None:\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Генерирует диаграммы BoxPlot для числовых колонок датасета\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" columns = get_numeric_columns(df)\n",
|
|||
|
" plt.figure(figsize=(15, 10))\n",
|
|||
|
" rows: int = ceil(len(columns) / 3)\n",
|
|||
|
" for index, column in enumerate(columns, 1):\n",
|
|||
|
" plt.subplot(rows, 3, index)\n",
|
|||
|
" plt.boxplot(df[column], vert=True, patch_artist=True)\n",
|
|||
|
" plt.title(f\"Диаграмма размахов\\n\\\"{column}\\\"\")\n",
|
|||
|
" plt.xlabel(column)\n",
|
|||
|
" \n",
|
|||
|
" plt.tight_layout()\n",
|
|||
|
" plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Визуализируем выбросы с помощью диаграмм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 237,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdIAAAPeCAYAAAAI5OjmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wU1f7/8fcmgTSS0BMiCVVpoiAqvShIiBRRkKrSrgUpAraLStUrYgMrlnsFpIgNUVApIsWCKCiiCEgVlN4SWkLJ+f3BL/PNJJtNQsrsLq/n47EPMzNnZ86smnfms2fOuIwxRgAAAAAAAAAAwK0ApzsAAAAAAAAAAIA3o5AOAAAAAAAAAIAHFNIBAAAAAAAAAPCAQjoAAAAAAAAAAB5QSAcAAAAAAAAAwAMK6QAAAAAAAAAAeEAhHQAAAAAAAAAADyikAwAAAAAAAADgAYV0AAAAAAAAAAA8oJAOAAAAAAAAAIAHFNLhsz766CO5XC63ryuvvNLp7gEAADfIbwAAfA/5DQBSkNMdAPLrscceU61atazl//znPw72BgAA5Ab5DQCA7yG/AVzKKKTD5910001q1aqVtfzf//5Xhw4dcq5DAAAgR+Q3AAC+h/wGcCljahf4rDNnzkiSAgJy/s942rRpcrlc2rlzp7UuLS1NV111lVwul6ZNm2atX79+vfr27auqVasqJCREMTEx6t+/vw4fPmzb59ixY93e1hYU9H/fT7Vq1UpXXnml1q5dqyZNmig0NFRVqlTRG2+8keVcRo8erQYNGigqKkrh4eFq3ry5li1bZmu3c+dO6zjz5s2zbUtJSVGpUqXkcrn0/PPPZ+ln+fLldfbsWdt73nvvPWt/Gf/4+fTTT9W+fXvFxsYqODhY1apV05NPPqnz58/n+FmnH2/Tpk3q1q2bIiMjVaZMGT3wwANKSUmxtZ06dapuvPFGlS9fXsHBwapdu7amTJmSZZ+33HKLKleurJCQEJUvX16dOnXSb7/9ZmuTfh6TJ0/O8v6aNWvK5XJp8ODB1rojR47ooYceUt26dVWiRAlFRkYqMTFRv/76q+29ffr0UUhIiDZu3Ghbn5CQoFKlSmnPnj3Wuu3bt+v2229X6dKlFRYWpkaNGunzzz+3vW/58uW2/16Cg4N1xRVXaMKECTLGeP5wAcAPkN/zbNvIb/IbAHwB+T3Pto38Jr9xaWJEOnxWepAHBwdf1PtnzJiRJQwkacmSJdq+fbv69eunmJgYbdiwQW+99ZY2bNigH374QS6Xy9Z+ypQpKlGihLWc+Q+Lo0eP6uabb1a3bt3Us2dPffDBBxo4cKCKFy+u/v37S5KSk5P13//+Vz179tTdd9+t48eP63//+58SEhL0448/ql69erZ9hoSEaOrUqercubO1bu7cuVmCMqPjx49rwYIFuvXWW611U6dOVUhISJb3TZs2TSVKlNCIESNUokQJff311xo9erSSk5P13HPPZXuMjLp166bKlStrwoQJ+uGHH/Tyyy/r6NGjevfdd22fXZ06ddSpUycFBQVp/vz5uv/++5WWlqZBgwbZ9nfPPfcoJiZGe/bs0auvvqo2bdpox44dCgsLy/K5DBs2zFr3/fff66+//srSv+3bt2vevHm6/fbbVaVKFe3fv19vvvmmWrZsqT/++EOxsbGSpJdeeklff/21+vTpo1WrVikwMFBvvvmmFi9erBkzZljt9u/fryZNmujUqVMaOnSoypQpo+nTp6tTp0766KOPbJ+79H+3RJ4+fVrvv/++HnvsMZUvX14DBgzI1ecLAL6K/Ca/yW8A8D3kN/lNfgOSDOCjJk+ebCSZX3/91ba+ZcuWpk6dOrZ1U6dONZLMjh07jDHGpKSkmPj4eJOYmGgkmalTp1ptT506leVY7733npFkVq5caa0bM2aMkWQOHjyYbR9btmxpJJkXXnjBWpeammrq1atnypcvb86cOWOMMebcuXMmNTXV9t6jR4+a6Oho079/f2vdjh07jCTTs2dPExQUZPbt22dta926tenVq5eRZJ577rks/ezZs6fp0KGDtf6vv/4yAQEBpmfPnlnOw91ncO+995qwsDCTkpKS7flmPF6nTp1s6++///4s/77cHSchIcFUrVrV4zE++OADI8msWbPGWifJdO3a1QQFBdnWDxgwwPpcBg0aZK1PSUkx58+ft+13x44dJjg42IwfP962ftGiRUaSeeqpp8z27dtNiRIlTOfOnW1thg0bZiSZb775xlp3/PhxU6VKFVO5cmXrWMuWLTOSzLJly2x9CQgIMPfff7/H8wYAf0B+k9/kNwD4HvKb/Ca/AWOY2gU+K/1Wr3LlyuX5va+99poOHz6sMWPGZNkWGhpq/ZySkqJDhw6pUaNGkqSff/45z8cKCgrSvffeay0XL15c9957rw4cOKC1a9dKkgIDA1W8eHFJF255O3LkiM6dO6drr73W7TGvueYa1alTRzNmzJAk/fXXX1q2bJn69u2bbT/69++vhQsXat++fZKk6dOnq3HjxrriiiuytM34GRw/flyHDh1S8+bNderUKW3atClX5535G+0hQ4ZIkr744gu3x0lKStKhQ4fUsmVLbd++XUlJSbb3nzp1SocOHdK6dev09ttvKzo6Okvfo6Oj1b59e02dOtV6zwcffKB+/fpl6V9wcLA1euH8+fM6fPiwSpQooRo1amT5zNu2bat7771X48eP12233aaQkBC9+eabtjZffPGFrr/+ejVr1sxaV6JECd1zzz3auXOn/vjjD1v79PPdtWuXnn32WaWlpenGG29080kCgH8hv8lv8hsAfA/5TX6T3wBzpMOH/fXXXwoKCspzkCclJenpp5/WiBEjFB0dnWX7kSNH9MADDyg6OlqhoaEqV66cqlSpYr03r2JjYxUeHm5blx5AGeeMmz59uq666iqFhISoTJkyKleunD7//PNsj9mvXz8rsKZNm6YmTZro8ssvz7Yf9erV05VXXql3331XxhhNmzbNbcBJ0oYNG3TrrbcqKipKkZGRKleunO644w5Juf8MMvelWrVqCggIsJ3zd999pzZt2ig8PFwlS5ZUuXLl9Nhjj7k9zvjx41WuXDnVr19fO3fu1PLlyxUREZHluP369dPs2bOVmpqqDz/8UKVKlXIbkGlpaZo0aZIuv/xyBQcHq2zZsipXrpzWr1/v9hyff/55lS5dWuvWrdPLL7+s8uXL27b/9ddfqlGjRpb3pT/RPvPtbZ07d1a5cuVUqVIljR07Vk888YS6dOmS5f0A4G/Ib/Kb/AYA30N+k9/kN0AhHT5s8+bNqlq1qu3hIrkxceJEBQQE6OGHH3a7vVu3bnr77bd13333ae7cuVq8eLEWLlwo6cIv/8Iwc+ZM9e3bV9WqVdP//vc/LVy4UEuWLNGNN96Y7THvuOMObd26VT/88IOmT5+ebShn1L9/f02dOlUrVqzQvn371K1btyxtjh07ppYtW+rXX3/V+PHjNX/+fC1ZskQTJ06UdPGfQea57bZt26bWrVvr0KFDevHFF/X5559ryZIlGj58uNvj/Otf/9LixYv1zjvvKCQkRF26dHEbuO3bt1fx4sU1b948TZ06VX369HH7QJz0P+ZatGihmTNnatGiRVqyZInq1Knj9hx/+eUXHThwQJLczu2XV88//7yWLFmiL774QmPGjNHEiRM1bty4fO8XALwd+U1+k98A4HvIb/Kb/AZ42Ch8VGpqqtatW2d72Edu7NmzRy+99JImTJigiIiILE8CP3r0qJYuXapx48Zp9OjR1votW7ZcdF/37NmjkydP2r4V//PPPyVJlStXliR99NFHqlq1qubOnWsLPHe3vqUrU6aMOnXqZN2m1q1bN9uTv93p3bu3Hn74YT3wwAPq2rWr22+Uly9frsOHD2vu3Llq0aKFtX7Hjh25Ot90W7ZssUYSSNLWrVuVlpZmnfP8+fOVmpqqzz77TPHx8Va7zE9KT1e9enVVr15dktSmTRvFx8dr9uzZGjhwoK1dUFCQ7rzzTv3nP//Rhg0b9M4777jd30cffaQbbrhB//vf/2zrjx07prJly9rWnTx5Uv3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 6 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"visualize_outliers(df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 238,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def remove_outliers(df: DataFrame, columns: list[str]) -> DataFrame:\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" Устраняет выбросы в заданных колонках:\n",
|
|||
|
" задает значениям выше максимального значение максимума, ниже минимального - значение минимума\n",
|
|||
|
" \"\"\"\n",
|
|||
|
" for column in columns:\n",
|
|||
|
" Q1: float = df[column].quantile(0.25)\n",
|
|||
|
" Q3: float = df[column].quantile(0.75)\n",
|
|||
|
" IQR: float = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
" lower_bound: float = Q1 - 1.5 * IQR\n",
|
|||
|
" upper_bound: float = Q3 + 1.5 * IQR\n",
|
|||
|
"\n",
|
|||
|
" df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else upper_bound if x > upper_bound else x)\n",
|
|||
|
" \n",
|
|||
|
" return df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаляем выбросы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 239,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"outliers_columns = list(outliers_info[outliers_info[\"Has Outliers\"] == True][\"Column\"])\n",
|
|||
|
"df = remove_outliers(df, outliers_columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Снова получим данные о выбросах"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 240,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Column</th>\n",
|
|||
|
" <th>Has Outliers</th>\n",
|
|||
|
" <th>Outliers Count</th>\n",
|
|||
|
" <th>Min Value</th>\n",
|
|||
|
" <th>Max Value</th>\n",
|
|||
|
" <th>Q1</th>\n",
|
|||
|
" <th>Q3</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>PhysicalHealthDays</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.000</td>\n",
|
|||
|
" <td>7.500</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>3.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>MentalHealthDays</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0.000</td>\n",
|
|||
|
" <td>10.000</td>\n",
|
|||
|
" <td>0.00</td>\n",
|
|||
|
" <td>4.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>SleepHours</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>3.000</td>\n",
|
|||
|
" <td>11.000</td>\n",
|
|||
|
" <td>6.00</td>\n",
|
|||
|
" <td>8.00</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>HeightInMeters</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.405</td>\n",
|
|||
|
" <td>2.005</td>\n",
|
|||
|
" <td>1.63</td>\n",
|
|||
|
" <td>1.78</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>WeightInKilograms</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>28.120</td>\n",
|
|||
|
" <td>136.065</td>\n",
|
|||
|
" <td>68.04</td>\n",
|
|||
|
" <td>95.25</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>5</th>\n",
|
|||
|
" <td>BMI</td>\n",
|
|||
|
" <td>False</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>12.840</td>\n",
|
|||
|
" <td>43.320</td>\n",
|
|||
|
" <td>24.27</td>\n",
|
|||
|
" <td>31.89</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Column Has Outliers Outliers Count Min Value Max Value \\\n",
|
|||
|
"0 PhysicalHealthDays False 0 0.000 7.500 \n",
|
|||
|
"1 MentalHealthDays False 0 0.000 10.000 \n",
|
|||
|
"2 SleepHours False 0 3.000 11.000 \n",
|
|||
|
"3 HeightInMeters False 0 1.405 2.005 \n",
|
|||
|
"4 WeightInKilograms False 0 28.120 136.065 \n",
|
|||
|
"5 BMI False 0 12.840 43.320 \n",
|
|||
|
"\n",
|
|||
|
" Q1 Q3 \n",
|
|||
|
"0 0.00 3.00 \n",
|
|||
|
"1 0.00 4.00 \n",
|
|||
|
"2 6.00 8.00 \n",
|
|||
|
"3 1.63 1.78 \n",
|
|||
|
"4 68.04 95.25 \n",
|
|||
|
"5 24.27 31.89 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 240,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"get_outliers_info(df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Видим, что выбросов не осталось - проверим через диаграммы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 241,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPeCAYAAADj01PlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADoXklEQVR4nOzdd3gUVdvH8d8mIZuQRggJEEihd6SKdJASYwhgoQkSQFERRUBQo9KV0AQsgMijCVKkg4ACglJUioAiRaRIlR5KQg2QzPsHb1aWZOjJkvD9XNdcj3PmzJx7Fh7unXtnzlgMwzAEAAAAAAAAAADScHJ0AAAAAAAAAAAAPKgoogMAAAAAAAAAYIIiOgAAAAAAAAAAJiiiAwAAAAAAAABggiI6AAAAAAAAAAAmKKIDAAAAAAAAAGCCIjoAAAAAAAAAACYoogMAAAAAAAAAYIIiOgAAAAAAAAAAJiiiAwAAAAAAAABggiI6sqxZs2bJYrGku5QtW9bR4QEAgHSQvwEAyHrI3wAedi6ODgC4V++++65KlSplW//www8dGA0AALgd5G8AALIe8jeAhxVFdGR5jRo1Ur169Wzr//vf/xQfH++4gAAAwC2RvwEAyHrI3wAeVkzngizr8uXLkiQnp1v/NY6Li5PFYtG+fftsbSkpKSpfvrwsFovi4uJs7Zs3b1aHDh1UuHBhubm5KV++fOrUqZNOnjxpd8z+/fun+yibi8t/v03Vq1dPZcuW1caNG1WjRg25u7urUKFC+vzzz9OcS9++fVW5cmX5+PjIw8NDtWvX1vLly+367du3zzbOvHnz7LZdunRJvr6+slgsGjFiRJo4AwICdOXKFbt9vvnmG9vxrv/i8+233yoiIkKBgYGyWq0qUqSIBg0apOTk5Ft+1qnj/f3332rZsqW8vb3l5+enN954Q5cuXbLrGxsbq8cff1wBAQGyWq0qXbq0xo0bl+aYzZo1U2hoqNzc3BQQEKCmTZtqy5Ytdn1Sz2P06NFp9i9ZsqQsFotee+01W9upU6fUq1cvlStXTp6envL29lZ4eLj+/PNPu32joqLk5uam7du327WHhYXJ19dXhw8ftrXt2bNHLVq0UO7cuZUzZ0499thj+u677+z2W7Fihd3fF6vVquLFiysmJkaGYdz8wwWAbID8Pc9uG/mb/A0AWQH5e57dNvI3+RsPH+5ER5aVmsStVutd7T9p0qQ0iUCSli5dqj179qhjx47Kly+ftm3bpi+++ELbtm3T2rVrZbFY7PqPGzdOnp6etvUbv1ScPn1aTz75pFq2bKk2bdpoxowZ6tKli1xdXdWpUydJUmJiov73v/+pTZs26ty5s86ePasvv/xSYWFh+u2331ShQgW7Y7q5uSk2NlbNmze3tc2ZMydNkrze2bNntXDhQj311FO2ttjYWLm5uaXZLy4uTp6enurZs6c8PT31008/qW/fvkpMTNTw4cNNx7hey5YtFRoaqpiYGK1du1affPKJTp8+ra+//trusytTpoyaNm0qFxcXLViwQK+++qpSUlLUtWtXu+O99NJLypcvnw4fPqzPPvtMDRs21N69e5UzZ840n0v37t1tbatXr9b+/fvTxLdnzx7NmzdPLVq0UKFChXTs2DGNHz9edevW1V9//aXAwEBJ0scff6yffvpJUVFRWrNmjZydnTV+/Hj98MMPmjRpkq3fsWPHVKNGDV24cEHdunWTn5+fJk6cqKZNm2rWrFl2n7v032OQFy9e1PTp0/Xuu+8qICBAL7zwwm19vgCQVZG/yd/kbwDIesjf5G/yNx56BpBFjR492pBk/Pnnn3btdevWNcqUKWPXFhsba0gy9u7daxiGYVy6dMkIDg42wsPDDUlGbGysre+FCxfSjPXNN98YkoxVq1bZ2vr162dIMk6cOGEaY926dQ1JxkcffWRrS0pKMipUqGAEBAQYly9fNgzDMK5evWokJSXZ7Xv69Gkjb968RqdOnWxte/fuNSQZbdq0MVxcXIyjR4/atjVo0MB47rnnDEnG8OHD08TZpk0bo0mTJrb2/fv3G05OTkabNm3SnEd6n8HLL79s5MyZ07h06ZLp+V4/XtOmTe3aX3311TR/XumNExYWZhQuXPimY8yYMcOQZGzYsMHWJsl49tlnDRcXF7v2F154wfa5dO3a1dZ+6dIlIzk52e64e/fuNaxWqzFw4EC79iVLlhiSjA8++MDYs2eP4enpaTRv3tyuT/fu3Q1Jxs8//2xrO3v2rFGoUCEjNDTUNtby5csNScby5cvtYnFycjJeffXVm543AGQH5G/yN/kbALIe8jf5m/yNhx3TuSDLSn28y9/f/473HTNmjE6ePKl+/fql2ebu7m7770uXLik+Pl6PPfaYJOn333+/47FcXFz08ssv29ZdXV318ssv6/jx49q4caMkydnZWa6urpKuPeZ26tQpXb16VVWqVEl3zEqVKqlMmTKaNGmSJGn//v1avny5OnToYBpHp06dtHjxYh09elSSNHHiRFWvXl3FixdP0/f6z+Ds2bOKj49X7dq1deHCBf3999+3dd43/pL9+uuvS5K+//77dMdJSEhQfHy86tatqz179ighIcFu/wsXLig+Pl6bNm3ShAkTlDdv3jSx582bVxEREYqNjbXtM2PGDHXs2DFNfFar1XbXQnJysk6ePClPT0+VKFEizWfeuHFjvfzyyxo4cKCefvppubm5afz48XZ9vv/+ez366KOqVauWrc3T01MvvfSS9u3bp7/++suuf+r5HjhwQMOGDVNKSooef/zxdD5JAMheyN/kb/I3AGQ95G/yN/kbDzuK6Miy9u/fLxcXlztO4gkJCRo8eLB69uypvHnzptl+6tQpvfHGG8qbN6/c3d3l7++vQoUK2fa9U4GBgfLw8LBrS00+188RN3HiRJUvX15ubm7y8/OTv7+/vvvuO9MxO3bsaEtWcXFxqlGjhooVK2YaR4UKFVS2bFl9/fXXMgxDcXFx6SY3Sdq2bZueeuop+fj4yNvbW/7+/mrXrp2k2/8MboylSJEicnJysjvnX3/9VQ0bNpSHh4dy5colf39/vfvuu+mOM3DgQPn7+6tixYrat2+fVqxYIS8vrzTjduzYUVOnTlVSUpJmzpwpX1/fdJNjSkqKRo0apWLFislqtSpPnjzy9/fX5s2b0z3HESNGKHfu3Nq0aZM++eQTBQQE2G3fv3+/SpQokWa/1DfX3/hIW/PmzeXv76+QkBD1799f77//vp555pk0+wNAdkP+Jn+TvwEg6yF/k7/J33jYUURHlrVjxw4VLlzY7kUit2Po0KFycnJS7969093esmVLTZgwQa+88ormzJmjH374QYsXL5Z07R/+jDB58mR16NBBRYoU0ZdffqnFixdr6dKlevzxx03HbNeunXbv3q21a9dq4sSJpgn5ep06dVJsbKxWrlypo0ePqmXLlmn6nDlzRnXr1tWff/6pgQMHasGCBVq6dKmGDh0q6e4/gxvnsvvnn3/UoEEDxcfHa+TIkfruu++0dOlS9ejRI91xXnzxRf3www/66quv5ObmpmeeeSbdZBsRESFXV1fNmzdPsbGxioqKSvflN6lf5OrUqaPJkydryZIlWrp0qcqUKZPuOf7xxx86fvy4JKU7l9+dGjFihJYuXarvv/9e/fr109ChQzVgwIB7Pi4APOjI3+Rv8jcAZD3kb/I3+RsPO14siiwpKSlJmzZtsnuxx+04fPiwPv74Y8XExMjLyyvNG79Pnz6tH3/8UQMGDFDfvn1t7bt27brrWA8fPqzz58/b/Rq+c+dOSVJoaKgkadasWSpcuLDmzJljl+zSe9wtlZ+fn5o2bWp7NK1ly5Z2b/hOT9u2bdW7d2+98cYbevbZZ9P9JXnFihU6efKk5syZozp16tja9+7de1vnm2rXrl22Owgkaffu3UpJSbGd84IFC5SUlKT58+crODjY1u/GN6KnKlq0qIoWLSpJatiwoYKDgzV16lR16dLFrp+Li4uef/55ffjhh9q2bZu++uqrdI83a9Ys1a9fX19++aVd+5kzZ5QnTx67tvPnz6tjx44qXbq0atSooWH
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1500x1000 with 6 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"visualize_outliers(df)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Нормализация числовых признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 242,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn import preprocessing"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 247,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>PhysicalHealthDaysNorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.253306</td>\n",
|
|||
|
" <td>0.385378</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>0.400000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>MentalHealthDaysNorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.244973</td>\n",
|
|||
|
" <td>0.378598</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>0.400000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>SleepHoursNorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.501124</td>\n",
|
|||
|
" <td>0.165569</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.375000</td>\n",
|
|||
|
" <td>0.500000</td>\n",
|
|||
|
" <td>0.625000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>HeightInMetersNorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.500401</td>\n",
|
|||
|
" <td>0.176240</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.375000</td>\n",
|
|||
|
" <td>0.491667</td>\n",
|
|||
|
" <td>0.625000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>WeightInKilogramsNorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.510963</td>\n",
|
|||
|
" <td>0.186742</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.369818</td>\n",
|
|||
|
" <td>0.495901</td>\n",
|
|||
|
" <td>0.621891</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>BMINorm</th>\n",
|
|||
|
" <td>246022.0</td>\n",
|
|||
|
" <td>0.513599</td>\n",
|
|||
|
" <td>0.194556</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.375000</td>\n",
|
|||
|
" <td>0.479659</td>\n",
|
|||
|
" <td>0.625000</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" count mean std min 25% 50% \\\n",
|
|||
|
"PhysicalHealthDaysNorm 246022.0 0.253306 0.385378 0.0 0.000000 0.000000 \n",
|
|||
|
"MentalHealthDaysNorm 246022.0 0.244973 0.378598 0.0 0.000000 0.000000 \n",
|
|||
|
"SleepHoursNorm 246022.0 0.501124 0.165569 0.0 0.375000 0.500000 \n",
|
|||
|
"HeightInMetersNorm 246022.0 0.500401 0.176240 0.0 0.375000 0.491667 \n",
|
|||
|
"WeightInKilogramsNorm 246022.0 0.510963 0.186742 0.0 0.369818 0.495901 \n",
|
|||
|
"BMINorm 246022.0 0.513599 0.194556 0.0 0.375000 0.479659 \n",
|
|||
|
"\n",
|
|||
|
" 75% max \n",
|
|||
|
"PhysicalHealthDaysNorm 0.400000 1.0 \n",
|
|||
|
"MentalHealthDaysNorm 0.400000 1.0 \n",
|
|||
|
"SleepHoursNorm 0.625000 1.0 \n",
|
|||
|
"HeightInMetersNorm 0.625000 1.0 \n",
|
|||
|
"WeightInKilogramsNorm 0.621891 1.0 \n",
|
|||
|
"BMINorm 0.625000 1.0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 247,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"min_max_scaler = preprocessing.MinMaxScaler()\n",
|
|||
|
"\n",
|
|||
|
"df_norm = df.copy()\n",
|
|||
|
"\n",
|
|||
|
"numeric_columns = get_numeric_columns(df)\n",
|
|||
|
"\n",
|
|||
|
"for column in numeric_columns:\n",
|
|||
|
" norm_column = column + \"Norm\"\n",
|
|||
|
" df_norm[norm_column] = min_max_scaler.fit_transform(\n",
|
|||
|
" df_norm[column].to_numpy().reshape(-1, 1)\n",
|
|||
|
" ).reshape(df_norm[column].shape)\n",
|
|||
|
"\n",
|
|||
|
"df_norm = df_norm.drop(columns=numeric_columns)\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"df_norm.describe().transpose()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Конструирование признаков"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Автоматическое конструирование признаков с помощью фреймворка FeatureTools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 221,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import featuretools as ft"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 222,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
|
|||
|
" pd.to_datetime(\n",
|
|||
|
"d:\\code\\AIM-PIbd-31-Potapov-N-S\\lab_3\\.venv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
|
|||
|
" warnings.warn(\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Преобразуем датасет с помощью фремйворка\n",
|
|||
|
"# https://featuretools.alteryx.com/en/stable/getting_started/afe.html\n",
|
|||
|
"\n",
|
|||
|
"entity_set = ft.EntitySet().add_dataframe(df, \"df\", make_index=True, index=\"id\")\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix, feature_defs = ft.dfs(\n",
|
|||
|
" entityset=entity_set,\n",
|
|||
|
" target_dataframe_name=\"df\",\n",
|
|||
|
" max_depth=2\n",
|
|||
|
")\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix: DataFrame\n",
|
|||
|
"feature_defs: list[ft.Feature]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполняем категориальное и унитарное кодирование признаков с помощью FeatureTools"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 224,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Было признаков: 40\n",
|
|||
|
"Стало признаков: 99\n",
|
|||
|
"<Feature: State = Washington>\n",
|
|||
|
"<Feature: State = Maryland>\n",
|
|||
|
"<Feature: State = Minnesota>\n",
|
|||
|
"<Feature: State = Ohio>\n",
|
|||
|
"<Feature: State = New York>\n",
|
|||
|
"<Feature: State = Texas>\n",
|
|||
|
"<Feature: State = Florida>\n",
|
|||
|
"<Feature: State = Kansas>\n",
|
|||
|
"<Feature: State = Wisconsin>\n",
|
|||
|
"<Feature: State = Maine>\n",
|
|||
|
"<Feature: State is unknown>\n",
|
|||
|
"<Feature: Sex = Female>\n",
|
|||
|
"<Feature: Sex = Male>\n",
|
|||
|
"<Feature: Sex is unknown>\n",
|
|||
|
"<Feature: GeneralHealth = Very good>\n",
|
|||
|
"<Feature: GeneralHealth = Good>\n",
|
|||
|
"<Feature: GeneralHealth = Excellent>\n",
|
|||
|
"<Feature: GeneralHealth = Fair>\n",
|
|||
|
"<Feature: GeneralHealth = Poor>\n",
|
|||
|
"<Feature: GeneralHealth is unknown>\n",
|
|||
|
"<Feature: PhysicalHealthDays>\n",
|
|||
|
"<Feature: MentalHealthDays>\n",
|
|||
|
"<Feature: LastCheckupTime = Within past year (anytime less than 12 months ago)>\n",
|
|||
|
"<Feature: LastCheckupTime = Within past 2 years (1 year but less than 2 years ago)>\n",
|
|||
|
"<Feature: LastCheckupTime = Within past 5 years (2 years but less than 5 years ago)>\n",
|
|||
|
"<Feature: LastCheckupTime = 5 or more years ago>\n",
|
|||
|
"<Feature: LastCheckupTime is unknown>\n",
|
|||
|
"<Feature: PhysicalActivities>\n",
|
|||
|
"<Feature: SleepHours>\n",
|
|||
|
"<Feature: RemovedTeeth = None of them>\n",
|
|||
|
"<Feature: RemovedTeeth = 1 to 5>\n",
|
|||
|
"<Feature: RemovedTeeth = 6 or more, but not all>\n",
|
|||
|
"<Feature: RemovedTeeth = All>\n",
|
|||
|
"<Feature: RemovedTeeth is unknown>\n",
|
|||
|
"<Feature: HadHeartAttack>\n",
|
|||
|
"<Feature: HadAngina>\n",
|
|||
|
"<Feature: HadStroke>\n",
|
|||
|
"<Feature: HadAsthma>\n",
|
|||
|
"<Feature: HadSkinCancer>\n",
|
|||
|
"<Feature: HadCOPD>\n",
|
|||
|
"<Feature: HadDepressiveDisorder>\n",
|
|||
|
"<Feature: HadKidneyDisease>\n",
|
|||
|
"<Feature: HadArthritis>\n",
|
|||
|
"<Feature: HadDiabetes = No>\n",
|
|||
|
"<Feature: HadDiabetes = Yes>\n",
|
|||
|
"<Feature: HadDiabetes = No, pre-diabetes or borderline diabetes>\n",
|
|||
|
"<Feature: HadDiabetes = Yes, but only during pregnancy (female)>\n",
|
|||
|
"<Feature: HadDiabetes is unknown>\n",
|
|||
|
"<Feature: DeafOrHardOfHearing>\n",
|
|||
|
"<Feature: BlindOrVisionDifficulty>\n",
|
|||
|
"<Feature: DifficultyConcentrating>\n",
|
|||
|
"<Feature: DifficultyWalking>\n",
|
|||
|
"<Feature: DifficultyDressingBathing>\n",
|
|||
|
"<Feature: DifficultyErrands>\n",
|
|||
|
"<Feature: SmokerStatus = Never smoked>\n",
|
|||
|
"<Feature: SmokerStatus = Former smoker>\n",
|
|||
|
"<Feature: SmokerStatus = Current smoker - now smokes every day>\n",
|
|||
|
"<Feature: SmokerStatus = Current smoker - now smokes some days>\n",
|
|||
|
"<Feature: SmokerStatus is unknown>\n",
|
|||
|
"<Feature: ECigaretteUsage = Never used e-cigarettes in my entire life>\n",
|
|||
|
"<Feature: ECigaretteUsage = Not at all (right now)>\n",
|
|||
|
"<Feature: ECigaretteUsage = Use them some days>\n",
|
|||
|
"<Feature: ECigaretteUsage = Use them every day>\n",
|
|||
|
"<Feature: ECigaretteUsage is unknown>\n",
|
|||
|
"<Feature: ChestScan>\n",
|
|||
|
"<Feature: RaceEthnicityCategory = White only, Non-Hispanic>\n",
|
|||
|
"<Feature: RaceEthnicityCategory = Hispanic>\n",
|
|||
|
"<Feature: RaceEthnicityCategory = Black only, Non-Hispanic>\n",
|
|||
|
"<Feature: RaceEthnicityCategory = Other race only, Non-Hispanic>\n",
|
|||
|
"<Feature: RaceEthnicityCategory = Multiracial, Non-Hispanic>\n",
|
|||
|
"<Feature: RaceEthnicityCategory is unknown>\n",
|
|||
|
"<Feature: AgeCategory = Age 65 to 69>\n",
|
|||
|
"<Feature: AgeCategory = Age 60 to 64>\n",
|
|||
|
"<Feature: AgeCategory = Age 70 to 74>\n",
|
|||
|
"<Feature: AgeCategory = Age 55 to 59>\n",
|
|||
|
"<Feature: AgeCategory = Age 50 to 54>\n",
|
|||
|
"<Feature: AgeCategory = Age 75 to 79>\n",
|
|||
|
"<Feature: AgeCategory = Age 80 or older>\n",
|
|||
|
"<Feature: AgeCategory = Age 40 to 44>\n",
|
|||
|
"<Feature: AgeCategory = Age 45 to 49>\n",
|
|||
|
"<Feature: AgeCategory = Age 35 to 39>\n",
|
|||
|
"<Feature: AgeCategory is unknown>\n",
|
|||
|
"<Feature: HeightInMeters>\n",
|
|||
|
"<Feature: WeightInKilograms>\n",
|
|||
|
"<Feature: BMI>\n",
|
|||
|
"<Feature: AlcoholDrinkers>\n",
|
|||
|
"<Feature: HIVTesting>\n",
|
|||
|
"<Feature: FluVaxLast12>\n",
|
|||
|
"<Feature: PneumoVaxEver>\n",
|
|||
|
"<Feature: TetanusLast10Tdap = No, did not receive any tetanus shot in the past 10 years>\n",
|
|||
|
"<Feature: TetanusLast10Tdap = Yes, received tetanus shot but not sure what type>\n",
|
|||
|
"<Feature: TetanusLast10Tdap = Yes, received Tdap>\n",
|
|||
|
"<Feature: TetanusLast10Tdap = Yes, received tetanus shot, but not Tdap>\n",
|
|||
|
"<Feature: TetanusLast10Tdap is unknown>\n",
|
|||
|
"<Feature: HighRiskLastYear>\n",
|
|||
|
"<Feature: CovidPos = No>\n",
|
|||
|
"<Feature: CovidPos = Yes>\n",
|
|||
|
"<Feature: CovidPos = Tested positive using home test without a health professional>\n",
|
|||
|
"<Feature: CovidPos is unknown>\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Сгенерируем новые признаки\n",
|
|||
|
"# https://featuretools.alteryx.com/en/stable/guides/tuning_dfs.html\n",
|
|||
|
"\n",
|
|||
|
"feature_matrix_enc, features_enc = ft.encode_features(feature_matrix, feature_defs)\n",
|
|||
|
"feature_matrix_enc.to_csv(\"./csv/generated_features.csv\", index=False)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Было признаков:\", len(feature_defs))\n",
|
|||
|
"print(\"Стало признаков:\", len(features_enc))\n",
|
|||
|
"print(*features_enc, sep='\\n')"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|