953 lines
188 KiB
Plaintext
Raw Normal View History

2024-10-19 00:25:57 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Лабораторная 2\n",
"\n",
"Первый датасет: информация о состоянии людей"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\heart_2020_cleaned.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Столбцы на русском:\n",
"\n",
"HeartDisease - сердечная недостаточность \\\n",
"BMI - ИМТ \\\n",
"Smoking - курящий ли человек \\\n",
"AlcoholDrinking - выпивающий ли человек\\\n",
"Stroke - был ли инсульт\\\n",
"PhysicalHealth - физическое здоровье\\\n",
"MentalHealth - ментальное здоровье\\\n",
"DiffWalking - проблемы с ходьбой\\\n",
"Sex - пол\\\n",
"AgeCategory - возрастная категория\\\n",
"Race - раса\\\n",
"Diabetic - диабетик ли человек\\\n",
"PhysicalActivity - физическая активность\\\n",
"GenHealth - общее здоровье\\\n",
"SleepTime - время сна\\\n",
"Asthma - астматик ли человек\\\n",
"KidneyDisease - нефропатия\\\n",
"SkinCancer - рак кожи"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 319795 entries, 0 to 319794\n",
"Data columns (total 18 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 HeartDisease 319795 non-null object \n",
" 1 BMI 319795 non-null float64\n",
" 2 Smoking 319795 non-null object \n",
" 3 AlcoholDrinking 319795 non-null object \n",
" 4 Stroke 319795 non-null object \n",
" 5 PhysicalHealth 319795 non-null float64\n",
" 6 MentalHealth 319795 non-null float64\n",
" 7 DiffWalking 319795 non-null object \n",
" 8 Sex 319795 non-null object \n",
" 9 AgeCategory 319795 non-null object \n",
" 10 Race 319795 non-null object \n",
" 11 Diabetic 319795 non-null object \n",
" 12 PhysicalActivity 319795 non-null object \n",
" 13 GenHealth 319795 non-null object \n",
" 14 SleepTime 319795 non-null float64\n",
" 15 Asthma 319795 non-null object \n",
" 16 KidneyDisease 319795 non-null object \n",
" 17 SkinCancer 319795 non-null object \n",
"dtypes: float64(4), object(14)\n",
"memory usage: 43.9+ MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>HeartDisease</th>\n",
" <th>BMI</th>\n",
" <th>Smoking</th>\n",
" <th>AlcoholDrinking</th>\n",
" <th>Stroke</th>\n",
" <th>PhysicalHealth</th>\n",
" <th>MentalHealth</th>\n",
" <th>DiffWalking</th>\n",
" <th>Sex</th>\n",
" <th>AgeCategory</th>\n",
" <th>Race</th>\n",
" <th>Diabetic</th>\n",
" <th>PhysicalActivity</th>\n",
" <th>GenHealth</th>\n",
" <th>SleepTime</th>\n",
" <th>Asthma</th>\n",
" <th>KidneyDisease</th>\n",
" <th>SkinCancer</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>No</td>\n",
" <td>16.60</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>3.0</td>\n",
" <td>30.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>55-59</td>\n",
" <td>White</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>5.0</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>No</td>\n",
" <td>20.34</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>80 or older</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>7.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>No</td>\n",
" <td>26.58</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>20.0</td>\n",
" <td>30.0</td>\n",
" <td>No</td>\n",
" <td>Male</td>\n",
" <td>65-69</td>\n",
" <td>White</td>\n",
" <td>Yes</td>\n",
" <td>Yes</td>\n",
" <td>Fair</td>\n",
" <td>8.0</td>\n",
" <td>Yes</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No</td>\n",
" <td>24.21</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>No</td>\n",
" <td>Female</td>\n",
" <td>75-79</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Good</td>\n",
" <td>6.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>No</td>\n",
" <td>23.71</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>28.0</td>\n",
" <td>0.0</td>\n",
" <td>Yes</td>\n",
" <td>Female</td>\n",
" <td>40-44</td>\n",
" <td>White</td>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>Very good</td>\n",
" <td>8.0</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"0 No 16.60 Yes No No 3.0 \n",
"1 No 20.34 No No Yes 0.0 \n",
"2 No 26.58 Yes No No 20.0 \n",
"3 No 24.21 No No No 0.0 \n",
"4 No 23.71 No No No 28.0 \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"0 30.0 No Female 55-59 White Yes \n",
"1 0.0 No Female 80 or older White No \n",
"2 30.0 No Male 65-69 White Yes \n",
"3 0.0 No Female 75-79 White No \n",
"4 0.0 Yes Female 40-44 White No \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
"0 Yes Very good 5.0 Yes No Yes \n",
"1 Yes Very good 7.0 No No No \n",
"2 Yes Fair 8.0 Yes No No \n",
"3 No Good 6.0 No No Yes \n",
"4 Yes Very good 8.0 No No No "
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.info()\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Объект наблюдения: состояние человека\\\n",
"Атрибуты объектов: сердечная недостаточность, ИМТ, курящий человек или нет, выпивающий человек или нет, был ли инсульт у человека и т.д."
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAIjCAYAAAByG8BaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACSK0lEQVR4nOzdd3xT9eLG8SfpnqHQDYWWJXsjsgQHLpwoLjbITxS314F6HThA1HvdoLIURBStqDhBUVSQvfcqhVJGaZvulZzfH0AvpS3Q0va0zef9euUlSc5Jnx5DaZ9+h8UwDEMAAAAAAABwGVazAwAAAAAAAKBqUQgBAAAAAAC4GAohAAAAAAAAF0MhBAAAAAAA4GIohAAAAAAAAFwMhRAAAAAAAICLoRACAAAAAABwMRRCAAAAAAAALoZCCAAAAAAAwMVQCAEAAAAAALgYCiEAAFBtfPnll7JYLCXe2rRpY3Y8l5WRkaHnnntOV111lerWrSuLxaKZM2eaHQsAAJwHd7MDAAAAnO6pp55Sy5YtC++//PLLJqZBUlKSxo8fr4YNG6p9+/b6/fffzY4EAADOE4UQAACodvr166e+ffsW3p86daqSkpLMC+TiIiIilJiYqPDwcK1atUpdu3Y1OxIAADhPTBkDAADVRl5eniTJaj37tygzZ86UxWJRXFxc4WNOp1Pt2rUrNqVpw4YNGj58uBo3bixvb2+Fh4dr5MiROnbsWJHXfP7550ucrubu/r/fofXt21dt2rTR6tWr1aNHD/n4+CgmJkZTpkwp9rk8++yz6ty5s2w2m/z8/NS7d28tXry4yHFxcXGFH2f+/PlFnsvJyVFQUJAsFotef/31YjlDQ0OVn59f5JzPPvus8PVOLdG++eYb9e/fX5GRkfLy8lKTJk304osvyuFwnPVae3l5KTw8/KzHAQCAmoMRQgAAoNo4WQh5eXmV6/xZs2Zp48aNxR5fuHCh9uzZoxEjRig8PFybN2/Whx9+qM2bN+uff/6RxWIpcvzkyZPl7+9feP/0giolJUXXXHONbr31Vt1xxx364osvdM8998jT01MjR46UJKWlpWnq1Km64447NHr0aKWnp2vatGm68sortWLFCnXo0KHIa3p7e2vGjBm68cYbCx+LjY1VTk5OqZ9venq6FixYoJtuuqnwsRkzZsjb27vYeTNnzpS/v78eeeQR+fv767ffftOzzz6rtLQ0vfbaa6V+DAAAUDtRCAEAgGrDbrdLknx8fMp8bm5urp599lldffXV+vHHH4s8d++99+rRRx8t8thFF12kO+64Q3/99Zd69+5d5LlbbrlFwcHBpX6sgwcP6o033tAjjzwiSbr77rvVrVs3jRs3TkOGDJGHh4eCgoIUFxcnT0/PwvNGjx6tFi1a6J133tG0adOKvOZNN92kefPm6fDhwwoLC5MkTZ8+XQMGDNCcOXNKzHHTTTdp+vTphYVQfHy8fv31V91222367LPPihw7Z86cItd1zJgxGjNmjN5//3299NJL5S7hAABAzcSUMQAAUG2cnMIVEhJS5nPfe+89HTt2TM8991yx504tQnJycpSUlKSLLrpIkrRmzZoyfyx3d3fdfffdhfc9PT11991368iRI1q9erUkyc3NrbAMcjqdSk5OVkFBgbp06VLix+zUqZNat26tWbNmSZL27dunxYsXa/jw4aXmGDlypH766ScdOnRIkvTxxx+re/fuat68ebFjT70G6enpSkpKUu/evZWVlaVt27aV+RoAAICajUIIAABUG/v27ZO7u3uZCyG73a5XXnlFjzzySOHomlMlJyfrwQcfVFhYmHx8fBQSEqKYmJjCc8sqMjJSfn5+RR47WcKcuqbRxx9/rHbt2snb21v16tVTSEiIvv/++1I/5ogRIzRjxgxJx6d49ejRQ82aNSs1R4cOHdSmTRt98sknMgxDM2fO1IgRI0o8dvPmzbrppptks9kUGBiokJAQDR48WFL5rgEAAKjZKIQAAEC1sX37djVu3LjIIs7n4tVXX5XVatVjjz1W4vO33nqrPvroI40ZM0axsbH65Zdf9NNPP0k6PnqnMsyePVvDhw9XkyZNNG3aNP30009auHChLr300lI/5uDBg7Vr1y79888/+vjjj0std041cuRIzZgxQ3/88YcOHTqkW2+9tdgxqamp6tOnj9avX6/x48fru+++08KFC/Xqq69KqrxrAAAAqi/WEAIAANVCbm6u1q1bV2RR5XNx8OBBvfXWW5owYYICAgKK7RyWkpKiX3/9VS+88IKeffbZwsd37txZ7qwHDx5UZmZmkVFCO3bskCRFR0dLkr788ks1btxYsbGxRRatLmlK20n16tXT9ddfXzj97NZbby2yU1hJBg0apMcee0wPPvigbrnlFgUEBBQ75vfff9exY8cUGxuriy++uPDxvXv3ntPnCwAAah9GCAEAgGphzpw5ys3N1WWXXVam81544QWFhYVpzJgxJT7v5uYmSTIMo8jjb775ZrlySlJBQYE++OCDwvt5eXn64IMPFBISos6dO5f6cZcvX65ly5ad8bVHjhypDRs2aODAgUV2OitN3bp1dcMNN2jDhg2FO5ydrqQseXl5ev/998/6+gAAoHZihBAAADBVZmam3nnnHY0fP15ubm4yDEOzZ88ucszhw4eVkZGh2bNnq1+/fkXWCfrll1/06aefFtnN61SBgYG6+OKLNWnSJOXn56t+/fr65Zdfzmt0TGRkpF599VXFxcWpefPm+vzzz7Vu3Tp9+OGH8vDwkCRde+21io2N1U033aT+/ftr7969mjJlilq1aqWMjIxSX/uqq67S0aNHz6kMOmnmzJl67733St0ZrUePHgoKCtKwYcP0wAMPyGKxaNasWcVKsjN59913lZqaqoMHD0qSvvvuOx04cECSdP/998tms53zawEAAPNRCAEAAFMdPXpU48aNK7x/6u5dpxsyZIgWL15cpBDq0KGD7rjjjjN+jDlz5uj+++/Xe++9J8MwdMUVV+jHH39UZGRkuTIHBQXp448/1v3336+PPvpIYWFhevfddzV69OjCY4YPH65Dhw7pgw8+0M8//6xWrVpp9uzZmjdvnn7//fdSX9tisZxxy/uS+Pj4FNlF7HT16tXTggUL9Oijj+qZZ55RUFCQBg8erMsuu0xXXnnlOX2M119/Xfv27Su8Hxsbq9jYWEnH1z6iEAIAoGaxGGX51RAAAEAFi4uLU0xMjBYvXqy+ffue93GVrW/fvkpKStKmTZtMywAAAHC+WEMIAAAAAADAxVAIAQAAU/n7+2vQoEFFpoGdz3EAAAA4O6aMAQAAlAFTxgAAQG1AIQQAAAAAAOBimDIGAAAAAADgYiiEAAAAAAAAXIy72QGqmtPp1MGDBxUQECCLxWJ2HAAAAAAAgAphGIbS09MVGRkpq/XMY4BcrhA6ePCgoqKizI4BAAAAAABQKfbv368GDRqc8RiXK4QCAgIkHb84gYGBJqcBAAAAAACoGGlpaYqKiirsPs7E5Qqhk9PEAgMDKYQAAAAAAECtcy5L5LCoNAAAAAAAgIuhEAIAAAAAAHAxFEIAAAAAAAAuhkIIAAAAAADAxVAIAQAAAAAAuBgKIQAAAAAAABdDIQQAAAAAAOBiKIQAAAAAAABcDIUQAAAAAACAi6EQAgAAAAAAcDEUQgAAAAAAAC6GQggAAAAAAMDFUAgBAAAAAAC4GAohAAAAAAAAF0MhBAAAAAAA4GIohAAAAAAAgMtLtGdr6e4kJdqzzY5SJdzNDgAAAAAAAGCmz1fG68nYjTIMyWqRJgxoq9u6NjQ7VqVihBAAAAAAAHBZifbswjJIkpyG9FTsplo/UohCCAAAAAAAuKxv1h0sLINOchiG4pKyzAlURSiEAAAAAACAS/pl8yG98fP2Yo+7WSyKDvY1IVHVoRACAAAAAAAu59v1B3XPp2uU7zTUtn6grJbjj7tZLHplQBtF2HzMDVjJWFQaAAAAAAC4lC9W7tcTsRtkGNKAjvU16ZZ2OpqRq7ikLEUH+9b6MkiiEAIAAAAAAC7k46Vxeu7bzZKkO7s11Es3tJHValGEzcc
"text/plain": [
"<Figure size 1400x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"mean_menthalhealth = df.groupby('AgeCategory')['SleepTime'].mean().reset_index()\n",
"\n",
"plt.figure(figsize=(14, 6))\n",
"\n",
"plt.plot(mean_menthalhealth['AgeCategory'], mean_menthalhealth['SleepTime'], marker='.')\n",
"\n",
"plt.title(\"Диаграмма 1\")\n",
"plt.xlabel(\"Возрастная группа\")\n",
"plt.ylabel(\"Время сна\")\n",
"\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Между атрибутами присутствует связь. Пример, на диаграмме 1 - связь между возрастной группой и временем сна\\\n",
"Примеры бизнес-целей:\\\n",
" 1. Прогнозирование инсульта на основе ИМТ.\\\n",
" 2. Наблюдение за изменением времени сна в зависимости от возраста.\\\n",
"\\\n",
"Эффект для бизнеса: влияние количества сна на здоровье, влияние ИМТ на здоровье, влияние возраста на инсульты\\\n",
"\\\n",
"\\\n",
"Цели технического проекта:\\\n",
" 1. Первая бизнес-цель: вход - ИМТ, целевой признак - инсульт.\\\n",
" 2. Вторая бизнес-цель: вход - возрастная группа, целевой признак - время сна."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверка на выбросы"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пустые значения по столбцам:\n",
"HeartDisease 0\n",
"BMI 0\n",
"Smoking 0\n",
"AlcoholDrinking 0\n",
"Stroke 0\n",
"PhysicalHealth 0\n",
"MentalHealth 0\n",
"DiffWalking 0\n",
"Sex 0\n",
"AgeCategory 0\n",
"Race 0\n",
"Diabetic 0\n",
"PhysicalActivity 0\n",
"GenHealth 0\n",
"SleepTime 0\n",
"Asthma 0\n",
"KidneyDisease 0\n",
"SkinCancer 0\n",
"dtype: int64\n",
"\n",
"Статистический обзор данных:\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>BMI</th>\n",
" <th>PhysicalHealth</th>\n",
" <th>MentalHealth</th>\n",
" <th>SleepTime</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>319795.000000</td>\n",
" <td>319795.00000</td>\n",
" <td>319795.000000</td>\n",
" <td>319795.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>28.325399</td>\n",
" <td>3.37171</td>\n",
" <td>3.898366</td>\n",
" <td>7.097075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>6.356100</td>\n",
" <td>7.95085</td>\n",
" <td>7.955235</td>\n",
" <td>1.436007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>12.020000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>24.030000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>27.340000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>31.420000</td>\n",
" <td>2.00000</td>\n",
" <td>3.000000</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>94.850000</td>\n",
" <td>30.00000</td>\n",
" <td>30.000000</td>\n",
" <td>24.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" BMI PhysicalHealth MentalHealth SleepTime\n",
"count 319795.000000 319795.00000 319795.000000 319795.000000\n",
"mean 28.325399 3.37171 3.898366 7.097075\n",
"std 6.356100 7.95085 7.955235 1.436007\n",
"min 12.020000 0.00000 0.000000 1.000000\n",
"25% 24.030000 0.00000 0.000000 6.000000\n",
"50% 27.340000 0.00000 0.000000 7.000000\n",
"75% 31.420000 2.00000 3.000000 8.000000\n",
"max 94.850000 30.00000 30.000000 24.000000"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"null_values = df.isnull().sum()\n",
"print(\"Пустые значения по столбцам:\")\n",
"print(null_values)\n",
"\n",
"print(\"\\nСтатистический обзор данных:\")\n",
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе полученной информации видно, что пустых данных нет. Проверим данные на выбросы и дубликаты:"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Коэффициент асимметрии для столбца 'BMI': 1.3324306428979513\n",
"\n",
"Коэффициент асимметрии для столбца 'PhysicalHealth': 2.6039732622480822\n",
"\n",
"Коэффициент асимметрии для столбца 'MentalHealth': 2.331111549136165\n",
"\n",
"Коэффициент асимметрии для столбца 'SleepTime': 0.6790346208011537\n",
"\n",
"Количество дубликатов: 18078\n"
]
}
],
"source": [
"for column in df.select_dtypes(include=[np.number]).columns:\n",
" skewness = df[column].skew()\n",
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
"\n",
"duplicates = df.duplicated().sum()\n",
"print(f\"\\nКоличество дубликатов: {duplicates}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"На основе данных выше можно сказать, что выбросы незначительны. Удаляем все дубликаты."
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"cleaned_df = df.drop_duplicates()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Очищаем данные от шумов"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADbUElEQVR4nOzdeXhU5dk/8O9M9oTsEBK2LLhACIIgSlDQWrXUBa1tba1Yqr6g1ra29tVKfypiF+rbt61dXWhfF2it2mrF1tJatXUhEQVBEXGBhDUhkBUSkkDm/P6IE8+cZGa+c+Y8Z845uT/XxXVp+GaYTGY5z/Pcz/34NE3TIIQQQgghhBACAOBP9B0QQgghhBBCCCeRQZIQQgghhBBC6MggSQghhBBCCCF0ZJAkhBBCCCGEEDoySBJCCCGEEEIIHRkkCSGEEEIIIYSODJKEEEIIIYQQQkcGSUIIIYQQQgihI4MkIYQQQgghhNCRQZIQQgghRALt2bMHDz300MD/19fX4/e//33i7pAQQgZJQoj4/OlPf4LP5xvyT1VVVaLvnhBCOJ7P58MNN9yAf/zjH6ivr8ctt9yCl19+OdF3S4hhLTnRd0AI4Q3f/e53MXny5IH//8EPfpDAeyOEEO4xduxYLF68GPPnzwcAlJSU4N///ndi75QQw5xP0zQt0XdCCOFef/rTn/D5z38eL774Is4666yBr5911lk4ePAgtmzZkrg7J4QQLrJ9+3YcPHgQVVVVyMrKSvTdEWJYk3I7IURcent7AQB+f/S3k4ceegg+nw/19fUDXwsEAjjppJPg8/lCavLfeustfOUrX0FFRQXS09NRXFyMq6++Gs3NzSG3eeeddw5Z6pec/PFC+VlnnYWqqips2LABc+bMQUZGBsrLy3HfffcN+lnuuOMOzJw5E7m5ucjKysLcuXPx4osvhuTq6+sH/p2//OUvIX/X3d2N/Px8+Hw+/O///u+g+1lUVISjR4+GfM+jjz46cHsHDx4c+PrTTz+NCy64AGPGjEFaWhomTpyI733ve+jr64v6WAf/vW3btuGyyy5DTk4OCgsLceONN6K7uzsk++CDD+Lss89GUVER0tLSUFlZiXvvvXfI2/373/+OM888E9nZ2cjJycGsWbPwhz/8ISTz2muv4fzzz0d+fj6ysrJw0kkn4ec//3lIZtu2bfjc5z6HgoICpKen45RTTsGaNWtCMrE8X77yla+E/P7z8/Nx1llnDSpZYh/T4HPG6H//938H3aeysjJ85StfCck98cQT8Pl8KCsrC/l6U1MTrrnmGkyYMAFJSUkD93fEiBGD/i2jsrKysKWtPp9vUH716tWYOXMmMjIyUFBQgC9+8YvYvXv3kD9ntNcGAPT09GDZsmU47rjjkJaWhvHjx+OWW25BT0/PoOy///1v+n4aBZ+7Q/38+sc5lucHgIHXwqhRo5CRkYETTzwR/+///b+QfzPSn+DKzllnnRUyIQT0r5z7/f5Br4Unnnhi4HcwcuRILFy4EHv37g3JfOUrXxl4nkycOBGnnXYaWlpakJGRMejnE0LYR8rthBBxCQ6S0tLSTH3/qlWr8Pbbbw/6+nPPPYcdO3bgqquuQnFxMd555x088MADeOedd1BbWzvoIuree+8NudA0DtpaW1tx/vnn47LLLsPll1+Oxx9/HNdffz1SU1Nx9dVXAwA6Ojrw29/+FpdffjkWL16MQ4cO4Xe/+x0+9alPYf369Zg+fXrIbaanp+PBBx/EJZdcMvC1J598ctAgRO/QoUP461//is985jMDX3vwwQeRnp4+6PseeughjBgxAjfddBNGjBiBF154AXfccQc6Ojrw4x//OOy/oXfZZZehrKwMK1asQG1tLX7xi1+gtbUVjzzySMhjN2XKFCxYsADJycl45pln8NWvfhWBQAA33HBDyP25+uqrMWXKFCxduhR5eXl48803sXbtWnzpS18C0P97u/DCC1FSUoIbb7wRxcXFePfdd/HXv/4VN954IwDgnXfewemnn46xY8fi1ltvRVZWFh5//HFccskl+POf/xzy2BiFe74AwMiRI/Gzn/0MQP9G+J///Oc4//zzsXv3buTl5Vn2mEZz7NixgYtvo0WLFuFf//oXvv71r2PatGlISkrCAw88gI0bN1K3PX36dHz7298O+dojjzyC5557LuRrP/jBD3D77bfjsssuw3/913/hwIED+OUvf4l58+bhzTffHHg8AO61EQgEsGDBArzyyitYsmQJJk+ejLfffhs/+9nP8P777w+aLAj6xje+gVmzZoW9n1YL9/x46623MHfuXKSkpGDJkiUoKyvD9u3b8cwzz+AHP/gBLr30Uhx33HED+W9961uYPHkylixZMvA1fTmx3oMPPojbbrsNP/nJTwZeB0D/c+2qq67CrFmzsGLFCuzfvx8///nP8eqrrw76HRjdcccdEd9HhBA20IQQIg733HOPBkDbvHlzyNfPPPNMbcqUKSFfe/DBBzUAWl1dnaZpmtbd3a1NmDBB+/SnP60B0B588MGBbFdX16B/69FHH9UAaC+99NLA15YtW6YB0A4cOBD2Pp555pkaAO0nP/nJwNd6enq06dOna0VFRVpvb6+maZp27NgxraenJ+R7W1tbtdGjR2tXX331wNfq6uo0ANrll1+uJScna42NjQN/98lPflL70pe+pAHQfvzjHw+6n5dffrl24YUXDnx9586dmt/v1y6//PJBP8dQj8G1116rZWZmat3d3WF/Xv2/t2DBgpCvf/WrXx30+xrq3/nUpz6lVVRUDPx/W1ublp2drZ122mnakSNHQrKBQEDTtP7Hr7y8XCstLdVaW1uHzGha/2M0derUkJ8hEAhoc+bM0Y4//viBr8XyfFm0aJFWWloa8m8+8MADGgBt/fr1EX/WoR7ToZ6/mqZpP/7xj0Puk6ZpWmlpqbZo0aKB///Nb36jpaWlaZ/4xCdC7tORI0c0v9+vXXvttSG3uWjRIi0rK2vQv2VUWlqqXXDBBYO+fsMNN2j6j/P6+notKSlJ+8EPfhCSe/vtt7Xk5OSQr7OvjVWrVml+v197+eWXQ27zvvvu0wBor776asjX//nPf2oAtD/96U9h72c4y5cv1wCEPGeCP7/+cY7l+TFv3jwtOztb27lzZ8htGv+NcP+W3plnnqmdeeaZmqZp2t/+9jctOTlZ+/a3vx2S6e3t1YqKirSqqqqQ18tf//pXDYB2xx13DHzN+NzdsmWL5vf7B34O/XNNCGEfKbcTQsQlWP42atSomL/317/+NZqbm7Fs2bJBf5eRkTHw393d3Th48CBmz54NAPSsu15ycjKuvfbagf9PTU3Ftddei6amJmzYsAEAkJSUhNTUVAD9M+ctLS04duwYTjnllCH/zRkzZmDKlClYtWoVAGDnzp148cUXB5Ve6V199dVYu3YtGhsbAQAPP/wwqqurccIJJwzK6h+DQ4cO4eDBg5g7dy66urqwbds26ufWrwQBwNe//nUAwLPPPjvkv9Pe3o6DBw/izDPPxI4dO9De3g6gf4Xo0KFDuPXWW5Genh5ym8FVvTfffBN1dXX45je/OWiWPJhpaWnBCy+8gMsuu2zgZzp48CCam5vxqU99Ch988MGgcqSgSM8XoP93Fry9TZs24ZFHHkFJSUnICkAsj2lfX9/A7QX/dHV1DflvB3V1deGuu+7C1772NUyYMCHk7zo7OxEIBFBYWBjxNuL15JNPIhAI4LLLLgu578XFxTj++OMHlY8yr40nnngCkydPxqRJk0Ju8+yzzwaAQbcZXAUxPlcYRUVFAPpXA2MR7vlx4MABvPTSS7j66qsH/U6Y8r9w1q9fj8suuwyf/exnB61CvvHGG2hqasJXv/rVkMfgggsuwKRJk/C3v/0t7O0uXboUM2bMwOc//3nT900IET8ptxNCxGXnzp1ITk6OeZDU3t6OH/7wh7jpppswevToQX/f0tKC5cuX449//COampoGfW+sxowZM2gjdHBgUl9fPzAAe/jhh/GTn/wE27ZtC9k7VF5ePuTtXnXVVXjggQfw3//933jooYcwZ84cHH/88WHvx/T
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Выбросы в датасете:\n",
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
"2 No 26.58 Yes No No 20.0 \n",
"4 No 23.71 No No No 28.0 \n",
"5 Yes 28.87 Yes No No 6.0 \n",
"6 No 21.63 No No No 15.0 \n",
"10 Yes 34.30 Yes No No 30.0 \n",
"... ... ... ... ... ... ... \n",
"319774 No 20.36 No No No 30.0 \n",
"319779 No 23.38 Yes No No 30.0 \n",
"319782 No 31.89 Yes No No 30.0 \n",
"319787 No 36.54 No No No 7.0 \n",
"319790 Yes 27.41 Yes No No 7.0 \n",
"\n",
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
"2 30.0 No Male 65-69 White Yes \n",
"4 0.0 Yes Female 40-44 White No \n",
"5 0.0 Yes Female 75-79 Black No \n",
"6 0.0 No Female 70-74 White No \n",
"10 0.0 Yes Male 60-64 White Yes \n",
"... ... ... ... ... ... ... \n",
"319774 0.0 Yes Female 55-59 Hispanic Yes \n",
"319779 0.0 Yes Female 70-74 Hispanic No \n",
"319782 30.0 Yes Female 55-59 Hispanic No \n",
"319787 0.0 No Male 30-34 Hispanic No \n",
"319790 0.0 Yes Male 60-64 Hispanic Yes \n",
"\n",
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
"2 Yes Fair 8.0 Yes No No \n",
"4 Yes Very good 8.0 No No No \n",
"5 No Fair 12.0 No No No \n",
"6 Yes Fair 4.0 Yes No Yes \n",
"10 No Poor 15.0 Yes No No \n",
"... ... ... ... ... ... ... \n",
"319774 Yes Fair 8.0 No No No \n",
"319779 Yes Fair 5.0 No No No \n",
"319782 No Fair 4.0 No No No \n",
"319787 No Good 9.0 No No No \n",
"319790 No Fair 6.0 Yes No No \n",
"\n",
"[47136 rows x 18 columns]\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABihklEQVR4nO3deXhTZd7G8TvdS2lLC3RBtoKgFgQE2VQWRxBcQFxe1EEHxVHGQUdFUZkREVzqLo4LgjosgjpuqLigiAouIAiClqIiVgRpQSiU0tKF5rx/MI1J1yeQNOnp93NdvS45uZs8SY9JfufZHJZlWQIAAAAASJJCAt0AAAAAAAgmFEkAAAAA4IYiCQAAAADcUCQBAAAAgBuKJAAAAABwQ5EEAAAAAG4okgAAAADADUUSAAAAALihSAIAAAAANxRJAAAAAOCGIgmAX7z22mtyOBzV/nTt2jXQzQMAAKhRWKAbAMDe/vnPf+qEE05w/fvee+8NYGsAAADqRpEEwK+GDh2qwYMHu/793HPPaffu3YFrEAAAQB0YbgfAL0pLSyVJISF1v83MnTtXDodDv/zyi+uY0+lUt27d5HA4NHfuXNfxb7/9VldccYU6dOigqKgopaSkaNy4cdqzZ4/Hfd51113VDvULC/vj2tDgwYPVtWtXrV27Vqeccoqio6OVlpamZ555pspzufPOO9WrVy/Fx8crJiZGAwYM0CeffOKR++WXX1yP8+abb3rcVlxcrISEBDkcDj388MNV2pmUlKSysjKP33nppZdc9+deWL711ls655xz1KpVK0VGRqpjx466++67VV5eXudrXfF433//vUaPHq24uDg1b95cN9xwg4qLiz2yc+bM0Z/+9CclJSUpMjJS6enpmjlzZrX3+/7772vQoEGKjY1VXFycevfurRdffNEj89VXX+nss89WQkKCYmJi1K1bNz3++OMeme+//14XXXSREhMTFRUVpZNPPllvv/22R8ab8+WKK67w+PsnJCRo8ODB+uyzzzzu0/Q1rThnKnv44YertKl9+/a64oorPHKvvvqqHA6H2rdv73F8165duuqqq9S2bVuFhoa62tu0adMqj1VZ+/btaxza6nA4PLKHDh3S3XffrY4dOyoyMlLt27fXP//5T5WUlFS5X5O/qfs5X9vjOp1OzZgxQ126dFFUVJSSk5M1fvx47d271+j5VX4dP/30UzkcDn366aeuY4MHD/a4ICNJa9asqbY9krRgwQL16dNHTZo0UUJCggYOHKgPP/zQ9Zi1vaYVf7+K5+9+zhUUFKhXr15KS0tTTk5OjTlJmjBhghwOR5XnByDw6EkC4BcVRVJkZOQR/f4LL7yg7777rsrxpUuX6ueff9aVV16plJQUbdy4UbNnz9bGjRu1atWqKl+GZs6c6fFFs3LRtnfvXp199tkaPXq0Lr30Ur3yyiu69tprFRERoXHjxkmS9u/fr+eee06XXnqprr76ahUUFOj555/XsGHDtHr1avXo0cPjPqOiojRnzhyNGjXKdeyNN96oUoS4Kygo0DvvvKPzzz/fdWzOnDmKioqq8ntz585V06ZNNXHiRDVt2lQff/yx7rzzTu3fv18PPfRQjY/hbvTo0Wrfvr0yMjK0atUq/fvf/9bevXs1f/58j9euS5cuGjlypMLCwrR48WL9/e9/l9Pp1IQJEzzaM27cOHXp0kWTJ09Ws2bN9M0332jJkiX685//LOnw3+3cc89VamqqbrjhBqWkpGjTpk165513dMMNN0iSNm7cqFNPPVXHHHOMbr/9dsXExOiVV17RqFGj9Prrr3u8NpXVdL5IUosWLfTYY49JkrZv367HH39cZ599trZt26ZmzZr57DWty6FDh/Svf/2r2tvGjh2rjz76SNdff726d++u0NBQzZ49W+vWrTO67x49eujmm2/2ODZ//nwtXbrU49hf//pXzZs3TxdddJFuvvlmffXVV8rIyNCmTZu0aNEiV87kb+rummuu0YABAyQdPtfd70uSxo8fr7lz5+rKK6/UP/7xD2VnZ+vJJ5/UN998oy+++ELh4eFGz9Nbt912W7XHp02bprvuukunnHKKpk+froiICH311Vf6+OOPdeaZZ2rGjBk6cOCAJGnTpk267777PIYO11S8lpWV6cILL9Svv/6qL774QqmpqTW27aefftKzzz57lM8QgN9YAOAHM2bMsCRZGzZs8Dg+aNAgq0uXLh7H5syZY0mysrOzLcuyrOLiYqtt27bWWWedZUmy5syZ48oWFRVVeayXXnrJkmStWLHCdWzq1KmWJOv333+vsY2DBg2yJFmPPPKI61hJSYnVo0cPKykpySotLbUsy7IOHTpklZSUePzu3r17reTkZGvcuHGuY9nZ2ZYk69JLL7XCwsKs3Nxc121nnHGG9ec//9mSZD300ENV2nnppZda5557ruv41q1brZCQEOvSSy+t8jyqew3Gjx9vNWnSxCouLq7x+bo/3siRIz2O//3vf6/y96rucYYNG2Z16NDB9e99+/ZZsbGxVt++fa2DBw96ZJ1Op2VZh1+/tLQ0q127dtbevXurzVjW4dfoxBNP9HgOTqfTOuWUU6xOnTq5jnlzvowdO9Zq166dx2POnj3bkmStXr261uda3Wta3flrWZb10EMPebTJsiyrXbt21tixY13/fvrpp63IyEjr9NNP92jTwYMHrZCQEGv8+PEe9zl27FgrJiamymNV1q5dO+ucc86pcnzChAmW+8f8+vXrLUnWX//6V4/cLbfcYkmyPv74Y8uyzP6mFTZv3mxJsubNm+c6VnGOVfjss88sSdbChQs9fnfJkiXVHq8sLS3N+stf/uJx7JNPPrEkWZ988onr2KBBg6xBgwa5/v3ee+9Zkqzhw4d7tGfz5s1WSEiIdf7551vl5eW1Pr+aHqtCxf/zc+bMsZxOpzVmzBirSZMm1ldffVVjrsLo0aOtrl27Wm3atPE4TwAEB4bbAfCLiuFvLVu29Pp3n3rqKe3Zs0dTp06tclt0dLTrv4uLi7V7927169dPkoyvursLCwvT+PHjXf+OiIjQ+PHjtWvXLq1du1aSFBoaqoiICEmHhw3l5eXp0KFDOvnkk6t9zJ49e6pLly564YUXJElbt27VJ598UuuQmnHjxmnJkiXKzc2VJM2bN0/9+/dX586dq2TdX4OCggLt3r1bAwYMUFFRkb7//nuj5+3eEyRJ119/vSTpvffeq/Zx8vPztXv3bg0aNEg///yz8vPzJR3uISooKNDtt9+uqKgoj/us6NX75ptvlJ2drRtvvNHVc1M5k5eXp48//lijR492Pafdu3drz549GjZsmDZv3qzffvut2udS2/kiHf6bVdzf+vXrNX/+fKWmpnosKOLNa1peXu66v4qfoqKiah+7QlFRkaZPn67rrrtObdu29bitsLBQTqdTzZs3r/U+jlbF33bixIkexyt6oN59911JZn/TCiY9xq+++qri4+M1dOhQj9esV69eatq0aZVhq5UlJSVp+/btBs/wD5ZlafLkybrwwgvVt29fj9vefPNNOZ1O3XnnnVV6lqsblmdq0qRJWrhwoV555RX16dOn1uzatWv16quvKiMjw2hIMoD6x/+ZAPxi69atCgsL87pIys/P13333aeJEycqOTm5yu15eXm64YYblJycrOjoaLVs2VJpaWmu3/VWq1atFBMT43GsojBxn18yb948devWTVFRUWrevLlatmypd999t8bHvPLKKzVnzhxJh4cunXLKKerUqVON7ejRo4e6du2q+fPny7Is19Ck6mzcuFHnn3++4uPjFRcXp5YtW+qyyy6TZP4aVG5Lx44dFRIS4vGcv/jiCw0ZMkQxMTFq1qyZWrZsqX/+858ej7NlyxZJqnVZd5PMTz/9JMuyNGXKFLVs2dLjp6L42bVrV5Xfq+t8kaRt27a57uukk07Sli1b9Prrr3sMmfLmNf3+++9rbGNNHn30URUXF7teP3fNmzdXp06d9Nxzz+nDDz/Url27tHv37mrnCR2NrVu3KiQkRMcee6zH8ZSUFDVr1kxbt26VZPb3qrBv3z5JNQ8/k6T
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(cleaned_df['PhysicalHealth'], cleaned_df['BMI'])\n",
"plt.xlabel('Физическое здоровье')\n",
"plt.ylabel('ИМТ')\n",
"plt.title('Диаграмма рассеивания перед чисткой')\n",
"plt.show()\n",
"\n",
"Q1 = cleaned_df[\"PhysicalHealth\"].quantile(0.25)\n",
"Q3 = cleaned_df[\"PhysicalHealth\"].quantile(0.75)\n",
"\n",
"IQR = Q3 - Q1\n",
"\n",
"threshold = 1.5 * IQR\n",
"lower_bound = Q1 - threshold\n",
"upper_bound = Q3 + threshold\n",
"\n",
"outliers = (cleaned_df[\"PhysicalHealth\"] < lower_bound) | (cleaned_df[\"PhysicalHealth\"] > upper_bound)\n",
"\n",
"print(\"Выбросы в датасете:\")\n",
"print(cleaned_df[outliers])\n",
"\n",
"median_score = cleaned_df[\"PhysicalHealth\"].median()\n",
"cleaned_df.loc[outliers, \"PhysicalHealth\"] = median_score\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(cleaned_df['PhysicalHealth'], cleaned_df['BMI'])\n",
"plt.xlabel('Физическое здоровье')\n",
"plt.ylabel('ИМТ')\n",
"plt.title('Диаграмма рассеивания после чистки')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 181029\n",
"Размер контрольной выборки: 60344\n",
"Размер тестовой выборки: 60344\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)\n",
"\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Данные недостаточно сбалансированны"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение ИМТ в обучающей выборке:\n",
"BMI\n",
"26.63 1941\n",
"27.46 1456\n",
"27.44 1416\n",
"27.12 1258\n",
"24.41 1247\n",
" ... \n",
"55.95 1\n",
"54.56 1\n",
"53.72 1\n",
"32.29 1\n",
"69.88 1\n",
"Name: count, Length: 3243, dtype: int64\n",
"\n",
"Распределение ИМТ в контрольной выборке:\n",
"BMI\n",
"26.63 657\n",
"27.46 494\n",
"24.41 474\n",
"27.44 463\n",
"25.10 379\n",
" ... \n",
"43.03 1\n",
"55.56 1\n",
"44.14 1\n",
"16.97 1\n",
"39.19 1\n",
"Name: count, Length: 2483, dtype: int64\n",
"\n",
"Распределение ИМТ в тестовой выборке:\n",
"BMI\n",
"26.63 646\n",
"27.44 506\n",
"27.46 475\n",
"24.41 452\n",
"27.12 426\n",
" ... \n",
"16.53 1\n",
"13.54 1\n",
"41.06 1\n",
"54.28 1\n",
"39.91 1\n",
"Name: count, Length: 2539, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['BMI'].value_counts()\n",
" print(f\"Распределение ИМТ в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Оверсэмплинг:\n",
"Распределение ИМТ в обучающей выборке:\n",
"BMI\n",
"26.63 1941\n",
"27.46 1472\n",
"27.44 1432\n",
"27.12 1258\n",
"24.41 1247\n",
" ... \n",
"27.13 1\n",
"29.59 1\n",
"24.76 1\n",
"53.72 1\n",
"31.03 1\n",
"Name: count, Length: 3243, dtype: int64\n",
"\n",
"Распределение ИМТ в контрольной выборке:\n",
"BMI\n",
"26.63 657\n",
"27.46 496\n",
"24.41 474\n",
"27.44 465\n",
"25.10 379\n",
" ... \n",
"46.66 1\n",
"46.76 1\n",
"68.59 1\n",
"73.39 1\n",
"54.57 1\n",
"Name: count, Length: 2483, dtype: int64\n",
"\n",
"Распределение ИМТ в тестовой выборке:\n",
"BMI\n",
"26.63 646\n",
"27.44 510\n",
"27.46 479\n",
"24.41 452\n",
"27.12 426\n",
" ... \n",
"46.72 1\n",
"60.69 1\n",
"47.44 1\n",
"53.48 1\n",
"58.16 1\n",
"Name: count, Length: 2539, dtype: int64\n",
"\n",
"Андерсэмплинг:\n",
"Распределение ИМТ в обучающей выборке:\n",
"BMI\n",
"26.63 1929\n",
"27.46 1456\n",
"27.44 1416\n",
"27.12 1238\n",
"24.41 1238\n",
" ... \n",
"56.30 1\n",
"41.59 1\n",
"59.08 1\n",
"61.66 1\n",
"65.19 1\n",
"Name: count, Length: 3241, dtype: int64\n",
"\n",
"Распределение ИМТ в контрольной выборке:\n",
"BMI\n",
"26.63 657\n",
"27.46 494\n",
"24.41 473\n",
"27.44 463\n",
"27.12 376\n",
" ... \n",
"48.40 1\n",
"63.67 1\n",
"48.68 1\n",
"39.19 1\n",
"30.76 1\n",
"Name: count, Length: 2483, dtype: int64\n",
"\n",
"Распределение ИМТ в тестовой выборке:\n",
"BMI\n",
"26.63 639\n",
"27.44 506\n",
"27.46 475\n",
"24.41 444\n",
"27.12 423\n",
" ... \n",
"34.89 1\n",
"30.75 1\n",
"41.06 1\n",
"54.28 1\n",
"39.91 1\n",
"Name: count, Length: 2539, dtype: int64\n",
"\n"
]
}
],
"source": [
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def binning(target, bins):\n",
" return pd.qcut(target, q=bins, labels=False)\n",
"\n",
"train_df['BMI_binned'] = binning(train_df['BMI'], bins=2)\n",
"val_df['BMI_binned'] = binning(val_df['BMI'], bins=2)\n",
"test_df['BMI_binned'] = binning(test_df['BMI'], bins=2)\n",
"\n",
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df, 'BMI_binned')\n",
"val_df_oversampled = oversample(val_df, 'BMI_binned')\n",
"test_df_oversampled = oversample(test_df, 'BMI_binned')\n",
"\n",
"train_df_undersampled = undersample(train_df, 'BMI_binned')\n",
"val_df_undersampled = undersample(val_df, 'BMI_binned')\n",
"test_df_undersampled = undersample(test_df, 'BMI_binned')\n",
"\n",
"print(\"Оверсэмплинг:\")\n",
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
"\n",
"print(\"Андерсэмплинг:\")\n",
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}