953 lines
188 KiB
Plaintext
953 lines
188 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Лабораторная 2\n",
|
|||
|
"\n",
|
|||
|
"Первый датасет: информация о состоянии людей"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 121,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',\n",
|
|||
|
" 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',\n",
|
|||
|
" 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',\n",
|
|||
|
" 'Asthma', 'KidneyDisease', 'SkinCancer'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"df = pd.read_csv(\"..\\\\static\\\\csv\\\\heart_2020_cleaned.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Столбцы на русском:\n",
|
|||
|
"\n",
|
|||
|
"HeartDisease - сердечная недостаточность \\\n",
|
|||
|
"BMI - ИМТ \\\n",
|
|||
|
"Smoking - курящий ли человек \\\n",
|
|||
|
"AlcoholDrinking - выпивающий ли человек\\\n",
|
|||
|
"Stroke - был ли инсульт\\\n",
|
|||
|
"PhysicalHealth - физическое здоровье\\\n",
|
|||
|
"MentalHealth - ментальное здоровье\\\n",
|
|||
|
"DiffWalking - проблемы с ходьбой\\\n",
|
|||
|
"Sex - пол\\\n",
|
|||
|
"AgeCategory - возрастная категория\\\n",
|
|||
|
"Race - раса\\\n",
|
|||
|
"Diabetic - диабетик ли человек\\\n",
|
|||
|
"PhysicalActivity - физическая активность\\\n",
|
|||
|
"GenHealth - общее здоровье\\\n",
|
|||
|
"SleepTime - время сна\\\n",
|
|||
|
"Asthma - астматик ли человек\\\n",
|
|||
|
"KidneyDisease - нефропатия\\\n",
|
|||
|
"SkinCancer - рак кожи"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 122,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 319795 entries, 0 to 319794\n",
|
|||
|
"Data columns (total 18 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 HeartDisease 319795 non-null object \n",
|
|||
|
" 1 BMI 319795 non-null float64\n",
|
|||
|
" 2 Smoking 319795 non-null object \n",
|
|||
|
" 3 AlcoholDrinking 319795 non-null object \n",
|
|||
|
" 4 Stroke 319795 non-null object \n",
|
|||
|
" 5 PhysicalHealth 319795 non-null float64\n",
|
|||
|
" 6 MentalHealth 319795 non-null float64\n",
|
|||
|
" 7 DiffWalking 319795 non-null object \n",
|
|||
|
" 8 Sex 319795 non-null object \n",
|
|||
|
" 9 AgeCategory 319795 non-null object \n",
|
|||
|
" 10 Race 319795 non-null object \n",
|
|||
|
" 11 Diabetic 319795 non-null object \n",
|
|||
|
" 12 PhysicalActivity 319795 non-null object \n",
|
|||
|
" 13 GenHealth 319795 non-null object \n",
|
|||
|
" 14 SleepTime 319795 non-null float64\n",
|
|||
|
" 15 Asthma 319795 non-null object \n",
|
|||
|
" 16 KidneyDisease 319795 non-null object \n",
|
|||
|
" 17 SkinCancer 319795 non-null object \n",
|
|||
|
"dtypes: float64(4), object(14)\n",
|
|||
|
"memory usage: 43.9+ MB\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>HeartDisease</th>\n",
|
|||
|
" <th>BMI</th>\n",
|
|||
|
" <th>Smoking</th>\n",
|
|||
|
" <th>AlcoholDrinking</th>\n",
|
|||
|
" <th>Stroke</th>\n",
|
|||
|
" <th>PhysicalHealth</th>\n",
|
|||
|
" <th>MentalHealth</th>\n",
|
|||
|
" <th>DiffWalking</th>\n",
|
|||
|
" <th>Sex</th>\n",
|
|||
|
" <th>AgeCategory</th>\n",
|
|||
|
" <th>Race</th>\n",
|
|||
|
" <th>Diabetic</th>\n",
|
|||
|
" <th>PhysicalActivity</th>\n",
|
|||
|
" <th>GenHealth</th>\n",
|
|||
|
" <th>SleepTime</th>\n",
|
|||
|
" <th>Asthma</th>\n",
|
|||
|
" <th>KidneyDisease</th>\n",
|
|||
|
" <th>SkinCancer</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>16.60</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>30.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>55-59</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>5.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>20.34</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>80 or older</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>7.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>26.58</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>20.0</td>\n",
|
|||
|
" <td>30.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Male</td>\n",
|
|||
|
" <td>65-69</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Fair</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>24.21</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>75-79</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Good</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>23.71</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>28.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Female</td>\n",
|
|||
|
" <td>40-44</td>\n",
|
|||
|
" <td>White</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>Yes</td>\n",
|
|||
|
" <td>Very good</td>\n",
|
|||
|
" <td>8.0</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" <td>No</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"0 No 16.60 Yes No No 3.0 \n",
|
|||
|
"1 No 20.34 No No Yes 0.0 \n",
|
|||
|
"2 No 26.58 Yes No No 20.0 \n",
|
|||
|
"3 No 24.21 No No No 0.0 \n",
|
|||
|
"4 No 23.71 No No No 28.0 \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"0 30.0 No Female 55-59 White Yes \n",
|
|||
|
"1 0.0 No Female 80 or older White No \n",
|
|||
|
"2 30.0 No Male 65-69 White Yes \n",
|
|||
|
"3 0.0 No Female 75-79 White No \n",
|
|||
|
"4 0.0 Yes Female 40-44 White No \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
|
|||
|
"0 Yes Very good 5.0 Yes No Yes \n",
|
|||
|
"1 Yes Very good 7.0 No No No \n",
|
|||
|
"2 Yes Fair 8.0 Yes No No \n",
|
|||
|
"3 No Good 6.0 No No Yes \n",
|
|||
|
"4 Yes Very good 8.0 No No No "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 122,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df.info()\n",
|
|||
|
"df.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Объект наблюдения: состояние человека\\\n",
|
|||
|
"Атрибуты объектов: сердечная недостаточность, ИМТ, курящий человек или нет, выпивающий человек или нет, был ли инсульт у человека и т.д."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 123,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAIjCAYAAAByG8BaAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACSK0lEQVR4nOzdd3xT9eLG8SfpnqHQDYWWJXsjsgQHLpwoLjbITxS314F6HThA1HvdoLIURBStqDhBUVSQvfcqhVJGaZvulZzfH0AvpS3Q0va0zef9euUlSc5Jnx5DaZ9+h8UwDEMAAAAAAABwGVazAwAAAAAAAKBqUQgBAAAAAAC4GAohAAAAAAAAF0MhBAAAAAAA4GIohAAAAAAAAFwMhRAAAAAAAICLoRACAAAAAABwMRRCAAAAAAAALoZCCAAAAAAAwMVQCAEAAAAAALgYCiEAAFBtfPnll7JYLCXe2rRpY3Y8l5WRkaHnnntOV111lerWrSuLxaKZM2eaHQsAAJwHd7MDAAAAnO6pp55Sy5YtC++//PLLJqZBUlKSxo8fr4YNG6p9+/b6/fffzY4EAADOE4UQAACodvr166e+ffsW3p86daqSkpLMC+TiIiIilJiYqPDwcK1atUpdu3Y1OxIAADhPTBkDAADVRl5eniTJaj37tygzZ86UxWJRXFxc4WNOp1Pt2rUrNqVpw4YNGj58uBo3bixvb2+Fh4dr5MiROnbsWJHXfP7550ucrubu/r/fofXt21dt2rTR6tWr1aNHD/n4+CgmJkZTpkwp9rk8++yz6ty5s2w2m/z8/NS7d28tXry4yHFxcXGFH2f+/PlFnsvJyVFQUJAsFotef/31YjlDQ0OVn59f5JzPPvus8PVOLdG++eYb9e/fX5GRkfLy8lKTJk304osvyuFwnPVae3l5KTw8/KzHAQCAmoMRQgAAoNo4WQh5eXmV6/xZs2Zp48aNxR5fuHCh9uzZoxEjRig8PFybN2/Whx9+qM2bN+uff/6RxWIpcvzkyZPl7+9feP/0giolJUXXXHONbr31Vt1xxx364osvdM8998jT01MjR46UJKWlpWnq1Km64447NHr0aKWnp2vatGm68sortWLFCnXo0KHIa3p7e2vGjBm68cYbCx+LjY1VTk5OqZ9venq6FixYoJtuuqnwsRkzZsjb27vYeTNnzpS/v78eeeQR+fv767ffftOzzz6rtLQ0vfbaa6V+DAAAUDtRCAEAgGrDbrdLknx8fMp8bm5urp599lldffXV+vHHH4s8d++99+rRRx8t8thFF12kO+64Q3/99Zd69+5d5LlbbrlFwcHBpX6sgwcP6o033tAjjzwiSbr77rvVrVs3jRs3TkOGDJGHh4eCgoIUFxcnT0/PwvNGjx6tFi1a6J133tG0adOKvOZNN92kefPm6fDhwwoLC5MkTZ8+XQMGDNCcOXNKzHHTTTdp+vTphYVQfHy8fv31V91222367LPPihw7Z86cItd1zJgxGjNmjN5//3299NJL5S7hAABAzcSUMQAAUG2cnMIVEhJS5nPfe+89HTt2TM8991yx504tQnJycpSUlKSLLrpIkrRmzZoyfyx3d3fdfffdhfc9PT11991368iRI1q9erUkyc3NrbAMcjqdSk5OVkFBgbp06VLix+zUqZNat26tWbNmSZL27dunxYsXa/jw4aXmGDlypH766ScdOnRIkvTxxx+re/fuat68ebFjT70G6enpSkpKUu/evZWVlaVt27aV+RoAAICajUIIAABUG/v27ZO7u3uZCyG73a5XXnlFjzzySOHomlMlJyfrwQcfVFhYmHx8fBQSEqKYmJjCc8sqMjJSfn5+RR47WcKcuqbRxx9/rHbt2snb21v16tVTSEiIvv/++1I/5ogRIzRjxgxJx6d49ejRQ82aNSs1R4cOHdSmTRt98sknMgxDM2fO1IgRI0o8dvPmzbrppptks9kUGBiokJAQDR48WFL5rgEAAKjZKIQAAEC1sX37djVu3LjIIs7n4tVXX5XVatVjjz1W4vO33nqrPvroI40ZM0axsbH65Zdf9NNPP0k6PnqnMsyePVvDhw9XkyZNNG3aNP30009auHChLr300lI/5uDBg7Vr1y79888/+vjjj0std041cuRIzZgxQ3/88YcOHTqkW2+9tdgxqamp6tOnj9avX6/x48fru+++08KFC/Xqq69KqrxrAAAAqi/WEAIAANVCbm6u1q1bV2RR5XNx8OBBvfXWW5owYYICAgKK7RyWkpKiX3/9VS+88IKeffbZwsd37txZ7qwHDx5UZmZmkVFCO3bskCRFR0dLkr788ks1btxYsbGxRRatLmlK20n16tXT9ddfXzj97NZbby2yU1hJBg0apMcee0wPPvigbrnlFgUEBBQ75vfff9exY8cUGxuriy++uPDxvXv3ntPnCwAAah9GCAEAgGphzpw5ys3N1WWXXVam81544QWFhYVpzJgxJT7v5uYmSTIMo8jjb775ZrlySlJBQYE++OCDwvt5eXn64IMPFBISos6dO5f6cZcvX65ly5ad8bVHjhypDRs2aODAgUV2OitN3bp1dcMNN2jDhg2FO5ydrqQseXl5ev/998/6+gAAoHZihBAAADBVZmam3nnnHY0fP15ubm4yDEOzZ88ucszhw4eVkZGh2bNnq1+/fkXWCfrll1/06aefFtnN61SBgYG6+OKLNWnSJOXn56t+/fr65Zdfzmt0TGRkpF599VXFxcWpefPm+vzzz7Vu3Tp9+OGH8vDwkCRde+21io2N1U033aT+/ftr7969mjJlilq1aqWMjIxSX/uqq67S0aNHz6kMOmnmzJl67733St0ZrUePHgoKCtKwYcP0wAMPyGKxaNasWcVKsjN59913lZqaqoMHD0qSvvvuOx04cECSdP/998tms53zawEAAPNRCAEAAFMdPXpU48aNK7x/6u5dpxsyZIgWL15cpBDq0KGD7rjjjjN+jDlz5uj+++/Xe++9J8MwdMUVV+jHH39UZGRkuTIHBQXp448/1v3336+PPvpIYWFhevfddzV69OjCY4YPH65Dhw7pgw8+0M8//6xWrVpp9uzZmjdvnn7//fdSX9tisZxxy/uS+Pj4FNlF7HT16tXTggUL9Oijj+qZZ55RUFCQBg8erMsuu0xXXnnlOX2M119/Xfv27Su8Hxsbq9jYWEnH1z6iEAIAoGaxGGX51RAAAEAFi4uLU0xMjBYvXqy+ffue93GVrW/fvkpKStKmTZtMywAAAHC+WEMIAAAAAADAxVAIAQAAU/n7+2vQoEFFpoGdz3EAAAA4O6aMAQAAlAFTxgAAQG1AIQQAAAAAAOBimDIGAAAAAADgYiiEAAAAAAAAXIy72QGqmtPp1MGDBxUQECCLxWJ2HAAAAAAAgAphGIbS09MVGRkpq/XMY4BcrhA6ePCgoqKizI4BAAAAAABQKfbv368GDRqc8RiXK4QCAgIkHb84gYGBJqcBAAAAAACoGGlpaYqKiirsPs7E5Qqhk9PEAgMDKYQAAAAAAECtcy5L5LCoNAAAAAAAgIuhEAIAAAAAAHAxFEIAAAAAAAAuhkIIAAAAAADAxVAIAQAAAAAAuBgKIQAAAAAAABdDIQQAAAAAAOBiKIQAAAAAAABcDIUQAAAAAACAi6EQAgAAAAAAcDEUQgAAAAAAAC6GQggAAAAAAMDFUAgBAAAAAAC4GAohAAAAAAAAF0MhBAAAAAAA4GIohAAAAAAAgMtLtGdr6e4kJdqzzY5SJdzNDgAAAAAAAGCmz1fG68nYjTIMyWqRJgxoq9u6NjQ7VqVihBAAAAAAAHBZifbswjJIkpyG9FTsplo/UohCCAAAAAAAuKxv1h0sLINOchiG4pKyzAlURSiEAAAAAACAS/pl8yG98fP2Yo+7WSyKDvY1IVHVoRACAAAAAAAu59v1B3XPp2uU7zTUtn6grJbjj7tZLHplQBtF2HzMDVjJWFQaAAAAAAC4lC9W7tcTsRtkGNKAjvU16ZZ2OpqRq7ikLEUH+9b6MkiiEAIAAAAAAC7k46Vxeu7bzZKkO7s11Es3tJHValGEzcc
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1400x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"mean_menthalhealth = df.groupby('AgeCategory')['SleepTime'].mean().reset_index()\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(14, 6))\n",
|
|||
|
"\n",
|
|||
|
"plt.plot(mean_menthalhealth['AgeCategory'], mean_menthalhealth['SleepTime'], marker='.')\n",
|
|||
|
"\n",
|
|||
|
"plt.title(\"Диаграмма 1\")\n",
|
|||
|
"plt.xlabel(\"Возрастная группа\")\n",
|
|||
|
"plt.ylabel(\"Время сна\")\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Между атрибутами присутствует связь. Пример, на диаграмме 1 - связь между возрастной группой и временем сна\\\n",
|
|||
|
"Примеры бизнес-целей:\\\n",
|
|||
|
" 1. Прогнозирование инсульта на основе ИМТ.\\\n",
|
|||
|
" 2. Наблюдение за изменением времени сна в зависимости от возраста.\\\n",
|
|||
|
"\\\n",
|
|||
|
"Эффект для бизнеса: влияние количества сна на здоровье, влияние ИМТ на здоровье, влияние возраста на инсульты\\\n",
|
|||
|
"\\\n",
|
|||
|
"\\\n",
|
|||
|
"Цели технического проекта:\\\n",
|
|||
|
" 1. Первая бизнес-цель: вход - ИМТ, целевой признак - инсульт.\\\n",
|
|||
|
" 2. Вторая бизнес-цель: вход - возрастная группа, целевой признак - время сна."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверка на выбросы"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 149,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пустые значения по столбцам:\n",
|
|||
|
"HeartDisease 0\n",
|
|||
|
"BMI 0\n",
|
|||
|
"Smoking 0\n",
|
|||
|
"AlcoholDrinking 0\n",
|
|||
|
"Stroke 0\n",
|
|||
|
"PhysicalHealth 0\n",
|
|||
|
"MentalHealth 0\n",
|
|||
|
"DiffWalking 0\n",
|
|||
|
"Sex 0\n",
|
|||
|
"AgeCategory 0\n",
|
|||
|
"Race 0\n",
|
|||
|
"Diabetic 0\n",
|
|||
|
"PhysicalActivity 0\n",
|
|||
|
"GenHealth 0\n",
|
|||
|
"SleepTime 0\n",
|
|||
|
"Asthma 0\n",
|
|||
|
"KidneyDisease 0\n",
|
|||
|
"SkinCancer 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Статистический обзор данных:\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>BMI</th>\n",
|
|||
|
" <th>PhysicalHealth</th>\n",
|
|||
|
" <th>MentalHealth</th>\n",
|
|||
|
" <th>SleepTime</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>count</th>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" <td>319795.00000</td>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" <td>319795.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>mean</th>\n",
|
|||
|
" <td>28.325399</td>\n",
|
|||
|
" <td>3.37171</td>\n",
|
|||
|
" <td>3.898366</td>\n",
|
|||
|
" <td>7.097075</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>std</th>\n",
|
|||
|
" <td>6.356100</td>\n",
|
|||
|
" <td>7.95085</td>\n",
|
|||
|
" <td>7.955235</td>\n",
|
|||
|
" <td>1.436007</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>min</th>\n",
|
|||
|
" <td>12.020000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>1.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>25%</th>\n",
|
|||
|
" <td>24.030000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>6.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>50%</th>\n",
|
|||
|
" <td>27.340000</td>\n",
|
|||
|
" <td>0.00000</td>\n",
|
|||
|
" <td>0.000000</td>\n",
|
|||
|
" <td>7.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>75%</th>\n",
|
|||
|
" <td>31.420000</td>\n",
|
|||
|
" <td>2.00000</td>\n",
|
|||
|
" <td>3.000000</td>\n",
|
|||
|
" <td>8.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>max</th>\n",
|
|||
|
" <td>94.850000</td>\n",
|
|||
|
" <td>30.00000</td>\n",
|
|||
|
" <td>30.000000</td>\n",
|
|||
|
" <td>24.000000</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" BMI PhysicalHealth MentalHealth SleepTime\n",
|
|||
|
"count 319795.000000 319795.00000 319795.000000 319795.000000\n",
|
|||
|
"mean 28.325399 3.37171 3.898366 7.097075\n",
|
|||
|
"std 6.356100 7.95085 7.955235 1.436007\n",
|
|||
|
"min 12.020000 0.00000 0.000000 1.000000\n",
|
|||
|
"25% 24.030000 0.00000 0.000000 6.000000\n",
|
|||
|
"50% 27.340000 0.00000 0.000000 7.000000\n",
|
|||
|
"75% 31.420000 2.00000 3.000000 8.000000\n",
|
|||
|
"max 94.850000 30.00000 30.000000 24.000000"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 149,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"null_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пустые значения по столбцам:\")\n",
|
|||
|
"print(null_values)\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nСтатистический обзор данных:\")\n",
|
|||
|
"df.describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе полученной информации видно, что пустых данных нет. Проверим данные на выбросы и дубликаты:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 150,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'BMI': 1.3324306428979513\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'PhysicalHealth': 2.6039732622480822\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'MentalHealth': 2.331111549136165\n",
|
|||
|
"\n",
|
|||
|
"Коэффициент асимметрии для столбца 'SleepTime': 0.6790346208011537\n",
|
|||
|
"\n",
|
|||
|
"Количество дубликатов: 18078\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"for column in df.select_dtypes(include=[np.number]).columns:\n",
|
|||
|
" skewness = df[column].skew()\n",
|
|||
|
" print(f\"\\nКоэффициент асимметрии для столбца '{column}': {skewness}\")\n",
|
|||
|
"\n",
|
|||
|
"duplicates = df.duplicated().sum()\n",
|
|||
|
"print(f\"\\nКоличество дубликатов: {duplicates}\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"На основе данных выше можно сказать, что выбросы незначительны. Удаляем все дубликаты."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 151,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"cleaned_df = df.drop_duplicates()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Очищаем данные от шумов"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 152,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADbUElEQVR4nOzdeXhU5dk/8O9M9oTsEBK2LLhACIIgSlDQWrXUBa1tba1Yqr6g1ra29tVKfypiF+rbt61dXWhfF2it2mrF1tJatXUhEQVBEXGBhDUhkBUSkkDm/P6IE8+cZGa+c+Y8Z845uT/XxXVp+GaYTGY5z/Pcz/34NE3TIIQQQgghhBACAOBP9B0QQgghhBBCCCeRQZIQQgghhBBC6MggSQghhBBCCCF0ZJAkhBBCCCGEEDoySBJCCCGEEEIIHRkkCSGEEEIIIYSODJKEEEIIIYQQQkcGSUIIIYQQQgihI4MkIYQQQgghhNCRQZIQQgghRALt2bMHDz300MD/19fX4/e//33i7pAQQgZJQoj4/OlPf4LP5xvyT1VVVaLvnhBCOJ7P58MNN9yAf/zjH6ivr8ctt9yCl19+OdF3S4hhLTnRd0AI4Q3f/e53MXny5IH//8EPfpDAeyOEEO4xduxYLF68GPPnzwcAlJSU4N///ndi75QQw5xP0zQt0XdCCOFef/rTn/D5z38eL774Is4666yBr5911lk4ePAgtmzZkrg7J4QQLrJ9+3YcPHgQVVVVyMrKSvTdEWJYk3I7IURcent7AQB+f/S3k4ceegg+nw/19fUDXwsEAjjppJPg8/lCavLfeustfOUrX0FFRQXS09NRXFyMq6++Gs3NzSG3eeeddw5Z6pec/PFC+VlnnYWqqips2LABc+bMQUZGBsrLy3HfffcN+lnuuOMOzJw5E7m5ucjKysLcuXPx4osvhuTq6+sH/p2//OUvIX/X3d2N/Px8+Hw+/O///u+g+1lUVISjR4+GfM+jjz46cHsHDx4c+PrTTz+NCy64AGPGjEFaWhomTpyI733ve+jr64v6WAf/vW3btuGyyy5DTk4OCgsLceONN6K7uzsk++CDD+Lss89GUVER0tLSUFlZiXvvvXfI2/373/+OM888E9nZ2cjJycGsWbPwhz/8ISTz2muv4fzzz0d+fj6ysrJw0kkn4ec//3lIZtu2bfjc5z6HgoICpKen45RTTsGaNWtCMrE8X77yla+E/P7z8/Nx1llnDSpZYh/T4HPG6H//938H3aeysjJ85StfCck98cQT8Pl8KCsrC/l6U1MTrrnmGkyYMAFJSUkD93fEiBGD/i2jsrKysKWtPp9vUH716tWYOXMmMjIyUFBQgC9+8YvYvXv3kD9ntNcGAPT09GDZsmU47rjjkJaWhvHjx+OWW25BT0/PoOy///1v+n4aBZ+7Q/38+sc5lucHgIHXwqhRo5CRkYETTzwR/+///b+QfzPSn+DKzllnnRUyIQT0r5z7/f5Br4Unnnhi4HcwcuRILFy4EHv37g3JfOUrXxl4nkycOBGnnXYaWlpakJGRMejnE0LYR8rthBBxCQ6S0tLSTH3/qlWr8Pbbbw/6+nPPPYcdO3bgqquuQnFxMd555x088MADeOedd1BbWzvoIuree+8NudA0DtpaW1tx/vnn47LLLsPll1+Oxx9/HNdffz1SU1Nx9dVXAwA6Ojrw29/+FpdffjkWL16MQ4cO4Xe/+x0+9alPYf369Zg+fXrIbaanp+PBBx/EJZdcMvC1J598ctAgRO/QoUP461//is985jMDX3vwwQeRnp4+6PseeughjBgxAjfddBNGjBiBF154AXfccQc6Ojrw4x//OOy/oXfZZZehrKwMK1asQG1tLX7xi1+gtbUVjzzySMhjN2XKFCxYsADJycl45pln8NWvfhWBQAA33HBDyP25+uqrMWXKFCxduhR5eXl48803sXbtWnzpS18C0P97u/DCC1FSUoIbb7wRxcXFePfdd/HXv/4VN954IwDgnXfewemnn46xY8fi1ltvRVZWFh5//HFccskl+POf/xzy2BiFe74AwMiRI/Gzn/0MQP9G+J///Oc4//zzsXv3buTl5Vn2mEZz7NixgYtvo0WLFuFf//oXvv71r2PatGlISkrCAw88gI0bN1K3PX36dHz7298O+dojjzyC5557LuRrP/jBD3D77bfjsssuw3/913/hwIED+OUvf4l58+bhzTffHHg8AO61EQgEsGDBArzyyitYsmQJJk+ejLfffhs/+9nP8P777w+aLAj6xje+gVmzZoW9n1YL9/x46623MHfuXKSkpGDJkiUoKyvD9u3b8cwzz+AHP/gBLr30Uhx33HED+W9961uYPHkylixZMvA1fTmx3oMPPojbbrsNP/nJTwZeB0D/c+2qq67CrFmzsGLFCuzfvx8///nP8eqrrw76HRjdcccdEd9HhBA20IQQIg733HOPBkDbvHlzyNfPPPNMbcqUKSFfe/DBBzUAWl1dnaZpmtbd3a1NmDBB+/SnP60B0B588MGBbFdX16B/69FHH9UAaC+99NLA15YtW6YB0A4cOBD2Pp555pkaAO0nP/nJwNd6enq06dOna0VFRVpvb6+maZp27NgxraenJ+R7W1tbtdGjR2tXX331wNfq6uo0ANrll1+uJScna42NjQN/98lPflL70pe+pAHQfvzjHw+6n5dffrl24YUXDnx9586dmt/v1y6//PJBP8dQj8G1116rZWZmat3d3WF/Xv2/t2DBgpCvf/WrXx30+xrq3/nUpz6lVVRUDPx/W1ublp2drZ122mnakSNHQrKBQEDTtP7Hr7y8XCstLdVaW1uHzGha/2M0derUkJ8hEAhoc+bM0Y4//viBr8XyfFm0aJFWWloa8m8+8MADGgBt/fr1EX/WoR7ToZ6/mqZpP/7xj0Puk6ZpWmlpqbZo0aKB///Nb36jpaWlaZ/4xCdC7tORI0c0v9+vXXvttSG3uWjRIi0rK2vQv2VUWlqqXXDBBYO+fsMNN2j6j/P6+notKSlJ+8EPfhCSe/vtt7Xk5OSQr7OvjVWrVml+v197+eWXQ27zvvvu0wBor776asjX//nPf2oAtD/96U9h72c4y5cv1wCEPGeCP7/+cY7l+TFv3jwtOztb27lzZ8htGv+NcP+W3plnnqmdeeaZmqZp2t/+9jctOTlZ+/a3vx2S6e3t1YqKirSqqqqQ18tf//pXDYB2xx13DHzN+NzdsmWL5vf7B34O/XNNCGEfKbcTQsQlWP42atSomL/317/+NZqbm7Fs2bJBf5eRkTHw393d3Th48CBmz54NAPSsu15ycjKuvfbagf9PTU3Ftddei6amJmzYsAEAkJSUhNTUVAD9M+ctLS04duwYTjnllCH/zRkzZmDKlClYtWoVAGDnzp148cUXB5Ve6V199dVYu3YtGhsbAQAPP/wwqqurccIJJwzK6h+DQ4cO4eDBg5g7dy66urqwbds26ufWrwQBwNe//nUAwLPPPjvkv9Pe3o6DBw/izDPPxI4dO9De3g6gf4Xo0KFDuPXWW5Genh5ym8FVvTfffBN1dXX45je/OWiWPJhpaWnBCy+8gMsuu2zgZzp48CCam5vxqU99Ch988MGgcqSgSM8XoP93Fry9TZs24ZFHHkFJSUnICkAsj2lfX9/A7QX/dHV1DflvB3V1deGuu+7C1772NUyYMCHk7zo7OxEIBFBYWBjxNuL15JNPIhAI4LLLLgu578XFxTj++OMHlY8yr40nnngCkydPxqRJk0Ju8+yzzwaAQbcZXAUxPlcYRUVFAPpXA2MR7vlx4MABvPTSS7j66qsH/U6Y8r9w1q9fj8suuwyf/exnB61CvvHGG2hqasJXv/rVkMfgggsuwKRJk/C3v/0t7O0uXboUM2bMwOc//3nT900IET8ptxNCxGXnzp1ITk6OeZDU3t6OH/7wh7jpppswevToQX/f0tKC5cuX449//COampoGfW+sxowZM2gjdHBgUl9fPzAAe/jhh/GTn/wE27ZtC9k7VF5ePuTtXnXVVXjggQfw3//933jooYcwZ84cHH/88WHvx/T
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Выбросы в датасете:\n",
|
|||
|
" HeartDisease BMI Smoking AlcoholDrinking Stroke PhysicalHealth \\\n",
|
|||
|
"2 No 26.58 Yes No No 20.0 \n",
|
|||
|
"4 No 23.71 No No No 28.0 \n",
|
|||
|
"5 Yes 28.87 Yes No No 6.0 \n",
|
|||
|
"6 No 21.63 No No No 15.0 \n",
|
|||
|
"10 Yes 34.30 Yes No No 30.0 \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"319774 No 20.36 No No No 30.0 \n",
|
|||
|
"319779 No 23.38 Yes No No 30.0 \n",
|
|||
|
"319782 No 31.89 Yes No No 30.0 \n",
|
|||
|
"319787 No 36.54 No No No 7.0 \n",
|
|||
|
"319790 Yes 27.41 Yes No No 7.0 \n",
|
|||
|
"\n",
|
|||
|
" MentalHealth DiffWalking Sex AgeCategory Race Diabetic \\\n",
|
|||
|
"2 30.0 No Male 65-69 White Yes \n",
|
|||
|
"4 0.0 Yes Female 40-44 White No \n",
|
|||
|
"5 0.0 Yes Female 75-79 Black No \n",
|
|||
|
"6 0.0 No Female 70-74 White No \n",
|
|||
|
"10 0.0 Yes Male 60-64 White Yes \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"319774 0.0 Yes Female 55-59 Hispanic Yes \n",
|
|||
|
"319779 0.0 Yes Female 70-74 Hispanic No \n",
|
|||
|
"319782 30.0 Yes Female 55-59 Hispanic No \n",
|
|||
|
"319787 0.0 No Male 30-34 Hispanic No \n",
|
|||
|
"319790 0.0 Yes Male 60-64 Hispanic Yes \n",
|
|||
|
"\n",
|
|||
|
" PhysicalActivity GenHealth SleepTime Asthma KidneyDisease SkinCancer \n",
|
|||
|
"2 Yes Fair 8.0 Yes No No \n",
|
|||
|
"4 Yes Very good 8.0 No No No \n",
|
|||
|
"5 No Fair 12.0 No No No \n",
|
|||
|
"6 Yes Fair 4.0 Yes No Yes \n",
|
|||
|
"10 No Poor 15.0 Yes No No \n",
|
|||
|
"... ... ... ... ... ... ... \n",
|
|||
|
"319774 Yes Fair 8.0 No No No \n",
|
|||
|
"319779 Yes Fair 5.0 No No No \n",
|
|||
|
"319782 No Fair 4.0 No No No \n",
|
|||
|
"319787 No Good 9.0 No No No \n",
|
|||
|
"319790 No Fair 6.0 Yes No No \n",
|
|||
|
"\n",
|
|||
|
"[47136 rows x 18 columns]\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0kAAAIjCAYAAADWYVDIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABihklEQVR4nO3deXhTZd7G8TvdS2lLC3RBtoKgFgQE2VQWRxBcQFxe1EEHxVHGQUdFUZkREVzqLo4LgjosgjpuqLigiAouIAiClqIiVgRpQSiU0tKF5rx/MI1J1yeQNOnp93NdvS45uZs8SY9JfufZHJZlWQIAAAAASJJCAt0AAAAAAAgmFEkAAAAA4IYiCQAAAADcUCQBAAAAgBuKJAAAAABwQ5EEAAAAAG4okgAAAADADUUSAAAAALihSAIAAAAANxRJAAAAAOCGIgmAX7z22mtyOBzV/nTt2jXQzQMAAKhRWKAbAMDe/vnPf+qEE05w/fvee+8NYGsAAADqRpEEwK+GDh2qwYMHu/793HPPaffu3YFrEAAAQB0YbgfAL0pLSyVJISF1v83MnTtXDodDv/zyi+uY0+lUt27d5HA4NHfuXNfxb7/9VldccYU6dOigqKgopaSkaNy4cdqzZ4/Hfd51113VDvULC/vj2tDgwYPVtWtXrV27Vqeccoqio6OVlpamZ555pspzufPOO9WrVy/Fx8crJiZGAwYM0CeffOKR++WXX1yP8+abb3rcVlxcrISEBDkcDj388MNV2pmUlKSysjKP33nppZdc9+deWL711ls655xz1KpVK0VGRqpjx466++67VV5eXudrXfF433//vUaPHq24uDg1b95cN9xwg4qLiz2yc+bM0Z/+9CclJSUpMjJS6enpmjlzZrX3+/7772vQoEGKjY1VXFycevfurRdffNEj89VXX+nss89WQkKCYmJi1K1bNz3++OMeme+//14XXXSREhMTFRUVpZNPPllvv/22R8ab8+WKK67w+PsnJCRo8ODB+uyzzzzu0/Q1rThnKnv44YertKl9+/a64oorPHKvvvqqHA6H2rdv73F8165duuqqq9S2bVuFhoa62tu0adMqj1VZ+/btaxza6nA4PLKHDh3S3XffrY4dOyoyMlLt27fXP//5T5WUlFS5X5O/qfs5X9vjOp1OzZgxQ126dFFUVJSSk5M1fvx47d271+j5VX4dP/30UzkcDn366aeuY4MHD/a4ICNJa9asqbY9krRgwQL16dNHTZo0UUJCggYOHKgPP/zQ9Zi1vaYVf7+K5+9+zhUUFKhXr15KS0tTTk5OjTlJmjBhghwOR5XnByDw6EkC4BcVRVJkZOQR/f4LL7yg7777rsrxpUuX6ueff9aVV16plJQUbdy4UbNnz9bGjRu1atWqKl+GZs6c6fFFs3LRtnfvXp199tkaPXq0Lr30Ur3yyiu69tprFRERoXHjxkmS9u/fr+eee06XXnqprr76ahUUFOj555/XsGHDtHr1avXo0cPjPqOiojRnzhyNGjXKdeyNN96oUoS4Kygo0DvvvKPzzz/fdWzOnDmKioqq8ntz585V06ZNNXHiRDVt2lQff/yx7rzzTu3fv18PPfRQjY/hbvTo0Wrfvr0yMjK0atUq/fvf/9bevXs1f/58j9euS5cuGjlypMLCwrR48WL9/e9/l9Pp1IQJEzzaM27cOHXp0kWTJ09Ws2bN9M0332jJkiX685//LOnw3+3cc89VamqqbrjhBqWkpGjTpk165513dMMNN0iSNm7cqFNPPVXHHHOMbr/9dsXExOiVV17RqFGj9Prrr3u8NpXVdL5IUosWLfTYY49JkrZv367HH39cZ599trZt26ZmzZr57DWty6FDh/Svf/2r2tvGjh2rjz76SNdff726d++u0NBQzZ49W+vWrTO67x49eujmm2/2ODZ//nwtXbrU49hf//pXzZs3TxdddJFuvvlmffXVV8rIyNCmTZu0aNEiV87kb+rummuu0YABAyQdPtfd70uSxo8fr7lz5+rKK6/UP/7xD2VnZ+vJJ5/UN998oy+++ELh4eFGz9Nbt912W7XHp02bprvuukunnHKKpk+froiICH311Vf6+OOPdeaZZ2rGjBk6cOCAJGnTpk267777PIYO11S8lpWV6cILL9Svv/6qL774QqmpqTW27aefftKzzz57lM8QgN9YAOAHM2bMsCRZGzZs8Dg+aNAgq0uXLh7H5syZY0mysrOzLcuyrOLiYqtt27bWWWedZUmy5syZ48oWFRVVeayXXnrJkmStWLHCdWzq1KmWJOv333+vsY2DBg2yJFmPPPKI61hJSYnVo0cPKykpySotLbUsy7IOHTpklZSUePzu3r17reTkZGvcuHGuY9nZ2ZYk69JLL7XCwsKs3Nxc121nnHGG9ec//9mSZD300ENV2nnppZda5557ruv41q1brZCQEOvSSy+t8jyqew3Gjx9vNWnSxCouLq7x+bo/3siRIz2O//3vf6/y96rucYYNG2Z16NDB9e99+/ZZsbGxVt++fa2DBw96ZJ1Op2VZh1+/tLQ0q127dtbevXurzVjW4dfoxBNP9HgOTqfTOuWUU6xOnTq5jnlzvowdO9Zq166dx2POnj3bkmStXr261uda3Wta3flrWZb10EMPebTJsiyrXbt21tixY13/fvrpp63IyEjr9NNP92jTwYMHrZCQEGv8+PEe9zl27FgrJiamymNV1q5dO+ucc86pcnzChAmW+8f8+vXrLUnWX//6V4/cLbfcYkmyPv74Y8uyzP6mFTZv3mxJsubNm+c6VnGOVfjss88sSdbChQs9fnfJkiXVHq8sLS3N+stf/uJx7JNPPrEkWZ988onr2KBBg6xBgwa5/v3ee+9Zkqzhw4d7tGfz5s1WSEiIdf7551vl5eW1Pr+aHqtCxf/zc+bMsZxOpzVmzBirSZMm1ldffVVjrsLo0aOtrl27Wm3atPE4TwAEB4bbAfCLiuFvLVu29Pp3n3rqKe3Zs0dTp06tclt0dLTrv4uLi7V7927169dPkoyvursLCwvT+PHjXf+OiIjQ+PHjtWvXLq1du1aSFBoaqoiICEmHhw3l5eXp0KFDOvnkk6t9zJ49e6pLly564YUXJElbt27VJ598UuuQmnHjxmnJkiXKzc2VJM2bN0/9+/dX586dq2TdX4OCggLt3r1bAwYMUFFRkb7//nuj5+3eEyRJ119/vSTpvffeq/Zx8vPztXv3bg0aNEg///yz8vPzJR3uISooKNDtt9+uqKgoj/us6NX75ptvlJ2drRtvvNHVc1M5k5eXp48//lijR492Pafdu3drz549GjZsmDZv3qzffvut2udS2/kiHf6bVdzf+vXrNX/+fKWmpnosKOLNa1peXu66v4qfoqKiah+7QlFRkaZPn67rrrtObdu29bitsLBQTqdTzZs3r/U+jlbF33bixIkexyt6oN59911JZn/TCiY9xq+++qri4+M1dOhQj9esV69eatq0aZVhq5UlJSVp+/btBs/wD5ZlafLkybrwwgvVt29fj9vefPNNOZ1O3XnnnVV6lqsblmdq0qRJWrhwoV555RX16dOn1uzatWv16quvKiMjw2hIMoD6x/+ZAPxi69atCgsL87pIys/P13333aeJEycqOTm5yu15eXm64YYblJycrOjoaLVs2VJpaWmu3/VWq1atFBMT43GsojBxn18yb948devWTVFRUWrevLlatmypd999t8bHvPLKKzVnzhxJh4cunXLKKerUqVON7ejRo4e6du2q+fPny7Is19Ck6mzcuFHnn3++4uPjFRcXp5YtW+qyyy6TZP4aVG5Lx44dFRIS4vGcv/jiCw0ZMkQxMTFq1qyZWrZsqX/+858ej7NlyxZJqnVZd5PMTz/9JMuyNGXKFLVs2dLjp6L42bVrV5Xfq+t8kaRt27a57uukk07Sli1b9Prrr3sMmfLmNf3+++9rbGNNHn30URUXF7teP3fNmzdXp06d9Nxzz+nDDz/Url27tHv37mrnCR2NrVu3KiQkRMcee6zH8ZSUFDVr1kxbt26VZPb3qrBv3z5JNQ8/k6T
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['PhysicalHealth'], cleaned_df['BMI'])\n",
|
|||
|
"plt.xlabel('Физическое здоровье')\n",
|
|||
|
"plt.ylabel('ИМТ')\n",
|
|||
|
"plt.title('Диаграмма рассеивания перед чисткой')\n",
|
|||
|
"plt.show()\n",
|
|||
|
"\n",
|
|||
|
"Q1 = cleaned_df[\"PhysicalHealth\"].quantile(0.25)\n",
|
|||
|
"Q3 = cleaned_df[\"PhysicalHealth\"].quantile(0.75)\n",
|
|||
|
"\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"\n",
|
|||
|
"threshold = 1.5 * IQR\n",
|
|||
|
"lower_bound = Q1 - threshold\n",
|
|||
|
"upper_bound = Q3 + threshold\n",
|
|||
|
"\n",
|
|||
|
"outliers = (cleaned_df[\"PhysicalHealth\"] < lower_bound) | (cleaned_df[\"PhysicalHealth\"] > upper_bound)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Выбросы в датасете:\")\n",
|
|||
|
"print(cleaned_df[outliers])\n",
|
|||
|
"\n",
|
|||
|
"median_score = cleaned_df[\"PhysicalHealth\"].median()\n",
|
|||
|
"cleaned_df.loc[outliers, \"PhysicalHealth\"] = median_score\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(cleaned_df['PhysicalHealth'], cleaned_df['BMI'])\n",
|
|||
|
"plt.xlabel('Физическое здоровье')\n",
|
|||
|
"plt.ylabel('ИМТ')\n",
|
|||
|
"plt.title('Диаграмма рассеивания после чистки')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 153,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 181029\n",
|
|||
|
"Размер контрольной выборки: 60344\n",
|
|||
|
"Размер тестовой выборки: 60344\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"\n",
|
|||
|
"train_df, test_df = train_test_split(cleaned_df, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Данные недостаточно сбалансированны"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 154,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение ИМТ в обучающей выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 1941\n",
|
|||
|
"27.46 1456\n",
|
|||
|
"27.44 1416\n",
|
|||
|
"27.12 1258\n",
|
|||
|
"24.41 1247\n",
|
|||
|
" ... \n",
|
|||
|
"55.95 1\n",
|
|||
|
"54.56 1\n",
|
|||
|
"53.72 1\n",
|
|||
|
"32.29 1\n",
|
|||
|
"69.88 1\n",
|
|||
|
"Name: count, Length: 3243, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в контрольной выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 657\n",
|
|||
|
"27.46 494\n",
|
|||
|
"24.41 474\n",
|
|||
|
"27.44 463\n",
|
|||
|
"25.10 379\n",
|
|||
|
" ... \n",
|
|||
|
"43.03 1\n",
|
|||
|
"55.56 1\n",
|
|||
|
"44.14 1\n",
|
|||
|
"16.97 1\n",
|
|||
|
"39.19 1\n",
|
|||
|
"Name: count, Length: 2483, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в тестовой выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 646\n",
|
|||
|
"27.44 506\n",
|
|||
|
"27.46 475\n",
|
|||
|
"24.41 452\n",
|
|||
|
"27.12 426\n",
|
|||
|
" ... \n",
|
|||
|
"16.53 1\n",
|
|||
|
"13.54 1\n",
|
|||
|
"41.06 1\n",
|
|||
|
"54.28 1\n",
|
|||
|
"39.91 1\n",
|
|||
|
"Name: count, Length: 2539, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['BMI'].value_counts()\n",
|
|||
|
" print(f\"Распределение ИМТ в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 155,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Оверсэмплинг:\n",
|
|||
|
"Распределение ИМТ в обучающей выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 1941\n",
|
|||
|
"27.46 1472\n",
|
|||
|
"27.44 1432\n",
|
|||
|
"27.12 1258\n",
|
|||
|
"24.41 1247\n",
|
|||
|
" ... \n",
|
|||
|
"27.13 1\n",
|
|||
|
"29.59 1\n",
|
|||
|
"24.76 1\n",
|
|||
|
"53.72 1\n",
|
|||
|
"31.03 1\n",
|
|||
|
"Name: count, Length: 3243, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в контрольной выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 657\n",
|
|||
|
"27.46 496\n",
|
|||
|
"24.41 474\n",
|
|||
|
"27.44 465\n",
|
|||
|
"25.10 379\n",
|
|||
|
" ... \n",
|
|||
|
"46.66 1\n",
|
|||
|
"46.76 1\n",
|
|||
|
"68.59 1\n",
|
|||
|
"73.39 1\n",
|
|||
|
"54.57 1\n",
|
|||
|
"Name: count, Length: 2483, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в тестовой выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 646\n",
|
|||
|
"27.44 510\n",
|
|||
|
"27.46 479\n",
|
|||
|
"24.41 452\n",
|
|||
|
"27.12 426\n",
|
|||
|
" ... \n",
|
|||
|
"46.72 1\n",
|
|||
|
"60.69 1\n",
|
|||
|
"47.44 1\n",
|
|||
|
"53.48 1\n",
|
|||
|
"58.16 1\n",
|
|||
|
"Name: count, Length: 2539, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Андерсэмплинг:\n",
|
|||
|
"Распределение ИМТ в обучающей выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 1929\n",
|
|||
|
"27.46 1456\n",
|
|||
|
"27.44 1416\n",
|
|||
|
"27.12 1238\n",
|
|||
|
"24.41 1238\n",
|
|||
|
" ... \n",
|
|||
|
"56.30 1\n",
|
|||
|
"41.59 1\n",
|
|||
|
"59.08 1\n",
|
|||
|
"61.66 1\n",
|
|||
|
"65.19 1\n",
|
|||
|
"Name: count, Length: 3241, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в контрольной выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 657\n",
|
|||
|
"27.46 494\n",
|
|||
|
"24.41 473\n",
|
|||
|
"27.44 463\n",
|
|||
|
"27.12 376\n",
|
|||
|
" ... \n",
|
|||
|
"48.40 1\n",
|
|||
|
"63.67 1\n",
|
|||
|
"48.68 1\n",
|
|||
|
"39.19 1\n",
|
|||
|
"30.76 1\n",
|
|||
|
"Name: count, Length: 2483, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение ИМТ в тестовой выборке:\n",
|
|||
|
"BMI\n",
|
|||
|
"26.63 639\n",
|
|||
|
"27.44 506\n",
|
|||
|
"27.46 475\n",
|
|||
|
"24.41 444\n",
|
|||
|
"27.12 423\n",
|
|||
|
" ... \n",
|
|||
|
"34.89 1\n",
|
|||
|
"30.75 1\n",
|
|||
|
"41.06 1\n",
|
|||
|
"54.28 1\n",
|
|||
|
"39.91 1\n",
|
|||
|
"Name: count, Length: 2539, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler\n",
|
|||
|
"\n",
|
|||
|
"def binning(target, bins):\n",
|
|||
|
" return pd.qcut(target, q=bins, labels=False)\n",
|
|||
|
"\n",
|
|||
|
"train_df['BMI_binned'] = binning(train_df['BMI'], bins=2)\n",
|
|||
|
"val_df['BMI_binned'] = binning(val_df['BMI'], bins=2)\n",
|
|||
|
"test_df['BMI_binned'] = binning(test_df['BMI'], bins=2)\n",
|
|||
|
"\n",
|
|||
|
"def oversample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"def undersample(df, target_column):\n",
|
|||
|
" X = df.drop(target_column, axis=1)\n",
|
|||
|
" y = df[target_column]\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" x_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df, 'BMI_binned')\n",
|
|||
|
"val_df_oversampled = oversample(val_df, 'BMI_binned')\n",
|
|||
|
"test_df_oversampled = oversample(test_df, 'BMI_binned')\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df, 'BMI_binned')\n",
|
|||
|
"val_df_undersampled = undersample(val_df, 'BMI_binned')\n",
|
|||
|
"test_df_undersampled = undersample(test_df, 'BMI_binned')\n",
|
|||
|
"\n",
|
|||
|
"print(\"Оверсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке\")\n",
|
|||
|
"\n",
|
|||
|
"print(\"Андерсэмплинг:\")\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": ".venv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.13.0"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|