2024-12-05 20:21:14 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Н а б о р данных \"Ближайшие к Земле объекты\" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Н а б о р е данных \"Ближайшие к Земле объекты\" \n",
"# \n",
"# 1. Анализ сведений о наборе данных \n",
"# Н а б о р данных о \"ближайших к Земле объектах\" относится к области астрофизики и исследования космических угроз. Проблемная область — мониторинг околоземных объектов (Near-Earth Objects, NEO), которые могут представлять потенциальную опасность для Земли. Цель анализа — оценить характеристики объектов, их орбиты, скорость, и расстояние до Земли, чтобы предсказать возможность столкновения. \n",
"# \n",
"# 2. Анализ содержимого набора данных \n",
"# Объекты наблюдения: \n",
"# Космические тела, которые классифицируются как \"околоземные объекты\". \n",
"# Атрибуты объектов: \n",
"# id — идентификатор объекта. \n",
"# name — название объекта. \n",
"# est_diameter_min и est_diameter_max — оценочные минимальный и максимальный диаметры. \n",
"# relative_velocity — относительная скорость объекта. \n",
"# miss_distance — минимальное расстояние до Земли. \n",
"# orbiting_body — небесное тело, вокруг которого объект совершает движение (обычно Солнце). \n",
"# sentry_object — булевый признак, указывающий, находится ли объект в базе наблюдения системы Sentry. \n",
"# absolute_magnitude — абсолютная звёздная величина объекта. \n",
"# hazardous — булевый признак опасности объекта. \n",
"# Связи между объектами: \n",
"# Объекты могут быть связаны через общее орбитальное тело или группироваться по характеристикам орбит. \n",
"# \n",
"# 3. Примеры бизнес-целей \n",
"# Уведомление о космических угрозах: \n",
"# Построение системы прогнозирования опасных объектов для оперативного предупреждения. \n",
"# Эффект: повышение безопасности через своевременные предупреждения. \n",
"# \n",
"# Оптимизация миссий для изучения астероидов: \n",
"# Идентификация объектов, подходящих для изучения (на основе характеристик орбиты и размера). \n",
"# Эффект: снижение затрат и повышение научной ценности миссий. \n",
"# \n",
"# 4. Примеры целей технического проекта \n",
"# Бизнес-цель: Предсказание опасности объекта. \n",
"# Входные данные: атрибуты объекта (диаметр, скорость, расстояние). \n",
"# Целевой признак: hazardous. \n",
"# \n",
"# Бизнес-цель: Классификация объектов по характеристикам. \n",
"# Входные данные: абсолютная звёздная величина, диаметр, скорость. \n",
"# Целевой признак: принадлежность к определённой группе объектов. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>est_diameter_min</th>\n",
" <th>est_diameter_max</th>\n",
" <th>relative_velocity</th>\n",
" <th>miss_distance</th>\n",
" <th>orbiting_body</th>\n",
" <th>sentry_object</th>\n",
" <th>absolute_magnitude</th>\n",
" <th>hazardous</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2162635</td>\n",
" <td>162635 (2000 SS164)</td>\n",
" <td>1.198271</td>\n",
" <td>2.679415</td>\n",
" <td>13569.249224</td>\n",
" <td>5.483974e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>16.73</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2277475</td>\n",
" <td>277475 (2005 WK4)</td>\n",
" <td>0.265800</td>\n",
" <td>0.594347</td>\n",
" <td>73588.726663</td>\n",
" <td>6.143813e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>20.00</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2512244</td>\n",
" <td>512244 (2015 YE18)</td>\n",
" <td>0.722030</td>\n",
" <td>1.614507</td>\n",
" <td>114258.692129</td>\n",
" <td>4.979872e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>17.83</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3596030</td>\n",
" <td>(2012 BV13)</td>\n",
" <td>0.096506</td>\n",
" <td>0.215794</td>\n",
" <td>24764.303138</td>\n",
" <td>2.543497e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>22.20</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3667127</td>\n",
" <td>(2014 GE35)</td>\n",
" <td>0.255009</td>\n",
" <td>0.570217</td>\n",
" <td>42737.733765</td>\n",
" <td>4.627557e+07</td>\n",
" <td>Earth</td>\n",
" <td>False</td>\n",
" <td>20.09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name est_diameter_min est_diameter_max \\\n",
"0 2162635 162635 (2000 SS164) 1.198271 2.679415 \n",
"1 2277475 277475 (2005 WK4) 0.265800 0.594347 \n",
"2 2512244 512244 (2015 YE18) 0.722030 1.614507 \n",
"3 3596030 (2012 BV13) 0.096506 0.215794 \n",
"4 3667127 (2014 GE35) 0.255009 0.570217 \n",
"\n",
" relative_velocity miss_distance orbiting_body sentry_object \\\n",
"0 13569.249224 5.483974e+07 Earth False \n",
"1 73588.726663 6.143813e+07 Earth False \n",
"2 114258.692129 4.979872e+07 Earth False \n",
"3 24764.303138 2.543497e+07 Earth False \n",
"4 42737.733765 4.627557e+07 Earth False \n",
"\n",
" absolute_magnitude hazardous \n",
"0 16.73 False \n",
"1 20.00 True \n",
"2 17.83 False \n",
"3 22.20 False \n",
"4 20.09 True "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/neo.csv')\n",
2024-12-05 20:21:14 +04:00
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество пропущенных значений в каждом столбце:\n",
"id 0\n",
"est_diameter_min 0\n",
"est_diameter_max 0\n",
"relative_velocity 0\n",
"miss_distance 0\n",
"absolute_magnitude 0\n",
"hazardous 0\n",
"dtype: int64\n",
"Количество выбросов в столбце 'est_diameter_min': 8306\n",
"Количество выбросов в столбце 'est_diameter_max': 8306\n",
"Количество выбросов в столбце 'relative_velocity': 1574\n",
"Количество выбросов в столбце 'miss_distance': 0\n",
"Количество выбросов в столбце 'absolute_magnitude': 101\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPeCAYAAADj01PlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXwU9f3H8fdujs2dkEASIhEDWg5FqOhP4oGgSMRgPVBLRUVFqTbYAhUtrSKHSkURL5RaLdAWqtB6ggLhFgmIUQRBEDUYICSB3Pe18/sj7MqSgyQkmWzyej4e+yCZ+e7MZyZhP5vPfuY7FsMwDAEAAAAAAAAAgBqsZgcAAAAAAAAAAEBbRREdAAAAAAAAAIA6UEQHAAAAAAAAAKAOFNEBAAAAAAAAAKgDRXQAAAAAAAAAAOpAER0AAAAAAAAAgDpQRAcAAAAAAAAAoA4U0QEAAAAAAAAAqANFdAAAAAAAAAAA6kARHW3OOeeco3vuucfsMNq95557Tj169JCHh4cGDBhgdjhNdurvy8aNG2WxWLRx40bTYuoIhgwZoiFDhpgdBgCTkbNbBzkbANCayO+tozXze0v8TO+55x6dc845zbpNsy1atEgWi0UHDx5ske1Pnz5dFoulRbaNlkcRHS3K8QL0xRdf1Lp+yJAhuuCCC854Px9//LGmT59+xtvpKNasWaNHH31Ul19+uRYuXKhnnnmmVfa7dOlSvfjii62yr7asuLhY06dPp2gAoE0hZ7dN5GxzkbMBuDvye9tkVn5vrLS0NE2fPl07d+40O5R265lnntH7779vdhhoAE+zAwBOtX//flmtjft85+OPP9b8+fNJ2g20fv16Wa1WvfXWW/L29m61/S5dulTffPONJk6c2GL7GDx4sEpKSlr1uBqruLhYM2bMkCS37eZes2aN2SEAaAPI2S2PnG2u9pCzAaCxyO8tz6z83lhpaWmaMWOGzjnnnBrd8n//+99lt9vNCcxNPf744/rTn/7ksuyZZ57RrbfeqptuusmcoNBgdKKjzbHZbPLy8jI7jEYpKioyO4RGyczMlK+vb5tO1k1ltVrl4+PT6Dd97UFr/h56e3u3y98fAI1Dzm555Oz2yd1+DwF0LOT3lnem+b24uLiZI2o8Ly8v2Ww2s8NwK56envLx8TE7DDRRx3vHijbv1Lm6KioqNGPGDJ133nny8fFRWFiYrrjiCiUmJkqqnodr/vz5kiSLxeJ8OBQVFemPf/yjoqOjZbPZ1KtXLz3//PMyDMNlvyUlJfr973+vzp07KzAwUL/61a905MgRWSwWl0/THXNY7d27V3fccYc6deqkK664QpK0a9cu3XPPPerRo4d8fHwUGRmp++67T1lZWS77cmzju+++05133qng4GB16dJFTzzxhAzD0KFDh3TjjTcqKChIkZGRmjt3boPOXWVlpWbNmqWePXvKZrPpnHPO0Z///GeVlZU5x1gsFi1cuFBFRUXOc7Vo0aJ6t7t9+3Zdd911Cg4Olp+fn6666ip99tlnLmMKCgo0ceJEnXPOObLZbAoPD9e1116rL7/8UlJ199bKlSv1008/OffbmPnTDMPQU089pW7dusnPz09Dhw7Vnj17aoyrbX7VTz/9VLfddpvOPvts2Ww2RUdHa9KkSSopKXF57j333KOAgAClpqZq5MiRCggI0FlnneX8/dq9e7euvvpq+fv7q3v37lq6dGmN/efm5mrixInO37dzzz1Xzz77rPMT+oMHD6pLly6SpBkzZjjPxcm/Y/v27dOtt96q0NBQ+fj46OKLL9aHH37osh/HZZmbNm3S7373O4WHh6tbt24NOpcHDx6UxWLR888/r/nz56tHjx7y8/PT8OHDdejQIRmGoVmzZqlbt27y9fXVjTfeqOzsbJdtnDonuuO8L1u2TE8//bS6desmHx8fXXPNNfr+++8bFBcA90POJmfXhpzdtnL2Bx98oPj4eEVFRclms6lnz56aNWuWqqqqnGO+/fZb+fr66u6773Z57pYtW+Th4aHHHnusQfECaB/I720rvzum3ElOTtbgwYPl5+enP//5z5KksrIyPfnkkzr33HOdefPRRx912V9tsrOz9cgjj6hfv34KCAhQUFCQRowYoa+//to5ZuPGjbrkkkskSffee2+NWE+eE72iokKhoaG69957a+wrPz9fPj4+euSRR5zLmhr3yZ5//nlZLBb99NNPNdZNnTpV3t7eysnJcS5ryHukurz22ms6//zzZbPZFBUVpYSEBOXm5tYYt337dl1//fXq1KmT/P39deGFF+qll15yrj91TnSLxaKioiItXrzYeX7vuecebdiwQRaLRe+9916NfSxdulQWi0VJSUkNih3Nh+lc0Cry8vJ0/PjxGssrKipO+9zp06dr9uzZuv/++/V///d/ys/P1xdffKEvv/xS1157rX77298qLS1NiYmJ+te//uXyXMMw9Ktf/UobNmzQuHHjNGDAAK1evVpTpkzRkSNHNG/ePOfYe+65R8uWLdNdd92lQYMGadOmTYqPj68zrttuu03nnXeennnmGWfyT0xM1I8//qh7771XkZGR2rNnj9544w3t2bNH27Ztq3EDiV//+tfq06eP/vrXv2rlypV66qmnFBoaqr/97W+6+uqr9eyzz2rJkiV65JFHdMkll2jw4MH1nqv7779fixcv1q233qo//vGP2r59u2bPnq1vv/3W+eL7r3/9S2+88YY+//xzvfnmm5Kkyy67rM5trl+/XiNGjNDAgQP15JNPymq1auHChbr66qv16aef6v/+7/8kSQ8++KD++9//asKECerbt6+ysrK0ZcsWffvtt7rooov0l7/8RXl5eTp8+LDzvAcEBNR7PCebNm2annrqKV1//fW6/vrr9eWXX2r48OEqLy8/7XOXL1+u4uJiPfTQQwoLC9Pnn3+uV155RYcPH9by5ctdxlZVVWnEiBEaPHiw5syZoyVLlmjChAny9/fXX/7yF40ZM0a33HKLFixYoLvvvluxsbGKiYmRVN0NcNVVV+nIkSP67W9/q7PPPltbt27V1KlTdfToUb344ovq0qWLXn/9dT300EO6+eabdcstt0iSLrzwQknSnj17dPnll+uss87Sn/70J/n7+2vZsmW66aab9L///U8333yzS7y/+93v1KVLF02bNq3R3RdLlixReXm5Hn74YWVnZ2vOnDm6/fbbdfXVV2vjxo167LHH9P333+uVV17RI488on/84x+n3eZf//pXWa1WPfLII8rLy9OcOXM0ZswYbd++vVGxATAPOZucTc5uXzl70aJFCggI0OTJkxUQEKD169dr2rRpys/P13PPPSdJ6tOnj2bNmqUpU6bo1ltv1a9+9SsVFRXpnnvuUe/evTVz5sxGxQug7SG/u29+l6SsrCyNGDFCo0eP1p133qmIiAjZ7Xb96le/0pYtWzR+/Hj16dNHu3fv1rx58/Tdd9/VO8/2jz/+qPfff1+33XabYmJilJGRob/97W+66qqrtHfvXkVFRalPnz6aOXOmpk2bpvHjx+vKK6+sM1YvLy/dfPPNevfdd/W3v/3NpcP+/fffV1lZmUaPHi1JZxT3yW6//XY9+uijWrZsmaZMmeKybtmyZRo+fLg6deokqeHvkWozffp0zZgxQ8OGDdNDDz2k/fv36/XXX9eOHTv02WefOa/aSExM1MiRI9W1a1f94Q9/UGRkpL799lutWLFCf/jDH2rd9r/+9S/n/6vx48dLknr27KlBgwYpOjpaS5YsqfF+YsmSJerZs6diY2MbdJ7QjAygBS1cuNCQVO/j/PPPd3lO9+7djbFjxzq/79+/vxEfH1/vfhISEozafp3ff/99Q5Lx1FNPuSy/9dZbDYvFYnz//feGYRhGcnKyIcmYOHGiy7h77rnHkGQ8+eSTzmVPPvmkIcn4zW9+U2N/xcXFNZb95z//MSQZmzdvrrGN8ePHO5dVVlYa3bp1MywWi/HXv/7VuTwnJ8fw9fV1OSe12blzpyHJuP/++12WP/LII4YkY/369c5lY8eONfz9/evdnmEYht1uN8477zwjLi7OsNvtLscZExN
"text/plain": [
"<Figure size 1500x1000 with 5 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество выбросов в столбце 'est_diameter_min': 8306\n",
"Количество выбросов в столбце 'est_diameter_max': 8306\n",
"Количество выбросов в столбце 'relative_velocity': 1574\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAISCAYAAAAjjoaeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADAK0lEQVR4nOzde1xUdf7H8fcwwHAHQQFJM9LyUqZlbdLFtExSa7fNav1laWW5tdiuWtq6W+alsiyztiy3rbTd1S3dLSutFDU185KR5jWzolARVG4j98uc3x/jTI4CIg4cBl7Px2MeyTnfOedzBpoPfOZ7Pl+LYRiGAAAAAAAAAADASfzMDgAAAAAAAAAAgKaKIjoAAAAAAAAAADWgiA4AAAAAAAAAQA0oogMAAAAAAAAAUAOK6AAAAAAAAAAA1IAiOgAAAAAAAAAANaCIDgAAAAAAAABADSiiAwAAAAAAAABQA4roAAAAAAAAAADUgCI6mpxzzjlHd999t9lhNHvPPfeczj33XFmtVvXs2dPscOrtxJ+X1atXy2KxaPXq1abF1BL07dtXffv2NTsMACYjZzcOcjYAoDGR3xtHY+b3hvie3n333TrnnHO8ekyzzZs3TxaLRT/99FODHH/y5MmyWCwNcmw0PIroaFCuN6Cvvvqq2v19+/bVhRdeeMbn+fjjjzV58uQzPk5LsXz5ck2YMEFXXnml5s6dq6effrpRzrtgwQK9+OKLjXKupqy4uFiTJ0+maACgSSFnN03kbHORswH4OvJ702RWfj9dmZmZmjx5srZu3Wp2KM3W008/rcWLF5sdBurA3+wAgBPt2bNHfn6n9/nOxx9/rNmzZ5O062jVqlXy8/PTm2++qcDAwEY774IFC7Rjxw6NGTOmwc7Rp08flZSUNOp1na7i4mJNmTJFknx2Nvfy5cvNDgFAE0DObnjkbHM1h5wNAKeL/N7wzMrvpyszM1NTpkzROeecc9Js+X/84x9yOBzmBOajHnvsMf35z3/22Pb000/r1ltv1c0332xOUKgzZqKjybHZbAoICDA7jNNSVFRkdgin5dChQwoODm7Sybq+/Pz8FBQUdNq/9DUHjflzGBgY2Cx/fgCcHnJ2wyNnN0++9nMIoGUhvze8M83vxcXFXo7o9AUEBMhms5kdhk/x9/dXUFCQ2WGgnlreb6xo8k7s1VVRUaEpU6bovPPOU1BQkGJiYnTVVVcpNTVVkrMP1+zZsyVJFovF/XApKirSww8/rPbt28tms6lz5856/vnnZRiGx3lLSkr0xz/+Ua1bt1Z4eLh+/etf68CBA7JYLB6fprt6WO3atUt33HGHWrVqpauuukqStG3bNt19990699xzFRQUpPj4eN17773KycnxOJfrGN99953uvPNORUZGqk2bNnr88cdlGIb27dun3/zmN4qIiFB8fLxmzpxZp9eusrJS06ZNU8eOHWWz2XTOOefoL3/5i8rKytxjLBaL5s6dq6KiIvdrNW/evFqPu2nTJt1www2KjIxUSEiIrrnmGn3xxRceY44ePaoxY8bonHPOkc1mU2xsrK6//np9/fXXkpyzt5YuXaqff/7Zfd7T6Z9mGIaefPJJtWvXTiEhIerXr5927tx50rjq+qt+/vnnuu2223T22WfLZrOpffv2Gjt2rEpKSjyee/fddyssLEwZGRm68cYbFRYWprPOOsv987V9+3Zde+21Cg0NVYcOHbRgwYKTzp+fn68xY8a4f946deqkZ5991v0J/U8//aQ2bdpIkqZMmeJ+LY7/Gfv222916623Kjo6WkFBQbr00kv14YcfepzHdVvmmjVr9Ic//EGxsbFq165dnV7Ln376SRaLRc8//7xmz56tc889VyEhIRowYID27dsnwzA0bdo0tWvXTsHBwfrNb36j3Nxcj2Oc2BPd9bovXLhQTz31lNq1a6egoCBdd911+v777+sUFwDfQ84mZ1eHnN20cvYHH3ygwYMHKyEhQTabTR07dtS0adNUVVXlHrN7924FBwdr+PDhHs9dt26drFarHn300TrFC6B5IL83rfzuarmTlpamPn36KCQkRH/5y18kSWVlZXriiSfUqVMnd96cMGGCx/mqk5ubq0ceeUTdu3dXWFiYIiIiNHDgQH3zzTfuMatXr9Zll10mSbrnnntOivX4nugVFRWKjo7WPffcc9K57Ha7goKC9Mgjj7i31Tfu4z3//POyWCz6+eefT9o3ceJEBQYGKi8vz72tLr8j1eTVV1/VBRdcIJvNpoSEBKWkpCg/P/+kcZs2bdKgQYPUqlUrhYaG6qKLLtJLL73k3n9iT3SLxaKioiK9/fbb7tf37rvv1meffSaLxaL333//pHMsWLBAFotFGzZsqFPs8B7auaBRFBQU6MiRIydtr6ioOOVzJ0+erOnTp+u+++7Tr371K9ntdn311Vf6+uuvdf311+v3v/+9MjMzlZqaqn/9618ezzUMQ7/+9a/12WefaeTIkerZs6eWLVum8ePH68CBA5o1a5Z77N13362FCxfqrrvuUu/evbVmzRoNHjy4xrhuu+02nXfeeXr66afdyT81NVU//vij7rnnHsXHx2vnzp16/fXXtXPnTm3cuPGkBSR+97vfqWvXrnrmmWe0dOlSPfnkk4qOjtbf//53XXvttXr22Wc1f/58PfLII7rsssvUp0+fWl+r++67T2+//bZuvfVWPfzww9q0aZOmT5+u3bt3u998//Wvf+n111/Xl19+qTfeeEOSdMUVV9R4zFWrVmngwIHq1auXnnjiCfn5+Wnu3Lm69tpr9fnnn+tXv/qVJOmBBx7Qf//7X40ePVrdunVTTk6O1q1bp927d+uSSy7RX//6VxUUFGj//v3u1z0sLKzW6znepEmT9OSTT2rQoEEaNGiQvv76aw0YMEDl5eWnfO6iRYtUXFysBx98UDExMfryyy/18ssva//+/Vq0aJHH2KqqKg0cOFB9+vTRjBkzNH/+fI0ePVqhoaH661//qmHDhumWW27RnDlzNHz4cCUlJSkxMVGSczbANddcowMHDuj3v/+9zj77bK1fv14TJ07UwYMH9eKLL6pNmzZ67bXX9OCDD+q3v/2tbrnlFknSRRddJEnauXOnrrzySp111ln685//rNDQUC1cuFA333yz/ve//+m3v/2tR7x/+MMf1KZNG02aNOm0Z1/Mnz9f5eXleuihh5Sbm6sZM2bo9ttv17XXXqvVq1fr0Ucf1ffff6+XX35ZjzzyiN56661THvOZZ56Rn5+fHnnkERUUFGjGjBkaNmyYNm3adFqxATAPOZucTc5uXjl73rx5CgsL07hx4xQWFqZVq1Zp0qRJstvteu655yRJXbt21bRp0zR+/Hjdeuut+vWvf62ioiLdfffd6tKli6ZOnXpa8QJoesjvvpvfJSknJ0cDBw7U0KFDdeeddyouLk4Oh0O//vWvtW7dOo0aNUpdu3bV9u3bNWvWLH333Xe19tn+8ccftXjxYt12221KTExUdna2/v73v+uaa67Rrl27lJCQoK5du2rq1KmaNGmSRo0apauvvrrGWAMCAvTb3/5W7733nv7+9797zLBfvHixysrKNHToUEk6o7iPd/vtt2vChAlauHChxo8f77Fv4cKFGjBggFq1aiWp7r8jVWfy5MmaMmWK+vfvrwcffFB79uzRa6+9ps2bN+uLL75w37WRmpqqG2+8UW3bttWf/vQnxcfHa/fu3VqyZIn+9Kc/VXvsf/3rX+7/r0aNGiVJ6tixo3r37q327dtr/vz5J/0+MX/+fHXs2FFJSUl1ep3gRQbQgObOnWtIqvVxwQUXeDynQ4cOxogRI9xf9+jRwxg8eHCt50lJSTGq+3FevHixIcl48sknPbbfeuuthsViMb7//nvDMAwjLS3NkGSMGTPGY9zdd99tSDKeeOIJ97YnnnjCkGT83//930nnKy4uPmnbf/7zH0OSsXbt2pOOMWrUKPe2yspKo127dobFYjGeeeYZ9/a8vDwjODjY4zWpztatWw1Jxn333eex/ZFHHjEkGatWrXJvGzFihBEaGlrr8QzDMBwOh3HeeecZycnJhsP
"text/plain": [
"<Figure size 1500x1000 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка: (63585, 7)\n",
"hazardous\n",
"False 57399\n",
"True 6186\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgkAAADECAYAAAAVi7K7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA9RUlEQVR4nO3dd1gUV9sH4N/uArv03kU6WLAglsQo2LHF2GJiF4OaqPHVaEw0sWvQWIndWDDia4xiJBordoxRsWBXlGKnd1hgd8/3By/zsewuAoID8tzXtZfucObMM2dmZ5+dc2ZGwBhjIIQQQggpQ8h3AIQQQgipnShJIIQQQohalCQQQgghRC1KEgghhBCiFiUJhBBCCFGLkgRCCCGEqEVJAiGEEELUoiSBEEIIIWpp8R0AIYTUB4WFhUhLS4NCoYCdnR3f4ZBqJJVKkZaWBi0tLVhZWfEdTrWiMwmE1AJjxoyBgYEB32FUm/nz50MgEPAdBu+ioqIwbNgwWFhYQCwWw9bWFoMGDeI7rDpj3bp1yMjI4N6vWbMGubm5/AVUSkREBPr16wcTExPo6urC3t4e//nPf/gOq9pV6kxCSEgIAgICuPdisRgNGzZEjx49MGfOHFhbW1d7gIQQUheFh4fjs88+Q6NGjbBkyRK4uroCwHv3S7MmHTp0CI8fP8b06dNx/vx5zJkzB1OmTOE7LGzYsAFff/01OnTogODgYNjb2wMAHB0deY6s+lWpu2HhwoVwdnaGVCpFZGQkNm7ciCNHjuDOnTvQ09Or7hgJIaROSUtLQ2BgIPz9/bFv3z7o6OjwHVKdNHv2bPTr1w/BwcEQCoVYuXIlhEJ+T4DHxMTgm2++wfjx47Fhw4b3/oxZlZKEXr16oXXr1gCAwMBAmJubY9WqVQgPD8fQoUOrNUBCSO0jk8mgUCjoy0+DHTt2QCqVIiQkhNroLfj5+SEhIQH379+Hg4MDGjRowHdI+OWXX2BjY4NffvnlvU8QgGoak9ClSxcAQFxcHIDiLHrGjBlo1qwZDAwMYGRkhF69eiE6OlplXqlUivnz58PDwwMSiQS2trYYOHAgnjx5AgCIj4+HQCDQ+OrUqRNX19mzZyEQCLB3717Mnj0bNjY20NfXR79+/fDs2TOVZV++fBk9e/aEsbEx9PT04Ofnh4sXL6pdx06dOqld/vz581XKhoaGwsfHB7q6ujAzM8Pnn3+udvnlrVtpCoUCa9asQdOmTSGRSGBtbY0JEyYgPT1dqZyTkxP69u2rspzJkyer1Kku9uXLl6u0KQAUFBRg3rx5cHNzg1gshoODA2bOnImCggK1bVVap06dVOpbsmQJhEIh/vvf/1apPVasWIH27dvD3Nwcurq68PHxwf79+9UuPzQ0FG3btoWenh5MTU3h6+uLEydOKJU5evQo/Pz8YGhoCCMjI7Rp00Yltn379nHb1MLCAiNGjMCLFy+UyowZM0YpZlNTU3Tq1AkXLlx4YzuVePHiBfr37w8DAwNYWlpixowZkMvllV7/srGo22cLCwsxd+5c+Pj4wNjYGPr6+ujYsSPOnDmjVFfJdlmxYgXWrFkDV1dXiMVi3Lt3DwAQGRmJNm3aQCKRwNXVFZs3b1a7bjKZDIsWLeLmd3JywuzZs1X2I02fKycnJ4wZM4Z7X1RUhAULFsDd3R0SiQTm5ubo0KEDTp48WW4bh4SEKLWHnp4emjVrhq1bt5Y7X4nY2Fh8+umnMDMzg56eHj744AP8/fffSmX+/fdftGzZEj/99BMcHBwgFovh7u6OpUuXQqFQcOX8/PzQokULtcvx9PSEv7+/Uszx8fFKZcp+viq6TQHVdn79+jVGjRoFS0tLiMVieHl54ddff1Wap/S+UJqXl5fK53zFihVqY37x4gXGjh0La2triMViNG3aFNu3b1cqU3IsP3v2LExMTPDhhx+iQYMG6NOnj8b9Q938JS+xWAwPDw8EBQWh9IOPS8bOpKSkaKyr7H7377//wsfHBxMnTuTWQV1bAUBubi6mT5/O7QOenp5YsWIFyj58WSAQYPLkydi9ezc8PT0hkUjg4+OD8+fPK5VTN9bnzJkzEIvF+PLLL5WmV6SdK6Jarm4o+UI3NzcHUPwhOnjwID799FM4OzsjMTERmzdvhp+fH+7du8eN7JXL5ejbty9OnTqFzz//HP/5z3+QnZ2NkydP4s6dO1wfHgAMHToUvXv3VlrurFmz1MazZMkSCAQCfPfdd0hKSsKaNWvQrVs33Lx5E7q6ugCA06dPo1evXvDx8cG8efMgFAqxY8cOdOnSBRcuXEDbtm1V6m3QoAGCgoIAADk5Ofjqq6/ULnvOnDkYMmQIAgMDkZycjLVr18LX1xc3btyAiYmJyjzjx49Hx44dAQAHDhzAn3/+qfT3CRMmcONBpkyZgri4OKxbtw43btzAxYsXoa2trbYdKiMjI4Nbt9IUCgX69euHyMhIjB8/Ho0bN8bt27exevVqPHr0CAcPHqzUcnbs2IEff/wRK1euxLBhw9SWeVN7BAcHo1+/fhg+fDgKCwvx+++/49NPP8Xhw4fRp08frtyCBQswf/58tG/fHgsXLoSOjg4uX76M06dPo0ePHgCKD75jx45F06ZNMWvWLJiYmODGjRs4duwYF19J27dp0wZBQUFITExEcHAwLl68qLJNLSwssHr1agDA8+fPERwcjN69e+PZs2dqt31pcrkc/v7+aNeuHVasWIGIiAisXLkSrq6uSvtaRdZ/woQJ6Natm1L9x44dw+7du7k+8aysLGzduhVDhw7FuHHjkJ2djW3btsHf3x9XrlxBy5YtVbadVCrF+PHjIRaLYWZmhtu3b6NHjx6wtLTE/PnzIZPJMG/ePLXjkwIDA7Fz504MHjwY06dPx+XLlxEUFIT79++rbOOKmD9/PoKCghAYGIi2bdsiKysLUVFRuH79Orp37/7G+VevXg0LCwtkZWVh+/btGDduHJycnFTarbTExES0b98eeXl5mDJlCszNzbFz507069cP+/fvx4ABAwAAqampiIyMRGRkJMaOHQsfHx+cOnUKs2bNQnx8PDZt2gQAGDlyJMaNG4c7d+7Ay8uLW87Vq1fx6NEj/Pjjj5Vqk8pu0xKFhYXo1q0bHjx4gK+++gqenp44ePAgxo8fj9TUVHz//feVikOTxMREfPDBB9yXoqWlJY4ePYovvvgCWVlZmDp1qsZ5z58/jyNHjlRqebNnz0bjxo2Rn5/P/Xi0srLCF198UeV1SE1NRVRUFLS0tDBp0iS4urqqbSvGGPr164czZ87giy++QMuWLXH8+HF8++23ePHiBXecKHHu3Dns3bsXU6ZMgVgsxoYNG9CzZ09cuXJFad8oLTo6Gv3790fv3r2xfv16bvrbtLMKVgk7duxgAFhERARLTk5mz549Y7///jszNzdnurq67Pnz54wxxqRSKZPL5UrzxsXFMbFYzBYuXMhN2759OwPAVq1apbIshULBzQeALV++XKVM06ZNmZ+fH/f+zJkzDACzt7dnWVlZ3PQ//viDAWDBwcFc3e7u7szf359bDmOM5eXlMWdnZ9a9e3eVZbVv3555eXlx75OTkxkANm/ePG5afHw8E4lEbMmSJUrz3r59m2lpaalMj4mJYQDYzp07uWnz5s1jpTfLhQsXGAC2e/dupXmPHTumMt3R0ZH16dNHJfZJkyaxspu6bOwzZ85kVlZWzMfHR6lNd+3axYRCIbtw4YLS/Js2bWIA2MWLF1WWV5qfnx9X399//820tLTY9OnT1ZatSHswVrydSissLGReXl6sS5cuSnUJhUI2YMAAlX2xZJtnZGQwQ0ND1q5dO5afn6+2TGFhIbOysmJeXl5KZQ4fPswAsLlz53LTRo8ezRwdHZXq2bJlCwPArly5onadS88LQOnzwRhj3t7ezMfHp9LrX1ZMTAwzNjZm3bt3ZzKZjDHGmEwmYwUFBUrl0tPTmbW1NRs7diw3reQzaGRkxJKSkpTK9+/fn0kkEpaQkMBNu3fvHhOJRErb7ebNmwwACwwMVJp/xowZDAA7ffo0N63svlnC0dGRjR49mnvfokULtfv7m5Q
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Контрольная выборка: (13625, 7)\n",
"hazardous\n",
"False 12315\n",
"True 1310\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhUAAADECAYAAAAoGdPdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8vElEQVR4nO3dd1hTZ/sH8G8SQgJh76EFZIi7iqNO3IhY6n7dii9qq2htHa32ddviqqPuVmup+toKttJqxb1Q66qgOJHlRkDZBEjy/P7gl7yEBCQQOKD357q4NCfnPOc+M3eeccJjjDEQQgghhFQTn+sACCGEEPJ2oKSCEEIIIXpBSQUhhBBC9IKSCkIIIYToBSUVhBBCCNELSioIIYQQoheUVBBCCCFELyipIIQQQoheUFJBCCGE1LLMzEw8fPgQMpmM61D0ipIKQuqACRMmwMTEhOsw9Gbx4sXg8Xhch0HeMU+ePMFPP/2kep2cnIy9e/dyF1ApxcXFWLVqFVq1agWRSARLS0t4enri5MmTXIemVzolFT/99BN4PJ7qTywWw8vLCyEhIUhNTa2pGAkhhJA34vF4mDZtGo4ePYrk5GTMnTsX58+f5zosFBYWonfv3liwYAG6d++O8PBwHD9+HKdOnULHjh25Dk+vDKqy0NKlS+Hm5gapVIro6Ghs3boVf/31F+Li4mBsbKzvGAkhhJA3cnZ2xqRJk9CvXz8AgKOjI86cOcNtUABWrlyJy5cv4+jRo+jevTvX4dSoKiUV/v7+aNu2LQAgODgY1tbWWLt2LSIjIzFy5Ei9BkgIqXtkMhkUCgUMDQ25DoUQNevXr8f06dORnp6O5s2bQyKRcBqPTCbD+vXrMWvWrLc+oQD01KeiZ8+eAICkpCQAwKtXrzB79my0aNECJiYmMDMzg7+/P2JjYzWWlUqlWLx4Mby8vCAWi+Ho6IjBgwcjISEBQEmbWOkml7J/pQ/SmTNnwOPx8Ouvv2L+/PlwcHCARCJBYGAgHj9+rLHuy5cvo1+/fjA3N4exsTF8fX1x4cIFrdvYvXt3retfvHixxrx79uyBj48PjIyMYGVlhREjRmhdf0XbVppCocD69evRrFkziMVi2NvbY8qUKXj9+rXafK6urhgwYIDGekJCQjTK1Bb76tWrNfYpUFJ1t2jRInh4eEAkEqFhw4aYO3cuCgsLte6r0rp3765R3tdffw0+n4///ve/Vdofa9asQadOnWBtbQ0jIyP4+PggIiJC6/r37NmD9u3bw9jYGJaWlujWrRuOHTumNs+RI0fg6+sLU1NTmJmZoV27dhqxhYeHq46pjY0NxowZg6dPn6rNM2HCBLWYLS0t0b17d52qX58+fYqBAwfCxMQEtra2mD17NuRyuc7bXzYWbedsUVERFi5cCB8fH5ibm0MikaBr1644ffq0WlnK47JmzRqsX78e7u7uEIlEuHPnDgAgOjoa7dq1g1gshru7O7Zv365122QyGZYtW6Za3tXVFfPnz9c4j8q7rlxdXTFhwgTV6+LiYixZsgSenp4Qi8WwtrZGly5dcPz48Qr3cdlmXGNjY7Ro0QI7duyocLnSyyYnJ6um3b59G5aWlhgwYIBap7vExEQMGzYMVlZWMDY2xgcffIDDhw+rlae8Z2k7f01MTFTbWzZmbX/KvgTK/jmJiYnw8/ODRCKBk5MTli5dirI/Sp2Xl4dZs2ahYcOGEIlEaNy4MdasWaMxX0UxlL6+lfNcu3atwv1YXh+iiIgI8Hg8jdqFyl5/rq6uAAB3d3d06NABr169gpGRkcYxKy+myly/5d1nlZTHVLkN9+/fx+vXr2FqagpfX18YGxvD3NwcAwYMQFxcnMbyN27cgL+/P8zMzGBiYoJevXrh77//VptHuZ/PnTuHKVOmwNraGmZmZhg3bpzWz4XS1w0ATJ48GWKxWGM/HzlyBF27doVEIoGpqSkCAgJw+/btCvdbWVWqqShLmQBYW1sDKLmYDh48iGHDhsHNzQ2pqanYvn07fH19cefOHTg5OQEA5HI5BgwYgJMnT2LEiBH49NNPkZOTg+PHjyMuLg7u7u6qdYwcORL9+/dXW++8efO0xvP111+Dx+Phiy++wMuXL7F+/Xr07t0bMTExMDIyAgCcOnUK/v7+8PHxwaJFi8Dn87Fr1y707NkT58+fR/v27TXKbdCgAUJDQwEAubm5+OSTT7Sue8GCBRg+fDiCg4ORlpaGjRs3olu3brhx4wYsLCw0lpk8eTK6du0KAPjtt9/w+++/q70/ZcoU/PTTTwgKCsKMGTOQlJSETZs24caNG7hw4QKEQqHW/aCLzMxM1baVplAoEBgYiOjoaEyePBlNmjTBrVu3sG7dOjx48AAHDx7UaT27du3Cf/7zH3z77bcYNWqU1nnetD82bNiAwMBAjB49GkVFRfjll18wbNgwHDp0CAEBAar5lixZgsWLF6NTp05YunQpDA0NcfnyZZw6dQp9+/YFUHJxTpw4Ec2aNcO8efNgYWGBGzduICoqShWfct+3a9cOoaGhSE1NxYYNG3DhwgWNY2pjY4N169YBKOk0tmHDBvTv3x+PHz/WeuxLk8vl8PPzQ4cOHbBmzRqcOHEC3377Ldzd3dXOtcps/5QpU9C7d2+18qOiorB3717Y2dkBALKzs7Fjxw6MHDkSkyZNQk5ODnbu3Ak/Pz9cuXIF77//vsaxk0qlmDx5MkQiEaysrHDr1i307dsXtra2WLx4MWQyGRYtWgR7e3uN7QsODkZYWBiGDh2KWbNm4fLlywgNDcXdu3c1jnFlLF68GKGhoQgODkb79u2RnZ2Na9eu4Z9//kGfPn3euPy6detgY2OD7Oxs/Pjjj5g0aRJcXV019ltFHj9+jH79+sHb2xv79++HgUHJLTU1NRWdOnVCfn4+ZsyYAWtra4SFhSEwMBAREREYNGiQTtvarVs37N69W/X666+/BgB89dVXqmmdOnVS/V8ul6Nfv3744IMPsGrVKkRFRWHRokWQyWRYunQpAIAxhsDAQJw+fRr//ve/8f777+Po0aOYM2cOnj59qjqPy1Lut9Jx1CRdrr+yFi5cCKlUWul1Vef6LU9GRgaAks8rT09PLFmyBFKpFJs3b0bnzp1x9epVeHl5AShJULt27QozMzPMnTsXQqEQ27dvR/fu3XH27Fl06NBBreyQkBBYWFhg8eLFuH//PrZu3YqUlBRVYqPNokWLsHPnTvz6669qCeHu3bsxfvx4+Pn5YeXKlcjPz8fWrVvRpUsX3LhxQ5WwvRHTwa5duxgAduLECZaWlsYeP37MfvnlF2Ztbc2MjIzYkydPGGOMSaVSJpfL1ZZNSkpiIpGILV26VDXtxx9/ZADY2rVrNdalUChUywFgq1ev1pinWbNmzNfXV/X69OnTDABzdnZm2dnZqun79+9nANiGDRtUZXt6ejI/Pz/VehhjLD8/n7m5ubE+ffporKtTp06sefPmqtdpaWkMAFu0aJFqWnJyMhMIBOzrr79WW/bWrVvMwMBAY3p8fDwDwMLCwlTTFi1axEoflvPnzzMAbO/evWrLRkVFaUx3cXFhAQEBGrFPmzaNlT3UZWOfO3cus7OzYz4+Pmr7dPfu3YzP57Pz58+rLb9t2zYGgF24cEFjfaX5+vqqyjt8+DAzMDBgs2bN0jpvZfYHYyXHqbSioiLWvHlz1rNnT7Wy+Hw+GzRokMa5qDzmmZmZzNTUlHXo0IEVFBRonaeoqIjZ2dmx5s2bq81z6NAhBoAtXLhQNW38+PHMxcVFrZzvv/+eAWBXrlzRus2llwWgdn0wxljr1q2Zj4+PzttfVnx8PDM3N2d9+vRhMpmMMcaYTCZjhYWFavO9fv2a2dvbs4kTJ6qmKa9BMzMz9vLlS7X5Bw4cyMRiMUtJSVFNu3PnDhMIBGrHLSYmhgFgwcHBasvPnj2bAWCnTp1STSt7biq5uLiw8ePHq163atVK6/n+Jsr7WFJSkmragwcPGAC2atWqSi/76tUr1rRpU9a4cWOWnp6uNt/MmTMZALXrJicnh7m5uTFXV1fVOam8Z4WHh2usSyKRqG1vaaWvq7KU59L06dN
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Тестовая выборка: (13626, 7)\n",
"hazardous\n",
"False 12282\n",
"True 1344\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAfQAAADECAYAAABp29OTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5YUlEQVR4nO3dd1hTZ/sH8G8WCSvsJVpAEEUcVRTrQNS6UKu4aNU6i+On1tpqrdaquMrbumrdtlVRbK2CVWvromoVa92oKCqyXMiSDSGQPL8/eJOXkDAFgsn9ua5cmsNznnOfkec+6zmHwxhjIIQQQsgbjavtAAghhBDy+iihE0IIITqAEjohhBCiAyihE0IIITqAEjohhBCiAyihE0IIITqAEjohhBCiAyihE0IIITqAEjohhBC9kZubi8TEROTn52s7lDpHCZ2QRmDSpEkwMTHRdhh1JigoCBwOR9thkAaSl5eH7777Tvk9KysLW7Zs0V5AZTDGsHPnTrzzzjswMjKCWCyGi4sLQkNDtR1anatRQt+zZw84HI7yIxKJ4O7ujtmzZyMlJaW+YiSEENKIGRoa4quvvsL+/fvx9OlTBAUF4ffff9d2WACAsWPHYsaMGfDw8MC+fftw5swZREREYMSIEdoOrc7xazPSihUr4OLiAolEgsjISGzbtg1//vknoqOjYWRkVNcxEkIIacR4PB6WL1+OCRMmQC6XQywW448//tB2WNi7dy9+/fVXhIaGYuzYsdoOp97VKqH7+fmhU6dOAIDAwEBYWVlh/fr1OHr0KMaMGVOnARJCGp+SkhLI5XIYGBhoOxTSSMybNw/vv/8+nj59Cg8PD5ibm2s7JKxZswZjxozRi2QO1NE19D59+gAAEhISAACvXr3C/Pnz0bZtW5iYmEAsFsPPzw+3b99WG1cikSAoKAju7u4QiURwcHDAiBEjEBcXBwBITExUOc1f/tOrVy9lXefPnweHw8Gvv/6KL7/8Evb29jA2NsbQoUPx9OlTtWlfuXIFAwcOhJmZGYyMjODr64tLly5pnMdevXppnH5QUJBa2dDQUHh5ecHQ0BCWlpb44IMPNE6/snkrSy6X47vvvoOnpydEIhHs7Owwffp0ZGZmqpRzdnbGkCFD1KYze/ZstTo1xb5mzRq1ZQoARUVFWLZsGdzc3CAUCtGsWTMsWLAARUVFGpdVWb169VKrb/Xq1eByufj5559rtTzWrl2Lbt26wcrKCoaGhvDy8kJYWJjG6YeGhsLb2xtGRkawsLBAz549cfr0aZUyJ06cgK+vL0xNTSEWi9G5c2e12A4dOqRcp9bW1vjwww/x/PlzlTKTJk1SidnCwgK9evXCxYsXq1xOCs+fP4e/vz9MTExgY2OD+fPnQyaT1Xj+y8eiaZuVSqVYunQpvLy8YGZmBmNjY/j4+ODcuXMqdSnWy9q1a/Hdd9/B1dUVQqEQ9+/fBwBERkaic+fOEIlEcHV1xY4dOzTOW0lJCVauXKkc39nZGV9++aXadlTR78rZ2RmTJk1Sfi8uLsby5cvRokULiEQiWFlZoUePHjhz5kyly7j8pUMjIyO0bdsWP/74Y43G0/TZs2ePsvyDBw8watQoWFpaQiQSoVOnTjh27JhavVlZWfj000/h7OwMoVCIpk2bYsKECUhPT1e2aZV9yi6rW7duwc/PD2KxGCYmJnj33Xfx77//1nr+z549Cx8fHxgbG8Pc3BzDhg1DTEyMSpmy90s0bdoUXbt2BZ/Ph729PTgcDs6fP1/pclWMr/iYmprC29sbR44cUSnXq1cvtGnTpsJ6FNupYh3k5+cjOjoazZo1w+DBgyEWi2FsbFzhbzI+Ph6jR4+GpaUljIyM8M4776idZahJjqlJ21eTXFSZWh2hl6dIvlZWVgBKF8yRI0cwevRouLi4ICUlBTt27ICvry/u37+PJk2aAABkMhmGDBmCv/76Cx988AE++eQT5Obm4syZM4iOjoarq6tyGmPGjMGgQYNUprto0SKN8axevRocDgdffPEFUlNT8d1336Fv376IioqCoaEhgNIN1c/PD15eXli2bBm4XC52796NPn364OLFi/D29lart2nTpggODgZQehPI//3f/2mc9pIlSxAQEIDAwECkpaVh06ZN6NmzJ27duqVxr3XatGnw8fEBABw+fBi//fabyt+nT5+OPXv2YPLkyZgzZw4SEhKwefNm3Lp1C5cuXYJAINC4HGoiKytLOW9lyeVyDB06FJGRkZg2bRo8PDxw9+5dbNiwAY8ePVL70VVl9+7d+Oqrr7Bu3boK95qrWh4bN27E0KFDMW7cOEilUhw4cACjR4/G8ePHMXjwYGW55cuXIygoCN26dcOKFStgYGCAK1eu4OzZs+jfvz+A0sZtypQp8PT0xKJFi2Bubo5bt27h5MmTyvgUy75z584IDg5GSkoKNm7ciEuXLqmtU2tra2zYsAEA8OzZM2zcuBGDBg3C06dPqzxikclkGDBgALp06YK1a9ciIiIC69atg6urq8q2Vp35nz59Ovr27atS/8mTJ7F//37Y2toCAHJycvDjjz9izJgxmDp1KnJzc/HTTz9hwIABuHr1Kt5++221dSeRSDBt2jQIhUJYWlri7t276N+/P2xsbBAUFISSkhIsW7YMdnZ2avMXGBiIkJAQjBo1CvPmzcOVK1cQHByMmJgYtXVcHUFBQQgODkZgYCC8vb2Rk5OD69ev4+bNm+jXr1+V42/YsAHW1tbIycnBrl27MHXqVDg7O6stN4WePXti3759yu+rV68GACxevFg5rFu3bgCAe/fuoXv37nB0dMTChQthbGyMgwcPwt/fH+Hh4Rg+fDiA0nbEx8cHMTExmDJlCjp27Ij09HQcO3YMz549U173Vdi5cydiYmKU2xgAtGvXTjlNHx8fiMViLFiwAAKBADt27ECvXr3w999/o0uXLjWa/4iICPj5+aF58+YICgpCYWEhNm3ahO7du+PmzZtwdnaucNmuW7euxvdVKeYzPT0dW7duxejRoxEdHY2WLVvWqB6FjIwMAMA333wDe3t7fP755xCJRPjhhx/Qt29fnDlzBj179gQApKSkoFu3bigoKMCcOXNgZWWFkJAQDB06FGFhYcr1pVCdHFNeRW1fbXJRhVgN7N69mwFgERERLC0tjT19+pQdOHCAWVlZMUNDQ/bs2TPGGGMSiYTJZDKVcRMSEphQKGQrVqxQDtu1axcDwNavX682LblcrhwPAFuzZo1aGU9PT+br66v8fu7cOQaAOTo6spycHOXwgwcPMgBs48aNyrpbtGjBBgwYoJwOY4wVFBQwFxcX1q9fP7VpdevWjbVp00b5PS0tjQFgy5YtUw5LTExkPB6PrV69WmXcu3fvMj6frzY8NjaWAWAhISHKYcuWLWNlV8vFixcZALZ//36VcU+ePKk23MnJiQ0ePFgt9lmzZrHyq7p87AsWLGC2trbMy8tLZZnu27ePcblcdvHiRZXxt2/fzgCwS5cuqU2vLF9fX2V9f/zxB+Pz+WzevHkay1ZneTBWup7KkkqlrE2bNqxPnz4qdXG5XDZ8+HC1bVGxzrOyspipqSnr0qULKyws1FhGKpUyW1tb1qZNG5Uyx48fZwDY0qVLlcMmTpzInJycVOrZuXMnA8CuXr2qcZ7LjgtA5ffBGGMdOnRgXl5eNZ7/8mJjY5mZmRnr168fKykpYYwxVlJSwoqKilTKZWZmMjs7OzZlyhTlMMVvUCwWs9TUVJXy/v7+TCQSsaSkJOWw+/fvMx6Pp7LeoqKiGAAWGBioMv78+fMZAHb27FnlsPLbpoKTkxObOHGi8nv79u01bu9VUbRjCQkJymGPHj1iANi3335b7XrKbtvlvfvuu6xt27ZMIpEoh8nlctatWzfWokUL5bClS5cyAOzw4cNqdZRtmxQ0bWMK/v7+zMDAgMXFxSmHvXjxgpmamrKePXsqh1V3/t9++21ma2vLMjIylMNu377NuFwumzBhgnJY+d9oamoqMzU1ZX5+fgwAO3funMZ4KxqfMcZOnz7NALCDBw8qh/n6+jJPT88K61Fsp7t371b5bmBgwB49eqQ
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после оверсемплинга: (115214, 7)\n",
"hazardous\n",
"True 57815\n",
"False 57399\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAs0AAADECAYAAABk3xxgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABDTElEQVR4nO3dd3hT1f8H8HeStulu6S67tLTsVQVlld2WJQiigOwCCoggKIIiQ6AiKFNAFAGBHyKgoCCrDKWoDGXPUnaB0ha6Z5Lz+6PfxKZJm7QUbsf79Tx5ILd3fO65N+d+cu65JzIhhAARERERERVILnUARERERESlHZNmIiIiIiITmDQTEREREZnApJmIiIiIyAQmzUREREREJjBpJiIiIiIygUkzEREREZEJTJqJiIiIiEywkDoAIqKKIDs7G48fP4ZGo0HlypWlDodKUGZmJh4/fgwLCwt4eHhIHQ4RPSNsaSYqBYYOHQp7e3upwygxM2fOhEwmkzoMyZ06dQoDBgyAm5sblEolvL290adPH6nDKjOWL1+OxMRE3fvFixcjLS1NuoDyiIiIQM+ePeHs7AwbGxtUqVIF7777rtRhEdEzVKSW5nXr1mHYsGG690qlEtWrV0eXLl0wffp0eHp6lniARERl0c6dO/H666+jTp06mDt3Lnx9fQGALZFF8Ouvv+L69euYNGkS/vjjD0yfPh3jx4+XOiysWLEC77zzDlq3bo0lS5agSpUqAIAaNWpIHBkRPUvF6p4xe/Zs+Pj4IDMzE5GRkVi5ciV+++03XLhwAba2tiUdIxFRmfL48WOEhYUhODgYW7duhZWVldQhlUnTpk1Dz549sWTJEsjlcnzxxReQy6W9QRoVFYX33nsPo0aNwooVK3hHhagCKVbSHBoaihdeeAEAEBYWBldXV3z55ZfYuXMn+vfvX6IBElHpo1KpoNFomAwWYO3atcjMzMS6detYRk8hKCgIt2/fxuXLl1GtWjVUrVpV6pCwdOlSeHl5YenSpUyYiSqYEvnK3qFDBwDAzZs3AeS2skyePBkNGzaEvb09HB0dERoairNnzxosm5mZiZkzZ8Lf3x/W1tbw9vbGq6++iujoaADArVu3IJPJCny1a9dOt64jR45AJpNhy5YtmDZtGry8vGBnZ4eePXvi7t27Bts+fvw4QkJC4OTkBFtbWwQFBeHYsWNG97Fdu3ZGtz9z5kyDeTdu3IjAwEDY2NjAxcUFb7zxhtHtF7ZveWk0GixevBj169eHtbU1PD09MXr0aDx58kRvvpo1a6J79+4G2xk3bpzBOo3FvmDBAoMyBYCsrCzMmDEDfn5+UCqVqFatGj744ANkZWUZLau82rVrZ7C+uXPnQi6X4//+7/+KVR4LFy5Ey5Yt4erqChsbGwQGBmLbtm1Gt79x40Y0b94ctra2qFSpEtq2bYv9+/frzbNnzx4EBQXBwcEBjo6OePHFFw1i27p1q+6Yurm54c0330RMTIzePEOHDtWLuVKlSmjXrh2OHj1qspy0YmJi0KtXL9jb28Pd3R2TJ0+GWq0u8v7nj8XYOZudnY1PPvkEgYGBcHJygp2dHdq0aYPDhw/rrUt7XBYuXIjFixfD19cXSqUSly5dAgBERkbixRdfhLW1NXx9ffH1118b3TeVSoVPP/1Ut3zNmjUxbdo0g/OooM9VzZo1MXToUN37nJwczJo1C7Vr14a1tTVcXV3RunVrHDhwoNAyXrdunV552NraomHDhvj2228LXU7rxo0beO211+Di4gJbW1u89NJL2L17t948f//9N5o0aYJ58+ahWrVqUCqVqF27Nj777DNoNBrdfEFBQWjcuLHR7QQEBCA4OFgv5lu3bunNk//zZe4xBQzL+eHDhxg8eDDc3d2hVCrRoEEDfPPNN3rL5D0X8mrQoIHB53zhwoVGY46JicHw4cPh6ekJpVKJ+vXr47vvvtObR1uXHzlyBM7Oznj55ZdRtWpVdOvWrcDzw9jy2pdSqYS/vz/Cw8MhhNDNp+17Hx8fX+C68p93f//9NwIDAzFmzBjdPhgrKwBIS0vDpEmTdOdAQEAAFi5cqBcDkHssxo0bh02bNiEgIADW1tYIDAzEH3/8oTefsWcFDh8+DKVSibfeektvujnlXJDCrrk1a9Ys1j4C5tXH+Y9dQdt9musSUPJ1+p49e9CmTRvY2dnBwcEB3bp1w8WLFw3WZ29vjxs3biA4OBh2dnaoXLkyZs+ebVBeGo0GS5YsQcOGDWFtbQ13d3eEhITg1KlTBmVqKt/Q5i+9evUyiHv06NGQyWRo0KCBblpR8q6CyspYjjR06FCD43j37l3Y2NgY1BVFzWeMvebMmQOgaPWiKSUyeoY2wXV1dQWQe1HZsWMHXnvtNfj4+CA2NhZff/01goKCcOnSJd2T42q1Gt27d8fBgwfxxhtv4N1330VKSgoOHDiACxcu6PoAAkD//v3RtWtXve1OnTrVaDxz586FTCbDlClT8OjRIyxevBidOnXCmTNnYGNjAwA4dOgQQkNDERgYiBkzZkAul2Pt2rXo0KEDjh49iubNmxust2rVqggPDwcApKam4u233za67enTp6Nfv34ICwtDXFwcli1bhrZt2+L06dNwdnY2WGbUqFFo06YNAOCnn37Czz//rPf30aNH6/qTjx8/Hjdv3sTy5ctx+vRpHDt2DJaWlkbLoSgSExN1+5aXRqNBz549ERkZiVGjRqFu3bo4f/48Fi1ahGvXrmHHjh1F2s7atWvx8ccf44svvsCAAQOMzmOqPJYsWYKePXti4MCByM7Oxg8//IDXXnsNu3btQrdu3XTzzZo1CzNnzkTLli0xe/ZsWFlZ4fjx4zh06BC6dOkCIDcZGT58OOrXr4+pU6fC2dkZp0+fxt69e3Xxacv+xRdfRHh4OGJjY7FkyRIcO3bM4Ji6ublh0aJFAIB79+5hyZIl6Nq1K+7evWv02OelVqsRHByMFi1aYOHChYiIiMAXX3wBX19fvXPNnP0fPXo0OnXqpLf+vXv3YtOmTbo+tcnJyfj222/Rv39/jBw5EikpKVizZg2Cg4Nx4sQJNGnSxODYZWZmYtSoUVAqlXBxccH58+fRpUsXuLu7Y+bMmVCpVJgxY4bR5xvCwsKwfv169O3bF5MmTcLx48cRHh6Oy5cvGxxjc8ycORPh4eEICwtD8+bNkZycjFOnTuHff/9F586dTS6/aNEiuLm5ITk5Gd999x1GjhyJmjVrGpRbXrGxsWjZsiXS09Mxfvx4uLq6Yv369ejZsye2bduG3r17AwASEhIQGRmJyMhIDB8+HIGBgTh48CCmTp2KW7duYdWqVQCAQYMGYeTIkbhw4YLeRevkyZO4du0aPv744yKVSVGPqVZ2djY6deqEK1eu4O2330ZAQAB27NiBUaNGISEhAR9++GGR4ihIbGwsXnrpJV2S6O7ujj179mDEiBFITk7GhAkTClz2jz/+wG+//Vak7U2bNg1169ZFRkaGrjHFw8MDI0aMKPY+JCQk4NSpU7CwsMDYsWPh6+trtKyEEOjZsycOHz6MESNGoEmTJti3bx/ef/99xMTE6OoJrd9//x1btmzB+PHjoVQqsWLFCoSEhODEiRN650ZeZ8+eRa9evdC1a1d89dVXuulPU85anTt3xuDBg/WmffHFF3qNNUXZR3Pq47y0xw4AVq9ejTt37uj+9rTXpZKu0zds2IAhQ4YgODgY8+fPR3p6OlauXInWrVvj9OnTeomiWq1GSEgIXnrpJXz++efYu3cvZsyYAZVKhdmzZ+vmGzFiBNatW4fQ0FCEhYVBpVLh6NGj+Pvvv3V3+ouSb1hbW2P37t149OiR7hqg/VxYW1sbLaei5F35ywrIrd9M+eSTT5CZmWlyPlOMna/a+q649aJRogjWrl0rAIiIiAgRFxcn7t69K3744Qfh6uoqbGxsxL1794QQQmRmZgq1Wq237M2bN4VSqRSzZ8/WTfvuu+8EAPHll18abEuj0eiWAyAWLFhgME/9+vVFUFCQ7v3hw4cFAFGlShWRnJysm/7jjz8KAGLJkiW6ddeuXVsEBwfrtiOEEOnp6cL
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после андерсемплинга: (12372, 7)\n",
"hazardous\n",
"False 6186\n",
"True 6186\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAtoAAADECAYAAAChrYbxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABC5ElEQVR4nO3deXwM5x8H8M/uRjZ3InK6I0HqJkWpiNYRV1NHaZ11BC2qlKqjjrhCaVGK6kGKn2pQV+sK6j6r7itIHEEkInc2ye4+vz/S3Wazm2RzWYnP+/XaFzuZ4zvPzDzz3ZlnnpEIIQSIiIiIiKhYSU0dABERERFRWcREm4iIiIioBDDRJiIiIiIqAUy0iYiIiIhKABNtIiIiIqISwESbiIiIiKgEMNEmIiIiIioBTLSJiIiIiEqAmakDICJ6FWRkZCAuLg5qtRoVK1Y0dThUjBQKBeLi4mBmZgYXFxdTh0NELxFe0SZ6CQwaNAg2NjamDqPYzJw5ExKJxNRhmNy5c+fQt29fODk5QS6Xw93dHT179jR1WKXG8uXLER8fr/2+ZMkSpKSkmC6gbMLCwhAQEAAHBwdYWlqiUqVK+PTTT00dFhG9ZAp0RXvt2rUYPHiw9rtcLkfVqlXRoUMHTJs2Da6ursUeIBFRabR9+3a8//778Pb2xty5c+Hp6QkAvOJZADt37sTt27cxfvx4HDlyBNOmTcOYMWNMHRZWrFiBTz75BK1atcLSpUtRqVIlAEC1atVMHBkRvWwK1XRk1qxZ8PDwgEKhwLFjx7By5Ur8+eefuHLlCqysrIo7RiKiUiUuLg6BgYHw9/dHaGgozM3NTR1SqTRlyhQEBARg6dKlkEql+PrrryGVmvZGbHh4OD777DMMHz4cK1as4J0bIspToRLtTp064fXXXwcABAYGokKFCvjmm2+wfft29OnTp1gDJKKXj1KphFqtZgKZizVr1kChUGDt2rUsoyLw8/PDvXv3cP36dVSpUgWVK1c2dUj49ttv4ebmhm+//ZZJNhHlq1guDbz99tsAgIiICABZV3MmTJiA+vXrw8bGBnZ2dujUqRMuXryoN61CocDMmTNRq1YtWFhYwN3dHT169MCdO3cAAJGRkZBIJLl+2rRpo53XX3/9BYlEgk2bNmHKlClwc3ODtbU1AgIC8ODBA71lnz59Gh07doS9vT2srKzg5+eH48ePG1zHNm3aGFz+zJkz9cZdv349fHx8YGlpCUdHR3zwwQcGl5/XumWnVquxZMkS1K1bFxYWFnB1dcWIESPw/PlznfGqV6+Orl276i1n9OjRevM0FPvChQv1yhQA0tPTMWPGDHh5eUEul6NKlSqYOHEi0tPTDZZVdm3atNGb39y5cyGVSvG///2vUOWxaNEitGzZEhUqVIClpSV8fHywefNmg8tfv349mjVrBisrK5QvXx6tW7fGvn37dMbZvXs3/Pz8YGtrCzs7OzRt2lQvttDQUO02dXJyQv/+/REVFaUzzqBBg3RiLl++PNq0aYOjR4/mW04aUVFR6NatG2xsbODs7IwJEyZApVIVeP1zxmJon83IyMD06dPh4+MDe3t7WFtbw9fXF4cOHdKZl2a7LFq0CEuWLIGnpyfkcjmuXbsGADh27BiaNm0KCwsLeHp64vvvvze4bkqlErNnz9ZOX716dUyZMkVvP8rtuKpevToGDRqk/Z6ZmYmgoCDUrFkTFhYWqFChAlq1aoX9+/fnWcZr167VKQ8rKyvUr18fP/74Y57Tady9exe9evWCo6MjrKys8MYbb+CPP/7QGefUqVNo1KgR5s2bhypVqkAul6NmzZqYP38+1Gq1djw/Pz80bNjQ4HJq164Nf39/nZgjIyN1xsl5fBm7TQH9cn7y5AkGDhwIZ2dnyOVy1KtXDz/88IPONNn3hezq1aund5wvWrTIYMxRUVEYMmQIXF1dIZfLUbduXfz8888642jq8r/++gsODg5o0aIFKleujC5duuS6fxiaXvORy+WoVasWgoODIYTQjqd5liA2NjbXeeXc706dOgUfHx+MHDlSuw6GygoAUlJSMH78eO0+ULt2bSxatEgnBiBrW4wePRobNmxA7dq1YWFhAR8fHxw5ckRnPEPPPhw6dAhyuRwfffSRznBjyjk3eZ1zq1evXqh1BIyrj3Nuu9yWW5Tz0tGjR9GrVy9UrVpVO+24ceOQlpamM15uz85s3rxZu38aW3Y5xzU2fmP3DSBrmw8dOhQVK1aEXC6Hh4cHPv74Y2RkZGjHiY+Px9ixY7Xby8vLCwsWLNCpl7Kfi7dt26azDIVCgfLly+vVA5p9M7fP2rVrjS6r7HWGsbmKZr8xlAvY2NjoHMM5zwHZPw8fPgQAXLp0CYMGDUKNGjVgYWEBNzc3DBkyBM+ePdObf36KpdcRTVJcoUIFAFknom3btqFXr17w8PBAdHQ0vv/+e/j5+eHatWvaJ+5VKhW6du2KAwcO4IMPPsCnn36KpKQk7N+/H1euXNG2aQSAPn36oHPnzjrLnTx5ssF45s6dC4lEgi+++AJPnz7FkiVL0K5dO1y4cAGWlpYAgIMHD6JTp07w8fHBjBkzIJVKsWbNGrz99ts4evQomjVrpjffypUrIzg4GACQnJyMjz/+2OCyp02bht69eyMwMBAxMTFYtmwZWrdujX/++QcODg560wwfPhy+vr4AgK1bt+L333/X+fuIESO07ePHjBmDiIgILF++HP/88w+OHz+OcuXKGSyHgoiPj9euW3ZqtRoBAQE4duwYhg8fjtdeew2XL1/G4sWLcevWLb2DMD9r1qzBl19+ia+//hp9+/Y1OE5+5bF06VIEBASgX79+yMjIwK+//opevXph165d6NKli3a8oKAgzJw5Ey1btsSsWbNgbm6O06dP4+DBg+jQoQOArANuyJAhqFu3LiZPngwHBwf8888/2LNnjzY+Tdk3bdoUwcHBiI6OxtKlS3H8+HG9berk5ITFixcDAB4+fIilS5eic+fOePDggcFtn51KpYK/vz+aN2+ORYsWISwsDF9//TU8PT119jVj1n/EiBFo166dzvz37NmDDRs2aNsIJyYm4scff0SfPn0wbNgwJCUl4aeffoK/vz/OnDmDRo0a6W07hUKB4cOHQy6Xw9HREZcvX0aHDh3g7OyMmTNnQqlUYsaMGQaf1wgMDERISAjee+89jB8/HqdPn0ZwcDCuX7+ut42NMXPmTAQHByMwMBDNmjVDYmIizp07h/Pnz6N9+/b5Tr948WI4OTkhMTERP//8M4YNG4bq1avrlVt20dHRaNmyJVJTUzFmzBhUqFABISEhCAgIwObNm9G9e3cAwLNnz3Ds2DEcO3YMQ4YMgY+PDw4cOIDJkycjMjISq1atAgAMGDAAw4YNw5UrV1CvXj3tcs6ePYtbt27hyy+/LFCZFHSbamRkZKBdu3a4ceMGPv74Y9SuXRvbtm3D8OHD8ezZM0yaNKlAceQmOjoab7zxhjZ5cHZ2xu7duzF06FAkJiZi7NixuU575MgR/PnnnwVa3pQpU/Daa68hLS1NewHGxcUFQ4cOLfQ6PHv2DOfOnYOZmRlGjRoFT09Pg2UlhEBAQAAOHTqEoUOHolGjRti7dy8+//xzREVFaesJjcOHD2PTpk0YM2YM5HI5VqxYgY4dO+LMmTM6+0Z2Fy9eRLdu3dC5c2d899132uFFKWeN9u3bY+DAgTrDvv76a50LPAVZR2Pq4+w02w4AVq9ejfv372v/VtTzUmhoKFJTU/Hxxx+jQoUKOHPmDJYtW4aHDx8iNDQ037LJT/ayO3v2LL799ludvxc0fmP2jUePHqFZs2aIj4/H8OHD4e3tjaioKGzevBmpqakwNzdHamoq/Pz8EBUVhREjRqBq1ao4ceIEJk+ejMePH2PJkiU6y7WwsMCaNWvQrVs37bCtW7dCoVDkuu4rV67U+XESERGB6dOn5zp+9+7d0aNHDwBZP4BWr16d67hA7rlKYWiaQWfn6OgIANi/fz/u3r2LwYMHw83NDVevXsXq1atx9epVnDp1qmB3s0QBrFmzRgAQYWFhIiYmRjx48ED8+uuvokKFCsL
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/neo.csv')\n",
2024-12-05 20:21:14 +04:00
"df = df.drop(columns=['name', 'orbiting_body', 'sentry_object'])\n",
"\n",
"\n",
"#5. Устранение пропущенных данных\n",
" \n",
"#Сведения о пропущенных данных\n",
"print(\"Количество пропущенных значений в каждом столбце:\")\n",
"print(df.isnull().sum())\n",
"\n",
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"\n",
"#Пропущенных данных в датасете нет\n",
"\n",
"\n",
"\n",
"\n",
"#6. Проблемы набора данных\n",
" #5.1Выбросы: Возможны аномалии в значениях скорости или расстояния.\n",
" #Смещение: Данные могут быть смещены в сторону объектов, которые легче обнаружить (крупные, близкие).\n",
"\n",
"#7. Решения для обнаруженных проблем\n",
" #Выбросы: Идентификация и обработка выбросов через методы (например, IQR или Z-оценка).\n",
" #Смещение: Использование методов балансировки данных, таких как oversampling.\n",
"\n",
"#7.1 Проверка набора данных на выбросы\n",
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']\n",
"def Emissions(columns_to_check):\n",
"\n",
" # Функция для подсчета выбросов\n",
" def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
" Q3 = df[col].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
" # Подсчитываем выбросы\n",
" outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
" # Выводим количество выбросов для каждого столбца\n",
" for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
" \n",
" # Создаем гистограммы\n",
" plt.figure(figsize=(15, 10))\n",
" for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 3, i)\n",
" sns.histplot(df[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.tight_layout()\n",
" plt.show()\n",
"Emissions(columns_to_check)\n",
"\n",
"#Признак miss_distance не имеет выбросов, \n",
"#признак absolute_magnitude имеет количество выбросов в приемлемом диапазоне\n",
"#для признаков est_diameter_min, est_diameter_max и relative_velocity необходимо использовать метод решения проблемы выбросов. \n",
"#Воспользуемся методом усреднения значений:\n",
"columns_to_fix = ['est_diameter_min', 'est_diameter_max', 'relative_velocity']\n",
"# Замена нулей и пропущенных значений средним\n",
"for column in columns_to_fix:\n",
" # Расчет среднего, исключая 0 и NaN\n",
" mean_value = df[df[column] > 0][column].mean()\n",
" # Замена NaN и нулей на среднее\n",
" df[column] = df[column].replace(0, np.nan).fillna(mean_value)\n",
"\n",
"#Оценим выбросы в выборке после усреднения:\n",
"Emissions(columns_to_fix)\n",
"\n",
"#Удалось избавиться от выбросов в соответствующих признаках как видно на диаграммах.\n",
"\n",
"\n",
"\n",
"#8. Разбиение данных на выборки\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)\n",
"val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)\n",
"\n",
"print(\"Обучающая выборка: \", train_data.shape)\n",
"print(train_data.hazardous.value_counts())\n",
"hazardous_counts = train_data['hazardous'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов hazardous в обучающей выборке')\n",
"plt.show()\n",
"\n",
"print(\"Контрольная выборка: \", val_data.shape)\n",
"print(val_data.hazardous.value_counts())\n",
"hazardous_counts = val_data['hazardous'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов hazardous в контрольной выборке')\n",
"plt.show()\n",
"\n",
"print(\"Тестовая выборка: \", test_data.shape)\n",
"print(test_data.hazardous.value_counts())\n",
"hazardous_counts = test_data['hazardous'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов hazardous в тестовой выборке')\n",
"plt.show()\n",
"\n",
"\n",
"#9. Оценить сбалансированность выборок для каждого набора данных. Оценить необходимость использования методов приращения (аугментации) данных. \n",
"#Выводы по сбалансированности\n",
"#Если распределение классов примерно равно (например, 50%/50%), выборка считается сбалансированной, и аугментация данных не требуется.\n",
"#Если один из классов сильно доминирует (например, 90%/10%), выборка несбалансированная, и может потребоваться аугментация данных.\n",
"\n",
"#Данная сборка несбалансированная, и требуется аугментация данных.\n",
"\n",
"\n",
"#10. Выполнить приращение данных методами выборки с избытком (oversampling) и выборки с недостатком (undersampling). Должны быть представлены примеры реализации обоих методов для выборок набора данных. \n",
"\n",
"#10.1\n",
"#Аугментация данных методом оверсемплинга¶\n",
"#Этот метод увеличивает количество примеров меньшинства.\n",
"\n",
"from imblearn.over_sampling import ADASYN\n",
"\n",
"# Создание экземпляра ADASYN\n",
"ada = ADASYN()\n",
"\n",
"# Применение ADASYN\n",
"X_resampled, y_resampled = ada.fit_resample(train_data.drop(columns=['hazardous']), train_data['hazardous'])\n",
"\n",
"# Создание нового DataFrame\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"df_train_adasyn['hazardous'] = y_resampled # Добавление целевой переменной\n",
"\n",
"# Вывод информации о новой выборке\n",
"print(\"Обучающая выборка после оверсемплинга: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn['hazardous'].value_counts())\n",
"hazardous_counts = df_train_adasyn['hazardous'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов hazardous в обучающей выборке после оверсемплинга')\n",
"plt.show()\n",
"\n",
"\n",
"#10.2\n",
"#Аугментация данных методом андерсемплинга\n",
"#Этот метод помогает сбалансировать выборку, уменьшая количество экземпляров класса большинства, чтобы привести е г о в соответствие с классом меньшинства\n",
"\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"rus = RandomUnderSampler()\n",
"\n",
"# Применение RandomUnderSampler\n",
"X_resampled, y_resampled = rus.fit_resample(train_data.drop(columns=['hazardous']), train_data['hazardous'])\n",
"\n",
"# Создание нового DataFrame\n",
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
"df_train_undersampled['hazardous'] = y_resampled # Добавление целевой переменной\n",
"\n",
"# Вывод информации о новой выборке\n",
"print(\"Обучающая выборка после андерсемплинга: \", df_train_undersampled.shape)\n",
"print(df_train_undersampled['hazardous'].value_counts())\n",
"\n",
"# Визуализация распределения классов\n",
"hazardous_counts = df_train_undersampled['hazardous'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(hazardous_counts, labels=hazardous_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов hazardous в обучающей выборке после андерсемплинга')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Н а б о р данных \"Продажа домов в округе Кинг, США\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 1. Анализ сведений о наборе данных \"Продажа домов в округе Кинг, США\"\n",
"# Данный набор данных содержит информацию о работе филиалов супермаркетов. Проблемная область связана с анализом производительности магазинов, оптимизацией их # работы и улучшением ключевых показателей, таких как продажи и привлечение клиентов. Основные вопросы, которые могут возникнуть:\n",
"# \n",
"# 2. Анализ содержимого набора данных\n",
"# Объекты наблюдения\n",
"# Каждая строка представляет собой данные о конкретном магазине.\n",
"# \n",
"# Атрибуты объектов\n",
"# Store ID (идентификатор магазина): уникальный номер, однозначно идентифицирующий магазин.\n",
"# Store_Area (площадь магазина): общая площадь магазина в квадратных метрах.\n",
"# Items_Available (количество доступных товаров): общее количество товаров в магазине.\n",
"# Daily_Customer_Count (ежедневное число покупателей): среднее число покупателей в день.\n",
"# Store_Sales (продажи магазина): общая выручка магазина.\n",
"# \n",
"# Связи между объектами\n",
"# Объекты (магазины) связаны общей бизнес-целью (рост прибыли) и схожими условиями работы (товары, покупатели,площади). Возможные связи:\n",
"# Прямая связь между числом покупателей и выручкой.\n",
"# Влияние площади и ассортимента на посещаемость и продажи.\n",
"# \n",
"# 3. Примеры бизнес-целей и эффекты для бизнеса\n",
"# Бизнес-цели\n",
"# Оптимизация ассортимента: определить, как ассортимент товаров влияет на продажи.\n",
"# Увеличение посещаемости: найти факторы, повышающие интерес покупателей к магазину.\n",
"# Оптимизация площадей: выявить оптимальные площади магазинов для максимальной выручки.\n",
"# Анализ производительности филиалов: сравнить эффективность магазинов и выявить лучшие практики.\n",
"# Эффекты для бизнеса\n",
"# Повышение выручки за счет оптимизации ассортимента и улучшения опыта покупателей.\n",
"# Сокращение затрат на неэффективные площади.\n",
"# Более точное планирование при открытии новых магазинов.\n",
"# \n",
"# 4. Примеры целей технического проекта\n",
"# Цель 1: Оптимизация ассортимента\n",
"# Входные данные: Store_Area, Items_Available, Store_Sales.\n",
"# Целевой признак: Store_Sales.\n",
"# Результат: рекомендации по увеличению или уменьшению ассортимента.\n",
"# Цель 2: Увеличение посещаемости\n",
"# Входные данные: Store_Area, Items_Available, Store_Sales.\n",
"# Целевой признак: Daily_Customer_Count.\n",
"# Результат: прогноз посещаемости на основе характеристик магазина.\n",
"# Цель 3: Оптимизация площадей\n",
"# Входные данные: Store_Area, Store_Sales.\n",
"# Целевой признак: Store_Sales.\n",
"# Результат: определение оптимальной площади для магазина.\n",
"# Цель 4: Анализ производительности филиалов\n",
"# Входные данные: В с е атрибуты.\n",
"# Целевой признак: Store_Sales.\n",
"# Результат: классификация магазинов по эффективности и выявление отстающих филиалов."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>price</th>\n",
" <th>bedrooms</th>\n",
" <th>bathrooms</th>\n",
" <th>sqft_living</th>\n",
" <th>sqft_lot</th>\n",
" <th>floors</th>\n",
" <th>waterfront</th>\n",
" <th>view</th>\n",
" <th>...</th>\n",
" <th>grade</th>\n",
" <th>sqft_above</th>\n",
" <th>sqft_basement</th>\n",
" <th>yr_built</th>\n",
" <th>yr_renovated</th>\n",
" <th>zipcode</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>sqft_living15</th>\n",
" <th>sqft_lot15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7129300520</td>\n",
" <td>20141013T000000</td>\n",
" <td>221900.0</td>\n",
" <td>3</td>\n",
" <td>1.00</td>\n",
" <td>1180</td>\n",
" <td>5650</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1180</td>\n",
" <td>0</td>\n",
" <td>1955</td>\n",
" <td>0</td>\n",
" <td>98178</td>\n",
" <td>47.5112</td>\n",
" <td>-122.257</td>\n",
" <td>1340</td>\n",
" <td>5650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6414100192</td>\n",
" <td>20141209T000000</td>\n",
" <td>538000.0</td>\n",
" <td>3</td>\n",
" <td>2.25</td>\n",
" <td>2570</td>\n",
" <td>7242</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>2170</td>\n",
" <td>400</td>\n",
" <td>1951</td>\n",
" <td>1991</td>\n",
" <td>98125</td>\n",
" <td>47.7210</td>\n",
" <td>-122.319</td>\n",
" <td>1690</td>\n",
" <td>7639</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5631500400</td>\n",
" <td>20150225T000000</td>\n",
" <td>180000.0</td>\n",
" <td>2</td>\n",
" <td>1.00</td>\n",
" <td>770</td>\n",
" <td>10000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>770</td>\n",
" <td>0</td>\n",
" <td>1933</td>\n",
" <td>0</td>\n",
" <td>98028</td>\n",
" <td>47.7379</td>\n",
" <td>-122.233</td>\n",
" <td>2720</td>\n",
" <td>8062</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2487200875</td>\n",
" <td>20141209T000000</td>\n",
" <td>604000.0</td>\n",
" <td>4</td>\n",
" <td>3.00</td>\n",
" <td>1960</td>\n",
" <td>5000</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>7</td>\n",
" <td>1050</td>\n",
" <td>910</td>\n",
" <td>1965</td>\n",
" <td>0</td>\n",
" <td>98136</td>\n",
" <td>47.5208</td>\n",
" <td>-122.393</td>\n",
" <td>1360</td>\n",
" <td>5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1954400510</td>\n",
" <td>20150218T000000</td>\n",
" <td>510000.0</td>\n",
" <td>3</td>\n",
" <td>2.00</td>\n",
" <td>1680</td>\n",
" <td>8080</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>8</td>\n",
" <td>1680</td>\n",
" <td>0</td>\n",
" <td>1987</td>\n",
" <td>0</td>\n",
" <td>98074</td>\n",
" <td>47.6168</td>\n",
" <td>-122.045</td>\n",
" <td>1800</td>\n",
" <td>7503</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" id date price bedrooms bathrooms sqft_living \\\n",
"0 7129300520 20141013T000000 221900.0 3 1.00 1180 \n",
"1 6414100192 20141209T000000 538000.0 3 2.25 2570 \n",
"2 5631500400 20150225T000000 180000.0 2 1.00 770 \n",
"3 2487200875 20141209T000000 604000.0 4 3.00 1960 \n",
"4 1954400510 20150218T000000 510000.0 3 2.00 1680 \n",
"\n",
" sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n",
"0 5650 1.0 0 0 ... 7 1180 0 \n",
"1 7242 2.0 0 0 ... 7 2170 400 \n",
"2 10000 1.0 0 0 ... 6 770 0 \n",
"3 5000 1.0 0 0 ... 7 1050 910 \n",
"4 8080 1.0 0 0 ... 8 1680 0 \n",
"\n",
" yr_built yr_renovated zipcode lat long sqft_living15 \\\n",
"0 1955 0 98178 47.5112 -122.257 1340 \n",
"1 1951 1991 98125 47.7210 -122.319 1690 \n",
"2 1933 0 98028 47.7379 -122.233 2720 \n",
"3 1965 0 98136 47.5208 -122.393 1360 \n",
"4 1987 0 98074 47.6168 -122.045 1800 \n",
"\n",
" sqft_lot15 \n",
"0 5650 \n",
"1 7639 \n",
"2 8062 \n",
"3 5000 \n",
"4 7503 \n",
"\n",
"[5 rows x 21 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/kc_house_data.csv')\n",
2024-12-05 20:21:14 +04:00
"df.head()\n",
"#df.select_dtypes(include=np.number).columns.tolist()[1:]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество пропущенных значений в каждом столбце:\n",
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64\n",
"Количество выбросов в столбце 'price': 1146\n",
"Количество выбросов в столбце 'sqft_living': 572\n",
"Количество выбросов в столбце 'bathrooms': 571\n",
"Количество выбросов в столбце 'yr_built': 0\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdIAAAPdCAYAAACOcJpIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXhU5fn/8c/MJDMJWQmShChg3EGxKFqJKyolRfQnlWq1VFFxLahgqy3fuiBaUSpKRRS3ArbytdqvtRUVQVCsZVFRXEBxQ9myCFmGbLNkzu+PyRwYCSHLzJyZzPt1XXOFnPPknPskts+ce+5zPzbDMAwBAAAAAAAAAIBW2a0OAAAAAAAAAACAeEYiHQAAAAAAAACANpBIBwAAAAAAAACgDSTSAQAAAAAAAABoA4l0AAAAAAAAAADaQCIdAAAAAAAAAIA2kEgHAAAAAAAAAKANJNIBAAAAAAAAAGgDiXQAAAAAAAAAANpAIh2IgoMPPliXX3651WF0e3/60590yCGHyOFwaPDgwVE911tvvSWbzaa33norqucBAHQOc29sxHLu3dN7772nk08+WRkZGbLZbFq3bl2XjvfD/166Ms8PGzZMw4YN61I8AJDsmMdjo73z+LBhw3TMMcfEJKbLL79cmZmZMTkX0FUk0oH9mD9/vmw2m95///1W90dqgnn11Vc1derULh8nWSxZskS33nqrTjnlFM2bN0/33nuv1SEBACKEuTc+WTX3+nw+XXjhhaqqqtJDDz2kv/71r+rfv78effRRzZ8/PyYxAADaj3k8Pll5D93Q0KCpU6dSmIaEl2J1AEB3tHHjRtntHfuc6tVXX9WcOXN4I9BOy5cvl91u19NPPy2n0xn1851++ulqbGyMybkAAB3H3Bt9sZ57Q77++mt99913evLJJ3XVVVeZ2x999FEdcMABEalg7Mo8v2TJki6fHwCSHfN49Fk1j0vBRPpdd90lSTzFhYRGRToQBS6XS6mpqVaH0SH19fVWh9AhlZWVSk9Pj/obgKamJgUCAdntdqWlpXX4zR0AIDaYe6MvVnNva+eVpNzc3KidoyvzvNPp5IN2AOgi5vHos2oejya/3y+v12t1GEgiZISAKPhhfzefz6e77rpLhx9+uNLS0tSrVy+deuqpWrp0qaRgT7A5c+ZIkmw2m/kKqa+v129+8xv17dtXLpdLRx55pB544AEZhhF23sbGRt1444064IADlJWVpf/3//6ftm3bJpvNFvYp/dSpU2Wz2bRhwwb98pe/VM+ePXXqqadKkj7++GNdfvnlOuSQQ5SWlqbCwkJdeeWV2rlzZ9i5Qsf44osv9Ktf/Uo5OTnq3bu3br/9dhmGoS1btuj8889Xdna2CgsLNXPmzHb97vx+v+6++24deuihcrlcOvjgg/U///M/8ng85hibzaZ58+apvr7e/F219Wh36NHBtWvX6uSTT1Z6erqKi4s1d+7csHGh/qjPPfecbrvtNh144IHq0aOH3G73PnunrlmzRuecc4569uypjIwMHXvssfrzn/8cNubzzz/Xz3/+c+Xl5SktLU0nnHCC/v3vf7fr9wEAaB/m3viae7/88kuNGTNGhYWFSktL00EHHaSLL75YtbW15hiPx6PJkyerd+/e5u9u69atYb+7yy+/XGeccYYk6cILL5TNZtOwYcN08MEHa/369VqxYoUZT1cq3H44z0+cOFGZmZlqaGjYa+wll1yiwsJCNTc3S9q7R3roWM8//7z++Mc/6qCDDlJaWprOPvtsffXVV3sdb86cOTrkkEOUnp6uH//4x/rPf/5D33UASYd5PL7m8ZD93UN7vV7dcccdGjJkiHJycpSRkaHTTjtNb775pjnm22+/Ve/evSVJd911l3n+Hz5JsG3bNo0ePVqZmZnq3bu3fvvb35pzbeg4NptNDzzwgGbNmmVe74YNGyQFK+5PO+00ZWRkKDc3V+eff74+++yzva7pww8/1MiRI5Wdna3MzEydffbZWr16ddiYUHuid955RzfeeKN69+6t3NxcXXvttfJ6vaqpqdFll12mnj17qmfPnrr11lv3+m/rueee05AhQ5SVlaXs7GwNGjRor1wBEg+tXYB2qq2t1Y4dO/ba7vP59vuzU6dO1fTp03XVVVfpxz/+sdxut95//3198MEH+slPfqJrr71W27dv19KlS/XXv/417GcNw9D/+3//T2+++abGjx+vwYMH6/XXX9ctt9yibdu26aGHHjLHXn755Xr++ed16aWXaujQoVqxYoVGjRq1z7guvPBCHX744br33nvN/9NfunSpvvnmG11xxRUqLCzU+vXr9cQTT2j9+vVavXp12JsTSfrFL36hAQMG6L777tMrr7yie+65R3l5eXr88cd11lln6f7779ezzz6r3/72tzrxxBN1+umnt/m7uuqqq7RgwQL9/Oc/129+8xutWbNG06dP12effaZ//vOfkqS//vWveuKJJ/Tuu+/qqaeekiSdfPLJbR63urpa55xzji666CJdcsklev7553X99dfL6XTqyiuvDBt79913y+l06re//a08Hs8+P7FfunSpzj33XPXp00c33XSTCgsL9dlnn2nRokW66aabJEnr16/XKaecogMPPFC///3vlZGRoeeff16jR4/W//3f/+lnP/tZm3EDQDJj7k3Mudfr9aq0tFQej0c33HCDCgsLtW3bNi1atEg1NTXKyckxz/u3v/1Nv/zlL3XyySdr+fLle/3urr32Wh144IG69957deONN+rEE09UQUGB6uvrdcMNNygzM1N/+MMfJEkFBQVtXmdH/OIXv9CcOXP0yiuv6MILLzS3NzQ06OWXX9bll18uh8PR5jHuu+8+2e12/fa3v1Vtba1mzJihsWPHas2aNeaYxx57TBMnTtRpp52myZMn69tvv9Xo0aPVs2dPHXTQQRG7HgCwAvN4Ys7jIe25h3a73Xrqqad0ySWX6Oqrr9auXbv09NNPq7S0VO+++64GDx6s3r1767HHHtP111+vn/3sZ7rgggskSccee6x5rubmZpWWluqkk07SAw88oDfeeEMzZ87UoYcequuvvz4srnnz5qmpqUnXXHONXC6X8vLy9MYbb2jkyJE65JBDNHXqVDU2Nmr27Nk65ZRT9MEHH+jggw+WFLw/P+2005Sdna1bb71VqampevzxxzVs2DCtWLFCJ510Uti5Qu9j7rrrLq1evVpPPPGEcnNztXLlSvXr10/33nuvXn31Vf3pT3/SMccco8suu8z8b+KSSy7R2Wefrfvvv1+S9Nlnn+m///2vmStAgjIAtGnevHmGpDZfRx99dNjP9O/f3xg3bpz5/Y9+9CNj1KhRbZ5nwoQJRmv/k3zppZcMScY999wTtv3nP/+5YbPZjK+++sowDMNYu3atIcmYNGlS2LjLL7/ckGTceeed5rY777zTkGRccskle52voaFhr23/+7//a0gy3n777b2Occ0115jb/H6/cdBBBxk2m8247777zO3V1dVGenp62O+kNevWrTMkGVdddVXY9t/+9reGJGP58uXmtnHjxhkZGRltHi/kjDPOMCQZM2fONLd5PB5j8ODBRn5+vuH1eg3DMIw333zTkGQccsghe/0eQvvefPNN81qLi4uN/v37G9XV1WFjA4GA+e+zzz7bGDRokNHU1BS2/+STTzYOP/zwdsUPAMmGuTex594PP/zQkGS88MIL+z3vr3/967Dtv/zlL/f63YXm4B8e7+ijjzbOOOOM/cbTmh/+9/LDeT4QCBgHHnigMWbMmLCfe/755/f6u5xxxhlhcYSONWDAAMPj8Zjb//znPxuSjE8++cQwjOB7kV69ehknnnii4fP5zHHz5883JHX62gDAaszjiT2PG0b776H9fn/YXBeKvaCgwLjyyivNbd9///1ev9M945JkTJs2LWz7cccdZwwZMsT8ftOmTYYkIzs726isrAwbG4pr586d5raPPvrIsNvtxmWXXWZuGz16tOF0Oo2vv/7a3LZ9+3YjKyvLOP30081tof+GS0tLw+7vS0pKDJvNZlx33XXmttDfcM95+6abbjKys7MNv9+
"text/plain": [
"<Figure size 1500x1000 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество удаленных строк: 1520\n",
"Количество выбросов в столбце 'price': 211\n",
"Количество выбросов в столбце 'sqft_living': 34\n",
"Количество выбросов в столбце 'bathrooms': 0\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAISCAYAAAAjjoaeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hUZfrG8XsmvU4KqRAg9C4ICEFUSiQqFpS1LSpYKAoouOrqbxUFu64rFhTbiu6Kru6urorSUVQ6SOi9E9LLpE7KnN8fIaORBCEkOSnfz3Xl2s05Z865z5Ddd+aZd57XYhiGIQAAAAAAAAAAcAqr2QEAAAAAAAAAAGioKKIDAAAAAAAAAFANiugAAAAAAAAAAFSDIjoAAAAAAAAAANWgiA4AAAAAAAAAQDUoogMAAAAAAAAAUA2K6AAAAAAAAAAAVIMiOgAAAAAAAAAA1aCIDgAAAAAAAABANSiiA/Wgbdu2GjdunNkxmrwXX3xR7dq1k5ubm3r37l2n1/ruu+9ksVj03Xff1el1AABnhrG2ftTnWPtr69ev16BBg+Tn5yeLxaLNmzef0/l++/dyLuP6kCFDNGTIkHPKAwDNDeN2/TjTcXvIkCHq0aNHvWQaN26c/P396+VaQG2iiA6cpXnz5slisWjDhg1V7q+tweebb77RE088cc7naS4WL16shx56SBdeeKHef/99PfPMM2ZHAgDUEGNtw2TWWFtSUqLrr79emZmZevnll/WPf/xDbdq00RtvvKF58+bVSwYAQPUYtxsmM98jFxQU6IknnmDSGZoUd7MDAM3B7t27ZbWe3WdW33zzjebMmcOLhDO0fPlyWa1Wvffee/L09Kzz61188cUqLCysl2sBAH4fY23dq++xtsL+/ft1+PBhvfPOO7rrrrtc29944w21aNGiVmYynsu4vnjx4nO+PgA0N4zbdc+scVsqL6LPnDlTkvi2FpoMZqID9cDLy0seHh5mxzgr+fn5Zkc4K6mpqfLx8anzFwdFRUVyOp2yWq3y9vY+6xd+AIC6wVhb9+prrK3qupIUFBRUZ9c4l3Hd09OTD9UB4Cwxbtc9s8btulRaWqri4mKzY6CZovoD1IPf9nsrKSnRzJkz1bFjR3l7eys0NFSDBw/WkiVLJJX3CJszZ44kyWKxuH4q5Ofn609/+pNiYmLk5eWlzp07669//asMw6h03cLCQt17771q0aKFAgICdPXVV+v48eOyWCyVPr1/4oknZLFYtGPHDv3xj39UcHCwBg8eLEnasmWLxo0bp3bt2snb21uRkZG64447lJGRUelaFefYs2ePbrnlFtlsNoWFhemxxx6TYRg6evSorrnmGgUGBioyMlIvvfTSGT13paWlevLJJ9W+fXt5eXmpbdu2+r//+z85HA7XMRaLRe+//77y8/Ndz9Xpvt5d8XXCjRs3atCgQfLx8VFsbKzmzp1b6biK/qiffPKJHn30UbVs2VK+vr6y2+3V9k5du3atrrjiCgUHB8vPz0+9evXSK6+8UumYXbt26Q9/+INCQkLk7e2tfv366csvvzyj5wMAUDXG2oY11u7du1ejR49WZGSkvL291apVK910003KyclxHeNwODR9+nSFhYW5nrtjx45Veu7GjRunSy65RJJ0/fXXy2KxaMiQIWrbtq22b9+u77//3pXnXGa6/XZcnzJlivz9/VVQUHDKsTfffLMiIyNVVlYm6dSe6BXn+vTTT/X000+rVatW8vb21vDhw7Vv375Tzjdnzhy1a9dOPj4+uuCCC/TDDz/QZx1Ak8e43bDG7Qq/9x65uLhYM2bMUN++fWWz2eTn56eLLrpIK1ascB1z6NAhhYWFSZJmzpzpuv5vv0Fw/PhxjRo1Sv7+/goLC9MDDzzgGlsrzmOxWPTXv/5Vs2fPdt3vjh07JJXPtL/ooovk5+enoKAgXXPNNdq5c+cp9/Tzzz/r8ssvV2BgoPz9/TV8+HCtWbOm0jEVLYl+/PFH3XvvvQoLC1NQUJAmTpyo4uJiZWdn67bbblNwcLCCg4P10EMPnfK39cknn6hv374KCAhQYGCgevbseUotAI0b7VyAGsrJyVF6evop20tKSn73sU888YSeffZZ3XXXXbrgggtkt9u1YcMGbdq0SZdeeqkmTpyopKQkLVmyRP/4xz8qPdYwDF199dVasWKF7rzzTvXu3VuLFi3Sgw8+qOPHj+vll192HTtu3Dh9+umnuvXWWzVw4EB9//33GjlyZLW5rr/+enXs2FHPPPOMa0BYsmSJDhw4oNtvv12RkZHavn273n77bW3fvl1r1qyp9MJFkm688UZ17dpVzz33nBYsWKCnnnpKISEheuuttzRs2DA9//zz+uijj/TAAw+of//+uvjii0/7XN1111364IMP9Ic//EF/+tOftHbtWj377LPauXOnPv/8c0nSP/7xD7399ttat26d3n33XUnSoEGDTnverKwsXXHFFbrhhht0880369NPP9Xdd98tT09P3XHHHZWOffLJJ+Xp6akHHnhADoej2k/ylyxZoiuvvFJRUVG67777FBkZqZ07d+rrr7/WfffdJ0navn27LrzwQrVs2VIPP/yw/Pz89Omnn2rUqFH6z3/+o2uvvfa0uQGgOWGsbZxjbXFxsRISEuRwODR16lRFRkbq+PHj+vrrr5WdnS2bzea67j//+U/98Y9/1KBBg7R8+fJTnruJEyeqZcuWeuaZZ3Tvvfeqf//+ioiIUH5+vqZOnSp/f3/95S9/kSRFRESc9j7Pxo033qg5c+ZowYIFuv76613bCwoK9NVXX2ncuHFyc3M77Tmee+45Wa1WPfDAA8rJydELL7ygMWPGaO3ata5j3nzzTU2ZMkUXXXSRpk+frkOHDmnUqFEKDg5Wq1atau1+AKA+MG43znG7wpm8R7bb7Xr33Xd18803a/z48crNzdV7772nhIQErVu3Tr1791ZYWJjefPNN3X333br22mt13XXXSZJ69erlulZZWZkSEhI0YMAA/fWvf9XSpUv10ksvqX379rr77rsr5Xr//fdVVFSkCRMmyMvLSyEhIVq6dKkuv/xytWvXTk888YQKCwv12muv6cILL9SmTZvUtm1bSeXvvy+66CIFBgbqoYcekoeHh9566y0NGTJE33//vQYMGFDpWhWvW2bOnKk1a9bo7bffVlBQkFatWqXWrVvrmWee0TfffKMXX3xRPXr00G233eb6m7j55ps1fPhwPf/885KknTt36qeffnLVAtAEGADOyvvvv29IOu1P9+7dKz2mTZs2xtixY12/n3feecbIkSNPe53JkycbVf1P9IsvvjAkGU899VSl7X/4wx8Mi8Vi7Nu3zzAMw9i4caMhyZg2bVql48aNG2dIMh5//HHXtscff9yQZNx8882nXK+goOCUbR9//LEhyVi5cuUp55gwYYJrW2lpqdGqVSvDYrEYzz33nGt7VlaW4ePjU+k5qcrmzZsNScZdd91VafsDDzxgSDKWL1/u2jZ27FjDz8/vtOercMkllxiSjJdeesm1zeFwGL179zbCw8ON4uJiwzAMY8WKFYYko127dqc8DxX7VqxY4brX2NhYo02bNkZWVlalY51Op+u/Dx8+3OjZs6dRVFRUaf+gQYOMjh07nlF+AGjqGGsb91j7888/G5KMzz777Heve88991Ta/sc//vGU565izP3t+bp3725ccsklv5unKr/9e/ntuO50Oo2WLVsao0ePrvS4Tz/99JR/l0suuaRSjopzde3a1XA4HK7tr7zyiiHJ2Lp1q2EY5a89QkNDjf79+xslJSWu4+bNm2dIqvG9AUB9Y9xu3OO2YZz5e+TS0tJKY1tF9oiICOOOO+5wbUtLSzvlOf11LknGrFmzKm3v06eP0bdvX9fvBw8eNCQZgYGBRmpqaqVjK3JlZGS4tiUmJhpWq9W47bbbXNtGjRpleHp6Gvv373dtS0pKMgICAoyLL77Yta3ibzghIaHS+/e4uDjDYrEYkyZNcm2r+Df89Th93333GYGBgUZpaekp94umg3YuQA3NmTNHS5YsOeXn15+uVicoKEjbt2/X3r17z/q
"text/plain": [
"<Figure size 1500x1000 with 3 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Средняя цена в обучающей выборке: 470646.91546391754\n",
"Средняя цена в контрольной выборке: 468579.52654280025\n",
"Средняя цена в тестовой выборке: 471598.4402786994\n",
"\n",
"Стандартное отклонение цены в обучающей выборке: 202445.70321089853\n",
"Стандартное отклонение цены в контрольной выборке: 202183.1175619316\n",
"Стандартное отклонение цены в тестовой выборке: 206393.61053704965\n",
"\n",
"Распределение по квартилам (обучающая):\n",
"0.25 313500.0\n",
"0.50 435000.0\n",
"0.75 594000.0\n",
"Name: price, dtype: float64\n",
"\n",
"Распределение по квартилам (контрольная):\n",
"0.25 313612.5\n",
"0.50 429925.0\n",
"0.75 595000.0\n",
"Name: price, dtype: float64\n",
"\n",
"Распределение по квартилам (тестовая):\n",
"0.25 312500.0\n",
"0.50 428500.0\n",
"0.75 595375.0\n",
"Name: price, dtype: float64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+0AAAIjCAYAAAB20vpjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3iN9//H8efJXhIjW0mUELNGtUIVrT2LDi016lutokOntnRXpy6qGy2qVdWpFNVWNWbNilFFjCDHishO7t8f9y+njgRJJO6M1+O67ivn3ONzv+8z8z6fZTMMw0BERERERERESh0XqwMQERERERERkfwpaRcREREREREppZS0i4iIiIiIiJRSStpFRERERERESikl7SIiIiIiIiKllJJ2ERERERERkVJKSbuIiIiIiIhIKaWkXURERERERKSUUtIuIiLl3okTJ/jnn3/IysqyOhQRkXLDMAyOHTvGzp07rQ5FpFxT0i4iIuVOZmYmr7zyCldccQWenp5UqVKFqKgoli5danVoZcKWLVv45ptvHPc3bNjAjz/+aF1AUqL279/P9OnTHff37NnDrFmzrAtISvV78NSpUzz55JPUq1cPDw8PqlWrRt26ddm+fbvVoYmUW25WByBS0UyfPp1hw4Y57nt6elKzZk06d+7M+PHjCQkJsTA6kbIvPT2dzp07s3LlSu6++26ee+45fHx8cHV1pUWLFlaHVyacOnWKu+66i9DQUKpVq8Z9991Ht27d6NGjh9WhSQmw2WyMGjWKsLAw6tWrxyOPPELVqlUZOHCg1aFVWKX1PXj06FHatWtHfHw8Y8aMoU2bNnh4eODu7k5kZKSlsYmUZ0raRSzy7LPPUqtWLdLS0vjjjz+YOnUqCxYsYMuWLfj4+FgdnkiZ9fLLL7Nq1SoWLVpE+/btrQ6nTIqJiXEsAHXr1uXOO++0OCopKdWrV+fOO++ka9euAISFhfHrr79aG1QFV1rfgw8//DAJCQnExsbSsGFDq8MRqTBshmEYVgchUpHk1rSvWbOGK6+80rH+wQcfZNKkScyePZtbb73VwghFyq6srCyCg4MZOXIkL7zwgtXhlHlbt24lNTWVxo0b4+HhYXU4UsJ27dqF3W6nUaNG+Pr6Wh2OULreg0eOHCEsLIz33nuvVPyAIFKRqE+7SClx3XXXAbB7924Ajh07xkMPPUTjxo3x8/PD39+fbt26sXHjxjzHpqWl8fTTT1O3bl28vLwICwujX79+7Nq1CzD7J9pstnMuZ9ZG/vrrr9hsNr744gsef/xxQkND8fX1pXfv3uzbty/PuVetWkXXrl0JCAjAx8eHdu3asWLFinyvsX379vme/+mnn86z78yZM2nRogXe3t5UrVqVAQMG5Hv+813bmXJycnjzzTdp2LAhXl5ehISEcNddd3H8+HGn/SIjI+nZs2ee84wePTpPmfnF/uqrr+Z5TMFssv3UU09Rp04dPD09qVGjBo888gjp6en5PlZnat++PY0aNcqz/rXXXsNms7Fnzx6n9SdOnOD++++nRo0aeHp6UqdOHV5++WVycnIc++Q+bq+99lqechs1apTva+Krr746Z4xDhw4tUNPIyMhIx/Pj4uJCaGgot9xyC/Hx8Rc8FuDdd9+lYcOGeHp6Eh4ezqhRozhx4oRj+/bt2zl+/DiVKlWiXbt2+Pj4EBAQQM+ePdmyZYtjv2XLlmGz2Zg/f36ec8yePRubzUZsbKwj5qFDhzrtk/uYnFkbuXz5cm666SZq1qzpeI4feOABUlNTnY59+umn87yWZs2aRdOmTfHy8qJatWrceuuteR6ToUOH4ufn57Tuq6++yhMHgJ+fX56YoWDvq/bt2zue/wYNGtCiRQs2btyY7/uqoKZPn57ntfr3339TpUoVevbs6TRA4L///stNN91E1apV8fHxoVWrVnn68p7vNXnmteee93xLbl/u3Mf333//pUuXLvj6+hIeHs6zzz7L2fUbp0+f5sEHH3S8x+rVq8drr72WZ7/zxXDmeyx3n7Vr1573cczvNQDnfh3MnTvX8XwHBgYyaNAgDhw4kKfM3Pdu7dq1ufrqqzl27Bje3t75fr7kF9PZ7/19+/YV6PihQ4de8Pk58/iffvqJtm3b4uvrS6VKlejRowd///13nnK3bdvGzTffTFBQEN7e3tSrV48nnngC+O/9d77lzMexoI/hmcdXqVKF9u3bs3z58jyxXegzDC7+PXj2d21gYCA9evRw+gwE8zts9OjR5yzn7PftmjVryMnJISMjgyuvvPK8n1cAv/zyi+P5qly5Mn369CEuLs5pn9znI/c58/f3d3QHSEtLyxPvmd+5WVlZdO/enapVq7J161bH+mnTpnHdddcRHByMp6cnDRo0YOrUqXlic3FxYcKECU7rcz//z95fxGpqHi9SSuQm2NWqVQPMf1y/+eYbbrrpJmrVqsXhw4d5//33adeuHVu3biU8PByA7OxsevbsydKlSxkwYAD33Xcfp06dYvHixWzZsoXatWs7znHrrbfSvXt3p/OOGzcu33heeOEFbDYbjz76KEeOHOHNN9+kY8eObNiwAW9vb8D80uvWrRstWrTgqaeewsXFxfFluXz5cq666qo85V522WVMnDgRgOTkZEaOHJnvucePH8/NN9/M//73PxITE3nnnXe49tprWb9+PZUrV85zzIgRI2jbti0AX3/9dZ5k7K677nK0crj33nvZvXs3kydPZv369axYsQJ3d/d8H4fCOHHihOPazpSTk0Pv3r35448/GDFiBPXr12fz5s288cYb7Nixw2mwoYuVkpJCu3btOHDgAHfddRc1a9bkzz//ZNy4cSQkJPDmm28W27mKqm3btowYMYKcnBy2bNnCm2++ycGDB/P9B/dMTz/9NM888wwdO3Zk5MiRbN++nalTp7JmzRrHc3j06FHAfF1HRUXxzDPPkJaWxpQpU2jTpg1r1qyhbt26tG/fnho1ajBr1iz69u3rdJ5Zs2ZRu3ZtR7PUgpo7dy4pKSmMHDmSatWqsXr1at555x3279/P3Llzz3nc7NmzGTRoEFdccQUTJ07k6NGjvP322/zxxx+sX7+ewMDAQsVxLkV5X+V69NFHiyWGXPv27aNr165ER0fz5Zdf4uZm/jty+PBhWrduTUpKCvfeey/VqlVjxowZ9O7dm6+++irPc3Uh1157LZ999pnjfm7ri9wEDqB169aO29nZ2XTt2pVWrVrxyiuvsHDhQp566imysrJ49tlnAXO07N69e7Ns2TKGDx9O06ZNWbRoEQ8//DAHDhzgjTfeyDeWN954w/FcXopWILmfdy1btmTixIkcPnyYt956ixUrVlzw+Z4wYUKehKkwCnr8XXfdRceOHR33b7/9dvr27Uu/fv0c64KCggD47LPPGDJkCF26dOHll18mJSWFqVOncs0117B+/XrHDwebNm2ibdu2uLu7M2LECCIjI9m1axfff/89L7zwAv369aNOnTqO8h944AHq16/PiBEjHOvq168PFO4xDAwMdDz3+/fv56233qJ79+7s27fPsV9BPsPOpbDvwejoaJ544gkMw2DXrl1MmjSJ7t27F/gH0vzkfr6OHj2aFi1a8NJLL5GYmJjv59WSJUvo1q0bl19+OU8//TSpqam88847tGnThr/++ivPDz0333wzkZGRTJw4kZUrV/L2229z/PhxPv3003PG87///Y9ff/2VxYsX06BBA8f6qVOn0rBhQ3r37o2bmxvff/8999xzDzk5OYwaNQowK0ruueceJk6cyA033EDz5s1JSEhgzJgxdOzYkbvvvrvIj5NIiTBE5JKaNm2aARhLliwxEhMTjX379hlz5swxqlWrZnh7exv79+83DMMw0tLSjOzsbKdjd+/ebXh6ehrPPvusY90nn3xiAMakSZPynCsnJ8dxHGC8+uqrefZp2LCh0a5dO8f9ZcuWGYBRvXp1IykpybH+yy+/NADjrbfecpQdFRVldOnSxXEewzCMlJQUo1atWkanTp3ynKt169ZGo0aNHPcTExMNwHjqqacc6/bs2WO4uroaL7zwgtOxmzdvNtzc3PKs37l
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA+0AAAIjCAYAAAB20vpjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3yNd//H8dfJ3iEyzRCxqaJa1Ko9W10URavlbmmr+9ZF2/tXnbp03neLFtVqUR0oiqoapbbYIkZIIoTsdf3+uJpTR4IkElcS7+fjcR7nOte5xueM6zrnc32XzTAMAxEREREREREpc5ysDkBERERERERECqakXURERERERKSMUtIuIiIiIiIiUkYpaRcREREREREpo5S0i4iIiIiIiJRRStpFREREREREyigl7SIiIiIiIiJllJJ2ERERERERkTJKSbuISDly+vRp9u3bR3Z2ttWhiIhUGIZhkJiYyN69e60ORUQkHyXtIiJlWFZWFq+//jrXXHMN7u7uVK5cmcjISJYtW2Z1aOXC9u3bmT9/vv3x5s2b+emnn6wLSErVkSNHmDZtmv1xdHQ0M2fOtC4gKdPH4NmzZ3nuueeoX78+bm5uVKlShXr16rF7926rQ7vievfuzf333291GFKO7Ny5ExcXF7Zv3251KFcFF6sDkPJr2rRp3HPPPfbH7u7u1KxZk+7du/P8888TEhJiYXQi5V9GRgbdu3dn7dq1/Otf/+Lll1/Gy8sLZ2dnWrZsaXV45cLZs2cZPXo0oaGhVKlShUceeYRevXrRp08fq0OTUmCz2RgzZgxhYWHUr1+fp556ioCAAIYMGWJ1aFetsnoMnjx5ko4dOxITE8NDDz1Eu3btcHNzw9XVlfDwcEtju9JWr17NL7/8wq5du6wORcqRRo0a0adPH1544QXmzp1rdTgVnpJ2uWwvvfQStWvXJj09nd9//52PPvqIn3/+me3bt+Pl5WV1eCLl1muvvca6detYvHgxnTp1sjqccqlNmzb2G0C9evVUmlSBVatWjfvvv5+ePXsCEBYWxooVK6wN6ipXVo/BJ598ktjYWNasWUPjxo2tDsdSb7zxBl26dKFu3bpWhyLlzL/+9S969+7N/v37iYiIsDqcCs1mGIZhdRBSPuWVtP/555+0atXKPv/xxx9n8uTJzJo1i7vuusvCCEXKr+zsbIKDg3nggQf4v//7P6vDKfd27txJWloaTZs2xc3NzepwpJTt37+fhIQEmjRpgre3t9XhCGXrGIyLiyMsLIyPP/64TFxAsFJcXBzVqlXj448/ZuTIkVaHI+VMVlYWISEhjB07lpdeesnqcCo0tWmXEnfTTTcBcPDgQQASExN54oknaNq0KT4+Pvj5+dGrVy+2bNmSb9309HQmTpxIvXr18PDwICwsjFtvvZX9+/cDZvtEm812wdu5pZErVqzAZrPx9ddf88wzzxAaGoq3tzf9+/fn8OHD+fa9bt06evbsib+/P15eXnTs2JHVq1cX+Bo7depU4P4nTpyYb9kZM2bQsmVLPD09CQgIYNCgQQXu/2Kv7Vy5ubm88847NG7cGA8PD0JCQhg9ejSnTp1yWC48PJy+ffvm28/YsWPzbbOg2N9444187ymYVbYnTJhA3bp1cXd3p0aNGjz11FNkZGQU+F6dq1OnTjRp0iTf/DfffBObzUZ0dLTD/NOnTzNu3Dhq1KiBu7s7devW5bXXXiM3N9e+TN779uabb+bbbpMmTQr8Tnz77bcXjHHEiBGFqhoZHh5u/3ycnJwIDQ1l4MCBxMTEXHJdgA8//JDGjRvj7u5O1apVGTNmDKdPn7Y/v3v3bk6dOoWvry8dO3bEy8sLf39/+vbt69B+bPny5dhsNubNm5dvH7NmzcJms7FmzRp7zCNGjHBYJu89Obc0ctWqVdxxxx3UrFnT/hk/+uijpKWlOaw7ceLEfN+lmTNn0rx5czw8PKhSpQp33XVXvvdkxIgR+Pj4OMz79ttv88UB4OPjky9mKNxx1alTJ/vn36hRI1q2bMmWLVsKPK4Ka9q0afm+qzt27KBy5cr07dvXoYPAAwcOcMcddxAQEICXlxc33HBDvra8F/tOnvva8/Z7sVteW+689/fAgQP06NEDb29vqlatyksvvcT51+lTUlJ4/PHH7cdY/fr1efPNN/Mtd7EYzj3G8pbZsGHDRd/Hgr4DcOHvwZw5c+yfd2BgIEOHDuXo0aP5tpl37EZERHD99deTmJiIp6dngeeXgmI6/9g/fPhwodYfMWLEJT+fc9dfuHAh7du3x9vbG19fX/r06cOOHTvybXfXrl3ceeedBAUF4enpSf369Xn22WeBf46/i93OfR8L+x6eu37lypXp1KkTq1atyhfbpc5hcPnH4Pm/tYGBgfTp0ydfG1qbzcbYsWMvuJ3zj9s///yT3NxcMjMzadWq1UXPVwC//vqr/fOqVKkSN998M1FRUQ7L5H0eeZ+Zn5+fvTlAenp6vnjP/c3Nzs6md+/eBAQEsHPnTvv8qVOnctNNNxEcHIy7uzuNGjXio48+yhebk5MTL7zwgsP8vPP/+cuf76effiI7O5uuXbs6zC/sf7fCnsPOdaHjpaBlC3OsFOWcl5uby7vvvkvTpk3x8PAgKCiInj175jtnFfY3xmazccstt+SLe/To0dhsNof/PEX5D3up9+rc709Rzl1F/W9Y0O0///mPfRlXV1c6derE999/n2+bUrJUPV5KXF6CXaVKFcD84zp//nzuuOMOateuzYkTJ/jkk0/o2LEjO3fupGrVqgDk5OTQt29fli1bxqBBg3jkkUc4e/YsS5YsYfv27Q7Vbu666y569+7tsN/x48cXGM///d//YbPZePrpp4mLi+Odd96ha9eubN68GU9PT8D80evVqxctW7ZkwoQJODk52X8sV61aRevWrfNtt3r16kyaNAmA5ORkHnjggQL3/fzzz3PnnXdy3333ER8fz/vvv0+HDh3YtGkTlSpVyrfOqFGjaN++PQBz587Nl4yNHj3aXsvh4Ycf5uDBg0yZMoVNmzaxevVqXF1dC3wfiuL06dP213au3Nxc+vfvz++//86oUaNo2LAh27Zt4+2332bPnj0OnQ1drtTUVDp27MjRo0cZPXo0NWvW5I8//mD8+PHExsbyzjvvlNi+iqt9+/aMGjWK3Nxctm/fzjvvvMOxY8cK/IN7rokTJ/Liiy/StWtXHnjgAXbv3s1HH33En3/+af8MT548CZjf68jISF588UXS09P54IMPaNeuHX/++Sf16tWjU6dO1KhRg5kzZzJgwACH/cycOZOIiAh7tdTCmjNnDqmpqTzwwANUqVKF9evX8/7773PkyBHmzJlzwfVmzZrF0KFDueaaa5g0aRInT57kvffe4/fff2fTpk0EBgYWKY4LKc5xlefpp58ukRjyHD58mJ49e9KgQQO++eYbXFzMn9UTJ07Qtm1bUlNTefjhh6lSpQrTp0+nf//+fPvtt/k+q0vp0KEDX375pf1xXu2LvAQOoG3btvbpnJwcevbsyQ033MDrr7/OokWLmDBhAtnZ2fbSEMMw6N+/P8uXL2fkyJE0b96cxYsX8+STT3L06FHefvvtAmN5++237Z/llagFkne+u+6665g0aRInTpzg3XffZfXq1Zf8vF944YV8CVNRFHb90aNHOyQ9d999NwMGDODWW2+1zwsKCgLgyy+/ZPjw4fTo0YPXXnuN1NRUPvroI2688UY2bdpk//O9detW2rdvj6urK6NGjSI8PJz9+/fzww8/8H//93/ceuutDtWZH330URo2bMioUaPs8xo2bAgU7T0MDAy0f/ZHjhzh3XffpXfv3hw+fNi+XGHOYRdS1GOwQYMGPPvssxiGwf79+5k8eTK9e/cu9AXSguSdX8eOHUvLli159dVXiY+PL/B8tXTpUnr16kWdOnWYOHEiaWlpvP/++7Rr146//vorX7J05513Eh4ezqRJk1i7di3vvfcep06d4osvvrhgPPfddx8rVqxgyZIlNGrUyD7/o48+onHjxvTv3x8XFxd++OEHHnzwQXJzcxkzZgxgFpQ8+OCDTJo0iVt
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA/YAAAIjCAYAAACpnIB8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3QU5dvG8e+mNxJKKj30IoiA0ovSCaCAhaKAoFgABaWIiqAICCoIgmChqRRBhJ+IIB0FoxTpXQQDSAKRElogZd4/5s3KkkISkkwC1+ecOZmdeWbm3s3u7N4zT7EZhmEgIiIiIiIiInmSk9UBiIiIiIiIiEjmKbEXERERERERycOU2IuIiIiIiIjkYUrsRURERERERPIwJfYiIiIiIiIieZgSexEREREREZE8TIm9iIiIiIiISB6mxF5EREREREQkD1NiLyJylzh//jx//vkn8fHxVociInLHMAyDs2fPcvjwYatDEZG7mBJ7EZE7VFxcHOPGjePee+/F3d2dAgUKULZsWdasWWN1aHnCnj17WLJkif3xjh07WLZsmXUBSbY6ceIEs2bNsj8+duwYc+bMsS4gydWfwYsXL/Lmm29Svnx53NzcKFSoEOXKlePgwYNWh5bjWrduzbPPPmt1GGKh2rVrM3jwYKvDuOspsRdLzJo1C5vNZp88PDwoV64cffv2JSoqyurwRPK8a9eu0bRpU4YNG0bjxo1ZuHAhq1atYu3atdSpU8fq8PKEixcv8txzz/Hbb79x+PBhXn75ZXbv3m11WJJNbDYbffr04aeffuLYsWMMHjyYX375xeqw7mq59TP477//UqdOHSZNmsSjjz7K//73P1atWsX69espWbKk1eHlqE2bNrFy5UqGDBlidShioSFDhjBlyhQiIyOtDuWu5mJ1AHJ3e+eddwgNDSU2NpaNGzcydepUfvzxR/bs2YOXl5fV4YnkWWPHjuX333/np59+onHjxlaHkyfVqVPHPgGUK1dOd6XuYEWKFOHZZ5+lZcuWAISEhLB+/Xprg7rL5dbP4KBBgzh16hTh4eFUrlzZ6nAs9f7779OkSRPKlCljdShioYcffhhfX18++eQT3nnnHavDuWvZDMMwrA5C7j6zZs3i6aefZsuWLdSsWdO+/NVXX2X8+PHMnTuXzp07WxihSN4VHx9PYGAgL7zwAqNGjbI6nDxv3759XL16lSpVquDm5mZ1OJLNjhw5QnR0NPfccw/e3t5WhyPkrs/g6dOnCQkJYdq0abniIoOVTp8+TZEiRZg2bRq9evWyOhyxWL9+/Vi6dClHjx7FZrNZHc5dSVXxJVd56KGHADh69CgAZ8+eZeDAgVSpUgUfHx98fX1p1aoVO3fuTLZtbGwsI0aMoFy5cnh4eBASEkKHDh04cuQIYLaXvLH6/83TjXc1169fj81m45tvvuH1118nODgYb29v2rVrx/Hjx5Md+/fff6dly5b4+fnh5eVFo0aN2LRpU4rPsXHjxikef8SIEcnKfv3119SoUQNPT08KFixIp06dUjx+Ws/tRomJiXz00UdUrlwZDw8PgoKCeO655zh37pxDuZIlS9KmTZtkx+nbt2+yfaYU+/vvv5/sNQWzevjw4cMpU6YM7u7uFCtWjMGDB3Pt2rUUX6sbNW7cmHvuuSfZ8g8++ACbzcaxY8cclp8/f57+/ftTrFgx3N3dKVOmDGPHjiUxMdFeJul1++CDD5Lt95577knxPfHtt9+mGmOPHj3SVQ2zZMmS9v+Pk5MTwcHBPPHEE0RERNxyW4BPPvmEypUr4+7uTuHChenTpw/nz5+3rz948CDnzp0jX758NGrUCC8vL/z8/GjTpg179uyxl1u3bh02m43FixcnO8bcuXOx2WyEh4fbY+7Ro4dDmaTX5Ma7mr/88guPPfYYxYsXt/+PBwwYwNWrVx22HTFiRLL30pw5c6hWrRoeHh4UKlSIzp07J3tNevTogY+Pj8Oyb7/9NlkcAD4+PslihvR9rho3bmz//1eqVIkaNWqwc+fOFD9X6ZXUBOnG9+revXspUKAAbdq0cejU8K+//uKxxx6jYMGCeHl5Ubt27WRti9N6T9743G9u+pTSlNS2POn1/euvv2jRogXe3t4ULlyYd955h5vvA1y+fJlXX33V/hkrX748H3zwQbJyacVw42csqczWrVvTfB1Teg9A6u+DhQsX2v/f/v7+PPnkk5w8eTLZPpM+u6VLl6ZWrVqcPXsWT0/PFM8vKcV082f/+PHj6dq+R48et/z/3Lj98uXLadCgAd7e3uTLl4+wsDD27t2bbL8HDhzg8ccfJyAgAE9PT8qXL88bb7wB/Pf5S2u68XVM72t44/YFChSgcePGKTZnuNU5DG7/M3jzd62/vz9hYWEO50Awv8P69u2b6n5u/txu2bKFxMRErl+/Ts2aNdM8XwGsXbvW/v/Knz8/Dz/8MPv373cok/T/SPqf+fr6UqhQIV5++WViY2OTxXvjd258fDytW7emYMGC7Nu3z7585syZPPTQQwQGBuLu7k6lSpWYOnVqsticnJx46623HJYnnf9vLn+zZcuWER8fT9OmTZOty8g5LGlyd3enXLlyjBkzJtl55OTJk/Ts2ZOgoCDc3d2pXLkyM2bMSHF/Gfnt1rp1awoUKIC3tzdVq1Zl4sSJDmUOHDjAo48+SsGCBfHw8KBmzZp8//33DmWS3iNubm6cOXPGYV14eLj9+d14bkvtt2BKn/mUXquk6cbzTkrfyQBhYWHJ3jdJ77no6GiHslu3bnX4ToDUz1E319Jo1qwZf//9Nzt27Ej2WkvOUFV8yVWSkvBChQoB5hfDkiVLeOyxxwgNDSUqKopPP/2URo0asW/fPgoXLgxAQkICbdq0Yc2aNXTq1ImXX36ZixcvsmrVKvbs2UPp0qXtx+jcuTOtW7d2OO7QoUNTjGfUqFHYbDaGDBnC6dOn+eijj2jatCk7duzA09MTML8YW7VqRY0aNRg+fDhOTk72L9RffvmFBx54INl+ixYtypgxYwC4dOkSL7zwQorHHjZsGI8//jjPPPMMZ86c4eOPP6Zhw4Zs376d/PnzJ9umd+/eNGjQAIDvvvsuWcL23HPP2WtLvPTSSxw9epTJkyezfft2Nm3ahKura4qvQ0acP3/e/txulJiYSLt27di4cSO9e/emYsWK7N69mwkTJnDo0CGHDpJu15UrV2jUqBEnT57kueeeo3jx4vz6668MHTqUU6dO8dFHH2XZsTKrQYMG9O7dm8TERPbs2cNHH33EP//8c8s2vSNGjODtt9+madOmvPDCCxw8eJCpU6eyZcsW+//w33//Bcz3ddmyZXn77beJjY1lypQp1KtXjy1btlCuXDkaN25MsWLFmDNnDu3bt3c4zpw5cyhdunSG2+MvXLiQK1eu8MILL1CoUCE2b97Mxx9/zIkTJ1i4cGGq282dO5cnn3ySe++9lzFjxvDvv/8yadIkNm7cyPbt2/H3989QHKnJzOcqSVa3IT1+/DgtW7akQoUKLFiwABcX8ys5KiqKunXrcuXKFV566SUKFSrE7NmzadeuHd9++22y/9WtNGzYkK+++sr+OKkWR1KSB1C3bl37fEJCAi1btqR27dqMGzeOFStWMHz4cOLj4+1VLA3DoF27dqxbt45evXpRrVo1fvrpJwYNGsTJkyeZMGFCirFMmDDB/r/MidokSee7+++/nzFjxhAVFcXEiRPZtGnTLf/fb731VrKkKiPSu/1zzz3nkBg99dRTtG/fng4dOtiXBQQEAPDVV1/RvXt3WrRowdixY7ly5QpTp06lfv36bN++3f4jf9euXTRo0ABXV1d69+5NyZIlOXLkCEuXLmXUqFF06NDB4Uf5gAEDqFixIr1797Yvq1ixIpCx19Df39/+vz9x4gQTJ06kdevWHD9+3F4uPeew1GT0M1ihQgXeeOMNDMPgyJEjjB8/ntatW6f7ImpKks6vffv2pUaNGrz33nucOXMmxfPV6tWradWqFaVKlWLEiBFcvXqVjz/+mHr16vHHH38kuxj0+OOPU7JkScaMGcNvv/3GpEmTOHfuHF9++WWq8Tz
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/kc_house_data.csv')\n",
2024-12-05 20:21:14 +04:00
"df = df.drop(columns=['id'])\n",
"\n",
"\n",
"#5. Устранение пропущенных данных\n",
" \n",
"#Сведения о пропущенных данных\n",
"print(\"Количество пропущенных значений в каждом столбце:\")\n",
"print(df.isnull().sum())\n",
"\n",
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"\n",
"#Пропущенных данных в датасете нет\n",
"\n",
"\n",
"\n",
"\n",
"#6. Проблемы набора данных\n",
" #5.1Выбросы: Возможны аномалии в значениях скорости или расстояния.\n",
" #Смещение: Данные могут быть смещены в сторону объектов, которые легче обнаружить (крупные, близкие).\n",
"\n",
"#7. Решения для обнаруженных проблем\n",
" #Выбросы: Идентификация и обработка выбросов через методы (например, IQR или Z-оценка).\n",
" #Смещение: Использование методов балансировки данных, таких как oversampling.\n",
"\n",
"#7.1 Проверка набора данных на выбросы\n",
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['price', 'sqft_living', 'bathrooms', 'yr_built']\n",
"def Emissions(columns_to_check):\n",
"\n",
" # Функция для подсчета выбросов\n",
" def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
" Q3 = df[col].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
" # Подсчитываем выбросы\n",
" outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
" # Выводим количество выбросов для каждого столбца\n",
" for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
" \n",
" # Создаем гистограммы\n",
" plt.figure(figsize=(15, 10))\n",
" for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 3, i)\n",
" sns.histplot(df[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.tight_layout()\n",
" plt.show()\n",
"Emissions(columns_to_check)\n",
"\n",
"#Признак yr_built не имеет выбросов, \n",
"#для признаков 'price', 'sqft_living', 'bathrooms' необходимо использовать метод решения проблемы выбросов. \n",
"#Воспользуемся методом удаления наблюдений с такими выбросами:\n",
"# Выбираем столбцы для очистки\n",
"columns_to_fix = ['price', 'sqft_living', 'bathrooms']\n",
"\n",
"# Функция для удаления выбросов\n",
"def remove_outliers(df, columns):\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
" Q3 = df[col].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Удаляем строки, содержащие выбросы\n",
" df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n",
" \n",
" return df\n",
"\n",
"# Удаляем выбросы\n",
"df_cleaned = remove_outliers(df, columns_to_fix)\n",
"\n",
"# Выводим количество удаленных строк\n",
"print(f\"Количество удаленных строк: {len(df) - len(df_cleaned)}\")\n",
"\n",
"df = df_cleaned\n",
"\n",
"#Оценим выбросы в выборке после усреднения:\n",
"Emissions(columns_to_fix)\n",
"\n",
"#Удалось избавиться от выбросов в соответствующих признаках как видно на диаграммах.\n",
"\n",
"\n",
"\n",
"#8. Разбиение данных на выборки\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)\n",
"val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)\n",
"\n",
"# Средние значения цены\n",
"print(\"Средняя цена в обучающей выборке:\", train_data['price'].mean())\n",
"print(\"Средняя цена в контрольной выборке:\", val_data['price'].mean())\n",
"print(\"Средняя цена в тестовой выборке:\", test_data['price'].mean())\n",
"print()\n",
"\n",
"# Стандартное отклонение цены\n",
"print(\"Стандартное отклонение цены в обучающей выборке:\", train_data['price'].std())\n",
"print(\"Стандартное отклонение цены в контрольной выборке:\", val_data['price'].std())\n",
"print(\"Стандартное отклонение цены в тестовой выборке:\", test_data['price'].std())\n",
"print()\n",
"\n",
"# Проверка распределений по количеству объектов в диапазонах\n",
"print(\"Распределение по квартилам (обучающая):\")\n",
"print(train_data['price'].quantile([0.25, 0.5, 0.75]))\n",
"print()\n",
"print(\"Распределение по квартилам (контрольная):\")\n",
"print(val_data['price'].quantile([0.25, 0.5, 0.75]))\n",
"print()\n",
"print(\"Распределение по квартилам (тестовая):\")\n",
"print(test_data['price'].quantile([0.25, 0.5, 0.75]))\n",
"\n",
"# Построение гистограмм для каждой выборки\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"sns.histplot(train_data['price'], color='blue', label='Train', kde=True)\n",
"sns.histplot(val_data['price'], color='green', label='Validation', kde=True)\n",
"sns.histplot(test_data['price'], color='red', label='Test', kde=True)\n",
"\n",
"plt.legend()\n",
"plt.xlabel('price')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Распределение цены в обучающей, контрольной и тестовой выборках')\n",
"plt.show()\n",
"\n",
"\n",
"#9. Оценить сбалансированность выборок для каждого набора данных. Оценить необходимость использования методов приращения (аугментации) данных. \n",
"#Выводы по сбалансированности\n",
"#Если распределение классов примерно равно (например, 50%/50%), выборка считается сбалансированной, и аугментация данных не требуется.\n",
"#Если один из классов сильно доминирует (например, 90%/10%), выборка несбалансированная, и может потребоваться аугментация данных.\n",
"\n",
"#Выборки оказались недостаточно сбалансированными. Используем методы приращения данных с избытком и с недостатком:\n",
"\n",
"\n",
"#10. Выполнить приращение данных методами выборки с избытком (oversampling) и выборки с недостатком (undersampling). Должны быть представлены примеры реализации обоих методов для выборок набора данных. \n",
"\n",
"#10.1\n",
"#Аугментация данных методом оверсемплинга¶\n",
"#Этот метод увеличивает количество примеров меньшинства.\n",
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"def oversample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" x_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1) \n",
" return resampled_df\n",
"\n",
"def undersample(df, target_column):\n",
" X = df.drop(target_column, axis=1)\n",
" y = df[target_column]\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" x_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([x_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_data, 'price')\n",
"val_df_oversampled = oversample(val_data, 'price')\n",
"test_df_oversampled = oversample(test_data, 'price')\n",
"\n",
"train_df_undersampled = undersample(train_data, 'price')\n",
"val_df_undersampled = undersample(val_data, 'price')\n",
"test_df_undersampled = undersample(test_data, 'price')\n",
"\n",
"# Построение гистограмм для каждой выборки\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"sns.histplot(train_df_undersampled['price'], color='blue', label='Train', kde=True)\n",
"sns.histplot(val_df_undersampled['price'], color='green', label='Validation', kde=True)\n",
"sns.histplot(test_df_undersampled['price'], color='red', label='Test', kde=True)\n",
"\n",
"plt.legend()\n",
"plt.xlabel('price')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Распределение цены в обучающей, контрольной и тестовой выборках (андерсемплинг)')\n",
"plt.show()\n",
"\n",
"# Построение гистограмм для каждой выборки\n",
"plt.figure(figsize=(12, 6))\n",
"\n",
"sns.histplot(train_df_oversampled['price'], color='blue', label='Train', kde=True)\n",
"sns.histplot(val_df_oversampled['price'], color='green', label='Validation', kde=True)\n",
"sns.histplot(test_df_oversampled['price'], color='red', label='Test', kde=True)\n",
"\n",
"plt.legend()\n",
"plt.xlabel('price')\n",
"plt.ylabel('Frequency')\n",
"plt.title('Распределение цены в обучающей, контрольной и тестовой выборках (оверсемплинг)')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Н а б о р данных \"Показатели сердечно-сосудистых заболеваний (за 2022 год)\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 1. Анализ сведений о наборе данных\n",
"# Этот набор данных из Kaggle, вероятно, относится к исследованию диагностики диабета у женщин индийского происхождения. Н а б о р данных изначально был взят из Национального института диабета и болезней пищеварения и почек (Pima Indians Diabetes Database).\n",
"# \n",
"# Проблемная область\n",
"# Основная проблемная область — медицинская диагностика. Цель данных — предсказать наличие диабета у пациента на основе медицинских показателей (например, уровня глюкозы, давления, ИМТ) и других факторов.\n",
"# \n",
"# 2. Анализ содержимого набора данных\n",
"# Объекты наблюдения\n",
"# Объектами наблюдения являются медицинские записи пациентов (768 строк). Каждая строка представляет собой данные о состоянии конкретного пациента.\n",
"# \n",
"# Атрибуты объектов\n",
"# Pregnancies (количество беременностей) — числовой признак.\n",
"# Glucose (уровень глюкозы в крови) — числовой признак.\n",
"# BloodPressure (артериальное давление) — числовой признак.\n",
"# SkinThickness (толщина кожной складки) — числовой признак.\n",
"# Insulin (уровень инсулина) — числовой признак.\n",
"# BMI (индекс массы тела) — вещественный признак.\n",
"# DiabetesPedigreeFunction (генетическая предрасположенность к диабету) — вещественный признак.\n",
"# Age (возраст) — числовой признак.\n",
"# Outcome (наличие диабета) — целевой бинарный признак (1 — диабет, 0 — нет).\n",
"# \n",
"# Связи между объектами\n",
"# Объекты наблюдения независимы друг от друга, так как каждый пациент представлен в одном экземпляре.\n",
"# \n",
"# 3. Примеры бизнес-целей\n",
"# Улучшение диагностики диабета\n",
"# Создание инструмента для раннего выявления диабета на основе медицинских данных.\n",
"# Эффект: Снижение затрат на лечение поздних стадий диабета, повышение уровня здоровья пациентов.\n",
"# \n",
"# Оптимизация медицинских услуг\n",
"# Предсказание риска диабета для отдельных групп населения.\n",
"# Эффект: Перераспределение ресурсов на профилактические меры и профилактические программы.\n",
"# \n",
"# Поддержка медицинских решений\n",
"# Автоматизированная рекомендация дополнительных обследований или консультаций для пациентов с высоким риском.\n",
"# Эффект: Ускорение процесса диагностики, уменьшение нагрузки на врачей.\n",
"# \n",
"# 4. Примеры целей технического проекта\n",
"# Цель 1: Улучшение диагностики диабета\n",
"# Вход: В с е признаки, кроме Outcome.\n",
"# Целевой признак: Outcome (наличие диабета).\n",
"# Результат: Модель классификации, которая определяет, есть ли у пациента диабет.\n",
"# Цель 2: Оптимизация медицинских услуг\n",
"# Вход: Социально-демографические данные (возраст, количество беременностей) и факторы здоровья (ИМТ, глюкоза).\n",
"# Целевой признак: Вероятность (риск) наличия диабета.\n",
"# Результат: Отчет о вероятности диабета для различных групп пациентов.\n",
"# Цель 3: Поддержка медицинских решений\n",
"# Вход: В с е признаки набора данных.\n",
"# Целевой признак: Рекомендации для дальнейших действий (например, \"провести анализ инсулина\", \"назначить консультацию эндокринолога\").\n",
"# Результат: Система поддержки принятия решений для врачей.\n",
"# "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Pregnancies</th>\n",
" <th>Glucose</th>\n",
" <th>BloodPressure</th>\n",
" <th>SkinThickness</th>\n",
" <th>Insulin</th>\n",
" <th>BMI</th>\n",
" <th>DiabetesPedigreeFunction</th>\n",
" <th>Age</th>\n",
" <th>Outcome</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>148</td>\n",
" <td>72</td>\n",
" <td>35</td>\n",
" <td>0</td>\n",
" <td>33.6</td>\n",
" <td>0.627</td>\n",
" <td>50</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>85</td>\n",
" <td>66</td>\n",
" <td>29</td>\n",
" <td>0</td>\n",
" <td>26.6</td>\n",
" <td>0.351</td>\n",
" <td>31</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8</td>\n",
" <td>183</td>\n",
" <td>64</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23.3</td>\n",
" <td>0.672</td>\n",
" <td>32</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>89</td>\n",
" <td>66</td>\n",
" <td>23</td>\n",
" <td>94</td>\n",
" <td>28.1</td>\n",
" <td>0.167</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>137</td>\n",
" <td>40</td>\n",
" <td>35</td>\n",
" <td>168</td>\n",
" <td>43.1</td>\n",
" <td>2.288</td>\n",
" <td>33</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \\\n",
"0 6 148 72 35 0 33.6 \n",
"1 1 85 66 29 0 26.6 \n",
"2 8 183 64 0 0 23.3 \n",
"3 1 89 66 23 94 28.1 \n",
"4 0 137 40 35 168 43.1 \n",
"\n",
" DiabetesPedigreeFunction Age Outcome \n",
"0 0.627 50 1 \n",
"1 0.351 31 0 \n",
"2 0.672 32 1 \n",
"3 0.167 21 0 \n",
"4 2.288 33 1 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/diabetes.csv')\n",
2024-12-05 20:21:14 +04:00
"df.head()\n",
"#df.select_dtypes(include=np.number).columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество пропущенных значений в каждом столбце:\n",
"Pregnancies 0\n",
"Glucose 0\n",
"BloodPressure 0\n",
"SkinThickness 0\n",
"Insulin 0\n",
"BMI 0\n",
"DiabetesPedigreeFunction 0\n",
"Age 0\n",
"Outcome 0\n",
"dtype: int64\n",
"Количество выбросов в столбце 'Pregnancies': 4\n",
"Количество выбросов в столбце 'Glucose': 5\n",
"Количество выбросов в столбце 'BloodPressure': 45\n",
"Количество выбросов в столбце 'SkinThickness': 1\n",
"Количество выбросов в столбце 'Insulin': 34\n",
"Количество выбросов в столбце 'BMI': 19\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABdEAAAPeCAYAAADj01PlAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXhU5f3+8XtmksxkD9kTCBAW2TcBEXEXRRSVqvVHRUVrpVXUulQrrbigFXeplLq0/aK2WrspVVQQBLUKIqDs+x5C9pB9nzm/P8KMBAgkYSZnJvN+XddcmjNnztyThefMZ57zeSyGYRgCAAAAAAAAAADHsJodAAAAAAAAAAAAf0URHQAAAAAAAACAZlBEBwAAAAAAAACgGRTRAQAAAAAAAABoBkV0AAAAAAAAAACaQREdAAAAAAAAAIBmUEQHAAAAAAAAAKAZFNEBAAAAAAAAAGgGRXQAAAAAAAAAAJpBER0dTvfu3XXzzTebHaPDe+6559SjRw/ZbDYNHTrU7Dgdyueffy6LxaLPP//c7CgAEFA4B2gfvjwHOP/883X++ed79ZgAgLZhXG0fvhhXzXxP+dhjj8lisbT78wK+RhEdfu2NN96QxWLR6tWrj3v/+eefr4EDB57y83z88cd67LHHTvk4weLTTz/Vgw8+qDFjxmjevHl66qmnmt335ptvlsVi8dxiYmI0ZMgQvfDCC6qtrW3H1ACAQMI5gH9qzTmA2//+9z9dd9116ty5s8LCwhQbG6tRo0Zp5syZysvLa4fUAADGVf90Ku+tQ0JClJGRoUmTJmnz5s3tmLr1unfv3iR7cnKyzjnnHL3//vtmRwNaLMTsAIC3bdu2TVZr6z4f+vjjjzV37lwG+xZaunSprFar/vKXvygsLOyk+9vtdv35z3+WJJWUlOg///mPfvWrX2nVqlV69913fR034Jx77rmqrq5u0fcWAPADzgF8r7XnAI888oieeOIJ9ejRQzfffLN69OihmpoarVmzRi+88ILefPNN7dq1qx2SAwBai3HV907lvXVDQ4N27dqlV199VQsXLtTmzZuVnp7u68htNnToUN1///2SpIMHD+q1117T1VdfrVdeeUW/+MUvTE4HnBxFdHQ4drvd7AitVllZqcjISLNjtFh+fr7Cw8NbXOQNCQnRDTfc4Pn6jjvu0KhRo/SPf/xDL7744nEHesMwVFNTo/DwcK/lDhRWq1UOh8PsGAAQcDgH8L3WnAP84x//0BNPPKHrrrtOf/3rX495zEsvvaSXXnrJV1EBAKeIcdX3TvW9tSSdeeaZmjBhgj766CPddtttvojpFZ07d26S/aabblKvXr300ksvNVtEb2hokMvlCqgJZoH2O4iWo50LOpyj+7bV19fr8ccfV+/eveVwOJSQkKCzzz5bixcvltR4SdTcuXMlqcnlRW6VlZW6//77lZGRIbvdrj59+uj555+XYRhNnre6ulp33323EhMTFR0drSuvvFLZ2dmyWCxNPoV39wfbvHmzrr/+enXq1Elnn322JGn9+vWeWVoOh0Opqan66U9/qqKioibP5T7G9u3bdcMNNyg2NlZJSUmaMWOGDMNQVlaWrrrqKsXExCg1NVUvvPBCi753DQ0NeuKJJ9SzZ0/Z7XZ1795dv/nNb5q0XbFYLJo3b54qKys936s33nijRcd3s1qtnn6ne/fuldT4c5swYYIWLVqkESNGKDw8XK+99pqkxtnr99xzj+dn0KtXLz3zzDNyuVxNjltUVKQbb7xRMTExiouL05QpU7Ru3bpjMt58882KiopSdna2Jk6cqKioKCUlJelXv/qVnE5nk2M+//zzOuuss5SQkKDw8HANHz5c//73v495TRaLRXfeeafmz5+vgQMHym63a8CAAVq4cOEx+2ZnZ+vWW29Venq67Ha7MjMzdfvtt6uurk5S8/3rVq5cqUsvvVSxsbGKiIjQeeedp6+//rrJPuXl5brnnnvUvXt32e12JScn6+KLL9Z333130p8LAAQ6zgH86xzgkUceUWJiYrOz62JjY086U9HdfsB9vuB2orHysssuU6dOnRQZGanBgwfr97//fZN9li5dqnPOOUeRkZGKi4vTVVddpS1btjTZp6XjaUvGZgAIVIyr/jWuNic1NVVSY4H9ZP71r39p+PDhCg8PV2Jiom644QZlZ2cfs19LxkpJ+uqrrzRy5Eg5HA717NnT8x6+pbn79eunPXv2SGqsDVgsFj3//POaPXu253vnblWzdetWXXvttYqPj5fD4dCIESP0wQcfNDnmyX5HJSk3N1e33HKLunTpIrvdrrS0NF111VVNzjWO/l1zO/pvwn2e8sUXX+iOO+5QcnKyunTp4rn/k08+8Xwfo6Ojdfnll2vTpk0t/h7BvzATHQGhtLRUhYWFx2yvr68/6WMfe+wxzZo1Sz/72c90xhlnqKysTKtXr9Z3332niy++WD//+c918OBBLV68WH/961+bPNYwDF155ZVatmyZbr31Vg0dOlSLFi3SAw88oOzs7Cazp26++Wb985//1I033qgzzzxTX3zxhS6//PJmc/34xz9W79699dRTT3lOGhYvXqzdu3frlltuUWpqqjZt2qTXX39dmzZt0jfffHPM4hz/7//9P/Xr109PP/20PvroIz355JOKj4/Xa6+9pgsvvFDPPPOM3n77bf3qV7/SyJEjde65557we/Wzn/1Mb775pq699lrdf//9WrlypWbNmqUtW7Z4epX99a9/1euvv65vv/3WcxnZWWedddKfw9Hcl24nJCR4tm3btk0/+clP9POf/1y33Xab+vTpo6qqKp133nnKzs7Wz3/+c3Xt2lXLly/X9OnTlZOTo9mzZ0uSXC6XrrjiCn377be6/fbb1bdvX/33v//VlClTjvv8TqdT48aN06hRo/T8889ryZIleuGFF9SzZ0/dfvvtnv1+//vf68orr9TkyZNVV1end999Vz/+8Y+1YMGCY36+X331ld577z3dcccdio6O1ssvv6xrrrlG+/fv97zOgwcP6owzzlBJSYmmTp2qvn37Kjs7W//+979VVVXV7CfsS5cu1fjx4zV8+HA9+uijslqtmjdvni688EL973//0xlnnCFJ+sUvfqF///vfuvPOO9W/f38VFRXpq6++0pYtW3T66ae3+ucEAGbjHCAwzwG2b9+u7du362c/+5mioqJO+NzesnjxYk2YMEFpaWn65S9/qdTUVG3ZskULFizQL3/5S0nSkiVLNH78ePXo0UOPPfaYqqurNWfOHI0ZM0bfffedunfvLqll42lLx2YA8CeMq4E5rh7J/fNzOp3avXu3fv3rXyshIUETJkw44ePeeOMN3XLLLRo5cqRmzZqlvLw8/f73v9fXX3+t77//XnFxcZJaPlZu2LBBl1xyiZKSkvTYY4+poaFBjz76qFJSUk76GqTG37msrKwmNQFJmjdvnmpqajR16lTZ7XbFx8dr06ZNGjNmjDp37qyHHnpIkZGR+uc//6mJEyfqP//5j370ox9JOvnvqCRdc8012rRpk+666y51795d+fn5Wrx4sfbv3+95ba11xx13KCkpSY888ogqKyslNf58p0yZonHjxumZZ55RVVWVXnnlFZ199tn6/vvv2/xcMJEB+LF58+YZkk54GzBgQJPHdOvWzZgyZYrn6yFDhhiXX375CZ9n2rRpxvH+HObPn29IMp588skm26+99lrDYrEYO3fuNAzDMNasWWNIMu65554m+918882GJOPRRx/1bHv00UcNScZPfvKTY56vqqrqmG1///vfDUnGl19+ecwxpk6d6tnW0NBgdOnSxbBYLMbTTz/t2X7o0CEjPDy8yffkeNauXWtIMn72s5812f6rX/3KkGQsXbrUs23KlClGZGTkCY939L4FBQVGQUGBsXPnTuOpp54yLBaLMXjwYM9+3bp1MyQZCxcubPL4J554woiMjDS2b9/eZPtDDz1k2Gw2Y//+/YZhGMZ//vMfQ5Ixe/Zszz5Op9O48MILDUnGvHnzmmSSZMycObPJMYcNG2YMHz68ybajfyZ1dXXGwIEDjQsvvLDJdklGWFi
"text/plain": [
"<Figure size 1500x1000 with 6 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"date 0\n",
"price 0\n",
"bedrooms 0\n",
"bathrooms 0\n",
"sqft_living 0\n",
"sqft_lot 0\n",
"floors 0\n",
"waterfront 0\n",
"view 0\n",
"condition 0\n",
"grade 0\n",
"sqft_above 0\n",
"sqft_basement 0\n",
"yr_built 0\n",
"yr_renovated 0\n",
"zipcode 0\n",
"lat 0\n",
"long 0\n",
"sqft_living15 0\n",
"sqft_lot15 0\n",
"dtype: int64\n",
"Обучающая выборка: (537, 9)\n",
"Outcome\n",
"0 349\n",
"1 188\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAf8AAADECAYAAACROyhkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA3nUlEQVR4nO3dd1xT1/sH8E8SAmSwtwqCggu0Ko46ALci1tVK3Yq22trWWvutrd/Wbeu31bqtq9WiUvf6aR0odYtbUKooKCAiskEII5Cc3x80KSFhCl4gz/v1yku5OffcJ+fenOfOEx5jjIEQQggheoPPdQCEEEIIebMo+RNCCCF6hpI/IYQQomco+RNCCCF6hpI/IYQQomco+RNCCCF6hpI/IYQQomco+RNCCCF6hpI/IYS8BplMhvj4eGRkZHAdCqlh2dnZiI2NhUwm4zqUGkfJnxBCqmj//v3o27cvTExMIJVK4eTkhJ9++onrsOqFnJwcrF69Wv13ZmYmNmzYwF1AJTDGsGXLFrz99tsQi8UwNTWFi4sLdu3axXVoNa5Kyf/3338Hj8dTv4yNjdGiRQt8+umnSEpKqq0YCdEbV65cwYgRI2BnZwcjIyM4Oztj+vTpePbsWbXrzM3NxcKFC3H+/PmaC1SPffPNN/D394eJiQm2bt2KM2fO4OzZs5gxYwbXodULIpEI3333HYKCghAfH4+FCxfi2LFjXIcFABg7diw++ugjtG7dGjt37lSv25EjR3IdWs1jVbB9+3YGgC1evJjt3LmTbd26lU2aNInx+Xzm4uLCZDJZVaojhJSwdu1axuPxWPPmzdmSJUvYr7/+yr788ktmZmbGzMzM2JUrV6pVb0pKCgPAFixYULMB66Hz588zAGzZsmVch1KvrVixgvH5fAaAmZqaskuXLnEdEgsMDGQ8Ho8FBQVxHcobUa3kf/PmTY3ps2fPZgDYH3/8UaPBEaIvLl++zPh8PvPy8tLaiY6OjmZ2dnbMwcGBpaenV7luSv41Z8iQIax79+5ch9EgxMfHs6tXr7KMjAyuQ2GMMebh4cHGjh3LdRhvTI1c8+/Tpw8AICYmBgCQnp6O//znP2jbti2kUilMTU3h6+uL8PBwrXnz8/OxcOFCtGjRAsbGxnBwcMDIkSPx5MkTAEBsbKzGpYbSr169eqnrOn/+PHg8Hvbu3Yv//ve/sLe3h0QiwdChQxEfH6+17OvXr2PQoEEwMzODWCyGj48Prly5ovMz9urVS+fyFy5cqFV2165d8PT0hEgkgqWlJUaPHq1z+eV9tpKUSiVWr14Nd3d3GBsbw87ODtOnT9e6wcjZ2RlDhgzRWs6nn36qVaeu2JcvX67VpgBQUFCABQsWwNXVFUZGRnB0dMScOXNQUFCgs61K6tWrl1Z933//Pfh8Pv74449qtceKFSvQvXt3WFlZQSQSwdPTEwcOHNC5/F27dqFLly4Qi8WwsLCAt7c3goODNcqcPHkSPj4+MDExgampKTp37qwV2/79+9Xr1NraGuPHj0dCQoJGmcmTJ2vEbGFhgV69euHSpUsVttOSJUvA4/EQGBgIsVis8V7z5s3x008/ITExEZs3b1ZP19W2qjicnZ0BFLepjY0NAGDRokU6t9vIyEj4+/vDxsYGIpEILVu2xLfffqtR5927d+Hr6wtTU1NIpVL07dsX165d0yijuix4+fJlzJw5EzY2NjA3N8f06dMhl8uRmZmJiRMnwsLCAhYWFpgzZw5YqR8Vrey2rsvrtD8A/PXXX/Dy8oJEIoG5uTmGDRuGhw8fapS5du0aPDw8MHr0aFhaWkIkEqFz5844cuSIukxOTg4kEgk+//xzrWU8f/4cAoEAy5YtU8esWlcllV5HcXFxmDFjBlq2bAmRSAQrKyuMGjUKsbGxGvOp+sCSl3hu3ryJ/v37w8TEBBKJRGebqNbdrVu31NNSU1N19hNDhgzRGXNl+tOFCxeqv89NmjRBt27dYGBgAHt7e624dVHNr3qZmJigS5cuGu0PFH83PDw8yqxH1df8/vvvAIpv2oyIiICjoyP8/PxgampaZlsBwNOnTzFq1ChYWlpCLBbj7bffxp9//qlRpir5qCr9ZFXyVnkMqjyHDqpEbWVlBaC4YY4cOYJRo0bBxcUFSUlJ2Lx5M3x8fPDgwQM0atQIAKBQKDBkyBCEhIRg9OjR+Pzzz5GdnY0zZ84gIiICzZs3Vy9jzJgxGDx4sMZy586dqzOe77//HjweD19//TWSk5OxevVq9OvXD2FhYRCJRACKv+i+vr7w9PTEggULwOfzsX37dvTp0weXLl1Cly5dtOpt0qSJ+kubk5ODjz/+WOey582bB39/f3zwwQdISUnBunXr4O3tjbt378Lc3FxrnmnTpsHLywsAcOjQIRw+fFjj/enTp+P3339HQEAAZs6ciZiYGKxfvx53797FlStXIBQKdbZDVWRmZqo/W0lKpRJDhw7F5cuXMW3aNLRu3Rr379/HqlWr8PjxY60vXUW2b9+O7777Dj///DPGjh2rs0xF7bFmzRoMHToU48aNg1wux549ezBq1CgcP34cfn5+6nKLFi3CwoUL0b17dyxevBiGhoa4fv06/vrrLwwYMABAcac3ZcoUuLu7Y+7cuTA3N8fdu3dx6tQpdXyqtu/cuTOWLVuGpKQkrFmzBleuXNFap9bW1li1ahWA4o5+zZo1GDx4MOLj43Wue6D4mnxISAi8vLzg4uKis8z777+PadOm4fjx4/jmm28qbuh/2NjYYOPGjfj4448xYsQI9bXLdu3aAQDu3bsHLy8vCIVCTJs2Dc7Oznjy5AmOHTuG77//HgDw999/w8vLC6amppgzZw6EQiE2b96MXr164cKFC+jatavGMj/77DPY29tj0aJFuHbtGrZs2QJzc3NcvXoVTk5O+OGHH3DixAksX74cHh4emDhxonre193Wq9P+AHD27Fn4+vqiWbNmWLhwIfLy8rBu3Tr06NEDd+7cUSe7tLQ0bNmyBVKpVL2Ds2vXLowcORJBQUEYM2YMpFIpRowYgb1792LlypUQCATq5ezevRuMMYwbN65yK/AfN2/exNWrVzF69Gg0adIEsbGx2LhxI3r16oUHDx5o7TCqREdHo1evXhCLxfjqq68gFouxdetW9OvXD2fOnIG3t3eV4ihLdfpTlZ9//rnK94zt3LkTQPEOyi+//IJRo0YhIiICLVu2rFb8aWlpAIAff/wR9vb2+Oqrr2BsbKyzrZKSktC9e3fk5uZi5syZsLKyQmBgIIYOHYoDBw5gxIgRGnVXJh+VVlY/+TrtrKUqpwlUp/3Pnj3LUlJSWHx8PNuzZw+zsrJiIpGIPX/+nDHGWH5+PlMoFBrzxsTEMCMjI7Z48WL1tG3btjEAbOXKlVrLUiqV6vkAsOXLl2uVcXd3Zz4+Puq/z507xwCwxo0bs1evXqmn79u3jwFga9asUdft5ubGBg4cqF4OY4zl5uYyFxcX1r9/f61lde/enXl4eKj/1nUqNTY2lgkEAvb9999rzHv//n1mYGCgNT0qKooBYIGBgeppCxYsYCVXy6VLlxgAretQp06d0pretGlT5ufnpxX7J598wkqv6tKxz5kzh9na2jJPT0+NNt25cyfj8/la1+Q2bdrEAFR4HdrHx0dd359//skMDAzYl19+qbNsZdqDseL1VJJcLmceHh6sT58+GnXx+Xw2YsQIrW1Rtc4zMzOZiYkJ69q1K8vLy9NZRi6XM1tbW+bh4aFR5vjx4wwAmz9/vnrapEmTWNOmTTXq2bJlCwPAbty4ofMzM8ZYWFgYA8A+//zzMsswxli7du2YpaWl+u+SbVtS6TjKO+3v7e3NTExMWFxcnMb0kt+L4cOHM0NDQ/bkyRP1tBcvXjATExPm7e2tnqbqH0p/r7p168Z4PB776KOP1NOKiopYkyZNNOKvyrauS3XbnzHG2rdvz2xtbVlaWpp6Wnh4OOPz+WzixInqaQAYAHb+/Hn1tNzcXNa6dWtmb2/P5HI5Y4yx06dPMwDs5MmTGstp166dxmcOCAhgTk5OWvGUXl+lt3nGGAsNDWUA2I4dO9TTVH3
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Контрольная выборка: (115, 9)\n",
"Outcome\n",
"0 78\n",
"1 37\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAgsAAADECAYAAAARfmKGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA4d0lEQVR4nO3dZ3xTZd8H8F+StummLZ1AoaXssisgewqUKrJRkI0UEBBBeQTFMkRUvBEERETZKHvcIpaN7E0ZUqCUllXopKV0psn1vOid2JA0HbScjt/388mLnFznOv9zcs7JP9dIZEIIASIiIqJcyKUOgIiIiEo2JgtERERkEpMFIiIiMonJAhEREZnEZIGIiIhMYrJAREREJjFZICIiIpOYLBAREZFJTBaIiIiKgEajQVxcHO7evSt1KEWOyQIREZVoJ06cwNGjR3XPjx49ipMnT0oXUA5PnjzB5MmTUa1aNVhYWMDFxQX16tXDs2fPpA6tSBUoWVizZg1kMpnuYWlpiVq1amHChAmIjo4urhiJyo2TJ0+id+/ecHNzg1KphJeXFwIDA3H//v1C15mamopZs2bp3WyJSpMHDx5g/PjxuHbtGq5du4bx48fjwYMHUoeFO3fuoFmzZti0aRMCAwOxZ88eHDhwAIcOHYKNjY3U4RUps8KsNGfOHHh7eyM9PR0nTpzA8uXLsXfvXly/fh3W1tZFHSNRubBkyRJ8+OGHqF69OiZOnAgPDw+Ehobil19+webNm7F37160atWqwPWmpqZi9uzZAIAOHToUcdRExa9Pnz5YtGgRGjZsCABo2bIl+vTpI3FUQGBgICwsLHDmzBlUrlxZ6nCKVaGSBX9/f7z22msAgNGjR6NixYpYuHAhdu/ejXfffbdIAyQqD06ePInJkyejTZs2CA4O1ku6x40bh9atW6Nfv374559/4OjoKGGkRK+eUqnEqVOncP36dQBA/fr1oVAoJI3p4sWLOHz4MPbv31/mEwWgiMYsdOrUCQAQEREBAEhISMDHH3+MBg0awNbWFvb29vD398eVK1cM1k1PT8esWbNQq1YtWFpawsPDA3369EF4eDgAIDIyUq/r48VHzm9KR48ehUwmw+bNmzFjxgy4u7vDxsYGPXv2NNpkdfbsWXTv3h0VKlSAtbU12rdvn2s/WIcOHYxuf9asWQZlN2zYAD8/P1hZWcHJyQnvvPOO0e2b2recNBoNFi1aBF9fX1haWsLNzQ2BgYF4+vSpXjkvLy+8+eabBtuZMGGCQZ3GYl+wYIHBMQWAjIwMBAUFoUaNGlAqlfD09MS0adOQkZFh9Fjl1KFDB4P65s2bB7lcjt9++61Qx+O7775Dq1atULFiRVhZWcHPzw/btm0zuv0NGzagefPmsLa2hqOjI9q1a4f9+/frlfnrr7/Qvn172NnZwd7eHs2aNTOIbevWrbr31NnZGe+99x4ePXqkV2b48OF6MTs6OqJDhw44fvx4nsdp7ty5kMlkWLt2rUHrnI+PD7799ls8fvwYK1as0C03dmy1cXh5eQHIPqYuLi4AgNmzZxs9b2/evIkBAwbAxcUFVlZWqF27Nj777DO9Oi9fvgx/f3/Y29vD1tYWnTt3xpkzZ/TKaLspT5w4gUmTJsHFxQUODg4IDAxEZmYmEhMTMXToUDg6OsLR0RHTpk3Di396m99z3ZiXOf45j5nWhg0bIJfL8fXXX+stP3z4MNq2bQsbGxs4ODjg7bffRmhoqF6ZWbNmQSaTIS4uTm/5hQsXIJPJsGbNGqMxG3tERkYC+Pf63r9/Pxo3bgxLS0vUq1cPO3bsMNifu3fvon///nBycoK1tTVef/11/Pnnn/k6bsbOkeHDh8PW1jbP41iQe1BWVhbmzp0LHx8fXZfbjBkzDO4rXl5eGD58OBQKBRo1aoRGjRphx44dkMlkBu9ZbjFp90kul8Pd3R0DBw7U69rT3nu+++67XOvRvqdaZ86cgaWlJcLDw+Hr6wulUgl3d3cEBgYiISHBYP383kNsbW1x9+5ddOvWDTY2NqhUqRLmzJmjd61o49WeRwCQnJwMPz8/eHt74/Hjx7rlL3NN5VSoloUXaT/YK1asCCD7RN21axf69+8Pb29vREdHY8WKFWjfvj1u3LiBSpUqAQDUajXefPNNHDp0CO+88w4+/PBDJCcn48CBA7h+/Tp8fHx023j33XfRo0cPve1Onz7daDzz5s2DTCbD//3f/yEmJgaLFi1Cly5dEBISAisrKwDZF7y/vz/8/PwQFBQEuVyO1atXo1OnTjh+/DiaN29uUG+VKlUwf/58AMDz588xbtw4o9ueOXMmBgwYgNGjRyM2NhZLlixBu3btcPnyZTg4OBisM2bMGLRt2xYAsGPHDuzcuVPv9cDAQKxZswYjRozApEmTEBERgaVLl+Ly5cs4efIkzM3NjR6HgkhMTNTtW04ajQY9e/bEiRMnMGbMGNStWxfXrl3D999/j9u3b2PXrl0F2s7q1avx+eef4z//+Q8GDRpktExex2Px4sXo2bMnBg8ejMzMTGzatAn9+/fHnj17EBAQoCs3e/ZszJo1C61atcKcOXNgYWGBs2fP4vDhw+jatSuA7A+4kSNHwtfXF9OnT4eDgwMuX76M4OBgXXzaY9+sWTPMnz8f0dHRWLx4MU6ePGnwnjo7O+P7778HADx8+BCLFy9Gjx498ODBA6PvPZDdTXDo0CG0bdsW3t7eRssMHDgQY8aMwZ49e/Dpp5/mfaD/x8XFBcuXL8e4cePQu3dvXdOttjn36tWraNu2LczNzTFmzBh4eXkhPDwcf/zxB+bNmwcA+Oeff9C2bVvY29tj2rRpMDc3x4oVK9ChQwf8/fffaNGihd42J06cCHd3d8yePRtnzpzBzz//DAcHB5w6dQpVq1bFV199hb1792LBggWoX78+hg4dqlv3Zc/1whx/Y/bv34+RI0diwoQJesf74MGD8Pf3R/Xq1TFr1iykpaVhyZIlaN26NS5dupSvD6+cAgMD0aVLF93zIUOG6L1PAHTJHgCEhYVh4MCBGDt2LIYNG4bVq1ejf//+CA4OxhtvvAEAiI6ORqtWrZCamopJkyahYsWKWLt2LXr27Ilt27ahd+/eBnHkPG7aOIrb6NGjsXbtWvTr1w9Tp07F2bNnMX/+fISGhhpc8zllZWUZJLN5adu2LcaMGQONRoPr169j0aJFiIqKylcimZv4+Hikp6dj3Lhx6NSpE8aOHYvw8HAsW7YMZ8+exdmzZ6FUKgEU7B6iVqvRvXt3vP766/j2228RHByMoKAgZGVlYc6cOUZjUalU6Nu3L+7fv4+TJ0/Cw8ND91qRfX6IAli9erUAIA4ePChiY2PFgwcPxKZNm0TFihWFlZWVePjwoRBCiPT0dKFWq/XWjYiIEEqlUsyZM0e3bNWqVQKAWLhwocG2NBqNbj0AYsGCBQZlfH19Rfv27XXPjxw5IgCIypUri2fPnumWb9myRQAQixcv1tVds2ZN0a1bN912hBAiNTVVeHt7izfeeMNgW61atRL169fXPY+NjRUARFBQkG5ZZGSkUCgUYt68eXrrXrt2TZiZmRksDwsLEwDE2rVrdcuCgoJEzrfl+PHjAoDYuHGj3rrBwcEGy6tVqyYCAgIMYv/ggw/Ei2/1i7FPmzZNuLq6Cj8/P71jun79eiGXy8Xx48f11v/pp58EAHHy5EmD7eXUvn17XX1//vmnMDMzE1OnTjVaNj/HQ4js9ymnzMxMUb9+fdGpUye9uuRyuejdu7fBuah9zxMTE4WdnZ1o0aKFSEtLM1omMzNTuLq6ivr16+uV2bNnjwAgvvjiC92yYcOGiWrVqunV8/PPPwsA4ty5c0b3WQghQkJCBADx4Ycf5lpGCCEaNmwonJycdM9zHtucXozD2Lmq1a5dO2FnZyfu3buntzznddGrVy9hYWEhwsPDdcuioqKEnZ2daNeunW6Z9v7w4nXVsmVLIZPJxNixY3XLsrKyRJUqVfTiL8i5bkxhj/+L6164cEHY2tqK/v37G5w7jRs3Fq6uriI+Pl637MqVK0Iul4uhQ4fqlmnP29jYWL31z58/LwC
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Тестовая выборка: (116, 9)\n",
"Outcome\n",
"0 73\n",
"1 43\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAADECAYAAAC7i9nLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA0/klEQVR4nO3dd1xT1/sH8E8SEvbeKgqCgIo4cPwUGVoVcVWtpe6taNXaar+0trXu2tZW69a2rmqHq2KtuEfdk+FCRAUHsgVkJyTn9wdNyiUBAYGE5Hm/Xr5abk7OeXJyc5+cc8+94THGGAghhBCicfjqDoAQQgghqlGSJoQQQjQUJWlCCCFEQ1GSJoQQQjQUJWlCCCFEQ1GSJoQQQjQUJWlCCCFEQ1GSJoQQQjQUJWlCCCENGmMML1++RHx8vLpDqXWUpAkhhKh0584dhIeHK/6Ojo7G4cOH1RdQGbm5ufjiiy/g4eEBkUgEa2truLu7Iy4uTt2h1apqJent27eDx+Mp/hkYGMDd3R0zZ85EampqXcVIiM64ePEihgwZAnt7e+jr68PZ2RmhoaF4+vRpjessKCjAwoULcfbs2doLlOiE3NxchIaG4sqVK4iPj8fs2bNx+/ZtdYeFzMxMdO3aFWvWrMGwYcNw8OBBnDhxAmfPnoWzs7O6w6tVejV50uLFi+Hi4oKioiJcuHABGzduREREBO7cuQMjI6PajpEQnbB27VrMnj0bzZs3x6xZs+Do6IjY2Fj8/PPP2L17NyIiItCtW7dq11tQUIBFixYBAAIDA2s5aqLNunbtqvgHAO7u7pgyZYqaowL+97//ITk5GZcvX0br1q3VHU6dqlGSDg4ORseOHQEAkydPhrW1NVauXImDBw9ixIgRtRogIbrg4sWL+PDDD9G9e3ccPXqU82V3+vTp8PX1xbBhw3D37l1YWlqqMVKia8LDw3Hv3j0UFhaiTZs2EIlEao0nLS0NO3bswKZNm7Q+QQO1dE66Z8+eAICEhAQAwMuXL/Hxxx+jTZs2MDExgZmZGYKDgxETE6P03KKiIixcuBDu7u4wMDCAo6Mjhg4dikePHgEAEhMTOVPs5f+VHRmcPXsWPB4Pu3fvxmeffQYHBwcYGxtj0KBBePbsmVLbV69eRd++fWFubg4jIyMEBATg4sWLKl9jYGCgyvYXLlyoVHbXrl3w8fGBoaEhrKysMHz4cJXtV/baypLJZPjhhx/QunVrGBgYwN7eHqGhocjKyuKUc3Z2xoABA5TamTlzplKdqmJfsWKFUp8CQHFxMRYsWAA3Nzfo6+vDyckJYWFhKC4uVtlXZQUGBirVt2zZMvD5fPz222816o/vvvsO3bp1g7W1NQwNDeHj44N9+/apbH/Xrl3o3LkzjIyMYGlpCX9/fxw/fpxT5siRIwgICICpqSnMzMzQqVMnpdj27t2reE9tbGwwevRoJCUlccqMHz+eE7OlpSUCAwNx/vz51/bTkiVLwOPxsGPHDqXZKFdXV3z77bdITk7G5s2bFdtV9a08DvmUX2JiImxtbQEAixYtUrnf3r9/HyEhIbC1tYWhoSE8PDzw+eefc+qMiopCcHAwzMzMYGJigrfeegtXrlzhlJGfDrtw4QI++OAD2NrawsLCAqGhoRCLxcjOzsbYsWNhaWkJS0tLhIWFofyP8FV1X1elpv1f/nmq/iUmJirKHzlyBH5+fjA2NoapqSn69++Pu3fvKtVbWb8uXLjwtW2WPT1R2/vfhg0b0Lp1a+jr66NRo0aYMWMGsrOzOWXK7l+tWrWCj48PYmJiVH4mVSl/zLSxsUH//v1x584dTjkej4eZM2dWWI98v5K/B9evX4dMJoNYLEbHjh1hYGAAa2trjBgxQuVpodOnTyveLwsLC7z99tuIjY3llJG/H/L3zMzMDNbW1pg9ezaKioqU4i37+SkpKUG/fv1gZWWFe/fuccpWNRdUpkYj6fLkCdXa2hoA8PjxY4SHh+Pdd9+Fi4sLUlNTsXnzZgQEBODevXto1KgRAEAqlWLAgAE4deoUhg8fjtmzZyM3NxcnTpzAnTt34OrqqmhjxIgR6NevH6fdefPmqYxn2bJl4PF4+OSTT5CWloYffvgBvXr1QnR0NAwNDQGUvnHBwcHw8fHBggULwOfzsW3bNvTs2RPnz59H586dlept0qQJli9fDgDIy8vD9OnTVbY9f/58hISEYPLkyUhPT8fatWvh7++PqKgoWFhYKD1n6tSp8PPzAwD8+eefOHDgAOfx0NBQbN++HRMmTMAHH3yAhIQErFu3DlFRUbh48SKEQqHKfqiO7OxsxWsrSyaTYdCgQbhw4QKmTp2Kli1b4vbt21i1ahUePHjAWVRSFdu2bcMXX3yB77//HiNHjlRZ5nX9sXr1agwaNAijRo2CWCzGH3/8gXfffRd///03+vfvryi3aNEiLFy4EN26dcPixYshEolw9epVnD59Gn369AFQegCYOHEiWrdujXnz5sHCwgJRUVE4evSoIj5533fq1AnLly9HamoqVq9ejYsXLyq9pzY2Nli1ahUA4Pnz51i9ejX69euHZ8+eqXzvgdLp6FOnTsHPzw8uLi4qy7z33nuYOnUq/v77b3z66aev7+h/2draYuPGjZg+fTqGDBmCoUOHAgC8vb0BALdu3YKfnx+EQiGmTp0KZ2dnPHr0CIcOHcKyZcsAAHfv3oWfnx/MzMwQFhYGoVCIzZs3IzAwEP/88w+6dOnCaXPWrFlwcHDAokWLcOXKFfz444+wsLDApUuX0LRpU3z11VeIiIjAihUr4OXlhbFjxyqe+6b7ek36PzQ0FL169VL8PWbMGE5fyfsRAHbu3Ilx48YhKCgI33zzDQoKCrBx40Z0794dUVFRii9Hr+vXoUOHws3NTVH/Rx99hJYtW2Lq1KmKbS1btgRQ+/vfwoULsWjRIvTq1QvTp09HXFwcNm7ciOvXr7+2jz/55JNK+788T09PfP7552CM4dGjR1i5ciX69ev3RmssMjMzAZQOPnx8fPD1118jPT0da9aswYULFxAVFQUbGxsAwMmTJxEcHIzmzZtj4cKFKCwsxNq1a+Hr64vIyEil89chISFwdnbG8uXLceXKFaxZswZZWVn45ZdfKoxn8uTJOHv2LE6cOIFWrVopttckF6jEqmHbtm0MADt58iRLT09nz549Y3/88QeztrZmhoaG7Pnz54wxxoqKiphUKuU8NyEhgenr67PFixcrtm3dupUBYCtXrlRqSyaTKZ4HgK1YsUKpTOvWrVlAQIDi7zNnzjAArHHjxuzVq1eK7Xv27GEA2OrVqxV1t2jRggUFBSnaYYyxgoIC5uLiwnr37q3UVrdu3ZiXl5fi7/T0dAaALViwQLEtMTGRCQQCtmzZMs5zb9++zfT09JS2x8fHMwBsx44dim0LFixgZd+W8+fPMwDs119/5Tz36NGjStubNWvG+vfvrxT7jBkzWPm3unzsYWFhzM7Ojvn4+HD6dOfOnYzP57Pz589znr9p0yYGgF28eFGpvbICAgIU9R0+fJjp6emxuXPnqixblf5grPR9KkssFjMvLy/Ws2dPTl18Pp8NGTJEaV+Uv+fZ2dnM1NSUdenShRUWFqosIxaLmZ2dHfPy8uKU+fvvvxkA9uWXXyq2jRs3jjVr1oxTz48//sgAsGvXrql8zYwxFh0dzQCw2bNnV1iGMca8vb2ZlZWV4u+yfVtW+ThU7aty/v7+zNTUlD158oSzveznYvDgwUwkErFHjx4ptr148YKZmpoyf39/xTb58aH856pr166Mx+OxadOmKbaVlJSwJk2acOKvzr6uSk37v7yK+io3N5dZWFiwKVOmcLanpKQwc3Nzzvaq9GtZzZo1Y+PGjVPaXtv7X1paGhOJRKxPnz6cz8W6desYALZ161bFtvL7V0REBAPA+vbtq/SZVEXV/vnZZ58xACwtLU2xDQCbMWNGhfXI96uEhATO361ateIcC+TH/7LHl3bt2jE7OzuWmZmp2BYTE8P4fD4bO3asYpv8ODNo0CBO2++//z4DwGJiYjjxyvePefPmMYFAwMLDwznPq24uqEyNprt79eo
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после оверсемплинга: (677, 9)\n",
"Outcome\n",
"0 349\n",
"1 328\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADECAYAAAB6FizTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+w0lEQVR4nO3dd3gUVdsG8Ht303sv1ITQCUUiICAJvUsHEYUQqgXRV5QPLFSVF0EEEWkKRHqJgALSEWnSW4QghARDCymkZ9P2fH/k3TWb3ZQNCbPJ3r/LXLizs2eeOTN75tmZM2dkQggBIiIiIiITJJc6ACIiIiIiqTAZJiIiIiKTxWSYiIiIiEwWk2EiIiIiMllMhomIiIjIZDEZJiIiIiKTxWSYiIiIiEwWk2EiIiIiMllMhomInkF6ejpiYmLw9OlTqUOhcpaamoro6Gikp6dLHQoRVSAmw0REBtq+fTu6dOkCe3t72NnZoVatWvjqq6+kDqtSSEtLw+LFizWvk5KSsGzZMukCKkAIgVWrVuGll16CjY0NHBwc4Ovriw0bNkgdGhFVIIOS4XXr1kEmk2n+rKysUL9+fUyaNAmxsbEVFSORyTh16hQGDhwIT09PWFpawsfHBxMnTsQ///xT5jIzMjIwa9Ys/P777+UXqAmbNm0ahg0bBnt7e6xevRqHDh3C4cOH8fbbb0sdWqVgbW2NTz/9FBs3bkRMTAxmzZqFX3/9VeqwAAAjRozAm2++iUaNGmH9+vWabTto0CCpQyOiCmRWlg/NmTMHvr6+UCqVOHnyJJYvX459+/YhPDwcNjY25R0jkUlYunQp3nvvPdSpUwfvvvsuvL29cfPmTfzwww/YunUr9u3bh3bt2hlcbkZGBmbPng0A6NixYzlHbVqOHz+O+fPnY968eZg2bZrU4VRKCoUCs2fPxqhRo6BSqeDg4IC9e/dKHRZ++uknbN26FRs2bMCIESOkDoeIniOZEEKUduZ169YhJCQE58+fx4svvqiZPmXKFCxatAibNm3Ca6+9ViGBElVlp06dQmBgINq3b4/9+/dr/aiMjIxE+/btIZfL8ddff8HZ2dmgsuPj4+Hu7o6ZM2di1qxZ5Ry5aXnllVeQmJiIU6dOSR1KpXf//n3ExMSgUaNGcHJykjocNG3aFM2aNcPGjRulDoWInrNy6TPcuXNnAEBUVBQAIDExER9++CGaNm0KOzs7ODg4oFevXrh69arOZ5VKJWbNmoX69evDysoK3t7eGDRoECIjIwEA0dHRWl0zCv8VPNP1+++/QyaTYevWrfj444/h5eUFW1tb9OvXDzExMTrLPnv2LHr27AlHR0fY2NggKCioyINcx44d9S5fX3KxYcMGBAQEwNraGi4uLhg+fLje5Re3bgWpVCosXrwYTZo0gZWVFTw9PTFx4kSdG3Z8fHzQt29fneVMmjRJp0x9sS9YsECnTgEgKysLM2fORN26dWFpaYmaNWti6tSpyMrK0ltXBXXs2FGnvC+++AJyuRybNm0qU30sXLgQ7dq1g6urK6ytrREQEIAdO3boXf6GDRvQunVr2NjYwNnZGYGBgTh48KDWPL/99huCgoJgb28PBwcHtGrVSie27du3a7apm5sb3njjDTx48EBrntGjR2vF7OzsjI4dO+LEiRMl1tPcuXMhk8kQGhqqc3XFz88PX331FR49eoSVK1dqpuurW3UcPj4+APLr1N3dHQAwe/ZsvfttREQEhg0bBnd3d1hbW6NBgwb45JNPtMq8fPkyevXqBQcHB9jZ2aFLly74888/teZRd6M6efIkJk+eDHd3dzg5OWHixInIzs5GUlISRo0aBWdnZzg7O2Pq1Kko/Fu8tPu6Ps9S/wBw9OhRdOjQAba2tnByckL//v1x8+ZNrXn+/PNP+Pv7Y/jw4XBxcYG1tTVatWqFXbt2aeZJS0uDra0t3nvvPZ1l3L9/HwqFAvPmzdPErN5WBRXeRvfu3cPbb7+NBg0awNraGq6urhg6dCiio6O1PqduAwt2iTl//jy6desGe3t72Nra6q0T9ba7cOGCZlp8fLzedqJv3756Yy5Nezpr1izN97lGjRpo27YtzMzM4OXlpRO3PurPq//s7e3RunVrrfoH8r8b/v7+RZajbmvWrVsHIP8myPDwcNSsWRN9+vSBg4NDkXUFAHfv3sXQoUPh4uICGxsbvPTSSzpntw05HhnSThpy3NIXT1F/o0ePNngdgZKP4WqFt11Ry33w4AHGjBmj6SrWpEkTrFmzpsT1A4Dc3FzMnTsXfn5+mm5mH3/8sc6xysfHR7N8uVwOLy8vvPrqqzrd0Qw99h48eBAtWrSAlZUVGjdujJ9//lknxqSkJPznP/+Bj48PLC0tUaNGDYwaNQrx8fGaeUp7zFWvQ8E++GoNGzaETCbDpEmTNNMKd3UtTT5TsK4K/hX8rvr4+Ohsx+3bt0Mmk2m1Ferv3cKFC3WW4+/vrzef0/d38uRJAKVvF0ujTN0kClPv9K6urgDyv0S7du3C0KFD4evri9jYWKxcuRJBQUG4ceMGqlWrBgDIy8tD3759ceTIEQwfPhzvvfceUlNTcejQIYSHh8PPz0+zjNdeew29e/fWWu706dP1xvPFF19AJpPh//7v//DkyRMsXrwYXbt2xZUrV2BtbQ0g/8DXq1cvBAQEYObMmZDL5Vi7di06d+6MEydOoHXr1jrl1qhRQ3MQS0tLw1tvvaV32Z999hmGDRuGcePGIS4uDkuXLkVgYCAuX76s9wzIhAkT0KFDBwDAzz//jJ07d2q9P3HiRM1Z+cmTJyMqKgrfffcdLl++jFOnTsHc3FxvPRgiKSlJs24FqVQq9OvXDydPnsSECRPQqFEjXL9+Hd988w3+/vtvnYNQSdauXYtPP/0UX3/9dZGXIkuqjyVLlqBfv354/fXXkZ2djS1btmDo0KHYs2cP+vTpo5lv9uzZmDVrFtq1a4c5c+bAwsICZ8+exdGjR9G9e3cA+Y3DmDFj0KRJE0yfPh1OTk64fPky9u/fr4lPXfetWrXCvHnzEBsbiyVLluDUqVM629TNzQ3ffPMNgPzEZ8mSJejduzdiYmKKPPuVkZGBI0eOoEOHDvD19dU7z6uvvooJEyZgz549Bl2ed3d3x/Lly/HWW29h4MCBmr6PzZo1AwBcu3YNHTp0gLm5OSZMmAAfHx9ERkbi119/xRdffAEA+Ouvv9ChQwc4ODhg6tSpMDc3x8qVK9GxY0ccP34cbdq00Vrmu+++Cy8vL8yePRt//vknVq1aBScnJ5w+fRq1atXCl19+iX379mHBggXw9/fHqFGjNJ991n29LPUPAIcPH0avXr1Qp04dzJo1C5mZmVi6dCnat2+PS5cuaRr0hIQErFq1CnZ2dpqEf8OGDRg0aBA2btyI1157DXZ2dhg4cCC2bt2KRYsWQaFQaJazefNmCCHw+uuvl24D/s/58+dx+vRpDB8+HDVq1EB0dDSWL1+Ojh074saNG0V2T7tz5w46duwIGxsbfPTRR7CxscHq1avRtWtXHDp0CIGBgQbFUZSytKdqX3/9tcH3nKxfvx5AfsL+/fffY+jQoQgPD0eDBg3KFH9CQgIAYP78+fDy8sJHH30EKysrvXUVGxuLdu3aISMjA5MnT4arqytCQ0PRr18/7NixAwMHDtQquzTHo8KKaiefpZ7VJk+ejFatWmlNGzdunNbr0q6jIcdwNfW2A4D//Oc/Ost96aWXNEmcu7s7fvvtN4wdOxYpKSl4//33i123cePGITQ0FEOGDMGUKVNw9uxZzJs3Dzdv3tQ5jnTo0AETJkyASqVCeHg4Fi9ejIcPH2r9+DGkPbp9+zZeffVVvPnmmwgODsbatWsxdOhQ7N+/H926dQOQnzd06NABN2/exJgxY9CyZUvEx8fjl19+wf379+Hm5mbwMdfKygpr167VqpvTp0/j3r17RdaTuqurWlH5TOG6AoCbN2/iyy+/LHojIP9HSeETKmWlb39Vf8/L2i7qJQywdu1aAUAcPnxYxMXFiZiYGLFlyxbh6uoqrK2txf3794UQQiiVSpGXl6f12aioKGFpaSnmzJm
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Обучающая выборка после андерсемплинга: (376, 9)\n",
"Outcome\n",
"0 188\n",
"1 188\n",
"Name: count, dtype: int64\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAADECAYAAABgKsxdAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+kUlEQVR4nO3deVwU9f8H8Nfucu8CcoM34n1l4o2C5kGoeZXmkQdaWprWz8rqW+ZVmVEemZpHHimZGqVpXnjlReatqHgF5skpyH3t5/cH7cawCywIDuDr+XjsQ5n9zMx7PjPzmffOfuazCiGEABERERERFUspdwBERERERJUFk2ciIiIiIhMxeSYiIiIiMhGTZyIiIiIiEzF5JiIiIiIyEZNnIiIiIiITMXkmIiIiIjIRk2ciIiIiIhMxeSYiegypqam4ffs2Hj58KHcoVMaSk5MRFRWF1NRUuUMhogqEyTMRUQlt2bIF3bt3h62tLTQaDWrXro0vv/xS7rAqhZSUFCxcuFD/d2JiIpYsWSJfQPkIIbBixQp06NABNjY2sLOzg6enJzZs2CB3aERUgZQoeV67di0UCoX+ZWVlhYYNG+LNN99EdHR0ecVI9NQ4duwYBg4cCDc3N1haWqJu3bqYMGEC/vnnn1IvMy0tDTNnzsShQ4fKLtCn2AcffIAhQ4bA1tYWK1euRGhoKPbt24eJEyfKHVqlYG1tjY8//hjBwcG4ffs2Zs6cie3bt8sdFgBg+PDheP3119GkSROsX79ev28HDRokd2hEVIGYlWam2bNnw9PTExkZGTh69CiWLVuGnTt3Ijw8HDY2NmUdI9FTYfHixXjrrbdQr149TJ48GR4eHrhy5QpWrVqFTZs2YefOnejUqVOJl5uWloZZs2YBALp27VrGUT9d/vjjD8ybNw9z587FBx98IHc4lZJKpcKsWbMwatQoaLVa2NnZ4ffff5c7LPzwww/YtGkTNmzYgOHDh8sdDhFVYAohhDC18Nq1axEYGIiTJ0+iTZs2+unvvPMO5s+fjx9//BHDhg0rl0CJqrJjx47B19cXPj4+2L17t+RD6M2bN+Hj4wOlUolLly7BwcGhRMuOi4uDi4sLZsyYgZkzZ5Zx5E+XF154AQkJCTh27JjcoVR6d+7cwe3bt9GkSRNUq1ZN7nDQokULtGzZEsHBwXKHQkQVXJn0eX7uuecAAJGRkQCAhIQEvPvuu2jRogU0Gg3s7OwQEBCA8+fPG8ybkZGBmTNnomHDhrCysoKHhwcGDRqEmzdvAgCioqIkXUUKvvLfSTt06BAUCgU2bdqE//3vf3B3d4darUa/fv1w+/Ztg3WfOHECzz//POzt7WFjYwM/P79CL4pdu3Y1un5jyciGDRvg7e0Na2trODo6YujQoUbXX9S25afVarFw4UI0a9YMVlZWcHNzw4QJEwweUKpbty769u1rsJ4333zTYJnGYg8KCjKoUwDIzMzEjBkzUL9+fVhaWqJWrVqYNm0aMjMzjdZVfl27djVY3meffQalUokff/yxVPXx1VdfoVOnTnBycoK1tTW8vb3x888/G13/hg0b0K5dO9jY2MDBwQG+vr7Yu3evpMyuXbvg5+cHW1tb2NnZoW3btgaxbdmyRb9PnZ2d8corr+Du3buSMmPGjJHE7ODggK5du+LIkSPF1tOcOXOgUCiwbt06g29vvLy88OWXX+L+/ftYvny5frqxutXFUbduXQB5deri4gIAmDVrltHjNiIiAkOGDIGLiwusra3RqFEjfPTRR5Jlnj17FgEBAbCzs4NGo0H37t3x559/SsrounUdPXoUU6ZMgYuLC6pVq4YJEyYgKysLiYmJGDVqFBwcHODg4IBp06ah4Gd3U491Yx6n/gHgwIED6NKlC9RqNapVq4b+/fvjypUrkjJ//vknmjdvjqFDh8LR0RHW1tZo27Yttm7dqi+TkpICtVqNt956y2Add+7cgUqlwty5c/Ux6/ZVfgX30a1btzBx4kQ0atQI1tbWcHJywuDBgxEVFSWZT9cG5u+ic/LkSfTs2RO2trZQq9VG60S3706dOqWfFhcXZ7Sd6Nu3r9GYTWlPZ86cqT+fa9asiY4dO8LMzAzu7u4GcRujm1/3srW1Rbt27ST1D+SdG82bNy90Obq2Zu3atQDyHvoMDw9HrVq10KdPH9jZ2RVaVwDw999/Y/DgwXB0dISNjQ06dOhgcPe8JNejkrSTJbluGYunsNeYMWNKvI1A8ddwnYL7rrD13r17F2PHjtV3XWvWrBlWr15d7PYBwJo1a/Dcc8/B1dUVlpaWaNq0KZYtW2ZQriTXSqDouivIlPhLk6v07t0bDg4OUKvVaNmyJRYtWiQpExERgZdeegmOjo6wsrJCmzZt8Ntvv0nK6M5zCwsLxMbGSt4LCwvTb1P+dqCw3Ef3KqwNKvjK32YYa6cAoE+fPgZtju64iYuLk5Q9deqU5BwGDK8Bulf9+vX1ZbZt24Y+ffqgevXqsLS0hJeXF+bMmYPc3FyDei9KqbptFKQ7SZycnADknXRbt27F4MGD4enpiejoaCxfvhx+fn64fPkyqlevDgDIzc1F3759sX//fgwdOhRvvfUWkpOTERoaivDwcHh5eenXMWzYMPTu3Vuy3g8//NBoPJ999hkUCgXef/99xMTEYOHChejRowfOnTsHa2trAHkXyoCAAHh7e2PGjBlQKpX6E+/IkSNo166dwXJr1qypv+ilpKTgjTfeMLru6dOnY8iQIXj11VcRGxuLxYsXw9fXF2fPnjV6h2X8+PHo0qULAOCXX37Br7/+Knl/woQJ+rv+U6ZMQWRkJL799lucPXsWx44dg7m5udF6KInExET9tuWn1WrRr18/HD16FOPHj0eTJk1w8eJFLFiwANeuXTO4aBVnzZo1+Pjjj/H1118X+tVocfWxaNEi9OvXDyNGjEBWVhZ++uknDB48GDt27ECfPn305WbNmoWZM2eiU6dOmD17NiwsLHDixAkcOHAAvXr1ApDXmIwdOxbNmjXDhx9+iGrVquHs2bPYvXu3Pj5d3bdt2xZz585FdHQ0Fi1ahGPHjhnsU2dnZyxYsABAXqK0aNEi9O7dG7dv3y707lpaWhr279+PLl26wNPT02iZl19+GePHj8eOHTtK1F3AxcUFy5YtwxtvvIGBAwfq+262bNkSAHDhwgV06dIF5ubmGD9+POrWrYubN29i+/bt+OyzzwAAly5dQpcuXWBnZ4dp06bB3Nwcy5cvR9euXfHHH3+gffv2knVOnjwZ7u7umDVrFv7880+sWLEC1apVw/Hjx1G7dm18/vnn2LlzJ4KCgtC8eXOMGjVKP+/jHuulqX8A2LdvHwICAlCvXj3MnDkT6enpWLx4MXx8fHDmzBl9wx8fH48VK1ZAo9HoPyBs2LABgwYNQnBwMIYNGwaNRoOBAwdi06ZNmD9/PlQqlX49GzduhBACI0aMMG0H/uvkyZM4fvw4hg4dipo1ayIqKgrLli1D165dcfny5UK7y924cQNdu3aFjY0N3nvvPdjY2GDlypXo0aMHQkND4evrW6I4ClOa9lTn66+/LvEzM+vXrweQl+AvXboUgwcPRnh4OBo1alSq+OPj4wEA8+bNg7u7O9577z1YWVkZravo6Gh06tQJaWlpmDJlCpycnLBu3Tr069cPP//8MwYOHChZtinXo4IKaycfp551pkyZgrZt20qmvfrqq5K/Td3GklzDdXT7DgD+7//+z2C9HTp0gEKhwJtvvgkXFxfs2rUL48aNw6NHj/D2228XuW3Lli1Ds2bN0K9fP5iZmWH79u2YOHEitFotJk2aVGzdFCd/3f3www8IDQ19rPhNOTZCQ0PRt29feHh44K233oK7uzuuXLmCHTt26D+gX7p0CT4+PqhRowY++OADqNVqbN68GQMGDEBISIjBMalSqbBhwwZJ/a9ZswZWVlbIyMgw2O78uY/Ozp07sXHjxkLr6n//+x+aNGkCAFixYkWxz+0cPnwYO3fuLLKMKSwtLbFq1SrJNFtbW/3/165dC41Gg6lTp0Kj0eDAgQP45JNP8OjRIwQFBZm+IlECa9asEQD
"text/plain": [
"<Figure size 200x200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"\n",
2024-12-07 12:08:20 +04:00
"df = pd.read_csv('../../datasets/diabetes.csv')\n",
2024-12-05 20:21:14 +04:00
"#df = df.drop(columns=['name', 'orbiting_body', 'sentry_object'])\n",
"\n",
"\n",
"#5. Устранение пропущенных данных\n",
" \n",
"#Сведения о пропущенных данных\n",
"print(\"Количество пропущенных значений в каждом столбце:\")\n",
"print(df.isnull().sum())\n",
"\n",
"# Процент пропущенных значений признаков\n",
"for i in df.columns:\n",
" null_rate = df[i].isnull().sum() / len(df) * 100\n",
" if null_rate > 0:\n",
" print(f'{i} Процент пустых значений: %{null_rate:.2f}')\n",
"\n",
"#Пропущенных данных в датасете нет\n",
"\n",
"\n",
"\n",
"#6. Проблемы набора данных\n",
" #5.1Выбросы: Возможны аномалии в значениях скорости или расстояния.\n",
" #Смещение: Данные могут быть смещены в сторону объектов, которые легче обнаружить (крупные, близкие).\n",
"\n",
"#7. Решения для обнаруженных проблем\n",
" #Выбросы: Идентификация и обработка выбросов через методы (например, IQR или Z-оценка).\n",
" #Смещение: Использование методов балансировки данных, таких как oversampling.\n",
"\n",
"#7.1 Проверка набора данных на выбросы\n",
"# Выбираем столбцы для анализа\n",
"columns_to_check = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI']\n",
"def Emissions(columns_to_check):\n",
"\n",
" # Функция для подсчета выбросов\n",
" def count_outliers(df, columns):\n",
" outliers_count = {}\n",
" for col in columns:\n",
" Q1 = df[col].quantile(0.25)\n",
" Q3 = df[col].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" \n",
" # Считаем количество выбросов\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" outliers_count[col] = len(outliers)\n",
" \n",
" return outliers_count\n",
"\n",
" # Подсчитываем выбросы\n",
" outliers_count = count_outliers(df, columns_to_check)\n",
"\n",
" # Выводим количество выбросов для каждого столбца\n",
" for col, count in outliers_count.items():\n",
" print(f\"Количество выбросов в столбце '{col}': {count}\")\n",
" \n",
" # Создаем гистограммы\n",
" plt.figure(figsize=(15, 10))\n",
" for i, col in enumerate(columns_to_check, 1):\n",
" plt.subplot(2, 3, i)\n",
" sns.histplot(df[col], kde=True)\n",
" plt.title(f'Histogram of {col}')\n",
" plt.tight_layout()\n",
" plt.show()\n",
"Emissions(columns_to_check)\n",
"\n",
"#Пр изна кb 'Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI' имеют количество выбросов в приемлемом диапазоне\n",
"#Н е нужно проводить устранять проблему пропущенных данных.\n",
"\n",
"#Подстановка константного значения для пропущенных данных. (пример так как кол-во выбросов в пределах нормы)\n",
"constant_value = 0 # Например, подставим 0\n",
"df.fillna(constant_value, inplace=True)\n",
"print(df_cleaned.isna().sum())\n",
"\n",
"\n",
"#8. Разбиение данных на выборки\n",
"\n",
"train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)\n",
"val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)\n",
"\n",
"print(\"Обучающая выборка: \", train_data.shape)\n",
"print(train_data.Outcome.value_counts())\n",
"Outcome_counts = train_data['Outcome'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(Outcome_counts, labels=Outcome_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов Outcome в обучающей выборке')\n",
"plt.show()\n",
"\n",
"print(\"Контрольная выборка: \", val_data.shape)\n",
"print(val_data.Outcome.value_counts())\n",
"Outcome_counts = val_data['Outcome'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(Outcome_counts, labels=Outcome_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов Outcome в контрольной выборке')\n",
"plt.show()\n",
"\n",
"print(\"Тестовая выборка: \", test_data.shape)\n",
"print(test_data.Outcome.value_counts())\n",
"Outcome_counts = test_data['Outcome'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(Outcome_counts, labels=Outcome_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов Outcome в тестовой выборке')\n",
"plt.show()\n",
"\n",
"\n",
"#9. Оценить сбалансированность выборок для каждого набора данных. Оценить необходимость использования методов приращения (аугментации) данных. \n",
"#Выводы по сбалансированности\n",
"#Если распределение классов примерно равно (например, 50%/50%), выборка считается сбалансированной, и аугментация данных не требуется.\n",
"#Если один из классов сильно доминирует (например, 90%/10%), выборка несбалансированная, и может потребоваться аугментация данных.\n",
"\n",
"#Данная сборка несбалансированная, и требуется аугментация данных.\n",
"\n",
"\n",
"#10. Выполнить приращение данных методами выборки с избытком (oversampling) и выборки с недостатком (undersampling). Должны быть представлены примеры реализации обоих методов для выборок набора данных. \n",
"\n",
"#10.1\n",
"#Аугментация данных методом оверсемплинга¶\n",
"#Этот метод увеличивает количество примеров меньшинства.\n",
"\n",
"from imblearn.over_sampling import ADASYN\n",
"\n",
"# Создание экземпляра ADASYN\n",
"ada = ADASYN()\n",
"\n",
"# Применение ADASYN\n",
"X_resampled, y_resampled = ada.fit_resample(train_data.drop(columns=['Outcome']), train_data['Outcome'])\n",
"\n",
"# Создание нового DataFrame\n",
"df_train_adasyn = pd.DataFrame(X_resampled)\n",
"df_train_adasyn['Outcome'] = y_resampled # Добавление целевой переменной\n",
"\n",
"# Вывод информации о новой выборке\n",
"print(\"Обучающая выборка после оверсемплинга: \", df_train_adasyn.shape)\n",
"print(df_train_adasyn['Outcome'].value_counts())\n",
"Outcome_counts = df_train_adasyn['Outcome'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(Outcome_counts, labels=Outcome_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов Outcome в обучающей выборке после оверсемплинга')\n",
"plt.show()\n",
"\n",
"\n",
"#10.2\n",
"#Аугментация данных методом андерсемплинга\n",
"#Этот метод помогает сбалансировать выборку, уменьшая количество экземпляров класса большинства, чтобы привести е г о в соответствие с классом меньшинства\n",
"\n",
"from imblearn.under_sampling import RandomUnderSampler\n",
"\n",
"rus = RandomUnderSampler()\n",
"\n",
"# Применение RandomUnderSampler\n",
"X_resampled, y_resampled = rus.fit_resample(train_data.drop(columns=['Outcome']), train_data['Outcome'])\n",
"\n",
"# Создание нового DataFrame\n",
"df_train_undersampled = pd.DataFrame(X_resampled)\n",
"df_train_undersampled['Outcome'] = y_resampled # Добавление целевой переменной\n",
"\n",
"# Вывод информации о новой выборке\n",
"print(\"Обучающая выборка после андерсемплинга: \", df_train_undersampled.shape)\n",
"print(df_train_undersampled['Outcome'].value_counts())\n",
"\n",
"# Визуализация распределения классов\n",
"Outcome_counts = df_train_undersampled['Outcome'].value_counts()\n",
"plt.figure(figsize=(2, 2))\n",
"plt.pie(Outcome_counts, labels=Outcome_counts.index, autopct='%1.1f%%', startangle=90)\n",
"plt.title('Распределение классов Outcome в обучающей выборке после андерсемплинга')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-12-07 12:08:20 +04:00
"version": "3.12.8"
2024-12-05 20:21:14 +04:00
}
},
"nbformat": 4,
"nbformat_minor": 2
}