AIM-PIbd-32-Fedorenko-G-Y/lab_2/lab2.ipynb

1278 lines
565 KiB
Plaintext
Raw Normal View History

2024-10-26 11:45:35 +04:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from sklearn.model_selection import train_test_split\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from imblearn.under_sampling import RandomUnderSampler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Diamonds Prices 2022**\n",
"В данном наборе данных представлена цена на алмазы. \n",
"Проблемная область: ценообразование бриллиантов\n",
"Объект наблюдения: бриллиант\n",
"Атрибуты: идентификатор, вес, качество огранки, цвет, чистота, общая глубина, ширина верхней грани, цена, длина, ширина, высота.\n",
"Бизнес-цели:\n",
"Повышение эффективности маркетинговых кампаний: Цель: Использование данных о бриллиантах для разработки целевых маркетинговых кампаний, направленных на конкретные сегменты рынка."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
" 'price', 'x', 'y', 'z'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df = pd.read_csv(\"..//static//csv//DiamondsPrices2022.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsLUlEQVR4nO3deZTXdd3//8fMIIs6DBeC4CQiuG8jKegxtUtLyzWzNDVKze1kmmtaHBWwy6wrvC6t3FvEvlZ2hVu573K8wD0SLddQKxBFZVUWmc/vD38zFyPbgONrGLjdzpkj8/m8l+f7M5/jmft83u/Pp6pSqVQCAADwMatu7wEAAIA1g/gAAACKEB8AAEAR4gMAAChCfAAAAEWIDwAAoAjxAQAAFCE+AACAIsQHAABQhPgAAACKEB8Aq5FXXnklVVVVGT16dHuPAgCLER8ASzB69OhUVVW1+Fp//fWz55575o477ig+z4MPPthilrXWWisDBw7MkUcemb///e9tso9x48Zl5MiRmT59eptsj+Tdd9/NyJEj8+CDD7b3KACrhE7tPQDAquz73/9+BgwYkEqlkqlTp2b06NHZb7/98qc//SkHHHBA8XlOOeWUDBkyJAsWLMhTTz2Vq6++OrfddlsmTpyY+vr6j7TtcePG5fzzz8/RRx+dHj16tM3Aa7h33303559/fpJkjz32aN9hAFYB4gNgGfbdd98MHjy4+ftjjz02ffr0ye9+97t2iY/dd989hxxySJLkG9/4RjbffPOccsopufbaazNs2LDi86xp3n///TQ2NqZz587tPQpAh+S0K4AV0KNHj3Tr1i2dOrX8282cOXNy5plnpl+/funSpUu22GKLXHTRRalUKkmS9957L1tuuWW23HLLvPfee83rvf3229lggw3yqU99KgsXLlzheT7zmc8kSSZNmrTM5e6///7svvvuWWedddKjR48cdNBB+dvf/tZ8/8iRI3PWWWclSQYMGNB8etcrr7yy3Bk+fHpa09eSTjU6+uijl7jsyJEjWyw3ZsyYDB48OLW1tS2Wu+iii5Y7z/Tp03P66adn4403TpcuXbLhhhvmyCOPzLRp05Ik8+fPz/Dhw7Pjjjumrq4u66yzTnbfffc88MADLbbTdP3MRRddlEsuuSSbbLJJunTpkr/+9a+t2sYrr7yS3r17J0nOP//8pR4rwJrEKx8AyzBjxoxMmzYtlUolb7zxRn72s59l9uzZ+drXvta8TKVSyRe+8IU88MADOfbYYzNo0KDcddddOeuss/Kvf/0rF198cbp165Zrr702u+66a84555z893//d5LkpJNOyowZMzJ69OjU1NSs8Hwvv/xykmS99dZb6jL33ntv9t133wwcODAjR47Me++9l5/97GfZdddd89RTT2XjjTfOl770pbzwwgv53e9+l4svvji9evVKkuZfnpdn7733zpFHHpkkefzxx/PTn/50qcv26tUrF198cfP3X//611vcP378+HzlK1/J9ttvnx/96Eepq6vLtGnTcvrppy93jtmzZ2f33XfP3/72txxzzDHZYYcdMm3atPzxj3/MP//5z/Tq1SszZ87ML37xixxxxBE5/vjjM2vWrPzyl7/M5z//+Tz22GMZNGhQi21ec801mTt3bk444YR06dIlPXv2bNU2evfunSuuuCInnnhiDj744HzpS19KkjQ0NLTqMQVYLVUAWMw111xTSbLYV5cuXSqjR49usezNN99cSVK54IILWtx+yCGHVKqqqiovvfRS823Dhg2rVFdXV8aOHVv5wx/+UElSueSSS5Y7zwMPPFBJUvnVr35VefPNNyuTJ0+u3HbbbZWNN964UlVVVXn88ccrlUqlMmnSpEqSyjXXXNO87qBBgyrrr79+5a233mq+7S9/+Uulurq6cuSRRzbfNmrUqEqSyqRJk1r9OM2fP7+SpHLyySc339Z0XA888MBiyw8dOrQyYMCAFrclqYwYMaL5+2HDhlWSVKZMmdJ8W9NxjRo1apnzDB8+vJKkcuONNy52X2NjY6VSqVTef//9yrx581rc984771T69OlTOeaYYxbbZ/fu3StvvPFGi+Vbu40333xzseMDWJN55QNgGS677LJsvvnmSZKpU6fmuuuuy3HHHZfa2trmv2TffvvtqampySmnnNJi3TPPPDNjxozJHXfckZNPPjnJB6c33XrrrTnqqKMye/bs/Pu///ti6y3LMccc0+L73r1759prr21xXcqipkyZkgkTJuTss89Oz549m29vaGjI3nvvndtvv73V+16SuXPnJkm6du3aquXnz5+fLl26LHOZWbNmpbq6eqUuer/hhhuy/fbb5+CDD17svqqqqiRJTU1N86tMjY2NmT59ehobGzN48OA89dRTi6335S9/ebFXgFZ0GwB8wDUfAMuw0047Za+99spee+2VoUOH5rbbbsvWW2+dk08+OfPnz0+SvPrqq6mvr09tbW2Ldbfaaqvm+5t07tw5v/rVrzJp0qTMmjUr11xzTfMvxa0xfPjw3HPPPbn//vvz9NNPZ/LkyYudtrSopn1vscUWi9231VZbZdq0aZkzZ06r9/9hTddR1NXVtWr56dOnZ911113mMrvssksaGxtz6qmn5uWXX860adPyzjvvtGr7L7/8crbddtvlLnfttdemoaEhXbt2zXrrrZfevXvntttuy4wZMxZbdsCAAR95GwB8QHwArIDq6ursueeemTJlSl588cWV2sZdd92V5INXDVZ0G9ttt1322muv7Lnnntluu+0Wu/C9tKYL0jfeeONWLf/666+nb9++y1zm8MMPz5lnnpnRo0dn0003Te/evbPDDjt8xEn/z3XXXZejjz46m2yySX75y1/mzjvvzD333JPPfOYzaWxsXGz5bt26feRtAPABp10BrKD3338/yQcXNydJ//79c++992bWrFktXv147rnnmu9v8vTTT+f73/9+vvGNb2TChAk57rjjMnHixFa/crCimvb9/PPPL3bfc889l169emWdddZJkhV6BabJE088kSRLPe1rUQsWLMhLL72UffbZZ5nLVVdX56KLLsrEiRMzadKkXH755Zk6dWqLi/yXZpNNNskzzzyzzGXGjBmTgQMH5sYbb2xxzCNGjFju9ld0GyvzmAKszrzyAbACFixYkLvvvjudO3duPq1qv/32y8KFC3PppZe2WPbiiy9OVVVV9t133+Z1jz766NTX1+cnP/lJRo8enalTp7bqXZxW1gYbbJBBgwbl2muvbfHJ5c8880zuvvvu7Lfffs23NUXIinzC+ZgxY7LFFltkyy23XO6yt9xyS957773mtwdelp/97Ge5//7785vf/CZ77bVXdt1111bN8+Uvfzl/+ctfctNNNy12X+X/f9vjpms1mr5PkkcffTTjx49v1T5WZBtrr712khV7TAFWZ175AFiGO+64o/kVjDfeeCO//e1v8+KLL+Z73/teunfvniQ58MADs+eee+acc87JK6+8ku233z533313brnllpx22mnZZJNNkiQXXHBBJkyYkPvuuy+1tbVpaGjI8OHDc+655+aQQw5pEQJtadSoUdl3332zyy675Nhjj21+q926uroWnzmx4447JknOOeecHH744VlrrbVy4IEHNkfJov7+97/nxz/+cR577LF86UtfynXXXdd83+OPP54kueeee7LRRhulb9++GTFiRC6//PJ86lOfyuc+97llzvvss8/m7LPPzsiRIzNkyJAVOtazzjorY8aMyaGHHppjjjkmO+64Y95+++388Y9/zJVXXpntt98+BxxwQG688cYcfPDB2X///TNp0qRceeWV2XrrrZtfzVqe1m6jW7du2XrrrfP73/8+m2++eXr27Jltt922VdelAKyW2vndtgBWSUt6q92uXbtWBg0aVLniiiua37a1yaxZsyqnn356pb6+vrLWWmtVNttss8qoUaOal3vyyScrnTp1qnz7299usd77779fGTJkSKW+vr7yzjvvLHWeprfa/cMf/rDMuZf0VruVSqVy7733VnbddddKt27dKt27d68ceOCBlb/+9a+Lrf8f//EflU9
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['carat'])\n",
"plt.title('Box Plot для carat')\n",
"plt.xlabel('carat')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACr0klEQVR4nOzdeXwU9f0/8Nfs5tgkJJuLsAlCEg6FGAFBEOSwIrQc4lWrolCvoiJYr/araBEoVaS2VX/VokVBC6K29UAEY7mUw2CUcMWgQkwAIQGSwCbkTnZ+f8RZ95jdndmdvbKv5+Ph4yGb2ZnPzs7uft7z/nzeH0EURRFEREREREQRQhfsBhAREREREQUSgyAiIiIiIoooDIKIiIiIiCiiMAgiIiIiIqKIwiCIiIiIiIgiCoMgIiIiIiKKKAyCiIiIiIgoojAIIiIiIiKiiMIgiIiIiIiIIgqDICIicqmiogKCIOD1118PdlPsFBQUYMiQITAYDBAEAWfPnvXbsW6//Xbk5OT4bf9ERBR4DIKIKCIdOHAAN9xwA7Kzs2EwGNCzZ09MnDgRf//73/12zDVr1uD55593evzEiRNYuHAh9u7d67djO/r0008hCIL1v+joaPTp0we//vWv8f3332tyjM8//xwLFy7UPECpqanBjTfeiLi4OLz00ktYtWoVEhISZLd9/fXX7V6nwWDA+eefj7lz5+LkyZOatisc7d27FzNmzECvXr0QGxuL1NRUTJgwAStXrkRHR0dQ2vT000/jgw8+CMqxiShyRAW7AUREgfb555/jiiuuQO/evTFr1iyYTCYcO3YMu3btwgsvvID777/fL8dds2YNSkpK8OCDD9o9fuLECSxatAg5OTkYMmSIX47tym9/+1sMHz4cbW1tKC4uxj//+U+sX78eBw4cQFZWlk/7/vzzz7Fo0SLcfvvtSE5O1qbBAL788kvU19dj8eLFmDBhgqLn/PGPf0Rubi6am5uxY8cOLFu2DBs2bEBJSQni4+PdPnf58uWwWCxaND2kvPrqq7j33nvRo0cPzJw5E/3790d9fT02b96Mu+66C5WVlXj88ccD3q6nn34aN9xwA6699tqAH5uIIgeDICKKOE899RSMRiO+/PJLp875qVOngtMoP2hoaHCZIZGMHTsWN9xwAwDgjjvuwPnnn4/f/va3eOONNzBv3rxANFM16T1SE1hNnjwZl1xyCQDgN7/5DdLS0vC3v/0Na9euxfTp02WfI52/6Ohon9scanbt2oV7770Xo0aNwoYNG5CYmGj924MPPoivvvoKJSUlPh/HYrGgtbUVBoPB530REWmJw+GIKOKUlZXhwgsvlO1EZ2RkOD22evVqjBgxAvHx8UhJScG4cePwv//9z/r3tWvXYurUqcjKykJsbCz69u2LxYsX2w0n+tnPfob169fjyJEj1qFZOTk5+PTTTzF8+HAAnUGI9DfbOThffPEFJk2aBKPRiPj4eFx++eXYuXOnXRsXLlwIQRBQWlqKW265BSkpKRgzZozqczN+/HgAQHl5udvttmzZgrFjxyIhIQHJycm45pprcPDgQbv2/P73vwcA5ObmWl9XRUWF2/3+5z//wbBhwxAXF4f09HTMmDEDx48ft/79Zz/7GW677TYAwPDhwyEIAm6//XafX+ftt9+Obt26oaysDFOmTEFiYiJuvfVW698c5wRZLBa88MILuOiii2AwGNC9e3dMmjQJX331ld12q1evtr6e1NRU3HzzzTh27Jjbtv33v/+FIAj47LPPnP72yiuvQBAEa4BSVVWFO+64A+eddx5iY2ORmZmJa665xuN5XrRoEQRBwJtvvmkXAEkuueQSu/P6l7/8BZdddhnS0tIQFxeHYcOG4b///a/T8wRBwNy5c/Hmm2/iwgsvRGxsLAoKChTvQxAENDQ04I033rBeM968v0REnjATREQRJzs7G4WFhSgpKUF+fr7bbRctWoSFCxfisssuwx//+EfExMTgiy++wJYtW/Dzn/8cQOe8k27duuHhhx9Gt27dsGXLFjz55JOoq6vDs88+CwB44oknYDab8cMPP+C5554DAHTr1g0DBw7EH//4Rzz55JO4++67MXbsWADAZZddBqAz2Jg8eTKGDRuGBQsWQKfTYeXKlRg/fjy2b9+OESNG2LX3V7/6Ffr374+nn34aoiiqPjdlZWUAgLS0NJfbbNq0CZMnT0afPn2wcOFCNDU14e9//ztGjx6N4uJi5OTk4Prrr8d3332Ht956C8899xzS09MBAN27d3e539dffx133HEHhg8fjiVLluDkyZN44YUXsHPnTuzZswfJycl44okncMEFF+Cf//yndYhb3759NXmd7e3t+MUvfoExY8bgL3/5i9thcnfddRdef/11TJ48Gb/5zW/Q3t6O7du3Y9euXdaM01NPPYX58+fjxhtvxG9+8xucPn0af//73zFu3Djr65EzdepUdOvWDf/+979x+eWX2/3tnXfewYUXXmi9bn/5y1/i66+/xv3334+cnBycOnUKGzduxNGjR10Wc2hsbMTmzZsxbtw49O7dW9H5euGFF3D11Vfj1ltvRWtrK95++2386le/wkcffYSpU6fabbtlyxb8+9//xty5c5Genm5th5J9rFq1Cr/5zW8wYsQI3H333QDg1ftLROSRSEQUYf73v/+Jer1e1Ov14qhRo8T/+7//Ez/55BOxtbXVbrtDhw6JOp1OvO6668SOjg67v1ksFuv/NzY2Oh3jnnvuEePj48Xm5mbrY1OnThWzs7Odtv3yyy9FAOLKlSudjtG/f3/xF7/4hdPxcnNzxYkTJ1ofW7BggQhAnD59uqJzsHXrVhGAuGLFCvH06dPiiRMnxPXr14s5OTmiIAjil19+KYqiKJaXlzu1bciQIWJGRoZYU1NjfWzfvn2iTqcTf/3rX1sfe/bZZ0UAYnl5ucf2tLa2ihkZGWJ+fr7Y1NRkffyjjz4SAYhPPvmk9bGVK1eKAKxtdEfadtOmTeLp06fFY8eOiW+//baYlpYmxsXFiT/88IMoiqJ42223iQDExx57zGkft912m937tmXLFhGA+Nvf/tZpW+l9qqioEPV6vfjUU0/Z/f3AgQNiVFSU0+OOpk+fLmZkZIjt7e3WxyorK0WdTif+8Y9/FEVRFM+cOSMCEJ999lmP58HWvn37RADiAw88oPg5jtd4a2urmJ+fL44fP97ucQCiTqcTv/76a6/3kZCQIN52222K20ZE5A0OhyOiiDNx4kQUFhbi6quvxr59+/DnP/8Zv/jFL9CzZ098+OGH1u0++OADWCwWPPnkk9Dp7L8uBUGw/n9cXJz1/+vr61FdXY2xY8eisbER33zzjdft3Lt3Lw4dOoRbbrkFNTU1qK6uRnV1NRoaGnDllVdi27ZtThP27733XlXHuPPOO9G9e3dkZWVh6tSp1qFIUjbDUWVlJfbu3Yvbb78dqamp1scHDRqEiRMnYsOGDepfKICvvvoKp06dwn333Wc3f2Tq1KkYMGAA1q9f79V+JRMmTED37t3Rq1cv3HzzzejWrRvef/999OzZ02672bNne9zXu+++C0EQsGDBAqe/SdfFe++9B4vFghtvvNH6vlVXV8NkMqF///7YunWr22PcdNNNOHXqFD799FPrY//9739hsVhw0003Aei87mJiYvDpp5/izJkzHtstqaurAwDZYXCu2F7jZ86cgdlsxtixY1FcXOy07eWXX468vDyf9kFE5G8cDkdEEWn48OF477330Nrain379uH999/Hc889hxtuuAF79+5FXl4eysrKoNPpZDt0tr7++mv84Q9/wJYtW6wdTInZbPa6jYcOHQIA6xwYOWazGSkpKdZ/5+bmqjrGk08+ibFjx0Kv1yM9PR0DBw5EVJTrn4YjR44AAC644AKnvw0cOBCffPKJooIMavY7YMAA7NixQ9X+HL300ks4//zzERUVhR49euCCCy5wCmyjoqJw3nnnedxXWVkZsrKy7IJAR4cOHYIoiujfv7/s3z0VW5DmgL3zzju48sorAXQOhRsyZAjOP/98AEBsbCyWLl2KRx55BD169MDIkSN
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Визуализация данных после обработки\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(df[\"price\"], df[\"carat\"])\n",
"plt.xlabel(\"price\")\n",
"plt.ylabel(\"carat\")\n",
"plt.title(\"Scatter Plot of Price vs Carat\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Удаление строк с пустыми значениями"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"df_cleaned = df.dropna()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 32365\n",
"Размер контрольной выборки: 10789\n",
"Размер тестовой выборки: 10789\n"
]
}
],
"source": [
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка сбалансированности выборок"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение price в обучающей выборке:\n",
"price\n",
"789 80\n",
"605 79\n",
"544 72\n",
"552 72\n",
"828 71\n",
" ..\n",
"9942 1\n",
"7787 1\n",
"18663 1\n",
"7979 1\n",
"8164 1\n",
"Name: count, Length: 9496, dtype: int64\n",
"\n",
"Распределение price в контрольной выборке:\n",
"price\n",
"625 34\n",
"828 31\n",
"605 30\n",
"789 26\n",
"544 26\n",
" ..\n",
"4188 1\n",
"7541 1\n",
"3498 1\n",
"3314 1\n",
"12196 1\n",
"Name: count, Length: 5383, dtype: int64\n",
"\n",
"Распределение price в тестовой выборке:\n",
"price\n",
"802 33\n",
"844 29\n",
"776 29\n",
"675 26\n",
"645 25\n",
" ..\n",
"1567 1\n",
"5529 1\n",
"2031 1\n",
"417 1\n",
"5431 1\n",
"Name: count, Length: 5338, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['price'].value_counts()\n",
" print(f\"Распределение price в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним овер- и андер- слемпинг."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение price в обучающей выборке после oversampling:\n",
"price\n",
"5076 80\n",
"1789 80\n",
"3931 80\n",
"1263 80\n",
"2026 80\n",
" ..\n",
"3678 80\n",
"4592 80\n",
"516 80\n",
"7152 80\n",
"2353 80\n",
"Name: count, Length: 9496, dtype: int64\n",
"\n",
"Распределение price в контрольной выборке после oversampling:\n",
"price\n",
"966 34\n",
"13638 34\n",
"3669 34\n",
"1052 34\n",
"2818 34\n",
" ..\n",
"4032 34\n",
"544 34\n",
"3362 34\n",
"6559 34\n",
"792 34\n",
"Name: count, Length: 5383, dtype: int64\n",
"\n",
"Распределение price в тестовой выборке после oversampling:\n",
"price\n",
"3742 33\n",
"559 33\n",
"8403 33\n",
"1238 33\n",
"1243 33\n",
" ..\n",
"1149 33\n",
"2401 33\n",
"958 33\n",
"702 33\n",
"14618 33\n",
"Name: count, Length: 5338, dtype: int64\n",
"\n"
]
}
],
"source": [
"def oversample(df):\n",
" X = df.drop(\"price\", axis=1)\n",
" y = df[\"price\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение price в обучающей выборке после undersampling:\n",
"price\n",
"18823 1\n",
"326 1\n",
"327 1\n",
"336 1\n",
"337 1\n",
" ..\n",
"353 1\n",
"351 1\n",
"348 1\n",
"345 1\n",
"344 1\n",
"Name: count, Length: 9496, dtype: int64\n",
"\n",
"Распределение price в контрольной выборке после undersampling:\n",
"price\n",
"18804 1\n",
"334 1\n",
"345 1\n",
"351 1\n",
"352 1\n",
" ..\n",
"373 1\n",
"371 1\n",
"369 1\n",
"367 1\n",
"366 1\n",
"Name: count, Length: 5383, dtype: int64\n",
"\n",
"Распределение price в тестовой выборке после undersampling:\n",
"price\n",
"18803 1\n",
"335 1\n",
"336 1\n",
"337 1\n",
"358 1\n",
" ..\n",
"386 1\n",
"384 1\n",
"380 1\n",
"378 1\n",
"375 1\n",
"Name: count, Length: 5338, dtype: int64\n",
"\n"
]
}
],
"source": [
"def undersample(df):\n",
" X = df.drop('price', axis=1)\n",
" y = df['price']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **14,400 Classic Rock Tracks (with Spotify Data)**\n",
"\n",
" Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков. Каждый трек каталогизирован с 18 столбцами данных, такие как название трека, исполнитель, альбом и год выпуска. \n",
" Бизнес-цель: улучшение стратегии маркетинга и продвижения музыкальных треков, создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений. \n",
" Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения. \n",
" Входные данные: \n",
" Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания.\n",
" Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.).\n",
" Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками. \n",
" Целевой признак: Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выгрузка данных из csv файла \"Данные о клиентах\" в датафрейм"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n",
" 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n",
" 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n",
" 'Popularity'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df = pd.read_csv(\"..//static//csv//UltimateClassicRock.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Анализируем датафрейм при помощи \"ящика с усами\". Есть смещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzVUlEQVR4nO3dd5RV5dnw4XsGptBRUYoCgg3ELqhAFFQs2DXqp6KIvUAUjTEaC9iisb72khcBA/ZEUINRFEskRikLSxTUCKJB5FXpAgPM/v5wzQnDDFV8BsbrWosVdjvnmT07zvmxy+RlWZYFAADATyy/qgcAAAD8PIgPAAAgCfEBAAAkIT4AAIAkxAcAAJCE+AAAAJIQHwAAQBLiAwAASEJ8AAAASYgPgCo0ZcqUyMvLi0GDBlX1UFiBXr16xZZbbrnOXzcvLy/69++/zl8XYH0mPoBqYdCgQZGXl1fuz2abbRb77rtvvPDCC8nH89prr5UbS0FBQbRu3Tp69uwZn3322Tp5j3/84x/Rv3//mDVr1jp5vaqQYj9tKKrD9xNgVWpW9QAA1qVrr702WrVqFVmWxddffx2DBg2KQw45JJ577rk47LDDko/nggsuiA4dOsTixYtj/Pjx8dBDD8Vf//rXeP/996NZs2Y/6rX/8Y9/xDXXXBO9evWKhg0brpsBV5Gfcj+trxYsWBA1a/73x3B1+n4CrIj4AKqV7t27R/v27XPTZ5xxRjRu3Dgee+yxKomPvffeO4499tiIiDjttNNi2223jQsuuCAGDx4cl19+efLxrK9+LvuptLQ0SkpKori4OIqLi6t6OADJuewKqNYaNmwYtWrVKvcvzBER8+fPj1//+tfRvHnzKCoqiu222y5uvfXWyLIsIn74V+k2bdpEmzZtYsGCBbntvvvuu2jatGl06tQpli5dusbj2W+//SIiYvLkyStdb9SoUbH33ntHnTp1omHDhnHkkUfGRx99lFvev3//+M1vfhMREa1atcpdtjRlypRVjmH5y9PK/rz22msV1u3Vq1el6y5/r8LTTz8d7du3j3r16pVb79Zbb13leCpT2X667777ol27dlFUVBTNmjWL3r17V7hEqWvXrrHDDjvEuHHjolOnTlGrVq1o1apVPPDAA+XWK7tMb/n9VXYZWGX7Ylm33nprdOrUKTbZZJOoVatW7L777vH0009XWC8vLy/69OkTQ4cOzY39b3/7W25Z2X5c2fezS5cusfPOO1c6ju222y4OOuiglY4VYH3izAdQrcyePTu++eabyLIsZsyYEXfffXfMmzcvTj755Nw6WZbFEUccEa+++mqcccYZscsuu8SLL74Yv/nNb+I///lP3HHHHVGrVq0YPHhwdO7cOa644oq4/fbbIyKid+/eMXv27Bg0aFDUqFFjjcf373//OyIiNtlkkxWu8/LLL0f37t2jdevW0b9//1iwYEHcfffd0blz5xg/fnxsueWWccwxx8THH38cjz32WNxxxx3RqFGjiIjYdNNNV2scBxxwQPTs2TMiIsaMGRN33XXXCtdt1KhR3HHHHbnpU045pdzyt956K44//vjYeeed46abbooGDRrEN998ExdddNFqjaUyy++n/v37xzXXXBPdunWL8847LyZNmhT3339/jBkzJkaPHh0FBQW5bWfOnBmHHHJIHH/88XHiiSfGk08+Geedd14UFhbG6aefvtZjWtadd94ZRxxxRPTo0SNKSkri8ccfj+OOOy6ef/75OPTQQ8utO2rUqHjyySejT58+0ahRo0pvXl/Z9/OUU06Js846Kz744IPYYYcdctuMGTMmPv7447jyyivXydcEkEQGUA0MHDgwi4gKf4qKirJBgwaVW3fYsGFZRGTXX399ufnHHntslpeXl3366ae5eZdffnmWn5+fvfHGG9lTTz2VRUT2P//zP6scz6uvvppFRPbwww9n//d//5dNmzYt++tf/5ptueWWWV5eXjZmzJgsy7Js8uTJWURkAwcOzG27yy67ZJtttln27bff5ua9++67WX5+ftazZ8/cvFtuuSWLiGzy5MmrvZ9KSkqyiMj69OmTm1f2db366qsV1u/Ro0fWqlWrcvMiIuvXr19u+vLLL88iIvvqq69y88q+rltuuWWl41md/TRjxoyssLAwO/DAA7OlS5fmtr3nnnty25bp0qVLFhHZbbfdlpu3aNGi3D4tKSnJsuy/x8vy+65sPMvui1NPPTVr2bJlufW+//77ctMlJSXZDjvskO23334V9lV+fn72r3/9q8LXvvx+XNH3c9asWVlxcXH229/+ttz8Cy64IKtTp042b968Cq8NsL5y2RVQrdx7770xcuTIGDlyZAwZMiT23XffOPPMM+Mvf/lLbp0RI0ZEjRo14oILLii37a9//evIsqzc07H69+8f7dq1i1NPPTXOP//86NKlS4XtVub000+PTTfdNJo1axaHHnpozJ8/PwYPHlzuvpRlffXVVzFhwoTo1atXbLzxxrn5O+20UxxwwAExYsSI1X7vyixcuDAiYrXvNygpKYmioqKVrjN37tzIz8//UTdJr2w/vfzyy1FSUhJ9+/aN/Pz//tg666yzon79+vHXv/613GvVrFkzzjnnnNx0YWFhnHPOOTFjxowYN27cWo9xWbVq1cr9febMmTF79uzYe++9Y/z48RXW7dKlS2y//fZr/V4NGjSII488Mh577LHcZYFLly6NJ554Io466qioU6fOWr82QGouuwKqlT322KPcB/sTTzwxdt111+jTp08cdthhUVhYGJ9//nk0a9Ys6tWrV27btm3bRkTE559/nptXWFgYDz/8cHTo0CGKi4tj4MCBkZeXt9rjufrqq2PvvfeOGjVqRKNGjaJt27YV7j9ZVtl7b7fddhWWtW3bNl588cWYP3/+Wn/g/OabbyLihw+0q2PWrFlRt27dla7TsWPHuOeee+LCCy+MSy+9NBo0aBAzZ85co3GtbD+taJ8UFhZG69aty32/IiKaNWtWYf9su+22EfHD71XZa6+91mhslXn++efj+uuvjwkTJsSiRYty8ys7Nlq1avWj369nz57xxBNPxN///vfYZ5994uWXX46vv/66wiVwAOs78QFUa/n5+bHvvvvGnXfeGZ988km0a9dujV/jxRdfjIgfzhp88skna/Rhcscdd4xu3bqt8Xv+VMpusF7dX5o3ffr0aNmy5UrXOeGEE2L8+PFx9913x0MPPbRW40q9n1YUkKvzEIG///3vccQRR8Q+++wT9913XzRt2jQKCgpi4MCB8eijj1ZYf9mzJGvroIMOisaNG8eQIUNin332iSFDhkSTJk3Wq2MLYHW47Aqo9pYsWRIREfPmzYuIiJYtW8a0adNi7ty55dabOHFibnmZ9957L6699to47bTTYtddd40zzzwzZs+e/ZONtey9J02aVGHZxIkTo1GjRrl/1V+TMzBlxo4dGxGxwsu+lrV48eL49NNPc2eEViQ/Pz9uvfXW6Nq1a2yzzTa5S97WlRXtk5KSkpg8eXKFOJo2bVrMnz+/3LyPP/44Iv4bXRtttFFERIWnZS1/FqUyf/7zn6O4uDhefPHFOP3006N79+7rJAJW9v2sUaNGnHTSSfH000/HzJkzY9iwYXHiiSeu1UMPAKqS+ACqtcWLF8dLL70UhYWFuQ/RhxxySCxdujTuueeecuvecccdkZeXF927d89t26tXr2jWrFnceeedMWjQoPj6669/1FOcVqVp06axyy67xODBg8t9MP7ggw/ipZdeikMOOSQ3ryxC1uQ3Yj/99NOx3XbbRZs2bVa57vDhw2PBggW5x96uzN133x2jRo2KoUOHRrdu3aJz586rPaZV6datWxQWFsZdd92Vu+chImLAgAExe/bsCk+XWrJkSTz44IO56ZKSknjwwQdj0003jd133z0iIrbaaquIiHjjjTdy6y1dunS1ztzUqFEj8vLyyp0lmTJlSgwbNmytvr4yq/p+nnLKKTFz5sw455xzKjzBDWBD4bIroFp54YUXcmcwZsyYEY8++mh88skncdlll0X9+vUjIuLwww+PfffdN6644oqYMmV
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Box plot для столбца 'Popularity'\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=df['Popularity'])\n",
"plt.title('Box Plot для Popularity')\n",
"plt.xlabel('Popularity')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Решим проблему пустых значений при помощи удаления таких строк."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"df_cleaned = df.dropna()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 8650\n",
"Размер контрольной выборки: 2884\n",
"Размер тестовой выборки: 2884\n"
]
}
],
"source": [
"# Разделение на обучающую и тестовую выборки\n",
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную\n",
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_df))\n",
"print(\"Размер контрольной выборки:\", len(val_df))\n",
"print(\"Размер тестовой выборки:\", len(test_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Оценка сбалансированности выборок"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке:\n",
"Popularity\n",
"23 258\n",
"15 250\n",
"26 246\n",
"21 245\n",
"14 245\n",
" ... \n",
"84 1\n",
"87 1\n",
"91 1\n",
"79 1\n",
"86 1\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке:\n",
"Popularity\n",
"17 90\n",
"26 86\n",
"21 83\n",
"24 83\n",
"28 80\n",
" ..\n",
"85 1\n",
"83 1\n",
"84 1\n",
"80 1\n",
"77 1\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке:\n",
"Popularity\n",
"22 86\n",
"21 85\n",
"12 84\n",
"20 82\n",
"26 81\n",
" ..\n",
"76 2\n",
"71 2\n",
"79 1\n",
"82 1\n",
"80 1\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"def check_balance(df, name):\n",
" counts = df['Popularity'].value_counts()\n",
" print(f\"Распределение Popularity в {name}:\")\n",
" print(counts)\n",
" print()\n",
"\n",
"check_balance(train_df, \"обучающей выборке\")\n",
"check_balance(val_df, \"контрольной выборке\")\n",
"check_balance(test_df, \"тестовой выборке\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним овер- и андер- слемпинг."
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке после oversampling:\n",
"Popularity\n",
"44 258\n",
"20 258\n",
"30 258\n",
"27 258\n",
"8 258\n",
" ... \n",
"78 258\n",
"79 258\n",
"74 258\n",
"81 258\n",
"86 258\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке после oversampling:\n",
"Popularity\n",
"21 90\n",
"11 90\n",
"28 90\n",
"23 90\n",
"37 90\n",
" ..\n",
"61 90\n",
"84 90\n",
"80 90\n",
"77 90\n",
"0 90\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке после oversampling:\n",
"Popularity\n",
"14 86\n",
"47 86\n",
"27 86\n",
"13 86\n",
"66 86\n",
" ..\n",
"63 86\n",
"79 86\n",
"71 86\n",
"82 86\n",
"80 86\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"def oversample(df):\n",
" X = df.drop('Popularity', axis=1)\n",
" y = df['Popularity']\n",
" \n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_oversampled = oversample(train_df)\n",
"val_df_oversampled = oversample(val_df)\n",
"test_df_oversampled = oversample(test_df)\n",
"\n",
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Распределение Popularity в обучающей выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"84 1\n",
"85 1\n",
"86 1\n",
"87 1\n",
"91 1\n",
"Name: count, Length: 88, dtype: int64\n",
"\n",
"Распределение Popularity в контрольной выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"82 1\n",
"83 1\n",
"84 1\n",
"85 1\n",
"87 1\n",
"Name: count, Length: 85, dtype: int64\n",
"\n",
"Распределение Popularity в тестовой выборке после undersampling:\n",
"Popularity\n",
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
" ..\n",
"76 1\n",
"77 1\n",
"79 1\n",
"80 1\n",
"82 1\n",
"Name: count, Length: 80, dtype: int64\n",
"\n"
]
}
],
"source": [
"def undersample(df):\n",
" X = df.drop('Popularity', axis=1)\n",
" y = df['Popularity']\n",
" \n",
" undersampler = RandomUnderSampler(random_state=42)\n",
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
" \n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"train_df_undersampled = undersample(train_df)\n",
"val_df_undersampled = undersample(val_df)\n",
"test_df_undersampled = undersample(test_df)\n",
"\n",
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Car Price Prediction Challenge**\n",
"В данном наборе данных представлена цена на автомобили\n",
"Проблемная область: ценообразование автомобилей\n",
"Объект наблюдения: автомобили\n",
"Атрибуты: идентификатор, цена, сбор, производитель, модель, год выпуска, категория, наличие кожаного салона, тип топлива, объём двигателя, пробег, циллиндры, тип коробки передач, колеса, двери, цвет и подушки безопасности.\n",
"Бизнес-цель: Построение моделей для прогнозирования цен на автомобили на основе их характеристик.\n",
"Эффект для бизнеса: Оптимизация стратегии продаж и увеличение прибыли.\n",
"Цель: Разработка модели для прогнозирования цен.\n",
"Входные данные: Данные о характеристиках автомобилей (год выпуска, тип топлива, пробег и др.).\n",
"Целевой признак: Цена автомобиля."
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
" 'Airbags'],\n",
" dtype='object')\n"
]
}
],
"source": [
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
"print(df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABAcAAAIlCAYAAACpcJFFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADtPElEQVR4nOzdeXhTdfYG8DdJs7SlCy20TQVLBWQrgoIiigqCFFAEFxRERWVEURQGBxEH2cRBUBQQBHEBVPi5i+JSqcimVPYdQYGyaJsWKN1pkib390dyb5smaZM0bbb38zydoTffJCcxaXtPzvccmSAIAoiIiIiIiIgoZMl9HQARERERERER+RaTA0REREREREQhjskBIiIiIiIiohDH5AARERERERFRiGNygIiIiIiIiCjEMTlAREREREREFOKYHCAiIiIiIiIKcUwOEBEREREREYU4JgeIiIiIiIiIQlyYrwMgIiIi4NSpU/j999+Rn5+PCxcuIC8vD/Pnz0dkZKSvQyMiIqIQIBMEQfB1EERERKFqx44dGD9+PH7//Xeb42FhYVi/fj369Onjo8iIiIgolHBbARFJdu/ejdGjR6Nt27aIjIxEeHg4WrdujYceegiZmZm+Do8o6Pz666+45ZZbcPDgQbz22ms4c+YMBEGAIAgwGo1MDBAREVGjYeUAEcFsNuM///kP3nzzTYSFheHWW29FWloalEolTp48iZ9//hkXL17ErFmz8NJLL/k6XKKgYDAY0KFDB1y4cAEbN27E1Vdf7euQiIiIKISx5wARYerUqXjzzTfRtWtXfPHFF2jdurXN5ZcuXcLixYtx4cIFH0VIFHx++uknnDx5Em+99RYTA0RERORz3FZAFOKOHz+OefPmIT4+HhkZGXaJAQAIDw/HpEmTMHPmTOnYI488AplMhpMnT2LevHlo27YtNBoNUlNTMWvWLBiNRof3t2XLFgwePBjNmjWDWq1G27ZtMXXqVJSXlztcf+rUKchkMqdfNbVq1QqtWrVyeFsrV66ETCbDypUrbY6Xl5fjxRdfxJVXXgm1Wm13H6dOnbJZf+DAAdx1111ISEiAQqGwWdu7d2+H912T+PzVvO3qj/mRRx5xeN1vvvkGffv2RdOmTaHRaJCWlobXX38dJpPJpccrcvRc/fnnn3j++edxzTXXID4+HhqNBldeeSVeeOEFlJaWuvTYACAnJwfTp0/H9ddfj4SEBKjVarRq1QpPPfUU8vPz7daLz4f4FRYWhhYtWuC+++7D4cOHpXWbNm2q9fXg7LVx/vx5TJgwAampqVCr1UhISMB9992HQ4cOOY1FLpcjOzvb7vKtW7dK9zFjxgy7y3/77TfcfvvtiIuLg0ajQfv27TF9+nS71/i2bdsAAElJSbjzzjsRHx8vvScmT56MoqIiu9t29hpbtGiR09crAMyYMcPp8+To9t577z1cd911iIyMtFvv7PXkKNbqX5GRkejQoQNmzJiBiooKl25j3759GDx4MK644gpERkYiNjYW11xzDRYsWGD3M2bjxo147LHH0K5dOzRp0gRNmjRB9+7dsXz58nrHJz5/mzZtsjleXl6Oli1bOn2/5ufn47nnnkO7du0QHh6OuLg49OjRA6+//rq0prb3+8SJEx2+nqv/96wZEwCcPn1a+tnk6HYPHTqE++67T3pvpqamYsKECU4TwHU9Dk/el85+Bubk5CAqKsrp+8uZ06dPY/To0bjsssugUqnQokULjB49GmfOnLFZ16pVK5fidOV1Xtv1p06d6lF81dX8uVj9q+ZzU1RUhLlz5+KWW25BcnIyVCoVkpOT8fDDD+PEiRMuP4+O/luK/70//PBDu/W9e/d2+LvY2WOp/t9b/B1V21f131EPPvggZDIZduzY4fA+pk2bBplMhv/7v/+zOb5//36MHDkSLVq0gFqthlarxYABA7Bu3Tq7WGr+d9++fTuio6ORmpqKs2fPSse99buSyN+wcoAoxK1cuRImkwlPPPEEEhMTa12rVqvtjk2YMAG//fYb7rvvPjRp0gTr1q3D9OnTceDAAXzxxRc2a5cuXYqnn34asbGxGDx4MBISErBr1y688sor2LhxIzZu3AiVSuXwvrt06YKhQ4faxH369Gn3H7ADo0aNwhdffIE2bdpg7NixiI2NBQCsXbsW+/fvt1mbk5ODm266CSUlJRgwYAC6du0qxVw9edJQpkyZgldffRWXXXYZ7r77bsTExGDr1q2YNGkStm/fjs8//7xet//VV1/h/fffR58+fdC7d2+YzWb8/vvvmDt3LjZv3owtW7ZAqVTWeTtbtmzB/Pnz0bdvX/To0QNKpRJ79+7F0qVL8dNPP2HPnj2IiYmxu9748eMRGxsLo9GIgwcP4osvvsCmTZtw5MgRNGvWDK1atcL06dNtrjNz5kykpKQ4TaacO3cOPXv2xIkTJ9C7d28MHz4c2dnZ+OKLL/D999/jp59+Qq9eveyuJ5fL8c477+DVV1+1Ob506VIoFAq7ZAwAfP755xgxYgTUajXuv/9+JCQkYP369Zg1axZ++uknbNq0CRqNRooLAIYPHw6VSoX77rsPWq0Wmzdvxrx58/Ddd99h27ZtDp+n6s6fP+/ySdSoUaNs/th29JpdsmQJxo0bh9jYWDzwwANITk6GTCbDvn378M0337h0P6Lq/11KSkrw/fffY+bMmTh9+jRWrFhR5/X//vtvnD9/Hn379kXz5s1RVlaGn376Cf/+979x6NAhvPfee9LauXPn4vjx47j++utx1113obCwEBkZGXjiiSdw7NgxzJ8/3+vxzZkzB3///bfDy44dO4Y+ffogNzcXvXr1wtChQ1FWVobDhw/jf//7H/7zn//UettHjx7F4sWLa12jUCiwbNkyuwTPO++84/SE7ddff0V6ejoMBgPuvfdetGrVCllZWVi4cCG+++47/P7772jWrJlbj8OT96UzkydPdvvk6s8//0SvXr1w7tw5DB48GJ06dcKhQ4fwwQcfYN26dfj1119x5ZVXArD8ziosLJSuK/6cF3/2iLp27erSfTt7jDfffLNH8TlSPbZTp05h1apVdmv++OMPTJs2DX369MFdd92FyMhIHD16FGvWrMH333+PPXv2ICUlxaXHBAC33HKL9LrS6XT44osvMGrUKAiCgFGjRrl8O64YMmSIw+d7wYIFNt8/8cQTWL16tZS8rM5kMmHFihWIj4/H3XffLR3/8ssv8cADD0AQBAwePBjt2rVDfn4+tm/fjvfffx+DBw92Gtf+/fsxcOBAREdHY8OGDWjZsqV0mbd+VxL5HYGIQlrv3r0FAMLPP//s1vVGjRolABCaN28unD17Vjqu1+uFm2++WQAgfPHFF9Lxw4cPC2FhYUKXLl2E8+fP29zWnDlzBADC66+/bnc/x48fFwAIjzzyiM3xW265RXD0IywlJUVISUlxGPOKFSsEAMKKFSukY8XFxYJcLheSk5OF0tJSh48xOztbOrZkyRIBgPDss8/a3T4A4ZZbbnF43zU5um1Rdna2AEAYNWqUzfH169cLAIT09HSbWM1ms/Dkk0/aPeeOHm91jp6rv//+W9Dr9XZrZ86cKQAQPv74Y5ceX15enlBSUmJ3fNWqVQIAYfbs2TbHnT0fkyZNEgAIa9eudXpfdT3vjz76qABAmDJlis3x77//XgAgtGnTRjCZTHaxDBkyRGjevLnN85Gfny+oVCph6NChAgBh+vTp0mVFRUVCTEyMoFarhf3790vHTSaTcP/99wsAhFmzZtndj0qlEvbs2WMT2zPPPCMAEMaNG1fnY33yyScFuVwudO3a1elr6r///a8AQNi0aVOdt3fttdcKAOxiquv1VJOj27506ZLQsmVLITY21qXbcMRgMAitW7cWIiMjbY6fPHnSbq3RaBRuu+02QaFQCKdPn/Y4vunTpwsAhI0bN0rHsrOzBY1GI3Tr1s3h+7V79+4CAGH58uV2cVX/mens/T5gwAAhMjJSaNeund3POjGeIUOGCEqlUtDpdNJler1eSEhIkF6j1W/XZDIJrVu3FgAIGRkZNrcpvtcee+wxjx5HTXW9Lx2957O
"text/plain": [
"<Figure size 1200x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df['Price'] = pd.to_numeric(df['Price'], errors='coerce')\n",
"df['Prod. year'] = pd.to_numeric(df['Prod. year'], errors='coerce')\n",
"\n",
"df.dropna(subset=['Price', 'Prod. year'], inplace=True)\n",
"\n",
"avg_price_per_year = df.groupby('Prod. year')['Price'].mean().reset_index()\n",
"\n",
"plt.figure(figsize=(12, 6))\n",
"plt.plot(avg_price_per_year['Prod. year'], avg_price_per_year['Price'], marker='o')\n",
"plt.title('Средняя цена автомобиля в зависимости от года выпуска', fontsize=14)\n",
"plt.xlabel('Год выпуска')\n",
"plt.ylabel('Средняя цена ($)')\n",
"plt.grid(True)\n",
"\n",
"plt.xticks(avg_price_per_year['Prod. year'][::4])\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Пропущенные значения в данных:\n",
" ID 0\n",
"Price 0\n",
"Levy 5819\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod. year 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64\n",
"\n",
"Выбросы в числовых столбцах:\n",
" ID 2531\n",
"Price 1073\n",
"Prod. year 982\n",
"Cylinders 4870\n",
"Airbags 0\n",
"dtype: int64\n",
"\n",
"Распределение автомобилей по производителям:\n",
" Manufacturer\n",
"HYUNDAI 3769\n",
"TOYOTA 3662\n",
"MERCEDES-BENZ 2076\n",
"FORD 1111\n",
"CHEVROLET 1069\n",
" ... \n",
"LAMBORGHINI 1\n",
"PONTIAC 1\n",
"SATURN 1\n",
"ASTON MARTIN 1\n",
"GREATWALL 1\n",
"Name: count, Length: 65, dtype: int64\n",
"\n",
"Последний год производства в данных: 2020\n",
"\n",
"начальный год: 1939\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Заменяем \"-\" на NaN только в числовых столбцах\n",
"numeric_columns = ['Price', 'Levy', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
"df[numeric_columns] = df[numeric_columns].replace(\"-\", np.nan)\n",
"\n",
"# Проверяем пропущенные значения снова\n",
"missing_values = df.isnull().sum()\n",
"print(\"Пропущенные значения в данных:\\n\", missing_values)\n",
"\n",
"# Фильтрация только числовых столбцов для анализа выбросов\n",
"numerical_data = df.select_dtypes(include=[np.number])\n",
"\n",
"# 2. Анализ выбросов\n",
"Q1 = numerical_data.quantile(0.25)\n",
"Q3 = numerical_data.quantile(0.75)\n",
"IQR = Q3 - Q1\n",
"outliers = ((numerical_data < (Q1 - 1.5 * IQR)) | (numerical_data > (Q3 + 1.5 * IQR))).sum()\n",
"print(\"\\nВыбросы в числовых столбцах:\\n\", outliers)\n",
"\n",
"# 3. Анализ смещения данных\n",
"category_counts = df['Manufacturer'].value_counts()\n",
"print(\"\\nРаспределение автомобилей по производителям:\\n\", category_counts)\n",
"\n",
"# 4. Анализ актуальности данных\n",
"max_production_year = df['Prod. year'].max()\n",
"min_year_from_car = df['Prod. year'].min()\n",
"\n",
"print(\"\\nПоследний год производства в данных:\", max_production_year)\n",
"print(\"\\nначальный год:\", min_year_from_car)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В наборе данных есть пропущенные значения в столбце Levy. Выбросы встречаются с столбцах Price, Prod. year, Cylinders. Набор данных сильно смещён в сторону нескольких производителей, таких как Hyundai, Toyota, и Mercedes-Benz, которые составляют большую часть данных.\n",
"В то же время, такие бренды, как Lamborghini, Pontiac, Saturn, и Aston Martin, представлены всего одной записью. Последний год выпуска в наборе данных — 2020: Данные устарели на несколько лет. Это может означать, что новые модели автомобилей, выпущенные после 2020 года, не учтены, что снижает актуальность данных для анализа современного рынка автомобилей."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### **Примеры решения обнаруженных проблем**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"В столбце Levy имеется 5819 пропущенных значений. Заполним пропуски, подставив 0."
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"df['Levy'] = df['Levy'].fillna(0) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Заменим выбрасы на медиану"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"for col in ['Price', 'Prod. year', 'Cylinders']:\n",
" Q1 = df[col].quantile(0.25) # 1-й квартиль\n",
" Q3 = df[col].quantile(0.75) # 3-й квартиль\n",
" IQR = Q3 - Q1 # Интерквартильный размах\n",
" median = df[col].median() # Медиана\n",
"\n",
" # Определение выбросов\n",
" condition = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))\n",
"\n",
" # Замена выбросов на медиану\n",
" df[col] = np.where(condition, median, df[col])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Если нужно использовать для анализа машины определенных лет выпуска, можем отфильтровать данные следующим образом:"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"df = df[df['Prod. year'] >= 2015]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### **Оценка качества данных**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для каждой категориальной переменной, такой как Manufacturer, Model, Category, можно подсчитать количество уникальных значений. Это даст представление о разнообразии и информативности данных.\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ID 18924\n",
"Price 2315\n",
"Levy 559\n",
"Manufacturer 65\n",
"Model 1590\n",
"Prod. year 54\n",
"Category 11\n",
"Leather interior 2\n",
"Fuel type 7\n",
"Engine volume 107\n",
"Mileage 7687\n",
"Cylinders 13\n",
"Gear box type 4\n",
"Drive wheels 3\n",
"Doors 3\n",
"Wheel 2\n",
"Color 16\n",
"Airbags 17\n",
"dtype: int64\n"
]
}
],
"source": [
"unique_counts = df.nunique()\n",
"print(unique_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Мы видим здесь 65 брендов машин и 1590 различных моделей, что говорит а разнообразии данных."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проверим на соответствие реальным данным"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIVCAYAAADmnq8BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACNm0lEQVR4nOzdd1gUV9sG8GcpYkFQVMCCiA1FBVFs2LCBii3WqLF3TexdrFFJokaNJfZeYu+9Y0uMsXeNBRsoqIBUYe/vD76dlxVwxSAL7P27Lq6Emdn12WGn3HPOnFEBgBAREREREVGyjPRdABERERERUXrH4ERERERERKQDgxMREREREZEODE5EREREREQ6MDgRERERERHpwOBERERERESkA4MTERERERGRDgxOREREREREOpjouwAiIiKipMTExMibN29ErVZLgQIF9F0OERk4tjgRERFRunHx4kXp0KGD5M2bV8zMzCR//vzSqlUrfZeVyPTp00WtVouIiFqtFl9fXz1XRERfG4MTEWUIq1atEpVKJRcvXkxyvoeHh5QtWzaNqyKi1LRr1y6pUaOG3Lp1S6ZNmyZHjhyRI0eOyOLFi/VdWiKrV6+WmTNnyrNnz2TWrFmyevVqfZdERF8Zu+oRERGR3r1580Z69uwpXl5esmXLFsmSJYu+S/qkKVOmSOfOnWXUqFFiZmYm69at03dJRPSVMTgRERGR3q1cuVKioqJk1apV6T40iYi0a9dO6tSpIw8ePJASJUpIvnz59F0SEX1l7KpHRJnanTt3pHXr1mJlZSVZs2YVNzc32b17t9YyyXUDDAoKEpVKJZMmTVKmTZo0SVQqlQQFBSX7bxYpUkS6du36yboeP34sKpUq2R8PDw+t5V+9eiU9evQQGxsbyZo1q7i4uHx216Ck6undu7dkzZpVTp48qUzbtWuXeHt7S4ECBcTMzEyKFSsmP/74o8TFxSnLeHh4fLJulUql9e+sW7dOKlasKNmyZRMrKyv59ttv5enTp1rLJPee9evXV5aJjY2VH3/8UYoVKyZmZmZSpEgRGTt2rERHRyf6rE2aNJHDhw9L+fLlJWvWrOLk5CTbt2/XWk7zN3/8+LEyTa1Wi7Ozs6hUKlm1apXW8lu3bhU3NzfJmTOnVo0zZ85UlunatauoVCopX758or+Br6+vqFQqMTc3TzTvY6dPn5Y2bdpI4cKFxczMTOzs7GTIkCESGRmptZzm39P85M6dWzw8POT06dNa6+NTf6siRYooy4aHh8uwYcPEzs5OzMzMxNHRUWbOnCkAtP5dzWvnzJmTqPZSpUqJSqWS77//Xmv6w4cPpU2bNmJlZSXZs2eXqlWryr59+7SW+fPPP6V8+fIyffp0pYYSJUrITz/9pNxLlLCGhNuliMiMGTOS3HaSWlean4/fIyX7i8ePH4u1tbW4u7tLnjx5kv3uEFHmwRYnIsq0bt68KdWrV5eCBQvK6NGjJUeOHLJ582Zp0aKFbNu2Tb755ht9lyjt27eXxo0ba00bM2aM1u+RkZHi4eEhDx48kO+//14cHBxky5Yt0rVrV3n37p0MGjQoRf/mxIkTZfny5bJp0yatk8xVq1aJubm5DB06VMzNzeX48eMyYcIECQ0NlRkzZoiIyLhx46Rnz54iEh8shwwZIr1795aaNWsm+nemTZsm48ePl7Zt20rPnj3l9evXMm/ePKlVq5ZcvnxZcuXKpSxbqFChRDfX58+fX/n/nj17yurVq6V169YybNgw+euvv8TX11du374tO3bs0Hrd/fv3pV27dtK3b1/p0qWLrFy5Utq0aSMHDx6UBg0aJLte1q5dK9evX080/fz589K2bVtxcXGRn376SSwtLZXP/jETExO5efOmXL58WVxdXbXWbdasWZP9txPasmWLRERESL9+/SRPnjxy4cIFmTdvnjx79ky2bNmitWzevHll9uzZIiLy7NkzmTt3rjRu3FiePn0quXLlkjlz5sj79+9FROT27dsyffp0GTt2rJQuXVpERAlyAKRZs2Zy4sQJ6dGjh5QvX14OHTokI0aMkOfPnyv/hkbWrFll5cqVMnjwYGXauXPn5MmTJ4k+T2BgoLi7u0tERIQMHDhQ8uTJI6tXr5ZmzZrJ1q1ble0wODhYzpw5I2fOnJHu3btLxYoV5dixYzJmzBh5/PixLFq0KNl19u7dO52DMyRcVyIinTp10pr/X/YXyX13iCiTARFRBrBy5UqICP7+++8k59euXRtlypTRmlavXj2UK1cOUVFRyjS1Wg13d3eUKFFC53u/fv0aIoKJEycq0yZOnAgRwevXr5Ot1d7eHl26dPnk53n06BFEBDNmzEg0r0yZMqhdu7by+5w5cyAiWLdunTItJiYG1apVg7m5OUJDQz/5byWsZ/HixRARzJs3L9FyERERiab16dMH2bNn11qHH3+GlStXJpr3+PFjGBsbY9q0aVrTr1+/DhMTE63pSf3tErpy5QpEBD179tSaPnz4cIgIjh8/rvVZRQTbtm1TpoWEhCB//vxwdXVVpmn+5o8ePQIAREVFoXDhwmjUqFGizzRmzBiICF6+fJnosyf8+3Xp0gU5cuRA06ZN8f333yvTT58+jWzZsqFFixbIkSNHsp9TI6m/g6+vL1QqFZ48eaL179nb22stt2TJEogILly4kOg9Tpw4ARHBiRMnEs3buXMnRARTp07Vmt66dWuoVCo8ePBAmSYiaN26NUxMTHDx4kVleo8ePdChQweICAYMGKBMHzx4MEQEp0+fVqaFhYXBwcEBRYoUQVxcHID474GIYNKkSVo1dO3aFSKC69eva9WQcLscOXIkrK2tUbFiRa1tR6Njx45wcHDQmvbxe6R0f/E53x0iylzYVY+IMqU3b97I8ePHpW3bthIWFiZBQUESFBQkwcHB4uXlJffv35fnz59rvSYkJERZLigoSN68efPJ9w8KCpLw8PCv/VFk//79YmtrK+3bt1emmZqaysCBA+X9+/dy6tSpz3qfXbt2Sf/+/WXEiBGJulKJiGTLlk35f806q1mzpkRERMidO3dSVPP27dtFrVZL27Zttdapra2tlChRQk6cOPHZ77V//34RERk6dKjW9GHDhomIJOryVaBAAa3WAQsLC+ncubNcvnxZAgICkvw3FixYIMHBwTJx4sRE88LCwsTIyEirhexTunfvLhs2bFC6Ea5cuVJatmwplpaWn/X6hH+H8PBwCQoKEnd3dwEgly9f1lpWrVYr6/bKlSuyZs0ayZ8/v9Ki9Ln2798vxsbGMnDgQK3pw4YNEwBy4MABrek2Njbi7e0tK1euFBGRiIgI2bx5s3Tr1i3J965cubLUqFFDmWZubi69e/eWx48fy61bt5TpxsbGiVrykvs7azx//lzmzZsn48ePT7YrZExMjJiZmSX38b9of6Hxqe8OEWUuBh2c/Pz8pGnTplKgQAFRqVSyc+fOFL1ec6/Dxz85cuT4OgUT0Wd78OCBAJDx48dLvnz5tH40JzivXr3Sek39+vW1lnN0dEz2/R0dHSVfvnxibm4uNjY24uPjo3UvUGp68uSJlChRQoyMtHfZmpPjpLpHfezKlSvSvn17iYuLSzYQ3rx5U7755huxtLQUCwsLyZcvn3z33XciEh8qU+L+/fsCQLlpPuHP7du3E637T3ny5IkYGRlJ8eLFtabb2tpKrly5En3+4sWLJ7rXqmTJkiIiWvc0aYSEhMj06dNl6NChYmNjk2h+tWrVRK1Wy6BBg+Tff/+VoKAgefv2bbL1ent7i4mJiezatUvCw8OTDRTJ8ff3l65du4qVlZWYm5tLvnz5pHbt2kqtCT19+lRZr66urvLvv//Ktm3bPuteqoSePHkiBQoUkJw5c2pN/9R3rFu3bkpA3LJli+TOnVvq1q2b5HsntS19/N4qlUoKFCggFhYWWss5OjqKkZFRkn87kfiupwUKFJA+ffok+/nevXv3yXXyJfsLEd3fHSLKXAz6Hqfw8HBxcXGR7t27S8uWLVP8+uHDh0vfvn21ptWrV08qVaqUWiUS0RfS3Ew+fPhw8fLySnKZj0/EFyxYoJxgi4iEhoY
"text/plain": [
"<Figure size 1000x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 5))\n",
"sns.boxplot(x='Category', y='Price', data=df)\n",
"plt.title('Цены по категориям автомобилей')\n",
"plt.xlabel('Категория')\n",
"plt.ylabel('Цена')\n",
"plt.xticks(rotation=45)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"данная диаграмма была получина до устранения выбрасов."
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIVCAYAAAC+4TwYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACUDUlEQVR4nOzdd1gUV9sG8HtBmiIoFhAVxC4oNixgjaLYosYag723xIhdUaMRNYlGjYKKPZbEGrvYSzQYa+w1ItjAgoBShX2+P/x2XlbQKCJDuX/XxaXMnN15dpjdnXvOzBmNiAiIiIiIiIgowxmoXQAREREREVFOxUBGRERERESkEgYyIiIiIiIilTCQERERERERqYSBjIiIiIiISCUMZERERERERCphICMiIiIiIlIJAxkREREREZFKcqldABEREVFGSkhIQHh4OLRaLWxtbdUuh4hyOPaQERERUbZ35swZfPXVVyhYsCBMTExQpEgRtG/fXu2yUpg+fTq0Wi0AQKvVYsaMGSpXRESfGgMZEeVoK1euhEajwZkzZ1Kd37BhQ1SsWDGDqyKi9LRt2zbUrVsXV69ehY+PD/bv34/9+/dj8eLFapeWwqpVqzBr1izcv38fs2fPxqpVq9QuiYg+MZ6ySERERNlWeHg4+vbtCw8PD2zcuBHGxsZql/ROU6dORffu3TFmzBiYmJhgzZo1apdERJ8YAxkRERFlWytWrEBcXBxWrlyZ6cMYAHTu3BmfffYZbt++jTJlyqBQoUJql0REnxhPWSQiSoPr16+jQ4cOsLKygqmpKVxcXLB9+3a9Nm87HfLp06fQaDT47rvvlGnfffcdNBoNnj59+tZllihRAj179nxnXXfv3oVGo3nrT8OGDfXaP378GH369IG1tTVMTU1RuXLl9z5FKrV6+vfvD1NTUxw5ckSZtm3bNrRs2RK2trYwMTFBqVKl8P333yMpKUlp07Bhw3fWrdFo9JazZs0aVK9eHWZmZrCyssKXX36Je/fu6bV523O6u7srbRITE/H999+jVKlSMDExQYkSJTB+/HjEx8eneK2tWrXCvn37UKVKFZiamsLR0RFbtmzRa6f7m9+9e1eZptVq4ezsDI1Gg5UrV+q137RpE1xcXJA3b169GmfNmqW06dmzJzQaDapUqZLibzBjxgxoNBqYm5unmPemP//8Ex07doSdnR1MTExQvHhxDB8+HLGxsXrtdMvT/eTPnx8NGzbEn3/+qbc+3vW3KlGihNI2OjoaI0aMQPHixWFiYoJy5cph1qxZEBG95eoeO3fu3BS1ly9fHhqNBkOHDtWbfufOHXTs2BFWVlbInTs3ateujV27dum1OXnyJKpUqYLp06crNZQpUwYzZ85UrtVKXkPy9yUA/PTTT6m+d1JbV7qfN5/jQz4v7t69i8KFC8PNzQ0FChR467ZDRNkHe8iIiD7QlStXUKdOHRQtWhRjx45Fnjx5sGHDBrRt2xabN2/GF198oXaJ6NKlC1q0aKE3bdy4cXq/x8bGomHDhrh9+zaGDh0KBwcHbNy4ET179kRERASGDRv2QcucPHkyli1bhvXr1+vtvK5cuRLm5ubw8vKCubk5Dh06hEmTJiEqKgo//fQTAGDChAno27cvgNeBdfjw4ejfvz/q1auXYjk+Pj6YOHEiOnXqhL59++LJkyeYP38+6tevj/PnzyNfvnxK22LFiqUYFKFIkSLK//v27YtVq1ahQ4cOGDFiBP7++2/MmDED165dwx9//KH3uFu3bqFz584YOHAgevTogRUrVqBjx44ICAhAkyZN3rpeVq9ejUuXLqWYHhgYiE6dOqFy5cqYOXMmLC0tldf+ply5cuHKlSs4f/48qlatqrduTU1N37rs5DZu3IiYmBgMGjQIBQoUwKlTpzB//nzcv38fGzdu1GtbsGBBzJkzBwBw//59zJs3Dy1atMC9e/eQL18+zJ07Fy9fvgQAXLt2DdOnT8f48eNRoUIFAFACooigdevWOHz4MPr06YMqVapg7969GDVqFB48eKAsQ8fU1BQrVqzAt99+q0z766+/EBwcnOL1hIWFwc3NDTExMfjmm29QoEABrFq1Cq1bt8amTZuU9+GzZ89w/PhxHD9+HL1790b16tVx8OBBjBs3Dnfv3sWiRYveus4iIiL+c1CN5OsKALp166Y3/2M+L9627RBRNiNERDnYihUrBICcPn061fkNGjQQJycnvWmNGzeWSpUqSVxcnDJNq9WKm5ublClT5j+f+8mTJwJAJk+erEybPHmyAJAnT568tVZ7e3vp0aPHO19PUFCQAJCffvopxTwnJydp0KCB8vvcuXMFgKxZs0aZlpCQIK6urmJubi5RUVHvXFbyehYvXiwAZP78+SnaxcTEpJg2YMAAyZ07t946fPM1rFixIsW8u3fviqGhofj4+OhNv3TpkuTKlUtvemp/u+T++ecfASB9+/bVmz5y5EgBIIcOHdJ7rQBk8+bNyrTIyEgpUqSIVK1aVZmm+5sHBQWJiEhcXJzY2dlJ8+bNU7ymcePGCQB59OhRitee/O/Xo0cPyZMnj3z++ecydOhQZfqff/4pZmZm0rZtW8mTJ89bX6dOan+HGTNmiEajkeDgYL3l2dvb67Xz9/cXAHLq1KkUz3H48GEBIIcPH04xb+vWrQJApk2bpje9Q4cOotFo5Pbt28o0ANKhQwfJlSuXnDlzRpnep08f+eqrrwSADBkyRJn+7bffCgD5888/lWkvXrwQBwcHKVGihCQlJYnI6+0AgHz33Xd6NfTs2VMAyKVLl/RqSP6+HD16tBQuXFiqV6+u997R8fT0FAcHB71pbz7Hh35evM+2Q0TZC09ZJCL6AOHh4Th06BA6deqEFy9e4OnTp3j69CmePXsGDw8P3Lp1Cw8ePNB7TGRkpNLu6dOnCA8Pf+fzP336FNHR0Z/6pWD37t2wsbFBly5dlGlGRkb45ptv8PLlSxw9evS9nmfbtm0YPHgwRo0aleKUMgAwMzNT/q9bZ/Xq1UNMTAyuX7/+QTVv2bIFWq0WnTp10lunNjY2KFOmDA4fPvzez7V7924AgJeXl970ESNGAECKU99sbW31ejMsLCzQvXt3nD9/HqGhoakuw9fXF8+ePcPkyZNTzHvx4gUMDAz0evTepXfv3li3bp1yOuWKFSvQrl07WFpavtfjk/8doqOj8fTpU7i5uUFEcP78eb22Wq1WWbf//PMPfv31VxQpUkTpAXtfu3fvhqGhIb755hu96SNGjICIYM+ePXrTra2t0bJlS6xYsQIAEBMTgw0bNqBXr16pPnfNmjVRt25dZZq5uTn69++Pu3fv4urVq8p0Q0PDFD2Pb/s76zx48ADz58/HxIkT33pKaEJCAkxMTN728tP0eaHzrm2HiLIXBjIiog9w+/ZtiAgmTpyIQoUK6f3odpweP36s9xh3d3e9duXKlXvr85crVw6FChWCubk5rK2t4e3trXetVXoKDg5GmTJlYGCg/1Wg2+lO7TSxN/3zzz/o0qULkpKS3ho0r1y5gi+++AKWlpawsLBAoUKF0LVrVwCvw+qHuHXrFkREGewg+c+1a9dSrPt3CQ4OhoGBAUqXLq033cbGBvny5Uvx+kuXLp3iWrayZcsCgN41YzqRkZGYPn06vLy8YG1tnWK+q6srtFothg0bhn///RdPnz7F8+fP31pvy5YtkStXLmzbtg3R0dFvDSpvExISgp49e8LKygrm5uYoVKgQGjRooNSa3L1795T1WrVqVfz777/YvHnze12rllxwcDBsbW2RN29evenv2sZ69eqlBM+NGzcif/78aNSoUarPndp76c3n1mg0sLW1hYWFhV67cuXKwcDAINW/HfD6FFxbW1sMGDDgra8vIiLineskLZ8XwH9vO0SUvfAaMiKiD6AbBGDkyJHw8PBItc2bO/i+vr7KjjsAREVFvfWGtJs3b4aFhQViYmLwxx9/wMfHBxYWFhg9enQ6vYL0deHCBTRv3hyNGzfGqFGj0LVrV73rxyIiItCgQQNYWFhg6tSpKFWqFExNTXHu3DmMGTMmxaAK/0Wr1UKj0WDPnj0wNDRMMf9
"text/plain": [
"<Figure size 1000x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 5))\n",
"sns.boxplot(x='Category', y='Price', data=df)\n",
"plt.title('Цены по категориям автомобилей')\n",
"plt.xlabel('Категория')\n",
"plt.ylabel('Цена')\n",
"plt.xticks(rotation=45)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"После устранения выбрасов, диаграмма выглядит лучше, хотя здесь все равно присутствуют очень маленькие значения, что не соответсвуем реальным ценам на машины. Из этого можем сделать вывод, что цены в наборе данных не особо соответствуют реальности. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 13465\n",
"Размер контрольной выборки: 2886\n",
"Размер тестовой выборки: 2886\n"
]
}
],
"source": [
"# Загрузка данных\n",
"new1 = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
"\n",
"# Разбиение на обучающую и временную выборки\n",
"train_data, temp_data = train_test_split(new1, test_size=0.3, random_state=42)\n",
"\n",
"# Разбиение временной выборки на контрольную и тестовую\n",
"val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_data))\n",
"print(\"Размер контрольной выборки:\", len(val_data))\n",
"print(\"Размер тестовой выборки:\", len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAnIAAAIwCAYAAAACvobyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HuXjnQVAUXsXaPBhr2CiL13RY0llthijTUaYm+xxESKsfceFbtR7GLXqMEuWAFFEYHn+8Nv591ldwGNRje5f9e1F+zMmZkzs1OeOXPOGZWICIiIiIjI5Kg/dQaIiIiI6P0wkCMiIiIyUQzkiIiIiEwUAzkiIiIiE8VAjoiIiMhEMZAjIiIiMlEM5IiIiIhMFAM5IiIiIhPFQI6I/raEhATcuXMHz549+9RZoQ/s+fPnuHnzJhISEj51VojIAAZyRPRe1qxZg9q1a8Pe3h52dnbInTs3pkyZ8qmzZRJevHiBWbNmKd9jY2Mxb968T5chLSKCRYsWoWLFirC1tYWDgwPy5s2LpUuXfuqsfTDHjx+HpaUlbt269amzQgQAWLhwIXLnzo3Xr1+/87SqT/GKrtDQUAQGBirfrayskDt3bvj6+mL06NHIkSPHP50lInoHw4cPx+TJk9G4cWO0adMG2bJlg0qlQqFCheDp6fmps/fZS0lJgaOjI37++WdUq1YN06dPx5UrV7Bjx45PnTW0bdsWq1atQufOndGgQQM4OjpCpVKhVKlSyJ49+6fO3gdRt25deHh4ICws7FNnhQgAkJiYiDx58mDkyJHo37//O01r/pHylCkTJkxA3rx5kZiYiD/++AMLFizA9u3bceHCBdja2n7KrBGREQcOHMDkyZMRFBSE4cOHf+rsmCQzMzOMHz8enTp1QmpqKhwcHLBt27ZPnS0sWbIEq1atwtKlS9GuXbtPnZ2PIjIyErt378aRI0c+dVaIFNbW1ujcuTNmzJiBfv36QaVSZXraT1oid+LECZQtW1YZPnjwYMyYMQPLly9H27Zt/+lsEVEmNGzYEE+fPsXhw4c/dVZM3t27d3Hnzh0ULVoUTk5Onzo7KFmyJEqVKoVly5Z96qx8NN988w02btyImzdvvtPFkuhjO3XqFMqWLYs9e/agVq1amZ7us6ojp8l4VFQUAODp06cYMmQISpYsCTs7Ozg4OMDf3x9nz57VmzYxMRHjxo1DoUKFYG1tDXd3dzRr1gw3btwAAOWgNfapUaOGMq/9+/dDpVJh1apVGDlyJNzc3JAlSxY0atQId+7c0Vv2sWPHUK9ePTg6OsLW1hbVq1c3epGrUaOGweWPGzdOL+3SpUvh7e0NGxsbuLi4oE2bNgaXn966aUtNTcWsWbNQvHhxWFtbI0eOHOjZs6deBfU8efKgQYMGesvp27ev3jwN5X3q1Kl62xQAXr9+jbFjx6JAgQKwsrKCp6cnhg4dmqk6ATVq1NCb36RJk6BWq7F8+fL32h7Tpk1DpUqVkDVrVtjY2MDb2xtr1641uPylS5eifPnysLW1hbOzM6pVq4Zdu3bppPn9999RvXp12Nvbw8HBAeXKldPL25o1a5TfNFu2bOjQoQPu3bunk6ZLly46eXZ2dkaNGjVw6NChDLfT35kWAPbu3YuqVasiS5YscHJyQuPGjXH58mWdNEePHkWJEiXQpk0buLi4wMbGBuXKlcPGjRuVNC9evECWLFnwzTff6C3j7t27MDMzQ1BQkJLnPHny6KVLu2/dunULX3/9NQoXLgwbGxtkzZoVLVu2xM2bN3Wm0xy/+/fvV4adOHECdevWhb29PbJkyWJwm4SGhkKlUuHkyZPKsMePHxvcxxs0aGAwz5k5F4wbN07ZF3PlygUfHx+Ym5vDzc1NL9+GaKbXfOzt7VG+fHmd7Q+8PWZKlChhdD6a4yQ0NBTA2wYrFy5cgKenJwICAuDg4GB0WwHAX3/9hZYtW8LFxQW2traoWLGiXqniu5xL3+UYf5dzblobN25ErVq1DAZx6Z070u5nmVl/Y5KTk/H9998jf/78sLKyUh6ppT0X5smTR1m+Wq2Gm5sbWrdujdu3b+uke9dz+65du1C6dGlYW1ujWLFiWL9+vV4eY2NjMXDgQOTJkwdWVlbIlSsXOnXqhMePHwMAgoKCULx4cdja2sLFxQWNGjXC6dOnjW7PtPtnYmIinJ2doVKpMG3aNGV42v077Uezv2rLzG+W2WuVZp81dC2ws7NDly5dlO+ac4ahz927dwEA586dQ5cuXZAvXz5YW1vDzc0NXbt2xZMnT/Tm7+3tDRcXF2zatElvXHo+6aPVtDRBV9asWQG8PVA2btyIli1bIm/evIiJicHPP/+M6tWr49KlS/Dw8ADwtr5JgwYNsGfPHrRp0wbffPMNnj9/jvDwcFy4cAH58+dXltG2bVvUr19fZ7kjRowwmJ9JkyZBpVJh2LBhePjwIWbNmoU6deogMjISNjY2AN5e+Pz9/eHt7Y2xY8dCrVYjJCQEtWrVwqFDh1C+fHm9+ebKlUu5iL148QK9e/c2uOzRo0ejVatW6N69Ox49eoS5c+eiWrVqOHPmjMG79x49eqBq1aoAgPXr12PDhg0643v27KmUhvbv3x9RUVH46aefcObMGRw+fBgWFhYGt8O7iI2NVdZNW2pqKho1aoQ//vgDPXr0QNGiRXH+/HnMnDkTf/75p95BnpGQkBB89913mD59utFHQBltj9mzZ6NRo0Zo3749kpKSsHLlSrRs2RJbt25FQECAkm78+PEYN24cKlWqhAkTJsDS0hLHjh3D3r174evrC+DtAd21a1cUL14cI0aMgJOTE86cOYMdO3Yo+dNs+3LlyiEoKAgxMTGYPXs2Dh8+rPebZsuWDTNnzgTwNvCZPXs26tevjzt37mRYcvO+0+7evRv+/v7Ily8fxo0bh1evXmHu3LmoXLkyTp8+rQQuT548waJFi2BnZ4f+/fsje/bsWLp0KZo1a4Zly5ahbdu2sLOzQ9OmTbFq1SrMmDEDZmZmynJWrFgBEUH79u3TXY+0Tpw4gSNHjqBNmzbIlSsXbt68iQULFqBGjRq4dOmS0eoY169fR40aNWBra4tvv/0Wtra2+OWXX1CnTh2Eh4ejWrVq75QPY97nXKAxffp0xMTEvNPyfvvtNwBvg8358+ejZcuWuHDhAgoXLvxe+ddcWCZPngw3Nzd8++23sLa2NritYmJiUKlSJbx8+RL9+/dH1qxZERYWhkaNGmHt2rVo2rSpzrwzcy5Ny9gx/ne2871793D79m18+eWX6W4L7evE9u3bsWLFCp3x77r+aXXv3h1hYWFo0aIFBg8ejGPHjiEoKAiXL1/WO09VrVoVPXr0QGpqKi5cuIBZs2bh/v37OsH1u5zbr127htatW6NXr17o3LkzQkJC0LJlS+zYsQN169YF8Pa6VLVqVVy+fBldu3bFl19+icePH2Pz5s24e/cusmXLhp07dyIgIAAFChRATEwMli1bhsqVK2PHjh2oXr26zjpYW1sjJCQETZo0UYatX78eiYmJRrfRggULYGdnp3yPiorCmDFjjKZv2rQpmjVrBgA4dOgQFi1alM4vYPxa9T401cS0ubi4AADCw8Px119/ITAwEG5ubrh48SIWLVqEixcv4ujRo3o3FF9++eW7P+2QTyAkJEQAyO7du+XRo0dy584dWblypWTNmlVsbGzk7t27IiKSmJgoKSkpOtNGRUWJlZWVTJgwQRkWHBwsAGTGjBl6y0pNTVWmAyBTp07VS1O8eHGpXr268n3fvn0CQHLmzCnx8fHK8NWrVwsAmT17tjLvggULip+fn7IcEZGXL19K3rx5pW7dunrLqlSpkpQoUUL5/ujRIwEgY8eOVYbdvHlTzMzMZNKkSTrTnj9/XszNzfWGX7t2TQBIWFiYMmzs2LGi/fMeOnRIAMiyZct0pt2xY4fecC8vLwkICNDLe58+fSTtLpM270OHDhVXV1fx9vbW2aa//fabqNVqOXTokM70CxcuFABy+PBhveVpq169ujK/bdu2ibm5uQwePNhg2sxsD5G3v5O2pKQkKVGihNSqVUtnXmq
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Определяем целевую переменную и признаки\n",
"X = train_data.drop(columns=['Manufacturer']) # Признаки\n",
"y = train_data['Manufacturer'] # Целевая переменная\n",
"\n",
"# Функция для визуализации распределения\n",
"def plot_class_distribution(data, title):\n",
" data['Manufacturer'].value_counts().plot(kind='bar', title=title)\n",
" plt.xlabel('Manufacturer')\n",
" plt.ylabel('Count')\n",
" plt.show()\n",
"\n",
"# Визуализация распределения классов до оверсэмплинга\n",
"plot_class_distribution(train_data, 'Распределение классов в обучающей выборке (до оверсэмплинга)')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Выполним овер- и андер- слемпинг."
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoEAAAIwCAYAAAD5xbFoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HuX3hGRpqjYezRYgr2g2HvBgr0llqgx1lhjNPYWayLF3ks0iYq9oUYUsUeN2MEKKIIIPN8ffjsvy+4CEo2+796/69pLmTk7c2Z25swzM6eoRERAREREREZF/bEzQERERET/PgaBREREREaIQSARERGREWIQSERERGSEGAQSERERGSEGgURERERGiEEgERERkRFiEEhERERkhBgEEhEAICEhAXfv3sXz588/dlboPXvx4gWioqKQkJDwsbNCRJ8QBoFERmzTpk2oV68e7OzsYGtri/z582PGjBkfO1v/FV6+fIl58+Ypf8fGxmLRokUfL0PpiAiWL1+OL774AtbW1rC3t4eXlxdWr179sbP23pw+fRrm5ua4ffv2x84K0Ufz9OlT2NjY4Pfff8/R91U5HTYuODgYPXr0UP62sLBA/vz50aBBA4wbNw6urq45yhAR/TtGjRqF6dOno0WLFvD394ezszNUKhWKFSsGT0/Pj529T15qaiocHBywbNky1KxZE7Nnz8bVq1exe/fuj501dOzYERs2bEC3bt3QtGlTODg4QKVSoVy5csiTJ8/Hzt57Ub9+fXh4eCAkJORjZ4Xoo/r6669x7NgxhIeHv/N3Tf/pyidPngwvLy8kJSXh2LFjWLJkCX7//XdcvHgR1tbW/3TxRPQBHD58GNOnT8e0adMwatSoj52d/0omJiaYNGkSunbtirS0NNjb2+O333772NnCypUrsWHDBqxevRqdOnX62Nn5ICIiIrBv3z6cOHHiY2eF6KPr378/FixYgAMHDqBu3brv9N1//CTwzz//RMWKFZXp33zzDebMmYO1a9eiY8eOOVk0EX1gzZo1w7Nnz3D8+PGPnZX/evfu3cPdu3dRsmRJODo6fuzsoGzZsihXrhzWrFnzsbPywXz99dfYvn07oqKioFKpPnZ2iD66smXLokKFCli5cuU7fe+91wnURKG3bt0CADx79gzDhw9H2bJlYWtrC3t7ezRq1Ajnz5/X+W5SUhImTpyIYsWKwdLSEu7u7mjdujVu3rwJAMoJb+hTu3ZtZVmHDh2CSqXChg0bMGbMGLi5ucHGxgbNmzfH3bt3ddZ96tQpNGzYEA4ODrC2tkatWrUMXiBr166td/0TJ07USbt69Wp4e3vDysoKTk5O8Pf317v+zLYtvbS0NMybNw+lS5eGpaUlXF1d0a9fP53K/AULFkTTpk111jNw4ECdZerL+8yZM3X2KQC8fv0aEyZMQJEiRWBhYQFPT0+MGDECr1+/1ruv0qtdu7bO8n744Qeo1WqsXbs2R/tj1qxZqFq1KnLnzg0rKyt4e3tj8+bNete/evVqVK5cGdbW1siVKxdq1qyJvXv3aqX5448/UKtWLdjZ2cHe3h6VKlXSydumTZuU39TZ2RldunTB/fv3tdJ0795dK8+5cuVC7dq1cfTo0Sz30z/5LgAcOHAANWrUgI2NDRwdHdGiRQtcuXJFK83JkydRpkwZ+Pv7w8nJCVZWVqhUqRK2b9+upHn58iVsbGzw9ddf66zj3r17MDExwbRp05Q8FyxYUCddxmPr9u3b+Oqrr1C8eHFYWVkhd+7caNeuHaKiorS+pzl/Dx06pEz7888/Ub9+fdjZ2cHGxkbvPgkODoZKpcKZM2eUaU+ePNF7jDdt2lRvnrNTFkycOFE5FvPlywcfHx+YmprCzc1NJ9/6aL6v+djZ2aFy5cpa+x94e86UKVPG4HI050lwcDCAt417Ll68CE9PTzRp0gT29vYG9xUA/P3332jXrh2cnJxgbW2NL774Qudp5ruUpe9yjr9LmZvR9u3bUbduXZ3yoGDBgpleI9JLSUnB999/j8KFC8PCwgIFCxbEmDFj9JZl2SkX3ncZbkh2zu/sHl/Aux3vV69eRfv27WFvb4/cuXPj66+/RlJSks4yMytrjx49Cl9fXzg7O8PKygoVKlTAkiVLkPF5lOY627JlS53l9+vXDyqVSuvceJf4QCNjWavvWq6vbLt79y6srKygUqm0yq53ve7q+0yZMgUAkJycjPHjx8Pb2xsODg6wsbFBjRo1cPDgQZ3lA2+rR+zcuVNnP2blH78OzkgTsOXOnRvA20Jm+/btaNeuHby8vBATE4Nly5ahVq1auHz5Mjw8PAC8rV/TtGlT7N+/H/7+/vj666/x4sULhIaG4uLFiyhcuLCyjo4dO6Jx48Za6x09erTe/Pzwww9QqVQYOXIkHj16hHnz5sHX1xcRERGwsrIC8PakatSoEby9vTFhwgSo1WoEBQWhbt26OHr0KCpXrqyz3Hz58ikXwJcvX+LLL7/Uu+5x48ahffv26N27Nx4/foyFCxeiZs2aOHfunN6nBn379kWNGjUAAFu3bsW2bdu05vfr1095Cjt48GDcunULP/30E86dO4fjx4/DzMxM7354F7Gxscq2pZeWlobmzZvj2LFj6Nu3L0qWLIkLFy5g7ty5+Ouvv/QWMJkJCgrCd999h9mzZxt8bZXV/pg/fz6aN2+Ozp07Izk5GevXr0e7du2wa9cuNGnSREk3adIkTJw4EVWrVsXkyZNhbm6OU6dO4cCBA2jQoAGAtwFEz549Ubp0aYwePRqOjo44d+4cdu/ereRPs+8rVaqEadOmISYmBvPnz8fx48d1flNnZ2fMnTsXwNugaf78+WjcuDHu3r2b5ROjnH533759aNSoEQoVKoSJEyciMTERCxcuRLVq1XD27FmlMHv69CmWL18OW1tbDB48GHny5MHq1avRunVrrFmzBh07doStrS1atWqFDRs2YM6cOTAxMVHWs27dOogIOnfunOl2ZPTnn3/ixIkT8Pf3R758+RAVFYUlS5agdu3auHz5ssEqJDdu3EDt2rVhbW2Nb7/9FtbW1vj555/h6+uL0NBQ1KxZ853yYUhOygKN2bNnIyYm5p3Wt2rVKgBvA9XFixejXbt2uHjxIooXL56j/D99+hQAMH36dLi5ueHbb7+FpaWl3n0VExODqlWr4tWrVxg8eDBy586NkJAQNG/eHJs3b0arVq20lp2dsjQjQ+f4P9nP9+/fx507d/D555/rnV++fHl88803WtNWrlyJ0NBQrWm9e/dGSEgI2rZti2+++QanTp3CtGnTcOXKFa1yJjvlQnofsgzP7vmtkdXx9a6/Q/v27VGwYEFMmzYNJ0+exIIFC/D8+XOtp09ZlbUnTpyAi4sLvvvuO5iYmODw4cP46quvEBkZiSVLlmitz9LSEr/99hsePXoEFxcXAEBiYiI2bNgAS0tLvfvoXeIDQLusBYCAgACDaTXGjx+vN/h9V/Xr10fXrl21ppUvXx4AEB8fj19++QUdO3ZEnz598OLFC6xYsQJ+fn44ffq0kk7D29sbc+fOxaVLlzK9cdQhORQUFCQAZN++ffL48WO5e/eurF+/XnLnzi1WVlZy7949ERFJSkqS1NRUre/eunVLLCwsZPLkycq0wMBAASBz5szRWVdaWpryPQAyc+ZMnTSlS5eWWrVqKX8fPHhQAEjevHklPj5emb5x40YBIPPnz1eWXbRoUfHz81PWIyLy6tUr8fLykvr16+usq2rVqlKmTBnl78ePHwsAmTBhgjItKipKTExM5IcfftD67oULF8TU1FRn+vXr1wWAhISEKNMmTJgg6X+io0ePCgBZs2aN1nd3796tM71AgQLSpEkTnbwPGDBAMv7sGfM+YsQIcXFxEW9vb619umrVKlGr1XL06FGt7y9dulQAyPHjx3XWl16tWrWU5f32229iamoq33zzjd602dkfIm9/p/SSk5OlTJkyUrduXa1lqdVqadWqlc6xqPnNY2Njxc7OTqpUqSKJiYl60yQnJ4uLi4uUKVNGK82uXbsEgIwfP16Z1q1bNylQoIDWcpYvXy4A5PT
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Применение оверсэмплинга\n",
"ros = RandomOverSampler(random_state=42)\n",
"X_resampled, y_resampled = ros.fit_resample(X, y)\n",
"\n",
"# Создание нового DataFrame для оверсэмплинга\n",
"train_resampled_over = pd.DataFrame(X_resampled, columns=X.columns)\n",
"train_resampled_over['Manufacturer'] = y_resampled\n",
"\n",
"# Визуализация распределения классов после оверсэмплинга\n",
"plot_class_distribution(train_resampled_over, 'Распределение классов в обучающей выборке (после оверсэмплинга)')"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAooAAAIwCAYAAAABJUqfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD9SElEQVR4nOzdd1gU1/s28HuX3rECVmwoVhRL7A1716hYEXtvMXYlloQYey8JRey9JcYaNRox9l6iUewiioCiiMLz/sG7891lZxFMIuGX+3Nde10w9ezszJlnzpx5RiMiAiIiIiKiVLSZXQAiIiIi+ndioEhEREREqhgoEhEREZEqBopEREREpIqBIhERERGpYqBIRERERKoYKBIRERGRKgaKRERERKSKgSLRf1h8fDzu37+PFy9eZHZR6G/28uVLREREID4+PrOLQkRZGANFov+YTZs2oX79+nBwcIC9vT0KFCiA7777LrOLlSW8evUK8+bNU/6PiYnB4sWLM69AekQEK1aswGeffQZbW1s4OjqiUKFCWL16dWYX7W9z8uRJWFpa4u7du5ldFKIsZdmyZShQoADevn2b4Xk1GXmFX2hoKPz9/ZX/raysUKBAATRs2BCTJk2Ci4tLhgtARJ/O2LFjMWPGDLRq1Qq+vr7ImTMnNBoNPDw8kD9//swu3r9eUlISnJycsHz5ctSqVQuzZ8/G9evXsWfPnswuGjp16oQNGzbAz88PzZs3h5OTEzQaDcqWLYtcuXJldvH+Fg0aNECePHmwcuXKzC4KUZaSkJAAd3d3jB8/HkOHDs3QvOYfs8KpU6eiUKFCSEhIwLFjx7B06VLs3r0bly9fhq2t7ccskoj+YUeOHMGMGTMQGBiIsWPHZnZxsiQzMzNMmTIF3bt3R3JyMhwdHfHTTz9ldrEQFhaGDRs2YPXq1ejcuXNmF+cfcf78eRw4cADHjx/P7KIQZTnW1tbw8/PDnDlzMGTIEGg0mnTP+1EtiqdOnULFihWV4V988QXmzJmDtWvXolOnThkrPRF9Ei1atEB0dDR+++23zC5KlvfgwQPcv38fnp6ecHZ2zuzioEyZMihbtizWrFmT2UX5xwwbNgzbt29HREREhk5yRJTizJkzqFixIg4ePIh69eqle76/pY+iboV37twBAERHR2PUqFEoU6YM7O3t4ejoiCZNmuDChQtG8yYkJOCrr76Ch4cHrK2t4ebmhrZt2+LPP/8EAKVSMPWpU6eOsqzDhw9Do9Fgw4YNGD9+PFxdXWFnZ4eWLVvi/v37Ruv+/fff0bhxYzg5OcHW1ha1a9c2eRKtU6eO6vq/+uoro2lXr14Nb29v2NjYIHv27PD19VVdf1rfTV9ycjLmzZuHUqVKwdraGi4uLujXr5/RAwju7u5o3ry50XoGDx5stEy1ss+cOdNomwLA27dvERAQgKJFi8LKygr58+fH6NGj09XXoU6dOkbL+/rrr6HVarF27dqP2h6zZs1CtWrVkCNHDtjY2MDb2xubN29WXf/q1atRuXJl2NraIlu2bKhVqxb27dtnMM3PP/+M2rVrw8HBAY6OjqhUqZJR2TZt2qT8pjlz5kTXrl3x8OFDg2l69OhhUOZs2bKhTp06OHr06Ae301+ZFwB++eUX1KxZE3Z2dnB2dkarVq1w7do1g2lOnDiB0qVLw9fXF9mzZ4eNjQ0qVaqE7du3K9O8evUKdnZ2GDZsmNE6Hjx4ADMzMwQGBipldnd3N5ou9b519+5dDBw4EMWLF4eNjQ1y5MiB9u3bIyIiwmA+3fF7+PBhZdipU6fQoEEDODg4wM7OTnWbhIaGQqPR4PTp08qwZ8+eqe7jzZs3Vy1zeuqCr776StkX8+XLh6pVq8Lc3Byurq5G5Vajm1/3cXBwQOXKlQ22P5ByzJQuXdrkcnTHSWhoKICUB5IuX76M/Pnzo1mzZnB0dDS5rQDg9u3baN++PbJnzw5bW1t89tlnRq2iGalLM3KMZ6TOTW379u2oV6+eUX3g7u6e5jlC3/v37zFt2jQUKVIEVlZWyq04tbosPfXC312Hq7l48SJ69OiBwoULw9raGq6urujZsyeeP39uMJ1u/3r27JnB8NOnTxvsL+nZdqmnzeg5aN++ffDy8oK1tTVKliyJrVu3Gq07JiYGI0aMgLu7O6ysrJAvXz50795dKX9gYCBKlSoFW1tbZM+eHS1btsTZs2cNlqG//VMfRwkJCciWLRs0Gg1mzZpltJ1MfdS2k6lp9euw9J5TdceW2jnL3t4ePXr0UP7X1W1qnwcPHgBI//4BAN7e3siePTt27NhhNC4tH3XrOTVdUJcjRw4AKRXR9u3b0b59exQqVAiRkZFYvnw5ateujatXryJPnjwAUvr7NG/eHAcPHoSvry+GDRuGly9fYv/+/bh8+TKKFCmirKNTp05o2rSpwXrHjRunWp6vv/4aGo0GY8aMwdOnTzFv3jz4+Pjg/PnzsLGxAZByYm3SpAm8vb0REBAArVaLkJAQ1KtXD0ePHkXlypWNlpsvXz7lJPnq1SsMGDBAdd2TJk1Chw4d0Lt3b0RFRWHhwoWoVasWzp07p9r60LdvX9SsWRMAsHXrVmzbts1gfL9+/ZTW3KFDh+LOnTtYtGgRzp07h99++w0WFhaq2yEjYmJilO+mLzk5GS1btsSxY8fQt29feHp64tKlS5g7dy7++OMPo4PzQ0JCQjBx4kTMnj3b5C2yD22P+fPno2XLlujSpQsSExOxfv16tG/fHj/++COaNWumTDdlyhR89dVXqFatGqZOnQpLS0v8/vvv+OWXX9CwYUMAKQdiz549UapUKYwbNw7Ozs44d+4c9uzZo5RPt+0rVaqEwMBAREZGYv78+fjtt9+MftOcOXNi7ty5AFICq/nz56Np06a4f//+B1uePnbeAwcOoEmTJihcuDC++uorvHnzBgsXLkT16tVx9uxZJTB6/vw5VqxYAXt7ewwdOhS5cuXC6tWr0bZtW6xZswadOnWCvb092rRpgw0bNmDOnDkwMzNT1rNu3TqICLp06ZLm90jt1KlTOH78OHx9fZEvXz5ERERg6dKlqFOnDq5evWqyu8qtW7dQp04d2Nra4ssvv4StrS2+//57+Pj4YP/+/ahVq1aGymHKx9QFOrNnz0ZkZGSG1rdq1SoAKcHskiVL0L59e1y+fBnFixf/qPLrTggzZsyAq6srvvzyS1hbW6tuq8jISFSrVg2vX7/G0KFDkSNHDqxcuRItW7bE5s2b0aZNG4Nlp6cuTc3UMf5XtvPDhw9x7949VKhQQXW8l5cXvvjiC4NhYWFh2L9/v8Gw3r17Y+XKlfj888/xxRdf4Pfff0dgYCCuXbtmUM+kp17Q90/W4fv378ft27fh7+8PV1dXXLlyBStWrMCVK1dw4sSJv9y6qr/t7ty5g8mTJxtNk5Hy37x5Ex07dkT//v3h5+eHkJAQtG/fHnv27EGDBg0ApJw/a9asiWvXrqFnz56oUKECnj17hp07d+LBgwfImTMn9u7di2bNmqFo0aKIjIzEmjVrUL16dezZswe1a9c2KJ+1tTVCQkLQunVrZdjWrVuRkJBg8nsvXboU9vb2yv+mvrtOmzZt0LZtWwDA0aNHsWLFijS2qulz6sfQdffTlz17dgAZ3z8qVKiQ8btKkgEhISECQA4cOCBRUVFy//59Wb9+veTIkUNsbGzkwYMHIiKSkJAgSUlJBvPeuXNHrKysZOrUqcqw4OBgASBz5swxWldycrIyHwCZOXOm0TSlSpWS2rVrK/8fOnRIAEjevHklLi5OGb5x40YBIPPnz1eWXaxYMWnUqJGyHhGR169fS6FChaRBgwZG66pWrZqULl1a+T8qKkoASEBAgDIsIiJCzMzM5OuvvzaY99KlS2Jubm40/ObNmwJAVq5cqQwLCAgQ/Z/l6NGjAkDWrFljMO+ePXuMhhcsWFCaNWtmVPZBgwZJ6p86ddlHjx4tuXPnFm9vb4NtumrVKtFqtXL06FGD+ZctWyYA5LfffjNan77atWsry/vpp5/E3NxcvvjiC9Vp07M9RFJ+J32JiYlSunRpqVevnsGytFqttGn
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Применение андерсэмплинга\n",
"rus = RandomUnderSampler(random_state=42)\n",
"X_resampled, y_resampled = rus.fit_resample(X, y)\n",
"\n",
"# Создание нового DataFrame для андерсэмплинга\n",
"train_resampled_under = pd.DataFrame(X_resampled, columns=X.columns)\n",
"train_resampled_under['Manufacturer'] = y_resampled\n",
"\n",
"# Визуализация распределения классов после андерсэмплинга\n",
"plot_class_distribution(train_resampled_under, 'Распределение классов в обучающей выборке (после андерсэмплинга)')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aimenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}