1278 lines
565 KiB
Plaintext
1278 lines
565 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from imblearn.over_sampling import RandomOverSampler\n",
|
|||
|
"from imblearn.under_sampling import RandomUnderSampler"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## **Diamonds Prices 2022**\n",
|
|||
|
"В данном наборе данных представлена цена на алмазы. \n",
|
|||
|
"Проблемная область: ценообразование бриллиантов\n",
|
|||
|
"Объект наблюдения: бриллиант\n",
|
|||
|
"Атрибуты: идентификатор, вес, качество огранки, цвет, чистота, общая глубина, ширина верхней грани, цена, длина, ширина, высота.\n",
|
|||
|
"Бизнес-цели:\n",
|
|||
|
"Повышение эффективности маркетинговых кампаний: Цель: Использование данных о бриллиантах для разработки целевых маркетинговых кампаний, направленных на конкретные сегменты рынка."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',\n",
|
|||
|
" 'price', 'x', 'y', 'z'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"..//static//csv//DiamondsPrices2022.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsLUlEQVR4nO3deZTXdd3//8fMIIs6DBeC4CQiuG8jKegxtUtLyzWzNDVKze1kmmtaHBWwy6wrvC6t3FvEvlZ2hVu573K8wD0SLddQKxBFZVUWmc/vD38zFyPbgONrGLjdzpkj8/m8l+f7M5/jmft83u/Pp6pSqVQCAADwMatu7wEAAIA1g/gAAACKEB8AAEAR4gMAAChCfAAAAEWIDwAAoAjxAQAAFCE+AACAIsQHAABQhPgAAACKEB8Aq5FXXnklVVVVGT16dHuPAgCLER8ASzB69OhUVVW1+Fp//fWz55575o477ig+z4MPPthilrXWWisDBw7MkUcemb///e9tso9x48Zl5MiRmT59eptsj+Tdd9/NyJEj8+CDD7b3KACrhE7tPQDAquz73/9+BgwYkEqlkqlTp2b06NHZb7/98qc//SkHHHBA8XlOOeWUDBkyJAsWLMhTTz2Vq6++OrfddlsmTpyY+vr6j7TtcePG5fzzz8/RRx+dHj16tM3Aa7h33303559/fpJkjz32aN9hAFYB4gNgGfbdd98MHjy4+ftjjz02ffr0ye9+97t2iY/dd989hxxySJLkG9/4RjbffPOccsopufbaazNs2LDi86xp3n///TQ2NqZz587tPQpAh+S0K4AV0KNHj3Tr1i2dOrX8282cOXNy5plnpl+/funSpUu22GKLXHTRRalUKkmS9957L1tuuWW23HLLvPfee83rvf3229lggw3yqU99KgsXLlzheT7zmc8kSSZNmrTM5e6///7svvvuWWedddKjR48cdNBB+dvf/tZ8/8iRI3PWWWclSQYMGNB8etcrr7yy3Bk+fHpa09eSTjU6+uijl7jsyJEjWyw3ZsyYDB48OLW1tS2Wu+iii5Y7z/Tp03P66adn4403TpcuXbLhhhvmyCOPzLRp05Ik8+fPz/Dhw7Pjjjumrq4u66yzTnbfffc88MADLbbTdP3MRRddlEsuuSSbbLJJunTpkr/+9a+t2sYrr7yS3r17J0nOP//8pR4rwJrEKx8AyzBjxoxMmzYtlUolb7zxRn72s59l9uzZ+drXvta8TKVSyRe+8IU88MADOfbYYzNo0KDcddddOeuss/Kvf/0rF198cbp165Zrr702u+66a84555z893//d5LkpJNOyowZMzJ69OjU1NSs8Hwvv/xykmS99dZb6jL33ntv9t133wwcODAjR47Me++9l5/97GfZdddd89RTT2XjjTfOl770pbzwwgv53e9+l4svvji9evVKkuZfnpdn7733zpFHHpkkefzxx/PTn/50qcv26tUrF198cfP3X//611vcP378+HzlK1/J9ttvnx/96Eepq6vLtGnTcvrppy93jtmzZ2f33XfP3/72txxzzDHZYYcdMm3atPzxj3/MP//5z/Tq1SszZ87ML37xixxxxBE5/vjjM2vWrPzyl7/M5z//+Tz22GMZNGhQi21ec801mTt3bk444YR06dIlPXv2bNU2evfunSuuuCInnnhiDj744HzpS19KkjQ0NLTqMQVYLVUAWMw111xTSbLYV5cuXSqjR49usezNN99cSVK54IILWtx+yCGHVKqqqiovvfRS823Dhg2rVFdXV8aOHVv5wx/+UElSueSSS5Y7zwMPPFBJUvnVr35VefPNNyuTJ0+u3HbbbZWNN964UlVVVXn88ccrlUqlMmnSpEqSyjXXXNO87qBBgyrrr79+5a233mq+7S9/+Uulurq6cuSRRzbfNmrUqEqSyqRJk1r9OM2fP7+SpHLyySc339Z0XA888MBiyw8dOrQyYMCAFrclqYwYMaL5+2HDhlWSVKZMmdJ8W9NxjRo1apnzDB8+vJKkcuONNy52X2NjY6VSqVTef//9yrx581rc984771T69OlTOeaYYxbbZ/fu3StvvPFGi+Vbu40333xzseMDWJN55QNgGS677LJsvvnmSZKpU6fmuuuuy3HHHZfa2trmv2TffvvtqampySmnnNJi3TPPPDNjxozJHXfckZNPPjnJB6c33XrrrTnqqKMye/bs/Pu///ti6y3LMccc0+L73r1759prr21xXcqipkyZkgkTJuTss89Oz549m29vaGjI3nvvndtvv73V+16SuXPnJkm6du3aquXnz5+fLl26LHOZWbNmpbq6eqUuer/hhhuy/fbb5+CDD17svqqqqiRJTU1N86tMjY2NmT59ehobGzN48OA89dRTi6335S9/ebFXgFZ0GwB8wDUfAMuw0047Za+99spee+2VoUOH5rbbbsvWW2+dk08+OfPnz0+SvPrqq6mvr09tbW2Ldbfaaqvm+5t07tw5v/rVrzJp0qTMmjUr11xzTfMvxa0xfPjw3HPPPbn//vvz9NNPZ/LkyYudtrSopn1vscUWi9231VZbZdq0aZkzZ06r9/9hTddR1NXVtWr56dOnZ911113mMrvssksaGxtz6qmn5uWXX860adPyzjvvtGr7L7/8crbddtvlLnfttdemoaEhXbt2zXrrrZfevXvntttuy4wZMxZbdsCAAR95GwB8QHwArIDq6ursueeemTJlSl588cWV2sZdd92V5INXDVZ0G9ttt1322muv7Lnnntluu+0Wu/C9tKYL0jfeeONWLf/666+nb9++y1zm8MMPz5lnnpnRo0dn0003Te/evbPDDjt8xEn/z3XXXZejjz46m2yySX75y1/mzjvvzD333JPPfOYzaWxsXGz5bt26feRtAPABp10BrKD3338/yQcXNydJ//79c++992bWrFktXv147rnnmu9v8vTTT+f73/9+vvGNb2TChAk57rjjMnHixFa/crCimvb9/PPPL3bfc889l169emWdddZJkhV6BabJE088kSRLPe1rUQsWLMhLL72UffbZZ5nLVVdX56KLLsrEiRMzadKkXH755Zk6dWqLi/yXZpNNNskzzzyzzGXGjBmTgQMH5sYbb2xxzCNGjFju9ld0GyvzmAKszrzyAbACFixYkLvvvjudO3duPq1qv/32y8KFC3PppZe2WPbiiy9OVVVV9t133+Z1jz766NTX1+cnP/lJRo8enalTp7bqXZxW1gYbbJBBgwbl2muvbfHJ5c8880zuvvvu7Lfffs23NUXIinzC+ZgxY7LFFltkyy23XO6yt9xyS957773mtwdelp/97Ge5//7785vf/CZ77bVXdt1111bN8+Uvfzl/+ctfctNNNy12X+X/f9vjpms1mr5PkkcffTTjx49v1T5WZBtrr712khV7TAFWZ175AFiGO+64o/kVjDfeeCO//e1v8+KLL+Z73/teunfvniQ58MADs+eee+acc87JK6+8ku233z533313brnllpx22mnZZJNNkiQXXHBBJkyYkPvuuy+1tbVpaGjI8OHDc+655+aQQw5pEQJtadSoUdl3332zyy675Nhjj21+q926uroWnzmx4447JknOOeecHH744VlrrbVy4IEHNkfJov7+97/nxz/+cR577LF86UtfynXXXdd83+OPP54kueeee7LRRhulb9++GTFiRC6//PJ86lOfyuc+97llzvvss8/m7LPPzsiRIzNkyJAVOtazzjorY8aMyaGHHppjjjkmO+64Y95+++388Y9/zJVXXpntt98+BxxwQG688cYcfPDB2X///TNp0qRceeWV2XrrrZtfzVqe1m6jW7du2XrrrfP73/8+m2++eXr27Jltt922VdelAKyW2vndtgBWSUt6q92uXbtWBg0aVLniiiua37a1yaxZsyqnn356pb6+vrLWWmtVNttss8qoUaOal3vyyScrnTp1qnz7299usd77779fGTJkSKW+vr7yzjvvLHWeprfa/cMf/rDMuZf0VruVSqVy7733VnbddddKt27dKt27d68ceOCBlb/+9a+Lrf8f//EflU9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['carat'])\n",
|
|||
|
"plt.title('Box Plot для carat')\n",
|
|||
|
"plt.xlabel('carat')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA0EAAAIjCAYAAADFthA8AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACr0klEQVR4nOzdeXwU9f0/8Nfs5tgkJJuLsAlCEg6FGAFBEOSwIrQc4lWrolCvoiJYr/araBEoVaS2VX/VokVBC6K29UAEY7mUw2CUcMWgQkwAIQGSwCbkTnZ+f8RZ95jdndmdvbKv5+Ph4yGb2ZnPzs7uft7z/nzeH0EURRFEREREREQRQhfsBhAREREREQUSgyAiIiIiIoooDIKIiIiIiCiiMAgiIiIiIqKIwiCIiIiIiIgiCoMgIiIiIiKKKAyCiIiIiIgoojAIIiIiIiKiiMIgiIiIiIiIIgqDICIicqmiogKCIOD1118PdlPsFBQUYMiQITAYDBAEAWfPnvXbsW6//Xbk5OT4bf9ERBR4DIKIKCIdOHAAN9xwA7Kzs2EwGNCzZ09MnDgRf//73/12zDVr1uD55593evzEiRNYuHAh9u7d67djO/r0008hCIL1v+joaPTp0we//vWv8f3332tyjM8//xwLFy7UPECpqanBjTfeiLi4OLz00ktYtWoVEhISZLd9/fXX7V6nwWDA+eefj7lz5+LkyZOatisc7d27FzNmzECvXr0QGxuL1NRUTJgwAStXrkRHR0dQ2vT000/jgw8+CMqxiShyRAW7AUREgfb555/jiiuuQO/evTFr1iyYTCYcO3YMu3btwgsvvID777/fL8dds2YNSkpK8OCDD9o9fuLECSxatAg5OTkYMmSIX47tym9/+1sMHz4cbW1tKC4uxj//+U+sX78eBw4cQFZWlk/7/vzzz7Fo0SLcfvvtSE5O1qbBAL788kvU19dj8eLFmDBhgqLn/PGPf0Rubi6am5uxY8cOLFu2DBs2bEBJSQni4+PdPnf58uWwWCxaND2kvPrqq7j33nvRo0cPzJw5E/3790d9fT02b96Mu+66C5WVlXj88ccD3q6nn34aN9xwA6699tqAH5uIIgeDICKKOE899RSMRiO+/PJLp875qVOngtMoP2hoaHCZIZGMHTsWN9xwAwDgjjvuwPnnn4/f/va3eOONNzBv3rxANFM16T1SE1hNnjwZl1xyCQDgN7/5DdLS0vC3v/0Na9euxfTp02WfI52/6Ohon9scanbt2oV7770Xo0aNwoYNG5CYmGj924MPPoivvvoKJSUlPh/HYrGgtbUVBoPB530REWmJw+GIKOKUlZXhwgsvlO1EZ2RkOD22evVqjBgxAvHx8UhJScG4cePwv//9z/r3tWvXYurUqcjKykJsbCz69u2LxYsX2w0n+tnPfob169fjyJEj1qFZOTk5+PTTTzF8+HAAnUGI9DfbOThffPEFJk2aBKPRiPj4eFx++eXYuXOnXRsXLlwIQRBQWlqKW265BSkpKRgzZozqczN+/HgAQHl5udvttmzZgrFjxyIhIQHJycm45pprcPDgQbv2/P73vwcA5ObmWl9XRUWF2/3+5z//wbBhwxAXF4f09HTMmDEDx48ft/79Zz/7GW677TYAwPDhwyEIAm6//XafX+ftt9+Obt26oaysDFOmTEFiYiJuvfVW698c5wRZLBa88MILuOiii2AwGNC9e3dMmjQJX331ld12q1evtr6e1NRU3HzzzTh27Jjbtv33v/+FIAj47LPPnP72yiuvQBAEa4BSVVWFO+64A+eddx5iY2ORmZmJa665xuN5XrRoEQRBwJtvvmkXAEkuueQSu/P6l7/8BZdddhnS0tIQFxeHYcOG4b///a/T8wRBwNy5c/Hmm2/iwgsvRGxsLAoKChTvQxAENDQ04I033rBeM968v0REnjATREQRJzs7G4WFhSgpKUF+fr7bbRctWoSFCxfisssuwx//+EfExMTgiy++wJYtW/Dzn/8cQOe8k27duuHhhx9Gt27dsGXLFjz55JOoq6vDs88+CwB44oknYDab8cMPP+C5554DAHTr1g0DBw7EH//4Rzz55JO4++67MXbsWADAZZddBqAz2Jg8eTKGDRuGBQsWQKfTYeXKlRg/fjy2b9+OESNG2LX3V7/6Ffr374+nn34aoiiqPjdlZWUAgLS0NJfbbNq0CZMnT0afPn2wcOFCNDU14e9//ztGjx6N4uJi5OTk4Prrr8d3332Ht956C8899xzS09MBAN27d3e539dffx133HEHhg8fjiVLluDkyZN44YUXsHPnTuzZswfJycl44okncMEFF+Cf//yndYhb3759NXmd7e3t+MUvfoExY8bgL3/5i9thcnfddRdef/11TJ48Gb/5zW/Q3t6O7du3Y9euXdaM01NPPYX58+fjxhtvxG9+8xucPn0af//73zFu3Djr65EzdepUdOvWDf/+979x+eWX2/3tnXfewYUXXmi9bn/5y1/i66+/xv3334+cnBycOnUKGzduxNGjR10Wc2hsbMTmzZsxbtw49O7dW9H5euGFF3D11Vfj1ltvRWtrK95++2386le/wkcffYSpU6fabbtlyxb8+9//xty5c5Genm5th5J9rFq1Cr/5zW8wYsQI3H333QDg1ftLROSRSEQUYf73v/+Jer1e1Ov14qhRo8T/+7//Ez/55BOxtbXVbrtDhw6JOp1OvO6668SOjg67v1ksFuv/NzY2Oh3jnnvuEePj48Xm5mbrY1OnThWzs7Odtv3yyy9FAOLKlSudjtG/f3/xF7/4hdPxcnNzxYkTJ1ofW7BggQhAnD59uqJzsHXrVhGAuGLFCvH06dPiiRMnxPXr14s5OTmiIAjil19+KYqiKJaXlzu1bciQIWJGRoZYU1NjfWzfvn2iTqcTf/3rX1sfe/bZZ0UAYnl5ucf2tLa2ihkZGWJ+fr7Y1NRkffyjjz4SAYhPPvmk9bGVK1eKAKxtdEfadtOmTeLp06fFY8eOiW+//baYlpYmxsXFiT/88IMoiqJ42223iQDExx57zGkft912m937tmXLFhGA+Nvf/tZpW+l9qqioEPV6vfjUU0/Z/f3AgQNiVFSU0+OOpk+fLmZkZIjt7e3WxyorK0WdTif+8Y9/FEVRFM+cOSMCEJ999lmP58HWvn37RADiAw88oPg5jtd4a2urmJ+fL44fP97ucQCiTqcTv/76a6/3kZCQIN52222K20ZE5A0OhyOiiDNx4kQUFhbi6quvxr59+/DnP/8Zv/jFL9CzZ098+OGH1u0++OADWCwWPPnkk9Dp7L8uBUGw/n9cXJz1/+vr61FdXY2xY8eisbER33zzjdft3Lt3Lw4dOoRbbrkFNTU1qK6uRnV1NRoaGnDllVdi27ZtThP27733XlXHuPPOO9G9e3dkZWVh6tSp1qFIUjbDUWVlJfbu3Yvbb78dqamp1scHDRqEiRMnYsOGDepfKICvvvoKp06dwn333Wc3f2Tq1KkYMGAA1q9f79V+JRMmTED37t3Rq1cv3HzzzejWrRvef/999OzZ02672bNne9zXu+++C0EQsGDBAqe/SdfFe++9B4vFghtvvNH6vlVXV8NkMqF///7YunWr22PcdNNNOHXqFD799FPrY//9739hsVhw0003Aei87mJiYvDpp5/izJkzHtstqaurAwDZYXCu2F7jZ86cgdlsxtixY1FcXOy07eWXX468vDyf9kFE5G8cDkdEEWn48OF477330Nrain379uH999/Hc889hxtuuAF79+5FXl4eysrKoNPpZDt0tr7++mv84Q9/wJYtW6wdTInZbPa6jYcOHQIA6xwYOWazGSkpKdZ/5+bmqjrGk08+ibFjx0Kv1yM9PR0DBw5EVJTrn4YjR44AAC644AKnvw0cOBCffPKJooIMavY7YMAA7NixQ9X+HL300ks4//zzERUVhR49euCCCy5wCmyjoqJw3nnnedxXWVkZsrKy7IJAR4cOHYIoiujfv7/s3z0VW5DmgL3zzju48sorAXQOhRsyZAjOP/98AEBsbCyWLl2KRx55BD169MDIkSN
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Визуализация данных после обработки\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"plt.scatter(df[\"price\"], df[\"carat\"])\n",
|
|||
|
"plt.xlabel(\"price\")\n",
|
|||
|
"plt.ylabel(\"carat\")\n",
|
|||
|
"plt.title(\"Scatter Plot of Price vs Carat\")\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Удаление строк с пустыми значениями"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 32365\n",
|
|||
|
"Размер контрольной выборки: 10789\n",
|
|||
|
"Размер тестовой выборки: 10789\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка сбалансированности выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение price в обучающей выборке:\n",
|
|||
|
"price\n",
|
|||
|
"789 80\n",
|
|||
|
"605 79\n",
|
|||
|
"544 72\n",
|
|||
|
"552 72\n",
|
|||
|
"828 71\n",
|
|||
|
" ..\n",
|
|||
|
"9942 1\n",
|
|||
|
"7787 1\n",
|
|||
|
"18663 1\n",
|
|||
|
"7979 1\n",
|
|||
|
"8164 1\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке:\n",
|
|||
|
"price\n",
|
|||
|
"625 34\n",
|
|||
|
"828 31\n",
|
|||
|
"605 30\n",
|
|||
|
"789 26\n",
|
|||
|
"544 26\n",
|
|||
|
" ..\n",
|
|||
|
"4188 1\n",
|
|||
|
"7541 1\n",
|
|||
|
"3498 1\n",
|
|||
|
"3314 1\n",
|
|||
|
"12196 1\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке:\n",
|
|||
|
"price\n",
|
|||
|
"802 33\n",
|
|||
|
"844 29\n",
|
|||
|
"776 29\n",
|
|||
|
"675 26\n",
|
|||
|
"645 25\n",
|
|||
|
" ..\n",
|
|||
|
"1567 1\n",
|
|||
|
"5529 1\n",
|
|||
|
"2031 1\n",
|
|||
|
"417 1\n",
|
|||
|
"5431 1\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['price'].value_counts()\n",
|
|||
|
" print(f\"Распределение price в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним овер- и андер- слемпинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение price в обучающей выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"5076 80\n",
|
|||
|
"1789 80\n",
|
|||
|
"3931 80\n",
|
|||
|
"1263 80\n",
|
|||
|
"2026 80\n",
|
|||
|
" ..\n",
|
|||
|
"3678 80\n",
|
|||
|
"4592 80\n",
|
|||
|
"516 80\n",
|
|||
|
"7152 80\n",
|
|||
|
"2353 80\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"966 34\n",
|
|||
|
"13638 34\n",
|
|||
|
"3669 34\n",
|
|||
|
"1052 34\n",
|
|||
|
"2818 34\n",
|
|||
|
" ..\n",
|
|||
|
"4032 34\n",
|
|||
|
"544 34\n",
|
|||
|
"3362 34\n",
|
|||
|
"6559 34\n",
|
|||
|
"792 34\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после oversampling:\n",
|
|||
|
"price\n",
|
|||
|
"3742 33\n",
|
|||
|
"559 33\n",
|
|||
|
"8403 33\n",
|
|||
|
"1238 33\n",
|
|||
|
"1243 33\n",
|
|||
|
" ..\n",
|
|||
|
"1149 33\n",
|
|||
|
"2401 33\n",
|
|||
|
"958 33\n",
|
|||
|
"702 33\n",
|
|||
|
"14618 33\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop(\"price\", axis=1)\n",
|
|||
|
" y = df[\"price\"]\n",
|
|||
|
"\n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
|
|||
|
"\n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение price в обучающей выборке после undersampling:\n",
|
|||
|
"price\n",
|
|||
|
"18823 1\n",
|
|||
|
"326 1\n",
|
|||
|
"327 1\n",
|
|||
|
"336 1\n",
|
|||
|
"337 1\n",
|
|||
|
" ..\n",
|
|||
|
"353 1\n",
|
|||
|
"351 1\n",
|
|||
|
"348 1\n",
|
|||
|
"345 1\n",
|
|||
|
"344 1\n",
|
|||
|
"Name: count, Length: 9496, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в контрольной выборке после undersampling:\n",
|
|||
|
"price\n",
|
|||
|
"18804 1\n",
|
|||
|
"334 1\n",
|
|||
|
"345 1\n",
|
|||
|
"351 1\n",
|
|||
|
"352 1\n",
|
|||
|
" ..\n",
|
|||
|
"373 1\n",
|
|||
|
"371 1\n",
|
|||
|
"369 1\n",
|
|||
|
"367 1\n",
|
|||
|
"366 1\n",
|
|||
|
"Name: count, Length: 5383, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение price в тестовой выборке после undersampling:\n",
|
|||
|
"price\n",
|
|||
|
"18803 1\n",
|
|||
|
"335 1\n",
|
|||
|
"336 1\n",
|
|||
|
"337 1\n",
|
|||
|
"358 1\n",
|
|||
|
" ..\n",
|
|||
|
"386 1\n",
|
|||
|
"384 1\n",
|
|||
|
"380 1\n",
|
|||
|
"378 1\n",
|
|||
|
"375 1\n",
|
|||
|
"Name: count, Length: 5338, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def undersample(df):\n",
|
|||
|
" X = df.drop('price', axis=1)\n",
|
|||
|
" y = df['price']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## **14,400 Classic Rock Tracks (with Spotify Data)**\n",
|
|||
|
"\n",
|
|||
|
" Этот набор данных, содержащий 1200 уникальных альбомов и 14 400 треков. Каждый трек каталогизирован с 18 столбцами данных, такие как название трека, исполнитель, альбом и год выпуска. \n",
|
|||
|
" Бизнес-цель: улучшение стратегии маркетинга и продвижения музыкальных треков, создание алгоритмов, которые будут рекомендовать пользователям музыку на основе их предпочтений. \n",
|
|||
|
" Цель технического проекта: Разработать и внедрить систему рекомендаций, которая будет предсказывать и рекомендовать пользователям музыкальные треки на основе их предпочтений и поведения. \n",
|
|||
|
" Входные данные: \n",
|
|||
|
" Данные о пользователях: Идентификатор пользователя, история прослушиваний, оценки треков, время прослушивания, частота прослушивания.\n",
|
|||
|
" Данные о треках: Атрибуты треков (название, исполнитель, альбом, год, длительность, танцевальность, энергичность, акустичность и т.д.).\n",
|
|||
|
" Данные о взаимодействии: Время и частота взаимодействия пользователя с определенными треками. \n",
|
|||
|
" Целевой признак: Рекомендации: Булева переменная, указывающая, должен ли конкретный трек быть рекомендован пользователю (1 - рекомендуется, 0 - не рекомендуется)."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выгрузка данных из csv файла \"Данные о клиентах\" в датафрейм"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',\n",
|
|||
|
" 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',\n",
|
|||
|
" 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',\n",
|
|||
|
" 'Popularity'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"..//static//csv//UltimateClassicRock.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Анализируем датафрейм при помощи \"ящика с усами\". Есть смещение в сторону меньших значений, это можно исправить при помощи oversampling и undersampling."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAx8AAAIjCAYAAABia6bHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAzVUlEQVR4nO3dd5RV5dnw4XsGptBRUYoCgg3ELqhAFFQs2DXqp6KIvUAUjTEaC9iisb72khcBA/ZEUINRFEskRikLSxTUCKJB5FXpAgPM/v5wzQnDDFV8BsbrWosVdjvnmT07zvmxy+RlWZYFAADATyy/qgcAAAD8PIgPAAAgCfEBAAAkIT4AAIAkxAcAAJCE+AAAAJIQHwAAQBLiAwAASEJ8AAAASYgPgCo0ZcqUyMvLi0GDBlX1UFiBXr16xZZbbrnOXzcvLy/69++/zl8XYH0mPoBqYdCgQZGXl1fuz2abbRb77rtvvPDCC8nH89prr5UbS0FBQbRu3Tp69uwZn3322Tp5j3/84x/Rv3//mDVr1jp5vaqQYj9tKKrD9xNgVWpW9QAA1qVrr702WrVqFVmWxddffx2DBg2KQw45JJ577rk47LDDko/nggsuiA4dOsTixYtj/Pjx8dBDD8Vf//rXeP/996NZs2Y/6rX/8Y9/xDXXXBO9evWKhg0brpsBV5Gfcj+trxYsWBA1a/73x3B1+n4CrIj4AKqV7t27R/v27XPTZ5xxRjRu3Dgee+yxKomPvffeO4499tiIiDjttNNi2223jQsuuCAGDx4cl19+efLxrK9+LvuptLQ0SkpKori4OIqLi6t6OADJuewKqNYaNmwYtWrVKvcvzBER8+fPj1//+tfRvHnzKCoqiu222y5uvfXWyLIsIn74V+k2bdpEmzZtYsGCBbntvvvuu2jatGl06tQpli5dusbj2W+//SIiYvLkyStdb9SoUbH33ntHnTp1omHDhnHkkUfGRx99lFvev3//+M1vfhMREa1atcpdtjRlypRVjmH5y9PK/rz22msV1u3Vq1el6y5/r8LTTz8d7du3j3r16pVb79Zbb13leCpT2X667777ol27dlFUVBTNmjWL3r17V7hEqWvXrrHDDjvEuHHjolOnTlGrVq1o1apVPPDAA+XWK7tMb/n9VXYZWGX7Ylm33nprdOrUKTbZZJOoVatW7L777vH0009XWC8vLy/69OkTQ4cOzY39b3/7W25Z2X5c2fezS5cusfPOO1c6ju222y4OOuiglY4VYH3izAdQrcyePTu++eabyLIsZsyYEXfffXfMmzcvTj755Nw6WZbFEUccEa+++mqcccYZscsuu8SLL74Yv/nNb+I///lP3HHHHVGrVq0YPHhwdO7cOa644oq4/fbbIyKid+/eMXv27Bg0aFDUqFFjjcf373//OyIiNtlkkxWu8/LLL0f37t2jdevW0b9//1iwYEHcfffd0blz5xg/fnxsueWWccwxx8THH38cjz32WNxxxx3RqFGjiIjYdNNNV2scBxxwQPTs2TMiIsaMGRN33XXXCtdt1KhR3HHHHbnpU045pdzyt956K44//vjYeeed46abbooGDRrEN998ExdddNFqjaUyy++n/v37xzXXXBPdunWL8847LyZNmhT3339/jBkzJkaPHh0FBQW5bWfOnBmHHHJIHH/88XHiiSfGk08+Geedd14UFhbG6aefvtZjWtadd94ZRxxxRPTo0SNKSkri8ccfj+OOOy6ef/75OPTQQ8utO2rUqHjyySejT58+0ahRo0pvXl/Z9/OUU06Js846Kz744IPYYYcdctuMGTMmPv7447jyyivXydcEkEQGUA0MHDgwi4gKf4qKirJBgwaVW3fYsGFZRGTXX399ufnHHntslpeXl3366ae5eZdffnmWn5+fvfHGG9lTTz2VRUT2P//zP6scz6uvvppFRPbwww9n//d//5dNmzYt++tf/5ptueWWWV5eXjZmzJgsy7Js8uTJWURkAwcOzG27yy67ZJtttln27bff5ua9++67WX5+ftazZ8/cvFtuuSWLiGzy5MmrvZ9KSkqyiMj69OmTm1f2db366qsV1u/Ro0fWqlWrcvMiIuvXr19u+vLLL88iIvvqq69y88q+rltuuWWl41md/TRjxoyssLAwO/DAA7OlS5fmtr3nnnty25bp0qVLFhHZbbfdlpu3aNGi3D4tKSnJsuy/x8vy+65sPMvui1NPPTVr2bJlufW+//77ctMlJSXZDjvskO23334V9lV+fn72r3/9q8LXvvx+XNH3c9asWVlxcXH229/+ttz8Cy64IKtTp042b968Cq8NsL5y2RVQrdx7770xcuTIGDlyZAwZMiT23XffOPPMM+Mvf/lLbp0RI0ZEjRo14oILLii37a9//evIsqzc07H69+8f7dq1i1NPPTXOP//86NKlS4XtVub000+PTTfdNJo1axaHHnpozJ8/PwYPHlzuvpRlffXVVzFhwoTo1atXbLzxxrn5O+20UxxwwAExYsSI1X7vyixcuDAiYrXvNygpKYmioqKVrjN37tzIz8//UTdJr2w/vfzyy1FSUhJ9+/aN/Pz//tg666yzon79+vHXv/613GvVrFkzzjnnnNx0YWFhnHPOOTFjxowYN27cWo9xWbVq1cr9febMmTF79uzYe++9Y/z48RXW7dKlS2y//fZr/V4NGjSII488Mh577LHcZYFLly6NJ554Io466qioU6fOWr82QGouuwKqlT322KPcB/sTTzwxdt111+jTp08cdthhUVhYGJ9//nk0a9Ys6tWrV27btm3bRkTE559/nptXWFgYDz/8cHTo0CGKi4tj4MCBkZeXt9rjufrqq2PvvfeOGjVqRKNGjaJt27YV7j9ZVtl7b7fddhWWtW3bNl588cWYP3/+Wn/g/OabbyLihw+0q2PWrFlRt27dla7TsWPHuOeee+LCCy+MSy+9NBo0aBAzZ85co3GtbD+taJ8UFhZG69aty32/IiKaNWtWYf9su+22EfHD71XZa6+91mhslXn++efj+uuvjwkTJsSiRYty8ys7Nlq1avWj369nz57xxBNPxN///vfYZ5994uWXX46vv/66wiVwAOs78QFUa/n5+bHvvvvGnXfeGZ988km0a9dujV/jxRdfjIgfzhp88skna/Rhcscdd4xu3bqt8Xv+VMpusF7dX5o3ffr0aNmy5UrXOeGEE2L8+PFx9913x0MPPbRW40q9n1YUkKvzEIG///3vccQRR8Q+++wT9913XzRt2jQKCgpi4MCB8eijj1ZYf9mzJGvroIMOisaNG8eQIUNin332iSFDhkSTJk3Wq2MLYHW47Aqo9pYsWRIREfPmzYuIiJYtW8a0adNi7ty55dabOHFibnmZ9957L6699to47bTTYtddd40zzzwzZs+e/ZONtey9J02aVGHZxIkTo1GjRrl/1V+TMzBlxo4dGxGxwsu+lrV48eL49NNPc2eEViQ/Pz9uvfXW6Nq1a2yzzTa5S97WlRXtk5KSkpg8eXKFOJo2bVrMnz+/3LyPP/44Iv4bXRtttFFERIWnZS1/FqUyf/7zn6O4uDhefPHFOP3006N79+7rJAJW9v2sUaNGnHTSSfH000/HzJkzY9iwYXHiiSeu1UMPAKqS+ACqtcWLF8dLL70UhYWFuQ/RhxxySCxdujTuueeecuvecccdkZeXF927d89t26tXr2jWrFnceeedMWjQoPj6669/1FOcVqVp06axyy67xODBg8t9MP7ggw/ipZdeikMOOSQ3ryxC1uQ3Yj/99NOx3XbbRZs2bVa57vDhw2PBggW5x96uzN133x2jRo2KoUOHRrdu3aJz586rPaZV6datWxQWFsZdd92Vu+chImLAgAExe/bsCk+XWrJkSTz44IO56ZKSknjwwQdj0003jd133z0iIrbaaquIiHjjjTdy6y1dunS1ztzUqFEj8vLyyp0lmTJlSgwbNmytvr4yq/p+nnLKKTFz5sw455xzKjzBDWBD4bIroFp54YUXcmcwZsyYEY8++mh88skncdlll0X9+vUjIuLwww+PfffdN6644oqYMmV
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Box plot для столбца 'Popularity'\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"sns.boxplot(x=df['Popularity'])\n",
|
|||
|
"plt.title('Box Plot для Popularity')\n",
|
|||
|
"plt.xlabel('Popularity')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Решим проблему пустых значений при помощи удаления таких строк."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df_cleaned = df.dropna()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 8650\n",
|
|||
|
"Размер контрольной выборки: 2884\n",
|
|||
|
"Размер тестовой выборки: 2884\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Разделение на обучающую и тестовую выборки\n",
|
|||
|
"train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разделение обучающей выборки на обучающую и контрольную\n",
|
|||
|
"train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_df))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_df))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_df))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Оценка сбалансированности выборок"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"23 258\n",
|
|||
|
"15 250\n",
|
|||
|
"26 246\n",
|
|||
|
"21 245\n",
|
|||
|
"14 245\n",
|
|||
|
" ... \n",
|
|||
|
"84 1\n",
|
|||
|
"87 1\n",
|
|||
|
"91 1\n",
|
|||
|
"79 1\n",
|
|||
|
"86 1\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"17 90\n",
|
|||
|
"26 86\n",
|
|||
|
"21 83\n",
|
|||
|
"24 83\n",
|
|||
|
"28 80\n",
|
|||
|
" ..\n",
|
|||
|
"85 1\n",
|
|||
|
"83 1\n",
|
|||
|
"84 1\n",
|
|||
|
"80 1\n",
|
|||
|
"77 1\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке:\n",
|
|||
|
"Popularity\n",
|
|||
|
"22 86\n",
|
|||
|
"21 85\n",
|
|||
|
"12 84\n",
|
|||
|
"20 82\n",
|
|||
|
"26 81\n",
|
|||
|
" ..\n",
|
|||
|
"76 2\n",
|
|||
|
"71 2\n",
|
|||
|
"79 1\n",
|
|||
|
"82 1\n",
|
|||
|
"80 1\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def check_balance(df, name):\n",
|
|||
|
" counts = df['Popularity'].value_counts()\n",
|
|||
|
" print(f\"Распределение Popularity в {name}:\")\n",
|
|||
|
" print(counts)\n",
|
|||
|
" print()\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df, \"обучающей выборке\")\n",
|
|||
|
"check_balance(val_df, \"контрольной выборке\")\n",
|
|||
|
"check_balance(test_df, \"тестовой выборке\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним овер- и андер- слемпинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"44 258\n",
|
|||
|
"20 258\n",
|
|||
|
"30 258\n",
|
|||
|
"27 258\n",
|
|||
|
"8 258\n",
|
|||
|
" ... \n",
|
|||
|
"78 258\n",
|
|||
|
"79 258\n",
|
|||
|
"74 258\n",
|
|||
|
"81 258\n",
|
|||
|
"86 258\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"21 90\n",
|
|||
|
"11 90\n",
|
|||
|
"28 90\n",
|
|||
|
"23 90\n",
|
|||
|
"37 90\n",
|
|||
|
" ..\n",
|
|||
|
"61 90\n",
|
|||
|
"84 90\n",
|
|||
|
"80 90\n",
|
|||
|
"77 90\n",
|
|||
|
"0 90\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке после oversampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"14 86\n",
|
|||
|
"47 86\n",
|
|||
|
"27 86\n",
|
|||
|
"13 86\n",
|
|||
|
"66 86\n",
|
|||
|
" ..\n",
|
|||
|
"63 86\n",
|
|||
|
"79 86\n",
|
|||
|
"71 86\n",
|
|||
|
"82 86\n",
|
|||
|
"80 86\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def oversample(df):\n",
|
|||
|
" X = df.drop('Popularity', axis=1)\n",
|
|||
|
" y = df['Popularity']\n",
|
|||
|
" \n",
|
|||
|
" oversampler = RandomOverSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = oversampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_oversampled = oversample(train_df)\n",
|
|||
|
"val_df_oversampled = oversample(val_df)\n",
|
|||
|
"test_df_oversampled = oversample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_oversampled, \"обучающей выборке после oversampling\")\n",
|
|||
|
"check_balance(val_df_oversampled, \"контрольной выборке после oversampling\")\n",
|
|||
|
"check_balance(test_df_oversampled, \"тестовой выборке после oversampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Распределение Popularity в обучающей выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"84 1\n",
|
|||
|
"85 1\n",
|
|||
|
"86 1\n",
|
|||
|
"87 1\n",
|
|||
|
"91 1\n",
|
|||
|
"Name: count, Length: 88, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в контрольной выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"82 1\n",
|
|||
|
"83 1\n",
|
|||
|
"84 1\n",
|
|||
|
"85 1\n",
|
|||
|
"87 1\n",
|
|||
|
"Name: count, Length: 85, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение Popularity в тестовой выборке после undersampling:\n",
|
|||
|
"Popularity\n",
|
|||
|
"0 1\n",
|
|||
|
"1 1\n",
|
|||
|
"2 1\n",
|
|||
|
"3 1\n",
|
|||
|
"4 1\n",
|
|||
|
" ..\n",
|
|||
|
"76 1\n",
|
|||
|
"77 1\n",
|
|||
|
"79 1\n",
|
|||
|
"80 1\n",
|
|||
|
"82 1\n",
|
|||
|
"Name: count, Length: 80, dtype: int64\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"def undersample(df):\n",
|
|||
|
" X = df.drop('Popularity', axis=1)\n",
|
|||
|
" y = df['Popularity']\n",
|
|||
|
" \n",
|
|||
|
" undersampler = RandomUnderSampler(random_state=42)\n",
|
|||
|
" X_resampled, y_resampled = undersampler.fit_resample(X, y)\n",
|
|||
|
" \n",
|
|||
|
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
|
|||
|
" return resampled_df\n",
|
|||
|
"\n",
|
|||
|
"train_df_undersampled = undersample(train_df)\n",
|
|||
|
"val_df_undersampled = undersample(val_df)\n",
|
|||
|
"test_df_undersampled = undersample(test_df)\n",
|
|||
|
"\n",
|
|||
|
"check_balance(train_df_undersampled, \"обучающей выборке после undersampling\")\n",
|
|||
|
"check_balance(val_df_undersampled, \"контрольной выборке после undersampling\")\n",
|
|||
|
"check_balance(test_df_undersampled, \"тестовой выборке после undersampling\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## **Car Price Prediction Challenge**\n",
|
|||
|
"В данном наборе данных представлена цена на автомобили\n",
|
|||
|
"Проблемная область: ценообразование автомобилей\n",
|
|||
|
"Объект наблюдения: автомобили\n",
|
|||
|
"Атрибуты: идентификатор, цена, сбор, производитель, модель, год выпуска, категория, наличие кожаного салона, тип топлива, объём двигателя, пробег, циллиндры, тип коробки передач, колеса, двери, цвет и подушки безопасности.\n",
|
|||
|
"Бизнес-цель: Построение моделей для прогнозирования цен на автомобили на основе их характеристик.\n",
|
|||
|
"Эффект для бизнеса: Оптимизация стратегии продаж и увеличение прибыли.\n",
|
|||
|
"Цель: Разработка модели для прогнозирования цен.\n",
|
|||
|
"Входные данные: Данные о характеристиках автомобилей (год выпуска, тип топлива, пробег и др.).\n",
|
|||
|
"Целевой признак: Цена автомобиля."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 73,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
|
|||
|
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
|
|||
|
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
|
|||
|
" 'Airbags'],\n",
|
|||
|
" dtype='object')\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
|
|||
|
"print(df.columns)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 61,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAABAcAAAIlCAYAAACpcJFFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAADtPElEQVR4nOzdeXhTdfYG8DdJs7SlCy20TQVLBWQrgoIiigqCFFAEFxRERWVEURQGBxEH2cRBUBQQBHEBVPi5i+JSqcimVPYdQYGyaJsWKN1pkib390dyb5smaZM0bbb38zydoTffJCcxaXtPzvccmSAIAoiIiIiIiIgoZMl9HQARERERERER+RaTA0REREREREQhjskBIiIiIiIiohDH5AARERERERFRiGNygIiIiIiIiCjEMTlAREREREREFOKYHCAiIiIiIiIKcUwOEBEREREREYU4JgeIiIiIiIiIQlyYrwMgIiIi4NSpU/j999+Rn5+PCxcuIC8vD/Pnz0dkZKSvQyMiIqIQIBMEQfB1EERERKFqx44dGD9+PH7//Xeb42FhYVi/fj369Onjo8iIiIgolHBbARFJdu/ejdGjR6Nt27aIjIxEeHg4WrdujYceegiZmZm+Do8o6Pz666+45ZZbcPDgQbz22ms4c+YMBEGAIAgwGo1MDBAREVGjYeUAEcFsNuM///kP3nzzTYSFheHWW29FWloalEolTp48iZ9//hkXL17ErFmz8NJLL/k6XKKgYDAY0KFDB1y4cAEbN27E1Vdf7euQiIiIKISx5wARYerUqXjzzTfRtWtXfPHFF2jdurXN5ZcuXcLixYtx4cIFH0VIFHx++uknnDx5Em+99RYTA0RERORz3FZAFOKOHz+OefPmIT4+HhkZGXaJAQAIDw/HpEmTMHPmTOnYI488AplMhpMnT2LevHlo27YtNBoNUlNTMWvWLBiNRof3t2XLFgwePBjNmjWDWq1G27ZtMXXqVJSXlztcf+rUKchkMqdfNbVq1QqtWrVyeFsrV66ETCbDypUrbY6Xl5fjxRdfxJVXXgm1Wm13H6dOnbJZf+DAAdx1111ISEiAQqGwWdu7d2+H912T+PzVvO3qj/mRRx5xeN1vvvkGffv2RdOmTaHRaJCWlobXX38dJpPJpccrcvRc/fnnn3j++edxzTXXID4+HhqNBldeeSVeeOEFlJaWuvTYACAnJwfTp0/H9ddfj4SEBKjVarRq1QpPPfUU8vPz7daLz4f4FRYWhhYtWuC+++7D4cOHpXWbNm2q9fXg7LVx/vx5TJgwAampqVCr1UhISMB9992HQ4cOOY1FLpcjOzvb7vKtW7dK9zFjxgy7y3/77TfcfvvtiIuLg0ajQfv27TF9+nS71/i2bdsAAElJSbjzzjsRHx8vvScmT56MoqIiu9t29hpbtGiR09crAMyYMcPp8+To9t577z1cd911iIyMtFvv7PXkKNbqX5GRkejQoQNmzJiBiooKl25j3759GDx4MK644gpERkYiNjYW11xzDRYsWGD3M2bjxo147LHH0K5dOzRp0gRNmjRB9+7dsXz58nrHJz5/mzZtsjleXl6Oli1bOn2/5ufn47nnnkO7du0QHh6OuLg49OjRA6+//rq0prb3+8SJEx2+nqv/96wZEwCcPn1a+tnk6HYPHTqE++67T3pvpqamYsKECU4TwHU9Dk/el85+Bubk5CAqKsrp+8uZ06dPY/To0bjsssugUqnQokULjB49GmfOnLFZ16pVK5fidOV1Xtv1p06d6lF81dX8uVj9q+ZzU1RUhLlz5+KWW25BcnIyVCoVkpOT8fDDD+PEiRMuP4+O/luK/70//PBDu/W9e/d2+LvY2WOp/t9b/B1V21f131EPPvggZDIZduzY4fA+pk2bBplMhv/7v/+zOb5//36MHDkSLVq0gFqthlarxYABA7Bu3Tq7WGr+d9++fTuio6ORmpqKs2fPSse99buSyN+wcoAoxK1cuRImkwlPPPEEEhMTa12rVqvtjk2YMAG//fYb7rvvPjRp0gTr1q3D9OnTceDAAXzxxRc2a5cuXYqnn34asbGxGDx4MBISErBr1y688sor2LhxIzZu3AiVSuXwvrt06YKhQ4faxH369Gn3H7ADo0aNwhdffIE2bdpg7NixiI2NBQCsXbsW+/fvt1mbk5ODm266CSUlJRgwYAC6du0qxVw9edJQpkyZgldffRWXXXYZ7r77bsTExGDr1q2YNGkStm/fjs8//7xet//VV1/h/fffR58+fdC7d2+YzWb8/vvvmDt3LjZv3owtW7ZAqVTWeTtbtmzB/Pnz0bdvX/To0QNKpRJ79+7F0qVL8dNPP2HPnj2IiYmxu9748eMRGxsLo9GIgwcP4osvvsCmTZtw5MgRNGvWDK1atcL06dNtrjNz5kykpKQ4TaacO3cOPXv2xIkTJ9C7d28MHz4c2dnZ+OKLL/D999/jp59+Qq9eveyuJ5fL8c477+DVV1+1Ob506VIoFAq7ZAwAfP755xgxYgTUajXuv/9+JCQkYP369Zg1axZ++uknbNq0CRqNRooLAIYPHw6VSoX77rsPWq0Wmzdvxrx58/Ddd99h27ZtDp+n6s6fP+/ySdSoUaNs/th29JpdsmQJxo0bh9jYWDzwwANITk6GTCbDvn378M0337h0P6Lq/11KSkrw/fffY+bMmTh9+jRWrFhR5/X//vtvnD9/Hn379kXz5s1RVlaGn376Cf/+979x6NAhvPfee9LauXPn4vjx47j++utx1113obCwEBkZGXjiiSdw7NgxzJ8/3+vxzZkzB3///bfDy44dO4Y+ffogNzcXvXr1wtChQ1FWVobDhw/jf//7H/7zn//UettHjx7F4sWLa12jUCiwbNkyuwTPO++84/SE7ddff0V6ejoMBgPuvfdetGrVCllZWVi4cCG+++47/P7772jWrJlbj8OT96UzkydPdvvk6s8//0SvXr1w7tw5DB48GJ06dcKhQ4fwwQcfYN26dfj1119x5ZVXArD8ziosLJSuK/6cF3/2iLp27erSfTt7jDfffLNH8TlSPbZTp05h1apVdmv++OMPTJs2DX369MFdd92FyMhIHD16FGvWrMH333+PPXv2ICUlxaXHBAC33HKL9LrS6XT44osvMGrUKAiCgFGjRrl8O64YMmSIw+d7wYIFNt8/8cQTWL16tZS8rM5kMmHFihWIj4/H3XffLR3/8ssv8cADD0AQBAwePBjt2rVDfn4+tm/fjvfffx+DBw92Gtf+/fsxcOBAREdHY8OGDWjZsqV0mbd+VxL5HYGIQlrv3r0FAMLPP//s1vVGjRolABCaN28unD17Vjqu1+uFm2++WQAgfPHFF9Lxw4cPC2FhYUKXLl2E8+fP29zWnDlzBADC66+/bnc/x48fFwAIjzzyiM3xW265RXD0IywlJUVISUlxGPOKFSsEAMKKFSukY8XFxYJcLheSk5OF0tJSh48xOztbOrZkyRIBgPDss8/a3T4A4ZZbbnF43zU5um1Rdna2AEAYNWqUzfH169cLAIT09HSbWM1ms/Dkk0/aPeeOHm91jp6rv//+W9Dr9XZrZ86cKQAQPv74Y5ceX15enlBSUmJ3fNWqVQIAYfbs2TbHnT0fkyZNEgAIa9eudXpfdT3vjz76qABAmDJlis3x77//XgAgtGnTRjCZTHaxDBkyRGjevLnN85Gfny+oVCph6NChAgBh+vTp0mVFRUVCTEyMoFarhf3790vHTSaTcP/99wsAhFmzZtndj0qlEvbs2WMT2zPPPCMAEMaNG1fnY33yyScFuVwudO3a1elr6r///a8AQNi0aVOdt3fttdcKAOxiquv1VJOj27506ZLQsmVLITY21qXbcMRgMAitW7cWIiMjbY6fPHnSbq3RaBRuu+02QaFQCKdPn/Y4vunTpwsAhI0bN0rHsrOzBY1GI3Tr1s3h+7V79+4CAGH58uV2cVX/mens/T5gwAAhMjJSaNeund3POjGeIUOGCEqlUtDpdNJler1eSEhIkF6j1W/XZDIJrVu3FgAIGRkZNrcpvtcee+wxjx5HTXW9Lx2957O
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1200x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df['Price'] = pd.to_numeric(df['Price'], errors='coerce')\n",
|
|||
|
"df['Prod. year'] = pd.to_numeric(df['Prod. year'], errors='coerce')\n",
|
|||
|
"\n",
|
|||
|
"df.dropna(subset=['Price', 'Prod. year'], inplace=True)\n",
|
|||
|
"\n",
|
|||
|
"avg_price_per_year = df.groupby('Prod. year')['Price'].mean().reset_index()\n",
|
|||
|
"\n",
|
|||
|
"plt.figure(figsize=(12, 6))\n",
|
|||
|
"plt.plot(avg_price_per_year['Prod. year'], avg_price_per_year['Price'], marker='o')\n",
|
|||
|
"plt.title('Средняя цена автомобиля в зависимости от года выпуска', fontsize=14)\n",
|
|||
|
"plt.xlabel('Год выпуска')\n",
|
|||
|
"plt.ylabel('Средняя цена ($)')\n",
|
|||
|
"plt.grid(True)\n",
|
|||
|
"\n",
|
|||
|
"plt.xticks(avg_price_per_year['Prod. year'][::4])\n",
|
|||
|
"\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 64,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Пропущенные значения в данных:\n",
|
|||
|
" ID 0\n",
|
|||
|
"Price 0\n",
|
|||
|
"Levy 5819\n",
|
|||
|
"Manufacturer 0\n",
|
|||
|
"Model 0\n",
|
|||
|
"Prod. year 0\n",
|
|||
|
"Category 0\n",
|
|||
|
"Leather interior 0\n",
|
|||
|
"Fuel type 0\n",
|
|||
|
"Engine volume 0\n",
|
|||
|
"Mileage 0\n",
|
|||
|
"Cylinders 0\n",
|
|||
|
"Gear box type 0\n",
|
|||
|
"Drive wheels 0\n",
|
|||
|
"Doors 0\n",
|
|||
|
"Wheel 0\n",
|
|||
|
"Color 0\n",
|
|||
|
"Airbags 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Выбросы в числовых столбцах:\n",
|
|||
|
" ID 2531\n",
|
|||
|
"Price 1073\n",
|
|||
|
"Prod. year 982\n",
|
|||
|
"Cylinders 4870\n",
|
|||
|
"Airbags 0\n",
|
|||
|
"dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Распределение автомобилей по производителям:\n",
|
|||
|
" Manufacturer\n",
|
|||
|
"HYUNDAI 3769\n",
|
|||
|
"TOYOTA 3662\n",
|
|||
|
"MERCEDES-BENZ 2076\n",
|
|||
|
"FORD 1111\n",
|
|||
|
"CHEVROLET 1069\n",
|
|||
|
" ... \n",
|
|||
|
"LAMBORGHINI 1\n",
|
|||
|
"PONTIAC 1\n",
|
|||
|
"SATURN 1\n",
|
|||
|
"ASTON MARTIN 1\n",
|
|||
|
"GREATWALL 1\n",
|
|||
|
"Name: count, Length: 65, dtype: int64\n",
|
|||
|
"\n",
|
|||
|
"Последний год производства в данных: 2020\n",
|
|||
|
"\n",
|
|||
|
"начальный год: 1939\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import numpy as np\n",
|
|||
|
"\n",
|
|||
|
"# Заменяем \"-\" на NaN только в числовых столбцах\n",
|
|||
|
"numeric_columns = ['Price', 'Levy', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']\n",
|
|||
|
"df[numeric_columns] = df[numeric_columns].replace(\"-\", np.nan)\n",
|
|||
|
"\n",
|
|||
|
"# Проверяем пропущенные значения снова\n",
|
|||
|
"missing_values = df.isnull().sum()\n",
|
|||
|
"print(\"Пропущенные значения в данных:\\n\", missing_values)\n",
|
|||
|
"\n",
|
|||
|
"# Фильтрация только числовых столбцов для анализа выбросов\n",
|
|||
|
"numerical_data = df.select_dtypes(include=[np.number])\n",
|
|||
|
"\n",
|
|||
|
"# 2. Анализ выбросов\n",
|
|||
|
"Q1 = numerical_data.quantile(0.25)\n",
|
|||
|
"Q3 = numerical_data.quantile(0.75)\n",
|
|||
|
"IQR = Q3 - Q1\n",
|
|||
|
"outliers = ((numerical_data < (Q1 - 1.5 * IQR)) | (numerical_data > (Q3 + 1.5 * IQR))).sum()\n",
|
|||
|
"print(\"\\nВыбросы в числовых столбцах:\\n\", outliers)\n",
|
|||
|
"\n",
|
|||
|
"# 3. Анализ смещения данных\n",
|
|||
|
"category_counts = df['Manufacturer'].value_counts()\n",
|
|||
|
"print(\"\\nРаспределение автомобилей по производителям:\\n\", category_counts)\n",
|
|||
|
"\n",
|
|||
|
"# 4. Анализ актуальности данных\n",
|
|||
|
"max_production_year = df['Prod. year'].max()\n",
|
|||
|
"min_year_from_car = df['Prod. year'].min()\n",
|
|||
|
"\n",
|
|||
|
"print(\"\\nПоследний год производства в данных:\", max_production_year)\n",
|
|||
|
"print(\"\\nначальный год:\", min_year_from_car)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В наборе данных есть пропущенные значения в столбце Levy. Выбросы встречаются с столбцах Price, Prod. year, Cylinders. Набор данных сильно смещён в сторону нескольких производителей, таких как Hyundai, Toyota, и Mercedes-Benz, которые составляют большую часть данных.\n",
|
|||
|
"В то же время, такие бренды, как Lamborghini, Pontiac, Saturn, и Aston Martin, представлены всего одной записью. Последний год выпуска в наборе данных — 2020: Данные устарели на несколько лет. Это может означать, что новые модели автомобилей, выпущенные после 2020 года, не учтены, что снижает актуальность данных для анализа современного рынка автомобилей."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### **Примеры решения обнаруженных проблем**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"В столбце Levy имеется 5819 пропущенных значений. Заполним пропуски, подставив 0."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 65,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['Levy'] = df['Levy'].fillna(0) "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Заменим выбрасы на медиану"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 66,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"for col in ['Price', 'Prod. year', 'Cylinders']:\n",
|
|||
|
" Q1 = df[col].quantile(0.25) # 1-й квартиль\n",
|
|||
|
" Q3 = df[col].quantile(0.75) # 3-й квартиль\n",
|
|||
|
" IQR = Q3 - Q1 # Интерквартильный размах\n",
|
|||
|
" median = df[col].median() # Медиана\n",
|
|||
|
"\n",
|
|||
|
" # Определение выбросов\n",
|
|||
|
" condition = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))\n",
|
|||
|
"\n",
|
|||
|
" # Замена выбросов на медиану\n",
|
|||
|
" df[col] = np.where(condition, median, df[col])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Если нужно использовать для анализа машины определенных лет выпуска, можем отфильтровать данные следующим образом:"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 67,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df = df[df['Prod. year'] >= 2015]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"#### **Оценка качества данных**"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Для каждой категориальной переменной, такой как Manufacturer, Model, Category, можно подсчитать количество уникальных значений. Это даст представление о разнообразии и информативности данных.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"ID 18924\n",
|
|||
|
"Price 2315\n",
|
|||
|
"Levy 559\n",
|
|||
|
"Manufacturer 65\n",
|
|||
|
"Model 1590\n",
|
|||
|
"Prod. year 54\n",
|
|||
|
"Category 11\n",
|
|||
|
"Leather interior 2\n",
|
|||
|
"Fuel type 7\n",
|
|||
|
"Engine volume 107\n",
|
|||
|
"Mileage 7687\n",
|
|||
|
"Cylinders 13\n",
|
|||
|
"Gear box type 4\n",
|
|||
|
"Drive wheels 3\n",
|
|||
|
"Doors 3\n",
|
|||
|
"Wheel 2\n",
|
|||
|
"Color 16\n",
|
|||
|
"Airbags 17\n",
|
|||
|
"dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"unique_counts = df.nunique()\n",
|
|||
|
"print(unique_counts)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Мы видим здесь 65 брендов машин и 1590 различных моделей, что говорит а разнообразии данных."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Проверим на соответствие реальным данным"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 62,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIVCAYAAADmnq8BAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACNm0lEQVR4nOzdd1gUV9sG8GcpYkFQVMCCiA1FBVFs2LCBii3WqLF3TexdrFFJokaNJfZeYu+9Y0uMsXeNBRsoqIBUYe/vD76dlxVwxSAL7P27Lq6Emdn12WGn3HPOnFEBgBAREREREVGyjPRdABERERERUXrH4ERERERERKQDgxMREREREZEODE5EREREREQ6MDgRERERERHpwOBERERERESkA4MTERERERGRDgxOREREREREOpjouwAiIiKipMTExMibN29ErVZLgQIF9F0OERk4tjgRERFRunHx4kXp0KGD5M2bV8zMzCR//vzSqlUrfZeVyPTp00WtVouIiFqtFl9fXz1XRERfG4MTEWUIq1atEpVKJRcvXkxyvoeHh5QtWzaNqyKi1LRr1y6pUaOG3Lp1S6ZNmyZHjhyRI0eOyOLFi/VdWiKrV6+WmTNnyrNnz2TWrFmyevVqfZdERF8Zu+oRERGR3r1580Z69uwpXl5esmXLFsmSJYu+S/qkKVOmSOfOnWXUqFFiZmYm69at03dJRPSVMTgRERGR3q1cuVKioqJk1apV6T40iYi0a9dO6tSpIw8ePJASJUpIvnz59F0SEX1l7KpHRJnanTt3pHXr1mJlZSVZs2YVNzc32b17t9YyyXUDDAoKEpVKJZMmTVKmTZo0SVQqlQQFBSX7bxYpUkS6du36yboeP34sKpUq2R8PDw+t5V+9eiU9evQQGxsbyZo1q7i4uHx216Ck6undu7dkzZpVTp48qUzbtWuXeHt7S4ECBcTMzEyKFSsmP/74o8TFxSnLeHh4fLJulUql9e+sW7dOKlasKNmyZRMrKyv59ttv5enTp1rLJPee9evXV5aJjY2VH3/8UYoVKyZmZmZSpEgRGTt2rERHRyf6rE2aNJHDhw9L+fLlJWvWrOLk5CTbt2/XWk7zN3/8+LEyTa1Wi7Ozs6hUKlm1apXW8lu3bhU3NzfJmTOnVo0zZ85UlunatauoVCopX758or+Br6+vqFQqMTc3TzTvY6dPn5Y2bdpI4cKFxczMTOzs7GTIkCESGRmptZzm39P85M6dWzw8POT06dNa6+NTf6siRYooy4aHh8uwYcPEzs5OzMzMxNHRUWbOnCkAtP5dzWvnzJmTqPZSpUqJSqWS77//Xmv6w4cPpU2bNmJlZSXZs2eXqlWryr59+7SW+fPPP6V8+fIyffp0pYYSJUrITz/9pNxLlLCGhNuliMiMGTOS3HaSWlean4/fIyX7i8ePH4u1tbW4u7tLnjx5kv3uEFHmwRYnIsq0bt68KdWrV5eCBQvK6NGjJUeOHLJ582Zp0aKFbNu2Tb755ht9lyjt27eXxo0ba00bM2aM1u+RkZHi4eEhDx48kO+//14cHBxky5Yt0rVrV3n37p0MGjQoRf/mxIkTZfny5bJp0yatk8xVq1aJubm5DB06VMzNzeX48eMyYcIECQ0NlRkzZoiIyLhx46Rnz54iEh8shwwZIr1795aaNWsm+nemTZsm48ePl7Zt20rPnj3l9evXMm/ePKlVq5ZcvnxZcuXKpSxbqFChRDfX58+fX/n/nj17yurVq6V169YybNgw+euvv8TX11du374tO3bs0Hrd/fv3pV27dtK3b1/p0qWLrFy5Utq0aSMHDx6UBg0aJLte1q5dK9evX080/fz589K2bVtxcXGRn376SSwtLZXP/jETExO5efOmXL58WVxdXbXWbdasWZP9txPasmWLRERESL9+/SRPnjxy4cIFmTdvnjx79ky2bNmitWzevHll9uzZIiLy7NkzmTt3rjRu3FiePn0quXLlkjlz5sj79+9FROT27dsyffp0GTt2rJQuXVpERAlyAKRZs2Zy4sQJ6dGjh5QvX14OHTokI0aMkOfPnyv/hkbWrFll5cqVMnjwYGXauXPn5MmTJ4k+T2BgoLi7u0tERIQMHDhQ8uTJI6tXr5ZmzZrJ1q1ble0wODhYzpw5I2fOnJHu3btLxYoV5dixYzJmzBh5/PixLFq0KNl19u7dO52DMyRcVyIinTp10pr/X/YXyX13iCiTARFRBrBy5UqICP7+++8k59euXRtlypTRmlavXj2UK1cOUVFRyjS1Wg13d3eUKFFC53u/fv0aIoKJEycq0yZOnAgRwevXr5Ot1d7eHl26dPnk53n06BFEBDNmzEg0r0yZMqhdu7by+5w5cyAiWLdunTItJiYG1apVg7m5OUJDQz/5byWsZ/HixRARzJs3L9FyERERiab16dMH2bNn11qHH3+GlStXJpr3+PFjGBsbY9q0aVrTr1+/DhMTE63pSf3tErpy5QpEBD179tSaPnz4cIgIjh8/rvVZRQTbtm1TpoWEhCB//vxwdXVVpmn+5o8ePQIAREVFoXDhwmjUqFGizzRmzBiICF6+fJnosyf8+3Xp0gU5cuRA06ZN8f333yvTT58+jWzZsqFFixbIkSNHsp9TI6m/g6+vL1QqFZ48eaL179nb22stt2TJEogILly4kOg9Tpw4ARHBiRMnEs3buXMnRARTp07Vmt66dWuoVCo8ePBAmSYiaN26NUxMTHDx4kVleo8ePdChQweICAYMGKBMHzx4MEQEp0+fVqaFhYXBwcEBRYoUQVxcHID474GIYNKkSVo1dO3aFSKC69eva9WQcLscOXIkrK2tUbFiRa1tR6Njx45wcHDQmvbxe6R0f/E53x0iylzYVY+IMqU3b97I8ePHpW3bthIWFiZBQUESFBQkwcHB4uXlJffv35fnz59rvSYkJERZLigoSN68efPJ9w8KCpLw8PCv/VFk//79YmtrK+3bt1emmZqaysCBA+X9+/dy6tSpz3qfXbt2Sf/+/WXEiBGJulKJiGTLlk35f806q1mzpkRERMidO3dSVPP27dtFrVZL27Zttdapra2tlChRQk6cOPHZ77V//34RERk6dKjW9GHDhomIJOryVaBAAa3WAQsLC+ncubNcvnxZAgICkvw3FixYIMHBwTJx4sRE88LCwsTIyEirhexTunfvLhs2bFC6Ea5cuVJatmwplpaWn/X6hH+H8PBwCQoKEnd3dwEgly9f1lpWrVYr6/bKlSuyZs0ayZ8/v9Ki9Ln2798vxsbGMnDgQK3pw4YNEwBy4MABrek2Njbi7e0tK1euFBGRiIgI2bx5s3Tr1i3J965cubLUqFFDmWZubi69e/eWx48fy61bt5TpxsbGiVrykvs7azx//lzmzZsn48ePT7YrZExMjJiZmSX38b9of6Hxqe8OEWUuBh2c/Pz8pGnTplKgQAFRqVSyc+fOFL1ec6/Dxz85cuT4OgUT0Wd78OCBAJDx48dLvnz5tH40JzivXr3Sek39+vW1lnN0dEz2/R0dHSVfvnxibm4uNjY24uPjo3UvUGp68uSJlChRQoyMtHfZmpPjpLpHfezKlSvSvn17iYuLSzYQ3rx5U7755huxtLQUCwsLyZcvn3z33XciEh8qU+L+/fsCQLlpPuHP7du3E637T3ny5IkYGRlJ8eLFtabb2tpKrly5En3+4sWLJ7rXqmTJkiIiWvc0aYSEhMj06dNl6NChYmNjk2h+tWrVRK1Wy6BBg+Tff/+VoKAgefv2bbL1ent7i4mJiezatUvCw8OTDRTJ8ff3l65du4qVlZWYm5tLvnz5pHbt2kqtCT19+lRZr66urvLvv//Ktm3bPuteqoSePHkiBQoUkJw5c2pN/9R3rFu3bkpA3LJli+TOnVvq1q2b5HsntS19/N4qlUoKFCggFhYWWss5OjqKkZFRkn87kfiupwUKFJA+ffok+/nevXv3yXXyJfsLEd3fHSLKXAz6Hqfw8HBxcXGR7t27S8uWLVP8+uHDh0vfvn21ptWrV08qVaqUWiUS0RfS3Ew+fPhw8fLySnKZj0/EFyxYoJxgi4iEhoY
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x500 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 5))\n",
|
|||
|
"sns.boxplot(x='Category', y='Price', data=df)\n",
|
|||
|
"plt.title('Цены по категориям автомобилей')\n",
|
|||
|
"plt.xlabel('Категория')\n",
|
|||
|
"plt.ylabel('Цена')\n",
|
|||
|
"plt.xticks(rotation=45)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"данная диаграмма была получина до устранения выбрасов."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 68,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIVCAYAAAC+4TwYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACUDUlEQVR4nOzdd1gUV9sG8HtBmiIoFhAVxC4oNixgjaLYosYag723xIhdUaMRNYlGjYKKPZbEGrvYSzQYa+w1ItjAgoBShX2+P/x2XlbQKCJDuX/XxaXMnN15dpjdnXvOzBmNiAiIiIiIiIgowxmoXQAREREREVFOxUBGRERERESkEgYyIiIiIiIilTCQERERERERqYSBjIiIiIiISCUMZERERERERCphICMiIiIiIlIJAxkREREREZFKcqldABEREVFGSkhIQHh4OLRaLWxtbdUuh4hyOPaQERERUbZ35swZfPXVVyhYsCBMTExQpEgRtG/fXu2yUpg+fTq0Wi0AQKvVYsaMGSpXRESfGgMZEeVoK1euhEajwZkzZ1Kd37BhQ1SsWDGDqyKi9LRt2zbUrVsXV69ehY+PD/bv34/9+/dj8eLFapeWwqpVqzBr1izcv38fs2fPxqpVq9QuiYg+MZ6ySERERNlWeHg4+vbtCw8PD2zcuBHGxsZql/ROU6dORffu3TFmzBiYmJhgzZo1apdERJ8YAxkRERFlWytWrEBcXBxWrlyZ6cMYAHTu3BmfffYZbt++jTJlyqBQoUJql0REnxhPWSQiSoPr16+jQ4cOsLKygqmpKVxcXLB9+3a9Nm87HfLp06fQaDT47rvvlGnfffcdNBoNnj59+tZllihRAj179nxnXXfv3oVGo3nrT8OGDfXaP378GH369IG1tTVMTU1RuXLl9z5FKrV6+vfvD1NTUxw5ckSZtm3bNrRs2RK2trYwMTFBqVKl8P333yMpKUlp07Bhw3fWrdFo9JazZs0aVK9eHWZmZrCyssKXX36Je/fu6bV523O6u7srbRITE/H999+jVKlSMDExQYkSJTB+/HjEx8eneK2tWrXCvn37UKVKFZiamsLR0RFbtmzRa6f7m9+9e1eZptVq4ezsDI1Gg5UrV+q137RpE1xcXJA3b169GmfNmqW06dmzJzQaDapUqZLibzBjxgxoNBqYm5unmPemP//8Ex07doSdnR1MTExQvHhxDB8+HLGxsXrtdMvT/eTPnx8NGzbEn3/+qbc+3vW3KlGihNI2OjoaI0aMQPHixWFiYoJy5cph1qxZEBG95eoeO3fu3BS1ly9fHhqNBkOHDtWbfufOHXTs2BFWVlbInTs3ateujV27dum1OXnyJKpUqYLp06crNZQpUwYzZ85UrtVKXkPy9yUA/PTTT6m+d1JbV7qfN5/jQz4v7t69i8KFC8PNzQ0FChR467ZDRNkHe8iIiD7QlStXUKdOHRQtWhRjx45Fnjx5sGHDBrRt2xabN2/GF198oXaJ6NKlC1q0aKE3bdy4cXq/x8bGomHDhrh9+zaGDh0KBwcHbNy4ET179kRERASGDRv2QcucPHkyli1bhvXr1+vtvK5cuRLm5ubw8vKCubk5Dh06hEmTJiEqKgo//fQTAGDChAno27cvgNeBdfjw4ejfvz/q1auXYjk+Pj6YOHEiOnXqhL59++LJkyeYP38+6tevj/PnzyNfvnxK22LFiqUYFKFIkSLK//v27YtVq1ahQ4cOGDFiBP7++2/MmDED165dwx9//KH3uFu3bqFz584YOHAgevTogRUrVqBjx44ICAhAkyZN3rpeVq9ejUuXLqWYHhgYiE6dOqFy5cqYOXMmLC0tldf+ply5cuHKlSs4f/48qlatqrduTU1N37rs5DZu3IiYmBgMGjQIBQoUwKlTpzB//nzcv38fGzdu1GtbsGBBzJkzBwBw//59zJs3Dy1atMC9e/eQL18+zJ07Fy9fvgQAXLt2DdOnT8f48eNRoUIFAFACooigdevWOHz4MPr06YMqVapg7969GDVqFB48eKAsQ8fU1BQrVqzAt99+q0z766+/EBwcnOL1hIWFwc3NDTExMfjmm29QoEABrFq1Cq1bt8amTZuU9+GzZ89w/PhxHD9+HL1790b16tVx8OBBjBs3Dnfv3sWiRYveus4iIiL+c1CN5OsKALp166Y3/2M+L9627RBRNiNERDnYihUrBICcPn061fkNGjQQJycnvWmNGzeWSpUqSVxcnDJNq9WKm5ublClT5j+f+8mTJwJAJk+erEybPHmyAJAnT568tVZ7e3vp0aPHO19PUFCQAJCffvopxTwnJydp0KCB8vvcuXMFgKxZs0aZlpCQIK6urmJubi5RUVHvXFbyehYvXiwAZP78+SnaxcTEpJg2YMAAyZ07t946fPM1rFixIsW8u3fviqGhofj4+OhNv3TpkuTKlUtvemp/u+T++ecfASB9+/bVmz5y5EgBIIcOHdJ7rQBk8+bNyrTIyEgpUqSIVK1aVZmm+5sHBQWJiEhcXJzY2dlJ8+bNU7ymcePGCQB59OhRitee/O/Xo0cPyZMnj3z++ecydOhQZfqff/4pZmZm0rZtW8mTJ89bX6dOan+HGTNmiEajkeDgYL3l2dvb67Xz9/cXAHLq1KkUz3H48GEBIIcPH04xb+vWrQJApk2bpje9Q4cOotFo5Pbt28o0ANKhQwfJlSuXnDlzRpnep08f+eqrrwSADBkyRJn+7bffCgD5888/lWkvXrwQBwcHKVGihCQlJYnI6+0AgHz33Xd6NfTs2VMAyKVLl/RqSP6+HD16tBQuXFiqV6+u997R8fT0FAcHB71pbz7Hh35evM+2Q0TZC09ZJCL6AOHh4Th06BA6deqEFy9e4OnTp3j69CmePXsGDw8P3Lp1Cw8ePNB7TGRkpNLu6dOnCA8Pf+fzP336FNHR0Z/6pWD37t2wsbFBly5dlGlGRkb45ptv8PLlSxw9evS9nmfbtm0YPHgwRo0aleKUMgAwMzNT/q9bZ/Xq1UNMTAyuX7/+QTVv2bIFWq0WnTp10lunNjY2KFOmDA4fPvzez7V7924AgJeXl970ESNGAECKU99sbW31ejMsLCzQvXt3nD9/HqGhoakuw9fXF8+ePcPkyZNTzHvx4gUMDAz0evTepXfv3li3bp1yOuWKFSvQrl07WFpavtfjk/8doqOj8fTpU7i5uUFEcP78eb22Wq1WWbf//PMPfv31VxQpUkTpAXtfu3fvhqGhIb755hu96SNGjICIYM+ePXrTra2t0bJlS6xYsQIAEBMTgw0bNqBXr16pPnfNmjVRt25dZZq5uTn69++Pu3fv4urVq8p0Q0PDFD2Pb/s76zx48ADz58/HxIkT33pKaEJCAkxMTN728tP0eaHzrm2HiLIXBjIiog9w+/ZtiAgmTpyIQoUK6f3odpweP36s9xh3d3e9duXKlXvr85crVw6FChWCubk5rK2t4e3trXetVXoKDg5GmTJlYGCg/1Wg2+lO7TSxN/3zzz/o0qULkpKS3ho0r1y5gi+++AKWlpawsLBAoUKF0LVrVwCvw+qHuHXrFkREGewg+c+1a9dSrPt3CQ4OhoGBAUqXLq033cbGBvny5Uvx+kuXLp3iWrayZcsCgN41YzqRkZGYPn06vLy8YG1tnWK+q6srtFothg0bhn///RdPnz7F8+fP31pvy5YtkStXLmzbtg3R0dFvDSpvExISgp49e8LKygrm5uYoVKgQGjRooNSa3L1795T1WrVqVfz777/YvHnze12rllxwcDBsbW2RN29evenv2sZ69eqlBM+NGzcif/78aNSoUarPndp76c3n1mg0sLW1hYWFhV67cuXKwcDAINW/HfD6FFxbW1sMGDDgra8vIiLineskLZ8XwH9vO0SUvfAaMiKiD6AbBGDkyJHw8PBItc2bO/i+vr7KjjsAREVFvfWGtJs3b4aFhQViYmLwxx9/wMfHBxYWFhg9enQ6vYL0deHCBTRv3hyNGzfGqFGj0LVrV73rxyIiItCgQQNYWFhg6tSpKFWqFExNTXHu3DmMGTMmxaAK/0Wr1UKj0WDPnj0wNDRMMf9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x500 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 5))\n",
|
|||
|
"sns.boxplot(x='Category', y='Price', data=df)\n",
|
|||
|
"plt.title('Цены по категориям автомобилей')\n",
|
|||
|
"plt.xlabel('Категория')\n",
|
|||
|
"plt.ylabel('Цена')\n",
|
|||
|
"plt.xticks(rotation=45)\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"После устранения выбрасов, диаграмма выглядит лучше, хотя здесь все равно присутствуют очень маленькие значения, что не соответсвуем реальным ценам на машины. Из этого можем сделать вывод, что цены в наборе данных не особо соответствуют реальности. "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Разбиение набора данных на обучающую, контрольную и тестовую выборки"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 85,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Размер обучающей выборки: 13465\n",
|
|||
|
"Размер контрольной выборки: 2886\n",
|
|||
|
"Размер тестовой выборки: 2886\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Загрузка данных\n",
|
|||
|
"new1 = pd.read_csv(\"..//static//csv//car_price_prediction.csv\")\n",
|
|||
|
"\n",
|
|||
|
"# Разбиение на обучающую и временную выборки\n",
|
|||
|
"train_data, temp_data = train_test_split(new1, test_size=0.3, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"# Разбиение временной выборки на контрольную и тестовую\n",
|
|||
|
"val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Размер обучающей выборки:\", len(train_data))\n",
|
|||
|
"print(\"Размер контрольной выборки:\", len(val_data))\n",
|
|||
|
"print(\"Размер тестовой выборки:\", len(test_data))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 86,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAnIAAAIwCAYAAAACvobyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HuXjnQVAUXsXaPBhr2CiL13RY0llthijTUaYm+xxESKsfceFbtR7GLXqMEuWAFFEYHn+8Nv591ldwGNRje5f9e1F+zMmZkzs1OeOXPOGZWICIiIiIjI5Kg/dQaIiIiI6P0wkCMiIiIyUQzkiIiIiEwUAzkiIiIiE8VAjoiIiMhEMZAjIiIiMlEM5IiIiIhMFAM5IiIiIhPFQI6I/raEhATcuXMHz549+9RZoQ/s+fPnuHnzJhISEj51VojIAAZyRPRe1qxZg9q1a8Pe3h52dnbInTs3pkyZ8qmzZRJevHiBWbNmKd9jY2Mxb968T5chLSKCRYsWoWLFirC1tYWDgwPy5s2LpUuXfuqsfTDHjx+HpaUlbt269amzQgQAWLhwIXLnzo3Xr1+/87SqT/GKrtDQUAQGBirfrayskDt3bvj6+mL06NHIkSPHP50lInoHw4cPx+TJk9G4cWO0adMG2bJlg0qlQqFCheDp6fmps/fZS0lJgaOjI37++WdUq1YN06dPx5UrV7Bjx45PnTW0bdsWq1atQufOndGgQQM4OjpCpVKhVKlSyJ49+6fO3gdRt25deHh4ICws7FNnhQgAkJiYiDx58mDkyJHo37//O01r/pHylCkTJkxA3rx5kZiYiD/++AMLFizA9u3bceHCBdja2n7KrBGREQcOHMDkyZMRFBSE4cOHf+rsmCQzMzOMHz8enTp1QmpqKhwcHLBt27ZPnS0sWbIEq1atwtKlS9GuXbtPnZ2PIjIyErt378aRI0c+dVaIFNbW1ujcuTNmzJiBfv36QaVSZXraT1oid+LECZQtW1YZPnjwYMyYMQPLly9H27Zt/+lsEVEmNGzYEE+fPsXhw4c/dVZM3t27d3Hnzh0ULVoUTk5Onzo7KFmyJEqVKoVly5Z96qx8NN988w02btyImzdvvtPFkuhjO3XqFMqWLYs9e/agVq1amZ7us6ojp8l4VFQUAODp06cYMmQISpYsCTs7Ozg4OMDf3x9nz57VmzYxMRHjxo1DoUKFYG1tDXd3dzRr1gw3btwAAOWgNfapUaOGMq/9+/dDpVJh1apVGDlyJNzc3JAlSxY0atQId+7c0Vv2sWPHUK9ePTg6OsLW1hbVq1c3epGrUaOGweWPGzdOL+3SpUvh7e0NGxsbuLi4oE2bNgaXn966aUtNTcWsWbNQvHhxWFtbI0eOHOjZs6deBfU8efKgQYMGesvp27ev3jwN5X3q1Kl62xQAXr9+jbFjx6JAgQKwsrKCp6cnhg4dmqk6ATVq1NCb36RJk6BWq7F8+fL32h7Tpk1DpUqVkDVrVtjY2MDb2xtr1641uPylS5eifPnysLW1hbOzM6pVq4Zdu3bppPn9999RvXp12Nvbw8HBAeXKldPL25o1a5TfNFu2bOjQoQPu3bunk6ZLly46eXZ2dkaNGjVw6NChDLfT35kWAPbu3YuqVasiS5YscHJyQuPGjXH58mWdNEePHkWJEiXQpk0buLi4wMbGBuXKlcPGjRuVNC9evECWLFnwzTff6C3j7t27MDMzQ1BQkJLnPHny6KVLu2/dunULX3/9NQoXLgwbGxtkzZoVLVu2xM2bN3Wm0xy/+/fvV4adOHECdevWhb29PbJkyWJwm4SGhkKlUuHkyZPKsMePHxvcxxs0aGAwz5k5F4wbN07ZF3PlygUfHx+Ym5vDzc1NL9+GaKbXfOzt7VG+fHmd7Q+8PWZKlChhdD6a4yQ0NBTA2wYrFy5cgKenJwICAuDg4GB0WwHAX3/9hZYtW8LFxQW2traoWLGiXqniu5xL3+UYf5dzblobN25ErVq1DAZx6Z070u5nmVl/Y5KTk/H9998jf/78sLKyUh6ppT0X5smTR1m+Wq2Gm5sbWrdujdu3b+uke9dz+65du1C6dGlYW1ujWLFiWL9+vV4eY2NjMXDgQOTJkwdWVlbIlSsXOnXqhMePHwMAgoKCULx4cdja2sLFxQWNGjXC6dOnjW7PtPtnYmIinJ2doVKpMG3aNGV42v077Uezv2rLzG+W2WuVZp81dC2ws7NDly5dlO+ac4ahz927dwEA586dQ5cuXZAvXz5YW1vDzc0NXbt2xZMnT/Tm7+3tDRcXF2zatElvXHo+6aPVtDRBV9asWQG8PVA2btyIli1bIm/evIiJicHPP/+M6tWr49KlS/Dw8ADwtr5JgwYNsGfPHrRp0wbffPMNnj9/jvDwcFy4cAH58+dXltG2bVvUr19fZ7kjRowwmJ9JkyZBpVJh2LBhePjwIWbNmoU6deogMjISNjY2AN5e+Pz9/eHt7Y2xY8dCrVYjJCQEtWrVwqFDh1C+fHm9+ebKlUu5iL148QK9e/c2uOzRo0ejVatW6N69Ox49eoS5c+eiWrVqOHPmjMG79x49eqBq1aoAgPXr12PDhg0643v27KmUhvbv3x9RUVH46aefcObMGRw+fBgWFhYGt8O7iI2NVdZNW2pqKho1aoQ//vgDPXr0QNGiRXH+/HnMnDkTf/75p95BnpGQkBB89913mD59utFHQBltj9mzZ6NRo0Zo3749kpKSsHLlSrRs2RJbt25FQECAkm78+PEYN24cKlWqhAkTJsDS0hLHjh3D3r174evrC+DtAd21a1cUL14cI0aMgJOTE86cOYMdO3Yo+dNs+3LlyiEoKAgxMTGYPXs2Dh8+rPebZsuWDTNnzgTwNvCZPXs26tevjzt37mRYcvO+0+7evRv+/v7Ily8fxo0bh1evXmHu3LmoXLkyTp8+rQQuT548waJFi2BnZ4f+/fsje/bsWLp0KZo1a4Zly5ahbdu2sLOzQ9OmTbFq1SrMmDEDZmZmynJWrFgBEUH79u3TXY+0Tpw4gSNHjqBNmzbIlSsXbt68iQULFqBGjRq4dOmS0eoY169fR40aNWBra4tvv/0Wtra2+OWXX1CnTh2Eh4ejWrVq75QPY97nXKAxffp0xMTEvNPyfvvtNwBvg8358+ejZcuWuHDhAgoXLvxe+ddcWCZPngw3Nzd8++23sLa2NritYmJiUKlSJbx8+RL9+/dH1qxZERYWhkaNGmHt2rVo2rSpzrwzcy5Ny9gx/ne2871793D79m18+eWX6W4L7evE9u3bsWLFCp3x77r+aXXv3h1hYWFo0aIFBg8ejGPHjiEoKAiXL1/WO09VrVoVPXr0QGpqKi5cuIBZs2bh/v37OsH1u5zbr127htatW6NXr17o3LkzQkJC0LJlS+zYsQN169YF8Pa6VLVqVVy+fBldu3bFl19+icePH2Pz5s24e/cusmXLhp07dyIgIAAFChRATEwMli1bhsqVK2PHjh2oXr26zjpYW1sjJCQETZo0UYatX78eiYmJRrfRggULYGdnp3yPiorCmDFjjKZv2rQpmjVrBgA4dOgQFi1alM4vYPxa9T401cS0ubi4AADCw8Px119/ITAwEG5ubrh48SIWLVqEixcv4ujRo3o3FF9++eW7P+2QTyAkJEQAyO7du+XRo0dy584dWblypWTNmlVsbGzk7t27IiKSmJgoKSkpOtNGRUWJlZWVTJgwQRkWHBwsAGTGjBl6y0pNTVWmAyBTp07VS1O8eHGpXr268n3fvn0CQHLmzCnx8fHK8NWrVwsAmT17tjLvggULip+fn7IcEZGXL19K3rx5pW7dunrLqlSpkpQoUUL5/ujRIwEgY8eOVYbdvHlTzMzMZNKkSTrTnj9/XszNzfWGX7t2TQBIWFiYMmzs2LGi/fMeOnRIAMiyZct0pt2xY4fecC8vLwkICNDLe58+fSTtLpM270OHDhVXV1fx9vbW2aa//fabqNVqOXTokM70CxcuFABy+PBhveVpq169ujK/bdu2ibm5uQwePNhg2sxsD5G3v5O2pKQkKVGihNSqVUtnXmq
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Определяем целевую переменную и признаки\n",
|
|||
|
"X = train_data.drop(columns=['Manufacturer']) # Признаки\n",
|
|||
|
"y = train_data['Manufacturer'] # Целевая переменная\n",
|
|||
|
"\n",
|
|||
|
"# Функция для визуализации распределения\n",
|
|||
|
"def plot_class_distribution(data, title):\n",
|
|||
|
" data['Manufacturer'].value_counts().plot(kind='bar', title=title)\n",
|
|||
|
" plt.xlabel('Manufacturer')\n",
|
|||
|
" plt.ylabel('Count')\n",
|
|||
|
" plt.show()\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов до оверсэмплинга\n",
|
|||
|
"plot_class_distribution(train_data, 'Распределение классов в обучающей выборке (до оверсэмплинга)')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"Выполним овер- и андер- слемпинг."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 87,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoEAAAIwCAYAAAD5xbFoAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1gUV9sG8HuX3hGRpqjYezRYgr2g2HvBgr0llqgx1lhjNPYWayLF3ks0iYq9oUYUsUeN2MEKKIIIPN8ffjsvy+4CEo2+796/69pLmTk7c2Z25swzM6eoRERAREREREZF/bEzQERERET/PgaBREREREaIQSARERGREWIQSERERGSEGAQSERERGSEGgURERERGiEEgERERkRFiEEhERERkhBgEEhEAICEhAXfv3sXz588/dlboPXvx4gWioqKQkJDwsbNCRJ8QBoFERmzTpk2oV68e7OzsYGtri/z582PGjBkfO1v/FV6+fIl58+Ypf8fGxmLRokUfL0PpiAiWL1+OL774AtbW1rC3t4eXlxdWr179sbP23pw+fRrm5ua4ffv2x84K0Ufz9OlT2NjY4Pfff8/R91U5HTYuODgYPXr0UP62sLBA/vz50aBBA4wbNw6urq45yhAR/TtGjRqF6dOno0WLFvD394ezszNUKhWKFSsGT0/Pj529T15qaiocHBywbNky1KxZE7Nnz8bVq1exe/fuj501dOzYERs2bEC3bt3QtGlTODg4QKVSoVy5csiTJ8/Hzt57Ub9+fXh4eCAkJORjZ4Xoo/r6669x7NgxhIeHv/N3Tf/pyidPngwvLy8kJSXh2LFjWLJkCX7//XdcvHgR1tbW/3TxRPQBHD58GNOnT8e0adMwatSoj52d/0omJiaYNGkSunbtirS0NNjb2+O333772NnCypUrsWHDBqxevRqdOnX62Nn5ICIiIrBv3z6cOHHiY2eF6KPr378/FixYgAMHDqBu3brv9N1//CTwzz//RMWKFZXp33zzDebMmYO1a9eiY8eOOVk0EX1gzZo1w7Nnz3D8+PGPnZX/evfu3cPdu3dRsmRJODo6fuzsoGzZsihXrhzWrFnzsbPywXz99dfYvn07oqKioFKpPnZ2iD66smXLokKFCli5cuU7fe+91wnURKG3bt0CADx79gzDhw9H2bJlYWtrC3t7ezRq1Ajnz5/X+W5SUhImTpyIYsWKwdLSEu7u7mjdujVu3rwJAMoJb+hTu3ZtZVmHDh2CSqXChg0bMGbMGLi5ucHGxgbNmzfH3bt3ddZ96tQpNGzYEA4ODrC2tkatWrUMXiBr166td/0TJ07USbt69Wp4e3vDysoKTk5O8Pf317v+zLYtvbS0NMybNw+lS5eGpaUlXF1d0a9fP53K/AULFkTTpk111jNw4ECdZerL+8yZM3X2KQC8fv0aEyZMQJEiRWBhYQFPT0+MGDECr1+/1ruv0qtdu7bO8n744Qeo1WqsXbs2R/tj1qxZqFq1KnLnzg0rKyt4e3tj8+bNete/evVqVK5cGdbW1siVKxdq1qyJvXv3aqX5448/UKtWLdjZ2cHe3h6VKlXSydumTZuU39TZ2RldunTB/fv3tdJ0795dK8+5cuVC7dq1cfTo0Sz30z/5LgAcOHAANWrUgI2NDRwdHdGiRQtcuXJFK83JkydRpkwZ+Pv7w8nJCVZWVqhUqRK2b9+upHn58iVsbGzw9ddf66zj3r17MDExwbRp05Q8FyxYUCddxmPr9u3b+Oqrr1C8eHFYWVkhd+7caNeuHaKiorS+pzl/Dx06pEz7888/Ub9+fdjZ2cHGxkbvPgkODoZKpcKZM2eUaU+ePNF7jDdt2lRvnrNTFkycOFE5FvPlywcfHx+YmprCzc1NJ9/6aL6v+djZ2aFy5cpa+x94e86UKVPG4HI050lwcDCAt417Ll68CE9PTzRp0gT29vYG9xUA/P3332jXrh2cnJxgbW2NL774Qudp5ruUpe9yjr9LmZvR9u3bUbduXZ3yoGDBgpleI9JLSUnB999/j8KFC8PCwgIFCxbEmDFj9JZl2SkX3ncZbkh2zu/sHl/Aux3vV69eRfv27WFvb4/cuXPj66+/RlJSks4yMytrjx49Cl9fXzg7O8PKygoVKlTAkiVLkPF5lOY627JlS53l9+vXDyqVSuvceJf4QCNjWavvWq6vbLt79y6srKygUqm0yq53ve7q+0yZMgUAkJycjPHjx8Pb2xsODg6wsbFBjRo1cPDgQZ3lA2+rR+zcuVNnP2blH78OzkgTsOXOnRvA20Jm+/btaNeuHby8vBATE4Nly5ahVq1auHz5Mjw8PAC8rV/TtGlT7N+/H/7+/vj666/x4sULhIaG4uLFiyhcuLCyjo4dO6Jx48Za6x09erTe/Pzwww9QqVQYOXIkHj16hHnz5sHX1xcRERGwsrIC8PakatSoEby9vTFhwgSo1WoEBQWhbt26OHr0KCpXrqyz3Hz58ikXwJcvX+LLL7/Uu+5x48ahffv26N27Nx4/foyFCxeiZs2aOHfunN6nBn379kWNGjUAAFu3bsW2bdu05vfr1095Cjt48GDcunULP/30E86dO4fjx4/DzMxM7354F7Gxscq2pZeWlobmzZvj2LFj6Nu3L0qWLIkLFy5g7ty5+Ouvv/QWMJkJCgrCd999h9mzZxt8bZXV/pg/fz6aN2+Ozp07Izk5GevXr0e7du2wa9cuNGnSREk3adIkTJw4EVWrVsXkyZNhbm6OU6dO4cCBA2jQoAGAtwFEz549Ubp0aYwePRqOjo44d+4cdu/ereRPs+8rVaqEadOmISYmBvPnz8fx48d1flNnZ2fMnTsXwNugaf78+WjcuDHu3r2b5ROjnH533759aNSoEQoVKoSJEyciMTERCxcuRLVq1XD27FmlMHv69CmWL18OW1tbDB48GHny5MHq1avRunVrrFmzBh07doStrS1atWqFDRs2YM6cOTAxMVHWs27dOogIOnfunOl2ZPTnn3/ixIkT8Pf3R758+RAVFYUlS5agdu3auHz5ssEqJDdu3EDt2rVhbW2Nb7/9FtbW1vj555/h6+uL0NBQ1KxZ853yYUhOygKN2bNnIyYm5p3Wt2rVKgBvA9XFixejXbt2uHjxIooXL56j/D99+hQAMH36dLi5ueHbb7+FpaWl3n0VExODqlWr4tWrVxg8eDBy586NkJAQNG/eHJs3b0arVq20lp2dsjQjQ+f4P9nP9+/fx507d/D555/rnV++fHl88803WtNWrlyJ0NBQrWm9e/dGSEgI2rZti2+++QanTp3CtGnTcOXKFa1yJjvlQnofsgzP7vmtkdXx9a6/Q/v27VGwYEFMmzYNJ0+exIIFC/D8+XOtp09ZlbUnTpyAi4sLvvvuO5iYmODw4cP46quvEBkZiSVLlmitz9LSEr/99hsePXoEFxcXAEBiYiI2bNgAS0tLvfvoXeIDQLusBYCAgACDaTXGjx+vN/h9V/Xr10fXrl21ppUvXx4AEB8fj19++QUdO3ZEnz598OLFC6xYsQJ+fn44ffq0kk7D29sbc+fOxaVLlzK9cdQhORQUFCQAZN++ffL48WO5e/eurF+/XnLnzi1WVlZy7949ERFJSkqS1NRUre/eunVLLCwsZPLkycq0wMBAASBz5szRWVdaWpryPQAyc+ZMnTSlS5eWWrVqKX8fPHhQAEjevHklPj5emb5x40YBIPPnz1eWXbRoUfHz81PWIyLy6tUr8fLykvr16+usq2rVqlKmTBnl78ePHwsAmTBhgjItKipKTExM5IcfftD67oULF8TU1FRn+vXr1wWAhISEKNMmTJgg6X+io0ePCgBZs2aN1nd3796tM71AgQLSpEkTnbwPGDBAMv7sGfM+YsQIcXFxEW9vb619umrVKlGr1XL06FGt7y9dulQAyPHjx3XWl16tWrWU5f32229iamoq33zzjd602dkfIm9/p/SSk5OlTJkyUrduXa1lqdVqadWqlc6xqPnNY2Njxc7OTqpUqSKJiYl60yQnJ4uLi4uUKVNGK82uXbsEgIwfP16Z1q1bNylQoIDWcpYvXy4A5PT
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Применение оверсэмплинга\n",
|
|||
|
"ros = RandomOverSampler(random_state=42)\n",
|
|||
|
"X_resampled, y_resampled = ros.fit_resample(X, y)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame для оверсэмплинга\n",
|
|||
|
"train_resampled_over = pd.DataFrame(X_resampled, columns=X.columns)\n",
|
|||
|
"train_resampled_over['Manufacturer'] = y_resampled\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов после оверсэмплинга\n",
|
|||
|
"plot_class_distribution(train_resampled_over, 'Распределение классов в обучающей выборке (после оверсэмплинга)')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 88,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAooAAAIwCAYAAAABJUqfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD9SElEQVR4nOzdd1gU1/s28HuX3rECVmwoVhRL7A1716hYEXtvMXYlloQYey8JRey9JcYaNRox9l6iUewiioCiiMLz/sG7891lZxFMIuGX+3Nde10w9ezszJlnzpx5RiMiAiIiIiKiVLSZXQAiIiIi+ndioEhEREREqhgoEhEREZEqBopEREREpIqBIhERERGpYqBIRERERKoYKBIRERGRKgaKRERERKSKgSLRf1h8fDzu37+PFy9eZHZR6G/28uVLREREID4+PrOLQkRZGANFov+YTZs2oX79+nBwcIC9vT0KFCiA7777LrOLlSW8evUK8+bNU/6PiYnB4sWLM69AekQEK1aswGeffQZbW1s4OjqiUKFCWL16dWYX7W9z8uRJWFpa4u7du5ldFKIsZdmyZShQoADevn2b4Xk1GXmFX2hoKPz9/ZX/raysUKBAATRs2BCTJk2Ci4tLhgtARJ/O2LFjMWPGDLRq1Qq+vr7ImTMnNBoNPDw8kD9//swu3r9eUlISnJycsHz5ctSqVQuzZ8/G9evXsWfPnswuGjp16oQNGzbAz88PzZs3h5OTEzQaDcqWLYtcuXJldvH+Fg0aNECePHmwcuXKzC4KUZaSkJAAd3d3jB8/HkOHDs3QvOYfs8KpU6eiUKFCSEhIwLFjx7B06VLs3r0bly9fhq2t7ccskoj+YUeOHMGMGTMQGBiIsWPHZnZxsiQzMzNMmTIF3bt3R3JyMhwdHfHTTz9ldrEQFhaGDRs2YPXq1ejcuXNmF+cfcf78eRw4cADHjx/P7KIQZTnW1tbw8/PDnDlzMGTIEGg0mnTP+1EtiqdOnULFihWV4V988QXmzJmDtWvXolOnThkrPRF9Ei1atEB0dDR+++23zC5KlvfgwQPcv38fnp6ecHZ2zuzioEyZMihbtizWrFmT2UX5xwwbNgzbt29HREREhk5yRJTizJkzqFixIg4ePIh69eqle76/pY+iboV37twBAERHR2PUqFEoU6YM7O3t4ejoiCZNmuDChQtG8yYkJOCrr76Ch4cHrK2t4ebmhrZt2+LPP/8EAKVSMPWpU6eOsqzDhw9Do9Fgw4YNGD9+PFxdXWFnZ4eWLVvi/v37Ruv+/fff0bhxYzg5OcHW1ha1a9c2eRKtU6eO6vq/+uoro2lXr14Nb29v2NjYIHv27PD19VVdf1rfTV9ycjLmzZuHUqVKwdraGi4uLujXr5/RAwju7u5o3ry50XoGDx5stEy1ss+cOdNomwLA27dvERAQgKJFi8LKygr58+fH6NGj09XXoU6dOkbL+/rrr6HVarF27dqP2h6zZs1CtWrVkCNHDtjY2MDb2xubN29WXf/q1atRuXJl2NraIlu2bKhVqxb27dtnMM3PP/+M2rVrw8HBAY6OjqhUqZJR2TZt2qT8pjlz5kTXrl3x8OFDg2l69OhhUOZs2bKhTp06OHr06Ae301+ZFwB++eUX1KxZE3Z2dnB2dkarVq1w7do1g2lOnDiB0qVLw9fXF9mzZ4eNjQ0qVaqE7du3K9O8evUKdnZ2GDZsmNE6Hjx4ADMzMwQGBipldnd3N5ou9b519+5dDBw4EMWLF4eNjQ1y5MiB9u3bIyIiwmA+3fF7+PBhZdipU6fQoEEDODg4wM7OTnWbhIaGQqPR4PTp08qwZ8+eqe7jzZs3Vy1zeuqCr776StkX8+XLh6pVq8Lc3Byurq5G5Vajm1/3cXBwQOXKlQ22P5ByzJQuXdrkcnTHSWhoKICUB5IuX76M/Pnzo1mzZnB0dDS5rQDg9u3baN++PbJnzw5bW1t89tlnRq2iGalLM3KMZ6TOTW379u2oV6+eUX3g7u6e5jlC3/v37zFt2jQUKVIEVlZWyq04tbosPfXC312Hq7l48SJ69OiBwoULw9raGq6urujZsyeeP39uMJ1u/3r27JnB8NOnTxvsL+nZdqmnzeg5aN++ffDy8oK1tTVKliyJrVu3Gq07JiYGI0aMgLu7O6ysrJAvXz50795dKX9gYCBKlSoFW1tbZM+eHS1btsTZs2cNlqG//VMfRwkJCciWLRs0Gg1mzZpltJ1MfdS2k6lp9euw9J5TdceW2jnL3t4ePXr0UP7X1W1qnwcPHgBI//4BAN7e3siePTt27NhhNC4tH3XrOTVdUJcjRw4AKRXR9u3b0b59exQqVAiRkZFYvnw5ateujatXryJPnjwAUvr7NG/eHAcPHoSvry+GDRuGly9fYv/+/bh8+TKKFCmirKNTp05o2rSpwXrHjRunWp6vv/4aGo0GY8aMwdOnTzFv3jz4+Pjg/PnzsLGxAZByYm3SpAm8vb0REBAArVaLkJAQ1KtXD0ePHkXlypWNlpsvXz7lJPnq1SsMGDBAdd2TJk1Chw4d0Lt3b0RFRWHhwoWoVasWzp07p9r60LdvX9SsWRMAsHXrVmzbts1gfL9+/ZTW3KFDh+LOnTtYtGgRzp07h99++w0WFhaq2yEjYmJilO+mLzk5GS1btsSxY8fQt29feHp64tKlS5g7dy7++OMPo4PzQ0JCQjBx4kTMnj3b5C2yD22P+fPno2XLlujSpQsSExOxfv16tG/fHj/++COaNWumTDdlyhR89dVXqFatGqZOnQpLS0v8/vvv+OWXX9CwYUMAKQdiz549UapUKYwbNw7Ozs44d+4c9uzZo5RPt+0rVaqEwMBAREZGYv78+fjtt9+MftOcOXNi7ty5AFICq/nz56Np06a4f//+B1uePnbeAwcOoEmTJihcuDC++uorvHnzBgsXLkT16tVx9uxZJTB6/vw5VqxYAXt7ewwdOhS5cuXC6tWr0bZtW6xZswadOnWCvb092rRpgw0bNmDOnDkwMzNT1rNu3TqICLp06ZLm90jt1KlTOH78OHx9fZEvXz5ERERg6dKlqFOnDq5evWqyu8qtW7dQp04d2Nra4ssvv4StrS2+//57+Pj4YP/+/ahVq1aGymHKx9QFOrNnz0ZkZGSG1rdq1SoAKcHskiVL0L59e1y+fBnFixf/qPLrTggzZsyAq6srvvzyS1hbW6tuq8jISFSrVg2vX7/G0KFDkSNHDqxcuRItW7bE5s2b0aZNG4Nlp6cuTc3UMf5XtvPDhw9x7949VKhQQXW8l5cXvvjiC4NhYWFh2L9/v8Gw3r17Y+XKlfj888/xxRdf4Pfff0dgYCCuXbtmUM+kp17Q90/W4fv378ft27fh7+8PV1dXXLlyBStWrMCVK1dw4sSJv9y6qr/t7ty5g8mTJxtNk5Hy37x5Ex07dkT//v3h5+eHkJAQtG/fHnv27EGDBg0ApJw/a9asiWvXrqFnz56oUKECnj17hp07d+LBgwfImTMn9u7di2bNmqFo0aKIjIzEmjVrUL16dezZswe1a9c2KJ+1tTVCQkLQunVrZdjWrVuRkJBg8nsvXboU9vb2yv+mvrtOmzZt0LZtWwDA0aNHsWLFijS2qulz6sfQdffTlz17dgAZ3z8qVKiQ8btKkgEhISECQA4cOCBRUVFy//59Wb9+veTIkUNsbGzkwYMHIiKSkJAgSUlJBvPeuXNHrKysZOrUqcqw4OBgASBz5swxWldycrIyHwCZOXOm0TSlSpWS2rVrK/8fOnRIAEjevHklLi5OGb5x40YBIPPnz1eWXaxYMWnUqJGyHhGR169fS6FChaRBgwZG66pWrZqULl1a+T8qKkoASEBAgDIsIiJCzMzM5OuvvzaY99KlS2Jubm40/ObNmwJAVq5cqQwLCAgQ/Z/l6NGjAkDWrFljMO+ePXuMhhcsWFCaNWtmVPZBgwZJ6p86ddlHjx4tuXPnFm9vb4NtumrVKtFqtXL06FGD+ZctWyYA5LfffjNan77atWsry/vpp5/E3NxcvvjiC9Vp07M9RFJ+J32JiYlSunRpqVevnsGytFqttGn
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Применение андерсэмплинга\n",
|
|||
|
"rus = RandomUnderSampler(random_state=42)\n",
|
|||
|
"X_resampled, y_resampled = rus.fit_resample(X, y)\n",
|
|||
|
"\n",
|
|||
|
"# Создание нового DataFrame для андерсэмплинга\n",
|
|||
|
"train_resampled_under = pd.DataFrame(X_resampled, columns=X.columns)\n",
|
|||
|
"train_resampled_under['Manufacturer'] = y_resampled\n",
|
|||
|
"\n",
|
|||
|
"# Визуализация распределения классов после андерсэмплинга\n",
|
|||
|
"plot_class_distribution(train_resampled_under, 'Распределение классов в обучающей выборке (после андерсэмплинга)')"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "aimenv",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.7"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 2
|
|||
|
}
|