AIM-PIbd-32-Kamcharova-K-A/lab_4/lab4.ipynb

831 lines
207 KiB
Plaintext
Raw Normal View History

2024-11-14 11:47:25 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"# Вариант задания: Прогнозирование выпучки в магазинах\n",
"### Бизнес-цели:\n",
"Цель: Разработать модель машинного обучения, которая позволит прогнозировать распродажи магазина в зависимоси от его ассортимента.\n",
"\n",
"### Цели технического проекта:\n",
"\n",
"Сбор и подготовка данных:\n",
"Очистка данных от пропусков, выбросов и дубликатов.\n",
"Преобразование категориальных переменных в числовые.\n",
"Разделение данных на обучающую и тестовую выборки.\n"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 62,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['Store ID ', 'Store_Area', 'Items_Available', 'Daily_Customer_Count',\n",
" 'Store_Sales'],\n",
" dtype='object')\n"
]
}
],
"source": [
2024-11-28 22:00:12 +04:00
"import pandas as pn\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"import matplotlib.ticker as ticker\n",
"df = pn.read_csv(\".//static//csv//Stores.csv\")\n",
2024-11-14 11:47:25 +04:00
"print(df.columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"Разделим на 3 выборки\n"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 63,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-28 22:00:12 +04:00
"Размер обучающей выборки: 572\n",
"Размер контрольной выборки: 144\n",
"Размер тестовой выборки: 180\n"
2024-11-14 11:47:25 +04:00
]
}
],
"source": [
2024-11-28 22:00:12 +04:00
"from sklearn.model_selection import train_test_split\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"print(\"Размер обучающей выборки:\", len(train_data))\n",
"print(\"Размер контрольной выборки:\", len(val_data))\n",
"print(\"Размер тестовой выборки:\", len(test_data))"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 64,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
2024-11-28 22:00:12 +04:00
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB5Z0lEQVR4nO3dd3gU1f4G8HdbdlM3vZFCSCEh9NACIgihCVhAsMAV1IvoBRW44r2oiB27iKJcvQh4BVFUUBRBCEVK6DUQQktISEjZ9LpJds/vj5D9sSRACBtmN3k/z7PPw86cnXmzs5t8OXPOjEwIIUBERERkg+RSByAiIiJqKhYyREREZLNYyBAREZHNYiFDRERENouFDBEREdksFjJERERks1jIEBERkc1iIUNEREQ2i4UMEVETFBYW4uzZs6ipqZE6ClmQEAL5+fk4c+aM1FGokVjIEBE1QnV1Nd577z106dIFarUabm5uCA8PR3x8vNTRbEJiYiLWrl1ren7kyBH8/vvv0gW6QklJCV5++WW0b98ednZ28PDwQEREBJKTk6WORo2glDoA3bply5bhscceMz1Xq9UICgrC0KFDMXfuXPj4+EiYjsj26fV6DB06FHv27MFTTz2FN954Aw4ODlAoFIiJiZE6nk0oKSnB1KlT4evrCw8PDzz33HMYMWIERo4cKWmuvLw8DBgwAGlpaXjmmWfQr18/2NnZQaVSoW3btpJmo8ZhIdOCvP766wgJCUFlZSV27tyJL774AuvXr0diYiIcHBykjkdks959913s3bsXGzduxMCBA6WOY5NiY2NNDwCIiIjAlClTJE4FzJ49G5cuXUJCQgKio6OljkNNwEKmBRkxYgR69OgBAPj73/8ODw8PfPTRR/jll1/w8MMPS5yOyDbV1NRgwYIF+Oc//8ki5hatXbsWJ0+eREVFBTp16gQ7OztJ8+Tk5GD58uVYvHgxixgbxjEyLdigQYMAACkpKQCA/Px8PP/88+jUqROcnJzg4uKCESNG4OjRo/VeW1lZiVdffRURERHQaDTw8/PDmDFjcO7cOQBAamoqZDLZNR9X/sLftm0bZDIZvv/+e7z44ovw9fWFo6Mj7rnnHqSnp9fb9969ezF8+HBotVo4ODhgwIAB2LVrV4M/48CBAxvc/6uvvlqv7bfffouYmBjY29vD3d0dDz30UIP7v97PdiWj0YgFCxYgOjoaGo0GPj4+mDp1KgoKCszatW3bFqNGjaq3n+nTp9fbZkPZ33///XrvKVB7umPevHkICwuDWq1GYGAgXnjhBej1+gbfqysNHDgQHTt2rLf8gw8+gEwmQ2pqqtnywsJCzJgxA4GBgVCr1QgLC8O7774Lo9FoalP3vn3wwQf1ttuxY8cGPxM//vjjNTNOnjy5UV37bdu2NR0fuVwOX19fPPjgg0hLS7vhawHg888/R3R0NNRqNfz9/TFt2jQUFhaa1icnJ6OgoADOzs4YMGAAHBwcoNVqMWrUKCQmJprabd26FTKZDGvWrKm3j5UrV0ImkyEhIcGUefLkyWZt6t6Tbdu2mZbt2LED48aNQ1BQkOkYz5w5ExUVFWavffXVV+t9llasWIGuXbtCo9HAw8MDDz/8cL33ZPLkyXBycjJb9uOPP9bLAQBOTk71MgON+14NHDjQdPw7dOiAmJgYHD16tMHvVUOu/p57enpi5MiRZu8/UPv9mT59+jW3s2zZMrPP9/79+2E0GlFVVYUePXpc970CgC1btqB///5wdHSEq6sr7r33XiQlJZm1qTsWp06dwvjx4+Hi4mI6lVZZWVkv75Xf95qaGtx9991wd3fHyZMnzdo29vdXa8QemRasrujw8PAAAJw/fx5r167FuHHjEBISguzsbPznP//BgAEDcPLkSfj7+wMADAYDRo0ahfj4eDz00EN47rnnUFJSgk2bNiExMRGhoaGmfTz88MO4++67zfY7Z86cBvO89dZbkMlk+Ne//oWcnBwsWLAAcXFxOHLkCOzt7QHU/qIYMWIEYmJiMG/ePMjlcixduhSDBg3Cjh070KtXr3rbDQgIwPz58wEApaWlePrppxvc99y5czF+/Hj8/e9/R25uLj799FPceeedOHz4MFxdXeu95sknn0T//v0BAD///HO9P1BTp041jU969tlnkZKSgs8++wyHDx/Grl27oFKpGnwfbkZhYaHpZ7uS0WjEPffcg507d+LJJ59EVFQUjh8/jo8//hinT582G1R5q8rLyzFgwABkZGRg6tSpCAoKwu7duzFnzhxcunQJCxYssNi+mqp///548sknYTQakZiYiAULFiAzMxM7duy47uteffVVvPbaa4iLi8PTTz+N5ORkfPHFF9i/f7/pGObl5QGo/VyHh4fjtddeQ2VlJRYtWoR+/fph//79iIiIwMCBAxEYGIgVK1bg/vvvN9vPihUrEBoaajqt0lirV69GeXk5nn76aXh4eGDfvn349NNPcfHiRaxevfqar1u5ciUmTpyILl26YP78+cjLy8PChQuxc+dOHD58GJ6enjeV41qa8r2q869//eum9hUZGYmXXnoJQgicO3cOH330Ee6+++5GF6wNqTu206dPR0xMDN555x3k5uY2+F5t3rwZI0aMQLt27fDqq6+ioqICn376Kfr164dDhw7VK7rHjx+Ptm3bYv78+dizZw8WLlyIgoICfPPNN9fM8/e//x3btm3Dpk2b0KFDB9PyW3mfWwVBNm/p0qUCgNi8ebPIzc0V6enpYtWqVcLDw0PY29uLixcvCiGEqKysFAaDwey1KSkpQq1Wi9dff9207OuvvxYAxEcffVRvX0aj0fQ6AOL999+v1yY6OloMGDDA9Hzr1q0CgGjTpo0oLi42Lf/hhx8EAPHJJ5+Yth0eHi6GDRtm2o8QQpSXl4uQkBAxZMiQevvq27ev6Nixo+l5bm6uACDmzZtnWpaamioUCoV46623zF57/PhxoVQq6y0/c+aMACCWL19uWjZv3jxx5ddlx44dAoBYsWKF2Ws3bNhQb3lwcLAYOXJkvezTpk0TV38Fr87+wgsvCG9vbxETE2P2nv7vf/8Tcrlc7Nixw+z1ixcvFgDErl276u3vSgMGDBDR0dH1lr///vsCgEhJSTEte+ONN4Sjo6M4ffq0Wdt///vfQqFQiLS0NCFE0z4Tq1evvmbGSZMmieDg4Ov+HELUvr+TJk0yW/bII48IBweH674uJydH2NnZiaFDh5p9Lz777DMBQHz99ddmWT09PYVOpzO1O336tFCpVGLs2LGmZXPmzBFqtVoUFhaa7UepVJod15CQEPHoo4+a5anbz9atW03LysvL6+WeP3++kMlk4sKFC6ZlV34+a2pqhI+PjwgNDRWlpaWmNtu2bRMAxD//+U/TskmTJglHR0ez7a9evbpeDiGEcHR0NHufb+Z7NWDAALPjv379egFADB8+vN53oCFXv14IIV588UUBQOTk5JiWARDTpk275nbqflfWfb7rnnfo0MHsva47Fle+V127dhXe3t4iLy/PtOzo0aNCLpebHcu6Y3HPPfeY7fsf//iHACCOHj1qlrfuczFnzhyhUCjE2rVrzV53s7+/WiOeWmpB4uLi4OXlhcDAQDz00ENwcnLCmjVr0KZNGwC1s5nk8tpDbjAYkJeXBycnJ7Rv3x6HDh0ybeenn36Cp6cnnnnmmXr7aEw38LU8+uijcHZ2Nj1/4IEH4Ofnh/Xr1wOonY555swZPPLII8jLy4NOp4NOp0NZWRkGDx6Mv/76y+xUBlB7Ckyj0Vx3vz///DOMRiPGjx9v2qZOp4Ovry/Cw8OxdetWs/ZVVVUAat+va1m9ejW0Wi2GDBlits2YmBg4OTnV22Z1dbVZO51OV6+b+WoZGRn49NNPMXfu3Hrd/6tXr0ZUVBQiIyPNtll3OvHq/d+K1atXo3///nBzczPbV1xcHAwGA/766y+z9uXl5fV+VoPB0OC2S0pKoNPpzE7lNIVer4dOp0NOTg42bdqELVu2YPDgwdd9zebNm1FVVYUZM2aYvhcAMGXKFLi4uNSbGvzYY4+ZejcBIDw8HPfccw82bNhg+vk
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHHCAYAAACle7JuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABzkUlEQVR4nO3dd3hUVf4/8PfMZFp67z0hBAKhhCIdpQsCiiKou4AKyqIroqjoIqir2FZxFXX9roIuKoJSFBGki/SWQEgIARIS0nuvM+f3R8j8GBJKQpI75f16nuFh7ty5877T8plzz7lHJoQQICIiIjJDcqkDEBEREbUWCxkiIiIyWyxkiIiIyGyxkCEiIiKzxUKGiIiIzBYLGSIiIjJbLGSIiIjIbLGQISIiIrNlI3UAIiIiU1dbW4vCwkLo9Xr4+vpKHYeuwhYZIiKSzOrVq5Gammq4vmrVKmRkZEgX6CrHjh3DQw89BHd3d6jVavj4+GDKlClSx6JrsJAxE6tWrYJMJjNcNBoNIiIi8NRTTyEnJ0fqeERErbJv3z688MILSE1NxbZt2zBv3jzI5dL/adq0aRMGDx6MhIQEvPnmm9i+fTu2b9+O//znP1JHo2vw0JKZef311xESEoLq6mr8+eef+Oyzz7BlyxbEx8fD1tZW6nhERC3y7LPPYvjw4QgJCQEALFiwAD4+PpJmKiwsxOOPP44xY8Zg3bp1UKlUkuahG2MhY2bGjRuHPn36AAAef/xxuLm54YMPPsCmTZswffp0idMREbVMZGQkLly4gPj4eLi7uyMsLEzqSFi5ciWqq6uxatUqFjFmQPr2O7otd911FwAgJSUFQMMvieeffx7du3eHvb09HB0dMW7cOMTFxTW5b3V1NZYuXYqIiAhoNBr4+Pjgvvvuw4ULFwAAqampRoezrr0MHz7csK09e/ZAJpPhhx9+wMsvvwxvb2/Y2dlh4sSJSE9Pb/LYhw8fxtixY+Hk5ARbW1sMGzYM+/fvb3Yfhw8f3uzjL126tMm6q1evRkxMDLRaLVxdXTFt2rRmH/9G+3Y1vV6P5cuXIyoqChqNBl5eXnjiiSdQVFRktF5wcDAmTJjQ5HGeeuqpJttsLvt7773X5DkFgJqaGixZsgTh4eFQq9UICAjACy+8gJqammafq6sNHz4c3bp1a7L8/fffh0wmM+qXAADFxcWYP38+AgICoFarER4ejnfeeQd6vd6wTuPz9v777zfZbrdu3Zp9T/z444/XzThz5kwEBwffdF+Cg4MNr49cLoe3tzcefPBBpKWl3dJ9Z86cabRszpw50Gg02LNnj9HyTz/9FFFRUVCr1fD19cW8efNQXFxstM6tPq9XZ27u0rjfVz+nH374IYKCgqDVajFs2DDEx8c3eZxdu3ZhyJAhsLOzg7OzMyZNmoTExMSbPm9XX67e7+u9d6/WktcdAHJzc/HYY4/By8sLGo0GPXr0wNdff93sNletWgU7Ozv0798fYWFhmDdvHmQyWZPX7HqZGi9KpRLBwcFYuHAhamtrDes1HpY/duzYdbc1fPhwo304dOgQevbsibfeesvweejUqRPefvtto88DANTX1+ONN95AWFgY1Go1goOD8fLLLzf5jDY+z7///jt69uwJjUaDrl27Yv369UbrNea9+vN55swZuLi4YMKECaivrzcsv5XPrDVgi4yZayw63NzcAAAXL17Exo0b8cADDyAkJAQ5OTn4z3/+g2HDhiEhIcHQ216n02HChAnYuXMnpk2bhmeeeQZlZWXYvn074uPjjX4VTZ8+HXfffbfR4y5atKjZPG+++SZkMhlefPFF5ObmYvny5Rg5ciRiY2Oh1WoBNHwRjxs3DjExMViyZAnkcjlWrlyJu+66C/v27UO/fv2abNff3x/Lli0DAJSXl2Pu3LnNPvbixYsxdepUPP7448jLy8PHH3+MoUOH4uTJk3B2dm5ynzlz5mDIkCEAgPXr12PDhg1Gtz/xxBNYtWoVZs2ahb///e9ISUnBJ598gpMnT2L//v1QKpXNPg8tUVxcbNi3q+n1ekycOBF//vkn5syZgy5duuD06dP48MMPce7cOWzcuPG2H7tRZWUlhg0bhoyMDDzxxBMIDAzEgQMHsGjRImRlZWH58uVt9litNWTIEMyZMwd6vR7x8fFYvnw5MjMzsW/fvhZtZ8mSJfjyyy/xww8/GP3xWrp0KV577TWMHDkSc+fORVJSEj777DMcPXq0Va/18uXLUV5eDgBITEzEW2+9hZdffhldunQBANjb2xut/80336CsrAzz5s1DdXU1PvroI9x11104ffo0vLy8AAA7duzAuHHjEBoaiqVLl6Kqqgoff/wxBg0ahBMnTjRbFDY+b1fnaE9VVVUYPnw4zp8/j6eeegohISFYt24dZs6cieLiYjzzzDPXve/58+fxf//3fy16vMbPcE1NDbZt24b3338fGo0Gb7zxRqv3oaCgAH/++Sf+/PNPPProo4iJicHOnTuxaNEipKam4vPPPzes+/jjj+Prr7/G/fffj+eeew6HDx/GsmXLkJiY2OT7JDk5GQ8++CCefPJJzJgxAytXrsQDDzyArVu3YtSoUc1mSU9Px9ixYxEZGYm1a9fCxqbhz7Y5fGY7jCCzsHLlSgFA7NixQ+Tl5Yn09HSxZs0a4ebmJrRarbh8+bIQQojq6mqh0+mM7puSkiLUarV4/fXXDcu++uorAUB88MEHTR5Lr9cb7gdAvPfee03WiYqKEsOGDTNc3717twAg/Pz8RGlpqWH52rVrBQDx0UcfGbbdqVMnMWbMGMPjCCFEZWWlCAkJEaNGjWryWAMHDhTdunUzXM/LyxMAxJIlSwzLUlNThUKhEG+++abRfU+fPi1sbGyaLE9OThYAxNdff21YtmTJEnH1R2Lfvn0CgPj222+N7rt169Ymy4OCgsT48eObZJ83b5649mN2bfYXXnhBeHp6ipiYGKPn9H//+5+Qy+Vi3759Rvf//PPPBQCxf//+Jo93tWHDhomoqKgmy9977z0BQKSkpBiWvfHGG8LOzk6cO3fOaN2XXnpJKBQKkZaWJoRo3Xti3bp11804Y8YMERQUdMP9EKLh+Z0xY4bRsoceekjY2tq26L7/+c9/BADx8ccfG62Tm5srVCqVGD16tNHn55NPPhEAxFdffWVY1pLntVHjc7F79+4mtzU+p1d/joUQ4vDhwwKAePbZZw3LevbsKTw9PUVBQYFhWVxcnJDL5eKvf/1rk237+fmJWbNm3TDH9d67zWW8ldd9+fLlAoBYvXq1YVltba0YMGCAsLe3N3w/NG5z5cqVhvWmTp0qunXrJgICApq83tfLdPX9hRDC19dX3H333Ybrjd+dR48eve62hg0bZrQPw4YNEwDE0qVLjdabOXOmACBOnz4thBAiNjZWABCPP/640XrPP/+8ACB27dplWBYUFCQAiJ9++smwrKSkRPj4+IhevXo1yZuSkiIKCwtF165dRefOnUV+fr7RY9zqZ9Ya8NCSmRk5ciQ8PDwQEBCAadOmwd7eHhs2bICfnx8AQK1WG3r863Q6FBQUwN7eHp07d8aJEycM2/npp5/g7u6Op59+usljXHsopCX++te/wsHBwXD9/vvvh4+PD7Zs2QIAiI2NRXJyMh566CEUFBQgPz8f+fn5qKiowIgRI/DHH380aRatrq6GRqO54eOuX78eer0eU6dONWwzPz8f3t7e6NSpE3bv3m20fmPTs1qtvu42161bBycnJ4waNcpomzExMbC3t2+yzbq6OqP18vPzUV1dfcPcGRkZ+Pjjj7F48eImv9DXrVuHLl26IDIy0mibjYcTr33827Fu3ToMGTIELi4uRo81cuRI6HQ6/PHHH0brV1ZWNtlXnU7X7LbLysqQn5/f5BBNS9XU1CA/Px+5ubnYvn07du3ahREjRtzy/Tdt2oS//e1vWLhwIZ566imj23bs2IHa2lrMnz/faMTM7Nmz4ejoiF9//dVofZ1O12T/Kysrb2v/Jk+ebPgcA0C/fv3Qv39/w2cnKysLsbGxmDlzJlxdXQ3rRUdHY9SoUYb1rlZbW3vD93ijxvd
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkcAAAHHCAYAAAC1G/yyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABxd0lEQVR4nO3dd3xT9f4/8Ff26N6D7hYKlF2GgAgKggguUBQXLvAiDkRR0au4rqj4U/SKev3eK7hRVFAUQbbIXgUKpRToonvvpGny+f1RGhtaSvdJ09fz8chDc3JyzisnSfPmfMaRCSEEiIiIiAgAIJc6ABEREZE9YXFEREREVA+LIyIiIqJ6WBwRERER1cPiiIiIiKgeFkdERERE9bA4IiIiIqqHxRERERFRPSyOiIjIodXU1CA3NxdpaWlSR6EugsURERG1yq+//oq4uDjr/bVr1+LEiRPSBaonKSkJs2fPRkBAANRqNfz8/DBy5EjwohDUHCyOupGVK1dCJpNZb1qtFr169cKjjz6KnJwcqeMRURdz/PhxPPHEE0hKSsLevXvxj3/8A2VlZVLHwt69ezF8+HBs3boVzz33HDZu3IhNmzZh7dq1kMlkUsejLkDGa6t1HytXrsT999+PV199FeHh4TAYDPjrr7/w5ZdfIjQ0FPHx8dDr9VLHJKIuIi8vD6NGjcKZM2cAANOmTcOPP/4oaabq6moMHDgQrq6u+OOPP+Dm5iZpHuqalFIHoM43efJkDB06FADw0EMPwcvLC++++y5+/vlnzJw5U+J0RNRV+Pj4ID4+3voPqz59+kgdCevWrUNiYiJOnTrFwohajc1qhGuuuQYAkJycDAAoLCzE008/jf79+8PZ2Rmurq6YPHkyjh492uC5BoMBL7/8Mnr16gWtVouAgABMmzYNZ8+eBQCkpKTYNOVdfBs3bpx1W9u3b4dMJsN3332H559/Hv7+/nBycsKNN96I9PT0Bvvet28frrvuOri5uUGv12Ps2LHYtWtXo69x3Lhxje7/5ZdfbrDuV199hdjYWOh0Onh6euKOO+5odP9Nvbb6LBYLli1bhpiYGGi1Wvj5+eHhhx9GUVGRzXphYWGYOnVqg/08+uijDbbZWPalS5c2OKYAYDQasXjxYkRFRUGj0SA4OBjPPPMMjEZjo8eqvnHjxqFfv34Nlr/zzjuQyWRISUmxWV5cXIz58+cjODgYGo0GUVFReOutt2CxWKzr1B23d955p8F2+/Xr1+hn4ocffrhkxvvuuw9hYWGXfS1hYWHW90cul8Pf3x+33377ZTvp1n9eY7f6+27uew0Av//+O8aOHQsXFxe4urpi2LBh+OabbwBc+vPa2GespqYGr732GiIjI6HRaBAWFobnn3++wfvb3NdfUVGBp556yvoeRkdH45133mnQV6fuM6jRaBAbG4s+ffpc8jPYmPqvRaFQoEePHpgzZw6Ki4ut67Tm/d+7dy/Cw8Px448/IjIyEmq1GiEhIXjmmWdQVVXV4PkfffQRYmJioNFoEBgYiHnz5tlkAP7+Hhw6dAijRo2CTqdDeHg4PvnkE5v16vJu377duiwzMxNhYWEYOnQoysvLrcvb8r2kjsczR2QtZLy8vAAA586dw9q1a3HbbbchPDwcOTk5+M9//oOxY8fi5MmTCAwMBACYzWZMnToVW7ZswR133IEnnngCZWVl2LRpE+Lj4xEZGWndx8yZM3H99dfb7HfRokWN5vnXv/4FmUyGZ599Frm5uVi2bBkmTJiAuLg46HQ6AMDWrVsxefJkxMbGYvHixZDL5VixYgWuueYa7Ny5E8OHD2+w3aCgICxZsgQAUF5ejrlz5za67xdffBEzZszAQw89hLy8PPz73//GVVddhSNHjsDd3b3Bc+bMmYMxY8YAAH766SesWbPG5vGHH37Y2qT5+OOPIzk5GR9++CGOHDmCXbt2QaVSNXocWqK4uNj62uqzWCy48cYb8ddff2HOnDno06cPjh8/jvfeew+nT5/G2rVr27zvOpWVlRg7diwyMjLw8MMPIyQkBLt378aiRYuQlZWFZcuWtdu+WmvMmDGYM2cOLBYL4uPjsWzZMmRmZmLnzp2XfM6yZcusP2oJCQl444038Pzzz1vPkjg7O1vXbe57vXLlSjzwwAOIiYnBokWL4O7ujiNHjmDDhg2488478cILL+Chhx4CAOTn5+PJJ5+0+ZzV99BDD+Hzzz/Hrbfeiqeeegr79u3DkiVLkJCQ0OCzeLnXL4TAjTfeiG3btuHBBx/EoEGDsHHjRixcuBAZGRl47733LnmcLvUZbMott9yCadOmoaamBnv27MGnn36KqqoqfPnlly3aTn0FBQU4d+4cnn/+eUybNg1PPfUUDh48iKVLlyI+Ph6//fabtbh8+eWX8corr2DChAmYO3cuEhMT8fHHH+PAgQMNvptFRUW4/vrrMWPGDMycORPff/895s6dC7VajQceeKDRLCUlJZg8eTJUKhXWr19v/ax05veSWklQt7FixQoBQGzevFnk5eWJ9PR0sWrVKuHl5SV0Op04f/68EEIIg8EgzGazzXOTk5OFRqMRr776qnXZZ599JgCId999t8G+LBaL9XkAxNKlSxusExMTI8aOHWu9v23bNgFA9OjRQ5SWllqXf//99wKAeP/9963b7tmzp5g0aZJ1P0IIUVlZKcLDw8W1117bYF+jRo0S/fr1s97Py8sTAMTixYuty1JSUoRCoRD/+te/bJ57/PhxoVQqGyxPSkoSAMTnn39uXbZ48WJR/2u1c+dOAUB8/fXXNs/dsGFDg+WhoaFiypQpDbLPmzdPXPxVvTj7M888I3x9fUVsbKzNMf3yyy+FXC4XO3futHn+J598IgCIXbt2NdhffWPHjhUxMTENli9dulQAEMnJydZlr732mnBychKnT5+2Wfe5554TCoVCpKWlCSFa95lYvXr1JTPOmjVLhIaGNvk6hKg9vrNmzbJZdueddwq9Xn/Z516cZ9u2bQ0ea+57XVxcLFxcXMSIESNEVVWVzbr1P8916o7XihUrGjwWFxcnAIiHHnrIZvnTTz8tAIitW7dalzXn9a9du1YAEK+//rrNerfeequQyWTizJkz1mXN/QxeysXPF6L2e9q3b1/r/da8/7NmzRIAxH333WezXt13c926dUIIIXJzc4VarRYTJ060+Xv34YcfCgDis88+sy4bO3asACD+3//7f9ZlRqNRDBo0SPj6+orq6mqbvNu2bRMGg0GMGzdO+Pr62hw3Idr+vaSOx2a1bmjChAnw8fFBcHAw7rjjDjg7O2PNmjXo0aMHAECj0UAur/1omM1mFBQUwNnZGdHR0Th8+LB1Oz/++CO8vb3x2GOPNdhHW0aE3HvvvXBxcbHev/XWWxEQEID169cDAOLi4pCUlIQ777wTBQUFyM/PR35+PioqKjB+/Hj8+eefNs04QG3zn1arbXK/P/30EywWC2bMmGHdZn5+Pvz9/dGzZ09s27bNZv3q6moAtcfrUlavXg03Nzdce+21NtuMjY2Fs7Nzg22aTCab9fLz82EwGJrMnZGRgX//+9948cUXbc5i1O2/T58+6N27t80265pSL95/W6xevRpjxoyBh4eHzb4mTJgAs9mMP//802b9ysrKBq/VbDY3uu2ysjLk5+c3aO5oKaPRiPz8fOTm5mLTpk3YunUrxo8f36Zt1mnue71p0yaUlZXhueeea/CZbOn3pu47sWDBApvlTz31FADgt99+s1l+ude/fv16KBQKPP744w22J4TA77//3miOpj6DTan7DGRnZ+PHH3/E0aNHG30/WvP+L1y40Ob+k08+CYVCYT0mmzdvRnV1NebPn2/9ewcAs2fPhqura4Njp1Qq8fDDD1vvq9VqPPzww8jNzcWhQ4ds1rVYLLj33nuxd+9erF+/3uYsOtC530tqHTardUPLly9Hr169oFQq4efnh+joaJs/DhaLBe+//z4++ugjJCcn2/xg1TW9AbXNcdHR0VAq2/dj1LNnT5v7MpkMUVFR1v4tSUlJAIBZs2ZdchslJSXw8PCw3s/Pz2+w3YslJSVBCHHJ9S5u/qr
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
2024-11-14 11:47:25 +04:00
}
],
"source": [
"\n",
2024-11-28 22:00:12 +04:00
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Гистограмма распределения цены в обучающей выборке\n",
"sns.histplot(train_data[\"Store_Sales\"], kde=True)\n",
"plt.title('Распределение цены в обучающей выборке')\n",
"plt.show()\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Гистограмма распределения цены в контрольной выборке\n",
"sns.histplot(val_data[\"Store_Sales\"], kde=True)\n",
"plt.title('Распределение цены в контрольной выборке')\n",
"plt.show()\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Гистограмма распределения цены в тестовой выборке\n",
"sns.histplot(test_data[\"Store_Sales\"], kde=True)\n",
"plt.title('Распределение цены в тестовой выборке')\n",
"plt.show()"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"## Процесс конструирования признаков\n",
"\n",
2024-11-14 11:47:25 +04:00
"\n",
"\n",
2024-11-28 22:00:12 +04:00
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"One-hot encoding: Преобразование категориальных признаков в бинарные векторы."
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 65,
2024-11-14 11:47:25 +04:00
"metadata": {},
2024-11-28 22:00:12 +04:00
"outputs": [],
2024-11-14 11:47:25 +04:00
"source": [
"import pandas as pd\n",
"\n",
2024-11-28 22:00:12 +04:00
"# Пример категориальных признаков\n",
"categorical_features = [\n",
" \"Store ID \",\n",
" \"Store_Area\"\n",
2024-11-14 11:47:25 +04:00
"]\n",
"\n",
2024-11-28 22:00:12 +04:00
"# Применение one-hot encoding\n",
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"### Дискретизация числовых признаков "
2024-11-14 11:47:25 +04:00
]
},
{
2024-11-28 22:00:12 +04:00
"cell_type": "code",
"execution_count": 66,
2024-11-14 11:47:25 +04:00
"metadata": {},
2024-11-28 22:00:12 +04:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store_Area</th>\n",
" <th>Store_Area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1659</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1461</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1340</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1451</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1770</td>\n",
" <td>(1744.333, 2229.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1442</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1542</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1261</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1090</td>\n",
" <td>(775.0, 1259.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1030</td>\n",
" <td>(775.0, 1259.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1187</td>\n",
" <td>(775.0, 1259.667]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1751</td>\n",
" <td>(1744.333, 2229.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1746</td>\n",
" <td>(1744.333, 2229.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>1615</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1469</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1644</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1578</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>1703</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1438</td>\n",
" <td>(1259.667, 1744.333]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1940</td>\n",
" <td>(1744.333, 2229.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store_Area Store_Area\n",
"0 1659 (1259.667, 1744.333]\n",
"1 1461 (1259.667, 1744.333]\n",
"2 1340 (1259.667, 1744.333]\n",
"3 1451 (1259.667, 1744.333]\n",
"4 1770 (1744.333, 2229.0]\n",
"5 1442 (1259.667, 1744.333]\n",
"6 1542 (1259.667, 1744.333]\n",
"7 1261 (1259.667, 1744.333]\n",
"8 1090 (775.0, 1259.667]\n",
"9 1030 (775.0, 1259.667]\n",
"10 1187 (775.0, 1259.667]\n",
"11 1751 (1744.333, 2229.0]\n",
"12 1746 (1744.333, 2229.0]\n",
"13 1615 (1259.667, 1744.333]\n",
"14 1469 (1259.667, 1744.333]\n",
"15 1644 (1259.667, 1744.333]\n",
"16 1578 (1259.667, 1744.333]\n",
"17 1703 (1259.667, 1744.333]\n",
"18 1438 (1259.667, 1744.333]\n",
"19 1940 (1744.333, 2229.0]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
2024-11-14 11:47:25 +04:00
"source": [
2024-11-28 22:00:12 +04:00
"from sklearn.preprocessing import OneHotEncoder\n",
"import numpy as np\n",
2024-11-14 11:47:25 +04:00
"\n",
"\n",
2024-11-28 22:00:12 +04:00
"labels = [\"small\", \"medium\", \"large\"]\n",
"num_bins = 3\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"hist1, bins1 = np.histogram(\n",
" df[\"Store_Area\"].fillna(df[\"Store_Area\"].median()), bins=num_bins\n",
")\n",
"bins1, hist1\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"pd.concat([df[\"Store_Area\"], pd.cut(df[\"Store_Area\"], list(bins1))], axis=1).head(20)\n"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 67,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
2024-11-28 22:00:12 +04:00
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Store_Area</th>\n",
" <th>Store_Area</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1659</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1461</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1340</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1451</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1770</td>\n",
" <td>large</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1442</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1542</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1261</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1090</td>\n",
" <td>small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1030</td>\n",
" <td>small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>1187</td>\n",
" <td>small</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>1751</td>\n",
" <td>large</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1746</td>\n",
" <td>large</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>1615</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>1469</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>1644</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1578</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>1703</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>1438</td>\n",
" <td>medium</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1940</td>\n",
" <td>large</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Store_Area Store_Area\n",
"0 1659 medium\n",
"1 1461 medium\n",
"2 1340 medium\n",
"3 1451 medium\n",
"4 1770 large\n",
"5 1442 medium\n",
"6 1542 medium\n",
"7 1261 medium\n",
"8 1090 small\n",
"9 1030 small\n",
"10 1187 small\n",
"11 1751 large\n",
"12 1746 large\n",
"13 1615 medium\n",
"14 1469 medium\n",
"15 1644 medium\n",
"16 1578 medium\n",
"17 1703 medium\n",
"18 1438 medium\n",
"19 1940 large"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
2024-11-14 11:47:25 +04:00
}
],
"source": [
2024-11-28 22:00:12 +04:00
"pd.concat(\n",
" [df[\"Store_Area\"], pd.cut(df[\"Store_Area\"], list(bins1), labels=labels)], axis=1\n",
").head(20)"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"### Ручной синтез"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 68,
2024-11-14 11:47:25 +04:00
"metadata": {},
2024-11-28 22:00:12 +04:00
"outputs": [],
2024-11-14 11:47:25 +04:00
"source": [
2024-11-28 22:00:12 +04:00
"# Пример синтеза признака коэффициента отношения размера ассортимента к его распродажам\n",
"train_data_encoded[\"koeff\"] = (\n",
" train_data_encoded[\"Items_Available\"] / train_data_encoded[\"Store_Sales\"]\n",
")\n",
"val_data_encoded[\"koeff\"] = (\n",
" val_data_encoded[\"Items_Available\"] / val_data_encoded[\"Store_Sales\"]\n",
")\n",
"test_data_encoded[\"koeff\"] = (\n",
" test_data_encoded[\"Items_Available\"] / test_data_encoded[\"Store_Sales\"]\n",
")"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 69,
2024-11-14 11:47:25 +04:00
"metadata": {},
2024-11-28 22:00:12 +04:00
"outputs": [],
2024-11-14 11:47:25 +04:00
"source": [
2024-11-28 22:00:12 +04:00
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Пример масштабирования числовых признаков\n",
"numerical_features = [\"Daily_Customer_Count\", \"Items_Available\"]\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"scaler = StandardScaler()\n",
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"### Конструирование признаков с применением фреймворка Featuretools"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 70,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-11-28 22:00:12 +04:00
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
2024-11-14 11:47:25 +04:00
" warnings.warn(\n",
2024-11-28 22:00:12 +04:00
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
2024-11-14 11:47:25 +04:00
]
}
],
"source": [
2024-11-28 22:00:12 +04:00
"import featuretools as ft\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Определение сущностей\n",
"es = ft.EntitySet(id='shop_data')\n",
"es = es.add_dataframe(dataframe_name='shops', dataframe=train_data_encoded, index='id')\n",
2024-11-14 11:47:25 +04:00
"\n",
"\n",
2024-11-28 22:00:12 +04:00
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='shops', max_depth=2)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"### Оценка качества каждого набора признаков\n",
"Предсказательная способность\n",
"Метрики: RMSE, MAE, R²\n",
"\n",
"Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n",
"\n",
"Скорость вычисления\n",
"Методы: Измерение времени выполнения генерации признаков и обучения модели.\n",
"\n",
"Надежность\n",
"Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n",
"\n",
"Корреляция\n",
"Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"Цельность\n",
"Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели."
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 71,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
2024-11-28 22:00:12 +04:00
"name": "stderr",
2024-11-14 11:47:25 +04:00
"output_type": "stream",
"text": [
2024-11-28 22:00:12 +04:00
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n",
" warnings.warn(\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
" df = pd.concat([df, default_df], sort=True)\n",
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
" series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n"
2024-11-14 11:47:25 +04:00
]
}
],
"source": [
2024-11-28 22:00:12 +04:00
"import featuretools as ft\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Определение сущностей\n",
"es = ft.EntitySet(id='shop_data')\n",
"es = es.add_dataframe(dataframe_name='shops', dataframe=train_data_encoded, index='id')\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='shops', max_depth=2)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "code",
2024-11-28 22:00:12 +04:00
"execution_count": 72,
2024-11-14 11:47:25 +04:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-11-28 22:00:12 +04:00
"RMSE: 935.869473619144\n",
"R²: 0.9976677314259463\n",
"MAE: 563.0765217391303\n",
"Cross-validated RMSE: 2423.8868120485813\n",
"Train RMSE: 871.8955293545159\n",
"Train R²: 0.9975555952641544\n",
"Train MAE: 514.1715034965034\n"
2024-11-14 11:47:25 +04:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\3_КУРС_ПИ\\МИИ\\aisenv\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n",
" warnings.warn(\n"
]
},
{
"data": {
2024-11-28 22:00:12 +04:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2wAAAIjCAYAAAB/FZhcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAACu6klEQVR4nOzdeXhU1f3H8fedfTKTTBKyhwBhE0HEBUVlcQEF1x+VKigqohVbxX3XutWFFkqtuFHUulSsda/FBRW1uCCCAhoFIewmkARCtklmvff3B81IBDXBQAJ8Xs+T53HOPffecwEJn5xzv8ewLMtCRERERERE2h1bWw9AREREREREdkyBTUREREREpJ1SYBMREREREWmnFNhERERERETaKQU2ERERERGRdkqBTUREREREpJ1SYBMREREREWmnFNhERERERETaKQU2ERERERGRdkqBTURE2iXDMLjjjjvaehht7phjjuGYY45JfF6zZg2GYfDkk0+22Zh+6Idj3F0++OADDMPggw8+2O33FhHZXRTYRET2AQ8//DCGYTBgwICdvkZpaSl33HEHixcvbr2BtXONgaDxy+l00rVrV8477zxWrVrV1sNrkU8++YQ77riDqqqqNhtDJBLh/vvv5+CDDyYlJYXU1FT69OnDhAkTWLZsWZuNS0SkPXO09QBERGTXmzlzJl26dOGzzz6juLiY7t27t/gapaWl3HnnnXTp0oWDDjqo9QfZjl1++eUcdthhRKNRvvjiC2bMmMHrr7/OV199RV5e3m4dS+fOnWloaMDpdLbovE8++YQ777yT888/n9TU1F0zuJ8xatQo3nzzTc466ywuuugiotEoy5YtY9asWRx11FH06tWrTcYlItKeKbCJiOzlVq9ezSeffMLLL7/MxRdfzMyZM7n99tvbelh7lMGDB/PrX/8agPHjx9OzZ08uv/xynnrqKW666aYdnhMMBvH5fK0+FsMw8Hg8rX7dXW3BggXMmjWLe+65h5tvvrnJsQcffLBNZ/5ERNozLYkUEdnLzZw5k7S0NE4++WR+/etfM3PmzB32q6qq4qqrrqJLly643W46duzIeeedx6ZNm/jggw847LDDgK2BpXGJYON7VF26dOH888/f7po/fLcpEolw2223ceihhxIIBPD5fAwePJj333+/xc9VVlaGw+Hgzjvv3O7Yt99+i2EYPPjggwBEo1HuvPNOevTogcfjoUOHDgwaNIh33nmnxfcFOO6444CtYRjgjjvuwDAMvvnmG84++2zS0tIYNGhQov8zzzzDoYceitfrJT09nTFjxrB+/frtrjtjxgy6deuG1+vl8MMP58MPP9yuz4+9w7Zs2TLOPPNMMjMz8Xq97Lffftxyyy2J8V133XUAFBYWJn7/1qxZs0vGuCMrV64EYODAgdsds9vtdOjQIfF57dq1XHLJJey33354vV46dOjAGWec0WS8P2X+/PmMGDGCQCBAUlISRx99NB9//HGTPrW1tVx55ZWJP+9ZWVkcf/zxfPHFF826h4jI7qIZNhGRvdzMmTM5/fTTcblcnHXWWTzyyCMsWLAgEcAA6urqGDx4MEuXLuWCCy7gkEMOYdOmTbz22mt899137L///vzhD3/gtttuY8KECQwePBiAo446qkVjqamp4bHHHkssiautreXxxx9n+PDhfPbZZy1aapmdnc3RRx/N888/v92M4b/+9S/sdjtnnHEGsDWwTJo0id/85jccfvjh1NTUsHDhQr744guOP/74Fj0DfB8+tg0ZAGeccQY9evTg3nvvxbIsAO655x5uvfVWzjzzTH7zm99QUVHBAw88wJAhQ1i0aFFieeLjjz/OxRdfzFFHHcWVV17JqlWrOO2000hPT6egoOAnx/Pll18yePBgnE4nEyZMoEuXLqxcuZL//Oc/3HPPPZx++uksX76cf/7zn9x3331kZGQAkJmZudvG2LlzZ2Drn8eBAwficPz4P0EWLFjAJ598wpgxY+jYsSNr1qzhkUce4ZhjjuGbb74hKSnpR8997733OPHEEzn00EO5/fbbsdlsPPHEExx33HF8+OGHHH744QD89re/5cUXX2TixIn07t2bzZs389FHH7F06VIOOeSQn3wWEZHdyhIRkb3WwoULLcB65513LMuyLNM0rY4dO1pXXHFFk3633XabBVgvv/zydtcwTdOyLMtasGCBBVhPPPHEdn06d+5sjRs3brv2o48+2jr66KMTn2OxmBUOh5v02bJli5WdnW1dcMEFTdoB6/bbb//J5/vb3/5mAdZXX33VpL13797Wcccdl/jcr18/6+STT/7Ja+3I+++/bwHW3//+d6uiosIqLS21Xn/9datLly6WYRjWggULLMuyrNtvv90CrLPOOqvJ+WvWrLHsdrt1zz33NGn/6quvLIfDkWiPRCJWVlaWddBBBzX59ZkxY4YFNPk1XL169Xa/D0OGDLGSk5OttWvXNrlP4++dZVnWlClTLMBavXr1Lh/jjpimaR199NEWYGVnZ1tnnXWW9dBDD203ZsuyrPr6+u3a5s2bZwHW008/nWhr/P15//33E/fo0aOHNXz48CbPXl9fbxUWFlrHH398oi0QCFiXXnrpT45ZRKQ90JJIEZG92MyZM8nOzubYY48Ftr7/NHr0aJ577jni8Xii30svvUS/fv341a9+td01DMNotfHY7XZcLhcApmlSWVlJLBajf//+O7UU7fTTT8fhcPCvf/0r0VZUVMQ333zD6NGjE22pqal8/fXXrFixYqfGfcEFF5CZmUleXh4nn3wywWCQp556iv79+zfp99vf/rbJ55dffhnTNDnzzDPZtGlT4isnJ4cePXokloIuXLiQ8vJyfvvb3yZ+fQDOP/98AoHAT46toqKCuXPncsEFF9CpU6cmx5rze7c7xtg4ltmzZ3P33XeTlpbGP//5Ty699FI6d+7M6NGjm7zD5vV6E/8djUbZvHkz3bt3JzU19Sf/nCxevJgVK1Zw9tlns3nz5sSzBINBhg4dyty5czFNE9j6Z2L+/PmUlpb+7NhFRNqSlkSKiOyl4vE4zz33HMcee2ziXSuAAQMGMHXqVObMmcMJJ5wAbF3iN2rUqN0yrqeeeoqpU6eybNkyotFoor2wsLDF18rIyGDo0KE8//zz3HXXXcDW5ZAOh4PTTz890e8Pf/gD//d//0fPnj054IADGDFiBOeeey4HHnhgs+5z2223MXjwYOx2OxkZGey///47XNL3w2dYsWIFlmXRo0ePHV63sdLj2rVrAbbr17iNwE9p3F7ggAMOaNaz/NDuGGMjt9vNLbfcwi233MKGDRv473//y/3338/zzz+P0+nkmWeeAaChoYFJkybxxBNPUFJSklheClBdXf2TzwIwbty4H+1TXV1NWloakydPZty4cRQUFHDooYdy0kkncd555zX7WUREdhcFNhGRvdR7773Hhg0beO6553juuee2Oz5z5sxEYPulfmwmJx6PY7fbE5+feeYZzj//fEaOHMl1111HVlYWdrudSZMmJd4La6kxY8Ywfvx4Fi9ezEEHHcTzzz/P0KFDE+9pAQwZMoSVK1fy73//m7fffpvHHnuM++67j+nTp/Ob3/zmZ+/Rt29fhg0b9rP9tp0Zgq2ziIZh8Oabbzb5dWjk9/ub8YS7VluNMTc3lzFjxjBq1Cj69OnD888/z5NPPonD4eCyyy7jiSee4Morr+TII48kEAhgGAZjxoxJzJD92LMATJky5Uffh2x8njPPPJPBgwfzyiuv8PbbbzNlyhT+9Kc/8fLLL3PiiSe2+vOKiOwsBTYRkb3UzJkzycrK4qGHHtru2Msvv8wrr7zC9OnT8Xq9dOvWjaKiop+83k8tr0tLS9thWfa1a9c2mbF48cUX6dq1Ky+//HKT6/2SbQZGjhzJxRdfnFgWuXz58h2W2k9PT2f8+PGMHz+euro6hgwZwh133NGswLazunXrhmVZFBYW0rNnzx/t11iQY8WKFYkKlLB1OeDq1avp16/fj57b+Ou7s79/u2OMP8XpdHLggQeyYsWKxFLMF198kXHjxjF16tREv1Ao9LOl/7t16wZ
2024-11-14 11:47:25 +04:00
"text/plain": [
2024-11-28 22:00:12 +04:00
"<Figure size 1000x600 with 1 Axes>"
2024-11-14 11:47:25 +04:00
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
2024-11-28 22:00:12 +04:00
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"from sklearn.model_selection import cross_val_score\n",
2024-11-14 11:47:25 +04:00
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
2024-11-28 22:00:12 +04:00
"# Удаление строк с NaN\n",
"feature_matrix = feature_matrix.dropna()\n",
"val_feature_matrix = val_feature_matrix.dropna()\n",
"test_feature_matrix = test_feature_matrix.dropna()\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Разделение данных на обучающую и тестовую выборки\n",
"X_train = feature_matrix.drop(\"Store_Sales\", axis=1)\n",
"y_train = feature_matrix[\"Store_Sales\"]\n",
"X_val = val_feature_matrix.drop(\"Store_Sales\", axis=1)\n",
"y_val = val_feature_matrix[\"Store_Sales\"]\n",
"X_test = test_feature_matrix.drop(\"Store_Sales\", axis=1)\n",
"y_test = test_feature_matrix[\"Store_Sales\"]\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Выбор модели\n",
"model = RandomForestRegressor(random_state=42)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Обучение модели\n",
"model.fit(X_train, y_train)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Предсказание и оценка\n",
"y_pred = model.predict(X_test)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"rmse = mean_squared_error(y_test, y_pred, squared=False)\n",
"r2 = r2_score(y_test, y_pred)\n",
"mae = mean_absolute_error(y_test, y_pred)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"print(f\"RMSE: {rmse}\")\n",
"print(f\"R²: {r2}\")\n",
"print(f\"MAE: {mae}\")\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Кросс-валидация\n",
"scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n",
"rmse_cv = (-scores.mean())**0.5\n",
"print(f\"Cross-validated RMSE: {rmse_cv}\")\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Анализ важности признаков\n",
"feature_importances = model.feature_importances_\n",
"feature_names = X_train.columns\n",
2024-11-14 11:47:25 +04:00
"\n",
"\n",
2024-11-28 22:00:12 +04:00
"# Проверка на переобучение\n",
"y_train_pred = model.predict(X_train)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n",
"r2_train = r2_score(y_train, y_train_pred)\n",
"mae_train = mean_absolute_error(y_train, y_train_pred)\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"print(f\"Train RMSE: {rmse_train}\")\n",
"print(f\"Train R²: {r2_train}\")\n",
"print(f\"Train MAE: {mae_train}\")\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"# Визуализация результатов\n",
"plt.figure(figsize=(10, 6))\n",
"plt.scatter(y_test, y_pred, alpha=0.5)\n",
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n",
"plt.xlabel('Actual Sales')\n",
"plt.ylabel('Predicted Sales')\n",
"plt.title(\"Actual vs Predicted Sales\")\n",
"plt.show()"
2024-11-14 11:47:25 +04:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2024-11-28 22:00:12 +04:00
"Точность предсказаний: Модель показывает довольно высокий R² (0.9975), что указывает на хорошее объяснение вариации распродаж. Значения RMSE и MAE довольно низки, что говорит о том, что модель достаточно точно предсказывает цены.\n",
2024-11-14 11:47:25 +04:00
"\n",
2024-11-28 22:00:12 +04:00
"Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n"
2024-11-14 11:47:25 +04:00
]
}
],
"metadata": {
"kernelspec": {
"display_name": "aisenv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}