2024-10-11 13:42:11 +04:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Вариант задания: Прогнозирование цен на автомобили\n",
"### Бизнес-цели:\n",
"Повышение эффективности ценообразования на вторичном рынке автомобилей:\n",
"Цель: Разработать модель машинного обучения, которая позволит точно прогнозировать рыночную стоимость автомобилей на вторичном рынке.\n",
"Ключевые показатели успеха (KPI):\n",
"Точность прогнозирования цены (например, RMSE, MAE).\n",
"Сокращение времени на оценку стоимости автомобиля.\n",
"Увеличение количества продаж за счет более конкурентоспособных цен.\n",
"Оптимизация рекламных бюджетов для онлайн-площадок по продаже автомобилей:\n",
"Цель: Использовать прогнозы цен на автомобили для оптимизации таргетинга рекламы и повышения конверсии на онлайн-площадках.\n",
"Ключевые показатели успеха (KPI):\n",
"Увеличение CTR (Click-Through Rate) рекламных объявлений.\n",
"Повышение конверсии (процент пользователей, совершивших покупку после клика на рекламу).\n",
"Снижение стоимости привлечения клиента (CPA).\n",
"### Цели технического проекта:\n",
"Для бизнес-цели 1:\n",
"С б о р и подготовка данных:\n",
"Очистка данных от пропусков, выбросов и дубликатов.\n",
"Преобразование категориальных переменных в числовые.\n",
"Разделение данных на обучающую и тестовую выборки.\n",
"Разработка и обучение модели:\n",
"Исследование различных алгоритмов машинного обучения (линейная регрессия, деревья решений, случайный лес и т.д.).\n",
"Обучение моделей на обучающей выборке.\n",
"Оценка качества моделей на тестовой выборке с помощью метрик RMSE, MAE и др.\n",
"Развертывание модели:\n",
"Интеграция модели в существующую систему или разработка нового API для доступа к прогнозам.\n",
"Создание веб-интерфейса или мобильного приложения для удобного использования модели.\n",
"Для бизнес-цели 2:\n",
"Анализ данных о пользователях и поведении:\n",
"Анализ данных о просмотрах, кликах и покупках на онлайн-площадке.\n",
"Определение сегментов пользователей с разным уровнем интереса к покупке автомобилей.\n",
"Разработка рекомендательной системы:\n",
"Создание модели, которая будет рекомендовать пользователям автомобили, соответствующие их предпочтениям и бюджету.\n",
"Интеграция рекомендательной системы в рекламные кампании.\n",
"Оптимизация таргетинга рекламы:\n",
"Использование прогнозов цен на автомобили для более точного таргетинга рекламы на пользователей, готовых к покупке.\n",
"Тестирование различных стратегий таргетинга и оценка их эффективности."
]
},
{
"cell_type": "code",
2024-10-11 23:17:25 +04:00
"execution_count": 3,
2024-10-11 13:42:11 +04:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['ID', 'Price', 'Levy', 'Manufacturer', 'Model', 'Prod. year',\n",
" 'Category', 'Leather interior', 'Fuel type', 'Engine volume', 'Mileage',\n",
" 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color',\n",
" 'Airbags'],\n",
" dtype='object')\n"
]
}
],
"source": [
"import pandas as pn\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"import matplotlib.ticker as ticker\n",
2024-10-11 23:17:25 +04:00
"df = pn.read_csv(\".//static//csv//car_price_prediction.csv\")\n",
2024-10-11 13:42:11 +04:00
"print(df.columns)"
]
2024-10-11 23:17:25 +04:00
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Разделим на 3 выборки\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размер обучающей выборки: 12311\n",
"Размер контрольной выборки: 3078\n",
"Размер тестовой выборки: 3848\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n",
"train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n",
"\n",
"# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n",
"train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n",
"\n",
"print(\"Размер обучающей выборки:\", len(train_data))\n",
"print(\"Размер контрольной выборки:\", len(val_data))\n",
"print(\"Размер тестовой выборки:\", len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAHHCAYAAABeLEexAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABI9UlEQVR4nO3deVxVdf7H8fdlVxRckK1IERVzNysHHbdCFJdyssylxMbSccAWW4yZTLQF08bMdHSaUmu0sWyxpmks3LKS1FTcM3Uw0wR3rmKiwPf3hz/ueL2gQsIFz+v5eJzHg3PO95zzOd+7vTnLvTZjjBEAAICFebi7AAAAAHcjEAEAAMsjEAEAAMsjEAEAAMsjEAEAAMsjEAEAAMsjEAEAAMsjEAEAAMsjEAGAm5w4cUK7d+9Wfn6+u0vBVWSM0bFjx7Rr1y53l4JSIBABQAU5d+6cJk+erNatW8vX11e1a9dW48aNtWzZMneXViVs3bpVixcvdoxnZGTo3//+t/sKusDJkyf1zDPPKDo6Wj4+Pqpbt66aNGminTt3urs0XCEvdxeAymHevHl64IEHHOO+vr664YYbFBcXp3HjxikkJMSN1QFVX15enuLi4vTtt9/qD3/4g5577jlVr15dnp6eateunbvLqxJOnjypkSNHKjQ0VHXr1tUjjzyi+Ph49e7d2611HT16VF26dNG+ffs0evRodezYUT4+PvL29laDBg3cWhuuHIEITiZOnKjIyEidOXNGX3/9tWbNmqXPPvtMW7duVfXq1d1dHlBlvfTSS1qzZo0+//xzde3a1d3lVEkxMTGOQZKaNGmihx56yM1VSU8++aQOHjyo9PR0NW/e3N3loIwIRHASHx+vm2++WZL04IMPqm7dupo6dao+/vhjDRo0yM3VAVVTfn6+pk2bpscff5ww9CstXrxY27dv1y+//KKWLVvKx8fHrfUcOnRIb731lmbPnk0YquK4hgiXdNttt0mSMjMzJUnHjh3TE088oZYtW6pGjRoKCAhQfHy8Nm3a5LLsmTNnlJKSoiZNmsjPz09hYWG66667tGfPHknS3r17ZbPZShwu/OBYuXKlbDab3n33Xf3pT39SaGio/P39dccdd+inn35y2faaNWvUs2dPBQYGqnr16urSpYu++eabYvexa9euxW4/JSXFpe38+fPVrl07VatWTXXq1NHAgQOL3f6l9u1ChYWFmjZtmpo3by4/Pz+FhIRo5MiROn78uFO7Bg0aqE+fPi7bSUpKcllncbVPmTLFpU+l86dxxo8fr0aNGsnX11cRERF66qmnlJeXV2xfXahr165q0aKFy/SXX35ZNptNe/fudZp+4sQJPfroo4qIiJCvr68aNWqkl156SYWFhY42Rf328ssvu6y3RYsWxT4n3n///RJrHDZs2BWdsmjQoIHj8fHw8FBoaKjuvfde7du377LLStJf//pXNW/eXL6+vgoPD1diYqJOnDjhmL9z504dP35cNWvWVJcuXVS9enUFBgaqT58+2rp1q6PdihUrZLPZ9NFHH7ls45133pHNZlN6erqj5mHDhjm1KeqTlStXOqZ99dVXuueee3TDDTc4HuPHHntMv/zyi9OyKSkpLs+lBQsWqE2bNvLz81PdunU1aNAglz4ZNmyYatSo4TTt/fffd6lDkmrUqOFSs3Rlr6uuXbs6Hv9mzZqpXbt22rRpU7Gvq+Jc/DoPCgpS7969nfpfOv/6SUpKKnE98+bNc3p+r1u3ToWFhTp79qxuvvnmS/aVJC1fvlydOnWSv7+/atWqpTvvvFM7duxwalP0WHz//fcaMGCAAgICHKcIz5w541Lvha/3/Px89erVS3Xq1NH27dud2l7p+5dVcYQIl1QUXurWrStJ+u9//6vFixfrnnvuUWRkpLKzs/W3v/1NXbp00fbt2xUeHi5JKigoUJ8+fbRs2TINHDhQjzzyiE6ePKm0tDRt3bpVUVFRjm0MGjRIvXr1ctpucnJysfW88MILstlsGjt2rA4dOqRp06YpNjZWGRkZqlatmqTzbzjx8fFq166dxo8fLw8PD82dO1e33XabvvrqK916660u673++uuVmpoqSTp16pRGjRpV7LbHjRunAQMG6MEHH9Thw4f12muvqXPnztq4caNq1arlssyIESPUqVMnSdKHH37o8kE3cuRIx/VbDz/8sDIzMzVjxgxt3LhR33zzjby9vYvth9I4ceKEY98uVFhYqDvuuENff/21RowYoRtvvFFbtmzRK6+8oh9++MHp4tVf6/Tp0+rSpYsOHDigkSNH6oYbbtDq1auVnJysgwcPatq0aVdtW2XVqVMnjRgxQoWFhdq6daumTZumn3/+WV999dUll0tJSdGECRMUGxurUaNGaefOnZo1a5bWrVvneAyPHj0q6fzzunHjxpowYYLOnDmjmTNnqmPHjlq3bp2aNGmirl27KiIiQgsWLNDvfvc7p+0sWLBAUVFRjtNFV2rRokU6ffq0Ro0apbp162rt2rV67bXXtH//fi1atKjE5d555x3dd999at26tVJTU3X06FFNnz5dX3/9tTZu3KigoKBS1VGSsryuiowdO7ZU22ratKn+/Oc/yxijPXv2aOrUqerVq9cVB9/iFD22SUlJateunSZNmqTDhw8X21dLly5VfHy8GjZsqJSUFP3yyy967bXX1LFjR23YsMElvA8YMEANGjRQamqqvv32W02fPl3Hjx/X22+/XWI9Dz74oFauXKm0tDQ1a9bMMf3X9LNlGMAYM3fuXCPJLF261Bw+fNj89NNPZuHChaZu3bqmWrVqZv/+/cYYY86cOWMKCgqcls3MzDS+vr5m4sSJjmlz5swxkszUqVNdtlVYWOhYTpKZMmWKS5vmzZubLl26OMZXrFhhJJnrrrvO2O12x/T33nvPSDKvvvqqY92NGzc2PXr0cGzHGGNOnz5tIiMjTffu3V221aFDB9OiRQvH+OHDh40kM378eMe0vXv3Gk9PT/PCCy84Lbtlyxbj5eXlMn3Xrl1Gknnrrbcc08aPH28ufMl99dVXRpJZsGCB07JLlixxmV6/fn3Tu3dvl9oTExPNxS/ji2t/6qmnTHBwsGnXrp1Tn/7jH/8wHh4e5quvvnJafvbs2UaS+eabb1y2d6EuXbqY5s2bu0yfMmWKkWQyMzMd05577jnj7+9vfvjhB6e2Tz/9tPH09DT79u0zxpTtObFo0aISa0xISDD169e/5H4Yc75/ExISnKYNHjzYVK9e/ZLLHTp0yPj4+Ji4uDin18WMGTOMJDNnzhynWoOCgsyRI0cc7X744Qfj7e1t+vfv75iWnJxsfH19zYkTJ5y24+Xl5fS4RkZGmqFDhzrVU7SdFStWOKadPn3ape7U1FRjs9nMjz/+6Jh24fMzPz/fhISEmKioKHPq1ClHm5UrVxpJ5vHHH3dMS0hIMP7+/k7rX7RokUsdxhjj7+/v1M+leV116dLF6fH/7LPPjCTTs2dPl9dAcS5e3hhj/vSnPxlJ5tChQ45pkkxiYmKJ6yl6ryx6fheNN2vWzKmvix6LC/uqTZs2Jjg42Bw9etQxbdOmTcbDw8PpsSx6LO644w6nbf/xj380ksymTZuc6i16XiQnJxtPT0+zePFip+VK+/5lVZwyg5PY2FjVq1dPERERGjhwoGrUqKGPPvpI1113naTzd595eJx/2hQUFOjo0aOqUaOGoqOjtWHDBsd6PvjgAwUFBWn06NEu27iSw9slGTp0qGrWrOkYv/vuuxUWFqbPPvtM0vnbcHft2qXBgwfr6NGjOnLkiI4cOaLc3FzdfvvtWrVqldMpGun8qT0/P79LbvfDDz9UYWGhBgwY4FjnkSNHFBoaqsaNG2vFihVO7c+ePSvpfH+VZNGiRQoMDFT37t2d1tmuXTvVqFHDZZ3nzp1zanfkyBGXw+cXO3DggF577TWNGzfO5bTGokWLdOONN6pp06ZO6yw6TXrx9n+NRYsWqVOnTqpdu7bTtmJjY1VQUKBVq1Y5tT99+rTLvhYUFBS77pMnT+rIkSNOp6jKIi8vT0eOHNGhQ4eUlpam5cuX6/bbb7/kMku
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAj4AAAHHCAYAAAC/R1LgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABWY0lEQVR4nO3deVxU5f4H8M/sbA6rMKAIKCIuoIUbpVlJImHL1W6bt7Rral600rKiW7l0u1b2M8u05VZqt8WkzfKapriUiqYkIqKIhqIou6wCAzPP7w+aEyOggMAA5/N+veal55znnPM9z2wfzjYKIYQAERERkQwobV0AERERUXth8CEiIiLZYPAhIiIi2WDwISIiItlg8CEiIiLZYPAhIiIi2WDwISIiItlg8CEiIiLZUNu6ACIioq7AaDSisLAQZrMZPj4+ti6HGsE9PkRE1KF9+umnOH36tDS8Zs0aZGVl2a6gOg4ePIgHH3wQHh4e0Ol08Pb2xqRJk2xdFl0Bg08XsmbNGigUCulhZ2eHoKAgzJ49Gzk5ObYuj4ioRX755Rc888wzOH36NLZs2YKYmBgolbb/+tqwYQNGjRqF1NRUvPLKK9i6dSu2bt2K999/39al0RXwUFcXtHjxYgQEBKCyshK7d+/Gu+++i02bNiElJQUODg62Lo+IqFnmzp2Lm2++GQEBAQCAefPmwdvb26Y1FRYW4tFHH0VkZCTi4uKg1WptWg81HYNPFxQVFYWhQ4cCAB599FG4u7tj2bJl2LBhAx544AEbV0dE1DzBwcE4deoUUlJS4OHhgT59+ti6JKxevRqVlZVYs2YNQ08nY/t9hdTmbr31VgBARkYGgNq/VJ5++mmEhITAyckJer0eUVFROHz4cL15KysrsXDhQgQFBcHOzg7e3t6YOHEiTp06BQA4ffq01eG1yx8333yztKydO3dCoVDgyy+/xPPPPw+DwQBHR0fceeedOHv2bL1179+/H+PHj4ezszMcHBwwZswY7Nmzp8FtvPnmmxtc/8KFC+u1/fTTTxEWFgZ7e3u4ubnh/vvvb3D9V9q2usxmM5YvX46BAwfCzs4OXl5emDlzJi5evGjVzt/fHxMmTKi3ntmzZ9dbZkO1L126tF6fAkBVVRUWLFiAwMBA6HQ6+Pr64plnnkFVVVWDfVXXzTffjEGDBtUb/8Ybb0ChUFidVwEARUVFePLJJ+Hr6wudTofAwEC89tprMJvNUhtLv73xxhv1ljto0KAGXxNfffVVozVOnToV/v7+V90Wf39/6flRKpUwGAy47777kJmZ2aR5p06dajVuxowZsLOzw86dO63Gr1q1CgMHDoROp4OPjw9iYmJQVFRk1aap/Vq35oYelu2u26dvvvkm/Pz8YG9vjzFjxiAlJaXeerZv347Ro0fD0dERLi4uuOuuu3Ds2LGr9lvdR93tbuy1W1dznncAyM3NxbRp0+Dl5QU7OzsMHjwYa9eubXCZa9asgaOjI0aMGIE+ffogJiYGCoWi3nPWWE2Wh0ajgb+/P+bPnw+j0Si1s5wmcPDgwUaXdfPNN1ttw759+zBkyBD8+9//lt4Pffv2xauvvmr1fgCAmpoavPzyy+jTpw90Oh38/f3x/PPP13uPWvr5p59+wpAhQ2BnZ4cBAwbgm2++sWpnqbfu+/Po0aNwdXXFhAkTUFNTI41vyntWbrjHRwYsIcXd3R0A8Pvvv+O7777DX//6VwQEBCAnJwfvv/8+xowZg9TUVOlqBJPJhAkTJiA+Ph73338/nnjiCZSWlmLr1q1ISUmx+qvrgQcewO2332613tjY2AbreeWVV6BQKPDss88iNzcXy5cvR0REBJKSkmBvbw+g9oM7KioKYWFhWLBgAZRKJVavXo1bb70Vv/zyC4YPH15vuT179sSSJUsAAGVlZZg1a1aD637xxRdx77334tFHH0VeXh5WrFiBm266CYcOHYKLi0u9eWbMmIHRo0cDAL755ht8++23VtNnzpyJNWvW4JFHHsHjjz+OjIwMvPPOOzh06BD27NkDjUbTYD80R1FRkbRtdZnNZtx5553YvXs3ZsyYgf79++PIkSN48803ceLECXz33XfXvG6LS5cuYcyYMcjKysLMmTPRq1cv7N27F7Gxsbhw4QKWL1/eautqqdGjR2PGjBkwm81ISUnB8uXLcf78efzyyy/NWs6CBQvw0Ucf4csvv7T6slu4cCEWLVqEiIgIzJo1C2lpaXj33Xdx4MCBFj3Xy5cvR1lZGQDg2LFj+Pe//43nn38e/fv3BwA4OTlZtf/kk09QWlqKmJgYVFZW4q233sKtt96KI0eOwMvLCwCwbds2REVFoXfv3li4cCEqKiqwYsUK3Hjjjfjtt98aDJGWfqtbR1uqqKjAzTffjJMnT2L27NkICAhAXFwcpk6diqKiIjzxxBONznvy5En85z//adb6LO/hqqoqbNmyBW+88Qbs7Ozw8ssvt3gbCgoKsHv3buzevRt///vfERYWhvj4eMTGxuL06dN47733pLaPPvoo1q5di3vuuQdPPfUU9u/fjyVLluDYsWP1Pk/S09Nx33334bHHHsOUKVOwevVq/PWvf8XmzZtx2223NVjL2bNnMX78eAQHB2P9+vVQq2u/2jvDe9YmBHUZq1evFgDEtm3bRF5enjh79qxYt26dcHd3F/b29uLcuXNCCCEqKyuFyWSymjcjI0PodDqxePFiadzHH38sAIhly5bVW5fZbJbmAyCWLl1ar83AgQPFmDFjpOEdO3YIAKJHjx6ipKREGr9+/XoBQLz11lvSsvv27SsiIyOl9QghxKVLl0RAQIC47bbb6q3rhhtuEIMGDZKG8/LyBACxYMECadzp06eFSqUSr7zyitW8R44cEWq1ut749PR0AUCsXbtWGrdgwQJR923zyy+/CADis88+s5p38+bN9cb7+fmJ6OjoerXHxMSIy9+Kl9f+zDPPCE9PTxEWFmbVp//973+FUqkUv/zyi9X87733ngAg9uzZU299dY0ZM0YMHDiw3vilS5cKACIjI0Ma9/LLLwtHR0dx4sQJq7bPPfecUKlUIjMzUwjRstdEXFxcozVOmTJF+Pn5XXE7hKjt3ylTpliNe/DBB4WDg0Oz5n3//fcFALFixQqrNrm5uUKr1Ypx48ZZvX/eeecdAUB8/PHH0rjm9KuFpS927NhRb5qlT+u+j4UQYv/+/QKAmDt3rjRuyJAhwtPTUxQUFEjjDh8+LJRKpXj44YfrLbtHjx7ikUceuWIdjb12G6qxKc/78uXLBQDx6aefSuOMRqMIDw8XTk5O0ueDZZmrV6+W2t17771i0KBBwtfXt97z3VhNdecXQggfHx9x++23S8OWz84DBw40uqwxY8ZYbcOYMWMEALFw4UKrdlOnThUAxJEjR4QQQiQlJQkA4tFHH7Vq9/TTTwsAYvv27dI4Pz8/AUB8/fXX0rji4mLh7e0trrvuunr1ZmRkiMLCQjFgwADRr18/kZ+fb7WOpr5n5YaHurqgiIgIdO/eHb6+vrj//vvh5OSEb7/9Fj169AAA6HQ66YoIk8mEgoICODk5oV+/fvjtt9+k5Xz99dfw8PDAnDlz6q3j8kMzzfHwww+jW7du0vA999wDb29vbNq0CQCQlJSE9PR0PPjggygoKEB+fj7y8/NRXl6OsWPH4ueff663m7ayshJ2dnZXXO8333wDs9mMe++9V1pmfn4+DAYD+vbtix07dli1t+wK1+l0jS4zLi4Ozs7OuO2226yWGRYWBicnp3rLrK6utmqXn5+PysrKK9adlZWFFStW4MUXX6y3ByAuLg79+/dHcHCw1TIthzcvX/+1iIuLw+jRo+Hq6mq1roiICJhMJvz8889W7S9dulRvW00mU4PLLi0tRX5+fr1DRs1VVVWF/Px85ObmYuvWrdi+fTvGjh3b5Pk3bNiAf/zjH5g/fz5mz55tNW3btm0wGo148sknra4omj59OvR6Pf73v/9ZtTeZTPW2/9KlS9e0fXfffbf0PgaA4cOHY8SIEdJ758KFC0hKSsLUqVPh5uYmtQsNDcVtt90mtavLaDRe8TVuYXntFhQ
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlEAAAHHCAYAAACfqw0dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABWe0lEQVR4nO3deVxU5f4H8M/srMMiMAMKiAsqroUblqlJLtl21dtmZl1T84eWWlq0uNXN0kozTetWaqs3K7XMJcWtEk1JVMRYFEVRNhEGUJaZeX5/IOc6AgpHYFg+79dr7mXOec453/PMjPPpnOecUQghBIiIiIioRpT2LoCIiIioMWKIIiIiIpKBIYqIiIhIBoYoIiIiIhkYooiIiIhkYIgiIiIikoEhioiIiEgGhigiIiIiGRiiiIio2TObzcjMzERqaqq9S6FGhCGKiIjqzKZNmxAbGys937BhA44fP26/gq6RlJSECRMmwNfXF1qtFgaDAWFhYeAPeVB1MUSRjdWrV0OhUEgPBwcHBAcHY8qUKcjIyLB3eUTUyBw7dgzPP/88kpKSsH//fjz77LPIz8+3d1nYv38/evfujZ07d+Lll1/Gtm3bsH37dmzYsAEKhcLe5VEjoeBv59G1Vq9ejaeffhrz589HUFAQioqK8Pvvv+PLL79EYGAg4uLi4OTkZO8yiaiRyMrKQr9+/ZCcnAwAGDlyJH744Qe71lRSUoLu3btDr9fj119/hZubm13rocZLbe8CqGEaPnw4evbsCQB45pln0KJFC7z//vvYuHEjHnvsMTtXR0SNhbe3N+Li4qT/AOvUqZO9S8LPP/+MhIQE/P333wxQdEt4Oo+q5e677wYApKSkAABycnLw4osvomvXrnBxcYFer8fw4cNx5MiRCssWFRVh7ty5CA4OhoODA3x9fTFy5EicPHkSAHD69GmbU4jXPwYOHCita/fu3VAoFPjvf/+LV155BUajEc7OznjggQdw9uzZCts+cOAAhg0bBjc3Nzg5OWHAgAH4448/Kt3HgQMHVrr9uXPnVmj71VdfITQ0FI6OjvD09MSjjz5a6fZvtG/XslqtWLJkCTp37gwHBwcYDAZMmjQJly5dsmnXunVr3HfffRW2M2XKlArrrKz2RYsWVehTACguLsacOXPQrl076HQ6+Pv7Y9asWSguLq60r641cOBAdOnSpcL0d999FwqFAqdPn7aZnpubi2nTpsHf3x86nQ7t2rXDO++8A6vVKrUp77d33323wnq7dOlS6Xvi+++/r7LGp556Cq1bt77pvrRu3Vp6fZRKJYxGIx555JGbDja+drnKHtduu7qvNQBs2bIFAwYMgKurK/R6PXr16oVvvvkGQNXv18reY2azGW+88Qbatm0LnU6H1q1b45VXXqnw+lZ3/wsLC/HCCy9Ir2GHDh3w7rvvVhhLVP4e1Ol0CA0NRadOnap8D1bm2n1RqVRo2bIlJk6ciNzcXKmNnNd///79CAoKwg8//IC2bdtCq9UiICAAs2bNwpUrVyos/9FHH6Fz587Q6XTw8/NDRESETQ3A/z4HMTEx6NevHxwdHREUFISVK1fatCuvd/fu3dK08+fPo3Xr1ujZsycKCgqk6bfyuaT6wSNRVC3lgadFixYAgFOnTmHDhg345z//iaCgIGRkZODjjz/GgAEDEB8fDz8/PwCAxWLBfffdh6ioKDz66KN4/vnnkZ+fj+3btyMuLg5t27aVtvHYY4/h3nvvtdluZGRkpfX8+9//hkKhwEsvvYTMzEwsWbIE4eHhiI2NhaOjIwBg586dGD58OEJDQzFnzhwolUqsWrUKd999N3777Tf07t27wnpbtWqFBQsWAAAKCgowefLkSrf9+uuv4+GHH8YzzzyDrKwsfPjhh7jrrrtw+PBhuLu7V1hm4sSJ6N+/PwDgxx9/xPr1623mT5o0STqV+txzzyElJQXLli3D4cOH8ccff0Cj0VTaDzWRm5sr7du1rFYrHnjgAfz++++YOHEiOnXqhGPHjmHx4sVITEzEhg0bbnnb5S5fvowBAwYgLS0NkyZNQkBAAPbt24fIyEhcuHABS5YsqbVtydW/f39MnDgRVqsVcXFxWLJkCc6fP4/ffvutymWWLFkiffmdOHECb731Fl555RXpqIuLi4vUtrqv9erVq/Gvf/0LnTt3RmRkJNzd3XH48GFs3boVjz/+OF599VU888wzAIDs7GxMnz7d5n12rWeeeQZr1qzB6NGj8cILL+DAgQNYsGABTpw4UeG9eLP9F0LggQcewK5duzB+/Hj06NED27Ztw8yZM5GWlobFixdX2U9VvQdv5B//+AdGjhwJs9mM6OhofPLJJ7hy5Qq+/PLLGq3nWhcvXsSpU6fwyiuvYOTIkXjhhRdw6NAhLFq0CHFxcfjll1+kEDp37lzMmzcP4eHhmDx5MhISErBixQocPHiwwmfz0qVLuPfee/Hwww/jsccew3fffYfJkydDq9XiX//6V6W15OXlYfjw4dBoNNi8ebP0XqnPzyXdAkF0jVWrVgkAYseOHSIrK0ucPXtWrF27VrRo0UI4OjqKc+fOCSGEKCoqEhaLxWbZlJQUodPpxPz586Vpn3/+uQAg3n///Qrbslqt0nIAxKJFiyq06dy5sxgwYID0fNeuXQKAaNmypTCZTNL07777TgAQH3zwgbTu9u3bi6FDh0rbEUKIy5cvi6CgIHHPPfdU2Fa/fv1Ely5dpOdZWVkCgJgzZ4407fTp00KlUol///vfNsseO3ZMqNXqCtOTkpIEALFmzRpp2pw5c8S1H73ffvtNABBff/21zbJbt26tMD0wMFCMGDGiQu0RERHi+o/z9bXPmjVL+Pj4iNDQUJs+/fLLL4VSqRS//fabzfIrV64UAMQff/xRYXvXGjBggOjcuXOF6YsWLRIAREpKijTtjTfeEM7OziIxMdGm7csvvyxUKpVITU0VQsh7T6xbt67KGseNGycCAwNvuB9ClPXvuHHjbKY9/vjjwsnJ6abLXl/Prl27Ksyr7mudm5srXF1dRZ8+fcSVK1ds2l77fi5X3l+rVq2qMC82NlYAEM8884zN9BdffFEAEDt37pSmVWf/N2zYIACIN99806bd6NGjhUKhEMnJydK06r4Hq3L98kKUfU5DQkKk53Je/3HjxgkA4qmnnrJpV/7Z/Pnnn4UQQmRmZgqtViuGDBli8+/dsmXLBADx+eefS9MGDBggAIj33ntPmlZcXCx69OghfHx8RElJiU29u3btEkVFRWLgwIHCx8fHpt+EuPXPJdUPns6jSoWHh8Pb2xv+/v549NFH4eLigvXr16Nly5YAAJ1OB6Wy7O1jsVhw8eJFuLi4oEOHDvjrr7+k9fzwww/w8vLC1KlTK2zjVq6AefLJJ+Hq6io9Hz16NHx9fbF582YAQGxsLJKSkvD444/j4sWLyM7ORnZ2NgoLCzF48GDs3bvX5vQRUHba0cHB4Ybb/fHHH2G1WvHwww9L68zOzobRaET79u2xa9cum/YlJSUAyvqrKuvWrYObmxvuuecem3WGhobCxcWlwjpLS0tt2mVnZ6OoqOiGdaelpeHDDz/E66+/bnNUpHz7nTp1QseOHW3WWX4K9/rt34p169ahf//+8PDwsNlWeHg4LBYL9u7da9P+8uXLFfbVYrFUuu78/HxkZ2dXOM1SU8XFxcjOzkZmZia2b9+OnTt3YvDgwbe0znLVfa23b9+O/Px8vPzyyxXekzX93JR/JmbMmGEz/YUXXgAA/PLLLzbTb7b/mzdvhkqlwnPPPVdhfUIIbNmypdI6bvQevJHy90B6ejp++OEHHDlypNLXQ87rP3PmTJvn06dPh0qlkvpkx44dKCkpwbRp06R/7wBgwoQJ0Ov1FfpOrVZj0qRJ0nOtVotJkyYhMzMTMTExNm2tViuefPJJ7N+/H5s3b7Y5Kg/U7+eS5OPpPKrU8uXLERwcDLVaDYPBgA4dOtj8I2K1WvHBBx/go48+QkpKis0XW/kpP6DsNGCHDh2gVtfuW619+/Y2zxUKBdq1ayeNv0lKSgIAjBs3rsp15OXlwcPDQ3qenZ1dYb3XS0p
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Пример оценки сбалансированности целевой переменной (цена автомобиля)\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Гистограмма распределения цены в обучающей выборке\n",
"sns.histplot(train_data['Price'], kde=True)\n",
"plt.title('Распределение цены в обучающей выборке')\n",
"plt.show()\n",
"\n",
"# Гистограмма распределения цены в контрольной выборке\n",
"sns.histplot(val_data['Price'], kde=True)\n",
"plt.title('Распределение цены в контрольной выборке')\n",
"plt.show()\n",
"\n",
"# Гистограмма распределения цены в тестовой выборке\n",
"sns.histplot(test_data['Price'], kde=True)\n",
"plt.title('Распределение цены в тестовой выборке')\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Процесс конструирования признаков\n",
"Задача 1: Прогнозирование цен на автомобили\n",
"Цель технического проекта: Разработка модели машинного обучения для точного прогнозирования рыночной стоимости автомобилей.\n",
"\n",
"Задача 2: Оптимизация рекламных бюджетов\n",
"Цель технического проекта: Использование прогнозов цен на автомобили для оптимизации таргетинга рекламы и повышения конверсии на онлайн-площадках.\n",
"\n",
"\n",
"### Унитарное кодирование категориальных признаков (one-hot encoding)\n",
"\n",
"One-hot encoding: Преобразование категориальных признаков в бинарные векторы."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ID Price Levy Manufacturer Prod. year Engine volume \\\n",
"3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n",
"3185 45760664 314 781 SUBARU 2012 2.5 \n",
"5529 45777845 5645 5908 BMW 1999 2.5 \n",
"7891 45651201 7997 1850 LEXUS 2008 3.5 \n",
"12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n",
"... ... ... ... ... ... ... \n",
"2750 45656065 941 1055 LEXUS 2013 3.5 \n",
"17390 45785069 12000 - FORD 1998 2.5 \n",
"5563 45815001 941 777 TOYOTA 2014 2.5 \n",
"3813 45809829 54850 831 HONDA 2018 1.5 \n",
"6041 45397141 9095 - FORD 2003 1.7 \n",
"\n",
" Mileage Cylinders Drive wheels Doors ... Fuel type_Hybrid \\\n",
"3438 160000 km 4.0 Rear 04-May ... False \n",
"3185 204579 km 4.0 4x4 04-May ... False \n",
"5529 0 km 6.0 Rear 04-May ... False \n",
"7891 244731 km 6.0 Front 04-May ... True \n",
"12167 103000 km 4.0 Front 04-May ... False \n",
"... ... ... ... ... ... ... \n",
"2750 361603 km 6.0 Front 04-May ... True \n",
"17390 220000 km 4.0 Rear 04-May ... False \n",
"5563 202355 km 4.0 Front 04-May ... False \n",
"3813 13048 km 4.0 Front 04-May ... False \n",
"6041 159000 km 4.0 Front 04-May ... False \n",
"\n",
" Fuel type_LPG Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n",
"3438 False True False \n",
"3185 False True False \n",
"5529 False True False \n",
"7891 False False False \n",
"12167 False True False \n",
"... ... ... ... \n",
"2750 False False False \n",
"17390 False False False \n",
"5563 False True False \n",
"3813 False True False \n",
"6041 False False False \n",
"\n",
" Gear box type_Automatic Gear box type_Manual Gear box type_Tiptronic \\\n",
"3438 True False False \n",
"3185 True False False \n",
"5529 False False True \n",
"7891 True False False \n",
"12167 False False True \n",
"... ... ... ... \n",
"2750 True False False \n",
"17390 False True False \n",
"5563 True False False \n",
"3813 True False False \n",
"6041 False True False \n",
"\n",
" Gear box type_Variator Leather interior_No Leather interior_Yes \n",
"3438 False False True \n",
"3185 False False True \n",
"5529 False True False \n",
"7891 False False True \n",
"12167 False True False \n",
"... ... ... ... \n",
"2750 False False True \n",
"17390 False True False \n",
"5563 False False True \n",
"3813 False False True \n",
"6041 False True False \n",
"\n",
"[12311 rows x 1247 columns]\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Пример категориальных признаков\n",
"categorical_features = ['Model', 'Category', 'Fuel type', 'Gear box type', 'Leather interior']\n",
"\n",
"# Применение one-hot encoding\n",
"train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n",
"val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n",
"test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Дискретизация числовых признаков \n",
"это процесс преобразования непрерывных числовых значений в дискретные категории или интервалы (бины). Этот процесс может быть полезен по нескольким причинам"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ID Price Levy Manufacturer Prod. year Engine volume \\\n",
"736 45753963 27284 259 CHEVROLET 2014 1.4 \n",
"8674 45786053 10349 - MERCEDES-BENZ 1997 2.9 Turbo \n",
"5971 45757478 40769 - MERCEDES-BENZ 1996 1.8 \n",
"1957 45732345 38737 639 HYUNDAI 2014 2 \n",
"11075 45729790 42102 831 SSANGYONG 2017 1.6 \n",
"... ... ... ... ... ... ... \n",
"12026 45786994 12231 650 CHEVROLET 2016 1.4 Turbo \n",
"17893 45756187 15681 - FORD 2003 2.4 Turbo \n",
"5339 45769967 314 2410 MERCEDES-BENZ 2010 6.2 \n",
"11859 45801865 14069 687 HYUNDAI 2010 1.6 \n",
"9276 45803366 15681 891 HYUNDAI 2016 2 \n",
"\n",
" Mileage Cylinders Drive wheels Doors ... Fuel type_LPG \\\n",
"736 65000 km 4.0 Front 04-May ... False \n",
"8674 3333 km 6.0 Rear 02-Mar ... False \n",
"5971 212485 km 8.0 Rear 04-May ... False \n",
"1957 132756 km 4.0 Front 04-May ... False \n",
"11075 50750 km 4.0 Front 04-May ... False \n",
"... ... ... ... ... ... ... \n",
"12026 9000 km 4.0 Front 04-May ... False \n",
"17893 250000 km 4.0 Rear 04-May ... False \n",
"5339 274771 km 8.0 Rear 04-May ... False \n",
"11859 100403 km 4.0 Front 04-May ... False \n",
"9276 322292 km 4.0 Front 04-May ... True \n",
"\n",
" Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Automatic \\\n",
"736 False True True \n",
"8674 False False False \n",
"5971 True False False \n",
"1957 False False True \n",
"11075 True False True \n",
"... ... ... ... \n",
"12026 True False False \n",
"17893 False False False \n",
"5339 True False True \n",
"11859 True False True \n",
"9276 False False True \n",
"\n",
" Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n",
"736 False False False \n",
"8674 True False False \n",
"5971 True False False \n",
"1957 False False False \n",
"11075 False False False \n",
"... ... ... ... \n",
"12026 False True False \n",
"17893 True False False \n",
"5339 False False False \n",
"11859 False False False \n",
"9276 False False False \n",
"\n",
" Leather interior_No Leather interior_Yes Year bin \n",
"736 True False 4 \n",
"8674 False True 3 \n",
"5971 True False 3 \n",
"1957 False True 4 \n",
"11075 False True 4 \n",
"... ... ... ... \n",
"12026 True False 4 \n",
"17893 True False 3 \n",
"5339 False True 4 \n",
"11859 False True 4 \n",
"9276 False True 4 \n",
"\n",
"[3848 rows x 658 columns]\n"
]
}
],
"source": [
"# Пример дискретизации признака 'year'\n",
"train_data_encoded['Year bin'] = pd.cut(train_data_encoded['Prod. year'], bins=5, labels=False)\n",
"val_data_encoded['Year bin'] = pd.cut(val_data_encoded['Prod. year'], bins=5, labels=False)\n",
"test_data_encoded['Year bin'] = pd.cut(test_data_encoded['Prod. year'], bins=5, labels=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Ручной синтез\n",
"Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ID Price Levy Manufacturer Prod. year Engine volume \\\n",
"3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n",
"3185 45760664 314 781 SUBARU 2012 2.5 \n",
"5529 45777845 5645 5908 BMW 1999 2.5 \n",
"7891 45651201 7997 1850 LEXUS 2008 3.5 \n",
"12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n",
"... ... ... ... ... ... ... \n",
"2750 45656065 941 1055 LEXUS 2013 3.5 \n",
"17390 45785069 12000 - FORD 1998 2.5 \n",
"5563 45815001 941 777 TOYOTA 2014 2.5 \n",
"3813 45809829 54850 831 HONDA 2018 1.5 \n",
"6041 45397141 9095 - FORD 2003 1.7 \n",
"\n",
" Mileage Cylinders Drive wheels Doors ... Fuel type_Petrol \\\n",
"3438 160000 km 4.0 Rear 04-May ... True \n",
"3185 204579 km 4.0 4x4 04-May ... True \n",
"5529 0 km 6.0 Rear 04-May ... True \n",
"7891 244731 km 6.0 Front 04-May ... False \n",
"12167 103000 km 4.0 Front 04-May ... True \n",
"... ... ... ... ... ... ... \n",
"2750 361603 km 6.0 Front 04-May ... False \n",
"17390 220000 km 4.0 Rear 04-May ... False \n",
"5563 202355 km 4.0 Front 04-May ... True \n",
"3813 13048 km 4.0 Front 04-May ... True \n",
"6041 159000 km 4.0 Front 04-May ... False \n",
"\n",
" Fuel type_Plug-in Hybrid Gear box type_Automatic Gear box type_Manual \\\n",
"3438 False True False \n",
"3185 False True False \n",
"5529 False False False \n",
"7891 False True False \n",
"12167 False False False \n",
"... ... ... ... \n",
"2750 False True False \n",
"17390 False False True \n",
"5563 False True False \n",
"3813 False True False \n",
"6041 False False True \n",
"\n",
" Gear box type_Tiptronic Gear box type_Variator Leather interior_No \\\n",
"3438 False False False \n",
"3185 False False False \n",
"5529 True False True \n",
"7891 False False False \n",
"12167 True False True \n",
"... ... ... ... \n",
"2750 False False False \n",
"17390 False False True \n",
"5563 False False False \n",
"3813 False False False \n",
"6041 False False True \n",
"\n",
" Leather interior_Yes Year bin Age \n",
"3438 True 4 12 \n",
"3185 True 4 12 \n",
"5529 False 3 25 \n",
"7891 True 4 16 \n",
"12167 False 4 9 \n",
"... ... ... ... \n",
"2750 True 4 11 \n",
"17390 False 3 26 \n",
"5563 True 4 10 \n",
"3813 True 4 6 \n",
"6041 False 3 21 \n",
"\n",
"[12311 rows x 1249 columns]\n"
]
}
],
"source": [
"# Пример синтеза признака \"возраст автомобиля\"\n",
"train_data_encoded['Age'] = 2024 - train_data_encoded['Prod. year']\n",
"val_data_encoded['Age'] = 2024 - val_data_encoded['Prod. year']\n",
"test_data_encoded['Age'] = 2024 - test_data_encoded['Prod. year']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
"\n",
"# Пример масштабирования числовых признаков\n",
"numerical_features = ['Airbags', 'Age']\n",
"\n",
"scaler = StandardScaler()\n",
"train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n",
"val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n",
"test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Конструирование признаков с применением фреймворка Featuretools"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'pkg_resources'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mft\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Определение сущностей\u001b[39;00m\n\u001b[0;32m 4\u001b[0m es \u001b[38;5;241m=\u001b[39m ft\u001b[38;5;241m.\u001b[39mEntitySet(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcar_data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
"File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\__init__.py:4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig_init\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m primitives\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynthesis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n",
"File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n",
"File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\api.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_entityset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EntitySet\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n",
"File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\deserialize.py:8\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minspect\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m getfullargspec\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_sys\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_system\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mww_type_system\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_woodwork_table\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n",
"File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n",
"\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'"
]
}
],
"source": [
"import featuretools as ft\n",
"\n",
"# Определение сущностей\n",
"es = ft.EntitySet(id='car_data')\n",
"es = es.entity_from_dataframe(entity_id='cars', dataframe=train_data_encoded, index='id')\n",
"\n",
"# Определение связей между сущностями (если есть)\n",
"# es = es.add_relationship(...)\n",
"\n",
"# Генерация признаков\n",
"feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='cars', max_depth=2)\n",
"\n",
"# Преобразование признаков для контрольной и тестовой выборок\n",
"val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n",
"test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)"
]
2024-10-11 13:42:11 +04:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}