diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb index 024426d..337db01 100644 --- a/lab_3/lab3.ipynb +++ b/lab_3/lab3.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -65,9 +65,466 @@ "import matplotlib.pyplot as plt\n", "import matplotlib\n", "import matplotlib.ticker as ticker\n", - "df = pn.read_csv(\".//static//csv//car_price_prediction.csv\").head(15000)\n", + "df = pn.read_csv(\".//static//csv//car_price_prediction.csv\")\n", "print(df.columns)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разделим на 3 выборки\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Размер обучающей выборки: 12311\n", + "Размер контрольной выборки: 3078\n", + "Размер тестовой выборки: 3848\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки (80% - обучение, 20% - тест)\n", + "train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)\n", + "\n", + "# Разделение обучающей выборки на обучающую и контрольную (80% - обучение, 20% - контроль)\n", + "train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)\n", + "\n", + "print(\"Размер обучающей выборки:\", len(train_data))\n", + "print(\"Размер контрольной выборки:\", len(val_data))\n", + "print(\"Размер тестовой выборки:\", len(test_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Пример оценки сбалансированности целевой переменной (цена автомобиля)\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Гистограмма распределения цены в обучающей выборке\n", + "sns.histplot(train_data['Price'], kde=True)\n", + "plt.title('Распределение цены в обучающей выборке')\n", + "plt.show()\n", + "\n", + "# Гистограмма распределения цены в контрольной выборке\n", + "sns.histplot(val_data['Price'], kde=True)\n", + "plt.title('Распределение цены в контрольной выборке')\n", + "plt.show()\n", + "\n", + "# Гистограмма распределения цены в тестовой выборке\n", + "sns.histplot(test_data['Price'], kde=True)\n", + "plt.title('Распределение цены в тестовой выборке')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Процесс конструирования признаков\n", + "Задача 1: Прогнозирование цен на автомобили\n", + "Цель технического проекта: Разработка модели машинного обучения для точного прогнозирования рыночной стоимости автомобилей.\n", + "\n", + "Задача 2: Оптимизация рекламных бюджетов\n", + "Цель технического проекта: Использование прогнозов цен на автомобили для оптимизации таргетинга рекламы и повышения конверсии на онлайн-площадках.\n", + "\n", + "\n", + "### Унитарное кодирование категориальных признаков (one-hot encoding)\n", + "\n", + "One-hot encoding: Преобразование категориальных признаков в бинарные векторы." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", + "3185 45760664 314 781 SUBARU 2012 2.5 \n", + "5529 45777845 5645 5908 BMW 1999 2.5 \n", + "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", + "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", + "... ... ... ... ... ... ... \n", + "2750 45656065 941 1055 LEXUS 2013 3.5 \n", + "17390 45785069 12000 - FORD 1998 2.5 \n", + "5563 45815001 941 777 TOYOTA 2014 2.5 \n", + "3813 45809829 54850 831 HONDA 2018 1.5 \n", + "6041 45397141 9095 - FORD 2003 1.7 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_Hybrid \\\n", + "3438 160000 km 4.0 Rear 04-May ... False \n", + "3185 204579 km 4.0 4x4 04-May ... False \n", + "5529 0 km 6.0 Rear 04-May ... False \n", + "7891 244731 km 6.0 Front 04-May ... True \n", + "12167 103000 km 4.0 Front 04-May ... False \n", + "... ... ... ... ... ... ... \n", + "2750 361603 km 6.0 Front 04-May ... True \n", + "17390 220000 km 4.0 Rear 04-May ... False \n", + "5563 202355 km 4.0 Front 04-May ... False \n", + "3813 13048 km 4.0 Front 04-May ... False \n", + "6041 159000 km 4.0 Front 04-May ... False \n", + "\n", + " Fuel type_LPG Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n", + "3438 False True False \n", + "3185 False True False \n", + "5529 False True False \n", + "7891 False False False \n", + "12167 False True False \n", + "... ... ... ... \n", + "2750 False False False \n", + "17390 False False False \n", + "5563 False True False \n", + "3813 False True False \n", + "6041 False False False \n", + "\n", + " Gear box type_Automatic Gear box type_Manual Gear box type_Tiptronic \\\n", + "3438 True False False \n", + "3185 True False False \n", + "5529 False False True \n", + "7891 True False False \n", + "12167 False False True \n", + "... ... ... ... \n", + "2750 True False False \n", + "17390 False True False \n", + "5563 True False False \n", + "3813 True False False \n", + "6041 False True False \n", + "\n", + " Gear box type_Variator Leather interior_No Leather interior_Yes \n", + "3438 False False True \n", + "3185 False False True \n", + "5529 False True False \n", + "7891 False False True \n", + "12167 False True False \n", + "... ... ... ... \n", + "2750 False False True \n", + "17390 False True False \n", + "5563 False False True \n", + "3813 False False True \n", + "6041 False True False \n", + "\n", + "[12311 rows x 1247 columns]\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Пример категориальных признаков\n", + "categorical_features = ['Model', 'Category', 'Fuel type', 'Gear box type', 'Leather interior']\n", + "\n", + "# Применение one-hot encoding\n", + "train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)\n", + "val_data_encoded = pd.get_dummies(val_data, columns=categorical_features)\n", + "test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Дискретизация числовых признаков \n", + "это процесс преобразования непрерывных числовых значений в дискретные категории или интервалы (бины). Этот процесс может быть полезен по нескольким причинам" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "736 45753963 27284 259 CHEVROLET 2014 1.4 \n", + "8674 45786053 10349 - MERCEDES-BENZ 1997 2.9 Turbo \n", + "5971 45757478 40769 - MERCEDES-BENZ 1996 1.8 \n", + "1957 45732345 38737 639 HYUNDAI 2014 2 \n", + "11075 45729790 42102 831 SSANGYONG 2017 1.6 \n", + "... ... ... ... ... ... ... \n", + "12026 45786994 12231 650 CHEVROLET 2016 1.4 Turbo \n", + "17893 45756187 15681 - FORD 2003 2.4 Turbo \n", + "5339 45769967 314 2410 MERCEDES-BENZ 2010 6.2 \n", + "11859 45801865 14069 687 HYUNDAI 2010 1.6 \n", + "9276 45803366 15681 891 HYUNDAI 2016 2 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_LPG \\\n", + "736 65000 km 4.0 Front 04-May ... False \n", + "8674 3333 km 6.0 Rear 02-Mar ... False \n", + "5971 212485 km 8.0 Rear 04-May ... False \n", + "1957 132756 km 4.0 Front 04-May ... False \n", + "11075 50750 km 4.0 Front 04-May ... False \n", + "... ... ... ... ... ... ... \n", + "12026 9000 km 4.0 Front 04-May ... False \n", + "17893 250000 km 4.0 Rear 04-May ... False \n", + "5339 274771 km 8.0 Rear 04-May ... False \n", + "11859 100403 km 4.0 Front 04-May ... False \n", + "9276 322292 km 4.0 Front 04-May ... True \n", + "\n", + " Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Automatic \\\n", + "736 False True True \n", + "8674 False False False \n", + "5971 True False False \n", + "1957 False False True \n", + "11075 True False True \n", + "... ... ... ... \n", + "12026 True False False \n", + "17893 False False False \n", + "5339 True False True \n", + "11859 True False True \n", + "9276 False False True \n", + "\n", + " Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n", + "736 False False False \n", + "8674 True False False \n", + "5971 True False False \n", + "1957 False False False \n", + "11075 False False False \n", + "... ... ... ... \n", + "12026 False True False \n", + "17893 True False False \n", + "5339 False False False \n", + "11859 False False False \n", + "9276 False False False \n", + "\n", + " Leather interior_No Leather interior_Yes Year bin \n", + "736 True False 4 \n", + "8674 False True 3 \n", + "5971 True False 3 \n", + "1957 False True 4 \n", + "11075 False True 4 \n", + "... ... ... ... \n", + "12026 True False 4 \n", + "17893 True False 3 \n", + "5339 False True 4 \n", + "11859 False True 4 \n", + "9276 False True 4 \n", + "\n", + "[3848 rows x 658 columns]\n" + ] + } + ], + "source": [ + "# Пример дискретизации признака 'year'\n", + "train_data_encoded['Year bin'] = pd.cut(train_data_encoded['Prod. year'], bins=5, labels=False)\n", + "val_data_encoded['Year bin'] = pd.cut(val_data_encoded['Prod. year'], bins=5, labels=False)\n", + "test_data_encoded['Year bin'] = pd.cut(test_data_encoded['Prod. year'], bins=5, labels=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ручной синтез\n", + "Создание новых признаков на основе экспертных знаний и логики предметной области. Например, для данных о продаже автомобилей можно создать признак \"возраст автомобиля\" как разницу между текущим годом и годом выпуска." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ID Price Levy Manufacturer Prod. year Engine volume \\\n", + "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", + "3185 45760664 314 781 SUBARU 2012 2.5 \n", + "5529 45777845 5645 5908 BMW 1999 2.5 \n", + "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", + "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", + "... ... ... ... ... ... ... \n", + "2750 45656065 941 1055 LEXUS 2013 3.5 \n", + "17390 45785069 12000 - FORD 1998 2.5 \n", + "5563 45815001 941 777 TOYOTA 2014 2.5 \n", + "3813 45809829 54850 831 HONDA 2018 1.5 \n", + "6041 45397141 9095 - FORD 2003 1.7 \n", + "\n", + " Mileage Cylinders Drive wheels Doors ... Fuel type_Petrol \\\n", + "3438 160000 km 4.0 Rear 04-May ... True \n", + "3185 204579 km 4.0 4x4 04-May ... True \n", + "5529 0 km 6.0 Rear 04-May ... True \n", + "7891 244731 km 6.0 Front 04-May ... False \n", + "12167 103000 km 4.0 Front 04-May ... True \n", + "... ... ... ... ... ... ... \n", + "2750 361603 km 6.0 Front 04-May ... False \n", + "17390 220000 km 4.0 Rear 04-May ... False \n", + "5563 202355 km 4.0 Front 04-May ... True \n", + "3813 13048 km 4.0 Front 04-May ... True \n", + "6041 159000 km 4.0 Front 04-May ... False \n", + "\n", + " Fuel type_Plug-in Hybrid Gear box type_Automatic Gear box type_Manual \\\n", + "3438 False True False \n", + "3185 False True False \n", + "5529 False False False \n", + "7891 False True False \n", + "12167 False False False \n", + "... ... ... ... \n", + "2750 False True False \n", + "17390 False False True \n", + "5563 False True False \n", + "3813 False True False \n", + "6041 False False True \n", + "\n", + " Gear box type_Tiptronic Gear box type_Variator Leather interior_No \\\n", + "3438 False False False \n", + "3185 False False False \n", + "5529 True False True \n", + "7891 False False False \n", + "12167 True False True \n", + "... ... ... ... \n", + "2750 False False False \n", + "17390 False False True \n", + "5563 False False False \n", + "3813 False False False \n", + "6041 False False True \n", + "\n", + " Leather interior_Yes Year bin Age \n", + "3438 True 4 12 \n", + "3185 True 4 12 \n", + "5529 False 3 25 \n", + "7891 True 4 16 \n", + "12167 False 4 9 \n", + "... ... ... ... \n", + "2750 True 4 11 \n", + "17390 False 3 26 \n", + "5563 True 4 10 \n", + "3813 True 4 6 \n", + "6041 False 3 21 \n", + "\n", + "[12311 rows x 1249 columns]\n" + ] + } + ], + "source": [ + "# Пример синтеза признака \"возраст автомобиля\"\n", + "train_data_encoded['Age'] = 2024 - train_data_encoded['Prod. year']\n", + "val_data_encoded['Age'] = 2024 - val_data_encoded['Prod. year']\n", + "test_data_encoded['Age'] = 2024 - test_data_encoded['Prod. year']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Масштабирование признаков - это процесс преобразования числовых признаков таким образом, чтобы они имели одинаковый масштаб. Это важно для многих алгоритмов машинного обучения, которые чувствительны к масштабу признаков, таких как линейная регрессия, метод опорных векторов (SVM) и нейронные сети." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", + "\n", + "# Пример масштабирования числовых признаков\n", + "numerical_features = ['Airbags', 'Age']\n", + "\n", + "scaler = StandardScaler()\n", + "train_data_encoded[numerical_features] = scaler.fit_transform(train_data_encoded[numerical_features])\n", + "val_data_encoded[numerical_features] = scaler.transform(val_data_encoded[numerical_features])\n", + "test_data_encoded[numerical_features] = scaler.transform(test_data_encoded[numerical_features])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Конструирование признаков с применением фреймворка Featuretools" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'pkg_resources'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mft\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Определение сущностей\u001b[39;00m\n\u001b[0;32m 4\u001b[0m es \u001b[38;5;241m=\u001b[39m ft\u001b[38;5;241m.\u001b[39mEntitySet(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcar_data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\__init__.py:4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig_init\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m primitives\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynthesis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\api.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_entityset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EntitySet\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\deserialize.py:8\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minspect\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m getfullargspec\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_sys\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_system\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mww_type_system\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_woodwork_table\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", + "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'" + ] + } + ], + "source": [ + "import featuretools as ft\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='car_data')\n", + "es = es.entity_from_dataframe(entity_id='cars', dataframe=train_data_encoded, index='id')\n", + "\n", + "# Определение связей между сущностями (если есть)\n", + "# es = es.add_relationship(...)\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='cars', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)" + ] } ], "metadata": {