From 1f9b7fcbe97e147f4e5adc18db1413bec63375c8 Mon Sep 17 00:00:00 2001 From: GokaPek Date: Fri, 11 Oct 2024 23:18:43 +0400 Subject: [PATCH] =?UTF-8?q?=D0=BF=D0=BE=D1=87=D1=82=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lab_3/lab3.ipynb | 484 +++++++++++++++++++++++------------------------ 1 file changed, 239 insertions(+), 245 deletions(-) diff --git a/lab_3/lab3.ipynb b/lab_3/lab3.ipynb index 337db01..7c9e25e 100644 --- a/lab_3/lab3.ipynb +++ b/lab_3/lab3.ipynb @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -181,82 +181,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", - "3185 45760664 314 781 SUBARU 2012 2.5 \n", - "5529 45777845 5645 5908 BMW 1999 2.5 \n", - "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", - "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", - "... ... ... ... ... ... ... \n", - "2750 45656065 941 1055 LEXUS 2013 3.5 \n", - "17390 45785069 12000 - FORD 1998 2.5 \n", - "5563 45815001 941 777 TOYOTA 2014 2.5 \n", - "3813 45809829 54850 831 HONDA 2018 1.5 \n", - "6041 45397141 9095 - FORD 2003 1.7 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_Hybrid \\\n", - "3438 160000 km 4.0 Rear 04-May ... False \n", - "3185 204579 km 4.0 4x4 04-May ... False \n", - "5529 0 km 6.0 Rear 04-May ... False \n", - "7891 244731 km 6.0 Front 04-May ... True \n", - "12167 103000 km 4.0 Front 04-May ... False \n", - "... ... ... ... ... ... ... \n", - "2750 361603 km 6.0 Front 04-May ... True \n", - "17390 220000 km 4.0 Rear 04-May ... False \n", - "5563 202355 km 4.0 Front 04-May ... False \n", - "3813 13048 km 4.0 Front 04-May ... False \n", - "6041 159000 km 4.0 Front 04-May ... False \n", - "\n", - " Fuel type_LPG Fuel type_Petrol Fuel type_Plug-in Hybrid \\\n", - "3438 False True False \n", - "3185 False True False \n", - "5529 False True False \n", - "7891 False False False \n", - "12167 False True False \n", - "... ... ... ... \n", - "2750 False False False \n", - "17390 False False False \n", - "5563 False True False \n", - "3813 False True False \n", - "6041 False False False \n", - "\n", - " Gear box type_Automatic Gear box type_Manual Gear box type_Tiptronic \\\n", - "3438 True False False \n", - "3185 True False False \n", - "5529 False False True \n", - "7891 True False False \n", - "12167 False False True \n", - "... ... ... ... \n", - "2750 True False False \n", - "17390 False True False \n", - "5563 True False False \n", - "3813 True False False \n", - "6041 False True False \n", - "\n", - " Gear box type_Variator Leather interior_No Leather interior_Yes \n", - "3438 False False True \n", - "3185 False False True \n", - "5529 False True False \n", - "7891 False False True \n", - "12167 False True False \n", - "... ... ... ... \n", - "2750 False False True \n", - "17390 False True False \n", - "5563 False False True \n", - "3813 False False True \n", - "6041 False True False \n", - "\n", - "[12311 rows x 1247 columns]\n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "\n", @@ -279,82 +206,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "736 45753963 27284 259 CHEVROLET 2014 1.4 \n", - "8674 45786053 10349 - MERCEDES-BENZ 1997 2.9 Turbo \n", - "5971 45757478 40769 - MERCEDES-BENZ 1996 1.8 \n", - "1957 45732345 38737 639 HYUNDAI 2014 2 \n", - "11075 45729790 42102 831 SSANGYONG 2017 1.6 \n", - "... ... ... ... ... ... ... \n", - "12026 45786994 12231 650 CHEVROLET 2016 1.4 Turbo \n", - "17893 45756187 15681 - FORD 2003 2.4 Turbo \n", - "5339 45769967 314 2410 MERCEDES-BENZ 2010 6.2 \n", - "11859 45801865 14069 687 HYUNDAI 2010 1.6 \n", - "9276 45803366 15681 891 HYUNDAI 2016 2 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_LPG \\\n", - "736 65000 km 4.0 Front 04-May ... False \n", - "8674 3333 km 6.0 Rear 02-Mar ... False \n", - "5971 212485 km 8.0 Rear 04-May ... False \n", - "1957 132756 km 4.0 Front 04-May ... False \n", - "11075 50750 km 4.0 Front 04-May ... False \n", - "... ... ... ... ... ... ... \n", - "12026 9000 km 4.0 Front 04-May ... False \n", - "17893 250000 km 4.0 Rear 04-May ... False \n", - "5339 274771 km 8.0 Rear 04-May ... False \n", - "11859 100403 km 4.0 Front 04-May ... False \n", - "9276 322292 km 4.0 Front 04-May ... True \n", - "\n", - " Fuel type_Petrol Fuel type_Plug-in Hybrid Gear box type_Automatic \\\n", - "736 False True True \n", - "8674 False False False \n", - "5971 True False False \n", - "1957 False False True \n", - "11075 True False True \n", - "... ... ... ... \n", - "12026 True False False \n", - "17893 False False False \n", - "5339 True False True \n", - "11859 True False True \n", - "9276 False False True \n", - "\n", - " Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator \\\n", - "736 False False False \n", - "8674 True False False \n", - "5971 True False False \n", - "1957 False False False \n", - "11075 False False False \n", - "... ... ... ... \n", - "12026 False True False \n", - "17893 True False False \n", - "5339 False False False \n", - "11859 False False False \n", - "9276 False False False \n", - "\n", - " Leather interior_No Leather interior_Yes Year bin \n", - "736 True False 4 \n", - "8674 False True 3 \n", - "5971 True False 3 \n", - "1957 False True 4 \n", - "11075 False True 4 \n", - "... ... ... ... \n", - "12026 True False 4 \n", - "17893 True False 3 \n", - "5339 False True 4 \n", - "11859 False True 4 \n", - "9276 False True 4 \n", - "\n", - "[3848 rows x 658 columns]\n" - ] - } - ], + "outputs": [], "source": [ "# Пример дискретизации признака 'year'\n", "train_data_encoded['Year bin'] = pd.cut(train_data_encoded['Prod. year'], bins=5, labels=False)\n", @@ -372,82 +226,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " ID Price Levy Manufacturer Prod. year Engine volume \\\n", - "3438 45793776 13485 781 VOLKSWAGEN 2012 2.5 \n", - "3185 45760664 314 781 SUBARU 2012 2.5 \n", - "5529 45777845 5645 5908 BMW 1999 2.5 \n", - "7891 45651201 7997 1850 LEXUS 2008 3.5 \n", - "12167 45798755 15681 765 VOLKSWAGEN 2015 2 \n", - "... ... ... ... ... ... ... \n", - "2750 45656065 941 1055 LEXUS 2013 3.5 \n", - "17390 45785069 12000 - FORD 1998 2.5 \n", - "5563 45815001 941 777 TOYOTA 2014 2.5 \n", - "3813 45809829 54850 831 HONDA 2018 1.5 \n", - "6041 45397141 9095 - FORD 2003 1.7 \n", - "\n", - " Mileage Cylinders Drive wheels Doors ... Fuel type_Petrol \\\n", - "3438 160000 km 4.0 Rear 04-May ... True \n", - "3185 204579 km 4.0 4x4 04-May ... True \n", - "5529 0 km 6.0 Rear 04-May ... True \n", - "7891 244731 km 6.0 Front 04-May ... False \n", - "12167 103000 km 4.0 Front 04-May ... True \n", - "... ... ... ... ... ... ... \n", - "2750 361603 km 6.0 Front 04-May ... False \n", - "17390 220000 km 4.0 Rear 04-May ... False \n", - "5563 202355 km 4.0 Front 04-May ... True \n", - "3813 13048 km 4.0 Front 04-May ... True \n", - "6041 159000 km 4.0 Front 04-May ... False \n", - "\n", - " Fuel type_Plug-in Hybrid Gear box type_Automatic Gear box type_Manual \\\n", - "3438 False True False \n", - "3185 False True False \n", - "5529 False False False \n", - "7891 False True False \n", - "12167 False False False \n", - "... ... ... ... \n", - "2750 False True False \n", - "17390 False False True \n", - "5563 False True False \n", - "3813 False True False \n", - "6041 False False True \n", - "\n", - " Gear box type_Tiptronic Gear box type_Variator Leather interior_No \\\n", - "3438 False False False \n", - "3185 False False False \n", - "5529 True False True \n", - "7891 False False False \n", - "12167 True False True \n", - "... ... ... ... \n", - "2750 False False False \n", - "17390 False False True \n", - "5563 False False False \n", - "3813 False False False \n", - "6041 False False True \n", - "\n", - " Leather interior_Yes Year bin Age \n", - "3438 True 4 12 \n", - "3185 True 4 12 \n", - "5529 False 3 25 \n", - "7891 True 4 16 \n", - "12167 False 4 9 \n", - "... ... ... ... \n", - "2750 True 4 11 \n", - "17390 False 3 26 \n", - "5563 True 4 10 \n", - "3813 True 4 6 \n", - "6041 False 3 21 \n", - "\n", - "[12311 rows x 1249 columns]\n" - ] - } - ], + "outputs": [], "source": [ "# Пример синтеза признака \"возраст автомобиля\"\n", "train_data_encoded['Age'] = 2024 - train_data_encoded['Prod. year']\n", @@ -464,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -488,23 +269,33 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 51, "metadata": {}, "outputs": [ { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pkg_resources'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[25], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mft\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;66;03m# Определение сущностей\u001b[39;00m\n\u001b[0;32m 4\u001b[0m es \u001b[38;5;241m=\u001b[39m ft\u001b[38;5;241m.\u001b[39mEntitySet(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcar_data\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\__init__.py:4\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig_init\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config\n\u001b[1;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m primitives\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msynthesis\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\api.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_entityset\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m EntitySet\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\deserialize.py:8\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01minspect\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m getfullargspec\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_sys\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtype_system\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mww_type_system\u001b[39;00m\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mwoodwork\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdeserialize\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m read_woodwork_table\n\u001b[0;32m 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfeaturetools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mentityset\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mrelationship\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Relationship\n", - "File \u001b[1;32mc:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\__init__.py:2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n", - "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'pkg_resources'" + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" ] } ], @@ -513,17 +304,220 @@ "\n", "# Определение сущностей\n", "es = ft.EntitySet(id='car_data')\n", - "es = es.entity_from_dataframe(entity_id='cars', dataframe=train_data_encoded, index='id')\n", + "es = es.add_dataframe(dataframe_name='cars', dataframe=train_data_encoded, index='id')\n", "\n", "# Определение связей между сущностями (если есть)\n", "# es = es.add_relationship(...)\n", "\n", "# Генерация признаков\n", - "feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity='cars', max_depth=2)\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='cars', max_depth=2)\n", "\n", "# Преобразование признаков для контрольной и тестовой выборок\n", "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", - "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)" + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Оценка качества каждого набора признаков\n", + "Предсказательная способность\n", + "Метрики: RMSE, MAE, R²\n", + "\n", + "Методы: Обучение модели на обучающей выборке и оценка на контрольной и тестовой выборках.\n", + "\n", + "Скорость вычисления\n", + "Методы: Измерение времени выполнения генерации признаков и обучения модели.\n", + "\n", + "Надежность\n", + "Методы: Кросс-валидация, анализ чувствительности модели к изменениям в данных.\n", + "\n", + "Корреляция\n", + "Методы: Анализ корреляционной матрицы признаков, удаление мультиколлинеарных признаков.\n", + "\n", + "Цельность\n", + "Методы: Проверка логической связи между признаками и целевой переменной, интерпретация результатов модели." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:724: UserWarning: A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: index\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\synthesis\\deep_feature_synthesis.py:169: UserWarning: Only one dataframe in entityset, changing max_depth to 1 since deeper features cannot be created\n", + " warnings.warn(\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\featuretools\\computational_backends\\feature_set_calculator.py:143: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " df = pd.concat([df, default_df], sort=True)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n", + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\woodwork\\logical_types.py:841: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " series = series.replace(ww.config.get_option(\"nan_values\"), np.nan)\n" + ] + } + ], + "source": [ + "import featuretools as ft\n", + "\n", + "# Определение сущностей\n", + "es = ft.EntitySet(id='car_data')\n", + "es = es.add_dataframe(dataframe_name='cars', dataframe=train_data_encoded, index='id')\n", + "\n", + "# Генерация признаков\n", + "feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='cars', max_depth=2)\n", + "\n", + "# Преобразование признаков для контрольной и тестовой выборок\n", + "val_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=val_data_encoded.index)\n", + "test_feature_matrix = ft.calculate_feature_matrix(features=feature_defs, entityset=es, instance_ids=test_data_encoded.index)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE: 234661.34107821883\n", + "R²: 0.8029264507217629\n", + "MAE: 7964.677649030692\n", + "Cross-validated RMSE: 259310.71680259163\n", + "Train RMSE: 109324.02870848698\n", + "Train R²: 0.7887252013114727\n", + "Train MAE: 3471.173866063129\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Egor\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\metrics\\_regression.py:492: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", + "from sklearn.model_selection import cross_val_score\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "# Удаление строк с NaN\n", + "feature_matrix = feature_matrix.dropna()\n", + "val_feature_matrix = val_feature_matrix.dropna()\n", + "test_feature_matrix = test_feature_matrix.dropna()\n", + "\n", + "# Разделение данных на обучающую и тестовую выборки\n", + "X_train = feature_matrix.drop('Price', axis=1)\n", + "y_train = feature_matrix['Price']\n", + "X_val = val_feature_matrix.drop('Price', axis=1)\n", + "y_val = val_feature_matrix['Price']\n", + "X_test = test_feature_matrix.drop('Price', axis=1)\n", + "y_test = test_feature_matrix['Price']\n", + "\n", + "# Выбор модели\n", + "model = RandomForestRegressor(random_state=42)\n", + "\n", + "# Обучение модели\n", + "model.fit(X_train, y_train)\n", + "\n", + "# Предсказание и оценка\n", + "y_pred = model.predict(X_test)\n", + "\n", + "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", + "r2 = r2_score(y_test, y_pred)\n", + "mae = mean_absolute_error(y_test, y_pred)\n", + "\n", + "print(f\"RMSE: {rmse}\")\n", + "print(f\"R²: {r2}\")\n", + "print(f\"MAE: {mae}\")\n", + "\n", + "# Кросс-валидация\n", + "scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')\n", + "rmse_cv = (-scores.mean())**0.5\n", + "print(f\"Cross-validated RMSE: {rmse_cv}\")\n", + "\n", + "# Анализ важности признаков\n", + "feature_importances = model.feature_importances_\n", + "feature_names = X_train.columns\n", + "\n", + "# importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})\n", + "# importance_df = importance_df.sort_values(by='Importance', ascending=False)\n", + "\n", + "# plt.figure(figsize=(10, 6))\n", + "# sns.barplot(x='Importance', y='Feature', data=importance_df)\n", + "# plt.title('Feature Importance')\n", + "# plt.show()\n", + "\n", + "# Проверка на переобучение\n", + "y_train_pred = model.predict(X_train)\n", + "\n", + "rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)\n", + "r2_train = r2_score(y_train, y_train_pred)\n", + "mae_train = mean_absolute_error(y_train, y_train_pred)\n", + "\n", + "print(f\"Train RMSE: {rmse_train}\")\n", + "print(f\"Train R²: {r2_train}\")\n", + "print(f\"Train MAE: {mae_train}\")\n", + "\n", + "# Визуализация результатов\n", + "plt.figure(figsize=(10, 6))\n", + "plt.scatter(y_test, y_pred, alpha=0.5)\n", + "plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)\n", + "plt.xlabel('Actual Price')\n", + "plt.ylabel('Predicted Price')\n", + "plt.title('Actual vs Predicted Price')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Точность предсказаний: Модель показывает довольно высокий R² (0.8029), что указывает на хорошее объяснение вариации цен. Однако, значения RMSE и MAE довольно высоки, что говорит о том, что модель не очень точно предсказывает цены, особенно для высоких значений.\n", + "\n", + "Переобучение: Разница между RMSE на обучающей и тестовой выборках не очень большая, что указывает на то, что переобучение не является критическим. Однако, стоит быть осторожным и продолжать мониторинг этого показателя.\n", + "\n", + "Кросс-валидация: Значение RMSE после кросс-валидации немного выше, чем на тестовой выборке, что может указывать на некоторую нестабильность модели." ] } ],