{ "cells": [ { "cell_type": "markdown", "metadata": { "vscode": { "languageId": "plaintext" } }, "source": [ "#### Загрузка набора данных" ] }, { "cell_type": "code", "execution_count": 1058, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PriceLevyManufacturerModelProd. yearCategoryLeather interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbags
0133281399LEXUSRX 4502010JeepYesHybrid3.5186005 km6.0Automatic4x404-MayLeft wheelSilver12
1166211018CHEVROLETEquinox2011JeepNoPetrol3192000 km6.0Tiptronic4x404-MayLeft wheelBlack8
28467-HONDAFIT2006HatchbackNoPetrol1.3200000 km4.0VariatorFront04-MayRight-hand driveBlack2
33607862FORDEscape2011JeepYesHybrid2.5168966 km4.0Automatic4x404-MayLeft wheelWhite0
411726446HONDAFIT2014HatchbackYesPetrol1.391901 km4.0AutomaticFront04-MayLeft wheelSilver4
......................................................
192328467-MERCEDES-BENZCLK 2001999CoupeYesCNG2.0 Turbo300000 km4.0ManualRear02-MarLeft wheelSilver5
1923315681831HYUNDAISonata2011SedanYesPetrol2.4161600 km4.0TiptronicFront04-MayLeft wheelRed8
1923426108836HYUNDAITucson2010JeepYesDiesel2116365 km4.0AutomaticFront04-MayLeft wheelGrey4
1923553311288CHEVROLETCaptiva2007JeepYesDiesel251258 km4.0AutomaticFront04-MayLeft wheelBlack4
19236470753HYUNDAISonata2012SedanYesHybrid2.4186923 km4.0AutomaticFront04-MayLeft wheelWhite12
\n", "

19237 rows × 17 columns

\n", "
" ], "text/plain": [ " Price Levy Manufacturer Model Prod. year Category \\\n", "0 13328 1399 LEXUS RX 450 2010 Jeep \n", "1 16621 1018 CHEVROLET Equinox 2011 Jeep \n", "2 8467 - HONDA FIT 2006 Hatchback \n", "3 3607 862 FORD Escape 2011 Jeep \n", "4 11726 446 HONDA FIT 2014 Hatchback \n", "... ... ... ... ... ... ... \n", "19232 8467 - MERCEDES-BENZ CLK 200 1999 Coupe \n", "19233 15681 831 HYUNDAI Sonata 2011 Sedan \n", "19234 26108 836 HYUNDAI Tucson 2010 Jeep \n", "19235 5331 1288 CHEVROLET Captiva 2007 Jeep \n", "19236 470 753 HYUNDAI Sonata 2012 Sedan \n", "\n", " Leather interior Fuel type Engine volume Mileage Cylinders \\\n", "0 Yes Hybrid 3.5 186005 km 6.0 \n", "1 No Petrol 3 192000 km 6.0 \n", "2 No Petrol 1.3 200000 km 4.0 \n", "3 Yes Hybrid 2.5 168966 km 4.0 \n", "4 Yes Petrol 1.3 91901 km 4.0 \n", "... ... ... ... ... ... \n", "19232 Yes CNG 2.0 Turbo 300000 km 4.0 \n", "19233 Yes Petrol 2.4 161600 km 4.0 \n", "19234 Yes Diesel 2 116365 km 4.0 \n", "19235 Yes Diesel 2 51258 km 4.0 \n", "19236 Yes Hybrid 2.4 186923 km 4.0 \n", "\n", " Gear box type Drive wheels Doors Wheel Color Airbags \n", "0 Automatic 4x4 04-May Left wheel Silver 12 \n", "1 Tiptronic 4x4 04-May Left wheel Black 8 \n", "2 Variator Front 04-May Right-hand drive Black 2 \n", "3 Automatic 4x4 04-May Left wheel White 0 \n", "4 Automatic Front 04-May Left wheel Silver 4 \n", "... ... ... ... ... ... ... \n", "19232 Manual Rear 02-Mar Left wheel Silver 5 \n", "19233 Tiptronic Front 04-May Left wheel Red 8 \n", "19234 Automatic Front 04-May Left wheel Grey 4 \n", "19235 Automatic Front 04-May Left wheel Black 4 \n", "19236 Automatic Front 04-May Left wheel White 12 \n", "\n", "[19237 rows x 17 columns]" ] }, "execution_count": 1058, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import featuretools as ft\n", "import re\n", "from sklearn.preprocessing import StandardScaler\n", "from imblearn.over_sampling import RandomOverSampler\n", "from sklearn.model_selection import train_test_split\n", "\n", "\n", "df = pd.read_csv(\"../data/car_price_prediction.csv\")\n", "\n", "df = df.drop(columns=[\"ID\"])\n", "\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Анализ датасета и очистка данных" ] }, { "cell_type": "code", "execution_count": 1059, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price int64\n", "Levy object\n", "Manufacturer object\n", "Model object\n", "Prod. year int64\n", "Category object\n", "Leather interior object\n", "Fuel type object\n", "Engine volume object\n", "Mileage object\n", "Cylinders float64\n", "Gear box type object\n", "Drive wheels object\n", "Doors object\n", "Wheel object\n", "Color object\n", "Airbags int64\n", "dtype: object" ] }, "execution_count": 1059, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 1060, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3.5, 3. , 1.3, 2.5, 2. , 1.8, 2.4, 4. , 1.6, 3.3, 2.2,\n", " 4.7, 1.5, 4.4, 1.4, 3.6, 2.3, 5.5, 2.8, 3.2, 3.8, 4.6,\n", " 1.2, 5. , 1.7, 2.9, 0.5, 1.9, 2.7, 4.8, 5.3, 0.4, 1.1,\n", " 2.1, 0.7, 5.4, 3.7, 1. , 2.6, 0.8, 0.2, 5.7, 6.7, 6.2,\n", " 3.4, 6.3, 4.3, 4.2, 0. , 20. , 0.3, 5.9, 5.6, 6. , 0.6,\n", " 6.8, 4.5, 7.3, 0.1, 3.1, 6.4, 3.9, 0.9, 5.2, 5.8])" ] }, "execution_count": 1060, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Engine volume\"] = df[\"Engine volume\"].str.replace(\"Turbo\", \"\")\n", "df[\"Engine volume\"] = pd.to_numeric(df[\"Engine volume\"])\n", "df[\"Engine volume\"].unique()" ] }, { "cell_type": "code", "execution_count": 1061, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([186005, 192000, 200000, ..., 140607, 307325, 186923])" ] }, "execution_count": 1061, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Mileage\"] = df[\"Mileage\"].str.replace(\"km\", \"\")\n", "df[\"Mileage\"] = df[\"Mileage\"].astype(\"int64\")\n", "df[\"Mileage\"].unique()" ] }, { "cell_type": "code", "execution_count": 1062, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1399, 1018, 0, 862, 446, 891, 761, 751, 394,\n", " 1053, 1055, 1079, 810, 2386, 1850, 531, 586, 1249,\n", " 2455, 583, 1537, 1288, 915, 1750, 707, 1077, 1486,\n", " 1091, 650, 382, 1436, 1194, 503, 1017, 1104, 639,\n", " 629, 919, 781, 530, 640, 765, 777, 779, 934,\n", " 769, 645, 1185, 1324, 830, 1187, 1111, 760, 642,\n", " 1604, 1095, 966, 473, 1138, 1811, 988, 917, 1156,\n", " 687, 11714, 836, 1347, 2866, 1646, 259, 609, 697,\n", " 585, 475, 690, 308, 1823, 1361, 1273, 924, 584,\n", " 2078, 831, 1172, 893, 1872, 1885, 1266, 447, 2148,\n", " 1730, 730, 289, 502, 333, 1325, 247, 879, 1342,\n", " 1327, 1598, 1514, 1058, 738, 1935, 481, 1522, 1282,\n", " 456, 880, 900, 798, 1277, 442, 1051, 790, 1292,\n", " 1047, 528, 1211, 1493, 1793, 574, 930, 1998, 271,\n", " 706, 1481, 1677, 1661, 1286, 1408, 1090, 595, 1451,\n", " 1267, 993, 1714, 878, 641, 749, 1511, 603, 353,\n", " 877, 1236, 1141, 397, 784, 1024, 1357, 1301, 770,\n", " 922, 1438, 753, 607, 1363, 638, 490, 431, 565,\n", " 517, 833, 489, 1760, 986, 1841, 1620, 1360, 474,\n", " 1099, 978, 1624, 1946, 1268, 1307, 696, 649, 666,\n", " 2151, 551, 800, 971, 1323, 2377, 1845, 1083, 694,\n", " 463, 419, 345, 1515, 1505, 2056, 1203, 729, 460,\n", " 1356, 876, 911, 1190, 780, 448, 2410, 1848, 1148,\n", " 834, 1275, 1028, 1197, 724, 890, 1705, 505, 789,\n", " 2959, 518, 461, 1719, 2858, 3156, 2225, 2177, 1968,\n", " 1888, 1308, 2736, 1103, 557, 2195, 843, 1664, 723,\n", " 4508, 562, 501, 2018, 1076, 1202, 3301, 691, 1440,\n", " 1869, 1178, 418, 1820, 1413, 488, 1304, 363, 2108,\n", " 521, 1659, 87, 1411, 1528, 3292, 7058, 1578, 627,\n", " 874, 1996, 1488, 5679, 1234, 5603, 400, 889, 3268,\n", " 875, 949, 2265, 441, 742, 425, 2476, 2971, 614,\n", " 1816, 1375, 1405, 2297, 1062, 1113, 420, 2469, 658,\n", " 1951, 2670, 2578, 1995, 1032, 994, 1011, 2421, 1296,\n", " 155, 494, 426, 1086, 961, 2236, 1829, 764, 1834,\n", " 1054, 617, 1529, 2266, 637, 626, 1832, 1016, 2002,\n", " 1756, 746, 1285, 2690, 1118, 5332, 980, 1807, 970,\n", " 1228, 1195, 1132, 1768, 1384, 1080, 7063, 1817, 1452,\n", " 1975, 1368, 702, 1974, 1781, 1036, 944, 663, 364,\n", " 1539, 1345, 1680, 2209, 741, 1575, 695, 1317, 294,\n", " 1525, 424, 997, 1473, 1552, 2819, 2188, 1668, 3057,\n", " 799, 1502, 2606, 552, 1694, 1759, 1110, 399, 1470,\n", " 1174, 5877, 1474, 1688, 526, 686, 5908, 1107, 2070,\n", " 1468, 1246, 1685, 556, 1533, 1917, 1346, 732, 692,\n", " 579, 421, 362, 3505, 1855, 2711, 1586, 3739, 681,\n", " 1708, 2278, 1701, 722, 1482, 928, 827, 832, 527,\n", " 604, 173, 1341, 3329, 1553, 859, 167, 916, 828,\n", " 2082, 1176, 1108, 975, 3008, 1516, 2269, 1699, 2073,\n", " 1031, 1503, 2364, 1030, 1442, 5666, 2715, 1437, 2067,\n", " 1426, 2908, 1279, 866, 4283, 279, 2658, 3015, 2004,\n", " 1391, 4736, 748, 1466, 644, 683, 2705, 1297, 731,\n", " 1252, 2216, 3141, 3273, 1518, 1723, 1588, 972, 682,\n", " 1094, 668, 175, 967, 402, 3894, 1960, 1599, 2000,\n", " 2084, 1621, 714, 1109, 3989, 873, 1572, 1163, 1991,\n", " 1716, 1673, 2562, 2874, 965, 462, 605, 1948, 1736,\n", " 3518, 2054, 2467, 1681, 1272, 1205, 750, 2156, 2566,\n", " 115, 524, 3184, 676, 1678, 612, 328, 955, 1441,\n", " 1675, 3965, 2909, 623, 822, 867, 3025, 1993, 792,\n", " 636, 4057, 3743, 2337, 2570, 2418, 2472, 3910, 1662,\n", " 2123, 2628, 3208, 2080, 3699, 2913, 864, 2505, 870,\n", " 7536, 1924, 1671, 1064, 1836, 1866, 4741, 841, 1369,\n", " 5681, 3112, 1366, 2223, 1198, 1039, 3811, 3571, 1387,\n", " 1171, 1365, 1531, 1590, 11706, 2308, 4860, 1641, 1045,\n", " 1901])" ] }, "execution_count": 1062, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Levy\"] = df[\"Levy\"].replace(\"-\", \"0\")\n", "df[\"Levy\"] = df[\"Levy\"].astype(\"int64\")\n", "df[\"Levy\"].unique()" ] }, { "cell_type": "code", "execution_count": 1063, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 6, 4, 8, 1, 12, 3, 2, 16, 5, 7, 9, 10, 14])" ] }, "execution_count": 1063, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Cylinders\"] = df[\"Cylinders\"].astype(\"int64\")\n", "df[\"Cylinders\"].unique()" ] }, { "cell_type": "code", "execution_count": 1064, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['04-May', '02-Mar', '>5'], dtype=object)" ] }, "execution_count": 1064, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Doors\"].unique()" ] }, { "cell_type": "code", "execution_count": 1065, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)" ] }, "execution_count": 1065, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Doors\"] = df[\"Doors\"].map(\n", " {\"02-Mar\": \"Двухдверный\", \"04-May\": \"Четырехдверный\", \">5\": \"Многодверный\"}\n", ")\n", "df[\"Doors\"].unique()" ] }, { "cell_type": "code", "execution_count": 1066, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 3, 6, ..., 627220, 872946, 26307500])" ] }, "execution_count": 1066, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted_df = df.sort_values(by=\"Price\")\n", "sorted_df[\"Price\"].unique()" ] }, { "cell_type": "code", "execution_count": 1067, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Количество строк до удаления некорректных значений: 19237\n", "Количество строк после удаления некорректных значений: 17574\n" ] } ], "source": [ "print(f\"Количество строк до удаления некорректных значений: {len(df)}\")\n", "df = df[df[\"Price\"] >= 500]\n", "print(f\"Количество строк после удаления некорректных значений: {len(df)}\")" ] }, { "cell_type": "code", "execution_count": 1068, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 500, 549, 600, ..., 627220, 872946, 26307500])" ] }, "execution_count": 1068, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted_df = df.sort_values(by=\"Price\")\n", "sorted_df[\"Price\"].unique()" ] }, { "cell_type": "code", "execution_count": 1069, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,\n", " 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,\n", " 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,\n", " 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,\n", " 2014, 2015, 2016, 2017, 2018, 2019, 2020])" ] }, "execution_count": 1069, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sorted_df = df.sort_values(by=\"Prod. year\")\n", "sorted_df[\"Prod. year\"].unique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ручной синтез признаков." ] }, { "cell_type": "code", "execution_count": 1070, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", " 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,\n", " 34, 35, 36, 37, 38, 39, 40, 42, 43, 46, 47, 52, 55, 56, 63, 67, 77])" ] }, "execution_count": 1070, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Age\"] = 2020 - df[\"Prod. year\"]\n", "df = df.drop(\"Prod. year\", axis=1)\n", "sorted_df = df.sort_values(by=\"Age\")\n", "sorted_df[\"Age\"].unique()" ] }, { "cell_type": "code", "execution_count": 1071, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PriceLevyManufacturerModelCategoryLeather interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbagsAge
0133281399LEXUSRX 450JeepYesHybrid3.51860056Automatic4x4ЧетырехдверныйLeft wheelSilver1210
1166211018CHEVROLETEquinoxJeepNoPetrol3.01920006Tiptronic4x4ЧетырехдверныйLeft wheelBlack89
284670HONDAFITHatchbackNoPetrol1.32000004VariatorFrontЧетырехдверныйRight-hand driveBlack214
33607862FORDEscapeJeepYesHybrid2.51689664Automatic4x4ЧетырехдверныйLeft wheelWhite09
411726446HONDAFITHatchbackYesPetrol1.3919014AutomaticFrontЧетырехдверныйLeft wheelSilver46
......................................................
1923158021055MERCEDES-BENZE 350SedanYesDiesel3.51078006AutomaticRearЧетырехдверныйLeft wheelGrey127
1923284670MERCEDES-BENZCLK 200CoupeYesCNG2.03000004ManualRearДвухдверныйLeft wheelSilver521
1923315681831HYUNDAISonataSedanYesPetrol2.41616004TiptronicFrontЧетырехдверныйLeft wheelRed89
1923426108836HYUNDAITucsonJeepYesDiesel2.01163654AutomaticFrontЧетырехдверныйLeft wheelGrey410
1923553311288CHEVROLETCaptivaJeepYesDiesel2.0512584AutomaticFrontЧетырехдверныйLeft wheelBlack413
\n", "

17574 rows × 17 columns

\n", "
" ], "text/plain": [ " Price Levy Manufacturer Model Category Leather interior \\\n", "0 13328 1399 LEXUS RX 450 Jeep Yes \n", "1 16621 1018 CHEVROLET Equinox Jeep No \n", "2 8467 0 HONDA FIT Hatchback No \n", "3 3607 862 FORD Escape Jeep Yes \n", "4 11726 446 HONDA FIT Hatchback Yes \n", "... ... ... ... ... ... ... \n", "19231 5802 1055 MERCEDES-BENZ E 350 Sedan Yes \n", "19232 8467 0 MERCEDES-BENZ CLK 200 Coupe Yes \n", "19233 15681 831 HYUNDAI Sonata Sedan Yes \n", "19234 26108 836 HYUNDAI Tucson Jeep Yes \n", "19235 5331 1288 CHEVROLET Captiva Jeep Yes \n", "\n", " Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels \\\n", "0 Hybrid 3.5 186005 6 Automatic 4x4 \n", "1 Petrol 3.0 192000 6 Tiptronic 4x4 \n", "2 Petrol 1.3 200000 4 Variator Front \n", "3 Hybrid 2.5 168966 4 Automatic 4x4 \n", "4 Petrol 1.3 91901 4 Automatic Front \n", "... ... ... ... ... ... ... \n", "19231 Diesel 3.5 107800 6 Automatic Rear \n", "19232 CNG 2.0 300000 4 Manual Rear \n", "19233 Petrol 2.4 161600 4 Tiptronic Front \n", "19234 Diesel 2.0 116365 4 Automatic Front \n", "19235 Diesel 2.0 51258 4 Automatic Front \n", "\n", " Doors Wheel Color Airbags Age \n", "0 Четырехдверный Left wheel Silver 12 10 \n", "1 Четырехдверный Left wheel Black 8 9 \n", "2 Четырехдверный Right-hand drive Black 2 14 \n", "3 Четырехдверный Left wheel White 0 9 \n", "4 Четырехдверный Left wheel Silver 4 6 \n", "... ... ... ... ... ... \n", "19231 Четырехдверный Left wheel Grey 12 7 \n", "19232 Двухдверный Left wheel Silver 5 21 \n", "19233 Четырехдверный Left wheel Red 8 9 \n", "19234 Четырехдверный Left wheel Grey 4 10 \n", "19235 Четырехдверный Left wheel Black 4 13 \n", "\n", "[17574 rows x 17 columns]" ] }, "execution_count": 1071, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Очистка дубликатов и пропущенных значений" ] }, { "cell_type": "code", "execution_count": 1072, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.int64(2773)" ] }, "execution_count": 1072, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 1073, "metadata": {}, "outputs": [], "source": [ "df.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 1074, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price 0\n", "Levy 0\n", "Manufacturer 0\n", "Model 0\n", "Category 0\n", "Leather interior 0\n", "Fuel type 0\n", "Engine volume 0\n", "Mileage 0\n", "Cylinders 0\n", "Gear box type 0\n", "Drive wheels 0\n", "Doors 0\n", "Wheel 0\n", "Color 0\n", "Airbags 0\n", "Age 0\n", "dtype: int64" ] }, "execution_count": 1074, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isna().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Очистка выбросов" ] }, { "cell_type": "code", "execution_count": 1075, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price int64\n", "Levy int64\n", "Manufacturer object\n", "Model object\n", "Category object\n", "Leather interior object\n", "Fuel type object\n", "Engine volume float64\n", "Mileage int64\n", "Cylinders int64\n", "Gear box type object\n", "Drive wheels object\n", "Doors object\n", "Wheel object\n", "Color object\n", "Airbags int64\n", "Age int64\n", "dtype: object" ] }, "execution_count": 1075, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 1076, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWkAAAGECAYAAAD0odESAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAiy0lEQVR4nO3df1BU973/8deyblESwBgV0WDguiapkfgriWIuoleRAPVm9YuT/pjRJE3yzUy8kwx2eou5Y2L7rdzeik2n8d7UySSZdKpJpSudQUjdoOL2Bm6uRtPQNBa9EtMIqFVZQcTNcr5/KHtDBNk1wH5gn48ZZjyf8/nsee/M2ZefOefsZ22WZVkCABgpJtIFAAB6R0gDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBhlRI79+/X8uWLdPEiRNls9lUVlYW9mv85je/0cyZMxUXF6fbb79dP/3pT/u/UADoJ0MqpNva2jRjxgxt2bLlhsZXVlbqO9/5jp566inV1dXp3//93/Wzn/1ML730Uj9XCgD9wzZUF1iy2WzauXOnXC5XsK2jo0PPPfectm/frvPnz2v69On6yU9+ooULF0qSvv3tb8vv92vHjh3BMb/4xS/0b//2bzpx4oRsNtsgvwsAuL4hNZPuy5o1a1RTU6M333xTf/zjH7Vy5Uo9+OCDqq+vl3QlxEeOHNltzKhRo/TXv/5Vn3zySSRKBoDrGjYhfeLECb322mvasWOHMjMzNWXKFH3ve9/T3//93+u1116TJOXk5MjtdquqqkqdnZ36y1/+opKSEklSY2NjJMsHgB6NiHQB/eXDDz9UIBDQHXfc0a29o6NDt956qyTpiSee0LFjx/SNb3xDfr9fCQkJeuaZZ/TCCy8oJmbY/H8FYBgZNiHd2toqu92ugwcPym63d9t38803S7pyHfsnP/mJNm7cqKamJo0bN05VVVWSpL/7u78b9JoBoC/DJqRnzZqlQCCgU6dOKTMz87p97Xa7Jk2aJEnavn27MjIyNG7cuMEoEwDCMqRCurW1VUePHg1uHz9+XIcPH9aYMWN0xx136Dvf+Y5WrVqlkpISzZo1S6dPn1ZVVZXuuece5efn68yZMyotLdXChQt16dKl4DXs6urqCL4rALgOawjZu3evJemav9WrV1uWZVmXL1+21q9fb6WmploOh8NKTk62li9fbv3xj3+0LMuyTp8+bc2bN8+66aabrLi4OGvx4sVWbW1tBN8RAFzfkH1OGgCiAY80AIDBhsQ16c7OTp08eVLx8fF8KxDAsGBZli5cuKCJEyde9xHgIRHSJ0+eVEpKSqTLAIB+9+mnn+q2227rdf+QCOn4+HhJV95MQkJChKvBcOb3+7V7924tXbpUDocj0uVgGPP5fEpJSQnmW2+GREh3XeJISEggpDGg/H6/4uLilJCQQEhjUPR1CZcbhwBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGGxIPCcNDIazZ88qMzNTn376qVJSUuT1ejVmzJhIl4UoR0gDkiZMmKDm5ubg9kcffaRbb71VSUlJampqimBliHZc7kDU+2JAz507Vxs2bNDcuXMlSc3NzZowYUIky0OUI6QR1c6ePRsM6AsXLsjr9WrGjBnyer26cOGCpCtBffbs2UiWiShGSCOqZWVlSZLmzZsX/MHiLjfffLPuv//+bv2AwUZII6qdPHlSkvTjH/+4x/0//OEPu/UDBhshjag2ceJESdJzzz3X4/7169d36wcMNkIaUa3rl+Jra2vV2trabV9ra6vee++9bv2AwUZII6qNGTNGSUlJkq78uMQDDzyg999/Xw888EBwMfakpCSel0bEDIlfC/f5fEpMTFRLSwuL/mNAfPk56S48J42BEmquMZMGJDU1Nelvf/ubpk2bpvj4eE2bNk1/+9vfCGhEHN84BK4aM2aMDh8+rIqKCuXl5fHzWTACM2kAMBghDQAGI6QBwGCENAAYjJAGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABgsrpIuLi3XfffcpPj5e48ePl8vl0pEjR/oct2PHDt11110aOXKk0tPTVVFRccMFA0A0CSukq6ur9fTTT6u2tlYej0d+v19Lly5VW1tbr2Peffddfetb39J3v/tdHTp0SC6XSy6XS3V1dV+5eAAY7r7SUqWnT5/W+PHjVV1drQULFvTY5+GHH1ZbW5vKy8uDbfPmzdPMmTP18ssvh3QclirFYPH7/SywhEERaq59pVXwWlpaJOm6C6LX1NSosLCwW1tOTo7Kysp6HdPR0aGOjo7gts/nk3TlA+T3+79CxcD1dZ1fnGcYaKGeYzcc0p2dnXr22Wf1wAMPaPr06b32a2pqCv7yRZe+FlIvLi7Whg0brmnfvXu34uLibrRkIGQejyfSJWCYu3jxYkj9bjikn376adXV1ekPf/jDjb5Er4qKirrNvn0+n1JSUrR06VIud2BA+f1+eTweZWdnc7kDA6rrCkFfbiik16xZo/Lycu3fv1+33Xbbdfv29LNEzc3NmjBhQq9jYmNjFRsbe027w+Hgg4NBwbmGgRbq+RXW0x2WZWnNmjXauXOn9uzZo7S0tD7HZGRkqKqqqlubx+NRRkZGOIcGgKgU1kz66aef1rZt2/S73/1O8fHxwevKiYmJGjVqlCRp1apVmjRpkoqLiyVJzzzzjLKyslRSUqL8/Hy9+eabOnDggLZu3drPbwUAhp+wZtL/8R//oZaWFi1cuFDJycnBv7feeivY58SJE2psbAxuz58/X9u2bdPWrVs1Y8YMlZaWqqys7Lo3GwEAV4Q1kw7lkep9+/Zd07Zy5UqtXLkynEMBAMTaHQBgNEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBCGngqkAgoOrqau3fv1/V1dUKBAKRLgkgpAFJcrvdcjqdys7O1ubNm5WdnS2n0ym32x3p0hDlCGlEPbfbrYKCAqWnp8vr9Wr79u3yer1KT09XQUEBQY2IslmWZUW6iL74fD4lJiaqpaVFCQkJkS4Hw0ggEJDT6VR6errKysoUCARUUVGhvLw82e12uVwu1dXVqb6+Xna7PdLlYhgJNdeYSSOqeb1eNTQ0aN26dYqJ6f5xiImJUVFRkY4fPy6v1xuhChHtCGlEtcbGRknS9OnTe9zf1d7VDxhshDSiWnJysiSprq6ux/1d7V39gMFGSCOqZWZmKjU1VRs3blRnZ2e3fZ2dnSouLlZaWpoyMzMjVCGiHSGNqGa321VSUqLy8nK5XC7V1taqvb1dtbW1crlcKi8v16ZNm7hpiIgZEekCgEhbsWKFSktLtXbtWi1YsCDYnpaWptLSUq1YsSKC1SHa8QgecFUgENDevXtVWVmp3NxcLVq0iBk0BkyoucZMGrjKbrcrKytLbW1tysrKIqBhBK5JA4DBCGkAMBghDQAGI6QBwGCENAAYjJAGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBCGkAMBghDQAGI6QBwGBhh/T+/fu1bNkyTZw4UTabTWVlZdftv2/fPtlstmv+mpqabrRmAIgaYYd0W1ubZsyYoS1btoQ17siRI2psbAz+jR8/PtxDA0DUGRHugNzcXOXm5oZ9oPHjx2v06NFhjwOAaBZ2SN+omTNnqqOjQ9OnT9cLL7ygBx54oNe+HR0d6ujoCG77fD5Jkt/vl9/vH/BaEb26zi/OMwy0UM+xAQ/p5ORkvfzyy7r33nvV0dGhV155RQsXLtR//dd/afbs2T2OKS4u1oYNG65p3717t+Li4ga6ZEAejyfSJWCYu3jxYkj9bJZlWTd6EJvNpp07d8rlcoU1LisrS5MnT9avfvWrHvf3NJNOSUnRmTNnlJCQcKPlAn3y+/3yeDzKzs6Ww+GIdDkYxnw+n8aOHauWlpbr5tqgXe74ovvvv19/+MMfet0fGxur2NjYa9odDgcfHAwKzjUMtFDPr4g8J3348GElJydH4tAAMKSEPZNubW3V0aNHg9vHjx/X4cOHNWbMGE2ePFlFRUX67LPP9MYbb0iSXnzxRaWlpenuu+/WpUuX9Morr2jPnj3avXt3/70LABimwg7pAwcOaNGiRcHtwsJCSdLq1av1+uuvq7GxUSdOnAjuv3z5stauXavPPvtMcXFxuueee/TOO+90ew0AQM++0o3DweLz+ZSYmNjnBXbgq/L7/aqoqFBeXh7XpDGgQs011u4AAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBCGkAMBghDQAGI6QBwGCENAAYjJAGAIMR0sBVgUBA1dXV2r9/v6qrqxUIBCJdEkBIA5LkdrvldDqVnZ2tzZs3Kzs7W06nU263O9KlIcoR0oh6brdbBQUFSk9Pl9fr1fbt2+X1epWenq6CggKCGhFlsyzLinQRffH5fEpMTFRLS4sSEhIiXQ6GkUAgIKfTqfT0dJWVlSkQCKiiokJ5eXmy2+1yuVyqq6tTfX297HZ7pMvFMBJqrjGTRlTzer1qaGjQunXrFBPT/eMQExOjoqIiHT9+XF6vN0IVItoR0ohqjY2NkqTp06f3uL+rvasfMNgIaUS15ORkSVJdXV2P+7vau/oBg42QRlTLzMxUamqqNm7cqM7Ozm77Ojs7VVxcrLS0NGVmZkaoQkQ7QhpRzW63q6SkROXl5XK5XKqtrVV7e7tqa2vlcrlUXl6uTZs2cdMQETMi0gUAkbZixQqVlpZq7dq1WrBgQbA9LS1NpaWlWrFiRQSrQ7TjETzgqkAgoL1796qyslK5ublatGgRM2gMmFBzjZk0cJXdbldWVpba2tqUlZVFQMMIXJMGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ1cxaL/MBEhDYhF/2EuQhpRj0X/YTK+cYioxqL/iBQW/QdCwKL/MB0hjajGov8wHSGNqMai/zAdIY2oxqL/MB0hjajGov8wHUuVIuqx6D9MxiN4wFUs+o/BxKL/QJhY9B8m4po0ABiMkAYAgxHSAGAwQhoADMaNQ+CqlpYW5ebmqr6+XlOnTlVlZaUSExMjXRaiXNgz6f3792vZsmWaOHGibDabysrK+hyzb98+zZ49W7GxsXI6nXr99ddvoFRg4DidTo0ePVo1NTU6c+aMampqNHr0aDmdzkiXhigXdki3tbVpxowZ2rJlS0j9jx8/rvz8fC1atEiHDx/Ws88+q8cff1y///3vwy4WGAhOp1PHjh2TJOXk5Ohf//VflZOTI0k6duwYQY2I+kpfZrHZbNq5c6dcLlevff75n/9Zu3bt6raAzTe/+U2dP39eb7/9dkjH4cssGCgtLS0aPXq0pCsTEIfDEVxP2u/366abbpIknT9/nksf6FfGfJmlpqZGS5Ys6daWk5OjZ599ttcxHR0d6ujoCG77fD5Jkt/vl9/vH5A6EZ1yc3MlXTknHQ5H8Pzy+/1yOBzKzs6Wx+NRbm6uqqurI1kqhplQs2zAQ7qpqUlJSUnd2pKSkuTz+dTe3q5Ro0ZdM6a4uFgbNmy4pn337t2Ki4sbsFoRferr6yVJixYtUkVFRbDd4/FIkhYuXCiPx6P6+vpu+4Gv6uLFiyH1M/LpjqKiIhUWFga3fT6fUlJStHTpUi53oF9NnTpVZ86c0d69e1VYWCi/3y+Px6Ps7Gw5HI7gvZepU6cqLy8vwtViOOm6QtCXAQ/pCRMmqLm5uVtbc3OzEhISepxFS1JsbKxiY2OvaXc4HHI4HANSJ6JTZWWlRo8erd///vfBSxySgpc+umbUlZWVnHvoV6GeTwP+ZZaMjAxVVVV1a/N4PMrIyBjoQwN9SkxM1JQpUyRJN910k/Lz8/WnP/1J+fn5wZuGU6ZM4aYhIibsmXRra6uOHj0a3D5+/LgOHz6sMWPGaPLkySoqKtJnn32mN954Q5L01FNP6aWXXtL3v/99PfbYY9qzZ49+85vfaNeuXf33LoCv4OjRo8HH8DweT3D2LF0J6C+e78BgC3smfeDAAc2aNUuzZs2SJBUWFmrWrFlav369pCs/2HnixIlg/7S0NO3atUsej0czZsxQSUmJXnnlleBzqIAJjh49qvPnzysjI0Njx45VRkaGzp8/T0Aj4lj0H/gCv98ffE6aa9AYSKHmGgssAYDBCGkAMBghDQAGI6QBwGCENAAYjJAGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpIGrAoGAqqurtX//flVXVysQCES6JICQBiTJ7XbL6XQqOztbmzdvVnZ2tpxOp9xud6RLQ5QjpBH13G63CgoKlJ6eLq/Xq+3bt8vr9So9PV0FBQUENSLKZlmWFeki+hLqT58D4QoEAnI6nUpPT1dZWZkCgYAqKiqUl5cnu90ul8uluro61dfXy263R7pcDCOh5hozaUQ1r9erhoYGrVu3TjEx3T8OMTExKioq0vHjx+X1eiNUIaIdIY2o1tjYKEmaPn16j/u72rv6AYONkEZUS05OliTV1dX1uL+rvasfMNgIaUS1zMxMpaamauPGjers7Oy2r7OzU8XFxUpLS1NmZmaEKkS0I6QR1ex2u0pKSlReXi6Xy6Xa2lq1t7ertrZWLpdL5eXl2rRpEzcNETEjIl0AEGkrVqxQaWmp1q5dqwULFgTb09LSVFpaqhUrVkSwOkQ7HsEDrgoEAtq7d68qKyuVm5urRYsWMYPGgAk115hJA1fZ7XZlZWWpra1NWVlZBDSMwDVpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBCGkAMBghDQAGI6QBwGCENAAYjJAGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGOyGQnrLli1KTU3VyJEjNXfuXL333nu99n399ddls9m6/Y0cOfKGCwaAaBJ2SL/11lsqLCzU888/r/fff18zZsxQTk6OTp061euYhIQENTY2Bv8++eSTr1Q0AESLEeEO2Lx5s5544gk9+uijkqSXX35Zu3bt0quvvqof/OAHPY6x2WyaMGFCyMfo6OhQR0dHcNvn80mS/H6//H5/uCUDIes6vzjPMNBCPcfCCunLly/r4MGDKioqCrbFxMRoyZIlqqmp6XVca2urbr/9dnV2dmr27NnauHGj7r777l77FxcXa8OGDde07969W3FxceGUDNwQj8cT6RIwzF28eDGkfmGF9JkzZxQIBJSUlNStPSkpSR9//HGPY+688069+uqruueee9TS0qJNmzZp/vz5+tOf/qTbbrutxzFFRUUqLCwMbvt8PqWkpGjp0qVKSEgIp2QgLH6/Xx6PR9nZ2XI4HJEuB8NY1xWCvoR9uSNcGRkZysjICG7Pnz9fX//61/XLX/5SP/rRj3ocExsbq9jY2GvaHQ4HHxwMCs41DLRQz6+wbhyOHTtWdrtdzc3N3dqbm5tDvubscDg0a9YsHT16NJxDA0BUCiukv/a1r2nOnDmqqqoKtnV2dqqqqqrbbPl6AoGAPvzwQyUnJ4dXKQBEobAvdxQWFmr16tW69957df/99+vFF19UW1tb8GmPVatWadKkSSouLpYk/fCHP9S8efPkdDp1/vx5/fSnP9Unn3yixx9/vH/fCQAMQ2GH9MMPP6zTp09r/fr1ampq0syZM/X2228HbyaeOHFCMTH/O0E/d+6cnnjiCTU1NemWW27RnDlz9O6772ratGn99y4AYJiyWZZlRbqIvvh8PiUmJqqlpYWnOzCg/H6/KioqlJeXx41DDKhQc421OwDAYIQ0ABiMkAYAgxHSAGAwQhoADEZIA4DBCGkAMBghDQAGI6QBwGCENAAYjJAGAIMR0gBgMEIaAAxGSAOAwQhpADAYIQ0ABiOkAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAYjpAHAYIQ0ABiMkAauCgQCqq6u1v79+1VdXa1AIBDpkgBCGpAkt9stp9Op7Oxsbd68WdnZ2XI6nXK73ZEuDVGOkEbUc7vdKigoUHp6urxer7Zv3y6v16v09HQVFBQQ1Igom2VZVqSL6IvP51NiYqJaWlqUkJAQ6XIwjAQCATmdTqWnp6usrEyBQEAVFRXKy8uT3W6Xy+VSXV2d6uvrZbfbI10uhpFQc42ZNKKa1+tVQ0OD1q1bp5iY7h+HmJgYFRUV6fjx4/J6vRGqENGOkEZUa2xslCRNnz69x/1d7V39gMFGSCOqJScnS5Lq6up63N/V3tUPGGyENKJaZmamUlNTtXHjRnV2dnbb19nZqeLiYqWlpSkzMzNCFSLaEdKIana7XSUlJSovL5fL5VJtba3a29tVW1srl8ul8vJybdq0iZuGiJgRkS4AiLQVK1aotLRUa9eu1YIFC4LtaWlpKi0t1YoVKyJYHaIdj+ABVwUCAe3du1eVlZXKzc3VokWLmEFjwISaa8ykgavsdruysrLU1tamrKwsAhpG4Jo0ABiMkAYAgxHSAGAwQhoADEZIA1exnjRMREgDYj1pmIuQRtRjPWmYjC+zIKqxnjQihfWkgRCwnjRMR0gjqrGeNExHSCOqsZ40TEdII6p9cT1pv9/f7RE8v9/PetKIOBZYQlTrWk+6oKBAiYmJam9vlyRt3rxZo0aN0qVLl1RaWspNQ0QMM2lAkmVZunTpUre2S5cuaQg8/IRhjkfwENUCgYAmTpyoU6dOKS8vT1OmTNGRI0d055136tixY6qoqND48eN18uRJZtPoV6HmGiGNqFZVVaUlS5borrvu0qVLl9TQ0BDcl5qaqpEjR+rjjz/WO++8o8WLF0euUAw7PCcNhGDfvn2SpCNHjmjatGlavny50tPTtXz5ck2bNk1Hjhzp1g8YbNw4RFTr+oXw0aNHq6KiItj+4YcfSpJuueUWnTt37ppfEgcGCzNpRLUxY8ZIks6dO9fj/q72rn7AYCOkEdVCDV9CGpFCSCOq7dy5s1/7Af3thkJ6y5YtwTvfc+fO1XvvvXfd/jt27NBdd92lkSNHKj09vdu1PyCSvvg0R3/0A/pb2CH91ltvqbCwUM8//7zef/99zZgxQzk5OTp16lSP/d99911961vf0ne/+10dOnRILpcruPwjEGldNwi7/PznP9eaNWv085///Lr9gMES9nPSc+fO1X333aeXXnpJ0pW74ykpKfqnf/on/eAHP7im/8MPP6y2tjaVl5cH2+bNm6eZM2fq5Zdf7vEYHR0d6ujoCG77fD6lpKTozJkzPCcd5c62XVbZhx+r9fOeb/R9WduFFh2tO9Tr/h1bS4L/vunmBLW1+v53Oz5RbRdagtsrn1zb42s4p8/STfGJIdUz9dZk5X79jpD6Ynjz+XwaO3Zsn89Jh/UI3uXLl3Xw4EEVFRUF22JiYrRkyRLV1NT0OKampkaFhYXd2nJyclRWVtbrcYqLi7Vhw4Zr2nfv3q24uLhwSsYwU9Nsk9u3R7HjqkIfNKn3Xc4Nzi+1jP/S9rjgvw7pdz2+xqFzv5NC+z9DHQcXq+mjRUoaFVp/DF8XL14MqV9YIX3mzBkFAgElJSV1a09KStLHH3/c45impqYe+zc1NfV6nKKiom7B3jWTXrp0KTPpKDev7bLSPpyk1s+/EVL/cGbSfemXmfQsZtK4wufz9d1Jhn6ZJTY2VrGxsde0OxwOORyOCFQEUySNduj/Zs4Jb9A//p9ed/3quR+H/DJvrPt/4R0XuI5QsyysG4djx46V3W5Xc3Nzt/bm5mZNmDChxzETJkwIqz8wmEK9JTMElrjBMBVWSH/ta1/TnDlzVFX1v9cDOzs7VVVVpYyMjB7HZGRkdOsvSR6Pp9f+wGDrK4AJaERS2Jc7CgsLtXr1at177726//779eKLL6qtrU2PPvqoJGnVqlWaNGmSiouLJUnPPPOMsrKyVFJSovz8fL355ps6cOCAtm7d2r/vBPgKLMuSzWbrsR2IpLBD+uGHH9bp06e1fv16NTU1aebMmXr77beDNwdPnDjR7VeX58+fr23btulf/uVftG7dOk2dOlVlZWW9/vAnECmWZcnv96uiokJ5eXnc/4ARWE8a+AJCGoOF9aQBYBggpAHAYIQ0ABiMkAYAgxHSAGAwQhoADGbk2h1f1vWUYKgLkgA3yu/36+LFi/L5fDyChwHVlWd9PQU9JEL6woULkqSUlJQIVwIA/evChQtKTOx9FcUh8WWWzs5OnTx5UvHx8T1+dRfoL13L4n766ad8cQoDyrIsXbhwQRMnTuz2Le0vGxIhDQwWvt0K03DjEAAMRkgDgMEIaeALYmNj9fzzz/f4y0BAJHBNGgAMxkwaAAxGSAOAwQhpADAYIQ0ABiOkMSwtXLhQzz77bHA7NTVVL774YsTqAW4UIY0h45FHHpHNZtNTTz11zb6nn35aNptNjzzyiCTJ7XbrRz/60SBXCPQ/QhpDSkpKit588021t7cH2y5duqRt27Zp8uTJwbYxY8YoPj4+EiUC/YqQxpAye/ZspaSkyO12B9vcbrcmT56sWbNmBdu+fLnjy86fP6/HH39c48aNU0JCgv7hH/5BH3zwQXD/sWPH9NBDDykpKUk333yz7rvvPr3zzjvdXqOxsVH5+fkaNWqU0tLStG3btmsuq/R1HKAvhDSGnMcee0yvvfZacPvVV1/Vo48+GtZrrFy5UqdOnVJlZaUOHjyo2bNna/HixTp79qwkqbW1VXl5eaqqqtKhQ4f04IMPatmyZTpx4kTwNVatWqWTJ09q3759+u1vf6utW7fq1KlTYR0H6JMFDBGrV6+2HnroIevUqVNWbGys1dDQYDU0NFgjR460Tp8+bT300EPW6tWrLcuyrKysLOuZZ54Jjr399tutn/3sZ5ZlWZbX67USEhKsS5cudXv9KVOmWL/85S97Pf7dd99t/eIXv7Asy7L+/Oc/W5Ks//7v/w7ur6+vtyR95eMAXzQkFv0HvmjcuHHKz8/X66+/LsuylJ+fr7Fjx4Y8/oMPPlBra6tuvfXWbu3t7e06duyYpCsz6RdeeEG7du1SY2OjPv/8c7W3twdn0keOHNGIESM0e/bs4Hin06lbbrklrOMAfSGkMSQ99thjWrNmjSRpy5YtYY1tbW1VcnKy9u3bd82+0aNHS5K+973vyePxaNOmTXI6nRo1apQKCgp0+fLlfj0O0BdCGkPSgw8+qMuXL8tmsyknJyessbNnz1ZTU5NGjBih1NTUHvv853/+px555BEtX75c0pXAbWhoCO6/88479fnnn+vQoUOaM2eOJOno0aM6d+5cWMcB+sKNQwxJdrtdf/7zn/XRRx/JbreHNXbJkiXKyMiQy+XS7t271dDQoHfffVfPPfecDhw4IEmaOnWq3G63Dh8+rA8++EDf/va31dnZGXyNu+66S0uWLNGTTz6p9957T4cOHdKTTz6pUaNGBX/iLZTjAH0hpDFkJSQk3NBPXNlsNlVUVGjBggV69NFHdccdd+ib3/ymPvnkEyUlJUmSNm/erFtuuUXz58/XsmXLlJOT0+36syS98cYbSkpK0oIFC7R8+XI98cQTio+P18iRI0M+DtAX1pMG+slf//pXpaSk6J133tHixYsjXQ6GCUIauEF79uxRa2ur0tPT1djYqO9///v67LPP9Je//EUOhyPS5WGY4MYhcIP8fr/WrVun//mf/1F8fLzmz5+vX//61wQ0+hUzaQAwGDcOAcBghDQAGIyQBgCDEdIAYDBCGgAMRkgDgMEIaQAwGCENAAb7/8GR/bp5ZnbuAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "numeric_features_with_outliers = [\n", " \"Price\",\n", " \"Levy\",\n", " \"Mileage\",\n", " \"Age\",\n", "]\n", "\n", "i = 1\n", "for col in numeric_features_with_outliers:\n", " plt.figure(figsize=(4, 30))\n", " plt.subplot(6, 1, i)\n", " df.boxplot(column=col)\n", " i += 1" ] }, { "cell_type": "code", "execution_count": 1077, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Количество строк до удаления выбросов: 14801\n", "Количество строк после удаления выбросов: 12597\n" ] } ], "source": [ "def remove_outliers(df, column):\n", " Q1 = df[column].quantile(0.25)\n", " Q3 = df[column].quantile(0.75)\n", " IQR = Q3 - Q1\n", " lower_bound = Q1 - 1.5 * IQR\n", " upper_bound = Q3 + 1.5 * IQR\n", " return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n", "\n", "print(f\"Количество строк до удаления выбросов: {len(df)}\")\n", "\n", "for column in numeric_features_with_outliers:\n", " df = remove_outliers(df, column)\n", "\n", "print(f\"Количество строк после удаления выбросов: {len(df)}\")" ] }, { "cell_type": "code", "execution_count": 1078, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "i = 1\n", "for col in numeric_features_with_outliers:\n", " plt.figure(figsize=(4, 30))\n", " plt.subplot(6, 1, i)\n", " df.boxplot(column=col)\n", " i += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Разбиение на выборки" ] }, { "cell_type": "code", "execution_count": 1079, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Размеры выборок:\n", "Обучающая выборка: 10077 записей\n", "Тестовая выборка: 2520 записей\n" ] } ], "source": [ "train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)\n", "\n", "print(\"Размеры выборок:\")\n", "print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n", "print(f\"Тестовая выборка: {test_df.shape[0]} записей\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Дискретизация числовых признаков" ] }, { "cell_type": "code", "execution_count": 1080, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price int64\n", "Levy int64\n", "Manufacturer object\n", "Model object\n", "Category object\n", "Leather interior object\n", "Fuel type object\n", "Engine volume float64\n", "Mileage int64\n", "Cylinders int64\n", "Gear box type object\n", "Drive wheels object\n", "Doors object\n", "Wheel object\n", "Color object\n", "Airbags int64\n", "Age int64\n", "dtype: object" ] }, "execution_count": 1080, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.dtypes" ] }, { "cell_type": "code", "execution_count": 1081, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PriceLevyManufacturerModelCategoryLeather interiorFuel typeEngine volumeMileageCylindersGear box typeDrive wheelsDoorsWheelColorAirbagsAgeAge_bin
148296743966DAEWOOLacettiSedanYesDiesel2.0622274AutomaticFrontЧетырехдверныйLeft wheelWhite411Старый
363220005583HYUNDAIElantraSedanYesPetrol1.6944794AutomaticFrontЧетырехдверныйLeft wheelRed49Средний
498213172836DODGECaliberHatchbackNoPetrol2.01140004VariatorFrontЧетырехдверныйLeft wheelSilver810Средний
167588781584HYUNDAIElantraSedanYesPetrol1.8600004TiptronicFrontЧетырехдверныйLeft wheelGrey106Средний
6875250860TOYOTAPriusHatchbackNoHybrid1.804AutomaticFrontЧетырехдверныйLeft wheelSilver125Новый
.........................................................
18201103490AUDIA4SedanYesPetrol2.41500006Manual4x4ЧетырехдверныйLeft wheelGrey413Старый
74362038765KIAAvellaSedanYesPetrol2.01256214AutomaticFrontЧетырехдверныйLeft wheelSilver125Новый
772813485843TOYOTAPriusHatchbackNoHybrid1.52120004VariatorFrontЧетырехдверныйLeft wheelSilver812Старый
1136156770FORDFiestaSedanNoPetrol1.6748004AutomaticFrontЧетырехдверныйLeft wheelSilver84Новый
1064016308751KIAOptima EXSedanYesPetrol2.49200012TiptronicFrontЧетырехдверныйLeft wheelSilver87Средний
\n", "

10077 rows × 18 columns

\n", "
" ], "text/plain": [ " Price Levy Manufacturer Model Category Leather interior \\\n", "14829 6743 966 DAEWOO Lacetti Sedan Yes \n", "3632 20005 583 HYUNDAI Elantra Sedan Yes \n", "4982 13172 836 DODGE Caliber Hatchback No \n", "16758 8781 584 HYUNDAI Elantra Sedan Yes \n", "6875 25086 0 TOYOTA Prius Hatchback No \n", "... ... ... ... ... ... ... \n", "18201 10349 0 AUDI A4 Sedan Yes \n", "7436 2038 765 KIA Avella Sedan Yes \n", "7728 13485 843 TOYOTA Prius Hatchback No \n", "1136 15677 0 FORD Fiesta Sedan No \n", "10640 16308 751 KIA Optima EX Sedan Yes \n", "\n", " Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels \\\n", "14829 Diesel 2.0 62227 4 Automatic Front \n", "3632 Petrol 1.6 94479 4 Automatic Front \n", "4982 Petrol 2.0 114000 4 Variator Front \n", "16758 Petrol 1.8 60000 4 Tiptronic Front \n", "6875 Hybrid 1.8 0 4 Automatic Front \n", "... ... ... ... ... ... ... \n", "18201 Petrol 2.4 150000 6 Manual 4x4 \n", "7436 Petrol 2.0 125621 4 Automatic Front \n", "7728 Hybrid 1.5 212000 4 Variator Front \n", "1136 Petrol 1.6 74800 4 Automatic Front \n", "10640 Petrol 2.4 92000 12 Tiptronic Front \n", "\n", " Doors Wheel Color Airbags Age Age_bin \n", "14829 Четырехдверный Left wheel White 4 11 Старый \n", "3632 Четырехдверный Left wheel Red 4 9 Средний \n", "4982 Четырехдверный Left wheel Silver 8 10 Средний \n", "16758 Четырехдверный Left wheel Grey 10 6 Средний \n", "6875 Четырехдверный Left wheel Silver 12 5 Новый \n", "... ... ... ... ... ... ... \n", "18201 Четырехдверный Left wheel Grey 4 13 Старый \n", "7436 Четырехдверный Left wheel Silver 12 5 Новый \n", "7728 Четырехдверный Left wheel Silver 8 12 Старый \n", "1136 Четырехдверный Left wheel Silver 8 4 Новый \n", "10640 Четырехдверный Left wheel Silver 8 7 Средний \n", "\n", "[10077 rows x 18 columns]" ] }, "execution_count": 1081, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric_features_for_discritization = [\"Age\"]\n", "\n", "def discretize_features(df, features, bins=4, labels=[\"Новый\", \"Средний\", \"Старый\", \"Очень старый\"]):\n", " for feature in features:\n", " try:\n", " df[f\"{feature}_bin\"] = pd.cut(df[feature], bins=bins, labels=labels) # type: ignore\n", " except Exception as e:\n", " print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n", " return df\n", "\n", "\n", "train_df = discretize_features(train_df, numeric_features_for_discritization)\n", "test_df = discretize_features(test_df, numeric_features_for_discritization)\n", "\n", "train_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Унитарное кодирование категориальных признаков" ] }, { "cell_type": "code", "execution_count": 1082, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price int64\n", "Levy int64\n", "Manufacturer object\n", "Model object\n", "Category object\n", "Leather interior object\n", "Fuel type object\n", "Engine volume float64\n", "Mileage int64\n", "Cylinders int64\n", "Gear box type object\n", "Drive wheels object\n", "Doors object\n", "Wheel object\n", "Color object\n", "Airbags int64\n", "Age int64\n", "Age_bin category\n", "dtype: object" ] }, "execution_count": 1082, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.dtypes" ] }, { "cell_type": "code", "execution_count": 1083, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PriceLevyManufacturerModelEngine volumeMileageCylindersColorAirbagsAge...Drive wheels_RearDoors_ДвухдверныйDoors_МногодверныйDoors_ЧетырехдверныйWheel_Left wheelWheel_Right-hand driveAge_bin_НовыйAge_bin_СреднийAge_bin_СтарыйAge_bin_Очень старый
148296743966DAEWOOLacetti2.0622274White411...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
363220005583HYUNDAIElantra1.6944794Red49...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
498213172836DODGECaliber2.01140004Silver810...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
167588781584HYUNDAIElantra1.8600004Grey106...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
6875250860TOYOTAPrius1.804Silver125...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
..................................................................
18201103490AUDIA42.41500006Grey413...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
74362038765KIAAvella2.01256214Silver125...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
772813485843TOYOTAPrius1.52120004Silver812...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
1136156770FORDFiesta1.6748004Silver84...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
1064016308751KIAOptima EX2.49200012Silver87...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
\n", "

10077 rows × 46 columns

\n", "
" ], "text/plain": [ " Price Levy Manufacturer Model Engine volume Mileage Cylinders \\\n", "14829 6743 966 DAEWOO Lacetti 2.0 62227 4 \n", "3632 20005 583 HYUNDAI Elantra 1.6 94479 4 \n", "4982 13172 836 DODGE Caliber 2.0 114000 4 \n", "16758 8781 584 HYUNDAI Elantra 1.8 60000 4 \n", "6875 25086 0 TOYOTA Prius 1.8 0 4 \n", "... ... ... ... ... ... ... ... \n", "18201 10349 0 AUDI A4 2.4 150000 6 \n", "7436 2038 765 KIA Avella 2.0 125621 4 \n", "7728 13485 843 TOYOTA Prius 1.5 212000 4 \n", "1136 15677 0 FORD Fiesta 1.6 74800 4 \n", "10640 16308 751 KIA Optima EX 2.4 92000 12 \n", "\n", " Color Airbags Age ... Drive wheels_Rear Doors_Двухдверный \\\n", "14829 White 4 11 ... False False \n", "3632 Red 4 9 ... False False \n", "4982 Silver 8 10 ... False False \n", "16758 Grey 10 6 ... False False \n", "6875 Silver 12 5 ... False False \n", "... ... ... ... ... ... ... \n", "18201 Grey 4 13 ... False False \n", "7436 Silver 12 5 ... False False \n", "7728 Silver 8 12 ... False False \n", "1136 Silver 8 4 ... False False \n", "10640 Silver 8 7 ... False False \n", "\n", " Doors_Многодверный Doors_Четырехдверный Wheel_Left wheel \\\n", "14829 False True True \n", "3632 False True True \n", "4982 False True True \n", "16758 False True True \n", "6875 False True True \n", "... ... ... ... \n", "18201 False True True \n", "7436 False True True \n", "7728 False True True \n", "1136 False True True \n", "10640 False True True \n", "\n", " Wheel_Right-hand drive Age_bin_Новый Age_bin_Средний Age_bin_Старый \\\n", "14829 False False False True \n", "3632 False False True False \n", "4982 False False True False \n", "16758 False False True False \n", "6875 False True False False \n", "... ... ... ... ... \n", "18201 False False False True \n", "7436 False True False False \n", "7728 False False False True \n", "1136 False True False False \n", "10640 False False True False \n", "\n", " Age_bin_Очень старый \n", "14829 False \n", "3632 False \n", "4982 False \n", "16758 False \n", "6875 False \n", "... ... \n", "18201 False \n", "7436 False \n", "7728 False \n", "1136 False \n", "10640 False \n", "\n", "[10077 rows x 46 columns]" ] }, "execution_count": 1083, "metadata": {}, "output_type": "execute_result" } ], "source": [ "categorical_features_for_encoding = [\n", " \"Leather interior\",\n", " \"Category\",\n", " \"Fuel type\",\n", " \"Gear box type\",\n", " \"Drive wheels\",\n", " \"Doors\",\n", " \"Wheel\",\n", " \"Age_bin\",\n", "]\n", "\n", "train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)\n", "test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)\n", "\n", "train_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Масштабирование признаков" ] }, { "cell_type": "code", "execution_count": 1084, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Price int64\n", "Levy int64\n", "Manufacturer object\n", "Model object\n", "Engine volume float64\n", "Mileage int64\n", "Cylinders int64\n", "Color object\n", "Airbags int64\n", "Age int64\n", "Leather interior_No bool\n", "Leather interior_Yes bool\n", "Category_Cabriolet bool\n", "Category_Coupe bool\n", "Category_Goods wagon bool\n", "Category_Hatchback bool\n", "Category_Jeep bool\n", "Category_Limousine bool\n", "Category_Microbus bool\n", "Category_Minivan bool\n", "Category_Pickup bool\n", "Category_Sedan bool\n", "Category_Universal bool\n", "Fuel type_CNG bool\n", "Fuel type_Diesel bool\n", "Fuel type_Hybrid bool\n", "Fuel type_Hydrogen bool\n", "Fuel type_LPG bool\n", "Fuel type_Petrol bool\n", "Fuel type_Plug-in Hybrid bool\n", "Gear box type_Automatic bool\n", "Gear box type_Manual bool\n", "Gear box type_Tiptronic bool\n", "Gear box type_Variator bool\n", "Drive wheels_4x4 bool\n", "Drive wheels_Front bool\n", "Drive wheels_Rear bool\n", "Doors_Двухдверный bool\n", "Doors_Многодверный bool\n", "Doors_Четырехдверный bool\n", "Wheel_Left wheel bool\n", "Wheel_Right-hand drive bool\n", "Age_bin_Новый bool\n", "Age_bin_Средний bool\n", "Age_bin_Старый bool\n", "Age_bin_Очень старый bool\n", "dtype: object" ] }, "execution_count": 1084, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.dtypes" ] }, { "cell_type": "code", "execution_count": 1085, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PriceLevyManufacturerModelEngine volumeMileageCylindersColorAirbagsAge...Drive wheels_RearDoors_ДвухдверныйDoors_МногодверныйDoors_ЧетырехдверныйWheel_Left wheelWheel_Right-hand driveAge_bin_НовыйAge_bin_СреднийAge_bin_СтарыйAge_bin_Очень старый
14829-0.9364280.909873DAEWOOLacetti-0.212078-0.855905-0.399820White-0.6814910.446831...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
36320.2881470.076376HYUNDAIElantra-0.757467-0.422001-0.399820Red-0.6814910.013523...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
4982-0.3427930.626963DODGECaliber-0.212078-0.159374-0.399820Silver0.3307630.230177...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
16758-0.7482450.078552HYUNDAIElantra-0.484772-0.885866-0.399820Grey0.836890-0.636438...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
68750.757313-1.192368TOYOTAPrius-0.484772-1.693079-0.399820Silver1.343017-0.853091...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
..................................................................
18201-0.603461-1.192368AUDIA40.3333120.3249541.520116Grey-0.6814910.880138...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
7436-1.3708750.472450KIAAvella-0.212078-0.003030-0.399820Silver1.343017-0.853091...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
7728-0.3138910.642196TOYOTAPrius-0.8938141.159074-0.399820Silver0.3307630.663484...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
1136-0.111488-1.192368FORDFiesta-0.757467-0.686753-0.399820Silver0.330763-1.069745...FalseFalseFalseTrueTrueFalseTrueFalseFalseFalse
10640-0.0532230.441983KIAOptima EX0.333312-0.4553527.279922Silver0.330763-0.419784...FalseFalseFalseTrueTrueFalseFalseTrueFalseFalse
\n", "

10077 rows × 46 columns

\n", "
" ], "text/plain": [ " Price Levy Manufacturer Model Engine volume Mileage \\\n", "14829 -0.936428 0.909873 DAEWOO Lacetti -0.212078 -0.855905 \n", "3632 0.288147 0.076376 HYUNDAI Elantra -0.757467 -0.422001 \n", "4982 -0.342793 0.626963 DODGE Caliber -0.212078 -0.159374 \n", "16758 -0.748245 0.078552 HYUNDAI Elantra -0.484772 -0.885866 \n", "6875 0.757313 -1.192368 TOYOTA Prius -0.484772 -1.693079 \n", "... ... ... ... ... ... ... \n", "18201 -0.603461 -1.192368 AUDI A4 0.333312 0.324954 \n", "7436 -1.370875 0.472450 KIA Avella -0.212078 -0.003030 \n", "7728 -0.313891 0.642196 TOYOTA Prius -0.893814 1.159074 \n", "1136 -0.111488 -1.192368 FORD Fiesta -0.757467 -0.686753 \n", "10640 -0.053223 0.441983 KIA Optima EX 0.333312 -0.455352 \n", "\n", " Cylinders Color Airbags Age ... Drive wheels_Rear \\\n", "14829 -0.399820 White -0.681491 0.446831 ... False \n", "3632 -0.399820 Red -0.681491 0.013523 ... False \n", "4982 -0.399820 Silver 0.330763 0.230177 ... False \n", "16758 -0.399820 Grey 0.836890 -0.636438 ... False \n", "6875 -0.399820 Silver 1.343017 -0.853091 ... False \n", "... ... ... ... ... ... ... \n", "18201 1.520116 Grey -0.681491 0.880138 ... False \n", "7436 -0.399820 Silver 1.343017 -0.853091 ... False \n", "7728 -0.399820 Silver 0.330763 0.663484 ... False \n", "1136 -0.399820 Silver 0.330763 -1.069745 ... False \n", "10640 7.279922 Silver 0.330763 -0.419784 ... False \n", "\n", " Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный \\\n", "14829 False False True \n", "3632 False False True \n", "4982 False False True \n", "16758 False False True \n", "6875 False False True \n", "... ... ... ... \n", "18201 False False True \n", "7436 False False True \n", "7728 False False True \n", "1136 False False True \n", "10640 False False True \n", "\n", " Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый \\\n", "14829 True False False \n", "3632 True False False \n", "4982 True False False \n", "16758 True False False \n", "6875 True False True \n", "... ... ... ... \n", "18201 True False False \n", "7436 True False True \n", "7728 True False False \n", "1136 True False True \n", "10640 True False False \n", "\n", " Age_bin_Средний Age_bin_Старый Age_bin_Очень старый \n", "14829 False True False \n", "3632 True False False \n", "4982 True False False \n", "16758 True False False \n", "6875 False False False \n", "... ... ... ... \n", "18201 False True False \n", "7436 False False False \n", "7728 False True False \n", "1136 False False False \n", "10640 True False False \n", "\n", "[10077 rows x 46 columns]" ] }, "execution_count": 1085, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaler = StandardScaler()\n", "\n", "numeric_features_for_stardartization = [\n", " \"Price\",\n", " \"Levy\",\n", " \"Engine volume\",\n", " \"Mileage\",\n", " \"Cylinders\",\n", " \"Airbags\",\n", " \"Age\",\n", "]\n", "\n", "train_df[numeric_features_for_stardartization] = scaler.fit_transform(\n", " train_df[numeric_features_for_stardartization]\n", ")\n", "test_df[numeric_features_for_stardartization] = scaler.transform(\n", " test_df[numeric_features_for_stardartization]\n", ")\n", "\n", "train_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Конструирование признаков с помощью Featuretools" ] }, { "cell_type": "code", "execution_count": 1086, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n", " warnings.warn(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n", "c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " pd.to_datetime(\n" ] } ], "source": [ "es = ft.EntitySet(id=\"car_data\")\n", "es = es.add_dataframe(dataframe_name=\"train\", dataframe=train_df, index=\"id\")\n", "feature_matrix, feature_defs = ft.dfs(\n", " entityset=es,\n", " target_dataframe_name=\"train\",\n", " max_depth=1,\n", ")" ] }, { "cell_type": "code", "execution_count": 1087, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 1087, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_defs" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" } }, "nbformat": 4, "nbformat_minor": 2 }