2972 lines
220 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"source": [
"#### Загрузка набора данных"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Prod. year</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005 km</td>\n",
" <td>6.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>3</td>\n",
" <td>192000 km</td>\n",
" <td>6.0</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8467</td>\n",
" <td>-</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000 km</td>\n",
" <td>4.0</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19232</th>\n",
" <td>8467</td>\n",
" <td>-</td>\n",
" <td>MERCEDES-BENZ</td>\n",
" <td>CLK 200</td>\n",
" <td>1999</td>\n",
" <td>Coupe</td>\n",
" <td>Yes</td>\n",
" <td>CNG</td>\n",
" <td>2.0 Turbo</td>\n",
" <td>300000 km</td>\n",
" <td>4.0</td>\n",
" <td>Manual</td>\n",
" <td>Rear</td>\n",
" <td>02-Mar</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19233</th>\n",
" <td>15681</td>\n",
" <td>831</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Sonata</td>\n",
" <td>2011</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.4</td>\n",
" <td>161600 km</td>\n",
" <td>4.0</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Red</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19234</th>\n",
" <td>26108</td>\n",
" <td>836</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Tucson</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2</td>\n",
" <td>116365 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19235</th>\n",
" <td>5331</td>\n",
" <td>1288</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Captiva</td>\n",
" <td>2007</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2</td>\n",
" <td>51258 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19236</th>\n",
" <td>470</td>\n",
" <td>753</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Sonata</td>\n",
" <td>2012</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.4</td>\n",
" <td>186923 km</td>\n",
" <td>4.0</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>04-May</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19237 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Prod. year Category \\\n",
"0 13328 1399 LEXUS RX 450 2010 Jeep \n",
"1 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"2 8467 - HONDA FIT 2006 Hatchback \n",
"3 3607 862 FORD Escape 2011 Jeep \n",
"4 11726 446 HONDA FIT 2014 Hatchback \n",
"... ... ... ... ... ... ... \n",
"19232 8467 - MERCEDES-BENZ CLK 200 1999 Coupe \n",
"19233 15681 831 HYUNDAI Sonata 2011 Sedan \n",
"19234 26108 836 HYUNDAI Tucson 2010 Jeep \n",
"19235 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
"19236 470 753 HYUNDAI Sonata 2012 Sedan \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Yes Hybrid 3.5 186005 km 6.0 \n",
"1 No Petrol 3 192000 km 6.0 \n",
"2 No Petrol 1.3 200000 km 4.0 \n",
"3 Yes Hybrid 2.5 168966 km 4.0 \n",
"4 Yes Petrol 1.3 91901 km 4.0 \n",
"... ... ... ... ... ... \n",
"19232 Yes CNG 2.0 Turbo 300000 km 4.0 \n",
"19233 Yes Petrol 2.4 161600 km 4.0 \n",
"19234 Yes Diesel 2 116365 km 4.0 \n",
"19235 Yes Diesel 2 51258 km 4.0 \n",
"19236 Yes Hybrid 2.4 186923 km 4.0 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color Airbags \n",
"0 Automatic 4x4 04-May Left wheel Silver 12 \n",
"1 Tiptronic 4x4 04-May Left wheel Black 8 \n",
"2 Variator Front 04-May Right-hand drive Black 2 \n",
"3 Automatic 4x4 04-May Left wheel White 0 \n",
"4 Automatic Front 04-May Left wheel Silver 4 \n",
"... ... ... ... ... ... ... \n",
"19232 Manual Rear 02-Mar Left wheel Silver 5 \n",
"19233 Tiptronic Front 04-May Left wheel Red 8 \n",
"19234 Automatic Front 04-May Left wheel Grey 4 \n",
"19235 Automatic Front 04-May Left wheel Black 4 \n",
"19236 Automatic Front 04-May Left wheel White 12 \n",
"\n",
"[19237 rows x 17 columns]"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import featuretools as ft\n",
"import re\n",
"from sklearn.preprocessing import StandardScaler\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"\n",
"df = pd.read_csv(\"../data/car_price_prediction.csv\")\n",
"\n",
"df = df.drop(columns=[\"ID\"])\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Анализ датасета и очистка данных"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy object\n",
"Manufacturer object\n",
"Model object\n",
"Prod. year int64\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume object\n",
"Mileage object\n",
"Cylinders float64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"dtype: object"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 3.5, 3. , 1.3, 2.5, 2. , 1.8, 2.4, 4. , 1.6, 3.3, 2.2,\n",
" 4.7, 1.5, 4.4, 1.4, 3.6, 2.3, 5.5, 2.8, 3.2, 3.8, 4.6,\n",
" 1.2, 5. , 1.7, 2.9, 0.5, 1.9, 2.7, 4.8, 5.3, 0.4, 1.1,\n",
" 2.1, 0.7, 5.4, 3.7, 1. , 2.6, 0.8, 0.2, 5.7, 6.7, 6.2,\n",
" 3.4, 6.3, 4.3, 4.2, 0. , 20. , 0.3, 5.9, 5.6, 6. , 0.6,\n",
" 6.8, 4.5, 7.3, 0.1, 3.1, 6.4, 3.9, 0.9, 5.2, 5.8])"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Engine volume\"] = df[\"Engine volume\"].str.replace(\"Turbo\", \"\")\n",
"df[\"Engine volume\"] = pd.to_numeric(df[\"Engine volume\"])\n",
"df[\"Engine volume\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([186005, 192000, 200000, ..., 140607, 307325, 186923])"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Mileage\"] = df[\"Mileage\"].str.replace(\"km\", \"\")\n",
"df[\"Mileage\"] = df[\"Mileage\"].astype(\"int64\")\n",
"df[\"Mileage\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1399, 1018, 0, 862, 446, 891, 761, 751, 394,\n",
" 1053, 1055, 1079, 810, 2386, 1850, 531, 586, 1249,\n",
" 2455, 583, 1537, 1288, 915, 1750, 707, 1077, 1486,\n",
" 1091, 650, 382, 1436, 1194, 503, 1017, 1104, 639,\n",
" 629, 919, 781, 530, 640, 765, 777, 779, 934,\n",
" 769, 645, 1185, 1324, 830, 1187, 1111, 760, 642,\n",
" 1604, 1095, 966, 473, 1138, 1811, 988, 917, 1156,\n",
" 687, 11714, 836, 1347, 2866, 1646, 259, 609, 697,\n",
" 585, 475, 690, 308, 1823, 1361, 1273, 924, 584,\n",
" 2078, 831, 1172, 893, 1872, 1885, 1266, 447, 2148,\n",
" 1730, 730, 289, 502, 333, 1325, 247, 879, 1342,\n",
" 1327, 1598, 1514, 1058, 738, 1935, 481, 1522, 1282,\n",
" 456, 880, 900, 798, 1277, 442, 1051, 790, 1292,\n",
" 1047, 528, 1211, 1493, 1793, 574, 930, 1998, 271,\n",
" 706, 1481, 1677, 1661, 1286, 1408, 1090, 595, 1451,\n",
" 1267, 993, 1714, 878, 641, 749, 1511, 603, 353,\n",
" 877, 1236, 1141, 397, 784, 1024, 1357, 1301, 770,\n",
" 922, 1438, 753, 607, 1363, 638, 490, 431, 565,\n",
" 517, 833, 489, 1760, 986, 1841, 1620, 1360, 474,\n",
" 1099, 978, 1624, 1946, 1268, 1307, 696, 649, 666,\n",
" 2151, 551, 800, 971, 1323, 2377, 1845, 1083, 694,\n",
" 463, 419, 345, 1515, 1505, 2056, 1203, 729, 460,\n",
" 1356, 876, 911, 1190, 780, 448, 2410, 1848, 1148,\n",
" 834, 1275, 1028, 1197, 724, 890, 1705, 505, 789,\n",
" 2959, 518, 461, 1719, 2858, 3156, 2225, 2177, 1968,\n",
" 1888, 1308, 2736, 1103, 557, 2195, 843, 1664, 723,\n",
" 4508, 562, 501, 2018, 1076, 1202, 3301, 691, 1440,\n",
" 1869, 1178, 418, 1820, 1413, 488, 1304, 363, 2108,\n",
" 521, 1659, 87, 1411, 1528, 3292, 7058, 1578, 627,\n",
" 874, 1996, 1488, 5679, 1234, 5603, 400, 889, 3268,\n",
" 875, 949, 2265, 441, 742, 425, 2476, 2971, 614,\n",
" 1816, 1375, 1405, 2297, 1062, 1113, 420, 2469, 658,\n",
" 1951, 2670, 2578, 1995, 1032, 994, 1011, 2421, 1296,\n",
" 155, 494, 426, 1086, 961, 2236, 1829, 764, 1834,\n",
" 1054, 617, 1529, 2266, 637, 626, 1832, 1016, 2002,\n",
" 1756, 746, 1285, 2690, 1118, 5332, 980, 1807, 970,\n",
" 1228, 1195, 1132, 1768, 1384, 1080, 7063, 1817, 1452,\n",
" 1975, 1368, 702, 1974, 1781, 1036, 944, 663, 364,\n",
" 1539, 1345, 1680, 2209, 741, 1575, 695, 1317, 294,\n",
" 1525, 424, 997, 1473, 1552, 2819, 2188, 1668, 3057,\n",
" 799, 1502, 2606, 552, 1694, 1759, 1110, 399, 1470,\n",
" 1174, 5877, 1474, 1688, 526, 686, 5908, 1107, 2070,\n",
" 1468, 1246, 1685, 556, 1533, 1917, 1346, 732, 692,\n",
" 579, 421, 362, 3505, 1855, 2711, 1586, 3739, 681,\n",
" 1708, 2278, 1701, 722, 1482, 928, 827, 832, 527,\n",
" 604, 173, 1341, 3329, 1553, 859, 167, 916, 828,\n",
" 2082, 1176, 1108, 975, 3008, 1516, 2269, 1699, 2073,\n",
" 1031, 1503, 2364, 1030, 1442, 5666, 2715, 1437, 2067,\n",
" 1426, 2908, 1279, 866, 4283, 279, 2658, 3015, 2004,\n",
" 1391, 4736, 748, 1466, 644, 683, 2705, 1297, 731,\n",
" 1252, 2216, 3141, 3273, 1518, 1723, 1588, 972, 682,\n",
" 1094, 668, 175, 967, 402, 3894, 1960, 1599, 2000,\n",
" 2084, 1621, 714, 1109, 3989, 873, 1572, 1163, 1991,\n",
" 1716, 1673, 2562, 2874, 965, 462, 605, 1948, 1736,\n",
" 3518, 2054, 2467, 1681, 1272, 1205, 750, 2156, 2566,\n",
" 115, 524, 3184, 676, 1678, 612, 328, 955, 1441,\n",
" 1675, 3965, 2909, 623, 822, 867, 3025, 1993, 792,\n",
" 636, 4057, 3743, 2337, 2570, 2418, 2472, 3910, 1662,\n",
" 2123, 2628, 3208, 2080, 3699, 2913, 864, 2505, 870,\n",
" 7536, 1924, 1671, 1064, 1836, 1866, 4741, 841, 1369,\n",
" 5681, 3112, 1366, 2223, 1198, 1039, 3811, 3571, 1387,\n",
" 1171, 1365, 1531, 1590, 11706, 2308, 4860, 1641, 1045,\n",
" 1901])"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Levy\"] = df[\"Levy\"].replace(\"-\", \"0\")\n",
"df[\"Levy\"] = df[\"Levy\"].astype(\"int64\")\n",
"df[\"Levy\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 6, 4, 8, 1, 12, 3, 2, 16, 5, 7, 9, 10, 14])"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Cylinders\"] = df[\"Cylinders\"].astype(\"int64\")\n",
"df[\"Cylinders\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['04-May', '02-Mar', '>5'], dtype=object)"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Doors\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"Doors\"] = df[\"Doors\"].map(\n",
" {\"02-Mar\": \"Двухдверный\", \"04-May\": \"Четырехдверный\", \">5\": \"Многодверный\"}\n",
")\n",
"df[\"Doors\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 3, 6, ..., 627220, 872946, 26307500])"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df = df.sort_values(by=\"Price\")\n",
"sorted_df[\"Price\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество строк до удаления некорректных значений: 19237\n",
"Количество строк после удаления некорректных значений: 17574\n"
]
}
],
"source": [
"print(f\"Количество строк до удаления некорректных значений: {len(df)}\")\n",
"df = df[df[\"Price\"] >= 500]\n",
"print(f\"Количество строк после удаления некорректных значений: {len(df)}\")"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 500, 549, 600, ..., 627220, 872946, 26307500])"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df = df.sort_values(by=\"Price\")\n",
"sorted_df[\"Price\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,\n",
" 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,\n",
" 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,\n",
" 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,\n",
" 2014, 2015, 2016, 2017, 2018, 2019, 2020])"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df = df.sort_values(by=\"Prod. year\")\n",
"sorted_df[\"Prod. year\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Prod. year</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13328</td>\n",
" <td>1399</td>\n",
" <td>LEXUS</td>\n",
" <td>RX 450</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>3.5</td>\n",
" <td>186005</td>\n",
" <td>6</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16621</td>\n",
" <td>1018</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Equinox</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>3.0</td>\n",
" <td>192000</td>\n",
" <td>6</td>\n",
" <td>Tiptronic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>8467</td>\n",
" <td>0</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2006</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>200000</td>\n",
" <td>4</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Black</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3607</td>\n",
" <td>862</td>\n",
" <td>FORD</td>\n",
" <td>Escape</td>\n",
" <td>2011</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Hybrid</td>\n",
" <td>2.5</td>\n",
" <td>168966</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11726</td>\n",
" <td>446</td>\n",
" <td>HONDA</td>\n",
" <td>FIT</td>\n",
" <td>2014</td>\n",
" <td>Hatchback</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>1.3</td>\n",
" <td>91901</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19231</th>\n",
" <td>5802</td>\n",
" <td>1055</td>\n",
" <td>MERCEDES-BENZ</td>\n",
" <td>E 350</td>\n",
" <td>2013</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>3.5</td>\n",
" <td>107800</td>\n",
" <td>6</td>\n",
" <td>Automatic</td>\n",
" <td>Rear</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19232</th>\n",
" <td>8467</td>\n",
" <td>0</td>\n",
" <td>MERCEDES-BENZ</td>\n",
" <td>CLK 200</td>\n",
" <td>1999</td>\n",
" <td>Coupe</td>\n",
" <td>Yes</td>\n",
" <td>CNG</td>\n",
" <td>2.0</td>\n",
" <td>300000</td>\n",
" <td>4</td>\n",
" <td>Manual</td>\n",
" <td>Rear</td>\n",
" <td>Двухдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19233</th>\n",
" <td>15681</td>\n",
" <td>831</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Sonata</td>\n",
" <td>2011</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.4</td>\n",
" <td>161600</td>\n",
" <td>4</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Red</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19234</th>\n",
" <td>26108</td>\n",
" <td>836</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Tucson</td>\n",
" <td>2010</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2.0</td>\n",
" <td>116365</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19235</th>\n",
" <td>5331</td>\n",
" <td>1288</td>\n",
" <td>CHEVROLET</td>\n",
" <td>Captiva</td>\n",
" <td>2007</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2.0</td>\n",
" <td>51258</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Black</td>\n",
" <td>4</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>17574 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Prod. year Category \\\n",
"0 13328 1399 LEXUS RX 450 2010 Jeep \n",
"1 16621 1018 CHEVROLET Equinox 2011 Jeep \n",
"2 8467 0 HONDA FIT 2006 Hatchback \n",
"3 3607 862 FORD Escape 2011 Jeep \n",
"4 11726 446 HONDA FIT 2014 Hatchback \n",
"... ... ... ... ... ... ... \n",
"19231 5802 1055 MERCEDES-BENZ E 350 2013 Sedan \n",
"19232 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe \n",
"19233 15681 831 HYUNDAI Sonata 2011 Sedan \n",
"19234 26108 836 HYUNDAI Tucson 2010 Jeep \n",
"19235 5331 1288 CHEVROLET Captiva 2007 Jeep \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"0 Yes Hybrid 3.5 186005 6 \n",
"1 No Petrol 3.0 192000 6 \n",
"2 No Petrol 1.3 200000 4 \n",
"3 Yes Hybrid 2.5 168966 4 \n",
"4 Yes Petrol 1.3 91901 4 \n",
"... ... ... ... ... ... \n",
"19231 Yes Diesel 3.5 107800 6 \n",
"19232 Yes CNG 2.0 300000 4 \n",
"19233 Yes Petrol 2.4 161600 4 \n",
"19234 Yes Diesel 2.0 116365 4 \n",
"19235 Yes Diesel 2.0 51258 4 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color \\\n",
"0 Automatic 4x4 Четырехдверный Left wheel Silver \n",
"1 Tiptronic 4x4 Четырехдверный Left wheel Black \n",
"2 Variator Front Четырехдверный Right-hand drive Black \n",
"3 Automatic 4x4 Четырехдверный Left wheel White \n",
"4 Automatic Front Четырехдверный Left wheel Silver \n",
"... ... ... ... ... ... \n",
"19231 Automatic Rear Четырехдверный Left wheel Grey \n",
"19232 Manual Rear Двухдверный Left wheel Silver \n",
"19233 Tiptronic Front Четырехдверный Left wheel Red \n",
"19234 Automatic Front Четырехдверный Left wheel Grey \n",
"19235 Automatic Front Четырехдверный Left wheel Black \n",
"\n",
" Airbags \n",
"0 12 \n",
"1 8 \n",
"2 2 \n",
"3 0 \n",
"4 4 \n",
"... ... \n",
"19231 12 \n",
"19232 5 \n",
"19233 8 \n",
"19234 4 \n",
"19235 4 \n",
"\n",
"[17574 rows x 17 columns]"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Очистка дубликатов и пропущенных значений"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.int64(2773)"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"df.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price 0\n",
"Levy 0\n",
"Manufacturer 0\n",
"Model 0\n",
"Prod. year 0\n",
"Category 0\n",
"Leather interior 0\n",
"Fuel type 0\n",
"Engine volume 0\n",
"Mileage 0\n",
"Cylinders 0\n",
"Gear box type 0\n",
"Drive wheels 0\n",
"Doors 0\n",
"Wheel 0\n",
"Color 0\n",
"Airbags 0\n",
"dtype: int64"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isna().sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Очистка выбросов"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy int64\n",
"Manufacturer object\n",
"Model object\n",
"Prod. year int64\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume float64\n",
"Mileage int64\n",
"Cylinders int64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"dtype: object"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"numeric_features_with_outliers = [\n",
" \"Price\",\n",
" \"Levy\",\n",
" \"Mileage\",\n",
" \"Prod. year\",\n",
"]\n",
"\n",
"i = 1\n",
"for col in numeric_features_with_outliers:\n",
" plt.figure(figsize=(4, 30))\n",
" plt.subplot(6, 1, i)\n",
" df.boxplot(column=col)\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Количество строк до удаления выбросов: 14801\n",
"Количество строк после удаления выбросов: 12597\n"
]
}
],
"source": [
"def remove_outliers(df, column):\n",
" Q1 = df[column].quantile(0.25)\n",
" Q3 = df[column].quantile(0.75)\n",
" IQR = Q3 - Q1\n",
" lower_bound = Q1 - 1.5 * IQR\n",
" upper_bound = Q3 + 1.5 * IQR\n",
" return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n",
"\n",
"print(f\"Количество строк до удаления выбросов: {len(df)}\")\n",
"\n",
"for column in numeric_features_with_outliers:\n",
" df = remove_outliers(df, column)\n",
"\n",
"print(f\"Количество строк после удаления выбросов: {len(df)}\")"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"i = 1\n",
"for col in numeric_features_with_outliers:\n",
" plt.figure(figsize=(4, 30))\n",
" plt.subplot(6, 1, i)\n",
" df.boxplot(column=col)\n",
" i += 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Разбиение на выборки"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры выборок:\n",
"Обучающая выборка: 8817 записей\n",
"Category\n",
"Sedan 3954\n",
"Jeep 2263\n",
"Hatchback 1554\n",
"Minivan 312\n",
"Coupe 251\n",
"Universal 180\n",
"Microbus 143\n",
"Goods wagon 120\n",
"Pickup 22\n",
"Cabriolet 16\n",
"Limousine 2\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: 3780 записей\n",
"Category\n",
"Sedan 1692\n",
"Jeep 990\n",
"Hatchback 636\n",
"Minivan 151\n",
"Coupe 117\n",
"Universal 82\n",
"Goods wagon 52\n",
"Microbus 46\n",
"Pickup 8\n",
"Cabriolet 5\n",
"Limousine 1\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"X = df\n",
"y = df[\"Category\"]\n",
"\n",
"train_df, test_df, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.3, random_state=42\n",
")\n",
"\n",
"print(\"Размеры выборок:\")\n",
"print(f\"Обучающая выборка: {train_df.shape[0]} записей\")\n",
"print(train_df.Category.value_counts())\n",
"print(f\"Тестовая выборка: {test_df.shape[0]} записей\")\n",
"print(test_df.Category.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Oversampling"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Размеры выборок:\n",
"Обучающая выборка: 43494 записей\n",
"Category\n",
"Sedan 3954\n",
"Jeep 3954\n",
"Universal 3954\n",
"Hatchback 3954\n",
"Coupe 3954\n",
"Goods wagon 3954\n",
"Minivan 3954\n",
"Microbus 3954\n",
"Pickup 3954\n",
"Limousine 3954\n",
"Cabriolet 3954\n",
"Name: count, dtype: int64\n",
"Тестовая выборка: 18612 записей\n",
"Category\n",
"Hatchback 1692\n",
"Sedan 1692\n",
"Universal 1692\n",
"Jeep 1692\n",
"Coupe 1692\n",
"Minivan 1692\n",
"Goods wagon 1692\n",
"Microbus 1692\n",
"Pickup 1692\n",
"Cabriolet 1692\n",
"Limousine 1692\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"def oversample(df):\n",
" X = df.drop(\"Category\", axis=1)\n",
" y = df[\"Category\"]\n",
"\n",
" oversampler = RandomOverSampler(random_state=42)\n",
" X_resampled, y_resampled = oversampler.fit_resample(X, y) # type: ignore\n",
"\n",
" resampled_df = pd.concat([X_resampled, y_resampled], axis=1)\n",
" return resampled_df\n",
"\n",
"\n",
"train_df_overs = oversample(train_df)\n",
"test_df_overs = oversample(test_df)\n",
"\n",
"print(\"Размеры выборок:\")\n",
"print(f\"Обучающая выборка: {train_df_overs.shape[0]} записей\")\n",
"print(train_df_overs.Category.value_counts())\n",
"print(f\"Тестовая выборка: {test_df_overs.shape[0]} записей\")\n",
"print(test_df_overs.Category.value_counts())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Ручной синтез признаков."
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"def age_create(df): \n",
" df[\"Age\"] = 2020 - df[\"Prod. year\"]\n",
" df = df.drop(\"Prod. year\", axis=1)\n",
" return df\n",
"\n",
"train_df = age_create(train_df)\n",
"test_df = age_create(test_df)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n",
" 17, 18, 19, 20, 21])"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df = train_df.sort_values(by=\"Age\")\n",
"sorted_df[\"Age\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: >"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 400x3000 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(4, 30))\n",
"plt.subplot(6, 1, i)\n",
"train_df.boxplot(column=\"Age\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Дискретизация числовых признаков"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy int64\n",
"Manufacturer object\n",
"Model object\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume float64\n",
"Mileage int64\n",
"Cylinders int64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"Age int64\n",
"dtype: object"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Category</th>\n",
" <th>Leather interior</th>\n",
" <th>Fuel type</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Gear box type</th>\n",
" <th>Drive wheels</th>\n",
" <th>Doors</th>\n",
" <th>Wheel</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" <th>Age</th>\n",
" <th>Age_bin</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15146</th>\n",
" <td>18503</td>\n",
" <td>0</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>Sedan</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.8</td>\n",
" <td>13000</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>White</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>Новый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14145</th>\n",
" <td>9722</td>\n",
" <td>0</td>\n",
" <td>TOYOTA</td>\n",
" <td>Ractis</td>\n",
" <td>Sedan</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.5</td>\n",
" <td>116800</td>\n",
" <td>4</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Right-hand drive</td>\n",
" <td>Brown</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" <td>Старый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8943</th>\n",
" <td>15367</td>\n",
" <td>584</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Elantra</td>\n",
" <td>Sedan</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.8</td>\n",
" <td>78222</td>\n",
" <td>4</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Beige</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>Средний</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17889</th>\n",
" <td>11917</td>\n",
" <td>0</td>\n",
" <td>SUBARU</td>\n",
" <td>Forester L.L.BEAN</td>\n",
" <td>Jeep</td>\n",
" <td>Yes</td>\n",
" <td>CNG</td>\n",
" <td>2.5</td>\n",
" <td>220000</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Green</td>\n",
" <td>5</td>\n",
" <td>16</td>\n",
" <td>Очень старый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9515</th>\n",
" <td>46919</td>\n",
" <td>1327</td>\n",
" <td>HYUNDAI</td>\n",
" <td>H1</td>\n",
" <td>Universal</td>\n",
" <td>Yes</td>\n",
" <td>Diesel</td>\n",
" <td>2.5</td>\n",
" <td>71689</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Новый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18201</th>\n",
" <td>10349</td>\n",
" <td>0</td>\n",
" <td>AUDI</td>\n",
" <td>A4</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.4</td>\n",
" <td>150000</td>\n",
" <td>6</td>\n",
" <td>Manual</td>\n",
" <td>4x4</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>Старый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7436</th>\n",
" <td>2038</td>\n",
" <td>765</td>\n",
" <td>KIA</td>\n",
" <td>Avella</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.0</td>\n",
" <td>125621</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>Новый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7728</th>\n",
" <td>13485</td>\n",
" <td>843</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>Hatchback</td>\n",
" <td>No</td>\n",
" <td>Hybrid</td>\n",
" <td>1.5</td>\n",
" <td>212000</td>\n",
" <td>4</td>\n",
" <td>Variator</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>12</td>\n",
" <td>Старый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1136</th>\n",
" <td>15677</td>\n",
" <td>0</td>\n",
" <td>FORD</td>\n",
" <td>Fiesta</td>\n",
" <td>Sedan</td>\n",
" <td>No</td>\n",
" <td>Petrol</td>\n",
" <td>1.6</td>\n",
" <td>74800</td>\n",
" <td>4</td>\n",
" <td>Automatic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>Новый</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10640</th>\n",
" <td>16308</td>\n",
" <td>751</td>\n",
" <td>KIA</td>\n",
" <td>Optima EX</td>\n",
" <td>Sedan</td>\n",
" <td>Yes</td>\n",
" <td>Petrol</td>\n",
" <td>2.4</td>\n",
" <td>92000</td>\n",
" <td>12</td>\n",
" <td>Tiptronic</td>\n",
" <td>Front</td>\n",
" <td>Четырехдверный</td>\n",
" <td>Left wheel</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>Средний</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8817 rows × 18 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Category \\\n",
"15146 18503 0 TOYOTA Prius Sedan \n",
"14145 9722 0 TOYOTA Ractis Sedan \n",
"8943 15367 584 HYUNDAI Elantra Sedan \n",
"17889 11917 0 SUBARU Forester L.L.BEAN Jeep \n",
"9515 46919 1327 HYUNDAI H1 Universal \n",
"... ... ... ... ... ... \n",
"18201 10349 0 AUDI A4 Sedan \n",
"7436 2038 765 KIA Avella Sedan \n",
"7728 13485 843 TOYOTA Prius Hatchback \n",
"1136 15677 0 FORD Fiesta Sedan \n",
"10640 16308 751 KIA Optima EX Sedan \n",
"\n",
" Leather interior Fuel type Engine volume Mileage Cylinders \\\n",
"15146 No Petrol 1.8 13000 4 \n",
"14145 No Petrol 1.5 116800 4 \n",
"8943 No Petrol 1.8 78222 4 \n",
"17889 Yes CNG 2.5 220000 4 \n",
"9515 Yes Diesel 2.5 71689 4 \n",
"... ... ... ... ... ... \n",
"18201 Yes Petrol 2.4 150000 6 \n",
"7436 Yes Petrol 2.0 125621 4 \n",
"7728 No Hybrid 1.5 212000 4 \n",
"1136 No Petrol 1.6 74800 4 \n",
"10640 Yes Petrol 2.4 92000 12 \n",
"\n",
" Gear box type Drive wheels Doors Wheel Color \\\n",
"15146 Automatic Front Четырехдверный Left wheel White \n",
"14145 Tiptronic Front Четырехдверный Right-hand drive Brown \n",
"8943 Tiptronic Front Четырехдверный Left wheel Beige \n",
"17889 Automatic 4x4 Четырехдверный Left wheel Green \n",
"9515 Automatic Front Четырехдверный Left wheel Grey \n",
"... ... ... ... ... ... \n",
"18201 Manual 4x4 Четырехдверный Left wheel Grey \n",
"7436 Automatic Front Четырехдверный Left wheel Silver \n",
"7728 Variator Front Четырехдверный Left wheel Silver \n",
"1136 Automatic Front Четырехдверный Left wheel Silver \n",
"10640 Tiptronic Front Четырехдверный Left wheel Silver \n",
"\n",
" Airbags Age Age_bin \n",
"15146 4 0 Новый \n",
"14145 2 13 Старый \n",
"8943 10 6 Средний \n",
"17889 5 16 Очень старый \n",
"9515 4 2 Новый \n",
"... ... ... ... \n",
"18201 4 13 Старый \n",
"7436 12 5 Новый \n",
"7728 8 12 Старый \n",
"1136 8 4 Новый \n",
"10640 8 7 Средний \n",
"\n",
"[8817 rows x 18 columns]"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"numeric_features_for_discritization = [\"Age\"]\n",
"\n",
"def discretize_features(df, features, bins=4, labels=[\"Новый\", \"Средний\", \"Старый\", \"Очень старый\"]):\n",
" for feature in features:\n",
" try:\n",
" df[f\"{feature}_bin\"] = pd.cut(df[feature], bins=bins, labels=labels) # type: ignore\n",
" except Exception as e:\n",
" print(f\"Ошибка при дискретизации признака {feature}: {e}\")\n",
" return df\n",
"\n",
"\n",
"train_df = discretize_features(train_df, numeric_features_for_discritization)\n",
"test_df = discretize_features(test_df, numeric_features_for_discritization)\n",
"\n",
"train_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Унитарное кодирование категориальных признаков"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy int64\n",
"Manufacturer object\n",
"Model object\n",
"Category object\n",
"Leather interior object\n",
"Fuel type object\n",
"Engine volume float64\n",
"Mileage int64\n",
"Cylinders int64\n",
"Gear box type object\n",
"Drive wheels object\n",
"Doors object\n",
"Wheel object\n",
"Color object\n",
"Airbags int64\n",
"Age int64\n",
"Age_bin category\n",
"dtype: object"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" <th>Age</th>\n",
" <th>...</th>\n",
" <th>Drive wheels_Rear</th>\n",
" <th>Doors_Двухдверный</th>\n",
" <th>Doors_Многодверный</th>\n",
" <th>Doors_Четырехдверный</th>\n",
" <th>Wheel_Left wheel</th>\n",
" <th>Wheel_Right-hand drive</th>\n",
" <th>Age_bin_Новый</th>\n",
" <th>Age_bin_Средний</th>\n",
" <th>Age_bin_Старый</th>\n",
" <th>Age_bin_Очень старый</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15146</th>\n",
" <td>18503</td>\n",
" <td>0</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>1.8</td>\n",
" <td>13000</td>\n",
" <td>4</td>\n",
" <td>White</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14145</th>\n",
" <td>9722</td>\n",
" <td>0</td>\n",
" <td>TOYOTA</td>\n",
" <td>Ractis</td>\n",
" <td>1.5</td>\n",
" <td>116800</td>\n",
" <td>4</td>\n",
" <td>Brown</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8943</th>\n",
" <td>15367</td>\n",
" <td>584</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Elantra</td>\n",
" <td>1.8</td>\n",
" <td>78222</td>\n",
" <td>4</td>\n",
" <td>Beige</td>\n",
" <td>10</td>\n",
" <td>6</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17889</th>\n",
" <td>11917</td>\n",
" <td>0</td>\n",
" <td>SUBARU</td>\n",
" <td>Forester L.L.BEAN</td>\n",
" <td>2.5</td>\n",
" <td>220000</td>\n",
" <td>4</td>\n",
" <td>Green</td>\n",
" <td>5</td>\n",
" <td>16</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9515</th>\n",
" <td>46919</td>\n",
" <td>1327</td>\n",
" <td>HYUNDAI</td>\n",
" <td>H1</td>\n",
" <td>2.5</td>\n",
" <td>71689</td>\n",
" <td>4</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18201</th>\n",
" <td>10349</td>\n",
" <td>0</td>\n",
" <td>AUDI</td>\n",
" <td>A4</td>\n",
" <td>2.4</td>\n",
" <td>150000</td>\n",
" <td>6</td>\n",
" <td>Grey</td>\n",
" <td>4</td>\n",
" <td>13</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7436</th>\n",
" <td>2038</td>\n",
" <td>765</td>\n",
" <td>KIA</td>\n",
" <td>Avella</td>\n",
" <td>2.0</td>\n",
" <td>125621</td>\n",
" <td>4</td>\n",
" <td>Silver</td>\n",
" <td>12</td>\n",
" <td>5</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7728</th>\n",
" <td>13485</td>\n",
" <td>843</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>1.5</td>\n",
" <td>212000</td>\n",
" <td>4</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1136</th>\n",
" <td>15677</td>\n",
" <td>0</td>\n",
" <td>FORD</td>\n",
" <td>Fiesta</td>\n",
" <td>1.6</td>\n",
" <td>74800</td>\n",
" <td>4</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10640</th>\n",
" <td>16308</td>\n",
" <td>751</td>\n",
" <td>KIA</td>\n",
" <td>Optima EX</td>\n",
" <td>2.4</td>\n",
" <td>92000</td>\n",
" <td>12</td>\n",
" <td>Silver</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8817 rows × 46 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Engine volume Mileage \\\n",
"15146 18503 0 TOYOTA Prius 1.8 13000 \n",
"14145 9722 0 TOYOTA Ractis 1.5 116800 \n",
"8943 15367 584 HYUNDAI Elantra 1.8 78222 \n",
"17889 11917 0 SUBARU Forester L.L.BEAN 2.5 220000 \n",
"9515 46919 1327 HYUNDAI H1 2.5 71689 \n",
"... ... ... ... ... ... ... \n",
"18201 10349 0 AUDI A4 2.4 150000 \n",
"7436 2038 765 KIA Avella 2.0 125621 \n",
"7728 13485 843 TOYOTA Prius 1.5 212000 \n",
"1136 15677 0 FORD Fiesta 1.6 74800 \n",
"10640 16308 751 KIA Optima EX 2.4 92000 \n",
"\n",
" Cylinders Color Airbags Age ... Drive wheels_Rear \\\n",
"15146 4 White 4 0 ... False \n",
"14145 4 Brown 2 13 ... False \n",
"8943 4 Beige 10 6 ... False \n",
"17889 4 Green 5 16 ... False \n",
"9515 4 Grey 4 2 ... False \n",
"... ... ... ... ... ... ... \n",
"18201 6 Grey 4 13 ... False \n",
"7436 4 Silver 12 5 ... False \n",
"7728 4 Silver 8 12 ... False \n",
"1136 4 Silver 8 4 ... False \n",
"10640 12 Silver 8 7 ... False \n",
"\n",
" Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный \\\n",
"15146 False False True \n",
"14145 False False True \n",
"8943 False False True \n",
"17889 False False True \n",
"9515 False False True \n",
"... ... ... ... \n",
"18201 False False True \n",
"7436 False False True \n",
"7728 False False True \n",
"1136 False False True \n",
"10640 False False True \n",
"\n",
" Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый \\\n",
"15146 True False True \n",
"14145 False True False \n",
"8943 True False False \n",
"17889 True False False \n",
"9515 True False True \n",
"... ... ... ... \n",
"18201 True False False \n",
"7436 True False True \n",
"7728 True False False \n",
"1136 True False True \n",
"10640 True False False \n",
"\n",
" Age_bin_Средний Age_bin_Старый Age_bin_Очень старый \n",
"15146 False False False \n",
"14145 False True False \n",
"8943 True False False \n",
"17889 False False True \n",
"9515 False False False \n",
"... ... ... ... \n",
"18201 False True False \n",
"7436 False False False \n",
"7728 False True False \n",
"1136 False False False \n",
"10640 True False False \n",
"\n",
"[8817 rows x 46 columns]"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"categorical_features_for_encoding = [\n",
" \"Leather interior\",\n",
" \"Category\",\n",
" \"Fuel type\",\n",
" \"Gear box type\",\n",
" \"Drive wheels\",\n",
" \"Doors\",\n",
" \"Wheel\",\n",
" \"Age_bin\",\n",
"]\n",
"\n",
"train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)\n",
"test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)\n",
"\n",
"train_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Масштабирование признаков"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Price int64\n",
"Levy int64\n",
"Manufacturer object\n",
"Model object\n",
"Engine volume float64\n",
"Mileage int64\n",
"Cylinders int64\n",
"Color object\n",
"Airbags int64\n",
"Age int64\n",
"Leather interior_No bool\n",
"Leather interior_Yes bool\n",
"Category_Cabriolet bool\n",
"Category_Coupe bool\n",
"Category_Goods wagon bool\n",
"Category_Hatchback bool\n",
"Category_Jeep bool\n",
"Category_Limousine bool\n",
"Category_Microbus bool\n",
"Category_Minivan bool\n",
"Category_Pickup bool\n",
"Category_Sedan bool\n",
"Category_Universal bool\n",
"Fuel type_CNG bool\n",
"Fuel type_Diesel bool\n",
"Fuel type_Hybrid bool\n",
"Fuel type_Hydrogen bool\n",
"Fuel type_LPG bool\n",
"Fuel type_Petrol bool\n",
"Fuel type_Plug-in Hybrid bool\n",
"Gear box type_Automatic bool\n",
"Gear box type_Manual bool\n",
"Gear box type_Tiptronic bool\n",
"Gear box type_Variator bool\n",
"Drive wheels_4x4 bool\n",
"Drive wheels_Front bool\n",
"Drive wheels_Rear bool\n",
"Doors_Двухдверный bool\n",
"Doors_Многодверный bool\n",
"Doors_Четырехдверный bool\n",
"Wheel_Left wheel bool\n",
"Wheel_Right-hand drive bool\n",
"Age_bin_Новый bool\n",
"Age_bin_Средний bool\n",
"Age_bin_Старый bool\n",
"Age_bin_Очень старый bool\n",
"dtype: object"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Price</th>\n",
" <th>Levy</th>\n",
" <th>Manufacturer</th>\n",
" <th>Model</th>\n",
" <th>Engine volume</th>\n",
" <th>Mileage</th>\n",
" <th>Cylinders</th>\n",
" <th>Color</th>\n",
" <th>Airbags</th>\n",
" <th>Age</th>\n",
" <th>...</th>\n",
" <th>Drive wheels_Rear</th>\n",
" <th>Doors_Двухдверный</th>\n",
" <th>Doors_Многодверный</th>\n",
" <th>Doors_Четырехдверный</th>\n",
" <th>Wheel_Left wheel</th>\n",
" <th>Wheel_Right-hand drive</th>\n",
" <th>Age_bin_Новый</th>\n",
" <th>Age_bin_Средний</th>\n",
" <th>Age_bin_Старый</th>\n",
" <th>Age_bin_Очень старый</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>15146</th>\n",
" <td>0.153774</td>\n",
" <td>-1.192982</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>-0.479341</td>\n",
" <td>-1.531744</td>\n",
" <td>-0.403213</td>\n",
" <td>White</td>\n",
" <td>-0.683755</td>\n",
" <td>-1.946936</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14145</th>\n",
" <td>-0.658018</td>\n",
" <td>-1.192982</td>\n",
" <td>TOYOTA</td>\n",
" <td>Ractis</td>\n",
" <td>-0.887855</td>\n",
" <td>-0.130245</td>\n",
" <td>-0.403213</td>\n",
" <td>Brown</td>\n",
" <td>-1.190217</td>\n",
" <td>0.879266</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8943</th>\n",
" <td>-0.136145</td>\n",
" <td>0.081576</td>\n",
" <td>HYUNDAI</td>\n",
" <td>Elantra</td>\n",
" <td>-0.479341</td>\n",
" <td>-0.651122</td>\n",
" <td>-0.403213</td>\n",
" <td>Beige</td>\n",
" <td>0.835631</td>\n",
" <td>-0.642535</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17889</th>\n",
" <td>-0.455093</td>\n",
" <td>-1.192982</td>\n",
" <td>SUBARU</td>\n",
" <td>Forester L.L.BEAN</td>\n",
" <td>0.473858</td>\n",
" <td>1.263152</td>\n",
" <td>-0.403213</td>\n",
" <td>Green</td>\n",
" <td>-0.430524</td>\n",
" <td>1.531466</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9515</th>\n",
" <td>2.780795</td>\n",
" <td>1.703146</td>\n",
" <td>HYUNDAI</td>\n",
" <td>H1</td>\n",
" <td>0.473858</td>\n",
" <td>-0.739330</td>\n",
" <td>-0.403213</td>\n",
" <td>Grey</td>\n",
" <td>-0.683755</td>\n",
" <td>-1.512135</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18201</th>\n",
" <td>-0.600053</td>\n",
" <td>-1.192982</td>\n",
" <td>AUDI</td>\n",
" <td>A4</td>\n",
" <td>0.337687</td>\n",
" <td>0.318018</td>\n",
" <td>1.538421</td>\n",
" <td>Grey</td>\n",
" <td>-0.683755</td>\n",
" <td>0.879266</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7436</th>\n",
" <td>-1.368394</td>\n",
" <td>0.476602</td>\n",
" <td>KIA</td>\n",
" <td>Avella</td>\n",
" <td>-0.206998</td>\n",
" <td>-0.011145</td>\n",
" <td>-0.403213</td>\n",
" <td>Silver</td>\n",
" <td>1.342092</td>\n",
" <td>-0.859935</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7728</th>\n",
" <td>-0.310134</td>\n",
" <td>0.646834</td>\n",
" <td>TOYOTA</td>\n",
" <td>Prius</td>\n",
" <td>-0.887855</td>\n",
" <td>1.155137</td>\n",
" <td>-0.403213</td>\n",
" <td>Silver</td>\n",
" <td>0.329169</td>\n",
" <td>0.661866</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1136</th>\n",
" <td>-0.107486</td>\n",
" <td>-1.192982</td>\n",
" <td>FORD</td>\n",
" <td>Fiesta</td>\n",
" <td>-0.751684</td>\n",
" <td>-0.697325</td>\n",
" <td>-0.403213</td>\n",
" <td>Silver</td>\n",
" <td>0.329169</td>\n",
" <td>-1.077335</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10640</th>\n",
" <td>-0.049151</td>\n",
" <td>0.446048</td>\n",
" <td>KIA</td>\n",
" <td>Optima EX</td>\n",
" <td>0.337687</td>\n",
" <td>-0.465093</td>\n",
" <td>7.363324</td>\n",
" <td>Silver</td>\n",
" <td>0.329169</td>\n",
" <td>-0.425135</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8817 rows × 46 columns</p>\n",
"</div>"
],
"text/plain": [
" Price Levy Manufacturer Model Engine volume \\\n",
"15146 0.153774 -1.192982 TOYOTA Prius -0.479341 \n",
"14145 -0.658018 -1.192982 TOYOTA Ractis -0.887855 \n",
"8943 -0.136145 0.081576 HYUNDAI Elantra -0.479341 \n",
"17889 -0.455093 -1.192982 SUBARU Forester L.L.BEAN 0.473858 \n",
"9515 2.780795 1.703146 HYUNDAI H1 0.473858 \n",
"... ... ... ... ... ... \n",
"18201 -0.600053 -1.192982 AUDI A4 0.337687 \n",
"7436 -1.368394 0.476602 KIA Avella -0.206998 \n",
"7728 -0.310134 0.646834 TOYOTA Prius -0.887855 \n",
"1136 -0.107486 -1.192982 FORD Fiesta -0.751684 \n",
"10640 -0.049151 0.446048 KIA Optima EX 0.337687 \n",
"\n",
" Mileage Cylinders Color Airbags Age ... \\\n",
"15146 -1.531744 -0.403213 White -0.683755 -1.946936 ... \n",
"14145 -0.130245 -0.403213 Brown -1.190217 0.879266 ... \n",
"8943 -0.651122 -0.403213 Beige 0.835631 -0.642535 ... \n",
"17889 1.263152 -0.403213 Green -0.430524 1.531466 ... \n",
"9515 -0.739330 -0.403213 Grey -0.683755 -1.512135 ... \n",
"... ... ... ... ... ... ... \n",
"18201 0.318018 1.538421 Grey -0.683755 0.879266 ... \n",
"7436 -0.011145 -0.403213 Silver 1.342092 -0.859935 ... \n",
"7728 1.155137 -0.403213 Silver 0.329169 0.661866 ... \n",
"1136 -0.697325 -0.403213 Silver 0.329169 -1.077335 ... \n",
"10640 -0.465093 7.363324 Silver 0.329169 -0.425135 ... \n",
"\n",
" Drive wheels_Rear Doors_Двухдверный Doors_Многодверный \\\n",
"15146 False False False \n",
"14145 False False False \n",
"8943 False False False \n",
"17889 False False False \n",
"9515 False False False \n",
"... ... ... ... \n",
"18201 False False False \n",
"7436 False False False \n",
"7728 False False False \n",
"1136 False False False \n",
"10640 False False False \n",
"\n",
" Doors_Четырехдверный Wheel_Left wheel Wheel_Right-hand drive \\\n",
"15146 True True False \n",
"14145 True False True \n",
"8943 True True False \n",
"17889 True True False \n",
"9515 True True False \n",
"... ... ... ... \n",
"18201 True True False \n",
"7436 True True False \n",
"7728 True True False \n",
"1136 True True False \n",
"10640 True True False \n",
"\n",
" Age_bin_Новый Age_bin_Средний Age_bin_Старый Age_bin_Очень старый \n",
"15146 True False False False \n",
"14145 False False True False \n",
"8943 False True False False \n",
"17889 False False False True \n",
"9515 True False False False \n",
"... ... ... ... ... \n",
"18201 False False True False \n",
"7436 True False False False \n",
"7728 False False True False \n",
"1136 True False False False \n",
"10640 False True False False \n",
"\n",
"[8817 rows x 46 columns]"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"scaler = StandardScaler()\n",
"\n",
"numeric_features_for_stardartization = [\n",
" \"Price\",\n",
" \"Levy\",\n",
" \"Engine volume\",\n",
" \"Mileage\",\n",
" \"Cylinders\",\n",
" \"Airbags\",\n",
" \"Age\",\n",
"]\n",
"\n",
"train_df[numeric_features_for_stardartization] = scaler.fit_transform(\n",
" train_df[numeric_features_for_stardartization]\n",
")\n",
"test_df[numeric_features_for_stardartization] = scaler.transform(\n",
" test_df[numeric_features_for_stardartization]\n",
")\n",
"\n",
"train_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Конструирование признаков с помощью Featuretools"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\featuretools\\entityset\\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column\n",
" warnings.warn(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n",
"c:\\Users\\user\\source\\repos\\mai_pi-33_zakharov\\.venv\\Lib\\site-packages\\woodwork\\type_sys\\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
" pd.to_datetime(\n"
]
}
],
"source": [
"es = ft.EntitySet(id=\"car_data\")\n",
"es = es.add_dataframe(dataframe_name=\"train\", dataframe=train_df, index=\"id\")\n",
"feature_matrix, feature_defs = ft.dfs(\n",
" entityset=es,\n",
" target_dataframe_name=\"train\",\n",
" max_depth=1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<Feature: Price>,\n",
" <Feature: Levy>,\n",
" <Feature: Manufacturer>,\n",
" <Feature: Model>,\n",
" <Feature: Engine volume>,\n",
" <Feature: Mileage>,\n",
" <Feature: Cylinders>,\n",
" <Feature: Color>,\n",
" <Feature: Airbags>,\n",
" <Feature: Age>,\n",
" <Feature: Leather interior_No>,\n",
" <Feature: Leather interior_Yes>,\n",
" <Feature: Category_Cabriolet>,\n",
" <Feature: Category_Coupe>,\n",
" <Feature: Category_Goods wagon>,\n",
" <Feature: Category_Hatchback>,\n",
" <Feature: Category_Jeep>,\n",
" <Feature: Category_Limousine>,\n",
" <Feature: Category_Microbus>,\n",
" <Feature: Category_Minivan>,\n",
" <Feature: Category_Pickup>,\n",
" <Feature: Category_Sedan>,\n",
" <Feature: Category_Universal>,\n",
" <Feature: Fuel type_CNG>,\n",
" <Feature: Fuel type_Diesel>,\n",
" <Feature: Fuel type_Hybrid>,\n",
" <Feature: Fuel type_Hydrogen>,\n",
" <Feature: Fuel type_LPG>,\n",
" <Feature: Fuel type_Petrol>,\n",
" <Feature: Fuel type_Plug-in Hybrid>,\n",
" <Feature: Gear box type_Automatic>,\n",
" <Feature: Gear box type_Manual>,\n",
" <Feature: Gear box type_Tiptronic>,\n",
" <Feature: Gear box type_Variator>,\n",
" <Feature: Drive wheels_4x4>,\n",
" <Feature: Drive wheels_Front>,\n",
" <Feature: Drive wheels_Rear>,\n",
" <Feature: Doors_Двухдверный>,\n",
" <Feature: Doors_Многодверный>,\n",
" <Feature: Doors_Четырехдверный>,\n",
" <Feature: Wheel_Left wheel>,\n",
" <Feature: Wheel_Right-hand drive>,\n",
" <Feature: Age_bin_Новый>,\n",
" <Feature: Age_bin_Средний>,\n",
" <Feature: Age_bin_Старый>,\n",
" <Feature: Age_bin_Очень старый>]"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_defs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}