220 KiB
Raw Blame History

Загрузка набора данных

In [91]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


df = pd.read_csv("../data/car_price_prediction.csv")

df = df.drop(columns=["ID"])

df
Out[91]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8
2 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19232 8467 - MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5
19233 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8
19234 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4
19235 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4
19236 470 753 HYUNDAI Sonata 2012 Sedan Yes Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12

19237 rows × 17 columns

Анализ датасета и очистка данных

In [92]:
df.dtypes
Out[92]:
Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [93]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[93]:
array([ 3.5,  3. ,  1.3,  2.5,  2. ,  1.8,  2.4,  4. ,  1.6,  3.3,  2.2,
        4.7,  1.5,  4.4,  1.4,  3.6,  2.3,  5.5,  2.8,  3.2,  3.8,  4.6,
        1.2,  5. ,  1.7,  2.9,  0.5,  1.9,  2.7,  4.8,  5.3,  0.4,  1.1,
        2.1,  0.7,  5.4,  3.7,  1. ,  2.6,  0.8,  0.2,  5.7,  6.7,  6.2,
        3.4,  6.3,  4.3,  4.2,  0. , 20. ,  0.3,  5.9,  5.6,  6. ,  0.6,
        6.8,  4.5,  7.3,  0.1,  3.1,  6.4,  3.9,  0.9,  5.2,  5.8])
In [94]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[94]:
array([186005, 192000, 200000, ..., 140607, 307325, 186923])
In [95]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[95]:
array([ 1399,  1018,     0,   862,   446,   891,   761,   751,   394,
        1053,  1055,  1079,   810,  2386,  1850,   531,   586,  1249,
        2455,   583,  1537,  1288,   915,  1750,   707,  1077,  1486,
        1091,   650,   382,  1436,  1194,   503,  1017,  1104,   639,
         629,   919,   781,   530,   640,   765,   777,   779,   934,
         769,   645,  1185,  1324,   830,  1187,  1111,   760,   642,
        1604,  1095,   966,   473,  1138,  1811,   988,   917,  1156,
         687, 11714,   836,  1347,  2866,  1646,   259,   609,   697,
         585,   475,   690,   308,  1823,  1361,  1273,   924,   584,
        2078,   831,  1172,   893,  1872,  1885,  1266,   447,  2148,
        1730,   730,   289,   502,   333,  1325,   247,   879,  1342,
        1327,  1598,  1514,  1058,   738,  1935,   481,  1522,  1282,
         456,   880,   900,   798,  1277,   442,  1051,   790,  1292,
        1047,   528,  1211,  1493,  1793,   574,   930,  1998,   271,
         706,  1481,  1677,  1661,  1286,  1408,  1090,   595,  1451,
        1267,   993,  1714,   878,   641,   749,  1511,   603,   353,
         877,  1236,  1141,   397,   784,  1024,  1357,  1301,   770,
         922,  1438,   753,   607,  1363,   638,   490,   431,   565,
         517,   833,   489,  1760,   986,  1841,  1620,  1360,   474,
        1099,   978,  1624,  1946,  1268,  1307,   696,   649,   666,
        2151,   551,   800,   971,  1323,  2377,  1845,  1083,   694,
         463,   419,   345,  1515,  1505,  2056,  1203,   729,   460,
        1356,   876,   911,  1190,   780,   448,  2410,  1848,  1148,
         834,  1275,  1028,  1197,   724,   890,  1705,   505,   789,
        2959,   518,   461,  1719,  2858,  3156,  2225,  2177,  1968,
        1888,  1308,  2736,  1103,   557,  2195,   843,  1664,   723,
        4508,   562,   501,  2018,  1076,  1202,  3301,   691,  1440,
        1869,  1178,   418,  1820,  1413,   488,  1304,   363,  2108,
         521,  1659,    87,  1411,  1528,  3292,  7058,  1578,   627,
         874,  1996,  1488,  5679,  1234,  5603,   400,   889,  3268,
         875,   949,  2265,   441,   742,   425,  2476,  2971,   614,
        1816,  1375,  1405,  2297,  1062,  1113,   420,  2469,   658,
        1951,  2670,  2578,  1995,  1032,   994,  1011,  2421,  1296,
         155,   494,   426,  1086,   961,  2236,  1829,   764,  1834,
        1054,   617,  1529,  2266,   637,   626,  1832,  1016,  2002,
        1756,   746,  1285,  2690,  1118,  5332,   980,  1807,   970,
        1228,  1195,  1132,  1768,  1384,  1080,  7063,  1817,  1452,
        1975,  1368,   702,  1974,  1781,  1036,   944,   663,   364,
        1539,  1345,  1680,  2209,   741,  1575,   695,  1317,   294,
        1525,   424,   997,  1473,  1552,  2819,  2188,  1668,  3057,
         799,  1502,  2606,   552,  1694,  1759,  1110,   399,  1470,
        1174,  5877,  1474,  1688,   526,   686,  5908,  1107,  2070,
        1468,  1246,  1685,   556,  1533,  1917,  1346,   732,   692,
         579,   421,   362,  3505,  1855,  2711,  1586,  3739,   681,
        1708,  2278,  1701,   722,  1482,   928,   827,   832,   527,
         604,   173,  1341,  3329,  1553,   859,   167,   916,   828,
        2082,  1176,  1108,   975,  3008,  1516,  2269,  1699,  2073,
        1031,  1503,  2364,  1030,  1442,  5666,  2715,  1437,  2067,
        1426,  2908,  1279,   866,  4283,   279,  2658,  3015,  2004,
        1391,  4736,   748,  1466,   644,   683,  2705,  1297,   731,
        1252,  2216,  3141,  3273,  1518,  1723,  1588,   972,   682,
        1094,   668,   175,   967,   402,  3894,  1960,  1599,  2000,
        2084,  1621,   714,  1109,  3989,   873,  1572,  1163,  1991,
        1716,  1673,  2562,  2874,   965,   462,   605,  1948,  1736,
        3518,  2054,  2467,  1681,  1272,  1205,   750,  2156,  2566,
         115,   524,  3184,   676,  1678,   612,   328,   955,  1441,
        1675,  3965,  2909,   623,   822,   867,  3025,  1993,   792,
         636,  4057,  3743,  2337,  2570,  2418,  2472,  3910,  1662,
        2123,  2628,  3208,  2080,  3699,  2913,   864,  2505,   870,
        7536,  1924,  1671,  1064,  1836,  1866,  4741,   841,  1369,
        5681,  3112,  1366,  2223,  1198,  1039,  3811,  3571,  1387,
        1171,  1365,  1531,  1590, 11706,  2308,  4860,  1641,  1045,
        1901])
In [96]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[96]:
array([ 6,  4,  8,  1, 12,  3,  2, 16,  5,  7,  9, 10, 14])
In [97]:
df["Doors"].unique()
Out[97]:
array(['04-May', '02-Mar', '>5'], dtype=object)
In [98]:
df["Doors"] = df["Doors"].map(
    {"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[98]:
array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)
In [99]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[99]:
array([       1,        3,        6, ...,   627220,   872946, 26307500])
In [100]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
Количество строк до удаления некорректных значений: 19237
Количество строк после удаления некорректных значений: 17574
In [101]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[101]:
array([     500,      549,      600, ...,   627220,   872946, 26307500])
In [102]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[102]:
array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020])
In [103]:
df
Out[103]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 6 Automatic 4x4 Четырехдверный Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3.0 192000 6 Tiptronic 4x4 Четырехдверный Left wheel Black 8
2 8467 0 HONDA FIT 2006 Hatchback No Petrol 1.3 200000 4 Variator Front Четырехдверный Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 4 Automatic 4x4 Четырехдверный Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 4 Automatic Front Четырехдверный Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19231 5802 1055 MERCEDES-BENZ E 350 2013 Sedan Yes Diesel 3.5 107800 6 Automatic Rear Четырехдверный Left wheel Grey 12
19232 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 300000 4 Manual Rear Двухдверный Left wheel Silver 5
19233 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 4 Tiptronic Front Четырехдверный Left wheel Red 8
19234 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2.0 116365 4 Automatic Front Четырехдверный Left wheel Grey 4
19235 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2.0 51258 4 Automatic Front Четырехдверный Left wheel Black 4

17574 rows × 17 columns

Очистка дубликатов и пропущенных значений

In [104]:
df.duplicated().sum()
Out[104]:
np.int64(2773)
In [105]:
df.drop_duplicates(inplace=True)
In [106]:
df.isna().sum()
Out[106]:
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

Очистка выбросов

In [107]:
df.dtypes
Out[107]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [108]:
numeric_features_with_outliers = [
    "Price",
    "Levy",
    "Mileage",
    "Prod. year",
]

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [109]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Количество строк до удаления выбросов: {len(df)}")

for column in numeric_features_with_outliers:
    df = remove_outliers(df, column)

print(f"Количество строк после удаления выбросов: {len(df)}")
Количество строк до удаления выбросов: 14801
Количество строк после удаления выбросов: 12597
In [110]:
i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Разбиение на выборки

In [112]:
X = df
y = df["Category"]

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())
Размеры выборок:
Обучающая выборка: 8817 записей
Category
Sedan          3954
Jeep           2263
Hatchback      1554
Minivan         312
Coupe           251
Universal       180
Microbus        143
Goods wagon     120
Pickup           22
Cabriolet        16
Limousine         2
Name: count, dtype: int64
Тестовая выборка: 3780 записей
Category
Sedan          1692
Jeep            990
Hatchback       636
Minivan         151
Coupe           117
Universal        82
Goods wagon      52
Microbus         46
Pickup            8
Cabriolet         5
Limousine         1
Name: count, dtype: int64

Oversampling

In [113]:
def oversample(df):
    X = df.drop("Category", axis=1)
    y = df["Category"]

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)  # type: ignore

    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    return resampled_df


train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())
Размеры выборок:
Обучающая выборка: 43494 записей
Category
Sedan          3954
Jeep           3954
Universal      3954
Hatchback      3954
Coupe          3954
Goods wagon    3954
Minivan        3954
Microbus       3954
Pickup         3954
Limousine      3954
Cabriolet      3954
Name: count, dtype: int64
Тестовая выборка: 18612 записей
Category
Hatchback      1692
Sedan          1692
Universal      1692
Jeep           1692
Coupe          1692
Minivan        1692
Goods wagon    1692
Microbus       1692
Pickup         1692
Cabriolet      1692
Limousine      1692
Name: count, dtype: int64

Ручной синтез признаков.

In [114]:
def age_create(df): 
    df["Age"] = 2020 - df["Prod. year"]
    df = df.drop("Prod. year", axis=1)
    return df

train_df = age_create(train_df)
test_df = age_create(test_df)
In [115]:
sorted_df = train_df.sort_values(by="Age")
sorted_df["Age"].unique()
Out[115]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21])
In [116]:
plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
train_df.boxplot(column="Age")
Out[116]:
<Axes: >
No description has been provided for this image

Дискретизация числовых признаков

In [117]:
train_df.dtypes
Out[117]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object
In [118]:
numeric_features_for_discritization = ["Age"]

def discretize_features(df, features, bins=4, labels=["Новый", "Средний", "Старый", "Очень старый"]):
    for feature in features:
        try:
            df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels)  # type: ignore
        except Exception as e:
            print(f"Ошибка при дискретизации признака {feature}: {e}")
    return df


train_df = discretize_features(train_df, numeric_features_for_discritization)
test_df = discretize_features(test_df, numeric_features_for_discritization)

train_df
Out[118]:
Price Levy Manufacturer Model Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags Age Age_bin
15146 18503 0 TOYOTA Prius Sedan No Petrol 1.8 13000 4 Automatic Front Четырехдверный Left wheel White 4 0 Новый
14145 9722 0 TOYOTA Ractis Sedan No Petrol 1.5 116800 4 Tiptronic Front Четырехдверный Right-hand drive Brown 2 13 Старый
8943 15367 584 HYUNDAI Elantra Sedan No Petrol 1.8 78222 4 Tiptronic Front Четырехдверный Left wheel Beige 10 6 Средний
17889 11917 0 SUBARU Forester L.L.BEAN Jeep Yes CNG 2.5 220000 4 Automatic 4x4 Четырехдверный Left wheel Green 5 16 Очень старый
9515 46919 1327 HYUNDAI H1 Universal Yes Diesel 2.5 71689 4 Automatic Front Четырехдверный Left wheel Grey 4 2 Новый
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 10349 0 AUDI A4 Sedan Yes Petrol 2.4 150000 6 Manual 4x4 Четырехдверный Left wheel Grey 4 13 Старый
7436 2038 765 KIA Avella Sedan Yes Petrol 2.0 125621 4 Automatic Front Четырехдверный Left wheel Silver 12 5 Новый
7728 13485 843 TOYOTA Prius Hatchback No Hybrid 1.5 212000 4 Variator Front Четырехдверный Left wheel Silver 8 12 Старый
1136 15677 0 FORD Fiesta Sedan No Petrol 1.6 74800 4 Automatic Front Четырехдверный Left wheel Silver 8 4 Новый
10640 16308 751 KIA Optima EX Sedan Yes Petrol 2.4 92000 12 Tiptronic Front Четырехдверный Left wheel Silver 8 7 Средний

8817 rows × 18 columns

Унитарное кодирование категориальных признаков

In [119]:
train_df.dtypes
Out[119]:
Price                  int64
Levy                   int64
Manufacturer          object
Model                 object
Category              object
Leather interior      object
Fuel type             object
Engine volume        float64
Mileage                int64
Cylinders              int64
Gear box type         object
Drive wheels          object
Doors                 object
Wheel                 object
Color                 object
Airbags                int64
Age                    int64
Age_bin             category
dtype: object
In [120]:
categorical_features_for_encoding = [
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    "Doors",
    "Wheel",
    "Age_bin",
]

train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)
test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)

train_df
Out[120]:
Price Levy Manufacturer Model Engine volume Mileage Cylinders Color Airbags Age ... Drive wheels_Rear Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый Age_bin_Средний Age_bin_Старый Age_bin_Очень старый
15146 18503 0 TOYOTA Prius 1.8 13000 4 White 4 0 ... False False False True True False True False False False
14145 9722 0 TOYOTA Ractis 1.5 116800 4 Brown 2 13 ... False False False True False True False False True False
8943 15367 584 HYUNDAI Elantra 1.8 78222 4 Beige 10 6 ... False False False True True False False True False False
17889 11917 0 SUBARU Forester L.L.BEAN 2.5 220000 4 Green 5 16 ... False False False True True False False False False True
9515 46919 1327 HYUNDAI H1 2.5 71689 4 Grey 4 2 ... False False False True True False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 10349 0 AUDI A4 2.4 150000 6 Grey 4 13 ... False False False True True False False False True False
7436 2038 765 KIA Avella 2.0 125621 4 Silver 12 5 ... False False False True True False True False False False
7728 13485 843 TOYOTA Prius 1.5 212000 4 Silver 8 12 ... False False False True True False False False True False
1136 15677 0 FORD Fiesta 1.6 74800 4 Silver 8 4 ... False False False True True False True False False False
10640 16308 751 KIA Optima EX 2.4 92000 12 Silver 8 7 ... False False False True True False False True False False

8817 rows × 46 columns

Масштабирование признаков

In [121]:
train_df.dtypes
Out[121]:
Price                         int64
Levy                          int64
Manufacturer                 object
Model                        object
Engine volume               float64
Mileage                       int64
Cylinders                     int64
Color                        object
Airbags                       int64
Age                           int64
Leather interior_No            bool
Leather interior_Yes           bool
Category_Cabriolet             bool
Category_Coupe                 bool
Category_Goods wagon           bool
Category_Hatchback             bool
Category_Jeep                  bool
Category_Limousine             bool
Category_Microbus              bool
Category_Minivan               bool
Category_Pickup                bool
Category_Sedan                 bool
Category_Universal             bool
Fuel type_CNG                  bool
Fuel type_Diesel               bool
Fuel type_Hybrid               bool
Fuel type_Hydrogen             bool
Fuel type_LPG                  bool
Fuel type_Petrol               bool
Fuel type_Plug-in Hybrid       bool
Gear box type_Automatic        bool
Gear box type_Manual           bool
Gear box type_Tiptronic        bool
Gear box type_Variator         bool
Drive wheels_4x4               bool
Drive wheels_Front             bool
Drive wheels_Rear              bool
Doors_Двухдверный              bool
Doors_Многодверный             bool
Doors_Четырехдверный           bool
Wheel_Left wheel               bool
Wheel_Right-hand drive         bool
Age_bin_Новый                  bool
Age_bin_Средний                bool
Age_bin_Старый                 bool
Age_bin_Очень старый           bool
dtype: object
In [122]:
scaler = StandardScaler()

numeric_features_for_stardartization = [
    "Price",
    "Levy",
    "Engine volume",
    "Mileage",
    "Cylinders",
    "Airbags",
    "Age",
]

train_df[numeric_features_for_stardartization] = scaler.fit_transform(
    train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
    test_df[numeric_features_for_stardartization]
)

train_df
Out[122]:
Price Levy Manufacturer Model Engine volume Mileage Cylinders Color Airbags Age ... Drive wheels_Rear Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый Age_bin_Средний Age_bin_Старый Age_bin_Очень старый
15146 0.153774 -1.192982 TOYOTA Prius -0.479341 -1.531744 -0.403213 White -0.683755 -1.946936 ... False False False True True False True False False False
14145 -0.658018 -1.192982 TOYOTA Ractis -0.887855 -0.130245 -0.403213 Brown -1.190217 0.879266 ... False False False True False True False False True False
8943 -0.136145 0.081576 HYUNDAI Elantra -0.479341 -0.651122 -0.403213 Beige 0.835631 -0.642535 ... False False False True True False False True False False
17889 -0.455093 -1.192982 SUBARU Forester L.L.BEAN 0.473858 1.263152 -0.403213 Green -0.430524 1.531466 ... False False False True True False False False False True
9515 2.780795 1.703146 HYUNDAI H1 0.473858 -0.739330 -0.403213 Grey -0.683755 -1.512135 ... False False False True True False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 -0.600053 -1.192982 AUDI A4 0.337687 0.318018 1.538421 Grey -0.683755 0.879266 ... False False False True True False False False True False
7436 -1.368394 0.476602 KIA Avella -0.206998 -0.011145 -0.403213 Silver 1.342092 -0.859935 ... False False False True True False True False False False
7728 -0.310134 0.646834 TOYOTA Prius -0.887855 1.155137 -0.403213 Silver 0.329169 0.661866 ... False False False True True False False False True False
1136 -0.107486 -1.192982 FORD Fiesta -0.751684 -0.697325 -0.403213 Silver 0.329169 -1.077335 ... False False False True True False True False False False
10640 -0.049151 0.446048 KIA Optima EX 0.337687 -0.465093 7.363324 Silver 0.329169 -0.425135 ... False False False True True False False True False False

8817 rows × 46 columns

Конструирование признаков с помощью Featuretools

In [123]:
es = ft.EntitySet(id="car_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df, index="id")
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="train",
    max_depth=1,
)
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  pd.to_datetime(
In [124]:
feature_defs
Out[124]:
[<Feature: Price>,
 <Feature: Levy>,
 <Feature: Manufacturer>,
 <Feature: Model>,
 <Feature: Engine volume>,
 <Feature: Mileage>,
 <Feature: Cylinders>,
 <Feature: Color>,
 <Feature: Airbags>,
 <Feature: Age>,
 <Feature: Leather interior_No>,
 <Feature: Leather interior_Yes>,
 <Feature: Category_Cabriolet>,
 <Feature: Category_Coupe>,
 <Feature: Category_Goods wagon>,
 <Feature: Category_Hatchback>,
 <Feature: Category_Jeep>,
 <Feature: Category_Limousine>,
 <Feature: Category_Microbus>,
 <Feature: Category_Minivan>,
 <Feature: Category_Pickup>,
 <Feature: Category_Sedan>,
 <Feature: Category_Universal>,
 <Feature: Fuel type_CNG>,
 <Feature: Fuel type_Diesel>,
 <Feature: Fuel type_Hybrid>,
 <Feature: Fuel type_Hydrogen>,
 <Feature: Fuel type_LPG>,
 <Feature: Fuel type_Petrol>,
 <Feature: Fuel type_Plug-in Hybrid>,
 <Feature: Gear box type_Automatic>,
 <Feature: Gear box type_Manual>,
 <Feature: Gear box type_Tiptronic>,
 <Feature: Gear box type_Variator>,
 <Feature: Drive wheels_4x4>,
 <Feature: Drive wheels_Front>,
 <Feature: Drive wheels_Rear>,
 <Feature: Doors_Двухдверный>,
 <Feature: Doors_Многодверный>,
 <Feature: Doors_Четырехдверный>,
 <Feature: Wheel_Left wheel>,
 <Feature: Wheel_Right-hand drive>,
 <Feature: Age_bin_Новый>,
 <Feature: Age_bin_Средний>,
 <Feature: Age_bin_Старый>,
 <Feature: Age_bin_Очень старый>]