205 KiB
Raw Blame History

Загрузка набора данных

In [971]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


df = pd.read_csv("../data/car_price_prediction.csv")

df = df.drop(columns=["ID"])

df
Out[971]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8
2 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19232 8467 - MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5
19233 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8
19234 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4
19235 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4
19236 470 753 HYUNDAI Sonata 2012 Sedan Yes Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12

19237 rows × 17 columns

Анализ датасета и очистка данных

In [972]:
df.dtypes
Out[972]:
Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [973]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[973]:
array([ 3.5,  3. ,  1.3,  2.5,  2. ,  1.8,  2.4,  4. ,  1.6,  3.3,  2.2,
        4.7,  1.5,  4.4,  1.4,  3.6,  2.3,  5.5,  2.8,  3.2,  3.8,  4.6,
        1.2,  5. ,  1.7,  2.9,  0.5,  1.9,  2.7,  4.8,  5.3,  0.4,  1.1,
        2.1,  0.7,  5.4,  3.7,  1. ,  2.6,  0.8,  0.2,  5.7,  6.7,  6.2,
        3.4,  6.3,  4.3,  4.2,  0. , 20. ,  0.3,  5.9,  5.6,  6. ,  0.6,
        6.8,  4.5,  7.3,  0.1,  3.1,  6.4,  3.9,  0.9,  5.2,  5.8])
In [974]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[974]:
array([186005, 192000, 200000, ..., 140607, 307325, 186923])
In [975]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[975]:
array([ 1399,  1018,     0,   862,   446,   891,   761,   751,   394,
        1053,  1055,  1079,   810,  2386,  1850,   531,   586,  1249,
        2455,   583,  1537,  1288,   915,  1750,   707,  1077,  1486,
        1091,   650,   382,  1436,  1194,   503,  1017,  1104,   639,
         629,   919,   781,   530,   640,   765,   777,   779,   934,
         769,   645,  1185,  1324,   830,  1187,  1111,   760,   642,
        1604,  1095,   966,   473,  1138,  1811,   988,   917,  1156,
         687, 11714,   836,  1347,  2866,  1646,   259,   609,   697,
         585,   475,   690,   308,  1823,  1361,  1273,   924,   584,
        2078,   831,  1172,   893,  1872,  1885,  1266,   447,  2148,
        1730,   730,   289,   502,   333,  1325,   247,   879,  1342,
        1327,  1598,  1514,  1058,   738,  1935,   481,  1522,  1282,
         456,   880,   900,   798,  1277,   442,  1051,   790,  1292,
        1047,   528,  1211,  1493,  1793,   574,   930,  1998,   271,
         706,  1481,  1677,  1661,  1286,  1408,  1090,   595,  1451,
        1267,   993,  1714,   878,   641,   749,  1511,   603,   353,
         877,  1236,  1141,   397,   784,  1024,  1357,  1301,   770,
         922,  1438,   753,   607,  1363,   638,   490,   431,   565,
         517,   833,   489,  1760,   986,  1841,  1620,  1360,   474,
        1099,   978,  1624,  1946,  1268,  1307,   696,   649,   666,
        2151,   551,   800,   971,  1323,  2377,  1845,  1083,   694,
         463,   419,   345,  1515,  1505,  2056,  1203,   729,   460,
        1356,   876,   911,  1190,   780,   448,  2410,  1848,  1148,
         834,  1275,  1028,  1197,   724,   890,  1705,   505,   789,
        2959,   518,   461,  1719,  2858,  3156,  2225,  2177,  1968,
        1888,  1308,  2736,  1103,   557,  2195,   843,  1664,   723,
        4508,   562,   501,  2018,  1076,  1202,  3301,   691,  1440,
        1869,  1178,   418,  1820,  1413,   488,  1304,   363,  2108,
         521,  1659,    87,  1411,  1528,  3292,  7058,  1578,   627,
         874,  1996,  1488,  5679,  1234,  5603,   400,   889,  3268,
         875,   949,  2265,   441,   742,   425,  2476,  2971,   614,
        1816,  1375,  1405,  2297,  1062,  1113,   420,  2469,   658,
        1951,  2670,  2578,  1995,  1032,   994,  1011,  2421,  1296,
         155,   494,   426,  1086,   961,  2236,  1829,   764,  1834,
        1054,   617,  1529,  2266,   637,   626,  1832,  1016,  2002,
        1756,   746,  1285,  2690,  1118,  5332,   980,  1807,   970,
        1228,  1195,  1132,  1768,  1384,  1080,  7063,  1817,  1452,
        1975,  1368,   702,  1974,  1781,  1036,   944,   663,   364,
        1539,  1345,  1680,  2209,   741,  1575,   695,  1317,   294,
        1525,   424,   997,  1473,  1552,  2819,  2188,  1668,  3057,
         799,  1502,  2606,   552,  1694,  1759,  1110,   399,  1470,
        1174,  5877,  1474,  1688,   526,   686,  5908,  1107,  2070,
        1468,  1246,  1685,   556,  1533,  1917,  1346,   732,   692,
         579,   421,   362,  3505,  1855,  2711,  1586,  3739,   681,
        1708,  2278,  1701,   722,  1482,   928,   827,   832,   527,
         604,   173,  1341,  3329,  1553,   859,   167,   916,   828,
        2082,  1176,  1108,   975,  3008,  1516,  2269,  1699,  2073,
        1031,  1503,  2364,  1030,  1442,  5666,  2715,  1437,  2067,
        1426,  2908,  1279,   866,  4283,   279,  2658,  3015,  2004,
        1391,  4736,   748,  1466,   644,   683,  2705,  1297,   731,
        1252,  2216,  3141,  3273,  1518,  1723,  1588,   972,   682,
        1094,   668,   175,   967,   402,  3894,  1960,  1599,  2000,
        2084,  1621,   714,  1109,  3989,   873,  1572,  1163,  1991,
        1716,  1673,  2562,  2874,   965,   462,   605,  1948,  1736,
        3518,  2054,  2467,  1681,  1272,  1205,   750,  2156,  2566,
         115,   524,  3184,   676,  1678,   612,   328,   955,  1441,
        1675,  3965,  2909,   623,   822,   867,  3025,  1993,   792,
         636,  4057,  3743,  2337,  2570,  2418,  2472,  3910,  1662,
        2123,  2628,  3208,  2080,  3699,  2913,   864,  2505,   870,
        7536,  1924,  1671,  1064,  1836,  1866,  4741,   841,  1369,
        5681,  3112,  1366,  2223,  1198,  1039,  3811,  3571,  1387,
        1171,  1365,  1531,  1590, 11706,  2308,  4860,  1641,  1045,
        1901])
In [976]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[976]:
array([ 6,  4,  8,  1, 12,  3,  2, 16,  5,  7,  9, 10, 14])
In [977]:
df["Doors"].unique()
Out[977]:
array(['04-May', '02-Mar', '>5'], dtype=object)
In [978]:
df["Doors"] = df["Doors"].map(
    {"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[978]:
array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)
In [979]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[979]:
array([       1,        3,        6, ...,   627220,   872946, 26307500])
In [980]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
Количество строк до удаления некорректных значений: 19237
Количество строк после удаления некорректных значений: 17574
In [981]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[981]:
array([     500,      549,      600, ...,   627220,   872946, 26307500])
In [982]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[982]:
array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020])

Ручной синтез признаков.

In [983]:
df["Age"] = 2020 - df["Prod. year"]
df = df.drop("Prod. year", axis=1)
sorted_df = df.sort_values(by="Age")
sorted_df["Age"].unique()
Out[983]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 42, 43, 46, 47, 52, 55, 56, 63, 67, 77])
In [984]:
df
Out[984]:
Price Levy Manufacturer Model Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags Age
0 13328 1399 LEXUS RX 450 Jeep Yes Hybrid 3.5 186005 6 Automatic 4x4 Четырехдверный Left wheel Silver 12 10
1 16621 1018 CHEVROLET Equinox Jeep No Petrol 3.0 192000 6 Tiptronic 4x4 Четырехдверный Left wheel Black 8 9
2 8467 0 HONDA FIT Hatchback No Petrol 1.3 200000 4 Variator Front Четырехдверный Right-hand drive Black 2 14
3 3607 862 FORD Escape Jeep Yes Hybrid 2.5 168966 4 Automatic 4x4 Четырехдверный Left wheel White 0 9
4 11726 446 HONDA FIT Hatchback Yes Petrol 1.3 91901 4 Automatic Front Четырехдверный Left wheel Silver 4 6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19231 5802 1055 MERCEDES-BENZ E 350 Sedan Yes Diesel 3.5 107800 6 Automatic Rear Четырехдверный Left wheel Grey 12 7
19232 8467 0 MERCEDES-BENZ CLK 200 Coupe Yes CNG 2.0 300000 4 Manual Rear Двухдверный Left wheel Silver 5 21
19233 15681 831 HYUNDAI Sonata Sedan Yes Petrol 2.4 161600 4 Tiptronic Front Четырехдверный Left wheel Red 8 9
19234 26108 836 HYUNDAI Tucson Jeep Yes Diesel 2.0 116365 4 Automatic Front Четырехдверный Left wheel Grey 4 10
19235 5331 1288 CHEVROLET Captiva Jeep Yes Diesel 2.0 51258 4 Automatic Front Четырехдверный Left wheel Black 4 13

17574 rows × 17 columns

Очистка дубликатов и пропущенных значений

In [985]:
df.duplicated().sum()
Out[985]:
np.int64(2773)
In [986]:
df.drop_duplicates(inplace=True)
In [987]:
df.isna().sum()
Out[987]:
Price               0
Levy                0
Manufacturer        0
Model               0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
Age                 0
dtype: int64

Очистка выбросов

In [988]:
df.dtypes
Out[988]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object
In [989]:
numeric_features_with_outliers = [
    "Price",
    "Levy",
    "Mileage",
    "Age",
]

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [990]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Количество строк до удаления выбросов: {len(df)}")

for column in numeric_features_with_outliers:
    df = remove_outliers(df, column)

print(f"Количество строк после удаления выбросов: {len(df)}")
Количество строк до удаления выбросов: 14801
Количество строк после удаления выбросов: 12597
In [991]:
i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Разбиение на выборки

In [992]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(f"Тестовая выборка: {test_df.shape[0]} записей")
Размеры выборок:
Обучающая выборка: 10077 записей
Тестовая выборка: 2520 записей

Дискретизация числовых признаков

In [993]:
train_df.dtypes
Out[993]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object
In [994]:
numeric_features_for_discritization = ["Age"]

def discretize_features(df, features, bins=4, labels=["Новый", "Средний", "Старый", "Очень старый"]):
    for feature in features:
        try:
            df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels)  # type: ignore
        except Exception as e:
            print(f"Ошибка при дискретизации признака {feature}: {e}")
    return df


train_df = discretize_features(train_df, numeric_features_for_discritization)
test_df = discretize_features(test_df, numeric_features_for_discritization)

train_df
Out[994]:
Price Levy Manufacturer Model Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags Age Age_bin
14829 6743 966 DAEWOO Lacetti Sedan Yes Diesel 2.0 62227 4 Automatic Front Четырехдверный Left wheel White 4 11 Старый
3632 20005 583 HYUNDAI Elantra Sedan Yes Petrol 1.6 94479 4 Automatic Front Четырехдверный Left wheel Red 4 9 Средний
4982 13172 836 DODGE Caliber Hatchback No Petrol 2.0 114000 4 Variator Front Четырехдверный Left wheel Silver 8 10 Средний
16758 8781 584 HYUNDAI Elantra Sedan Yes Petrol 1.8 60000 4 Tiptronic Front Четырехдверный Left wheel Grey 10 6 Средний
6875 25086 0 TOYOTA Prius Hatchback No Hybrid 1.8 0 4 Automatic Front Четырехдверный Left wheel Silver 12 5 Новый
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 10349 0 AUDI A4 Sedan Yes Petrol 2.4 150000 6 Manual 4x4 Четырехдверный Left wheel Grey 4 13 Старый
7436 2038 765 KIA Avella Sedan Yes Petrol 2.0 125621 4 Automatic Front Четырехдверный Left wheel Silver 12 5 Новый
7728 13485 843 TOYOTA Prius Hatchback No Hybrid 1.5 212000 4 Variator Front Четырехдверный Left wheel Silver 8 12 Старый
1136 15677 0 FORD Fiesta Sedan No Petrol 1.6 74800 4 Automatic Front Четырехдверный Left wheel Silver 8 4 Новый
10640 16308 751 KIA Optima EX Sedan Yes Petrol 2.4 92000 12 Tiptronic Front Четырехдверный Left wheel Silver 8 7 Средний

10077 rows × 18 columns

Унитарное кодирование категориальных признаков

In [995]:
train_df.dtypes
Out[995]:
Price                  int64
Levy                   int64
Manufacturer          object
Model                 object
Category              object
Leather interior      object
Fuel type             object
Engine volume        float64
Mileage                int64
Cylinders              int64
Gear box type         object
Drive wheels          object
Doors                 object
Wheel                 object
Color                 object
Airbags                int64
Age                    int64
Age_bin             category
dtype: object
In [996]:
categorical_features_for_encoding = [
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    "Doors",
    "Wheel",
    "Age_bin",
]

train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)
test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)

train_df
Out[996]:
Price Levy Manufacturer Model Engine volume Mileage Cylinders Color Airbags Age ... Drive wheels_Rear Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый Age_bin_Средний Age_bin_Старый Age_bin_Очень старый
14829 6743 966 DAEWOO Lacetti 2.0 62227 4 White 4 11 ... False False False True True False False False True False
3632 20005 583 HYUNDAI Elantra 1.6 94479 4 Red 4 9 ... False False False True True False False True False False
4982 13172 836 DODGE Caliber 2.0 114000 4 Silver 8 10 ... False False False True True False False True False False
16758 8781 584 HYUNDAI Elantra 1.8 60000 4 Grey 10 6 ... False False False True True False False True False False
6875 25086 0 TOYOTA Prius 1.8 0 4 Silver 12 5 ... False False False True True False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 10349 0 AUDI A4 2.4 150000 6 Grey 4 13 ... False False False True True False False False True False
7436 2038 765 KIA Avella 2.0 125621 4 Silver 12 5 ... False False False True True False True False False False
7728 13485 843 TOYOTA Prius 1.5 212000 4 Silver 8 12 ... False False False True True False False False True False
1136 15677 0 FORD Fiesta 1.6 74800 4 Silver 8 4 ... False False False True True False True False False False
10640 16308 751 KIA Optima EX 2.4 92000 12 Silver 8 7 ... False False False True True False False True False False

10077 rows × 46 columns

Масштабирование признаков

In [997]:
train_df.dtypes
Out[997]:
Price                         int64
Levy                          int64
Manufacturer                 object
Model                        object
Engine volume               float64
Mileage                       int64
Cylinders                     int64
Color                        object
Airbags                       int64
Age                           int64
Leather interior_No            bool
Leather interior_Yes           bool
Category_Cabriolet             bool
Category_Coupe                 bool
Category_Goods wagon           bool
Category_Hatchback             bool
Category_Jeep                  bool
Category_Limousine             bool
Category_Microbus              bool
Category_Minivan               bool
Category_Pickup                bool
Category_Sedan                 bool
Category_Universal             bool
Fuel type_CNG                  bool
Fuel type_Diesel               bool
Fuel type_Hybrid               bool
Fuel type_Hydrogen             bool
Fuel type_LPG                  bool
Fuel type_Petrol               bool
Fuel type_Plug-in Hybrid       bool
Gear box type_Automatic        bool
Gear box type_Manual           bool
Gear box type_Tiptronic        bool
Gear box type_Variator         bool
Drive wheels_4x4               bool
Drive wheels_Front             bool
Drive wheels_Rear              bool
Doors_Двухдверный              bool
Doors_Многодверный             bool
Doors_Четырехдверный           bool
Wheel_Left wheel               bool
Wheel_Right-hand drive         bool
Age_bin_Новый                  bool
Age_bin_Средний                bool
Age_bin_Старый                 bool
Age_bin_Очень старый           bool
dtype: object
In [998]:
scaler = StandardScaler()

numeric_features_for_stardartization = [
    "Price",
    "Levy",
    "Engine volume",
    "Mileage",
    "Cylinders",
    "Airbags",
    "Age",
]

train_df[numeric_features_for_stardartization] = scaler.fit_transform(
    train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
    test_df[numeric_features_for_stardartization]
)

train_df
Out[998]:
Price Levy Manufacturer Model Engine volume Mileage Cylinders Color Airbags Age ... Drive wheels_Rear Doors_Двухдверный Doors_Многодверный Doors_Четырехдверный Wheel_Left wheel Wheel_Right-hand drive Age_bin_Новый Age_bin_Средний Age_bin_Старый Age_bin_Очень старый
14829 -0.936428 0.909873 DAEWOO Lacetti -0.212078 -0.855905 -0.399820 White -0.681491 0.446831 ... False False False True True False False False True False
3632 0.288147 0.076376 HYUNDAI Elantra -0.757467 -0.422001 -0.399820 Red -0.681491 0.013523 ... False False False True True False False True False False
4982 -0.342793 0.626963 DODGE Caliber -0.212078 -0.159374 -0.399820 Silver 0.330763 0.230177 ... False False False True True False False True False False
16758 -0.748245 0.078552 HYUNDAI Elantra -0.484772 -0.885866 -0.399820 Grey 0.836890 -0.636438 ... False False False True True False False True False False
6875 0.757313 -1.192368 TOYOTA Prius -0.484772 -1.693079 -0.399820 Silver 1.343017 -0.853091 ... False False False True True False True False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 -0.603461 -1.192368 AUDI A4 0.333312 0.324954 1.520116 Grey -0.681491 0.880138 ... False False False True True False False False True False
7436 -1.370875 0.472450 KIA Avella -0.212078 -0.003030 -0.399820 Silver 1.343017 -0.853091 ... False False False True True False True False False False
7728 -0.313891 0.642196 TOYOTA Prius -0.893814 1.159074 -0.399820 Silver 0.330763 0.663484 ... False False False True True False False False True False
1136 -0.111488 -1.192368 FORD Fiesta -0.757467 -0.686753 -0.399820 Silver 0.330763 -1.069745 ... False False False True True False True False False False
10640 -0.053223 0.441983 KIA Optima EX 0.333312 -0.455352 7.279922 Silver 0.330763 -0.419784 ... False False False True True False False True False False

10077 rows × 46 columns

Конструирование признаков с помощью Featuretools

In [999]:
es = ft.EntitySet(id="car_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df)
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="train",
    max_depth=1,
)

feature_defs
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:1717: UserWarning: Using first column as index. To change this, specify the index parameter
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:1379: SyntaxWarning: invalid escape sequence '\l'
  columns_string = "\l".join(column_typing_info)  # noqa: W605
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:1381: SyntaxWarning: invalid escape sequence '\l'
  label = "{%s (%d row%s)|%s\l}" % (  # noqa: W605
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[999], line 2
      1 es = ft.EntitySet(id="car_data")
----> 2 es = es.add_dataframe(dataframe_name="train", dataframe=train_df)
      3 feature_matrix, feature_defs = ft.dfs(
      4     entityset=es,
      5     target_dataframe_name="train",
      6     max_depth=1,
      7 )
      9 feature_defs

File c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:687, in EntitySet.add_dataframe(self, dataframe, dataframe_name, index, logical_types, semantic_tags, make_index, time_index, secondary_time_index, already_sorted)
    676     raise ValueError(
    677         "Cannot add dataframe to EntitySet without a name. "
    678         "Please provide a value for the dataframe_name parameter.",
    679     )
    681 index_was_created, index, dataframe = _get_or_create_index(
    682     index,
    683     make_index,
    684     dataframe,
    685 )
--> 687 dataframe.ww.init(
    688     name=dataframe_name,
    689     index=index,
    690     time_index=time_index,
    691     logical_types=logical_types,
    692     semantic_tags=semantic_tags,
    693     already_sorted=already_sorted,
    694 )
    695 if index_was_created:
    696     dataframe.ww.metadata["created_index"] = index

File c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\table_accessor.py:96, in WoodworkTableAccessor.init(self, **kwargs)
     44 def init(self, **kwargs):
     45     """Initializes Woodwork typing information for a DataFrame with a partial schema.
     46 
     47     Logical type priority:
   (...)
     94             Any errors resulting from skipping validation with invalid inputs may not be easily understood.
     95     """
---> 96     self.init_with_partial_schema(**kwargs)

File c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\table_accessor.py:199, in WoodworkTableAccessor.init_with_partial_schema(self, schema, index, time_index, logical_types, ignore_columns, already_sorted, name, semantic_tags, table_metadata, column_metadata, use_standard_tags, column_descriptions, column_origins, null_invalid_values, validate, **kwargs)
    147 """Initializes Woodwork typing information for a DataFrame with a partial schema.
    148 
    149 Logical type priority:
   (...)
    196         Any errors resulting from skipping validation with invalid inputs may not be easily understood.
    197 """
    198 if validate:
--> 199     _validate_accessor_params(
    200         self._dataframe,
    201         index,
    202         time_index,
    203         logical_types,
    204         ignore_columns,
    205         schema,
    206         use_standard_tags,
    207     )
    209 existing_logical_types = {}
    210 existing_col_descriptions = {}

File c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\table_accessor.py:1683, in _validate_accessor_params(dataframe, index, time_index, logical_types, ignore_columns, schema, use_standard_tags)
   1681         index = schema.index
   1682 if index is not None:
-> 1683     _check_index(dataframe, index)
   1684 if logical_types:
   1685     _check_logical_types(dataframe.columns, logical_types)

File c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\table_accessor.py:1718, in _check_index(dataframe, index)
   1715 if index is not None:
   1716     # User specifies a dataframe index that is not unique or contains null values
   1717     if not dataframe[index].is_unique:
-> 1718         raise IndexError("Index column must be unique")
   1720     if dataframe[index].isnull().any():
   1721         raise IndexError("Index contains null values")

IndexError: Index column must be unique