mai_pi-33_zakharov/lab3_2.ipynb at 5ab313468c4fd77d05bdcb99c6f833ce9bd863ae

Zakharov_Rostislav bd8c7a6d2b feat(lab3): finish preps

2024-12-07 10:51:50 +04:00

220 KiB

Raw Blame History

Загрузка набора данных¶

In [91]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import featuretools as ft
import re
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


df = pd.read_csv("../data/car_price_prediction.csv")

df = df.drop(columns=["ID"])

df

Out[91]:

	Price	Levy	Manufacturer	Model	Prod. year	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
0	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005 km	6.0	Automatic	4x4	04-May	Left wheel	Silver	12
1	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3	192000 km	6.0	Tiptronic	4x4	04-May	Left wheel	Black	8
2	8467	-	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000 km	4.0	Variator	Front	04-May	Right-hand drive	Black	2
3	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966 km	4.0	Automatic	4x4	04-May	Left wheel	White	0
4	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901 km	4.0	Automatic	Front	04-May	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19232	8467	-	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0 Turbo	300000 km	4.0	Manual	Rear	02-Mar	Left wheel	Silver	5
19233	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600 km	4.0	Tiptronic	Front	04-May	Left wheel	Red	8
19234	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2	116365 km	4.0	Automatic	Front	04-May	Left wheel	Grey	4
19235	5331	1288	CHEVROLET	Captiva	2007	Jeep	Yes	Diesel	2	51258 km	4.0	Automatic	Front	04-May	Left wheel	Black	4
19236	470	753	HYUNDAI	Sonata	2012	Sedan	Yes	Hybrid	2.4	186923 km	4.0	Automatic	Front	04-May	Left wheel	White	12

19237 rows × 17 columns

Анализ датасета и очистка данных¶

In [92]:

df.dtypes

Out[92]:

Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [93]:

df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()

Out[93]:

array([ 3.5,  3. ,  1.3,  2.5,  2. ,  1.8,  2.4,  4. ,  1.6,  3.3,  2.2,
        4.7,  1.5,  4.4,  1.4,  3.6,  2.3,  5.5,  2.8,  3.2,  3.8,  4.6,
        1.2,  5. ,  1.7,  2.9,  0.5,  1.9,  2.7,  4.8,  5.3,  0.4,  1.1,
        2.1,  0.7,  5.4,  3.7,  1. ,  2.6,  0.8,  0.2,  5.7,  6.7,  6.2,
        3.4,  6.3,  4.3,  4.2,  0. , 20. ,  0.3,  5.9,  5.6,  6. ,  0.6,
        6.8,  4.5,  7.3,  0.1,  3.1,  6.4,  3.9,  0.9,  5.2,  5.8])

In [94]:

df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()

Out[94]:

array([186005, 192000, 200000, ..., 140607, 307325, 186923])

In [95]:

df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()

Out[95]:

array([ 1399,  1018,     0,   862,   446,   891,   761,   751,   394,
        1053,  1055,  1079,   810,  2386,  1850,   531,   586,  1249,
        2455,   583,  1537,  1288,   915,  1750,   707,  1077,  1486,
        1091,   650,   382,  1436,  1194,   503,  1017,  1104,   639,
         629,   919,   781,   530,   640,   765,   777,   779,   934,
         769,   645,  1185,  1324,   830,  1187,  1111,   760,   642,
        1604,  1095,   966,   473,  1138,  1811,   988,   917,  1156,
         687, 11714,   836,  1347,  2866,  1646,   259,   609,   697,
         585,   475,   690,   308,  1823,  1361,  1273,   924,   584,
        2078,   831,  1172,   893,  1872,  1885,  1266,   447,  2148,
        1730,   730,   289,   502,   333,  1325,   247,   879,  1342,
        1327,  1598,  1514,  1058,   738,  1935,   481,  1522,  1282,
         456,   880,   900,   798,  1277,   442,  1051,   790,  1292,
        1047,   528,  1211,  1493,  1793,   574,   930,  1998,   271,
         706,  1481,  1677,  1661,  1286,  1408,  1090,   595,  1451,
        1267,   993,  1714,   878,   641,   749,  1511,   603,   353,
         877,  1236,  1141,   397,   784,  1024,  1357,  1301,   770,
         922,  1438,   753,   607,  1363,   638,   490,   431,   565,
         517,   833,   489,  1760,   986,  1841,  1620,  1360,   474,
        1099,   978,  1624,  1946,  1268,  1307,   696,   649,   666,
        2151,   551,   800,   971,  1323,  2377,  1845,  1083,   694,
         463,   419,   345,  1515,  1505,  2056,  1203,   729,   460,
        1356,   876,   911,  1190,   780,   448,  2410,  1848,  1148,
         834,  1275,  1028,  1197,   724,   890,  1705,   505,   789,
        2959,   518,   461,  1719,  2858,  3156,  2225,  2177,  1968,
        1888,  1308,  2736,  1103,   557,  2195,   843,  1664,   723,
        4508,   562,   501,  2018,  1076,  1202,  3301,   691,  1440,
        1869,  1178,   418,  1820,  1413,   488,  1304,   363,  2108,
         521,  1659,    87,  1411,  1528,  3292,  7058,  1578,   627,
         874,  1996,  1488,  5679,  1234,  5603,   400,   889,  3268,
         875,   949,  2265,   441,   742,   425,  2476,  2971,   614,
        1816,  1375,  1405,  2297,  1062,  1113,   420,  2469,   658,
        1951,  2670,  2578,  1995,  1032,   994,  1011,  2421,  1296,
         155,   494,   426,  1086,   961,  2236,  1829,   764,  1834,
        1054,   617,  1529,  2266,   637,   626,  1832,  1016,  2002,
        1756,   746,  1285,  2690,  1118,  5332,   980,  1807,   970,
        1228,  1195,  1132,  1768,  1384,  1080,  7063,  1817,  1452,
        1975,  1368,   702,  1974,  1781,  1036,   944,   663,   364,
        1539,  1345,  1680,  2209,   741,  1575,   695,  1317,   294,
        1525,   424,   997,  1473,  1552,  2819,  2188,  1668,  3057,
         799,  1502,  2606,   552,  1694,  1759,  1110,   399,  1470,
        1174,  5877,  1474,  1688,   526,   686,  5908,  1107,  2070,
        1468,  1246,  1685,   556,  1533,  1917,  1346,   732,   692,
         579,   421,   362,  3505,  1855,  2711,  1586,  3739,   681,
        1708,  2278,  1701,   722,  1482,   928,   827,   832,   527,
         604,   173,  1341,  3329,  1553,   859,   167,   916,   828,
        2082,  1176,  1108,   975,  3008,  1516,  2269,  1699,  2073,
        1031,  1503,  2364,  1030,  1442,  5666,  2715,  1437,  2067,
        1426,  2908,  1279,   866,  4283,   279,  2658,  3015,  2004,
        1391,  4736,   748,  1466,   644,   683,  2705,  1297,   731,
        1252,  2216,  3141,  3273,  1518,  1723,  1588,   972,   682,
        1094,   668,   175,   967,   402,  3894,  1960,  1599,  2000,
        2084,  1621,   714,  1109,  3989,   873,  1572,  1163,  1991,
        1716,  1673,  2562,  2874,   965,   462,   605,  1948,  1736,
        3518,  2054,  2467,  1681,  1272,  1205,   750,  2156,  2566,
         115,   524,  3184,   676,  1678,   612,   328,   955,  1441,
        1675,  3965,  2909,   623,   822,   867,  3025,  1993,   792,
         636,  4057,  3743,  2337,  2570,  2418,  2472,  3910,  1662,
        2123,  2628,  3208,  2080,  3699,  2913,   864,  2505,   870,
        7536,  1924,  1671,  1064,  1836,  1866,  4741,   841,  1369,
        5681,  3112,  1366,  2223,  1198,  1039,  3811,  3571,  1387,
        1171,  1365,  1531,  1590, 11706,  2308,  4860,  1641,  1045,
        1901])

In [96]:

df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()

Out[96]:

array([ 6,  4,  8,  1, 12,  3,  2, 16,  5,  7,  9, 10, 14])

In [97]:

df["Doors"].unique()

Out[97]:

array(['04-May', '02-Mar', '>5'], dtype=object)

In [98]:

df["Doors"] = df["Doors"].map(
    {"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()

Out[98]:

array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)

In [99]:

sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()

Out[99]:

array([       1,        3,        6, ...,   627220,   872946, 26307500])

In [100]:

print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")

Количество строк до удаления некорректных значений: 19237
Количество строк после удаления некорректных значений: 17574

In [101]:

sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()

Out[101]:

array([     500,      549,      600, ...,   627220,   872946, 26307500])

In [102]:

sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()

Out[102]:

array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [103]:

df

Out[103]:

	Price	Levy	Manufacturer	Model	Prod. year	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags
0	13328	1399	LEXUS	RX 450	2010	Jeep	Yes	Hybrid	3.5	186005	6	Automatic	4x4	Четырехдверный	Left wheel	Silver	12
1	16621	1018	CHEVROLET	Equinox	2011	Jeep	No	Petrol	3.0	192000	6	Tiptronic	4x4	Четырехдверный	Left wheel	Black	8
2	8467	0	HONDA	FIT	2006	Hatchback	No	Petrol	1.3	200000	4	Variator	Front	Четырехдверный	Right-hand drive	Black	2
3	3607	862	FORD	Escape	2011	Jeep	Yes	Hybrid	2.5	168966	4	Automatic	4x4	Четырехдверный	Left wheel	White	0
4	11726	446	HONDA	FIT	2014	Hatchback	Yes	Petrol	1.3	91901	4	Automatic	Front	Четырехдверный	Left wheel	Silver	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
19231	5802	1055	MERCEDES-BENZ	E 350	2013	Sedan	Yes	Diesel	3.5	107800	6	Automatic	Rear	Четырехдверный	Left wheel	Grey	12
19232	8467	0	MERCEDES-BENZ	CLK 200	1999	Coupe	Yes	CNG	2.0	300000	4	Manual	Rear	Двухдверный	Left wheel	Silver	5
19233	15681	831	HYUNDAI	Sonata	2011	Sedan	Yes	Petrol	2.4	161600	4	Tiptronic	Front	Четырехдверный	Left wheel	Red	8
19234	26108	836	HYUNDAI	Tucson	2010	Jeep	Yes	Diesel	2.0	116365	4	Automatic	Front	Четырехдверный	Left wheel	Grey	4
19235	5331	1288	CHEVROLET	Captiva	2007	Jeep	Yes	Diesel	2.0	51258	4	Automatic	Front	Четырехдверный	Left wheel	Black	4

17574 rows × 17 columns

Очистка дубликатов и пропущенных значений¶

In [104]:

df.duplicated().sum()

Out[104]:

np.int64(2773)

In [105]:

df.drop_duplicates(inplace=True)

In [106]:

df.isna().sum()

Out[106]:

Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

Очистка выбросов¶

In [107]:

df.dtypes

Out[107]:

Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object

In [108]:

numeric_features_with_outliers = [
    "Price",
    "Levy",
    "Mileage",
    "Prod. year",
]

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1

No description has been provided for this image

In [109]:

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Количество строк до удаления выбросов: {len(df)}")

for column in numeric_features_with_outliers:
    df = remove_outliers(df, column)

print(f"Количество строк после удаления выбросов: {len(df)}")

Количество строк до удаления выбросов: 14801
Количество строк после удаления выбросов: 12597

In [110]:

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1

Разбиение на выборки¶

In [112]:

X = df
y = df["Category"]

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(train_df.Category.value_counts())
print(f"Тестовая выборка: {test_df.shape[0]} записей")
print(test_df.Category.value_counts())

Размеры выборок:
Обучающая выборка: 8817 записей
Category
Sedan          3954
Jeep           2263
Hatchback      1554
Minivan         312
Coupe           251
Universal       180
Microbus        143
Goods wagon     120
Pickup           22
Cabriolet        16
Limousine         2
Name: count, dtype: int64
Тестовая выборка: 3780 записей
Category
Sedan          1692
Jeep            990
Hatchback       636
Minivan         151
Coupe           117
Universal        82
Goods wagon      52
Microbus         46
Pickup            8
Cabriolet         5
Limousine         1
Name: count, dtype: int64

Oversampling¶

In [113]:

def oversample(df):
    X = df.drop("Category", axis=1)
    y = df["Category"]

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)  # type: ignore

    resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
    return resampled_df


train_df_overs = oversample(train_df)
test_df_overs = oversample(test_df)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df_overs.shape[0]} записей")
print(train_df_overs.Category.value_counts())
print(f"Тестовая выборка: {test_df_overs.shape[0]} записей")
print(test_df_overs.Category.value_counts())

Размеры выборок:
Обучающая выборка: 43494 записей
Category
Sedan          3954
Jeep           3954
Universal      3954
Hatchback      3954
Coupe          3954
Goods wagon    3954
Minivan        3954
Microbus       3954
Pickup         3954
Limousine      3954
Cabriolet      3954
Name: count, dtype: int64
Тестовая выборка: 18612 записей
Category
Hatchback      1692
Sedan          1692
Universal      1692
Jeep           1692
Coupe          1692
Minivan        1692
Goods wagon    1692
Microbus       1692
Pickup         1692
Cabriolet      1692
Limousine      1692
Name: count, dtype: int64

Ручной синтез признаков.¶

In [114]:

def age_create(df): 
    df["Age"] = 2020 - df["Prod. year"]
    df = df.drop("Prod. year", axis=1)
    return df

train_df = age_create(train_df)
test_df = age_create(test_df)

In [115]:

sorted_df = train_df.sort_values(by="Age")
sorted_df["Age"].unique()

Out[115]:

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21])

In [116]:

plt.figure(figsize=(4, 30))
plt.subplot(6, 1, i)
train_df.boxplot(column="Age")

Out[116]:

<Axes: >

Дискретизация числовых признаков¶

In [117]:

train_df.dtypes

Out[117]:

Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
Age                   int64
dtype: object

In [118]:

numeric_features_for_discritization = ["Age"]

def discretize_features(df, features, bins=4, labels=["Новый", "Средний", "Старый", "Очень старый"]):
    for feature in features:
        try:
            df[f"{feature}_bin"] = pd.cut(df[feature], bins=bins, labels=labels)  # type: ignore
        except Exception as e:
            print(f"Ошибка при дискретизации признака {feature}: {e}")
    return df


train_df = discretize_features(train_df, numeric_features_for_discritization)
test_df = discretize_features(test_df, numeric_features_for_discritization)

train_df

Out[118]:

	Price	Levy	Manufacturer	Model	Category	Leather interior	Fuel type	Engine volume	Mileage	Cylinders	Gear box type	Drive wheels	Doors	Wheel	Color	Airbags	Age	Age_bin
15146	18503	0	TOYOTA	Prius	Sedan	No	Petrol	1.8	13000	4	Automatic	Front	Четырехдверный	Left wheel	White	4	0	Новый
14145	9722	0	TOYOTA	Ractis	Sedan	No	Petrol	1.5	116800	4	Tiptronic	Front	Четырехдверный	Right-hand drive	Brown	2	13	Старый
8943	15367	584	HYUNDAI	Elantra	Sedan	No	Petrol	1.8	78222	4	Tiptronic	Front	Четырехдверный	Left wheel	Beige	10	6	Средний
17889	11917	0	SUBARU	Forester L.L.BEAN	Jeep	Yes	CNG	2.5	220000	4	Automatic	4x4	Четырехдверный	Left wheel	Green	5	16	Очень старый
9515	46919	1327	HYUNDAI	H1	Universal	Yes	Diesel	2.5	71689	4	Automatic	Front	Четырехдверный	Left wheel	Grey	4	2	Новый
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18201	10349	0	AUDI	A4	Sedan	Yes	Petrol	2.4	150000	6	Manual	4x4	Четырехдверный	Left wheel	Grey	4	13	Старый
7436	2038	765	KIA	Avella	Sedan	Yes	Petrol	2.0	125621	4	Automatic	Front	Четырехдверный	Left wheel	Silver	12	5	Новый
7728	13485	843	TOYOTA	Prius	Hatchback	No	Hybrid	1.5	212000	4	Variator	Front	Четырехдверный	Left wheel	Silver	8	12	Старый
1136	15677	0	FORD	Fiesta	Sedan	No	Petrol	1.6	74800	4	Automatic	Front	Четырехдверный	Left wheel	Silver	8	4	Новый
10640	16308	751	KIA	Optima EX	Sedan	Yes	Petrol	2.4	92000	12	Tiptronic	Front	Четырехдверный	Left wheel	Silver	8	7	Средний

8817 rows × 18 columns

Унитарное кодирование категориальных признаков¶

In [119]:

train_df.dtypes

Out[119]:

Price                  int64
Levy                   int64
Manufacturer          object
Model                 object
Category              object
Leather interior      object
Fuel type             object
Engine volume        float64
Mileage                int64
Cylinders              int64
Gear box type         object
Drive wheels          object
Doors                 object
Wheel                 object
Color                 object
Airbags                int64
Age                    int64
Age_bin             category
dtype: object

In [120]:

categorical_features_for_encoding = [
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    "Doors",
    "Wheel",
    "Age_bin",
]

train_df = pd.get_dummies(train_df, columns=categorical_features_for_encoding)
test_df = pd.get_dummies(test_df, columns=categorical_features_for_encoding)

train_df

Out[120]:

	Price	Levy	Manufacturer	Model	Engine volume	Mileage	Cylinders	Color	Airbags	Age	...	Drive wheels_Rear	Doors_Двухдверный	Doors_Многодверный	Doors_Четырехдверный	Wheel_Left wheel	Wheel_Right-hand drive	Age_bin_Новый	Age_bin_Средний	Age_bin_Старый	Age_bin_Очень старый
15146	18503	0	TOYOTA	Prius	1.8	13000	4	White	4	0	...	False	False	False	True	True	False	True	False	False	False
14145	9722	0	TOYOTA	Ractis	1.5	116800	4	Brown	2	13	...	False	False	False	True	False	True	False	False	True	False
8943	15367	584	HYUNDAI	Elantra	1.8	78222	4	Beige	10	6	...	False	False	False	True	True	False	False	True	False	False
17889	11917	0	SUBARU	Forester L.L.BEAN	2.5	220000	4	Green	5	16	...	False	False	False	True	True	False	False	False	False	True
9515	46919	1327	HYUNDAI	H1	2.5	71689	4	Grey	4	2	...	False	False	False	True	True	False	True	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18201	10349	0	AUDI	A4	2.4	150000	6	Grey	4	13	...	False	False	False	True	True	False	False	False	True	False
7436	2038	765	KIA	Avella	2.0	125621	4	Silver	12	5	...	False	False	False	True	True	False	True	False	False	False
7728	13485	843	TOYOTA	Prius	1.5	212000	4	Silver	8	12	...	False	False	False	True	True	False	False	False	True	False
1136	15677	0	FORD	Fiesta	1.6	74800	4	Silver	8	4	...	False	False	False	True	True	False	True	False	False	False
10640	16308	751	KIA	Optima EX	2.4	92000	12	Silver	8	7	...	False	False	False	True	True	False	False	True	False	False

8817 rows × 46 columns

Масштабирование признаков¶

In [121]:

train_df.dtypes

Out[121]:

Price                         int64
Levy                          int64
Manufacturer                 object
Model                        object
Engine volume               float64
Mileage                       int64
Cylinders                     int64
Color                        object
Airbags                       int64
Age                           int64
Leather interior_No            bool
Leather interior_Yes           bool
Category_Cabriolet             bool
Category_Coupe                 bool
Category_Goods wagon           bool
Category_Hatchback             bool
Category_Jeep                  bool
Category_Limousine             bool
Category_Microbus              bool
Category_Minivan               bool
Category_Pickup                bool
Category_Sedan                 bool
Category_Universal             bool
Fuel type_CNG                  bool
Fuel type_Diesel               bool
Fuel type_Hybrid               bool
Fuel type_Hydrogen             bool
Fuel type_LPG                  bool
Fuel type_Petrol               bool
Fuel type_Plug-in Hybrid       bool
Gear box type_Automatic        bool
Gear box type_Manual           bool
Gear box type_Tiptronic        bool
Gear box type_Variator         bool
Drive wheels_4x4               bool
Drive wheels_Front             bool
Drive wheels_Rear              bool
Doors_Двухдверный              bool
Doors_Многодверный             bool
Doors_Четырехдверный           bool
Wheel_Left wheel               bool
Wheel_Right-hand drive         bool
Age_bin_Новый                  bool
Age_bin_Средний                bool
Age_bin_Старый                 bool
Age_bin_Очень старый           bool
dtype: object

In [122]:

scaler = StandardScaler()

numeric_features_for_stardartization = [
    "Price",
    "Levy",
    "Engine volume",
    "Mileage",
    "Cylinders",
    "Airbags",
    "Age",
]

train_df[numeric_features_for_stardartization] = scaler.fit_transform(
    train_df[numeric_features_for_stardartization]
)
test_df[numeric_features_for_stardartization] = scaler.transform(
    test_df[numeric_features_for_stardartization]
)

train_df

Out[122]:

	Price	Levy	Manufacturer	Model	Engine volume	Mileage	Cylinders	Color	Airbags	Age	...	Drive wheels_Rear	Doors_Двухдверный	Doors_Многодверный	Doors_Четырехдверный	Wheel_Left wheel	Wheel_Right-hand drive	Age_bin_Новый	Age_bin_Средний	Age_bin_Старый	Age_bin_Очень старый
15146	0.153774	-1.192982	TOYOTA	Prius	-0.479341	-1.531744	-0.403213	White	-0.683755	-1.946936	...	False	False	False	True	True	False	True	False	False	False
14145	-0.658018	-1.192982	TOYOTA	Ractis	-0.887855	-0.130245	-0.403213	Brown	-1.190217	0.879266	...	False	False	False	True	False	True	False	False	True	False
8943	-0.136145	0.081576	HYUNDAI	Elantra	-0.479341	-0.651122	-0.403213	Beige	0.835631	-0.642535	...	False	False	False	True	True	False	False	True	False	False
17889	-0.455093	-1.192982	SUBARU	Forester L.L.BEAN	0.473858	1.263152	-0.403213	Green	-0.430524	1.531466	...	False	False	False	True	True	False	False	False	False	True
9515	2.780795	1.703146	HYUNDAI	H1	0.473858	-0.739330	-0.403213	Grey	-0.683755	-1.512135	...	False	False	False	True	True	False	True	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
18201	-0.600053	-1.192982	AUDI	A4	0.337687	0.318018	1.538421	Grey	-0.683755	0.879266	...	False	False	False	True	True	False	False	False	True	False
7436	-1.368394	0.476602	KIA	Avella	-0.206998	-0.011145	-0.403213	Silver	1.342092	-0.859935	...	False	False	False	True	True	False	True	False	False	False
7728	-0.310134	0.646834	TOYOTA	Prius	-0.887855	1.155137	-0.403213	Silver	0.329169	0.661866	...	False	False	False	True	True	False	False	False	True	False
1136	-0.107486	-1.192982	FORD	Fiesta	-0.751684	-0.697325	-0.403213	Silver	0.329169	-1.077335	...	False	False	False	True	True	False	True	False	False	False
10640	-0.049151	0.446048	KIA	Optima EX	0.337687	-0.465093	7.363324	Silver	0.329169	-0.425135	...	False	False	False	True	True	False	False	True	False	False

8817 rows × 46 columns

Конструирование признаков с помощью Featuretools¶

In [123]:

es = ft.EntitySet(id="car_data")
es = es.add_dataframe(dataframe_name="train", dataframe=train_df, index="id")
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="train",
    max_depth=1,
)

c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\featuretools\entityset\entityset.py:1733: UserWarning: index id not found in dataframe, creating new integer column
warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\woodwork\type_sys\utils.py:33: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
pd.to_datetime(

In [124]:

feature_defs

Out[124]:

[<Feature: Price>,
 <Feature: Levy>,
 <Feature: Manufacturer>,
 <Feature: Model>,
 <Feature: Engine volume>,
 <Feature: Mileage>,
 <Feature: Cylinders>,
 <Feature: Color>,
 <Feature: Airbags>,
 <Feature: Age>,
 <Feature: Leather interior_No>,
 <Feature: Leather interior_Yes>,
 <Feature: Category_Cabriolet>,
 <Feature: Category_Coupe>,
 <Feature: Category_Goods wagon>,
 <Feature: Category_Hatchback>,
 <Feature: Category_Jeep>,
 <Feature: Category_Limousine>,
 <Feature: Category_Microbus>,
 <Feature: Category_Minivan>,
 <Feature: Category_Pickup>,
 <Feature: Category_Sedan>,
 <Feature: Category_Universal>,
 <Feature: Fuel type_CNG>,
 <Feature: Fuel type_Diesel>,
 <Feature: Fuel type_Hybrid>,
 <Feature: Fuel type_Hydrogen>,
 <Feature: Fuel type_LPG>,
 <Feature: Fuel type_Petrol>,
 <Feature: Fuel type_Plug-in Hybrid>,
 <Feature: Gear box type_Automatic>,
 <Feature: Gear box type_Manual>,
 <Feature: Gear box type_Tiptronic>,
 <Feature: Gear box type_Variator>,
 <Feature: Drive wheels_4x4>,
 <Feature: Drive wheels_Front>,
 <Feature: Drive wheels_Rear>,
 <Feature: Doors_Двухдверный>,
 <Feature: Doors_Многодверный>,
 <Feature: Doors_Четырехдверный>,
 <Feature: Wheel_Left wheel>,
 <Feature: Wheel_Right-hand drive>,
 <Feature: Age_bin_Новый>,
 <Feature: Age_bin_Средний>,
 <Feature: Age_bin_Старый>,
 <Feature: Age_bin_Очень старый>]

220 KiB Raw Blame History Unescape Escape

Загрузка набора данных¶

Анализ датасета и очистка данных¶

Очистка дубликатов и пропущенных значений¶

Очистка выбросов¶

Разбиение на выборки¶

Oversampling¶

Ручной синтез признаков.¶

Дискретизация числовых признаков¶

Унитарное кодирование категориальных признаков¶

Масштабирование признаков¶

Конструирование признаков с помощью Featuretools¶

220 KiB

Raw Blame History