201 KiB
Raw Blame History

Загрузка набора данных

In [732]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import linear_model, tree, neighbors, ensemble
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.pipeline import make_pipeline


df = pd.read_csv("../data/car_price_prediction.csv")

df = df.drop(columns=["ID"])

random_state = 9

set_config(transform_output="pandas")

df
Out[732]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 km 6.0 Automatic 4x4 04-May Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3 192000 km 6.0 Tiptronic 4x4 04-May Left wheel Black 8
2 8467 - HONDA FIT 2006 Hatchback No Petrol 1.3 200000 km 4.0 Variator Front 04-May Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 km 4.0 Automatic 4x4 04-May Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 km 4.0 Automatic Front 04-May Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19232 8467 - MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 Turbo 300000 km 4.0 Manual Rear 02-Mar Left wheel Silver 5
19233 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 km 4.0 Tiptronic Front 04-May Left wheel Red 8
19234 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2 116365 km 4.0 Automatic Front 04-May Left wheel Grey 4
19235 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2 51258 km 4.0 Automatic Front 04-May Left wheel Black 4
19236 470 753 HYUNDAI Sonata 2012 Sedan Yes Hybrid 2.4 186923 km 4.0 Automatic Front 04-May Left wheel White 12

19237 rows × 17 columns

Анализ датасета и очистка данных

In [733]:
df.dtypes
Out[733]:
Price                 int64
Levy                 object
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume        object
Mileage              object
Cylinders           float64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [734]:
df["Engine volume"] = df["Engine volume"].str.replace("Turbo", "")
df["Engine volume"] = pd.to_numeric(df["Engine volume"])
df["Engine volume"].unique()
Out[734]:
array([ 3.5,  3. ,  1.3,  2.5,  2. ,  1.8,  2.4,  4. ,  1.6,  3.3,  2.2,
        4.7,  1.5,  4.4,  1.4,  3.6,  2.3,  5.5,  2.8,  3.2,  3.8,  4.6,
        1.2,  5. ,  1.7,  2.9,  0.5,  1.9,  2.7,  4.8,  5.3,  0.4,  1.1,
        2.1,  0.7,  5.4,  3.7,  1. ,  2.6,  0.8,  0.2,  5.7,  6.7,  6.2,
        3.4,  6.3,  4.3,  4.2,  0. , 20. ,  0.3,  5.9,  5.6,  6. ,  0.6,
        6.8,  4.5,  7.3,  0.1,  3.1,  6.4,  3.9,  0.9,  5.2,  5.8])
In [735]:
df["Mileage"] = df["Mileage"].str.replace("km", "")
df["Mileage"] = df["Mileage"].astype("int64")
df["Mileage"].unique()
Out[735]:
array([186005, 192000, 200000, ..., 140607, 307325, 186923])
In [736]:
df["Levy"] = df["Levy"].replace("-", "0")
df["Levy"] = df["Levy"].astype("int64")
df["Levy"].unique()
Out[736]:
array([ 1399,  1018,     0,   862,   446,   891,   761,   751,   394,
        1053,  1055,  1079,   810,  2386,  1850,   531,   586,  1249,
        2455,   583,  1537,  1288,   915,  1750,   707,  1077,  1486,
        1091,   650,   382,  1436,  1194,   503,  1017,  1104,   639,
         629,   919,   781,   530,   640,   765,   777,   779,   934,
         769,   645,  1185,  1324,   830,  1187,  1111,   760,   642,
        1604,  1095,   966,   473,  1138,  1811,   988,   917,  1156,
         687, 11714,   836,  1347,  2866,  1646,   259,   609,   697,
         585,   475,   690,   308,  1823,  1361,  1273,   924,   584,
        2078,   831,  1172,   893,  1872,  1885,  1266,   447,  2148,
        1730,   730,   289,   502,   333,  1325,   247,   879,  1342,
        1327,  1598,  1514,  1058,   738,  1935,   481,  1522,  1282,
         456,   880,   900,   798,  1277,   442,  1051,   790,  1292,
        1047,   528,  1211,  1493,  1793,   574,   930,  1998,   271,
         706,  1481,  1677,  1661,  1286,  1408,  1090,   595,  1451,
        1267,   993,  1714,   878,   641,   749,  1511,   603,   353,
         877,  1236,  1141,   397,   784,  1024,  1357,  1301,   770,
         922,  1438,   753,   607,  1363,   638,   490,   431,   565,
         517,   833,   489,  1760,   986,  1841,  1620,  1360,   474,
        1099,   978,  1624,  1946,  1268,  1307,   696,   649,   666,
        2151,   551,   800,   971,  1323,  2377,  1845,  1083,   694,
         463,   419,   345,  1515,  1505,  2056,  1203,   729,   460,
        1356,   876,   911,  1190,   780,   448,  2410,  1848,  1148,
         834,  1275,  1028,  1197,   724,   890,  1705,   505,   789,
        2959,   518,   461,  1719,  2858,  3156,  2225,  2177,  1968,
        1888,  1308,  2736,  1103,   557,  2195,   843,  1664,   723,
        4508,   562,   501,  2018,  1076,  1202,  3301,   691,  1440,
        1869,  1178,   418,  1820,  1413,   488,  1304,   363,  2108,
         521,  1659,    87,  1411,  1528,  3292,  7058,  1578,   627,
         874,  1996,  1488,  5679,  1234,  5603,   400,   889,  3268,
         875,   949,  2265,   441,   742,   425,  2476,  2971,   614,
        1816,  1375,  1405,  2297,  1062,  1113,   420,  2469,   658,
        1951,  2670,  2578,  1995,  1032,   994,  1011,  2421,  1296,
         155,   494,   426,  1086,   961,  2236,  1829,   764,  1834,
        1054,   617,  1529,  2266,   637,   626,  1832,  1016,  2002,
        1756,   746,  1285,  2690,  1118,  5332,   980,  1807,   970,
        1228,  1195,  1132,  1768,  1384,  1080,  7063,  1817,  1452,
        1975,  1368,   702,  1974,  1781,  1036,   944,   663,   364,
        1539,  1345,  1680,  2209,   741,  1575,   695,  1317,   294,
        1525,   424,   997,  1473,  1552,  2819,  2188,  1668,  3057,
         799,  1502,  2606,   552,  1694,  1759,  1110,   399,  1470,
        1174,  5877,  1474,  1688,   526,   686,  5908,  1107,  2070,
        1468,  1246,  1685,   556,  1533,  1917,  1346,   732,   692,
         579,   421,   362,  3505,  1855,  2711,  1586,  3739,   681,
        1708,  2278,  1701,   722,  1482,   928,   827,   832,   527,
         604,   173,  1341,  3329,  1553,   859,   167,   916,   828,
        2082,  1176,  1108,   975,  3008,  1516,  2269,  1699,  2073,
        1031,  1503,  2364,  1030,  1442,  5666,  2715,  1437,  2067,
        1426,  2908,  1279,   866,  4283,   279,  2658,  3015,  2004,
        1391,  4736,   748,  1466,   644,   683,  2705,  1297,   731,
        1252,  2216,  3141,  3273,  1518,  1723,  1588,   972,   682,
        1094,   668,   175,   967,   402,  3894,  1960,  1599,  2000,
        2084,  1621,   714,  1109,  3989,   873,  1572,  1163,  1991,
        1716,  1673,  2562,  2874,   965,   462,   605,  1948,  1736,
        3518,  2054,  2467,  1681,  1272,  1205,   750,  2156,  2566,
         115,   524,  3184,   676,  1678,   612,   328,   955,  1441,
        1675,  3965,  2909,   623,   822,   867,  3025,  1993,   792,
         636,  4057,  3743,  2337,  2570,  2418,  2472,  3910,  1662,
        2123,  2628,  3208,  2080,  3699,  2913,   864,  2505,   870,
        7536,  1924,  1671,  1064,  1836,  1866,  4741,   841,  1369,
        5681,  3112,  1366,  2223,  1198,  1039,  3811,  3571,  1387,
        1171,  1365,  1531,  1590, 11706,  2308,  4860,  1641,  1045,
        1901])
In [737]:
df["Cylinders"] = df["Cylinders"].astype("int64")
df["Cylinders"].unique()
Out[737]:
array([ 6,  4,  8,  1, 12,  3,  2, 16,  5,  7,  9, 10, 14])
In [738]:
df["Doors"].unique()
Out[738]:
array(['04-May', '02-Mar', '>5'], dtype=object)
In [739]:
df["Doors"] = df["Doors"].map(
    {"02-Mar": "Двухдверный", "04-May": "Четырехдверный", ">5": "Многодверный"}
)
df["Doors"].unique()
Out[739]:
array(['Четырехдверный', 'Двухдверный', 'Многодверный'], dtype=object)
In [740]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[740]:
array([       1,        3,        6, ...,   627220,   872946, 26307500])
In [741]:
print(f"Количество строк до удаления некорректных значений: {len(df)}")
df = df[df["Price"] >= 500]
print(f"Количество строк после удаления некорректных значений: {len(df)}")
Количество строк до удаления некорректных значений: 19237
Количество строк после удаления некорректных значений: 17574
In [742]:
sorted_df = df.sort_values(by="Price")
sorted_df["Price"].unique()
Out[742]:
array([     500,      549,      600, ...,   627220,   872946, 26307500])
In [743]:
sorted_df = df.sort_values(by="Prod. year")
sorted_df["Prod. year"].unique()
Out[743]:
array([1943, 1953, 1957, 1964, 1965, 1968, 1973, 1974, 1977, 1978, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020])
In [744]:
df
Out[744]:
Price Levy Manufacturer Model Prod. year Category Leather interior Fuel type Engine volume Mileage Cylinders Gear box type Drive wheels Doors Wheel Color Airbags
0 13328 1399 LEXUS RX 450 2010 Jeep Yes Hybrid 3.5 186005 6 Automatic 4x4 Четырехдверный Left wheel Silver 12
1 16621 1018 CHEVROLET Equinox 2011 Jeep No Petrol 3.0 192000 6 Tiptronic 4x4 Четырехдверный Left wheel Black 8
2 8467 0 HONDA FIT 2006 Hatchback No Petrol 1.3 200000 4 Variator Front Четырехдверный Right-hand drive Black 2
3 3607 862 FORD Escape 2011 Jeep Yes Hybrid 2.5 168966 4 Automatic 4x4 Четырехдверный Left wheel White 0
4 11726 446 HONDA FIT 2014 Hatchback Yes Petrol 1.3 91901 4 Automatic Front Четырехдверный Left wheel Silver 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19231 5802 1055 MERCEDES-BENZ E 350 2013 Sedan Yes Diesel 3.5 107800 6 Automatic Rear Четырехдверный Left wheel Grey 12
19232 8467 0 MERCEDES-BENZ CLK 200 1999 Coupe Yes CNG 2.0 300000 4 Manual Rear Двухдверный Left wheel Silver 5
19233 15681 831 HYUNDAI Sonata 2011 Sedan Yes Petrol 2.4 161600 4 Tiptronic Front Четырехдверный Left wheel Red 8
19234 26108 836 HYUNDAI Tucson 2010 Jeep Yes Diesel 2.0 116365 4 Automatic Front Четырехдверный Left wheel Grey 4
19235 5331 1288 CHEVROLET Captiva 2007 Jeep Yes Diesel 2.0 51258 4 Automatic Front Четырехдверный Left wheel Black 4

17574 rows × 17 columns

Очистка дубликатов и пропущенных значений

In [745]:
df.duplicated().sum()
Out[745]:
np.int64(2773)
In [746]:
df.drop_duplicates(inplace=True)
# df.duplicated().sum()
C:\Users\user\AppData\Local\Temp\ipykernel_30576\1689817098.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)
In [747]:
df.isna().sum()
Out[747]:
Price               0
Levy                0
Manufacturer        0
Model               0
Prod. year          0
Category            0
Leather interior    0
Fuel type           0
Engine volume       0
Mileage             0
Cylinders           0
Gear box type       0
Drive wheels        0
Doors               0
Wheel               0
Color               0
Airbags             0
dtype: int64

Очистка выбросов

In [748]:
df.dtypes
Out[748]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [749]:
numeric_features_with_outliers = [
    "Price",
    "Levy",
    "Mileage",
    "Prod. year",
]

i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [750]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print(f"Количество строк до удаления выбросов: {len(df)}")

for column in numeric_features_with_outliers:
    df = remove_outliers(df, column)

print(f"Количество строк после удаления выбросов: {len(df)}")
Количество строк до удаления выбросов: 14801
Количество строк после удаления выбросов: 12597
In [751]:
i = 1
for col in numeric_features_with_outliers:
    plt.figure(figsize=(4, 30))
    plt.subplot(6, 1, i)
    df.boxplot(column=col)
    i += 1
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Формирование конвейера для классификации данных

In [752]:
df.dtypes
Out[752]:
Price                 int64
Levy                  int64
Manufacturer         object
Model                object
Prod. year            int64
Category             object
Leather interior     object
Fuel type            object
Engine volume       float64
Mileage               int64
Cylinders             int64
Gear box type        object
Drive wheels         object
Doors                object
Wheel                object
Color                object
Airbags               int64
dtype: object
In [753]:
columns_to_drop = [
    "Model",
    "Manufacturer",
    "Color",
    "Doors",
    "Cylinders",
    "Mileage",
]

num_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype != "object"
]

cat_columns = [
    column
    for column in df.columns
    if column not in columns_to_drop and df[column].dtype == "object"
]

cat_cols_for_one_hot_enc = [
    "Leather interior",
    "Category",
    "Fuel type",
    "Gear box type",
    "Drive wheels",
    # "Doors",
    "Wheel",
]

cat_cols_for_num_enc = [
    # "Model",
    # "Manufacturer",
    # "Color",
]

num_imputer = SimpleImputer(strategy="median")
num_scaler = StandardScaler()
preprocessing_num = Pipeline(
    [
        ("imputer", num_imputer),
        ("scaler", num_scaler),
    ]
)

cat_imputer = SimpleImputer(strategy="constant", fill_value="unknown")
cat_one_hot_encoder = OneHotEncoder(
    handle_unknown="ignore", sparse_output=False, drop="first"
)
preprocessing_one_hot = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", cat_one_hot_encoder),
    ]
)

preprocessing_label_enc = Pipeline(
    [
        ("imputer", cat_imputer),
        ("encoder", OrdinalEncoder()),
    ]
)

features_preprocessing = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("prepocessing_one_hot", preprocessing_one_hot, cat_cols_for_one_hot_enc),
        ("prepocessing_label_enc", preprocessing_label_enc, cat_cols_for_num_enc),
        ("prepocessing_num", preprocessing_num, num_columns),
    ],
    remainder="passthrough",
)

drop_columns = ColumnTransformer(
    verbose_feature_names_out=False,
    transformers=[
        ("drop_columns", "drop", columns_to_drop),
    ],
    remainder="passthrough",
)

pipeline_end = Pipeline(
    [
        ("features_preprocessing", features_preprocessing),
        ("drop_columns", drop_columns),
    ]
)

Демонстрация работы конвейера для предобработки данных

In [754]:
preprocessing_result = pipeline_end.fit_transform(df)
df = pd.DataFrame(
    preprocessing_result,
    columns=pipeline_end.get_feature_names_out(),
)

df
Out[754]:
Leather interior_Yes Category_Coupe Category_Goods wagon Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan ... Gear box type_Tiptronic Gear box type_Variator Drive wheels_Front Drive wheels_Rear Wheel_Right-hand drive Price Levy Prod. year Engine volume Airbags
0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.332733 1.851925 -0.224219 1.849443 1.353782
1 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 -0.029023 1.024150 -0.007434 1.162057 0.340316
2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 1.0 0.0 1.0 -0.781060 -1.187596 -1.091356 -1.175055 -1.179883
3 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 -1.229294 0.685218 -0.007434 0.474671 -1.686616
4 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 -0.480485 -0.218599 0.642919 -1.175055 -0.673150
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
19225 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 -0.752100 1.217515 -0.874572 -0.625146 -0.673150
19226 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 1.0 0.0 0.0 1.0 1.0 -0.838888 -1.187596 -1.741709 1.162057 -1.686616
19232 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 -0.781060 -1.187596 -2.608847 -0.212715 -0.419784
19233 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 1.0 0.0 1.0 0.0 0.0 -0.115718 0.617867 -0.007434 0.337194 0.340316
19234 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.845956 0.628730 -0.224219 -0.212715 -0.673150

12597 rows × 28 columns

Разбиение на выборки

In [755]:
train_df, test_df = train_test_split(
    df, test_size=0.3, random_state=42
)

print("Размеры выборок:")
print(f"Обучающая выборка: {train_df.shape[0]} записей")
print(f"Тестовая выборка: {test_df.shape[0]} записей")
Размеры выборок:
Обучающая выборка: 8817 записей
Тестовая выборка: 3780 записей
In [756]:
price_y_train = train_df["Price"]
price_y_test = test_df["Price"]
train_df.drop(columns=["Price"], inplace=True)
test_df.drop(columns=["Price"], inplace=True)

train_df
Out[756]:
Leather interior_Yes Category_Coupe Category_Goods wagon Category_Hatchback Category_Jeep Category_Limousine Category_Microbus Category_Minivan Category_Pickup Category_Sedan ... Gear box type_Manual Gear box type_Tiptronic Gear box type_Variator Drive wheels_Front Drive wheels_Rear Wheel_Right-hand drive Levy Prod. year Engine volume Airbags
15146 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 -1.187596 1.943625 -0.487669 -0.673150
14145 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 1.0 0.0 1.0 -1.187596 -0.874572 -0.900100 -1.179883
8943 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 1.0 0.0 0.0 0.081225 0.642919 -0.487669 0.847049
17889 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 -1.187596 -1.524925 0.474671 -0.419784
9515 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 1.695495 1.510056 0.474671 -0.673150
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18201 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 -1.187596 -0.874572 0.337194 -0.673150
7436 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.474472 0.859703 -0.212715 1.353782
7728 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 1.0 0.0 0.0 0.643938 -0.657787 -0.900100 0.340316
1136 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 -1.187596 1.076487 -0.762623 0.340316
10640 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 1.0 0.0 0.0 0.444055 0.426134 0.337194 0.340316

8817 rows × 27 columns

Формирование набора моделей

In [757]:
models = {
    "linear": {"model": linear_model.LinearRegression(n_jobs=-1)},
    # "linear_poly": {
    #     "model": make_pipeline(
    #         PolynomialFeatures(degree=2),
    #         linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
    #     )
    # },
    # "linear_interact": {
    #     "model": make_pipeline(
    #         PolynomialFeatures(interaction_only=True),
    #         linear_model.LinearRegression(fit_intercept=False, n_jobs=-1),
    #     )
    # },
    "ridge": {"model": linear_model.RidgeCV()},
    "decision_tree": {
        "model": tree.DecisionTreeRegressor(max_depth=7, random_state=random_state)
    },
    "knn": {"model": neighbors.KNeighborsRegressor(n_neighbors=7, n_jobs=-1)},
    "random_forest": {
        "model": ensemble.RandomForestRegressor(
            max_depth=7, random_state=random_state, n_jobs=-1
        )
    },
}

Обучение и оценка моделей с помощью различных алгоритмов

In [758]:
for model_name in models.keys():
    print(f"Model: {model_name}")

    model = models[model_name]["model"]

    fitted_model = model.fit(train_df.values, price_y_train.values.ravel())

    y_train_pred = fitted_model.predict(train_df)
    y_test_pred = fitted_model.predict(test_df)

    models[model_name]["fitted"] = fitted_model
    models[model_name]["train_preds"] = y_train_pred
    models[model_name]["preds"] = y_test_pred

    models[model_name]["RMSE_train"] = math.sqrt(
        metrics.mean_squared_error(price_y_train, y_train_pred)
    )
    models[model_name]["RMSE_test"] = math.sqrt(
        metrics.mean_squared_error(price_y_test, y_test_pred)
    )
    models[model_name]["RMAE_test"] = math.sqrt(
        metrics.mean_absolute_error(price_y_test, y_test_pred)
    )
    models[model_name]["R2_test"] = metrics.r2_score(price_y_test, y_test_pred)
Model: linear
Model: ridge
Model: decision_tree
Model: knn
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but LinearRegression was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but LinearRegression was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RidgeCV was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RidgeCV was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but DecisionTreeRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but KNeighborsRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but KNeighborsRegressor was fitted without feature names
  warnings.warn(
Model: random_forest
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(
c:\Users\user\source\repos\mai_pi-33_zakharov\.venv\Lib\site-packages\sklearn\base.py:486: UserWarning: X has feature names, but RandomForestRegressor was fitted without feature names
  warnings.warn(

Вывод результатов оценки

In [759]:
reg_metrics = pd.DataFrame.from_dict(models, "index")[
    ["RMSE_train", "RMSE_test", "RMAE_test", "R2_test"]
]
reg_metrics.sort_values(by="RMSE_test").style.background_gradient(
    cmap="viridis", low=1, high=0.3, subset=["RMSE_train", "RMSE_test"]
).background_gradient(cmap="plasma", low=0.3, high=1, subset=["RMAE_test", "R2_test"])
Out[759]:
  RMSE_train RMSE_test RMAE_test R2_test
knn 0.536712 0.602377 0.643601 0.640894
random_forest 0.610511 0.634321 0.675524 0.601798
decision_tree 0.650704 0.683437 0.701393 0.537743
ridge 0.779199 0.788052 0.776391 0.385397
linear 0.779198 0.788056 0.776379 0.385390